Compare commits

..

14 Commits

Author SHA1 Message Date
notkshitij 58d38f9199 add answers for may-june 2025 + november-december 2025 pyqs for unit 6 (High Performance Computing Applications) 2026-05-26 01:41:12 +05:30
notkshitij ccca13880d add answers for may-june 2025 + november-december 2025 pyqs for unit 5 (CUDA Architecture) 2026-05-26 01:34:07 +05:30
notkshitij ccd6640ac9 add answers for may-june 2025 + november-december 2025 pyqs for unit 4 (Analytical Modeling of Parallel Programs) 2026-05-26 01:24:24 +05:30
notkshitij 1dcb16981b add answers for may-june 2025 + november-december 2025 pyqs for unit 3 (Parallel Communication) 2026-05-26 01:15:06 +05:30
notkshitij 344db2f477 add may-june 2025 + november-december 2025 pyqs for end-sem. 2026-05-25 23:20:03 +05:30
notkshitij 3f3b1a1978 add link for end-sem pyq answers in README. 2026-05-21 19:33:27 +05:30
notkshitij 786d318b88 add end-sem pyq answers for unit 6 (High Performance Computing Applications) 2026-05-21 19:29:44 +05:30
notkshitij fefa2383bb add end-sem pyq answers for unit 5 (CUDA Architecture) 2026-05-21 19:28:05 +05:30
notkshitij a90631ce37 add end-sem pyq answers for unit 4 (Analytical Modeling of Parallel Programs) 2026-05-21 19:23:03 +05:30
notkshitij 7a6b281521 add end-sem pyq answers for unit 3 (Parallel Communication) 2026-05-21 19:19:15 +05:30
notkshitij 84b5e3a059 add end-sem pyqs for HPC (may june 2023, may-june 2024) 2026-05-15 01:41:50 +05:30
notkshitij b8b405da94 fix title in markdown file for practical 4. 2026-05-04 23:53:57 +05:30
notkshitij d3ad26e1ca add link for 4th practical in README. 2026-05-04 23:53:26 +05:30
notkshitij 60783ed8cd add only the program file for CUDA program; practical 4. 2026-05-04 23:52:49 +05:30
11 changed files with 165 additions and 1 deletions
+1 -1
View File
@@ -1,4 +1,4 @@
# Practical-4 (CUDA Programs for Addition and Multiplication)
# Practical-4 (Vector Addition and Matrix Multiplication)
Problem Statement:
Write a CUDA Program for:
+161
View File
@@ -0,0 +1,161 @@
# %%writefile cuda_program.cu
#include <iostream>
#include <cuda.h>
using namespace std;
#define BLOCK_SIZE 2
// ─── Vector Addition Kernel ──────────────────────────────────────────────────
// Each thread computes a single element of C = A + B.
__global__ void vectorAdd(int *A, int *B, int *C, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
// Guard against threads beyond the vector size (when N is not a multiple
// of the block size, some threads in the last block are out of range).
if (i < N)
C[i] = A[i] + B[i];
}
// ─── Matrix Multiplication Kernel ────────────────────────────────────────────
// Each thread computes a single element of C = A * B.
// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
__global__ void matrixMul(float *A, float *B, float *C, int N) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (int n = 0; n < N; ++n)
sum += A[row * N + n] * B[n * N + col];
C[row * N + col] = sum;
}
// ─── Vector Addition ─────────────────────────────────────────────────────────
void runVectorAddition() {
int N;
cout << "\n=== Vector Addition ===" << endl;
cout << "Enter vector size: ";
cin >> N;
int size = N * sizeof(int);
// Host allocation and initialisation
int *hA = new int[N];
int *hB = new int[N];
int *hC = new int[N];
for (int i = 0; i < N; i++) {
hA[i] = i;
hB[i] = i * 2;
}
cout << "\nVector A: ";
for (int i = 0; i < N; i++) cout << hA[i] << " ";
cout << "\nVector B: ";
for (int i = 0; i < N; i++) cout << hB[i] << " ";
cout << endl;
// Device allocation and transfer
int *dA, *dB, *dC;
cudaMalloc(&dA, size);
cudaMalloc(&dB, size);
cudaMalloc(&dC, size);
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
// Launch with enough blocks to cover all N elements.
// (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
cout << "Result A + B: ";
for (int i = 0; i < N; i++) cout << hC[i] << " ";
cout << endl;
delete[] hA;
delete[] hB;
delete[] hC;
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
}
// ─── Matrix Multiplication ───────────────────────────────────────────────────
void runMatrixMultiplication() {
int K, N;
cout << "\n=== Matrix Multiplication ===" << endl;
cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
cin >> K;
N = K * BLOCK_SIZE;
cout << "Matrix size: " << N << " x " << N << endl;
int size = N * N * sizeof(float);
// Host allocation and initialisation
float *hA = new float[N * N];
float *hB = new float[N * N];
float *hC = new float[N * N];
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
hA[j * N + i] = 2;
hB[j * N + i] = 4;
}
}
cout << "\nMatrix A:\n";
for (int row = 0; row < N; row++) {
for (int col = 0; col < N; col++)
cout << hA[row * N + col] << " ";
cout << endl;
}
cout << "\nMatrix B:\n";
for (int row = 0; row < N; row++) {
for (int col = 0; col < N; col++)
cout << hB[row * N + col] << " ";
cout << endl;
}
// Device allocation and transfer
float *dA, *dB, *dC;
cudaMalloc(&dA, size);
cudaMalloc(&dB, size);
cudaMalloc(&dC, size);
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
// threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
// grid: K x K blocks, so total threads = N x N (one per output element).
dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(K, K);
matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
cout << "\nResult C = A * B:\n";
for (int row = 0; row < N; row++) {
for (int col = 0; col < N; col++)
cout << hC[row * N + col] << " ";
cout << endl;
}
delete[] hA;
delete[] hB;
delete[] hC;
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
}
int main() {
runVectorAddition();
runMatrixMultiplication();
cout << "\nFinished." << endl;
return 0;
}
+3
View File
@@ -13,6 +13,7 @@ This repository compiles essential resources for the SPPU Computer Engineering P
1. [Code-1 (Parallel BFS and DFS)](Codes/Code-1.cpp)
2. [Code-2 (Sequential and Parallel Bubble Sort and Merge Sort)](Codes/Code-2.cpp)
3. [Code-3 (Min, Max, Sum, Average)](Codes/Code-3.cpp)
4. [Code-4 (Vector Addition and Matrix Multiplication)](Codes/Code-4/)
### Practical
@@ -38,6 +39,8 @@ This repository compiles essential resources for the SPPU Computer Engineering P
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers/)
### [END-SEM PYQ Answers](Notes/END-SEM%20PYQ%20Answers/)
---
## Miscellaneous