Compare commits
14 Commits
aaa405c02a
..
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
58d38f9199
|
|||
|
ccca13880d
|
|||
|
ccd6640ac9
|
|||
|
1dcb16981b
|
|||
|
344db2f477
|
|||
|
3f3b1a1978
|
|||
|
786d318b88
|
|||
|
fefa2383bb
|
|||
|
a90631ce37
|
|||
|
7a6b281521
|
|||
|
84b5e3a059
|
|||
|
b8b405da94
|
|||
|
d3ad26e1ca
|
|||
|
60783ed8cd
|
@@ -1,4 +1,4 @@
|
|||||||
# Practical-4 (CUDA Programs for Addition and Multiplication)
|
# Practical-4 (Vector Addition and Matrix Multiplication)
|
||||||
|
|
||||||
Problem Statement:
|
Problem Statement:
|
||||||
Write a CUDA Program for:
|
Write a CUDA Program for:
|
||||||
|
|||||||
@@ -0,0 +1,161 @@
|
|||||||
|
# %%writefile cuda_program.cu
|
||||||
|
#include <iostream>
|
||||||
|
#include <cuda.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
#define BLOCK_SIZE 2
|
||||||
|
|
||||||
|
// ─── Vector Addition Kernel ──────────────────────────────────────────────────
|
||||||
|
// Each thread computes a single element of C = A + B.
|
||||||
|
__global__ void vectorAdd(int *A, int *B, int *C, int N) {
|
||||||
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
// Guard against threads beyond the vector size (when N is not a multiple
|
||||||
|
// of the block size, some threads in the last block are out of range).
|
||||||
|
if (i < N)
|
||||||
|
C[i] = A[i] + B[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Matrix Multiplication Kernel ────────────────────────────────────────────
|
||||||
|
// Each thread computes a single element of C = A * B.
|
||||||
|
// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
|
||||||
|
__global__ void matrixMul(float *A, float *B, float *C, int N) {
|
||||||
|
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
float sum = 0.0f;
|
||||||
|
for (int n = 0; n < N; ++n)
|
||||||
|
sum += A[row * N + n] * B[n * N + col];
|
||||||
|
|
||||||
|
C[row * N + col] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Vector Addition ─────────────────────────────────────────────────────────
|
||||||
|
void runVectorAddition() {
|
||||||
|
int N;
|
||||||
|
cout << "\n=== Vector Addition ===" << endl;
|
||||||
|
cout << "Enter vector size: ";
|
||||||
|
cin >> N;
|
||||||
|
|
||||||
|
int size = N * sizeof(int);
|
||||||
|
|
||||||
|
// Host allocation and initialisation
|
||||||
|
int *hA = new int[N];
|
||||||
|
int *hB = new int[N];
|
||||||
|
int *hC = new int[N];
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
hA[i] = i;
|
||||||
|
hB[i] = i * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nVector A: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hA[i] << " ";
|
||||||
|
cout << "\nVector B: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hB[i] << " ";
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
// Device allocation and transfer
|
||||||
|
int *dA, *dB, *dC;
|
||||||
|
cudaMalloc(&dA, size);
|
||||||
|
cudaMalloc(&dB, size);
|
||||||
|
cudaMalloc(&dC, size);
|
||||||
|
|
||||||
|
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// Launch with enough blocks to cover all N elements.
|
||||||
|
// (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
|
||||||
|
int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
|
vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
|
||||||
|
|
||||||
|
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
cout << "Result A + B: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hC[i] << " ";
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
delete[] hA;
|
||||||
|
delete[] hB;
|
||||||
|
delete[] hC;
|
||||||
|
cudaFree(dA);
|
||||||
|
cudaFree(dB);
|
||||||
|
cudaFree(dC);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Matrix Multiplication ───────────────────────────────────────────────────
|
||||||
|
void runMatrixMultiplication() {
|
||||||
|
int K, N;
|
||||||
|
cout << "\n=== Matrix Multiplication ===" << endl;
|
||||||
|
cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
|
||||||
|
cin >> K;
|
||||||
|
N = K * BLOCK_SIZE;
|
||||||
|
|
||||||
|
cout << "Matrix size: " << N << " x " << N << endl;
|
||||||
|
int size = N * N * sizeof(float);
|
||||||
|
|
||||||
|
// Host allocation and initialisation
|
||||||
|
float *hA = new float[N * N];
|
||||||
|
float *hB = new float[N * N];
|
||||||
|
float *hC = new float[N * N];
|
||||||
|
|
||||||
|
for (int j = 0; j < N; j++) {
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
hA[j * N + i] = 2;
|
||||||
|
hB[j * N + i] = 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nMatrix A:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hA[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nMatrix B:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hB[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Device allocation and transfer
|
||||||
|
float *dA, *dB, *dC;
|
||||||
|
cudaMalloc(&dA, size);
|
||||||
|
cudaMalloc(&dB, size);
|
||||||
|
cudaMalloc(&dC, size);
|
||||||
|
|
||||||
|
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
|
||||||
|
// grid: K x K blocks, so total threads = N x N (one per output element).
|
||||||
|
dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
|
||||||
|
dim3 grid(K, K);
|
||||||
|
matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
|
||||||
|
|
||||||
|
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
cout << "\nResult C = A * B:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hC[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] hA;
|
||||||
|
delete[] hB;
|
||||||
|
delete[] hC;
|
||||||
|
cudaFree(dA);
|
||||||
|
cudaFree(dB);
|
||||||
|
cudaFree(dC);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
runVectorAddition();
|
||||||
|
runMatrixMultiplication();
|
||||||
|
|
||||||
|
cout << "\nFinished." << endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
BIN
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
@@ -13,6 +13,7 @@ This repository compiles essential resources for the SPPU Computer Engineering P
|
|||||||
1. [Code-1 (Parallel BFS and DFS)](Codes/Code-1.cpp)
|
1. [Code-1 (Parallel BFS and DFS)](Codes/Code-1.cpp)
|
||||||
2. [Code-2 (Sequential and Parallel Bubble Sort and Merge Sort)](Codes/Code-2.cpp)
|
2. [Code-2 (Sequential and Parallel Bubble Sort and Merge Sort)](Codes/Code-2.cpp)
|
||||||
3. [Code-3 (Min, Max, Sum, Average)](Codes/Code-3.cpp)
|
3. [Code-3 (Min, Max, Sum, Average)](Codes/Code-3.cpp)
|
||||||
|
4. [Code-4 (Vector Addition and Matrix Multiplication)](Codes/Code-4/)
|
||||||
|
|
||||||
### Practical
|
### Practical
|
||||||
|
|
||||||
@@ -38,6 +39,8 @@ This repository compiles essential resources for the SPPU Computer Engineering P
|
|||||||
|
|
||||||
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers/)
|
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers/)
|
||||||
|
|
||||||
|
### [END-SEM PYQ Answers](Notes/END-SEM%20PYQ%20Answers/)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Miscellaneous
|
## Miscellaneous
|
||||||
|
|||||||
Reference in New Issue
Block a user