Compare commits
19 Commits
26ef8ceb1b
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
58d38f9199
|
|||
|
ccca13880d
|
|||
|
ccd6640ac9
|
|||
|
1dcb16981b
|
|||
|
344db2f477
|
|||
|
3f3b1a1978
|
|||
|
786d318b88
|
|||
|
fefa2383bb
|
|||
|
a90631ce37
|
|||
|
7a6b281521
|
|||
|
84b5e3a059
|
|||
|
b8b405da94
|
|||
|
d3ad26e1ca
|
|||
|
60783ed8cd
|
|||
|
aaa405c02a
|
|||
|
5f94348c49
|
|||
|
4e5913d6e4
|
|||
|
a521ac1ca1
|
|||
|
3a3c78ad6d
|
@@ -0,0 +1,3 @@
|
|||||||
|
attachments/change-runtime.png filter=lfs diff=lfs merge=lfs -text
|
||||||
|
attachments/runtime-navbar.png filter=lfs diff=lfs merge=lfs -text
|
||||||
|
attachments/select-t4-gpu.png filter=lfs diff=lfs merge=lfs -text
|
||||||
@@ -0,0 +1,267 @@
|
|||||||
|
# Practical-4 (Vector Addition and Matrix Multiplication)
|
||||||
|
|
||||||
|
Problem Statement:
|
||||||
|
Write a CUDA Program for:
|
||||||
|
1. Addition of two large vectors
|
||||||
|
2. Matrix Multiplication using CUDA C
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Pre-requisities
|
||||||
|
|
||||||
|
1. Open [Google Colab](https://colab.research.google.com/)
|
||||||
|
2. Create a new Jupyter Notebook
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
### 1. After creating a new Jupyter notebook, click on "Runtime" in the navbar:
|
||||||
|
|
||||||
|
<img src="attachments/runtime-navbar.png" alt="Runtime in navbar in Google Colab" width=350>
|
||||||
|
|
||||||
|
### 2. Then, choose "Change runtime type":
|
||||||
|
|
||||||
|
<img src="attachments/change-runtime.png" alt="Change runtime type option in Runtime section on Google Colab" width=300>
|
||||||
|
|
||||||
|
### 3. Select "T4 GPU", and save:
|
||||||
|
|
||||||
|
<img src="attachments/select-t4-gpu.png" alt="T4 GPU option selected in Google Colab as Runtime" width=300>
|
||||||
|
|
||||||
|
### 4. Check if `nvcc` is installed:
|
||||||
|
|
||||||
|
```python3
|
||||||
|
!nvcc --version
|
||||||
|
```
|
||||||
|
|
||||||
|
### 5. Install `nvcc4jupyter`:
|
||||||
|
|
||||||
|
```python3
|
||||||
|
!pip install nvcc4jupyter
|
||||||
|
# Or if the above command fails, comment the above line and run
|
||||||
|
# !pip install git+https://git.kska.io/notkshitij/nvcc.git
|
||||||
|
```
|
||||||
|
|
||||||
|
### 6. Load it:
|
||||||
|
|
||||||
|
```python3
|
||||||
|
%load_ext nvcc4jupyter
|
||||||
|
```
|
||||||
|
|
||||||
|
### 7. Paste the below code in a new code block:
|
||||||
|
|
||||||
|
```cu
|
||||||
|
%%writefile cuda_program.cu
|
||||||
|
#include <iostream>
|
||||||
|
#include <cuda.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
#define BLOCK_SIZE 2
|
||||||
|
|
||||||
|
// Vector Addition Kernel
|
||||||
|
// Each thread computes a single element of C = A + B.
|
||||||
|
__global__ void vectorAdd(int *A, int *B, int *C, int N) {
|
||||||
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
// Guard against threads beyond the vector size (when N is not a multiple
|
||||||
|
// of the block size, some threads in the last block are out of range).
|
||||||
|
if (i < N)
|
||||||
|
C[i] = A[i] + B[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Matrix Multiplication Kernel
|
||||||
|
// Each thread computes a single element of C = A * B.
|
||||||
|
// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
|
||||||
|
__global__ void matrixMul(float *A, float *B, float *C, int N) {
|
||||||
|
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
float sum = 0.0f;
|
||||||
|
for (int n = 0; n < N; ++n)
|
||||||
|
sum += A[row * N + n] * B[n * N + col];
|
||||||
|
|
||||||
|
C[row * N + col] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Vector Addition
|
||||||
|
void runVectorAddition() {
|
||||||
|
int N;
|
||||||
|
cout << "\n=== Vector Addition ===" << endl;
|
||||||
|
cout << "Enter vector size: ";
|
||||||
|
cin >> N;
|
||||||
|
|
||||||
|
int size = N * sizeof(int);
|
||||||
|
|
||||||
|
// Host allocation and initialisation
|
||||||
|
int *hA = new int[N];
|
||||||
|
int *hB = new int[N];
|
||||||
|
int *hC = new int[N];
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
hA[i] = i;
|
||||||
|
hB[i] = i * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nVector A: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hA[i] << " ";
|
||||||
|
cout << "\nVector B: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hB[i] << " ";
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
// Device allocation and transfer
|
||||||
|
int *dA, *dB, *dC;
|
||||||
|
cudaMalloc(&dA, size);
|
||||||
|
cudaMalloc(&dB, size);
|
||||||
|
cudaMalloc(&dC, size);
|
||||||
|
|
||||||
|
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// Launch with enough blocks to cover all N elements.
|
||||||
|
// (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
|
||||||
|
int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
|
vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
|
||||||
|
|
||||||
|
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
cout << "Result A + B: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hC[i] << " ";
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
delete[] hA;
|
||||||
|
delete[] hB;
|
||||||
|
delete[] hC;
|
||||||
|
cudaFree(dA);
|
||||||
|
cudaFree(dB);
|
||||||
|
cudaFree(dC);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Matrix Multiplication
|
||||||
|
void runMatrixMultiplication() {
|
||||||
|
int K, N;
|
||||||
|
cout << "\n=== Matrix Multiplication ===" << endl;
|
||||||
|
cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
|
||||||
|
cin >> K;
|
||||||
|
N = K * BLOCK_SIZE;
|
||||||
|
|
||||||
|
cout << "Matrix size: " << N << " x " << N << endl;
|
||||||
|
int size = N * N * sizeof(float);
|
||||||
|
|
||||||
|
// Host allocation and initialisation
|
||||||
|
float *hA = new float[N * N];
|
||||||
|
float *hB = new float[N * N];
|
||||||
|
float *hC = new float[N * N];
|
||||||
|
|
||||||
|
for (int j = 0; j < N; j++) {
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
hA[j * N + i] = 2;
|
||||||
|
hB[j * N + i] = 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nMatrix A:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hA[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nMatrix B:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hB[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Device allocation and transfer
|
||||||
|
float *dA, *dB, *dC;
|
||||||
|
cudaMalloc(&dA, size);
|
||||||
|
cudaMalloc(&dB, size);
|
||||||
|
cudaMalloc(&dC, size);
|
||||||
|
|
||||||
|
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
|
||||||
|
// grid: K x K blocks, so total threads = N x N (one per output element).
|
||||||
|
dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
|
||||||
|
dim3 grid(K, K);
|
||||||
|
matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
|
||||||
|
|
||||||
|
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
cout << "\nResult C = A * B:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hC[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] hA;
|
||||||
|
delete[] hB;
|
||||||
|
delete[] hC;
|
||||||
|
cudaFree(dA);
|
||||||
|
cudaFree(dB);
|
||||||
|
cudaFree(dC);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
runVectorAddition();
|
||||||
|
runMatrixMultiplication();
|
||||||
|
|
||||||
|
cout << "\nFinished." << endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 8. Compile and run:
|
||||||
|
|
||||||
|
```python3
|
||||||
|
!nvcc cuda_program.cu -o cuda_program && ./cuda_program
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Sample output
|
||||||
|
|
||||||
|
```md
|
||||||
|
=== Vector Addition ===
|
||||||
|
Enter vector size: 2
|
||||||
|
|
||||||
|
Vector A: 0 1
|
||||||
|
Vector B: 0 2
|
||||||
|
Result A + B: 0 3
|
||||||
|
|
||||||
|
=== Matrix Multiplication ===
|
||||||
|
Enter K (matrix will be N x N where N = K * 2): 3
|
||||||
|
Matrix size: 6 x 6
|
||||||
|
|
||||||
|
Matrix A:
|
||||||
|
2 2 2 2 2 2
|
||||||
|
2 2 2 2 2 2
|
||||||
|
2 2 2 2 2 2
|
||||||
|
2 2 2 2 2 2
|
||||||
|
2 2 2 2 2 2
|
||||||
|
2 2 2 2 2 2
|
||||||
|
|
||||||
|
Matrix B:
|
||||||
|
4 4 4 4 4 4
|
||||||
|
4 4 4 4 4 4
|
||||||
|
4 4 4 4 4 4
|
||||||
|
4 4 4 4 4 4
|
||||||
|
4 4 4 4 4 4
|
||||||
|
4 4 4 4 4 4
|
||||||
|
|
||||||
|
Result C = A * B:
|
||||||
|
48 48 48 48 48 48
|
||||||
|
48 48 48 48 48 48
|
||||||
|
48 48 48 48 48 48
|
||||||
|
48 48 48 48 48 48
|
||||||
|
48 48 48 48 48 48
|
||||||
|
48 48 48 48 48 48
|
||||||
|
|
||||||
|
Finished.
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
@@ -0,0 +1,348 @@
|
|||||||
|
{
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"provenance": [],
|
||||||
|
"gpuType": "T4"
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"display_name": "Python 3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
},
|
||||||
|
"accelerator": "GPU"
|
||||||
|
},
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "sSZ5XEy-IFoj",
|
||||||
|
"outputId": "8bac00c3-0327-4682-f636-b6b253db5201"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"nvcc: NVIDIA (R) Cuda compiler driver\n",
|
||||||
|
"Copyright (c) 2005-2025 NVIDIA Corporation\n",
|
||||||
|
"Built on Fri_Feb_21_20:23:50_PST_2025\n",
|
||||||
|
"Cuda compilation tools, release 12.8, V12.8.93\n",
|
||||||
|
"Build cuda_12.8.r12.8/compiler.35583870_0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!nvcc --version"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"!pip install nvcc4jupyter\n",
|
||||||
|
"# Or if the above command fails, comment the above line and run\n",
|
||||||
|
"# !pip install git+https://git.kska.io/notkshitij/nvcc.git"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "1jHq-AKfIINd",
|
||||||
|
"outputId": "818ccbb0-9383-4be5-cd79-c9527c5806c9"
|
||||||
|
},
|
||||||
|
"execution_count": 25,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Requirement already satisfied: nvcc4jupyter in /usr/local/lib/python3.12/dist-packages (1.2.1)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"%load_ext nvcc4jupyter"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "9nuQsRZMIROH",
|
||||||
|
"outputId": "e8508f1c-9895-4e8c-e7f0-d9e34aab21a1"
|
||||||
|
},
|
||||||
|
"execution_count": 16,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"The nvcc4jupyter extension is already loaded. To reload it, use:\n",
|
||||||
|
" %reload_ext nvcc4jupyter\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"%%writefile cuda_program.cu\n",
|
||||||
|
"#include <iostream>\n",
|
||||||
|
"#include <cuda.h>\n",
|
||||||
|
"\n",
|
||||||
|
"using namespace std;\n",
|
||||||
|
"\n",
|
||||||
|
"#define BLOCK_SIZE 2\n",
|
||||||
|
"\n",
|
||||||
|
"// Vector Addition Kernel\n",
|
||||||
|
"// Each thread computes a single element of C = A + B.\n",
|
||||||
|
"__global__ void vectorAdd(int *A, int *B, int *C, int N) {\n",
|
||||||
|
" int i = blockIdx.x * blockDim.x + threadIdx.x;\n",
|
||||||
|
" // Guard against threads beyond the vector size (when N is not a multiple\n",
|
||||||
|
" // of the block size, some threads in the last block are out of range).\n",
|
||||||
|
" if (i < N)\n",
|
||||||
|
" C[i] = A[i] + B[i];\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"// Matrix Multiplication Kernel\n",
|
||||||
|
"// Each thread computes a single element of C = A * B.\n",
|
||||||
|
"// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.\n",
|
||||||
|
"__global__ void matrixMul(float *A, float *B, float *C, int N) {\n",
|
||||||
|
" int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
|
||||||
|
" int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
|
||||||
|
"\n",
|
||||||
|
" float sum = 0.0f;\n",
|
||||||
|
" for (int n = 0; n < N; ++n)\n",
|
||||||
|
" sum += A[row * N + n] * B[n * N + col];\n",
|
||||||
|
"\n",
|
||||||
|
" C[row * N + col] = sum;\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"// Vector Addition\n",
|
||||||
|
"void runVectorAddition() {\n",
|
||||||
|
" int N;\n",
|
||||||
|
" cout << \"\\n=== Vector Addition ===\" << endl;\n",
|
||||||
|
" cout << \"Enter vector size: \";\n",
|
||||||
|
" cin >> N;\n",
|
||||||
|
"\n",
|
||||||
|
" int size = N * sizeof(int);\n",
|
||||||
|
"\n",
|
||||||
|
" // Host allocation and initialisation\n",
|
||||||
|
" int *hA = new int[N];\n",
|
||||||
|
" int *hB = new int[N];\n",
|
||||||
|
" int *hC = new int[N];\n",
|
||||||
|
"\n",
|
||||||
|
" for (int i = 0; i < N; i++) {\n",
|
||||||
|
" hA[i] = i;\n",
|
||||||
|
" hB[i] = i * 2;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"\\nVector A: \";\n",
|
||||||
|
" for (int i = 0; i < N; i++) cout << hA[i] << \" \";\n",
|
||||||
|
" cout << \"\\nVector B: \";\n",
|
||||||
|
" for (int i = 0; i < N; i++) cout << hB[i] << \" \";\n",
|
||||||
|
" cout << endl;\n",
|
||||||
|
"\n",
|
||||||
|
" // Device allocation and transfer\n",
|
||||||
|
" int *dA, *dB, *dC;\n",
|
||||||
|
" cudaMalloc(&dA, size);\n",
|
||||||
|
" cudaMalloc(&dB, size);\n",
|
||||||
|
" cudaMalloc(&dC, size);\n",
|
||||||
|
"\n",
|
||||||
|
" cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
|
||||||
|
" cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
|
||||||
|
"\n",
|
||||||
|
" // Launch with enough blocks to cover all N elements.\n",
|
||||||
|
" // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.\n",
|
||||||
|
" int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;\n",
|
||||||
|
" vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);\n",
|
||||||
|
"\n",
|
||||||
|
" cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"Result A + B: \";\n",
|
||||||
|
" for (int i = 0; i < N; i++) cout << hC[i] << \" \";\n",
|
||||||
|
" cout << endl;\n",
|
||||||
|
"\n",
|
||||||
|
" delete[] hA;\n",
|
||||||
|
" delete[] hB;\n",
|
||||||
|
" delete[] hC;\n",
|
||||||
|
" cudaFree(dA);\n",
|
||||||
|
" cudaFree(dB);\n",
|
||||||
|
" cudaFree(dC);\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"// Matrix Multiplication\n",
|
||||||
|
"void runMatrixMultiplication() {\n",
|
||||||
|
" int K, N;\n",
|
||||||
|
" cout << \"\\n=== Matrix Multiplication ===\" << endl;\n",
|
||||||
|
" cout << \"Enter K (matrix will be N x N where N = K * \" << BLOCK_SIZE << \"): \";\n",
|
||||||
|
" cin >> K;\n",
|
||||||
|
" N = K * BLOCK_SIZE;\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"Matrix size: \" << N << \" x \" << N << endl;\n",
|
||||||
|
" int size = N * N * sizeof(float);\n",
|
||||||
|
"\n",
|
||||||
|
" // Host allocation and initialisation\n",
|
||||||
|
" float *hA = new float[N * N];\n",
|
||||||
|
" float *hB = new float[N * N];\n",
|
||||||
|
" float *hC = new float[N * N];\n",
|
||||||
|
"\n",
|
||||||
|
" for (int j = 0; j < N; j++) {\n",
|
||||||
|
" for (int i = 0; i < N; i++) {\n",
|
||||||
|
" hA[j * N + i] = 2;\n",
|
||||||
|
" hB[j * N + i] = 4;\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"\\nMatrix A:\\n\";\n",
|
||||||
|
" for (int row = 0; row < N; row++) {\n",
|
||||||
|
" for (int col = 0; col < N; col++)\n",
|
||||||
|
" cout << hA[row * N + col] << \" \";\n",
|
||||||
|
" cout << endl;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"\\nMatrix B:\\n\";\n",
|
||||||
|
" for (int row = 0; row < N; row++) {\n",
|
||||||
|
" for (int col = 0; col < N; col++)\n",
|
||||||
|
" cout << hB[row * N + col] << \" \";\n",
|
||||||
|
" cout << endl;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" // Device allocation and transfer\n",
|
||||||
|
" float *dA, *dB, *dC;\n",
|
||||||
|
" cudaMalloc(&dA, size);\n",
|
||||||
|
" cudaMalloc(&dB, size);\n",
|
||||||
|
" cudaMalloc(&dC, size);\n",
|
||||||
|
"\n",
|
||||||
|
" cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
|
||||||
|
" cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
|
||||||
|
"\n",
|
||||||
|
" // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.\n",
|
||||||
|
" // grid: K x K blocks, so total threads = N x N (one per output element).\n",
|
||||||
|
" dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);\n",
|
||||||
|
" dim3 grid(K, K);\n",
|
||||||
|
" matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);\n",
|
||||||
|
"\n",
|
||||||
|
" cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"\\nResult C = A * B:\\n\";\n",
|
||||||
|
" for (int row = 0; row < N; row++) {\n",
|
||||||
|
" for (int col = 0; col < N; col++)\n",
|
||||||
|
" cout << hC[row * N + col] << \" \";\n",
|
||||||
|
" cout << endl;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" delete[] hA;\n",
|
||||||
|
" delete[] hB;\n",
|
||||||
|
" delete[] hC;\n",
|
||||||
|
" cudaFree(dA);\n",
|
||||||
|
" cudaFree(dB);\n",
|
||||||
|
" cudaFree(dC);\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"int main() {\n",
|
||||||
|
" runVectorAddition();\n",
|
||||||
|
" runMatrixMultiplication();\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"\\nFinished.\" << endl;\n",
|
||||||
|
" return 0;\n",
|
||||||
|
"}\n"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "nvCj8UmhIh3o",
|
||||||
|
"outputId": "54a31aec-f860-4b72-a4e1-03a392def6f6"
|
||||||
|
},
|
||||||
|
"execution_count": 23,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Overwriting cuda_program.cu\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"!nvcc cuda_program.cu -o cuda_program && ./cuda_program"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "F7fC0LtbJ5o8",
|
||||||
|
"outputId": "9d5988b2-ad42-4b0b-c84d-c1698e778bb9"
|
||||||
|
},
|
||||||
|
"execution_count": 24,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"nvcc warning : Support for offline compilation for architectures prior to '<compute/sm/lto>_75' will be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n",
|
||||||
|
"\n",
|
||||||
|
"=== Vector Addition ===\n",
|
||||||
|
"Enter vector size: 2\n",
|
||||||
|
"\n",
|
||||||
|
"Vector A: 0 1 \n",
|
||||||
|
"Vector B: 0 2 \n",
|
||||||
|
"Result A + B: 0 3 \n",
|
||||||
|
"\n",
|
||||||
|
"=== Matrix Multiplication ===\n",
|
||||||
|
"Enter K (matrix will be N x N where N = K * 2): 3\n",
|
||||||
|
"Matrix size: 6 x 6\n",
|
||||||
|
"\n",
|
||||||
|
"Matrix A:\n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"\n",
|
||||||
|
"Matrix B:\n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"\n",
|
||||||
|
"Result C = A * B:\n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"\n",
|
||||||
|
"Finished.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [],
|
||||||
|
"metadata": {
|
||||||
|
"id": "HjhrulSNKHkq"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,161 @@
|
|||||||
|
# %%writefile cuda_program.cu
|
||||||
|
#include <iostream>
|
||||||
|
#include <cuda.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
#define BLOCK_SIZE 2
|
||||||
|
|
||||||
|
// ─── Vector Addition Kernel ──────────────────────────────────────────────────
|
||||||
|
// Each thread computes a single element of C = A + B.
|
||||||
|
__global__ void vectorAdd(int *A, int *B, int *C, int N) {
|
||||||
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
// Guard against threads beyond the vector size (when N is not a multiple
|
||||||
|
// of the block size, some threads in the last block are out of range).
|
||||||
|
if (i < N)
|
||||||
|
C[i] = A[i] + B[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Matrix Multiplication Kernel ────────────────────────────────────────────
|
||||||
|
// Each thread computes a single element of C = A * B.
|
||||||
|
// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
|
||||||
|
__global__ void matrixMul(float *A, float *B, float *C, int N) {
|
||||||
|
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
float sum = 0.0f;
|
||||||
|
for (int n = 0; n < N; ++n)
|
||||||
|
sum += A[row * N + n] * B[n * N + col];
|
||||||
|
|
||||||
|
C[row * N + col] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Vector Addition ─────────────────────────────────────────────────────────
|
||||||
|
void runVectorAddition() {
|
||||||
|
int N;
|
||||||
|
cout << "\n=== Vector Addition ===" << endl;
|
||||||
|
cout << "Enter vector size: ";
|
||||||
|
cin >> N;
|
||||||
|
|
||||||
|
int size = N * sizeof(int);
|
||||||
|
|
||||||
|
// Host allocation and initialisation
|
||||||
|
int *hA = new int[N];
|
||||||
|
int *hB = new int[N];
|
||||||
|
int *hC = new int[N];
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
hA[i] = i;
|
||||||
|
hB[i] = i * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nVector A: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hA[i] << " ";
|
||||||
|
cout << "\nVector B: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hB[i] << " ";
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
// Device allocation and transfer
|
||||||
|
int *dA, *dB, *dC;
|
||||||
|
cudaMalloc(&dA, size);
|
||||||
|
cudaMalloc(&dB, size);
|
||||||
|
cudaMalloc(&dC, size);
|
||||||
|
|
||||||
|
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// Launch with enough blocks to cover all N elements.
|
||||||
|
// (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
|
||||||
|
int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
|
vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
|
||||||
|
|
||||||
|
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
cout << "Result A + B: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hC[i] << " ";
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
delete[] hA;
|
||||||
|
delete[] hB;
|
||||||
|
delete[] hC;
|
||||||
|
cudaFree(dA);
|
||||||
|
cudaFree(dB);
|
||||||
|
cudaFree(dC);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Matrix Multiplication ───────────────────────────────────────────────────
|
||||||
|
void runMatrixMultiplication() {
|
||||||
|
int K, N;
|
||||||
|
cout << "\n=== Matrix Multiplication ===" << endl;
|
||||||
|
cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
|
||||||
|
cin >> K;
|
||||||
|
N = K * BLOCK_SIZE;
|
||||||
|
|
||||||
|
cout << "Matrix size: " << N << " x " << N << endl;
|
||||||
|
int size = N * N * sizeof(float);
|
||||||
|
|
||||||
|
// Host allocation and initialisation
|
||||||
|
float *hA = new float[N * N];
|
||||||
|
float *hB = new float[N * N];
|
||||||
|
float *hC = new float[N * N];
|
||||||
|
|
||||||
|
for (int j = 0; j < N; j++) {
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
hA[j * N + i] = 2;
|
||||||
|
hB[j * N + i] = 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nMatrix A:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hA[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nMatrix B:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hB[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Device allocation and transfer
|
||||||
|
float *dA, *dB, *dC;
|
||||||
|
cudaMalloc(&dA, size);
|
||||||
|
cudaMalloc(&dB, size);
|
||||||
|
cudaMalloc(&dC, size);
|
||||||
|
|
||||||
|
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
|
||||||
|
// grid: K x K blocks, so total threads = N x N (one per output element).
|
||||||
|
dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
|
||||||
|
dim3 grid(K, K);
|
||||||
|
matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
|
||||||
|
|
||||||
|
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
cout << "\nResult C = A * B:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hC[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] hA;
|
||||||
|
delete[] hB;
|
||||||
|
delete[] hC;
|
||||||
|
cudaFree(dA);
|
||||||
|
cudaFree(dB);
|
||||||
|
cudaFree(dC);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
runVectorAddition();
|
||||||
|
runMatrixMultiplication();
|
||||||
|
|
||||||
|
cout << "\nFinished." << endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
BIN
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
@@ -10,6 +10,11 @@ This repository compiles essential resources for the SPPU Computer Engineering P
|
|||||||
|
|
||||||
### Codes
|
### Codes
|
||||||
|
|
||||||
|
1. [Code-1 (Parallel BFS and DFS)](Codes/Code-1.cpp)
|
||||||
|
2. [Code-2 (Sequential and Parallel Bubble Sort and Merge Sort)](Codes/Code-2.cpp)
|
||||||
|
3. [Code-3 (Min, Max, Sum, Average)](Codes/Code-3.cpp)
|
||||||
|
4. [Code-4 (Vector Addition and Matrix Multiplication)](Codes/Code-4/)
|
||||||
|
|
||||||
### Practical
|
### Practical
|
||||||
|
|
||||||
1. [Practical-1](Practical/Practical-1/)
|
1. [Practical-1](Practical/Practical-1/)
|
||||||
@@ -34,6 +39,8 @@ This repository compiles essential resources for the SPPU Computer Engineering P
|
|||||||
|
|
||||||
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers/)
|
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers/)
|
||||||
|
|
||||||
|
### [END-SEM PYQ Answers](Notes/END-SEM%20PYQ%20Answers/)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Miscellaneous
|
## Miscellaneous
|
||||||
|
|||||||
Reference in New Issue
Block a user