Compare commits

..

18 Commits

Author SHA1 Message Date
notkshitij 58d38f9199 add answers for may-june 2025 + november-december 2025 pyqs for unit 6 (High Performance Computing Applications) 2026-05-26 01:41:12 +05:30
notkshitij ccca13880d add answers for may-june 2025 + november-december 2025 pyqs for unit 5 (CUDA Architecture) 2026-05-26 01:34:07 +05:30
notkshitij ccd6640ac9 add answers for may-june 2025 + november-december 2025 pyqs for unit 4 (Analytical Modeling of Parallel Programs) 2026-05-26 01:24:24 +05:30
notkshitij 1dcb16981b add answers for may-june 2025 + november-december 2025 pyqs for unit 3 (Parallel Communication) 2026-05-26 01:15:06 +05:30
notkshitij 344db2f477 add may-june 2025 + november-december 2025 pyqs for end-sem. 2026-05-25 23:20:03 +05:30
notkshitij 3f3b1a1978 add link for end-sem pyq answers in README. 2026-05-21 19:33:27 +05:30
notkshitij 786d318b88 add end-sem pyq answers for unit 6 (High Performance Computing Applications) 2026-05-21 19:29:44 +05:30
notkshitij fefa2383bb add end-sem pyq answers for unit 5 (CUDA Architecture) 2026-05-21 19:28:05 +05:30
notkshitij a90631ce37 add end-sem pyq answers for unit 4 (Analytical Modeling of Parallel Programs) 2026-05-21 19:23:03 +05:30
notkshitij 7a6b281521 add end-sem pyq answers for unit 3 (Parallel Communication) 2026-05-21 19:19:15 +05:30
notkshitij 84b5e3a059 add end-sem pyqs for HPC (may june 2023, may-june 2024) 2026-05-15 01:41:50 +05:30
notkshitij b8b405da94 fix title in markdown file for practical 4. 2026-05-04 23:53:57 +05:30
notkshitij d3ad26e1ca add link for 4th practical in README. 2026-05-04 23:53:26 +05:30
notkshitij 60783ed8cd add only the program file for CUDA program; practical 4. 2026-05-04 23:52:49 +05:30
notkshitij aaa405c02a add Jupyter notebook for 4th practical; vector addition and matrix multiplication using CUDA C. 2026-05-04 23:51:16 +05:30
notkshitij 5f94348c49 fix formatting and resize attachments in instructions for executing practical 4 code. 2026-05-04 23:50:05 +05:30
notkshitij 4e5913d6e4 add instructions for executing 4th practical in Google Colab. 2026-05-04 23:46:08 +05:30
notkshitij a521ac1ca1 add attachments required for practical 4 (CUDA program) instructions. 2026-05-04 23:45:45 +05:30
16 changed files with 791 additions and 0 deletions
+3
View File
@@ -0,0 +1,3 @@
attachments/change-runtime.png filter=lfs diff=lfs merge=lfs -text
attachments/runtime-navbar.png filter=lfs diff=lfs merge=lfs -text
attachments/select-t4-gpu.png filter=lfs diff=lfs merge=lfs -text
+267
View File
@@ -0,0 +1,267 @@
# Practical-4 (Vector Addition and Matrix Multiplication)
Problem Statement:
Write a CUDA Program for:
1. Addition of two large vectors
2. Matrix Multiplication using CUDA C
---
## Pre-requisities
1. Open [Google Colab](https://colab.research.google.com/)
2. Create a new Jupyter Notebook
---
## Steps
### 1. After creating a new Jupyter notebook, click on "Runtime" in the navbar:
<img src="attachments/runtime-navbar.png" alt="Runtime in navbar in Google Colab" width=350>
### 2. Then, choose "Change runtime type":
<img src="attachments/change-runtime.png" alt="Change runtime type option in Runtime section on Google Colab" width=300>
### 3. Select "T4 GPU", and save:
<img src="attachments/select-t4-gpu.png" alt="T4 GPU option selected in Google Colab as Runtime" width=300>
### 4. Check if `nvcc` is installed:
```python3
!nvcc --version
```
### 5. Install `nvcc4jupyter`:
```python3
!pip install nvcc4jupyter
# Or if the above command fails, comment the above line and run
# !pip install git+https://git.kska.io/notkshitij/nvcc.git
```
### 6. Load it:
```python3
%load_ext nvcc4jupyter
```
### 7. Paste the below code in a new code block:
```cu
%%writefile cuda_program.cu
#include <iostream>
#include <cuda.h>
using namespace std;
#define BLOCK_SIZE 2
// Vector Addition Kernel
// Each thread computes a single element of C = A + B.
__global__ void vectorAdd(int *A, int *B, int *C, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
// Guard against threads beyond the vector size (when N is not a multiple
// of the block size, some threads in the last block are out of range).
if (i < N)
C[i] = A[i] + B[i];
}
// Matrix Multiplication Kernel
// Each thread computes a single element of C = A * B.
// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
__global__ void matrixMul(float *A, float *B, float *C, int N) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (int n = 0; n < N; ++n)
sum += A[row * N + n] * B[n * N + col];
C[row * N + col] = sum;
}
// Vector Addition
void runVectorAddition() {
int N;
cout << "\n=== Vector Addition ===" << endl;
cout << "Enter vector size: ";
cin >> N;
int size = N * sizeof(int);
// Host allocation and initialisation
int *hA = new int[N];
int *hB = new int[N];
int *hC = new int[N];
for (int i = 0; i < N; i++) {
hA[i] = i;
hB[i] = i * 2;
}
cout << "\nVector A: ";
for (int i = 0; i < N; i++) cout << hA[i] << " ";
cout << "\nVector B: ";
for (int i = 0; i < N; i++) cout << hB[i] << " ";
cout << endl;
// Device allocation and transfer
int *dA, *dB, *dC;
cudaMalloc(&dA, size);
cudaMalloc(&dB, size);
cudaMalloc(&dC, size);
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
// Launch with enough blocks to cover all N elements.
// (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
cout << "Result A + B: ";
for (int i = 0; i < N; i++) cout << hC[i] << " ";
cout << endl;
delete[] hA;
delete[] hB;
delete[] hC;
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
}
// Matrix Multiplication
void runMatrixMultiplication() {
int K, N;
cout << "\n=== Matrix Multiplication ===" << endl;
cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
cin >> K;
N = K * BLOCK_SIZE;
cout << "Matrix size: " << N << " x " << N << endl;
int size = N * N * sizeof(float);
// Host allocation and initialisation
float *hA = new float[N * N];
float *hB = new float[N * N];
float *hC = new float[N * N];
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
hA[j * N + i] = 2;
hB[j * N + i] = 4;
}
}
cout << "\nMatrix A:\n";
for (int row = 0; row < N; row++) {
for (int col = 0; col < N; col++)
cout << hA[row * N + col] << " ";
cout << endl;
}
cout << "\nMatrix B:\n";
for (int row = 0; row < N; row++) {
for (int col = 0; col < N; col++)
cout << hB[row * N + col] << " ";
cout << endl;
}
// Device allocation and transfer
float *dA, *dB, *dC;
cudaMalloc(&dA, size);
cudaMalloc(&dB, size);
cudaMalloc(&dC, size);
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
// threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
// grid: K x K blocks, so total threads = N x N (one per output element).
dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(K, K);
matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
cout << "\nResult C = A * B:\n";
for (int row = 0; row < N; row++) {
for (int col = 0; col < N; col++)
cout << hC[row * N + col] << " ";
cout << endl;
}
delete[] hA;
delete[] hB;
delete[] hC;
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
}
int main() {
runVectorAddition();
runMatrixMultiplication();
cout << "\nFinished." << endl;
return 0;
}
```
### 8. Compile and run:
```python3
!nvcc cuda_program.cu -o cuda_program && ./cuda_program
```
---
## Sample output
```md
=== Vector Addition ===
Enter vector size: 2
Vector A: 0 1
Vector B: 0 2
Result A + B: 0 3
=== Matrix Multiplication ===
Enter K (matrix will be N x N where N = K * 2): 3
Matrix size: 6 x 6
Matrix A:
2 2 2 2 2 2
2 2 2 2 2 2
2 2 2 2 2 2
2 2 2 2 2 2
2 2 2 2 2 2
2 2 2 2 2 2
Matrix B:
4 4 4 4 4 4
4 4 4 4 4 4
4 4 4 4 4 4
4 4 4 4 4 4
4 4 4 4 4 4
4 4 4 4 4 4
Result C = A * B:
48 48 48 48 48 48
48 48 48 48 48 48
48 48 48 48 48 48
48 48 48 48 48 48
48 48 48 48 48 48
48 48 48 48 48 48
Finished.
```
---
+348
View File
@@ -0,0 +1,348 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "sSZ5XEy-IFoj",
"outputId": "8bac00c3-0327-4682-f636-b6b253db5201"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"nvcc: NVIDIA (R) Cuda compiler driver\n",
"Copyright (c) 2005-2025 NVIDIA Corporation\n",
"Built on Fri_Feb_21_20:23:50_PST_2025\n",
"Cuda compilation tools, release 12.8, V12.8.93\n",
"Build cuda_12.8.r12.8/compiler.35583870_0\n"
]
}
],
"source": [
"!nvcc --version"
]
},
{
"cell_type": "code",
"source": [
"!pip install nvcc4jupyter\n",
"# Or if the above command fails, comment the above line and run\n",
"# !pip install git+https://git.kska.io/notkshitij/nvcc.git"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "1jHq-AKfIINd",
"outputId": "818ccbb0-9383-4be5-cd79-c9527c5806c9"
},
"execution_count": 25,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: nvcc4jupyter in /usr/local/lib/python3.12/dist-packages (1.2.1)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%load_ext nvcc4jupyter"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "9nuQsRZMIROH",
"outputId": "e8508f1c-9895-4e8c-e7f0-d9e34aab21a1"
},
"execution_count": 16,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"The nvcc4jupyter extension is already loaded. To reload it, use:\n",
" %reload_ext nvcc4jupyter\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%%writefile cuda_program.cu\n",
"#include <iostream>\n",
"#include <cuda.h>\n",
"\n",
"using namespace std;\n",
"\n",
"#define BLOCK_SIZE 2\n",
"\n",
"// Vector Addition Kernel\n",
"// Each thread computes a single element of C = A + B.\n",
"__global__ void vectorAdd(int *A, int *B, int *C, int N) {\n",
" int i = blockIdx.x * blockDim.x + threadIdx.x;\n",
" // Guard against threads beyond the vector size (when N is not a multiple\n",
" // of the block size, some threads in the last block are out of range).\n",
" if (i < N)\n",
" C[i] = A[i] + B[i];\n",
"}\n",
"\n",
"// Matrix Multiplication Kernel\n",
"// Each thread computes a single element of C = A * B.\n",
"// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.\n",
"__global__ void matrixMul(float *A, float *B, float *C, int N) {\n",
" int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
" int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
"\n",
" float sum = 0.0f;\n",
" for (int n = 0; n < N; ++n)\n",
" sum += A[row * N + n] * B[n * N + col];\n",
"\n",
" C[row * N + col] = sum;\n",
"}\n",
"\n",
"// Vector Addition\n",
"void runVectorAddition() {\n",
" int N;\n",
" cout << \"\\n=== Vector Addition ===\" << endl;\n",
" cout << \"Enter vector size: \";\n",
" cin >> N;\n",
"\n",
" int size = N * sizeof(int);\n",
"\n",
" // Host allocation and initialisation\n",
" int *hA = new int[N];\n",
" int *hB = new int[N];\n",
" int *hC = new int[N];\n",
"\n",
" for (int i = 0; i < N; i++) {\n",
" hA[i] = i;\n",
" hB[i] = i * 2;\n",
" }\n",
"\n",
" cout << \"\\nVector A: \";\n",
" for (int i = 0; i < N; i++) cout << hA[i] << \" \";\n",
" cout << \"\\nVector B: \";\n",
" for (int i = 0; i < N; i++) cout << hB[i] << \" \";\n",
" cout << endl;\n",
"\n",
" // Device allocation and transfer\n",
" int *dA, *dB, *dC;\n",
" cudaMalloc(&dA, size);\n",
" cudaMalloc(&dB, size);\n",
" cudaMalloc(&dC, size);\n",
"\n",
" cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
" cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
"\n",
" // Launch with enough blocks to cover all N elements.\n",
" // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.\n",
" int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;\n",
" vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);\n",
"\n",
" cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
"\n",
" cout << \"Result A + B: \";\n",
" for (int i = 0; i < N; i++) cout << hC[i] << \" \";\n",
" cout << endl;\n",
"\n",
" delete[] hA;\n",
" delete[] hB;\n",
" delete[] hC;\n",
" cudaFree(dA);\n",
" cudaFree(dB);\n",
" cudaFree(dC);\n",
"}\n",
"\n",
"// Matrix Multiplication\n",
"void runMatrixMultiplication() {\n",
" int K, N;\n",
" cout << \"\\n=== Matrix Multiplication ===\" << endl;\n",
" cout << \"Enter K (matrix will be N x N where N = K * \" << BLOCK_SIZE << \"): \";\n",
" cin >> K;\n",
" N = K * BLOCK_SIZE;\n",
"\n",
" cout << \"Matrix size: \" << N << \" x \" << N << endl;\n",
" int size = N * N * sizeof(float);\n",
"\n",
" // Host allocation and initialisation\n",
" float *hA = new float[N * N];\n",
" float *hB = new float[N * N];\n",
" float *hC = new float[N * N];\n",
"\n",
" for (int j = 0; j < N; j++) {\n",
" for (int i = 0; i < N; i++) {\n",
" hA[j * N + i] = 2;\n",
" hB[j * N + i] = 4;\n",
" }\n",
" }\n",
"\n",
" cout << \"\\nMatrix A:\\n\";\n",
" for (int row = 0; row < N; row++) {\n",
" for (int col = 0; col < N; col++)\n",
" cout << hA[row * N + col] << \" \";\n",
" cout << endl;\n",
" }\n",
"\n",
" cout << \"\\nMatrix B:\\n\";\n",
" for (int row = 0; row < N; row++) {\n",
" for (int col = 0; col < N; col++)\n",
" cout << hB[row * N + col] << \" \";\n",
" cout << endl;\n",
" }\n",
"\n",
" // Device allocation and transfer\n",
" float *dA, *dB, *dC;\n",
" cudaMalloc(&dA, size);\n",
" cudaMalloc(&dB, size);\n",
" cudaMalloc(&dC, size);\n",
"\n",
" cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
" cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
"\n",
" // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.\n",
" // grid: K x K blocks, so total threads = N x N (one per output element).\n",
" dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);\n",
" dim3 grid(K, K);\n",
" matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);\n",
"\n",
" cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
"\n",
" cout << \"\\nResult C = A * B:\\n\";\n",
" for (int row = 0; row < N; row++) {\n",
" for (int col = 0; col < N; col++)\n",
" cout << hC[row * N + col] << \" \";\n",
" cout << endl;\n",
" }\n",
"\n",
" delete[] hA;\n",
" delete[] hB;\n",
" delete[] hC;\n",
" cudaFree(dA);\n",
" cudaFree(dB);\n",
" cudaFree(dC);\n",
"}\n",
"\n",
"int main() {\n",
" runVectorAddition();\n",
" runMatrixMultiplication();\n",
"\n",
" cout << \"\\nFinished.\" << endl;\n",
" return 0;\n",
"}\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "nvCj8UmhIh3o",
"outputId": "54a31aec-f860-4b72-a4e1-03a392def6f6"
},
"execution_count": 23,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Overwriting cuda_program.cu\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!nvcc cuda_program.cu -o cuda_program && ./cuda_program"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "F7fC0LtbJ5o8",
"outputId": "9d5988b2-ad42-4b0b-c84d-c1698e778bb9"
},
"execution_count": 24,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"nvcc warning : Support for offline compilation for architectures prior to '<compute/sm/lto>_75' will be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n",
"\n",
"=== Vector Addition ===\n",
"Enter vector size: 2\n",
"\n",
"Vector A: 0 1 \n",
"Vector B: 0 2 \n",
"Result A + B: 0 3 \n",
"\n",
"=== Matrix Multiplication ===\n",
"Enter K (matrix will be N x N where N = K * 2): 3\n",
"Matrix size: 6 x 6\n",
"\n",
"Matrix A:\n",
"2 2 2 2 2 2 \n",
"2 2 2 2 2 2 \n",
"2 2 2 2 2 2 \n",
"2 2 2 2 2 2 \n",
"2 2 2 2 2 2 \n",
"2 2 2 2 2 2 \n",
"\n",
"Matrix B:\n",
"4 4 4 4 4 4 \n",
"4 4 4 4 4 4 \n",
"4 4 4 4 4 4 \n",
"4 4 4 4 4 4 \n",
"4 4 4 4 4 4 \n",
"4 4 4 4 4 4 \n",
"\n",
"Result C = A * B:\n",
"48 48 48 48 48 48 \n",
"48 48 48 48 48 48 \n",
"48 48 48 48 48 48 \n",
"48 48 48 48 48 48 \n",
"48 48 48 48 48 48 \n",
"48 48 48 48 48 48 \n",
"\n",
"Finished.\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "HjhrulSNKHkq"
},
"execution_count": null,
"outputs": []
}
]
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
+161
View File
@@ -0,0 +1,161 @@
# %%writefile cuda_program.cu
#include <iostream>
#include <cuda.h>
using namespace std;
#define BLOCK_SIZE 2
// ─── Vector Addition Kernel ──────────────────────────────────────────────────
// Each thread computes a single element of C = A + B.
__global__ void vectorAdd(int *A, int *B, int *C, int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
// Guard against threads beyond the vector size (when N is not a multiple
// of the block size, some threads in the last block are out of range).
if (i < N)
C[i] = A[i] + B[i];
}
// ─── Matrix Multiplication Kernel ────────────────────────────────────────────
// Each thread computes a single element of C = A * B.
// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
__global__ void matrixMul(float *A, float *B, float *C, int N) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float sum = 0.0f;
for (int n = 0; n < N; ++n)
sum += A[row * N + n] * B[n * N + col];
C[row * N + col] = sum;
}
// ─── Vector Addition ─────────────────────────────────────────────────────────
void runVectorAddition() {
int N;
cout << "\n=== Vector Addition ===" << endl;
cout << "Enter vector size: ";
cin >> N;
int size = N * sizeof(int);
// Host allocation and initialisation
int *hA = new int[N];
int *hB = new int[N];
int *hC = new int[N];
for (int i = 0; i < N; i++) {
hA[i] = i;
hB[i] = i * 2;
}
cout << "\nVector A: ";
for (int i = 0; i < N; i++) cout << hA[i] << " ";
cout << "\nVector B: ";
for (int i = 0; i < N; i++) cout << hB[i] << " ";
cout << endl;
// Device allocation and transfer
int *dA, *dB, *dC;
cudaMalloc(&dA, size);
cudaMalloc(&dB, size);
cudaMalloc(&dC, size);
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
// Launch with enough blocks to cover all N elements.
// (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
cout << "Result A + B: ";
for (int i = 0; i < N; i++) cout << hC[i] << " ";
cout << endl;
delete[] hA;
delete[] hB;
delete[] hC;
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
}
// ─── Matrix Multiplication ───────────────────────────────────────────────────
void runMatrixMultiplication() {
int K, N;
cout << "\n=== Matrix Multiplication ===" << endl;
cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
cin >> K;
N = K * BLOCK_SIZE;
cout << "Matrix size: " << N << " x " << N << endl;
int size = N * N * sizeof(float);
// Host allocation and initialisation
float *hA = new float[N * N];
float *hB = new float[N * N];
float *hC = new float[N * N];
for (int j = 0; j < N; j++) {
for (int i = 0; i < N; i++) {
hA[j * N + i] = 2;
hB[j * N + i] = 4;
}
}
cout << "\nMatrix A:\n";
for (int row = 0; row < N; row++) {
for (int col = 0; col < N; col++)
cout << hA[row * N + col] << " ";
cout << endl;
}
cout << "\nMatrix B:\n";
for (int row = 0; row < N; row++) {
for (int col = 0; col < N; col++)
cout << hB[row * N + col] << " ";
cout << endl;
}
// Device allocation and transfer
float *dA, *dB, *dC;
cudaMalloc(&dA, size);
cudaMalloc(&dB, size);
cudaMalloc(&dC, size);
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
// threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
// grid: K x K blocks, so total threads = N x N (one per output element).
dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(K, K);
matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
cout << "\nResult C = A * B:\n";
for (int row = 0; row < N; row++) {
for (int col = 0; col < N; col++)
cout << hC[row * N + col] << " ";
cout << endl;
}
delete[] hA;
delete[] hB;
delete[] hC;
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
}
int main() {
runVectorAddition();
runMatrixMultiplication();
cout << "\nFinished." << endl;
return 0;
}
+3
View File
@@ -13,6 +13,7 @@ This repository compiles essential resources for the SPPU Computer Engineering P
1. [Code-1 (Parallel BFS and DFS)](Codes/Code-1.cpp)
2. [Code-2 (Sequential and Parallel Bubble Sort and Merge Sort)](Codes/Code-2.cpp)
3. [Code-3 (Min, Max, Sum, Average)](Codes/Code-3.cpp)
4. [Code-4 (Vector Addition and Matrix Multiplication)](Codes/Code-4/)
### Practical
@@ -38,6 +39,8 @@ This repository compiles essential resources for the SPPU Computer Engineering P
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers/)
### [END-SEM PYQ Answers](Notes/END-SEM%20PYQ%20Answers/)
---
## Miscellaneous