Compare commits
16 Commits
4e5913d6e4
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
58d38f9199
|
|||
|
ccca13880d
|
|||
|
ccd6640ac9
|
|||
|
1dcb16981b
|
|||
|
344db2f477
|
|||
|
3f3b1a1978
|
|||
|
786d318b88
|
|||
|
fefa2383bb
|
|||
|
a90631ce37
|
|||
|
7a6b281521
|
|||
|
84b5e3a059
|
|||
|
b8b405da94
|
|||
|
d3ad26e1ca
|
|||
|
60783ed8cd
|
|||
|
aaa405c02a
|
|||
|
5f94348c49
|
@@ -1,9 +1,9 @@
|
|||||||
# Practical-4 (CUDA Programs for Addition and Multiplication)
|
# Practical-4 (Vector Addition and Matrix Multiplication)
|
||||||
|
|
||||||
Problem Statement:
|
Problem Statement:
|
||||||
Write a CUDA Program for:
|
Write a CUDA Program for:
|
||||||
1. Addition of two large vectors
|
1. Addition of two large vectors
|
||||||
2. 2. Matrix Multiplication using CUDA C
|
2. Matrix Multiplication using CUDA C
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -18,15 +18,15 @@ Write a CUDA Program for:
|
|||||||
|
|
||||||
### 1. After creating a new Jupyter notebook, click on "Runtime" in the navbar:
|
### 1. After creating a new Jupyter notebook, click on "Runtime" in the navbar:
|
||||||
|
|
||||||

|
<img src="attachments/runtime-navbar.png" alt="Runtime in navbar in Google Colab" width=350>
|
||||||
|
|
||||||
### 2. Then, choose "Change runtime type":
|
### 2. Then, choose "Change runtime type":
|
||||||
|
|
||||||

|
<img src="attachments/change-runtime.png" alt="Change runtime type option in Runtime section on Google Colab" width=300>
|
||||||
|
|
||||||
### 3. Select "T4 GPU", and save:
|
### 3. Select "T4 GPU", and save:
|
||||||
|
|
||||||

|
<img src="attachments/select-t4-gpu.png" alt="T4 GPU option selected in Google Colab as Runtime" width=300>
|
||||||
|
|
||||||
### 4. Check if `nvcc` is installed:
|
### 4. Check if `nvcc` is installed:
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,348 @@
|
|||||||
|
{
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 0,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"provenance": [],
|
||||||
|
"gpuType": "T4"
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"display_name": "Python 3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"name": "python"
|
||||||
|
},
|
||||||
|
"accelerator": "GPU"
|
||||||
|
},
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "sSZ5XEy-IFoj",
|
||||||
|
"outputId": "8bac00c3-0327-4682-f636-b6b253db5201"
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"nvcc: NVIDIA (R) Cuda compiler driver\n",
|
||||||
|
"Copyright (c) 2005-2025 NVIDIA Corporation\n",
|
||||||
|
"Built on Fri_Feb_21_20:23:50_PST_2025\n",
|
||||||
|
"Cuda compilation tools, release 12.8, V12.8.93\n",
|
||||||
|
"Build cuda_12.8.r12.8/compiler.35583870_0\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"!nvcc --version"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"!pip install nvcc4jupyter\n",
|
||||||
|
"# Or if the above command fails, comment the above line and run\n",
|
||||||
|
"# !pip install git+https://git.kska.io/notkshitij/nvcc.git"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "1jHq-AKfIINd",
|
||||||
|
"outputId": "818ccbb0-9383-4be5-cd79-c9527c5806c9"
|
||||||
|
},
|
||||||
|
"execution_count": 25,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Requirement already satisfied: nvcc4jupyter in /usr/local/lib/python3.12/dist-packages (1.2.1)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"%load_ext nvcc4jupyter"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "9nuQsRZMIROH",
|
||||||
|
"outputId": "e8508f1c-9895-4e8c-e7f0-d9e34aab21a1"
|
||||||
|
},
|
||||||
|
"execution_count": 16,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"The nvcc4jupyter extension is already loaded. To reload it, use:\n",
|
||||||
|
" %reload_ext nvcc4jupyter\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"%%writefile cuda_program.cu\n",
|
||||||
|
"#include <iostream>\n",
|
||||||
|
"#include <cuda.h>\n",
|
||||||
|
"\n",
|
||||||
|
"using namespace std;\n",
|
||||||
|
"\n",
|
||||||
|
"#define BLOCK_SIZE 2\n",
|
||||||
|
"\n",
|
||||||
|
"// Vector Addition Kernel\n",
|
||||||
|
"// Each thread computes a single element of C = A + B.\n",
|
||||||
|
"__global__ void vectorAdd(int *A, int *B, int *C, int N) {\n",
|
||||||
|
" int i = blockIdx.x * blockDim.x + threadIdx.x;\n",
|
||||||
|
" // Guard against threads beyond the vector size (when N is not a multiple\n",
|
||||||
|
" // of the block size, some threads in the last block are out of range).\n",
|
||||||
|
" if (i < N)\n",
|
||||||
|
" C[i] = A[i] + B[i];\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"// Matrix Multiplication Kernel\n",
|
||||||
|
"// Each thread computes a single element of C = A * B.\n",
|
||||||
|
"// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.\n",
|
||||||
|
"__global__ void matrixMul(float *A, float *B, float *C, int N) {\n",
|
||||||
|
" int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
|
||||||
|
" int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
|
||||||
|
"\n",
|
||||||
|
" float sum = 0.0f;\n",
|
||||||
|
" for (int n = 0; n < N; ++n)\n",
|
||||||
|
" sum += A[row * N + n] * B[n * N + col];\n",
|
||||||
|
"\n",
|
||||||
|
" C[row * N + col] = sum;\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"// Vector Addition\n",
|
||||||
|
"void runVectorAddition() {\n",
|
||||||
|
" int N;\n",
|
||||||
|
" cout << \"\\n=== Vector Addition ===\" << endl;\n",
|
||||||
|
" cout << \"Enter vector size: \";\n",
|
||||||
|
" cin >> N;\n",
|
||||||
|
"\n",
|
||||||
|
" int size = N * sizeof(int);\n",
|
||||||
|
"\n",
|
||||||
|
" // Host allocation and initialisation\n",
|
||||||
|
" int *hA = new int[N];\n",
|
||||||
|
" int *hB = new int[N];\n",
|
||||||
|
" int *hC = new int[N];\n",
|
||||||
|
"\n",
|
||||||
|
" for (int i = 0; i < N; i++) {\n",
|
||||||
|
" hA[i] = i;\n",
|
||||||
|
" hB[i] = i * 2;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"\\nVector A: \";\n",
|
||||||
|
" for (int i = 0; i < N; i++) cout << hA[i] << \" \";\n",
|
||||||
|
" cout << \"\\nVector B: \";\n",
|
||||||
|
" for (int i = 0; i < N; i++) cout << hB[i] << \" \";\n",
|
||||||
|
" cout << endl;\n",
|
||||||
|
"\n",
|
||||||
|
" // Device allocation and transfer\n",
|
||||||
|
" int *dA, *dB, *dC;\n",
|
||||||
|
" cudaMalloc(&dA, size);\n",
|
||||||
|
" cudaMalloc(&dB, size);\n",
|
||||||
|
" cudaMalloc(&dC, size);\n",
|
||||||
|
"\n",
|
||||||
|
" cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
|
||||||
|
" cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
|
||||||
|
"\n",
|
||||||
|
" // Launch with enough blocks to cover all N elements.\n",
|
||||||
|
" // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.\n",
|
||||||
|
" int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;\n",
|
||||||
|
" vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);\n",
|
||||||
|
"\n",
|
||||||
|
" cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"Result A + B: \";\n",
|
||||||
|
" for (int i = 0; i < N; i++) cout << hC[i] << \" \";\n",
|
||||||
|
" cout << endl;\n",
|
||||||
|
"\n",
|
||||||
|
" delete[] hA;\n",
|
||||||
|
" delete[] hB;\n",
|
||||||
|
" delete[] hC;\n",
|
||||||
|
" cudaFree(dA);\n",
|
||||||
|
" cudaFree(dB);\n",
|
||||||
|
" cudaFree(dC);\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"// Matrix Multiplication\n",
|
||||||
|
"void runMatrixMultiplication() {\n",
|
||||||
|
" int K, N;\n",
|
||||||
|
" cout << \"\\n=== Matrix Multiplication ===\" << endl;\n",
|
||||||
|
" cout << \"Enter K (matrix will be N x N where N = K * \" << BLOCK_SIZE << \"): \";\n",
|
||||||
|
" cin >> K;\n",
|
||||||
|
" N = K * BLOCK_SIZE;\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"Matrix size: \" << N << \" x \" << N << endl;\n",
|
||||||
|
" int size = N * N * sizeof(float);\n",
|
||||||
|
"\n",
|
||||||
|
" // Host allocation and initialisation\n",
|
||||||
|
" float *hA = new float[N * N];\n",
|
||||||
|
" float *hB = new float[N * N];\n",
|
||||||
|
" float *hC = new float[N * N];\n",
|
||||||
|
"\n",
|
||||||
|
" for (int j = 0; j < N; j++) {\n",
|
||||||
|
" for (int i = 0; i < N; i++) {\n",
|
||||||
|
" hA[j * N + i] = 2;\n",
|
||||||
|
" hB[j * N + i] = 4;\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"\\nMatrix A:\\n\";\n",
|
||||||
|
" for (int row = 0; row < N; row++) {\n",
|
||||||
|
" for (int col = 0; col < N; col++)\n",
|
||||||
|
" cout << hA[row * N + col] << \" \";\n",
|
||||||
|
" cout << endl;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"\\nMatrix B:\\n\";\n",
|
||||||
|
" for (int row = 0; row < N; row++) {\n",
|
||||||
|
" for (int col = 0; col < N; col++)\n",
|
||||||
|
" cout << hB[row * N + col] << \" \";\n",
|
||||||
|
" cout << endl;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" // Device allocation and transfer\n",
|
||||||
|
" float *dA, *dB, *dC;\n",
|
||||||
|
" cudaMalloc(&dA, size);\n",
|
||||||
|
" cudaMalloc(&dB, size);\n",
|
||||||
|
" cudaMalloc(&dC, size);\n",
|
||||||
|
"\n",
|
||||||
|
" cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
|
||||||
|
" cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
|
||||||
|
"\n",
|
||||||
|
" // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.\n",
|
||||||
|
" // grid: K x K blocks, so total threads = N x N (one per output element).\n",
|
||||||
|
" dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);\n",
|
||||||
|
" dim3 grid(K, K);\n",
|
||||||
|
" matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);\n",
|
||||||
|
"\n",
|
||||||
|
" cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"\\nResult C = A * B:\\n\";\n",
|
||||||
|
" for (int row = 0; row < N; row++) {\n",
|
||||||
|
" for (int col = 0; col < N; col++)\n",
|
||||||
|
" cout << hC[row * N + col] << \" \";\n",
|
||||||
|
" cout << endl;\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" delete[] hA;\n",
|
||||||
|
" delete[] hB;\n",
|
||||||
|
" delete[] hC;\n",
|
||||||
|
" cudaFree(dA);\n",
|
||||||
|
" cudaFree(dB);\n",
|
||||||
|
" cudaFree(dC);\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"int main() {\n",
|
||||||
|
" runVectorAddition();\n",
|
||||||
|
" runMatrixMultiplication();\n",
|
||||||
|
"\n",
|
||||||
|
" cout << \"\\nFinished.\" << endl;\n",
|
||||||
|
" return 0;\n",
|
||||||
|
"}\n"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "nvCj8UmhIh3o",
|
||||||
|
"outputId": "54a31aec-f860-4b72-a4e1-03a392def6f6"
|
||||||
|
},
|
||||||
|
"execution_count": 23,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Overwriting cuda_program.cu\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [
|
||||||
|
"!nvcc cuda_program.cu -o cuda_program && ./cuda_program"
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"base_uri": "https://localhost:8080/"
|
||||||
|
},
|
||||||
|
"id": "F7fC0LtbJ5o8",
|
||||||
|
"outputId": "9d5988b2-ad42-4b0b-c84d-c1698e778bb9"
|
||||||
|
},
|
||||||
|
"execution_count": 24,
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"nvcc warning : Support for offline compilation for architectures prior to '<compute/sm/lto>_75' will be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n",
|
||||||
|
"\n",
|
||||||
|
"=== Vector Addition ===\n",
|
||||||
|
"Enter vector size: 2\n",
|
||||||
|
"\n",
|
||||||
|
"Vector A: 0 1 \n",
|
||||||
|
"Vector B: 0 2 \n",
|
||||||
|
"Result A + B: 0 3 \n",
|
||||||
|
"\n",
|
||||||
|
"=== Matrix Multiplication ===\n",
|
||||||
|
"Enter K (matrix will be N x N where N = K * 2): 3\n",
|
||||||
|
"Matrix size: 6 x 6\n",
|
||||||
|
"\n",
|
||||||
|
"Matrix A:\n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"2 2 2 2 2 2 \n",
|
||||||
|
"\n",
|
||||||
|
"Matrix B:\n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"4 4 4 4 4 4 \n",
|
||||||
|
"\n",
|
||||||
|
"Result C = A * B:\n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"48 48 48 48 48 48 \n",
|
||||||
|
"\n",
|
||||||
|
"Finished.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"source": [],
|
||||||
|
"metadata": {
|
||||||
|
"id": "HjhrulSNKHkq"
|
||||||
|
},
|
||||||
|
"execution_count": null,
|
||||||
|
"outputs": []
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,161 @@
|
|||||||
|
# %%writefile cuda_program.cu
|
||||||
|
#include <iostream>
|
||||||
|
#include <cuda.h>
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
#define BLOCK_SIZE 2
|
||||||
|
|
||||||
|
// ─── Vector Addition Kernel ──────────────────────────────────────────────────
|
||||||
|
// Each thread computes a single element of C = A + B.
|
||||||
|
__global__ void vectorAdd(int *A, int *B, int *C, int N) {
|
||||||
|
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
// Guard against threads beyond the vector size (when N is not a multiple
|
||||||
|
// of the block size, some threads in the last block are out of range).
|
||||||
|
if (i < N)
|
||||||
|
C[i] = A[i] + B[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Matrix Multiplication Kernel ────────────────────────────────────────────
|
||||||
|
// Each thread computes a single element of C = A * B.
|
||||||
|
// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
|
||||||
|
__global__ void matrixMul(float *A, float *B, float *C, int N) {
|
||||||
|
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||||
|
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
|
||||||
|
float sum = 0.0f;
|
||||||
|
for (int n = 0; n < N; ++n)
|
||||||
|
sum += A[row * N + n] * B[n * N + col];
|
||||||
|
|
||||||
|
C[row * N + col] = sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Vector Addition ─────────────────────────────────────────────────────────
|
||||||
|
void runVectorAddition() {
|
||||||
|
int N;
|
||||||
|
cout << "\n=== Vector Addition ===" << endl;
|
||||||
|
cout << "Enter vector size: ";
|
||||||
|
cin >> N;
|
||||||
|
|
||||||
|
int size = N * sizeof(int);
|
||||||
|
|
||||||
|
// Host allocation and initialisation
|
||||||
|
int *hA = new int[N];
|
||||||
|
int *hB = new int[N];
|
||||||
|
int *hC = new int[N];
|
||||||
|
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
hA[i] = i;
|
||||||
|
hB[i] = i * 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nVector A: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hA[i] << " ";
|
||||||
|
cout << "\nVector B: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hB[i] << " ";
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
// Device allocation and transfer
|
||||||
|
int *dA, *dB, *dC;
|
||||||
|
cudaMalloc(&dA, size);
|
||||||
|
cudaMalloc(&dB, size);
|
||||||
|
cudaMalloc(&dC, size);
|
||||||
|
|
||||||
|
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// Launch with enough blocks to cover all N elements.
|
||||||
|
// (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
|
||||||
|
int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
|
vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
|
||||||
|
|
||||||
|
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
cout << "Result A + B: ";
|
||||||
|
for (int i = 0; i < N; i++) cout << hC[i] << " ";
|
||||||
|
cout << endl;
|
||||||
|
|
||||||
|
delete[] hA;
|
||||||
|
delete[] hB;
|
||||||
|
delete[] hC;
|
||||||
|
cudaFree(dA);
|
||||||
|
cudaFree(dB);
|
||||||
|
cudaFree(dC);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─── Matrix Multiplication ───────────────────────────────────────────────────
|
||||||
|
void runMatrixMultiplication() {
|
||||||
|
int K, N;
|
||||||
|
cout << "\n=== Matrix Multiplication ===" << endl;
|
||||||
|
cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
|
||||||
|
cin >> K;
|
||||||
|
N = K * BLOCK_SIZE;
|
||||||
|
|
||||||
|
cout << "Matrix size: " << N << " x " << N << endl;
|
||||||
|
int size = N * N * sizeof(float);
|
||||||
|
|
||||||
|
// Host allocation and initialisation
|
||||||
|
float *hA = new float[N * N];
|
||||||
|
float *hB = new float[N * N];
|
||||||
|
float *hC = new float[N * N];
|
||||||
|
|
||||||
|
for (int j = 0; j < N; j++) {
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
hA[j * N + i] = 2;
|
||||||
|
hB[j * N + i] = 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nMatrix A:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hA[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
cout << "\nMatrix B:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hB[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Device allocation and transfer
|
||||||
|
float *dA, *dB, *dC;
|
||||||
|
cudaMalloc(&dA, size);
|
||||||
|
cudaMalloc(&dB, size);
|
||||||
|
cudaMalloc(&dC, size);
|
||||||
|
|
||||||
|
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||||
|
|
||||||
|
// threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
|
||||||
|
// grid: K x K blocks, so total threads = N x N (one per output element).
|
||||||
|
dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
|
||||||
|
dim3 grid(K, K);
|
||||||
|
matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
|
||||||
|
|
||||||
|
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||||
|
|
||||||
|
cout << "\nResult C = A * B:\n";
|
||||||
|
for (int row = 0; row < N; row++) {
|
||||||
|
for (int col = 0; col < N; col++)
|
||||||
|
cout << hC[row * N + col] << " ";
|
||||||
|
cout << endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
delete[] hA;
|
||||||
|
delete[] hB;
|
||||||
|
delete[] hC;
|
||||||
|
cudaFree(dA);
|
||||||
|
cudaFree(dB);
|
||||||
|
cudaFree(dC);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main() {
|
||||||
|
runVectorAddition();
|
||||||
|
runMatrixMultiplication();
|
||||||
|
|
||||||
|
cout << "\nFinished." << endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
BIN
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
@@ -13,6 +13,7 @@ This repository compiles essential resources for the SPPU Computer Engineering P
|
|||||||
1. [Code-1 (Parallel BFS and DFS)](Codes/Code-1.cpp)
|
1. [Code-1 (Parallel BFS and DFS)](Codes/Code-1.cpp)
|
||||||
2. [Code-2 (Sequential and Parallel Bubble Sort and Merge Sort)](Codes/Code-2.cpp)
|
2. [Code-2 (Sequential and Parallel Bubble Sort and Merge Sort)](Codes/Code-2.cpp)
|
||||||
3. [Code-3 (Min, Max, Sum, Average)](Codes/Code-3.cpp)
|
3. [Code-3 (Min, Max, Sum, Average)](Codes/Code-3.cpp)
|
||||||
|
4. [Code-4 (Vector Addition and Matrix Multiplication)](Codes/Code-4/)
|
||||||
|
|
||||||
### Practical
|
### Practical
|
||||||
|
|
||||||
@@ -38,6 +39,8 @@ This repository compiles essential resources for the SPPU Computer Engineering P
|
|||||||
|
|
||||||
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers/)
|
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers/)
|
||||||
|
|
||||||
|
### [END-SEM PYQ Answers](Notes/END-SEM%20PYQ%20Answers/)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Miscellaneous
|
## Miscellaneous
|
||||||
|
|||||||
Reference in New Issue
Block a user