From 4e5913d6e49b4eae9f531b68e17a5da99aa91270 Mon Sep 17 00:00:00 2001 From: Kshitij Date: Mon, 4 May 2026 23:46:08 +0530 Subject: [PATCH] add instructions for executing 4th practical in Google Colab. --- Codes/Code-4/Code-4-Steps.md | 267 +++++++++++++++++++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 Codes/Code-4/Code-4-Steps.md diff --git a/Codes/Code-4/Code-4-Steps.md b/Codes/Code-4/Code-4-Steps.md new file mode 100644 index 0000000..dcdc434 --- /dev/null +++ b/Codes/Code-4/Code-4-Steps.md @@ -0,0 +1,267 @@ +# Practical-4 (CUDA Programs for Addition and Multiplication) + +Problem Statement: +Write a CUDA Program for: +1. Addition of two large vectors +2. 2. Matrix Multiplication using CUDA C + +--- + +## Pre-requisities + +1. Open [Google Colab](https://colab.research.google.com/) +2. Create a new Jupyter Notebook + +--- + +## Steps + +### 1. After creating a new Jupyter notebook, click on "Runtime" in the navbar: + +![Runtime in navbar in Google Colab](attachments/runtime-navbar.png) + +### 2. Then, choose "Change runtime type": + +![Change runtime type option in Runtime section on Google Colab](attachments/change-runtime.png) + +### 3. Select "T4 GPU", and save: + +![T4 GPU option selected in Google Colab as Runtime](attachments/select-t4-gpu.png) + +### 4. Check if `nvcc` is installed: + +```python3 +!nvcc --version +``` + +### 5. Install `nvcc4jupyter`: + +```python3 +!pip install nvcc4jupyter +# Or if the above command fails, comment the above line and run +# !pip install git+https://git.kska.io/notkshitij/nvcc.git +``` + +### 6. Load it: + +```python3 +%load_ext nvcc4jupyter +``` + +### 7. Paste the below code in a new code block: + +```cu +%%writefile cuda_program.cu +#include +#include + +using namespace std; + +#define BLOCK_SIZE 2 + +// Vector Addition Kernel +// Each thread computes a single element of C = A + B. +__global__ void vectorAdd(int *A, int *B, int *C, int N) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + // Guard against threads beyond the vector size (when N is not a multiple + // of the block size, some threads in the last block are out of range). + if (i < N) + C[i] = A[i] + B[i]; +} + +// Matrix Multiplication Kernel +// Each thread computes a single element of C = A * B. +// Thread (row, col) sums the dot product of row `row` of A with column `col` of B. +__global__ void matrixMul(float *A, float *B, float *C, int N) { + int row = blockIdx.y * blockDim.y + threadIdx.y; + int col = blockIdx.x * blockDim.x + threadIdx.x; + + float sum = 0.0f; + for (int n = 0; n < N; ++n) + sum += A[row * N + n] * B[n * N + col]; + + C[row * N + col] = sum; +} + +// Vector Addition +void runVectorAddition() { + int N; + cout << "\n=== Vector Addition ===" << endl; + cout << "Enter vector size: "; + cin >> N; + + int size = N * sizeof(int); + + // Host allocation and initialisation + int *hA = new int[N]; + int *hB = new int[N]; + int *hC = new int[N]; + + for (int i = 0; i < N; i++) { + hA[i] = i; + hB[i] = i * 2; + } + + cout << "\nVector A: "; + for (int i = 0; i < N; i++) cout << hA[i] << " "; + cout << "\nVector B: "; + for (int i = 0; i < N; i++) cout << hB[i] << " "; + cout << endl; + + // Device allocation and transfer + int *dA, *dB, *dC; + cudaMalloc(&dA, size); + cudaMalloc(&dB, size); + cudaMalloc(&dC, size); + + cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice); + cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice); + + // Launch with enough blocks to cover all N elements. + // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail. + int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE; + vectorAdd<<>>(dA, dB, dC, N); + + cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost); + + cout << "Result A + B: "; + for (int i = 0; i < N; i++) cout << hC[i] << " "; + cout << endl; + + delete[] hA; + delete[] hB; + delete[] hC; + cudaFree(dA); + cudaFree(dB); + cudaFree(dC); +} + +// Matrix Multiplication +void runMatrixMultiplication() { + int K, N; + cout << "\n=== Matrix Multiplication ===" << endl; + cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): "; + cin >> K; + N = K * BLOCK_SIZE; + + cout << "Matrix size: " << N << " x " << N << endl; + int size = N * N * sizeof(float); + + // Host allocation and initialisation + float *hA = new float[N * N]; + float *hB = new float[N * N]; + float *hC = new float[N * N]; + + for (int j = 0; j < N; j++) { + for (int i = 0; i < N; i++) { + hA[j * N + i] = 2; + hB[j * N + i] = 4; + } + } + + cout << "\nMatrix A:\n"; + for (int row = 0; row < N; row++) { + for (int col = 0; col < N; col++) + cout << hA[row * N + col] << " "; + cout << endl; + } + + cout << "\nMatrix B:\n"; + for (int row = 0; row < N; row++) { + for (int col = 0; col < N; col++) + cout << hB[row * N + col] << " "; + cout << endl; + } + + // Device allocation and transfer + float *dA, *dB, *dC; + cudaMalloc(&dA, size); + cudaMalloc(&dB, size); + cudaMalloc(&dC, size); + + cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice); + cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice); + + // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block. + // grid: K x K blocks, so total threads = N x N (one per output element). + dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE); + dim3 grid(K, K); + matrixMul<<>>(dA, dB, dC, N); + + cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost); + + cout << "\nResult C = A * B:\n"; + for (int row = 0; row < N; row++) { + for (int col = 0; col < N; col++) + cout << hC[row * N + col] << " "; + cout << endl; + } + + delete[] hA; + delete[] hB; + delete[] hC; + cudaFree(dA); + cudaFree(dB); + cudaFree(dC); +} + +int main() { + runVectorAddition(); + runMatrixMultiplication(); + + cout << "\nFinished." << endl; + return 0; +} +``` + +### 8. Compile and run: + +```python3 +!nvcc cuda_program.cu -o cuda_program && ./cuda_program +``` + +--- + +## Sample output + +```md +=== Vector Addition === +Enter vector size: 2 + +Vector A: 0 1 +Vector B: 0 2 +Result A + B: 0 3 + +=== Matrix Multiplication === +Enter K (matrix will be N x N where N = K * 2): 3 +Matrix size: 6 x 6 + +Matrix A: +2 2 2 2 2 2 +2 2 2 2 2 2 +2 2 2 2 2 2 +2 2 2 2 2 2 +2 2 2 2 2 2 +2 2 2 2 2 2 + +Matrix B: +4 4 4 4 4 4 +4 4 4 4 4 4 +4 4 4 4 4 4 +4 4 4 4 4 4 +4 4 4 4 4 4 +4 4 4 4 4 4 + +Result C = A * B: +48 48 48 48 48 48 +48 48 48 48 48 48 +48 48 48 48 48 48 +48 48 48 48 48 48 +48 48 48 48 48 48 +48 48 48 48 48 48 + +Finished. +``` + +--- +