From 4e5913d6e49b4eae9f531b68e17a5da99aa91270 Mon Sep 17 00:00:00 2001
From: Kshitij <notkshitij@git.kska.io>
Date: Mon, 4 May 2026 23:46:08 +0530
Subject: [PATCH] add instructions for executing 4th practical in Google Colab.

---
 Codes/Code-4/Code-4-Steps.md | 267 +++++++++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 Codes/Code-4/Code-4-Steps.md
diff --git a/Codes/Code-4/Code-4-Steps.md b/Codes/Code-4/Code-4-Steps.md
new file mode 100644
index 0000000..dcdc434
--- /dev/null
+++ b/Codes/Code-4/Code-4-Steps.md
@@ -0,0 +1,267 @@
+# Practical-4 (CUDA Programs for Addition and Multiplication)
+
+Problem Statement:
+Write a CUDA Program for:
+1. Addition of two large vectors
+2. 2. Matrix Multiplication using CUDA C
+
+---
+
+## Pre-requisities
+
+1. Open [Google Colab](https://colab.research.google.com/)
+2. Create a new Jupyter Notebook
+
+---
+
+## Steps
+
+### 1. After creating a new Jupyter notebook, click on "Runtime" in the navbar:
+
+![Runtime in navbar in Google Colab](attachments/runtime-navbar.png)
+
+### 2. Then, choose "Change runtime type":
+
+![Change runtime type option in Runtime section on Google Colab](attachments/change-runtime.png)
+
+### 3. Select "T4 GPU", and save:
+
+![T4 GPU option selected in Google Colab as Runtime](attachments/select-t4-gpu.png)
+
+### 4. Check if `nvcc` is installed:
+
+```python3
+!nvcc --version
+```
+
+### 5. Install `nvcc4jupyter`:
+
+```python3
+!pip install nvcc4jupyter
+# Or if the above command fails, comment the above line and run
+# !pip install git+https://git.kska.io/notkshitij/nvcc.git
+```
+
+### 6. Load it:
+
+```python3
+%load_ext nvcc4jupyter
+```
+
+### 7. Paste the below code in a new code block:
+
+```cu
+%%writefile cuda_program.cu
+#include <iostream>
+#include <cuda.h>
+
+using namespace std;
+
+#define BLOCK_SIZE 2
+
+// Vector Addition Kernel
+// Each thread computes a single element of C = A + B.
+__global__ void vectorAdd(int *A, int *B, int *C, int N) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    // Guard against threads beyond the vector size (when N is not a multiple
+    // of the block size, some threads in the last block are out of range).
+    if (i < N)
+        C[i] = A[i] + B[i];
+}
+
+// Matrix Multiplication Kernel
+// Each thread computes a single element of C = A * B.
+// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
+__global__ void matrixMul(float *A, float *B, float *C, int N) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+    float sum = 0.0f;
+    for (int n = 0; n < N; ++n)
+        sum += A[row * N + n] * B[n * N + col];
+
+    C[row * N + col] = sum;
+}
+
+// Vector Addition
+void runVectorAddition() {
+    int N;
+    cout << "\n=== Vector Addition ===" << endl;
+    cout << "Enter vector size: ";
+    cin >> N;
+
+    int size = N * sizeof(int);
+
+    // Host allocation and initialisation
+    int *hA = new int[N];
+    int *hB = new int[N];
+    int *hC = new int[N];
+
+    for (int i = 0; i < N; i++) {
+        hA[i] = i;
+        hB[i] = i * 2;
+    }
+
+    cout << "\nVector A: ";
+    for (int i = 0; i < N; i++) cout << hA[i] << " ";
+    cout << "\nVector B: ";
+    for (int i = 0; i < N; i++) cout << hB[i] << " ";
+    cout << endl;
+
+    // Device allocation and transfer
+    int *dA, *dB, *dC;
+    cudaMalloc(&dA, size);
+    cudaMalloc(&dB, size);
+    cudaMalloc(&dC, size);
+
+    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
+
+    // Launch with enough blocks to cover all N elements.
+    // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
+    int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
+
+    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
+
+    cout << "Result A + B: ";
+    for (int i = 0; i < N; i++) cout << hC[i] << " ";
+    cout << endl;
+
+    delete[] hA;
+    delete[] hB;
+    delete[] hC;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dC);
+}
+
+// Matrix Multiplication
+void runMatrixMultiplication() {
+    int K, N;
+    cout << "\n=== Matrix Multiplication ===" << endl;
+    cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
+    cin >> K;
+    N = K * BLOCK_SIZE;
+
+    cout << "Matrix size: " << N << " x " << N << endl;
+    int size = N * N * sizeof(float);
+
+    // Host allocation and initialisation
+    float *hA = new float[N * N];
+    float *hB = new float[N * N];
+    float *hC = new float[N * N];
+
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < N; i++) {
+            hA[j * N + i] = 2;
+            hB[j * N + i] = 4;
+        }
+    }
+
+    cout << "\nMatrix A:\n";
+    for (int row = 0; row < N; row++) {
+        for (int col = 0; col < N; col++)
+            cout << hA[row * N + col] << " ";
+        cout << endl;
+    }
+
+    cout << "\nMatrix B:\n";
+    for (int row = 0; row < N; row++) {
+        for (int col = 0; col < N; col++)
+            cout << hB[row * N + col] << " ";
+        cout << endl;
+    }
+
+    // Device allocation and transfer
+    float *dA, *dB, *dC;
+    cudaMalloc(&dA, size);
+    cudaMalloc(&dB, size);
+    cudaMalloc(&dC, size);
+
+    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
+
+    // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
+    // grid: K x K blocks, so total threads = N x N (one per output element).
+    dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
+    dim3 grid(K, K);
+    matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
+
+    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
+
+    cout << "\nResult C = A * B:\n";
+    for (int row = 0; row < N; row++) {
+        for (int col = 0; col < N; col++)
+            cout << hC[row * N + col] << " ";
+        cout << endl;
+    }
+
+    delete[] hA;
+    delete[] hB;
+    delete[] hC;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dC);
+}
+
+int main() {
+    runVectorAddition();
+    runMatrixMultiplication();
+
+    cout << "\nFinished." << endl;
+    return 0;
+}
+```
+
+### 8. Compile and run:
+
+```python3
+!nvcc cuda_program.cu -o cuda_program && ./cuda_program
+```
+
+---
+
+## Sample output
+
+```md
+=== Vector Addition ===
+Enter vector size: 2
+
+Vector A: 0 1 
+Vector B: 0 2 
+Result A + B: 0 3 
+
+=== Matrix Multiplication ===
+Enter K (matrix will be N x N where N = K * 2): 3
+Matrix size: 6 x 6
+
+Matrix A:
+2 2 2 2 2 2 
+2 2 2 2 2 2 
+2 2 2 2 2 2 
+2 2 2 2 2 2 
+2 2 2 2 2 2 
+2 2 2 2 2 2 
+
+Matrix B:
+4 4 4 4 4 4 
+4 4 4 4 4 4 
+4 4 4 4 4 4 
+4 4 4 4 4 4 
+4 4 4 4 4 4 
+4 4 4 4 4 4 
+
+Result C = A * B:
+48 48 48 48 48 48 
+48 48 48 48 48 48 
+48 48 48 48 48 48 
+48 48 48 48 48 48 
+48 48 48 48 48 48 
+48 48 48 48 48 48 
+
+Finished.
+```
+
+---
+