add answers for may-june 2025 + november-december 2025 pyqs for unit 6 (High Performance Computing Applications)

add answers for may-june 2025 + november-december 2025 pyqs for unit 5 (CUDA Architecture)
add answers for may-june 2025 + november-december 2025 pyqs for unit 4 (Analytical Modeling of Parallel Programs)
2026-05-26 01:41:12 +05:30 · 2026-05-26 01:34:07 +05:30 · 2026-05-26 01:24:24 +05:30 · 2026-05-26 01:15:06 +05:30 · 2026-05-25 23:20:03 +05:30 · 2026-05-21 19:33:27 +05:30
16 changed files with 795 additions and 0 deletions
@@ -0,0 +1,3 @@
 attachments/change-runtime.png filter=lfs diff=lfs merge=lfs -text
 attachments/runtime-navbar.png filter=lfs diff=lfs merge=lfs -text
 attachments/select-t4-gpu.png filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,267 @@
 # Practical-4 (Vector Addition and Matrix Multiplication)
 Problem Statement:
 Write a CUDA Program for:
 1. Addition of two large vectors
 2. Matrix Multiplication using CUDA C
 ---
 ## Pre-requisities
 1. Open [Google Colab](https://colab.research.google.com/)
 2. Create a new Jupyter Notebook
 ---
 ## Steps
 ### 1. After creating a new Jupyter notebook, click on "Runtime" in the navbar:
 <img src="attachments/runtime-navbar.png" alt="Runtime in navbar in Google Colab" width=350>
 ### 2. Then, choose "Change runtime type":
 <img src="attachments/change-runtime.png" alt="Change runtime type option in Runtime section on Google Colab" width=300>
 ### 3. Select "T4 GPU", and save:
 <img src="attachments/select-t4-gpu.png" alt="T4 GPU option selected in Google Colab as Runtime" width=300>
 ### 4. Check if `nvcc` is installed:
 ```python3
 !nvcc --version
 ```
 ### 5. Install `nvcc4jupyter`:
 ```python3
 !pip install nvcc4jupyter
 # Or if the above command fails, comment the above line and run
 # !pip install git+https://git.kska.io/notkshitij/nvcc.git
 ```
 ### 6. Load it:
 ```python3
 %load_ext nvcc4jupyter
 ```
 ### 7. Paste the below code in a new code block:
 ```cu
 %%writefile cuda_program.cu
 #include <iostream>
 #include <cuda.h>
 using namespace std;
 #define BLOCK_SIZE 2
 // Vector Addition Kernel
 // Each thread computes a single element of C = A + B.
 __global__ void vectorAdd(int *A, int *B, int *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    // Guard against threads beyond the vector size (when N is not a multiple
    // of the block size, some threads in the last block are out of range).
    if (i < N)
        C[i] = A[i] + B[i];
 }
 // Matrix Multiplication Kernel
 // Each thread computes a single element of C = A * B.
 // Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
 __global__ void matrixMul(float *A, float *B, float *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    for (int n = 0; n < N; ++n)
        sum += A[row * N + n] * B[n * N + col];
    C[row * N + col] = sum;
 }
 // Vector Addition
 void runVectorAddition() {
    int N;
    cout << "\n=== Vector Addition ===" << endl;
    cout << "Enter vector size: ";
    cin >> N;
    int size = N * sizeof(int);
    // Host allocation and initialisation
    int *hA = new int[N];
    int *hB = new int[N];
    int *hC = new int[N];
    for (int i = 0; i < N; i++) {
        hA[i] = i;
        hB[i] = i * 2;
    }
    cout << "\nVector A: ";
    for (int i = 0; i < N; i++) cout << hA[i] << " ";
    cout << "\nVector B: ";
    for (int i = 0; i < N; i++) cout << hB[i] << " ";
    cout << endl;
    // Device allocation and transfer
    int *dA, *dB, *dC;
    cudaMalloc(&dA, size);
    cudaMalloc(&dB, size);
    cudaMalloc(&dC, size);
    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
    // Launch with enough blocks to cover all N elements.
    // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
    int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
    cout << "Result A + B: ";
    for (int i = 0; i < N; i++) cout << hC[i] << " ";
    cout << endl;
    delete[] hA;
    delete[] hB;
    delete[] hC;
    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);
 }
 // Matrix Multiplication
 void runMatrixMultiplication() {
    int K, N;
    cout << "\n=== Matrix Multiplication ===" << endl;
    cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
    cin >> K;
    N = K * BLOCK_SIZE;
    cout << "Matrix size: " << N << " x " << N << endl;
    int size = N * N * sizeof(float);
    // Host allocation and initialisation
    float *hA = new float[N * N];
    float *hB = new float[N * N];
    float *hC = new float[N * N];
    for (int j = 0; j < N; j++) {
        for (int i = 0; i < N; i++) {
            hA[j * N + i] = 2;
            hB[j * N + i] = 4;
        }
    }
    cout << "\nMatrix A:\n";
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++)
            cout << hA[row * N + col] << " ";
        cout << endl;
    }
    cout << "\nMatrix B:\n";
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++)
            cout << hB[row * N + col] << " ";
        cout << endl;
    }
    // Device allocation and transfer
    float *dA, *dB, *dC;
    cudaMalloc(&dA, size);
    cudaMalloc(&dB, size);
    cudaMalloc(&dC, size);
    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
    // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
    // grid: K x K blocks, so total threads = N x N (one per output element).
    dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid(K, K);
    matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
    cout << "\nResult C = A * B:\n";
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++)
            cout << hC[row * N + col] << " ";
        cout << endl;
    }
    delete[] hA;
    delete[] hB;
    delete[] hC;
    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);
 }
 int main() {
    runVectorAddition();
    runMatrixMultiplication();
    cout << "\nFinished." << endl;
    return 0;
 }
 ```
 ### 8. Compile and run:
 ```python3
 !nvcc cuda_program.cu -o cuda_program && ./cuda_program
 ```
 ---
 ## Sample output
 ```md
 === Vector Addition ===
 Enter vector size: 2
 Vector A: 0 1 
 Vector B: 0 2 
 Result A + B: 0 3 
 === Matrix Multiplication ===
 Enter K (matrix will be N x N where N = K * 2): 3
 Matrix size: 6 x 6
 Matrix A:
 2 2 2 2 2 2 
 2 2 2 2 2 2 
 2 2 2 2 2 2 
 2 2 2 2 2 2 
 2 2 2 2 2 2 
 2 2 2 2 2 2 
 Matrix B:
 4 4 4 4 4 4 
 4 4 4 4 4 4 
 4 4 4 4 4 4 
 4 4 4 4 4 4 
 4 4 4 4 4 4 
 4 4 4 4 4 4 
 Result C = A * B:
 48 48 48 48 48 48 
 48 48 48 48 48 48 
 48 48 48 48 48 48 
 48 48 48 48 48 48 
 48 48 48 48 48 48 
 48 48 48 48 48 48 
 Finished.
 ```
 ---
@@ -0,0 +1,348 @@
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "sSZ5XEy-IFoj",
        "outputId": "8bac00c3-0327-4682-f636-b6b253db5201"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "nvcc: NVIDIA (R) Cuda compiler driver\n",
            "Copyright (c) 2005-2025 NVIDIA Corporation\n",
            "Built on Fri_Feb_21_20:23:50_PST_2025\n",
            "Cuda compilation tools, release 12.8, V12.8.93\n",
            "Build cuda_12.8.r12.8/compiler.35583870_0\n"
          ]
        }
      ],
      "source": [
        "!nvcc --version"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip install nvcc4jupyter\n",
        "# Or if the above command fails, comment the above line and run\n",
        "# !pip install git+https://git.kska.io/notkshitij/nvcc.git"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1jHq-AKfIINd",
        "outputId": "818ccbb0-9383-4be5-cd79-c9527c5806c9"
      },
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: nvcc4jupyter in /usr/local/lib/python3.12/dist-packages (1.2.1)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%load_ext nvcc4jupyter"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9nuQsRZMIROH",
        "outputId": "e8508f1c-9895-4e8c-e7f0-d9e34aab21a1"
      },
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "The nvcc4jupyter extension is already loaded. To reload it, use:\n",
            "  %reload_ext nvcc4jupyter\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%writefile cuda_program.cu\n",
        "#include <iostream>\n",
        "#include <cuda.h>\n",
        "\n",
        "using namespace std;\n",
        "\n",
        "#define BLOCK_SIZE 2\n",
        "\n",
        "// Vector Addition Kernel\n",
        "// Each thread computes a single element of C = A + B.\n",
        "__global__ void vectorAdd(int *A, int *B, int *C, int N) {\n",
        "    int i = blockIdx.x * blockDim.x + threadIdx.x;\n",
        "    // Guard against threads beyond the vector size (when N is not a multiple\n",
        "    // of the block size, some threads in the last block are out of range).\n",
        "    if (i < N)\n",
        "        C[i] = A[i] + B[i];\n",
        "}\n",
        "\n",
        "// Matrix Multiplication Kernel\n",
        "// Each thread computes a single element of C = A * B.\n",
        "// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.\n",
        "__global__ void matrixMul(float *A, float *B, float *C, int N) {\n",
        "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
        "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
        "\n",
        "    float sum = 0.0f;\n",
        "    for (int n = 0; n < N; ++n)\n",
        "        sum += A[row * N + n] * B[n * N + col];\n",
        "\n",
        "    C[row * N + col] = sum;\n",
        "}\n",
        "\n",
        "// Vector Addition\n",
        "void runVectorAddition() {\n",
        "    int N;\n",
        "    cout << \"\\n=== Vector Addition ===\" << endl;\n",
        "    cout << \"Enter vector size: \";\n",
        "    cin >> N;\n",
        "\n",
        "    int size = N * sizeof(int);\n",
        "\n",
        "    // Host allocation and initialisation\n",
        "    int *hA = new int[N];\n",
        "    int *hB = new int[N];\n",
        "    int *hC = new int[N];\n",
        "\n",
        "    for (int i = 0; i < N; i++) {\n",
        "        hA[i] = i;\n",
        "        hB[i] = i * 2;\n",
        "    }\n",
        "\n",
        "    cout << \"\\nVector A: \";\n",
        "    for (int i = 0; i < N; i++) cout << hA[i] << \" \";\n",
        "    cout << \"\\nVector B: \";\n",
        "    for (int i = 0; i < N; i++) cout << hB[i] << \" \";\n",
        "    cout << endl;\n",
        "\n",
        "    // Device allocation and transfer\n",
        "    int *dA, *dB, *dC;\n",
        "    cudaMalloc(&dA, size);\n",
        "    cudaMalloc(&dB, size);\n",
        "    cudaMalloc(&dC, size);\n",
        "\n",
        "    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
        "    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
        "\n",
        "    // Launch with enough blocks to cover all N elements.\n",
        "    // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.\n",
        "    int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;\n",
        "    vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);\n",
        "\n",
        "    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
        "\n",
        "    cout << \"Result A + B: \";\n",
        "    for (int i = 0; i < N; i++) cout << hC[i] << \" \";\n",
        "    cout << endl;\n",
        "\n",
        "    delete[] hA;\n",
        "    delete[] hB;\n",
        "    delete[] hC;\n",
        "    cudaFree(dA);\n",
        "    cudaFree(dB);\n",
        "    cudaFree(dC);\n",
        "}\n",
        "\n",
        "// Matrix Multiplication\n",
        "void runMatrixMultiplication() {\n",
        "    int K, N;\n",
        "    cout << \"\\n=== Matrix Multiplication ===\" << endl;\n",
        "    cout << \"Enter K (matrix will be N x N where N = K * \" << BLOCK_SIZE << \"): \";\n",
        "    cin >> K;\n",
        "    N = K * BLOCK_SIZE;\n",
        "\n",
        "    cout << \"Matrix size: \" << N << \" x \" << N << endl;\n",
        "    int size = N * N * sizeof(float);\n",
        "\n",
        "    // Host allocation and initialisation\n",
        "    float *hA = new float[N * N];\n",
        "    float *hB = new float[N * N];\n",
        "    float *hC = new float[N * N];\n",
        "\n",
        "    for (int j = 0; j < N; j++) {\n",
        "        for (int i = 0; i < N; i++) {\n",
        "            hA[j * N + i] = 2;\n",
        "            hB[j * N + i] = 4;\n",
        "        }\n",
        "    }\n",
        "\n",
        "    cout << \"\\nMatrix A:\\n\";\n",
        "    for (int row = 0; row < N; row++) {\n",
        "        for (int col = 0; col < N; col++)\n",
        "            cout << hA[row * N + col] << \" \";\n",
        "        cout << endl;\n",
        "    }\n",
        "\n",
        "    cout << \"\\nMatrix B:\\n\";\n",
        "    for (int row = 0; row < N; row++) {\n",
        "        for (int col = 0; col < N; col++)\n",
        "            cout << hB[row * N + col] << \" \";\n",
        "        cout << endl;\n",
        "    }\n",
        "\n",
        "    // Device allocation and transfer\n",
        "    float *dA, *dB, *dC;\n",
        "    cudaMalloc(&dA, size);\n",
        "    cudaMalloc(&dB, size);\n",
        "    cudaMalloc(&dC, size);\n",
        "\n",
        "    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
        "    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
        "\n",
        "    // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.\n",
        "    // grid: K x K blocks, so total threads = N x N (one per output element).\n",
        "    dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);\n",
        "    dim3 grid(K, K);\n",
        "    matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);\n",
        "\n",
        "    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
        "\n",
        "    cout << \"\\nResult C = A * B:\\n\";\n",
        "    for (int row = 0; row < N; row++) {\n",
        "        for (int col = 0; col < N; col++)\n",
        "            cout << hC[row * N + col] << \" \";\n",
        "        cout << endl;\n",
        "    }\n",
        "\n",
        "    delete[] hA;\n",
        "    delete[] hB;\n",
        "    delete[] hC;\n",
        "    cudaFree(dA);\n",
        "    cudaFree(dB);\n",
        "    cudaFree(dC);\n",
        "}\n",
        "\n",
        "int main() {\n",
        "    runVectorAddition();\n",
        "    runMatrixMultiplication();\n",
        "\n",
        "    cout << \"\\nFinished.\" << endl;\n",
        "    return 0;\n",
        "}\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "nvCj8UmhIh3o",
        "outputId": "54a31aec-f860-4b72-a4e1-03a392def6f6"
      },
      "execution_count": 23,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Overwriting cuda_program.cu\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!nvcc cuda_program.cu -o cuda_program && ./cuda_program"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "F7fC0LtbJ5o8",
        "outputId": "9d5988b2-ad42-4b0b-c84d-c1698e778bb9"
      },
      "execution_count": 24,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "nvcc warning : Support for offline compilation for architectures prior to '<compute/sm/lto>_75' will be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n",
            "\n",
            "=== Vector Addition ===\n",
            "Enter vector size: 2\n",
            "\n",
            "Vector A: 0 1 \n",
            "Vector B: 0 2 \n",
            "Result A + B: 0 3 \n",
            "\n",
            "=== Matrix Multiplication ===\n",
            "Enter K (matrix will be N x N where N = K * 2): 3\n",
            "Matrix size: 6 x 6\n",
            "\n",
            "Matrix A:\n",
            "2 2 2 2 2 2 \n",
            "2 2 2 2 2 2 \n",
            "2 2 2 2 2 2 \n",
            "2 2 2 2 2 2 \n",
            "2 2 2 2 2 2 \n",
            "2 2 2 2 2 2 \n",
            "\n",
            "Matrix B:\n",
            "4 4 4 4 4 4 \n",
            "4 4 4 4 4 4 \n",
            "4 4 4 4 4 4 \n",
            "4 4 4 4 4 4 \n",
            "4 4 4 4 4 4 \n",
            "4 4 4 4 4 4 \n",
            "\n",
            "Result C = A * B:\n",
            "48 48 48 48 48 48 \n",
            "48 48 48 48 48 48 \n",
            "48 48 48 48 48 48 \n",
            "48 48 48 48 48 48 \n",
            "48 48 48 48 48 48 \n",
            "48 48 48 48 48 48 \n",
            "\n",
            "Finished.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "HjhrulSNKHkq"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
 }
@@ -0,0 +1,161 @@
 # %%writefile cuda_program.cu
 #include <iostream>
 #include <cuda.h>
 using namespace std;
 #define BLOCK_SIZE 2
 // ─── Vector Addition Kernel ──────────────────────────────────────────────────
 // Each thread computes a single element of C = A + B.
 __global__ void vectorAdd(int *A, int *B, int *C, int N) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    // Guard against threads beyond the vector size (when N is not a multiple
    // of the block size, some threads in the last block are out of range).
    if (i < N)
        C[i] = A[i] + B[i];
 }
 // ─── Matrix Multiplication Kernel ────────────────────────────────────────────
 // Each thread computes a single element of C = A * B.
 // Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
 __global__ void matrixMul(float *A, float *B, float *C, int N) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    float sum = 0.0f;
    for (int n = 0; n < N; ++n)
        sum += A[row * N + n] * B[n * N + col];
    C[row * N + col] = sum;
 }
 // ─── Vector Addition ─────────────────────────────────────────────────────────
 void runVectorAddition() {
    int N;
    cout << "\n=== Vector Addition ===" << endl;
    cout << "Enter vector size: ";
    cin >> N;
    int size = N * sizeof(int);
    // Host allocation and initialisation
    int *hA = new int[N];
    int *hB = new int[N];
    int *hC = new int[N];
    for (int i = 0; i < N; i++) {
        hA[i] = i;
        hB[i] = i * 2;
    }
    cout << "\nVector A: ";
    for (int i = 0; i < N; i++) cout << hA[i] << " ";
    cout << "\nVector B: ";
    for (int i = 0; i < N; i++) cout << hB[i] << " ";
    cout << endl;
    // Device allocation and transfer
    int *dA, *dB, *dC;
    cudaMalloc(&dA, size);
    cudaMalloc(&dB, size);
    cudaMalloc(&dC, size);
    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
    // Launch with enough blocks to cover all N elements.
    // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
    int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
    vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
    cout << "Result A + B: ";
    for (int i = 0; i < N; i++) cout << hC[i] << " ";
    cout << endl;
    delete[] hA;
    delete[] hB;
    delete[] hC;
    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);
 }
 // ─── Matrix Multiplication ───────────────────────────────────────────────────
 void runMatrixMultiplication() {
    int K, N;
    cout << "\n=== Matrix Multiplication ===" << endl;
    cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
    cin >> K;
    N = K * BLOCK_SIZE;
    cout << "Matrix size: " << N << " x " << N << endl;
    int size = N * N * sizeof(float);
    // Host allocation and initialisation
    float *hA = new float[N * N];
    float *hB = new float[N * N];
    float *hC = new float[N * N];
    for (int j = 0; j < N; j++) {
        for (int i = 0; i < N; i++) {
            hA[j * N + i] = 2;
            hB[j * N + i] = 4;
        }
    }
    cout << "\nMatrix A:\n";
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++)
            cout << hA[row * N + col] << " ";
        cout << endl;
    }
    cout << "\nMatrix B:\n";
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++)
            cout << hB[row * N + col] << " ";
        cout << endl;
    }
    // Device allocation and transfer
    float *dA, *dB, *dC;
    cudaMalloc(&dA, size);
    cudaMalloc(&dB, size);
    cudaMalloc(&dC, size);
    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
    // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
    // grid: K x K blocks, so total threads = N x N (one per output element).
    dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
    dim3 grid(K, K);
    matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
    cout << "\nResult C = A * B:\n";
    for (int row = 0; row < N; row++) {
        for (int col = 0; col < N; col++)
            cout << hC[row * N + col] << " ";
        cout << endl;
    }
    delete[] hA;
    delete[] hB;
    delete[] hC;
    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);
 }
 int main() {
    runVectorAddition();
    runMatrixMultiplication();
    cout << "\nFinished." << endl;
    return 0;
 }
@@ -10,6 +10,11 @@ This repository compiles essential resources for the SPPU Computer Engineering P
 ### Codes
 1. [Code-1 (Parallel BFS and DFS)](Codes/Code-1.cpp)
 2. [Code-2 (Sequential and Parallel Bubble Sort and Merge Sort)](Codes/Code-2.cpp)
 3. [Code-3 (Min, Max, Sum, Average)](Codes/Code-3.cpp)
 4. [Code-4 (Vector Addition and Matrix Multiplication)](Codes/Code-4/)
 ### Practical
 1. [Practical-1](Practical/Practical-1/)
@@ -34,6 +39,8 @@ This repository compiles essential resources for the SPPU Computer Engineering P
 ### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers/)
 ### [END-SEM PYQ Answers](Notes/END-SEM%20PYQ%20Answers/)
 ---
 ## Miscellaneous
Author	SHA1	Message	Date
notkshitij	58d38f9199	add answers for may-june 2025 + november-december 2025 pyqs for unit 6 (High Performance Computing Applications)	2026-05-26 01:41:12 +05:30
notkshitij	ccca13880d	add answers for may-june 2025 + november-december 2025 pyqs for unit 5 (CUDA Architecture)	2026-05-26 01:34:07 +05:30
notkshitij	ccd6640ac9	add answers for may-june 2025 + november-december 2025 pyqs for unit 4 (Analytical Modeling of Parallel Programs)	2026-05-26 01:24:24 +05:30
notkshitij	1dcb16981b	add answers for may-june 2025 + november-december 2025 pyqs for unit 3 (Parallel Communication)	2026-05-26 01:15:06 +05:30
notkshitij	344db2f477	add may-june 2025 + november-december 2025 pyqs for end-sem.	2026-05-25 23:20:03 +05:30
notkshitij	3f3b1a1978	add link for end-sem pyq answers in README.	2026-05-21 19:33:27 +05:30
notkshitij	786d318b88	add end-sem pyq answers for unit 6 (High Performance Computing Applications)	2026-05-21 19:29:44 +05:30
notkshitij	fefa2383bb	add end-sem pyq answers for unit 5 (CUDA Architecture)	2026-05-21 19:28:05 +05:30
notkshitij	a90631ce37	add end-sem pyq answers for unit 4 (Analytical Modeling of Parallel Programs)	2026-05-21 19:23:03 +05:30
notkshitij	7a6b281521	add end-sem pyq answers for unit 3 (Parallel Communication)	2026-05-21 19:19:15 +05:30
notkshitij	84b5e3a059	add end-sem pyqs for HPC (may june 2023, may-june 2024)	2026-05-15 01:41:50 +05:30
notkshitij	b8b405da94	fix title in markdown file for practical 4.	2026-05-04 23:53:57 +05:30
notkshitij	d3ad26e1ca	add link for 4th practical in README.	2026-05-04 23:53:26 +05:30
notkshitij	60783ed8cd	add only the program file for CUDA program; practical 4.	2026-05-04 23:52:49 +05:30
notkshitij	aaa405c02a	add Jupyter notebook for 4th practical; vector addition and matrix multiplication using CUDA C.	2026-05-04 23:51:16 +05:30
notkshitij	5f94348c49	fix formatting and resize attachments in instructions for executing practical 4 code.	2026-05-04 23:50:05 +05:30
notkshitij	4e5913d6e4	add instructions for executing 4th practical in Google Colab.	2026-05-04 23:46:08 +05:30
notkshitij	a521ac1ca1	add attachments required for practical 4 (CUDA program) instructions.	2026-05-04 23:45:45 +05:30
notkshitij	3a3c78ad6d	add links for code 1..3 in README.	2026-04-29 02:26:45 +05:30