diff --git a/Codes/Code-4/Notebook-4.ipynb b/Codes/Code-4/Notebook-4.ipynb new file mode 100644 index 0000000..22463e2 --- /dev/null +++ b/Codes/Code-4/Notebook-4.ipynb @@ -0,0 +1,348 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "sSZ5XEy-IFoj", + "outputId": "8bac00c3-0327-4682-f636-b6b253db5201" + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "nvcc: NVIDIA (R) Cuda compiler driver\n", + "Copyright (c) 2005-2025 NVIDIA Corporation\n", + "Built on Fri_Feb_21_20:23:50_PST_2025\n", + "Cuda compilation tools, release 12.8, V12.8.93\n", + "Build cuda_12.8.r12.8/compiler.35583870_0\n" + ] + } + ], + "source": [ + "!nvcc --version" + ] + }, + { + "cell_type": "code", + "source": [ + "!pip install nvcc4jupyter\n", + "# Or if the above command fails, comment the above line and run\n", + "# !pip install git+https://git.kska.io/notkshitij/nvcc.git" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1jHq-AKfIINd", + "outputId": "818ccbb0-9383-4be5-cd79-c9527c5806c9" + }, + "execution_count": 25, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: nvcc4jupyter in /usr/local/lib/python3.12/dist-packages (1.2.1)\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "%load_ext nvcc4jupyter" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "9nuQsRZMIROH", + "outputId": "e8508f1c-9895-4e8c-e7f0-d9e34aab21a1" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The nvcc4jupyter extension is already loaded. To reload it, use:\n", + " %reload_ext nvcc4jupyter\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "%%writefile cuda_program.cu\n", + "#include \n", + "#include \n", + "\n", + "using namespace std;\n", + "\n", + "#define BLOCK_SIZE 2\n", + "\n", + "// Vector Addition Kernel\n", + "// Each thread computes a single element of C = A + B.\n", + "__global__ void vectorAdd(int *A, int *B, int *C, int N) {\n", + " int i = blockIdx.x * blockDim.x + threadIdx.x;\n", + " // Guard against threads beyond the vector size (when N is not a multiple\n", + " // of the block size, some threads in the last block are out of range).\n", + " if (i < N)\n", + " C[i] = A[i] + B[i];\n", + "}\n", + "\n", + "// Matrix Multiplication Kernel\n", + "// Each thread computes a single element of C = A * B.\n", + "// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.\n", + "__global__ void matrixMul(float *A, float *B, float *C, int N) {\n", + " int row = blockIdx.y * blockDim.y + threadIdx.y;\n", + " int col = blockIdx.x * blockDim.x + threadIdx.x;\n", + "\n", + " float sum = 0.0f;\n", + " for (int n = 0; n < N; ++n)\n", + " sum += A[row * N + n] * B[n * N + col];\n", + "\n", + " C[row * N + col] = sum;\n", + "}\n", + "\n", + "// Vector Addition\n", + "void runVectorAddition() {\n", + " int N;\n", + " cout << \"\\n=== Vector Addition ===\" << endl;\n", + " cout << \"Enter vector size: \";\n", + " cin >> N;\n", + "\n", + " int size = N * sizeof(int);\n", + "\n", + " // Host allocation and initialisation\n", + " int *hA = new int[N];\n", + " int *hB = new int[N];\n", + " int *hC = new int[N];\n", + "\n", + " for (int i = 0; i < N; i++) {\n", + " hA[i] = i;\n", + " hB[i] = i * 2;\n", + " }\n", + "\n", + " cout << \"\\nVector A: \";\n", + " for (int i = 0; i < N; i++) cout << hA[i] << \" \";\n", + " cout << \"\\nVector B: \";\n", + " for (int i = 0; i < N; i++) cout << hB[i] << \" \";\n", + " cout << endl;\n", + "\n", + " // Device allocation and transfer\n", + " int *dA, *dB, *dC;\n", + " cudaMalloc(&dA, size);\n", + " cudaMalloc(&dB, size);\n", + " cudaMalloc(&dC, size);\n", + "\n", + " cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n", + " cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n", + "\n", + " // Launch with enough blocks to cover all N elements.\n", + " // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.\n", + " int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;\n", + " vectorAdd<<>>(dA, dB, dC, N);\n", + "\n", + " cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n", + "\n", + " cout << \"Result A + B: \";\n", + " for (int i = 0; i < N; i++) cout << hC[i] << \" \";\n", + " cout << endl;\n", + "\n", + " delete[] hA;\n", + " delete[] hB;\n", + " delete[] hC;\n", + " cudaFree(dA);\n", + " cudaFree(dB);\n", + " cudaFree(dC);\n", + "}\n", + "\n", + "// Matrix Multiplication\n", + "void runMatrixMultiplication() {\n", + " int K, N;\n", + " cout << \"\\n=== Matrix Multiplication ===\" << endl;\n", + " cout << \"Enter K (matrix will be N x N where N = K * \" << BLOCK_SIZE << \"): \";\n", + " cin >> K;\n", + " N = K * BLOCK_SIZE;\n", + "\n", + " cout << \"Matrix size: \" << N << \" x \" << N << endl;\n", + " int size = N * N * sizeof(float);\n", + "\n", + " // Host allocation and initialisation\n", + " float *hA = new float[N * N];\n", + " float *hB = new float[N * N];\n", + " float *hC = new float[N * N];\n", + "\n", + " for (int j = 0; j < N; j++) {\n", + " for (int i = 0; i < N; i++) {\n", + " hA[j * N + i] = 2;\n", + " hB[j * N + i] = 4;\n", + " }\n", + " }\n", + "\n", + " cout << \"\\nMatrix A:\\n\";\n", + " for (int row = 0; row < N; row++) {\n", + " for (int col = 0; col < N; col++)\n", + " cout << hA[row * N + col] << \" \";\n", + " cout << endl;\n", + " }\n", + "\n", + " cout << \"\\nMatrix B:\\n\";\n", + " for (int row = 0; row < N; row++) {\n", + " for (int col = 0; col < N; col++)\n", + " cout << hB[row * N + col] << \" \";\n", + " cout << endl;\n", + " }\n", + "\n", + " // Device allocation and transfer\n", + " float *dA, *dB, *dC;\n", + " cudaMalloc(&dA, size);\n", + " cudaMalloc(&dB, size);\n", + " cudaMalloc(&dC, size);\n", + "\n", + " cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n", + " cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n", + "\n", + " // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.\n", + " // grid: K x K blocks, so total threads = N x N (one per output element).\n", + " dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);\n", + " dim3 grid(K, K);\n", + " matrixMul<<>>(dA, dB, dC, N);\n", + "\n", + " cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n", + "\n", + " cout << \"\\nResult C = A * B:\\n\";\n", + " for (int row = 0; row < N; row++) {\n", + " for (int col = 0; col < N; col++)\n", + " cout << hC[row * N + col] << \" \";\n", + " cout << endl;\n", + " }\n", + "\n", + " delete[] hA;\n", + " delete[] hB;\n", + " delete[] hC;\n", + " cudaFree(dA);\n", + " cudaFree(dB);\n", + " cudaFree(dC);\n", + "}\n", + "\n", + "int main() {\n", + " runVectorAddition();\n", + " runMatrixMultiplication();\n", + "\n", + " cout << \"\\nFinished.\" << endl;\n", + " return 0;\n", + "}\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "nvCj8UmhIh3o", + "outputId": "54a31aec-f860-4b72-a4e1-03a392def6f6" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Overwriting cuda_program.cu\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "!nvcc cuda_program.cu -o cuda_program && ./cuda_program" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "F7fC0LtbJ5o8", + "outputId": "9d5988b2-ad42-4b0b-c84d-c1698e778bb9" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "nvcc warning : Support for offline compilation for architectures prior to '_75' will be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n", + "\n", + "=== Vector Addition ===\n", + "Enter vector size: 2\n", + "\n", + "Vector A: 0 1 \n", + "Vector B: 0 2 \n", + "Result A + B: 0 3 \n", + "\n", + "=== Matrix Multiplication ===\n", + "Enter K (matrix will be N x N where N = K * 2): 3\n", + "Matrix size: 6 x 6\n", + "\n", + "Matrix A:\n", + "2 2 2 2 2 2 \n", + "2 2 2 2 2 2 \n", + "2 2 2 2 2 2 \n", + "2 2 2 2 2 2 \n", + "2 2 2 2 2 2 \n", + "2 2 2 2 2 2 \n", + "\n", + "Matrix B:\n", + "4 4 4 4 4 4 \n", + "4 4 4 4 4 4 \n", + "4 4 4 4 4 4 \n", + "4 4 4 4 4 4 \n", + "4 4 4 4 4 4 \n", + "4 4 4 4 4 4 \n", + "\n", + "Result C = A * B:\n", + "48 48 48 48 48 48 \n", + "48 48 48 48 48 48 \n", + "48 48 48 48 48 48 \n", + "48 48 48 48 48 48 \n", + "48 48 48 48 48 48 \n", + "48 48 48 48 48 48 \n", + "\n", + "Finished.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "HjhrulSNKHkq" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file