348 lines
11 KiB
Plaintext
348 lines
11 KiB
Plaintext
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"provenance": [],
|
|
"gpuType": "T4"
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
},
|
|
"language_info": {
|
|
"name": "python"
|
|
},
|
|
"accelerator": "GPU"
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "sSZ5XEy-IFoj",
|
|
"outputId": "8bac00c3-0327-4682-f636-b6b253db5201"
|
|
},
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"nvcc: NVIDIA (R) Cuda compiler driver\n",
|
|
"Copyright (c) 2005-2025 NVIDIA Corporation\n",
|
|
"Built on Fri_Feb_21_20:23:50_PST_2025\n",
|
|
"Cuda compilation tools, release 12.8, V12.8.93\n",
|
|
"Build cuda_12.8.r12.8/compiler.35583870_0\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"!nvcc --version"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"!pip install nvcc4jupyter\n",
|
|
"# Or if the above command fails, comment the above line and run\n",
|
|
"# !pip install git+https://git.kska.io/notkshitij/nvcc.git"
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "1jHq-AKfIINd",
|
|
"outputId": "818ccbb0-9383-4be5-cd79-c9527c5806c9"
|
|
},
|
|
"execution_count": 25,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"Requirement already satisfied: nvcc4jupyter in /usr/local/lib/python3.12/dist-packages (1.2.1)\n"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"%load_ext nvcc4jupyter"
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "9nuQsRZMIROH",
|
|
"outputId": "e8508f1c-9895-4e8c-e7f0-d9e34aab21a1"
|
|
},
|
|
"execution_count": 16,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"The nvcc4jupyter extension is already loaded. To reload it, use:\n",
|
|
" %reload_ext nvcc4jupyter\n"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"%%writefile cuda_program.cu\n",
|
|
"#include <iostream>\n",
|
|
"#include <cuda.h>\n",
|
|
"\n",
|
|
"using namespace std;\n",
|
|
"\n",
|
|
"#define BLOCK_SIZE 2\n",
|
|
"\n",
|
|
"// Vector Addition Kernel\n",
|
|
"// Each thread computes a single element of C = A + B.\n",
|
|
"__global__ void vectorAdd(int *A, int *B, int *C, int N) {\n",
|
|
" int i = blockIdx.x * blockDim.x + threadIdx.x;\n",
|
|
" // Guard against threads beyond the vector size (when N is not a multiple\n",
|
|
" // of the block size, some threads in the last block are out of range).\n",
|
|
" if (i < N)\n",
|
|
" C[i] = A[i] + B[i];\n",
|
|
"}\n",
|
|
"\n",
|
|
"// Matrix Multiplication Kernel\n",
|
|
"// Each thread computes a single element of C = A * B.\n",
|
|
"// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.\n",
|
|
"__global__ void matrixMul(float *A, float *B, float *C, int N) {\n",
|
|
" int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
|
|
" int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
|
|
"\n",
|
|
" float sum = 0.0f;\n",
|
|
" for (int n = 0; n < N; ++n)\n",
|
|
" sum += A[row * N + n] * B[n * N + col];\n",
|
|
"\n",
|
|
" C[row * N + col] = sum;\n",
|
|
"}\n",
|
|
"\n",
|
|
"// Vector Addition\n",
|
|
"void runVectorAddition() {\n",
|
|
" int N;\n",
|
|
" cout << \"\\n=== Vector Addition ===\" << endl;\n",
|
|
" cout << \"Enter vector size: \";\n",
|
|
" cin >> N;\n",
|
|
"\n",
|
|
" int size = N * sizeof(int);\n",
|
|
"\n",
|
|
" // Host allocation and initialisation\n",
|
|
" int *hA = new int[N];\n",
|
|
" int *hB = new int[N];\n",
|
|
" int *hC = new int[N];\n",
|
|
"\n",
|
|
" for (int i = 0; i < N; i++) {\n",
|
|
" hA[i] = i;\n",
|
|
" hB[i] = i * 2;\n",
|
|
" }\n",
|
|
"\n",
|
|
" cout << \"\\nVector A: \";\n",
|
|
" for (int i = 0; i < N; i++) cout << hA[i] << \" \";\n",
|
|
" cout << \"\\nVector B: \";\n",
|
|
" for (int i = 0; i < N; i++) cout << hB[i] << \" \";\n",
|
|
" cout << endl;\n",
|
|
"\n",
|
|
" // Device allocation and transfer\n",
|
|
" int *dA, *dB, *dC;\n",
|
|
" cudaMalloc(&dA, size);\n",
|
|
" cudaMalloc(&dB, size);\n",
|
|
" cudaMalloc(&dC, size);\n",
|
|
"\n",
|
|
" cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
|
|
" cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
|
|
"\n",
|
|
" // Launch with enough blocks to cover all N elements.\n",
|
|
" // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.\n",
|
|
" int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;\n",
|
|
" vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);\n",
|
|
"\n",
|
|
" cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
|
|
"\n",
|
|
" cout << \"Result A + B: \";\n",
|
|
" for (int i = 0; i < N; i++) cout << hC[i] << \" \";\n",
|
|
" cout << endl;\n",
|
|
"\n",
|
|
" delete[] hA;\n",
|
|
" delete[] hB;\n",
|
|
" delete[] hC;\n",
|
|
" cudaFree(dA);\n",
|
|
" cudaFree(dB);\n",
|
|
" cudaFree(dC);\n",
|
|
"}\n",
|
|
"\n",
|
|
"// Matrix Multiplication\n",
|
|
"void runMatrixMultiplication() {\n",
|
|
" int K, N;\n",
|
|
" cout << \"\\n=== Matrix Multiplication ===\" << endl;\n",
|
|
" cout << \"Enter K (matrix will be N x N where N = K * \" << BLOCK_SIZE << \"): \";\n",
|
|
" cin >> K;\n",
|
|
" N = K * BLOCK_SIZE;\n",
|
|
"\n",
|
|
" cout << \"Matrix size: \" << N << \" x \" << N << endl;\n",
|
|
" int size = N * N * sizeof(float);\n",
|
|
"\n",
|
|
" // Host allocation and initialisation\n",
|
|
" float *hA = new float[N * N];\n",
|
|
" float *hB = new float[N * N];\n",
|
|
" float *hC = new float[N * N];\n",
|
|
"\n",
|
|
" for (int j = 0; j < N; j++) {\n",
|
|
" for (int i = 0; i < N; i++) {\n",
|
|
" hA[j * N + i] = 2;\n",
|
|
" hB[j * N + i] = 4;\n",
|
|
" }\n",
|
|
" }\n",
|
|
"\n",
|
|
" cout << \"\\nMatrix A:\\n\";\n",
|
|
" for (int row = 0; row < N; row++) {\n",
|
|
" for (int col = 0; col < N; col++)\n",
|
|
" cout << hA[row * N + col] << \" \";\n",
|
|
" cout << endl;\n",
|
|
" }\n",
|
|
"\n",
|
|
" cout << \"\\nMatrix B:\\n\";\n",
|
|
" for (int row = 0; row < N; row++) {\n",
|
|
" for (int col = 0; col < N; col++)\n",
|
|
" cout << hB[row * N + col] << \" \";\n",
|
|
" cout << endl;\n",
|
|
" }\n",
|
|
"\n",
|
|
" // Device allocation and transfer\n",
|
|
" float *dA, *dB, *dC;\n",
|
|
" cudaMalloc(&dA, size);\n",
|
|
" cudaMalloc(&dB, size);\n",
|
|
" cudaMalloc(&dC, size);\n",
|
|
"\n",
|
|
" cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
|
|
" cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
|
|
"\n",
|
|
" // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.\n",
|
|
" // grid: K x K blocks, so total threads = N x N (one per output element).\n",
|
|
" dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);\n",
|
|
" dim3 grid(K, K);\n",
|
|
" matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);\n",
|
|
"\n",
|
|
" cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
|
|
"\n",
|
|
" cout << \"\\nResult C = A * B:\\n\";\n",
|
|
" for (int row = 0; row < N; row++) {\n",
|
|
" for (int col = 0; col < N; col++)\n",
|
|
" cout << hC[row * N + col] << \" \";\n",
|
|
" cout << endl;\n",
|
|
" }\n",
|
|
"\n",
|
|
" delete[] hA;\n",
|
|
" delete[] hB;\n",
|
|
" delete[] hC;\n",
|
|
" cudaFree(dA);\n",
|
|
" cudaFree(dB);\n",
|
|
" cudaFree(dC);\n",
|
|
"}\n",
|
|
"\n",
|
|
"int main() {\n",
|
|
" runVectorAddition();\n",
|
|
" runMatrixMultiplication();\n",
|
|
"\n",
|
|
" cout << \"\\nFinished.\" << endl;\n",
|
|
" return 0;\n",
|
|
"}\n"
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "nvCj8UmhIh3o",
|
|
"outputId": "54a31aec-f860-4b72-a4e1-03a392def6f6"
|
|
},
|
|
"execution_count": 23,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"Overwriting cuda_program.cu\n"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [
|
|
"!nvcc cuda_program.cu -o cuda_program && ./cuda_program"
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
},
|
|
"id": "F7fC0LtbJ5o8",
|
|
"outputId": "9d5988b2-ad42-4b0b-c84d-c1698e778bb9"
|
|
},
|
|
"execution_count": 24,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"name": "stdout",
|
|
"text": [
|
|
"nvcc warning : Support for offline compilation for architectures prior to '<compute/sm/lto>_75' will be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n",
|
|
"\n",
|
|
"=== Vector Addition ===\n",
|
|
"Enter vector size: 2\n",
|
|
"\n",
|
|
"Vector A: 0 1 \n",
|
|
"Vector B: 0 2 \n",
|
|
"Result A + B: 0 3 \n",
|
|
"\n",
|
|
"=== Matrix Multiplication ===\n",
|
|
"Enter K (matrix will be N x N where N = K * 2): 3\n",
|
|
"Matrix size: 6 x 6\n",
|
|
"\n",
|
|
"Matrix A:\n",
|
|
"2 2 2 2 2 2 \n",
|
|
"2 2 2 2 2 2 \n",
|
|
"2 2 2 2 2 2 \n",
|
|
"2 2 2 2 2 2 \n",
|
|
"2 2 2 2 2 2 \n",
|
|
"2 2 2 2 2 2 \n",
|
|
"\n",
|
|
"Matrix B:\n",
|
|
"4 4 4 4 4 4 \n",
|
|
"4 4 4 4 4 4 \n",
|
|
"4 4 4 4 4 4 \n",
|
|
"4 4 4 4 4 4 \n",
|
|
"4 4 4 4 4 4 \n",
|
|
"4 4 4 4 4 4 \n",
|
|
"\n",
|
|
"Result C = A * B:\n",
|
|
"48 48 48 48 48 48 \n",
|
|
"48 48 48 48 48 48 \n",
|
|
"48 48 48 48 48 48 \n",
|
|
"48 48 48 48 48 48 \n",
|
|
"48 48 48 48 48 48 \n",
|
|
"48 48 48 48 48 48 \n",
|
|
"\n",
|
|
"Finished.\n"
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"source": [],
|
|
"metadata": {
|
|
"id": "HjhrulSNKHkq"
|
|
},
|
|
"execution_count": null,
|
|
"outputs": []
|
|
}
|
|
]
|
|
} |