# Practical-4 (CUDA Programs for Addition and Multiplication) Problem Statement: Write a CUDA Program for: 1. Addition of two large vectors 2. 2. Matrix Multiplication using CUDA C --- ## Pre-requisities 1. Open [Google Colab](https://colab.research.google.com/) 2. Create a new Jupyter Notebook --- ## Steps ### 1. After creating a new Jupyter notebook, click on "Runtime" in the navbar: ![Runtime in navbar in Google Colab](attachments/runtime-navbar.png) ### 2. Then, choose "Change runtime type": ![Change runtime type option in Runtime section on Google Colab](attachments/change-runtime.png) ### 3. Select "T4 GPU", and save: ![T4 GPU option selected in Google Colab as Runtime](attachments/select-t4-gpu.png) ### 4. Check if `nvcc` is installed: ```python3 !nvcc --version ``` ### 5. Install `nvcc4jupyter`: ```python3 !pip install nvcc4jupyter # Or if the above command fails, comment the above line and run # !pip install git+https://git.kska.io/notkshitij/nvcc.git ``` ### 6. Load it: ```python3 %load_ext nvcc4jupyter ``` ### 7. Paste the below code in a new code block: ```cu %%writefile cuda_program.cu #include #include using namespace std; #define BLOCK_SIZE 2 // Vector Addition Kernel // Each thread computes a single element of C = A + B. __global__ void vectorAdd(int *A, int *B, int *C, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; // Guard against threads beyond the vector size (when N is not a multiple // of the block size, some threads in the last block are out of range). if (i < N) C[i] = A[i] + B[i]; } // Matrix Multiplication Kernel // Each thread computes a single element of C = A * B. // Thread (row, col) sums the dot product of row `row` of A with column `col` of B. __global__ void matrixMul(float *A, float *B, float *C, int N) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; float sum = 0.0f; for (int n = 0; n < N; ++n) sum += A[row * N + n] * B[n * N + col]; C[row * N + col] = sum; } // Vector Addition void runVectorAddition() { int N; cout << "\n=== Vector Addition ===" << endl; cout << "Enter vector size: "; cin >> N; int size = N * sizeof(int); // Host allocation and initialisation int *hA = new int[N]; int *hB = new int[N]; int *hC = new int[N]; for (int i = 0; i < N; i++) { hA[i] = i; hB[i] = i * 2; } cout << "\nVector A: "; for (int i = 0; i < N; i++) cout << hA[i] << " "; cout << "\nVector B: "; for (int i = 0; i < N; i++) cout << hB[i] << " "; cout << endl; // Device allocation and transfer int *dA, *dB, *dC; cudaMalloc(&dA, size); cudaMalloc(&dB, size); cudaMalloc(&dC, size); cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice); cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice); // Launch with enough blocks to cover all N elements. // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail. int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE; vectorAdd<<>>(dA, dB, dC, N); cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost); cout << "Result A + B: "; for (int i = 0; i < N; i++) cout << hC[i] << " "; cout << endl; delete[] hA; delete[] hB; delete[] hC; cudaFree(dA); cudaFree(dB); cudaFree(dC); } // Matrix Multiplication void runMatrixMultiplication() { int K, N; cout << "\n=== Matrix Multiplication ===" << endl; cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): "; cin >> K; N = K * BLOCK_SIZE; cout << "Matrix size: " << N << " x " << N << endl; int size = N * N * sizeof(float); // Host allocation and initialisation float *hA = new float[N * N]; float *hB = new float[N * N]; float *hC = new float[N * N]; for (int j = 0; j < N; j++) { for (int i = 0; i < N; i++) { hA[j * N + i] = 2; hB[j * N + i] = 4; } } cout << "\nMatrix A:\n"; for (int row = 0; row < N; row++) { for (int col = 0; col < N; col++) cout << hA[row * N + col] << " "; cout << endl; } cout << "\nMatrix B:\n"; for (int row = 0; row < N; row++) { for (int col = 0; col < N; col++) cout << hB[row * N + col] << " "; cout << endl; } // Device allocation and transfer float *dA, *dB, *dC; cudaMalloc(&dA, size); cudaMalloc(&dB, size); cudaMalloc(&dC, size); cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice); cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice); // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block. // grid: K x K blocks, so total threads = N x N (one per output element). dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE); dim3 grid(K, K); matrixMul<<>>(dA, dB, dC, N); cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost); cout << "\nResult C = A * B:\n"; for (int row = 0; row < N; row++) { for (int col = 0; col < N; col++) cout << hC[row * N + col] << " "; cout << endl; } delete[] hA; delete[] hB; delete[] hC; cudaFree(dA); cudaFree(dB); cudaFree(dC); } int main() { runVectorAddition(); runMatrixMultiplication(); cout << "\nFinished." << endl; return 0; } ``` ### 8. Compile and run: ```python3 !nvcc cuda_program.cu -o cuda_program && ./cuda_program ``` --- ## Sample output ```md === Vector Addition === Enter vector size: 2 Vector A: 0 1 Vector B: 0 2 Result A + B: 0 3 === Matrix Multiplication === Enter K (matrix will be N x N where N = K * 2): 3 Matrix size: 6 x 6 Matrix A: 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 Matrix B: 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 Result C = A * B: 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 Finished. ``` ---