add answers for may-june 2025 + november-december 2025 pyqs for unit 6 (High Performance Computing Applications)

add answers for may-june 2025 + november-december 2025 pyqs for unit 5 (CUDA Architecture)
add answers for may-june 2025 + november-december 2025 pyqs for unit 4 (Analytical Modeling of Parallel Programs)
2026-05-26 01:41:12 +05:30 · 2026-05-26 01:34:07 +05:30 · 2026-05-26 01:24:24 +05:30 · 2026-05-26 01:15:06 +05:30 · 2026-05-25 23:20:03 +05:30 · 2026-05-21 19:33:27 +05:30
27 changed files with 1340 additions and 0 deletions
@@ -0,0 +1,195 @@
+// Code-1 (Parallel BFS and DFS)
+
+/*
+ * THIS CODE HAS BEEN TESTED AND IS FULLY OPERATIONAL.
+ *
+ * Problem Statement: 
+ *  Design and implement Parallel Breadth First Search and
+ *  Depth First Search based on existing algorithms using OpenMP.
+ *  Use a Tree or an undirected graph for BFS and DFS.
+ *
+ * Code from HighPerformanceComputing (SPPU - Final Year - Computer Engineering - Content)
+ * repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/HighPerformanceComputing
+ **/
+
+/*
+ * EXECUTION INSTRUCTIONS (Debian-based distributions):
+ *
+ * i) Install g++ with OpenMP support:
+ *   sudo apt update
+ *   sudo apt install g++
+ *
+ * ii) Compile:
+ *   g++ -fopenmp Code-1.cpp -o Code-1
+ *
+ * iii) Execute:
+ *   ./Code-1
+ **/
+
+// BEGINNING OF CODE
+#include <iostream>
+#include <vector>
+#include <omp.h>
+
+using namespace std;
+
+// Undirected graph with parallel BFS and DFS traversal via OpenMP.
+class Graph {
+    int V;
+    vector<vector<int>> adj;
+
+public:
+    Graph(int V) {
+        this->V = V;
+        adj.resize(V);
+    }
+
+    void addEdge(int u, int v) {
+        adj[u].push_back(v);
+        adj[v].push_back(u);
+    }
+
+    // Level-synchronous BFS: all nodes at the current depth (the "frontier")
+    // are expanded in parallel before moving to the next level. This is the
+    // natural unit of parallelism for BFS, processing individual nodes is too
+    // fine-grained for threads to be useful.
+    void parallelBFS(int start) {
+        vector<bool> visited(V, false);
+        vector<int> frontier;
+
+        visited[start] = true;
+        frontier.push_back(start);
+
+        cout << "Parallel BFS from node " << start << ": ";
+
+        while (!frontier.empty()) {
+            for (int u : frontier)
+                cout << u << " ";
+
+            vector<int> next_frontier;
+
+            // Each thread accumulates its own local candidates to avoid
+            // contention on a shared next_frontier vector.
+            #pragma omp parallel
+            {
+                vector<int> local_next;
+
+                // nowait: threads that finish early skip the implicit barrier
+                // and proceed directly to the merge below.
+                // schedule(dynamic): faster threads pick up remaining chunks
+                // when adjacency list sizes vary across nodes.
+                #pragma omp for nowait schedule(dynamic)
+                for (int i = 0; i < (int)frontier.size(); i++) {
+                    for (int v : adj[frontier[i]]) {
+                        // The check-and-set on visited[] must be a single
+                        // critical section — without it, two threads could
+                        // both see visited[v]==false and both enqueue v,
+                        // producing duplicates in the next frontier.
+                        bool should_visit = false;
+                        #pragma omp critical
+                        {
+                            if (!visited[v]) {
+                                visited[v] = true;
+                                should_visit = true;
+                            }
+                        }
+                        // local_next is thread-private so no lock needed here.
+                        if (should_visit)
+                            local_next.push_back(v);
+                    }
+                }
+
+                // Merge: one thread at a time appends its local results.
+                // This is a separate critical section from the one above
+                // so the two do not serialize against each other.
+                #pragma omp critical
+                {
+                    next_frontier.insert(next_frontier.end(),
+                                         local_next.begin(),
+                                         local_next.end());
+                }
+            } // implicit barrier: all threads finish before frontier is swapped
+
+            frontier = next_frontier;
+        }
+
+        cout << endl;
+    }
+
+    // Iterative DFS using a vector as a stack (push_back/pop_back).
+    // vector is used instead of std::stack because std::stack cannot be
+    // safely shared across threads.
+    void parallelDFS(int start) {
+        vector<bool> visited(V, false);
+        vector<int> stack;
+
+        stack.push_back(start);
+
+        cout << "Parallel DFS from node " << start << ": ";
+
+        while (!stack.empty()) {
+            int u = stack.back();
+            stack.pop_back();
+
+            // A node may be pushed multiple times before it is marked visited
+            // (two threads can both see visited[v]==false). This guard ensures
+            // it is processed only once.
+            if (visited[u]) continue;
+            visited[u] = true;
+            cout << u << " ";
+
+            vector<int> to_push;
+
+            #pragma omp parallel
+            {
+                vector<int> local_push;
+
+                #pragma omp for nowait schedule(dynamic)
+                for (int i = 0; i < (int)adj[u].size(); i++) {
+                    // visited[] is only read here, not written, so no critical
+                    // section is needed. Stale reads may cause duplicates but
+                    // the guard above handles that safely.
+                    if (!visited[adj[u][i]])
+                        local_push.push_back(adj[u][i]);
+                }
+
+                #pragma omp critical
+                {
+                    to_push.insert(to_push.end(),
+                                   local_push.begin(),
+                                   local_push.end());
+                }
+            }
+
+            for (int v : to_push)
+                stack.push_back(v);
+        }
+
+        cout << endl;
+    }
+};
+
+int main() {
+    Graph g(6);
+
+    g.addEdge(0, 1);
+    g.addEdge(0, 2);
+    g.addEdge(1, 3);
+    g.addEdge(1, 4);
+    g.addEdge(2, 5);
+
+    g.parallelBFS(0);
+    g.parallelDFS(0);
+
+    return 0;
+}
+// END OF CODE
+
+/*
+EXAMPLE OUTPUT:
+
+$ ./Code-1 
+Parallel BFS from node 0: 0 1 2 5 3 4 
+Parallel DFS from node 0: 0 2 5 1 4 3 
+*/
+
@@ -0,0 +1,209 @@
+// Code-2 (Parallel Bubble Sort and Merge Sort)
+
+/*
+ * THIS CODE HAS BEEN TESTED AND IS FULLY OPERATIONAL.
+ *
+ * Problem Statement:
+ *  Write a program to implement Parallel Bubble Sort and Merge sort using OpenMP.
+ *  Use existing algorithms and measure the performance of sequential and parallel algorithms.
+ *
+ * Code from HighPerformanceComputing (SPPU - Final Year - Computer Engineering - Content)
+ * repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/HighPerformanceComputing
+ **/
+
+/*
+ * EXECUTION INSTRUCTIONS (Debian-based distributions):
+ *
+ * i) Install g++ with OpenMP support:
+ *   sudo apt update
+ *   sudo apt install g++
+ *
+ * ii) Compile:
+ *   g++ -fopenmp Code-2.cpp -o Code-2
+ *
+ * iii) Execute:
+ *   ./Code-2
+ **/
+
+// BEGINNING OF CODE
+#include <iostream>
+#include <vector>
+#include <cstdlib>
+#include <omp.h>
+
+using namespace std;
+
+void printArray(const vector<int>& arr) {
+    for (int num : arr)
+        cout << num << " ";
+    cout << endl;
+}
+
+// Bubble Sort
+
+// Sequential bubble sort.
+// Sorts the array using bubble sort by repeatedly swapping adjacent elements.
+void sequentialBubbleSort(vector<int>& arr) {
+    int n = arr.size();
+    for (int i = 0; i < n - 1; i++) {
+        for (int j = 0; j < n - i - 1; j++) {
+            if (arr[j] > arr[j + 1])
+                swap(arr[j], arr[j + 1]);
+        }
+    }
+}
+
+// Parallel bubble sort using odd-even transposition.
+// Standard bubble sort cannot be parallelized directly: thread on index j
+// and thread on index j+1 would both touch arr[j+1] simultaneously (data race).
+// Odd-even transposition alternates between two phases each pass:
+//   Phase 0 (even): compare pairs (0,1), (2,3), (4,5), ...
+//   Phase 1 (odd):  compare pairs (1,2), (3,4), (5,6), ...
+// Within each phase every pair is independent, so threads never share elements.
+void parallelBubbleSort(vector<int>& arr) {
+    int n = arr.size();
+    for (int i = 0; i < n; i++) {
+        // i % 2 selects even phase (0) or odd phase (1).
+        // The starting index of the first pair in each phase matches i % 2.
+        #pragma omp parallel for
+        for (int j = i % 2; j < n - 1; j += 2) {
+            if (arr[j] > arr[j + 1])
+                swap(arr[j], arr[j + 1]);
+        }
+    }
+}
+
+// Merge Sort
+
+// Merges two sorted halves arr[left..mid] and arr[mid+1..right] in place.
+void merge(vector<int>& arr, int left, int mid, int right) {
+    int n1 = mid - left + 1;
+    int n2 = right - mid;
+
+    vector<int> L(n1), R(n2);
+    for (int i = 0; i < n1; i++) L[i] = arr[left + i];
+    for (int i = 0; i < n2; i++) R[i] = arr[mid + 1 + i];
+
+    int i = 0, j = 0, k = left;
+    while (i < n1 && j < n2)
+        arr[k++] = (L[i] <= R[j]) ? L[i++] : R[j++];
+
+    while (i < n1) arr[k++] = L[i++];
+    while (j < n2) arr[k++] = R[j++];
+}
+
+void sequentialMergeSort(vector<int>& arr, int left, int right) {
+    if (left >= right) return;
+    int mid = left + (right - left) / 2;
+    sequentialMergeSort(arr, left, mid);
+    sequentialMergeSort(arr, mid + 1, right);
+    merge(arr, left, mid, right);
+}
+
+// Parallel merge sort using OpenMP tasks.
+// "#pragma omp parallel sections" inside a recursive function would spawn a
+// new thread team at every level of recursion, hundreds of thousands of teams
+// for a large array, causing enormous overhead and likely a crash.
+// Tasks are lighter: the runtime schedules them across an existing thread pool.
+// The depth cutoff switches to sequential below a threshold to avoid spawning
+// tasks so small that the overhead exceeds the work itself.
+void parallelMergeSortHelper(vector<int>& arr, int left, int right, int depth) {
+    if (left >= right) return;
+    int mid = left + (right - left) / 2;
+
+    if (depth <= 0) {
+        // Below the cutoff the subarray is small enough that sequential is faster.
+        sequentialMergeSort(arr, left, mid);
+        sequentialMergeSort(arr, mid + 1, right);
+    } else {
+        #pragma omp task
+        parallelMergeSortHelper(arr, left, mid, depth - 1);
+
+        #pragma omp task
+        parallelMergeSortHelper(arr, mid + 1, right, depth - 1);
+
+        // Wait for both tasks to finish before merging.
+        #pragma omp taskwait
+    }
+
+    merge(arr, left, mid, right);
+}
+
+void parallelMergeSort(vector<int>& arr, int left, int right) {
+    // The single directive creates one thread team for the entire sort.
+    // All recursive tasks share this pool instead of creating new teams.
+    #pragma omp parallel
+    {
+        // single ensures only one thread kicks off the root task;
+        // the rest wait and pick up the child tasks as they are created.
+        #pragma omp single
+        parallelMergeSortHelper(arr, left, right, 4); // depth 4 → up to 16 parallel tasks
+    }
+}
+
+// Main function
+
+int main() {
+    int n = 10000; // Adjust this to specify the number of elements.
+    vector<int> arr(n);
+
+    for (int i = 0; i < n; i++)
+        arr[i] = rand() % 10000;
+
+    double start, end;
+    double time_seq_bubble, time_par_bubble;
+    double time_seq_merge, time_par_merge;
+
+    // --- Sequential Bubble Sort ---
+    vector<int> seqArr = arr;
+    start = omp_get_wtime();
+    sequentialBubbleSort(seqArr);
+    end = omp_get_wtime();
+    time_seq_bubble = end - start;
+    cout << "Sequential Bubble Sort time: " << time_seq_bubble << " seconds" << endl;
+
+    // --- Parallel Bubble Sort ---
+    vector<int> parArr = arr;
+    start = omp_get_wtime();
+    parallelBubbleSort(parArr);
+    end = omp_get_wtime();
+    time_par_bubble = end - start;
+    cout << "Parallel Bubble Sort time: " << time_par_bubble << " seconds" << endl;
+
+    cout << "Bubble Sort Speedup (Sequential / Parallel) = " << (time_seq_bubble / time_par_bubble) << "x" << endl;
+
+    // --- Sequential Merge Sort ---
+    seqArr = arr;
+    start = omp_get_wtime();
+    sequentialMergeSort(seqArr, 0, n - 1);
+    end = omp_get_wtime();
+    time_seq_merge = end - start;
+    cout << "\nSequential Merge Sort time: " << time_seq_merge << " seconds" << endl;
+
+    // --- Parallel Merge Sort ---
+    parArr = arr;
+    start = omp_get_wtime();
+    parallelMergeSort(parArr, 0, n - 1);
+    end = omp_get_wtime();
+    time_par_merge = end - start;
+    cout << "Parallel Merge Sort time: " << time_par_merge << " seconds" << endl;
+
+    cout << "Merge Sort Speedup (Sequential / Parallel) = " << (time_seq_merge / time_par_merge) << "x" << endl;
+
+    return 0;
+}
+// END OF CODE
+
+/*
+EXAMPLE OUTPUT (when n=10000):
+
+$ ./Code-2 
+Sequential Bubble Sort time: 0.955394 seconds
+Parallel Bubble Sort time: 0.282093 seconds
+Bubble Sort Speedup (Sequential / Parallel) = 3.38681x
+
+Sequential Merge Sort time: 0.0116294 seconds
+Parallel Merge Sort time: 0.00282529 seconds
+Merge Sort Speedup (Sequential / Parallel) = 4.11618x
+*/
+
@@ -0,0 +1,132 @@
+// Code-3 (Min, Max, Sum and Average Operations)
+
+/*
+ * THIS CODE HAS BEEN TESTED AND IS FULLY OPERATIONAL.
+ *
+ * Problem Statement: Implement Min, Max, Sum and Average operations using Parallel Reduction.
+ *
+ * Code from HighPerformanceComputing (SPPU - Final Year - Computer Engineering - Content)
+ * repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/HighPerformanceComputing
+ **/
+
+/*
+ * EXECUTION INSTRUCTIONS (Debian-based distributions):
+ *
+ * i) Install g++ with OpenMP support:
+ *   sudo apt update
+ *   sudo apt install g++
+ *
+ * ii) Compile:
+ *   g++ -fopenmp Code-3.cpp -o Code-3
+ *
+ * iii) Execute:
+ *   ./Code-3
+ **/
+
+// BEGINNING OF CODE
+#include <iostream>
+#include <vector>
+#include <omp.h>
+#include <cstdlib>
+
+using namespace std;
+
+int main() {
+    // Uncomment to manually control thread count
+    // omp_set_num_threads(4);
+
+    // --- Input ---
+    int n = 1000000;
+    vector<int> nums(n);
+
+    for (int i = 0; i < n; i++)
+        nums[i] = rand() % 10000;
+
+    cout << "Input: " << n << " random integers in the range [0, 9999]." << endl << endl;
+
+    // long long prevents overflow: up to 1,000,000 * 9,999 ≈ 10 billion,
+    // which exceeds the int limit of ~2.1 billion.
+    long long sum_seq, sum_par;
+    int min_seq, max_seq;
+    int min_par, max_par;
+    double avg_seq, avg_par;
+    double start, end;
+
+    // --- Sequential ---
+    min_seq = max_seq = nums[0];
+    sum_seq = 0;
+
+    start = omp_get_wtime();
+    for (int i = 0; i < n; i++) {
+        if (nums[i] < min_seq) min_seq = nums[i];
+        if (nums[i] > max_seq) max_seq = nums[i];
+        sum_seq += nums[i];
+    }
+    end = omp_get_wtime();
+
+    // Computed after timing so both versions are measured fairly.
+    avg_seq = (double)sum_seq / n;
+    double time_seq = end - start;
+
+    // --- Parallel ---
+    min_par = max_par = nums[0];
+    sum_par = 0;
+
+    start = omp_get_wtime();
+    // reduction(min/max/+) gives each thread its own private copy of the
+    // variable, then combines them at the end, no critical sections needed.
+    // Without reduction, threads would race to update the same variable.
+    #pragma omp parallel for reduction(min: min_par) reduction(max: max_par) reduction(+: sum_par)
+    for (int i = 0; i < n; i++) {
+        if (nums[i] < min_par) min_par = nums[i];
+        if (nums[i] > max_par) max_par = nums[i];
+        sum_par += nums[i];
+    }
+    end = omp_get_wtime();
+
+    avg_par = (double)sum_par / n;
+    double time_par = end - start;
+
+    // --- Output ---
+    cout << "--- Sequential Computation ---" << endl;
+    cout << "Minimum  : " << min_seq << endl;
+    cout << "Maximum  : " << max_seq << endl;
+    cout << "Sum      : " << sum_seq << endl;
+    cout << "Average  : " << avg_seq << endl;
+    cout << "Time     : " << time_seq << " seconds" << endl;
+
+    cout << "\n--- Parallel Computation ---" << endl;
+    cout << "Minimum  : " << min_par << endl;
+    cout << "Maximum  : " << max_par << endl;
+    cout << "Sum      : " << sum_par << endl;
+    cout << "Average  : " << avg_par << endl;
+    cout << "Time     : " << time_par << " seconds" << endl;
+
+    cout << "\nSpeedup (Sequential / Parallel) = " << (time_seq / time_par) << "x" << endl;
+
+    return 0;
+}
+// END OF CODE
+
+/*
+EXAMPLE OUTPUT:
+
+$ ./Code-3 
+Input: 1000000 random integers in the range [0, 9999].
+
+--- Sequential Computation ---
+Minimum  : 0
+Maximum  : 9999
+Sum      : 5000491283
+Average  : 5000.49
+Time     : 0.0205385 seconds
+
+--- Parallel Computation ---
+Minimum  : 0
+Maximum  : 9999
+Sum      : 5000491283
+Average  : 5000.49
+Time     : 0.0135714 seconds
+
+Speedup (Sequential / Parallel) = 1.51336x
+*/
@@ -0,0 +1,3 @@
+attachments/change-runtime.png filter=lfs diff=lfs merge=lfs -text
+attachments/runtime-navbar.png filter=lfs diff=lfs merge=lfs -text
+attachments/select-t4-gpu.png filter=lfs diff=lfs merge=lfs -text
@@ -0,0 +1,267 @@
+# Practical-4 (Vector Addition and Matrix Multiplication)
+
+Problem Statement:
+Write a CUDA Program for:
+1. Addition of two large vectors
+2. Matrix Multiplication using CUDA C
+
+---
+
+## Pre-requisities
+
+1. Open [Google Colab](https://colab.research.google.com/)
+2. Create a new Jupyter Notebook
+
+---
+
+## Steps
+
+### 1. After creating a new Jupyter notebook, click on "Runtime" in the navbar:
+
+<img src="attachments/runtime-navbar.png" alt="Runtime in navbar in Google Colab" width=350>
+
+### 2. Then, choose "Change runtime type":
+
+<img src="attachments/change-runtime.png" alt="Change runtime type option in Runtime section on Google Colab" width=300>
+
+### 3. Select "T4 GPU", and save:
+
+<img src="attachments/select-t4-gpu.png" alt="T4 GPU option selected in Google Colab as Runtime" width=300>
+
+### 4. Check if `nvcc` is installed:
+
+```python3
+!nvcc --version
+```
+
+### 5. Install `nvcc4jupyter`:
+
+```python3
+!pip install nvcc4jupyter
+# Or if the above command fails, comment the above line and run
+# !pip install git+https://git.kska.io/notkshitij/nvcc.git
+```
+
+### 6. Load it:
+
+```python3
+%load_ext nvcc4jupyter
+```
+
+### 7. Paste the below code in a new code block:
+
+```cu
+%%writefile cuda_program.cu
+#include <iostream>
+#include <cuda.h>
+
+using namespace std;
+
+#define BLOCK_SIZE 2
+
+// Vector Addition Kernel
+// Each thread computes a single element of C = A + B.
+__global__ void vectorAdd(int *A, int *B, int *C, int N) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    // Guard against threads beyond the vector size (when N is not a multiple
+    // of the block size, some threads in the last block are out of range).
+    if (i < N)
+        C[i] = A[i] + B[i];
+}
+
+// Matrix Multiplication Kernel
+// Each thread computes a single element of C = A * B.
+// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
+__global__ void matrixMul(float *A, float *B, float *C, int N) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+    float sum = 0.0f;
+    for (int n = 0; n < N; ++n)
+        sum += A[row * N + n] * B[n * N + col];
+
+    C[row * N + col] = sum;
+}
+
+// Vector Addition
+void runVectorAddition() {
+    int N;
+    cout << "\n=== Vector Addition ===" << endl;
+    cout << "Enter vector size: ";
+    cin >> N;
+
+    int size = N * sizeof(int);
+
+    // Host allocation and initialisation
+    int *hA = new int[N];
+    int *hB = new int[N];
+    int *hC = new int[N];
+
+    for (int i = 0; i < N; i++) {
+        hA[i] = i;
+        hB[i] = i * 2;
+    }
+
+    cout << "\nVector A: ";
+    for (int i = 0; i < N; i++) cout << hA[i] << " ";
+    cout << "\nVector B: ";
+    for (int i = 0; i < N; i++) cout << hB[i] << " ";
+    cout << endl;
+
+    // Device allocation and transfer
+    int *dA, *dB, *dC;
+    cudaMalloc(&dA, size);
+    cudaMalloc(&dB, size);
+    cudaMalloc(&dC, size);
+
+    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
+
+    // Launch with enough blocks to cover all N elements.
+    // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
+    int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
+
+    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
+
+    cout << "Result A + B: ";
+    for (int i = 0; i < N; i++) cout << hC[i] << " ";
+    cout << endl;
+
+    delete[] hA;
+    delete[] hB;
+    delete[] hC;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dC);
+}
+
+// Matrix Multiplication
+void runMatrixMultiplication() {
+    int K, N;
+    cout << "\n=== Matrix Multiplication ===" << endl;
+    cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
+    cin >> K;
+    N = K * BLOCK_SIZE;
+
+    cout << "Matrix size: " << N << " x " << N << endl;
+    int size = N * N * sizeof(float);
+
+    // Host allocation and initialisation
+    float *hA = new float[N * N];
+    float *hB = new float[N * N];
+    float *hC = new float[N * N];
+
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < N; i++) {
+            hA[j * N + i] = 2;
+            hB[j * N + i] = 4;
+        }
+    }
+
+    cout << "\nMatrix A:\n";
+    for (int row = 0; row < N; row++) {
+        for (int col = 0; col < N; col++)
+            cout << hA[row * N + col] << " ";
+        cout << endl;
+    }
+
+    cout << "\nMatrix B:\n";
+    for (int row = 0; row < N; row++) {
+        for (int col = 0; col < N; col++)
+            cout << hB[row * N + col] << " ";
+        cout << endl;
+    }
+
+    // Device allocation and transfer
+    float *dA, *dB, *dC;
+    cudaMalloc(&dA, size);
+    cudaMalloc(&dB, size);
+    cudaMalloc(&dC, size);
+
+    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
+
+    // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
+    // grid: K x K blocks, so total threads = N x N (one per output element).
+    dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
+    dim3 grid(K, K);
+    matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
+
+    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
+
+    cout << "\nResult C = A * B:\n";
+    for (int row = 0; row < N; row++) {
+        for (int col = 0; col < N; col++)
+            cout << hC[row * N + col] << " ";
+        cout << endl;
+    }
+
+    delete[] hA;
+    delete[] hB;
+    delete[] hC;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dC);
+}
+
+int main() {
+    runVectorAddition();
+    runMatrixMultiplication();
+
+    cout << "\nFinished." << endl;
+    return 0;
+}
+```
+
+### 8. Compile and run:
+
+```python3
+!nvcc cuda_program.cu -o cuda_program && ./cuda_program
+```
+
+---
+
+## Sample output
+
+```md
+=== Vector Addition ===
+Enter vector size: 2
+
+Vector A: 0 1 
+Vector B: 0 2 
+Result A + B: 0 3 
+
+=== Matrix Multiplication ===
+Enter K (matrix will be N x N where N = K * 2): 3
+Matrix size: 6 x 6
+
+Matrix A:
+2 2 2 2 2 2 
+2 2 2 2 2 2 
+2 2 2 2 2 2 
+2 2 2 2 2 2 
+2 2 2 2 2 2 
+2 2 2 2 2 2 
+
+Matrix B:
+4 4 4 4 4 4 
+4 4 4 4 4 4 
+4 4 4 4 4 4 
+4 4 4 4 4 4 
+4 4 4 4 4 4 
+4 4 4 4 4 4 
+
+Result C = A * B:
+48 48 48 48 48 48 
+48 48 48 48 48 48 
+48 48 48 48 48 48 
+48 48 48 48 48 48 
+48 48 48 48 48 48 
+48 48 48 48 48 48 
+
+Finished.
+```
+
+---
+
@@ -0,0 +1,348 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": [],
+      "gpuType": "T4"
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "accelerator": "GPU"
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "sSZ5XEy-IFoj",
+        "outputId": "8bac00c3-0327-4682-f636-b6b253db5201"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "nvcc: NVIDIA (R) Cuda compiler driver\n",
+            "Copyright (c) 2005-2025 NVIDIA Corporation\n",
+            "Built on Fri_Feb_21_20:23:50_PST_2025\n",
+            "Cuda compilation tools, release 12.8, V12.8.93\n",
+            "Build cuda_12.8.r12.8/compiler.35583870_0\n"
+          ]
+        }
+      ],
+      "source": [
+        "!nvcc --version"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!pip install nvcc4jupyter\n",
+        "# Or if the above command fails, comment the above line and run\n",
+        "# !pip install git+https://git.kska.io/notkshitij/nvcc.git"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "1jHq-AKfIINd",
+        "outputId": "818ccbb0-9383-4be5-cd79-c9527c5806c9"
+      },
+      "execution_count": 25,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: nvcc4jupyter in /usr/local/lib/python3.12/dist-packages (1.2.1)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%load_ext nvcc4jupyter"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "9nuQsRZMIROH",
+        "outputId": "e8508f1c-9895-4e8c-e7f0-d9e34aab21a1"
+      },
+      "execution_count": 16,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "The nvcc4jupyter extension is already loaded. To reload it, use:\n",
+            "  %reload_ext nvcc4jupyter\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%%writefile cuda_program.cu\n",
+        "#include <iostream>\n",
+        "#include <cuda.h>\n",
+        "\n",
+        "using namespace std;\n",
+        "\n",
+        "#define BLOCK_SIZE 2\n",
+        "\n",
+        "// Vector Addition Kernel\n",
+        "// Each thread computes a single element of C = A + B.\n",
+        "__global__ void vectorAdd(int *A, int *B, int *C, int N) {\n",
+        "    int i = blockIdx.x * blockDim.x + threadIdx.x;\n",
+        "    // Guard against threads beyond the vector size (when N is not a multiple\n",
+        "    // of the block size, some threads in the last block are out of range).\n",
+        "    if (i < N)\n",
+        "        C[i] = A[i] + B[i];\n",
+        "}\n",
+        "\n",
+        "// Matrix Multiplication Kernel\n",
+        "// Each thread computes a single element of C = A * B.\n",
+        "// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.\n",
+        "__global__ void matrixMul(float *A, float *B, float *C, int N) {\n",
+        "    int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
+        "    int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
+        "\n",
+        "    float sum = 0.0f;\n",
+        "    for (int n = 0; n < N; ++n)\n",
+        "        sum += A[row * N + n] * B[n * N + col];\n",
+        "\n",
+        "    C[row * N + col] = sum;\n",
+        "}\n",
+        "\n",
+        "// Vector Addition\n",
+        "void runVectorAddition() {\n",
+        "    int N;\n",
+        "    cout << \"\\n=== Vector Addition ===\" << endl;\n",
+        "    cout << \"Enter vector size: \";\n",
+        "    cin >> N;\n",
+        "\n",
+        "    int size = N * sizeof(int);\n",
+        "\n",
+        "    // Host allocation and initialisation\n",
+        "    int *hA = new int[N];\n",
+        "    int *hB = new int[N];\n",
+        "    int *hC = new int[N];\n",
+        "\n",
+        "    for (int i = 0; i < N; i++) {\n",
+        "        hA[i] = i;\n",
+        "        hB[i] = i * 2;\n",
+        "    }\n",
+        "\n",
+        "    cout << \"\\nVector A: \";\n",
+        "    for (int i = 0; i < N; i++) cout << hA[i] << \" \";\n",
+        "    cout << \"\\nVector B: \";\n",
+        "    for (int i = 0; i < N; i++) cout << hB[i] << \" \";\n",
+        "    cout << endl;\n",
+        "\n",
+        "    // Device allocation and transfer\n",
+        "    int *dA, *dB, *dC;\n",
+        "    cudaMalloc(&dA, size);\n",
+        "    cudaMalloc(&dB, size);\n",
+        "    cudaMalloc(&dC, size);\n",
+        "\n",
+        "    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
+        "    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
+        "\n",
+        "    // Launch with enough blocks to cover all N elements.\n",
+        "    // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.\n",
+        "    int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;\n",
+        "    vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);\n",
+        "\n",
+        "    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
+        "\n",
+        "    cout << \"Result A + B: \";\n",
+        "    for (int i = 0; i < N; i++) cout << hC[i] << \" \";\n",
+        "    cout << endl;\n",
+        "\n",
+        "    delete[] hA;\n",
+        "    delete[] hB;\n",
+        "    delete[] hC;\n",
+        "    cudaFree(dA);\n",
+        "    cudaFree(dB);\n",
+        "    cudaFree(dC);\n",
+        "}\n",
+        "\n",
+        "// Matrix Multiplication\n",
+        "void runMatrixMultiplication() {\n",
+        "    int K, N;\n",
+        "    cout << \"\\n=== Matrix Multiplication ===\" << endl;\n",
+        "    cout << \"Enter K (matrix will be N x N where N = K * \" << BLOCK_SIZE << \"): \";\n",
+        "    cin >> K;\n",
+        "    N = K * BLOCK_SIZE;\n",
+        "\n",
+        "    cout << \"Matrix size: \" << N << \" x \" << N << endl;\n",
+        "    int size = N * N * sizeof(float);\n",
+        "\n",
+        "    // Host allocation and initialisation\n",
+        "    float *hA = new float[N * N];\n",
+        "    float *hB = new float[N * N];\n",
+        "    float *hC = new float[N * N];\n",
+        "\n",
+        "    for (int j = 0; j < N; j++) {\n",
+        "        for (int i = 0; i < N; i++) {\n",
+        "            hA[j * N + i] = 2;\n",
+        "            hB[j * N + i] = 4;\n",
+        "        }\n",
+        "    }\n",
+        "\n",
+        "    cout << \"\\nMatrix A:\\n\";\n",
+        "    for (int row = 0; row < N; row++) {\n",
+        "        for (int col = 0; col < N; col++)\n",
+        "            cout << hA[row * N + col] << \" \";\n",
+        "        cout << endl;\n",
+        "    }\n",
+        "\n",
+        "    cout << \"\\nMatrix B:\\n\";\n",
+        "    for (int row = 0; row < N; row++) {\n",
+        "        for (int col = 0; col < N; col++)\n",
+        "            cout << hB[row * N + col] << \" \";\n",
+        "        cout << endl;\n",
+        "    }\n",
+        "\n",
+        "    // Device allocation and transfer\n",
+        "    float *dA, *dB, *dC;\n",
+        "    cudaMalloc(&dA, size);\n",
+        "    cudaMalloc(&dB, size);\n",
+        "    cudaMalloc(&dC, size);\n",
+        "\n",
+        "    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
+        "    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
+        "\n",
+        "    // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.\n",
+        "    // grid: K x K blocks, so total threads = N x N (one per output element).\n",
+        "    dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);\n",
+        "    dim3 grid(K, K);\n",
+        "    matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);\n",
+        "\n",
+        "    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
+        "\n",
+        "    cout << \"\\nResult C = A * B:\\n\";\n",
+        "    for (int row = 0; row < N; row++) {\n",
+        "        for (int col = 0; col < N; col++)\n",
+        "            cout << hC[row * N + col] << \" \";\n",
+        "        cout << endl;\n",
+        "    }\n",
+        "\n",
+        "    delete[] hA;\n",
+        "    delete[] hB;\n",
+        "    delete[] hC;\n",
+        "    cudaFree(dA);\n",
+        "    cudaFree(dB);\n",
+        "    cudaFree(dC);\n",
+        "}\n",
+        "\n",
+        "int main() {\n",
+        "    runVectorAddition();\n",
+        "    runMatrixMultiplication();\n",
+        "\n",
+        "    cout << \"\\nFinished.\" << endl;\n",
+        "    return 0;\n",
+        "}\n"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "nvCj8UmhIh3o",
+        "outputId": "54a31aec-f860-4b72-a4e1-03a392def6f6"
+      },
+      "execution_count": 23,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Overwriting cuda_program.cu\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!nvcc cuda_program.cu -o cuda_program && ./cuda_program"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "F7fC0LtbJ5o8",
+        "outputId": "9d5988b2-ad42-4b0b-c84d-c1698e778bb9"
+      },
+      "execution_count": 24,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "nvcc warning : Support for offline compilation for architectures prior to '<compute/sm/lto>_75' will be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n",
+            "\n",
+            "=== Vector Addition ===\n",
+            "Enter vector size: 2\n",
+            "\n",
+            "Vector A: 0 1 \n",
+            "Vector B: 0 2 \n",
+            "Result A + B: 0 3 \n",
+            "\n",
+            "=== Matrix Multiplication ===\n",
+            "Enter K (matrix will be N x N where N = K * 2): 3\n",
+            "Matrix size: 6 x 6\n",
+            "\n",
+            "Matrix A:\n",
+            "2 2 2 2 2 2 \n",
+            "2 2 2 2 2 2 \n",
+            "2 2 2 2 2 2 \n",
+            "2 2 2 2 2 2 \n",
+            "2 2 2 2 2 2 \n",
+            "2 2 2 2 2 2 \n",
+            "\n",
+            "Matrix B:\n",
+            "4 4 4 4 4 4 \n",
+            "4 4 4 4 4 4 \n",
+            "4 4 4 4 4 4 \n",
+            "4 4 4 4 4 4 \n",
+            "4 4 4 4 4 4 \n",
+            "4 4 4 4 4 4 \n",
+            "\n",
+            "Result C = A * B:\n",
+            "48 48 48 48 48 48 \n",
+            "48 48 48 48 48 48 \n",
+            "48 48 48 48 48 48 \n",
+            "48 48 48 48 48 48 \n",
+            "48 48 48 48 48 48 \n",
+            "48 48 48 48 48 48 \n",
+            "\n",
+            "Finished.\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [],
+      "metadata": {
+        "id": "HjhrulSNKHkq"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
@@ -0,0 +1,161 @@
+# %%writefile cuda_program.cu
+#include <iostream>
+#include <cuda.h>
+
+using namespace std;
+
+#define BLOCK_SIZE 2
+
+// ─── Vector Addition Kernel ──────────────────────────────────────────────────
+// Each thread computes a single element of C = A + B.
+__global__ void vectorAdd(int *A, int *B, int *C, int N) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    // Guard against threads beyond the vector size (when N is not a multiple
+    // of the block size, some threads in the last block are out of range).
+    if (i < N)
+        C[i] = A[i] + B[i];
+}
+
+// ─── Matrix Multiplication Kernel ────────────────────────────────────────────
+// Each thread computes a single element of C = A * B.
+// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
+__global__ void matrixMul(float *A, float *B, float *C, int N) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+    float sum = 0.0f;
+    for (int n = 0; n < N; ++n)
+        sum += A[row * N + n] * B[n * N + col];
+
+    C[row * N + col] = sum;
+}
+
+// ─── Vector Addition ─────────────────────────────────────────────────────────
+void runVectorAddition() {
+    int N;
+    cout << "\n=== Vector Addition ===" << endl;
+    cout << "Enter vector size: ";
+    cin >> N;
+
+    int size = N * sizeof(int);
+
+    // Host allocation and initialisation
+    int *hA = new int[N];
+    int *hB = new int[N];
+    int *hC = new int[N];
+
+    for (int i = 0; i < N; i++) {
+        hA[i] = i;
+        hB[i] = i * 2;
+    }
+
+    cout << "\nVector A: ";
+    for (int i = 0; i < N; i++) cout << hA[i] << " ";
+    cout << "\nVector B: ";
+    for (int i = 0; i < N; i++) cout << hB[i] << " ";
+    cout << endl;
+
+    // Device allocation and transfer
+    int *dA, *dB, *dC;
+    cudaMalloc(&dA, size);
+    cudaMalloc(&dB, size);
+    cudaMalloc(&dC, size);
+
+    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
+
+    // Launch with enough blocks to cover all N elements.
+    // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
+    int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
+    vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
+
+    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
+
+    cout << "Result A + B: ";
+    for (int i = 0; i < N; i++) cout << hC[i] << " ";
+    cout << endl;
+
+    delete[] hA;
+    delete[] hB;
+    delete[] hC;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dC);
+}
+
+// ─── Matrix Multiplication ───────────────────────────────────────────────────
+void runMatrixMultiplication() {
+    int K, N;
+    cout << "\n=== Matrix Multiplication ===" << endl;
+    cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
+    cin >> K;
+    N = K * BLOCK_SIZE;
+
+    cout << "Matrix size: " << N << " x " << N << endl;
+    int size = N * N * sizeof(float);
+
+    // Host allocation and initialisation
+    float *hA = new float[N * N];
+    float *hB = new float[N * N];
+    float *hC = new float[N * N];
+
+    for (int j = 0; j < N; j++) {
+        for (int i = 0; i < N; i++) {
+            hA[j * N + i] = 2;
+            hB[j * N + i] = 4;
+        }
+    }
+
+    cout << "\nMatrix A:\n";
+    for (int row = 0; row < N; row++) {
+        for (int col = 0; col < N; col++)
+            cout << hA[row * N + col] << " ";
+        cout << endl;
+    }
+
+    cout << "\nMatrix B:\n";
+    for (int row = 0; row < N; row++) {
+        for (int col = 0; col < N; col++)
+            cout << hB[row * N + col] << " ";
+        cout << endl;
+    }
+
+    // Device allocation and transfer
+    float *dA, *dB, *dC;
+    cudaMalloc(&dA, size);
+    cudaMalloc(&dB, size);
+    cudaMalloc(&dC, size);
+
+    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
+    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
+
+    // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
+    // grid: K x K blocks, so total threads = N x N (one per output element).
+    dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
+    dim3 grid(K, K);
+    matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
+
+    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
+
+    cout << "\nResult C = A * B:\n";
+    for (int row = 0; row < N; row++) {
+        for (int col = 0; col < N; col++)
+            cout << hC[row * N + col] << " ";
+        cout << endl;
+    }
+
+    delete[] hA;
+    delete[] hB;
+    delete[] hC;
+    cudaFree(dA);
+    cudaFree(dB);
+    cudaFree(dC);
+}
+
+int main() {
+    runVectorAddition();
+    runMatrixMultiplication();
+
+    cout << "\nFinished." << endl;
+    return 0;
+}
@@ -10,6 +10,11 @@ This repository compiles essential resources for the SPPU Computer Engineering P

 ### Codes

+1. [Code-1 (Parallel BFS and DFS)](Codes/Code-1.cpp)
+2. [Code-2 (Sequential and Parallel Bubble Sort and Merge Sort)](Codes/Code-2.cpp)
+3. [Code-3 (Min, Max, Sum, Average)](Codes/Code-3.cpp)
+4. [Code-4 (Vector Addition and Matrix Multiplication)](Codes/Code-4/)
+
 ### Practical

 1. [Practical-1](Practical/Practical-1/)
@@ -18,6 +23,15 @@ This repository compiles essential resources for the SPPU Computer Engineering P
 4. [Practical-4](Practical/Practical-4/)
 5. [Mini Project](Practical/HPC%20-%20Mini%20Project%20-%20Handout.doc)

+### Assignments
+
+1. Assignment-1
+  - [Questions](Assignments/HPC%20-%20Assignment-1%20%28Questions%29.pdf)
+  - [Answers](Assignments/HPC%20-%20Assignment-1%20%28Answers%29.pdf)
+2. Assignment-2
+  - [Questions](Assignments/HPC%20-%20Assignment-2%20%28Questions%29.pdf)
+  - [Answers](Assignments/HPC%20-%20Assignment-2%20%28Answers%29.pdf)
+
 ### Question Papers

 - [IN-SEM](Question%20Papers/IN-SEM)
@@ -25,6 +39,8 @@ This repository compiles essential resources for the SPPU Computer Engineering P

 ### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers/)

+### [END-SEM PYQ Answers](Notes/END-SEM%20PYQ%20Answers/)
+
 ---

 ## Miscellaneous
Author	SHA1	Message	Date
notkshitij	58d38f9199	add answers for may-june 2025 + november-december 2025 pyqs for unit 6 (High Performance Computing Applications)	2026-05-26 01:41:12 +05:30
notkshitij	ccca13880d	add answers for may-june 2025 + november-december 2025 pyqs for unit 5 (CUDA Architecture)	2026-05-26 01:34:07 +05:30
notkshitij	ccd6640ac9	add answers for may-june 2025 + november-december 2025 pyqs for unit 4 (Analytical Modeling of Parallel Programs)	2026-05-26 01:24:24 +05:30
notkshitij	1dcb16981b	add answers for may-june 2025 + november-december 2025 pyqs for unit 3 (Parallel Communication)	2026-05-26 01:15:06 +05:30
notkshitij	344db2f477	add may-june 2025 + november-december 2025 pyqs for end-sem.	2026-05-25 23:20:03 +05:30
notkshitij	3f3b1a1978	add link for end-sem pyq answers in README.	2026-05-21 19:33:27 +05:30
notkshitij	786d318b88	add end-sem pyq answers for unit 6 (High Performance Computing Applications)	2026-05-21 19:29:44 +05:30
notkshitij	fefa2383bb	add end-sem pyq answers for unit 5 (CUDA Architecture)	2026-05-21 19:28:05 +05:30
notkshitij	a90631ce37	add end-sem pyq answers for unit 4 (Analytical Modeling of Parallel Programs)	2026-05-21 19:23:03 +05:30
notkshitij	7a6b281521	add end-sem pyq answers for unit 3 (Parallel Communication)	2026-05-21 19:19:15 +05:30
notkshitij	84b5e3a059	add end-sem pyqs for HPC (may june 2023, may-june 2024)	2026-05-15 01:41:50 +05:30
notkshitij	b8b405da94	fix title in markdown file for practical 4.	2026-05-04 23:53:57 +05:30
notkshitij	d3ad26e1ca	add link for 4th practical in README.	2026-05-04 23:53:26 +05:30
notkshitij	60783ed8cd	add only the program file for CUDA program; practical 4.	2026-05-04 23:52:49 +05:30
notkshitij	aaa405c02a	add Jupyter notebook for 4th practical; vector addition and matrix multiplication using CUDA C.	2026-05-04 23:51:16 +05:30
notkshitij	5f94348c49	fix formatting and resize attachments in instructions for executing practical 4 code.	2026-05-04 23:50:05 +05:30
notkshitij	4e5913d6e4	add instructions for executing 4th practical in Google Colab.	2026-05-04 23:46:08 +05:30
notkshitij	a521ac1ca1	add attachments required for practical 4 (CUDA program) instructions.	2026-05-04 23:45:45 +05:30
notkshitij	3a3c78ad6d	add links for code 1..3 in README.	2026-04-29 02:26:45 +05:30
notkshitij	26ef8ceb1b	add code for performing min, max, sum and average operations using parallel reductions, Code-3.	2026-04-29 02:25:30 +05:30
notkshitij	87c11c70c7	update code output after adding speedup for comparison between sequential and parallel sorting.	2026-04-29 02:18:41 +05:30
notkshitij	7b797250ea	add speedup for sequential / parallel for bubble and merge sort.	2026-04-29 02:15:13 +05:30
notkshitij	e29a85dafc	add code for comparing sequential and parallel bubble sort and merge sort execution times, Code-2.	2026-04-29 02:06:55 +05:30
notkshitij	f1577f6db7	add code for parallel bfs and dfs, Code-1.	2026-04-29 01:51:59 +05:30
notkshitij	d7aee3fdaf	add links for assignment questions and answers in README.	2026-04-29 00:48:16 +05:30
notkshitij	692228580b	add answers for assignment 2, written by Ayush Kalaskar.	2026-04-29 00:48:04 +05:30
notkshitij	7798f33ee6	add answers for assignment 1, written by Ayush Kalaskar.	2026-04-29 00:47:36 +05:30
notkshitij	9bcf499a18	add questions for assignment 2.	2026-04-29 00:38:47 +05:30
notkshitij	e3d2c5c30d	add questions for assignment 1.	2026-04-29 00:38:36 +05:30
notkshitij	508fd56180	add write-up for 4th practical.	2026-03-24 00:19:44 +05:30
notkshitij	1596839c5f	add write-up for 3rd practical.	2026-03-24 00:17:35 +05:30
notkshitij	79faff15d2	add write-up for 2nd practical.	2026-03-24 00:14:02 +05:30
notkshitij	90643da3db	add write-up for 1st practical.	2026-03-24 00:13:47 +05:30