Compare commits
33 Commits
b769a123e2
..
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
58d38f9199
|
|||
|
ccca13880d
|
|||
|
ccd6640ac9
|
|||
|
1dcb16981b
|
|||
|
344db2f477
|
|||
|
3f3b1a1978
|
|||
|
786d318b88
|
|||
|
fefa2383bb
|
|||
|
a90631ce37
|
|||
|
7a6b281521
|
|||
|
84b5e3a059
|
|||
|
b8b405da94
|
|||
|
d3ad26e1ca
|
|||
|
60783ed8cd
|
|||
|
aaa405c02a
|
|||
|
5f94348c49
|
|||
|
4e5913d6e4
|
|||
|
a521ac1ca1
|
|||
|
3a3c78ad6d
|
|||
|
26ef8ceb1b
|
|||
|
87c11c70c7
|
|||
|
7b797250ea
|
|||
|
e29a85dafc
|
|||
|
f1577f6db7
|
|||
|
d7aee3fdaf
|
|||
|
692228580b
|
|||
|
7798f33ee6
|
|||
|
9bcf499a18
|
|||
|
e3d2c5c30d
|
|||
|
508fd56180
|
|||
|
1596839c5f
|
|||
|
79faff15d2
|
|||
|
90643da3db
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,195 @@
|
||||
// Code-1 (Parallel BFS and DFS)
|
||||
|
||||
/*
|
||||
* THIS CODE HAS BEEN TESTED AND IS FULLY OPERATIONAL.
|
||||
*
|
||||
* Problem Statement:
|
||||
* Design and implement Parallel Breadth First Search and
|
||||
* Depth First Search based on existing algorithms using OpenMP.
|
||||
* Use a Tree or an undirected graph for BFS and DFS.
|
||||
*
|
||||
* Code from HighPerformanceComputing (SPPU - Final Year - Computer Engineering - Content)
|
||||
* repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/HighPerformanceComputing
|
||||
**/
|
||||
|
||||
/*
|
||||
* EXECUTION INSTRUCTIONS (Debian-based distributions):
|
||||
*
|
||||
* i) Install g++ with OpenMP support:
|
||||
* sudo apt update
|
||||
* sudo apt install g++
|
||||
*
|
||||
* ii) Compile:
|
||||
* g++ -fopenmp Code-1.cpp -o Code-1
|
||||
*
|
||||
* iii) Execute:
|
||||
* ./Code-1
|
||||
**/
|
||||
|
||||
// BEGINNING OF CODE
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <omp.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
// Undirected graph with parallel BFS and DFS traversal via OpenMP.
|
||||
class Graph {
|
||||
int V;
|
||||
vector<vector<int>> adj;
|
||||
|
||||
public:
|
||||
Graph(int V) {
|
||||
this->V = V;
|
||||
adj.resize(V);
|
||||
}
|
||||
|
||||
void addEdge(int u, int v) {
|
||||
adj[u].push_back(v);
|
||||
adj[v].push_back(u);
|
||||
}
|
||||
|
||||
// Level-synchronous BFS: all nodes at the current depth (the "frontier")
|
||||
// are expanded in parallel before moving to the next level. This is the
|
||||
// natural unit of parallelism for BFS, processing individual nodes is too
|
||||
// fine-grained for threads to be useful.
|
||||
void parallelBFS(int start) {
|
||||
vector<bool> visited(V, false);
|
||||
vector<int> frontier;
|
||||
|
||||
visited[start] = true;
|
||||
frontier.push_back(start);
|
||||
|
||||
cout << "Parallel BFS from node " << start << ": ";
|
||||
|
||||
while (!frontier.empty()) {
|
||||
for (int u : frontier)
|
||||
cout << u << " ";
|
||||
|
||||
vector<int> next_frontier;
|
||||
|
||||
// Each thread accumulates its own local candidates to avoid
|
||||
// contention on a shared next_frontier vector.
|
||||
#pragma omp parallel
|
||||
{
|
||||
vector<int> local_next;
|
||||
|
||||
// nowait: threads that finish early skip the implicit barrier
|
||||
// and proceed directly to the merge below.
|
||||
// schedule(dynamic): faster threads pick up remaining chunks
|
||||
// when adjacency list sizes vary across nodes.
|
||||
#pragma omp for nowait schedule(dynamic)
|
||||
for (int i = 0; i < (int)frontier.size(); i++) {
|
||||
for (int v : adj[frontier[i]]) {
|
||||
// The check-and-set on visited[] must be a single
|
||||
// critical section — without it, two threads could
|
||||
// both see visited[v]==false and both enqueue v,
|
||||
// producing duplicates in the next frontier.
|
||||
bool should_visit = false;
|
||||
#pragma omp critical
|
||||
{
|
||||
if (!visited[v]) {
|
||||
visited[v] = true;
|
||||
should_visit = true;
|
||||
}
|
||||
}
|
||||
// local_next is thread-private so no lock needed here.
|
||||
if (should_visit)
|
||||
local_next.push_back(v);
|
||||
}
|
||||
}
|
||||
|
||||
// Merge: one thread at a time appends its local results.
|
||||
// This is a separate critical section from the one above
|
||||
// so the two do not serialize against each other.
|
||||
#pragma omp critical
|
||||
{
|
||||
next_frontier.insert(next_frontier.end(),
|
||||
local_next.begin(),
|
||||
local_next.end());
|
||||
}
|
||||
} // implicit barrier: all threads finish before frontier is swapped
|
||||
|
||||
frontier = next_frontier;
|
||||
}
|
||||
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
// Iterative DFS using a vector as a stack (push_back/pop_back).
|
||||
// vector is used instead of std::stack because std::stack cannot be
|
||||
// safely shared across threads.
|
||||
void parallelDFS(int start) {
|
||||
vector<bool> visited(V, false);
|
||||
vector<int> stack;
|
||||
|
||||
stack.push_back(start);
|
||||
|
||||
cout << "Parallel DFS from node " << start << ": ";
|
||||
|
||||
while (!stack.empty()) {
|
||||
int u = stack.back();
|
||||
stack.pop_back();
|
||||
|
||||
// A node may be pushed multiple times before it is marked visited
|
||||
// (two threads can both see visited[v]==false). This guard ensures
|
||||
// it is processed only once.
|
||||
if (visited[u]) continue;
|
||||
visited[u] = true;
|
||||
cout << u << " ";
|
||||
|
||||
vector<int> to_push;
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
vector<int> local_push;
|
||||
|
||||
#pragma omp for nowait schedule(dynamic)
|
||||
for (int i = 0; i < (int)adj[u].size(); i++) {
|
||||
// visited[] is only read here, not written, so no critical
|
||||
// section is needed. Stale reads may cause duplicates but
|
||||
// the guard above handles that safely.
|
||||
if (!visited[adj[u][i]])
|
||||
local_push.push_back(adj[u][i]);
|
||||
}
|
||||
|
||||
#pragma omp critical
|
||||
{
|
||||
to_push.insert(to_push.end(),
|
||||
local_push.begin(),
|
||||
local_push.end());
|
||||
}
|
||||
}
|
||||
|
||||
for (int v : to_push)
|
||||
stack.push_back(v);
|
||||
}
|
||||
|
||||
cout << endl;
|
||||
}
|
||||
};
|
||||
|
||||
int main() {
|
||||
Graph g(6);
|
||||
|
||||
g.addEdge(0, 1);
|
||||
g.addEdge(0, 2);
|
||||
g.addEdge(1, 3);
|
||||
g.addEdge(1, 4);
|
||||
g.addEdge(2, 5);
|
||||
|
||||
g.parallelBFS(0);
|
||||
g.parallelDFS(0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
// END OF CODE
|
||||
|
||||
/*
|
||||
EXAMPLE OUTPUT:
|
||||
|
||||
$ ./Code-1
|
||||
Parallel BFS from node 0: 0 1 2 5 3 4
|
||||
Parallel DFS from node 0: 0 2 5 1 4 3
|
||||
*/
|
||||
|
||||
@@ -0,0 +1,209 @@
|
||||
// Code-2 (Parallel Bubble Sort and Merge Sort)
|
||||
|
||||
/*
|
||||
* THIS CODE HAS BEEN TESTED AND IS FULLY OPERATIONAL.
|
||||
*
|
||||
* Problem Statement:
|
||||
* Write a program to implement Parallel Bubble Sort and Merge sort using OpenMP.
|
||||
* Use existing algorithms and measure the performance of sequential and parallel algorithms.
|
||||
*
|
||||
* Code from HighPerformanceComputing (SPPU - Final Year - Computer Engineering - Content)
|
||||
* repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/HighPerformanceComputing
|
||||
**/
|
||||
|
||||
/*
|
||||
* EXECUTION INSTRUCTIONS (Debian-based distributions):
|
||||
*
|
||||
* i) Install g++ with OpenMP support:
|
||||
* sudo apt update
|
||||
* sudo apt install g++
|
||||
*
|
||||
* ii) Compile:
|
||||
* g++ -fopenmp Code-2.cpp -o Code-2
|
||||
*
|
||||
* iii) Execute:
|
||||
* ./Code-2
|
||||
**/
|
||||
|
||||
// BEGINNING OF CODE
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstdlib>
|
||||
#include <omp.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
void printArray(const vector<int>& arr) {
|
||||
for (int num : arr)
|
||||
cout << num << " ";
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
// Bubble Sort
|
||||
|
||||
// Sequential bubble sort.
|
||||
// Sorts the array using bubble sort by repeatedly swapping adjacent elements.
|
||||
void sequentialBubbleSort(vector<int>& arr) {
|
||||
int n = arr.size();
|
||||
for (int i = 0; i < n - 1; i++) {
|
||||
for (int j = 0; j < n - i - 1; j++) {
|
||||
if (arr[j] > arr[j + 1])
|
||||
swap(arr[j], arr[j + 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Parallel bubble sort using odd-even transposition.
|
||||
// Standard bubble sort cannot be parallelized directly: thread on index j
|
||||
// and thread on index j+1 would both touch arr[j+1] simultaneously (data race).
|
||||
// Odd-even transposition alternates between two phases each pass:
|
||||
// Phase 0 (even): compare pairs (0,1), (2,3), (4,5), ...
|
||||
// Phase 1 (odd): compare pairs (1,2), (3,4), (5,6), ...
|
||||
// Within each phase every pair is independent, so threads never share elements.
|
||||
void parallelBubbleSort(vector<int>& arr) {
|
||||
int n = arr.size();
|
||||
for (int i = 0; i < n; i++) {
|
||||
// i % 2 selects even phase (0) or odd phase (1).
|
||||
// The starting index of the first pair in each phase matches i % 2.
|
||||
#pragma omp parallel for
|
||||
for (int j = i % 2; j < n - 1; j += 2) {
|
||||
if (arr[j] > arr[j + 1])
|
||||
swap(arr[j], arr[j + 1]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Merge Sort
|
||||
|
||||
// Merges two sorted halves arr[left..mid] and arr[mid+1..right] in place.
|
||||
void merge(vector<int>& arr, int left, int mid, int right) {
|
||||
int n1 = mid - left + 1;
|
||||
int n2 = right - mid;
|
||||
|
||||
vector<int> L(n1), R(n2);
|
||||
for (int i = 0; i < n1; i++) L[i] = arr[left + i];
|
||||
for (int i = 0; i < n2; i++) R[i] = arr[mid + 1 + i];
|
||||
|
||||
int i = 0, j = 0, k = left;
|
||||
while (i < n1 && j < n2)
|
||||
arr[k++] = (L[i] <= R[j]) ? L[i++] : R[j++];
|
||||
|
||||
while (i < n1) arr[k++] = L[i++];
|
||||
while (j < n2) arr[k++] = R[j++];
|
||||
}
|
||||
|
||||
void sequentialMergeSort(vector<int>& arr, int left, int right) {
|
||||
if (left >= right) return;
|
||||
int mid = left + (right - left) / 2;
|
||||
sequentialMergeSort(arr, left, mid);
|
||||
sequentialMergeSort(arr, mid + 1, right);
|
||||
merge(arr, left, mid, right);
|
||||
}
|
||||
|
||||
// Parallel merge sort using OpenMP tasks.
|
||||
// "#pragma omp parallel sections" inside a recursive function would spawn a
|
||||
// new thread team at every level of recursion, hundreds of thousands of teams
|
||||
// for a large array, causing enormous overhead and likely a crash.
|
||||
// Tasks are lighter: the runtime schedules them across an existing thread pool.
|
||||
// The depth cutoff switches to sequential below a threshold to avoid spawning
|
||||
// tasks so small that the overhead exceeds the work itself.
|
||||
void parallelMergeSortHelper(vector<int>& arr, int left, int right, int depth) {
|
||||
if (left >= right) return;
|
||||
int mid = left + (right - left) / 2;
|
||||
|
||||
if (depth <= 0) {
|
||||
// Below the cutoff the subarray is small enough that sequential is faster.
|
||||
sequentialMergeSort(arr, left, mid);
|
||||
sequentialMergeSort(arr, mid + 1, right);
|
||||
} else {
|
||||
#pragma omp task
|
||||
parallelMergeSortHelper(arr, left, mid, depth - 1);
|
||||
|
||||
#pragma omp task
|
||||
parallelMergeSortHelper(arr, mid + 1, right, depth - 1);
|
||||
|
||||
// Wait for both tasks to finish before merging.
|
||||
#pragma omp taskwait
|
||||
}
|
||||
|
||||
merge(arr, left, mid, right);
|
||||
}
|
||||
|
||||
void parallelMergeSort(vector<int>& arr, int left, int right) {
|
||||
// The single directive creates one thread team for the entire sort.
|
||||
// All recursive tasks share this pool instead of creating new teams.
|
||||
#pragma omp parallel
|
||||
{
|
||||
// single ensures only one thread kicks off the root task;
|
||||
// the rest wait and pick up the child tasks as they are created.
|
||||
#pragma omp single
|
||||
parallelMergeSortHelper(arr, left, right, 4); // depth 4 → up to 16 parallel tasks
|
||||
}
|
||||
}
|
||||
|
||||
// Main function
|
||||
|
||||
int main() {
|
||||
int n = 10000; // Adjust this to specify the number of elements.
|
||||
vector<int> arr(n);
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
arr[i] = rand() % 10000;
|
||||
|
||||
double start, end;
|
||||
double time_seq_bubble, time_par_bubble;
|
||||
double time_seq_merge, time_par_merge;
|
||||
|
||||
// --- Sequential Bubble Sort ---
|
||||
vector<int> seqArr = arr;
|
||||
start = omp_get_wtime();
|
||||
sequentialBubbleSort(seqArr);
|
||||
end = omp_get_wtime();
|
||||
time_seq_bubble = end - start;
|
||||
cout << "Sequential Bubble Sort time: " << time_seq_bubble << " seconds" << endl;
|
||||
|
||||
// --- Parallel Bubble Sort ---
|
||||
vector<int> parArr = arr;
|
||||
start = omp_get_wtime();
|
||||
parallelBubbleSort(parArr);
|
||||
end = omp_get_wtime();
|
||||
time_par_bubble = end - start;
|
||||
cout << "Parallel Bubble Sort time: " << time_par_bubble << " seconds" << endl;
|
||||
|
||||
cout << "Bubble Sort Speedup (Sequential / Parallel) = " << (time_seq_bubble / time_par_bubble) << "x" << endl;
|
||||
|
||||
// --- Sequential Merge Sort ---
|
||||
seqArr = arr;
|
||||
start = omp_get_wtime();
|
||||
sequentialMergeSort(seqArr, 0, n - 1);
|
||||
end = omp_get_wtime();
|
||||
time_seq_merge = end - start;
|
||||
cout << "\nSequential Merge Sort time: " << time_seq_merge << " seconds" << endl;
|
||||
|
||||
// --- Parallel Merge Sort ---
|
||||
parArr = arr;
|
||||
start = omp_get_wtime();
|
||||
parallelMergeSort(parArr, 0, n - 1);
|
||||
end = omp_get_wtime();
|
||||
time_par_merge = end - start;
|
||||
cout << "Parallel Merge Sort time: " << time_par_merge << " seconds" << endl;
|
||||
|
||||
cout << "Merge Sort Speedup (Sequential / Parallel) = " << (time_seq_merge / time_par_merge) << "x" << endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
// END OF CODE
|
||||
|
||||
/*
|
||||
EXAMPLE OUTPUT (when n=10000):
|
||||
|
||||
$ ./Code-2
|
||||
Sequential Bubble Sort time: 0.955394 seconds
|
||||
Parallel Bubble Sort time: 0.282093 seconds
|
||||
Bubble Sort Speedup (Sequential / Parallel) = 3.38681x
|
||||
|
||||
Sequential Merge Sort time: 0.0116294 seconds
|
||||
Parallel Merge Sort time: 0.00282529 seconds
|
||||
Merge Sort Speedup (Sequential / Parallel) = 4.11618x
|
||||
*/
|
||||
|
||||
@@ -0,0 +1,132 @@
|
||||
// Code-3 (Min, Max, Sum and Average Operations)
|
||||
|
||||
/*
|
||||
* THIS CODE HAS BEEN TESTED AND IS FULLY OPERATIONAL.
|
||||
*
|
||||
* Problem Statement: Implement Min, Max, Sum and Average operations using Parallel Reduction.
|
||||
*
|
||||
* Code from HighPerformanceComputing (SPPU - Final Year - Computer Engineering - Content)
|
||||
* repository on KSKA Git: https://git.kska.io/sppu-be-comp-content/HighPerformanceComputing
|
||||
**/
|
||||
|
||||
/*
|
||||
* EXECUTION INSTRUCTIONS (Debian-based distributions):
|
||||
*
|
||||
* i) Install g++ with OpenMP support:
|
||||
* sudo apt update
|
||||
* sudo apt install g++
|
||||
*
|
||||
* ii) Compile:
|
||||
* g++ -fopenmp Code-3.cpp -o Code-3
|
||||
*
|
||||
* iii) Execute:
|
||||
* ./Code-3
|
||||
**/
|
||||
|
||||
// BEGINNING OF CODE
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <omp.h>
|
||||
#include <cstdlib>
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main() {
|
||||
// Uncomment to manually control thread count
|
||||
// omp_set_num_threads(4);
|
||||
|
||||
// --- Input ---
|
||||
int n = 1000000;
|
||||
vector<int> nums(n);
|
||||
|
||||
for (int i = 0; i < n; i++)
|
||||
nums[i] = rand() % 10000;
|
||||
|
||||
cout << "Input: " << n << " random integers in the range [0, 9999]." << endl << endl;
|
||||
|
||||
// long long prevents overflow: up to 1,000,000 * 9,999 ≈ 10 billion,
|
||||
// which exceeds the int limit of ~2.1 billion.
|
||||
long long sum_seq, sum_par;
|
||||
int min_seq, max_seq;
|
||||
int min_par, max_par;
|
||||
double avg_seq, avg_par;
|
||||
double start, end;
|
||||
|
||||
// --- Sequential ---
|
||||
min_seq = max_seq = nums[0];
|
||||
sum_seq = 0;
|
||||
|
||||
start = omp_get_wtime();
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (nums[i] < min_seq) min_seq = nums[i];
|
||||
if (nums[i] > max_seq) max_seq = nums[i];
|
||||
sum_seq += nums[i];
|
||||
}
|
||||
end = omp_get_wtime();
|
||||
|
||||
// Computed after timing so both versions are measured fairly.
|
||||
avg_seq = (double)sum_seq / n;
|
||||
double time_seq = end - start;
|
||||
|
||||
// --- Parallel ---
|
||||
min_par = max_par = nums[0];
|
||||
sum_par = 0;
|
||||
|
||||
start = omp_get_wtime();
|
||||
// reduction(min/max/+) gives each thread its own private copy of the
|
||||
// variable, then combines them at the end, no critical sections needed.
|
||||
// Without reduction, threads would race to update the same variable.
|
||||
#pragma omp parallel for reduction(min: min_par) reduction(max: max_par) reduction(+: sum_par)
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (nums[i] < min_par) min_par = nums[i];
|
||||
if (nums[i] > max_par) max_par = nums[i];
|
||||
sum_par += nums[i];
|
||||
}
|
||||
end = omp_get_wtime();
|
||||
|
||||
avg_par = (double)sum_par / n;
|
||||
double time_par = end - start;
|
||||
|
||||
// --- Output ---
|
||||
cout << "--- Sequential Computation ---" << endl;
|
||||
cout << "Minimum : " << min_seq << endl;
|
||||
cout << "Maximum : " << max_seq << endl;
|
||||
cout << "Sum : " << sum_seq << endl;
|
||||
cout << "Average : " << avg_seq << endl;
|
||||
cout << "Time : " << time_seq << " seconds" << endl;
|
||||
|
||||
cout << "\n--- Parallel Computation ---" << endl;
|
||||
cout << "Minimum : " << min_par << endl;
|
||||
cout << "Maximum : " << max_par << endl;
|
||||
cout << "Sum : " << sum_par << endl;
|
||||
cout << "Average : " << avg_par << endl;
|
||||
cout << "Time : " << time_par << " seconds" << endl;
|
||||
|
||||
cout << "\nSpeedup (Sequential / Parallel) = " << (time_seq / time_par) << "x" << endl;
|
||||
|
||||
return 0;
|
||||
}
|
||||
// END OF CODE
|
||||
|
||||
/*
|
||||
EXAMPLE OUTPUT:
|
||||
|
||||
$ ./Code-3
|
||||
Input: 1000000 random integers in the range [0, 9999].
|
||||
|
||||
--- Sequential Computation ---
|
||||
Minimum : 0
|
||||
Maximum : 9999
|
||||
Sum : 5000491283
|
||||
Average : 5000.49
|
||||
Time : 0.0205385 seconds
|
||||
|
||||
--- Parallel Computation ---
|
||||
Minimum : 0
|
||||
Maximum : 9999
|
||||
Sum : 5000491283
|
||||
Average : 5000.49
|
||||
Time : 0.0135714 seconds
|
||||
|
||||
Speedup (Sequential / Parallel) = 1.51336x
|
||||
*/
|
||||
@@ -0,0 +1,3 @@
|
||||
attachments/change-runtime.png filter=lfs diff=lfs merge=lfs -text
|
||||
attachments/runtime-navbar.png filter=lfs diff=lfs merge=lfs -text
|
||||
attachments/select-t4-gpu.png filter=lfs diff=lfs merge=lfs -text
|
||||
@@ -0,0 +1,267 @@
|
||||
# Practical-4 (Vector Addition and Matrix Multiplication)
|
||||
|
||||
Problem Statement:
|
||||
Write a CUDA Program for:
|
||||
1. Addition of two large vectors
|
||||
2. Matrix Multiplication using CUDA C
|
||||
|
||||
---
|
||||
|
||||
## Pre-requisities
|
||||
|
||||
1. Open [Google Colab](https://colab.research.google.com/)
|
||||
2. Create a new Jupyter Notebook
|
||||
|
||||
---
|
||||
|
||||
## Steps
|
||||
|
||||
### 1. After creating a new Jupyter notebook, click on "Runtime" in the navbar:
|
||||
|
||||
<img src="attachments/runtime-navbar.png" alt="Runtime in navbar in Google Colab" width=350>
|
||||
|
||||
### 2. Then, choose "Change runtime type":
|
||||
|
||||
<img src="attachments/change-runtime.png" alt="Change runtime type option in Runtime section on Google Colab" width=300>
|
||||
|
||||
### 3. Select "T4 GPU", and save:
|
||||
|
||||
<img src="attachments/select-t4-gpu.png" alt="T4 GPU option selected in Google Colab as Runtime" width=300>
|
||||
|
||||
### 4. Check if `nvcc` is installed:
|
||||
|
||||
```python3
|
||||
!nvcc --version
|
||||
```
|
||||
|
||||
### 5. Install `nvcc4jupyter`:
|
||||
|
||||
```python3
|
||||
!pip install nvcc4jupyter
|
||||
# Or if the above command fails, comment the above line and run
|
||||
# !pip install git+https://git.kska.io/notkshitij/nvcc.git
|
||||
```
|
||||
|
||||
### 6. Load it:
|
||||
|
||||
```python3
|
||||
%load_ext nvcc4jupyter
|
||||
```
|
||||
|
||||
### 7. Paste the below code in a new code block:
|
||||
|
||||
```cu
|
||||
%%writefile cuda_program.cu
|
||||
#include <iostream>
|
||||
#include <cuda.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define BLOCK_SIZE 2
|
||||
|
||||
// Vector Addition Kernel
|
||||
// Each thread computes a single element of C = A + B.
|
||||
__global__ void vectorAdd(int *A, int *B, int *C, int N) {
|
||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// Guard against threads beyond the vector size (when N is not a multiple
|
||||
// of the block size, some threads in the last block are out of range).
|
||||
if (i < N)
|
||||
C[i] = A[i] + B[i];
|
||||
}
|
||||
|
||||
// Matrix Multiplication Kernel
|
||||
// Each thread computes a single element of C = A * B.
|
||||
// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
|
||||
__global__ void matrixMul(float *A, float *B, float *C, int N) {
|
||||
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int n = 0; n < N; ++n)
|
||||
sum += A[row * N + n] * B[n * N + col];
|
||||
|
||||
C[row * N + col] = sum;
|
||||
}
|
||||
|
||||
// Vector Addition
|
||||
void runVectorAddition() {
|
||||
int N;
|
||||
cout << "\n=== Vector Addition ===" << endl;
|
||||
cout << "Enter vector size: ";
|
||||
cin >> N;
|
||||
|
||||
int size = N * sizeof(int);
|
||||
|
||||
// Host allocation and initialisation
|
||||
int *hA = new int[N];
|
||||
int *hB = new int[N];
|
||||
int *hC = new int[N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
hA[i] = i;
|
||||
hB[i] = i * 2;
|
||||
}
|
||||
|
||||
cout << "\nVector A: ";
|
||||
for (int i = 0; i < N; i++) cout << hA[i] << " ";
|
||||
cout << "\nVector B: ";
|
||||
for (int i = 0; i < N; i++) cout << hB[i] << " ";
|
||||
cout << endl;
|
||||
|
||||
// Device allocation and transfer
|
||||
int *dA, *dB, *dC;
|
||||
cudaMalloc(&dA, size);
|
||||
cudaMalloc(&dB, size);
|
||||
cudaMalloc(&dC, size);
|
||||
|
||||
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||
|
||||
// Launch with enough blocks to cover all N elements.
|
||||
// (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
|
||||
int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
|
||||
|
||||
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||
|
||||
cout << "Result A + B: ";
|
||||
for (int i = 0; i < N; i++) cout << hC[i] << " ";
|
||||
cout << endl;
|
||||
|
||||
delete[] hA;
|
||||
delete[] hB;
|
||||
delete[] hC;
|
||||
cudaFree(dA);
|
||||
cudaFree(dB);
|
||||
cudaFree(dC);
|
||||
}
|
||||
|
||||
// Matrix Multiplication
|
||||
void runMatrixMultiplication() {
|
||||
int K, N;
|
||||
cout << "\n=== Matrix Multiplication ===" << endl;
|
||||
cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
|
||||
cin >> K;
|
||||
N = K * BLOCK_SIZE;
|
||||
|
||||
cout << "Matrix size: " << N << " x " << N << endl;
|
||||
int size = N * N * sizeof(float);
|
||||
|
||||
// Host allocation and initialisation
|
||||
float *hA = new float[N * N];
|
||||
float *hB = new float[N * N];
|
||||
float *hC = new float[N * N];
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
hA[j * N + i] = 2;
|
||||
hB[j * N + i] = 4;
|
||||
}
|
||||
}
|
||||
|
||||
cout << "\nMatrix A:\n";
|
||||
for (int row = 0; row < N; row++) {
|
||||
for (int col = 0; col < N; col++)
|
||||
cout << hA[row * N + col] << " ";
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
cout << "\nMatrix B:\n";
|
||||
for (int row = 0; row < N; row++) {
|
||||
for (int col = 0; col < N; col++)
|
||||
cout << hB[row * N + col] << " ";
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
// Device allocation and transfer
|
||||
float *dA, *dB, *dC;
|
||||
cudaMalloc(&dA, size);
|
||||
cudaMalloc(&dB, size);
|
||||
cudaMalloc(&dC, size);
|
||||
|
||||
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||
|
||||
// threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
|
||||
// grid: K x K blocks, so total threads = N x N (one per output element).
|
||||
dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
|
||||
dim3 grid(K, K);
|
||||
matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
|
||||
|
||||
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||
|
||||
cout << "\nResult C = A * B:\n";
|
||||
for (int row = 0; row < N; row++) {
|
||||
for (int col = 0; col < N; col++)
|
||||
cout << hC[row * N + col] << " ";
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
delete[] hA;
|
||||
delete[] hB;
|
||||
delete[] hC;
|
||||
cudaFree(dA);
|
||||
cudaFree(dB);
|
||||
cudaFree(dC);
|
||||
}
|
||||
|
||||
int main() {
|
||||
runVectorAddition();
|
||||
runMatrixMultiplication();
|
||||
|
||||
cout << "\nFinished." << endl;
|
||||
return 0;
|
||||
}
|
||||
```
|
||||
|
||||
### 8. Compile and run:
|
||||
|
||||
```python3
|
||||
!nvcc cuda_program.cu -o cuda_program && ./cuda_program
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Sample output
|
||||
|
||||
```md
|
||||
=== Vector Addition ===
|
||||
Enter vector size: 2
|
||||
|
||||
Vector A: 0 1
|
||||
Vector B: 0 2
|
||||
Result A + B: 0 3
|
||||
|
||||
=== Matrix Multiplication ===
|
||||
Enter K (matrix will be N x N where N = K * 2): 3
|
||||
Matrix size: 6 x 6
|
||||
|
||||
Matrix A:
|
||||
2 2 2 2 2 2
|
||||
2 2 2 2 2 2
|
||||
2 2 2 2 2 2
|
||||
2 2 2 2 2 2
|
||||
2 2 2 2 2 2
|
||||
2 2 2 2 2 2
|
||||
|
||||
Matrix B:
|
||||
4 4 4 4 4 4
|
||||
4 4 4 4 4 4
|
||||
4 4 4 4 4 4
|
||||
4 4 4 4 4 4
|
||||
4 4 4 4 4 4
|
||||
4 4 4 4 4 4
|
||||
|
||||
Result C = A * B:
|
||||
48 48 48 48 48 48
|
||||
48 48 48 48 48 48
|
||||
48 48 48 48 48 48
|
||||
48 48 48 48 48 48
|
||||
48 48 48 48 48 48
|
||||
48 48 48 48 48 48
|
||||
|
||||
Finished.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
@@ -0,0 +1,348 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"provenance": [],
|
||||
"gpuType": "T4"
|
||||
},
|
||||
"kernelspec": {
|
||||
"name": "python3",
|
||||
"display_name": "Python 3"
|
||||
},
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
},
|
||||
"accelerator": "GPU"
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "sSZ5XEy-IFoj",
|
||||
"outputId": "8bac00c3-0327-4682-f636-b6b253db5201"
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"nvcc: NVIDIA (R) Cuda compiler driver\n",
|
||||
"Copyright (c) 2005-2025 NVIDIA Corporation\n",
|
||||
"Built on Fri_Feb_21_20:23:50_PST_2025\n",
|
||||
"Cuda compilation tools, release 12.8, V12.8.93\n",
|
||||
"Build cuda_12.8.r12.8/compiler.35583870_0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!nvcc --version"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!pip install nvcc4jupyter\n",
|
||||
"# Or if the above command fails, comment the above line and run\n",
|
||||
"# !pip install git+https://git.kska.io/notkshitij/nvcc.git"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "1jHq-AKfIINd",
|
||||
"outputId": "818ccbb0-9383-4be5-cd79-c9527c5806c9"
|
||||
},
|
||||
"execution_count": 25,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Requirement already satisfied: nvcc4jupyter in /usr/local/lib/python3.12/dist-packages (1.2.1)\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"%load_ext nvcc4jupyter"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "9nuQsRZMIROH",
|
||||
"outputId": "e8508f1c-9895-4e8c-e7f0-d9e34aab21a1"
|
||||
},
|
||||
"execution_count": 16,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"The nvcc4jupyter extension is already loaded. To reload it, use:\n",
|
||||
" %reload_ext nvcc4jupyter\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"%%writefile cuda_program.cu\n",
|
||||
"#include <iostream>\n",
|
||||
"#include <cuda.h>\n",
|
||||
"\n",
|
||||
"using namespace std;\n",
|
||||
"\n",
|
||||
"#define BLOCK_SIZE 2\n",
|
||||
"\n",
|
||||
"// Vector Addition Kernel\n",
|
||||
"// Each thread computes a single element of C = A + B.\n",
|
||||
"__global__ void vectorAdd(int *A, int *B, int *C, int N) {\n",
|
||||
" int i = blockIdx.x * blockDim.x + threadIdx.x;\n",
|
||||
" // Guard against threads beyond the vector size (when N is not a multiple\n",
|
||||
" // of the block size, some threads in the last block are out of range).\n",
|
||||
" if (i < N)\n",
|
||||
" C[i] = A[i] + B[i];\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"// Matrix Multiplication Kernel\n",
|
||||
"// Each thread computes a single element of C = A * B.\n",
|
||||
"// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.\n",
|
||||
"__global__ void matrixMul(float *A, float *B, float *C, int N) {\n",
|
||||
" int row = blockIdx.y * blockDim.y + threadIdx.y;\n",
|
||||
" int col = blockIdx.x * blockDim.x + threadIdx.x;\n",
|
||||
"\n",
|
||||
" float sum = 0.0f;\n",
|
||||
" for (int n = 0; n < N; ++n)\n",
|
||||
" sum += A[row * N + n] * B[n * N + col];\n",
|
||||
"\n",
|
||||
" C[row * N + col] = sum;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"// Vector Addition\n",
|
||||
"void runVectorAddition() {\n",
|
||||
" int N;\n",
|
||||
" cout << \"\\n=== Vector Addition ===\" << endl;\n",
|
||||
" cout << \"Enter vector size: \";\n",
|
||||
" cin >> N;\n",
|
||||
"\n",
|
||||
" int size = N * sizeof(int);\n",
|
||||
"\n",
|
||||
" // Host allocation and initialisation\n",
|
||||
" int *hA = new int[N];\n",
|
||||
" int *hB = new int[N];\n",
|
||||
" int *hC = new int[N];\n",
|
||||
"\n",
|
||||
" for (int i = 0; i < N; i++) {\n",
|
||||
" hA[i] = i;\n",
|
||||
" hB[i] = i * 2;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" cout << \"\\nVector A: \";\n",
|
||||
" for (int i = 0; i < N; i++) cout << hA[i] << \" \";\n",
|
||||
" cout << \"\\nVector B: \";\n",
|
||||
" for (int i = 0; i < N; i++) cout << hB[i] << \" \";\n",
|
||||
" cout << endl;\n",
|
||||
"\n",
|
||||
" // Device allocation and transfer\n",
|
||||
" int *dA, *dB, *dC;\n",
|
||||
" cudaMalloc(&dA, size);\n",
|
||||
" cudaMalloc(&dB, size);\n",
|
||||
" cudaMalloc(&dC, size);\n",
|
||||
"\n",
|
||||
" cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
|
||||
" cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
|
||||
"\n",
|
||||
" // Launch with enough blocks to cover all N elements.\n",
|
||||
" // (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.\n",
|
||||
" int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;\n",
|
||||
" vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);\n",
|
||||
"\n",
|
||||
" cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
|
||||
"\n",
|
||||
" cout << \"Result A + B: \";\n",
|
||||
" for (int i = 0; i < N; i++) cout << hC[i] << \" \";\n",
|
||||
" cout << endl;\n",
|
||||
"\n",
|
||||
" delete[] hA;\n",
|
||||
" delete[] hB;\n",
|
||||
" delete[] hC;\n",
|
||||
" cudaFree(dA);\n",
|
||||
" cudaFree(dB);\n",
|
||||
" cudaFree(dC);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"// Matrix Multiplication\n",
|
||||
"void runMatrixMultiplication() {\n",
|
||||
" int K, N;\n",
|
||||
" cout << \"\\n=== Matrix Multiplication ===\" << endl;\n",
|
||||
" cout << \"Enter K (matrix will be N x N where N = K * \" << BLOCK_SIZE << \"): \";\n",
|
||||
" cin >> K;\n",
|
||||
" N = K * BLOCK_SIZE;\n",
|
||||
"\n",
|
||||
" cout << \"Matrix size: \" << N << \" x \" << N << endl;\n",
|
||||
" int size = N * N * sizeof(float);\n",
|
||||
"\n",
|
||||
" // Host allocation and initialisation\n",
|
||||
" float *hA = new float[N * N];\n",
|
||||
" float *hB = new float[N * N];\n",
|
||||
" float *hC = new float[N * N];\n",
|
||||
"\n",
|
||||
" for (int j = 0; j < N; j++) {\n",
|
||||
" for (int i = 0; i < N; i++) {\n",
|
||||
" hA[j * N + i] = 2;\n",
|
||||
" hB[j * N + i] = 4;\n",
|
||||
" }\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" cout << \"\\nMatrix A:\\n\";\n",
|
||||
" for (int row = 0; row < N; row++) {\n",
|
||||
" for (int col = 0; col < N; col++)\n",
|
||||
" cout << hA[row * N + col] << \" \";\n",
|
||||
" cout << endl;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" cout << \"\\nMatrix B:\\n\";\n",
|
||||
" for (int row = 0; row < N; row++) {\n",
|
||||
" for (int col = 0; col < N; col++)\n",
|
||||
" cout << hB[row * N + col] << \" \";\n",
|
||||
" cout << endl;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" // Device allocation and transfer\n",
|
||||
" float *dA, *dB, *dC;\n",
|
||||
" cudaMalloc(&dA, size);\n",
|
||||
" cudaMalloc(&dB, size);\n",
|
||||
" cudaMalloc(&dC, size);\n",
|
||||
"\n",
|
||||
" cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);\n",
|
||||
" cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);\n",
|
||||
"\n",
|
||||
" // threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.\n",
|
||||
" // grid: K x K blocks, so total threads = N x N (one per output element).\n",
|
||||
" dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);\n",
|
||||
" dim3 grid(K, K);\n",
|
||||
" matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);\n",
|
||||
"\n",
|
||||
" cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);\n",
|
||||
"\n",
|
||||
" cout << \"\\nResult C = A * B:\\n\";\n",
|
||||
" for (int row = 0; row < N; row++) {\n",
|
||||
" for (int col = 0; col < N; col++)\n",
|
||||
" cout << hC[row * N + col] << \" \";\n",
|
||||
" cout << endl;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" delete[] hA;\n",
|
||||
" delete[] hB;\n",
|
||||
" delete[] hC;\n",
|
||||
" cudaFree(dA);\n",
|
||||
" cudaFree(dB);\n",
|
||||
" cudaFree(dC);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"int main() {\n",
|
||||
" runVectorAddition();\n",
|
||||
" runMatrixMultiplication();\n",
|
||||
"\n",
|
||||
" cout << \"\\nFinished.\" << endl;\n",
|
||||
" return 0;\n",
|
||||
"}\n"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "nvCj8UmhIh3o",
|
||||
"outputId": "54a31aec-f860-4b72-a4e1-03a392def6f6"
|
||||
},
|
||||
"execution_count": 23,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"Overwriting cuda_program.cu\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"!nvcc cuda_program.cu -o cuda_program && ./cuda_program"
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/"
|
||||
},
|
||||
"id": "F7fC0LtbJ5o8",
|
||||
"outputId": "9d5988b2-ad42-4b0b-c84d-c1698e778bb9"
|
||||
},
|
||||
"execution_count": 24,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"name": "stdout",
|
||||
"text": [
|
||||
"nvcc warning : Support for offline compilation for architectures prior to '<compute/sm/lto>_75' will be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n",
|
||||
"\n",
|
||||
"=== Vector Addition ===\n",
|
||||
"Enter vector size: 2\n",
|
||||
"\n",
|
||||
"Vector A: 0 1 \n",
|
||||
"Vector B: 0 2 \n",
|
||||
"Result A + B: 0 3 \n",
|
||||
"\n",
|
||||
"=== Matrix Multiplication ===\n",
|
||||
"Enter K (matrix will be N x N where N = K * 2): 3\n",
|
||||
"Matrix size: 6 x 6\n",
|
||||
"\n",
|
||||
"Matrix A:\n",
|
||||
"2 2 2 2 2 2 \n",
|
||||
"2 2 2 2 2 2 \n",
|
||||
"2 2 2 2 2 2 \n",
|
||||
"2 2 2 2 2 2 \n",
|
||||
"2 2 2 2 2 2 \n",
|
||||
"2 2 2 2 2 2 \n",
|
||||
"\n",
|
||||
"Matrix B:\n",
|
||||
"4 4 4 4 4 4 \n",
|
||||
"4 4 4 4 4 4 \n",
|
||||
"4 4 4 4 4 4 \n",
|
||||
"4 4 4 4 4 4 \n",
|
||||
"4 4 4 4 4 4 \n",
|
||||
"4 4 4 4 4 4 \n",
|
||||
"\n",
|
||||
"Result C = A * B:\n",
|
||||
"48 48 48 48 48 48 \n",
|
||||
"48 48 48 48 48 48 \n",
|
||||
"48 48 48 48 48 48 \n",
|
||||
"48 48 48 48 48 48 \n",
|
||||
"48 48 48 48 48 48 \n",
|
||||
"48 48 48 48 48 48 \n",
|
||||
"\n",
|
||||
"Finished.\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"source": [],
|
||||
"metadata": {
|
||||
"id": "HjhrulSNKHkq"
|
||||
},
|
||||
"execution_count": null,
|
||||
"outputs": []
|
||||
}
|
||||
]
|
||||
}
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,161 @@
|
||||
# %%writefile cuda_program.cu
|
||||
#include <iostream>
|
||||
#include <cuda.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define BLOCK_SIZE 2
|
||||
|
||||
// ─── Vector Addition Kernel ──────────────────────────────────────────────────
|
||||
// Each thread computes a single element of C = A + B.
|
||||
__global__ void vectorAdd(int *A, int *B, int *C, int N) {
|
||||
int i = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
// Guard against threads beyond the vector size (when N is not a multiple
|
||||
// of the block size, some threads in the last block are out of range).
|
||||
if (i < N)
|
||||
C[i] = A[i] + B[i];
|
||||
}
|
||||
|
||||
// ─── Matrix Multiplication Kernel ────────────────────────────────────────────
|
||||
// Each thread computes a single element of C = A * B.
|
||||
// Thread (row, col) sums the dot product of row `row` of A with column `col` of B.
|
||||
__global__ void matrixMul(float *A, float *B, float *C, int N) {
|
||||
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int n = 0; n < N; ++n)
|
||||
sum += A[row * N + n] * B[n * N + col];
|
||||
|
||||
C[row * N + col] = sum;
|
||||
}
|
||||
|
||||
// ─── Vector Addition ─────────────────────────────────────────────────────────
|
||||
void runVectorAddition() {
|
||||
int N;
|
||||
cout << "\n=== Vector Addition ===" << endl;
|
||||
cout << "Enter vector size: ";
|
||||
cin >> N;
|
||||
|
||||
int size = N * sizeof(int);
|
||||
|
||||
// Host allocation and initialisation
|
||||
int *hA = new int[N];
|
||||
int *hB = new int[N];
|
||||
int *hC = new int[N];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
hA[i] = i;
|
||||
hB[i] = i * 2;
|
||||
}
|
||||
|
||||
cout << "\nVector A: ";
|
||||
for (int i = 0; i < N; i++) cout << hA[i] << " ";
|
||||
cout << "\nVector B: ";
|
||||
for (int i = 0; i < N; i++) cout << hB[i] << " ";
|
||||
cout << endl;
|
||||
|
||||
// Device allocation and transfer
|
||||
int *dA, *dB, *dC;
|
||||
cudaMalloc(&dA, size);
|
||||
cudaMalloc(&dB, size);
|
||||
cudaMalloc(&dC, size);
|
||||
|
||||
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||
|
||||
// Launch with enough blocks to cover all N elements.
|
||||
// (N + BLOCK_SIZE - 1) / BLOCK_SIZE rounds up so we don't miss the tail.
|
||||
int numBlocks = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||
vectorAdd<<<numBlocks, BLOCK_SIZE>>>(dA, dB, dC, N);
|
||||
|
||||
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||
|
||||
cout << "Result A + B: ";
|
||||
for (int i = 0; i < N; i++) cout << hC[i] << " ";
|
||||
cout << endl;
|
||||
|
||||
delete[] hA;
|
||||
delete[] hB;
|
||||
delete[] hC;
|
||||
cudaFree(dA);
|
||||
cudaFree(dB);
|
||||
cudaFree(dC);
|
||||
}
|
||||
|
||||
// ─── Matrix Multiplication ───────────────────────────────────────────────────
|
||||
void runMatrixMultiplication() {
|
||||
int K, N;
|
||||
cout << "\n=== Matrix Multiplication ===" << endl;
|
||||
cout << "Enter K (matrix will be N x N where N = K * " << BLOCK_SIZE << "): ";
|
||||
cin >> K;
|
||||
N = K * BLOCK_SIZE;
|
||||
|
||||
cout << "Matrix size: " << N << " x " << N << endl;
|
||||
int size = N * N * sizeof(float);
|
||||
|
||||
// Host allocation and initialisation
|
||||
float *hA = new float[N * N];
|
||||
float *hB = new float[N * N];
|
||||
float *hC = new float[N * N];
|
||||
|
||||
for (int j = 0; j < N; j++) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
hA[j * N + i] = 2;
|
||||
hB[j * N + i] = 4;
|
||||
}
|
||||
}
|
||||
|
||||
cout << "\nMatrix A:\n";
|
||||
for (int row = 0; row < N; row++) {
|
||||
for (int col = 0; col < N; col++)
|
||||
cout << hA[row * N + col] << " ";
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
cout << "\nMatrix B:\n";
|
||||
for (int row = 0; row < N; row++) {
|
||||
for (int col = 0; col < N; col++)
|
||||
cout << hB[row * N + col] << " ";
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
// Device allocation and transfer
|
||||
float *dA, *dB, *dC;
|
||||
cudaMalloc(&dA, size);
|
||||
cudaMalloc(&dB, size);
|
||||
cudaMalloc(&dC, size);
|
||||
|
||||
cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(dB, hB, size, cudaMemcpyHostToDevice);
|
||||
|
||||
// threadBlock: BLOCK_SIZE x BLOCK_SIZE threads per block.
|
||||
// grid: K x K blocks, so total threads = N x N (one per output element).
|
||||
dim3 threadBlock(BLOCK_SIZE, BLOCK_SIZE);
|
||||
dim3 grid(K, K);
|
||||
matrixMul<<<grid, threadBlock>>>(dA, dB, dC, N);
|
||||
|
||||
cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);
|
||||
|
||||
cout << "\nResult C = A * B:\n";
|
||||
for (int row = 0; row < N; row++) {
|
||||
for (int col = 0; col < N; col++)
|
||||
cout << hC[row * N + col] << " ";
|
||||
cout << endl;
|
||||
}
|
||||
|
||||
delete[] hA;
|
||||
delete[] hB;
|
||||
delete[] hC;
|
||||
cudaFree(dA);
|
||||
cudaFree(dB);
|
||||
cudaFree(dC);
|
||||
}
|
||||
|
||||
int main() {
|
||||
runVectorAddition();
|
||||
runMatrixMultiplication();
|
||||
|
||||
cout << "\nFinished." << endl;
|
||||
return 0;
|
||||
}
|
||||
BIN
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
@@ -10,6 +10,11 @@ This repository compiles essential resources for the SPPU Computer Engineering P
|
||||
|
||||
### Codes
|
||||
|
||||
1. [Code-1 (Parallel BFS and DFS)](Codes/Code-1.cpp)
|
||||
2. [Code-2 (Sequential and Parallel Bubble Sort and Merge Sort)](Codes/Code-2.cpp)
|
||||
3. [Code-3 (Min, Max, Sum, Average)](Codes/Code-3.cpp)
|
||||
4. [Code-4 (Vector Addition and Matrix Multiplication)](Codes/Code-4/)
|
||||
|
||||
### Practical
|
||||
|
||||
1. [Practical-1](Practical/Practical-1/)
|
||||
@@ -18,6 +23,15 @@ This repository compiles essential resources for the SPPU Computer Engineering P
|
||||
4. [Practical-4](Practical/Practical-4/)
|
||||
5. [Mini Project](Practical/HPC%20-%20Mini%20Project%20-%20Handout.doc)
|
||||
|
||||
### Assignments
|
||||
|
||||
1. Assignment-1
|
||||
- [Questions](Assignments/HPC%20-%20Assignment-1%20%28Questions%29.pdf)
|
||||
- [Answers](Assignments/HPC%20-%20Assignment-1%20%28Answers%29.pdf)
|
||||
2. Assignment-2
|
||||
- [Questions](Assignments/HPC%20-%20Assignment-2%20%28Questions%29.pdf)
|
||||
- [Answers](Assignments/HPC%20-%20Assignment-2%20%28Answers%29.pdf)
|
||||
|
||||
### Question Papers
|
||||
|
||||
- [IN-SEM](Question%20Papers/IN-SEM)
|
||||
@@ -25,6 +39,8 @@ This repository compiles essential resources for the SPPU Computer Engineering P
|
||||
|
||||
### [IN-SEM PYQ Answers](Notes/IN-SEM%20PYQ%20Answers/)
|
||||
|
||||
### [END-SEM PYQ Answers](Notes/END-SEM%20PYQ%20Answers/)
|
||||
|
||||
---
|
||||
|
||||
## Miscellaneous
|
||||
|
||||
Reference in New Issue
Block a user