mirror of
https://github.com/andreinechaev/nvcc4jupyter.git
synced 2026-06-13 18:50:47 +05:30
Add readthedocs config and sphinx documentation
This commit is contained in:
@@ -0,0 +1,32 @@
|
|||||||
|
# .readthedocs.yaml
|
||||||
|
# Read the Docs configuration file
|
||||||
|
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
||||||
|
|
||||||
|
# Required
|
||||||
|
version: 2
|
||||||
|
|
||||||
|
# Set the OS, Python version and other tools you might need
|
||||||
|
build:
|
||||||
|
os: ubuntu-22.04
|
||||||
|
tools:
|
||||||
|
python: "3.12"
|
||||||
|
# You can also specify other tool versions:
|
||||||
|
# nodejs: "19"
|
||||||
|
# rust: "1.64"
|
||||||
|
# golang: "1.19"
|
||||||
|
|
||||||
|
# Build documentation in the "docs/" directory with Sphinx
|
||||||
|
sphinx:
|
||||||
|
configuration: docs/conf.py
|
||||||
|
|
||||||
|
# Optionally build your docs in additional formats such as PDF and ePub
|
||||||
|
# formats:
|
||||||
|
# - pdf
|
||||||
|
# - epub
|
||||||
|
|
||||||
|
# Optional but recommended, declare the Python requirements required
|
||||||
|
# to build your documentation
|
||||||
|
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
|
||||||
|
python:
|
||||||
|
install:
|
||||||
|
- requirements: docs/requirements.txt
|
||||||
@@ -0,0 +1,20 @@
|
|||||||
|
# Minimal makefile for Sphinx documentation
|
||||||
|
#
|
||||||
|
|
||||||
|
# You can set these variables from the command line, and also
|
||||||
|
# from the environment for the first two.
|
||||||
|
SPHINXOPTS ?=
|
||||||
|
SPHINXBUILD ?= sphinx-build
|
||||||
|
SOURCEDIR = source
|
||||||
|
BUILDDIR = build
|
||||||
|
|
||||||
|
# Put it first so that "make" without argument is like "make help".
|
||||||
|
help:
|
||||||
|
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
|
|
||||||
|
.PHONY: help Makefile
|
||||||
|
|
||||||
|
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||||
|
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||||
|
%: Makefile
|
||||||
|
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
@ECHO OFF
|
||||||
|
|
||||||
|
pushd %~dp0
|
||||||
|
|
||||||
|
REM Command file for Sphinx documentation
|
||||||
|
|
||||||
|
if "%SPHINXBUILD%" == "" (
|
||||||
|
set SPHINXBUILD=sphinx-build
|
||||||
|
)
|
||||||
|
set SOURCEDIR=source
|
||||||
|
set BUILDDIR=build
|
||||||
|
|
||||||
|
if "%1" == "" goto help
|
||||||
|
|
||||||
|
%SPHINXBUILD% >NUL 2>NUL
|
||||||
|
if errorlevel 9009 (
|
||||||
|
echo.
|
||||||
|
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||||
|
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||||
|
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||||
|
echo.may add the Sphinx directory to PATH.
|
||||||
|
echo.
|
||||||
|
echo.If you don't have Sphinx installed, grab it from
|
||||||
|
echo.http://sphinx-doc.org/
|
||||||
|
exit /b 1
|
||||||
|
)
|
||||||
|
|
||||||
|
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
|
goto end
|
||||||
|
|
||||||
|
:help
|
||||||
|
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||||
|
|
||||||
|
:end
|
||||||
|
popd
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
sphinx==7.1.2
|
||||||
|
sphinx-rtd-theme==1.3.0rc1
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
# Configuration file for the Sphinx documentation builder.
|
||||||
|
#
|
||||||
|
# For the full list of built-in configuration values, see the documentation:
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||||
|
|
||||||
|
# -- Project information -----------------------------------------------------
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
|
||||||
|
|
||||||
|
project = "nvcc4jupyter"
|
||||||
|
copyright = "2024, Andrei Nechaev & Cosmin Stefan Ciocan"
|
||||||
|
author = "Andrei Nechaev & Cosmin Stefan Ciocan"
|
||||||
|
release = "1.0.1"
|
||||||
|
version = "1.0.1"
|
||||||
|
|
||||||
|
# -- General configuration ---------------------------------------------------
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
|
||||||
|
|
||||||
|
extensions = [
|
||||||
|
"sphinx.ext.duration",
|
||||||
|
"sphinx.ext.doctest",
|
||||||
|
"sphinx.ext.autodoc",
|
||||||
|
"sphinx.ext.autosummary",
|
||||||
|
"sphinx.ext.intersphinx",
|
||||||
|
]
|
||||||
|
|
||||||
|
intersphinx_mapping = {
|
||||||
|
"python": ("https://docs.python.org/3/", None),
|
||||||
|
"sphinx": ("https://www.sphinx-doc.org/en/master/", None),
|
||||||
|
}
|
||||||
|
intersphinx_disabled_domains = ["std"]
|
||||||
|
|
||||||
|
templates_path = ["_templates"]
|
||||||
|
exclude_patterns = []
|
||||||
|
|
||||||
|
|
||||||
|
# -- Options for HTML output -------------------------------------------------
|
||||||
|
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
|
||||||
|
|
||||||
|
html_theme = "sphinx_rtd_theme"
|
||||||
|
html_static_path = ["_static"]
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
Welcome to nvcc4jupyter's documentation!
|
||||||
|
========================================
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
This project is under active development.
|
||||||
|
|
||||||
|
Contents
|
||||||
|
--------
|
||||||
|
|
||||||
|
.. toctree::
|
||||||
|
:maxdepth: 2
|
||||||
|
:caption: Contents:
|
||||||
|
|
||||||
|
usage
|
||||||
|
magics
|
||||||
|
|
||||||
|
Indices and tables
|
||||||
|
==================
|
||||||
|
|
||||||
|
* :ref:`genindex`
|
||||||
|
* :ref:`modindex`
|
||||||
|
* :ref:`search`
|
||||||
@@ -0,0 +1,172 @@
|
|||||||
|
**********
|
||||||
|
Magics API
|
||||||
|
**********
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
Arguments for profilers and the nvcc compiler can be passed in double
|
||||||
|
quotes so they can contain spaces and dashes.
|
||||||
|
|
||||||
|
------
|
||||||
|
|
||||||
|
.. _cuda_magic:
|
||||||
|
|
||||||
|
cuda
|
||||||
|
====
|
||||||
|
|
||||||
|
Magic command that compiles, runs, and profiles CUDA C++ code in the cell.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
|
||||||
|
- ``%%cuda``: Compile and run this cell.
|
||||||
|
- ``%%cuda -p``: Also runs the Nsight Compute profiler.
|
||||||
|
- ``%%cuda -p -a "<SPACE SEPARATED PROFILER ARGS>"``: Also runs the Nsight Compute profiler.
|
||||||
|
- ``%%cuda -t``: Outputs the "timeit" built-in magic results.
|
||||||
|
|
||||||
|
Options
|
||||||
|
-------
|
||||||
|
|
||||||
|
-t, --timeit
|
||||||
|
Boolean. If set, returns the output of the "timeit" built-in
|
||||||
|
ipython magic instead of stdout.
|
||||||
|
|
||||||
|
-p, --profile
|
||||||
|
Boolean. If set, runs the NVIDIA Nsight Compute profiler whose
|
||||||
|
output is appended to standard output.
|
||||||
|
|
||||||
|
-a, --profiler-args
|
||||||
|
String. Optional profiler arguments that can be space separated
|
||||||
|
by wrapping them in double quotes. See all options here:
|
||||||
|
`Nsight Compute CLI <https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html#command-line-options>`_
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
If both "\-\-profile" and "\-\-timeit" are used then no profiling is
|
||||||
|
done.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
::
|
||||||
|
|
||||||
|
# compile, run, and profile the code in the cell with the Nsight
|
||||||
|
# compute profiler while collecting only metrics from the
|
||||||
|
# "MemoryWorkloadAnalysis" section.
|
||||||
|
%%cuda --profile --profiler-args "--section MemoryWorkloadAnalysis"
|
||||||
|
|
||||||
|
------
|
||||||
|
|
||||||
|
.. _cuda_group_save_magic:
|
||||||
|
|
||||||
|
cuda_group_save
|
||||||
|
===============
|
||||||
|
|
||||||
|
Magic command that saves CUDA C++ code in the cell for later
|
||||||
|
compilation and execution with possibly more source files.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
|
||||||
|
- ``%%cuda_group_save -n <FILENAME> -g <GROUPNAME>``: Save the code in the current cell to a group of source files.
|
||||||
|
|
||||||
|
Options
|
||||||
|
-------
|
||||||
|
|
||||||
|
-n, --name
|
||||||
|
String. Required file name of the saved source file. Must have
|
||||||
|
either the ".cu" or ".h" extension. In order to import a header
|
||||||
|
file saved with this magic you can simply add '#include "<name>"'.
|
||||||
|
|
||||||
|
-g, --group
|
||||||
|
String. Required group name to which to add the saved source file.
|
||||||
|
Groups are source files that get compiled together and do not
|
||||||
|
interact with other groups. This allows you to have multiple
|
||||||
|
unrelated CUDA programs within the same jupyter notebook. Adding
|
||||||
|
files to a group named "shared" will make them available to all
|
||||||
|
other source file groups. One use case for the shared group is for
|
||||||
|
sharing error handling code which should be present in all CUDA
|
||||||
|
programs.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
::
|
||||||
|
|
||||||
|
# jupyter cell 1
|
||||||
|
%%cuda_group_save -n "error_handling.h" -g "shared"
|
||||||
|
<ERROR HANDLING CODE>
|
||||||
|
|
||||||
|
# jupyter cell 2
|
||||||
|
%%cuda_group_save -n "main.cu" -g "example_group"
|
||||||
|
#include "error_handling.h"
|
||||||
|
<YOUR CODE HERE>
|
||||||
|
|
||||||
|
------
|
||||||
|
|
||||||
|
.. _cuda_group_run_magic:
|
||||||
|
|
||||||
|
cuda_group_run
|
||||||
|
==============
|
||||||
|
|
||||||
|
Line magic command that compiles, runs, and profiles all source files
|
||||||
|
in a group.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
|
||||||
|
- ``%%cuda_group_run -g <GROUPNAME>``: Compiles, runs, and profiles the sources files in the given group.
|
||||||
|
|
||||||
|
Options
|
||||||
|
-------
|
||||||
|
|
||||||
|
-g, --group
|
||||||
|
String. Required group name whose source files should be deleted.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
All options from the "%%cuda" cell magic are inherited.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
::
|
||||||
|
|
||||||
|
# jupyter cell 1
|
||||||
|
%%cuda_group_save -n "error_handling.h" -g "shared"
|
||||||
|
<ERROR HANDLING CODE>
|
||||||
|
|
||||||
|
# jupyter cell 2
|
||||||
|
%%cuda_group_save -n "main.cu" -g "example_group"
|
||||||
|
#include "error_handling.h"
|
||||||
|
<YOUR CODE HERE>
|
||||||
|
|
||||||
|
# jupyter cell 3
|
||||||
|
%cuda_group_run -g "example_group" --profile
|
||||||
|
|
||||||
|
-----
|
||||||
|
|
||||||
|
.. _cuda_group_delete_magic:
|
||||||
|
|
||||||
|
cuda_group_delete
|
||||||
|
=================
|
||||||
|
|
||||||
|
Line magic command that deletes all source files in a group.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
|
||||||
|
- ``%%cuda_group_delete -g <GROUPNAME>``: Removes all source files in the given group.
|
||||||
|
|
||||||
|
Options
|
||||||
|
-------
|
||||||
|
|
||||||
|
-g, --group
|
||||||
|
String. Required group name whose source files should be deleted.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
--------
|
||||||
|
::
|
||||||
|
|
||||||
|
# jupyter cell 1
|
||||||
|
%%cuda_group_save -n "error_handling.h" -g "shared"
|
||||||
|
<ERROR HANDLING CODE>
|
||||||
|
|
||||||
|
# jupyter cell 2 - here we delete the error shared group; in
|
||||||
|
# practice this would be helpful if you want to overwrite some
|
||||||
|
# functionality that was defined earlier in the notebook
|
||||||
|
%cuda_group_delete -g "shared"
|
||||||
@@ -0,0 +1,265 @@
|
|||||||
|
Usage
|
||||||
|
=====
|
||||||
|
|
||||||
|
This IPython extension allows running CUDA C++ code in Jupyter notebook. This
|
||||||
|
is especially useful when combined with `Google Colab <https://colab.research.google.com/>`_
|
||||||
|
which provides CUDA capable GPUs with the CUDA toolkit already installed.
|
||||||
|
|
||||||
|
.. _installation:
|
||||||
|
|
||||||
|
Installation
|
||||||
|
------------
|
||||||
|
|
||||||
|
To use nvcc4jupyter, first install it using pip:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
(venv) $ pip install nvcc4jupyter
|
||||||
|
|
||||||
|
.. _load_extension:
|
||||||
|
|
||||||
|
Load the Extension
|
||||||
|
------------------
|
||||||
|
|
||||||
|
Now we need to load the IPython extension to be able to use its cell and line
|
||||||
|
magic commands:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
%load_ext nvcc4jupyter
|
||||||
|
|
||||||
|
Hello World
|
||||||
|
-----------
|
||||||
|
|
||||||
|
We will use the :ref:`cuda <cuda_magic>` cell magic command to run a simple
|
||||||
|
hello world program.
|
||||||
|
|
||||||
|
.. code-block:: c++
|
||||||
|
|
||||||
|
%%cuda
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
__global__ void hello(){
|
||||||
|
printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(){
|
||||||
|
hello<<<2, 2>>>();
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
}
|
||||||
|
|
||||||
|
Groups
|
||||||
|
------
|
||||||
|
|
||||||
|
Now we will demonstrate a more complex scenario that uses source file groups.
|
||||||
|
If you want to split your code into multiple source files, either for code reuse
|
||||||
|
or just to have an easier to read project, you want to use groups. A group of
|
||||||
|
source files will be compiled together. Because of this, you can include headers
|
||||||
|
from the same group and use the code defined in other ".cu" files. There is also
|
||||||
|
a special group named "shared" whose files will be compiled together with all
|
||||||
|
other groups, which is a great feature for error handling code as we'll show now:
|
||||||
|
|
||||||
|
.. code-block:: c++
|
||||||
|
|
||||||
|
%%cuda_group_save --group shared --name "error_handling.h"
|
||||||
|
// error checking macro
|
||||||
|
#define cudaCheckErrors(msg) \
|
||||||
|
do { \
|
||||||
|
cudaError_t __err = cudaGetLastError(); \
|
||||||
|
if (__err != cudaSuccess) { \
|
||||||
|
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
|
||||||
|
msg, cudaGetErrorString(__err), \
|
||||||
|
__FILE__, __LINE__); \
|
||||||
|
fprintf(stderr, "*** FAILED - ABORTING\n"); \
|
||||||
|
exit(1); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
Now we can use that error handling macro in this vector addition program but
|
||||||
|
also in other programs that we define in other Jupyter cells:
|
||||||
|
|
||||||
|
.. code-block:: c++
|
||||||
|
|
||||||
|
%%cuda
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "error_handling.h"
|
||||||
|
|
||||||
|
const int DSIZE = 4096;
|
||||||
|
const int block_size = 256;
|
||||||
|
|
||||||
|
// vector add kernel: C = A + B
|
||||||
|
__global__ void vadd(const float *A, const float *B, float *C, int ds){
|
||||||
|
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
if (idx < ds) {
|
||||||
|
C[idx] = A[idx] + B[idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(){
|
||||||
|
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
|
||||||
|
|
||||||
|
// allocate space for vectors in host memory
|
||||||
|
h_A = new float[DSIZE];
|
||||||
|
h_B = new float[DSIZE];
|
||||||
|
h_C = new float[DSIZE];
|
||||||
|
|
||||||
|
// initialize vectors in host memory to random values (except for the
|
||||||
|
// result vector whose values do not matter as they will be overwritten)
|
||||||
|
for (int i = 0; i < DSIZE; i++) {
|
||||||
|
h_A[i] = rand()/(float)RAND_MAX;
|
||||||
|
h_B[i] = rand()/(float)RAND_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate space for vectors in device memory
|
||||||
|
cudaMalloc(&d_A, DSIZE*sizeof(float));
|
||||||
|
cudaMalloc(&d_B, DSIZE*sizeof(float));
|
||||||
|
cudaMalloc(&d_C, DSIZE*sizeof(float));
|
||||||
|
cudaCheckErrors("cudaMalloc failure"); // error checking
|
||||||
|
|
||||||
|
// copy vectors A and B from host to device:
|
||||||
|
cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
cudaCheckErrors("cudaMemcpy H2D failure");
|
||||||
|
|
||||||
|
// launch the vector adding kernel
|
||||||
|
vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
|
||||||
|
cudaCheckErrors("kernel launch failure");
|
||||||
|
|
||||||
|
// wait for the kernel to finish execution
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
cudaCheckErrors("kernel execution failure");
|
||||||
|
|
||||||
|
cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
|
||||||
|
cudaCheckErrors("cudaMemcpy D2H failure");
|
||||||
|
|
||||||
|
printf("A[0] = %f\n", h_A[0]);
|
||||||
|
printf("B[0] = %f\n", h_B[0]);
|
||||||
|
printf("C[0] = %f\n", h_C[0]);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
Above we use the :ref:`cuda <cuda_magic>` magic command which saves the code
|
||||||
|
in the cell to an anonymous source file group, compiles, and executes that
|
||||||
|
code. This only allows us to have one source file (besides the ones in the
|
||||||
|
"shared" group). In order to have multiple source files we need to use the
|
||||||
|
:ref:`cuda_group_save <cuda_group_save_magic>` and
|
||||||
|
:ref:`cuda_group_run <cuda_group_run_magic>` magics.
|
||||||
|
|
||||||
|
First, we save the vector addition function to its own file:
|
||||||
|
|
||||||
|
|
||||||
|
.. code-block:: c++
|
||||||
|
|
||||||
|
%%cuda_group_save --name "vector_add.cu" --group "vector_add"
|
||||||
|
// vector add kernel: C = A + B
|
||||||
|
__global__ void vadd(const float *A, const float *B, float *C, int ds){
|
||||||
|
int idx = threadIdx.x + blockIdx.x * blockDim.x;
|
||||||
|
if (idx < ds) {
|
||||||
|
C[idx] = A[idx] + B[idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Now we create a header file so the main cuda file knows the signature of "vadd":
|
||||||
|
|
||||||
|
.. code-block:: c++
|
||||||
|
|
||||||
|
%%cuda_group_save --name "vector_add.h" --group "vector_add"
|
||||||
|
__global__ void vadd(const float *A, const float *B, float *C, int ds);
|
||||||
|
|
||||||
|
To tie it all together, we save the main cuda file, which includes our vector
|
||||||
|
addition code:
|
||||||
|
|
||||||
|
.. code-block:: c++
|
||||||
|
|
||||||
|
%%cuda_group_save --name "main.cu" --group "vector_add"
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "error_handling.h"
|
||||||
|
#include "vector_add.h"
|
||||||
|
|
||||||
|
const int DSIZE = 4096;
|
||||||
|
const int block_size = 256;
|
||||||
|
|
||||||
|
int main(){
|
||||||
|
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
|
||||||
|
|
||||||
|
// allocate space for vectors in host memory
|
||||||
|
h_A = new float[DSIZE];
|
||||||
|
h_B = new float[DSIZE];
|
||||||
|
h_C = new float[DSIZE];
|
||||||
|
|
||||||
|
// initialize vectors in host memory to random values (except for the
|
||||||
|
// result vector whose values do not matter as they will be overwritten)
|
||||||
|
for (int i = 0; i < DSIZE; i++) {
|
||||||
|
h_A[i] = rand()/(float)RAND_MAX;
|
||||||
|
h_B[i] = rand()/(float)RAND_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
// allocate space for vectors in device memory
|
||||||
|
cudaMalloc(&d_A, DSIZE*sizeof(float));
|
||||||
|
cudaMalloc(&d_B, DSIZE*sizeof(float));
|
||||||
|
cudaMalloc(&d_C, DSIZE*sizeof(float));
|
||||||
|
cudaCheckErrors("cudaMalloc failure"); // error checking
|
||||||
|
|
||||||
|
// copy vectors A and B from host to device:
|
||||||
|
cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
|
||||||
|
cudaCheckErrors("cudaMemcpy H2D failure");
|
||||||
|
|
||||||
|
// launch the vector adding kernel
|
||||||
|
vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
|
||||||
|
cudaCheckErrors("kernel launch failure");
|
||||||
|
|
||||||
|
// wait for the kernel to finish execution
|
||||||
|
cudaDeviceSynchronize();
|
||||||
|
cudaCheckErrors("kernel execution failure");
|
||||||
|
|
||||||
|
cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
|
||||||
|
cudaCheckErrors("cudaMemcpy D2H failure");
|
||||||
|
|
||||||
|
printf("A[0] = %f\n", h_A[0]);
|
||||||
|
printf("B[0] = %f\n", h_B[0]);
|
||||||
|
printf("C[0] = %f\n", h_C[0]);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
Now we can compile all the source files in the group and execute the main
|
||||||
|
function with the following command:
|
||||||
|
|
||||||
|
.. code-block:: c++
|
||||||
|
|
||||||
|
%cuda_group_run --group "vector_add"
|
||||||
|
|
||||||
|
Profiling
|
||||||
|
---------
|
||||||
|
|
||||||
|
Another important feature of nvcc4jupyter is its integration with the NVIDIA
|
||||||
|
Nsight Compute profiler, which you need to make sure is installed and its
|
||||||
|
executable can be found in a directory in your PATH environment variable.
|
||||||
|
|
||||||
|
In order to use it and provide the profiler with custom arguments, simply run:
|
||||||
|
|
||||||
|
.. code-block:: c++
|
||||||
|
|
||||||
|
%cuda_group_run --group "vector_add" --profile --profiler-args "--section SpeedOfLight"
|
||||||
|
|
||||||
|
Running the cell above will compile and execute the vector addition code in the
|
||||||
|
"vector_add" group and profile it, keeping only the metrics from the
|
||||||
|
"SpeedOfLight" section. The output will contain something similar to:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
Section: GPU Speed Of Light Throughput
|
||||||
|
----------------------- ------------- ------------
|
||||||
|
Metric Name Metric Unit Metric Value
|
||||||
|
----------------------- ------------- ------------
|
||||||
|
DRAM Frequency cycle/nsecond 4.65
|
||||||
|
SM Frequency cycle/usecond 544.31
|
||||||
|
Elapsed Cycles cycle 2,145
|
||||||
|
Memory Throughput % 3.19
|
||||||
|
DRAM Throughput % 3.19
|
||||||
|
Duration usecond 3.94
|
||||||
|
L1/TEX Cache Throughput % 6.67
|
||||||
|
L2 Cache Throughput % 1.98
|
||||||
|
SM Active Cycles cycle 383.65
|
||||||
|
Compute (SM) Throughput % 1.19
|
||||||
|
----------------------- ------------- ------------
|
||||||
Reference in New Issue
Block a user