Add readthedocs config and sphinx documentation

This commit is contained in:
Cosmin Ciocan
2024-01-12 14:47:30 +01:00
parent 6150ae5713
commit 1ca949d803
8 changed files with 589 additions and 0 deletions
+20
View File
@@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+35
View File
@@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd
+2
View File
@@ -0,0 +1,2 @@
sphinx==7.1.2
sphinx-rtd-theme==1.3.0rc1
+40
View File
@@ -0,0 +1,40 @@
# Configuration file for the Sphinx documentation builder.
#
# For the full list of built-in configuration values, see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
project = "nvcc4jupyter"
copyright = "2024, Andrei Nechaev & Cosmin Stefan Ciocan"
author = "Andrei Nechaev & Cosmin Stefan Ciocan"
release = "1.0.1"
version = "1.0.1"
# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
extensions = [
"sphinx.ext.duration",
"sphinx.ext.doctest",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"sphinx.ext.intersphinx",
]
intersphinx_mapping = {
"python": ("https://docs.python.org/3/", None),
"sphinx": ("https://www.sphinx-doc.org/en/master/", None),
}
intersphinx_disabled_domains = ["std"]
templates_path = ["_templates"]
exclude_patterns = []
# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
html_theme = "sphinx_rtd_theme"
html_static_path = ["_static"]
+23
View File
@@ -0,0 +1,23 @@
Welcome to nvcc4jupyter's documentation!
========================================
.. note::
This project is under active development.
Contents
--------
.. toctree::
:maxdepth: 2
:caption: Contents:
usage
magics
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
+172
View File
@@ -0,0 +1,172 @@
**********
Magics API
**********
.. note::
Arguments for profilers and the nvcc compiler can be passed in double
quotes so they can contain spaces and dashes.
------
.. _cuda_magic:
cuda
====
Magic command that compiles, runs, and profiles CUDA C++ code in the cell.
Usage
-----
- ``%%cuda``: Compile and run this cell.
- ``%%cuda -p``: Also runs the Nsight Compute profiler.
- ``%%cuda -p -a "<SPACE SEPARATED PROFILER ARGS>"``: Also runs the Nsight Compute profiler.
- ``%%cuda -t``: Outputs the "timeit" built-in magic results.
Options
-------
-t, --timeit
Boolean. If set, returns the output of the "timeit" built-in
ipython magic instead of stdout.
-p, --profile
Boolean. If set, runs the NVIDIA Nsight Compute profiler whose
output is appended to standard output.
-a, --profiler-args
String. Optional profiler arguments that can be space separated
by wrapping them in double quotes. See all options here:
`Nsight Compute CLI <https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html#command-line-options>`_
.. note::
If both "\-\-profile" and "\-\-timeit" are used then no profiling is
done.
Examples
--------
::
# compile, run, and profile the code in the cell with the Nsight
# compute profiler while collecting only metrics from the
# "MemoryWorkloadAnalysis" section.
%%cuda --profile --profiler-args "--section MemoryWorkloadAnalysis"
------
.. _cuda_group_save_magic:
cuda_group_save
===============
Magic command that saves CUDA C++ code in the cell for later
compilation and execution with possibly more source files.
Usage
-----
- ``%%cuda_group_save -n <FILENAME> -g <GROUPNAME>``: Save the code in the current cell to a group of source files.
Options
-------
-n, --name
String. Required file name of the saved source file. Must have
either the ".cu" or ".h" extension. In order to import a header
file saved with this magic you can simply add '#include "<name>"'.
-g, --group
String. Required group name to which to add the saved source file.
Groups are source files that get compiled together and do not
interact with other groups. This allows you to have multiple
unrelated CUDA programs within the same jupyter notebook. Adding
files to a group named "shared" will make them available to all
other source file groups. One use case for the shared group is for
sharing error handling code which should be present in all CUDA
programs.
Examples
--------
::
# jupyter cell 1
%%cuda_group_save -n "error_handling.h" -g "shared"
<ERROR HANDLING CODE>
# jupyter cell 2
%%cuda_group_save -n "main.cu" -g "example_group"
#include "error_handling.h"
<YOUR CODE HERE>
------
.. _cuda_group_run_magic:
cuda_group_run
==============
Line magic command that compiles, runs, and profiles all source files
in a group.
Usage
-----
- ``%%cuda_group_run -g <GROUPNAME>``: Compiles, runs, and profiles the sources files in the given group.
Options
-------
-g, --group
String. Required group name whose source files should be deleted.
.. note::
All options from the "%%cuda" cell magic are inherited.
Examples
--------
::
# jupyter cell 1
%%cuda_group_save -n "error_handling.h" -g "shared"
<ERROR HANDLING CODE>
# jupyter cell 2
%%cuda_group_save -n "main.cu" -g "example_group"
#include "error_handling.h"
<YOUR CODE HERE>
# jupyter cell 3
%cuda_group_run -g "example_group" --profile
-----
.. _cuda_group_delete_magic:
cuda_group_delete
=================
Line magic command that deletes all source files in a group.
Usage
-----
- ``%%cuda_group_delete -g <GROUPNAME>``: Removes all source files in the given group.
Options
-------
-g, --group
String. Required group name whose source files should be deleted.
Examples
--------
::
# jupyter cell 1
%%cuda_group_save -n "error_handling.h" -g "shared"
<ERROR HANDLING CODE>
# jupyter cell 2 - here we delete the error shared group; in
# practice this would be helpful if you want to overwrite some
# functionality that was defined earlier in the notebook
%cuda_group_delete -g "shared"
+265
View File
@@ -0,0 +1,265 @@
Usage
=====
This IPython extension allows running CUDA C++ code in Jupyter notebook. This
is especially useful when combined with `Google Colab <https://colab.research.google.com/>`_
which provides CUDA capable GPUs with the CUDA toolkit already installed.
.. _installation:
Installation
------------
To use nvcc4jupyter, first install it using pip:
.. code-block:: console
(venv) $ pip install nvcc4jupyter
.. _load_extension:
Load the Extension
------------------
Now we need to load the IPython extension to be able to use its cell and line
magic commands:
.. code-block::
%load_ext nvcc4jupyter
Hello World
-----------
We will use the :ref:`cuda <cuda_magic>` cell magic command to run a simple
hello world program.
.. code-block:: c++
%%cuda
#include <stdio.h>
__global__ void hello(){
printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
}
int main(){
hello<<<2, 2>>>();
cudaDeviceSynchronize();
}
Groups
------
Now we will demonstrate a more complex scenario that uses source file groups.
If you want to split your code into multiple source files, either for code reuse
or just to have an easier to read project, you want to use groups. A group of
source files will be compiled together. Because of this, you can include headers
from the same group and use the code defined in other ".cu" files. There is also
a special group named "shared" whose files will be compiled together with all
other groups, which is a great feature for error handling code as we'll show now:
.. code-block:: c++
%%cuda_group_save --group shared --name "error_handling.h"
// error checking macro
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
Now we can use that error handling macro in this vector addition program but
also in other programs that we define in other Jupyter cells:
.. code-block:: c++
%%cuda
#include <stdio.h>
#include "error_handling.h"
const int DSIZE = 4096;
const int block_size = 256;
// vector add kernel: C = A + B
__global__ void vadd(const float *A, const float *B, float *C, int ds){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < ds) {
C[idx] = A[idx] + B[idx];
}
}
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
// allocate space for vectors in host memory
h_A = new float[DSIZE];
h_B = new float[DSIZE];
h_C = new float[DSIZE];
// initialize vectors in host memory to random values (except for the
// result vector whose values do not matter as they will be overwritten)
for (int i = 0; i < DSIZE; i++) {
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
}
// allocate space for vectors in device memory
cudaMalloc(&d_A, DSIZE*sizeof(float));
cudaMalloc(&d_B, DSIZE*sizeof(float));
cudaMalloc(&d_C, DSIZE*sizeof(float));
cudaCheckErrors("cudaMalloc failure"); // error checking
// copy vectors A and B from host to device:
cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D failure");
// launch the vector adding kernel
vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
cudaCheckErrors("kernel launch failure");
// wait for the kernel to finish execution
cudaDeviceSynchronize();
cudaCheckErrors("kernel execution failure");
cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy D2H failure");
printf("A[0] = %f\n", h_A[0]);
printf("B[0] = %f\n", h_B[0]);
printf("C[0] = %f\n", h_C[0]);
return 0;
}
Above we use the :ref:`cuda <cuda_magic>` magic command which saves the code
in the cell to an anonymous source file group, compiles, and executes that
code. This only allows us to have one source file (besides the ones in the
"shared" group). In order to have multiple source files we need to use the
:ref:`cuda_group_save <cuda_group_save_magic>` and
:ref:`cuda_group_run <cuda_group_run_magic>` magics.
First, we save the vector addition function to its own file:
.. code-block:: c++
%%cuda_group_save --name "vector_add.cu" --group "vector_add"
// vector add kernel: C = A + B
__global__ void vadd(const float *A, const float *B, float *C, int ds){
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < ds) {
C[idx] = A[idx] + B[idx];
}
}
Now we create a header file so the main cuda file knows the signature of "vadd":
.. code-block:: c++
%%cuda_group_save --name "vector_add.h" --group "vector_add"
__global__ void vadd(const float *A, const float *B, float *C, int ds);
To tie it all together, we save the main cuda file, which includes our vector
addition code:
.. code-block:: c++
%%cuda_group_save --name "main.cu" --group "vector_add"
#include <stdio.h>
#include "error_handling.h"
#include "vector_add.h"
const int DSIZE = 4096;
const int block_size = 256;
int main(){
float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
// allocate space for vectors in host memory
h_A = new float[DSIZE];
h_B = new float[DSIZE];
h_C = new float[DSIZE];
// initialize vectors in host memory to random values (except for the
// result vector whose values do not matter as they will be overwritten)
for (int i = 0; i < DSIZE; i++) {
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
}
// allocate space for vectors in device memory
cudaMalloc(&d_A, DSIZE*sizeof(float));
cudaMalloc(&d_B, DSIZE*sizeof(float));
cudaMalloc(&d_C, DSIZE*sizeof(float));
cudaCheckErrors("cudaMalloc failure"); // error checking
// copy vectors A and B from host to device:
cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy H2D failure");
// launch the vector adding kernel
vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
cudaCheckErrors("kernel launch failure");
// wait for the kernel to finish execution
cudaDeviceSynchronize();
cudaCheckErrors("kernel execution failure");
cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy D2H failure");
printf("A[0] = %f\n", h_A[0]);
printf("B[0] = %f\n", h_B[0]);
printf("C[0] = %f\n", h_C[0]);
return 0;
}
Now we can compile all the source files in the group and execute the main
function with the following command:
.. code-block:: c++
%cuda_group_run --group "vector_add"
Profiling
---------
Another important feature of nvcc4jupyter is its integration with the NVIDIA
Nsight Compute profiler, which you need to make sure is installed and its
executable can be found in a directory in your PATH environment variable.
In order to use it and provide the profiler with custom arguments, simply run:
.. code-block:: c++
%cuda_group_run --group "vector_add" --profile --profiler-args "--section SpeedOfLight"
Running the cell above will compile and execute the vector addition code in the
"vector_add" group and profile it, keeping only the metrics from the
"SpeedOfLight" section. The output will contain something similar to:
.. code-block::
Section: GPU Speed Of Light Throughput
----------------------- ------------- ------------
Metric Name Metric Unit Metric Value
----------------------- ------------- ------------
DRAM Frequency cycle/nsecond 4.65
SM Frequency cycle/usecond 544.31
Elapsed Cycles cycle 2,145
Memory Throughput % 3.19
DRAM Throughput % 3.19
Duration usecond 3.94
L1/TEX Cache Throughput % 6.67
L2 Cache Throughput % 1.98
SM Active Cycles cycle 383.65
Compute (SM) Throughput % 1.19
----------------------- ------------- ------------