diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..7dc180b --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,32 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the OS, Python version and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.12" + # You can also specify other tool versions: + # nodejs: "19" + # rust: "1.64" + # golang: "1.19" + +# Build documentation in the "docs/" directory with Sphinx +sphinx: + configuration: docs/conf.py + +# Optionally build your docs in additional formats such as PDF and ePub +# formats: +# - pdf +# - epub + +# Optional but recommended, declare the Python requirements required +# to build your documentation +# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: docs/requirements.txt \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..269cadc --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..fa98a78 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..1d628f2 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +sphinx==7.1.2 +sphinx-rtd-theme==1.3.0rc1 \ No newline at end of file diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..665059c --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,40 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "nvcc4jupyter" +copyright = "2024, Andrei Nechaev & Cosmin Stefan Ciocan" +author = "Andrei Nechaev & Cosmin Stefan Ciocan" +release = "1.0.1" +version = "1.0.1" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx.ext.duration", + "sphinx.ext.doctest", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", +] + +intersphinx_mapping = { + "python": ("https://docs.python.org/3/", None), + "sphinx": ("https://www.sphinx-doc.org/en/master/", None), +} +intersphinx_disabled_domains = ["std"] + +templates_path = ["_templates"] +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "sphinx_rtd_theme" +html_static_path = ["_static"] diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..b472c1d --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,23 @@ +Welcome to nvcc4jupyter's documentation! +======================================== + +.. note:: + + This project is under active development. + +Contents +-------- + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + usage + magics + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/magics.rst b/docs/source/magics.rst new file mode 100644 index 0000000..2073f35 --- /dev/null +++ b/docs/source/magics.rst @@ -0,0 +1,172 @@ +********** +Magics API +********** + +.. note:: + Arguments for profilers and the nvcc compiler can be passed in double + quotes so they can contain spaces and dashes. + +------ + +.. _cuda_magic: + +cuda +==== + +Magic command that compiles, runs, and profiles CUDA C++ code in the cell. + +Usage +----- + + - ``%%cuda``: Compile and run this cell. + - ``%%cuda -p``: Also runs the Nsight Compute profiler. + - ``%%cuda -p -a ""``: Also runs the Nsight Compute profiler. + - ``%%cuda -t``: Outputs the "timeit" built-in magic results. + +Options +------- + +-t, --timeit + Boolean. If set, returns the output of the "timeit" built-in + ipython magic instead of stdout. + +-p, --profile + Boolean. If set, runs the NVIDIA Nsight Compute profiler whose + output is appended to standard output. + +-a, --profiler-args + String. Optional profiler arguments that can be space separated + by wrapping them in double quotes. See all options here: + `Nsight Compute CLI `_ + +.. note:: + If both "\-\-profile" and "\-\-timeit" are used then no profiling is + done. + +Examples +-------- +:: + + # compile, run, and profile the code in the cell with the Nsight + # compute profiler while collecting only metrics from the + # "MemoryWorkloadAnalysis" section. + %%cuda --profile --profiler-args "--section MemoryWorkloadAnalysis" + +------ + +.. _cuda_group_save_magic: + +cuda_group_save +=============== + +Magic command that saves CUDA C++ code in the cell for later +compilation and execution with possibly more source files. + +Usage +----- + + - ``%%cuda_group_save -n -g ``: Save the code in the current cell to a group of source files. + +Options +------- + +-n, --name + String. Required file name of the saved source file. Must have + either the ".cu" or ".h" extension. In order to import a header + file saved with this magic you can simply add '#include ""'. + +-g, --group + String. Required group name to which to add the saved source file. + Groups are source files that get compiled together and do not + interact with other groups. This allows you to have multiple + unrelated CUDA programs within the same jupyter notebook. Adding + files to a group named "shared" will make them available to all + other source file groups. One use case for the shared group is for + sharing error handling code which should be present in all CUDA + programs. + +Examples +-------- +:: + + # jupyter cell 1 + %%cuda_group_save -n "error_handling.h" -g "shared" + + + # jupyter cell 2 + %%cuda_group_save -n "main.cu" -g "example_group" + #include "error_handling.h" + + +------ + +.. _cuda_group_run_magic: + +cuda_group_run +============== + +Line magic command that compiles, runs, and profiles all source files +in a group. + +Usage +----- + + - ``%%cuda_group_run -g ``: Compiles, runs, and profiles the sources files in the given group. + +Options +------- + +-g, --group + String. Required group name whose source files should be deleted. + +.. note:: + All options from the "%%cuda" cell magic are inherited. + +Examples +-------- +:: + + # jupyter cell 1 + %%cuda_group_save -n "error_handling.h" -g "shared" + + + # jupyter cell 2 + %%cuda_group_save -n "main.cu" -g "example_group" + #include "error_handling.h" + + + # jupyter cell 3 + %cuda_group_run -g "example_group" --profile + +----- + +.. _cuda_group_delete_magic: + +cuda_group_delete +================= + +Line magic command that deletes all source files in a group. + +Usage +----- + + - ``%%cuda_group_delete -g ``: Removes all source files in the given group. + +Options +------- + +-g, --group + String. Required group name whose source files should be deleted. + +Examples +-------- +:: + + # jupyter cell 1 + %%cuda_group_save -n "error_handling.h" -g "shared" + + + # jupyter cell 2 - here we delete the error shared group; in + # practice this would be helpful if you want to overwrite some + # functionality that was defined earlier in the notebook + %cuda_group_delete -g "shared" diff --git a/docs/source/usage.rst b/docs/source/usage.rst new file mode 100644 index 0000000..885e419 --- /dev/null +++ b/docs/source/usage.rst @@ -0,0 +1,265 @@ +Usage +===== + +This IPython extension allows running CUDA C++ code in Jupyter notebook. This +is especially useful when combined with `Google Colab `_ +which provides CUDA capable GPUs with the CUDA toolkit already installed. + +.. _installation: + +Installation +------------ + +To use nvcc4jupyter, first install it using pip: + +.. code-block:: console + + (venv) $ pip install nvcc4jupyter + +.. _load_extension: + +Load the Extension +------------------ + +Now we need to load the IPython extension to be able to use its cell and line +magic commands: + +.. code-block:: + + %load_ext nvcc4jupyter + +Hello World +----------- + +We will use the :ref:`cuda ` cell magic command to run a simple +hello world program. + +.. code-block:: c++ + + %%cuda + #include + + __global__ void hello(){ + printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x); + } + + int main(){ + hello<<<2, 2>>>(); + cudaDeviceSynchronize(); + } + +Groups +------ + +Now we will demonstrate a more complex scenario that uses source file groups. +If you want to split your code into multiple source files, either for code reuse +or just to have an easier to read project, you want to use groups. A group of +source files will be compiled together. Because of this, you can include headers +from the same group and use the code defined in other ".cu" files. There is also +a special group named "shared" whose files will be compiled together with all +other groups, which is a great feature for error handling code as we'll show now: + +.. code-block:: c++ + + %%cuda_group_save --group shared --name "error_handling.h" + // error checking macro + #define cudaCheckErrors(msg) \ + do { \ + cudaError_t __err = cudaGetLastError(); \ + if (__err != cudaSuccess) { \ + fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ + msg, cudaGetErrorString(__err), \ + __FILE__, __LINE__); \ + fprintf(stderr, "*** FAILED - ABORTING\n"); \ + exit(1); \ + } \ + } while (0) + +Now we can use that error handling macro in this vector addition program but +also in other programs that we define in other Jupyter cells: + +.. code-block:: c++ + + %%cuda + #include + #include "error_handling.h" + + const int DSIZE = 4096; + const int block_size = 256; + + // vector add kernel: C = A + B + __global__ void vadd(const float *A, const float *B, float *C, int ds){ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < ds) { + C[idx] = A[idx] + B[idx]; + } + } + + int main(){ + float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C; + + // allocate space for vectors in host memory + h_A = new float[DSIZE]; + h_B = new float[DSIZE]; + h_C = new float[DSIZE]; + + // initialize vectors in host memory to random values (except for the + // result vector whose values do not matter as they will be overwritten) + for (int i = 0; i < DSIZE; i++) { + h_A[i] = rand()/(float)RAND_MAX; + h_B[i] = rand()/(float)RAND_MAX; + } + + // allocate space for vectors in device memory + cudaMalloc(&d_A, DSIZE*sizeof(float)); + cudaMalloc(&d_B, DSIZE*sizeof(float)); + cudaMalloc(&d_C, DSIZE*sizeof(float)); + cudaCheckErrors("cudaMalloc failure"); // error checking + + // copy vectors A and B from host to device: + cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice); + cudaCheckErrors("cudaMemcpy H2D failure"); + + // launch the vector adding kernel + vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE); + cudaCheckErrors("kernel launch failure"); + + // wait for the kernel to finish execution + cudaDeviceSynchronize(); + cudaCheckErrors("kernel execution failure"); + + cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost); + cudaCheckErrors("cudaMemcpy D2H failure"); + + printf("A[0] = %f\n", h_A[0]); + printf("B[0] = %f\n", h_B[0]); + printf("C[0] = %f\n", h_C[0]); + return 0; + } + +Above we use the :ref:`cuda ` magic command which saves the code +in the cell to an anonymous source file group, compiles, and executes that +code. This only allows us to have one source file (besides the ones in the +"shared" group). In order to have multiple source files we need to use the +:ref:`cuda_group_save ` and +:ref:`cuda_group_run ` magics. + +First, we save the vector addition function to its own file: + + +.. code-block:: c++ + + %%cuda_group_save --name "vector_add.cu" --group "vector_add" + // vector add kernel: C = A + B + __global__ void vadd(const float *A, const float *B, float *C, int ds){ + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < ds) { + C[idx] = A[idx] + B[idx]; + } + } + +Now we create a header file so the main cuda file knows the signature of "vadd": + +.. code-block:: c++ + + %%cuda_group_save --name "vector_add.h" --group "vector_add" + __global__ void vadd(const float *A, const float *B, float *C, int ds); + +To tie it all together, we save the main cuda file, which includes our vector +addition code: + +.. code-block:: c++ + + %%cuda_group_save --name "main.cu" --group "vector_add" + #include + #include "error_handling.h" + #include "vector_add.h" + + const int DSIZE = 4096; + const int block_size = 256; + + int main(){ + float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C; + + // allocate space for vectors in host memory + h_A = new float[DSIZE]; + h_B = new float[DSIZE]; + h_C = new float[DSIZE]; + + // initialize vectors in host memory to random values (except for the + // result vector whose values do not matter as they will be overwritten) + for (int i = 0; i < DSIZE; i++) { + h_A[i] = rand()/(float)RAND_MAX; + h_B[i] = rand()/(float)RAND_MAX; + } + + // allocate space for vectors in device memory + cudaMalloc(&d_A, DSIZE*sizeof(float)); + cudaMalloc(&d_B, DSIZE*sizeof(float)); + cudaMalloc(&d_C, DSIZE*sizeof(float)); + cudaCheckErrors("cudaMalloc failure"); // error checking + + // copy vectors A and B from host to device: + cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice); + cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice); + cudaCheckErrors("cudaMemcpy H2D failure"); + + // launch the vector adding kernel + vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE); + cudaCheckErrors("kernel launch failure"); + + // wait for the kernel to finish execution + cudaDeviceSynchronize(); + cudaCheckErrors("kernel execution failure"); + + cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost); + cudaCheckErrors("cudaMemcpy D2H failure"); + + printf("A[0] = %f\n", h_A[0]); + printf("B[0] = %f\n", h_B[0]); + printf("C[0] = %f\n", h_C[0]); + return 0; + } + +Now we can compile all the source files in the group and execute the main +function with the following command: + +.. code-block:: c++ + + %cuda_group_run --group "vector_add" + +Profiling +--------- + +Another important feature of nvcc4jupyter is its integration with the NVIDIA +Nsight Compute profiler, which you need to make sure is installed and its +executable can be found in a directory in your PATH environment variable. + +In order to use it and provide the profiler with custom arguments, simply run: + +.. code-block:: c++ + + %cuda_group_run --group "vector_add" --profile --profiler-args "--section SpeedOfLight" + +Running the cell above will compile and execute the vector addition code in the +"vector_add" group and profile it, keeping only the metrics from the +"SpeedOfLight" section. The output will contain something similar to: + +.. code-block:: + + Section: GPU Speed Of Light Throughput + ----------------------- ------------- ------------ + Metric Name Metric Unit Metric Value + ----------------------- ------------- ------------ + DRAM Frequency cycle/nsecond 4.65 + SM Frequency cycle/usecond 544.31 + Elapsed Cycles cycle 2,145 + Memory Throughput % 3.19 + DRAM Throughput % 3.19 + Duration usecond 3.94 + L1/TEX Cache Throughput % 6.67 + L2 Cache Throughput % 1.98 + SM Active Cycles cycle 383.65 + Compute (SM) Throughput % 1.19 + ----------------------- ------------- ------------