Add readthedocs config and sphinx documentation

2026-06-13 18:50:47 +05:30 · 2024-01-12 14:47:30 +01:00
parent 6150ae5713
commit 1ca949d803
8 changed files with 589 additions and 0 deletions
@@ -0,0 +1,32 @@
 # .readthedocs.yaml
 # Read the Docs configuration file
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 # Required
 version: 2
 # Set the OS, Python version and other tools you might need
 build:
  os: ubuntu-22.04
  tools:
    python: "3.12"
    # You can also specify other tool versions:
    # nodejs: "19"
    # rust: "1.64"
    # golang: "1.19"
 # Build documentation in the "docs/" directory with Sphinx
 sphinx:
  configuration: docs/conf.py
 # Optionally build your docs in additional formats such as PDF and ePub
 # formats:
 #    - pdf
 #    - epub
 # Optional but recommended, declare the Python requirements required
 # to build your documentation
 # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
 python:
   install:
   - requirements: docs/requirements.txt
@@ -0,0 +1,20 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line, and also
 # from the environment for the first two.
 SPHINXOPTS    ?=
 SPHINXBUILD   ?= sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -0,0 +1,35 @@
@ECHO OFF
 pushd %~dp0
 REM Command file for Sphinx documentation
 if "%SPHINXBUILD%" == "" (
 	set SPHINXBUILD=sphinx-build
 )
 set SOURCEDIR=source
 set BUILDDIR=build
 if "%1" == "" goto help
 %SPHINXBUILD% >NUL 2>NUL
 if errorlevel 9009 (
 	echo.
 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 	echo.installed, then set the SPHINXBUILD environment variable to point
 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 	echo.may add the Sphinx directory to PATH.
 	echo.
 	echo.If you don't have Sphinx installed, grab it from
 	echo.http://sphinx-doc.org/
 	exit /b 1
 )
 %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 goto end
 :help
 %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
 :end
 popd
@@ -0,0 +1,2 @@
 sphinx==7.1.2
 sphinx-rtd-theme==1.3.0rc1
@@ -0,0 +1,40 @@
 # Configuration file for the Sphinx documentation builder.
 #
 # For the full list of built-in configuration values, see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 project = "nvcc4jupyter"
 copyright = "2024, Andrei Nechaev & Cosmin Stefan Ciocan"
 author = "Andrei Nechaev & Cosmin Stefan Ciocan"
 release = "1.0.1"
 version = "1.0.1"
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 extensions = [
    "sphinx.ext.duration",
    "sphinx.ext.doctest",
    "sphinx.ext.autodoc",
    "sphinx.ext.autosummary",
    "sphinx.ext.intersphinx",
 ]
 intersphinx_mapping = {
    "python": ("https://docs.python.org/3/", None),
    "sphinx": ("https://www.sphinx-doc.org/en/master/", None),
 }
 intersphinx_disabled_domains = ["std"]
 templates_path = ["_templates"]
 exclude_patterns = []
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 html_theme = "sphinx_rtd_theme"
 html_static_path = ["_static"]
@@ -0,0 +1,23 @@
 Welcome to nvcc4jupyter's documentation!
 ========================================
 .. note::
   This project is under active development.
 Contents
 --------
 .. toctree::
   :maxdepth: 2
   :caption: Contents:
   usage
   magics
 Indices and tables
 ==================
 * :ref:`genindex`
 * :ref:`modindex`
 * :ref:`search`
@@ -0,0 +1,172 @@
 **********
 Magics API
 **********
 .. note::
   Arguments for profilers and the nvcc compiler can be passed in double
   quotes so they can contain spaces and dashes.
 ------
 .. _cuda_magic:
 cuda
 ====
 Magic command that compiles, runs, and profiles CUDA C++ code in the cell.
 Usage
 -----
   - ``%%cuda``: Compile and run this cell.
   - ``%%cuda -p``: Also runs the Nsight Compute profiler.
   - ``%%cuda -p -a "<SPACE SEPARATED PROFILER ARGS>"``: Also runs the Nsight Compute profiler.
   - ``%%cuda -t``: Outputs the "timeit" built-in magic results.
 Options
 -------
 -t, --timeit
   Boolean. If set, returns the output of the "timeit" built-in
   ipython magic instead of stdout.
 -p, --profile
   Boolean. If set, runs the NVIDIA Nsight Compute profiler whose
   output is appended to standard output.
 -a, --profiler-args
   String. Optional profiler arguments that can be space separated
   by wrapping them in double quotes. See all options here:
   `Nsight Compute CLI <https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html#command-line-options>`_
 .. note::
   If both "\-\-profile" and "\-\-timeit" are used then no profiling is
   done.
 Examples
 --------
 ::
   # compile, run, and profile the code in the cell with the Nsight
   # compute profiler while collecting only metrics from the
   # "MemoryWorkloadAnalysis" section.
   %%cuda --profile --profiler-args "--section MemoryWorkloadAnalysis"
 ------
 .. _cuda_group_save_magic:
 cuda_group_save
 ===============
 Magic command that saves CUDA C++ code in the cell for later
 compilation and execution with possibly more source files.
 Usage
 -----
   - ``%%cuda_group_save -n <FILENAME> -g <GROUPNAME>``: Save the code in the current cell to a group of source files.
 Options
 -------
 -n, --name
   String. Required file name of the saved source file. Must have
   either the ".cu" or ".h" extension. In order to import a header
   file saved with this magic you can simply add '#include "<name>"'.
 -g, --group
   String. Required group name to which to add the saved source file.
   Groups are source files that get compiled together and do not
   interact with other groups. This allows you to have multiple
   unrelated CUDA programs within the same jupyter notebook. Adding
   files to a group named "shared" will make them available to all
   other source file groups. One use case for the shared group is for
   sharing error handling code which should be present in all CUDA
   programs.
 Examples
 --------
 ::
   # jupyter cell 1
   %%cuda_group_save -n "error_handling.h" -g "shared"
   <ERROR HANDLING CODE>
   # jupyter cell 2
   %%cuda_group_save -n "main.cu" -g "example_group"
   #include "error_handling.h"
   <YOUR CODE HERE>
 ------
 .. _cuda_group_run_magic:
 cuda_group_run
 ==============
 Line magic command that compiles, runs, and profiles all source files
 in a group.
 Usage
 -----
   - ``%%cuda_group_run -g <GROUPNAME>``: Compiles, runs, and profiles the sources files in the given group.
 Options
 -------
 -g, --group
   String. Required group name whose source files should be deleted.
 .. note::
   All options from the "%%cuda" cell magic are inherited.
 Examples
 --------
 ::
   # jupyter cell 1
   %%cuda_group_save -n "error_handling.h" -g "shared"
   <ERROR HANDLING CODE>
   # jupyter cell 2
   %%cuda_group_save -n "main.cu" -g "example_group"
   #include "error_handling.h"
   <YOUR CODE HERE>
   # jupyter cell 3
   %cuda_group_run -g "example_group" --profile
 -----
 .. _cuda_group_delete_magic:
 cuda_group_delete
 =================
 Line magic command that deletes all source files in a group.
 Usage
 -----
   - ``%%cuda_group_delete -g <GROUPNAME>``: Removes all source files in the given group.
 Options
 -------
 -g, --group
   String. Required group name whose source files should be deleted.
 Examples
 --------
 ::
   # jupyter cell 1
   %%cuda_group_save -n "error_handling.h" -g "shared"
   <ERROR HANDLING CODE>
   # jupyter cell 2 - here we delete the error shared group; in
   # practice this would be helpful if you want to overwrite some
   # functionality that was defined earlier in the notebook
   %cuda_group_delete -g "shared"
@@ -0,0 +1,265 @@
 Usage
 =====
 This IPython extension allows running CUDA C++ code in Jupyter notebook. This
 is especially useful when combined with `Google Colab <https://colab.research.google.com/>`_
 which provides CUDA capable GPUs with the CUDA toolkit already installed.
 .. _installation:
 Installation
 ------------
 To use nvcc4jupyter, first install it using pip:
 .. code-block:: console
    (venv) $ pip install nvcc4jupyter
 .. _load_extension:
 Load the Extension
 ------------------
 Now we need to load the IPython extension to be able to use its cell and line
 magic commands:
 .. code-block::
    %load_ext nvcc4jupyter
 Hello World
 -----------
 We will use the :ref:`cuda <cuda_magic>` cell magic command to run a simple
 hello world program.
 .. code-block:: c++
    %%cuda
    #include <stdio.h>
    __global__ void hello(){
        printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
    }
    int main(){
        hello<<<2, 2>>>();
        cudaDeviceSynchronize();
    }
 Groups
 ------
 Now we will demonstrate a more complex scenario that uses source file groups.
 If you want to split your code into multiple source files, either for code reuse
 or just to have an easier to read project, you want to use groups. A group of
 source files will be compiled together. Because of this, you can include headers
 from the same group and use the code defined in other ".cu" files. There is also
 a special group named "shared" whose files will be compiled together with all
 other groups, which is a great feature for error handling code as we'll show now:
 .. code-block:: c++
    %%cuda_group_save --group shared --name "error_handling.h"
    // error checking macro
    #define cudaCheckErrors(msg) \
        do { \
            cudaError_t __err = cudaGetLastError(); \
            if (__err != cudaSuccess) { \
                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                    msg, cudaGetErrorString(__err), \
                    __FILE__, __LINE__); \
                fprintf(stderr, "*** FAILED - ABORTING\n"); \
                exit(1); \
            } \
        } while (0)
 Now we can use that error handling macro in this vector addition program but
 also in other programs that we define in other Jupyter cells:
 .. code-block:: c++
    %%cuda
    #include <stdio.h>
    #include "error_handling.h"
    const int DSIZE = 4096;
    const int block_size = 256;
    // vector add kernel: C = A + B
    __global__ void vadd(const float *A, const float *B, float *C, int ds){
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        if (idx < ds) {
            C[idx] = A[idx] + B[idx];
        }
    }
    int main(){
        float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
        // allocate space for vectors in host memory
        h_A = new float[DSIZE];
        h_B = new float[DSIZE];
        h_C = new float[DSIZE];
        // initialize vectors in host memory to random values (except for the
        // result vector whose values do not matter as they will be overwritten)
        for (int i = 0; i < DSIZE; i++) {
            h_A[i] = rand()/(float)RAND_MAX;
            h_B[i] = rand()/(float)RAND_MAX;
        }
        // allocate space for vectors in device memory
        cudaMalloc(&d_A, DSIZE*sizeof(float));
        cudaMalloc(&d_B, DSIZE*sizeof(float));
        cudaMalloc(&d_C, DSIZE*sizeof(float));
        cudaCheckErrors("cudaMalloc failure"); // error checking
        // copy vectors A and B from host to device:
        cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
        cudaCheckErrors("cudaMemcpy H2D failure");
        // launch the vector adding kernel
        vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
        cudaCheckErrors("kernel launch failure");
        // wait for the kernel to finish execution
        cudaDeviceSynchronize();
        cudaCheckErrors("kernel execution failure");
        cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
        cudaCheckErrors("cudaMemcpy D2H failure");
        printf("A[0] = %f\n", h_A[0]);
        printf("B[0] = %f\n", h_B[0]);
        printf("C[0] = %f\n", h_C[0]);
        return 0;
    }
 Above we use the :ref:`cuda <cuda_magic>` magic command which saves the code
 in the cell to an anonymous source file group, compiles, and executes that
 code. This only allows us to have one source file (besides the ones in the
 "shared" group). In order to have multiple source files we need to use the
 :ref:`cuda_group_save <cuda_group_save_magic>` and
 :ref:`cuda_group_run <cuda_group_run_magic>` magics.
 First, we save the vector addition function to its own file:
 .. code-block:: c++
    %%cuda_group_save --name "vector_add.cu" --group "vector_add"
    // vector add kernel: C = A + B
    __global__ void vadd(const float *A, const float *B, float *C, int ds){
        int idx = threadIdx.x + blockIdx.x * blockDim.x;
        if (idx < ds) {
            C[idx] = A[idx] + B[idx];
        }
    }
 Now we create a header file so the main cuda file knows the signature of "vadd":
 .. code-block:: c++
    %%cuda_group_save --name "vector_add.h" --group "vector_add"
    __global__ void vadd(const float *A, const float *B, float *C, int ds);
 To tie it all together, we save the main cuda file, which includes our vector
 addition code:
 .. code-block:: c++
    %%cuda_group_save --name "main.cu" --group "vector_add"
    #include <stdio.h>
    #include "error_handling.h"
    #include "vector_add.h"
    const int DSIZE = 4096;
    const int block_size = 256;
    int main(){
        float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
        // allocate space for vectors in host memory
        h_A = new float[DSIZE];
        h_B = new float[DSIZE];
        h_C = new float[DSIZE];
        // initialize vectors in host memory to random values (except for the
        // result vector whose values do not matter as they will be overwritten)
        for (int i = 0; i < DSIZE; i++) {
            h_A[i] = rand()/(float)RAND_MAX;
            h_B[i] = rand()/(float)RAND_MAX;
        }
        // allocate space for vectors in device memory
        cudaMalloc(&d_A, DSIZE*sizeof(float));
        cudaMalloc(&d_B, DSIZE*sizeof(float));
        cudaMalloc(&d_C, DSIZE*sizeof(float));
        cudaCheckErrors("cudaMalloc failure"); // error checking
        // copy vectors A and B from host to device:
        cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
        cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
        cudaCheckErrors("cudaMemcpy H2D failure");
        // launch the vector adding kernel
        vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
        cudaCheckErrors("kernel launch failure");
        // wait for the kernel to finish execution
        cudaDeviceSynchronize();
        cudaCheckErrors("kernel execution failure");
        cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
        cudaCheckErrors("cudaMemcpy D2H failure");
        printf("A[0] = %f\n", h_A[0]);
        printf("B[0] = %f\n", h_B[0]);
        printf("C[0] = %f\n", h_C[0]);
        return 0;
    }
 Now we can compile all the source files in the group and execute the main
 function with the following command:
 .. code-block:: c++
    %cuda_group_run --group "vector_add"
 Profiling
 ---------
 Another important feature of nvcc4jupyter is its integration with the NVIDIA
 Nsight Compute profiler, which you need to make sure is installed and its
 executable can be found in a directory in your PATH environment variable.
 In order to use it and provide the profiler with custom arguments, simply run:
 .. code-block:: c++
    %cuda_group_run --group "vector_add" --profile --profiler-args "--section SpeedOfLight"
 Running the cell above will compile and execute the vector addition code in the
 "vector_add" group and profile it, keeping only the metrics from the
 "SpeedOfLight" section. The output will contain something similar to:
 .. code-block::
    Section: GPU Speed Of Light Throughput
    ----------------------- ------------- ------------
    Metric Name               Metric Unit Metric Value
    ----------------------- ------------- ------------
    DRAM Frequency          cycle/nsecond         4.65
    SM Frequency            cycle/usecond       544.31
    Elapsed Cycles                  cycle        2,145
    Memory Throughput                   %         3.19
    DRAM Throughput                     %         3.19
    Duration                      usecond         3.94
    L1/TEX Cache Throughput             %         6.67
    L2 Cache Throughput                 %         1.98
    SM Active Cycles                cycle       383.65
    Compute (SM) Throughput             %         1.19
    ----------------------- ------------- ------------
		`@@ -0,0 +1,2 @@`
							`sphinx==7.1.2`
							`sphinx-rtd-theme==1.3.0rc1`