diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..7dc180b
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,32 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.12"
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - requirements: docs/requirements.txt
\ No newline at end of file
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..269cadc
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..fa98a78
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..1d628f2
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,2 @@
+sphinx==7.1.2
+sphinx-rtd-theme==1.3.0rc1
\ No newline at end of file
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..665059c
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,40 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "nvcc4jupyter"
+copyright = "2024, Andrei Nechaev & Cosmin Stefan Ciocan"
+author = "Andrei Nechaev & Cosmin Stefan Ciocan"
+release = "1.0.1"
+version = "1.0.1"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "sphinx.ext.duration",
+    "sphinx.ext.doctest",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.intersphinx",
+]
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3/", None),
+    "sphinx": ("https://www.sphinx-doc.org/en/master/", None),
+}
+intersphinx_disabled_domains = ["std"]
+
+templates_path = ["_templates"]
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "sphinx_rtd_theme"
+html_static_path = ["_static"]
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..b472c1d
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,23 @@
+Welcome to nvcc4jupyter's documentation!
+========================================
+
+.. note::
+
+   This project is under active development.
+
+Contents
+--------
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   usage
+   magics
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/source/magics.rst b/docs/source/magics.rst
new file mode 100644
index 0000000..2073f35
--- /dev/null
+++ b/docs/source/magics.rst
@@ -0,0 +1,172 @@
+**********
+Magics API
+**********
+
+.. note::
+   Arguments for profilers and the nvcc compiler can be passed in double
+   quotes so they can contain spaces and dashes.
+
+------
+
+.. _cuda_magic:
+
+cuda
+====
+
+Magic command that compiles, runs, and profiles CUDA C++ code in the cell.
+
+Usage
+-----
+
+   - ``%%cuda``: Compile and run this cell.
+   - ``%%cuda -p``: Also runs the Nsight Compute profiler.
+   - ``%%cuda -p -a "<SPACE SEPARATED PROFILER ARGS>"``: Also runs the Nsight Compute profiler.
+   - ``%%cuda -t``: Outputs the "timeit" built-in magic results.
+
+Options
+-------
+
+-t, --timeit
+   Boolean. If set, returns the output of the "timeit" built-in
+   ipython magic instead of stdout.
+
+-p, --profile
+   Boolean. If set, runs the NVIDIA Nsight Compute profiler whose
+   output is appended to standard output.
+
+-a, --profiler-args
+   String. Optional profiler arguments that can be space separated
+   by wrapping them in double quotes. See all options here:
+   `Nsight Compute CLI <https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html#command-line-options>`_
+
+.. note::
+   If both "\-\-profile" and "\-\-timeit" are used then no profiling is
+   done.
+
+Examples
+--------
+::
+
+   # compile, run, and profile the code in the cell with the Nsight
+   # compute profiler while collecting only metrics from the
+   # "MemoryWorkloadAnalysis" section.
+   %%cuda --profile --profiler-args "--section MemoryWorkloadAnalysis"
+
+------
+
+.. _cuda_group_save_magic:
+
+cuda_group_save
+===============
+
+Magic command that saves CUDA C++ code in the cell for later
+compilation and execution with possibly more source files.
+
+Usage
+-----
+
+   - ``%%cuda_group_save -n <FILENAME> -g <GROUPNAME>``: Save the code in the current cell to a group of source files.
+
+Options
+-------
+
+-n, --name
+   String. Required file name of the saved source file. Must have
+   either the ".cu" or ".h" extension. In order to import a header
+   file saved with this magic you can simply add '#include "<name>"'.
+
+-g, --group
+   String. Required group name to which to add the saved source file.
+   Groups are source files that get compiled together and do not
+   interact with other groups. This allows you to have multiple
+   unrelated CUDA programs within the same jupyter notebook. Adding
+   files to a group named "shared" will make them available to all
+   other source file groups. One use case for the shared group is for
+   sharing error handling code which should be present in all CUDA
+   programs.
+
+Examples
+--------
+::
+
+   # jupyter cell 1
+   %%cuda_group_save -n "error_handling.h" -g "shared"
+   <ERROR HANDLING CODE>
+
+   # jupyter cell 2
+   %%cuda_group_save -n "main.cu" -g "example_group"
+   #include "error_handling.h"
+   <YOUR CODE HERE>
+
+------
+
+.. _cuda_group_run_magic:
+
+cuda_group_run
+==============
+
+Line magic command that compiles, runs, and profiles all source files
+in a group.
+
+Usage
+-----
+
+   - ``%%cuda_group_run -g <GROUPNAME>``: Compiles, runs, and profiles the sources files in the given group.
+
+Options
+-------
+
+-g, --group
+   String. Required group name whose source files should be deleted.
+
+.. note::
+   All options from the "%%cuda" cell magic are inherited.
+
+Examples
+--------
+::
+
+   # jupyter cell 1
+   %%cuda_group_save -n "error_handling.h" -g "shared"
+   <ERROR HANDLING CODE>
+
+   # jupyter cell 2
+   %%cuda_group_save -n "main.cu" -g "example_group"
+   #include "error_handling.h"
+   <YOUR CODE HERE>
+
+   # jupyter cell 3
+   %cuda_group_run -g "example_group" --profile
+
+-----
+
+.. _cuda_group_delete_magic:
+
+cuda_group_delete
+=================
+
+Line magic command that deletes all source files in a group.
+
+Usage
+-----
+
+   - ``%%cuda_group_delete -g <GROUPNAME>``: Removes all source files in the given group.
+
+Options
+-------
+
+-g, --group
+   String. Required group name whose source files should be deleted.
+
+Examples
+--------
+::
+
+   # jupyter cell 1
+   %%cuda_group_save -n "error_handling.h" -g "shared"
+   <ERROR HANDLING CODE>
+
+   # jupyter cell 2 - here we delete the error shared group; in
+   # practice this would be helpful if you want to overwrite some
+   # functionality that was defined earlier in the notebook
+   %cuda_group_delete -g "shared"
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
new file mode 100644
index 0000000..885e419
--- /dev/null
+++ b/docs/source/usage.rst
@@ -0,0 +1,265 @@
+Usage
+=====
+
+This IPython extension allows running CUDA C++ code in Jupyter notebook. This
+is especially useful when combined with `Google Colab <https://colab.research.google.com/>`_
+which provides CUDA capable GPUs with the CUDA toolkit already installed.
+
+.. _installation:
+
+Installation
+------------
+
+To use nvcc4jupyter, first install it using pip:
+
+.. code-block:: console
+
+    (venv) $ pip install nvcc4jupyter
+
+.. _load_extension:
+
+Load the Extension
+------------------
+
+Now we need to load the IPython extension to be able to use its cell and line
+magic commands:
+
+.. code-block::
+
+    %load_ext nvcc4jupyter
+
+Hello World
+-----------
+
+We will use the :ref:`cuda <cuda_magic>` cell magic command to run a simple
+hello world program.
+
+.. code-block:: c++
+
+    %%cuda
+    #include <stdio.h>
+
+    __global__ void hello(){
+        printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
+    }
+
+    int main(){
+        hello<<<2, 2>>>();
+        cudaDeviceSynchronize();
+    }
+
+Groups
+------
+
+Now we will demonstrate a more complex scenario that uses source file groups.
+If you want to split your code into multiple source files, either for code reuse
+or just to have an easier to read project, you want to use groups. A group of
+source files will be compiled together. Because of this, you can include headers
+from the same group and use the code defined in other ".cu" files. There is also
+a special group named "shared" whose files will be compiled together with all
+other groups, which is a great feature for error handling code as we'll show now:
+
+.. code-block:: c++
+
+    %%cuda_group_save --group shared --name "error_handling.h"
+    // error checking macro
+    #define cudaCheckErrors(msg) \
+        do { \
+            cudaError_t __err = cudaGetLastError(); \
+            if (__err != cudaSuccess) { \
+                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
+                    msg, cudaGetErrorString(__err), \
+                    __FILE__, __LINE__); \
+                fprintf(stderr, "*** FAILED - ABORTING\n"); \
+                exit(1); \
+            } \
+        } while (0)
+
+Now we can use that error handling macro in this vector addition program but
+also in other programs that we define in other Jupyter cells:
+
+.. code-block:: c++
+
+    %%cuda
+    #include <stdio.h>
+    #include "error_handling.h"
+
+    const int DSIZE = 4096;
+    const int block_size = 256;
+
+    // vector add kernel: C = A + B
+    __global__ void vadd(const float *A, const float *B, float *C, int ds){
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+        if (idx < ds) {
+            C[idx] = A[idx] + B[idx];
+        }
+    }
+
+    int main(){
+        float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
+
+        // allocate space for vectors in host memory
+        h_A = new float[DSIZE];
+        h_B = new float[DSIZE];
+        h_C = new float[DSIZE];
+
+        // initialize vectors in host memory to random values (except for the
+        // result vector whose values do not matter as they will be overwritten)
+        for (int i = 0; i < DSIZE; i++) {
+            h_A[i] = rand()/(float)RAND_MAX;
+            h_B[i] = rand()/(float)RAND_MAX;
+        }
+
+        // allocate space for vectors in device memory
+        cudaMalloc(&d_A, DSIZE*sizeof(float));
+        cudaMalloc(&d_B, DSIZE*sizeof(float));
+        cudaMalloc(&d_C, DSIZE*sizeof(float));
+        cudaCheckErrors("cudaMalloc failure"); // error checking
+
+        // copy vectors A and B from host to device:
+        cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
+        cudaCheckErrors("cudaMemcpy H2D failure");
+
+        // launch the vector adding kernel
+        vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
+        cudaCheckErrors("kernel launch failure");
+
+        // wait for the kernel to finish execution
+        cudaDeviceSynchronize();
+        cudaCheckErrors("kernel execution failure");
+
+        cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
+        cudaCheckErrors("cudaMemcpy D2H failure");
+
+        printf("A[0] = %f\n", h_A[0]);
+        printf("B[0] = %f\n", h_B[0]);
+        printf("C[0] = %f\n", h_C[0]);
+        return 0;
+    }
+
+Above we use the :ref:`cuda <cuda_magic>` magic command which saves the code
+in the cell to an anonymous source file group, compiles, and executes that
+code. This only allows us to have one source file (besides the ones in the
+"shared" group). In order to have multiple source files we need to use the
+:ref:`cuda_group_save <cuda_group_save_magic>` and
+:ref:`cuda_group_run <cuda_group_run_magic>` magics.
+
+First, we save the vector addition function to its own file:
+
+
+.. code-block:: c++
+
+    %%cuda_group_save --name "vector_add.cu" --group "vector_add"
+    // vector add kernel: C = A + B
+    __global__ void vadd(const float *A, const float *B, float *C, int ds){
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+        if (idx < ds) {
+            C[idx] = A[idx] + B[idx];
+        }
+    }
+
+Now we create a header file so the main cuda file knows the signature of "vadd":
+
+.. code-block:: c++
+
+    %%cuda_group_save --name "vector_add.h" --group "vector_add"
+    __global__ void vadd(const float *A, const float *B, float *C, int ds);
+
+To tie it all together, we save the main cuda file, which includes our vector
+addition code:
+
+.. code-block:: c++
+
+    %%cuda_group_save --name "main.cu" --group "vector_add"
+    #include <stdio.h>
+    #include "error_handling.h"
+    #include "vector_add.h"
+
+    const int DSIZE = 4096;
+    const int block_size = 256;
+
+    int main(){
+        float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
+
+        // allocate space for vectors in host memory
+        h_A = new float[DSIZE];
+        h_B = new float[DSIZE];
+        h_C = new float[DSIZE];
+
+        // initialize vectors in host memory to random values (except for the
+        // result vector whose values do not matter as they will be overwritten)
+        for (int i = 0; i < DSIZE; i++) {
+            h_A[i] = rand()/(float)RAND_MAX;
+            h_B[i] = rand()/(float)RAND_MAX;
+        }
+
+        // allocate space for vectors in device memory
+        cudaMalloc(&d_A, DSIZE*sizeof(float));
+        cudaMalloc(&d_B, DSIZE*sizeof(float));
+        cudaMalloc(&d_C, DSIZE*sizeof(float));
+        cudaCheckErrors("cudaMalloc failure"); // error checking
+
+        // copy vectors A and B from host to device:
+        cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
+        cudaCheckErrors("cudaMemcpy H2D failure");
+
+        // launch the vector adding kernel
+        vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
+        cudaCheckErrors("kernel launch failure");
+
+        // wait for the kernel to finish execution
+        cudaDeviceSynchronize();
+        cudaCheckErrors("kernel execution failure");
+
+        cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
+        cudaCheckErrors("cudaMemcpy D2H failure");
+
+        printf("A[0] = %f\n", h_A[0]);
+        printf("B[0] = %f\n", h_B[0]);
+        printf("C[0] = %f\n", h_C[0]);
+        return 0;
+    }
+
+Now we can compile all the source files in the group and execute the main
+function with the following command:
+
+.. code-block:: c++
+
+    %cuda_group_run --group "vector_add"
+
+Profiling
+---------
+
+Another important feature of nvcc4jupyter is its integration with the NVIDIA
+Nsight Compute profiler, which you need to make sure is installed and its
+executable can be found in a directory in your PATH environment variable.
+
+In order to use it and provide the profiler with custom arguments, simply run:
+
+.. code-block:: c++
+
+    %cuda_group_run --group "vector_add" --profile --profiler-args "--section SpeedOfLight"
+
+Running the cell above will compile and execute the vector addition code in the
+"vector_add" group and profile it, keeping only the metrics from the
+"SpeedOfLight" section. The output will contain something similar to:
+
+.. code-block::
+
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.65
+    SM Frequency            cycle/usecond       544.31
+    Elapsed Cycles                  cycle        2,145
+    Memory Throughput                   %         3.19
+    DRAM Throughput                     %         3.19
+    Duration                      usecond         3.94
+    L1/TEX Cache Throughput             %         6.67
+    L2 Cache Throughput                 %         1.98
+    SM Active Cycles                cycle       383.65
+    Compute (SM) Throughput             %         1.19
+    ----------------------- ------------- ------------