diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
new file mode 100644
index 0000000..9088efc
--- /dev/null
+++ b/.devcontainer/Dockerfile
@@ -0,0 +1,19 @@
+FROM ubuntu
+
+ARG VENV_PATH=/opt/dev-venv
+ENV VENV_ACTIVATE=${VENV_PATH}/bin/activate
+
+RUN apt update
+RUN apt install -y python3.10-venv nvidia-cuda-toolkit gcc vim git
+
+# the mkdir command bypasses a profiler error, which allows us to run it with
+# host code only to at least check that the profiler parameters are correctly
+# provided; without this line, some tests will fail
+RUN mkdir -p /usr/lib/x86_64-linux-gnu/nsight-compute/sections
+
+# we create the virtualenv here so that the devcontainer.json setting
+# python.defaultInterpreterPath can be used to find it; if we do it in the
+# post_create.sh script, the virtualenv will not be loaded and features like
+# pylance, black, isort, etc. will not work
+RUN python3.10 -m venv ${VENV_PATH}
+RUN echo "source ${VENV_ACTIVATE}" >> ~/.bashrc
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
new file mode 100644
index 0000000..c6e997c
--- /dev/null
+++ b/.devcontainer/devcontainer.json
@@ -0,0 +1,26 @@
+{
+    "name": "Python Environment",
+    "build": {
+        "dockerfile": "Dockerfile",
+        "context": ".."
+    },
+    "postCreateCommand": "bash .devcontainer/post_create.sh",
+    "customizations": {
+        "vscode": {
+            "extensions": [
+                "editorconfig.editorconfig",
+                "ms-azuretools.vscode-docker",
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "ms-python.pylint",
+                "ms-python.isort",
+                "ms-python.flake8",
+                "ms-python.black-formatter",
+                "ryanluker.vscode-coverage-gutters"
+            ],
+            "settings": {
+                "python.defaultInterpreterPath": "/opt/dev-venv/bin/python"
+            }
+        }
+    }
+}
diff --git a/.devcontainer/post_create.sh b/.devcontainer/post_create.sh
new file mode 100644
index 0000000..15fd069
--- /dev/null
+++ b/.devcontainer/post_create.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# install developer dependencies
+pip install .[dev]
+
+# make sure the developer uses pre-commit hooks
+pre-commit install
diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..bd44966
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+max-line-length = 79
+select = F,E,W,B,B901,B902,B903
+exclude = .eggs,.git,.tox,nssm,obj,out,packages,pywin32,tests,swagger_client
+ignore = E722,B001,W503,E203
diff --git a/.github/workflows/code-quality-master.yml b/.github/workflows/code-quality-master.yml
new file mode 100644
index 0000000..ab4f9f0
--- /dev/null
+++ b/.github/workflows/code-quality-master.yml
@@ -0,0 +1,22 @@
+# Same as `code-quality-pr.yaml` but triggered on commit to master branch
+# and runs on all files (instead of only the changed ones)
+
+name: Code Quality Master
+
+on:
+  push:
+    branches: [master]
+
+jobs:
+  code-quality:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v2
+
+      - name: Run pre-commits
+        uses: pre-commit/action@v2.0.3
diff --git a/.github/workflows/code-quality-pr.yml b/.github/workflows/code-quality-pr.yml
new file mode 100644
index 0000000..4bffba5
--- /dev/null
+++ b/.github/workflows/code-quality-pr.yml
@@ -0,0 +1,36 @@
+# This workflow finds which files were changed, prints them,
+# and runs `pre-commit` on those files.
+
+# Inspired by the sktime library:
+# https://github.com/alan-turing-institute/sktime/blob/main/.github/workflows/test.yml
+
+name: Code Quality PR
+
+on:
+  pull_request:
+    branches: [master, "release/*", "dev"]
+
+jobs:
+  code-quality:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v2
+
+      - name: Find modified files
+        id: file_changes
+        uses: trilom/file-changes-action@v1.2.4
+        with:
+          output: " "
+
+      - name: List modified files
+        run: echo '${{ steps.file_changes.outputs.files}}'
+
+      - name: Run pre-commits
+        uses: pre-commit/action@v2.0.3
+        with:
+          extra_args: --files ${{ steps.file_changes.outputs.files}}
diff --git a/.github/workflows/publish-to-pypi.yml b/.github/workflows/publish-to-pypi.yml
new file mode 100644
index 0000000..a92716f
--- /dev/null
+++ b/.github/workflows/publish-to-pypi.yml
@@ -0,0 +1,46 @@
+name: Publish Python 🐍 distribution 📦 to PyPI
+
+on: push
+
+jobs:
+  build:
+    name: Build distribution 📦
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.x"
+    - name: Install pypa/build
+      run: python3 -m pip install build --user
+    - name: Build a binary wheel and a source tarball
+      run: python3 -m build
+    - name: Store the distribution packages
+      uses: actions/upload-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+
+  publish-to-pypi:
+    name: >-
+      Publish Python 🐍 distribution 📦 to PyPI
+    if: startsWith(github.ref, 'refs/tags/')  # only publish to PyPI on tag pushes
+    needs:
+    - build
+    runs-on: ubuntu-latest
+    environment:
+      name: pypi
+      url: https://pypi.org/p/nvcc4jupyter
+    permissions:
+      id-token: write
+
+    steps:
+    - name: Download all the dists
+      uses: actions/download-artifact@v3
+      with:
+        name: python-package-distributions
+        path: dist/
+    - name: Publish distribution 📦 to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 0000000..6fd78e1
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,86 @@
+name: Tests
+
+on:
+  push:
+    branches: [master]
+  pull_request:
+    branches: [master, "release/*", "dev"]
+
+jobs:
+  run_tests_ubuntu:
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-latest"]
+        python-version: ["3.10", "3.11", "3.12"]
+
+    timeout-minutes: 20
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      # the mkdir command bypasses a profiler error, which allows us to run it
+      # with host code only to at least check that the profiler parameters are
+      # correctly provided
+      - name: Install CUDA tools
+        run: |
+          sudo apt update
+          sudo apt install nvidia-cuda-toolkit
+          sudo mkdir -p /usr/lib/x86_64-linux-gnu/nsight-compute/sections
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r tests/requirements.txt
+
+      - name: List dependencies
+        run: |
+          python -m pip list
+
+      - name: Run pytest
+        run: |
+          pytest -v
+
+  # upload code coverage report
+  code-coverage:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v2
+        with:
+          lfs: "true"
+      - run: git lfs pull
+
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v2
+        with:
+          python-version: "3.10"
+
+      - name: Install CUDA tools
+        run: |
+          sudo apt update
+          sudo apt install nvidia-cuda-toolkit
+          sudo mkdir -p /usr/lib/x86_64-linux-gnu/nsight-compute/sections
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r tests/requirements.txt
+          pip install pytest-cov[toml]
+
+      - name: Run tests and collect coverage
+        run: pytest --cov nvcc4jupyter
+
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.gitignore b/.gitignore
index 3d72576..3485fef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,37 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+.tox/
+.coverage
+.cache
+nosetests.xml
+coverage.xml
+
+# Virtual Environment
+*env*
+
+# Misc
+.pytest_cache/
 .DS_Store
-.idea
\ No newline at end of file
+.idea
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..de41dce
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,53 @@
+default_language_version:
+  python: python3
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      # list of supported hooks: https://pre-commit.com/hooks.html
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-docstring-first
+      - id: check-yaml
+      - id: debug-statements
+      - id: detect-private-key
+      - id: check-executables-have-shebangs
+      - id: check-toml
+      - id: check-case-conflict
+      - id: check-added-large-files
+
+  # python code formatting
+  - repo: https://github.com/psf/black
+    rev: 23.12.1
+    hooks:
+      - id: black
+        args: ["--config", "pyproject.toml"]
+
+  # python import sorting
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        args: ["--settings-path", "pyproject.toml"]
+
+  # python check (PEP8), programming errors and code complexity
+  - repo: https://github.com/PyCQA/flake8
+    rev: 7.0.0
+    hooks:
+      - id: flake8
+        args: ["--config", ".flake8"]
+
+  # pylint check
+  - repo: https://github.com/pycqa/pylint
+    rev: v3.0.3
+    hooks:
+    -   id: pylint
+        args: ["--rcfile", "pyproject.toml"]
+
+  - repo: https://github.com/PyCQA/bandit
+    rev: 1.7.6
+    hooks:
+    - id: bandit
+      args: ["-c", "pyproject.toml"]
+      additional_dependencies: ["bandit[toml]"]
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..314290f
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,32 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.10"
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: docs/source/conf.py
+
+# Optionally build your docs in additional formats such as PDF and ePub
+# formats:
+#    - pdf
+#    - epub
+
+# Optional but recommended, declare the Python requirements required
+# to build your documentation
+# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
+python:
+   install:
+   - requirements: docs/requirements.txt
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..4785bc9
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,33 @@
+{
+    "editor.formatOnSave": true,
+    "editor.formatOnPaste": true,
+    "files.trimTrailingWhitespace": true,
+    "files.autoSave": "onFocusChange",
+    "git.autofetch": true,
+    "[jsonc]": {
+        "editor.defaultFormatter": "vscode.json-language-features"
+    },
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter",
+        "editor.formatOnSave": true,
+        "editor.codeActionsOnSave": {
+            "source.organizeImports": "explicit"
+        },
+    },
+    "python.defaultInterpreterPath": "/usr/local/bin/python",
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    "pylint.args": [
+        "--rcfile=pyproject.toml"
+    ],
+    "black-formatter.args": [
+        "--config=pyproject.toml"
+    ],
+    "flake8.args": [
+        "--config",
+        ".flake8"
+    ],
+    "isort.args": [
+        "--settings-path=pyproject.toml"
+    ]
+}
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..134312e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2018-2024 Andrei Nechaev, Cosmin Stefan Ciocan and others
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
index b9ddce0..f60b626 100644
--- a/README.md
+++ b/README.md
@@ -1,37 +1,103 @@
-## NVCC Plugin for Jupyter notebook
+# nvcc4jupyter: CUDA C++ plugin for Jupyter Notebook
 
-### V2 is available
+| | |
+| --- | --- |
+| Testing | ![Python Versions][python-version] [![CI - Test][test-badge]][test-workflow] [![Coverage][coverage-badge]][coverage-results] |
+| Code Quality | [![Code style: black][black-badge]][black-project] [![security: bandit][bandit-badge]][bandit-project]|
+| Package | [![PyPI Latest Release][pypi-latest-version]][pypi-project-url] [![PyPI Downloads][pypi-downloads]][pypi-project-url] |
 
-V2 brings support of multiple source and header files.
+<!-- Testing badges -->
+[python-version]: https://img.shields.io/pypi/pyversions/nvcc4jupyter
+[test-badge]: https://github.com/cosminc98/nvcc4jupyter/actions/workflows/test.yml/badge.svg
+[test-workflow]: https://github.com/cosminc98/nvcc4jupyter/actions/workflows/test.yml
+[coverage-badge]: https://codecov.io/github/cosminc98/nvcc4jupyter/coverage.svg?branch=master
+[coverage-results]: https://codecov.io/gh/cosminc98/nvcc4jupyter
 
-##### Usage
+<!-- Code Quality badges -->
+[black-badge]: https://img.shields.io/badge/code%20style-black-000000.svg
+[black-project]: https://github.com/ambv/black
+[bandit-badge]: https://img.shields.io/badge/security-bandit-yellow.svg
+[bandit-project]: https://github.com/PyCQA/bandit
 
-- Install and load extension
-```
-!pip install git+https://github.com/andreinechaev/nvcc4jupyter.git
-%load_ext nvcc_plugin
+<!-- Package badges -->
+[pypi-project-url]: https://pypi.org/project/nvcc4jupyter/
+[pypi-latest-version]: https://img.shields.io/pypi/v/nvcc4jupyter.svg
+[pypi-downloads]: https://img.shields.io/pypi/dm/nvcc4jupyter.svg?label=PyPI%20downloads
+
+**nvcc4jupyter** is a Jupyter Notebook plugin that provides cell and line
+[magics](https://ipython.readthedocs.io/en/stable/interactive/magics.html)
+to allow running CUDA C++ code from a notebook. This is especially
+useful when combined with a hosted service such a Google's
+[Colab](https://colab.research.google.com/) which provide CUDA capable GPUs
+and you can start learning CUDA C++ without having to install anything or even
+to own a GPU yourself.
+
+## Table of Contents
+
+- [Main Features](#main-features)
+- [Install](#install)
+- [Usage](#usage)
+- [License](#license)
+- [Documentation](#documentation)
+- [Contributing](#contributing)
+
+## Main Features
+Here are just a few of the things that nvcc4jupyter does well:
+
+  - [Easily run CUDA C++ code](https://nvcc4jupyter.readthedocs.io/en/latest/usage.html#hello-world)
+  - [Profile your code with NVIDIA Nsight Compute](https://nvcc4jupyter.readthedocs.io/en/latest/usage.html#profiling)
+  - [Share code between different programs in the same notebook / split your code into multiple files for improved readability](https://nvcc4jupyter.readthedocs.io/en/latest/usage.html#groups)
+
+## Install
+The installer for the latest released version is available at the [Python
+Package Index (PyPI)](https://pypi.org/project/nvcc4jupyter).
+
+```sh
+pip install nvcc4jupyter
 ```
 
-- Mark a cell to be treated as cuda cell
-> `%%cuda --name example.cu --compile false`
->> NOTE: The cell must contain either code or comments to be run successfully. 
->> It accepts 2 arguments. `-n` | `--name`  - which is the name of either CUDA source or Header
->> The name parameter must have extension `.cu` or `.h`
->> Second argument `-c` | `--compile`; default value is `false`. The argument is a flag to specify
->> if the cell will be compiled and run right away or not. It might be usefull if you're playing in
->> the `main` function
+## Usage
 
-- To compile and run all CUDA files you need to run
+First, load the extension to enable the magic commands:
 ```
-%%cuda_run
-# This line just to bypass an exeption and can contain any text
+%load_ext nvcc4jupyter
 ```
 
-- To profile your CUDA kernels using NVIDIA Nsight Compute CLI profiler you need to run
+Running a quick CUDA Hello World program:
+```c++
+%%cuda
+#include <stdio.h>
+
+__global__ void hello(){
+    printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
+}
+
+int main(){
+    hello<<<2, 2>>>();
+    cudaDeviceSynchronize();
+}
 ```
-%%cu --profile
+
+For more advanced use cases, see [the documentation](https://nvcc4jupyter.readthedocs.io/en/latest/usage.html).
+
+## Documentation
+The official documentation is hosted on [readthedocs](https://nvcc4jupyter.readthedocs.io/).
+
+## License
+[MIT](LICENSE)
+
+## Contributing
+
+Install the package with the development dependencies:
+```bash
+pip install .[dev]
 ```
-- You can add options to the profiler. Keep in mind that any argument after "--profiler-args" will be considered as a profiler argument. For example, to select which sections to collect metrics for you need to run
-```
-%%cu --profile --profiler-args --section SpeedOfLight --section MemoryWorkloadAnalysis --section Occupancy
+
+As a developer, make sure you install the pre-commit hook before commiting any changes:
+```bash
+pre-commit install
 ```
+
+<hr>
+
+[Go to Top](#table-of-contents)
diff --git a/common/helper.py b/common/helper.py
deleted file mode 100644
index 17dcfff..0000000
--- a/common/helper.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import argparse
-
-
-def get_argparser():
-    parser = argparse.ArgumentParser(description='NVCCPlugin params')
-    parser.add_argument(
-        '-t', 
-        '--timeit',
-        action='store_true',
-        help='If set, returns the output of the "timeit" built-in ipython magic instead of stdout.',
-    )
-    parser.add_argument(
-        '-p', 
-        '--profile', 
-        action='store_true',
-        help='If set, runs the nvidia nsight compute profiler. Has no effect if used with --timeit.',
-    )
-    parser.add_argument(
-        '-a',
-        '--profiler-args',
-        type=str,
-        nargs=argparse.REMAINDER,
-        default=[],
-        help='Extra options that can be passed to the nvidia nsight compute profiler. '
-             'Must be the last option given to the argument parser so you can pass arguments with dashes.',
-    )
-    return parser
-
-
-def print_out(out: str):
-    for l in out.split('\n'):
-        print(l)
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d0c3cbf
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..319c288
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000..53fc1f3
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,2 @@
+sphinx==7.1.2
+sphinx-rtd-theme==1.3.0rc1
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..665059c
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,40 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "nvcc4jupyter"
+copyright = "2024, Andrei Nechaev & Cosmin Stefan Ciocan"
+author = "Andrei Nechaev & Cosmin Stefan Ciocan"
+release = "1.0.1"
+version = "1.0.1"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "sphinx.ext.duration",
+    "sphinx.ext.doctest",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.intersphinx",
+]
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3/", None),
+    "sphinx": ("https://www.sphinx-doc.org/en/master/", None),
+}
+intersphinx_disabled_domains = ["std"]
+
+templates_path = ["_templates"]
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = "sphinx_rtd_theme"
+html_static_path = ["_static"]
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..1f07bdd
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,13 @@
+Welcome to nvcc4jupyter's documentation!
+========================================
+
+This IPython extension allows running CUDA C++ code in Jupyter notebook. This
+is especially useful when combined with `Google Colab <https://colab.research.google.com/>`_
+which provides CUDA capable GPUs with the CUDA toolkit already installed.
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   usage
+   magics
diff --git a/docs/source/magics.rst b/docs/source/magics.rst
new file mode 100644
index 0000000..2073f35
--- /dev/null
+++ b/docs/source/magics.rst
@@ -0,0 +1,172 @@
+**********
+Magics API
+**********
+
+.. note::
+   Arguments for profilers and the nvcc compiler can be passed in double
+   quotes so they can contain spaces and dashes.
+
+------
+
+.. _cuda_magic:
+
+cuda
+====
+
+Magic command that compiles, runs, and profiles CUDA C++ code in the cell.
+
+Usage
+-----
+
+   - ``%%cuda``: Compile and run this cell.
+   - ``%%cuda -p``: Also runs the Nsight Compute profiler.
+   - ``%%cuda -p -a "<SPACE SEPARATED PROFILER ARGS>"``: Also runs the Nsight Compute profiler.
+   - ``%%cuda -t``: Outputs the "timeit" built-in magic results.
+
+Options
+-------
+
+-t, --timeit
+   Boolean. If set, returns the output of the "timeit" built-in
+   ipython magic instead of stdout.
+
+-p, --profile
+   Boolean. If set, runs the NVIDIA Nsight Compute profiler whose
+   output is appended to standard output.
+
+-a, --profiler-args
+   String. Optional profiler arguments that can be space separated
+   by wrapping them in double quotes. See all options here:
+   `Nsight Compute CLI <https://docs.nvidia.com/nsight-compute/NsightComputeCli/index.html#command-line-options>`_
+
+.. note::
+   If both "\-\-profile" and "\-\-timeit" are used then no profiling is
+   done.
+
+Examples
+--------
+::
+
+   # compile, run, and profile the code in the cell with the Nsight
+   # compute profiler while collecting only metrics from the
+   # "MemoryWorkloadAnalysis" section.
+   %%cuda --profile --profiler-args "--section MemoryWorkloadAnalysis"
+
+------
+
+.. _cuda_group_save_magic:
+
+cuda_group_save
+===============
+
+Magic command that saves CUDA C++ code in the cell for later
+compilation and execution with possibly more source files.
+
+Usage
+-----
+
+   - ``%%cuda_group_save -n <FILENAME> -g <GROUPNAME>``: Save the code in the current cell to a group of source files.
+
+Options
+-------
+
+-n, --name
+   String. Required file name of the saved source file. Must have
+   either the ".cu" or ".h" extension. In order to import a header
+   file saved with this magic you can simply add '#include "<name>"'.
+
+-g, --group
+   String. Required group name to which to add the saved source file.
+   Groups are source files that get compiled together and do not
+   interact with other groups. This allows you to have multiple
+   unrelated CUDA programs within the same jupyter notebook. Adding
+   files to a group named "shared" will make them available to all
+   other source file groups. One use case for the shared group is for
+   sharing error handling code which should be present in all CUDA
+   programs.
+
+Examples
+--------
+::
+
+   # jupyter cell 1
+   %%cuda_group_save -n "error_handling.h" -g "shared"
+   <ERROR HANDLING CODE>
+
+   # jupyter cell 2
+   %%cuda_group_save -n "main.cu" -g "example_group"
+   #include "error_handling.h"
+   <YOUR CODE HERE>
+
+------
+
+.. _cuda_group_run_magic:
+
+cuda_group_run
+==============
+
+Line magic command that compiles, runs, and profiles all source files
+in a group.
+
+Usage
+-----
+
+   - ``%%cuda_group_run -g <GROUPNAME>``: Compiles, runs, and profiles the sources files in the given group.
+
+Options
+-------
+
+-g, --group
+   String. Required group name whose source files should be deleted.
+
+.. note::
+   All options from the "%%cuda" cell magic are inherited.
+
+Examples
+--------
+::
+
+   # jupyter cell 1
+   %%cuda_group_save -n "error_handling.h" -g "shared"
+   <ERROR HANDLING CODE>
+
+   # jupyter cell 2
+   %%cuda_group_save -n "main.cu" -g "example_group"
+   #include "error_handling.h"
+   <YOUR CODE HERE>
+
+   # jupyter cell 3
+   %cuda_group_run -g "example_group" --profile
+
+-----
+
+.. _cuda_group_delete_magic:
+
+cuda_group_delete
+=================
+
+Line magic command that deletes all source files in a group.
+
+Usage
+-----
+
+   - ``%%cuda_group_delete -g <GROUPNAME>``: Removes all source files in the given group.
+
+Options
+-------
+
+-g, --group
+   String. Required group name whose source files should be deleted.
+
+Examples
+--------
+::
+
+   # jupyter cell 1
+   %%cuda_group_save -n "error_handling.h" -g "shared"
+   <ERROR HANDLING CODE>
+
+   # jupyter cell 2 - here we delete the error shared group; in
+   # practice this would be helpful if you want to overwrite some
+   # functionality that was defined earlier in the notebook
+   %cuda_group_delete -g "shared"
diff --git a/docs/source/usage.rst b/docs/source/usage.rst
new file mode 100644
index 0000000..38ff35c
--- /dev/null
+++ b/docs/source/usage.rst
@@ -0,0 +1,257 @@
+Usage
+=====
+
+Installation
+------------
+
+To use nvcc4jupyter, first install it using pip:
+
+.. code-block:: console
+
+    (venv) $ pip install nvcc4jupyter
+
+Load the Extension
+------------------
+
+Now we need to load the IPython extension to be able to use its cell and line
+magic commands:
+
+.. code-block::
+
+    %load_ext nvcc4jupyter
+
+Hello World
+-----------
+
+We will use the :ref:`cuda <cuda_magic>` cell magic command to run a simple
+hello world program.
+
+.. code-block:: c++
+
+    %%cuda
+    #include <stdio.h>
+
+    __global__ void hello(){
+        printf("Hello from block: %u, thread: %u\n", blockIdx.x, threadIdx.x);
+    }
+
+    int main(){
+        hello<<<2, 2>>>();
+        cudaDeviceSynchronize();
+    }
+
+Groups
+------
+
+Now we will demonstrate a more complex scenario that uses source file groups.
+If you want to split your code into multiple source files, either for code reuse
+or just to have an easier to read project, you want to use groups. A group of
+source files will be compiled together. Because of this, you can include headers
+from the same group and use the code defined in other ".cu" files. There is also
+a special group named "shared" whose files will be compiled together with all
+other groups, which is a great feature for error handling code as we'll show now:
+
+.. code-block:: c++
+
+    %%cuda_group_save --group shared --name "error_handling.h"
+    // error checking macro
+    #define cudaCheckErrors(msg) \
+        do { \
+            cudaError_t __err = cudaGetLastError(); \
+            if (__err != cudaSuccess) { \
+                fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
+                    msg, cudaGetErrorString(__err), \
+                    __FILE__, __LINE__); \
+                fprintf(stderr, "*** FAILED - ABORTING\n"); \
+                exit(1); \
+            } \
+        } while (0)
+
+Now we can use that error handling macro in this vector addition program but
+also in other programs that we define in other Jupyter cells:
+
+.. code-block:: c++
+
+    %%cuda
+    #include <stdio.h>
+    #include "error_handling.h"
+
+    const int DSIZE = 4096;
+    const int block_size = 256;
+
+    // vector add kernel: C = A + B
+    __global__ void vadd(const float *A, const float *B, float *C, int ds){
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+        if (idx < ds) {
+            C[idx] = A[idx] + B[idx];
+        }
+    }
+
+    int main(){
+        float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
+
+        // allocate space for vectors in host memory
+        h_A = new float[DSIZE];
+        h_B = new float[DSIZE];
+        h_C = new float[DSIZE];
+
+        // initialize vectors in host memory to random values (except for the
+        // result vector whose values do not matter as they will be overwritten)
+        for (int i = 0; i < DSIZE; i++) {
+            h_A[i] = rand()/(float)RAND_MAX;
+            h_B[i] = rand()/(float)RAND_MAX;
+        }
+
+        // allocate space for vectors in device memory
+        cudaMalloc(&d_A, DSIZE*sizeof(float));
+        cudaMalloc(&d_B, DSIZE*sizeof(float));
+        cudaMalloc(&d_C, DSIZE*sizeof(float));
+        cudaCheckErrors("cudaMalloc failure"); // error checking
+
+        // copy vectors A and B from host to device:
+        cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
+        cudaCheckErrors("cudaMemcpy H2D failure");
+
+        // launch the vector adding kernel
+        vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
+        cudaCheckErrors("kernel launch failure");
+
+        // wait for the kernel to finish execution
+        cudaDeviceSynchronize();
+        cudaCheckErrors("kernel execution failure");
+
+        cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
+        cudaCheckErrors("cudaMemcpy D2H failure");
+
+        printf("A[0] = %f\n", h_A[0]);
+        printf("B[0] = %f\n", h_B[0]);
+        printf("C[0] = %f\n", h_C[0]);
+        return 0;
+    }
+
+Above we use the :ref:`cuda <cuda_magic>` magic command which saves the code
+in the cell to an anonymous source file group, compiles, and executes that
+code. This only allows us to have one source file (besides the ones in the
+"shared" group). In order to have multiple source files we need to use the
+:ref:`cuda_group_save <cuda_group_save_magic>` and
+:ref:`cuda_group_run <cuda_group_run_magic>` magics.
+
+First, we save the vector addition function to its own file:
+
+
+.. code-block:: c++
+
+    %%cuda_group_save --name "vector_add.cu" --group "vector_add"
+    // vector add kernel: C = A + B
+    __global__ void vadd(const float *A, const float *B, float *C, int ds){
+        int idx = threadIdx.x + blockIdx.x * blockDim.x;
+        if (idx < ds) {
+            C[idx] = A[idx] + B[idx];
+        }
+    }
+
+Now we create a header file so the main cuda file knows the signature of "vadd":
+
+.. code-block:: c++
+
+    %%cuda_group_save --name "vector_add.h" --group "vector_add"
+    __global__ void vadd(const float *A, const float *B, float *C, int ds);
+
+To tie it all together, we save the main cuda file, which includes our vector
+addition code:
+
+.. code-block:: c++
+
+    %%cuda_group_save --name "main.cu" --group "vector_add"
+    #include <stdio.h>
+    #include "error_handling.h"
+    #include "vector_add.h"
+
+    const int DSIZE = 4096;
+    const int block_size = 256;
+
+    int main(){
+        float *h_A, *h_B, *h_C, *d_A, *d_B, *d_C;
+
+        // allocate space for vectors in host memory
+        h_A = new float[DSIZE];
+        h_B = new float[DSIZE];
+        h_C = new float[DSIZE];
+
+        // initialize vectors in host memory to random values (except for the
+        // result vector whose values do not matter as they will be overwritten)
+        for (int i = 0; i < DSIZE; i++) {
+            h_A[i] = rand()/(float)RAND_MAX;
+            h_B[i] = rand()/(float)RAND_MAX;
+        }
+
+        // allocate space for vectors in device memory
+        cudaMalloc(&d_A, DSIZE*sizeof(float));
+        cudaMalloc(&d_B, DSIZE*sizeof(float));
+        cudaMalloc(&d_C, DSIZE*sizeof(float));
+        cudaCheckErrors("cudaMalloc failure"); // error checking
+
+        // copy vectors A and B from host to device:
+        cudaMemcpy(d_A, h_A, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
+        cudaMemcpy(d_B, h_B, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
+        cudaCheckErrors("cudaMemcpy H2D failure");
+
+        // launch the vector adding kernel
+        vadd<<<(DSIZE+block_size-1)/block_size, block_size>>>(d_A, d_B, d_C, DSIZE);
+        cudaCheckErrors("kernel launch failure");
+
+        // wait for the kernel to finish execution
+        cudaDeviceSynchronize();
+        cudaCheckErrors("kernel execution failure");
+
+        cudaMemcpy(h_C, d_C, DSIZE*sizeof(float), cudaMemcpyDeviceToHost);
+        cudaCheckErrors("cudaMemcpy D2H failure");
+
+        printf("A[0] = %f\n", h_A[0]);
+        printf("B[0] = %f\n", h_B[0]);
+        printf("C[0] = %f\n", h_C[0]);
+        return 0;
+    }
+
+Now we can compile all the source files in the group and execute the main
+function with the following command:
+
+.. code-block:: c++
+
+    %cuda_group_run --group "vector_add"
+
+Profiling
+---------
+
+Another important feature of nvcc4jupyter is its integration with the NVIDIA
+Nsight Compute profiler, which you need to make sure is installed and its
+executable can be found in a directory in your PATH environment variable.
+
+In order to use it and provide the profiler with custom arguments, simply run:
+
+.. code-block:: c++
+
+    %cuda_group_run --group "vector_add" --profile --profiler-args "--section SpeedOfLight"
+
+Running the cell above will compile and execute the vector addition code in the
+"vector_add" group and profile it, keeping only the metrics from the
+"SpeedOfLight" section. The output will contain something similar to:
+
+.. code-block::
+
+    Section: GPU Speed Of Light Throughput
+    ----------------------- ------------- ------------
+    Metric Name               Metric Unit Metric Value
+    ----------------------- ------------- ------------
+    DRAM Frequency          cycle/nsecond         4.65
+    SM Frequency            cycle/usecond       544.31
+    Elapsed Cycles                  cycle        2,145
+    Memory Throughput                   %         3.19
+    DRAM Throughput                     %         3.19
+    Duration                      usecond         3.94
+    L1/TEX Cache Throughput             %         6.67
+    L2 Cache Throughput                 %         1.98
+    SM Active Cycles                cycle       383.65
+    Compute (SM) Throughput             %         1.19
+    ----------------------- ------------- ------------
diff --git a/nvcc4jupyter/__init__.py b/nvcc4jupyter/__init__.py
new file mode 100644
index 0000000..97b8902
--- /dev/null
+++ b/nvcc4jupyter/__init__.py
@@ -0,0 +1,7 @@
+"""
+nvcc4jupyter: CUDA C++ plugin for Jupyter Notebook
+"""
+
+from .plugin import NVCCPlugin, load_ipython_extension  # noqa: F401
+
+__version__ = "1.0.3"
diff --git a/nvcc4jupyter/parsers.py b/nvcc4jupyter/parsers.py
new file mode 100644
index 0000000..e94afce
--- /dev/null
+++ b/nvcc4jupyter/parsers.py
@@ -0,0 +1,69 @@
+"""
+Parsers for the CUDA magic commands.
+"""
+
+import argparse
+
+
+def get_parser_cuda() -> argparse.ArgumentParser:
+    """
+    %%cuda magic command parser.
+    """
+    parser = argparse.ArgumentParser(
+        description=(
+            "%%cuda magic that compiles and runs CUDA C++ code in this cell."
+            " See https://nvcc4jupyter.readthedocs.io/en/latest/magics.html#cuda"  # noqa: E501
+            " for usage details."
+        )
+    )
+    parser.add_argument("-t", "--timeit", action="store_true")
+    parser.add_argument("-p", "--profile", action="store_true")
+    parser.add_argument("-a", "--profiler-args", type=str, default="")
+    return parser
+
+
+def get_parser_cuda_group_run() -> argparse.ArgumentParser:
+    """
+    %%cuda_group_run magic command parser.
+    """
+    parser = get_parser_cuda()
+    parser.description = (
+        "%%cuda_group_run magic that compiles and runs source files in a given"
+        " group. See"
+        " https://nvcc4jupyter.readthedocs.io/en/latest/magics.html#cuda-group-run"  # noqa: E501
+        " for usage details."
+    )
+    parser.add_argument("-g", "--group", type=str, required=True)
+    return parser
+
+
+def get_parser_cuda_group_save() -> argparse.ArgumentParser:
+    """
+    %%cuda_group_save magic command parser.
+    """
+    parser = argparse.ArgumentParser(
+        description=(
+            "%%cuda_group_save magic that saves CUDA C++ code in this cell for"
+            " later compilation and execution with possibly more source files."
+            " See https://nvcc4jupyter.readthedocs.io/en/latest/magics.html#cuda-group-save"  # noqa: E501
+            " for usage details."
+        )
+    )
+    parser.add_argument("-n", "--name", type=str, required=True)
+    parser.add_argument("-g", "--group", type=str, required=True)
+    return parser
+
+
+def get_parser_cuda_group_delete() -> argparse.ArgumentParser:
+    """
+    %%cuda_group_delete magic command parser.
+    """
+    parser = argparse.ArgumentParser(
+        description=(
+            "%%cuda_group_delete magic that deletes all files in a group. See"
+            " https://nvcc4jupyter.readthedocs.io/en/latest/magics.html#cuda-group-delete"  # noqa: E501
+            " for usage details."
+        )
+    )
+    parser.add_argument("-g", "--group", type=str, required=True)
+    return parser
diff --git a/nvcc4jupyter/plugin.py b/nvcc4jupyter/plugin.py
new file mode 100644
index 0000000..269a2cc
--- /dev/null
+++ b/nvcc4jupyter/plugin.py
@@ -0,0 +1,321 @@
+"""
+nvcc4jupyter: CUDA C++ plugin for Jupyter Notebook
+"""
+
+import argparse
+import glob
+import os
+import shutil
+import subprocess
+import tempfile
+import uuid
+from typing import List, Optional
+
+# pylint: disable=import-error
+from IPython.core.interactiveshell import InteractiveShell
+from IPython.core.magic import Magics, cell_magic, line_magic, magics_class
+
+from . import parsers
+
+DEFAULT_EXEC_FNAME = "cuda_exec.out"
+SHARED_GROUP_NAME = "shared"
+
+
+def print_out(out: str):
+    """Print string line by line."""
+    for line in out.split("\n"):
+        print(line)
+
+
+@magics_class
+class NVCCPlugin(Magics):
+    """
+    CUDA C++ plugin for Jupyter Notebook
+    """
+
+    def __init__(self, shell: InteractiveShell):
+        super().__init__(shell)
+        self.shell: InteractiveShell  # type hint not provided by parent class
+
+        self.parser_cuda = parsers.get_parser_cuda()
+        self.parser_cuda_group_save = parsers.get_parser_cuda_group_save()
+        self.parser_cuda_group_delete = parsers.get_parser_cuda_group_delete()
+        self.parser_cuda_group_run = parsers.get_parser_cuda_group_run()
+
+        self.workdir = tempfile.mkdtemp()
+        print(f'Source files will be saved in "{self.workdir}".')
+
+    def _save_source(
+        self, source_name: str, source_code: str, group_name: str
+    ) -> None:
+        """
+        Save source code as a .cu or .h file in the group directory where
+        files can be compiled together. Saving a source file to the group
+        named "shared" will make those source files available when compiling
+        any group.
+
+        Args:
+            source_name: The name of the source file. Must end in ".cu" or
+                ".h".
+            source_code: The source code to be written to the source file.
+            group_name: The name of the group directory where the file will be
+                saved.
+
+        Raises:
+            ValueError: If the source name does not have a proper extension.
+        """
+        _, ext = os.path.splitext(source_name)
+        if ext not in (".cu", ".h"):
+            raise ValueError(
+                f'Given source name "{source_name}" must end in ".h" or ".cu".'
+            )
+        group_dirpath = os.path.join(self.workdir, group_name)
+        os.makedirs(group_dirpath, exist_ok=True)
+        source_fpath = os.path.join(group_dirpath, source_name)
+        with open(source_fpath, "w", encoding="utf-8") as f:
+            f.write(source_code)
+
+    def _delete_group(self, group_name: str) -> None:
+        """
+        Removes all source files from the given group.
+
+        Args:
+            group_name: The name of the source files group.
+        """
+        group_dirpath = os.path.join(self.workdir, group_name)
+        if os.path.exists(group_dirpath):
+            shutil.rmtree(group_dirpath)
+
+    def _compile(
+        self, group_name: str, executable_fname: str = DEFAULT_EXEC_FNAME
+    ) -> str:
+        """
+        Compiles all source files in a given group together with all source
+        files from the group named "shared".
+
+        Args:
+            group_name: The name of the source file group to be compiled.
+            executable_fname: The output executable file name. Defaults to
+                "cuda_exec.out".
+
+        Raises:
+            RuntimeError: If the group does not exist or if does not have any
+                source files associated with it.
+
+        Returns:
+            The file path of the resulted executable file.
+        """
+        shared_dirpath = os.path.join(self.workdir, SHARED_GROUP_NAME)
+        group_dirpath = os.path.join(self.workdir, group_name)
+        if not os.path.exists(group_dirpath):
+            raise RuntimeError(f'Group "{group_name}" does not exist.')
+
+        source_files = list(glob.glob(os.path.join(group_dirpath, "*.cu")))
+        if len(source_files) == 0:
+            raise RuntimeError(
+                f'Group "{group_name}" does not have any source files.'
+            )
+        source_files.extend(
+            list(glob.glob(os.path.join(shared_dirpath, "*.cu")))
+        )
+
+        executable_fpath = os.path.join(group_dirpath, executable_fname)
+
+        args = [
+            "nvcc",
+            "-I" + shared_dirpath + "," + group_dirpath,
+        ]
+        args.extend(source_files)
+        args.extend(
+            [
+                "-o",
+                executable_fpath,
+                "-Wno-deprecated-gpu-targets",
+            ]
+        )
+        subprocess.check_output(args, stderr=subprocess.STDOUT)
+
+        return executable_fpath
+
+    def _run(
+        self,
+        exec_fpath: str,
+        timeit: bool = False,
+        profile: bool = False,
+        profiler_args: str = "",
+    ) -> str:
+        """
+        Runs a CUDA executable.
+
+        Args:
+            exec_fpath: The file path of the executable.
+            timeit: If True, returns the result of the "timeit" magic instead
+                of the standard output of the CUDA process. Defaults to False.
+            profile: If True, the executable is profiled with NVIDIA Nsight
+                Compute profiling tool and its output is added to stdout.
+                Defaults to False.
+            profiler_args: The profiler arguments used to customize the
+                information gathered by it and its overall behaviour. Defaults
+                to an empty string.
+
+        Returns:
+            The standard output of the CUDA process or the "timeit" magic
+            output.
+        """
+        if timeit:
+            stmt = (
+                f"subprocess.check_output(['{exec_fpath}'],"
+                " stderr=subprocess.STDOUT)"
+            )
+            output = self.shell.run_cell_magic(
+                magic_name="timeit", line="-q -o import subprocess", cell=stmt
+            )
+            # convert TimeitResult object to human readable string
+            output = str(output)
+        else:
+            run_args = []
+            if profile:
+                run_args.extend(["ncu"] + profiler_args.split())
+            run_args.append(exec_fpath)
+            output = subprocess.check_output(
+                run_args, stderr=subprocess.STDOUT
+            )
+            output = output.decode("utf8")
+
+        return output
+
+    def _compile_and_run(
+        self, group_name: str, args: argparse.Namespace
+    ) -> str:
+        try:
+            exec_fpath = self._compile(group_name)
+            output = self._run(
+                exec_fpath=exec_fpath,
+                timeit=args.timeit,
+                profile=args.profile,
+                profiler_args=args.profiler_args,
+            )
+        except subprocess.CalledProcessError as e:
+            output = e.output.decode("utf8")
+        return output
+
+    def _read_args(
+        self, line: str, parser: argparse.ArgumentParser
+    ) -> Optional[argparse.Namespace]:
+        """
+        Read arguments from the magic line. Makes sure to keep arguments
+        between double quotes together for use with profiler arguments or
+        compiler arguments.
+
+        Args:
+            line: The arguments on the line of the magic call in the jupyter
+                cell.
+            parser: The parser which will process the arguments after they are
+                correctly tokenized.
+
+        Returns:
+            The parsed arguments.
+        """
+        tokens = line.strip().split('"')
+        args_tokenized: List[str] = []
+        for index, tok in enumerate(tokens):
+            if index % 2 == 0:
+                # tokens found outside double quotes are split at whitespace
+                args_tokenized.extend(tok.split(" "))
+            else:
+                # anything found between double quotes will not be split
+                args_tokenized.append(tok)
+        args_tokenized = [arg for arg in args_tokenized if len(arg) > 0]
+
+        try:
+            return parser.parse_args(args_tokenized)
+        except SystemExit:
+            parser.print_help()
+            return None
+
+    @cell_magic
+    def cuda(self, line: str, cell: str) -> None:
+        """Compile and run the CUDA code in the cell.
+
+        Args:
+            line: The arguments on the line of the magic call in the jupyter
+                cell.
+            cell: All of the lines in the jupyter cell besides the magic call
+                itself. It should contain all of the source code to be
+                compiled and run.
+        """
+        args = self._read_args(line, self.parser_cuda)
+        if args is None:
+            return
+
+        group_name = str(uuid.uuid4())
+        self._save_source(
+            source_name="single_file.cu",
+            source_code=cell,
+            group_name=group_name,
+        )
+
+        output = self._compile_and_run(group_name, args)
+        print_out(output)
+
+    @cell_magic
+    def cuda_group_save(self, line: str, cell: str) -> None:
+        """
+        Save the CUDA code in the cell in a group of source files to be later
+        compiled and executed by the "cuda_group_run" line magic.
+
+        Args:
+            line: The arguments on the line of the magic call in the jupyter
+                cell.
+            cell: All of the lines in the jupyter cell besides the magic call
+                itself. It should contain all of the source code to be
+                saved.
+        """
+        args = self._read_args(line, self.parser_cuda_group_save)
+        if args is None:
+            return
+
+        self._save_source(
+            source_name=args.name,
+            source_code=cell,
+            group_name=args.group,
+        )
+
+    @line_magic
+    def cuda_group_run(self, line: str) -> None:
+        """
+        Compile and run all source files inside a specific source file group.
+
+        Args:
+            line: The arguments on the line of the magic call in the jupyter
+                cell.
+        """
+        args = self._read_args(line, self.parser_cuda_group_run)
+        if args is None:
+            return
+
+        output = self._compile_and_run(args.group, args)
+        print_out(output)
+
+    @line_magic
+    def cuda_group_delete(self, line: str) -> None:
+        """
+        Remove all source files inside a specific source file group.
+
+        Args:
+            line: The arguments on the line of the magic call in the jupyter
+                cell.
+        """
+        args = self._read_args(line, self.parser_cuda_group_delete)
+        if args is None:
+            return
+
+        self._delete_group(args.group)
+
+
+def load_ipython_extension(shell: InteractiveShell):
+    """
+    Method used by IPython to load the extension.
+    """
+    nvcc_plugin = NVCCPlugin(shell)
+    shell.register_magics(nvcc_plugin)
diff --git a/nvcc_plugin.py b/nvcc_plugin.py
deleted file mode 100644
index 81d2c8d..0000000
--- a/nvcc_plugin.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from v1.v1 import NVCCPlugin as NVCC_V1
-from v2.v2 import NVCCPluginV2 as NVCC_V2
-
-
-def load_ipython_extension(ip):
-    nvcc_plugin = NVCC_V1(ip)
-    ip.register_magics(nvcc_plugin)
-
-    nvcc_plugin_v2 = NVCC_V2(ip)
-    ip.register_magics(nvcc_plugin_v2)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..71966ef
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,291 @@
+[build-system]
+requires = ["hatchling >= 1.13.0"]
+build-backend = "hatchling.build"
+
+[project]
+name = "nvcc4jupyter"
+description = "Jupyter notebook plugin to run CUDA C/C++ code"
+readme = "README.md"
+requires-python = ">=3.10"
+license = {text = "MIT License"}
+authors = [
+    { name = "Andrei Nechaev", email = "lyfaradey@yahoo.com" },
+    { name = "Cosmin Stefan Ciocan", email = "ciocan.cosmin98@gmail.com" },
+]
+classifiers = [
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    'Programming Language :: Python :: 3.10',
+    'Programming Language :: Python :: 3.11',
+    'Programming Language :: Python :: 3.12',
+    'Environment :: GPU',
+    'Environment :: GPU :: NVIDIA CUDA',
+    'Framework :: IPython',
+    'Framework :: Jupyter',
+]
+dependencies = []
+dynamic = ["version"]
+
+[project.urls]
+documentation = 'https://nvcc4jupyter.readthedocs.io/'
+repository = 'https://github.com/andreinechaev/nvcc4jupyter'
+
+[tool.hatch.version]
+path = "nvcc4jupyter/__init__.py"
+
+[tool.hatch.build.targets.wheel]
+packages = ["nvcc4jupyter"]
+
+[project.optional-dependencies]
+testing = ["pytest>=7.4.3", "IPython>=8.19.0"]
+dev = ["pytest>=7.4.3", "IPython>=8.19.0", "pre-commit>=3.6.0", "pytest-cov[toml]>=4.1.0"]
+
+
+[tool.pytest.ini_options]
+addopts = [
+  "--color=yes",
+  "--durations=0",
+  "--strict-markers",
+  "--doctest-modules",
+]
+filterwarnings = [
+  "ignore::DeprecationWarning",
+  "ignore::UserWarning",
+]
+log_cli = "True"
+markers = [
+  "slow: slow tests",
+]
+minversion = "6.0"
+testpaths = "tests/"
+
+[tool.coverage.report]
+exclude_lines = [
+    "pragma: nocover",
+    "raise NotImplementedError",
+    "raise NotImplementedError()",
+    "if __name__ == .__main__.:",
+]
+
+[tool.isort]
+profile = "black"
+
+[tool.bandit]
+exclude_dirs = ["build","dist","tests","scripts"]
+number = 4
+recursive = true
+targets = "src"
+# B404 and B603 are skipped because the user can already run any arbitrary
+# command on their jupyter server
+skips = ["B101", "B311", "B404", "B603"]
+
+[tool.black]
+line-length = 79
+fast = true
+experimental-string-processing = true
+
+[tool.coverage.run]
+branch = true
+
+[tool.pyright]
+include = ["src"]
+exclude = [
+    "**/node_modules",
+    "**/__pycache__",
+]
+venv = "env37"
+
+reportMissingImports = true
+reportMissingTypeStubs = false
+
+pythonVersion = "3.7"
+pythonPlatform = "Linux"
+
+executionEnvironments = [
+  { root = "src" }
+]
+
+[tool.tox]
+legacy_tox_ini = """
+[tox]
+envlist = py, integration, spark, all
+[testenv]
+commands =
+    pytest -m "not integration and not spark" {posargs}
+[testenv:integration]
+commands =
+    pytest -m "integration" {posargs}
+[testenv:spark]
+extras = spark
+setenv =
+       PYSPARK_DRIVER_PYTHON = {envpython}
+       PYSPARK_PYTHON = {envpython}
+commands =
+    pytest -m "spark" {posargs}
+[testenv:all]
+extras = all
+setenv =
+       PYSPARK_DRIVER_PYTHON = {envpython}
+       PYSPARK_PYTHON = {envpython}
+commands =
+    pytest {posargs}
+"""
+
+[tool.pylint]
+extension-pkg-whitelist= [
+    "numpy",
+    "torch",
+    "cv2",
+    "pyodbc",
+    "pydantic",
+    "ciso8601",
+    "netcdf4",
+    "scipy"
+]
+ignore="CVS"
+ignore-patterns="test.*?py,conftest.py"
+ignore-paths="docs,tests"
+init-hook='import sys; sys.setrecursionlimit(8 * sys.getrecursionlimit())'
+jobs=0
+limit-inference-results=100
+persistent="yes"
+suggestion-mode="yes"
+unsafe-load-any-extension="no"
+
+[tool.pylint.'MESSAGES CONTROL']
+enable="c-extension-no-member"
+
+[tool.pylint.'REPORTS']
+evaluation="10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)"
+output-format="text"
+reports="no"
+score="yes"
+
+[tool.pylint.'REFACTORING']
+max-nested-blocks=5
+never-returning-functions="sys.exit"
+
+[tool.pylint.'BASIC']
+argument-naming-style="snake_case"
+attr-naming-style="snake_case"
+bad-names= [
+    "foo",
+    "bar"
+]
+class-attribute-naming-style="any"
+class-naming-style="PascalCase"
+const-naming-style="UPPER_CASE"
+docstring-min-length=-1
+function-naming-style="snake_case"
+good-names= [
+    "i",
+    "j",
+    "k",
+    "ex",
+    "Run",
+    "_"
+]
+include-naming-hint="yes"
+inlinevar-naming-style="any"
+method-naming-style="snake_case"
+module-naming-style="any"
+no-docstring-rgx="^_"
+property-classes="abc.abstractproperty"
+variable-naming-style="snake_case"
+
+[tool.pylint.'FORMAT']
+ignore-long-lines="^\\s*(# )?.*['\"]?<?https?://\\S+>?"
+indent-after-paren=4
+indent-string='    '
+max-line-length=79
+max-module-lines=1000
+single-line-class-stmt="no"
+single-line-if-stmt="no"
+
+[tool.pylint.'LOGGING']
+logging-format-style="old"
+logging-modules="logging"
+
+[tool.pylint.'MISCELLANEOUS']
+notes= [
+    "FIXME",
+    "XXX",
+    "TODO"
+]
+
+[tool.pylint.'SIMILARITIES']
+ignore-comments="yes"
+ignore-docstrings="yes"
+ignore-imports="yes"
+min-similarity-lines=7
+
+[tool.pylint.'SPELLING']
+max-spelling-suggestions=4
+spelling-store-unknown-words="no"
+
+[tool.pylint.'STRING']
+check-str-concat-over-line-jumps="no"
+
+[tool.pylint.'TYPECHECK']
+contextmanager-decorators="contextlib.contextmanager"
+generated-members="numpy.*,np.*,pyspark.sql.functions,collect_list"
+ignore-mixin-members="yes"
+ignore-none="yes"
+ignore-on-opaque-inference="yes"
+ignored-classes="optparse.Values,thread._local,_thread._local,numpy,torch,swagger_client"
+ignored-modules="numpy,torch,swagger_client,netCDF4,scipy"
+missing-member-hint="yes"
+missing-member-hint-distance=1
+missing-member-max-choices=1
+
+[tool.pylint.'VARIABLES']
+additional-builtins="dbutils"
+allow-global-unused-variables="yes"
+callbacks= [
+    "cb_",
+    "_cb"
+]
+dummy-variables-rgx="_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_"
+ignored-argument-names="_.*|^ignored_|^unused_"
+init-import="no"
+redefining-builtins-modules="six.moves,past.builtins,future.builtins,builtins,io"
+
+[tool.pylint.'CLASSES']
+defining-attr-methods= [
+    "__init__",
+    "__new__",
+    "setUp",
+    "__post_init__"
+]
+exclude-protected= [
+    "_asdict",
+    "_fields",
+    "_replace",
+    "_source",
+    "_make"
+]
+valid-classmethod-first-arg="cls"
+valid-metaclass-classmethod-first-arg="cls"
+
+[tool.pylint.'DESIGN']
+max-args=5
+max-attributes=7
+max-bool-expr=5
+max-branches=12
+max-locals=15
+max-parents=7
+max-public-methods=20
+max-returns=6
+max-statements=50
+min-public-methods=2
+
+[tool.pylint.'IMPORTS']
+allow-wildcard-with-all="no"
+analyse-fallback-blocks="no"
+deprecated-modules="optparse,tkinter.tix"
+
+[tool.pylint.'EXCEPTIONS']
+overgeneral-exceptions= [
+    "BaseException",
+    "Exception"
+]
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 0643352..0000000
--- a/setup.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from distutils.core import setup
-
-setup(
-    name='NVCCPlugin',
-    version='0.0.2',
-    author='Andrei Nechaev',
-    author_email='lyfaradey@yahoo.com',
-    py_modules=['nvcc_plugin', 'v2.v2', 'v1.v1', 'common.helper'],
-    url='https://github.com/andreinechaev/nvcc4jupyter',
-    license='LICENSE',
-    description='Jupyter notebook plugin to run CUDA C/C++ code',
-    # long_description=open('README.md').read(),
-)
diff --git a/common/__init__.py b/tests/__init__.py
similarity index 100%
rename from common/__init__.py
rename to tests/__init__.py
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..3bb2d59
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1 @@
+from .fixtures.fixtures import *  # noqa: F401,F403
diff --git a/v1/__init__.py b/tests/fixtures/__init__.py
similarity index 100%
rename from v1/__init__.py
rename to tests/fixtures/__init__.py
diff --git a/tests/fixtures/fixtures.py b/tests/fixtures/fixtures.py
new file mode 100644
index 0000000..93b88fb
--- /dev/null
+++ b/tests/fixtures/fixtures.py
@@ -0,0 +1,57 @@
+import glob
+import os
+
+import pytest
+from IPython.core.interactiveshell import InteractiveShell
+
+from nvcc4jupyter.plugin import NVCCPlugin
+
+
+@pytest.fixture(scope="session")
+def shell():
+    return InteractiveShell()
+
+
+@pytest.fixture(scope="session")
+def plugin(shell: InteractiveShell):
+    return NVCCPlugin(shell=shell)
+
+
+@pytest.fixture(scope="session")
+def tests_path():
+    return "tests"
+
+
+@pytest.fixture(scope="session")
+def fixtures_path(tests_path):
+    return os.path.join(tests_path, "fixtures")
+
+
+@pytest.fixture(scope="session")
+def sample_magic_cu_line():
+    # fmt: off
+    return '--profile --profiler-args "--metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum"'  # noqa: E501
+    # fmt: on
+
+
+@pytest.fixture(scope="session")
+def sample_cuda_fpath(fixtures_path: str):
+    return os.path.join(fixtures_path, "single_file", "hello.cu")
+
+
+@pytest.fixture(scope="session")
+def sample_cuda_code(sample_cuda_fpath: str):
+    with open(sample_cuda_fpath, "r", encoding="utf-8") as f:
+        return f.read()
+
+
+@pytest.fixture(scope="session")
+def timeit_regex():
+    return r".+ ± .+ per loop \(mean ± std. dev. of .+ runs, .+ loops each\)"
+
+
+@pytest.fixture(scope="session")
+def multiple_source_fpaths(fixtures_path: str):
+    pattern_h = os.path.join(fixtures_path, "multiple_files", "*.h")
+    pattern_cu = os.path.join(fixtures_path, "multiple_files", "*.cu")
+    return list(glob.glob(pattern_h)) + list(glob.glob(pattern_cu))
diff --git a/tests/fixtures/multiple_files/hello.cu b/tests/fixtures/multiple_files/hello.cu
new file mode 100644
index 0000000..7f6c3c6
--- /dev/null
+++ b/tests/fixtures/multiple_files/hello.cu
@@ -0,0 +1,6 @@
+#include <cstdio>
+#include "hello.h"
+
+__host__ void hello(){
+    printf("Hello World!\n");
+}
diff --git a/tests/fixtures/multiple_files/hello.h b/tests/fixtures/multiple_files/hello.h
new file mode 100644
index 0000000..f19e5d3
--- /dev/null
+++ b/tests/fixtures/multiple_files/hello.h
@@ -0,0 +1,6 @@
+#ifndef HELLO_H
+#define HELLO_H
+
+void hello();
+
+#endif
diff --git a/tests/fixtures/multiple_files/main.cu b/tests/fixtures/multiple_files/main.cu
new file mode 100644
index 0000000..5c0ebb8
--- /dev/null
+++ b/tests/fixtures/multiple_files/main.cu
@@ -0,0 +1,6 @@
+#include "hello.h"
+
+int main() {
+    hello();
+    return 0;
+}
diff --git a/tests/fixtures/single_file/hello.cu b/tests/fixtures/single_file/hello.cu
new file mode 100644
index 0000000..eda41aa
--- /dev/null
+++ b/tests/fixtures/single_file/hello.cu
@@ -0,0 +1,10 @@
+#include <cstdio>
+
+__host__ void hello(){
+    printf("Hello World!\n");
+}
+
+int main() {
+    hello();
+    return 0;
+}
diff --git a/tests/requirements.txt b/tests/requirements.txt
new file mode 100644
index 0000000..aac9161
--- /dev/null
+++ b/tests/requirements.txt
@@ -0,0 +1,2 @@
+pytest>=7.4.3
+IPython>=8.19.0
diff --git a/tests/test_plugin.py b/tests/test_plugin.py
new file mode 100644
index 0000000..05d340e
--- /dev/null
+++ b/tests/test_plugin.py
@@ -0,0 +1,221 @@
+import argparse
+import math
+import os
+import re
+import shutil
+from typing import List
+
+import pytest
+
+from nvcc4jupyter.plugin import NVCCPlugin
+
+
+def check_profiler_output(output: str):
+    # the profiler output will be a line of "Hello World!" along with some
+    # warning lines which start with "==WARNING=="
+    lines = output.strip().split("\n")
+    warn_count = 0
+    for line in lines:
+        if not line.startswith("==WARNING=="):
+            assert line == "Hello World!"
+        else:
+            warn_count += 1
+    assert warn_count >= 1
+    assert warn_count == len(lines) - 1
+
+
+def copy_source_to_group(
+    source_fpath: str, group_name: str, workdir: str
+) -> str:
+    group_dirpath = os.path.join(workdir, group_name)
+    os.makedirs(group_dirpath, exist_ok=True)
+    destination_fpath = os.path.join(
+        group_dirpath, os.path.basename(source_fpath)
+    )
+    shutil.copy(source_fpath, destination_fpath)
+    return destination_fpath
+
+
+@pytest.fixture(autouse=True, scope="function")
+def before_each(plugin: NVCCPlugin):
+    shutil.rmtree(plugin.workdir, ignore_errors=True)  # before test
+    yield
+    pass  # after test
+
+
+def test_save_source(plugin: NVCCPlugin, sample_cuda_code: str) -> None:
+    gname = "test_save_source"
+    sname = "sample.cu"
+    plugin._save_source(sname, sample_cuda_code, gname)
+    spath = os.path.join(plugin.workdir, gname, sname)
+    assert os.path.exists(spath)
+    with open(spath, "r", encoding="utf-8") as f:
+        code = f.read()
+    assert code == sample_cuda_code
+
+    with pytest.raises(ValueError):
+        plugin._save_source("wrong_extension.txt", sample_cuda_code, gname)
+
+
+def test_delete_group(plugin: NVCCPlugin, sample_cuda_fpath: str) -> None:
+    gname = "test_delete_group"
+    source_fpath = copy_source_to_group(
+        sample_cuda_fpath, gname, plugin.workdir
+    )
+    assert os.path.exists(source_fpath)
+    plugin._delete_group(gname)
+    assert not os.path.exists(source_fpath)
+
+
+def test_compile(
+    plugin: NVCCPlugin,
+    sample_cuda_fpath: str,
+):
+    # we artificially create a source file group in the plugin workdir
+    gname = "test_compile"
+    source_fpath = copy_source_to_group(
+        sample_cuda_fpath, gname, plugin.workdir
+    )
+
+    exec_fpath = plugin._compile(gname)
+    assert os.path.exists(exec_fpath)
+
+    with pytest.raises(RuntimeError):
+        plugin._compile("inexistent_group")
+
+    with pytest.raises(RuntimeError):
+        os.remove(source_fpath)
+        plugin._compile(gname)
+
+
+def test_run(
+    plugin: NVCCPlugin,
+    sample_cuda_fpath: str,
+):
+    gname = "test_run"
+    copy_source_to_group(sample_cuda_fpath, gname, plugin.workdir)
+
+    exec_fpath = plugin._compile(gname)
+    output = plugin._run(exec_fpath)
+    assert output == "Hello World!\n"
+
+
+def test_run_timeit(
+    plugin: NVCCPlugin, sample_cuda_fpath: str, timeit_regex: str
+):
+    gname = "test_run_timeit"
+    copy_source_to_group(sample_cuda_fpath, gname, plugin.workdir)
+
+    exec_fpath = plugin._compile(gname)
+    output = plugin._run(exec_fpath, timeit=True)
+    assert (
+        re.match(timeit_regex, output) is not None
+    ), f'Output "{output}" does not match the regex "{timeit_regex}".'
+
+
+def test_run_profile(plugin: NVCCPlugin, sample_cuda_fpath: str):
+    gname = "test_run_profile"
+    copy_source_to_group(sample_cuda_fpath, gname, plugin.workdir)
+
+    exec_fpath = plugin._compile(gname)
+    output = plugin._run(
+        exec_fpath,
+        profile=True,
+        # because we are running without a kernel (in the test env we have no
+        # GPU) it does not matter what arguments we pass to the profiler as its
+        # output will always be just a few warnings; the reason we add them
+        # here is to test that no error is produced when passing the arguments
+        profiler_args=(
+            "--metrics l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum"
+        ),
+    )
+    check_profiler_output(output)
+
+
+def test_compile_and_run_multiple_files(
+    plugin: NVCCPlugin, multiple_source_fpaths: List[str]
+):
+    """
+    Compiles and executes 3 cuda source files from
+    tests/fixtures/multiple_files.
+    """
+    gname = "test_compile_and_run_multiple_files"
+    for fpath in multiple_source_fpaths:
+        copy_source_to_group(fpath, gname, plugin.workdir)
+    output = plugin._compile_and_run(
+        gname, argparse.Namespace(timeit=False, profile=True, profiler_args="")
+    )
+    check_profiler_output(output)
+
+
+def test_compile_and_run_multiple_files_shared(
+    plugin: NVCCPlugin, multiple_source_fpaths: List[str]
+):
+    """
+    Compiles and executes 3 cuda source files from
+    tests/fixtures/multiple_files. However, the hello.cu and hello.h files are
+    added to the "shared" group which is compiled with all other groups. This
+    allows sharing error handling code easily and other very common code.
+    """
+    gname = "test_compile_and_run_multiple_files_shared"
+    for fpath in multiple_source_fpaths:
+        fname = os.path.basename(fpath)
+        if fname == "main.cu":
+            copy_source_to_group(fpath, gname, plugin.workdir)
+        else:
+            copy_source_to_group(fpath, "shared", plugin.workdir)
+    output = plugin._compile_and_run(
+        gname, argparse.Namespace(timeit=False, profile=True, profiler_args="")
+    )
+    check_profiler_output(output)
+
+
+def test_read_args(plugin: NVCCPlugin):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-a", type=str, required=True)
+    parser.add_argument("-b", type=float, required=True)
+    args = plugin._read_args(
+        '-a "--this has --spaces and --dashes" -b 0.75', parser
+    )
+    assert args.a == "--this has --spaces and --dashes"
+    assert math.isclose(args.b, 0.75)
+
+
+def test_magic_cuda(
+    capsys,
+    plugin: NVCCPlugin,
+    sample_cuda_code: str,
+    sample_magic_cu_line: str,
+):
+    plugin.cuda(sample_magic_cu_line, sample_cuda_code)
+    check_profiler_output(capsys.readouterr().out)
+
+
+def test_magic_cuda_group_save(plugin: NVCCPlugin, sample_cuda_code: str):
+    gname = "test_save_source"
+    sname = "sample.cu"
+    plugin.cuda_group_save(f"-g {gname} -n {sname}", sample_cuda_code)
+    spath = os.path.join(plugin.workdir, gname, sname)
+    assert os.path.exists(spath)
+    with open(spath, "r", encoding="utf-8") as f:
+        code = f.read()
+    assert code == sample_cuda_code
+
+
+def test_magic_cuda_group_run(
+    capsys, plugin: NVCCPlugin, sample_cuda_fpath: str
+):
+    gname = "test_magic_cuda_group_run"
+    copy_source_to_group(sample_cuda_fpath, gname, plugin.workdir)
+    plugin.cuda_group_run(f"--group {gname} --profile")
+    check_profiler_output(capsys.readouterr().out)
+
+
+def test_magic_cuda_group_delete(plugin: NVCCPlugin, sample_cuda_fpath: str):
+    gname = "test_magic_cuda_group_run"
+    source_fpath = copy_source_to_group(
+        sample_cuda_fpath, gname, plugin.workdir
+    )
+    assert os.path.exists(source_fpath)
+    plugin.cuda_group_delete(f"--group {gname}")
+    assert not os.path.exists(source_fpath)
diff --git a/v1/v1.py b/v1/v1.py
deleted file mode 100644
index ed1aa8e..0000000
--- a/v1/v1.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import os
-import subprocess
-import tempfile
-import uuid
-
-from IPython.core.magic import Magics, cell_magic, magics_class
-from common import helper
-
-compiler = '/usr/local/cuda/bin/nvcc'
-profiler = '/usr/local/cuda/bin/ncu'
-ext = '.cu'
-
-
-@magics_class
-class NVCCPlugin(Magics):
-
-    def __init__(self, shell):
-        super(NVCCPlugin, self).__init__(shell)
-
-        self.argparser = helper.get_argparser()
-
-    @staticmethod
-    def compile(file_path):
-        subprocess.check_output(
-            [compiler, file_path + ext, "-o", file_path + ".out", '-Wno-deprecated-gpu-targets'], stderr=subprocess.STDOUT)
-
-    def run(self, file_path, timeit=False, profile=False, profiler_args=[]):
-        if timeit:
-            stmt = f"subprocess.check_output(['{file_path}.out'], stderr=subprocess.STDOUT)"
-            output = self.shell.run_cell_magic(
-                magic_name="timeit", line="-q -o import subprocess", cell=stmt)
-            output = str(output) # convert TimeitResult object to human readable string
-        else:
-            run_args = []
-            if profile:
-                run_args.extend([profiler] + profiler_args)
-            run_args.append(file_path + ".out")
-            output = subprocess.check_output(run_args, stderr=subprocess.STDOUT)
-            output = output.decode('utf8')
-            
-        helper.print_out(output)
-        return None
-
-    @cell_magic
-    def cu(self, line, cell):
-        try:
-            args = self.argparser.parse_args(line.split())
-        except SystemExit as e:
-            self.argparser.print_help()
-            return
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            file_path = os.path.join(tmp_dir, str(uuid.uuid4()))
-            with open(file_path + ext, "w") as f:
-                f.write(cell)
-            try:
-                self.compile(file_path)
-                output = self.run(file_path, timeit=args.timeit, profile=args.profile, profiler_args=args.profiler_args)
-            except subprocess.CalledProcessError as e:
-                helper.print_out(e.output.decode("utf8"))
-                output = None
-        return output
diff --git a/v2/__init__.py b/v2/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/v2/v2.py b/v2/v2.py
deleted file mode 100644
index 41511d6..0000000
--- a/v2/v2.py
+++ /dev/null
@@ -1,107 +0,0 @@
-import os
-import subprocess
-
-from IPython.core.magic import Magics, cell_magic, magics_class
-from IPython.core.magic_arguments import argument, magic_arguments, parse_argstring
-from common import helper
-
-compiler = '/usr/local/cuda/bin/nvcc'
-profiler = '/usr/local/cuda/bin/ncu'
-
-
-@magics_class
-class NVCCPluginV2(Magics):
-
-    def __init__(self, shell):
-        super(NVCCPluginV2, self).__init__(shell)
-        self.argparser = helper.get_argparser()
-        current_dir = os.getcwd()
-        self.output_dir = os.path.join(current_dir, 'src')
-        if not os.path.exists(self.output_dir):
-            os.mkdir(self.output_dir)
-            print(f'created output directory at {self.output_dir}')
-        else:
-            print(f'directory {self.output_dir} already exists')
-
-        self.out = os.path.join(current_dir, "result.out")
-        print(f'Out bin {self.out}')
-
-    @staticmethod
-    def compile(output_dir, file_paths, out):
-        res = subprocess.check_output(
-            [compiler, '-I' + output_dir, file_paths, "-o", out, '-Wno-deprecated-gpu-targets'], stderr=subprocess.STDOUT)
-        res = res.decode()
-        helper.print_out(res)
-
-    def run(self, timeit=False, profile=False, profiler_args=[]):
-        if timeit:
-            stmt = f"subprocess.check_output(['{self.out}'], stderr=subprocess.STDOUT)"
-            output = self.shell.run_cell_magic(
-                magic_name="timeit", line="-q -o import subprocess", cell=stmt)
-            output = str(output) # convert TimeitResult object to human readable string
-        else:
-            run_args = []
-            if profile:
-                run_args.extend([profiler] + profiler_args)
-            run_args.append(self.out)
-            output = subprocess.check_output(run_args, stderr=subprocess.STDOUT)
-            output = output.decode('utf8')
-
-        helper.print_out(output)
-        return None
-
-    @magic_arguments()
-    @argument('-n', '--name', type=str, help='file name that will be produced by the cell. must end with .cu extension')
-    @argument('-c', '--compile', type=bool, help='Should be compiled?')
-    @cell_magic
-    def cuda(self, line='', cell=None):
-        args = parse_argstring(self.cuda, line)
-        ex = args.name.split('.')[-1]
-        if ex not in ['cu', 'h']:
-            raise Exception('name must end with .cu or .h')
-
-        if not os.path.exists(self.output_dir):
-            print(f'Output directory does not exist, creating')
-            try:
-                os.mkdir(self.output_dir)
-            except OSError:
-                print(f"Creation of the directory {self.output_dir} failed")
-            else:
-                print(f"Successfully created the directory {self.output_dir}")
-
-        file_path = os.path.join(self.output_dir, args.name)
-        with open(file_path, "w") as f:
-            f.write(cell)
-
-        if args.compile:
-            try:
-                self.compile(self.output_dir, file_path, self.out)
-                output = self.run(timeit=args.timeit, profile=args.profile, profiler_args=args.profiler_args)
-            except subprocess.CalledProcessError as e:
-                helper.print_out(e.output.decode("utf8"))
-                output = None
-        else:
-            output = f'File written in {file_path}'
-
-        return output
-
-    @cell_magic
-    def cuda_run(self, line='', cell=None):
-        try:
-            args = self.argparser.parse_args(line.split())
-        except SystemExit:
-            self.argparser.print_help()
-            return
-
-        try:
-            cuda_src = os.listdir(self.output_dir)
-            cuda_src = [os.path.join(self.output_dir, x)
-                        for x in cuda_src if x[-3:] == '.cu']
-            print(f'found sources: {cuda_src}')
-            self.compile(self.output_dir, ' '.join(cuda_src), self.out)
-            output = self.run(timeit=args.timeit, profile=args.profile, profiler_args=args.profiler_args)
-        except subprocess.CalledProcessError as e:
-            helper.print_out(e.output.decode("utf8"))
-            output = None
-
-        return output