diff --git a/docker/dli_lab/build_lab.sh b/docker/dli_lab/build_lab.sh
index de1161bb..dbdd157c 100755
--- a/docker/dli_lab/build_lab.sh
+++ b/docker/dli_lab/build_lab.sh
@@ -1,3 +1,3 @@
-cp ../../docs_input/notebooks/gtc_lab/run_matx.py ./
+cp ../../docs_input/notebooks/scripts/run_matx.py ./
 docker build -f lab.Dockerfile -t ghcr.io/nvidia/matx/lab:latest .
-rm run_matx.py
\ No newline at end of file
+rm run_matx.py
diff --git a/docker/dli_lab/lab.Dockerfile b/docker/dli_lab/lab.Dockerfile
index b72ec181..f070ebb3 100644
--- a/docker/dli_lab/lab.Dockerfile
+++ b/docker/dli_lab/lab.Dockerfile
@@ -1,6 +1,5 @@
-FROM ghcr.io/nvidia/matx/production:latest AS devel
-
-
+#FROM ghcr.io/nvidia/matx/production:latest AS devel
+FROM gitlab-master.nvidia.com:5005/devtech-compute/sigx-group/container/build:12.8_x86_64_ubuntu22.04-amd64 AS devel
 
 # Prevent interactive prompts during package installation
 ENV DEBIAN_FRONTEND=noninteractive
@@ -27,17 +26,14 @@ RUN apt-get update && apt-get install -y \
 RUN python3 -m venv /opt/venv
 ENV PATH="/opt/venv/bin:$PATH"
 
-RUN mkdir /root/.ipython/profile_default
-RUN mkdir /root/.ipython/extensions
-RUN echo c.InteractiveShellApp.extensions = ['run_matx'] >> /root/.ipython/profile_default/ipython_config.py
-RUN cp ./run_matx.py /root/.ipython/extensions/
-
 # Install Python packages
 RUN pip3 install --no-cache-dir \
   jupyter \
   jupyterlab \
   cmake \
   nlohmann-json==3.11.2 \
+  numpy \
+  scipy \
   xtl \
   pugixml
 
@@ -45,13 +41,23 @@ RUN pip install bash_kernel
 RUN python -m bash_kernel.install
 
 # Create jupyter config directory
-RUN mkdir -p ~/.jupyter
+RUN mkdir -p /root/.jupyter
+
+RUN mkdir /root/.ipython/profile_default
+RUN mkdir /root/.ipython/extensions
+RUN echo "c.InteractiveShellApp.extensions = ['run_matx']" >> /root/.ipython/profile_default/ipython_config.py
+COPY ./run_matx.py /root/.ipython/extensions/
+
+ENV PYTHONPATH="${PYTHONPATH}:/root/.ipython/extensions"
+
+RUN git clone https://github.com/NVIDIA/MatX.git /MatX
+RUN cd /MatX && mkdir build && cd build && cmake .. -DMATX_EN_X86_FFTW=ON -DMATX_EN_FILEIO=ON -DMATX_EN_OPENBLAS=ON
 
 # Expose Jupyter port
 EXPOSE 8888
 
 # Set working directory for Jupyter
-WORKDIR /notebooks
+WORKDIR /MatX/docs_input/notebooks/gtc_lab
 
 # Start Jupyter Lab
 CMD ["jupyter", "lab", "--ip=0.0.0.0", "--port=8888", "--no-browser", "--allow-root"]
diff --git a/docs_input/notebooks/01_introduction.ipynb b/docs_input/notebooks/01_introduction.ipynb
index 02a7708f..340c25dd 100644
--- a/docs_input/notebooks/01_introduction.ipynb
+++ b/docs_input/notebooks/01_introduction.ipynb
@@ -33,7 +33,7 @@
    "metadata": {},
    "source": [
     "## Training Structure\n",
-    "This training contains a series of tutorials in increasing order of name that will guide you through basic and intermediate MatX features. Most tutorials will require you to open the source code, make a small change, and run it from the Jupyter notebook. As time permits, there is another notebook called `99_assignments.ipynb` that will give a series of problems to solve using MatX primitives. The assignments have verification code to make sure the answer is correct."
+    "This training contains a series of tutorials in increasing order of name that will guide you through basic and intermediate MatX features. Early tutorials are implemented using an in-line interpreter that allows you to run MatX code natively in the Jupyter cells. Later, more complex tutorials that are performance sensitive will require you to open source code in separate source files, and run it from the Jupyter notebook. As time permits, there is another notebook called `99_assignments.ipynb` that will give a series of problems to solve using MatX primitives. The assignments have verification code to make sure the answer is correct."
    ]
   },
   {
@@ -97,7 +97,9 @@
    "source": [
     "### 0. Library Import\n",
     "\n",
-    "During this tutorial, we will be writing our MatX code in a CUDA file (`.cu`), compiled via the CUDA Compiler, `nvcc`. If you're curious, the specific command line to build and execute code can be found [here](exercises/compile_and_run.sh).\n",
+    "During this tutorial, the includes and Jupyter intepreter will enable compilation, however in deployed code, MatX is most-often compiled via the CUDA Compiler, `nvcc`. If you're curious, example command line to build and execute code can be found [here](exercises/compile_and_run.sh).\n",
+    "\n",
+    "During this tutorial, the includes and Jupyter interpreter will enable compilation, however in deployed code, MatX is most-often compiled via the CUDA Compiler, `nvcc`. If you're curious, example command line to build and execute code can be found [here](exercises/compile_and_run.sh).\n",
     "\n",
     "When using MatX, be sure to import the library via:\n",
     "\n",
@@ -114,6 +116,48 @@
     "Be aware that since MatX mimics a lot of functionality from the standard library, you may see common names from MatX in your current namespace. It's usually recommended to not import the entire `matx` namespace unless absolutely necessary."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "//todo this should be moved to a hidden init block that runs automatically when the notebook starts\n",
+    "#pragma cling add_library_path(\"/usr/local/cuda/lib64\")\n",
+    "#pragma cling add_library_path(\"/opt/xeus/cling/lib\")\n",
+    "//#pragma cling add_library_path(\"/usr/Lib/gcc/x86_64-Linux-gnu/11/\")\n",
+    "#pragma cling add_library_path(\"/usr/lib/x86_64-linux-gnu/openblas64-openmp/\")\n",
+    "#pragma cling add_include_path(\"/usr/local/cuda/include\")\n",
+    "#pragma cling add_include_path(\"/usr/include/x86_64-linux-gnu/openblas64-openmp\")\n",
+    "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/include\")\n",
+    "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/build/_deps/cccl-src/libcudacxx/include\")\n",
+    "//#pragma cling load(\"libgomp\")\n",
+    "#pragma cling load(\"libopenblas64\")\n",
+    "#pragma cling load(\"libcuda\")\n",
+    "#pragma cling load(\"libcudart\")\n",
+    "#pragma cling load(\"libcurand\")\n",
+    "#pragma cling load(\"libcublas\")\n",
+    "#pragma cling load(\"libcublasLt\")\n",
+    "\n",
+    "#include <cuda/std/__algorithm/max.h>\n",
+    "#include <cuda/std/__algorithm/min.h>\n",
+    "\n",
+    "#define MATX_EN_OPENBLAS\n",
+    "#define MATX_EN_OPENBLAS_LAPACK\n",
+    "#define MATX_OPENBLAS_64BITINT\n",
+    "\n",
+    "#include \"matx.h\""
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -121,11 +165,31 @@
    "source": [
     "### 1. Creation\n",
     "\n",
-    "First, we create a tensor object:\n",
-    "\n",
-    "```c++\n",
-    "auto t2 = make_tensor<int>({5,4});\n",
-    "```\n",
+    "First, we create a tensor object:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "auto t2 = matx::make_tensor<int>({5,4});"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "\n",
     "While you can create `tensor_t` objects directly, it us advised to used `make_tensor` instead. `tensor_t` has several template parameters that may change in the future, and using `make_tensor` avoids that ambiguity and always returns the correct tensor type. One exception to this is when a user wants to use `tensor_t` as a class member variable. In that case you need to use `tensor_t` directly, and in the constructor of your class use `make_tensor` with your `tensor_t` variable as the first argument. This will be demonstrated later.\n",
     "\n",
@@ -145,25 +209,100 @@
    "source": [
     "### 2. Initialization \n",
     "\n",
-    "After allocating the tensor, we initialize the underlying data:\n",
-    "\n",
-    "```c++\n",
-    "t2.SetVals({  {1, 2, 3, 4},\n",
-    "        {5, 6, 7, 8},\n",
-    "        {9, 10, 11, 12},\n",
-    "        {13, 14, 15, 16},\n",
-    "        {17, 18, 19, 20}});\n",
-    "\n",
+    "After allocating the tensor, we initialize the underlying data:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[5, 4], Strides:[4,1]\n",
+      "000000:  1  2  3  4 \n",
+      "000001:  5  6  7  8 \n",
+      "000002:  9  10  11  12 \n",
+      "000003:  13  14  15  16 \n",
+      "000004:  17  18  19  20 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x7f5e43dfec30\n"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t2.SetVals({  \n",
+    "           {1, 2, 3, 4},\n",
+    "           {5, 6, 7, 8},\n",
+    "           {9, 10, 11, 12},\n",
+    "           {13, 14, 15, 16},\n",
+    "           {17, 18, 19, 20}\n",
+    "           });\n",
     "\n",
     "t2.PrefetchDevice(0);\n",
-    "```\n",
     "\n",
-    "The tensor is initialized using a nested initializer list inside of the `SetVals` member function, specifying the values of the matrix. The initializer list is a single-nested list to match a 2D tensor shape, but this can be extended up to 4D tensors. `operator()` is also available to set and get individual values of a tensor as an alternative:\n",
-    "```c++\n",
-    "   t2(0,0) = 1;\n",
-    "   t2(0,1) = 2;\n",
-    "   ...\n",
-    "```\n",
+    "matx::print(t2);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The tensor is initialized using a nested initializer list inside of the `SetVals` member function, specifying the values of the matrix. The initializer list is a single-nested list to match a 2D tensor shape, but this can be extended up to 4D tensors. `operator()` is also available to set and get individual values of a tensor as an alternative:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[5, 4], Strides:[4,1]\n",
+      "000000:  42  2  3  4 \n",
+      "000001:  5  6  7  8 \n",
+      "000002:  9  10  11  12 \n",
+      "000003:  13  14  117  16 \n",
+      "000004:  17  18  19  20 \n",
+      "My updates value for (3,2): 117\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(std::basic_ostream<char, std::char_traits<char> >::__ostream_type &) @0x7f5eafc34540\n"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t2(0,0) = 42;\n",
+    "t2(3,2) = 117;\n",
+    "\n",
+    "matx::print(t2);\n",
+    "\n",
+    "std::cout << \"My updates value for (3,2): \" << t2(3,2) << std::endl;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "\n",
     "**NOTE** The syntax above is executed on the host when written. This works for any type of memory accessible by the host, but will result in a segmentation fault if the tensor is backed by device memory.\n",
     "\n",
@@ -177,32 +316,7 @@
     "print(t2);\n",
     "```  \n",
     "\n",
-    "`print` is a utility function to print a tensor or operator's contents to stdout. Printing can be used with any type of operator, including ones that have no memory backing them (see upcoming generators section). If a tensor is being printed, the data backing it can reside either on the host or device, and MatX will move it before printing if needed. With no arguments `print` will print the entire contents of the tensor. However, the size of the printing can also be limited by passing a limit to each dimension. For example, `print(3,2)` would print the first 2 columns and 3 rows of the 2D tensor. The contents of the tensor printed should appear as an increasing sequence of numbers from the top to bottom rows.\n",
-    "\n",
-    "Open the file [exercises/example1_init.cu](exercises/example1_init.cu) and edit the contents where you see TODO markers."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example1_init"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000:    1    2    3    4 \n",
-    "000001:    5    6    7    8 \n",
-    "000002:    9   10   11   12 \n",
-    "000003:   13   14   15   16 \n",
-    "000004:   17   18   19   20 \n",
-    "```"
+    "`print` is a utility function to print a tensor or operator's contents to stdout. Printing can be used with any type of operator, including ones that have no memory backing them (see upcoming generators section). If a tensor is being printed, the data backing it can reside either on the host or device, and MatX will move it before printing if needed. With no arguments `print` will print the entire contents of the tensor. However, the size of the printing can also be limited by passing a limit to each dimension. For example, `print(3,2)` would print the first 2 columns and 3 rows of the 2D tensor. The contents of the tensor printed should appear as an increasing sequence of numbers from the top to bottom rows."
    ]
   },
   {
@@ -211,40 +325,49 @@
    "metadata": {},
    "source": [
     "### 3. Permute\n",
-    "The next section calls `permute` on the returned view:\n",
-    "\n",
-    "```c++\n",
-    "t2p = permute(t2, {1,0});\n",
-    "print(t2p);\n",
-    "```\n",
-    "\n",
-    "`permute` returns a view of the data with the dimensions swapped to match the order of the initializer list argument. In this case there are only two dimensions being permuted on a 2D tensor, so it's equivalent to a matrix transpose. However, `permute` can be used on higher-order tensors with the dimensions swapped in any particular order. Like printing, `permute` can work on any type of operator as input and not just tensors backed by memory. Observe the data and size of the tensor is now transposed when using this view:\n",
-    "\n",
-    "![Permuted/Transposed 2D Tensor](img/dli-transpose.png)\n",
-    "\n",
-    "Open the file [exercises/example1_permute.cu](exercises/example1_permute.cu) and edit the contents where you see TODO markers."
+    "The next section calls `permute` on the returned view:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example1_permute"
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[4, 5], Strides:[1,4]\n",
+      "000000:  42  5  9  13  17 \n",
+      "000001:  2  6  10  14  18 \n",
+      "000002:  3  7  11  117  19 \n",
+      "000003:  4  8  12  16  20 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x7f5e43dfec30\n"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "auto t2p = permute(t2, {1,0});\n",
+    "print(t2p);"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: 1 5 9 13 17 \n",
-    "000001: 2 6 10 14 18 \n",
-    "000002: 3 7 11 15 19 \n",
-    "000003: 4 8 12 16 20 \n",
-    "```"
+    "`permute` returns a view of the data with the dimensions swapped to match the order of the initializer list argument. In this case there are only two dimensions being permuted on a 2D tensor, so it's equivalent to a matrix transpose. However, `permute` can be used on higher-order tensors with the dimensions swapped in any particular order. Like printing, `permute` can work on any type of operator as input and not just tensors backed by memory. Observe the data and size of the tensor is now transposed when using this view:\n",
+    "\n",
+    "![Permuted/Transposed 2D Tensor](img/dli-transpose.png)"
    ]
   },
   {
@@ -261,40 +384,47 @@
    "metadata": {},
    "source": [
     "### 4. Slice\n",
-    "The next line takes a slice of the 2D tensor by selecting a subset of data in both dimensions:\n",
-    "\n",
-    "```c++\n",
-    "auto t2s = slice(t2, {1,1}, {3, 3});\n",
-    "```\n",
-    "\n",
-    "`t2s` is now a view of the same data, but starting at index 1 and ending at index 3 (exclusive) on both dimensions. This is equivalent to Python using `t2[1:3, 1:3]`. Since a new sliced view is returned, the new view will have dimensions `{2, 2}`.\n",
-    "\n",
-    "![2D Slice](img/dli-slice.png)\n",
-    "\n",
-    " Open the file [exercises/example1_simple_slice.cu](exercises/example1_simple_slice.cu) and edit the contents where you see TODO markers.\n",
-    "\n",
-    "\n",
-    "\n"
+    "The next line takes a slice of the 2D tensor by selecting a subset of data in both dimensions:\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example1_simple_slice"
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[2, 2], Strides:[4,1]\n",
+      "000000:  6  7 \n",
+      "000001:  10  11 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x7f5e43dfec30\n"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "auto t2s = matx::slice(t2, {1,1}, {3, 3});\n",
+    "\n",
+    "matx::print(t2s);"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000:    6    7 \n",
-    "000001:   10   11 \n",
-    "```"
+    "`t2s` is now a view of the same data, but starting at index 1 and ending at index 3 (exclusive) on both dimensions. This is equivalent to Python using `t2[1:3, 1:3]`. Since a new sliced view is returned, the new view will have dimensions `{2, 2}`.\n",
+    "\n",
+    "![2D Slice](img/dli-slice.png)"
    ]
   },
   {
@@ -302,40 +432,50 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The next line shows a variant of `slice` that can reduce the dimension of an operator:\n",
-    "\n",
-    "```c++\n",
-    "auto t1 = slice<1>(t2, {0, 1}, {matxEnd, matxDropDim});\n",
-    "```\n",
-    "\n",
-    "Using this form of `slice` requires a template argument with the rank of the new slice. The second parameter to `slice` takes the starting index for each dimension, while the third takes the ending index. To include all values from the beginning on, a special sentinel of `matxEnd` can be used. Similarly, `matxDropDim` is used to indicate this dimension is the one being sliced (i.e. removed). In this case we are slicing the second column of the tensor and all rows, which produces a new 1D tensor containing only the second column of the original tensor. This is equivalent to `t2[:,1]` in Python. \n",
-    "\n",
-    "![Column Slice](img/dli-slice_col.png)\n",
-    "\n",
-    "Open the file [exercises/example1_adv_slice_col.cu](exercises/example1_adv_slice_col.cu) and edit the contents where you see TODO markers."
+    "The next line shows a variant of `slice` that can reduce the dimension of an operator:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example1_adv_slice_col"
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_1_i32: Tensor{int32_t} Rank: 1, Sizes:[5], Strides:[4]\n",
+      "000000:  2 \n",
+      "000001:  6 \n",
+      "000002:  10 \n",
+      "000003:  14 \n",
+      "000004:  18 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x7f5e43dfec30\n"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "auto t1Col = matx::slice<1>(t2, {0, 1}, {matx::matxEnd, matx::matxDropDim});\n",
+    "\n",
+    "matx::print(t1Col);"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000:    2 \n",
-    "000001:    6 \n",
-    "000002:   10 \n",
-    "000003:   14 \n",
-    "000004:   18 \n",
-    "```"
+    "Using this form of `slice` requires a template argument with the rank of the new slice. The second parameter to `slice` takes the starting index for each dimension, while the third takes the ending index. To include all values from the beginning on, a special sentinel of `matxEnd` can be used. Similarly, `matxDropDim` is used to indicate this dimension is the one being sliced (i.e. removed). In this case we are slicing the second column of the tensor and all rows, which produces a new 1D tensor containing only the second column of the original tensor. This is equivalent to `t2[:,1]` in Python. \n",
+    "\n",
+    "![Column Slice](img/dli-slice_col.png)"
    ]
   },
   {
@@ -343,37 +483,49 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Instead of slicing a single column, we can also slice a single row:\n",
-    "\n",
-    "```c++\n",
-    "auto t1 = slice<1>(t2, {1, 0}, {matxDropDim, matxEnd});\n",
-    "```\n",
-    "\n",
-    "![Row Slice](img/dli-slice_row.png)\n",
-    "\n",
-    "Open the file [exercises/example1_adv_slice_row.cu](exercises/example1_adv_slice_row.cu) and edit the contents where you see TODO markers.\n"
+    "Instead of slicing a single column, we can also slice a single row:\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example1_adv_slice_row"
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_1_i32: Tensor{int32_t} Rank: 1, Sizes:[4], Strides:[1]\n",
+      "000000:  5 \n",
+      "000001:  6 \n",
+      "000002:  7 \n",
+      "000003:  8 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x7f5e43dfec30\n"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "auto t1Row = matx::slice<1>(t2, {1, 0}, {matx::matxDropDim, matx::matxEnd});\n",
+    "\n",
+    "matx::print(t1Row);"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000:    5 \n",
-    "000001:    6 \n",
-    "000002:    7 \n",
-    "000003:    8 \n",
-    "```"
+    "![Row Slice](img/dli-slice_row.png)\n",
+    "\n",
+    "\n"
    ]
   },
   {
@@ -385,93 +537,154 @@
     "Note that since we reduced the dimension to a 1D tensor in both cases, printing a 1D tensor (vector) will appear the same in the direction the values are printed.\n",
     "\n",
     "### 5. Clone\n",
-    "The last line shows `clone`, which replicates a operator's dimensions into a higher-rank operator:\n",
-    "```c++\n",
-    "auto t2c = clone<2>(t1, {5, matxKeepDim});\n",
-    "```\n",
-    "\n",
-    "`clone` is used on a 1D tensor from the output of the previous example, and replicates the data of the `t1` vector into a 2D tensor with 5 rows where all rows match the data in `t1`. Cloning does not replicate the data in memory; instead, the same elements in `t1` are accessed repeatedly when different rows are accessed. This not only saves memory, but also benefits from the caches in the GPU by not hitting different addresses in memory for the same value. \n",
-    "\n",
-    "In this case `clone` was being used on a 1D view from a 2D tensor data set, but similar code works on taking any dimension tensor and increasing it to a higher dimension. The increase in dimensions is not restricted to one. For example, a scalar (0D tensor) can be cloned into a 4F tensor where a single value in memory would appear as a 4D tensor.\n",
-    "\n",
-    "![Permuted/Transposed 2D Tensor](img/dli-clone.png)\n",
-    "\n",
-    "Open the file [exercises/example1_clone.cu](exercises/example1_clone.cu) and edit the first TODO."
+    "The last line shows `clone`, which replicates a operator's dimensions into a higher-rank operator:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example1_clone"
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[5, 4], Strides:[0,1]\n",
+      "000000:  5  6  7  8 \n",
+      "000001:  5  6  7  8 \n",
+      "000002:  5  6  7  8 \n",
+      "000003:  5  6  7  8 \n",
+      "000004:  5  6  7  8 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x7f5e43dfec30\n"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "auto t2c_rows = matx::clone<2>(t1Row, {5, matx::matxKeepDim});\n",
+    "\n",
+    "matx::print(t2c_rows);\n"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: 1 2 3 4 \n",
-    "000001: 1 2 3 4 \n",
-    "000002: 1 2 3 4 \n",
-    "000003: 1 2 3 4 \n",
-    "000004: 1 2 3 4 \n",
-    "```"
+    "`clone` is used on a 1D tensor from the output of the previous example, and replicates the data of the `t1` vector into a 2D tensor with 5 rows where all rows match the data in `t1`. Cloning does not replicate the data in memory; instead, the same elements in `t1` are accessed repeatedly when different rows are accessed. This not only saves memory, but also benefits from the caches in the GPU by not hitting different addresses in memory for the same value. \n",
+    "\n",
+    "In this case `clone` was being used on a 1D view from a 2D tensor data set, but similar code works on taking any dimension tensor and increasing it to a higher dimension. The increase in dimensions is not restricted to one. For example, a scalar (0D tensor) can be cloned into a 4F tensor where a single value in memory would appear as a 4D tensor.\n",
+    "\n",
+    "![Permuted/Transposed 2D Tensor](img/dli-clone.png)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "By changing which dimension is cloned, we can also take the same 1D tensor across columns. Edit the last file and clones across columns instead, and print the output of the cloned view.\n",
+    "By changing which dimension is cloned, we can also take the same 1D tensor across columns.\n",
     "\n",
     "![Column Clone](img/dli-clone-col.png)"
    ]
   },
   {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: 1 1 1 1 1 \n",
-    "000001: 2 2 2 2 2 \n",
-    "000002: 3 3 3 3 3 \n",
-    "000003: 4 4 4 4 4 \n",
-    "```"
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[4, 5], Strides:[1,0]\n",
+      "000000:  5  5  5  5  5 \n",
+      "000001:  6  6  6  6  6 \n",
+      "000002:  7  7  7  7  7 \n",
+      "000003:  8  8  8  8  8 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x7f5e43dfec30\n"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "auto t2c_columns = matx::clone<2>(t1Row, {matx::matxKeepDim, 5});\n",
+    "\n",
+    "matx::print(t2c_columns);"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As mentioned at the beginning, views do not modify the underlying data; they simply provide the metadata needed to access the elements. To show this, we will open the same [exercise](exercises/example1_clone.cu) again and modify the first value in the original 1D tensor to 10 and watch how multiple elements of the cloned view are modified."
+    "As mentioned at the beginning, views do not modify the underlying data; they simply provide the metadata needed to access the elements. To show this, we will modify t2(1,0), which corresponds to the first value of our 1D slice, watch how multiple elements of the cloned view are modified."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example1_clone"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000:   10    2    3    4 \n",
-    "000001:   10    2    3    4 \n",
-    "000002:   10    2    3    4 \n",
-    "000003:   10    2    3    4 \n",
-    "000004:   10    2    3    4 \n",
-    "```"
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[5, 4], Strides:[4,1]\n",
+      "000000:  42  2  3  4 \n",
+      "000001:  10  6  7  8 \n",
+      "000002:  9  10  11  12 \n",
+      "000003:  13  14  117  16 \n",
+      "000004:  17  18  19  20 \n",
+      "tensor_1_i32: Tensor{int32_t} Rank: 1, Sizes:[4], Strides:[1]\n",
+      "000000:  10 \n",
+      "000001:  6 \n",
+      "000002:  7 \n",
+      "000003:  8 \n",
+      "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[5, 4], Strides:[0,1]\n",
+      "000000:  10  6  7  8 \n",
+      "000001:  10  6  7  8 \n",
+      "000002:  10  6  7  8 \n",
+      "000003:  10  6  7  8 \n",
+      "000004:  10  6  7  8 \n",
+      "tensor_2_i32: Tensor{int32_t} Rank: 2, Sizes:[4, 5], Strides:[1,0]\n",
+      "000000:  10  10  10  10  10 \n",
+      "000001:  6  6  6  6  6 \n",
+      "000002:  7  7  7  7  7 \n",
+      "000003:  8  8  8  8  8 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x7f5e43dfec30\n"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "t2(1,0) = 10;\n",
+    "matx::print(t2);\n",
+    "matx::print(t1Row);\n",
+    "matx::print(t2c_rows);\n",
+    "matx::print(t2c_columns);"
    ]
   },
   {
@@ -489,21 +702,15 @@
    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
   },
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
+   "display_name": "C++17",
+   "language": "C++",
+   "name": "cling-cpp17"
   },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "codemirror_mode": "c++",
+   "file_extension": ".c++",
+   "mimetype": "text/x-c++src",
+   "name": "c++"
   }
  },
  "nbformat": 4,
diff --git a/docs_input/notebooks/02_operators.ipynb b/docs_input/notebooks/02_operators.ipynb
index 1c085dfd..222a782e 100644
--- a/docs_input/notebooks/02_operators.ipynb
+++ b/docs_input/notebooks/02_operators.ipynb
@@ -35,56 +35,125 @@
     "The last topic in this exercise will cover MatX generators. MatX generators are an operator that can dynamically generate data from a formula without storing the interim values. For example, the values an identity matrix or a Hamming window can both be generated on-the-fly only by knowing the index of the value. Generators typically only take a Shape as input since their output is generated without input data."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "//todo this should be moved to a hidden init block that runs automatically when the notebook starts\n",
+    "#pragma cling add_library_path(\"/usr/local/cuda/lib64\")\n",
+    "#pragma cling add_library_path(\"/opt/xeus/cling/lib\")\n",
+    "//#pragma cling add_library_path(\"/usr/Lib/gcc/x86_64-Linux-gnu/11/\")\n",
+    "#pragma cling add_library_path(\"/usr/lib/x86_64-linux-gnu/openblas64-openmp/\")\n",
+    "#pragma cling add_include_path(\"/usr/local/cuda/include\")\n",
+    "#pragma cling add_include_path(\"/usr/include/x86_64-linux-gnu/openblas64-openmp\")\n",
+    "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/include\")\n",
+    "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/build/_deps/cccl-src/libcudacxx/include\")\n",
+    "//#pragma cling load(\"libgomp\")\n",
+    "#pragma cling load(\"libopenblas64\")\n",
+    "#pragma cling load(\"libcuda\")\n",
+    "#pragma cling load(\"libcudart\")\n",
+    "#pragma cling load(\"libcurand\")\n",
+    "#pragma cling load(\"libcublas\")\n",
+    "#pragma cling load(\"libcublasLt\")\n",
+    "\n",
+    "#include <cuda/std/__algorithm/max.h>\n",
+    "#include <cuda/std/__algorithm/min.h>\n",
+    "\n",
+    "#define MATX_EN_OPENBLAS\n",
+    "#define MATX_EN_OPENBLAS_LAPACK\n",
+    "#define MATX_OPENBLAS_64BITINT\n",
+    "\n",
+    "#include \"matx.h\""
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Initialization\n",
-    "As in the previous example, we need to declare tensors and initialize the data:\n",
-    "\n",
-    "```c++\n",
-    "  auto A = make_tensor<float>({2, 3});\n",
-    "  auto B = make_tensor<float>({2, 3});\n",
-    "  auto C = make_tensor<float>({2, 3});\n",
-    "  auto V = make_tensor<float>({3});\n",
-    "```  \n",
-    "\n",
-    "We create a single tensor Shape type that's used in multiple tensor types so that we don't have to repeat the size. After this code is executed, four data objects are created, and managed memory is allocated to account for the shape and type of each tensor. Next, the input tensor Views (`A` and `V`) are initiailized with an increasing data pattern:\n",
-    "\n",
-    "```c++\n",
-    "  A.SetVals({ {1, 2, 3},\n",
-    "        {4, 5, 6}});\n",
-    "  \n",
-    "  V.SetVals({7, 8, 9});\n",
-    "```  \n",
-    "\n",
-    "Open the file [exercises/example2_init.cu](exercises/example2_init.cu) and edit the contents where you see TODO markers."
+    "As in the previous example, we need to declare tensors and initialize the data:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "!./exercises/compile_and_run.sh example2_init"
+    "auto A = matx::make_tensor<float>({2, 3});\n",
+    "auto B = matx::make_tensor<float>({2, 3});\n",
+    "auto C = matx::make_tensor<float>({2, 3});\n",
+    "auto V = matx::make_tensor<float>({3});\n",
+    "auto E = matx::make_tensor<float>({8,8});\n",
+    "auto H = matx::make_tensor<float>({10});"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: 1.0000 2.0000 3.0000 \n",
-    "000001: 4.0000 5.0000 6.0000 \n",
+    "After this code is executed, four data objects are created, and managed memory is allocated to account for the shape and type of each tensor. Next, the input tensor Views (`A` and `V`) are initiailized with an increasing data pattern:\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n",
+      "000000:  1.0000e+00  2.0000e+00  3.0000e+00 \n",
+      "000001:  4.0000e+00  5.0000e+00  6.0000e+00 \n",
+      "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[3], Strides:[1]\n",
+      "000000:  7.0000e+00 \n",
+      "000001:  8.0000e+00 \n",
+      "000002:  9.0000e+00 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x757f1fdfec30\n"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A.SetVals({ {1, 2, 3},\n",
+    "            {4, 5, 6}\n",
+    "          });\n",
+    "          \n",
+    "V.SetVals({7, 8, 9});\n",
     "\n",
-    "000000: 7.0000 \n",
-    "000001: 8.0000 \n",
-    "000002: 9.0000 \n",
-    "```"
+    "matx::print(A);\n",
+    "matx::print(V);"
    ]
   },
   {
@@ -93,39 +162,50 @@
    "metadata": {},
    "source": [
     "## Element-wise Scalar Addition\n",
-    "For the first operator example, we add a scalar onto a tensor and assign it to another tensor. This can be thought of as tensor addition with the second tensor equal to a tensor of equal size with all ones. To make the separation of operators from executors explicit, we first create the operator `op` by using MatX's lazy assignment operator `=`. The statement on the right hand side can be read as \"Add the number 5 to operator A, and assign the result to tensor B\". Instantiating variable `op` generates a CUDA kernel that can then be executed with the `run()` method:\n",
-    "\n",
-    "```c++\n",
-    "auto op = (B = A + 5);\n",
-    "op.run();\n",
-    "```\n",
-    "The `run()` function takes an optional CUDA stream to launch the work in. Since no argument was specified here, the default stream is used.\n",
-    "\n",
-    "Open the file [exercises/example2_scalar.cu](exercises/example2_scalar.cu) and edit the contents where you see TODO markers."
+    "For the first operator example, we add a scalar onto a tensor and assign it to another tensor. This can be thought of as tensor addition with the second tensor equal to a tensor of equal size with all ones. To make the separation of operators from executors explicit, we first create the operator `op` by using MatX's lazy assignment operator `=`. The statement on the right hand side can be read as \"Add the number 5 to operator A, and assign the result to tensor B\". Instantiating variable `op` generates a CUDA kernel that can then be executed with the `run()` method:\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n",
+      "000000:  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000001:  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "Operator{float} Rank: 2, Sizes:[2, 3]\n",
+      "000000:  6.0000e+00  7.0000e+00  8.0000e+00 \n",
+      "000001:  9.0000e+00  1.0000e+01  1.1000e+01 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x757f1fdfec30\n"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "!./exercises/compile_and_run.sh example2_scalar"
+    "auto op = (B = A + 5);\n",
+    "op.run();\n",
+    "matx::print(B);\n",
+    "\n",
+    "matx::print((B = A + 5)); ///\\todo remove after run is fixed"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: 1.0000 2.0000 3.0000 \n",
-    "000001: 4.0000 5.0000 6.0000 \n",
-    "\n",
-    "000000: 2.0000 3.0000 4.0000 \n",
-    "000001: 5.0000 6.0000 7.0000 \n",
-    "```"
+    "The `run()` function takes an optional executor to determine what accelerator is used to perform the operation. When no argument is specified, the default executor is the CUDA default stream."
    ]
   },
   {
@@ -134,46 +214,49 @@
    "metadata": {},
    "source": [
     "## Element-wise Tensor Addition\n",
-    "The next section adds two tensors together element-wise. Just like with a scalar, the `+` operator works on two tensors. Instead of creating a separate operator variable, this example shows how to create and execute an operator in a single line:\n",
-    "\n",
-    "```c++\n",
-    "A.SetVals({ {1, 2, 3},\n",
-    "      {4, 5, 6}});\n",
-    "\n",
-    "B.SetVals({ {7, 8, 9},\n",
-    "      {10, 11, 12}});\n",
-    "\n",
-    "(C = A + B).run();\n",
-    "```\n",
-    "\n",
-    "Open the file [exercises/example2_tensor_add.cu](exercises/example2_tensor_add.cu) and edit the contents where you see TODO markers."
+    "The next section adds two tensors together element-wise. Just like with a scalar, the `+` operator works on two tensors. Instead of creating a separate operator variable, this example shows how to create and execute an operator in a single line:\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example2_tensor_add"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n",
+      "000000:  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000001:  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "Operator{float} Rank: 2, Sizes:[2, 3]\n",
+      "000000:  8.0000e+00  1.0000e+01  1.2000e+01 \n",
+      "000001:  1.4000e+01  1.6000e+01  1.8000e+01 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x757f1fdfec30\n"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: 1.0000 2.0000 3.0000 \n",
-    "000001: 4.0000 5.0000 6.0000 \n",
+    "A.SetVals({ {1, 2, 3},\n",
+    "            {4, 5, 6}});\n",
+    "\n",
+    "B.SetVals({ {7, 8, 9},\n",
+    "            {10, 11, 12}});\n",
+    "\n",
+    "(C = A + B).run();\n",
     "\n",
-    "000000: 7.0000 8.0000 9.0000 \n",
-    "000001: 10.0000 11.0000 12.0000 \n",
+    "matx::print(C);\n",
     "\n",
-    "000000: 8.0000 10.0000 12.0000 \n",
-    "000001: 14.0000 16.0000 18.0000 \n",
-    "```"
+    "matx::print(C = A + B); ///\\todo remove after run is fixed"
    ]
   },
   {
@@ -182,39 +265,53 @@
    "metadata": {},
    "source": [
     "## Element-wise Tensor Division\n",
-    "The division operator `/` can also be used on two tensors. In this example we reuse the `C` tensor from the last example and divide each element by 2:\n",
-    "\n",
-    "```c++\n",
-    "  C.SetVals({ {7, 8, 9},\n",
-    "        {10, 11, 12}});\n",
-    "\n",
-    "  (C = C / 2).run();  \n",
-    "```\n",
-    "\n",
-    "With division, the usual C semantics apply - if the tensor type is an integral type, the results are rounded down. If the type is floating point, floating point division is performed. In this case we are using `float` types, so floating point division will occur.\n",
-    "\n",
-    "Open the file [exercises/example2_tensor_div.cu](exercises/example2_tensor_div.cu) and edit the contents where you see TODO markers."
+    "The division operator `/` can also be used on two tensors, or any scalar type that is compatible with the tensor's  data. In this example we reuse the `C` tensor from the last example and divide each element by 2:\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example2_tensor_div"
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n",
+      "000000:  7.0000e+00  8.0000e+00  9.0000e+00 \n",
+      "000001:  1.0000e+01  1.1000e+01  1.2000e+01 \n",
+      "Operator{float} Rank: 2, Sizes:[2, 3]\n",
+      "000000:  3.5000e+00  4.0000e+00  4.5000e+00 \n",
+      "000001:  5.0000e+00  5.5000e+00  6.0000e+00 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x757f1fdfec30\n"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "C.SetVals({ {7, 8, 9},\n",
+    "            {10, 11, 12}});\n",
+    "\n",
+    "(C = C / 2).run();  \n",
+    "\n",
+    "matx::print(C);\n",
+    "\n",
+    "matx::print(C = C / 2); ///\\todo remove after run is fixed"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: 3.5000 4.0000 4.5000 \n",
-    "000001: 5.0000 5.5000 6.0000 \n",
-    "```"
+    "With division, the usual C semantics apply - if the tensor type is an integral type, the results are rounded down. If the type is floating point, floating point division is performed. In this case we are using `float` types, so floating point division will occur."
    ]
   },
   {
@@ -223,95 +320,57 @@
    "metadata": {},
    "source": [
     "## Broadcasted Tensor Addition\n",
-    "Binary operators can be used on tensors of different ranks. In this section, we add a 1D tensor `M` onto a 2D tensor `C`. Unlike previous examples, the result is stored in the same tensor `C`, which is safe since the operation is element-wise and each thread runs independent of others. When operating on tensors of different ranks, the outer dimensions of both tensors must match. The tensor with the lower rank will be broadcasted on the higher dimensions when the operation is executing.\n",
-    "\n",
-    "```c++\n",
-    "  A.SetVals({ {1, 2, 3},\n",
-    "        {4, 5, 6}});\n",
-    "  \n",
-    "  V.SetVals({7, 8, 9});\n",
-    "\n",
-    "  (C = C + M).run();\n",
-    "```\n",
-    "\n",
-    "The result of the operation will be `M` repeatedly added to all rows of `C`.\n",
-    "\n",
-    "Open the file [exercises/example2_mixed_rank.cu](exercises/example2_mixed_rank.cu) and edit the contents where you see TODO markers."
+    "Binary operators can be used on tensors of different ranks. In this section, we add a 1D tensor `V` onto a 2D tensor `C`. Unlike previous examples, the result is stored in the same tensor `C`, which is safe since the operation is element-wise and each thread runs independent of others. When operating on tensors of different ranks, the outer dimensions of both tensors must match. The tensor with the lower rank will be broadcasted on the higher dimensions when the operation is executing."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example2_mixed_rank"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
+   "execution_count": 7,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n",
+      "000000:  1.0000e+00  2.0000e+00  3.0000e+00 \n",
+      "000001:  4.0000e+00  5.0000e+00  6.0000e+00 \n",
+      "Operator{float} Rank: 2, Sizes:[2, 3]\n",
+      "000000:  8.0000e+00  1.0000e+01  1.2000e+01 \n",
+      "000001:  1.1000e+01  1.3000e+01  1.5000e+01 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x757f1fdfec30\n"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: 8.0000 10.0000 12.0000 \n",
-    "000001: 11.0000 13.0000 15.0000 \n",
-    "```"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Multiple Operators\n",
-    "Multiple operators can be combined in a single expression. The syntax is similar to using a high-level language like MATLAB where the order of operations is followed, and the final result is stored into the tensor on the left hand side of the lazy assignment operator `=`. Unlike most C++ libraries that use operator overloading for runtime expression parsing, MatX uses templates to parse the entire expression at compile-time. This removes all unnecessary interim loads and stores that would normally occur with the runtime approach. In this example, we combined 4 operators (three `+` and one `/`) in a single expression:\n",
     "\n",
-    "```c++\n",
-    "  A.SetVals({ {1, 2, 3},\n",
-    "        {4, 5, 6}});\n",
-    "  \n",
-    "  V.SetVals({7, 8, 9});\n",
+    "C.SetVals({ {1, 2, 3},\n",
+    "            {4, 5, 6}});\n",
     "\n",
-    "  (C = (A + A + 1) / 2 + V).run();\n",
-    "```\n",
+    "V.SetVals({7, 8, 9});\n",
     "\n",
-    "Open the file [exercises/example2_multiple_ops.cu](exercises/example2_multiple_ops.cu) and edit the contents where you see TODO markers."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example2_multiple_ops"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: 8.5000 10.5000 12.5000 \n",
-    "000001: 11.5000 13.5000 15.5000\n",
-    "```"
+    "(C = C + V).run();\n",
+    "\n",
+    "matx::print(C);\n",
+    "\n",
+    "matx::print(C = C + V); ///\\todo remove after run is fixed"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Conditionals\n",
-    "Conditional statements operators are also available to take an action based on the value of an operator or tensor view. These actions can be anything from changing the computation to choosing where to store the data. In this example, we set the output of A based on whether the value in C is greater than 3. Note that `IFELSE` is an operator, and has the same `run()` method to execute the work as a standard expression.\n",
     "\n",
-    "```c++\n",
-    "    IFELSE(C > 3, A = 1, A = 0).run();\n",
-    "```"
+    "The result of the operation will be `V` repeatedly added to all rows of `C`."
    ]
   },
   {
@@ -319,16 +378,49 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Open the file [exercises/example2_conditional.cu](exercises/example2_conditional.cu) and edit the contents where you see TODO markers."
+    "## Multiple Operators\n",
+    "Multiple operators can be combined in a single expression. The syntax is similar to using a high-level language like MATLAB where the order of operations is followed, and the final result is stored into the tensor on the left hand side of the lazy assignment operator `=`. Unlike most C++ libraries that use operator overloading for runtime expression parsing, MatX uses templates to parse the entire expression at compile-time. This removes all unnecessary interim loads and stores that would normally occur with the runtime approach. In this example, we combined 4 operators (three `+` and one `/`) in a single expression:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n",
+      "000000:  8.0000e+00  1.0000e+01  1.2000e+01 \n",
+      "000001:  1.1000e+01  1.3000e+01  1.5000e+01 \n",
+      "Operator{float} Rank: 2, Sizes:[2, 3]\n",
+      "000000:  8.5000e+00  1.0500e+01  1.2500e+01 \n",
+      "000001:  1.1500e+01  1.3500e+01  1.5500e+01 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x757f1fdfec30\n"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "!./exercises/compile_and_run.sh example2_conditional"
+    "A.SetVals({ {1, 2, 3},\n",
+    "            {4, 5, 6}});\n",
+    "\n",
+    "V.SetVals({7, 8, 9});\n",
+    "\n",
+    "(C = (A + A + 1) / 2 + V).run();\n",
+    "\n",
+    "matx::print(C);\n",
+    "\n",
+    "matx::print((C = (A + A + 1) / 2 + V)); ///\\todo remove after run is fixed"
    ]
   },
   {
@@ -336,58 +428,54 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000:    0    0    0\n",
-    "000001:    1    1    1\n",
-    "```"
+    "## Conditionals\n",
+    "Conditional statements operators are also available to take an action based on the value of an operator or tensor view. These actions can be anything from changing the computation to choosing where to store the data. In this example, we set the output of A based on whether the value in C is greater than 3. Note that `IFELSE` is an operator, and has the same `run()` method to execute the work as a standard expression.\n"
    ]
   },
   {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n",
+      "000000:  1.0000e+00  2.0000e+00  3.0000e+00 \n",
+      "000001:  4.0000e+00  5.0000e+00  6.0000e+00 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x757f1fdfec30\n"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "### Random Operator\n",
-    "The ``random`` operator provides a way to generate random numbers using various distrubtions. Random values can be useful for many applications, including generating noise in signal processing or initializing data for testing. In this example we take an existing tensor view (`A`) and populate it with random values from a normal distribution. Before setting the random values, we set all elements of `A` to zero to show the values change after randomizing.\n",
+    "A.SetVals({ {1, 2, 3},\n",
+    "            {4, 5, 6}});\n",
     "\n",
-    "```c++\n",
-    "(A = 0).run()\n",
-    "(A = random<float>({4, 4}, NORMAL)).run();\n",
-    "```\n",
+    "C.SetVals({ {1, 2, 3},\n",
+    "            {4, 5, 6}});\n",
     "\n",
-    "Open the file [exercises/example2_rand.cu](exercises/example2_rand.cu) and edit the contents where you see TODO markers."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example2_rand"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: -0.9247 -0.4253 -2.6438 0.1452\n",
-    "000001: -0.1209 -0.5797 -0.6229 -0.3284\n",
-    "000002: -1.0745 -0.3631 -1.6711 2.2655\n",
-    "000003: 0.3117 -0.1842 1.2866 1.1820\n",
-    "```"
+    "matx::IFELSE(C > 3, A = 1, A = 0).run();\n",
+    "matx::print(A);\n",
+    "\n",
+    "\n",
+    "///\\todo currently broken, doesn't work with print for some reason"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "In this example we store the current random values from `randTensor` into `A`. Instead of storing the random values in `A`, `randTensor` can be used directly in operator equations, and each time it's used a different set of random values is generated."
+    "\n"
    ]
   },
   {
@@ -395,23 +483,47 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Generators\n",
-    "Next, we introduce the concept of a generator by creating the identity matrix, scaling the values by `5`, and storing it in a tensor. MatX contains an `eye` operator for generating an identity matrix. Each time an element in the generator is accessed, `eye` simply returns a `1` for values in the diagonal, and `0` otherwise. Said differently, if the index for each rank is equal, the value is set to `1`. Since the goal is to have a diagonal matrix of fives, we multiply the generator by the scalar `5`. Since `eye` is a generator, the multiply and the identity matrix can be evaluated without storing any values. Since we're interested in seeing the results, we execute the operator and store it in the tensor `B`:\n",
-    "\n",
-    "```c++\n",
-    "(B = eye({8, 8}) * 5).run();\n",
-    "```\n",
-    "\n",
-    "Open the file [exercises/example2_eye.cu](exercises/example2_eye.cu) and edit the contents where you see TODO markers."
+    "### Random Operator\n",
+    "The ``random`` operator provides a way to generate random numbers using various distrubtions. Random values can be useful for many applications, including generating noise in signal processing or initializing data for testing. In this example we take an existing tensor view (`A`) and populate it with random values from a normal distribution. Before setting the random values, we set all elements of `A` to zero to show the values change after randomizing.\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example2_eye"
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[2, 3], Strides:[3,1]\n",
+      "000000:  1.0000e+00  2.0000e+00  3.0000e+00 \n",
+      "000001:  4.0000e+00  5.0000e+00  6.0000e+00 \n",
+      "Operator{float} Rank: 2, Sizes:[2, 3]\n",
+      "000000:  4.2150e-41  4.2150e-41  4.2150e-41 \n",
+      "000001:  4.2150e-41  4.2150e-41  4.2150e-41 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x757f1fdfec30\n"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(A = 0).run();\n",
+    "\n",
+    "(A = matx::random<float>({2, 3}, matx::NORMAL)).run();\n",
+    "\n",
+    "matx::print(A);\n",
+    "\n",
+    "//broken output?\n",
+    "matx::print(matx::random<float>({2, 3}, matx::NORMAL)); ///\\todo remove after run is fixed broken anyways with no memory backing"
    ]
   },
   {
@@ -419,17 +531,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: 5.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000\n",
-    "000001: 0.0000 5.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000\n",
-    "000002: 0.0000 0.0000 5.0000 0.0000 0.0000 0.0000 0.0000 0.0000\n",
-    "000003: 0.0000 0.0000 0.0000 5.0000 0.0000 0.0000 0.0000 0.0000\n",
-    "000004: 0.0000 0.0000 0.0000 0.0000 5.0000 0.0000 0.0000 0.0000\n",
-    "000005: 0.0000 0.0000 0.0000 0.0000 0.0000 5.0000 0.0000 0.0000\n",
-    "000006: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 5.0000 0.0000\n",
-    "000007: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 5.0000\n",
-    "```"
+    "In this example we store the current random values from `randTensor` into `A`. Instead of storing the random values in `A`, `randTensor` can be used directly in operator equations, and each time it's used a different set of random values is generated."
    ]
   },
   {
@@ -437,41 +539,57 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "While `eye` is a fairly simple generator for creating ones on the diagonal, more complex generators exist for performing operations like windowing, or creating a linearly-spaced range of values. Below we use the `hamming_x` function to generate a Hamming window using the formula: $$ 0.5 * (1 - cos(\\frac{2{\\pi}n}{N})) $$ where `n` is the sample number and `N` is the total number of samples. Since an array of sizes is passed into the generator, these two variables are computed at runtime and the size of the shape is used as the size of the Hamming window. Like the name implies, the `_x` on `hanning` generates the window across the `x` axis, but there are versions for all four possible axes. Other window functions use the same nomenclature:\n",
-    "\n",
-    "```c++\n",
-    "(B = hamming_x(B.Shape())).run();\n",
-    "```\n",
-    "Open the file [exercises/example2_hamming.cu](exercises/example2_hamming.cu) and edit the contents where you see TODO markers."
+    "# Generators\n",
+    "Next, we introduce the concept of a generator by creating the identity matrix, scaling the values by `5`, and storing it in a tensor. MatX contains an `eye` operator for generating an identity matrix. Each time an element in the generator is accessed, `eye` simply returns a `1` for values in the diagonal, and `0` otherwise. Said differently, if the index for each rank is equal, the value is set to `1`. Since the goal is to have a diagonal matrix of fives, we multiply the generator by the scalar `5`. Since `eye` is a generator, the multiply and the identity matrix can be evaluated without storing any values. Since we're interested in seeing the results, we execute the operator and store it in the tensor `B`:\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example2_hamming"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "000000: 0.0800\n",
-    "000001: 0.1876\n",
-    "000002: 0.4601\n",
-    "000003: 0.7700\n",
-    "000004: 0.9723\n",
-    "000005: 0.9723\n",
-    "000006: 0.7700\n",
-    "000007: 0.4601\n",
-    "000008: 0.1876\n",
-    "000009: 0.0800\n",
-    "```"
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 8], Strides:[8,1]\n",
+      "000000:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000001:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000002:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000003:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000004:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000005:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000006:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000007:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "Operator{int32_t} Rank: 2, Sizes:[8, 8]\n",
+      "000000:  5  0  0  0  0  0  0  0 \n",
+      "000001:  0  5  0  0  0  0  0  0 \n",
+      "000002:  0  0  5  0  0  0  0  0 \n",
+      "000003:  0  0  0  5  0  0  0  0 \n",
+      "000004:  0  0  0  0  5  0  0  0 \n",
+      "000005:  0  0  0  0  0  5  0  0 \n",
+      "000006:  0  0  0  0  0  0  5  0 \n",
+      "000007:  0  0  0  0  0  0  0  5 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x757f1fdfec30\n"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "(E = matx::eye({8, 8}) * 5).run();\n",
+    "\n",
+    "matx::print(E); \n",
+    "\n",
+    "matx::print(matx::eye({8, 8}) * 5); ///\\todo remove after run is fixed"
    ]
   },
   {
@@ -479,31 +597,62 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Visualizing Data\n",
-    "MatX uses PyBind and Plotly as a visualization backend of tensor views. Basic plots are supported, such as line, bar, scatter, and contour. Using the Hamming window data from the previous example, we can visualize what the window looks like:\n",
-    "\n",
-    "```c++\n",
-    "viz::line(B, \"Hamming Window\", \"Sample\", \"Amplitude\", \"hamming.html\");\n",
-    "```\n",
-    "\n",
-    "Visualizations in MatX are under the `matx::viz` nested namespace. The string parameters above give the plot a title, X axis name, and Y axis name, respectively. The last parameter is an optional filename to output the plot to. If the code was running natively in this notebook the plot would appear here, but since this is a compiled program, we output to a separate html file that you can open in your file tree. Note that since this is a standard Plotly plot, all the functionality such as zooming and panning are present."
+    "While `eye` is a fairly simple generator for creating ones on the diagonal, more complex generators exist for performing operations like windowing, or creating a linearly-spaced range of values. Below we use the `hamming` function to generate a Hamming window using the formula: $$ 0.5 * (1 - cos(\\frac{2{\\pi}n}{N})) $$ where `n` is the sample number and `N` is the total number of samples. Since an array of sizes is passed into the generator, these two variables are computed at runtime and the size of the shape is used as the size of the Hamming window. Like the name implies, the `_x` on `hanning` generates the window across the `x` axis, but there are versions for all four possible axes. Other window functions use the same nomenclature:\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example2_viz\n",
-    "\n",
-    "# Display plot\n",
-    "from IPython.display import IFrame\n",
-    "IFrame(src='./hamming.html', width=700, height=600)"
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[10], Strides:[1]\n",
+      "000000:  0.0000e+00 \n",
+      "000001:  0.0000e+00 \n",
+      "000002:  0.0000e+00 \n",
+      "000003:  0.0000e+00 \n",
+      "000004:  0.0000e+00 \n",
+      "000005:  0.0000e+00 \n",
+      "000006:  0.0000e+00 \n",
+      "000007:  0.0000e+00 \n",
+      "000008:  0.0000e+00 \n",
+      "000009:  0.0000e+00 \n",
+      "Operator{float} Rank: 1, Sizes:[10]\n",
+      "000000:  8.0000e-02 \n",
+      "000001:  1.8762e-01 \n",
+      "000002:  4.6012e-01 \n",
+      "000003:  7.7000e-01 \n",
+      "000004:  9.7226e-01 \n",
+      "000005:  9.7226e-01 \n",
+      "000006:  7.7000e-01 \n",
+      "000007:  4.6012e-01 \n",
+      "000008:  1.8762e-01 \n",
+      "000009:  8.0000e-02 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x757f1fdfec30\n"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "(H = matx::hamming<0>(H.Shape())).run();\n",
+    "\n",
+    "matx::print(H);\n",
+    "\n",
+    "matx::print(matx::hamming<0>(H.Shape())); ///\\todo remove after run is fixed"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
@@ -514,25 +663,16 @@
   }
  ],
  "metadata": {
-  "interpreter": {
-   "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
-  },
   "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
+   "display_name": "C++17",
+   "language": "C++",
+   "name": "cling-cpp17"
   },
   "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
+   "codemirror_mode": "c++",
+   "file_extension": ".c++",
+   "mimetype": "text/x-c++src",
+   "name": "c++"
   }
  },
  "nbformat": 4,
diff --git a/docs_input/notebooks/03_transforms.ipynb b/docs_input/notebooks/03_transforms.ipynb
index e148df73..982f7aaa 100644
--- a/docs_input/notebooks/03_transforms.ipynb
+++ b/docs_input/notebooks/03_transforms.ipynb
@@ -33,6 +33,49 @@
     "Some executors use CUDA libraries to implement their functionality, and those libraries require either a handle or a plan to operated. MatX hides this complexity by creating and caching the plan on the first call, and using the same plan on future calls where possible. More advanced users may use the handle interface directly to avoid the caching. Only the caching interface will be covered in this tutorial since it's the recommended approach, but the non-cached version can be found in the documentation."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "//todo this should be moved to a hidden init block that runs automatically when the notebook starts\n",
+    "#pragma cling add_library_path(\"/usr/local/cuda/lib64\")\n",
+    "#pragma cling add_library_path(\"/opt/xeus/cling/lib\")\n",
+    "//#pragma cling add_library_path(\"/usr/Lib/gcc/x86_64-Linux-gnu/11/\")\n",
+    "#pragma cling add_library_path(\"/usr/lib/x86_64-linux-gnu/openblas64-openmp/\")\n",
+    "#pragma cling add_include_path(\"/usr/local/cuda/include\")\n",
+    "#pragma cling add_include_path(\"/usr/include/x86_64-linux-gnu/openblas64-openmp\")\n",
+    "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/include\")\n",
+    "#pragma cling add_include_path(\"/opt/xeus/cling/tools/Jupyter/kernel/MatX/build/_deps/cccl-src/libcudacxx/include\")\n",
+    "//#pragma cling load(\"libgomp\")\n",
+    "#pragma cling load(\"libopenblas64\")\n",
+    "#pragma cling load(\"libcuda\")\n",
+    "#pragma cling load(\"libcudart\")\n",
+    "#pragma cling load(\"libcurand\")\n",
+    "#pragma cling load(\"libcublas\")\n",
+    "#pragma cling load(\"libcublasLt\")\n",
+    "#pragma cling load(\"libcufft\")\n",
+    "\n",
+    "#include <cuda/std/__algorithm/max.h>\n",
+    "#include <cuda/std/__algorithm/min.h>\n",
+    "\n",
+    "#define MATX_EN_OPENBLAS\n",
+    "#define MATX_EN_OPENBLAS_LAPACK\n",
+    "#define MATX_OPENBLAS_64BITINT\n",
+    "\n",
+    "#include \"matx.h\""
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",
@@ -43,56 +86,75 @@
     "\n",
     "We use rectangular matrices for `A` and `B`, while `C` will be a square matrix due to the outer dimensions of `A` and `B` matching. \n",
     "\n",
-    "```c++\n",
-    "(A = random<float>({8, 4}, NORMAL)).run();  \n",
-    "(B = random<float>({4, 8}, NORMAL)).run();  \n",
-    "\n",
-    "matmul(C, A, B);\n",
-    "```  \n",
     "\n",
     "Open the file [exercises/example3_gemm.cu](exercises/example3_gemm.cu) and edit the contents where you see TODO markers."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example3_gemm"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
+   "execution_count": 2,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "A:\n",
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 4], Strides:[4,1]\n",
+      "000000:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000001:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000002:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000003:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000004:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000005:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000006:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000007:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "B:\n",
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[4, 8], Strides:[8,1]\n",
+      "000000:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000001:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000002:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000003:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "C:\n",
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 8], Strides:[8,1]\n",
+      "000000:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000001:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000002:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000003:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000004:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000005:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000006:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000007:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x78e507dfec30\n"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "A:\n",
-    "000000: -0.9247 -0.4253 -2.6438 0.1452\n",
-    "000001: -0.1209 -0.5797 -0.6229 -0.3284\n",
-    "000002: -1.0745 -0.3631 -1.6711 2.2655\n",
-    "000003: 0.3117 -0.1842 1.2866 1.1820\n",
-    "000004: -0.1271 1.2169 1.4353 1.0605\n",
-    "000005: -0.4941 -1.4244 -0.7244 -1.2973\n",
-    "000006: 0.0697 -0.0074 1.8969 0.6878\n",
-    "000007: -0.0779 -0.8373 1.3506 -0.2879\n",
-    "B:\n",
-    "000000: 0.9911 1.0676 -0.6272 0.3202 -0.3110 -0.3441 -1.1709 -0.5371\n",
-    "000001: 1.3390 -0.2401 1.2149 -0.2052 1.2999 0.2181 -1.2135 -1.3723\n",
-    "000002: -0.4635 -0.4089 -0.0032 0.2967 -0.3587 -1.0455 -0.0450 -0.0985\n",
-    "000003: 1.7608 0.9107 0.0288 -1.1128 0.0929 -0.1502 -0.9854 0.7889\n",
-    "C:\n",
-    "000000: -0.0050 0.3283 0.0760 -1.1547 0.6966 2.9677 1.5747 1.4554\n",
-    "000001: -1.1856 -0.0342 -0.6359 0.2609 -0.5231 0.6156 1.1966 0.6628\n",
-    "000002: 3.2124 1.6864 0.3035 -3.2863 0.6721 1.6973 -0.4584 3.0275\n",
-    "000003: 1.5472 0.9272 -0.3894 -0.7960 -0.6881 -1.6701 -1.3640 0.8911\n",
-    "000004: 2.7056 -0.0490 1.5840 -1.0446 1.2051 -1.3507 -2.4374 -0.9065\n",
-    "000005: -4.3456 -1.0707 -1.4556 1.3628 -1.5586 0.8115 3.6179 1.2680\n",
-    "000006: 0.3910 -0.0732 -0.0391 -0.1788 -0.6479 -2.1121 -0.8357 0.3284\n",
-    "000007: -2.3314 -0.6966 -0.9810 0.8679 -1.5754 -1.5246 1.3302 0.8306\n",
-    "```"
+    "  auto A = matx::make_tensor<float>({8, 4});\n",
+    "  auto B = matx::make_tensor<float>({4, 8});\n",
+    "  auto C = matx::make_tensor<float>({8, 8});\n",
+    "\n",
+    "  (A = matx::random<float>({8, 4}, matx::NORMAL)).run();  \n",
+    "  (B = matx::random<float>({4, 8}, matx::NORMAL)).run();  \n",
+    "\n",
+    "  // TODO: Perform a GEMM of C = A*B\n",
+    "  (C = matx::matmul(A, B)).run();\n",
+    "  \n",
+    "  printf(\"A:\\n\");\n",
+    "  matx::print(A);\n",
+    "  printf(\"B:\\n\");\n",
+    "  matx::print(B);  \n",
+    "  printf(\"C:\\n\");\n",
+    "  matx::print(C);    \n"
    ]
   },
   {
@@ -103,26 +165,34 @@
     "### FFT\n",
     "MatX provides an interface to do both 1D Fast Fourier Transforms (FFTs) and 2D FFTs. Any tensor above rank 1 will be batched in a 1D FFT, and any tensor above rank 2 will be batched in a 2D FFT. FFTs may either be done in-place or out-of-place by using the same or different variables for the output and inputs. Since the tensors are strongly-typed, the type of FFT (C2C, R2C, etc) is inferred by the tensor type at compile time. Similarly, the input and output size of the executor is deduced by the type of transform, and the input/output tensors must match those sizes. There's one exception to this rule, and it's when the input FFT is to be zero-padded at the end. In this case, the input tensor can be shorter than the output tensor, and the input will be zero-padded to the length of the output tensor. This is a common tactic used in signal and image processing for both speed and FFT resolution.\n",
     "\n",
-    "In this example, we execute a 1D batched FFT on a 2D tensor populated with random complex floating point data. Since the FFT executor is performed in-place, the input and output types of the tensors are the same, and the type of the FFT is inferred as a complex-to-complex (`C2C`). The FFT length is specified by the inner dimension of the tensor, or 4 in this example, and the outer dimension is the number of batches, or 2. After the FFT completes, we perform on IFFT on the same tensor using the `ifft` interface. Ignoring floating point inaccuracies, the result of `ifft(fft(A))` should be the same as `A`, and this is shown by printing the tensors at each step. To perform a batched FFT on columns instead of rows, the tensor can be transposed by calling the `Permute` function used in the first tutorial. When the library detects a permuted tensor is being used, it can use technique to speed the FFT up over the naive method of converting the data in memory.\n",
-    "\n",
-    "```c++\n",
-    "C.print();\n",
-    "fft(C, C);\n",
-    "C.print();\n",
-    "ifft(C, C);  \n",
-    "C.print();\n",
-    "```\n",
-    "\n",
-    "Open the file [exercises/example3_1dfft.cu](exercises/example3_1dfft.cu) and edit the contents where you see TODO markers."
+    "In this example, we execute a 1D batched FFT on a 2D tensor populated with random complex floating point data. Since the FFT executor is performed in-place, the input and output types of the tensors are the same, and the type of the FFT is inferred as a complex-to-complex (`C2C`). The FFT length is specified by the inner dimension of the tensor, or 4 in this example, and the outer dimension is the number of batches, or 2. After the FFT completes, we perform on IFFT on the same tensor using the `ifft` interface. Ignoring floating point inaccuracies, the result of `ifft(fft(A))` should be the same as `A`, and this is shown by printing the tensors at each step. To perform a batched FFT on columns instead of rows, the tensor can be transposed by calling the `Permute` function used in the first tutorial. When the library detects a permuted tensor is being used, it can use technique to speed the FFT up over the naive method of converting the data in memory."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "!./exercises/compile_and_run.sh example3_1dfft"
+    "auto D = matx::make_tensor<float>({2, 4});\n",
+    "\n",
+    "// (D = matx::random<float>(D.Shape(), matx::NORMAL)).run();\n",
+    "// matx::print(D);\n",
+    "\n",
+    "// (D = fft(D)).run();\n",
+    "// matx::print(D);\n",
+    "\n",
+    "// (D = matx::ifft(D)).run();  \n",
+    "// matx::print(D);"
    ]
   },
   {
@@ -130,66 +200,41 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "Initial C tensor:\n",
-    "000000: -0.9247+0.9911j -0.4253+1.0676j -2.6438-0.6272j 0.1452+0.3202j \n",
-    "000001: -0.1209-0.3110j -0.5797-0.3441j -0.6229-1.1709j -0.3284-0.5371j \n",
-    "After FFT:\n",
-    "000000: -3.8487+1.7517j 2.4666+2.1889j -3.2883-1.0238j 0.9718+1.0478j \n",
-    "000001: -1.6518-2.3630j 0.6950+1.1112j 0.1644-0.6007j 0.3090+0.6085j \n",
-    "After IFFT and normalization:\n",
-    "000000: -0.9247+0.9911j -0.4253+1.0676j -2.6438-0.6272j 0.1452+0.3202j \n",
-    "000001: -0.1209-0.3110j -0.5797-0.3441j -0.6229-1.1709j -0.3284-0.5371j \n",
-    "```"
+    "Next, we take the same 2D tensor and perform a 2D FFT on it. Since the rank is 2, it will not be batched as in the previous example. "
    ]
   },
   {
-   "attachments": {},
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x78e507dfec30\n"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "Next, we take the same 2D tensor and perform a 2D FFT on it. Since the rank is 2, it will not be batched as in the previous example. \n",
-    "\n",
-    "```c++\n",
-    "C.print();\n",
-    "fft2(C, C);\n",
-    "C.print();\n",
-    "ifft2(C, C);  \n",
-    "C.print();\n",
-    "```\n",
+    "(D = matx::random<float>(D.Shape(), matx::NORMAL)).run();\n",
+    "// matx::print(D);\n",
     "\n",
-    "As before, the results after the IFFT closely match the original `C` tensor, but with floating point error.\n",
+    "// (D = fft2(D)).run();\n",
+    "// matx::print(D);\n",
     "\n",
-    "Open the file [exercises/example3_2dfft.cu](exercises/example3_2dfft.cu) and edit the contents where you see TODO markers."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example3_2dfft"
+    "// (D = matx::ifft2(D)).run();  \n",
+    "// matx::print(D);"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "Intial C tensor:\n",
-    "000000: -0.9247+0.9911j -0.4253+1.0676j -2.6438-0.6272j 0.1452+0.3202j \n",
-    "000001: -0.1209-0.3110j -0.5797-0.3441j -0.6229-1.1709j -0.3284-0.5371j \n",
-    "After FFT:\n",
-    "000000: -2.0506+1.4036j -0.0405-0.0434j -2.6438-0.6272j 0.1452+0.3202j \n",
-    "000001: -2.0051+2.7593j -0.4662-0.5353j -0.6229-1.1709j -0.3284-0.5371j \n",
-    "After IFFT and normalization:\n",
-    "000000: -1.8493+1.9823j -0.8507+2.1352j -0.6610-0.1568j 0.0363+0.0800j \n",
-    "000001: -0.2417-0.6220j -1.1595-0.6882j -0.1557-0.2927j -0.0821-0.1343j \n",
-    "```"
+    "As before, the results after the IFFT closely match the original `C` tensor, but with floating point error."
    ]
   },
   {
@@ -203,23 +248,61 @@
     "MatX provides a set of optimized primitives to perform reductions on tensors for many common types. Reductions are supported across individual dimensions or on entire tensors, depending on the size of the output tensor. Currently supported reduction functions are `sum`, `min`, `max`,` mean`, `any`, and `all`.\n",
     "\n",
     "#### Full Reduction\n",
-    "In this example we reduce an entire tensor to a single value by applying the reduction across all dimensions of the tensor. We apply the same random initialization from previous examples on a 2D tensor `A`. Note that the output tensor must be zeroed for a `sum` reduction since that value is continually added to during the reduction. Not initializing the output tensor will give undefined results since the variables are used as accumulators throughout the reduction. With the tensor initialized, we perform both a `max` and `sum` reduction across all dimensions of the tensor:\n",
-    "\n",
-    "```c++\n",
-    "max(MD0, A);\n",
-    "sum(AD0, A);\n",
-    "```\n",
-    "\n",
-    "Open the file [exercises/example3_full_reduce.cu](exercises/example3_full_reduce.cu) and edit the contents where you see TODO markers."
+    "In this example we reduce an entire tensor to a single value by applying the reduction across all dimensions of the tensor. We apply the same random initialization from previous examples on a 2D tensor `A`. Note that the output tensor must be zeroed for a `sum` reduction since that value is continually added to during the reduction. Not initializing the output tensor will give undefined results since the variables are used as accumulators throughout the reduction. With the tensor initialized, we perform both a `max` and `sum` reduction across all dimensions of the tensor:\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "A:\n",
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 4], Strides:[4,1]\n",
+      "000000:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000001:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000002:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000003:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000004:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000005:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000006:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000007:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "Max: 0.000000\n",
+      "Sum: 0.000000\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(int) 14\n"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "!./exercises/compile_and_run.sh example3_full_reduce"
+    "\n",
+    "  auto MD0 = matx::make_tensor<float>({});\n",
+    "  auto AD0 = matx::make_tensor<float>({});\n",
+    "\n",
+    "  (A = matx::random<float>(A.Shape(), matx::NORMAL)).run();    \n",
+    "  \n",
+    "  // Initialize max and average to 0\n",
+    "  (MD0 = 0).run();\n",
+    "  (AD0 = 0).run();\n",
+    "\n",
+    "  (MD0 = max(A)).run();\n",
+    "  (AD0 = sum(A)).run();\n",
+    "\n",
+    "  printf(\"A:\\n\");\n",
+    "  matx::print(A);\n",
+    "  printf(\"Max: %f\\n\", MD0());\n",
+    "  printf(\"Sum: %f\\n\", AD0()); "
    ]
   },
   {
@@ -227,68 +310,107 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "A:\n",
-    "000000: -0.9247 -0.4253 -2.6438 0.1452 -0.1209 \n",
-    "000001: -0.5797 -0.6229 -0.3284 -1.0745 -0.3631 \n",
-    "000002: -1.6711 2.2655 0.3117 -0.1842 1.2866 \n",
-    "000003: 1.1820 -0.1271 1.2169 1.4353 1.0605 \n",
-    "Max: 2.265505\n",
-    "Sum: -0.162026\n",
-    "```"
+    "#### Dimensional Reductions\n",
+    "Reductions can also be performed across certain dimensions instead of the whole tensor. Dimensional reductions are useful in situations where each row contains data for a different user, for example, and we wish to sum up each user's data. By setting the output tensor view to a 1D tensor, independent reductions can be performed across the input tensor where each output element corresponds to a single row reduction from the input. Using the same tensor `A` from the previous example, we only change the output tensor type to be a 1D tensor instead of a scalar:\n"
    ]
   },
   {
-   "attachments": {},
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 6,
    "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x78e507dfec30\n"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "#### Dimensional Reductions\n",
-    "Reductions can also be performed across certain dimensions instead of the whole tensor. Dimensional reductions are useful in situations where each row contains data for a different user, for example, and we wish to sum up each user's data. By setting the output tensor view to a 1D tensor, independent reductions can be performed across the input tensor where each output element corresponds to a single row reduction from the input. Using the same tensor `A` from the previous example, we only change the output tensor type to be a 1D tensor instead of a scalar:\n",
     "\n",
-    "```c++\n",
-    "  max(MD1, A);\n",
-    "  sum(AD1, A);  \n",
-    "```\n",
+    "  auto MD1 = matx::make_tensor<float>({A.Size(0)});\n",
+    "  auto AD1 = matx::make_tensor<float>({A.Size(0)});\n",
     "\n",
-    "Printing the new reduction tensors shows the reduced values across each row of the input tensor `A`.\n",
+    "  (A = matx::random<float>(A.Shape(), matx::NORMAL)).run();    \n",
+    "  \n",
+    "  // Initialize max and average to 0\n",
+    "  (MD1 = 0).run();\n",
+    "  (AD1 = 0).run();\n",
     "\n",
-    "Open the file [exercises/example3_partial_reduce.cu](exercises/example3_partial_reduce.cu) and edit the contents where you see TODO markers."
+    "  (MD1 = max(A)).run();\n",
+    "  (AD1 = sum(A)).run();"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "!./exercises/compile_and_run.sh example3_partial_reduce"
+    "Printing the new reduction tensors shows the reduced values across each row of the input tensor `A`."
    ]
   },
   {
-   "attachments": {},
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 7,
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "A:\n",
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 4], Strides:[4,1]\n",
+      "000000:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000001:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000002:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000003:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000004:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000005:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000006:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000007:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "Max:\n",
+      "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[8], Strides:[1]\n",
+      "000000:  0.0000e+00 \n",
+      "000001:  0.0000e+00 \n",
+      "000002:  0.0000e+00 \n",
+      "000003:  0.0000e+00 \n",
+      "000004:  0.0000e+00 \n",
+      "000005:  0.0000e+00 \n",
+      "000006:  0.0000e+00 \n",
+      "000007:  0.0000e+00 \n",
+      "Sum:\n",
+      "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[8], Strides:[1]\n",
+      "000000:  0.0000e+00 \n",
+      "000001:  0.0000e+00 \n",
+      "000002:  0.0000e+00 \n",
+      "000003:  0.0000e+00 \n",
+      "000004:  0.0000e+00 \n",
+      "000005:  0.0000e+00 \n",
+      "000006:  0.0000e+00 \n",
+      "000007:  0.0000e+00 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x78e507dfec30\n"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "A:\n",
-    "000000: -0.9247 -0.4253 -2.6438 0.1452 -0.1209 \n",
-    "000001: -0.5797 -0.6229 -0.3284 -1.0745 -0.3631 \n",
-    "000002: -1.6711 2.2655 0.3117 -0.1842 1.2866 \n",
-    "000003: 1.1820 -0.1271 1.2169 1.4353 1.0605 \n",
-    "Max:\n",
-    "000000: 0.1452 \n",
-    "000001: -0.3284 \n",
-    "000002: 2.2655 \n",
-    "000003: 1.4353 \n",
-    "Sum:\n",
-    "000000: -3.9695 \n",
-    "000001: -2.9686 \n",
-    "000002: 2.0086 \n",
-    "000003: 4.7676 \n",
-    "```"
+    "  printf(\"A:\\n\");\n",
+    "  matx::print(A);\n",
+    "  printf(\"Max:\\n\");\n",
+    "  matx::print(MD1);\n",
+    "  printf(\"Sum:\\n\");\n",
+    "  matx::print(AD1);"
    ]
   },
   {
@@ -297,20 +419,82 @@
    "metadata": {},
    "source": [
     "### Convolution\n",
-    "MatX supports both 1D and 2D direct convolution using the `conv1d` and `conv2d` functions. FFT-based convolution can also be performed as a combination of existing primitives as a potentially faster alternative to direct convolution for large tensors. Both forms of direct convolution take in an extra mode which specifies how much of the output is saved, where `MATX_C_MODE_FULL` saves the entire filter ramp-up and down, `MATX_C_MODE_SAME` makes the input and output tensors the same size, and `MATX_C_MODE_VALID` only keeps valid samples (when the entire filter was part of the convolution). Convolution can be used to perform a rolling average of an input by making all filter values 1/N, where N is the length of the filter. In this example, we use a filter of length 3 to create a running average of the last 3 elements:\n",
-    "\n",
-    "```c++\n",
-    "conv1d(Co, C, filt, MATX_C_MODE_FULL, 0);\n",
-    "```"
+    "MatX supports both 1D and 2D direct convolution using the `conv1d` and `conv2d` functions. FFT-based convolution can also be performed as a combination of existing primitives as a potentially faster alternative to direct convolution for large tensors. Both forms of direct convolution take in an extra mode which specifies how much of the output is saved, where `MATX_C_MODE_FULL` saves the entire filter ramp-up and down, `MATX_C_MODE_SAME` makes the input and output tensors the same size, and `MATX_C_MODE_VALID` only keeps valid samples (when the entire filter was part of the convolution). Convolution can be used to perform a rolling average of an input by making all filter values 1/N, where N is the length of the filter. In this example, we use a filter of length 3 to create a running average of the last 3 elements:\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial CIn tensor:\n",
+      "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[16], Strides:[1]\n",
+      "000000:  0.0000e+00 \n",
+      "000001:  0.0000e+00 \n",
+      "000002:  0.0000e+00 \n",
+      "000003:  0.0000e+00 \n",
+      "000004:  0.0000e+00 \n",
+      "000005:  0.0000e+00 \n",
+      "000006:  0.0000e+00 \n",
+      "000007:  0.0000e+00 \n",
+      "000008:  0.0000e+00 \n",
+      "000009:  0.0000e+00 \n",
+      "000010:  0.0000e+00 \n",
+      "000011:  0.0000e+00 \n",
+      "000012:  0.0000e+00 \n",
+      "000013:  0.0000e+00 \n",
+      "000014:  0.0000e+00 \n",
+      "000015:  0.0000e+00 \n",
+      "tensor_1_f32: Tensor{float} Rank: 1, Sizes:[18], Strides:[1]\n",
+      "000000:  0.0000e+00 \n",
+      "000001:  0.0000e+00 \n",
+      "000002:  0.0000e+00 \n",
+      "000003:  0.0000e+00 \n",
+      "000004:  0.0000e+00 \n",
+      "000005:  0.0000e+00 \n",
+      "000006:  0.0000e+00 \n",
+      "000007:  0.0000e+00 \n",
+      "000008:  0.0000e+00 \n",
+      "000009:  0.0000e+00 \n",
+      "000010:  0.0000e+00 \n",
+      "000011:  0.0000e+00 \n",
+      "000012:  0.0000e+00 \n",
+      "000013:  0.0000e+00 \n",
+      "000014:  0.0000e+00 \n",
+      "000015:  0.0000e+00 \n",
+      "000016:  0.0000e+00 \n",
+      "000017:  0.0000e+00 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x78e507dfec30\n"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "!./exercises/compile_and_run.sh example3_conv1d"
+    "auto CIn  = matx::make_tensor<float>({16});\n",
+    "auto filt = matx::make_tensor<float>({3});\n",
+    "auto Co   = matx::make_tensor<float>({16 + filt.Lsize() - 1});\n",
+    "\n",
+    "filt.SetVals({1.0/3, 1.0/3, 1.0/3});\n",
+    "\n",
+    "(CIn = matx::random<float>({16}, matx::NORMAL)).run();  \n",
+    "\n",
+    "printf(\"Initial CIn tensor:\\n\");\n",
+    "matx::print(CIn);\n",
+    "(Co = matx::conv1d(CIn, filt, matx::MATX_C_MODE_FULL)).run();\n",
+    "\n",
+    "matx::print(Co);"
    ]
   },
   {
@@ -318,45 +502,67 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Expected output:\n",
-    "```sh\n",
-    "Initial C tensor:\n",
-    "000000: -0.9247\n",
-    "000001: -0.4253\n",
-    "000002: -2.6438\n",
-    "000003: 0.1452\n",
-    "000004: -0.1209\n",
-    "000005: -0.5797\n",
-    "000006: -0.6229\n",
-    "000007: -0.3284\n",
-    "000008: -1.0745\n",
-    "000009: -0.3631\n",
-    "000010: -1.6711\n",
-    "000011: 2.2655\n",
-    "000012: 0.3117\n",
-    "000013: -0.1842\n",
-    "000014: 1.2866\n",
-    "000015: 1.1820\n",
-    "After conv1d:\n",
-    "000000: -0.3082\n",
-    "000001: -0.4500\n",
-    "000002: -1.3313\n",
-    "000003: -0.9747\n",
-    "000004: -0.8732\n",
-    "000005: -0.1851\n",
-    "000006: -0.4411\n",
-    "000007: -0.5103\n",
-    "000008: -0.6753\n",
-    "000009: -0.5887\n",
-    "000010: -1.0362\n",
-    "000011: 0.0771\n",
-    "000012: 0.3020\n",
-    "000013: 0.7977\n",
-    "000014: 0.4714\n",
-    "000015: 0.7615\n",
-    "000016: 0.8229\n",
-    "000017: 0.3940\n",
-    "```"
+    "Similar to a 1D convolution, a 2D convolution does the same computation over two dimensions. A tensor of at least rank 2 is needed for a 2D convolution. Below we use a filter of all ones using the `ones` operator to demonstrate the filter can also be an operator and not an existing tensor view. The result is the sum of the four values around each cell on the input:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Initial C tensor:\n",
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 8], Strides:[8,1]\n",
+      "000000:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000001:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000002:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000003:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000004:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000005:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000006:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000007:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "After conv2d:\n",
+      "tensor_2_f32: Tensor{float} Rank: 2, Sizes:[8, 8], Strides:[8,1]\n",
+      "000000:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000001:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000002:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000003:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000004:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000005:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000006:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n",
+      "000007:  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 \n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(void) @0x78e507dfec30\n"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "  auto CIn2  = matx::make_tensor<float>({8,8});\n",
+    "  auto filt2 = matx::ones<float>({2, 2});\n",
+    "  auto Co2   = matx::make_tensor<float>({8, 8});\n",
+    "\n",
+    "  (CIn2 = matx::random<float>({8, 8}, matx::NORMAL)).run();  \n",
+    "\n",
+    "  printf(\"Initial C tensor:\\n\");\n",
+    "  matx::print(C);\n",
+    "\n",
+    "  (Co2 = matx::conv2d(CIn2, filt, matx::MATX_C_MODE_SAME)).run();\n",
+    "  \n",
+    "  printf(\"After conv2d:\\n\");\n",
+    "  matx::print(Co2);\n",
+    "\n"
    ]
   },
   {
@@ -364,39 +570,94 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Similar to a 1D convolution, a 2D convolution does the same computation over two dimensions. A tensor of at least rank 2 is needed for a 2D convolution. Below we use a filter of all ones using the `ones` operator to demonstrate the filter can also be an operator and not an existing tensor view. The result is the sum of the four values around each cell on the input:\n",
-    "\n",
-    "```c++\n",
-    "conv2d(Co, C, filt, MATX_C_MODE_FULL, 0);\n",
-    "```"
+    "Last, we mentioned above that convolution can also be done in the frequency domain using FFTs. This is the preferred method for larger tensors since FFTs are much faster than direct convolutions in large sizes, and because FFT libraries are highly-optimized. FFT convolution uses more memory than direct if the inputs are not to be destroyed since it requires running an FFT on both the input signal and filter before filtering. If not done in-place, this typically requires `2N + L - 1` new elements in memory, where N is the signal length and L is the filter length. A full FFT convolution example can be found in `fft_conv.cu` in the MatX examples, but the main convolution code is shown below:\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "!./exercises/compile_and_run.sh example3_conv2d"
+    "\n",
+    "///\\todo complete this tutorial. This one is pretty complex, do we want to keep the validation code here?\n",
+    "\n",
+    "// using complex = cuda::std::complex<float>;\n",
+    "// cudaExecutor exec{};\n",
+    "\n",
+    "// index_t signal_size = 16;\n",
+    "// index_t filter_size = 3;\n",
+    "// index_t filtered_size = signal_size + filter_size - 1;\n",
+    "\n",
+    "// // Create time domain buffers\n",
+    "// auto sig_time  = make_tensor<complex>({signal_size});\n",
+    "// auto filt_time = make_tensor<complex>({filter_size});\n",
+    "// auto time_out  = make_tensor<complex>({filtered_size});\n",
+    "\n",
+    "// // Frequency domain buffers\n",
+    "// auto sig_freq  = make_tensor<complex>({filtered_size});\n",
+    "// auto filt_freq = make_tensor<complex>({filtered_size});\n",
+    "\n",
+    "// // Fill the time domain signals with data\n",
+    "// for (index_t i = 0; i < signal_size; i++) {\n",
+    "//   sig_time(i) = {-1.0f * (2.0f * static_cast<float>(i % 2) + 1.0f) *\n",
+    "//                         (static_cast<float>(i % 10) / 10.0f) +\n",
+    "//                     0.1f,\n",
+    "//                 -1.0f * (static_cast<float>(i % 2) == 0.0f) *\n",
+    "//                         (static_cast<float>(i % 10) / 5.0f) -\n",
+    "//                     0.1f};\n",
+    "// }\n",
+    "// for (index_t i = 0; i < filter_size; i++) {\n",
+    "//   filt_time(i) = {static_cast<float>(i) / static_cast<float>(filter_size),\n",
+    "//                   static_cast<float>(-i) / static_cast<float>(filter_size) +\n",
+    "//                       0.5f};\n",
+    "// }\n",
+    "\n",
+    "// TODO: Perform FFT convolution\n",
+    "// Perform the FFT in-place on both signal and filter\n",
+    "// (sig_freq = fft(sig_time)).run();\n",
+    "// (filt_freq = fft(filt_time)).run();\n",
+    "\n",
+    "// (sig_freq = sig_freq * filt_freq).run();\n",
+    "\n",
+    "// // IFFT in-place\n",
+    "// (sig_freq = ifft(sig_freq)).run();  \n",
+    "\n",
+    "\n",
+    "// Perform the FFT in-place on both signal and filter, do an element-wise multiply of the two, then IFFT that output\n",
+    "// (sig_freq = ifft(fft(sig_time, filtered_size) * fft(filt_time, filtered_size))).run(stream);\n",
+    "\n",
+    "// TODO: Perform a time-domain convolution\n",
+    "// conv1d(time_out, sig_time, filt_time, matxConvCorrMode_t::MATX_C_MODE_FULL, 0);\n",
+    "\n",
+    "// exec.sync();\n",
+    "\n",
+    "// // Compare signals\n",
+    "// for (index_t i = 0; i < filtered_size; i++) {\n",
+    "//     if (  fabs(time_out(i).real() - sig_freq(i).real()) > 0.001 || \n",
+    "//           fabs(time_out(i).imag() - sig_freq(i).imag()) > 0.001) {\n",
+    "//         printf(\"Verification failed at item %lld. Direct=%f%+.2fj, FFT=%f%+.2fj\\n\", i,\n",
+    "//           time_out(i).real(), time_out(i).imag(), sig_freq(i).real(), sig_freq(i).imag());\n",
+    "//         return -1;\n",
+    "//     }\n",
+    "// }\n",
+    "\n",
+    "std::cout << \"Verification successful\" << std::endl;\n"
    ]
   },
   {
-   "attachments": {},
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Last, we mentioned above that convolution can also be done in the frequency domain using FFTs. This is the preferred method for larger tensors since FFTs are much faster than direct convolutions in large sizes, and because FFT libraries are highly-optimized. FFT convolution uses more memory than direct if the inputs are not to be destroyed since it requires running an FFT on both the input signal and filter before filtering. If not done in-place, this typically requires `2N + L - 1` new elements in memory, where N is the signal length and L is the filter length. A full FFT convolution example can be found in `fft_conv.cu` in the MatX examples, but the main convolution code is shown below:\n",
-    "\n",
-    "```c++\n",
-    "  // Perform the FFT in-place on both signal and filter\n",
-    "    (sig_freq = fft(sig_time)).run();\n",
-    "    (filt_freq = fft(filt_time)).run();\n",
-    "\n",
-    "    (sig_freq = sig_freq * filt_freq).run();\n",
-    "\n",
-    "    // IFFT in-place\n",
-    "    (sig_freq = ifft(sig_freq)).run();\n",
-    "```\n",
     "Since the expected output size of the full filtering operation is signal_len + filter_len - 1, both the filter and signal time domain inputs are shorter than the output. This would normally require a separate stage of allocating buffers of the appropriate size, zeroing them out, copying the time domain data to the buffers, and performing the  FFT. However, MatX has an API to do all of this automatically in the library using asynchronous allocations. This makes the call have a noticeable performance hit on the first call, but subsequent calls will be close to the time without allocation. To recognize that automatic padding is wanted, MatX uses the output tensor size compared to the input tensor size to determine whether to pad the input with zeros. In this case the output signal (sig_time and filt_time) are shorter than the output tensors (sig_freq and filt_freq), so it will automatically zero-pad the input.\n",
     "\n",
     "The above expression can also be combined into a single line:\n",
@@ -413,21 +674,7 @@
     "conv1d(time_out, sig_time, filt_time, matxConvCorrMode_t::MATX_C_MODE_FULL, 0);\n",
     "```\n",
     "\n",
-    "To match the FFT results we do a full convolution to get all the samples from the filter ramp up and ramp down. However, if we wanted either valid or same mode we could slice the FFT convolution output at the appropriate places to give the same answer. Edit the file [exercises/example3_fft_conv.cu](exercises/example3_fft_conv.cu) and add the missing code where you see TODOs. After running the verification code at the bottom will check for accuracy.\n",
-    "\n",
-    "Expected output:\n",
-    "```sh\n",
-    "Verification successful\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!./exercises/compile_and_run.sh example3_fft_conv"
+    "To match the FFT results we do a full convolution to get all the samples from the filter ramp up and ramp down. However, if we wanted either valid or same mode we could slice the FFT convolution output at the appropriate places to give the same answer. Edit the file [exercises/example3_fft_conv.cu](exercises/example3_fft_conv.cu) and add the missing code where you see TODOs. After running the verification code at the bottom will check for accuracy."
    ]
   },
   {
@@ -446,12 +693,15 @@
    "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
   },
   "kernelspec": {
-   "display_name": "Python 3.6.9 64-bit",
-   "name": "python3"
+   "display_name": "C++17",
+   "language": "C++",
+   "name": "cling-cpp17"
   },
   "language_info": {
-   "name": "python",
-   "version": ""
+   "codemirror_mode": "c++",
+   "file_extension": ".c++",
+   "mimetype": "text/x-c++src",
+   "name": "c++"
   }
  },
  "nbformat": 4,
diff --git a/docs_input/notebooks/05_fusion.ipynb b/docs_input/notebooks/05_fusion.ipynb
new file mode 100644
index 00000000..4a15e305
--- /dev/null
+++ b/docs_input/notebooks/05_fusion.ipynb
@@ -0,0 +1,25 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Fusing Operations in MatX"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## "
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs_input/notebooks/06_profiling.ipynb b/docs_input/notebooks/06_profiling.ipynb
new file mode 100644
index 00000000..ff5f9330
--- /dev/null
+++ b/docs_input/notebooks/06_profiling.ipynb
@@ -0,0 +1,25 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Profiling and Optimizing with MatX"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Automated NVTX Ranges"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs_input/notebooks/exercises/example1_adv_slice_col.cu b/docs_input/notebooks/exercises/example1_adv_slice_col.cu
deleted file mode 100644
index 082f8207..00000000
--- a/docs_input/notebooks/exercises/example1_adv_slice_col.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto t2 = make_tensor<int>({5, 4});
-
-  // Initialize the tensor linearly
-  t2.SetVals({{1, 2, 3, 4},
-        {5, 6, 7, 8},
-        {9, 10, 11, 12},
-        {13, 14, 15, 16},
-        {17, 18, 19, 20}});
-
-  // TODO: reduce tensor t2 to a 1D tensor by pulling the second column and all
-  // rows
-  auto t1 = ...;
-
-  print(t1);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example1_adv_slice_row.cu b/docs_input/notebooks/exercises/example1_adv_slice_row.cu
deleted file mode 100644
index a99e2540..00000000
--- a/docs_input/notebooks/exercises/example1_adv_slice_row.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto t2 = make_tensor<int>({5, 4});
-
-  // Initialize the tensor linearly
-  t2.SetVals({{1, 2, 3, 4},
-        {5, 6, 7, 8},
-        {9, 10, 11, 12},
-        {13, 14, 15, 16},
-        {17, 18, 19, 20}});
-
-  // TODO: reduce tensor t2 to a 1D tensor by pulling all columns and the
-  // second row
-  auto t1 = ...;
-
-  print(t1);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example1_assignment1.cu b/docs_input/notebooks/exercises/example1_assignment1.cu
deleted file mode 100644
index 14d923a2..00000000
--- a/docs_input/notebooks/exercises/example1_assignment1.cu
+++ /dev/null
@@ -1,127 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-/**
- * MatX training assignment 1. This training goes through basic tensor
- * operations that were learned in the 01_introduction notebook. Uncomment each
- * verification block as you go to ensure your solutions are correct.
- */
-
-int main() {
-
-  /****************************************************************************************************
-   * Create a rank-2 tensor data object of ints with 5 rows and 4 columns called
-   *"t2"
-   *https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#tensor-views
-   ****************************************************************************************************/
-
-  /*** End editing ***/
-
-  /****************************************************************************************************
-   * Initialize the t2 view to a 4x5 matrix of increasing values starting at 1
-   * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#tensor-views
-   ****************************************************************************************************/
-  // t2 = ;
-  /*** End editing ***/
-
-  /****************************************************************************************************
-   * Get a slice of the second and third rows with all columns
-   * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#slicing-and-dicing
-   *****************************************************************************************************/
-  auto t2s = t2;
-  /*** End editing ***/
-
-  // Verify slice is correct
-  // for (int row = 1; row <= 2; row++) {
-  //   for (int col = 0; col < t2.Size(1); col++) {
-  //     if (t2(row, col) != t2s(row - 1, col)) {
-  //       printf("Mismatch in sliced view! actual = %d, expected = %d\n",
-  //       t2s(row - 1, col), t2(row, col)); exit(-1);
-  //     }
-  //   }
-  // }
-
-  // print(t2s);
-  // printf("Slice verification passed!\n");
-
-  /****************************************************************************************************
-   * Take the slice and clone it into a 3D tensor with new outer dimensions as
-   *follows: First dim: keep existing row dimension from t2s Second dim: 2 Third
-   *dim: keep existing col dimension from t2s
-   https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#increasing-dimensionality
-   *****************************************************************************************************/
-  auto t3c = t2s;
-  /*** End editing ***/
-
-  // Verify clone
-  // for (int first = 0; first < t3c.Size(0); first++) {
-  //   for (int sec = 0; sec < t3c.Size(1); sec++) {
-  //     for (int third = 0; third < t3c.Size(2); third++) {
-  //       if (t3c(first, sec, third) != t2s(first, third)) {
-  //         printf("Mismatch in cloned view! actual = %d, expected = %d\n",
-  //         t3c(first, sec, third), t2s(first, third)); exit(-1);
-  //       }
-  //     }
-  //   }
-  // }
-
-  // print(t3c);
-  // printf("Clone verification passed!\n");
-
-  /****************************************************************************************************
-   * Permute the two outer dimensions of the cloned tensor
-   * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#permuting
-   *****************************************************************************************************/
-  auto t3p = t3c;
-  /*** End editing ***/
-
-  // Verify clone
-  // for (int first = 0; first < t3p.Size(0); first++) {
-  //   for (int sec = 0; sec < t3p.Size(1); sec++) {
-  //     for (int third = 0; third < t3p.Size(2); third++) {
-  //       if (t3c(first, sec, third) != t2s(first, third)) {
-  //         printf("Mismatch in permuted view! actual = %d, expected = %d\n",
-  //         t3c(first, sec, third), t2s(sec, third)); exit(-1);
-  //       }
-  //     }
-  //   }
-  // }
-
-  // print(t3p);
-  // printf("Permute verification passed!\n");
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example1_clone.cu b/docs_input/notebooks/exercises/example1_clone.cu
deleted file mode 100644
index 99d7cc05..00000000
--- a/docs_input/notebooks/exercises/example1_clone.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto t1 = make_tensor<int>({4});
-
-  // Initialize the tensor linearly
-  t1.SetVals({1, 2, 3, 4});
-
-  // TODO: Clone tensor t1 into a 2D tensor by making a new outer dimension 5.
-  auto t2c = ...;
-
-  print(t2c);
-
-  // TODO: After compiling and running the code above, modify the first element
-  // in t1 to be 10 on the next line. Uncomment the print line as well. t1(0) =
-  // ...
-
-  // print(t2c);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example1_init.cu b/docs_input/notebooks/exercises/example1_init.cu
deleted file mode 100644
index 150bdda9..00000000
--- a/docs_input/notebooks/exercises/example1_init.cu
+++ /dev/null
@@ -1,54 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  // TODO: Create a 2D tensor of ints called t2data with dimensions 5, 4, and
-  // a view of that data using the default view.
-  
-  auto t2 = ;
-
-  // Initialize the tensor linearly
-  t2.SetVals({  {1, 2, 3, 4},
-          {5, 6, 7, 8},
-          {9, 10, 11, 12},
-          {13, 14, 15, 16},
-          {17, 18, 19, 20}});
-
-  print(t2);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example1_permute.cu b/docs_input/notebooks/exercises/example1_permute.cu
deleted file mode 100644
index 00b7fb9a..00000000
--- a/docs_input/notebooks/exercises/example1_permute.cu
+++ /dev/null
@@ -1,54 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto t2 = make_tensor<int>({5, 4});
-
-  // Initialize the tensor linearly
-  t2.SetVals({{1, 2, 3, 4},
-        {5, 6, 7, 8},
-        {9, 10, 11, 12},
-        {13, 14, 15, 16},
-        {17, 18, 19, 20}});
-
-  // TODO: Permute the view t2 such that the two dimensions are swapped
-  auto t2p = ...;
-
-  print(t2p);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example1_simple_slice.cu b/docs_input/notebooks/exercises/example1_simple_slice.cu
deleted file mode 100644
index 8c8dce35..00000000
--- a/docs_input/notebooks/exercises/example1_simple_slice.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto t2 = make_tensor<int>({5, 4});
-
-  // Initialize the tensor linearly
-  t2.SetVals({{1, 2, 3, 4},
-        {5, 6, 7, 8},
-        {9, 10, 11, 12},
-        {13, 14, 15, 16},
-        {17, 18, 19, 20}});
-
-  // TODO: Create a slide of the view t2 starting at the second element and
-  // ending at the third element (inclusive) in both dimensions
-  auto t2s = ...;
-
-  print(t2s);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example2_assignment1.cu b/docs_input/notebooks/exercises/example2_assignment1.cu
deleted file mode 100644
index 69d2f3e6..00000000
--- a/docs_input/notebooks/exercises/example2_assignment1.cu
+++ /dev/null
@@ -1,158 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-/**
- * MatX training assignment 2. This training goes through tensor operations that
- * were learned in the 02_operators notebook. Uncomment each verification block
- * as you go to ensure your solutions are correct.
- */
-
-int main() {
-  auto A = make_tensor<float>({2, 3});
-  auto B = make_tensor<float>({2, 3});
-  auto V = make_tensor<float>({3});
-
-  cudaExecutor exec{};
-
-  /****************************************************************************************************
-   * Initialize tensor A with increasing values from 0.5 to 3.0 in steps of 0.4,
-   *and tensor V from -1 to -3 in steps of -1.
-   ****************************************************************************************************/
-
-  /*** End editing ***/
-
-  // Verify init is correct
-  float step = 0.5;
-  for (int row = 0; row < A.Size(0); row++) {
-    for (int col = 0; col < A.Size(1); col++) {
-      if (A(row, col) != step) {
-        printf("Mismatch in A init view! actual = %f, expected = %f\n",
-               A(row, col), step);
-        exit(-1);
-      }
-      step += 0.5;
-    }
-  }
-
-  for (int col = 0; col < V.Size(0); col++) {
-    if (V(col) != (-1 + col * -1)) {
-      printf("Mismatch in A init view! actual = %f, expected = %f\n", V(col),
-             (float)(-1 + col * -1));
-      exit(-1);
-    }
-  }
-
-  print(A);
-  print(V);
-  printf("Init verification passed!\n");
-
-  /****************************************************************************************************
-   * Add 5.0 to all elements of A and store the results back in A
-   ****************************************************************************************************/
-
-  /*** End editing ***/
-
-  exec.sync();
-
-  step = 0.5;
-  for (int row = 0; row < A.Size(0); row++) {
-    for (int col = 0; col < A.Size(1); col++) {
-      if (A(row, col) != (5.0 + step)) {
-        printf("Mismatch in A sum view! actual = %f, expected = %f\n",
-               A(row, col), 5.0 + step);
-        exit(-1);
-      }
-      step += 0.5;
-    }
-  }
-
-  print(A);
-  printf("Sum verification passed!\n");
-
-  /****************************************************************************************************
-   * Clone V to match the dimensions of A, and subtract V from A. The results
-   * should be stored in A
-   *
-   * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#increasing-dimensionality
-   * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/tensorview.html#_CPPv4I0_iEN4matx12tensor_tE
-   *
-   ****************************************************************************************************/
-  /// auto tvs = ;
-  /*** End editing. ***/
-
-  // exec.sync();
-
-  // step = 0.5;
-  // for (int row = 0; row < A.Size(0); row++) {
-  //   for (int col = 0; col < A.Size(1); col++) {
-  //     if (A(row, col) != (5.0 + step - tvs(row, col))) {
-  //       printf("Mismatch in A sub view! actual = %f, expected = %f\n", A(row,
-  //       col), 5.0 + step - tvs(row, col)); exit(-1);
-  //     }
-  //     step += 0.5;
-  //   }
-  // }
-
-  // print(A);
-  // print(tvs);
-  // printf("Clone verification passed!\n");
-
-  /****************************************************************************************************
-   * Raise the matrix A to the power of 2 and multiply the output by two. Next,
-   * subtract the vector V from each row. Store the result in tensor B.
-   *
-   * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/tensorops.html#_CPPv4N4matx3powE2Op2Op
-   ****************************************************************************************************/
-
-  /*** End editing ***/
-
-  exec.sync();
-
-  for (int row = 0; row < B.Size(0); row++) {
-    for (int col = 0; col < B.Size(1); col++) {
-      if (B(row, col) != powf(A(row, col), 2) * 2 - V(col)) {
-        printf("Mismatch in B init view! actual = %f, expected = %f\n",
-               B(row, col), powf(A(row, col), 2) * 2 - V(col));
-        exit(-1);
-      }
-    }
-  }
-
-  print(B);
-  printf("Mixed verification passed!\n");
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example2_conditional.cu b/docs_input/notebooks/exercises/example2_conditional.cu
deleted file mode 100644
index 30107902..00000000
--- a/docs_input/notebooks/exercises/example2_conditional.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-  auto A = make_tensor<int>({2, 3});
-  auto C = make_tensor<float>({2, 3});
-
-  C.SetVals({ {1, 2, 3},
-        {4, 5, 6}});
-  
-
-  // TODO: Conditionally assign elements of A the value of 1 if the same element in C is > 3, or 0 otherwise
-  
-
-  print(A);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example2_eye.cu b/docs_input/notebooks/exercises/example2_eye.cu
deleted file mode 100644
index 77fe96a2..00000000
--- a/docs_input/notebooks/exercises/example2_eye.cu
+++ /dev/null
@@ -1,45 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-  auto B = make_tensor<float>({8, 8});  
-
-  // TODO: Set tensor B such that it forms an identity matrix
-
-  print(B);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example2_hamming.cu b/docs_input/notebooks/exercises/example2_hamming.cu
deleted file mode 100644
index 574760a4..00000000
--- a/docs_input/notebooks/exercises/example2_hamming.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto B = make_tensor<float>({10});  
-
-  // TODO: Set tensor B such that it forms a Hamming window
-
-
-  print(B);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example2_init.cu b/docs_input/notebooks/exercises/example2_init.cu
deleted file mode 100644
index 2e693abd..00000000
--- a/docs_input/notebooks/exercises/example2_init.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-  auto A = make_tensor<float>({2, 3});
-  auto B = make_tensor<float>({2, 3});
-  auto C = make_tensor<float>({2, 3});
-  auto V = make_tensor<float>({3});
-
-  // TODO: Initialize the A tensor to contain values increasing from 1 to 6, and
-  // V from 7 to 9.
-  A = {};
-  V = {};
-
-  print(A);
-  printf("\n");
-  print(V);
-  
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example2_mixed_rank.cu b/docs_input/notebooks/exercises/example2_mixed_rank.cu
deleted file mode 100644
index 660f5987..00000000
--- a/docs_input/notebooks/exercises/example2_mixed_rank.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto A = make_tensor<float>({2, 3});
-  auto B = make_tensor<float>({2, 3});
-  auto C = make_tensor<float>({2, 3});
-  auto V = make_tensor<float>({3});
-
-  C.SetVals({ {1, 2, 3},
-        {4, 5, 6}});
-  
-  V.SetVals({7, 8, 9});
-
-  // TODO: Add vector V to matrix C using rank expansion. Store result in C
-
-
-  print(C);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example2_multiple_ops.cu b/docs_input/notebooks/exercises/example2_multiple_ops.cu
deleted file mode 100644
index 84048b48..00000000
--- a/docs_input/notebooks/exercises/example2_multiple_ops.cu
+++ /dev/null
@@ -1,54 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-  auto A = make_tensor<float>({2, 3});
-  auto B = make_tensor<float>({2, 3});
-  auto C = make_tensor<float>({2, 3});
-  auto V = make_tensor<float>({3});
-
-  A.SetVals({ {1, 2, 3},
-        {4, 5, 6}});
-  
-  V.SetVals({7, 8, 9});
-
-  // TODO: Add A to itself plus 1, divide the result by 2, and add vector V. 
-
-  
-  print(C);
-  
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example2_rand.cu b/docs_input/notebooks/exercises/example2_rand.cu
deleted file mode 100644
index 378b60ad..00000000
--- a/docs_input/notebooks/exercises/example2_rand.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto A = make_tensor<float>({4, 4});
-
-  (A = 0).run();
-
-  // TODO: Set tensor A to normally-distributed random numbers
-
-
-  print(A);
-}
diff --git a/docs_input/notebooks/exercises/example2_scalar.cu b/docs_input/notebooks/exercises/example2_scalar.cu
deleted file mode 100644
index 7c827005..00000000
--- a/docs_input/notebooks/exercises/example2_scalar.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-  auto A = make_tensor<float>({2, 3});
-  auto B = make_tensor<float>({2, 3});
-  auto C = make_tensor<float>({2, 3});
-  auto V = make_tensor<float>({3});
-
-  A.SetVals({ {1, 2, 3},
-        {4, 5, 6}});
-  
-  V.SetVals({7, 8, 9});
-
-  // TODO: Add the value 1 to all elements of A and store the result in B
-
-
-  print(A);
-  printf("\n");
-  print(B);
-  
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example2_tensor_add.cu b/docs_input/notebooks/exercises/example2_tensor_add.cu
deleted file mode 100644
index 47d06344..00000000
--- a/docs_input/notebooks/exercises/example2_tensor_add.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-  auto A = make_tensor<float>({2, 3});
-  auto B = make_tensor<float>({2, 3});
-  auto C = make_tensor<float>({2, 3});
-  auto V = make_tensor<float>({3});
-
-  A.SetVals({ {1, 2, 3},
-        {4, 5, 6}});
-  
-  B.SetVals({ {7, 8, 9},
-        {10, 11, 12}});
-
-  // TODO: Add tensors A and B and store the result in C
-
-  
-  print(A);
-  printf("\n");
-  print(B);
-  printf("\n");
-  print(C);
-}
\ No newline at end of file
diff --git a/docs_input/notebooks/exercises/example2_tensor_div.cu b/docs_input/notebooks/exercises/example2_tensor_div.cu
deleted file mode 100644
index 9cf71f38..00000000
--- a/docs_input/notebooks/exercises/example2_tensor_div.cu
+++ /dev/null
@@ -1,45 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-  auto C = make_tensor<float>({2, 3});
-
-  C.SetVals({{7, 8, 9}, {10, 11, 12}});
-
-  // TODO: Divide tensor C by 2 and store in C
-
-  print(C);
-}
\ No newline at end of file
diff --git a/docs_input/notebooks/exercises/example2_viz.cu b/docs_input/notebooks/exercises/example2_viz.cu
deleted file mode 100644
index 8c11ccb8..00000000
--- a/docs_input/notebooks/exercises/example2_viz.cu
+++ /dev/null
@@ -1,47 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-#include "matx_viz.h"
-
-using namespace matx;
-
-int main() {
-  auto B = make_tensor<float>({10});
-
-  // TODO: Set tensor B such that it forms a Hamming window
-  (B = hamming_x(shape)).run();
-
-  viz::line(B, "Hamming Window", "Sample", "Amplitude", "hamming.html");
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example3_1dfft.cu b/docs_input/notebooks/exercises/example3_1dfft.cu
deleted file mode 100644
index cd5bf70d..00000000
--- a/docs_input/notebooks/exercises/example3_1dfft.cu
+++ /dev/null
@@ -1,57 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto C = make_tensor<cuda::std::complex<float>>({2, 4});
-
-  (C = random<float>({2, 4}, NORMAL)).run();
-
-  printf("Initial C tensor:\n");
-  print(C);
-
-  // TODO: Perform an in-place FFT on C across rows
-
-  printf("After FFT:\n");
-  print(C);
-
-  // TODO: Perform an in-place IFFT on C across rows.
-
-  printf("After IFFT and normalization:\n");
-  print(C);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example3_2dfft.cu b/docs_input/notebooks/exercises/example3_2dfft.cu
deleted file mode 100644
index ebe117df..00000000
--- a/docs_input/notebooks/exercises/example3_2dfft.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto C = make_tensor<cuda::std::complex<float>>({2, 4});
-
-  (C = random<float>({2, 4}, NORMAL)).run();
-  printf("Initial C tensor:\n");
-  print(C);
-
-  // TODO: Perform an in-place 2D FFT on C across rows
-
-  printf("After FFT:\n");
-  print(C);
-
-  // TODO: Perform an in-place 2D IFFT on C across rows
-
-  printf("After IFFT and normalization:\n");
-  print(C);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example3_assignment1.cu b/docs_input/notebooks/exercises/example3_assignment1.cu
deleted file mode 100644
index 07f37ef1..00000000
--- a/docs_input/notebooks/exercises/example3_assignment1.cu
+++ /dev/null
@@ -1,115 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-/**
- * MatX training assignment 3. This training goes through tensor operations that
- * were learned in the 03_transformations notebook. Uncomment each verification
- * block as you go to ensure your solutions are correct.
- */
-
-int main() {
-  using complex = cuda::std::complex<float>;
-  auto A = make_tensor<complex>({2, 3});
-  auto B = make_tensor<complex>({2, 3});
-
-  cudaExecutor exec{};
-
-  /****************************************************************************************************
-   * Use the random number generator with a seed of 12345 to generate
-   * normally-distributed numbers in the tensor A. Next, take the FFT across
-   * columns of A (a 2-element FFT), and store the results in-place back in A.
-   * An example of random number generation can be found in the second tutorial
-   * or in the quick start guide here:
-   *
-   * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/quickstart.html#random-numbers
-   * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/random.html
-   ****************************************************************************************************/
-
-  /*** End editing ***/
-
-  // Verify init is correct
-  B.SetVals({{{0.5927, -0.3677}, {-2.6895, 1.8154}, {-0.0129, 0.9246}},
-       {{0.5646, 0.8638}, {1.6400, 0.3494}, {-0.5709, 0.5919}}});
-  A.print();
-  B.print();
-  exec.sync();
-  for (int row = 0; row < A.Size(0); row++) {
-    for (int col = 0; col < A.Size(1); col++) {
-      if (fabs(A(row, col).real() - B(row, col).real()) > 0.001) {
-        printf(
-            "Mismatch in real part of FFT view! actual = %f, expected = %f\n",
-            A(row, col).real(), B(row, col).real());
-        exit(-1);
-      }
-      if (fabs(A(row, col).imag() - B(row, col).imag()) > 0.001) {
-        printf(
-            "Mismatch in imag part of FFT view! actual = %f, expected = %f\n",
-            A(row, col).imag(), B(row, col).imag());
-        exit(-1);
-      }
-    }
-  }
-
-  printf("FFT verification passed!\n");
-
-  /****************************************************************************************************
-   * Create a 3D tensor of floats using a normal distribution and with shape
-   * 10x5x15. Reduce the entire tensor down to a single float containing the max
-   * value. Scale the original tensor by this max value and do another max
-   * reduction. The final reduction should be 1.0.
-   *
-   * Hint: the reduction function is named rmax and takes the output, input, and
-   * stream as parameters
-   * https://devtech-compute.gitlab-master-pages.nvidia.com/matx/api/reduce.html
-   ****************************************************************************************************/
-  // Create and initialize 3D tensor
-
-  // Create scalar tensor for reduction
-  tensor_t<float, 0> redv;
-
-  /*** End editing ***/
-
-  // Verify init is correct
-  exec.sync();
-  if (fabs(redv() - 1.0) > 0.001) {
-    printf("Mismatch on final reduction. Expected=1.0, actual = %f\n", redv());
-    exit(-1);
-  }
-
-  printf("Reduction verification passed!\n");
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example3_conv1d.cu b/docs_input/notebooks/exercises/example3_conv1d.cu
deleted file mode 100644
index 1a5b83a7..00000000
--- a/docs_input/notebooks/exercises/example3_conv1d.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-  auto C = make_tensor<float>({16});
-  auto filt = make_tensor<float>({3});
-  auto Co = make_tensor<float>({16 + filt.Lsize() - 1});
-
-  filt.SetVals({1.0/3, 1.0/3, 1.0/3});
-
-  (C = random<float>({16}, NORMAL)).run();  
-
-  printf("Initial C tensor:\n");
-  print(C);
-
-  // TODO: Perform a 1D direct convolution on C with filter filt
-  
-
-  printf("After conv1d:\n");
-  print(Co);
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example3_conv2d.cu b/docs_input/notebooks/exercises/example3_conv2d.cu
deleted file mode 100644
index d1ac06b3..00000000
--- a/docs_input/notebooks/exercises/example3_conv2d.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-  auto C = make_tensor<float>({8,8});
-  auto filt = make_tensor<float>({3});
-  auto Co = make_tensor<float>({16 + filt.Lsize() - 1});
-
-  auto filt = ones<float>({2, 2});
-  auto Co = make_tensor<float>({8 + filt.Size(0) - 1, 8 + filt.Size(1) - 1});
-
-  (C = randTrandom<float>({8, 8}, NORMAL)).run();  
-
-  printf("Initial C tensor:\n");
-  print(C);
-
-  // TODO: Perform a 2D direct convolution on C with filter filt
-
-  
-
-  printf("After conv2d:\n");
-  print(Co);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example3_fft_conv.cu b/docs_input/notebooks/exercises/example3_fft_conv.cu
deleted file mode 100644
index d922ec82..00000000
--- a/docs_input/notebooks/exercises/example3_fft_conv.cu
+++ /dev/null
@@ -1,92 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  using complex = cuda::std::complex<float>;
-  cudaExecutor exec{};
-  
-  index_t signal_size = 16;
-  index_t filter_size = 3;
-  index_t filtered_size = signal_size + filter_size - 1;
-
-  // Create time domain buffers
-  auto sig_time  = make_tensor<complex>({signal_size});
-  auto filt_time = make_tensor<complex>({filter_size});
-  auto time_out  = make_tensor<complex>({filtered_size});
-
-  // Frequency domain buffers
-  auto sig_freq  = make_tensor<complex>({filtered_size});
-  auto filt_freq = make_tensor<complex>({filtered_size});
-
-  // Fill the time domain signals with data
-  for (index_t i = 0; i < signal_size; i++) {
-    sig_time(i) = {-1.0f * (2.0f * static_cast<float>(i % 2) + 1.0f) *
-                          (static_cast<float>(i % 10) / 10.0f) +
-                      0.1f,
-                  -1.0f * (static_cast<float>(i % 2) == 0.0f) *
-                          (static_cast<float>(i % 10) / 5.0f) -
-                      0.1f};
-  }
-  for (index_t i = 0; i < filter_size; i++) {
-    filt_time(i) = {static_cast<float>(i) / static_cast<float>(filter_size),
-                    static_cast<float>(-i) / static_cast<float>(filter_size) +
-                        0.5f};
-  }
-
-  // TODO: Perform FFT convolution
-  // Perform the FFT in-place on both signal and filter, do an element-wise multiply of the two, then IFFT that output
-
-
-  // TODO: Perform a time-domain convolution
-  
-
-  exec.sync();
-
-  // Compare signals
-  for (index_t i = 0; i < filtered_size; i++) {
-      if (  fabs(time_out(i).real() - sig_freq(i).real()) > 0.001 || 
-            fabs(time_out(i).imag() - sig_freq(i).imag()) > 0.001) {
-          printf("Verification failed at item %lld. Direct=%f%+.2fj, FFT=%f%+.2fj\n", i,
-            time_out(i).real(), time_out(i).imag(), sig_freq(i).real(), sig_freq(i).imag());
-          return -1;
-      }
-  }
-
-  std::cout << "Verification successful" << std::endl;
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example3_full_reduce.cu b/docs_input/notebooks/exercises/example3_full_reduce.cu
deleted file mode 100644
index 49c81832..00000000
--- a/docs_input/notebooks/exercises/example3_full_reduce.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto A = make_tensor<float>({4, 5});
-  auto MD0 = make_tensor<float>();
-  auto AD0 = make_tensor<float>();
-
-  (A = random<float>({4, 5}, NORMAL)).run();    
-  
-  // Initialize max and average to 0
-  (MD0 = 0).run();
-  (AD0 = 0).run();
-
-  // TODO: Perform a max and sum reduction of A into MD0 and AD0, respectively.
-
-
-  printf("A:\n");
-  print(A);
-  printf("Max: %f\n", MD0());
-  printf("Sum: %f\n", AD0());  
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example3_gemm.cu b/docs_input/notebooks/exercises/example3_gemm.cu
deleted file mode 100644
index 824ec9af..00000000
--- a/docs_input/notebooks/exercises/example3_gemm.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-  auto A = make_tensor<float>({8, 4});
-  auto B = make_tensor<float>({4, 8});
-  auto C = make_tensor<float>({8, 8});
-
-  (A = random<float>({8, 4}, NORMAL)).run();  
-  (B = random<float>({4, 8}, NORMAL)).run();  
-
-  // TODO: Perform a GEMM of C = A*B
-  
-  printf("A:\n");
-  print(A);
-  printf("B:\n");
-  print(B);  
-  printf("C:\n");
-  print(C);    
-
-  return 0;
-}
diff --git a/docs_input/notebooks/exercises/example3_partial_reduce.cu b/docs_input/notebooks/exercises/example3_partial_reduce.cu
deleted file mode 100644
index e48e407b..00000000
--- a/docs_input/notebooks/exercises/example3_partial_reduce.cu
+++ /dev/null
@@ -1,61 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-// BSD 3-Clause License
-//
-// Copyright (c) 2021, NVIDIA Corporation
-// All rights reserved.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are met:
-//
-// 1. Redistributions of source code must retain the above copyright notice, this
-//    list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright notice,
-//    this list of conditions and the following disclaimer in the documentation
-//    and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the copyright holder nor the names of its
-//    contributors may be used to endorse or promote products derived from
-//    this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-/////////////////////////////////////////////////////////////////////////////////
-
-#include <matx.h>
-
-using namespace matx;
-
-int main() {
-
-  auto A = make_tensor<float>({4, 5});
-  auto MD0 = make_tensor<float>({4});
-  auto AD0 = make_tensor<float>({4});
-
-  (A = random<float>(shape, NORMAL)).run();    
-  
-  // Initialize max and average to 0
-  (MD1 = 0).run();
-  (AD1 = 0).run();
-
-  // TODO: Reduce all rows of A by max where each reduction is a separate value in the vector MD1
-
-
-
-  printf("A:\n");
-  print(A);
-  printf("Max:\n");
-  print(MD1);
-  printf("Sum:\n");
-  print(AD1);
-
-  return 0;
-}
diff --git a/docs_input/notebooks/gtc_lab/01_lab_intro.ipynb b/docs_input/notebooks/gtc_lab/01_lab_intro.ipynb
new file mode 100644
index 00000000..e354b73c
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/01_lab_intro.ipynb
@@ -0,0 +1,792 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Notebook Usage\n",
+    "\n",
+    "This notebook uses a custom magic command `%%run_matx` to run MatX code. This command wraps the code you write in a cell, adds the necessary includes and compiler flags, and then compiles and runs the code. All code in this notebook can be copied and pasted into your own MatX code without any additions beyond the environment setup. The magic `%%run_matx` must be at the beginning of the cell for the code to compile properly. \n",
+    "\n",
+    "Since MatX is a C++ template library, some of the code may take many seconds to compile. The type of CPU, CUDA version, and complexity of the example can affect compile times."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# MatX GTC Lab Notebook\n",
+    "## Tensor Creation and Memory Backing\n",
+    "\n",
+    "Tensors are the base class of memory backed storage in MatX. The Tensor class is highly flexible with many options for memory types, residency, and ownership. By default, tensors are allocated using CUDA managed memory so that it's available on both the host and device. This is great for quick prototyping and development, but for production code it's recommended to use device memory or pinned host memory for performance reasons. A set of utility `make_tensor` functions are provided to help streamline and simplify tensor creation rather than declaring the tensor object directly.\n",
+    "\n",
+    "`make_tensor` takes one template parameter indicating the type of the tensor, and zero or more function parameters. Without any parameters the tensor is considered a \"null tensor\" and has no shape or memory backing it. This is useful when declaring a tensor that will be given a shape and allocation later. The sizes of the tensor are specified in curly braces, or in the case of a 0-D tensor, an empty set of braces. For a complete guide on creating tensors in different ways, please visit: https://nvidia.github.io/MatX/basics/creation.html.\n",
+    "\n",
+    "MatX uses several conventions that can be different from other libraries:\n",
+    "- Row-major memory layout\n",
+    "- 0-based or C-style indexing\n",
+    "- A rank 1 tensor is a different type entirely than a rank 2 tensor with one dimension of length 1\n",
+    "\n",
+    "In the following cell we demonstrate creating tensors of 0D (scalar), 1D, and 2D data. Tensors can be scaled to any arbitrary rank by adding more dimensions, and the rank is only limited by the available memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "// declare a 0D integer tensor (Scalar)\n",
+    "auto t0 = matx::make_tensor<int>({});\n",
+    "\n",
+    "// declare a 1D integer tensor of length 4\n",
+    "auto t1 = matx::make_tensor<int>({4});\n",
+    "\n",
+    "// declare a 2D fp32 tensor with shape 4x5 (4 rows and 5 columns)\n",
+    "auto t2 = matx::make_tensor<float>({4,5});\n",
+    "\n",
+    "// declare tensor with user provided memory\n",
+    "int *myptr = new int[4*5];\n",
+    "auto t2_custom = matx::make_tensor<int>(myptr, t2.Shape());\n",
+    "\n",
+    "// declare tensor with shape of tensor t2\n",
+    "auto t2_b = matx::make_tensor<int>(t2.Shape());\n",
+    "\n",
+    "std::cout << \"Done!\" << std::endl;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Printing & Assigning\n",
+    "MatX provides several utilities for initializing and viewing its data inside tensors.\n",
+    "\n",
+    "To set a series of values explicitly the `SetVals` member function can be used by specifying values in an intializer list syntax. The initializer list uses nested braces to match the shape of the tensor and is supported up to 4D tensors. For higher ranks or very large tensors MatX provides an IO API to read in data from a file. See the [IO section](https://nvidia.github.io/MatX/api/io/index.html) for more information. `operator()` can also available to set and get individual values of a tensor as an alternative. `operator()` can both get and set individual values, but is not recommended for large tensors as setting individual values is not memory efficient. When setting values with both `SetVals` and `operator()` the memory backing the tensor must be modifiable from the host. For example, attempting to access device memory from these functions on a system without unified memory will result in undefined behavior.\n",
+    "\n",
+    "`print` is a utility function to print a tensor or operator's contents to stdout. Printing can be used with any type of operator, including ones that have no memory backing them. With no arguments `print` will print the entire contents of the operator. The size of the printing can also be limited by passing a limit to each dimension. For example, `print(3,2)` would print the first 2 columns and 3 rows of the 2D tensor. Unlike `SetVals` and `operator()`, `print` can be used on tensors with memory not accessible from the host. In this case a copy will be performed to the host before printing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "// declare a 2D tensor of size with 4 rows and 5 columns\n",
+    "auto t2 = matx::make_tensor<int>({4,5});\n",
+    "\n",
+    "// setVals in tensor\n",
+    "t2.SetVals({\n",
+    "          {1, 2, 3, 4, 5},\n",
+    "          {6, 7, 8, 9, 10},\n",
+    "          {11, 12, 13, 14, 15},\n",
+    "          {16, 17, 18, 19, 20}\n",
+    "          });\n",
+    "\n",
+    "// print a tensor\n",
+    "matx::print(t2);\n",
+    "\n",
+    "// print elements of tensor. Memory MUST be host-accessible for this to work.\n",
+    "std::cout << t2(0,0) << std::endl;\n",
+    "\n",
+    "t2(0,0) = 42;\n",
+    "t2(3,2) = 117;\n",
+    "\n",
+    "matx::print(t2);\n",
+    "\n",
+    "std::cout << \"My updates value for (3,2): \" << t2(3,2) << std::endl;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tensor Creation Operators\n",
+    "\n",
+    "MatX also has a number of pre-built creation routines. For the full list, see https://nvidia.github.io/MatX/api/creation/operators/index.html.\n",
+    "\n",
+    "Shown below are [linspace](https://nvidia.github.io/MatX/api/creation/operators/linspace.html), [range](https://nvidia.github.io/MatX/api/creation/operators/range.html), and [ones](https://nvidia.github.io/MatX/api/creation/operators/ones.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "std::cout << \"Linspace (steps=10, start=0, stop=1)\" << std::endl;\n",
+    "matx::print(matx::linspace<0>({10}, 1.f, 10.f));\n",
+    "  \n",
+    "std::cout << std::endl << \"Range (shape=10, first=0, step=0.1)\" << std::endl;\n",
+    "matx::print(matx::range<0>({10}, 0.0f, 0.1f));\n",
+    "  \n",
+    "std::cout << std::endl << \"Ones (2x3)\" << std::endl;\n",
+    "matx::print(matx::ones<float>({2, 3}));"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 01_A: Creating your first tensor\n",
+    "\n",
+    "Try defining a new integer tensor of size `{3, 5}` and initialize its values in increasing order from 0 to 15. Once defined, print your tensor to ensure the values are as expected. Next, updated element (1,2) to 101. Print the tensor again to ensure the update was valid."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "// Declare a tensor\n",
+    "\n",
+    "// SetVals in myTensor\n",
+    "\n",
+    "// Print your new tensor\n",
+    "\n",
+    "// Update the value at {1,1} to 101"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[01_A Solution](solutions/01_A.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Operator Views\n",
+    "MatX provides a powerful set of functions that enable arbitrary views into existing tensors without incuring additional memory storage or processing cost to reorganize the data. These views provide \"zero copy\" accessors to a tensor that can be used in MatX logic as if it were a real memory-backed tensor.\n",
+    "\n",
+    "MatX has feature parity to most operations expected in CuPy / MATLAB style environments. A full table of the translation of a given operation to its MatX equivalant can be found in our full documentation [here](https://nvidia.github.io/MatX/basics/matlabpython.html)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Permute\n",
+    "`permute` returns a view of the data with the dimensions swapped to match the order of the initializer list argument. In the example below we swap our two dimensions, equivalent to a matrix transpose. However, `permute` can be used on higher-order tensors with the dimensions swapped in any order."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "// declare a 2D tensor of size with 4 rows and 5 columns\n",
+    "auto t2 = matx::make_tensor<int>({4,5});\n",
+    "\n",
+    "// setVals in tensor\n",
+    "t2.SetVals({\n",
+    "            {1, 2, 3, 4, 5},\n",
+    "            {6, 7, 8, 9, 10},\n",
+    "            {11, 12, 13, 14, 15},\n",
+    "            {16, 17, 18, 19, 20}\n",
+    "          });  \n",
+    "\n",
+    "// base tensor\n",
+    "matx::print(t2);\n",
+    "\n",
+    "// Permute axes 0 and 1 of the tensor\n",
+    "auto t2p = matx::permute(t2, {1,0});\n",
+    "\n",
+    "// print the permuted tensor to show the transposed data\n",
+    "matx::print(t2p);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Slice\n",
+    "`slice` provides a view of a subset of data in a tensor, allowing that subset to be used and manipulated as a new operator. The `slice` utility function takes the input operator and two to three initilization lists to define the range of the provided input operator the slice will container. The ranges are defined with the start index and end (exclusive) index, and optional strides. The sentinel value `matxEnd` can be used to indicate the end of the tensor rather than specifying its length.\n",
+    "\n",
+    "in the example below, `t2s` will correspond to the elements [`1:2,1:5`] of the larger t2 tensor\n",
+    "\n",
+    "![2D Slice](img/dli-slice.png)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "// declare a 2D tensor of size with 4 rows and 5 columns\n",
+    "auto t2 = matx::make_tensor<int>({4,5});\n",
+    "\n",
+    "// setVals in tensor\n",
+    "t2.SetVals({\n",
+    "            {1, 2, 3, 4, 5},\n",
+    "            {6, 7, 8, 9, 10},\n",
+    "            {11, 12, 13, 14, 15},\n",
+    "            {16, 17, 18, 19, 20}\n",
+    "          });  \n",
+    "\n",
+    "// slice example 1: same Rank\n",
+    "auto t2s = matx::slice(t2, {1,1}, {3, matx::matxEnd});\n",
+    "\n",
+    "// print the sliced tensor to show the subset of data\n",
+    "matx::print(t2s);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "To reduce the rank when slicing, `slice` can be used with a template parameter to define an operator of a lower rank (dimensionality) than the input tensor. This is useful for situations like selecting a row of a matrix, for example. In the second example, we demonstrate slicing the 0th column from the t2 tensor.\n",
+    "\n",
+    "![Column Slice](img/dli-slice_col.png)\n",
+    "\n",
+    "MatX also includes several helper defines to make tensor bound definitions easier. The sentinel value `matxDropDim` is used to indicate this dimension is the one being sliced (i.e. removed)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "// declare a 2D tensor of size with 4 rows and 5 columns\n",
+    "auto t2 = matx::make_tensor<int>({4,5});\n",
+    "\n",
+    "// setVals in tensor  \n",
+    "t2.SetVals({\n",
+    "            {1, 2, 3, 4, 5},\n",
+    "            {6, 7, 8, 9, 10},\n",
+    "            {11, 12, 13, 14, 15},\n",
+    "            {16, 17, 18, 19, 20}\n",
+    "          });  \n",
+    "\n",
+    "// slice example 2: Select all values of column 1\n",
+    "auto t1Col = matx::slice<1>(t2, {0, 1}, {matx::matxEnd, matx::matxDropDim});\n",
+    "\n",
+    "// print the sliced tensor to show the subset of data\n",
+    "matx::print(t1Col);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Clone\n",
+    "`clone` provides a utlity function to expand a smaller rank operator to a larger rank by replicating the original data. For example, a 1D Tensor can be cloned to create a 2D or higher rank tensor. Cloning does not copy or replicate the original data, but rather creates a new operator that references the same memory.\n",
+    "\n",
+    "In the clone example below, we will take the t1Col from our previous operation, and clone it to build a 2D [5,4] tensor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "// declare a 2D tensor of size with 4 rows and 5 columns\n",
+    "auto t2 = matx::make_tensor<int>({4,5});\n",
+    "\n",
+    "// setVals in tensor  \n",
+    "t2.SetVals({\n",
+    "            {1, 2, 3, 4, 5},\n",
+    "            {6, 7, 8, 9, 10},\n",
+    "            {11, 12, 13, 14, 15},\n",
+    "            {16, 17, 18, 19, 20}\n",
+    "          });  \n",
+    "\n",
+    "// slice example 2: reduce rank requires template parameter\n",
+    "auto t1Col = matx::slice<1>(t2, {0, 1}, {matx::matxEnd, matx::matxDropDim});\n",
+    "\n",
+    "// clone the sliced 1D tensor to create a new 2D tensor\n",
+    "auto t2c_cols = matx::clone<2>(t1Col, {5, matx::matxKeepDim});\n",
+    "\n",
+    "// print the cloned tensor to show the expanded data\n",
+    "matx::print(t2c_cols);\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### View Data Backing\n",
+    "We established earlier that views are not new data, but accessors into the original operator. This is a powerful tool when operating on the core data, but it's also different from some other languages where the programmer has no control over whether the data is copied. This also means that any changes to the original tensor will be reflected in all views of that tensor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx \n",
+    "\n",
+    "// declare a 2D tensor of size with 4 rows and 5 columns\n",
+    "auto t2 = matx::make_tensor<int>({4,5});\n",
+    "\n",
+    "// setVals in tensor  \n",
+    "t2.SetVals({\n",
+    "            {1, 2, 3, 4, 5},\n",
+    "            {6, 7, 8, 9, 10},\n",
+    "            {11, 12, 13, 14, 15},\n",
+    "            {16, 17, 18, 19, 20}\n",
+    "          });  \n",
+    "\n",
+    "// slice example 2: reduce rank requires template parameter\n",
+    "auto t1Col = matx::slice<1>(t2, {0, 1}, {matx::matxEnd, matx::matxDropDim});\n",
+    "// clone the sliced 1D tensor to create a new 2D tensor\n",
+    "auto t2c_cols = matx::clone<2>(t1Col, {5, matx::matxKeepDim});\n",
+    "\n",
+    "\n",
+    "// modify the original tensor\n",
+    "t2(0,1) = 10;\n",
+    "// print our views to show the updated values\n",
+    "matx::print(t2);\n",
+    "matx::print(t1Col);\n",
+    "matx::print(t2c_cols);\n",
+    "\n",
+    "// modify the tensor through a view\n",
+    "t1Col(1) = 203;\n",
+    "std::cout << \"------------------- After 203 -------------------\" << std::endl;\n",
+    "\n",
+    "// print our views to show the updated values\n",
+    "matx::print(t2);\n",
+    "matx::print(t1Col);\n",
+    "matx::print(t2c_cols);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 01_B: Operator Views\n",
+    "Let's demonstrate your new skills in creating views of a tensor. Using the pre-defined `baseTensor2D`, please create the following views:\n",
+    "\n",
+    "- The complete first row of the `baseTensor2D`\n",
+    "- A 2D square of 4 elements, composed of the first 2 rows and 2 columns of data\n",
+    "- Modify the (1,1) element of baseTensor2D through the view corresponding to assign it the value of 87.\n",
+    "\n",
+    "Print the output at each stage to ensure your views are working as expected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "// Make tensor\n",
+    "auto baseTensor2D = matx::make_tensor<int>({3,5});\n",
+    "baseTensor2D.SetVals({\n",
+    "  {1, 2, 3, 4, 5},\n",
+    "  {6, 7, 8, 9, 10},\n",
+    "  {11, 12, 13, 14, 15}\n",
+    "});\n",
+    "\n",
+    "\n",
+    "// Slice the first row of baseTensor\n",
+    "\n",
+    "// Create a 2D square of 4 elements, composed of the first 2 rows and 2 columns of data\n",
+    "\n",
+    "// Assign the value 87 to the (0,1) element of baseTensor2D and observe the change in each view"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[01_B Solution](solutions/01_B.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "## MatX Operators\n",
+    "All of the examples above show how to view the operator's data differently, but no manipulation was done to the data. To manipulate the data, we use operators to define what work is to be done, and then call `run` to execute the operation (more on this later).\n",
+    "\n",
+    "Operators in MatX are an abstract type that follow the [operator interface](https://nvidia.github.io/MatX/basics/concepts.html#operator). The operator interface dictates a small number of methods that must be implemented to be used in MatX expressions. Everything from tensors to `operator+` are considered operators in MatX. Every operator in MatX except for tensors are lazy evaluated, meaning that the operation is not performed until the statement is executed. For example, the expression `C = A + B` will not perform the addition or assignment until the `run` function is called.\n",
+    "\n",
+    "Most operators come in unary types for operating on a single input or a binary type for operating on two inputs, but operators can be defined for an arbitrary number of inputs. Advanced users can also define their own operators. MatX supports most of the standard unary operators a user would expect from a library like NumPy. Broadcasting of operators is also supported, which allows for operations between tensors of different ranks.\n",
+    "\n",
+    "Operator expressions follow the normal type promotion rules of C++ and any type errors or warnings will be reported at compile time.\n",
+    "\n",
+    "Below we'll demonstrate both scalar and matrix support for the basic unary operators (`+`, `-`, `x`, `/`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "auto A = matx::make_tensor<float>({2, 3});\n",
+    "auto B = matx::make_tensor<float>({2, 3});\n",
+    "auto C = matx::make_tensor<float>({2, 3});\n",
+    "auto D = matx::make_tensor<float>({2, 2});\n",
+    "\n",
+    "A.SetVals({ {1.f, 2.f, 3.f},\n",
+    "            {4.f, 5.f, 6.f}\n",
+    "          });\n",
+    "\n",
+    "(B = A).run(); // `run` will be discussed in more detail later\n",
+    "\n",
+    "matx::print(A);\n",
+    "matx::print(B);\n",
+    "std::cout << \" val: \" << A(0,0) << std::endl;\n",
+    "\n",
+    "// Addition\n",
+    "matx::print(A + 5.0f); // Broadcasting a scalar to a matrix\n",
+    "matx::print(A + B);    // Element-wise addition of two matrices\n",
+    "\n",
+    "// Subtraction\n",
+    "matx::print(A - 5.0f);\n",
+    "matx::print(A - B);\n",
+    "\n",
+    "// Multiplication\n",
+    "matx::print(A * 5.0f);\n",
+    "matx::print(A * B);\n",
+    "\n",
+    "// Division\n",
+    "matx::print(A / 5.0f);\n",
+    "matx::print(A / B);\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise 01_C: Operators\n",
+    "Please use the provided A and B tensors to complete the following set of operations:\n",
+    "\n",
+    "- Multiply `A` by its scalar weight factor `aScale` to populate tensor `C`\n",
+    "- In-place subtract `bOffset` from the matrix `B`\n",
+    "- Add the `A` and `B` Tensors to populate tensor `D`\n",
+    "\n",
+    "Keep in mind that rather than storing the result of an expression in a tensor, you may pass it to the `print` function directly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "auto A = matx::make_tensor<int>({2, 3});\n",
+    "auto B = matx::make_tensor<int>({2, 3});\n",
+    "auto C = matx::make_tensor<int>({2, 3});\n",
+    "auto D = matx::make_tensor<int>({2, 3});\n",
+    "\n",
+    "A.SetVals({ {1, 2, 3},\n",
+    "            {4, 5, 6}\n",
+    "          });\n",
+    "\n",
+    "(B = A).run();\n",
+    "\n",
+    "int aScale = 5;\n",
+    "int bOffset = 2;\n",
+    "\n",
+    "// Scale A by aScale\n",
+    "\n",
+    "// Subtract B by bOffset\n",
+    "\n",
+    "// Add A and B Tensors"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[01_C Solution](solutions/01_C.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Generator Operators\n",
+    "Generators are a type of operator that can generate values without another tensor or operator as input. For example, an identity function can generate a list of ones on the diagonal and zeros elsewhere. Generators are efficient since they require no memory and typically reduce to either a constant or an equation in the emitted code."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "auto A = matx::make_tensor<float>({2, 3});\n",
+    "auto H = matx::make_tensor<float>({10});\n",
+    "\n",
+    "// random\n",
+    "(A = 0).run();\n",
+    "(A = matx::random<float>(A.Shape(), matx::NORMAL)).run();\n",
+    "matx::print(A);\n",
+    "\n",
+    "// eye\n",
+    "(A = matx::eye(A.Shape())).run();\n",
+    "matx::print(A);\n",
+    "\n",
+    "// hamming\n",
+    "(H = matx::hamming<0>(H.Shape())).run();\n",
+    "matx::print(H);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## MatX Transform Operators\n",
+    "Transform operators take one or more inputs and call a backend library or kernel to manipulate the data. FFTs, GEMMs, and linear solvers are all types of transform operators. Compared to non-transform operators, transforms typically use some temporary memory and require synchronization that an element-wise operator does not. Transforms are allowed to be used in all of the same contexts that other operators are used in. For example, `C = A * matmul(A, B)` mixes both a transform operator (`matmul`) and an element-wise operator (`*`). If necessary, MatX will asynchronously allocate temporary memory for the output of the transform and free it after the operation is complete.\n",
+    "\n",
+    "Transform operators operate over a fixed number of dimensions, and anything higher dimensions will be batched. For example, if a 4D tensor is passed into a GEMM the left-most two dimensions are batched."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Matrix Multiplication (GEMM)\n",
+    "The `matmul` operator performs the matrix-matrix multiply of $$C = {\\alpha}A * B + {\\beta}C$$ where `A` is of dimensions `MxK`, `B` is `KxN`, and `C` is `MxN`. We first populate the `A` and `B` matrices with random values before the multiply, then the matrix multiply is performed. The `random` operator is used to populate the tensor with random values from a chosen distribution."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "auto A = matx::make_tensor<float>({4, 8});\n",
+    "auto B = matx::make_tensor<float>({8, 16});\n",
+    "auto C = matx::make_tensor<float>({4, 16});\n",
+    "\n",
+    "(A = matx::random<float>(A.Shape(), matx::NORMAL)).run();\n",
+    "(B = matx::random<float>(B.Shape(), matx::NORMAL)).run();\n",
+    "matx::print(A);\n",
+    "matx::print(B);\n",
+    "\n",
+    "(C = matx::matmul(A, B)).run();\n",
+    "matx::print(C);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fast Fourier Transform (FFT)\n",
+    "The `fft` operator performs a fast Fourier transform on the input operator. MatX supports both 1D and 2D FFTs via the `fft` and `fft2` functions, and their inverses `ifft` and `ifft2`. For a 1D FFT anything above the first dimension is batched, while for a 2D FFT anything above the second dimension is batched. B complex and real inputs are supported, and the user is required to size the outputs appropriately for each case. For documentaiton on the FFT please see the [FFT documentation](https://nvidia.github.io/MatX/api/dft/fft/fft.html). \n",
+    "\n",
+    "When using complex types in MatX the CCCL library is used for both CPU and GPU compatibility with `cuda::std::complex`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "using complex = cuda::std::complex<float>;\n",
+    "\n",
+    "auto A = matx::make_tensor<complex>({8});\n",
+    "auto B = matx::make_tensor<complex>({8});\n",
+    "\n",
+    "(A = matx::random<complex>(A.Shape(), matx::NORMAL)).run();\n",
+    "matx::print(A);\n",
+    "\n",
+    "(B = fft(A)).run(); // Take forward FFT of A\n",
+    "matx::print(B);\n",
+    "\n",
+    "(A = ifft(B)).run(); // Inverse FFT of B. Result should closely match the original A input\n",
+    "matx::print(A);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Reductions\n",
+    "Reductions are a class of algorithms that reduce a number of inputs into one or more outputs. Examples of reductions include summing all elements of a tensor, finding the maximum or minimum value, or counting the number of non-zero elements. Reduction functions in MatX use similar names to their counterparts in NumPy and MATLAB, such as `sum`, `min`, `max`, `mean`, `any`, and `all`. By default a reduction operator will reduce over all elements of the tensor, but a list of axes can be specified to reduce over different dimensions.\n",
+    "\n",
+    "Below is a simple example for calcluate a full reduction of the max and sum of our A data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "auto A = matx::make_tensor<float>({2, 3});\n",
+    "auto max_all = matx::make_tensor<float>({});\n",
+    "auto sum_all = matx::make_tensor<float>({});\n",
+    "\n",
+    "// Since we are taking the max of each column, our output tensor is 1D with the same number of elements as the number of columns in A\n",
+    "auto max_col = matx::make_tensor<float>({3});  \n",
+    "\n",
+    "(A = matx::random<float>(A.Shape(), matx::NORMAL)).run();\n",
+    "\n",
+    "// Max of data\n",
+    "(max_all = matx::max(A)).run();\n",
+    "// Min of data\n",
+    "(sum_all = matx::sum(A)).run();\n",
+    "// Max of each column\n",
+    "(max_col = matx::max(A, {0})).run();  \n",
+    "\n",
+    "printf(\"A:\\n\");\n",
+    "matx::print(A);\n",
+    "\n",
+    "printf(\"Max: %f\\n\", max_all());\n",
+    "printf(\"Sum: %f\\n\", sum_all());\n",
+    "printf(\"Max Col: \\n\");\n",
+    "matx::print(max_col);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "source": [
+    "### Additional Transforms\n",
+    "MatX Supports a wide range of transforms, including both sparse and dense solvers, tensor constractions, and more. Please review the [MatX documentation](https://nvidia.github.io/MatX/api/index.html) for an exhaustive list of supported operations."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 01_D: Transforms and Generators:\n",
+    "\n",
+    "For this example we will generate random data to verify the distribution of our generator functions. Please implement the following:\n",
+    "\n",
+    "- Generate three floating point 3D tensors with sizes 2x4x8, 2x8x8, and 2x4x8\n",
+    "- Populate the first two tensors with random values from a uniform distribution\n",
+    "- Perform a batched matrix multiply of the first two tensors and store the output in the third tensor\n",
+    "- Find the minimum values of each inner matrix of C (there should be 2 of them) and print the results\n",
+    "\n",
+    "Ensure that the minimum values printed match what you would expect in the third tensor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "// Create floating point tensors with specified sizes\n",
+    "\n",
+    "// Generate random data\n",
+    "\n",
+    "// Perform matmul and print\n",
+    "\n",
+    "// Find min values and print"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[01_D Solution](solutions/01_D.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Executors\n",
+    "\n",
+    "In the examples above the `run()` statement executed the operator, but it was not specified *where* the operator was executed. The `run()` statement takes an optional executor argument that allows the user to specify where the operator should be executed. Without any parameters `run()` will executor using the CUDA executor on the default stream, but in practice specifying the executor explicitly is recommended.\n",
+    "\n",
+    "Executors in MatX are a powerful tool for executing the same operator in many contexts. A context can currently be a single GPU, CPU, but may be extended in the future for more powerful types. MatX attempts to achieve feature parity between the executors, regardless of the performance. For example, running an FFT on the CUDA executor will use the cuFFT library on the backend, while launching the same operator on the CPU will use the optional FFTW library. Like all operators, the user must be aware of memory locality as it relates to the executor. For example, passing a tensor to a CUDA executor with malloc'd host memory on platforms without HMM will result in undefined behavior. Using CUDA managed memory works on all current executors without worrying about locality, but may come at a small performance cost.\n",
+    "\n",
+    "The example below the same operator is executed separately on a CUDA and Host executor. The results should match very closely, but may not match exactly due to how floating point is treated on different platforms."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "auto cuda_exec = matx::CUDAExecutor();\n",
+    "auto host_exec = matx::HostExecutor();\n",
+    "\n",
+    "auto A = matx::make_tensor<float>({2, 3}); // Allocated using CUDA managed memory\n",
+    "auto B = matx::make_tensor<float>({2, 3}); // Allocated using CUDA managed memory\n",
+    "auto C = matx::make_tensor<float>({2, 3}); // Allocated using CUDA managed memory\n",
+    "\n",
+    "(A = matx::random<float>(A.Shape(), matx::NORMAL)).run(cuda_exec); // Run on GPU\n",
+    "(B = matx::random<float>(B.Shape(), matx::NORMAL)).run(host_exec); // Run on CPU\n",
+    "\n",
+    "matx::print(A);\n",
+    "matx::print(B);\n",
+    "\n",
+    "auto assign_op = (C = A + B); // Operator representing the computation C = A + B\n",
+    "\n",
+    "assign_op.run(cuda_exec); // Run on GPU\n",
+    "matx::print(C);\n",
+    "\n",
+    "assign_op.run(host_exec); // Run on CPU\n",
+    "matx::print(C);"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "That's it! You've completed the introduction to MatX tutorial and are ready to move on to the [fusion lab](02_lab_fusion.ipynb)."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs_input/notebooks/gtc_lab/02_lab_fusion.ipynb b/docs_input/notebooks/gtc_lab/02_lab_fusion.ipynb
new file mode 100644
index 00000000..636b061c
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/02_lab_fusion.ipynb
@@ -0,0 +1,370 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Operators and Lazy Evaluation\n",
+    "When writing a simple arithmetic expression like the following:\n",
+    "\n",
+    "```A = B * (cos(C) / D```\n",
+    "\n",
+    "Using the typical order of operations rules, we evaluate the expression in parentheses first `(cos(C) / D)`, followed by the multiply `*B1`, then the assignment `A=`. Written using standard C++ operator overloading, we would have a cosine, division, multiplication, and assignment overload. Each operator performs their respective task, then returns the value computed. That returned value is stored somewhere (either out to memory or possible in a register), then the next operator uses that output as input into its own computation. Finally, the assignment writes the value, usually out to memory.\n",
+    "\n",
+    "To avoid overhead of repeated accesses to global memory and multiple discrete operation calls, MatX uses a technique called **lazy evaluation** to reduce the total number of loads and stores. It does this by overloading each operator so that **instead of performing the operation, such as multiplication, instead it returns an object that represents multiplication when it’s needed.** The entire expression then generates a single type in C++ representing the full equation above, and when we ask for element (0,0) of A above, the value is computed on-the-fly without storing any values. This also implies that you can store an entire expression into a variable and nothing will be exectuted:\n",
+    "\n",
+    "`auto op = (B * (cos(C) / D));`\n",
+    "\n",
+    "In the example above op is not evaluated at creation, but is instead a handle ot the operator that can calculate the result of the equation onthe right hand side.\n",
+    "\n",
+    "This operator can then be further combined with other expressions, which can increase code readability without loss of performance."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Executors\n",
+    "Operators are used in conjunction with executors and the ``run()`` syntax to dictate when a given operator is executed and on what acceleration hardware. Notebook 1 exploited automation in the `matx::print` function to ensure operators are called and copied to the host to facilitate printing. This is convenient, but not realistic or performant to use in a real application. \n",
+    "\n",
+    "Executors are types that describe how to execute an operator expression or transform. They are similar to C++’s execution policy, and may even use C++ execution policies behind the scenes. Executors are designed so that the code can remain unchanged while executing on a variety of different targets. For these notebooks, a single executor `exec` is created at the top of each notebook, and then used throughout. \n",
+    "\n",
+    "to exectuor work on a give executor, you can simply call the `run()` function on the operator you would like to execute, with the executor that you would like to do the work.\n",
+    "\n",
+    "```\n",
+    "\n",
+    "(A = B * (cos(C) / D)).run(exec); // immediate evaluation of fused operators into memory-backed tensor\n",
+    "\n",
+    "auto  myOp = B * (cos(C) / D);    // define lay operator with fused operator\n",
+    "(A = myOp).run(exec);             // evaluate operaeter to a memory-backed tensor\n",
+    "\n",
+    "(A2 = myOp * C).run(exec);        // combine op with other tensors \n",
+    "(A3 = myOp * myOp).run(exec)      // combine op with other ops\n",
+    "\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Fusion Example\n",
+    "\n",
+    "Below we will take the example operation from above `(A = B * (cos(C) / D)).run();` and express it in MatX as individual operations, and as a single fused operation to demonstrate the value of the speed up."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "auto exec = matx::CUDAExecutor();\n",
+    "\n",
+    "matx::index_t size_x = 128;\n",
+    "matx::index_t size_y = 256;\n",
+    "\n",
+    "auto A      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto B      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto C      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto D      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto result = matx::make_tensor<float>({size_x, size_y});\n",
+    "\n",
+    "// ---- populate the data ---- //\n",
+    "(A = matx::random<float>(A.Shape(), matx::NORMAL)).run();\n",
+    "(B = matx::random<float>(B.Shape(), matx::NORMAL)).run();\n",
+    "(C = matx::random<float>(C.Shape(), matx::NORMAL)).run();\n",
+    "(D = matx::random<float>(D.Shape(), matx::NORMAL)).run();\n",
+    "(result = matx::zeros({size_x, size_y})).run(exec);\n",
+    "exec.sync();\n",
+    "\n",
+    "\n",
+    "// ---- first individual, independent kernels ---- //\n",
+    "exec.start_timer();\n",
+    "(result = cos(C)).run(exec);     \n",
+    "(result = result / D).run(exec); \n",
+    "(result = result * B).run(exec);   \n",
+    "exec.stop_timer();\n",
+    "\n",
+    "std::cout <<\"Unfused time: \" << exec.get_time_ms() << \" ms\" << std::endl;\n",
+    "\n",
+    "// ---- fused operation ---- //\n",
+    "exec.start_timer();\n",
+    "(A = B * cos(C)/D).run(exec);\n",
+    "exec.stop_timer();\n",
+    "std::cout <<\"fused time: \" << exec.get_time_ms() << \" ms\" << std::endl;\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Runtime Improvements\n",
+    "The fused results in all of the performance benefits we described above:\n",
+    "- a single kernel is submitted to the GPU to complete all operations\n",
+    "- memory is only read from global once and written to global once\n",
+    "\n",
+    "This results in significant performance improvements, both for launch latency, and GPU kernel exection\n",
+    "# <img src=\"img/dli-fusion.png\" width=\"80%\">"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "vscode": {
+     "languageId": "cpp"
+    }
+   },
+   "source": [
+    "## Fusion with Operators\n",
+    "\n",
+    "Fusion is intuitive when all operands can be combined into a single statement (like above), and follows the natural pattern most programs would follow. The reality is often different for more complex algorithms, and this is where fusion can also provide significant benefit for readability and reuse in implementations where very complex terms are defined, in addition to the performance benefits we just showed. \n",
+    "\n",
+    "Combining the lazy evaluation of operators with the ability to combine operators, terms can be defined to clearly construct the specfic math for each term, and then combined later to create the complete final expression for execution.\n",
+    "\n",
+    "Below we show a more complex operation comprised of both unary operators and transforms, and how we can break down a very complex expression into simple terms that can be reused\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "# Exercise 02_A : Fusion Basics\n",
+    "\n",
+    "Use the following equations to create an implemention that utilizes fusion and reuse optimize underlying code. \n",
+    "\n",
+    "`result = A*C + B/D + ((D-C)/B)/(A*C) `\n",
+    "\n",
+    "An example implementation is given with all operations done individually; how much faster can you make it?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "auto exec = matx::CUDAExecutor();\n",
+    "\n",
+    "matx::index_t size_x = 128;\n",
+    "matx::index_t size_y = 256;\n",
+    "\n",
+    "auto A      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto B      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto C      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto D      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto result = matx::make_tensor<float>({size_x, size_y});\n",
+    "\n",
+    "// ---- populate the data ---- //\n",
+    "(A = matx::random<float>(A.Shape(), matx::NORMAL)).run();\n",
+    "(B = matx::random<float>(B.Shape(), matx::NORMAL)).run();\n",
+    "(C = matx::random<float>(C.Shape(), matx::NORMAL)).run();\n",
+    "(D = matx::random<float>(D.Shape(), matx::NORMAL)).run();\n",
+    "(result = matx::zeros({size_x, size_y})).run(exec);\n",
+    "exec.sync();\n",
+    "\n",
+    "// ---- Reference Implementation ---- //\n",
+    "exec.start_timer();\n",
+    "(result = A*C).run(exec);\n",
+    "(result += B/D).run(exec);\n",
+    "(result += ((D-C)/B)/(A*C)).run(exec);\n",
+    "exec.stop_timer();\n",
+    "std::cout <<\"Separate Operators Runtime: \" << exec.get_time_ms() << \" ms\" << std::endl;\n",
+    "\n",
+    "// ---- Exercise: Implementation ---- //\n",
+    "exec.start_timer();\n",
+    "//\n",
+    "// Your implementation here:\n",
+    "//\n",
+    "exec.stop_timer();\n",
+    "std::cout <<\"Exercise Runtime: \" << exec.get_time_ms() << \" ms\" << std::endl;\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[02_A Solution](solutions/02_A.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exercise 02_B: Black Scholes Fusion\n",
+    "\n",
+    "The Black Scholes model provides a fantastic example of a real-world set of equations that greatly benefits from operator fusion. Black Scholes provides both a complex set of expressions that provide significant readability improvements if expressed as individual expressions, but also benefits from fusion of its separate operational parts. Below is a brief description of the Black Scholes models and its composite terms:\n",
+    "\n",
+    "\n",
+    "$$\n",
+    "C(S_0, K, T) = S_0 \\,\\Phi\\bigl(d_1\\bigr) \\;-\\; K \\, e^{-rT} \\,\\Phi\\bigl(d_2\\bigr),\n",
+    "$$\n",
+    "\n",
+    "where\n",
+    "\n",
+    "$$\n",
+    "d_1 = \\frac{\\ln\\!\\bigl(\\tfrac{S_0}{K}\\bigr) + \\bigl(r + \\tfrac{\\sigma^2}{2}\\bigr)T}{\\sigma \\sqrt{T}},\n",
+    "\\quad\n",
+    "d_2 = d_1 - \\sigma \\sqrt{T}.\n",
+    "$$\n",
+    "\n",
+    "\n",
+    "Here:\n",
+    "- \\( S_0 \\) is the current stock price\n",
+    "- \\( K \\) is the strike price\n",
+    "- \\( T \\) is the time to maturity (in years)\n",
+    "- \\( r \\) is the risk-free interest rate (annualized)\n",
+    "- \\( $\\sigma$ \\) is the volatility of the underlying stock (annualized)\n",
+    "- \\( $\\Phi (\\cdot)$ \\) is the cumulative distribution function (CDF) of the standard normal distribution\n",
+    "\n",
+    "\n",
+    "\n",
+    "We can easily translate this by expressing each of the terms defined above as separate MatX operators, then fusing the execution of those operators in the final run command.\n",
+    "\n",
+    "Try breaking the equation below into the following operators:\n",
+    "\n",
+    "```\n",
+    "VsqrtT  = V * sqrt(T);\n",
+    "d1      = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;\n",
+    "d2      = d1 - VsqrtT;\n",
+    "cdf_d1  = normcdf(d1);\n",
+    "cdf_d2  = normcdf(d2);\n",
+    "expRT   = exp(-1 * r * T); \n",
+    "```\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "auto exec = matx::CUDAExecutor();\n",
+    "\n",
+    "using dtype = double;\n",
+    "matx::index_t input_size = 100;\n",
+    "\n",
+    "// ---- declare input data ---- //\n",
+    "auto K = matx::make_tensor<dtype>({input_size});\n",
+    "auto S = matx::make_tensor<dtype>({input_size});\n",
+    "auto V = matx::make_tensor<dtype>({input_size});\n",
+    "auto r = matx::make_tensor<dtype>({input_size});\n",
+    "auto T = matx::make_tensor<dtype>({input_size});\n",
+    "auto output = matx::make_tensor<dtype>({input_size});  \n",
+    "\n",
+    "// ---- populate the data ---- //\n",
+    "(K = matx::random<float>(K.Shape(), matx::NORMAL)).run();\n",
+    "(S = matx::random<float>(S.Shape(), matx::NORMAL)).run();\n",
+    "(V = matx::random<float>(V.Shape(), matx::NORMAL)).run();\n",
+    "(r = matx::random<float>(r.Shape(), matx::NORMAL)).run();\n",
+    "(T = matx::random<float>(T.Shape(), matx::NORMAL)).run();\n",
+    "(output = matx::zeros({input_size})).run(exec);\n",
+    "exec.sync();\n",
+    "\n",
+    "// ---- Exercise: Implementation ---- //\n",
+    "exec.start_timer();\n",
+    "//\n",
+    "// Your implementation here:\n",
+    "//\n",
+    "exec.stop_timer();\n",
+    "std::cout <<\"Exercise Runtime: \" << exec.get_time_ms() << \" ms\" << std::endl;"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[02_B Solution](solutions/02_B.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Too Much of a Good Thing: Corner Cases and Limitations\n",
+    "Some limitions exist that prevents the fusion of all operations. Like all CUDA programs, there is an upper ceiling to the complexity of how much compute is optimal for a given kernel, as the kernel's complexity drives resource utilization (such as registers and shared memory), that may ultimately harm performance.\n",
+    "\n",
+    "Similarly some lower-level APIs utilized by MatX may not support iterators / pre / post operations, and cannot be fused at the kernel level, and so may still require intermediate memory and separate kernels.\n",
+    "\n",
+    "To resolve this MatX uses Asnychronous memory when required to create intermediate outputs to store information between non-fusable operations. This does not require any action on the user to enable, however it may result in sub-optimal performance if asnchronous pools are not managed appropriately.  \n",
+    "\n",
+    "An example is the FFT operations, which are backed by the cuFFT library on the GPU. Without using more advanced callback techniques, each FFT/IFFT call requires memory-backed tensors to operate, requiring MatX to break up the fused operation and generate individual kernels."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "auto exec = matx::CUDAExecutor();\n",
+    "\n",
+    "matx::index_t size_x = 12;\n",
+    "matx::index_t size_y = 12;\n",
+    "\n",
+    "// matx::index_t size_x = 128;\n",
+    "// matx::index_t size_y = 256;\n",
+    "\n",
+    "auto A      = matx::make_tensor<cuda::std::complex<float>>({size_x, size_y});\n",
+    "auto B      = matx::make_tensor<cuda::std::complex<float>>({size_x, size_y});\n",
+    "auto result = matx::make_tensor<cuda::std::complex<float>>({size_x, size_y});\n",
+    "\n",
+    "for (int i = 0; i < 10; i++) \n",
+    "{  \n",
+    "  exec.start_timer();\n",
+    "  (A = fft(A)).run(exec);\n",
+    "  (A = A * B).run(exec);\n",
+    "  (A = ifft(A)).run(exec);\n",
+    "  exec.stop_timer();\n",
+    "}\n",
+    "std::cout <<\"NonFused Runtime: \" << exec.get_time_ms() << \" ms\" << std::endl;\n",
+    "\n",
+    "\n",
+    "for (int i = 0; i < 10; i++) \n",
+    "{  \n",
+    "  exec.start_timer();\n",
+    "  (A = ifft(fft(A)*B)).run(exec);\n",
+    "  exec.stop_timer();\n",
+    "}\n",
+    "std::cout <<\"Fused Runtime: \" << exec.get_time_ms() << \" ms\" << std::endl;\n",
+    "\n",
+    "for (int i = 0; i < 10; i++) \n",
+    "{  \n",
+    "  exec.start_timer();\n",
+    "  (A = fft(A)*B).run(exec);\n",
+    "  (A = ifft(A)).run(exec);\n",
+    "  exec.stop_timer();\n",
+    "}\n",
+    "std::cout <<\"Partial Fused Runtime: \" << exec.get_time_ms() << \" ms\" << std::endl;\n",
+    "\n",
+    "\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs_input/notebooks/gtc_lab/03_lab_profiling.ipynb b/docs_input/notebooks/gtc_lab/03_lab_profiling.ipynb
new file mode 100644
index 00000000..62c938ce
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/03_lab_profiling.ipynb
@@ -0,0 +1,382 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "*NOTE: This notebook is configured for bash execution, not C++, so you won't be able to run the C++ code examples shown*"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Profiling in MatX\n",
+    "Improving performance is at the heart of MatX's value, so it must facilitate any easy to implement and powerful capability for benchmarking and analysing code both at deployment and during development.\n",
+    "\n",
+    "The NVIDIA software ecosystem provides a powerful profiling suite of tools through [Nsight Systems]() and [Nsight Compute]() that allows developers to gain great insight into the performance of their code and utilization of their hardware. MatX leverages this powerful ecosystem through the [NVTX toolkit]() which allows developers to annote their code for use with the Nsight suite of tools. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Executor Timers\n",
+    "Before we get into the meat of the profiling section, let's talk about a profiling approach you used earlier. The simplest, most light-weight (but less powerful) way to profile a block of MatX code is to leverage the built-in timer in the executor.\n",
+    "\n",
+    "These methods don't integrate with the Nsight toolset, but can be useful for quick and dirty analysis:\n",
+    "\n",
+    "```c++\n",
+    "exec.start_timer();\n",
+    "(C = A * fft(B)).run(exec);\n",
+    "exec.stop_timer();\n",
+    "std::cout << \"Execution time: \" << exec.get_time_ms() << \" ms\" << std::endl;\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Nsight Profile Without and With NVTX Wrapping\n",
+    "Below, see an Nsight Systems report without NVTX wrapping enabled. There's lots of good information for understanding \n",
+    "the program runtimes, including CUDA API calls, memory transfers, and kernel runtimes:\n",
+    "\n",
+    "![Nsight report without NVTX](img/no-nvtx-report.png)\n",
+    "\n",
+    "However, there is no information that clearly correlates the profiler's data to the application's logical structure.\n",
+    "NVTX enables the user to define profile ranges that make identifying bottlenecks and runtimes within your specific\n",
+    "application much easier. See the added NVTX ranges at the bottom of the updated profile report. Some of these ranges\n",
+    "are user-defined within the program and some are automatically generated by MatX:\n",
+    "\n",
+    "![Nsight report without NVTX](img/with-nvtx-report.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## MatX Profiling Tools\n",
+    "MatX provides an NVTX API to enable native compile-in profiling capabilities. The MatX NVTX API enable a user to \n",
+    "easily profile all MatX calls using built-in NVTX ranges, while also providing a convenient API for the user to insert \n",
+    "custom ranges in their own code. This API provides many convenience features such as:\n",
+    "\n",
+    "- A convenient compile-in/compile-out MACRO based API \n",
+    "- verbosity levels allowing varying levels of profiling detail\n",
+    "- Built-in color rotation\n",
+    "- Automatic scope management and range naming \n",
+    "- Overloaded API for manual range specification\n",
+    "\n",
+    "MatX Implements it's NVTX API as a set of macros, which allows users to easily compile NVTX functionality into, or out of your code. This completely removes any runtime penality that may be caused by NVTX in the most latency sensitive deployments.\n",
+    "\n",
+    "To enable the NVTX Profiling API, simply compile with the ``MATX_NVTX_FLAGS=ON`` enabled in the cmake command."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### User Defined Ranges\n",
+    "User defined NVTX ranges require the user to provide a name and unique ID for each range. The name will appear in the NVTX range of your nsight profiles, while the unique ID is only used interally to track your ranges during deletion. Because of this, the unique ID **must** be unique for any ranges that overlap, otherwise you may delete the incorrect range during tear-down.\n",
+    "\n",
+    "Below is an example of a user-defined NVTX range:\n",
+    "\n",
+    "```c++\n",
+    "using dtype = double;\n",
+    "index_t input_size = 10;\n",
+    "// index_t inputIsze  = 10000000; // increase size to measure performance\n",
+    "\n",
+    "MATX_NVTX_START_RANGE(\"Black-Scholes Memory Allocation\", 0)\n",
+    "// declare input data\n",
+    "auto K = matx::make_tensor<dtype>({input_size});\n",
+    "auto S = matx::make_tensor<dtype>({input_size});\n",
+    "auto V = matx::make_tensor<dtype>({input_size});\n",
+    "auto r = matx::make_tensor<dtype>({input_size});\n",
+    "auto T = matx::make_tensor<dtype>({input_size});\n",
+    "auto output = matx::make_tensor<dtype>({input_size});  \n",
+    "auto referenceOutput = matx::make_tensor<dtype>({input_size});  \n",
+    "MATX_NVTX_END_RANGE(0)\n",
+    "\n",
+    "\n",
+    "MATX_NVTX_START_RANGE(\"Black-Scholes Op Creation\", 1)\n",
+    "// create ops\n",
+    "auto VsqrtT = V * sqrt(T);\n",
+    "auto d1     = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;\n",
+    "auto d2     = d1 - VsqrtT;\n",
+    "auto cdf_d1 = normcdf(d1);\n",
+    "auto cdf_d2 = normcdf(d2);\n",
+    "auto expRT  = exp(-1 * r * T); \n",
+    "MATX_NVTX_END_RANGE(1)\n",
+    "\n",
+    "MATX_NVTX_START_RANGE(\"Black-Scholes Execution\", 2)\n",
+    "// execute ops\n",
+    "(output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);\n",
+    "MATX_NVTX_END_RANGE(2)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Automatic Ranges\n",
+    "Alternative versions of the timing macros are provided to auomate handling the NatX NVTX ranges. The `MATX_NVTX_START_RANGE` has an overload which allows the its use without providing a unique ID. Instead the macro returns an ID, which can be stored in an int variable and later passed to the end range call. when NVTX ranges are compiled out, the Macros simply return 0, and no action is taken on the end call.\n",
+    "\n",
+    "Below is an example using the automatic enumeration feature:\n",
+    "\n",
+    "```c++\n",
+    "int bc_range = MATX_NVTX_START_RANGE(\"Black-Scholes Execution\");\n",
+    "(output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);\n",
+    "MATX_NVTX_END_RANGE(bc_range);\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Scope Based Ranges\n",
+    "A final version of the API, `MATX_NVTX_START` is provided that matches the life of the NVTX range to the life of the scope in which it is defined. This automatically enumates a unique ID, and does not need to be explicitly destroyed by the user. \n",
+    "\n",
+    "Similarly it will also inherit the name of the functions it is called from, and do not require a name. This is especially useful for automating ranges for entire functions.\n",
+    "\n",
+    "An example of this API is as follows:\n",
+    "\n",
+    "```c++\n",
+    "void myFunction\n",
+    "{\n",
+    "  MATX_NVTX_START(\"\");\n",
+    "  \n",
+    "  (output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);\n",
+    "}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "### Profile Level \n",
+    "The MatX NVTX API supports logging levels, allowing you to fine-tune the levels of NVTX ranges that are captured at a given time. the logging level is checked at runtime, so can be dynamically changed throughout program execution.\n",
+    "A utility macro  `MATX_NVTX_SET_LOG_LEVEL(LOG_LEVEL)`.\n",
+    "\n",
+    "All Events default to the log level `MATX_NVTX_LOG_USER`, and the default verbosity is `MATX_NVTX_LOG_API`. \n",
+    "\n",
+    "\n",
+    "There are 5 increasing levels of verbosity:\n",
+    "\n",
+    "```c++\n",
+    "MATX_NVTX_LOG_NONE\n",
+    "MATX_NVTX_LOG_USER\n",
+    "MATX_NVTX_LOG_API\n",
+    "MATX_NVTX_LOG_INTERNAL\n",
+    "MATX_NVTX_LOG_ALL\n",
+    "``` \n",
+    "\n",
+    "`MATX_NVTX_LOG_NONE` ensures that no Ranges are recorded.\n",
+    "`MATX_NVTX_LOG_ALL` ensures all NVTX Ranges are recorded.\n",
+    "\n",
+    "Any intermediate level ensures that level and all levesl avove it are recoded. For exmaple, if `MATX_NVTX_LOG_API`\n",
+    "is enabled, then all events of type `MATX_NVTX_LOG_USER` **AND** `MATX_NVTX_LOG_API` will be recoded.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Application Profiling Examples\n",
+    "In this section we're going to use some pre-built applications to demonstrate how to generate an Nsight Systems profile and do some basic, high-level analysis using the Nsight Systems CLI.\n",
+    "\n",
+    "To take advantage of the full Nsight Systems profiler, you must example the profile report with the GUI, which isn't installed in this lab. We'll show screenshots of the output you'll see, but to interact with the reports yourself, head to https://developer.nvidia.com/nsight-systems to get started."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Kernel Fusion Application\n",
+    "The first application we'll be profiling is one you've already seen before. In `samples/kernel_fusion.cu`, we've implemented a simple application that demonstrates the same concepts learned in the earlier section about operator fusion.\n",
+    "\n",
+    "Specifically, we implement 2 ranges, looping over each 10 times to get an accurate timing analysis:\n",
+    "\n",
+    "```c++\n",
+    "// first individual, independent kernels\n",
+    "int unfused_range = MATX_NVTX_START_RANGE(\"Unfused Kernels\");\n",
+    "(result = cos(C)).run(exec);\n",
+    "(result = result / D).run(exec);\n",
+    "(result = result * B).run(exec);\n",
+    "MATX_NVTX_END_RANGE(unfused_range);\n",
+    "\n",
+    "// now, as a fused operation\n",
+    "int fused_range = MATX_NVTX_START_RANGE(\"Fused Operation\");\n",
+    "(A = B * cos(C)/D).run(exec);\n",
+    "MATX_NVTX_END_RANGE(fused_range);\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run the cell below to generate an Nsight Systems profile report on the application, which is saved as `samples/kernel_fusion_report.nsys-rep`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nsys profile -o ./samples/kernel_fusion_report.nsys-rep ./samples/kernel_fusion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(Note: If for some reason, the program fails or the profile report isn't generated, there's a pre-loaded profile at `samples/backup_kernel_fusion_report.nsys-rep`)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This `.nsys-rep` file is what is used by Nsight Systems for profiling and is what you would load into the Nsight Systems GUI. To see some high-level statistics, let's use the CLI in the cell below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nsys stats ./samples/kernel_fusion_report.nsys-rep"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In the top section, `** NVTX Range Summary`, take note of the `MatX:Unfused Kernels` and `MatX:Fused Operation` ranges on the right. You should see something like:\n",
+    "```\n",
+    " ** NVTX Range Summary (nvtxsum):\n",
+    "\n",
+    "Time (%)  Total Time (ns)  Instances   Avg (ns)    Med (ns)  Min (ns)   Max (ns)   StdDev (ns)   Style                                            Range                                        \n",
+    "--------  ---------------  ---------  -----------  --------  --------  ----------  -----------  --------  -------------------------------------------------------------------------------------\n",
+    "...\n",
+    "     1.0          338,264         10     33,826.4  31,930.0    30,941      44,218      4,231.5  StartEnd  MatX:Unfused Kernels\n",
+    "     0.4          145,858         10     14,585.8  13,614.0    13,019      21,775      2,730.8  StartEnd  MatX:Fused Operation\n",
+    "...\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's look at the output you'd see in Nsight Systems GUI. This plot shows a high-level view of all 10 iterations we just ran:\n",
+    "\n",
+    "![Fusion High Level](img/kernel-fusion-highlevel.png)\n",
+    "\n",
+    "We can zoom down to a single iteration to compare the two ranges:\n",
+    "\n",
+    "![Fusion High Level](img/kernel-fusion-lowlevel.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Simple Radar Application\n",
+    "To demonstrate the power of the NVTX ranges, we'll demonstrate using a more complex example: the [Simple Radar Pipeline](https://github.com/NVIDIA/MatX/blob/main/examples/simple_radar_pipeline.cu) that comes with the MatX example codes. This pipeline showcases both the powerful accleration MatX provides, as well as the granular insight we gain into our performance through the MatX NVTX API.\n",
+    "\n",
+    "You can view the file here at `./samples/simple_radar_pipeline.cu` and `./samples/simple_radar_pipeline.h`.\n",
+    "\n",
+    "The pipeline is made up of 4 stages:\n",
+    "1. Pulse Compression - An FFT, a dot matrix multiply, and an inverse FFT\n",
+    "2. Three Pulse Canceller - A 1D convolution\n",
+    "3. Doppler Processing - A dot matrix multiply with a Hamming window and an FFT\n",
+    "4. CFAR Detection - An element-wise magnitude-squared, a 2D convolution, a dot matrix divide\n",
+    "\n",
+    "Which operations do you think will take the longest? The fastest?\n",
+    "\n",
+    "Run the cell below to generate a profile report."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nsys profile -o ./samples/simple_radar_pipeline_report.nsys-rep ./samples/simple_radar_pipeline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "(Note: If for some reason, the program fails or the profile report isn't generated, there's a pre-loaded profile at `samples/backup_simple_radar_pipeline.nsys-rep`)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "See below for a high-level view of the profile output (this one is a lot more complicated!):\n",
+    "\n",
+    "![Radar High-Level](img/radar-highlevel.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here's a look, zoomed into a single pass-through of all 4 stages of the pipeline:\n",
+    "\n",
+    "![Radar Pipeline](img/radar-pipeline.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And zoomed in even further to just look at the Pulse Compression stage:\n",
+    "\n",
+    "![Radar Pulse Compression](img/radar-pulsecompression.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, run the cell below to see the CLI output for some high-level statistics:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nsys stats ./samples/simple_radar_pipeline_report.nsys-rep"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Bash",
+   "language": "bash",
+   "name": "bash"
+  },
+  "language_info": {
+   "codemirror_mode": "shell",
+   "file_extension": ".sh",
+   "mimetype": "text/x-sh",
+   "name": "bash"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/docs_input/notebooks/img/dli-clone-col.png b/docs_input/notebooks/gtc_lab/img/dli-clone-col.png
similarity index 100%
rename from docs_input/notebooks/img/dli-clone-col.png
rename to docs_input/notebooks/gtc_lab/img/dli-clone-col.png
diff --git a/docs_input/notebooks/img/dli-clone.png b/docs_input/notebooks/gtc_lab/img/dli-clone.png
similarity index 100%
rename from docs_input/notebooks/img/dli-clone.png
rename to docs_input/notebooks/gtc_lab/img/dli-clone.png
diff --git a/docs_input/notebooks/gtc_lab/img/dli-fusion.png b/docs_input/notebooks/gtc_lab/img/dli-fusion.png
new file mode 100755
index 00000000..5b5366a7
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/img/dli-fusion.png differ
diff --git a/docs_input/notebooks/img/dli-matx-overview.png b/docs_input/notebooks/gtc_lab/img/dli-matx-overview.png
similarity index 100%
rename from docs_input/notebooks/img/dli-matx-overview.png
rename to docs_input/notebooks/gtc_lab/img/dli-matx-overview.png
diff --git a/docs_input/notebooks/img/dli-slice.png b/docs_input/notebooks/gtc_lab/img/dli-slice.png
similarity index 100%
rename from docs_input/notebooks/img/dli-slice.png
rename to docs_input/notebooks/gtc_lab/img/dli-slice.png
diff --git a/docs_input/notebooks/img/dli-slice_col.png b/docs_input/notebooks/gtc_lab/img/dli-slice_col.png
similarity index 100%
rename from docs_input/notebooks/img/dli-slice_col.png
rename to docs_input/notebooks/gtc_lab/img/dli-slice_col.png
diff --git a/docs_input/notebooks/img/dli-slice_row.png b/docs_input/notebooks/gtc_lab/img/dli-slice_row.png
similarity index 100%
rename from docs_input/notebooks/img/dli-slice_row.png
rename to docs_input/notebooks/gtc_lab/img/dli-slice_row.png
diff --git a/docs_input/notebooks/img/dli-transpose.png b/docs_input/notebooks/gtc_lab/img/dli-transpose.png
similarity index 100%
rename from docs_input/notebooks/img/dli-transpose.png
rename to docs_input/notebooks/gtc_lab/img/dli-transpose.png
diff --git a/docs_input/notebooks/gtc_lab/img/kernel-fusion-highlevel.png b/docs_input/notebooks/gtc_lab/img/kernel-fusion-highlevel.png
new file mode 100755
index 00000000..62136a9d
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/img/kernel-fusion-highlevel.png differ
diff --git a/docs_input/notebooks/gtc_lab/img/kernel-fusion-lowlevel.png b/docs_input/notebooks/gtc_lab/img/kernel-fusion-lowlevel.png
new file mode 100755
index 00000000..76d3e2ac
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/img/kernel-fusion-lowlevel.png differ
diff --git a/docs_input/notebooks/gtc_lab/img/no-nvtx-report.png b/docs_input/notebooks/gtc_lab/img/no-nvtx-report.png
new file mode 100755
index 00000000..91396ecc
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/img/no-nvtx-report.png differ
diff --git a/docs_input/notebooks/gtc_lab/img/radar-highlevel.png b/docs_input/notebooks/gtc_lab/img/radar-highlevel.png
new file mode 100755
index 00000000..6f5c0718
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/img/radar-highlevel.png differ
diff --git a/docs_input/notebooks/gtc_lab/img/radar-pipeline.png b/docs_input/notebooks/gtc_lab/img/radar-pipeline.png
new file mode 100755
index 00000000..870d12cc
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/img/radar-pipeline.png differ
diff --git a/docs_input/notebooks/gtc_lab/img/radar-pulsecompression.png b/docs_input/notebooks/gtc_lab/img/radar-pulsecompression.png
new file mode 100755
index 00000000..e36f705a
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/img/radar-pulsecompression.png differ
diff --git a/docs_input/notebooks/gtc_lab/img/restart_kernel.png b/docs_input/notebooks/gtc_lab/img/restart_kernel.png
new file mode 100755
index 00000000..4535dd18
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/img/restart_kernel.png differ
diff --git a/docs_input/notebooks/gtc_lab/img/with-nvtx-report.png b/docs_input/notebooks/gtc_lab/img/with-nvtx-report.png
new file mode 100755
index 00000000..2316fece
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/img/with-nvtx-report.png differ
diff --git a/docs_input/notebooks/gtc_lab/run.sh b/docs_input/notebooks/gtc_lab/run.sh
new file mode 100755
index 00000000..f0155066
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/run.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+USER_ID=$(id -u)
+GROUP_ID=$(id -g)
+#IMAGE_NAME=gitlab-master.nvidia.com:5005/tylera/playground/gtc-lab:latest
+IMAGE_NAME=gitlab-master.nvidia.com:5005/devtech-compute/sigx-group/container/gtc-lab:lite
+
+LAB_FOLDER="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+MATX_ROOT_DIR="${LAB_FOLDER%/*/*/*}/"
+SCRATCH_DIR="/$(echo "$LAB_FOLDER" | cut -d'/' -f2-3)/"
+
+docker run -it --rm \
+    -p 8888:8888 \
+    --gpus all \
+    --ipc=host \
+    -w /MatX/docs_input/notebooks/gtc_lab \
+    $IMAGE_NAME
diff --git a/docs_input/notebooks/gtc_lab/running_outside_gtc.md b/docs_input/notebooks/gtc_lab/running_outside_gtc.md
new file mode 100644
index 00000000..9093054c
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/running_outside_gtc.md
@@ -0,0 +1,14 @@
+# Notebook Startup (Not needed for GTC Lab)
+
+## Container Startup 
+Start container with all normal options, adding `-p 8888:8888`
+
+a sample `run.sh` script is provided in `MatX/docs_input/notebooks`
+
+## Start Jupyter server locally in container
+`jupyter notebook --ip=0.0.0.0 --port=8888 --no-browser --allow-root` 
+
+copy the token from the server start (specifically the local token should be something similar to):
+`http://127.0.0.1:8888/tree?token=a3ad60a152dcafe98d4eaecc22bd773b38f1e6e93312adae`
+
+Since Jupyter is binding to localhost by default, you may need to change the IP address to access it from another machine.
\ No newline at end of file
diff --git a/docs_input/notebooks/gtc_lab/samples/backup_kernel_fusion_report.nsys-rep b/docs_input/notebooks/gtc_lab/samples/backup_kernel_fusion_report.nsys-rep
new file mode 100644
index 00000000..ff219198
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/samples/backup_kernel_fusion_report.nsys-rep differ
diff --git a/docs_input/notebooks/gtc_lab/samples/backup_simple_radar_pipeline_report.nsys-rep b/docs_input/notebooks/gtc_lab/samples/backup_simple_radar_pipeline_report.nsys-rep
new file mode 100644
index 00000000..7052b596
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/samples/backup_simple_radar_pipeline_report.nsys-rep differ
diff --git a/docs_input/notebooks/gtc_lab/samples/kernel_fusion b/docs_input/notebooks/gtc_lab/samples/kernel_fusion
new file mode 100755
index 00000000..fae10974
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/samples/kernel_fusion differ
diff --git a/docs_input/notebooks/gtc_lab/samples/kernel_fusion.cu b/docs_input/notebooks/gtc_lab/samples/kernel_fusion.cu
new file mode 100644
index 00000000..c12d9b97
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/samples/kernel_fusion.cu
@@ -0,0 +1,46 @@
+#include "matx.h"
+
+int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
+{
+  MATX_ENTER_HANDLER();
+
+  cudaStream_t stream = 0;
+  matx::cudaExecutor exec{stream};
+
+  // // manually set to log all NVTX levels
+  // MATX_NVTX_SET_LOG_LEVEL( matx::matx_nvxtLogLevels::MATX_NVTX_LOG_ALL );
+
+  matx::index_t size_x = 128;
+  matx::index_t size_y = 256;
+
+  auto A      = matx::make_tensor<float>({size_x, size_y});
+  auto B      = matx::make_tensor<float>({size_x, size_y});
+  auto C      = matx::make_tensor<float>({size_x, size_y});
+  auto D      = matx::make_tensor<float>({size_x, size_y});
+  auto result = matx::make_tensor<float>({size_x, size_y});
+
+  // run once to warm-up
+  (result = cos(C)).run(exec);
+  (result = result / D).run(exec);
+  (result = result * B).run(exec);
+  (A = B * cos(C)/D).run(exec);
+  cudaStreamSynchronize(stream);
+
+  for (int i = 0; i < 10; i++) {
+
+    // first individual, independent kernels
+    int unfused_range = MATX_NVTX_START_RANGE("Unfused Kernels");
+    (result = cos(C)).run(exec);
+    (result = result / D).run(exec);
+    (result = result * B).run(exec);
+    MATX_NVTX_END_RANGE(unfused_range);
+
+    // now, as a fused operation
+    int fused_range = MATX_NVTX_START_RANGE("Fused Operation");
+    (A = B * cos(C)/D).run(exec);
+    MATX_NVTX_END_RANGE(fused_range);
+  }
+
+  MATX_EXIT_HANDLER();
+  MATX_EXIT_HANDLER();
+}
\ No newline at end of file
diff --git a/docs_input/notebooks/gtc_lab/samples/simple_radar_pipeline b/docs_input/notebooks/gtc_lab/samples/simple_radar_pipeline
new file mode 100755
index 00000000..5a96a22c
Binary files /dev/null and b/docs_input/notebooks/gtc_lab/samples/simple_radar_pipeline differ
diff --git a/docs_input/notebooks/gtc_lab/samples/simple_radar_pipeline.cu b/docs_input/notebooks/gtc_lab/samples/simple_radar_pipeline.cu
new file mode 100644
index 00000000..cc4bb356
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/samples/simple_radar_pipeline.cu
@@ -0,0 +1,161 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#include "simple_radar_pipeline.h"
+
+int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
+{
+  MATX_ENTER_HANDLER();
+  index_t numChannels = 16;
+  index_t numPulses = 128;
+  index_t numSamples = 9000;
+  index_t waveformLength = 1000;
+  constexpr bool ENABLE_GRAPHS = false;
+  uint32_t iterations = 100;
+  constexpr int num_streams = 1;
+  cudaGraph_t graphs[num_streams];
+  cudaGraphExec_t instances[num_streams];
+  using complex = cuda::std::complex<float>;
+  RadarPipeline<complex> *pipelines[num_streams];
+
+  std::cout << "Iterations: " << iterations << std::endl;
+  std::cout << "numChannels: " << numChannels << std::endl;
+  std::cout << "numPulses: " << numPulses << std::endl;
+  std::cout << "numSamples: " << numSamples << std::endl;
+  std::cout << "waveformLength: " << waveformLength << std::endl;
+
+  // cuda stream to place work in
+  cudaStream_t streams[num_streams];
+
+  // manually set to log all NVTX levels
+  MATX_NVTX_SET_LOG_LEVEL( matx_nvxtLogLevels::MATX_NVTX_LOG_ALL );
+
+  // create some events for timing
+  cudaEvent_t starts[num_streams];
+  cudaEvent_t stops[num_streams];
+
+  for (int s = 0; s < num_streams; s++) {
+    cudaEventCreate(&starts[s]);
+    cudaEventCreate(&stops[s]);
+    cudaStreamCreate(&streams[s]);
+
+    MATX_NVTX_START_RANGE("Pipeline Initialize", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 1)
+    printf("Initializing data structures for stream %d...\n", s);
+    pipelines[s] = new RadarPipeline(numPulses, numSamples, waveformLength, numChannels, streams[s]);
+    MATX_NVTX_END_RANGE(1)
+
+    pipelines[s]->sync();
+  }
+
+  MATX_NVTX_START_RANGE("Pipeline Test", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 2)
+  printf("Running test...\n");
+
+  auto run_pipeline = [&](int s) {
+    MATX_NVTX_START_RANGE("PulseCompression", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 21)
+    pipelines[s]->PulseCompression();
+    MATX_NVTX_END_RANGE(21)
+
+    MATX_NVTX_START_RANGE("ThreePulseCanceller", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 22)
+    pipelines[s]->ThreePulseCanceller();
+    MATX_NVTX_END_RANGE(22)
+
+    MATX_NVTX_START_RANGE("DopplerProcessing", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 23)
+    pipelines[s]->DopplerProcessing();
+    MATX_NVTX_END_RANGE(23)
+
+    MATX_NVTX_START_RANGE("CFARDetections", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 24)
+    pipelines[s]->CFARDetections();
+    MATX_NVTX_END_RANGE(24)
+  };
+
+  // Warmup
+  for (int s = 0; s < num_streams; s++) {
+    run_pipeline(s);
+  }
+
+  if (ENABLE_GRAPHS) {
+    for (int s = 0; s < num_streams; s++) {
+      cudaStreamBeginCapture(streams[s], cudaStreamCaptureModeGlobal);
+      run_pipeline(s);
+      cudaStreamEndCapture(streams[s], &graphs[s]);
+      cudaGraphInstantiate(&instances[s], graphs[s], NULL, NULL, 0);
+    }
+  }
+
+  for (uint32_t i = 0; i < iterations; i++) {
+    for (int s = 0; s < num_streams; s++) {
+      if (i == 1) {
+        cudaEventRecord(starts[s], streams[s]);
+      }
+
+      if (ENABLE_GRAPHS) {
+        cudaGraphLaunch(instances[s], streams[s]);
+      }
+      else {
+        run_pipeline(s);
+      }
+    }
+  }
+
+  for (int s = 0; s < num_streams; s++) {
+    cudaEventRecord(stops[s], streams[s]);
+    pipelines[s]->sync();
+  }
+  MATX_NVTX_END_RANGE(2)
+
+  MATX_NVTX_START_RANGE("Pipeline Results", matx_nvxtLogLevels::MATX_NVTX_LOG_USER, 3)
+  float time_ms;
+  cudaEventElapsedTime(&time_ms, starts[num_streams-1], stops[num_streams-1]);
+  float time_s = time_ms * .001f;
+
+  auto mult = iterations * numChannels * numPulses * num_streams;
+  printf("Pipeline finished in %.2fms, rate: %.2f pulses/channel/sec (%.2f Gbps)\n",
+        time_ms,
+         static_cast<float>(mult) / time_s,
+         static_cast<float>(mult*sizeof(complex)*numSamples*8)/time_s/1e9);
+
+for (int s = 0; s < num_streams; s++) {
+    cudaEventDestroy(starts[s]);
+    cudaEventDestroy(stops[s]);
+    cudaStreamDestroy(streams[s]);
+}
+
+  cudaDeviceSynchronize();
+  MATX_CUDA_CHECK_LAST_ERROR();
+
+  matxPrintMemoryStatistics();
+
+  printf("Done\n");
+  MATX_NVTX_END_RANGE(3)
+  MATX_EXIT_HANDLER();
+  return 0;
+}
diff --git a/docs_input/notebooks/gtc_lab/samples/simple_radar_pipeline.h b/docs_input/notebooks/gtc_lab/samples/simple_radar_pipeline.h
new file mode 100644
index 00000000..0f52ccf2
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/samples/simple_radar_pipeline.h
@@ -0,0 +1,469 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+#include "matx.h"
+#include <memory>
+#include <stdint.h>
+
+using namespace matx;
+
+/**
+ * @brief Custom operator for calculating detection positions
+ * 
+ * @tparam O output tensor type
+ * @tparam I1 Input power tensor type
+ * @tparam I2 Input burst averages tensor type
+ * @tparam I3 Input norm tensor type
+ * @tparam I4 Input probability of false alarm tensor type
+ */
+template <class O, class I1, class I2, class I3, class I4>
+class calcDets : public BaseOp<calcDets<O, I1, I2, I3, I4>> {
+private:
+  O out_;
+  I1 xpow_;
+  I2 ba_;
+  I3 norm_;
+  I4 pfa_;
+
+public:
+
+  /**
+   * @brief Construct a new calcDets object
+   * 
+  * @param out output tensor
+  * @param xpow Input power tensor
+  * @param ba Input burst averages tensor
+  * @param norm Input norm tensor
+  * @param pfa Input probability of false alarm tensor
+  */
+  calcDets(O out, I1 xpow, I2 ba, I3 norm, I4 pfa)
+      : out_(out), xpow_(xpow), ba_(ba), norm_(norm), pfa_(pfa)
+  {
+  }
+
+  /**
+   * @brief Get detection value at position
+   * 
+   * @param idz Z position
+   * @param idy Y position
+   * @param idx X position
+   */
+  __device__ inline void operator()(index_t idz, index_t idy, index_t idx)
+  {
+    typename I1::type xpow = xpow_(idz, idy, idx);
+    typename I2::type ba = ba_(idz, idy, idx);
+    typename I2::type norm = norm_(idz, idy, idx);
+    typename I2::type alpha = norm * (cuda::std::powf(pfa_, -1.0f / norm) - 1.f);
+    out_(idz, idy, idx) = (xpow > alpha * ba) ? 1 : 0;
+  }
+
+  /**
+   * @brief Get size of detection tensor across dimension
+   * 
+   * @param i dimension
+   * @return Size of dimension 
+   */
+  __host__ __device__ inline index_t Size(uint32_t i) const
+  {
+    return out_.Size(i);
+  }
+
+  /**
+   * @brief Return rank of detection tensor
+   * 
+   * @return Rank of tensor
+   */
+  static inline constexpr __host__ __device__ int32_t Rank()
+  {
+    return O::Rank();
+  }
+};
+
+/**
+ * @brief Radar Pipeline object
+ * 
+ * @tparam ComplexType type of complex value
+ */
+template <typename ComplexType = cuda::std::complex<float>>
+class RadarPipeline {
+public:
+  RadarPipeline() = delete;
+  ~RadarPipeline()
+  {
+
+  }
+
+  /**
+   * @brief Construct a new Radar Pipeline object
+   * 
+   * @param _numPulses Number of pulses
+   * @param _numSamples Number of samples per pulse
+   * @param _wfLen Waveform length
+   * @param _numChannels Number of channels
+   * @param _stream CUDA stream
+   */
+  RadarPipeline(const index_t _numPulses, const index_t _numSamples,
+                index_t _wfLen, index_t _numChannels, cudaStream_t _stream)
+      : numPulses(_numPulses), numSamples(_numSamples), waveformLength(_wfLen),
+        numChannels(_numChannels), stream(_stream), exec(_stream)
+  {
+    numSamplesRnd = 1;
+    while (numSamplesRnd < numSamples) {
+      numSamplesRnd *= 2;
+    }
+
+    numPulsesRnd = 1;
+    while (numPulsesRnd <= numPulses) {
+      numPulsesRnd *= 2;
+    }
+
+    numCompressedSamples = numSamples - waveformLength + 1;
+
+    // waveform is of length waveform data but we pad to numSamples for fft
+    make_tensor(waveformView, {numSamplesRnd});
+    make_tensor(norms);
+    make_tensor(inputView,
+        {numChannels, numPulses, numSamplesRnd});
+    make_tensor(tpcView,
+        {numChannels, numPulsesRnd, numCompressedSamples});
+    make_tensor(cancelMask, {3});
+    make_tensor(normT, 
+        {numChannels, numPulsesRnd + cfarMaskY - 1,
+         numCompressedSamples + cfarMaskX - 1});
+    make_tensor(ba, 
+        {numChannels, numPulsesRnd + cfarMaskY - 1,
+         numCompressedSamples + cfarMaskX - 1});
+    make_tensor(dets, 
+        {numChannels, numPulsesRnd, numCompressedSamples});
+    make_tensor(xPow, 
+        {numChannels, numPulsesRnd, numCompressedSamples});
+
+    cudaMemset(waveformView.Data(), 0, numSamplesRnd * sizeof(ComplexType));
+    cudaMemset(inputView.Data(), 0,
+               inputView.TotalSize() * sizeof(ComplexType));
+    cudaMemset(tpcView.Data(), 0, tpcView.TotalSize() * sizeof(ComplexType));
+
+    cancelMask.SetVals({1, -2, 1});
+
+    make_tensor(cfarMaskView, 
+        {cfarMaskY, cfarMaskX});
+    // Mask for cfar detection
+    // G == guard, R == reference, C == CUT
+    // mask = [
+    //    R R R R R ;
+    //    R R R R R ;
+    //    R R R R R ;
+    //    R R R R R ;
+    //    R R R R R ;
+    //    R G G G R ;
+    //    R G C G R ;
+    //    R G G G R ;
+    //    R R R R R ;
+    //    R R R R R ;
+    //    R R R R R ;
+    //    R R R R R ;
+    //    R R R R R ];
+    //  }
+    cfarMaskView.SetVals({{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+                           {1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1},
+                           {1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1},
+                           {1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1},
+                           {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}});
+
+    // Pre-process CFAR convolution
+    (normT = conv2d(ones({numChannels, numPulsesRnd, numCompressedSamples}),
+           cfarMaskView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
+
+    cancelMask.PrefetchDevice(stream);
+    ba.PrefetchDevice(stream);
+    normT.PrefetchDevice(stream);
+    cfarMaskView.PrefetchDevice(stream);
+    dets.PrefetchDevice(stream);
+    waveformView.PrefetchDevice(stream);
+    norms.PrefetchDevice(stream);
+    inputView.PrefetchDevice(stream);
+    tpcView.PrefetchDevice(stream);
+    xPow.PrefetchDevice(stream);
+  }
+
+  /**
+   * @brief Sync the pipeline using the underlying executor
+   * 
+   */
+  void sync() {
+    exec.sync();
+  }
+
+  /**
+   * @brief Stage 1 - Pulse compression - convolution via FFTs
+   * 
+   * Pulse compression achieves high range resolution by applying intra-pulse
+   * modulation during transmit followed by applying a matched filter after
+   * reception. References:
+   *    Richards, M. A., Scheer, J. A., Holm, W. A., "Principles of Modern
+   *    Radar: Basic Principles", SciTech Publishing, Inc., 2010.  Chapter 20.
+   *    Also, http://en.wikipedia.org/wiki/Pulse_compression
+   */
+  void PulseCompression()
+  {
+    // reshape waveform to be waveformLength
+    auto waveformPart = slice(waveformView, {0}, {waveformLength});
+    auto waveformT =
+        waveformView.template Clone<3>({numChannels, numPulses, matxKeepDim});
+
+    auto waveformFull = slice(waveformView, {0}, {numSamplesRnd});
+
+    auto x = inputView;
+
+    // create waveform (assuming waveform is the same for every pulse)
+    // this allows us to precompute waveform in frequency domain
+    // Apply a Hamming window to the waveform to suppress sidelobes. Other
+    // windows could be used as well (e.g., Taylor windows). Ultimately, it is
+    // just an element-wise weighting by a pre-computed window function.
+    (waveformPart = waveformPart * hamming<0>({waveformLength})).run(exec);
+
+    // compute L2 norm
+    (norms = sum(abs2(waveformPart))).run(exec);
+    (norms = sqrt(norms)).run(exec);
+
+    (waveformPart = waveformPart / norms).run(exec);
+    (waveformFull = fft(waveformPart, numSamplesRnd)).run(exec);
+    (waveformFull = conj(waveformFull)).run(exec);
+
+    (x = fft(x)).run(exec);
+    (x = x * waveformT).run(exec);
+    (x = ifft(x)).run(exec);
+  }
+
+
+  /**
+   * @brief Stage 2 - Three-pulse canceller - 1D convolution
+   * 
+   * The three-pulse canceller is a simple high-pass filter designed to suppress
+   * background, or "clutter", such as the ground and other non-moving objects.
+   * The three-pulse canceller is a pair of two-pulse cancellers implemented in
+   * a single stage. A two-pulse canceller just computes the difference between
+   * two subsequent pulses at each range bin. Thus, the two pulse canceller is
+   * equivalent to convolution in the pulse dimension with [1 -1] and the
+   * three-pulse canceller is convolution in the pulse dimension with [1 -2 1]
+   * ([1 -2 1] is just the convolution of [1 -1] with [1 -1], so it is
+   * effectively a sequence of two two-pulse cancellers).
+   * References:
+   *   Richards, M. A., Scheer, J. A., Holm, W. A., "Principles of Modern Radar:
+   *   Basic Principles",
+   *       SciTech Publishing, Inc., 2010.  Section 17.4.
+   */
+  void ThreePulseCanceller()
+  {
+    auto x = slice(inputView.Permute({0, 2, 1}), 
+        {0, 0, 0}, {numChannels, numCompressedSamples, numPulses});
+    auto xo = slice(tpcView.Permute({0, 2, 1}), 
+        {0, 0, 0}, {numChannels, numCompressedSamples, numPulses});
+    (xo = conv1d(x, cancelMask, matxConvCorrMode_t::MATX_C_MODE_SAME)).run(exec);
+  }
+
+  /**
+   * @brief Stage 3 - Doppler Processing - FFTs in pulse
+   * 
+   * Doppler processing converts the range-pulse data to range-Doppler data via
+   * an FFT in the Doppler dimension. Explicit spectral analysis can then be
+   * performed, such as the detector that will follow as stage 4.
+   * References:
+   *   Richards, M. A., Scheer, J. A., Holm, W. A., "Principles of Modern Radar:
+   *   Basic Principles",
+   *       SciTech Publishing, Inc., 2010.  Section 17.5.
+   *
+   * Apply a window in pulse to suppress sidelobes. Using a Hamming window for
+   * simplicity, but others would work. repmat().
+   */
+  void DopplerProcessing()
+  {
+    const index_t cpulses = numPulses - (cancelMask.Size(0) - 1);
+
+    auto xc =
+        slice(tpcView, {0, 0, 0}, {numChannels, cpulses, numCompressedSamples});
+
+    auto xf = tpcView.Permute({0, 2, 1});
+
+    (xc = xc * hamming<1>({numChannels, numPulses - (cancelMask.Size(0) - 1),
+                          numCompressedSamples}))
+        .run(exec);
+    (xf = fft(xf)).run(exec);
+  }
+
+  /**
+   * @brief Stage 4 - Constant False Alarm Rate (CFAR) Detector - averaging or median
+   * 
+   * filter CFAR detectors in general are designed to provide constant false
+   * alarm rates by dynamically adjusting detection thresholds based on certain
+   * statistical assumptions and interference estimates made from the data.
+   * References:
+   *   Richards, M. A., Scheer, J. A., Holm, W. A., "Principles of Modern Radar:
+   *   Basic Principles",
+   *       SciTech Publishing, Inc., 2010.  Section 16.4.
+   *   Richards, M. A., "Fundamentals of Radar Signal Processing", McGraw-Hill,
+   *   2005.
+   *       Chapter 7. alpha below corresponds to equation (7.17)
+   *   Also, http://en.wikipedia.org/wiki/Constant_false_alarm_rate
+
+   * CFAR works by using a training window to average cells "near" a cell
+   * under test (CUT) to estimate the background power for that cell. It is
+   * an assumption that the average of the nearby cells represents a
+   * reasonable background estimate. In general, there are guard cells (G)
+   * and reference cells (R) around the CUT. The guard cells prevent
+   * contributions of a potential target in the CUT from corrupting the
+   * background estimate. More reference cells are preferred to better
+   * estimate the background average. As implemented below, the CUT and
+   * guard cells form a hole within the training window, but CA-CFAR is
+   * largely just an averaging filter otherwise with a threshold check
+   * at each pixel after applying the filter.
+   * Currently, the window below is defined statically because it is then
+   * easy to visualize, but more typically the number of guard and
+   * reference cells would be given as input and the window would be
+   * constructed; we could update to such an approach, but I'm keeping
+   * it simple for now.
+
+   * We apply CFAR to the power of X; X is still complex until this point
+   * Xpow = abs(X).^2;
+   */
+  void CFARDetections()
+  {
+    (xPow = abs2(tpcView)).run(exec);
+
+    // Estimate the background average power in each cell
+    // background_averages = conv2(Xpow, mask, 'same') ./ norm;
+    (ba = conv2d(xPow, cfarMaskView, matxConvCorrMode_t::MATX_C_MODE_FULL)).run(exec);
+
+    // Computing number of cells contributing to each cell.
+    // This can be done with a convolution of the cfarMask with
+    // ones.
+    // norm = conv2(ones(size(X)), mask, 'same');
+    auto normTrim = slice(normT, {0, cfarMaskY / 2, cfarMaskX / 2},
+                                 {numChannels, numPulsesRnd + cfarMaskY / 2,
+                                  numCompressedSamples + cfarMaskX / 2});
+
+    auto baTrim = slice(ba, {0, cfarMaskY / 2, cfarMaskX / 2},
+                            {numChannels, numPulsesRnd + cfarMaskY / 2,
+                             numCompressedSamples + cfarMaskX / 2});
+    (baTrim = baTrim / normTrim).run(exec);
+
+    // The scalar alpha is used as a multiplier on the background averages
+    // to achieve a constant false alarm rate (under certain assumptions);
+    // it is based upon the desired probability of false alarm (Pfa) and
+    // number of reference cells used to estimate the background for the
+    // CUT. For the purposes of computation, it can be assumed as a given
+    // constant, although it does vary at the edges due to the different
+    // training windows.
+    // Declare a detection if the power exceeds the background estimate
+    // times alpha for a particular cell.
+    // dets(find(Xpow > alpha.*background_averages)) = 1;
+
+    // These 2 branches are functionally equivalent.  A custom op is more
+    // efficient as it can avoid repeated loads.
+#if 0
+    IFELSE(xPow > normTrim*(pow(pfa, -1.0f/normTrim) - 1.0f)*baTrim,
+                dets = 1, dets = 0).run(exec);
+#else
+    calcDets(dets, xPow, baTrim, normTrim, pfa).run(exec);
+#endif
+  }
+
+  /**
+   * @brief Get the Input View object
+   * 
+   * @return tensor_t view 
+   */
+  auto GetInputView() { return inputView; }
+
+  /**
+   * @brief Get waveform view
+   * 
+   * @return tensor_t view 
+   */
+  auto GetwaveformView() { return waveformView; }
+
+  /**
+   * @brief Get TPC view
+   * 
+   * @return tensor_t view 
+   */
+  auto GetTPCView() { return tpcView; }
+
+  /**
+   * @brief Get the Detections object
+   * 
+   * @return tensor_t view 
+   */
+  auto GetDetections() { return dets; }
+
+  /**
+   * @brief Get the Background Averages object
+   * 
+   * @return tensor_t view 
+   */
+  auto GetBackgroundAverages() { return ba; }
+
+  /**
+   * @brief Get norm object
+   * 
+   * @return tensor_t view 
+   */
+  auto GetnormT() { return normT; }
+
+private:
+  index_t numPulses;
+  index_t numSamples;
+  index_t waveformLength;
+  index_t numSamplesRnd;
+  index_t numPulsesRnd;
+  index_t numCompressedSamples;
+  index_t numChannels;
+  const index_t cfarMaskX = 13;
+  const index_t cfarMaskY = 5;
+
+  static const constexpr float pfa = 1e-5f;
+
+  tensor_t<typename ComplexType::value_type, 3> normT;
+  tensor_t<typename ComplexType::value_type, 3> ba;
+  tensor_t<int, 3> dets;
+  tensor_t<typename ComplexType::value_type, 1> cancelMask;
+  tensor_t<typename ComplexType::value_type, 3> xPow;
+  tensor_t<ComplexType, 1> waveformView;
+  tensor_t<typename ComplexType::value_type, 0> norms;
+  tensor_t<ComplexType, 3> inputView;
+  tensor_t<ComplexType, 3> tpcView;
+  tensor_t<typename ComplexType::value_type, 2> cfarMaskView;
+
+  cudaStream_t stream;
+  cudaExecutor exec;
+};
diff --git a/docs_input/notebooks/gtc_lab/solutions/01_A.ipynb b/docs_input/notebooks/gtc_lab/solutions/01_A.ipynb
new file mode 100644
index 00000000..1f6b8fd3
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/solutions/01_A.ipynb
@@ -0,0 +1,49 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 01_A: Creating your first tensor\n",
+    "\n",
+    "Try defining a new integer tensor of size `{3, 5}` and initialize its values in increasing order from 0 to 15. Once defined, print your tensor to ensure the values are as expected. Next, updated element (1,2) to 101. Print the tensor again to ensure the update was valid."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "// Declare a tensor\n",
+    "auto myTensor = matx::make_tensor<int>({3, 5});\n",
+    "\n",
+    "// SetVals in myTensor\n",
+    "myTensor.SetVals({{0, 1, 2, 3, 4}, {5, 6, 7, 8, 9}, {10, 11, 12, 13, 14}});\n",
+    "\n",
+    "// Print your new tensor\n",
+    "matx::print(myTensor);\n",
+    "\n",
+    "// Update the value at {1,1} to 101\n",
+    "myTensor(1, 2) = 101;\n",
+    "\n",
+    "// Print the tensor again\n",
+    "matx::print(myTensor);\n",
+    "\n",
+    "// An alternative solution is to use `linspace` and treat the original tensor as a flat 1D tensor. `linspace` takes the total elements, start, and end.\n",
+    "auto reshaped_op = matx::reshape(matx::linspace<0>({15}, 0, 14), {3, 5});\n",
+    "matx::print(reshaped_op);\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs_input/notebooks/gtc_lab/solutions/01_B.ipynb b/docs_input/notebooks/gtc_lab/solutions/01_B.ipynb
new file mode 100644
index 00000000..6ddb0eef
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/solutions/01_B.ipynb
@@ -0,0 +1,58 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 01_B: Operator Views\n",
+    "Let's demonstrate your new skills in creating views of a tensor. Using the pre-defined `baseTensor2D`, please create the following views:\n",
+    "\n",
+    "- The complete first row of the `baseTensor2D`\n",
+    "- A 2D square of 4 elements, composed of the first 2 rows and 2 columns of data\n",
+    "- Modify the (1,1) element of baseTensor2D through the view corresponding to assign it the value of 87.\n",
+    "\n",
+    "Print the output at each stage to ensure your views are working as expected."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "// Make tensor\n",
+    "auto baseTensor2D = matx::make_tensor<int>({3,5});\n",
+    "baseTensor2D.SetVals({\n",
+    "  {1, 2, 3, 4, 5},\n",
+    "  {6, 7, 8, 9, 10},\n",
+    "  {11, 12, 13, 14, 15}\n",
+    "});\n",
+    "\n",
+    "\n",
+    "// Slice the first row of baseTensor\n",
+    "auto first_row = matx::slice<1>(baseTensor2D, {0, 0}, {matx::matxDropDim, matx::matxEnd});\n",
+    "matx::print(first_row);\n",
+    "\n",
+    "// Create a 2D square of 4 elements, composed of the first 2 rows and 2 columns of data\n",
+    "auto td_square = matx::slice(baseTensor2D, {0, 0}, {2, 2});\n",
+    "matx::print(td_square);\n",
+    "\n",
+    "// Assign the value 87 to the (1,1) element of baseTensor2D and observe the change in each view\n",
+    "baseTensor2D(1, 1) = 87;\n",
+    "matx::print(baseTensor2D);\n",
+    "matx::print(first_row);\n",
+    "matx::print(td_square);\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs_input/notebooks/gtc_lab/solutions/01_C.ipynb b/docs_input/notebooks/gtc_lab/solutions/01_C.ipynb
new file mode 100644
index 00000000..94a624a4
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/solutions/01_C.ipynb
@@ -0,0 +1,61 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exercise 01_C: Operators\n",
+    "Please use the provided A and B tensors to complete the following set of operations:\n",
+    "\n",
+    "- Multiply `A` by its scalar weight factor `aScale` to populate tensor `C`\n",
+    "- In-place subtract `bOffset` from the matrix `B`\n",
+    "- Add the `A` and `B` Tensors to populate tensor `D`\n",
+    "\n",
+    "Keep in mind that rather than storing the result of an expression in a tensor, you may pass it to the `print` function directly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "auto A = matx::make_tensor<int>({2, 3});\n",
+    "auto B = matx::make_tensor<int>({2, 3});\n",
+    "auto C = matx::make_tensor<int>({2, 3});\n",
+    "auto D = matx::make_tensor<int>({2, 2});\n",
+    "\n",
+    "A.SetVals({ {1, 2, 3},\n",
+    "            {4, 5, 6}\n",
+    "          });\n",
+    "\n",
+    "(B = A).run();\n",
+    "matx::print(A);\n",
+    "matx::print(B);\n",
+    "\n",
+    "int aScale = 5;\n",
+    "int bOffset = 2;\n",
+    "\n",
+    "// Scale A by aScale\n",
+    "matx::print(A * aScale);\n",
+    "\n",
+    "// Subtract B by bOffset\n",
+    "matx::print(B - bOffset);\n",
+    "\n",
+    "// Add A and B Tensors\n",
+    "(D = A + B).run();\n",
+    "matx::print(D);"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs_input/notebooks/gtc_lab/solutions/01_D.ipynb b/docs_input/notebooks/gtc_lab/solutions/01_D.ipynb
new file mode 100644
index 00000000..3c426628
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/solutions/01_D.ipynb
@@ -0,0 +1,57 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exercise 01_D: Transforms and generators:\n",
+    "\n",
+    "For this example we will generate random data to verify the distribution of our generator functions. Please implement the following:\n",
+    "\n",
+    "- Generate three floating point 3D tensors with sizes 2x4x8, 2x8x8, and 2x4x8\n",
+    "- Populate the first two tensors with random values from a uniform distribution\n",
+    "- Perform a batched matrix multiply of the first two tensors and store the output in the third tensor\n",
+    "- Find the minimum values of each inner matrix of C (there should be 2 of them) and print the results\n",
+    "\n",
+    "Ensure that the minimum values printed match what you would expect in the third tensor."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "\n",
+    "// Create floating point tensors with specified sizes\n",
+    "auto A = matx::make_tensor<float>({2, 4, 8});\n",
+    "auto B = matx::make_tensor<float>({2, 8, 8});\n",
+    "auto C = matx::make_tensor<float>({2, 4, 8});\n",
+    "\n",
+    "// Generate random data\n",
+    "(A = matx::random<float>(A.Shape(), matx::UNIFORM)).run();\n",
+    "(B = matx::random<float>(B.Shape(), matx::UNIFORM)).run();\n",
+    "\n",
+    "matx::print(A);\n",
+    "matx::print(B);\n",
+    "\n",
+    "// Perform matmul and print\n",
+    "(C = matx::matmul(A, B)).run();\n",
+    "matx::print(C);\n",
+    "\n",
+    "// Find min values and print\n",
+    "auto min_rows = matx::min(C, {1,2});\n",
+    "matx::print(min_rows);"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs_input/notebooks/gtc_lab/solutions/02_A.ipynb b/docs_input/notebooks/gtc_lab/solutions/02_A.ipynb
new file mode 100644
index 00000000..1cf6ace9
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/solutions/02_A.ipynb
@@ -0,0 +1,69 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exercise 02_A : Fusion Basics\n",
+    "\n",
+    "Use the following equations to create an implemention that utilizes fusion and reuse optimize underlying code. \n",
+    "\n",
+    "`result = A*C + B/D + ((D-C)/B)/(A*C) `\n",
+    "\n",
+    "An example implementation is given with all operations done individually; how much faster can you make it?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "auto exec = matx::CUDAExecutor();\n",
+    "\n",
+    "matx::index_t size_x = 128;\n",
+    "matx::index_t size_y = 256;\n",
+    "\n",
+    "auto A      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto B      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto C      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto D      = matx::make_tensor<float>({size_x, size_y});\n",
+    "auto result = matx::make_tensor<float>({size_x, size_y});\n",
+    "\n",
+    "// ---- populate the data ---- //\n",
+    "(A = matx::random<float>(A.Shape(), matx::NORMAL)).run();\n",
+    "(B = matx::random<float>(B.Shape(), matx::NORMAL)).run();\n",
+    "(C = matx::random<float>(C.Shape(), matx::NORMAL)).run();\n",
+    "(D = matx::random<float>(D.Shape(), matx::NORMAL)).run();\n",
+    "(result = matx::zeros({size_x, size_y})).run(exec);\n",
+    "exec.sync();\n",
+    "\n",
+    "// ---- all crammed together ---- //\n",
+    "exec.start_timer();\n",
+    "(result = A * C  + B / D + ((D - C) / B) / A * C).run(exec); \n",
+    "exec.stop_timer();\n",
+    "std::cout <<\"One Equation Runtime: \" << exec.get_time_ms() << \" ms\" << std::endl;\n",
+    "\n",
+    "// ---- ideal implementation with reuse of operators ---- //\n",
+    "exec.start_timer();\n",
+    "auto term1 = A * C; \n",
+    "auto term2 = B / D;\n",
+    "auto term3 = (D - C) / B;\n",
+    "auto term4 = term3 / term1;\n",
+    "(result = term1 + term2 + term4).run(exec);\n",
+    "exec.stop_timer();\n",
+    "std::cout <<\"Fused Operation Runtime: \" << exec.get_time_ms() << \" ms\" << std::endl;  "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs_input/notebooks/gtc_lab/solutions/02_B.ipynb b/docs_input/notebooks/gtc_lab/solutions/02_B.ipynb
new file mode 100644
index 00000000..94e1e00d
--- /dev/null
+++ b/docs_input/notebooks/gtc_lab/solutions/02_B.ipynb
@@ -0,0 +1,102 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Exercise: Black Scholes Fusion\n",
+    "\n",
+    "The Black Scholes model provides a fantastic example of a real-world set of equations that greatly benefits from operator fusion. Black Scholes provides both a complex set of expressions that provide significant readability improvements if expressed as individual expressions, but also benefits from fusion of its separate operational parts. Below is a brief description of the Black Scholes models and its composite terms:\n",
+    "\n",
+    "\n",
+    "$$\n",
+    "C(S_0, K, T) = S_0 \\,\\Phi\\bigl(d_1\\bigr) \\;-\\; K \\, e^{-rT} \\,\\Phi\\bigl(d_2\\bigr),\n",
+    "$$\n",
+    "\n",
+    "where\n",
+    "\n",
+    "$$\n",
+    "d_1 = \\frac{\\ln\\!\\bigl(\\tfrac{S_0}{K}\\bigr) + \\bigl(r + \\tfrac{\\sigma^2}{2}\\bigr)T}{\\sigma \\sqrt{T}},\n",
+    "\\quad\n",
+    "d_2 = d_1 - \\sigma \\sqrt{T}.\n",
+    "$$\n",
+    "\n",
+    "Here:\n",
+    "- \\( S_0 \\) is the current stock price\n",
+    "- \\( K \\) is the strike price\n",
+    "- \\( T \\) is the time to maturity (in years)\n",
+    "- \\( r \\) is the risk-free interest rate (annualized)\n",
+    "- \\( \\sigma \\) is the volatility of the underlying stock (annualized)\n",
+    "- \\( \\Phi(\\cdot) \\) is the cumulative distribution function (CDF) of the standard normal distribution\n",
+    "\n",
+    "\n",
+    "\n",
+    "We can easily translate this by expressing each of the terms defined above as separate MatX operators, then fusing the execution of those operators in the final run command.\n",
+    "\n",
+    "Try breaking the equation below into the following operators:\n",
+    "\n",
+    "```\n",
+    "VsqrtT  = V * sqrt(T);\n",
+    "d1      = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;\n",
+    "d2      = d1 - VsqrtT;\n",
+    "cdf_d1  = normcdf(d1);\n",
+    "cdf_d2  = normcdf(d2);\n",
+    "expRT   = exp(-1 * r * T); \n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%%run_matx\n",
+    "auto exec = matx::CUDAExecutor();\n",
+    "\n",
+    "using dtype = double;\n",
+    "matx::index_t input_size = 100;\n",
+    "\n",
+    "\n",
+    "// ---- declare input data ---- //\n",
+    "auto K = matx::make_tensor<dtype>({input_size});\n",
+    "auto S = matx::make_tensor<dtype>({input_size});\n",
+    "auto V = matx::make_tensor<dtype>({input_size});\n",
+    "auto r = matx::make_tensor<dtype>({input_size});\n",
+    "auto T = matx::make_tensor<dtype>({input_size});\n",
+    "auto output = matx::make_tensor<dtype>({input_size});  \n",
+    "\n",
+    "// ---- populate the data ---- //\n",
+    "(K = matx::random<float>(K.Shape(), matx::NORMAL)).run();\n",
+    "(S = matx::random<float>(S.Shape(), matx::NORMAL)).run();\n",
+    "(V = matx::random<float>(V.Shape(), matx::NORMAL)).run();\n",
+    "(r = matx::random<float>(r.Shape(), matx::NORMAL)).run();\n",
+    "(T = matx::random<float>(T.Shape(), matx::NORMAL)).run();\n",
+    "(output = matx::zeros({input_size})).run(exec);\n",
+    "exec.sync();\n",
+    "\n",
+    "auto VsqrtT = V * sqrt(T);\n",
+    "auto d1     = (log(S / K) + (r + 0.5 * V * V) * T) / VsqrtT ;\n",
+    "auto d2     = d1 - VsqrtT;\n",
+    "auto cdf_d1 = matx::normcdf(d1);\n",
+    "auto cdf_d2 = matx::normcdf(d2);\n",
+    "auto expRT  = exp(-1 * r * T); \n",
+    "exec.start_timer();\n",
+    "(output = S * cdf_d1 - K * expRT * cdf_d2).run(exec);\n",
+    "exec.stop_timer();\n",
+    "std::cout <<\"Fused Runtime: \" << exec.get_time_ms() << \" ms\" << std::endl;"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs_input/notebooks/scripts/run_matx.py b/docs_input/notebooks/scripts/run_matx.py
new file mode 100644
index 00000000..18cc2732
--- /dev/null
+++ b/docs_input/notebooks/scripts/run_matx.py
@@ -0,0 +1,123 @@
+from IPython.core.magic import register_cell_magic
+import subprocess
+import os
+import threading
+
+# Simple progress bar from https://stackoverflow.com/questions/3173320/text-progress-bar-in-terminal-with-block-characters
+def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
+    """
+    Call in a loop to create terminal progress bar
+    @params:
+        iteration   - Required  : current iteration (Int)
+        total       - Required  : total iterations (Int)
+        prefix      - Optional  : prefix string (Str)
+        suffix      - Optional  : suffix string (Str)
+        decimals    - Optional  : positive number of decimals in percent complete (Int)
+        length      - Optional  : character length of bar (Int)
+        fill        - Optional  : bar fill character (Str)
+        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
+    """
+    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
+    filledLength = int(length * iteration // total)
+    bar = fill * filledLength + '-' * (length - filledLength)
+    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
+    # Print New Line on Complete
+    if iteration == total: 
+        print()
+
+
+def run_command(cmd_list):
+    process = subprocess.Popen(
+        cmd_list,  # directly pass the command as a list
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        text=True,
+        bufsize=1,  # line-buffered
+        universal_newlines=True
+    )
+
+    cur_line = 0
+
+    def handle_stderr_line(line):
+        nonlocal cur_line
+        sp = line.split()
+        if len(sp) == 0:
+            return
+
+        if sp[0] != '#$':
+            print(line)
+            return
+        
+        start_tokens = ('gcc', 'cudafe++', 'gcc', '"$CICC_PATH/cicc"', 'ptxas', 'fatbinary', 'rm', 'gcc', 'nvlink', 'fatbinary', 'rm', 'gcc', 'g++')
+        if sp[1] != start_tokens[cur_line]:
+            return
+
+        printProgressBar(cur_line+1, len(start_tokens), prefix = 'Compiling...')
+        cur_line += 1
+
+    def read_stderr(pipe):
+        for line in pipe:
+            handle_stderr_line(line)
+
+
+    def read_stdout(pipe):
+        for line in pipe:
+            print(line, end='')  # Print in real-time
+
+    stderr_thread = threading.Thread(target=read_stderr, args=(process.stderr,))
+    stdout_thread = threading.Thread(target=read_stdout, args=(process.stdout,))
+    
+    stderr_thread.start()
+    stdout_thread.start()
+
+    # Wait for process to complete
+    process.wait()
+    
+    # Wait for threads to finish
+    stderr_thread.join()
+    stdout_thread.join()
+
+    cur_line = 0  
+    if process.returncode != 0:
+        print("\nCompilation failed!")
+        return False
+ 
+    return True
+
+
+def load_ipython_extension(ipython):
+    # Register any magic commands or perform setup here
+    from IPython.core.magic import register_cell_magic
+
+    @register_cell_magic
+    def run_matx(line, cell):
+        output_code = f"""
+        #include <matx.h>
+
+        int main() {{
+            {cell}
+        }}
+        """
+
+        with open("/tmp/output.cu", "w") as f:
+            f.write(output_code)
+        with open("/tmp/output.cpp", "w") as f:
+            f.write(output_code)    
+
+        current_dir = os.getcwd()
+
+        MATX_ROOT = '/MatX'
+
+        nvcc_cmd = f'nvcc -v -forward-unknown-to-host-compiler -Ofc -std=c++17 -DMATX_DISABLE_CUB_CACHE -DMATX_ENABLE_FILEIO -DMATX_ENABLE_PYBIND11 -DMATX_EN_OMP -DMATX_EN_X86_FFTW -DMATX_NVTX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA '\
+                    f'-DTHRUST_DISABLE_ABI_NAMESPACE -DMATX_EN_OPENBLAS  --generate-code=arch=compute_80,code=[sm_80] '\
+                    f'-I{MATX_ROOT}/include -I{MATX_ROOT}/include/matx/kernels -I"{MATX_ROOT}/build/_deps/cccl-src/lib/cmake/thrust/../../../thrust" -I"{MATX_ROOT}/build/_deps/cccl-src/lib/cmake/libcudacxx/../../../libcudacxx/include" -I"{MATX_ROOT}/build/_deps/cccl-src/lib/cmake/cub/../../../cub" -isystem "{MATX_ROOT}/build/_deps/pybind11-src/include" -isystem /usr/include/python3.10 -isystem "/usr/include/x86_64-linux-gnu/openblas64-openmp" -isystem "{MATX_ROOT}/build/_deps/cutensor-src/include" -isystem "{MATX_ROOT}/build/_deps/cutensornet-src/include" -isystem "{MATX_ROOT}/build/_deps/cudss-src/include" -isystem /usr/local/cuda/include'\
+                    f'-DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP -DTHRUST_IGNORE_ABI_NAMESPACE_ERROR -fopenmp -DMATX_ROOT=\"\" -fvisibility=hidden -lcuda -lcufft -lcublas -lcublasLt -lcurand -lfftw3 -lfftw3f -lfftw3_omp -lfftw3f_omp -lopenblas64 -o /tmp/output /tmp/output.cu'
+
+        if run_command(nvcc_cmd.split()):
+            run_process = subprocess.run([f"/tmp/output"], capture_output=True, text=True)
+
+            print(run_process.stdout)   
+            if run_process.returncode != 0:
+                print(run_process.stderr)
+
+    ipython.register_magic_function(run_matx, 'cell')
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 25f36bc2..8fcb625e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(examples
     simple_radar_pipeline
+    kernel_fusion
     recursive_filter
     channelize_poly_bench
     convolution
diff --git a/examples/kernel_fusion.cu b/examples/kernel_fusion.cu
new file mode 100644
index 00000000..c12d9b97
--- /dev/null
+++ b/examples/kernel_fusion.cu
@@ -0,0 +1,46 @@
+#include "matx.h"
+
+int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
+{
+  MATX_ENTER_HANDLER();
+
+  cudaStream_t stream = 0;
+  matx::cudaExecutor exec{stream};
+
+  // // manually set to log all NVTX levels
+  // MATX_NVTX_SET_LOG_LEVEL( matx::matx_nvxtLogLevels::MATX_NVTX_LOG_ALL );
+
+  matx::index_t size_x = 128;
+  matx::index_t size_y = 256;
+
+  auto A      = matx::make_tensor<float>({size_x, size_y});
+  auto B      = matx::make_tensor<float>({size_x, size_y});
+  auto C      = matx::make_tensor<float>({size_x, size_y});
+  auto D      = matx::make_tensor<float>({size_x, size_y});
+  auto result = matx::make_tensor<float>({size_x, size_y});
+
+  // run once to warm-up
+  (result = cos(C)).run(exec);
+  (result = result / D).run(exec);
+  (result = result * B).run(exec);
+  (A = B * cos(C)/D).run(exec);
+  cudaStreamSynchronize(stream);
+
+  for (int i = 0; i < 10; i++) {
+
+    // first individual, independent kernels
+    int unfused_range = MATX_NVTX_START_RANGE("Unfused Kernels");
+    (result = cos(C)).run(exec);
+    (result = result / D).run(exec);
+    (result = result * B).run(exec);
+    MATX_NVTX_END_RANGE(unfused_range);
+
+    // now, as a fused operation
+    int fused_range = MATX_NVTX_START_RANGE("Fused Operation");
+    (A = B * cos(C)/D).run(exec);
+    MATX_NVTX_END_RANGE(fused_range);
+  }
+
+  MATX_EXIT_HANDLER();
+  MATX_EXIT_HANDLER();
+}
\ No newline at end of file