NVIDIA · rwgk · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 22, 2026
diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
@@ -254,6 +254,16 @@ jobs:
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
         run: run-tests bindings
 
+      - name: Run cuda.bindings examples
+        if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        run: |
+          pushd cuda_bindings
+          ${SANITIZER_CMD} pytest -ra -s -vv examples/
+          popd
+
       - name: Run cuda.core tests
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}

diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
@@ -226,6 +226,17 @@ jobs:
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: run-tests bindings
 
+      - name: Run cuda.bindings examples
+        if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: |
+          pushd cuda_bindings
+          ${SANITIZER_CMD} pytest -ra -s -vv examples/
+          popd
+
       - name: Run cuda.core tests
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}

diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
@@ -30,13 +30,13 @@ linux:
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'H100',       GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'H100',       GPU_COUNT: '1', DRIVER: 'latest' }
-    - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100',       GPU_COUNT: '2', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 't4',         GPU_COUNT: '2', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100',       GPU_COUNT: '2', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.10',  CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.10',  CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.10',  CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest' }

diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -1114,7 +1114,6 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
 
 
 def main():
-    common.pytest_skipif_cuda_include_not_found()
     common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0))
 
     print("[globalToShmemAsyncCopy] - Starting...")

diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py
@@ -1,33 +1,15 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-import os
 
 import numpy as np
 from common.helper_cuda import checkCudaErrors
+from cuda import pathfinder
 from cuda.bindings import driver as cuda
 from cuda.bindings import nvrtc
 from cuda.bindings import runtime as cudart
 
 
-def get_cuda_home():
-    cuda_home = os.getenv("CUDA_HOME")
-    if cuda_home is None:
-        cuda_home = os.getenv("CUDA_PATH")
-    return cuda_home
-
-
-def pytest_skipif_cuda_include_not_found():
-    import pytest
-
-    cuda_home = get_cuda_home()
-    if cuda_home is None:
-        pytest.skip("CUDA_HOME/CUDA_PATH not set")
-    cuda_include = os.path.join(cuda_home, "include")
-    if not os.path.exists(cuda_include):
-        pytest.skip(f"$CUDA_HOME/include does not exist: '{cuda_include}'")
-
-
 def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor):
     import pytest
 
@@ -44,16 +26,16 @@ def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor):
 
 class KernelHelper:
     def __init__(self, code, devID):
-        prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
+        include_dirs = []
+        for libname in ("cudart", "cccl"):
+            hdr_dir = pathfinder.find_nvidia_header_directory(libname)
+            if hdr_dir is None:
+                import pytest
 
-        cuda_home = get_cuda_home()
-        assert cuda_home is not None
-        cuda_include = os.path.join(cuda_home, "include")
-        assert os.path.isdir(cuda_include)
-        include_dirs = [cuda_include]
-        cccl_include = os.path.join(cuda_include, "cccl")
-        if os.path.isdir(cccl_include):
-            include_dirs.insert(0, cccl_include)
+                pytest.skip(f'pathfinder.find_nvidia_header_directory("{libname}") returned None')
+            include_dirs.append(hdr_dir)
+
+        prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
 
         # Initialize CUDA
         checkCudaErrors(cudart.cudaFree(0))

diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py
@@ -310,7 +310,7 @@ def createSource(self, kernel):
         freq = np.array(self.params.freqMax, dtype=np.float32)
 
         args = [buf, dt, freq, nt]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
         checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
         checkCudaErrors(
             cuda.cuLaunchKernel(
@@ -323,7 +323,7 @@ def createSource(self, kernel):
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments
@@ -351,7 +351,7 @@ def injectSource(self, kernel, iter):
         np_it = np.array(iter, dtype=np.uint32)
 
         args = [wavein + offset_sourceInject, src, np_it]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
         checkCudaErrors(
             cuda.cuLaunchKernel(
                 kernel.injectSource,
@@ -363,7 +363,7 @@ def injectSource(self, kernel, iter):
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments
@@ -391,7 +391,7 @@ def createVelocity(self, kernel):
         np_stride = np.array(stride, dtype=np.uint32)
 
         args = [vel + offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
 
@@ -407,7 +407,7 @@ def createVelocity(self, kernel):
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments
@@ -448,7 +448,7 @@ def executeCenter(self, kernel):
             np_nx,
             np_stride,
         ]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         # do center propagation from 2 * fd_order to nz - 2 * fd_order
         checkCudaErrors(
@@ -462,7 +462,7 @@ def executeCenter(self, kernel):
                 1,  # block dim
                 0,
                 self.streamCenter,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments
@@ -503,7 +503,7 @@ def executeHalo(self, kernel):
             np_nx,
             np_stride,
         ]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         # do halo up
         checkCudaErrors(
@@ -517,7 +517,7 @@ def executeHalo(self, kernel):
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments
@@ -541,7 +541,7 @@ def executeHalo(self, kernel):
             np_nx,
             np_stride,
         ]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
         checkCudaErrors(
             cuda.cuLaunchKernel(
                 kernel.fdPropag,
@@ -553,7 +553,7 @@ def executeHalo(self, kernel):
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments