From 833e6255303b20c361c3bd3b59f4dec7925a87bf Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 20 Jan 2026 14:19:00 -0800
Subject: [PATCH 1/7] ci: run cuda.bindings examples on Linux

---
 .github/workflows/test-wheel-linux.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 78a2fa1c4f..2f6cabcbf8 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -254,6 +254,16 @@ jobs:
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
         run: run-tests bindings
 
+      - name: Run cuda.bindings examples
+        if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        run: |
+          pushd cuda_bindings
+          ${SANITIZER_CMD} python -m pytest -ra -s -vv examples/
+          popd
+
       - name: Run cuda.core tests
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}

From dbd6464386edb0cc7f59a05d9f38ab8de6642922 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 20 Jan 2026 14:21:14 -0800
Subject: [PATCH 2/7] ci: run cuda.bindings examples on Windows

---
 .github/workflows/test-wheel-windows.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 6d7164edc9..715d069bdd 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -226,6 +226,17 @@ jobs:
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: run-tests bindings
 
+      - name: Run cuda.bindings examples
+        if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
+        env:
+          CUDA_VER: ${{ matrix.CUDA_VER }}
+          LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
+        shell: bash --noprofile --norc -xeuo pipefail {0}
+        run: |
+          pushd cuda_bindings
+          ${SANITIZER_CMD} python -m pytest -ra -s -vv examples/
+          popd
+
       - name: Run cuda.core tests
         env:
           CUDA_VER: ${{ matrix.CUDA_VER }}

From 64ba26e4a6f9d0bb88cb27c9133f8da3a8bea329 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Tue, 20 Jan 2026 15:17:21 -0800
Subject: [PATCH 3/7] ci: run bindings examples via pytest entrypoint

Analysis:
- examples were invoked via `python -m pytest` from within `cuda_bindings`
  so the repo checkout was on sys.path and imports resolved to the source tree
- `setuptools_scm` generates `cuda/bindings/_version.py` only in the built wheel,
  so the source tree lacks this file and `from cuda.bindings._version import __version__`
  fails during example collection
- running `pytest` via the installed entrypoint avoids CWD precedence and keeps
  imports coming from the installed wheel, which includes the generated version file

Change:
- switch Linux and Windows example steps to call `pytest` entrypoint
---
 .github/workflows/test-wheel-linux.yml   | 2 +-
 .github/workflows/test-wheel-windows.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml
index 2f6cabcbf8..49a0298b34 100644
--- a/.github/workflows/test-wheel-linux.yml
+++ b/.github/workflows/test-wheel-linux.yml
@@ -261,7 +261,7 @@ jobs:
           LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
         run: |
           pushd cuda_bindings
-          ${SANITIZER_CMD} python -m pytest -ra -s -vv examples/
+          ${SANITIZER_CMD} pytest -ra -s -vv examples/
           popd
 
       - name: Run cuda.core tests
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
index 715d069bdd..0f9b25d7c2 100644
--- a/.github/workflows/test-wheel-windows.yml
+++ b/.github/workflows/test-wheel-windows.yml
@@ -234,7 +234,7 @@ jobs:
         shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           pushd cuda_bindings
-          ${SANITIZER_CMD} python -m pytest -ra -s -vv examples/
+          ${SANITIZER_CMD} pytest -ra -s -vv examples/
           popd
 
       - name: Run cuda.core tests

From 95024c4458f4a380b71066580486a7d06614ce98 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 22 Jan 2026 08:11:44 -0800
Subject: [PATCH 4/7] use pathfinder.find_nvidia_header_directory() in
 cuda_bindings examples KernelHelper

---
 cuda_bindings/examples/common/common.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py
index 13b57749a6..166d062bd8 100644
--- a/cuda_bindings/examples/common/common.py
+++ b/cuda_bindings/examples/common/common.py
@@ -5,6 +5,7 @@
 
 import numpy as np
 from common.helper_cuda import checkCudaErrors
+from cuda import pathfinder
 from cuda.bindings import driver as cuda
 from cuda.bindings import nvrtc
 from cuda.bindings import runtime as cudart
@@ -44,16 +45,16 @@ def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor):
 
 class KernelHelper:
     def __init__(self, code, devID):
-        prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
+        include_dirs = []
+        for libname in ("cudart", "cccl"):
+            hdr_dir = pathfinder.find_nvidia_header_directory(libname)
+            if hdr_dir is None:
+                import pytest
+
+                pytest.skip(f'pathfinder.find_nvidia_header_directory("{libname}") returned None')
+            include_dirs.append(hdr_dir)
 
-        cuda_home = get_cuda_home()
-        assert cuda_home is not None
-        cuda_include = os.path.join(cuda_home, "include")
-        assert os.path.isdir(cuda_include)
-        include_dirs = [cuda_include]
-        cccl_include = os.path.join(cuda_include, "cccl")
-        if os.path.isdir(cccl_include):
-            include_dirs.insert(0, cccl_include)
+        prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
 
         # Initialize CUDA
         checkCudaErrors(cudart.cudaFree(0))

From 7fa3f760ae27b52e7654c04825b753282d0631e2 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 22 Jan 2026 08:40:55 -0800
Subject: [PATCH 5/7] Remove pytest_skipif_cuda_include_not_found() and
 get_cuda_home() entirely under cuda_bindings/examples/

---
 .../globalToShmemAsyncCopy_test.py            |  1 -
 cuda_bindings/examples/common/common.py       | 19 -------------------
 2 files changed, 20 deletions(-)

diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
index 2250b07036..b82c9b02b4 100644
--- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
+++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py
@@ -1114,7 +1114,6 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):
 
 
 def main():
-    common.pytest_skipif_cuda_include_not_found()
     common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0))
 
     print("[globalToShmemAsyncCopy] - Starting...")
diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py
index 166d062bd8..ee1bef5acb 100644
--- a/cuda_bindings/examples/common/common.py
+++ b/cuda_bindings/examples/common/common.py
@@ -1,7 +1,6 @@
 # Copyright 2021-2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-import os
 
 import numpy as np
 from common.helper_cuda import checkCudaErrors
@@ -11,24 +10,6 @@
 from cuda.bindings import runtime as cudart
 
 
-def get_cuda_home():
-    cuda_home = os.getenv("CUDA_HOME")
-    if cuda_home is None:
-        cuda_home = os.getenv("CUDA_PATH")
-    return cuda_home
-
-
-def pytest_skipif_cuda_include_not_found():
-    import pytest
-
-    cuda_home = get_cuda_home()
-    if cuda_home is None:
-        pytest.skip("CUDA_HOME/CUDA_PATH not set")
-    cuda_include = os.path.join(cuda_home, "include")
-    if not os.path.exists(cuda_include):
-        pytest.skip(f"$CUDA_HOME/include does not exist: '{cuda_include}'")
-
-
 def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor):
     import pytest
 

From dbfa3db586905272b6ed0bf93141b1818866568d Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 22 Jan 2026 14:28:53 -0800
Subject: [PATCH 6/7] Replace py3.13 2-GPU job with 3.14t 2-GPU job

---
 ci/test-matrix.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml
index 81dd98f92a..56191bc6c5 100644
--- a/ci/test-matrix.yml
+++ b/ci/test-matrix.yml
@@ -30,13 +30,13 @@ linux:
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'H100',       GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'H100',       GPU_COUNT: '1', DRIVER: 'latest' }
-    - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100',       GPU_COUNT: '2', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.13',  CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 't4',         GPU_COUNT: '2', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14',  CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4',         GPU_COUNT: '1', DRIVER: 'latest' }
+    - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100',       GPU_COUNT: '2', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.10',  CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.10',  CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest' }
     - { ARCH: 'arm64', PY_VER: '3.10',  CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100',       GPU_COUNT: '1', DRIVER: 'latest' }

From 7e03dd5a6f6daf8405b4d238fb32d865013d17a0 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Thu, 22 Jan 2026 16:06:54 -0800
Subject: [PATCH 7/7] Fix kernel arg lifetimes in isoFDModelling example

Keep pointer arrays alive through launches to avoid free-threaded Python
misaligned-address failures caused by temporary argument buffers.
---
 .../examples/extra/isoFDModelling_test.py     | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py
index 8885c310b3..8d7c4538a9 100644
--- a/cuda_bindings/examples/extra/isoFDModelling_test.py
+++ b/cuda_bindings/examples/extra/isoFDModelling_test.py
@@ -310,7 +310,7 @@ def createSource(self, kernel):
         freq = np.array(self.params.freqMax, dtype=np.float32)
 
         args = [buf, dt, freq, nt]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
         checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
         checkCudaErrors(
             cuda.cuLaunchKernel(
@@ -323,7 +323,7 @@ def createSource(self, kernel):
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments
@@ -351,7 +351,7 @@ def injectSource(self, kernel, iter):
         np_it = np.array(iter, dtype=np.uint32)
 
         args = [wavein + offset_sourceInject, src, np_it]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
         checkCudaErrors(
             cuda.cuLaunchKernel(
                 kernel.injectSource,
@@ -363,7 +363,7 @@ def injectSource(self, kernel, iter):
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments
@@ -391,7 +391,7 @@ def createVelocity(self, kernel):
         np_stride = np.array(stride, dtype=np.uint32)
 
         args = [vel + offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
 
@@ -407,7 +407,7 @@ def createVelocity(self, kernel):
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments
@@ -448,7 +448,7 @@ def executeCenter(self, kernel):
             np_nx,
             np_stride,
         ]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         # do center propagation from 2 * fd_order to nz - 2 * fd_order
         checkCudaErrors(
@@ -462,7 +462,7 @@ def executeCenter(self, kernel):
                 1,  # block dim
                 0,
                 self.streamCenter,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments
@@ -503,7 +503,7 @@ def executeHalo(self, kernel):
             np_nx,
             np_stride,
         ]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
         # do halo up
         checkCudaErrors(
@@ -517,7 +517,7 @@ def executeHalo(self, kernel):
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments
@@ -541,7 +541,7 @@ def executeHalo(self, kernel):
             np_nx,
             np_stride,
         ]
-        args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
+        argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
         checkCudaErrors(
             cuda.cuLaunchKernel(
                 kernel.fdPropag,
@@ -553,7 +553,7 @@ def executeHalo(self, kernel):
                 1,  # block dim
                 0,
                 self.streamHalo,  # shared mem and stream
-                args.ctypes.data,
+                argsp.ctypes.data,
                 0,
             )
         )  # arguments