From 833e6255303b20c361c3bd3b59f4dec7925a87bf Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 20 Jan 2026 14:19:00 -0800 Subject: [PATCH 1/7] ci: run cuda.bindings examples on Linux --- .github/workflows/test-wheel-linux.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 78a2fa1c4f..2f6cabcbf8 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -254,6 +254,16 @@ jobs: LOCAL_CTK: ${{ matrix.LOCAL_CTK }} run: run-tests bindings + - name: Run cuda.bindings examples + if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }} + env: + CUDA_VER: ${{ matrix.CUDA_VER }} + LOCAL_CTK: ${{ matrix.LOCAL_CTK }} + run: | + pushd cuda_bindings + ${SANITIZER_CMD} python -m pytest -ra -s -vv examples/ + popd + - name: Run cuda.core tests env: CUDA_VER: ${{ matrix.CUDA_VER }} From dbd6464386edb0cc7f59a05d9f38ab8de6642922 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 20 Jan 2026 14:21:14 -0800 Subject: [PATCH 2/7] ci: run cuda.bindings examples on Windows --- .github/workflows/test-wheel-windows.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 6d7164edc9..715d069bdd 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -226,6 +226,17 @@ jobs: shell: bash --noprofile --norc -xeuo pipefail {0} run: run-tests bindings + - name: Run cuda.bindings examples + if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }} + env: + CUDA_VER: ${{ matrix.CUDA_VER }} + LOCAL_CTK: ${{ matrix.LOCAL_CTK }} + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + pushd cuda_bindings + ${SANITIZER_CMD} python -m pytest -ra -s -vv examples/ + popd + - name: Run cuda.core tests env: CUDA_VER: ${{ matrix.CUDA_VER }} From 64ba26e4a6f9d0bb88cb27c9133f8da3a8bea329 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Tue, 20 Jan 2026 15:17:21 -0800 Subject: [PATCH 3/7] ci: run bindings examples via pytest entrypoint Analysis: - examples were invoked via `python -m pytest` from within `cuda_bindings` so the repo checkout was on sys.path and imports resolved to the source tree - `setuptools_scm` generates `cuda/bindings/_version.py` only in the built wheel, so the source tree lacks this file and `from cuda.bindings._version import __version__` fails during example collection - running `pytest` via the installed entrypoint avoids CWD precedence and keeps imports coming from the installed wheel, which includes the generated version file Change: - switch Linux and Windows example steps to call `pytest` entrypoint --- .github/workflows/test-wheel-linux.yml | 2 +- .github/workflows/test-wheel-windows.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 2f6cabcbf8..49a0298b34 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -261,7 +261,7 @@ jobs: LOCAL_CTK: ${{ matrix.LOCAL_CTK }} run: | pushd cuda_bindings - ${SANITIZER_CMD} python -m pytest -ra -s -vv examples/ + ${SANITIZER_CMD} pytest -ra -s -vv examples/ popd - name: Run cuda.core tests diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 715d069bdd..0f9b25d7c2 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -234,7 +234,7 @@ jobs: shell: bash --noprofile --norc -xeuo pipefail {0} run: | pushd cuda_bindings - ${SANITIZER_CMD} python -m pytest -ra -s -vv examples/ + ${SANITIZER_CMD} pytest -ra -s -vv examples/ popd - name: Run cuda.core tests From 95024c4458f4a380b71066580486a7d06614ce98 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 22 Jan 2026 08:11:44 -0800 Subject: [PATCH 4/7] use pathfinder.find_nvidia_header_directory() in cuda_bindings examples KernelHelper --- cuda_bindings/examples/common/common.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py index 13b57749a6..166d062bd8 100644 --- a/cuda_bindings/examples/common/common.py +++ b/cuda_bindings/examples/common/common.py @@ -5,6 +5,7 @@ import numpy as np from common.helper_cuda import checkCudaErrors +from cuda import pathfinder from cuda.bindings import driver as cuda from cuda.bindings import nvrtc from cuda.bindings import runtime as cudart @@ -44,16 +45,16 @@ def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor): class KernelHelper: def __init__(self, code, devID): - prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None)) + include_dirs = [] + for libname in ("cudart", "cccl"): + hdr_dir = pathfinder.find_nvidia_header_directory(libname) + if hdr_dir is None: + import pytest + + pytest.skip(f'pathfinder.find_nvidia_header_directory("{libname}") returned None') + include_dirs.append(hdr_dir) - cuda_home = get_cuda_home() - assert cuda_home is not None - cuda_include = os.path.join(cuda_home, "include") - assert os.path.isdir(cuda_include) - include_dirs = [cuda_include] - cccl_include = os.path.join(cuda_include, "cccl") - if os.path.isdir(cccl_include): - include_dirs.insert(0, cccl_include) + prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None)) # Initialize CUDA checkCudaErrors(cudart.cudaFree(0)) From 7fa3f760ae27b52e7654c04825b753282d0631e2 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 22 Jan 2026 08:40:55 -0800 Subject: [PATCH 5/7] Remove pytest_skipif_cuda_include_not_found() and get_cuda_home() entirely under cuda_bindings/examples/ --- .../globalToShmemAsyncCopy_test.py | 1 - cuda_bindings/examples/common/common.py | 19 ------------------- 2 files changed, 20 deletions(-) diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py index 2250b07036..b82c9b02b4 100644 --- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py @@ -1114,7 +1114,6 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): def main(): - common.pytest_skipif_cuda_include_not_found() common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0)) print("[globalToShmemAsyncCopy] - Starting...") diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py index 166d062bd8..ee1bef5acb 100644 --- a/cuda_bindings/examples/common/common.py +++ b/cuda_bindings/examples/common/common.py @@ -1,7 +1,6 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -import os import numpy as np from common.helper_cuda import checkCudaErrors @@ -11,24 +10,6 @@ from cuda.bindings import runtime as cudart -def get_cuda_home(): - cuda_home = os.getenv("CUDA_HOME") - if cuda_home is None: - cuda_home = os.getenv("CUDA_PATH") - return cuda_home - - -def pytest_skipif_cuda_include_not_found(): - import pytest - - cuda_home = get_cuda_home() - if cuda_home is None: - pytest.skip("CUDA_HOME/CUDA_PATH not set") - cuda_include = os.path.join(cuda_home, "include") - if not os.path.exists(cuda_include): - pytest.skip(f"$CUDA_HOME/include does not exist: '{cuda_include}'") - - def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor): import pytest From dbfa3db586905272b6ed0bf93141b1818866568d Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 22 Jan 2026 14:28:53 -0800 Subject: [PATCH 6/7] Replace py3.13 2-GPU job with 3.14t 2-GPU job --- ci/test-matrix.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 81dd98f92a..56191bc6c5 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -30,13 +30,13 @@ linux: - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '2', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest' } - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } From 7e03dd5a6f6daf8405b4d238fb32d865013d17a0 Mon Sep 17 00:00:00 2001 From: "Ralf W. Grosse-Kunstleve" Date: Thu, 22 Jan 2026 16:06:54 -0800 Subject: [PATCH 7/7] Fix kernel arg lifetimes in isoFDModelling example Keep pointer arrays alive through launches to avoid free-threaded Python misaligned-address failures caused by temporary argument buffers. --- .../examples/extra/isoFDModelling_test.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py index 8885c310b3..8d7c4538a9 100644 --- a/cuda_bindings/examples/extra/isoFDModelling_test.py +++ b/cuda_bindings/examples/extra/isoFDModelling_test.py @@ -310,7 +310,7 @@ def createSource(self, kernel): freq = np.array(self.params.freqMax, dtype=np.float32) args = [buf, dt, freq, nt] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) checkCudaErrors( cuda.cuLaunchKernel( @@ -323,7 +323,7 @@ def createSource(self, kernel): 1, # block dim 0, self.streamHalo, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments @@ -351,7 +351,7 @@ def injectSource(self, kernel, iter): np_it = np.array(iter, dtype=np.uint32) args = [wavein + offset_sourceInject, src, np_it] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) checkCudaErrors( cuda.cuLaunchKernel( kernel.injectSource, @@ -363,7 +363,7 @@ def injectSource(self, kernel, iter): 1, # block dim 0, self.streamHalo, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments @@ -391,7 +391,7 @@ def createVelocity(self, kernel): np_stride = np.array(stride, dtype=np.uint32) args = [vel + offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) @@ -407,7 +407,7 @@ def createVelocity(self, kernel): 1, # block dim 0, self.streamHalo, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments @@ -448,7 +448,7 @@ def executeCenter(self, kernel): np_nx, np_stride, ] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) # do center propagation from 2 * fd_order to nz - 2 * fd_order checkCudaErrors( @@ -462,7 +462,7 @@ def executeCenter(self, kernel): 1, # block dim 0, self.streamCenter, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments @@ -503,7 +503,7 @@ def executeHalo(self, kernel): np_nx, np_stride, ] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) # do halo up checkCudaErrors( @@ -517,7 +517,7 @@ def executeHalo(self, kernel): 1, # block dim 0, self.streamHalo, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments @@ -541,7 +541,7 @@ def executeHalo(self, kernel): np_nx, np_stride, ] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) checkCudaErrors( cuda.cuLaunchKernel( kernel.fdPropag, @@ -553,7 +553,7 @@ def executeHalo(self, kernel): 1, # block dim 0, self.streamHalo, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments