diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 78a2fa1c4f..49a0298b34 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -254,6 +254,16 @@ jobs: LOCAL_CTK: ${{ matrix.LOCAL_CTK }} run: run-tests bindings + - name: Run cuda.bindings examples + if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }} + env: + CUDA_VER: ${{ matrix.CUDA_VER }} + LOCAL_CTK: ${{ matrix.LOCAL_CTK }} + run: | + pushd cuda_bindings + ${SANITIZER_CMD} pytest -ra -s -vv examples/ + popd + - name: Run cuda.core tests env: CUDA_VER: ${{ matrix.CUDA_VER }} diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index 6d7164edc9..0f9b25d7c2 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -226,6 +226,17 @@ jobs: shell: bash --noprofile --norc -xeuo pipefail {0} run: run-tests bindings + - name: Run cuda.bindings examples + if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }} + env: + CUDA_VER: ${{ matrix.CUDA_VER }} + LOCAL_CTK: ${{ matrix.LOCAL_CTK }} + shell: bash --noprofile --norc -xeuo pipefail {0} + run: | + pushd cuda_bindings + ${SANITIZER_CMD} pytest -ra -s -vv examples/ + popd + - name: Run cuda.core tests env: CUDA_VER: ${{ matrix.CUDA_VER }} diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 81dd98f92a..56191bc6c5 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -30,13 +30,13 @@ linux: - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '2', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest' } - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } diff --git a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py index 2250b07036..b82c9b02b4 100644 --- a/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py +++ b/cuda_bindings/examples/3_CUDA_Features/globalToShmemAsyncCopy_test.py @@ -1114,7 +1114,6 @@ def MatrixMultiply(dimsA, dimsB, kernel_number): def main(): - common.pytest_skipif_cuda_include_not_found() common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0)) print("[globalToShmemAsyncCopy] - Starting...") diff --git a/cuda_bindings/examples/common/common.py b/cuda_bindings/examples/common/common.py index 13b57749a6..ee1bef5acb 100644 --- a/cuda_bindings/examples/common/common.py +++ b/cuda_bindings/examples/common/common.py @@ -1,33 +1,15 @@ # Copyright 2021-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE -import os import numpy as np from common.helper_cuda import checkCudaErrors +from cuda import pathfinder from cuda.bindings import driver as cuda from cuda.bindings import nvrtc from cuda.bindings import runtime as cudart -def get_cuda_home(): - cuda_home = os.getenv("CUDA_HOME") - if cuda_home is None: - cuda_home = os.getenv("CUDA_PATH") - return cuda_home - - -def pytest_skipif_cuda_include_not_found(): - import pytest - - cuda_home = get_cuda_home() - if cuda_home is None: - pytest.skip("CUDA_HOME/CUDA_PATH not set") - cuda_include = os.path.join(cuda_home, "include") - if not os.path.exists(cuda_include): - pytest.skip(f"$CUDA_HOME/include does not exist: '{cuda_include}'") - - def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor): import pytest @@ -44,16 +26,16 @@ def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor): class KernelHelper: def __init__(self, code, devID): - prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None)) + include_dirs = [] + for libname in ("cudart", "cccl"): + hdr_dir = pathfinder.find_nvidia_header_directory(libname) + if hdr_dir is None: + import pytest - cuda_home = get_cuda_home() - assert cuda_home is not None - cuda_include = os.path.join(cuda_home, "include") - assert os.path.isdir(cuda_include) - include_dirs = [cuda_include] - cccl_include = os.path.join(cuda_include, "cccl") - if os.path.isdir(cccl_include): - include_dirs.insert(0, cccl_include) + pytest.skip(f'pathfinder.find_nvidia_header_directory("{libname}") returned None') + include_dirs.append(hdr_dir) + + prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None)) # Initialize CUDA checkCudaErrors(cudart.cudaFree(0)) diff --git a/cuda_bindings/examples/extra/isoFDModelling_test.py b/cuda_bindings/examples/extra/isoFDModelling_test.py index 8885c310b3..8d7c4538a9 100644 --- a/cuda_bindings/examples/extra/isoFDModelling_test.py +++ b/cuda_bindings/examples/extra/isoFDModelling_test.py @@ -310,7 +310,7 @@ def createSource(self, kernel): freq = np.array(self.params.freqMax, dtype=np.float32) args = [buf, dt, freq, nt] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) checkCudaErrors( cuda.cuLaunchKernel( @@ -323,7 +323,7 @@ def createSource(self, kernel): 1, # block dim 0, self.streamHalo, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments @@ -351,7 +351,7 @@ def injectSource(self, kernel, iter): np_it = np.array(iter, dtype=np.uint32) args = [wavein + offset_sourceInject, src, np_it] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) checkCudaErrors( cuda.cuLaunchKernel( kernel.injectSource, @@ -363,7 +363,7 @@ def injectSource(self, kernel, iter): 1, # block dim 0, self.streamHalo, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments @@ -391,7 +391,7 @@ def createVelocity(self, kernel): np_stride = np.array(stride, dtype=np.uint32) args = [vel + offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) checkCudaErrors(cuda.cuCtxSetCurrent(self.context)) @@ -407,7 +407,7 @@ def createVelocity(self, kernel): 1, # block dim 0, self.streamHalo, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments @@ -448,7 +448,7 @@ def executeCenter(self, kernel): np_nx, np_stride, ] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) # do center propagation from 2 * fd_order to nz - 2 * fd_order checkCudaErrors( @@ -462,7 +462,7 @@ def executeCenter(self, kernel): 1, # block dim 0, self.streamCenter, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments @@ -503,7 +503,7 @@ def executeHalo(self, kernel): np_nx, np_stride, ] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) # do halo up checkCudaErrors( @@ -517,7 +517,7 @@ def executeHalo(self, kernel): 1, # block dim 0, self.streamHalo, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments @@ -541,7 +541,7 @@ def executeHalo(self, kernel): np_nx, np_stride, ] - args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) + argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64) checkCudaErrors( cuda.cuLaunchKernel( kernel.fdPropag, @@ -553,7 +553,7 @@ def executeHalo(self, kernel): 1, # block dim 0, self.streamHalo, # shared mem and stream - args.ctypes.data, + argsp.ctypes.data, 0, ) ) # arguments