Skip to content
10 changes: 10 additions & 0 deletions .github/workflows/test-wheel-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,16 @@ jobs:
LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
run: run-tests bindings

- name: Run cuda.bindings examples
if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
env:
CUDA_VER: ${{ matrix.CUDA_VER }}
LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
run: |
pushd cuda_bindings
${SANITIZER_CMD} pytest -ra -s -vv examples/
popd
- name: Run cuda.core tests
env:
CUDA_VER: ${{ matrix.CUDA_VER }}
Expand Down
11 changes: 11 additions & 0 deletions .github/workflows/test-wheel-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,17 @@ jobs:
shell: bash --noprofile --norc -xeuo pipefail {0}
run: run-tests bindings

- name: Run cuda.bindings examples
if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
env:
CUDA_VER: ${{ matrix.CUDA_VER }}
LOCAL_CTK: ${{ matrix.LOCAL_CTK }}
shell: bash --noprofile --norc -xeuo pipefail {0}
run: |
pushd cuda_bindings
${SANITIZER_CMD} pytest -ra -s -vv examples/
popd
- name: Run cuda.core tests
env:
CUDA_VER: ${{ matrix.CUDA_VER }}
Expand Down
2 changes: 1 addition & 1 deletion ci/test-matrix.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ linux:
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '2', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest' }
- { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' }
- { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1114,7 +1114,6 @@ def MatrixMultiply(dimsA, dimsB, kernel_number):


def main():
common.pytest_skipif_cuda_include_not_found()
common.pytest_skipif_compute_capability_too_low(findCudaDevice(), (7, 0))

print("[globalToShmemAsyncCopy] - Starting...")
Expand Down
38 changes: 10 additions & 28 deletions cuda_bindings/examples/common/common.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,15 @@
# Copyright 2021-2025 NVIDIA Corporation. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import os

import numpy as np
from common.helper_cuda import checkCudaErrors
from cuda import pathfinder
from cuda.bindings import driver as cuda
from cuda.bindings import nvrtc
from cuda.bindings import runtime as cudart


def get_cuda_home():
cuda_home = os.getenv("CUDA_HOME")
if cuda_home is None:
cuda_home = os.getenv("CUDA_PATH")
return cuda_home


def pytest_skipif_cuda_include_not_found():
import pytest

cuda_home = get_cuda_home()
if cuda_home is None:
pytest.skip("CUDA_HOME/CUDA_PATH not set")
cuda_include = os.path.join(cuda_home, "include")
if not os.path.exists(cuda_include):
pytest.skip(f"$CUDA_HOME/include does not exist: '{cuda_include}'")


def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor):
import pytest

Expand All @@ -44,16 +26,16 @@ def pytest_skipif_compute_capability_too_low(devID, required_cc_major_minor):

class KernelHelper:
def __init__(self, code, devID):
prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))
include_dirs = []
for libname in ("cudart", "cccl"):
hdr_dir = pathfinder.find_nvidia_header_directory(libname)
if hdr_dir is None:
import pytest

cuda_home = get_cuda_home()
assert cuda_home is not None
cuda_include = os.path.join(cuda_home, "include")
assert os.path.isdir(cuda_include)
include_dirs = [cuda_include]
cccl_include = os.path.join(cuda_include, "cccl")
if os.path.isdir(cccl_include):
include_dirs.insert(0, cccl_include)
pytest.skip(f'pathfinder.find_nvidia_header_directory("{libname}") returned None')
include_dirs.append(hdr_dir)

prog = checkCudaErrors(nvrtc.nvrtcCreateProgram(str.encode(code), b"sourceCode.cu", 0, None, None))

# Initialize CUDA
checkCudaErrors(cudart.cudaFree(0))
Expand Down
24 changes: 12 additions & 12 deletions cuda_bindings/examples/extra/isoFDModelling_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ def createSource(self, kernel):
freq = np.array(self.params.freqMax, dtype=np.float32)

args = [buf, dt, freq, nt]
args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
checkCudaErrors(cuda.cuCtxSetCurrent(self.context))
checkCudaErrors(
cuda.cuLaunchKernel(
Expand All @@ -323,7 +323,7 @@ def createSource(self, kernel):
1, # block dim
0,
self.streamHalo, # shared mem and stream
args.ctypes.data,
argsp.ctypes.data,
0,
)
) # arguments
Expand Down Expand Up @@ -351,7 +351,7 @@ def injectSource(self, kernel, iter):
np_it = np.array(iter, dtype=np.uint32)

args = [wavein + offset_sourceInject, src, np_it]
args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
checkCudaErrors(
cuda.cuLaunchKernel(
kernel.injectSource,
Expand All @@ -363,7 +363,7 @@ def injectSource(self, kernel, iter):
1, # block dim
0,
self.streamHalo, # shared mem and stream
args.ctypes.data,
argsp.ctypes.data,
0,
)
) # arguments
Expand Down Expand Up @@ -391,7 +391,7 @@ def createVelocity(self, kernel):
np_stride = np.array(stride, dtype=np.uint32)

args = [vel + offset_velocity, np_dx_dt2, np_nz, np_nx, np_stride]
args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)

checkCudaErrors(cuda.cuCtxSetCurrent(self.context))

Expand All @@ -407,7 +407,7 @@ def createVelocity(self, kernel):
1, # block dim
0,
self.streamHalo, # shared mem and stream
args.ctypes.data,
argsp.ctypes.data,
0,
)
) # arguments
Expand Down Expand Up @@ -448,7 +448,7 @@ def executeCenter(self, kernel):
np_nx,
np_stride,
]
args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)

# do center propagation from 2 * fd_order to nz - 2 * fd_order
checkCudaErrors(
Expand All @@ -462,7 +462,7 @@ def executeCenter(self, kernel):
1, # block dim
0,
self.streamCenter, # shared mem and stream
args.ctypes.data,
argsp.ctypes.data,
0,
)
) # arguments
Expand Down Expand Up @@ -503,7 +503,7 @@ def executeHalo(self, kernel):
np_nx,
np_stride,
]
args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)

# do halo up
checkCudaErrors(
Expand All @@ -517,7 +517,7 @@ def executeHalo(self, kernel):
1, # block dim
0,
self.streamHalo, # shared mem and stream
args.ctypes.data,
argsp.ctypes.data,
0,
)
) # arguments
Expand All @@ -541,7 +541,7 @@ def executeHalo(self, kernel):
np_nx,
np_stride,
]
args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
argsp = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
checkCudaErrors(
cuda.cuLaunchKernel(
kernel.fdPropag,
Expand All @@ -553,7 +553,7 @@ def executeHalo(self, kernel):
1, # block dim
0,
self.streamHalo, # shared mem and stream
args.ctypes.data,
argsp.ctypes.data,
0,
)
) # arguments
Expand Down
Loading