From f19d167bbb86e1110d27cf913b2292d762696877 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 15 Jan 2026 15:14:33 -0800 Subject: [PATCH 1/5] Add multi-GPU CI runners (t4 and h100 with 2 GPUs) Add GPU_COUNT field to test matrix to support multi-GPU configurations. This enables rigorous testing of peer access, device switching, and other multi-GPU functionality in CI. Closes #1501 --- .github/workflows/test-wheel-linux.yml | 2 +- .github/workflows/test-wheel-windows.yml | 2 +- ci/test-matrix.yml | 108 ++++++++++++----------- 3 files changed, 57 insertions(+), 55 deletions(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 4f417b6a81..e8ffccbe64 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -78,7 +78,7 @@ jobs: strategy: fail-fast: false matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} - runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-1" + runs-on: "linux-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}" # The build stage could fail but we want the CI to keep moving. if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} # Our self-hosted runners require a container diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml index eaabdf47d0..6d7164edc9 100644 --- a/.github/workflows/test-wheel-windows.yml +++ b/.github/workflows/test-wheel-windows.yml @@ -66,7 +66,7 @@ jobs: fail-fast: false matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} - runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-1" + runs-on: "windows-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}" steps: - name: Checkout ${{ github.event.repository.name }} uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index 77bcc2d3f6..c6e5fa0efc 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -8,7 +8,7 @@ # # Please keep the matrices sorted in ascending order by the following: # -# [ARCH, PY_VER, CUDA_VER, LOCAL_CTK, GPU, DRIVER] +# [ARCH, PY_VER, CUDA_VER, LOCAL_CTK, GPU, GPU_COUNT, DRIVER] # # Windows entries also include DRIVER_MODE. # @@ -17,62 +17,64 @@ linux: pull-request: - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'v100', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'l4', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'l4', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' } - - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.11', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } nightly: [] special_runners: amd64: - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'H100', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'H100', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '2', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest' } windows: pull-request: - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtx2080', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'v100', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtx4090', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } - - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'rtx4090', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtxpro6000', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'rtxpro6000', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', DRIVER: 'latest', DRIVER_MODE: 'TCC' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } - - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'rtx2080', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } + - { ARCH: 'amd64', PY_VER: '3.10', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtx4090', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } + - { ARCH: 'amd64', PY_VER: '3.11', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'rtx4090', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'WDDM' } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } + - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } + - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } + - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } + - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } + - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } + - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } + - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'MCDM' } nightly: [] From cc13ecf2b9f20d5d9b5465740cf19e36adc74ee2 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 15 Jan 2026 15:18:04 -0800 Subject: [PATCH 2/5] Remove special_runners, integrate entries into pull-request matrix Simplifies CI configuration by moving special runner entries directly into the pull-request matrix rather than handling them separately. --- .github/workflows/test-wheel-linux.yml | 6 ------ ci/test-matrix.yml | 10 ++++------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index e8ffccbe64..65fb0f4c5c 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -57,12 +57,6 @@ jobs: # Read base matrix from YAML file for the specific architecture TEST_MATRIX=$(yq -o json ".linux[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml) - # Add special runner for amd64 if applicable - if [[ "${ARCH}" == "amd64" ]]; then - SPECIAL_RUNNERS=$(yq -o json '.linux.special_runners.amd64' ci/test-matrix.yml) - TEST_MATRIX=$(echo "$TEST_MATRIX" | yq -o json ". + $SPECIAL_RUNNERS") - fi - # Apply matrix filter and wrap in include structure MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') diff --git a/ci/test-matrix.yml b/ci/test-matrix.yml index c6e5fa0efc..81dd98f92a 100644 --- a/ci/test-matrix.yml +++ b/ci/test-matrix.yml @@ -27,8 +27,12 @@ linux: - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.1.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } + - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '2', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } @@ -50,12 +54,6 @@ linux: - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } - { ARCH: 'arm64', PY_VER: '3.14t', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'a100', GPU_COUNT: '1', DRIVER: 'latest' } nightly: [] - special_runners: - amd64: - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'H100', GPU_COUNT: '1', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '2', DRIVER: 'latest' } - - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.1.0', LOCAL_CTK: '1', GPU: 'h100', GPU_COUNT: '2', DRIVER: 'latest' } windows: pull-request: From 074a6ca9dddeec5d2016a9059f617c187b4d0ec1 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 15 Jan 2026 15:27:50 -0800 Subject: [PATCH 3/5] Fix test to avoid cuRAND dependency Replace cp.random.default_rng() with cp.arange() to avoid requiring cuRAND library which may not match the installed CUDA version. --- cuda_core/tests/test_launcher.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index ae3e5531c1..baab983cb2 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -348,13 +348,11 @@ def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_reso else: array = cp.from_dlpack(buffer).view(dtype=dtype) - # Initialize data with random values + # Initialize data with arbitrary values if mr.is_host_accessible: - rng = np.random.default_rng() - array[:] = rng.random(size, dtype=dtype) + array[:] = np.arange(size, dtype=dtype) + 1.0 else: - rng = cp.random.default_rng() - array[:] = rng.random(size, dtype=dtype) + array[:] = cp.arange(size, dtype=dtype) + 1.0 # Store original values for verification original = array.copy() From 31843168d5c4fc070e2ea2bb79e74c168bb53544 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 15 Jan 2026 15:32:49 -0800 Subject: [PATCH 4/5] Add notes about GPU_COUNT and column alignment --- .github/workflows/test-wheel-linux.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-wheel-linux.yml b/.github/workflows/test-wheel-linux.yml index 65fb0f4c5c..78a2fa1c4f 100644 --- a/.github/workflows/test-wheel-linux.yml +++ b/.github/workflows/test-wheel-linux.yml @@ -67,7 +67,7 @@ jobs: echo "OLD_BRANCH=${OLD_BRANCH}" >> "$GITHUB_OUTPUT" test: - name: py${{ matrix.PY_VER }}, ${{ matrix.CUDA_VER }}, ${{ (matrix.LOCAL_CTK == '1' && 'local') || 'wheels' }}, ${{ matrix.GPU }} + name: py${{ matrix.PY_VER }}, ${{ matrix.CUDA_VER }}, ${{ (matrix.LOCAL_CTK == '1' && 'local') || 'wheels' }}, ${{ matrix.GPU }}${{ matrix.GPU_COUNT != '1' && format('(x{0})', matrix.GPU_COUNT) || '' }} needs: compute-matrix strategy: fail-fast: false From 4214c2c179eb999bbf0b05a663f4a0733b09bba1 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 15 Jan 2026 15:48:22 -0800 Subject: [PATCH 5/5] Use CuPy's new RNG to avoid libcurand dependency Switch from cp.random.random() (old RNG requiring libcurand.so) to cp.random.default_rng().random() (new RNG with pre-compiled curand device libs bundled in CuPy). --- cuda_core/examples/simple_multi_gpu_example.py | 10 ++++++---- cuda_core/tests/test_launcher.py | 8 +++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py index 438a21c808..497a4309cf 100644 --- a/cuda_core/examples/simple_multi_gpu_example.py +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -88,8 +88,9 @@ def __cuda_stream__(self): # Allocate memory on GPU 0 # Note: This runs on CuPy's current stream for GPU 0 dev0.set_current() -a = cp.random.random(size, dtype=dtype) -b = cp.random.random(size, dtype=dtype) +rng = cp.random.default_rng() +a = rng.random(size, dtype=dtype) +b = rng.random(size, dtype=dtype) c = cp.empty_like(a) cp_stream0 = dev0.create_stream(StreamAdaptor(cp.cuda.get_current_stream())) @@ -103,8 +104,9 @@ def __cuda_stream__(self): # Allocate memory on GPU 1 # Note: This runs on CuPy's current stream for GPU 1. dev1.set_current() -x = cp.random.random(size, dtype=dtype) -y = cp.random.random(size, dtype=dtype) +rng = cp.random.default_rng() +x = rng.random(size, dtype=dtype) +y = rng.random(size, dtype=dtype) z = cp.empty_like(a) cp_stream1 = dev1.create_stream(StreamAdaptor(cp.cuda.get_current_stream())) diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index baab983cb2..ae3e5531c1 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -348,11 +348,13 @@ def test_launch_with_buffers_allocated_by_memory_resource(init_cuda, memory_reso else: array = cp.from_dlpack(buffer).view(dtype=dtype) - # Initialize data with arbitrary values + # Initialize data with random values if mr.is_host_accessible: - array[:] = np.arange(size, dtype=dtype) + 1.0 + rng = np.random.default_rng() + array[:] = rng.random(size, dtype=dtype) else: - array[:] = cp.arange(size, dtype=dtype) + 1.0 + rng = cp.random.default_rng() + array[:] = rng.random(size, dtype=dtype) # Store original values for verification original = array.copy()