From cca8dd7a6ab19b1710c9d801f3a6198fb7b29e35 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 6 Jan 2026 14:07:52 +0100 Subject: [PATCH 1/7] fix: Don't filter device nodes when disable-multiple-csv-devices is enabled This change allows the "multiple-csv-devices" feature to be disabled using the feature flag. Without this change the /dev/nvidia1 device node is filtered out of the required set of devices which breaks certain applications. Signed-off-by: Evan Lezar --- pkg/nvcdi/lib-csv.go | 49 +++++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/pkg/nvcdi/lib-csv.go b/pkg/nvcdi/lib-csv.go index 77fcd95b3..cea5ffc11 100644 --- a/pkg/nvcdi/lib-csv.go +++ b/pkg/nvcdi/lib-csv.go @@ -46,11 +46,7 @@ var _ deviceSpecGeneratorFactory = (*csvlib)(nil) // If NVML is not available or the disable-multiple-csv-devices feature flag is // enabled, a single device is assumed. func (l *csvlib) DeviceSpecGenerators(ids ...string) (DeviceSpecGenerator, error) { - if l.featureFlags[FeatureDisableMultipleCSVDevices] { - return l.purecsvDeviceSpecGenerators(ids...) - } - hasNVML, _ := l.infolib.HasNvml() - if !hasNVML { + if l.usePureCSVDeviceSpecGenerator() { return l.purecsvDeviceSpecGenerators(ids...) } mixed, err := l.mixedDeviceSpecGenerators(ids...) @@ -61,6 +57,24 @@ func (l *csvlib) DeviceSpecGenerators(ids ...string) (DeviceSpecGenerator, error return mixed, nil } +func (l *csvlib) usePureCSVDeviceSpecGenerator() bool { + if l.featureFlags[FeatureDisableMultipleCSVDevices] { + return true + } + hasNVML, _ := l.infolib.HasNvml() + if !hasNVML { + return true + } + asNvmlLib := (*nvmllib)(l) + err := asNvmlLib.init() + if err != nil { + return true + } + defer asNvmlLib.tryShutdown() + + return false +} + func (l *csvlib) purecsvDeviceSpecGenerators(ids ...string) (DeviceSpecGenerator, error) { for _, id := range ids { switch id { @@ -74,6 +88,9 @@ func (l *csvlib) purecsvDeviceSpecGenerators(ids ...string) (DeviceSpecGenerator csvlib: l, index: 0, uuid: "", + // We set noFilterDeviceNodes to true to ensure that the /dev/nvidia[0-1] + // device nodes in the CSV files on the system are consumed as-is. + noFilterDeviceNodes: true, } return g, nil } @@ -86,8 +103,9 @@ func (l *csvlib) mixedDeviceSpecGenerators(ids ...string) (DeviceSpecGenerator, // platform-specific CSV files. type csvDeviceGenerator struct { *csvlib - index int - uuid string + index int + uuid string + noFilterDeviceNodes bool } func (l *csvDeviceGenerator) GetUUID() (string, error) { @@ -132,14 +150,17 @@ func (l *csvDeviceGenerator) GetDeviceSpecs() ([]specs.Device, error) { // particular device is added to the set of device nodes to be discovered. func (l *csvDeviceGenerator) deviceNodeDiscoverer() (discover.Discover, error) { mountSpecs := tegra.Transform( - tegra.Transform( - tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...), - // We remove non-device nodes. - tegra.OnlyDeviceNodes(), - ), - // We remove the regular (nvidia[0-9]+) device nodes. - tegra.WithoutRegularDeviceNodes(), + tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...), + // We remove non-device nodes. + tegra.OnlyDeviceNodes(), ) + if !l.noFilterDeviceNodes { + mountSpecs = tegra.Transform( + mountSpecs, + // We remove the regular (nvidia[0-9]+) device nodes. + tegra.WithoutRegularDeviceNodes(), + ) + } return tegra.New( tegra.WithLogger(l.logger), tegra.WithDriverRoot(l.driverRoot), From 6b74d0c44acf375232c4d2b3c8dedac2c399d1ed Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 6 Jan 2026 14:11:26 +0100 Subject: [PATCH 2/7] fix: Use pure CSV mode when a single device is found This change ensure that the pure CSV mode (with no device node filtering) is used when a single device is present on the system. This enables use cases such as VPI applications where the /dev/nvidia1 device node is used even though an iGPU is present. Signed-off-by: Evan Lezar --- pkg/nvcdi/lib-csv.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/nvcdi/lib-csv.go b/pkg/nvcdi/lib-csv.go index cea5ffc11..c01115462 100644 --- a/pkg/nvcdi/lib-csv.go +++ b/pkg/nvcdi/lib-csv.go @@ -72,7 +72,12 @@ func (l *csvlib) usePureCSVDeviceSpecGenerator() bool { } defer asNvmlLib.tryShutdown() - return false + numDevices, ret := l.nvmllib.DeviceGetCount() + if ret != nvml.SUCCESS { + return true + } + + return numDevices <= 1 } func (l *csvlib) purecsvDeviceSpecGenerators(ids ...string) (DeviceSpecGenerator, error) { From b10bd59e26361decda4fd18e216ffbb3e2682438 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 7 Jan 2026 16:27:47 +0100 Subject: [PATCH 3/7] fix: Ensure that iGPU device nodes includes /dev/nvidia2 Signed-off-by: Evan Lezar --- internal/platform-support/tegra/filter.go | 11 ++ .../platform-support/tegra/mount_specs.go | 4 + pkg/nvcdi/lib-csv.go | 125 ++++++++++++------ 3 files changed, 97 insertions(+), 43 deletions(-) diff --git a/internal/platform-support/tegra/filter.go b/internal/platform-support/tegra/filter.go index 989906dc0..8644b46d5 100644 --- a/internal/platform-support/tegra/filter.go +++ b/internal/platform-support/tegra/filter.go @@ -77,6 +77,9 @@ type matcherAsFilter struct { } type filterByMountSpecType map[csv.MountSpecType]filter +type filterByMountSpecPathsByTyper struct { + MountSpecPathsByTyper +} type pathPatterns []string type pathPattern string @@ -125,6 +128,14 @@ func (p filterByMountSpecType) Apply(input MountSpecPathsByTyper) MountSpecPaths return ms } +func (p filterByMountSpecPathsByTyper) Apply(input MountSpecPathsByTyper) MountSpecPathsByTyper { + f := make(filterByMountSpecType) + for t, p := range p.MountSpecPathsByType() { + f[t] = &matcherAsFilter{pathPatterns(p)} + } + return f.Apply(input) +} + // apply uses a matcher to filter an input string. // Each element in the input that matches is skipped and the remaining elements // are returned. diff --git a/internal/platform-support/tegra/mount_specs.go b/internal/platform-support/tegra/mount_specs.go index 05c766830..acd3c9c66 100644 --- a/internal/platform-support/tegra/mount_specs.go +++ b/internal/platform-support/tegra/mount_specs.go @@ -130,6 +130,10 @@ func WithoutDeviceNodes() Transformer { } } +func Without(m MountSpecPathsByTyper) Transformer { + return filterByMountSpecPathsByTyper{m} +} + // WithoutRegularDeviceNodes creates a transfomer which removes // regular `/dev/nvidia[0-9]+` device nodes from the source. func WithoutRegularDeviceNodes() Transformer { diff --git a/pkg/nvcdi/lib-csv.go b/pkg/nvcdi/lib-csv.go index c01115462..ffd10cef5 100644 --- a/pkg/nvcdi/lib-csv.go +++ b/pkg/nvcdi/lib-csv.go @@ -93,9 +93,6 @@ func (l *csvlib) purecsvDeviceSpecGenerators(ids ...string) (DeviceSpecGenerator csvlib: l, index: 0, uuid: "", - // We set noFilterDeviceNodes to true to ensure that the /dev/nvidia[0-1] - // device nodes in the CSV files on the system are consumed as-is. - noFilterDeviceNodes: true, } return g, nil } @@ -108,9 +105,9 @@ func (l *csvlib) mixedDeviceSpecGenerators(ids ...string) (DeviceSpecGenerator, // platform-specific CSV files. type csvDeviceGenerator struct { *csvlib - index int - uuid string - noFilterDeviceNodes bool + index int + uuid string + mode string } func (l *csvDeviceGenerator) GetUUID() (string, error) { @@ -154,18 +151,6 @@ func (l *csvDeviceGenerator) GetDeviceSpecs() ([]specs.Device, error) { // - The device node (i.e. /dev/nvidia{{ .index }}) associated with this // particular device is added to the set of device nodes to be discovered. func (l *csvDeviceGenerator) deviceNodeDiscoverer() (discover.Discover, error) { - mountSpecs := tegra.Transform( - tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...), - // We remove non-device nodes. - tegra.OnlyDeviceNodes(), - ) - if !l.noFilterDeviceNodes { - mountSpecs = tegra.Transform( - mountSpecs, - // We remove the regular (nvidia[0-9]+) device nodes. - tegra.WithoutRegularDeviceNodes(), - ) - } return tegra.New( tegra.WithLogger(l.logger), tegra.WithDriverRoot(l.driverRoot), @@ -173,12 +158,45 @@ func (l *csvDeviceGenerator) deviceNodeDiscoverer() (discover.Discover, error) { tegra.WithHookCreator(l.hookCreator), tegra.WithLdconfigPath(l.ldconfigPath), tegra.WithLibrarySearchPaths(l.librarySearchPaths...), - tegra.WithMountSpecs( - mountSpecs, + tegra.WithMountSpecs(l.deviceNodeMountSpecs()), + ) +} + +func (l *csvDeviceGenerator) deviceNodeMountSpecs() tegra.MountSpecPathsByTyper { + mountSpecs := tegra.Transform( + tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...), + // We remove non-device nodes. + tegra.OnlyDeviceNodes(), + ) + switch l.mode { + // For a dGPU we remove all regular device nodes from the list of device + // nodes that we detect and only look for the node associated with the + // index. + case "dgpu": + return tegra.Merge( + tegra.Transform( + mountSpecs, + // We remove the regular (nvidia[0-9]+) device nodes. + tegra.WithoutRegularDeviceNodes(), + ), // We add the specific device node for this device. tegra.DeviceNodes(fmt.Sprintf("/dev/nvidia%d", l.index)), - ), - ) + ) + case "igpu": + return tegra.Merge( + tegra.Transform( + mountSpecs, + // We remove the /dev/nvidia1 device node. + // TODO: This assumes that the dGPU has the index 1 and remove + // it from the set of device nodes. + tegra.Without(tegra.DeviceNodes("/dev/nvidia1")), + ), + // We add the display device from the iGPU. + tegra.DeviceNodes("/dev/nvidia2"), + ) + default: + return mountSpecs + } } // GetCommonEdits generates a CDI specification that can be used for ANY devices @@ -272,35 +290,56 @@ func (l *mixedcsvlib) csvDeviceSpecGenerator(index int, uuid string, device nvml return nil, fmt.Errorf("is-integrated check failed for device (index=%v,uuid=%v)", index, uuid) } + if isIntegrated { + return l.iGPUDeviceSpecGenerator(index, uuid) + } + + return l.dGPUDeviceSpecGenerator(index, uuid, device) +} + +func (l *mixedcsvlib) dGPUDeviceSpecGenerator(index int, uuid string, device nvml.Device) (DeviceSpecGenerator, error) { + if index != 1 { + return nil, fmt.Errorf("unexpected device index for dGPU: %d", index) + } g := &csvDeviceGenerator{ csvlib: (*csvlib)(l), index: index, uuid: uuid, + mode: "dgpu", } - if !isIntegrated { - csvDeviceNodeDiscoverer, err := g.deviceNodeDiscoverer() - if err != nil { - return nil, fmt.Errorf("failed to create discoverer for devices nodes: %w", err) - } + csvDeviceNodeDiscoverer, err := g.deviceNodeDiscoverer() + if err != nil { + return nil, fmt.Errorf("failed to create discoverer for devices nodes: %w", err) + } + + // If this is not an integrated GPU, we also create a spec generator for + // the full GPU. + dgpu := (*nvmllib)(l).withInit(&fullGPUDeviceSpecGenerator{ + nvmllib: (*nvmllib)(l), + uuid: uuid, + index: index, + // For the CSV case, we include the control device nodes at a + // device level. + additionalDiscoverers: []discover.Discover{ + (*nvmllib)(l).controlDeviceNodeDiscoverer(), + csvDeviceNodeDiscoverer, + }, + featureFlags: l.featureFlags, + }) + return dgpu, nil +} - // If this is not an integrated GPU, we also create a spec generator for - // the full GPU. - dgpu := (*nvmllib)(l).withInit(&fullGPUDeviceSpecGenerator{ - nvmllib: (*nvmllib)(l), - uuid: uuid, - index: index, - // For the CSV case, we include the control device nodes at a - // device level. - additionalDiscoverers: []discover.Discover{ - (*nvmllib)(l).controlDeviceNodeDiscoverer(), - csvDeviceNodeDiscoverer, - }, - featureFlags: l.featureFlags, - }) - return dgpu, nil +func (l *mixedcsvlib) iGPUDeviceSpecGenerator(index int, uuid string) (DeviceSpecGenerator, error) { + if index != 0 { + return nil, fmt.Errorf("unexpected device index for iGPU: %d", index) + } + g := &csvDeviceGenerator{ + csvlib: (*csvlib)(l), + index: index, + uuid: uuid, + mode: "igpu", } - return g, nil } From 6e78e228597ab653f5d295d722d1a0e9af4f2bb0 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 22 Jan 2026 16:14:35 +0100 Subject: [PATCH 4/7] TOFIX: add csv test Signed-off-by: Evan Lezar --- pkg/nvcdi/lib-csv_test.go | 205 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 205 insertions(+) create mode 100644 pkg/nvcdi/lib-csv_test.go diff --git a/pkg/nvcdi/lib-csv_test.go b/pkg/nvcdi/lib-csv_test.go new file mode 100644 index 000000000..f88e9b76f --- /dev/null +++ b/pkg/nvcdi/lib-csv_test.go @@ -0,0 +1,205 @@ +/** +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvcdi + +import ( + "bytes" + "encoding/json" + "path/filepath" + "testing" + + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock" + testlog "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/require" + "tags.cncf.io/container-device-interface/specs-go" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/test" +) + +func TestDeviceSpecGenerators(t *testing.T) { + t.Setenv("__NVCT_TESTING_DEVICES_ARE_FILES", "true") + moduleRoot, err := test.GetModuleRoot() + require.NoError(t, err) + + logger, _ := testlog.NewNullLogger() + + lookupRoot := filepath.Join(moduleRoot, "testdata", "lookup") + + testCases := []struct { + description string + + rootfsFolder string + + lib *csvlib + expectedError error + expectedSpecError error + expectedDeviceSpecs []specs.Device + }{ + { + description: "single orin CSV device", + rootfsFolder: "rootfs-orin", + lib: &csvlib{ + // test-case specific + infolib: &infoInterfaceMock{ + HasNvmlFunc: func() (bool, string) { return true, "forced" }, + }, + nvmllib: &mock.Interface{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 1, nvml.SUCCESS + }, + }, + }, + expectedDeviceSpecs: []specs.Device{ + { + Name: "0", + ContainerEdits: specs.ContainerEdits{ + DeviceNodes: []*specs.DeviceNode{ + {Path: "/dev/nvidia0", HostPath: "/dev/nvidia0"}, + {Path: "/dev/nvidia1", HostPath: "/dev/nvidia1"}, + }, + }, + }, + }, + }, + { + description: "thor device with dGPU", + rootfsFolder: "rootfs-thor-dgpu", + lib: &csvlib{ + // test-case specific + infolib: &infoInterfaceMock{ + HasNvmlFunc: func() (bool, string) { return true, "forced" }, + }, + nvmllib: &mock.Interface{ + InitFunc: func() nvml.Return { + return nvml.SUCCESS + }, + ShutdownFunc: func() nvml.Return { + return nvml.SUCCESS + }, + DeviceGetCountFunc: func() (int, nvml.Return) { + return 2, nvml.SUCCESS + }, + DeviceGetHandleByIndexFunc: func(n int) (nvml.Device, nvml.Return) { + switch n { + case 0: + device := &mock.Device{ + GetUUIDFunc: func() (string, nvml.Return) { + return "GPU-0", nvml.SUCCESS + }, + GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) { + return nvml.PciInfo{ + Bus: 1, + }, nvml.SUCCESS + }, + } + return device, nvml.SUCCESS + case 1: + device := &mock.Device{ + GetUUIDFunc: func() (string, nvml.Return) { + return "GPU-1", nvml.SUCCESS + }, + GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) { + return nvml.PciInfo{ + Bus: 3, + }, nvml.SUCCESS + }, + } + return device, nvml.SUCCESS + } + return nil, nvml.ERROR_INVALID_ARGUMENT + }, + }, + }, + expectedDeviceSpecs: []specs.Device{ + { + Name: "0", + ContainerEdits: specs.ContainerEdits{ + DeviceNodes: []*specs.DeviceNode{ + {Path: "/dev/nvidia0", HostPath: "/dev/nvidia0"}, + {Path: "/dev/nvidia2", HostPath: "/dev/nvidia2"}, + }, + }, + }, + { + Name: "1", + ContainerEdits: specs.ContainerEdits{ + DeviceNodes: []*specs.DeviceNode{ + {Path: "/dev/nvidia1", HostPath: "/dev/nvidia1"}, + }, + }, + }, + }, + }, + } + + for _, tc := range testCases { + driverRoot := filepath.Join(lookupRoot, tc.rootfsFolder) + + tc.lib.logger = logger + tc.lib.deviceNamers = []DeviceNamer{deviceNameIndex{}} + tc.lib.hookCreator = discover.NewHookCreator() + + tc.lib.devicelib = device.New(tc.lib.nvmllib) + + tc.lib.driverRoot = driverRoot + tc.lib.devRoot = driverRoot + tc.lib.csvFiles = []string{ + filepath.Join(driverRoot, "/etc/nvidia-container-runtime/host-files-for-container.d/devices.csv"), + filepath.Join(driverRoot, "/etc/nvidia-container-runtime/host-files-for-container.d/drivers.csv"), + } + + t.Run(tc.description, func(t *testing.T) { + generator, err := tc.lib.DeviceSpecGenerators("all") + + require.EqualValues(t, tc.expectedError, err) + + if tc.expectedError != nil { + return + } + + deviceSpecs, err := generator.GetDeviceSpecs() + require.EqualValues(t, tc.expectedSpecError, err) + require.EqualValues(t, tc.expectedDeviceSpecs, stripRoot(driverRoot, deviceSpecs)) + }) + } + +} + +func stripRoot[T any](root string, v T) T { + stringRep, err := json.Marshal(v) + if err != nil { + panic(err) + } + stringRep = bytes.ReplaceAll(stringRep, []byte(root), []byte("")) + + var modified T + err = json.Unmarshal(stringRep, &modified) + if err != nil { + panic(err) + } + return modified +} From c2bfab89abcc9c6f7b49e662278e15a0fbc5be18 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 3 Dec 2025 13:30:46 +0100 Subject: [PATCH 5/7] [no-relnote] Add minimal csv test filesystem Signed-off-by: Evan Lezar --- testdata/lookup/csv-mode/dev/nvidia0 | 0 .../host-files-for-container.d/devices.csv | 40 +++ .../host-files-for-container.d/drivers.csv | 264 ++++++++++++++++++ 3 files changed, 304 insertions(+) create mode 100644 testdata/lookup/csv-mode/dev/nvidia0 create mode 100644 testdata/lookup/csv-mode/etc/nvidia-container-runtime/host-files-for-container.d/devices.csv create mode 100644 testdata/lookup/csv-mode/etc/nvidia-container-runtime/host-files-for-container.d/drivers.csv diff --git a/testdata/lookup/csv-mode/dev/nvidia0 b/testdata/lookup/csv-mode/dev/nvidia0 new file mode 100644 index 000000000..e69de29bb diff --git a/testdata/lookup/csv-mode/etc/nvidia-container-runtime/host-files-for-container.d/devices.csv b/testdata/lookup/csv-mode/etc/nvidia-container-runtime/host-files-for-container.d/devices.csv new file mode 100644 index 000000000..91e40a9eb --- /dev/null +++ b/testdata/lookup/csv-mode/etc/nvidia-container-runtime/host-files-for-container.d/devices.csv @@ -0,0 +1,40 @@ +dev, /dev/dri/card* +dev, /dev/dri/renderD* +dir, /dev/dri/by-path +dev, /dev/fb0 +dev, /dev/fb1 +dev, /dev/host1x-fence +dev, /dev/nvhost-as-gpu +dev, /dev/nvhost-ctrl-gpu +dev, /dev/nvhost-ctrl-nvdla0 +dev, /dev/nvhost-ctrl-nvdla1 +dev, /dev/nvhost-ctrl-pva0 +dev, /dev/nvhost-ctxsw-gpu +dev, /dev/nvhost-dbg-gpu +dev, /dev/nvhost-gpu +dev, /dev/nvhost-nvsched-gpu +dev, /dev/nvhost-power-gpu +dev, /dev/nvhost-prof-ctx-gpu +dev, /dev/nvhost-prof-dev-gpu +dev, /dev/nvhost-prof-gpu +dev, /dev/nvhost-sched-gpu +dev, /dev/nvhost-tsg-gpu +dev, /dev/nvgpu/igpu0/as +dev, /dev/nvgpu/igpu0/channel +dev, /dev/nvgpu/igpu0/ctrl +dev, /dev/nvgpu/igpu0/ctxsw +dev, /dev/nvgpu/igpu0/dbg +dev, /dev/nvgpu/igpu0/nvsched +dev, /dev/nvgpu/igpu0/power +dev, /dev/nvgpu/igpu0/prof +dev, /dev/nvgpu/igpu0/prof-ctx +dev, /dev/nvgpu/igpu0/prof-dev +dev, /dev/nvgpu/igpu0/sched +dev, /dev/nvgpu/igpu0/tsg +dev, /dev/nvidia-modeset +dev, /dev/nvidia0 +dev, /dev/nvidiactl +dev, /dev/nvmap +dev, /dev/nvsciipc +dev, /dev/v4l2-nvdec +dev, /dev/v4l2-nvenc diff --git a/testdata/lookup/csv-mode/etc/nvidia-container-runtime/host-files-for-container.d/drivers.csv b/testdata/lookup/csv-mode/etc/nvidia-container-runtime/host-files-for-container.d/drivers.csv new file mode 100644 index 000000000..4b3d4cd68 --- /dev/null +++ b/testdata/lookup/csv-mode/etc/nvidia-container-runtime/host-files-for-container.d/drivers.csv @@ -0,0 +1,264 @@ +lib, /usr/lib/aarch64-linux-gnu/libv4l2.so.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/desktop-shell.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/drm-backend.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/EGLWLInputEventExample +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/EGLWLMockNavigation +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/fullscreen-shell.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/gl-renderer.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/hmi-controller.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/ivi-controller.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/ivi-shell.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/LayerManagerControl +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/libilmClient.so.2.2.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/libilmCommon.so.2.2.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/libilmControl.so.2.2.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/libilmInput.so.2.2.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/libweston-6.so.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/libweston-desktop-6.so.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/simple-weston-client +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/spring-tool +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/wayland-backend.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-calibrator +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-clickdot +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-cliptest +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-content-protection +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-debug +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-desktop-shell +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-dnd +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-eventdemo +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-flower +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-fullscreen +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-image +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-info +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-ivi-shell-user-interface +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-keyboard +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-launch +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-multi-resource +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-output-mode +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-resizor +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-scaler +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-screenshooter +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-simple-dmabuf-egldevice +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-simple-egl +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-simple-pattern-hdr +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-simple-shm +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-simple-touch +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-smoke +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-stacking +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-subsurfaces +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-terminal +lib, /usr/lib/aarch64-linux-gnu/nvidia/weston/weston-transformed +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvarguscamerasrc.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvcompositor.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvdrmvideosink.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnveglglessink.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnveglstreamsrc.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvegltransform.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvipcpipeline.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvivafilter.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvjpeg.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvtee.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvv4l2camerasrc.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvvidconv.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvvideo4linux2.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvvideosink.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/libgstnvvideosinks.so +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/nvgstcapture-1.0_README.txt +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/nvgstipctestapp-1.0_README.txt +lib, /usr/lib/aarch64-linux-gnu/gstreamer-1.0/nvgstplayer-1.0_README.txt +lib, /usr/lib/aarch64-linux-gnu/libgstnvegl-1.0.so.0 +lib, /usr/lib/aarch64-linux-gnu/libgstnvexifmeta.so +lib, /usr/lib/aarch64-linux-gnu/libgstnvivameta.so +lib, /usr/lib/aarch64-linux-gnu/libnvsample_cudaprocess.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libgstnvcustomhelper.so.1.0.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libgstnvdsseimeta.so.1.0.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnveglstreamproducer.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libgstnvcustomhelper.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libgstnvdsseimeta.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvdla_compiler.so +lib, /etc/vulkansc/icd.d/nvidia_icd_vksc.json +lib, /usr/lib/aarch64-linux-gnu/nvidia/ld.so.conf +lib, /usr/lib/aarch64-linux-gnu/nvidia/libcuda.so.1.1 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libGLX_nvidia.so.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libjetsonpower.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvargus.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvargus_socketclient.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvargus_socketserver.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvbuf_fdmap.so.1.0.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvbufsurface.so.1.0.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvbufsurftransform.so.1.0.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvcameratools.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvcamerautils.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvcam_imageencoder.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvcamlog.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvcamv4l2.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvcapture.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvcolorutil.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvcucompat.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvcudla.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvcuvidv4l2.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvdc.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvddk_2d_v2.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvddk_vic.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvdecode2eglimage.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvdla_runtime.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvdsbufferpool.so.1.0.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnveventlib.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvexif.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvfnet.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvfnetstoredefog.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvfnetstorehdfx.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvfusacapinterface.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvfusacap.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvgov_boot.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvgov_camera.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvgov_force.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvgov_generic.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvgov_gpucompute.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvgov_graphics.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvgov_il.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvgov_spincircle.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvgov_tbc.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvgov_ui.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-allocator.so.1 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-eglcore.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-egl-gbm.so.1.1.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-egl-wayland.so.1.1.11 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-glcore.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-glsi.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-glvkspirv.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-gpucomp.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-kms.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-ml.so.1 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-nvvm.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-ptxjitcompiler.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-rmapi-tegra.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-rtcore.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-tls.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-vksc-core.so.540.3.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvid_mapper.so.1.0.0 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvimp.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvisppg.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvisp.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvisp_utils.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvjpeg.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia_2d.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia2d.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia_dla.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia_eglstream.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia_ide_parser.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia_ide_sci.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia_iep_sci.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia_ijpd_sci.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia_ijpe_sci.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia_iofa_sci.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia_isp_ext.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedialdc.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmedia_tensor.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmm_contentpipe.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmmlite_image.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmmlite.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmmlite_utils.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmmlite_video.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmm_parser.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmm.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvmm_utils.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvodm_imager.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvofsdk.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvoggopus.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvomxilclient.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvomx.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvosd.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvos.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvparser.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvphsd.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvphs.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvplayfair.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvpva_algorithms.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvpvaintf.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvpva.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvpvaumd.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvrm_chip.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvrm_gpu.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvrm_host1x.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvrm_mem.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvrm_stream.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvrm_surface.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvrm_sync.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvscf.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvscibuf.so.1 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvscicommon.so.1 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvscievent.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvsciipc.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvscistream.so.1 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvscisync.so.1 +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvsocsys.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvtegrahv.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvtracebuf.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvtvmr_2d.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvtvmr.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvv4l2.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvv4lconvert.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvvic.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvvideoencode_ppe.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libnvvideo.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libsensors.hal-client.nvs.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libsensors_hal.nvs.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libsensors.l4t.no_fusion.nvs.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libtegrav4l2.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libtegrawfd.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libv4l2_nvargus.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libv4l2_nvcuvidvideocodec.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libv4l2_nvvideocodec.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libVkLayer_json_gen.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libVkSCLayer_khronos_validation.so +lib, /usr/lib/aarch64-linux-gnu/nvidia/libvulkansc.so.1.0.10 +lib, /usr/lib/aarch64-linux-gnu/nvidia/nvidia_icd.json +lib, /usr/lib/aarch64-linux-gnu/tegra-egl/ld.so.conf +lib, /usr/lib/aarch64-linux-gnu/tegra-egl/libEGL_nvidia.so.0 +lib, /usr/lib/aarch64-linux-gnu/tegra-egl/libGLESv1_CM_nvidia.so.1 +lib, /usr/lib/aarch64-linux-gnu/tegra-egl/libGLESv2_nvidia.so.2 +lib, /usr/lib/aarch64-linux-gnu/tegra-egl/nvidia.json +sym, /etc/vulkan/icd.d/nvidia_icd.json +sym, /usr/lib/aarch64-linux-gnu/gbm/nvidia-drm_gbm.so +sym, /usr/lib/aarch64-linux-gnu/gbm/tegra_gbm.so +sym, /usr/lib/aarch64-linux-gnu/gbm/tegra-udrm_gbm.so +sym, /usr/lib/aarch64-linux-gnu/libcuda.so +sym, /usr/lib/aarch64-linux-gnu/libnvcucompat.so +sym, /usr/lib/aarch64-linux-gnu/libnvcudla.so +sym, /usr/lib/aarch64-linux-gnu/libv4l2.so.0.0.999999 +sym, /usr/lib/aarch64-linux-gnu/libv4lconvert.so.0.0.999999 +sym, /usr/lib/aarch64-linux-gnu/libv4l/plugins/nv/libv4l2_nvargus.so +sym, /usr/lib/aarch64-linux-gnu/libv4l/plugins/nv/libv4l2_nvcuvidvideocodec.so +sym, /usr/lib/aarch64-linux-gnu/libv4l/plugins/nv/libv4l2_nvvideocodec.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libcuda.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libcuda.so.1 +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvbufsurface.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvbufsurftransform.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvdsbufferpool.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-allocator.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-egl-gbm.so.1 +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-egl-wayland.so.1 +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-kms.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-nvvm.so.4 +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-ptxjitcompiler.so.1 +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-vksc-core.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvidia-vksc-core.so.1 +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvid_mapper.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvscibuf.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvscicommon.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvscistream.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libnvscisync.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libv4l2.so.0 +sym, /usr/lib/aarch64-linux-gnu/nvidia/libv4lconvert.so.0 +sym, /usr/lib/aarch64-linux-gnu/nvidia/libvulkansc.so +sym, /usr/lib/aarch64-linux-gnu/nvidia/libvulkansc.so.1 +sym, /usr/lib/aarch64-linux-gnu/tegra +sym, /usr/share/glvnd/egl_vendor.d/10_nvidia.json +lib, //lib/firmware/tegra19x/nvhost_nvdec040_ns.fw +lib, /lib/firmware/tegra19x/nvhost_nvdec040_ns.fw +lib, /lib/firmware/tegra23x/nvhost_nvdec050_desc_prod.bin +lib, /usr/sbin/nvidia-smi +lib, /usr/share/doc/nvidia-tegra/LICENSE.nvidia-smi From 42c50bf44ce27ec29174001d8e297fccbcd5e491 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 3 Dec 2025 13:31:04 +0100 Subject: [PATCH 6/7] Add csv lib test Signed-off-by: Evan Lezar --- pkg/nvcdi/lib-csv_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/nvcdi/lib-csv_test.go b/pkg/nvcdi/lib-csv_test.go index f88e9b76f..62f086507 100644 --- a/pkg/nvcdi/lib-csv_test.go +++ b/pkg/nvcdi/lib-csv_test.go @@ -1,5 +1,9 @@ /** +<<<<<<< HEAD # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +======= +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +>>>>>>> 99726b17 (Add csv lib test) # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); From 48714071b1e75004ca76f47ea89aeacf23c037e4 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 22 Jan 2026 17:36:52 +0100 Subject: [PATCH 7/7] TOFIX: use Thor IGX mock Signed-off-by: Evan Lezar --- go.mod | 2 + go.sum | 2 - pkg/nvcdi/lib-csv.go | 15 +- pkg/nvcdi/lib-csv_test.go | 53 +- .../NVIDIA/go-nvml/pkg/nvml/mock/README.md | 396 +++++++++++++++ .../go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go | 405 ++------------- .../pkg/nvml/mock/dgxa100/mig-profile.go | 471 ------------------ .../NVIDIA/go-nvml/pkg/nvml/mock/gpus/a100.go | 455 +++++++++++++++++ .../NVIDIA/go-nvml/pkg/nvml/mock/gpus/a30.go | 249 +++++++++ .../NVIDIA/go-nvml/pkg/nvml/mock/gpus/b200.go | 360 +++++++++++++ .../NVIDIA/go-nvml/pkg/nvml/mock/gpus/gpu.go | 33 ++ .../NVIDIA/go-nvml/pkg/nvml/mock/gpus/h100.go | 326 ++++++++++++ .../NVIDIA/go-nvml/pkg/nvml/mock/gpus/h200.go | 326 ++++++++++++ .../go-nvml/pkg/nvml/mock/gpus/tegra.go | 19 + .../go-nvml/pkg/nvml/mock/server/options.go | 32 ++ .../go-nvml/pkg/nvml/mock/server/shared.go | 438 ++++++++++++++++ vendor/modules.txt | 5 +- 17 files changed, 2719 insertions(+), 868 deletions(-) create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/README.md delete mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/a100.go create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/a30.go create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/b200.go create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/gpu.go create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/h100.go create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/h200.go create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/tegra.go create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/server/options.go create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/server/shared.go diff --git a/go.mod b/go.mod index 959d9c3a0..6942951a5 100644 --- a/go.mod +++ b/go.mod @@ -48,3 +48,5 @@ require ( gopkg.in/yaml.v3 v3.0.1 // indirect sigs.k8s.io/yaml v1.4.0 // indirect ) + +replace github.com/NVIDIA/go-nvml => ../go-nvml diff --git a/go.sum b/go.sum index 94f2ca87d..b17dcb83a 100644 --- a/go.sum +++ b/go.sum @@ -2,8 +2,6 @@ cyphar.com/go-pathrs v0.2.1 h1:9nx1vOgwVvX1mNBWDu93+vaceedpbsDqo+XuBGL40b8= cyphar.com/go-pathrs v0.2.1/go.mod h1:y8f1EMG7r+hCuFf/rXsKqMJrJAUoADZGNh5/vZPKcGc= github.com/NVIDIA/go-nvlib v0.9.1-0.20251202135446-d0f42ba016dd h1:MC1w/VYuo9Zt0se4SSx9BVid4a46ai+voN3knRvVWjE= github.com/NVIDIA/go-nvlib v0.9.1-0.20251202135446-d0f42ba016dd/go.mod h1:7mzx9FSdO9fXWP9NKuZmWkCwhkEcSWQFe2tmFwtLb9c= -github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw= -github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= diff --git a/pkg/nvcdi/lib-csv.go b/pkg/nvcdi/lib-csv.go index ffd10cef5..5ddde70b5 100644 --- a/pkg/nvcdi/lib-csv.go +++ b/pkg/nvcdi/lib-csv.go @@ -173,14 +173,23 @@ func (l *csvDeviceGenerator) deviceNodeMountSpecs() tegra.MountSpecPathsByTyper // nodes that we detect and only look for the node associated with the // index. case "dgpu": - return tegra.Merge( + return tegra.Transform( tegra.Transform( mountSpecs, // We remove the regular (nvidia[0-9]+) device nodes. + // The device nodes for the GPU are discovered for the full GPU. tegra.WithoutRegularDeviceNodes(), ), - // We add the specific device node for this device. - tegra.DeviceNodes(fmt.Sprintf("/dev/nvidia%d", l.index)), + // We also ignore control device nodes since these are included in + // the full GPU spec generator. + tegra.Without( + tegra.DeviceNodes( + "/dev/nvidia-modeset", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-uvm", + "/dev/nvidiactl", + ), + ), ) case "igpu": return tegra.Merge( diff --git a/pkg/nvcdi/lib-csv_test.go b/pkg/nvcdi/lib-csv_test.go index 62f086507..089058c4b 100644 --- a/pkg/nvcdi/lib-csv_test.go +++ b/pkg/nvcdi/lib-csv_test.go @@ -30,6 +30,8 @@ import ( "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" "github.com/NVIDIA/go-nvml/pkg/nvml" "github.com/NVIDIA/go-nvml/pkg/nvml/mock" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/server" testlog "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/require" "tags.cncf.io/container-device-interface/specs-go" @@ -83,7 +85,6 @@ func TestDeviceSpecGenerators(t *testing.T) { ContainerEdits: specs.ContainerEdits{ DeviceNodes: []*specs.DeviceNode{ {Path: "/dev/nvidia0", HostPath: "/dev/nvidia0"}, - {Path: "/dev/nvidia1", HostPath: "/dev/nvidia1"}, }, }, }, @@ -97,46 +98,14 @@ func TestDeviceSpecGenerators(t *testing.T) { infolib: &infoInterfaceMock{ HasNvmlFunc: func() (bool, string) { return true, "forced" }, }, - nvmllib: &mock.Interface{ - InitFunc: func() nvml.Return { - return nvml.SUCCESS - }, - ShutdownFunc: func() nvml.Return { - return nvml.SUCCESS - }, - DeviceGetCountFunc: func() (int, nvml.Return) { - return 2, nvml.SUCCESS - }, - DeviceGetHandleByIndexFunc: func(n int) (nvml.Device, nvml.Return) { - switch n { - case 0: - device := &mock.Device{ - GetUUIDFunc: func() (string, nvml.Return) { - return "GPU-0", nvml.SUCCESS - }, - GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) { - return nvml.PciInfo{ - Bus: 1, - }, nvml.SUCCESS - }, - } - return device, nvml.SUCCESS - case 1: - device := &mock.Device{ - GetUUIDFunc: func() (string, nvml.Return) { - return "GPU-1", nvml.SUCCESS - }, - GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) { - return nvml.PciInfo{ - Bus: 3, - }, nvml.SUCCESS - }, - } - return device, nvml.SUCCESS - } - return nil, nvml.ERROR_INVALID_ARGUMENT - }, - }, + nvmllib: server.NewServerWithGPUs( + "580.00", + "", + 0, + gpus.THOR_IGX, + // TODO: This should be an RTX device. + gpus.A30_PCIE_24GB, + ), }, expectedDeviceSpecs: []specs.Device{ { @@ -144,6 +113,7 @@ func TestDeviceSpecGenerators(t *testing.T) { ContainerEdits: specs.ContainerEdits{ DeviceNodes: []*specs.DeviceNode{ {Path: "/dev/nvidia0", HostPath: "/dev/nvidia0"}, + {Path: "/dev/nvidiactl", HostPath: "/dev/nvidiactl"}, {Path: "/dev/nvidia2", HostPath: "/dev/nvidia2"}, }, }, @@ -153,6 +123,7 @@ func TestDeviceSpecGenerators(t *testing.T) { ContainerEdits: specs.ContainerEdits{ DeviceNodes: []*specs.DeviceNode{ {Path: "/dev/nvidia1", HostPath: "/dev/nvidia1"}, + {Path: "/dev/nvidiactl", HostPath: "/dev/nvidiactl"}, }, }, }, diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/README.md b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/README.md new file mode 100644 index 000000000..aabccb540 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/README.md @@ -0,0 +1,396 @@ +# NVML Mock Framework + +This package provides mock implementations of NVIDIA's NVML (NVIDIA Management Library) for testing and development purposes. The framework uses a shared factory system to define GPU configurations that can be easily extended and customized. + +## Architecture + +``` +pkg/nvml/mock/ +├── shared/ +│ ├── shared.go # Core shared factory and types +│ └── gpus/ # GPU configuration definitions +│ ├── a100.go # A100 GPU variants (Ampere) +│ ├── a30.go # A30 GPU variants (Ampere) +│ ├── h100.go # H100 GPU variants (Hopper) +│ ├── h200.go # H200 GPU variants (Hopper) +│ └── b200.go # B200 GPU variants (Blackwell) +├── dgxa100/ # DGX A100 implementation +│ ├── dgxa100.go # Server and device implementation +│ └── dgxa100_test.go # Comprehensive tests +├── dgxh100/ # DGX H100 implementation +│ ├── dgxh100.go # Server and device implementation +│ └── dgxh100_test.go # Comprehensive tests +├── dgxh200/ # DGX H200 implementation +│ ├── dgxh200.go # Server and device implementation +│ └── dgxh200_test.go # Comprehensive tests +└── dgxb200/ # DGX B200 implementation + ├── dgxb200.go # Server and device implementation + └── dgxb200_test.go # Comprehensive tests +``` + +## Core Concepts + +### Shared Factory (`shared.Config`) +Define the characteristics of individual GPU models including: + +- Device properties (name, architecture, brand, PCI device ID) +- Compute capabilities (CUDA version, compute capability) +- Memory configuration +- MIG (Multi-Instance GPU) profiles and placements + +### Server Configuration (`shared.ServerConfig`) +Define complete system configurations including: + +- GPU configuration and count +- Driver, NVML, and CUDA versions + +### MIG Profile Configuration (`shared.MIGProfileConfig`) +Define Multi-Instance GPU capabilities including: + +- GPU instance profiles (slice configurations) +- Compute instance profiles +- Placement constraints and possibilities + +## Usage Examples + +### Basic Usage + +```go +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxh100" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxh200" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxb200" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus" +) + +// Create default systems +serverA100 := dgxa100.New() // A100-SXM4-40GB (8 GPUs) +serverH100 := dgxh100.New() // H100-SXM5-80GB (8 GPUs) +serverH200 := dgxh200.New() // H200-SXM5-141GB (8 GPUs) +serverB200 := dgxb200.New() // B200-SXM5-180GB (8 GPUs) + +// Create specific variants +serverA100_80GB := dgxa100.NewServerWithGPU(gpus.A100_SXM4_80GB) +serverH200_Custom := dgxh200.NewServerWithGPU(gpus.H200_SXM5_141GB) +serverB200_Custom := dgxb200.NewServerWithGPU(gpus.B200_SXM5_180GB) +``` + +### Device Creation + +```go +// Create devices with default configurations +deviceA100 := dgxa100.NewDevice(0) +deviceH100 := dgxh100.NewDevice(0) +deviceH200 := dgxh200.NewDevice(0) +deviceB200 := dgxb200.NewDevice(0) + +// Create devices with specific GPU variants +deviceA100_80GB := dgxa100.NewDeviceWithGPU(gpus.A100_SXM4_80GB, 0) +deviceH200_Custom := dgxh200.NewDeviceWithGPU(gpus.H200_SXM5_141GB, 1) +deviceB200_Custom := dgxb200.NewDeviceWithGPU(gpus.B200_SXM5_180GB, 2) +``` + +### Accessing GPU Configurations + +```go +// Available GPU configurations +// A100 Family +gpus.A100_SXM4_40GB // A100 SXM4 40GB +gpus.A100_SXM4_80GB // A100 SXM4 80GB +gpus.A100_PCIE_40GB // A100 PCIe 40GB +gpus.A100_PCIE_80GB // A100 PCIe 80GB + +// A30 Family +gpus.A30_PCIE_24GB // A30 PCIe 24GB + +// H100 Family +gpus.H100_SXM5_80GB // H100 SXM5 80GB + +// H200 Family +gpus.H200_SXM5_141GB // H200 SXM5 141GB + +// B200 Family +gpus.B200_SXM5_180GB // B200 SXM5 180GB + +// Inspect configurations +fmt.Printf("GPU: %s\n", gpus.A100_SXM4_80GB.Name) +fmt.Printf("Memory: %d MB\n", gpus.A100_SXM4_80GB.MemoryMB) +fmt.Printf("Architecture: %v\n", gpus.A100_SXM4_80GB.Architecture) +fmt.Printf("PCI Device ID: 0x%X\n", gpus.A100_SXM4_80GB.PciDeviceId) + +// Inspect H100 configuration +fmt.Printf("GPU: %s\n", gpus.H100_SXM5_80GB.Name) +fmt.Printf("Memory: %d MB\n", gpus.H100_SXM5_80GB.MemoryMB) +fmt.Printf("CUDA Major: %d\n", gpus.H100_SXM5_80GB.CudaMajor) + +// Inspect B200 configuration +fmt.Printf("GPU: %s\n", gpus.B200_SXM5_180GB.Name) +fmt.Printf("Memory: %d MB\n", gpus.B200_SXM5_180GB.MemoryMB) +fmt.Printf("CUDA Major: %d\n", gpus.B200_SXM5_180GB.CudaMajor) +``` + +## Available GPU Models + +### A100 Family (Ampere Architecture, 108 SMs) + +- **A100 SXM4 40GB** (`gpus.A100_SXM4_40GB`) + - Form factor: SXM4 + - Memory: 40GB HBM2 + - PCI Device ID: 0x20B010DE + - CUDA Capability: 8.0 + - SMs per slice: 14 (1-slice), 28 (2-slice), 42 (3-slice), 56 (4-slice), 98 (7-slice) + - MIG P2P: Not supported (`IsP2pSupported: 0`) + +- **A100 SXM4 80GB** (`gpus.A100_SXM4_80GB`) + - Form factor: SXM4 + - Memory: 80GB HBM2e + - PCI Device ID: 0x20B210DE + - CUDA Capability: 8.0 + +- **A100 PCIe 40GB** (`gpus.A100_PCIE_40GB`) + - Form factor: PCIe + - Memory: 40GB HBM2 + - PCI Device ID: 0x20F110DE + - CUDA Capability: 8.0 + +- **A100 PCIe 80GB** (`gpus.A100_PCIE_80GB`) + - Form factor: PCIe + - Memory: 80GB HBM2e + - PCI Device ID: 0x20B510DE + - CUDA Capability: 8.0 + +### A30 Family (Ampere Architecture, 56 SMs) + +- **A30 PCIe 24GB** (`gpus.A30_PCIE_24GB`) + - Form factor: PCIe + - Memory: 24GB HBM2 + - PCI Device ID: 0x20B710DE + - CUDA Capability: 8.0 + - SMs per slice: 14 (1-slice), 28 (2-slice), 56 (4-slice) + - MIG P2P: Not supported (`IsP2pSupported: 0`) + - MIG slices: 1, 2, 4 (no 3-slice or 7-slice support) + +### H100 Family (Hopper Architecture, 132 SMs) + +- **H100 SXM5 80GB** (`gpus.H100_SXM5_80GB`) + - Form factor: SXM5 + - Memory: 80GB HBM3 + - PCI Device ID: 0x233010DE + - CUDA Capability: 9.0 + - SMs per slice: 16 (1-slice), 32 (2-slice), 48 (3-slice), 64 (4-slice), 112 (7-slice) + - MIG P2P: Supported (`IsP2pSupported: 1`) + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles + +### H200 Family (Hopper Architecture, 132 SMs) + +- **H200 SXM5 141GB** (`gpus.H200_SXM5_141GB`) + - Form factor: SXM5 + - Memory: 141GB HBM3e + - PCI Device ID: 0x233310DE + - CUDA Capability: 9.0 + - SMs per slice: 16 (1-slice), 32 (2-slice), 48 (3-slice), 64 (4-slice), 112 (7-slice) + - MIG P2P: Supported (`IsP2pSupported: 1`) + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles + +### B200 Family (Blackwell Architecture, 144 SMs) + +- **B200 SXM5 180GB** (`gpus.B200_SXM5_180GB`) + - Form factor: SXM5 + - Memory: 180GB HBM3e + - PCI Device ID: 0x2B0010DE + - CUDA Capability: 10.0 + - SMs per slice: 18 (1-slice), 36 (2-slice), 54 (3-slice), 72 (4-slice), 126 (7-slice) + - MIG P2P: Supported (`IsP2pSupported: 1`) + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles + +## Available Server Models + +### DGX A100 Family + +- **DGX A100 40GB** (default) + - 8x A100 SXM4 40GB GPUs + - Driver: 550.54.15 + - NVML: 12.550.54.15 + - CUDA: 12040 + +### DGX H100 Family + +- **DGX H100 80GB** (default) + - 8x H100 SXM5 80GB GPUs + - Driver: 550.54.15 + - NVML: 12.550.54.15 + - CUDA: 12040 + +### DGX H200 Family + +- **DGX H200 141GB** (default) + - 8x H200 SXM5 141GB GPUs + - Driver: 550.54.15 + - NVML: 12.550.54.15 + - CUDA: 12040 + +### DGX B200 Family + +- **DGX B200 180GB** (default) + - 8x B200 SXM5 180GB GPUs + - Driver: 560.28.03 + - NVML: 12.560.28.03 + - CUDA: 12060 + +## MIG (Multi-Instance GPU) Support + +All GPU configurations include comprehensive MIG profile definitions: + +- **A100**: No P2P support in MIG (`IsP2pSupported: 0`) + - Memory profiles differ between 40GB and 80GB variants + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 108 SMs total with 14 SMs per slice +- **A30**: No P2P support in MIG (`IsP2pSupported: 0`) + - Supports limited MIG slice configurations (1, 2, 4 slices only) + - 56 SMs total with 14 SMs per slice +- **H100**: Full P2P support in MIG (`IsP2pSupported: 1`) + - 80GB HBM3 memory with optimized slice allocations + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 132 SMs total with 16 SMs per slice + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles +- **H200**: Full P2P support in MIG (`IsP2pSupported: 1`) + - 141GB HBM3e memory with enhanced capacity + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 132 SMs total with 16 SMs per slice + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles +- **B200**: Full P2P support in MIG (`IsP2pSupported: 1`) + - 180GB HBM3e memory with next-generation capacity + - Supports standard NVIDIA MIG slice configurations (1, 2, 3, 4, 7 slices) + - 144 SMs total with 18 SMs per slice + - Includes REV1 (media extensions) and REV2 (expanded memory) profiles + +### MIG Operations + +```go +// Create server with MIG support +server := dgxa100.New() +device, _ := server.DeviceGetHandleByIndex(0) + +// Enable MIG mode +device.SetMigMode(1) + +// Get available GPU instance profiles +profileInfo, ret := device.GetGpuInstanceProfileInfo(nvml.GPU_INSTANCE_PROFILE_1_SLICE) + +// Create GPU instance +gi, ret := device.CreateGpuInstance(&profileInfo) + +// Create compute instance within GPU instance +ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo( + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED +) +ci, ret := gi.CreateComputeInstance(&ciProfileInfo) +``` + +## Testing + +The framework includes comprehensive tests covering: + +- Server creation and device enumeration +- Device properties and capabilities +- MIG mode operations and lifecycle +- GPU and compute instance management +- Memory and PCI information +- Multi-device scenarios + +```bash +# Run all mock tests +go test ./pkg/nvml/mock/... + +# Run generation specific tests +go test -v ./pkg/nvml/mock/dgxa100/ +go test -v ./pkg/nvml/mock/dgxh100/ +go test -v ./pkg/nvml/mock/dgxh200/ +go test -v ./pkg/nvml/mock/dgxb200/ + +# Run specific test +go test -v ./pkg/nvml/mock/dgxa100/ -run TestMIGProfilesExist +go test -v ./pkg/nvml/mock/dgxh100/ -run TestMIGProfilesExist +``` + +## Extending the Framework + +### Adding GPU Variants + +Add new configurations to the appropriate file in `shared/gpus/`: + +```go +var A100_PCIE_24GB = shared.Config{ + Name: "NVIDIA A100-PCIE-24GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 24576, // 24GB + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20F010DE, + MIGProfiles: a100_24gb_MIGProfiles, +} +``` + +### Adding GPU Generations + +1. **Create new package** (e.g., `dgxb200/`) +2. **Define GPU configurations** in `shared/gpus/b200.go` +3. **Define MIG profiles** with appropriate memory and SM allocations +4. **Implement server and device factory functions** +5. **Add comprehensive tests** + +Example structure for B200 generation: + +```go +// In shared/gpus/b200.go +var B200_SXM5_180GB = shared.Config{ + Name: "NVIDIA B200 180GB HBM3e", + Architecture: nvml.DEVICE_ARCH_BLACKWELL, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 184320, // 180GB + CudaMajor: 10, + CudaMinor: 0, + PciDeviceId: 0x2B0010DE, + MIGProfiles: b200_180gb_MIGProfiles, +} + +// In dgxb200/dgxb200.go +func New() *Server { + return shared.NewServerFromConfig(shared.ServerConfig{ + Config: gpus.B200_SXM5_180GB, + GPUCount: 8, + DriverVersion: "560.28.03", + NvmlVersion: "12.560.28.03", + CudaDriverVersion: 12060, + }) +} +``` + +## Backward Compatibility + +The framework maintains full backward compatibility: + +- All existing `dgxa100.New()`, `dgxh100.New()`, `dgxh200.New()`, `dgxb200.New()` calls continue to work unchanged +- Legacy global variables (`MIGProfiles`, `MIGPlacements`) are preserved for all generations +- Device names maintain "Mock" prefix for test compatibility +- All existing tests pass without modification +- All GPU configurations reference `shared/gpus` package for consistency +- Type aliases ensure seamless transition from generation-specific types + +## Performance Considerations + +- Configurations are defined as static variables (no runtime overhead) +- Device creation uses shared factory (fast) +- MIG profiles are shared between devices of the same type +- Mock functions use direct field access (minimal latency) + +## Implementation Notes + +- **Thread Safety**: Device implementations include proper mutex usage +- **Memory Management**: No memory leaks in device/instance lifecycle +- **Error Handling**: Proper NVML return codes for all operations +- **Standards Compliance**: Follows official NVML API patterns and behaviors +- **Separation of Concerns**: GPU configs in `shared/gpus`, server logic in package-specific files diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go index af6503702..ba59261a1 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go @@ -17,365 +17,70 @@ package dgxa100 import ( - "fmt" - "sync" - - "github.com/google/uuid" - "github.com/NVIDIA/go-nvml/pkg/nvml" - "github.com/NVIDIA/go-nvml/pkg/nvml/mock" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/server" ) -type Server struct { - mock.Interface - mock.ExtendedInterface - Devices [8]nvml.Device - DriverVersion string - NvmlVersion string - CudaDriverVersion int -} -type Device struct { - mock.Device - sync.RWMutex - UUID string - Name string - Brand nvml.BrandType - Architecture nvml.DeviceArchitecture - PciBusID string - Minor int - Index int - CudaComputeCapability CudaComputeCapability - MigMode int - GpuInstances map[*GpuInstance]struct{} - GpuInstanceCounter uint32 - MemoryInfo nvml.Memory -} - -type GpuInstance struct { - mock.GpuInstance - sync.RWMutex - Info nvml.GpuInstanceInfo - ComputeInstances map[*ComputeInstance]struct{} - ComputeInstanceCounter uint32 -} - -type ComputeInstance struct { - mock.ComputeInstance - Info nvml.ComputeInstanceInfo -} - -type CudaComputeCapability struct { - Major int - Minor int -} - -var _ nvml.Interface = (*Server)(nil) -var _ nvml.Device = (*Device)(nil) -var _ nvml.GpuInstance = (*GpuInstance)(nil) -var _ nvml.ComputeInstance = (*ComputeInstance)(nil) +// Server is a type alias for server.Server maintained for backward compatibility. +// +// Deprecated: This type alias is maintained for backward compatibility only. +// The type may be removed in a future version. +type Server = server.Server + +// Device is a type alias for server.Device maintained for backward compatibility. +// +// Deprecated: This type alias is maintained for backward compatibility only. +// The type may be removed in a future version. +type Device = server.Device + +// GpuInstance is a type alias for server.GpuInstance maintained for backward compatibility. +// +// Deprecated: This type alias is maintained for backward compatibility only. +// The type may be removed in a future version. +type GpuInstance = server.GpuInstance + +// ComputeInstance is a type alias for server.ComputeInstance maintained for backward compatibility. +// +// Deprecated: This type alias is maintained for backward compatibility only. +// The type may be removed in a future version. +type ComputeInstance = server.ComputeInstance + +// CudaComputeCapability is a type alias for server.CudaComputeCapability maintained for backward compatibility. +// +// Deprecated: This type alias is maintained for backward compatibility only. +// The type may be removed in a future version. +type CudaComputeCapability = server.CudaComputeCapability func New() *Server { - server := &Server{ - Devices: [8]nvml.Device{ - NewDevice(0), - NewDevice(1), - NewDevice(2), - NewDevice(3), - NewDevice(4), - NewDevice(5), - NewDevice(6), - NewDevice(7), - }, - DriverVersion: "550.54.15", - NvmlVersion: "12.550.54.15", - CudaDriverVersion: 12040, - } - server.setMockFuncs() - return server + return NewWithGPUs(gpus.Multiple(8, gpus.A100_SXM4_40GB)...) } -func NewDevice(index int) *Device { - device := &Device{ - UUID: "GPU-" + uuid.New().String(), - Name: "Mock NVIDIA A100-SXM4-40GB", - Brand: nvml.BRAND_NVIDIA, - Architecture: nvml.DEVICE_ARCH_AMPERE, - PciBusID: fmt.Sprintf("0000:%02x:00.0", index), - Minor: index, - Index: index, - CudaComputeCapability: CudaComputeCapability{ - Major: 8, - Minor: 0, - }, - GpuInstances: make(map[*GpuInstance]struct{}), - GpuInstanceCounter: 0, - MemoryInfo: nvml.Memory{Total: 42949672960, Free: 0, Used: 0}, - } - device.setMockFuncs() - return device +func NewWithGPUs(gpus ...gpus.Config) *Server { + s, _ := server.New( + server.WithGPUs(gpus...), + server.WithDriverVersion("550.54.15"), + server.WithNVMLVersion("12.550.54.15"), + server.WithCUDADriverVersion(12040), + ) + return s } -func NewGpuInstance(info nvml.GpuInstanceInfo) *GpuInstance { - gi := &GpuInstance{ - Info: info, - ComputeInstances: make(map[*ComputeInstance]struct{}), - ComputeInstanceCounter: 0, - } - gi.setMockFuncs() - return gi -} - -func NewComputeInstance(info nvml.ComputeInstanceInfo) *ComputeInstance { - ci := &ComputeInstance{ - Info: info, - } - ci.setMockFuncs() - return ci -} - -func (s *Server) setMockFuncs() { - s.ExtensionsFunc = func() nvml.ExtendedInterface { - return s - } - - s.LookupSymbolFunc = func(symbol string) error { - return nil - } - - s.InitFunc = func() nvml.Return { - return nvml.SUCCESS - } - - s.ShutdownFunc = func() nvml.Return { - return nvml.SUCCESS +// Legacy globals for backward compatibility - expose the internal data +var ( + MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + }{ + GpuInstanceProfiles: gpus.A100_SXM4_40GB.MIGProfiles.GpuInstanceProfiles, + ComputeInstanceProfiles: gpus.A100_SXM4_40GB.MIGProfiles.ComputeInstanceProfiles, } - s.SystemGetDriverVersionFunc = func() (string, nvml.Return) { - return s.DriverVersion, nvml.SUCCESS + MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement + }{ + GpuInstancePossiblePlacements: gpus.A100_SXM4_40GB.MIGProfiles.GpuInstancePlacements, + ComputeInstancePossiblePlacements: gpus.A100_SXM4_40GB.MIGProfiles.ComputeInstancePlacements, } - - s.SystemGetNVMLVersionFunc = func() (string, nvml.Return) { - return s.NvmlVersion, nvml.SUCCESS - } - - s.SystemGetCudaDriverVersionFunc = func() (int, nvml.Return) { - return s.CudaDriverVersion, nvml.SUCCESS - } - - s.DeviceGetCountFunc = func() (int, nvml.Return) { - return len(s.Devices), nvml.SUCCESS - } - - s.DeviceGetHandleByIndexFunc = func(index int) (nvml.Device, nvml.Return) { - if index < 0 || index >= len(s.Devices) { - return nil, nvml.ERROR_INVALID_ARGUMENT - } - return s.Devices[index], nvml.SUCCESS - } - - s.DeviceGetHandleByUUIDFunc = func(uuid string) (nvml.Device, nvml.Return) { - for _, d := range s.Devices { - if uuid == d.(*Device).UUID { - return d, nvml.SUCCESS - } - } - return nil, nvml.ERROR_INVALID_ARGUMENT - } - - s.DeviceGetHandleByPciBusIdFunc = func(busID string) (nvml.Device, nvml.Return) { - for _, d := range s.Devices { - if busID == d.(*Device).PciBusID { - return d, nvml.SUCCESS - } - } - return nil, nvml.ERROR_INVALID_ARGUMENT - } -} - -func (d *Device) setMockFuncs() { - d.GetMinorNumberFunc = func() (int, nvml.Return) { - return d.Minor, nvml.SUCCESS - } - - d.GetIndexFunc = func() (int, nvml.Return) { - return d.Index, nvml.SUCCESS - } - - d.GetCudaComputeCapabilityFunc = func() (int, int, nvml.Return) { - return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS - } - - d.GetUUIDFunc = func() (string, nvml.Return) { - return d.UUID, nvml.SUCCESS - } - - d.GetNameFunc = func() (string, nvml.Return) { - return d.Name, nvml.SUCCESS - } - - d.GetBrandFunc = func() (nvml.BrandType, nvml.Return) { - return d.Brand, nvml.SUCCESS - } - - d.GetArchitectureFunc = func() (nvml.DeviceArchitecture, nvml.Return) { - return d.Architecture, nvml.SUCCESS - } - - d.GetMemoryInfoFunc = func() (nvml.Memory, nvml.Return) { - return d.MemoryInfo, nvml.SUCCESS - } - - d.GetPciInfoFunc = func() (nvml.PciInfo, nvml.Return) { - p := nvml.PciInfo{ - PciDeviceId: 0x20B010DE, - } - return p, nvml.SUCCESS - } - - d.SetMigModeFunc = func(mode int) (nvml.Return, nvml.Return) { - d.MigMode = mode - return nvml.SUCCESS, nvml.SUCCESS - } - - d.GetMigModeFunc = func() (int, int, nvml.Return) { - return d.MigMode, d.MigMode, nvml.SUCCESS - } - - d.GetGpuInstanceProfileInfoFunc = func(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) { - if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT { - return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT - } - - if _, exists := MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists { - return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - return MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS - } - - d.GetGpuInstancePossiblePlacementsFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) { - return MIGPlacements.GpuInstancePossiblePlacements[int(info.Id)], nvml.SUCCESS - } - - d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { - d.Lock() - defer d.Unlock() - giInfo := nvml.GpuInstanceInfo{ - Device: d, - Id: d.GpuInstanceCounter, - ProfileId: info.Id, - } - d.GpuInstanceCounter++ - gi := NewGpuInstance(giInfo) - d.GpuInstances[gi] = struct{}{} - return gi, nvml.SUCCESS - } - - d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) { - d.Lock() - defer d.Unlock() - giInfo := nvml.GpuInstanceInfo{ - Device: d, - Id: d.GpuInstanceCounter, - ProfileId: info.Id, - Placement: *placement, - } - d.GpuInstanceCounter++ - gi := NewGpuInstance(giInfo) - d.GpuInstances[gi] = struct{}{} - return gi, nvml.SUCCESS - } - - d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) { - d.RLock() - defer d.RUnlock() - var gis []nvml.GpuInstance - for gi := range d.GpuInstances { - if gi.Info.ProfileId == info.Id { - gis = append(gis, gi) - } - } - return gis, nvml.SUCCESS - } -} - -func (gi *GpuInstance) setMockFuncs() { - gi.GetInfoFunc = func() (nvml.GpuInstanceInfo, nvml.Return) { - return gi.Info, nvml.SUCCESS - } - - gi.GetComputeInstanceProfileInfoFunc = func(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) { - if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT - } - - if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - giProfileId := int(gi.Info.ProfileId) - - if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists { - return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED - } - - return MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS - } - - gi.GetComputeInstancePossiblePlacementsFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) { - return MIGPlacements.ComputeInstancePossiblePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS - } - - gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { - gi.Lock() - defer gi.Unlock() - ciInfo := nvml.ComputeInstanceInfo{ - Device: gi.Info.Device, - GpuInstance: gi, - Id: gi.ComputeInstanceCounter, - ProfileId: info.Id, - } - gi.ComputeInstanceCounter++ - ci := NewComputeInstance(ciInfo) - gi.ComputeInstances[ci] = struct{}{} - return ci, nvml.SUCCESS - } - - gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) { - gi.RLock() - defer gi.RUnlock() - var cis []nvml.ComputeInstance - for ci := range gi.ComputeInstances { - if ci.Info.ProfileId == info.Id { - cis = append(cis, ci) - } - } - return cis, nvml.SUCCESS - } - - gi.DestroyFunc = func() nvml.Return { - d := gi.Info.Device.(*Device) - d.Lock() - defer d.Unlock() - delete(d.GpuInstances, gi) - return nvml.SUCCESS - } -} - -func (ci *ComputeInstance) setMockFuncs() { - ci.GetInfoFunc = func() (nvml.ComputeInstanceInfo, nvml.Return) { - return ci.Info, nvml.SUCCESS - } - - ci.DestroyFunc = func() nvml.Return { - gi := ci.Info.GpuInstance.(*GpuInstance) - gi.Lock() - defer gi.Unlock() - delete(gi.ComputeInstances, ci) - return nvml.SUCCESS - } -} +) diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go deleted file mode 100644 index c4df4c833..000000000 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go +++ /dev/null @@ -1,471 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package dgxa100 - -import ( - "github.com/NVIDIA/go-nvml/pkg/nvml" -) - -// MIGProfiles holds the profile information for GIs and CIs in this mock server. -// We should consider auto-generating this object in the future. -var MIGProfiles = struct { - GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo - ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo -}{ - GpuInstanceProfiles: map[int]nvml.GpuInstanceProfileInfo{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 7, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 0, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 4864, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 1, - OfaCount: 1, - MemorySizeMB: 4864, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, - IsP2pSupported: 0, - SliceCount: 1, - InstanceCount: 4, - MultiprocessorCount: 14, - CopyEngineCount: 1, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 9856, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, - IsP2pSupported: 0, - SliceCount: 2, - InstanceCount: 3, - MultiprocessorCount: 28, - CopyEngineCount: 2, - DecoderCount: 1, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 9856, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, - IsP2pSupported: 0, - SliceCount: 3, - InstanceCount: 2, - MultiprocessorCount: 42, - CopyEngineCount: 3, - DecoderCount: 2, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 19968, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, - IsP2pSupported: 0, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - CopyEngineCount: 4, - DecoderCount: 2, - EncoderCount: 0, - JpegCount: 0, - OfaCount: 0, - MemorySizeMB: 19968, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, - IsP2pSupported: 0, - SliceCount: 7, - InstanceCount: 1, - MultiprocessorCount: 98, - CopyEngineCount: 7, - DecoderCount: 5, - EncoderCount: 0, - JpegCount: 1, - OfaCount: 1, - MemorySizeMB: 40192, - }, - }, - ComputeInstanceProfiles: map[int]map[int]nvml.ComputeInstanceProfileInfo{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 0, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 1, - MultiprocessorCount: 14, - SharedCopyEngineCount: 1, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 2, - MultiprocessorCount: 14, - SharedCopyEngineCount: 2, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 1, - MultiprocessorCount: 28, - SharedCopyEngineCount: 2, - SharedDecoderCount: 1, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 3, - MultiprocessorCount: 14, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 1, - MultiprocessorCount: 28, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, - SliceCount: 3, - InstanceCount: 1, - MultiprocessorCount: 42, - SharedCopyEngineCount: 3, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 4, - MultiprocessorCount: 14, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 2, - MultiprocessorCount: 28, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - SharedCopyEngineCount: 4, - SharedDecoderCount: 2, - SharedEncoderCount: 0, - SharedJpegCount: 0, - SharedOfaCount: 0, - }, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, - SliceCount: 1, - InstanceCount: 7, - MultiprocessorCount: 14, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, - SliceCount: 2, - InstanceCount: 3, - MultiprocessorCount: 28, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, - SliceCount: 3, - InstanceCount: 2, - MultiprocessorCount: 42, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, - SliceCount: 4, - InstanceCount: 1, - MultiprocessorCount: 56, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { - Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, - SliceCount: 7, - InstanceCount: 1, - MultiprocessorCount: 98, - SharedCopyEngineCount: 7, - SharedDecoderCount: 5, - SharedEncoderCount: 0, - SharedJpegCount: 1, - SharedOfaCount: 1, - }, - }, - }, -} - -// MIGPlacements holds the placement information for GIs and CIs in this mock server. -// We should consider auto-generating this object in the future. -var MIGPlacements = struct { - GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement - ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement -}{ - GpuInstancePossiblePlacements: map[int][]nvml.GpuInstancePlacement{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - { - Start: 0, - Size: 1, - }, - { - Start: 1, - Size: 1, - }, - { - Start: 2, - Size: 1, - }, - { - Start: 3, - Size: 1, - }, - { - Start: 4, - Size: 1, - }, - { - Start: 5, - Size: 1, - }, - { - Start: 6, - Size: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - { - Start: 0, - Size: 1, - }, - { - Start: 1, - Size: 1, - }, - { - Start: 2, - Size: 1, - }, - { - Start: 3, - Size: 1, - }, - { - Start: 4, - Size: 1, - }, - { - Start: 5, - Size: 1, - }, - { - Start: 6, - Size: 1, - }, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - { - Start: 0, - Size: 2, - }, - { - Start: 2, - Size: 2, - }, - { - Start: 4, - Size: 2, - }, - { - Start: 6, - Size: 2, - }, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - { - Start: 0, - Size: 2, - }, - { - Start: 2, - Size: 2, - }, - { - Start: 4, - Size: 2, - }, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - { - Start: 0, - Size: 4, - }, - { - Start: 4, - Size: 4, - }, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - { - Start: 0, - Size: 4, - }, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - { - Start: 0, - Size: 8, - }, - }, - }, - // TODO: Fill out ComputeInstancePossiblePlacements - ComputeInstancePossiblePlacements: map[int]map[int][]nvml.ComputeInstancePlacement{ - nvml.GPU_INSTANCE_PROFILE_1_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_2_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_3_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_4_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, - }, - nvml.GPU_INSTANCE_PROFILE_7_SLICE: { - nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, - nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {}, - }, - }, -} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/a100.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/a100.go new file mode 100644 index 000000000..93ef989f9 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/a100.go @@ -0,0 +1,455 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// A100 GPU Variants with different memory profiles and PCI device IDs +var ( + A100_PCIE_40GB = Config{ + Name: "NVIDIA A100-PCIE-40GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 40960, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20F110DE, + MIGProfiles: a100_40gb_MIGProfiles, + } + A100_PCIE_80GB = Config{ + Name: "NVIDIA A100-PCIE-80GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 81920, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B510DE, + MIGProfiles: a100_80gb_MIGProfiles, + } + A100_SXM4_40GB = Config{ + Name: "Mock NVIDIA A100-SXM4-40GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 40960, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B010DE, + MIGProfiles: a100_40gb_MIGProfiles, + } + A100_SXM4_80GB = Config{ + Name: "NVIDIA A100-SXM4-80GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 81920, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B210DE, + MIGProfiles: a100_80gb_MIGProfiles, + } +) + +var ( + a100_40gb_MIGProfiles = MIGProfileConfig{ + GpuInstanceProfiles: a100_40gb_GpuInstanceProfiles, + ComputeInstanceProfiles: a100_ComputeInstanceProfiles, + GpuInstancePlacements: a100_GpuInstancePlacements, + ComputeInstancePlacements: a100_ComputeInstancePlacements, + } + a100_80gb_MIGProfiles = MIGProfileConfig{ + GpuInstanceProfiles: a100_80gb_GpuInstanceProfiles, + ComputeInstanceProfiles: a100_ComputeInstanceProfiles, + GpuInstancePlacements: a100_GpuInstancePlacements, + ComputeInstancePlacements: a100_ComputeInstancePlacements, + } +) + +var ( + a100_40gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 0, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 0, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 96, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 40192, + }, + } + a100_80gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 0, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40192, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40192, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 0, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 98, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 80384, + }, + } +) + +var a100_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 48, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + }, + }, +} + +var a100_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 8}, // Test expects Size 8 + }, +} + +var a100_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 8}, // Test expects Size 8 + }, + }, +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/a30.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/a30.go new file mode 100644 index 000000000..3cae016d4 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/a30.go @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// A30 GPU Variants with different memory profiles and PCI device IDs +var ( + A30_PCIE_24GB = Config{ + Name: "NVIDIA A30-PCIE-24GB", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 24576, + CudaMajor: 8, + CudaMinor: 0, + PciDeviceId: 0x20B710DE, + MIGProfiles: a30_24gb_MIGProfiles, + } +) + +var a30_24gb_MIGProfiles = MIGProfileConfig{ + GpuInstanceProfiles: a30_24gb_GpuInstanceProfiles, + ComputeInstanceProfiles: a30_ComputeInstanceProfiles, + GpuInstancePlacements: a30_GpuInstancePlacements, + ComputeInstancePlacements: a30_ComputeInstancePlacements, +} + +var a30_24gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 0, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 5836, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 5836, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 11672, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 11672, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 4, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 23344, + }, +} + +var a30_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 14, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 14, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 28, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + }, + }, +} + +var a30_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, +} + +var a30_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/b200.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/b200.go new file mode 100644 index 000000000..ddac85419 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/b200.go @@ -0,0 +1,360 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// B200 GPU Variants +var ( + B200_SXM5_180GB = Config{ + Name: "NVIDIA B200 180GB HBM3e", + Architecture: nvml.DEVICE_ARCH_BLACKWELL, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 184320, // 180GB + CudaMajor: 10, + CudaMinor: 0, + PciDeviceId: 0x2B0010DE, + MIGProfiles: b200_180gb_MIGProfiles, + } +) + +var ( + b200_180gb_MIGProfiles = MIGProfileConfig{ + GpuInstanceProfiles: b200_180gb_GpuInstanceProfiles, + ComputeInstanceProfiles: b200_ComputeInstanceProfiles, + GpuInstancePlacements: b200_GpuInstancePlacements, + ComputeInstancePlacements: b200_ComputeInstancePlacements, + } +) + +var ( + b200_180gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 18, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 23552, // 23GB (MIG 1g.23gb) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 1, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 23552, // 23GB (MIG 1g.23gb+me) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 18, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 46080, // 45GB (MIG 1g.45gb) + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 1, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 36, + CopyEngineCount: 2, + DecoderCount: 2, + EncoderCount: 1, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 46080, // 45GB (MIG 2g.45gb) + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 1, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 54, + CopyEngineCount: 3, + DecoderCount: 3, + EncoderCount: 2, + JpegCount: 2, + OfaCount: 2, + MemorySizeMB: 92160, // 90GB (MIG 3g.90gb) + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 1, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 72, + CopyEngineCount: 4, + DecoderCount: 4, + EncoderCount: 2, + JpegCount: 2, + OfaCount: 2, + MemorySizeMB: 92160, // 90GB (MIG 4g.90gb) + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 1, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 126, + CopyEngineCount: 7, + DecoderCount: 7, + EncoderCount: 4, + JpegCount: 4, + OfaCount: 4, + MemorySizeMB: 184320, // 180GB (MIG 7g.180gb) + }, + } +) + +var b200_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 18, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 36, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 36, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 54, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 36, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 72, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 18, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 36, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 54, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 126, + }, + }, +} + +var b200_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, +} + +var b200_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, + }, +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/gpu.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/gpu.go new file mode 100644 index 000000000..049c470ea --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/gpu.go @@ -0,0 +1,33 @@ +package gpus + +import "github.com/NVIDIA/go-nvml/pkg/nvml" + +func Multiple(count int, gpu Config) []Config { + gpus := make([]Config, count) + for i := range gpus { + gpus[i] = gpu + } + return gpus +} + +// Config contains the minimal configuration needed for a GPU generation +type Config struct { + Name string + Architecture nvml.DeviceArchitecture + Brand nvml.BrandType + MemoryMB uint64 + CudaMajor int + CudaMinor int + //Deprecated: Use PciInfo directly + PciDeviceId uint32 + PciInfo *nvml.PciInfo + MIGProfiles MIGProfileConfig +} + +// MIGProfileConfig contains MIG profile configuration for a GPU +type MIGProfileConfig struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo + GpuInstancePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePlacements map[int]map[int][]nvml.ComputeInstancePlacement +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/h100.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/h100.go new file mode 100644 index 000000000..c9c9a0253 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/h100.go @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// H100 GPU Variants +var ( + H100_SXM5_80GB = Config{ + Name: "NVIDIA H100 80GB HBM3", + Architecture: nvml.DEVICE_ARCH_HOPPER, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 81920, // 80GB + CudaMajor: 9, + CudaMinor: 0, + PciDeviceId: 0x233010DE, + MIGProfiles: h100_80gb_MIGProfiles, + } +) + +var ( + h100_80gb_MIGProfiles = MIGProfileConfig{ + GpuInstanceProfiles: h100_80gb_GpuInstanceProfiles, + ComputeInstanceProfiles: h100_ComputeInstanceProfiles, + GpuInstancePlacements: h100_GpuInstancePlacements, + ComputeInstancePlacements: h100_ComputeInstancePlacements, + } +) + +var ( + h100_80gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 10240, // 10GB (MIG 1g.10gb) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 10240, // 10GB (MIG 1g.10gb+me) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 20480, // 20GB (MIG 1g.20gb) + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 1, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 20480, // 20GB (MIG 2g.20gb) + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 1, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40960, // 40GB (MIG 3g.40gb) + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 1, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 40960, // 40GB (MIG 4g.40gb) + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 1, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 81920, // 80GB (MIG 7g.80gb) + }, + } +) + +var h100_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 48, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + }, + }, +} + +var h100_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, +} + +var h100_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, + }, +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/h200.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/h200.go new file mode 100644 index 000000000..86e3b0014 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/h200.go @@ -0,0 +1,326 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package gpus + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// H200 GPU Variants +var ( + H200_SXM5_141GB = Config{ + Name: "NVIDIA H200 141GB HBM3e", + Architecture: nvml.DEVICE_ARCH_HOPPER, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 144384, // 141GB + CudaMajor: 9, + CudaMinor: 0, + PciDeviceId: 0x233310DE, + MIGProfiles: h200_141gb_MIGProfiles, + } +) + +var ( + h200_141gb_MIGProfiles = MIGProfileConfig{ + GpuInstanceProfiles: h200_141gb_GpuInstanceProfiles, + ComputeInstanceProfiles: h200_ComputeInstanceProfiles, + GpuInstancePlacements: h200_GpuInstancePlacements, + ComputeInstancePlacements: h200_ComputeInstancePlacements, + } +) + +var ( + h200_141gb_GpuInstanceProfiles = map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 18432, // 18GB (MIG 1g.18gb) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 18432, // 18GB (MIG 1g.18gb+me) + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 1, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 35840, // 35GB (MIG 1g.35gb) + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 1, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 35840, // 35GB (MIG 2g.35gb) + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 1, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 72704, // 71GB (MIG 3g.71gb) + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 1, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 72704, // 71GB (MIG 4g.71gb) + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 1, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 144384, // 141GB (MIG 7g.141gb) + }, + } +) + +var h200_ComputeInstanceProfiles = map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 16, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 48, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 64, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 16, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 32, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 48, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 112, + }, + }, +} + +var h200_GpuInstancePlacements = map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, +} + +var h200_ComputeInstancePlacements = map[int]map[int][]nvml.ComputeInstancePlacement{ + 0: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + }, + }, + 1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + }, + 2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + }, + }, + 3: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + {Start: 0, Size: 4}, + }, + }, + 4: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + {Start: 0, Size: 1}, + {Start: 1, Size: 1}, + {Start: 2, Size: 1}, + {Start: 3, Size: 1}, + {Start: 4, Size: 1}, + {Start: 5, Size: 1}, + {Start: 6, Size: 1}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + {Start: 0, Size: 2}, + {Start: 2, Size: 2}, + {Start: 4, Size: 2}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + {Start: 0, Size: 3}, + {Start: 4, Size: 3}, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + {Start: 0, Size: 7}, + }, + }, +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/tegra.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/tegra.go new file mode 100644 index 000000000..26a8ac3bc --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus/tegra.go @@ -0,0 +1,19 @@ +package gpus + +import "github.com/NVIDIA/go-nvml/pkg/nvml" + +var ( + THOR_IGX = Config{ + Name: "NVIDIA Thor", + Architecture: nvml.DEVICE_ARCH_AMPERE, + Brand: nvml.BRAND_NVIDIA, + MemoryMB: 131882934272 / 1024 / 1024, + CudaMajor: 11, + CudaMinor: 0, + PciInfo: &nvml.PciInfo{ + Domain: 0, + Bus: 1, + Device: 0, + }, + } +) diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/server/options.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/server/options.go new file mode 100644 index 000000000..5a303293f --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/server/options.go @@ -0,0 +1,32 @@ +package server + +import "github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus" + +func WithGPUs(gpus ...gpus.Config) Option { + return func(o *options) error { + o.gpus = gpus + return nil + } +} + +func WithDriverVersion(version string) Option { + return func(o *options) error { + o.DriverVersion = version + return nil + } +} + +func WithNVMLVersion(version string) Option { + return func(o *options) error { + o.NvmlVersion = version + return nil + } +} + +// TODO: Add a string implementation using generics +func WithCUDADriverVersion(version int) Option { + return func(o *options) error { + o.CudaDriverVersion = version + return nil + } +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/server/shared.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/server/shared.go new file mode 100644 index 000000000..71fc51155 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/server/shared.go @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package server + +import ( + "fmt" + "sync" + + "github.com/google/uuid" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus" +) + +// Compile-time interface checks +var _ nvml.Interface = (*Server)(nil) +var _ nvml.ExtendedInterface = (*Server)(nil) + +type Option func(*options) error + +func New(opts ...Option) (*Server, error) { + o := &options{} + for _, opt := range opts { + if err := opt(o); err != nil { + return nil, err + } + } + // TODO: Check defaults and validity + return o.build(), nil +} + +// NewServerFromConfig creates a new server from the provided configuration +func (o *options) build() *Server { + devices := make([]nvml.Device, len(o.gpus)) + for i, gpu := range o.gpus { + devices[i] = NewDeviceFromConfig(gpu, i) + } + + server := &Server{ + Devices: devices, + DriverVersion: o.DriverVersion, + NvmlVersion: o.NvmlVersion, + CudaDriverVersion: o.CudaDriverVersion, + } + server.SetMockFuncs() + return server +} + +// GBtoMB is a conversion constant from GB to MB (1 GB = 1024 MB) +const GBtoMB = 1024 + +// options contains the minimal configuration needed for a server +type options struct { + gpus []gpus.Config + DriverVersion string + NvmlVersion string + CudaDriverVersion int +} + +// Server provides a reusable server implementation +type Server struct { + mock.Interface + mock.ExtendedInterface + Devices []nvml.Device + DriverVersion string + NvmlVersion string + CudaDriverVersion int +} + +// Device provides a reusable device implementation +type Device struct { + mock.Device + sync.RWMutex + Config gpus.Config // Embedded configuration + UUID string + PciBusID string + Minor int + Index int + MigMode int + GpuInstances map[*GpuInstance]struct{} + GpuInstanceCounter uint32 + MemoryInfo nvml.Memory +} + +// GpuInstance provides a reusable GPU instance implementation +type GpuInstance struct { + mock.GpuInstance + sync.RWMutex + Info nvml.GpuInstanceInfo + ComputeInstances map[*ComputeInstance]struct{} + ComputeInstanceCounter uint32 + MIGProfiles gpus.MIGProfileConfig +} + +// ComputeInstance provides a reusable compute instance implementation +type ComputeInstance struct { + mock.ComputeInstance + Info nvml.ComputeInstanceInfo +} + +// CudaComputeCapability represents CUDA compute capability +type CudaComputeCapability struct { + Major int + Minor int +} + +var _ nvml.Interface = (*Server)(nil) +var _ nvml.Device = (*Device)(nil) +var _ nvml.GpuInstance = (*GpuInstance)(nil) +var _ nvml.ComputeInstance = (*ComputeInstance)(nil) + +// NewServerWithGPUs creates a new server with heterogeneous GPU configurations +func NewServerWithGPUs(driverVersion, nvmlVersion string, cudaDriverVersion int, gpuConfigs ...gpus.Config) *Server { + devices := make([]nvml.Device, len(gpuConfigs)) + for i, config := range gpuConfigs { + devices[i] = NewDeviceFromConfig(config, i) + } + + server := &Server{ + Devices: devices, + DriverVersion: driverVersion, + NvmlVersion: nvmlVersion, + CudaDriverVersion: cudaDriverVersion, + } + server.SetMockFuncs() + return server +} + +// NewDeviceFromConfig creates a new device from the provided GPU configuration +func NewDeviceFromConfig(config gpus.Config, index int) *Device { + device := &Device{ + Config: config, + UUID: "GPU-" + uuid.New().String(), + PciBusID: fmt.Sprintf("0000:%02x:00.0", index), + Minor: index, + Index: index, + GpuInstances: make(map[*GpuInstance]struct{}), + GpuInstanceCounter: 0, + MemoryInfo: nvml.Memory{Total: config.MemoryMB * 1024 * 1024, Free: 0, Used: 0}, + } + device.SetMockFuncs() + return device +} + +// NewGpuInstanceFromInfo creates a new GPU instance +func NewGpuInstanceFromInfo(info nvml.GpuInstanceInfo, profiles gpus.MIGProfileConfig) *GpuInstance { + gi := &GpuInstance{ + Info: info, + ComputeInstances: make(map[*ComputeInstance]struct{}), + ComputeInstanceCounter: 0, + MIGProfiles: profiles, + } + gi.SetMockFuncs() + return gi +} + +// NewComputeInstanceFromInfo creates a new compute instance +func NewComputeInstanceFromInfo(info nvml.ComputeInstanceInfo) *ComputeInstance { + ci := &ComputeInstance{ + Info: info, + } + ci.SetMockFuncs() + return ci +} + +// SetMockFuncs configures all the mock function implementations for the server +func (s *Server) SetMockFuncs() { + s.ExtensionsFunc = func() nvml.ExtendedInterface { + return s + } + + s.LookupSymbolFunc = func(symbol string) error { + return nil + } + + s.InitFunc = func() nvml.Return { + return nvml.SUCCESS + } + + s.ShutdownFunc = func() nvml.Return { + return nvml.SUCCESS + } + + s.SystemGetDriverVersionFunc = func() (string, nvml.Return) { + return s.DriverVersion, nvml.SUCCESS + } + + s.SystemGetNVMLVersionFunc = func() (string, nvml.Return) { + return s.NvmlVersion, nvml.SUCCESS + } + + s.SystemGetCudaDriverVersionFunc = func() (int, nvml.Return) { + return s.CudaDriverVersion, nvml.SUCCESS + } + + s.DeviceGetCountFunc = func() (int, nvml.Return) { + return len(s.Devices), nvml.SUCCESS + } + + s.DeviceGetHandleByIndexFunc = func(index int) (nvml.Device, nvml.Return) { + if index < 0 || index >= len(s.Devices) { + return nil, nvml.ERROR_INVALID_ARGUMENT + } + return s.Devices[index], nvml.SUCCESS + } + + s.DeviceGetHandleByUUIDFunc = func(uuid string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if uuid == d.(*Device).UUID { + return d, nvml.SUCCESS + } + } + return nil, nvml.ERROR_INVALID_ARGUMENT + } + + s.DeviceGetHandleByPciBusIdFunc = func(busID string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if busID == d.(*Device).PciBusID { + return d, nvml.SUCCESS + } + } + return nil, nvml.ERROR_INVALID_ARGUMENT + } +} + +// SetMockFuncs configures all the mock function implementations for the device +func (d *Device) SetMockFuncs() { + d.GetMinorNumberFunc = func() (int, nvml.Return) { + return d.Minor, nvml.SUCCESS + } + + d.GetIndexFunc = func() (int, nvml.Return) { + return d.Index, nvml.SUCCESS + } + + d.GetCudaComputeCapabilityFunc = func() (int, int, nvml.Return) { + return d.Config.CudaMajor, d.Config.CudaMinor, nvml.SUCCESS + } + + d.GetUUIDFunc = func() (string, nvml.Return) { + return d.UUID, nvml.SUCCESS + } + + d.GetNameFunc = func() (string, nvml.Return) { + return d.Config.Name, nvml.SUCCESS + } + + d.GetBrandFunc = func() (nvml.BrandType, nvml.Return) { + return d.Config.Brand, nvml.SUCCESS + } + + d.GetArchitectureFunc = func() (nvml.DeviceArchitecture, nvml.Return) { + return d.Config.Architecture, nvml.SUCCESS + } + + d.GetMemoryInfoFunc = func() (nvml.Memory, nvml.Return) { + return d.MemoryInfo, nvml.SUCCESS + } + + d.GetPciInfoFunc = func() (nvml.PciInfo, nvml.Return) { + if d.Config.PciInfo != nil { + return *d.Config.PciInfo, nvml.SUCCESS + } + //nolint:staticcheck + id := d.Config.PciDeviceId + if id == 0 { + return nvml.PciInfo{}, nvml.ERROR_NOT_SUPPORTED + } + p := nvml.PciInfo{ + PciDeviceId: id, + } + return p, nvml.SUCCESS + } + + d.SetMigModeFunc = func(mode int) (nvml.Return, nvml.Return) { + d.MigMode = mode + return nvml.SUCCESS, nvml.SUCCESS + } + + d.GetMigModeFunc = func() (int, int, nvml.Return) { + return d.MigMode, d.MigMode, nvml.SUCCESS + } + + d.GetGpuInstanceProfileInfoFunc = func(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) { + if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } + + if _, exists := d.Config.MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + return d.Config.MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS + } + + d.GetGpuInstancePossiblePlacementsFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) { + return d.Config.MIGProfiles.GpuInstancePlacements[int(info.Id)], nvml.SUCCESS + } + + d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, + } + d.GpuInstanceCounter++ + gi := NewGpuInstanceFromInfo(giInfo, d.Config.MIGProfiles) + d.GpuInstances[gi] = struct{}{} + return gi, nvml.SUCCESS + } + + d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, + Placement: *placement, + } + d.GpuInstanceCounter++ + gi := NewGpuInstanceFromInfo(giInfo, d.Config.MIGProfiles) + d.GpuInstances[gi] = struct{}{} + return gi, nvml.SUCCESS + } + + d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) { + d.RLock() + defer d.RUnlock() + var gis []nvml.GpuInstance + for gi := range d.GpuInstances { + if gi.Info.ProfileId == info.Id { + gis = append(gis, gi) + } + } + return gis, nvml.SUCCESS + } +} + +// SetMockFuncs configures all the mock function implementations for the GPU instance +func (gi *GpuInstance) SetMockFuncs() { + gi.GetInfoFunc = func() (nvml.GpuInstanceInfo, nvml.Return) { + return gi.Info, nvml.SUCCESS + } + + gi.GetComputeInstanceProfileInfoFunc = func(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) { + if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } + + if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + giProfileId := int(gi.Info.ProfileId) + + if _, exists := gi.MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + if _, exists := gi.MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + return gi.MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS + } + + gi.GetComputeInstancePossiblePlacementsFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) { + return gi.MIGProfiles.ComputeInstancePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS + } + + gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { + gi.Lock() + defer gi.Unlock() + ciInfo := nvml.ComputeInstanceInfo{ + Device: gi.Info.Device, + GpuInstance: gi, + Id: gi.ComputeInstanceCounter, + ProfileId: info.Id, + } + gi.ComputeInstanceCounter++ + ci := NewComputeInstanceFromInfo(ciInfo) + gi.ComputeInstances[ci] = struct{}{} + return ci, nvml.SUCCESS + } + + gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) { + gi.RLock() + defer gi.RUnlock() + var cis []nvml.ComputeInstance + for ci := range gi.ComputeInstances { + if ci.Info.ProfileId == info.Id { + cis = append(cis, ci) + } + } + return cis, nvml.SUCCESS + } + + gi.DestroyFunc = func() nvml.Return { + d := gi.Info.Device.(*Device) + d.Lock() + defer d.Unlock() + delete(d.GpuInstances, gi) + return nvml.SUCCESS + } +} + +// SetMockFuncs configures all the mock function implementations for the compute instance +func (ci *ComputeInstance) SetMockFuncs() { + ci.GetInfoFunc = func() (nvml.ComputeInstanceInfo, nvml.Return) { + return ci.Info, nvml.SUCCESS + } + + ci.DestroyFunc = func() nvml.Return { + gi := ci.Info.GpuInstance.(*GpuInstance) + gi.Lock() + defer gi.Unlock() + delete(gi.ComputeInstances, ci) + return nvml.SUCCESS + } +} diff --git a/vendor/modules.txt b/vendor/modules.txt index d8e20797d..70f791f4c 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -12,12 +12,14 @@ github.com/NVIDIA/go-nvlib/pkg/nvpci github.com/NVIDIA/go-nvlib/pkg/nvpci/bytes github.com/NVIDIA/go-nvlib/pkg/nvpci/mmio github.com/NVIDIA/go-nvlib/pkg/pciids -# github.com/NVIDIA/go-nvml v0.13.0-1 +# github.com/NVIDIA/go-nvml v0.13.0-1 => ../go-nvml ## explicit; go 1.20 github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml github.com/NVIDIA/go-nvml/pkg/nvml/mock github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100 +github.com/NVIDIA/go-nvml/pkg/nvml/mock/gpus +github.com/NVIDIA/go-nvml/pkg/nvml/mock/server # github.com/containerd/log v0.1.0 ## explicit; go 1.20 github.com/containerd/log @@ -243,3 +245,4 @@ tags.cncf.io/container-device-interface/pkg/parser # tags.cncf.io/container-device-interface/specs-go v1.1.0 ## explicit; go 1.19 tags.cncf.io/container-device-interface/specs-go +# github.com/NVIDIA/go-nvml => ../go-nvml