Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions internal/platform-support/tegra/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ type matcherAsFilter struct {
}

type filterByMountSpecType map[csv.MountSpecType]filter
type filterByMountSpecPathsByTyper struct {
MountSpecPathsByTyper
}

type pathPatterns []string
type pathPattern string
Expand Down Expand Up @@ -125,6 +128,14 @@ func (p filterByMountSpecType) Apply(input MountSpecPathsByTyper) MountSpecPaths
return ms
}

func (p filterByMountSpecPathsByTyper) Apply(input MountSpecPathsByTyper) MountSpecPathsByTyper {
f := make(filterByMountSpecType)
for t, paths := range p.MountSpecPathsByType() {
f[t] = &matcherAsFilter{pathPatterns(paths)}
}
return f.Apply(input)
}

// apply uses a matcher to filter an input string.
// Each element in the input that matches is skipped and the remaining elements
// are returned.
Expand Down
20 changes: 18 additions & 2 deletions internal/platform-support/tegra/mount_specs.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,25 @@ type Transformer interface {

// Transform applies the specified transforms to a set of mount specs by type.
// The result is itself a set of mount specs by type.
func Transform(input MountSpecPathsByTyper, t Transformer) MountSpecPathsByTyper {
func Transform(input MountSpecPathsByTyper, t ...Transformer) MountSpecPathsByTyper {
return transformMountSpecByPathsByType{
Transformer: t,
Transformer: allTransformers(t),
input: input,
}
}

type allTransformers []Transformer

func (ts allTransformers) Apply(input MountSpecPathsByTyper) MountSpecPathsByTyper {
for _, t := range ts {
if t == nil {
continue
}
input = t.Apply(input)
}
return input
}

type transformMountSpecByPathsByType struct {
Transformer
input MountSpecPathsByTyper
Expand Down Expand Up @@ -130,6 +142,10 @@ func WithoutDeviceNodes() Transformer {
}
}

func Without(m MountSpecPathsByTyper) Transformer {
return filterByMountSpecPathsByTyper{m}
}

// WithoutRegularDeviceNodes creates a transfomer which removes
// regular `/dev/nvidia[0-9]+` device nodes from the source.
func WithoutRegularDeviceNodes() Transformer {
Expand Down
137 changes: 94 additions & 43 deletions pkg/nvcdi/lib-csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,6 @@ func (l *csvlib) purecsvDeviceSpecGenerators(ids ...string) (DeviceSpecGenerator
csvlib: l,
index: 0,
uuid: "",
// We set noFilterDeviceNodes to true to ensure that the /dev/nvidia[0-1]
// device nodes in the CSV files on the system are consumed as-is.
noFilterDeviceNodes: true,
}
return g, nil
}
Expand All @@ -108,11 +105,18 @@ func (l *csvlib) mixedDeviceSpecGenerators(ids ...string) (DeviceSpecGenerator,
// platform-specific CSV files.
type csvDeviceGenerator struct {
*csvlib
index int
uuid string
noFilterDeviceNodes bool
index int
uuid string
mode csvGeneratorMode
}

type csvGeneratorMode string

const (
iGPUGeneratorMode = csvGeneratorMode("igpu")
dGPUGeneratorMode = csvGeneratorMode("dgpu")
)

func (l *csvDeviceGenerator) GetUUID() (string, error) {
return l.uuid, nil
}
Expand Down Expand Up @@ -154,31 +158,57 @@ func (l *csvDeviceGenerator) GetDeviceSpecs() ([]specs.Device, error) {
// - The device node (i.e. /dev/nvidia{{ .index }}) associated with this
// particular device is added to the set of device nodes to be discovered.
func (l *csvDeviceGenerator) deviceNodeDiscoverer() (discover.Discover, error) {
return tegra.New(
tegra.WithLogger(l.logger),
tegra.WithDriverRoot(l.driverRoot),
tegra.WithDevRoot(l.devRoot),
tegra.WithHookCreator(l.hookCreator),
tegra.WithLdconfigPath(l.ldconfigPath),
tegra.WithLibrarySearchPaths(l.librarySearchPaths...),
tegra.WithMountSpecs(l.deviceNodeMountSpecs()),
)
}

func (l *csvDeviceGenerator) deviceNodeMountSpecs() tegra.MountSpecPathsByTyper {
mountSpecs := tegra.Transform(
tegra.MountSpecsFromCSVFiles(l.logger, l.csvFiles...),
// We remove non-device nodes.
tegra.OnlyDeviceNodes(),
)
if !l.noFilterDeviceNodes {
mountSpecs = tegra.Transform(
switch l.mode {
case dGPUGeneratorMode:
return tegra.Transform(
mountSpecs,
// We remove the regular (nvidia[0-9]+) device nodes.
// For a dGPU we remove all regular device nodes (nvidia[0-9]+)
// from the list of device nodes taken from the CSV mount specs.
// The device nodes for the GPU are discovered for the full GPU.
tegra.WithoutRegularDeviceNodes(),
// We also ignore control device nodes since these are included in
// the full GPU spec generator.
tegra.Without(
tegra.DeviceNodes(
"/dev/nvidia-modeset",
"/dev/nvidia-uvm-tools",
"/dev/nvidia-uvm",
"/dev/nvidiactl",
),
),
)
case iGPUGeneratorMode:
return tegra.Merge(
tegra.Transform(
mountSpecs,
// We remove the /dev/nvidia1 device node.
// TODO: This assumes that the dGPU has the index 1 and remove
// it from the set of device nodes.
tegra.Without(tegra.DeviceNodes("/dev/nvidia1")),
),
// We add the display device from the iGPU.
tegra.DeviceNodes("/dev/nvidia2"),
)
default:
return mountSpecs
}
return tegra.New(
tegra.WithLogger(l.logger),
tegra.WithDriverRoot(l.driverRoot),
tegra.WithDevRoot(l.devRoot),
tegra.WithHookCreator(l.hookCreator),
tegra.WithLdconfigPath(l.ldconfigPath),
tegra.WithLibrarySearchPaths(l.librarySearchPaths...),
tegra.WithMountSpecs(
mountSpecs,
// We add the specific device node for this device.
tegra.DeviceNodes(fmt.Sprintf("/dev/nvidia%d", l.index)),
),
)
}

// GetCommonEdits generates a CDI specification that can be used for ANY devices
Expand Down Expand Up @@ -272,35 +302,56 @@ func (l *mixedcsvlib) csvDeviceSpecGenerator(index int, uuid string, device nvml
return nil, fmt.Errorf("is-integrated check failed for device (index=%v,uuid=%v)", index, uuid)
}

if isIntegrated {
return l.iGPUDeviceSpecGenerator(index, uuid)
}

return l.dGPUDeviceSpecGenerator(index, uuid)
}

func (l *mixedcsvlib) dGPUDeviceSpecGenerator(index int, uuid string) (DeviceSpecGenerator, error) {
if index != 1 {
return nil, fmt.Errorf("unexpected device index for dGPU: %d", index)
}
g := &csvDeviceGenerator{
csvlib: (*csvlib)(l),
index: index,
uuid: uuid,
mode: dGPUGeneratorMode,
}

if !isIntegrated {
csvDeviceNodeDiscoverer, err := g.deviceNodeDiscoverer()
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for devices nodes: %w", err)
}
csvDeviceNodeDiscoverer, err := g.deviceNodeDiscoverer()
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for devices nodes: %w", err)
}

// If this is not an integrated GPU, we also create a spec generator for
// the full GPU.
dgpu := (*nvmllib)(l).withInit(&fullGPUDeviceSpecGenerator{
nvmllib: (*nvmllib)(l),
uuid: uuid,
index: index,
// For the CSV case, we include the control device nodes at a
// device level.
additionalDiscoverers: []discover.Discover{
(*nvmllib)(l).controlDeviceNodeDiscoverer(),
csvDeviceNodeDiscoverer,
},
featureFlags: l.featureFlags,
})
return dgpu, nil
}

// If this is not an integrated GPU, we also create a spec generator for
// the full GPU.
dgpu := (*nvmllib)(l).withInit(&fullGPUDeviceSpecGenerator{
nvmllib: (*nvmllib)(l),
uuid: uuid,
index: index,
// For the CSV case, we include the control device nodes at a
// device level.
additionalDiscoverers: []discover.Discover{
(*nvmllib)(l).controlDeviceNodeDiscoverer(),
csvDeviceNodeDiscoverer,
},
featureFlags: l.featureFlags,
})
return dgpu, nil
func (l *mixedcsvlib) iGPUDeviceSpecGenerator(index int, uuid string) (DeviceSpecGenerator, error) {
if index != 0 {
return nil, fmt.Errorf("unexpected device index for iGPU: %d", index)
}
g := &csvDeviceGenerator{
csvlib: (*csvlib)(l),
index: index,
uuid: uuid,
mode: iGPUGeneratorMode,
}

return g, nil
}

Expand Down
116 changes: 116 additions & 0 deletions pkg/nvcdi/lib-csv_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,56 @@ func TestDeviceSpecGenerators(t *testing.T) {
},
},
},
{
description: "thor device with dGPU",
rootfsFolder: "rootfs-thor-dgpu",
lib: &csvlib{
// test-case specific
infolib: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) { return true, "forced" },
},
nvmllib: mockIGXServer(),
},
expectedDeviceSpecs: []specs.Device{
{
Name: "0",
ContainerEdits: specs.ContainerEdits{
DeviceNodes: []*specs.DeviceNode{
{Path: "/dev/nvidia0", HostPath: "/dev/nvidia0"},
{Path: "/dev/nvidiactl", HostPath: "/dev/nvidiactl"},
{Path: "/dev/nvidia2", HostPath: "/dev/nvidia2"},
},
},
},
{
Name: "1",
ContainerEdits: specs.ContainerEdits{
DeviceNodes: []*specs.DeviceNode{
{Path: "/dev/nvidia1", HostPath: "/dev/nvidia1"},
{Path: "/dev/nvidiactl", HostPath: "/dev/nvidiactl"},
},
},
},
},
expectedCommonEdits: &cdi.ContainerEdits{
ContainerEdits: &specs.ContainerEdits{
Hooks: []*specs.Hook{
{
HookName: "createContainer",
Path: "/usr/bin/nvidia-cdi-hook",
Args: []string{"nvidia-cdi-hook", "enable-cuda-compat", "--host-driver-version=540.3.0"},
Env: []string{"NVIDIA_CTK_DEBUG=false"},
},
{
HookName: "createContainer",
Path: "/usr/bin/nvidia-cdi-hook",
Args: []string{"nvidia-cdi-hook", "update-ldcache"},
Env: []string{"NVIDIA_CTK_DEBUG=false"},
},
},
},
},
},
}

for _, tc := range testCases {
Expand Down Expand Up @@ -179,3 +229,69 @@ func stripRoot[T any](root string, v T) T {
}
return modified
}

// TODO: We should move this mock to go-nvml/mock
func mockIGXServer() nvml.Interface {
thor := &mock.Device{
GetNameFunc: func() (string, nvml.Return) {
return "NVIDIA Thor", nvml.SUCCESS
},
GetUUIDFunc: func() (string, nvml.Return) {
return "GPU-0", nvml.SUCCESS
},
GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) {
return nvml.PciInfo{
Bus: 1,
}, nvml.SUCCESS
},
}
rtx := &mock.Device{
GetNameFunc: func() (string, nvml.Return) {
return "RTX Pro 6000", nvml.SUCCESS
},
GetUUIDFunc: func() (string, nvml.Return) {
return "GPU-1", nvml.SUCCESS
},
GetPciInfoFunc: func() (nvml.PciInfo, nvml.Return) {
return nvml.PciInfo{
Bus: 3,
}, nvml.SUCCESS
},
GetMinorNumberFunc: func() (int, nvml.Return) {
return 1, nvml.SUCCESS
},
}

return &mock.Interface{
InitFunc: func() nvml.Return {
return nvml.SUCCESS
},
ShutdownFunc: func() nvml.Return {
return nvml.SUCCESS
},
SystemGetDriverVersionFunc: func() (string, nvml.Return) {
return "540.3.0", nvml.SUCCESS
},
DeviceGetCountFunc: func() (int, nvml.Return) {
return 2, nvml.SUCCESS
},
DeviceGetHandleByIndexFunc: func(n int) (nvml.Device, nvml.Return) {
switch n {
case 0:
return thor, nvml.SUCCESS
case 1:
return rtx, nvml.SUCCESS
}
return nil, nvml.ERROR_INVALID_ARGUMENT
},
DeviceGetHandleByUUIDFunc: func(s string) (nvml.Device, nvml.Return) {
switch s {
case "GPU-0":
return thor, nvml.SUCCESS
case "GPU-1":
return rtx, nvml.SUCCESS
}
return nil, nvml.ERROR_INVALID_ARGUMENT
},
}
}
Empty file.
Empty file.
Empty file.
Empty file.
Loading
Loading