From 1b0f07a124f483bb443df1ee0a8ffda34900504c Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 5 Jun 2025 15:48:09 +0200 Subject: [PATCH] Make CDI device requests consistent with other methods Following the refactoring of device request extraction, we can now make CDI device requests consistent with other methods. This change moves to using image.VisibleDevices instead of separate calls to CDIDevicesFromMounts and VisibleDevicesFromEnvVar. The handling of annotation-based requests will be addressed in a follow-up. Signed-off-by: Evan Lezar --- internal/config/image/cuda_image.go | 75 ++++++++++++++---------- internal/config/image/cuda_image_test.go | 6 +- internal/info/auto_test.go | 20 ++++++- internal/modifier/cdi.go | 63 +++++++++++--------- internal/modifier/cdi_test.go | 70 ++++++++++++++++++++++ 5 files changed, 171 insertions(+), 63 deletions(-) diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index b16e3fed..6c8e767e 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -54,8 +54,12 @@ type CUDA struct { // NewCUDAImageFromSpec creates a CUDA image from the input OCI runtime spec. // The process environment is read (if present) to construc the CUDA Image. func NewCUDAImageFromSpec(spec *specs.Spec, opts ...Option) (CUDA, error) { + if spec == nil { + return New(opts...) + } + var env []string - if spec != nil && spec.Process != nil { + if spec.Process != nil { env = spec.Process.Env } @@ -212,19 +216,12 @@ func parseMajorMinorVersion(version string) (string, error) { // OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool { var hasCDIdevice bool - for _, device := range i.VisibleDevicesFromEnvVar() { + for _, device := range i.VisibleDevices() { if !parser.IsQualifiedName(device) { return false } hasCDIdevice = true } - - for _, device := range i.DevicesFromMounts() { - if !strings.HasPrefix(device, "cdi/") { - return false - } - hasCDIdevice = true - } return hasCDIdevice } @@ -276,20 +273,27 @@ func (i CUDA) VisibleDevicesFromEnvVar() []string { // visibleDevicesFromMounts returns the set of visible devices requested as mounts. func (i CUDA) visibleDevicesFromMounts() []string { var devices []string - for _, device := range i.DevicesFromMounts() { + for _, device := range i.requestsFromMounts() { switch { - case strings.HasPrefix(device, volumeMountDevicePrefixCDI): - continue case strings.HasPrefix(device, volumeMountDevicePrefixImex): continue + case strings.HasPrefix(device, volumeMountDevicePrefixCDI): + name, err := cdiDeviceMountRequest(device).qualifiedName() + if err != nil { + i.logger.Warningf("Ignoring invalid mount request for CDI device %v: %w", device, err) + continue + } + devices = append(devices, name) + default: + devices = append(devices, device) } - devices = append(devices, device) + } return devices } -// DevicesFromMounts returns a list of device specified as mounts. -func (i CUDA) DevicesFromMounts() []string { +// requestsFromMounts returns a list of device specified as mounts. +func (i CUDA) requestsFromMounts() []string { root := filepath.Clean(DeviceListAsVolumeMountsRoot) seen := make(map[string]bool) var devices []string @@ -321,23 +325,30 @@ func (i CUDA) DevicesFromMounts() []string { return devices } -// CDIDevicesFromMounts returns a list of CDI devices specified as mounts on the image. -func (i CUDA) CDIDevicesFromMounts() []string { - var devices []string - for _, mountDevice := range i.DevicesFromMounts() { - if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixCDI) { - continue - } - parts := strings.SplitN(strings.TrimPrefix(mountDevice, volumeMountDevicePrefixCDI), "/", 3) - if len(parts) != 3 { - continue - } - vendor := parts[0] - class := parts[1] - device := parts[2] - devices = append(devices, fmt.Sprintf("%s/%s=%s", vendor, class, device)) +// a cdiDeviceMountRequest represents a CDI device requests as a mount. +// Here the host path /dev/null is mounted to a particular path in the container. +// The container path has the form: +// /var/run/nvidia-container-devices/cdi/// +// or +// /var/run/nvidia-container-devices/cdi//= +type cdiDeviceMountRequest string + +// qualifiedName returns the fully-qualified name of the CDI device. +func (m cdiDeviceMountRequest) qualifiedName() (string, error) { + if !strings.HasPrefix(string(m), volumeMountDevicePrefixCDI) { + return "", fmt.Errorf("invalid mount CDI device request: %s", m) } - return devices + + requestedDevice := strings.TrimPrefix(string(m), volumeMountDevicePrefixCDI) + if parser.IsQualifiedName(requestedDevice) { + return requestedDevice, nil + } + + parts := strings.SplitN(requestedDevice, "/", 3) + if len(parts) != 3 { + return "", fmt.Errorf("invalid mount CDI device request: %s", m) + } + return fmt.Sprintf("%s/%s=%s", parts[0], parts[1], parts[2]), nil } // ImexChannelsFromEnvVar returns the list of IMEX channels requested for the image. @@ -352,7 +363,7 @@ func (i CUDA) ImexChannelsFromEnvVar() []string { // ImexChannelsFromMounts returns the list of IMEX channels requested for the image. func (i CUDA) ImexChannelsFromMounts() []string { var channels []string - for _, mountDevice := range i.DevicesFromMounts() { + for _, mountDevice := range i.requestsFromMounts() { if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixImex) { continue } diff --git a/internal/config/image/cuda_image_test.go b/internal/config/image/cuda_image_test.go index 044466d3..5c83ba8c 100644 --- a/internal/config/image/cuda_image_test.go +++ b/internal/config/image/cuda_image_test.go @@ -487,9 +487,9 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) { expectedDevices: []string{"GPU0-MIG0/0/1", "GPU1-MIG0/0/1"}, }, { - description: "cdi devices are ignored", - mounts: makeTestMounts("GPU0", "cdi/nvidia.com/gpu=all", "GPU1"), - expectedDevices: []string{"GPU0", "GPU1"}, + description: "cdi devices are included", + mounts: makeTestMounts("GPU0", "nvidia.com/gpu=all", "GPU1"), + expectedDevices: []string{"GPU0", "nvidia.com/gpu=all", "GPU1"}, }, { description: "imex devices are ignored", diff --git a/internal/info/auto_test.go b/internal/info/auto_test.go index ad147526..c2ab93d7 100644 --- a/internal/info/auto_test.go +++ b/internal/info/auto_test.go @@ -184,7 +184,7 @@ func TestResolveAutoMode(t *testing.T) { expectedMode: "legacy", }, { - description: "cdi mount and non-CDI envvar resolves to legacy", + description: "cdi mount and non-CDI envvar resolves to cdi", mode: "auto", envmap: map[string]string{ "NVIDIA_VISIBLE_DEVICES": "0", @@ -197,6 +197,22 @@ func TestResolveAutoMode(t *testing.T) { "tegra": false, "nvgpu": false, }, + expectedMode: "cdi", + }, + { + description: "non-cdi mount and CDI envvar resolves to legacy", + mode: "auto", + envmap: map[string]string{ + "NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0", + }, + mounts: []string{ + "/var/run/nvidia-container-devices/0", + }, + info: map[string]bool{ + "nvml": true, + "tegra": false, + "nvgpu": false, + }, expectedMode: "legacy", }, } @@ -232,6 +248,8 @@ func TestResolveAutoMode(t *testing.T) { image, _ := image.New( image.WithEnvMap(tc.envmap), image.WithMounts(mounts), + image.WithAcceptDeviceListAsVolumeMounts(true), + image.WithAcceptEnvvarUnprivileged(true), ) mode := resolveMode(logger, tc.mode, image, properties) require.EqualValues(t, tc.expectedMode, mode) diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index 6c7286af..a6bf02c3 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -66,57 +66,66 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe } func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.Config) ([]string, error) { + cdiModifier := &cdiModifier{ + logger: logger, + acceptDeviceListAsVolumeMounts: cfg.AcceptDeviceListAsVolumeMounts, + acceptEnvvarUnprivileged: cfg.AcceptEnvvarUnprivileged, + annotationPrefixes: cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes, + defaultKind: cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, + } + return cdiModifier.getDevicesFromSpec(ociSpec) +} + +// TODO: We should rename this type. +type cdiModifier struct { + logger logger.Interface + acceptDeviceListAsVolumeMounts bool + acceptEnvvarUnprivileged bool + annotationPrefixes []string + defaultKind string +} + +func (c *cdiModifier) getDevicesFromSpec(ociSpec oci.Spec) ([]string, error) { rawSpec, err := ociSpec.Load() if err != nil { return nil, fmt.Errorf("failed to load OCI spec: %v", err) } - annotationDevices, err := getAnnotationDevices(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes, rawSpec.Annotations) - if err != nil { - return nil, fmt.Errorf("failed to parse container annotations: %v", err) - } - if len(annotationDevices) > 0 { - return annotationDevices, nil + if rawSpec != nil { + annotationDevices, err := getAnnotationDevices(c.annotationPrefixes, rawSpec.Annotations) + if err != nil { + return nil, fmt.Errorf("failed to parse container annotations: %v", err) + } + if len(annotationDevices) > 0 { + return annotationDevices, nil + } } container, err := image.NewCUDAImageFromSpec( rawSpec, - image.WithLogger(logger), + image.WithLogger(c.logger), + image.WithAcceptDeviceListAsVolumeMounts(c.acceptDeviceListAsVolumeMounts), + image.WithAcceptEnvvarUnprivileged(c.acceptEnvvarUnprivileged), ) if err != nil { return nil, err } - if cfg.AcceptDeviceListAsVolumeMounts { - mountDevices := container.CDIDevicesFromMounts() - if len(mountDevices) > 0 { - return mountDevices, nil - } - } var devices []string seen := make(map[string]bool) - for _, name := range container.VisibleDevicesFromEnvVar() { + for _, name := range container.VisibleDevices() { if !parser.IsQualifiedName(name) { - name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name) + name = fmt.Sprintf("%s=%s", c.defaultKind, name) } if seen[name] { - logger.Debugf("Ignoring duplicate device %q", name) + c.logger.Debugf("Ignoring duplicate device %q", name) continue } + seen[name] = true devices = append(devices, name) } - if len(devices) == 0 { - return nil, nil - } - - if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged((*image.OCISpec)(rawSpec)) { - return devices, nil - } - - logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES: %v", devices) - - return nil, nil + return devices, nil } // getAnnotationDevices returns a list of devices specified in the annotations. diff --git a/internal/modifier/cdi_test.go b/internal/modifier/cdi_test.go index 88ff697a..21c9aac8 100644 --- a/internal/modifier/cdi_test.go +++ b/internal/modifier/cdi_test.go @@ -20,7 +20,11 @@ import ( "fmt" "testing" + "github.com/opencontainers/runtime-spec/specs-go" + testlog "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/require" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" ) func TestGetAnnotationDevices(t *testing.T) { @@ -90,3 +94,69 @@ func TestGetAnnotationDevices(t *testing.T) { }) } } + +func TestGetDevicesFromSpec(t *testing.T) { + logger, _ := testlog.NewNullLogger() + + testCases := []struct { + description string + input cdiModifier + spec *specs.Spec + expectedDevices []string + }{ + { + description: "empty spec yields no devices", + }, + { + description: "cdi devices from mounts", + input: cdiModifier{ + defaultKind: "nvidia.com/gpu", + acceptEnvvarUnprivileged: true, + acceptDeviceListAsVolumeMounts: true, + }, + spec: &specs.Spec{ + Mounts: []specs.Mount{ + { + Destination: "/var/run/nvidia-container-devices/cdi/nvidia.com/gpu/0", + Source: "/dev/null", + }, + { + Destination: "/var/run/nvidia-container-devices/cdi/nvidia.com/gpu/1", + Source: "/dev/null", + }, + }, + }, + expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1"}, + }, + { + description: "cdi devices from envvar", + input: cdiModifier{ + defaultKind: "nvidia.com/gpu", + acceptEnvvarUnprivileged: true, + acceptDeviceListAsVolumeMounts: true, + }, + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=0,example.com/class=device"}, + }, + }, + expectedDevices: []string{"nvidia.com/gpu=0", "example.com/class=device"}, + }, + } + + for _, tc := range testCases { + tc.input.logger = logger + + spec := &oci.SpecMock{ + LoadFunc: func() (*specs.Spec, error) { + return tc.spec, nil + }, + } + + t.Run(tc.description, func(t *testing.T) { + devices, err := tc.input.getDevicesFromSpec(spec) + require.NoError(t, err) + require.EqualValues(t, tc.expectedDevices, devices) + }) + } +}