From 6fc6f2c594a3c14dfaac03b62038b0b1e73108a8 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 5 Jun 2025 13:21:25 +0200 Subject: [PATCH 1/2] [no-relnote] new helper func visibleEnvVars Signed-off-by: Carlos Eduardo Arango Gutierrez --- internal/config/image/cuda_image.go | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index b16e3fed..51914470 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -228,6 +228,21 @@ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool { return hasCDIdevice } +// visibleEnvVars returns the environment variables that are used to determine device visibility. +// It returns the preferred environment variables that are set, or NVIDIA_VISIBLE_DEVICES if none are set. +func (i CUDA) visibleEnvVars() []string { + var envVars []string + for _, envVar := range i.preferredVisibleDeviceEnvVars { + if i.HasEnvvar(envVar) { + envVars = append(envVars, envVar) + } + } + if len(envVars) == 0 { + envVars = append(envVars, EnvVarNvidiaVisibleDevices) + } + return envVars +} + // VisibleDevices returns a list of devices requested in the container image. // If volume mount requests are enabled these are returned if requested, // otherwise device requests through environment variables are considered. @@ -255,7 +270,10 @@ func (i CUDA) VisibleDevices() []string { } // We log a warning if we are ignoring the environment variable requests. - i.logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES in unprivileged container") + envVars := i.visibleEnvVars() + if len(envVars) > 0 { + i.logger.Warningf("Ignoring devices requested by environment variable(s) in unprivileged container: %v", envVars) + } return nil } @@ -265,12 +283,8 @@ func (i CUDA) VisibleDevices() []string { // are used to determine the visible devices. If this is not the case, the // NVIDIA_VISIBLE_DEVICES environment variable is used. func (i CUDA) VisibleDevicesFromEnvVar() []string { - for _, envVar := range i.preferredVisibleDeviceEnvVars { - if i.HasEnvvar(envVar) { - return i.DevicesFromEnvvars(i.preferredVisibleDeviceEnvVars...).List() - } - } - return i.DevicesFromEnvvars(EnvVarNvidiaVisibleDevices).List() + envVars := i.visibleEnvVars() + return i.DevicesFromEnvvars(envVars...).List() } // visibleDevicesFromMounts returns the set of visible devices requested as mounts. From b4f0ad9c8e8120f303785829c5e393d0649b40c2 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 5 Jun 2025 13:25:46 +0200 Subject: [PATCH 2/2] BUGFIX: modifier: respect GPU volume-mount device requests The gated modifiers used to add support for GDS, Mofed, and CUDA Forward Comatibility only check the NVIDIA_VISIBLE_DEVICES envvar to determine whether GPUs are requested and modifications should be made. This means that use cases where volume mounts are used to request devices (e.g. when using the GPU Device Plugin) are not supported. This patch takes visibleDevicesFromEnvVar private, making VisibleDevices the only exported method to query valid devices. And edits the gated modifiers to use this func, ensuring device requests via mounts are also taken into acount. Signed-off-by: Carlos Eduardo Arango Gutierrez --- internal/config/image/cuda_image.go | 8 +- internal/config/image/cuda_image_test.go | 102 ++++++++++-- internal/modifier/cdi.go | 29 ++-- internal/modifier/cdi_test.go | 203 +++++++++++++++++++++++ internal/modifier/csv.go | 2 +- internal/modifier/gated.go | 2 +- internal/modifier/graphics.go | 2 +- 7 files changed, 313 insertions(+), 35 deletions(-) diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index 51914470..5c44a885 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -212,7 +212,7 @@ func parseMajorMinorVersion(version string) (string, error) { // OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool { var hasCDIdevice bool - for _, device := range i.VisibleDevicesFromEnvVar() { + for _, device := range i.visibleDevicesFromEnvVar() { if !parser.IsQualifiedName(device) { return false } @@ -258,7 +258,7 @@ func (i CUDA) VisibleDevices() []string { } // Get the Fallback to reading from the environment variable if privileges are correct - envVarDeviceRequests := i.VisibleDevicesFromEnvVar() + envVarDeviceRequests := i.visibleDevicesFromEnvVar() if len(envVarDeviceRequests) == 0 { return nil } @@ -278,11 +278,11 @@ func (i CUDA) VisibleDevices() []string { return nil } -// VisibleDevicesFromEnvVar returns the set of visible devices requested through environment variables. +// visibleDevicesFromEnvVar returns the set of visible devices requested through environment variables. // If any of the preferredVisibleDeviceEnvVars are present in the image, they // are used to determine the visible devices. If this is not the case, the // NVIDIA_VISIBLE_DEVICES environment variable is used. -func (i CUDA) VisibleDevicesFromEnvVar() []string { +func (i CUDA) visibleDevicesFromEnvVar() []string { envVars := i.visibleEnvVars() return i.DevicesFromEnvvars(envVars...).List() } diff --git a/internal/config/image/cuda_image_test.go b/internal/config/image/cuda_image_test.go index 044466d3..75fc06d7 100644 --- a/internal/config/image/cuda_image_test.go +++ b/internal/config/image/cuda_image_test.go @@ -429,7 +429,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { ) require.NoError(t, err) - devices := image.VisibleDevicesFromEnvVar() + devices := image.visibleDevicesFromEnvVar() require.EqualValues(t, tc.expectedDevices, devices) }) } @@ -508,13 +508,15 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) { func TestVisibleDevices(t *testing.T) { var tests = []struct { - description string - mountDevices []specs.Mount - envvarDevices string - privileged bool - acceptUnprivileged bool - acceptMounts bool - expectedDevices []string + description string + mountDevices []specs.Mount + envvarDevices string + privileged bool + acceptUnprivileged bool + acceptMounts bool + preferredVisibleDeviceEnvVars []string + env map[string]string + expectedDevices []string }{ { description: "Mount devices, unprivileged, no accept unprivileged", @@ -597,20 +599,92 @@ func TestVisibleDevices(t *testing.T) { acceptMounts: false, expectedDevices: nil, }, + // New test cases for visibleEnvVars functionality + { + description: "preferred env var set and present in env, privileged", + mountDevices: nil, + envvarDevices: "", + privileged: true, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + }, + expectedDevices: []string{"GPU-12345"}, + }, + { + description: "preferred env var set and present in env, unprivileged but accepted", + mountDevices: nil, + envvarDevices: "", + privileged: false, + acceptUnprivileged: true, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + }, + expectedDevices: []string{"GPU-12345"}, + }, + { + description: "preferred env var set and present in env, unprivileged and not accepted", + mountDevices: nil, + envvarDevices: "", + privileged: false, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + }, + expectedDevices: nil, + }, + { + description: "multiple preferred env vars, both present, privileged", + mountDevices: nil, + envvarDevices: "", + privileged: true, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + "DOCKER_RESOURCE_GPUS_ADDITIONAL": "GPU-67890", + }, + expectedDevices: []string{"GPU-12345", "GPU-67890"}, + }, + { + description: "preferred env var not present, fallback to NVIDIA_VISIBLE_DEVICES, privileged", + mountDevices: nil, + envvarDevices: "GPU-12345", + privileged: true, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + EnvVarNvidiaVisibleDevices: "GPU-12345", + }, + expectedDevices: []string{"GPU-12345"}, + }, } for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { - // Wrap the call to getDevices() in a closure. + // Create env map with both NVIDIA_VISIBLE_DEVICES and any additional env vars + env := make(map[string]string) + if tc.envvarDevices != "" { + env[EnvVarNvidiaVisibleDevices] = tc.envvarDevices + } + for k, v := range tc.env { + env[k] = v + } + image, err := New( - WithEnvMap( - map[string]string{ - EnvVarNvidiaVisibleDevices: tc.envvarDevices, - }, - ), + WithEnvMap(env), WithMounts(tc.mountDevices), WithPrivileged(tc.privileged), WithAcceptDeviceListAsVolumeMounts(tc.acceptMounts), WithAcceptEnvvarUnprivileged(tc.acceptUnprivileged), + WithPreferredVisibleDevicesEnvVars(tc.preferredVisibleDeviceEnvVars...), ) require.NoError(t, err) require.Equal(t, tc.expectedDevices, image.VisibleDevices()) diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index 6c7286af..23d2627e 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -86,37 +86,38 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C if err != nil { return nil, err } - if cfg.AcceptDeviceListAsVolumeMounts { - mountDevices := container.CDIDevicesFromMounts() - if len(mountDevices) > 0 { - return mountDevices, nil - } - } var devices []string seen := make(map[string]bool) - for _, name := range container.VisibleDevicesFromEnvVar() { + for _, name := range container.VisibleDevices() { + // Skip empty device names + if strings.TrimSpace(name) == "" { + continue + } + if !parser.IsQualifiedName(name) { name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name) } + + // Skip devices that result in empty device IDs after qualification + if strings.HasSuffix(name, "=") { + logger.Debugf("Ignoring device with empty ID: %q", name) + continue + } + if seen[name] { logger.Debugf("Ignoring duplicate device %q", name) continue } devices = append(devices, name) + seen[name] = true } if len(devices) == 0 { return nil, nil } - if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged((*image.OCISpec)(rawSpec)) { - return devices, nil - } - - logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES: %v", devices) - - return nil, nil + return devices, nil } // getAnnotationDevices returns a list of devices specified in the annotations. diff --git a/internal/modifier/cdi_test.go b/internal/modifier/cdi_test.go index 88ff697a..f5ab134d 100644 --- a/internal/modifier/cdi_test.go +++ b/internal/modifier/cdi_test.go @@ -20,7 +20,12 @@ import ( "fmt" "testing" + "github.com/opencontainers/runtime-spec/specs-go" + testlog "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/require" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/config" + "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" ) func TestGetAnnotationDevices(t *testing.T) { @@ -90,3 +95,201 @@ func TestGetAnnotationDevices(t *testing.T) { }) } } + +func getTestConfig() *config.Config { + cfg, _ := config.GetDefault() + return cfg +} + +func getTestConfigWithAnnotations() *config.Config { + cfg, _ := config.GetDefault() + cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes = []string{"cdi.k8s.io/"} + return cfg +} + +func TestGetDevicesFromSpec(t *testing.T) { + logger, _ := testlog.NewNullLogger() + + testCases := []struct { + description string + spec *specs.Spec + config *config.Config + loadError error + expectedDevices []string + expectedError string + }{ + { + description: "empty spec, no devices specified", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{}, + }, + }, + config: getTestConfig(), + expectedDevices: nil, + expectedError: "", + }, + { + description: "NVIDIA_VISIBLE_DEVICES=all devices specified", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{ + "NVIDIA_VISIBLE_DEVICES=all", + }, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"nvidia.com/gpu=all"}, + expectedError: "", + }, + { + description: "devices from annotations", + spec: &specs.Spec{ + Annotations: map[string]string{ + "cdi.k8s.io/test": "example.com/device=device1,example.com/device=device2", + }, + Process: &specs.Process{ + Env: []string{}, + }, + }, + config: getTestConfigWithAnnotations(), + expectedDevices: []string{"example.com/device=device1", "example.com/device=device2"}, + }, + { + description: "devices from environment variables - single device", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=0"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"nvidia.com/gpu=0"}, + }, + { + description: "devices from environment variables - multiple unique devices", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1,2"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"}, + }, + { + description: "devices from environment variables - duplicate devices should be deduplicated", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1,0,2,1"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"}, + }, + { + description: "devices from environment variables - qualified device names", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=example.com/device=dev1,0,example.com/device=dev2"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"example.com/device=dev1", "nvidia.com/gpu=0", "example.com/device=dev2"}, + }, + { + description: "devices from environment variables - duplicate qualified device names should be deduplicated", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=example.com/device=dev1,0,example.com/device=dev1,nvidia.com/gpu=0"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"example.com/device=dev1", "nvidia.com/gpu=0"}, + }, + { + description: "annotation devices take precedence over environment variables", + spec: &specs.Spec{ + Annotations: map[string]string{ + "cdi.k8s.io/test": "example.com/device=annotation-device", + }, + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1"}, + }, + }, + config: getTestConfigWithAnnotations(), + expectedDevices: []string{"example.com/device=annotation-device"}, + }, + { + description: "devices from environment variables - empty and whitespace devices should be filtered", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=0, ,1, \t ,2"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"}, + }, + { + description: "devices from environment variables - void should return empty", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=void"}, + }, + }, + config: getTestConfig(), + expectedDevices: nil, + }, + { + description: "devices from environment variables - none should be filtered out", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=none"}, + }, + }, + config: getTestConfig(), + expectedDevices: nil, + }, + { + description: "devices from environment variables - all empty devices should result in no devices", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES= , , \t "}, + }, + }, + config: getTestConfig(), + expectedDevices: nil, + }, + { + description: "error loading OCI spec", + loadError: fmt.Errorf("failed to load spec"), + config: getTestConfig(), + expectedError: "failed to load OCI spec", + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + mockSpec := &oci.SpecMock{ + LoadFunc: func() (*specs.Spec, error) { + if tc.loadError != nil { + return nil, tc.loadError + } + return tc.spec, nil + }, + } + + devices, err := getDevicesFromSpec(logger, mockSpec, tc.config) + + if tc.expectedError != "" { + require.Error(t, err) + require.Nil(t, devices) + require.Contains(t, err.Error(), tc.expectedError) + } else { + require.NoError(t, err) + require.ElementsMatch(t, tc.expectedDevices, devices) + } + + // Verify that Load was called exactly once + require.Len(t, mockSpec.LoadCalls(), 1) + }) + } +} diff --git a/internal/modifier/csv.go b/internal/modifier/csv.go index d3dc84e7..1f8a12f8 100644 --- a/internal/modifier/csv.go +++ b/internal/modifier/csv.go @@ -33,7 +33,7 @@ import ( // NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper. // The modifications are defined by CSV MountSpecs. func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) { - if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 { + if devices := container.VisibleDevices(); len(devices) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index a0239df8..e96946a4 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -37,7 +37,7 @@ import ( // // If not devices are selected, no changes are made. func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) { - if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 { + if devices := image.VisibleDevices(); len(devices) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } diff --git a/internal/modifier/graphics.go b/internal/modifier/graphics.go index 6e602d7a..e949bd27 100644 --- a/internal/modifier/graphics.go +++ b/internal/modifier/graphics.go @@ -65,7 +65,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI // requiresGraphicsModifier determines whether a graphics modifier is required. func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) { - if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 { + if devices := cudaImage.VisibleDevices(); len(devices) == 0 { return false, "no devices requested" }