diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index b16e3fed..5c44a885 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -212,7 +212,7 @@ func parseMajorMinorVersion(version string) (string, error) { // OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool { var hasCDIdevice bool - for _, device := range i.VisibleDevicesFromEnvVar() { + for _, device := range i.visibleDevicesFromEnvVar() { if !parser.IsQualifiedName(device) { return false } @@ -228,6 +228,21 @@ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool { return hasCDIdevice } +// visibleEnvVars returns the environment variables that are used to determine device visibility. +// It returns the preferred environment variables that are set, or NVIDIA_VISIBLE_DEVICES if none are set. +func (i CUDA) visibleEnvVars() []string { + var envVars []string + for _, envVar := range i.preferredVisibleDeviceEnvVars { + if i.HasEnvvar(envVar) { + envVars = append(envVars, envVar) + } + } + if len(envVars) == 0 { + envVars = append(envVars, EnvVarNvidiaVisibleDevices) + } + return envVars +} + // VisibleDevices returns a list of devices requested in the container image. // If volume mount requests are enabled these are returned if requested, // otherwise device requests through environment variables are considered. @@ -243,7 +258,7 @@ func (i CUDA) VisibleDevices() []string { } // Get the Fallback to reading from the environment variable if privileges are correct - envVarDeviceRequests := i.VisibleDevicesFromEnvVar() + envVarDeviceRequests := i.visibleDevicesFromEnvVar() if len(envVarDeviceRequests) == 0 { return nil } @@ -255,22 +270,21 @@ func (i CUDA) VisibleDevices() []string { } // We log a warning if we are ignoring the environment variable requests. - i.logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES in unprivileged container") + envVars := i.visibleEnvVars() + if len(envVars) > 0 { + i.logger.Warningf("Ignoring devices requested by environment variable(s) in unprivileged container: %v", envVars) + } return nil } -// VisibleDevicesFromEnvVar returns the set of visible devices requested through environment variables. +// visibleDevicesFromEnvVar returns the set of visible devices requested through environment variables. // If any of the preferredVisibleDeviceEnvVars are present in the image, they // are used to determine the visible devices. If this is not the case, the // NVIDIA_VISIBLE_DEVICES environment variable is used. -func (i CUDA) VisibleDevicesFromEnvVar() []string { - for _, envVar := range i.preferredVisibleDeviceEnvVars { - if i.HasEnvvar(envVar) { - return i.DevicesFromEnvvars(i.preferredVisibleDeviceEnvVars...).List() - } - } - return i.DevicesFromEnvvars(EnvVarNvidiaVisibleDevices).List() +func (i CUDA) visibleDevicesFromEnvVar() []string { + envVars := i.visibleEnvVars() + return i.DevicesFromEnvvars(envVars...).List() } // visibleDevicesFromMounts returns the set of visible devices requested as mounts. diff --git a/internal/config/image/cuda_image_test.go b/internal/config/image/cuda_image_test.go index 044466d3..75fc06d7 100644 --- a/internal/config/image/cuda_image_test.go +++ b/internal/config/image/cuda_image_test.go @@ -429,7 +429,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { ) require.NoError(t, err) - devices := image.VisibleDevicesFromEnvVar() + devices := image.visibleDevicesFromEnvVar() require.EqualValues(t, tc.expectedDevices, devices) }) } @@ -508,13 +508,15 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) { func TestVisibleDevices(t *testing.T) { var tests = []struct { - description string - mountDevices []specs.Mount - envvarDevices string - privileged bool - acceptUnprivileged bool - acceptMounts bool - expectedDevices []string + description string + mountDevices []specs.Mount + envvarDevices string + privileged bool + acceptUnprivileged bool + acceptMounts bool + preferredVisibleDeviceEnvVars []string + env map[string]string + expectedDevices []string }{ { description: "Mount devices, unprivileged, no accept unprivileged", @@ -597,20 +599,92 @@ func TestVisibleDevices(t *testing.T) { acceptMounts: false, expectedDevices: nil, }, + // New test cases for visibleEnvVars functionality + { + description: "preferred env var set and present in env, privileged", + mountDevices: nil, + envvarDevices: "", + privileged: true, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + }, + expectedDevices: []string{"GPU-12345"}, + }, + { + description: "preferred env var set and present in env, unprivileged but accepted", + mountDevices: nil, + envvarDevices: "", + privileged: false, + acceptUnprivileged: true, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + }, + expectedDevices: []string{"GPU-12345"}, + }, + { + description: "preferred env var set and present in env, unprivileged and not accepted", + mountDevices: nil, + envvarDevices: "", + privileged: false, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + }, + expectedDevices: nil, + }, + { + description: "multiple preferred env vars, both present, privileged", + mountDevices: nil, + envvarDevices: "", + privileged: true, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + "DOCKER_RESOURCE_GPUS_ADDITIONAL": "GPU-67890", + }, + expectedDevices: []string{"GPU-12345", "GPU-67890"}, + }, + { + description: "preferred env var not present, fallback to NVIDIA_VISIBLE_DEVICES, privileged", + mountDevices: nil, + envvarDevices: "GPU-12345", + privileged: true, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + EnvVarNvidiaVisibleDevices: "GPU-12345", + }, + expectedDevices: []string{"GPU-12345"}, + }, } for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { - // Wrap the call to getDevices() in a closure. + // Create env map with both NVIDIA_VISIBLE_DEVICES and any additional env vars + env := make(map[string]string) + if tc.envvarDevices != "" { + env[EnvVarNvidiaVisibleDevices] = tc.envvarDevices + } + for k, v := range tc.env { + env[k] = v + } + image, err := New( - WithEnvMap( - map[string]string{ - EnvVarNvidiaVisibleDevices: tc.envvarDevices, - }, - ), + WithEnvMap(env), WithMounts(tc.mountDevices), WithPrivileged(tc.privileged), WithAcceptDeviceListAsVolumeMounts(tc.acceptMounts), WithAcceptEnvvarUnprivileged(tc.acceptUnprivileged), + WithPreferredVisibleDevicesEnvVars(tc.preferredVisibleDeviceEnvVars...), ) require.NoError(t, err) require.Equal(t, tc.expectedDevices, image.VisibleDevices()) diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index 6c7286af..23d2627e 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -86,37 +86,38 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C if err != nil { return nil, err } - if cfg.AcceptDeviceListAsVolumeMounts { - mountDevices := container.CDIDevicesFromMounts() - if len(mountDevices) > 0 { - return mountDevices, nil - } - } var devices []string seen := make(map[string]bool) - for _, name := range container.VisibleDevicesFromEnvVar() { + for _, name := range container.VisibleDevices() { + // Skip empty device names + if strings.TrimSpace(name) == "" { + continue + } + if !parser.IsQualifiedName(name) { name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name) } + + // Skip devices that result in empty device IDs after qualification + if strings.HasSuffix(name, "=") { + logger.Debugf("Ignoring device with empty ID: %q", name) + continue + } + if seen[name] { logger.Debugf("Ignoring duplicate device %q", name) continue } devices = append(devices, name) + seen[name] = true } if len(devices) == 0 { return nil, nil } - if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged((*image.OCISpec)(rawSpec)) { - return devices, nil - } - - logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES: %v", devices) - - return nil, nil + return devices, nil } // getAnnotationDevices returns a list of devices specified in the annotations. diff --git a/internal/modifier/cdi_test.go b/internal/modifier/cdi_test.go index 88ff697a..f5ab134d 100644 --- a/internal/modifier/cdi_test.go +++ b/internal/modifier/cdi_test.go @@ -20,7 +20,12 @@ import ( "fmt" "testing" + "github.com/opencontainers/runtime-spec/specs-go" + testlog "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/require" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/config" + "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" ) func TestGetAnnotationDevices(t *testing.T) { @@ -90,3 +95,201 @@ func TestGetAnnotationDevices(t *testing.T) { }) } } + +func getTestConfig() *config.Config { + cfg, _ := config.GetDefault() + return cfg +} + +func getTestConfigWithAnnotations() *config.Config { + cfg, _ := config.GetDefault() + cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes = []string{"cdi.k8s.io/"} + return cfg +} + +func TestGetDevicesFromSpec(t *testing.T) { + logger, _ := testlog.NewNullLogger() + + testCases := []struct { + description string + spec *specs.Spec + config *config.Config + loadError error + expectedDevices []string + expectedError string + }{ + { + description: "empty spec, no devices specified", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{}, + }, + }, + config: getTestConfig(), + expectedDevices: nil, + expectedError: "", + }, + { + description: "NVIDIA_VISIBLE_DEVICES=all devices specified", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{ + "NVIDIA_VISIBLE_DEVICES=all", + }, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"nvidia.com/gpu=all"}, + expectedError: "", + }, + { + description: "devices from annotations", + spec: &specs.Spec{ + Annotations: map[string]string{ + "cdi.k8s.io/test": "example.com/device=device1,example.com/device=device2", + }, + Process: &specs.Process{ + Env: []string{}, + }, + }, + config: getTestConfigWithAnnotations(), + expectedDevices: []string{"example.com/device=device1", "example.com/device=device2"}, + }, + { + description: "devices from environment variables - single device", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=0"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"nvidia.com/gpu=0"}, + }, + { + description: "devices from environment variables - multiple unique devices", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1,2"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"}, + }, + { + description: "devices from environment variables - duplicate devices should be deduplicated", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1,0,2,1"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"}, + }, + { + description: "devices from environment variables - qualified device names", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=example.com/device=dev1,0,example.com/device=dev2"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"example.com/device=dev1", "nvidia.com/gpu=0", "example.com/device=dev2"}, + }, + { + description: "devices from environment variables - duplicate qualified device names should be deduplicated", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=example.com/device=dev1,0,example.com/device=dev1,nvidia.com/gpu=0"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"example.com/device=dev1", "nvidia.com/gpu=0"}, + }, + { + description: "annotation devices take precedence over environment variables", + spec: &specs.Spec{ + Annotations: map[string]string{ + "cdi.k8s.io/test": "example.com/device=annotation-device", + }, + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1"}, + }, + }, + config: getTestConfigWithAnnotations(), + expectedDevices: []string{"example.com/device=annotation-device"}, + }, + { + description: "devices from environment variables - empty and whitespace devices should be filtered", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=0, ,1, \t ,2"}, + }, + }, + config: getTestConfig(), + expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"}, + }, + { + description: "devices from environment variables - void should return empty", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=void"}, + }, + }, + config: getTestConfig(), + expectedDevices: nil, + }, + { + description: "devices from environment variables - none should be filtered out", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES=none"}, + }, + }, + config: getTestConfig(), + expectedDevices: nil, + }, + { + description: "devices from environment variables - all empty devices should result in no devices", + spec: &specs.Spec{ + Process: &specs.Process{ + Env: []string{"NVIDIA_VISIBLE_DEVICES= , , \t "}, + }, + }, + config: getTestConfig(), + expectedDevices: nil, + }, + { + description: "error loading OCI spec", + loadError: fmt.Errorf("failed to load spec"), + config: getTestConfig(), + expectedError: "failed to load OCI spec", + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + mockSpec := &oci.SpecMock{ + LoadFunc: func() (*specs.Spec, error) { + if tc.loadError != nil { + return nil, tc.loadError + } + return tc.spec, nil + }, + } + + devices, err := getDevicesFromSpec(logger, mockSpec, tc.config) + + if tc.expectedError != "" { + require.Error(t, err) + require.Nil(t, devices) + require.Contains(t, err.Error(), tc.expectedError) + } else { + require.NoError(t, err) + require.ElementsMatch(t, tc.expectedDevices, devices) + } + + // Verify that Load was called exactly once + require.Len(t, mockSpec.LoadCalls(), 1) + }) + } +} diff --git a/internal/modifier/csv.go b/internal/modifier/csv.go index d3dc84e7..1f8a12f8 100644 --- a/internal/modifier/csv.go +++ b/internal/modifier/csv.go @@ -33,7 +33,7 @@ import ( // NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper. // The modifications are defined by CSV MountSpecs. func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) { - if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 { + if devices := container.VisibleDevices(); len(devices) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index a0239df8..e96946a4 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -37,7 +37,7 @@ import ( // // If not devices are selected, no changes are made. func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) { - if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 { + if devices := image.VisibleDevices(); len(devices) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } diff --git a/internal/modifier/graphics.go b/internal/modifier/graphics.go index 6e602d7a..e949bd27 100644 --- a/internal/modifier/graphics.go +++ b/internal/modifier/graphics.go @@ -65,7 +65,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI // requiresGraphicsModifier determines whether a graphics modifier is required. func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) { - if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 { + if devices := cudaImage.VisibleDevices(); len(devices) == 0 { return false, "no devices requested" }