From 5fe7b0651461ac6c0fb65fa80ca5dbb10f073be1 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 5 Jun 2025 13:21:25 +0200 Subject: [PATCH 1/4] [no-relnote] new helper func visibleEnvVars Signed-off-by: Carlos Eduardo Arango Gutierrez Signed-off-by: Evan Lezar --- internal/config/image/cuda_image.go | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index c295d105..fced9c9d 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -232,6 +232,22 @@ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool { return hasCDIdevice } +// visibleEnvVars returns the environment variables that are used to determine device visibility. +// It returns the preferred environment variables that are set, or NVIDIA_VISIBLE_DEVICES if none are set. +func (i CUDA) visibleEnvVars() []string { + var envVars []string + for _, envVar := range i.preferredVisibleDeviceEnvVars { + if !i.HasEnvvar(envVar) { + continue + } + envVars = append(envVars, envVar) + } + if len(envVars) > 0 { + return envVars + } + return []string{EnvVarNvidiaVisibleDevices} +} + // VisibleDevices returns a list of devices requested in the container image. // If volume mount requests are enabled these are returned if requested, // otherwise device requests through environment variables are considered. @@ -265,7 +281,10 @@ func (i CUDA) VisibleDevices() []string { } // We log a warning if we are ignoring the environment variable requests. - i.logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES in unprivileged container") + envVars := i.visibleEnvVars() + if len(envVars) > 0 { + i.logger.Warningf("Ignoring devices requested by environment variable(s) in unprivileged container: %v", envVars) + } return nil } @@ -300,12 +319,8 @@ func (i CUDA) cdiDeviceRequestsFromAnnotations() []string { // are used to determine the visible devices. If this is not the case, the // NVIDIA_VISIBLE_DEVICES environment variable is used. func (i CUDA) VisibleDevicesFromEnvVar() []string { - for _, envVar := range i.preferredVisibleDeviceEnvVars { - if i.HasEnvvar(envVar) { - return i.DevicesFromEnvvars(i.preferredVisibleDeviceEnvVars...).List() - } - } - return i.DevicesFromEnvvars(EnvVarNvidiaVisibleDevices).List() + envVars := i.visibleEnvVars() + return i.DevicesFromEnvvars(envVars...).List() } // visibleDevicesFromMounts returns the set of visible devices requested as mounts. From f4f7da65f1d5b483111c75c4e3ca97cde3eead5d Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 13 Jun 2025 16:25:11 +0200 Subject: [PATCH 2/4] Ensure consistent sorting of annotation devices Signed-off-by: Evan Lezar --- internal/config/image/cuda_image.go | 14 +++++++++++--- internal/modifier/cdi_test.go | 2 +- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index fced9c9d..7d5bf7c1 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -19,6 +19,7 @@ package image import ( "fmt" "path/filepath" + "slices" "strconv" "strings" @@ -300,17 +301,24 @@ func (i CUDA) cdiDeviceRequestsFromAnnotations() []string { return nil } - var devices []string - for key, value := range i.annotations { + var annotationKeys []string + for key := range i.annotations { for _, prefix := range i.annotationsPrefixes { if strings.HasPrefix(key, prefix) { - devices = append(devices, strings.Split(value, ",")...) + annotationKeys = append(annotationKeys, key) // There is no need to check additional prefixes since we // typically deduplicate devices in any case. break } } } + // We sort the annotationKeys for consistent results. + slices.Sort(annotationKeys) + + var devices []string + for _, key := range annotationKeys { + devices = append(devices, strings.Split(i.annotations[key], ",")...) + } return devices } diff --git a/internal/modifier/cdi_test.go b/internal/modifier/cdi_test.go index 881e8b2c..a5ca78f5 100644 --- a/internal/modifier/cdi_test.go +++ b/internal/modifier/cdi_test.go @@ -98,7 +98,7 @@ func TestDeviceRequests(t *testing.T) { "another-prefix/bar": "example.com/device=baz", }, }, - expectedDevices: []string{"example.com/device=bar", "example.com/device=baz"}, + expectedDevices: []string{"example.com/device=baz", "example.com/device=bar"}, }, { description: "multiple matching annotations with duplicate devices", From d03a06029ad5c94fbdebf9e15c914ff4e784fcf1 Mon Sep 17 00:00:00 2001 From: Carlos Eduardo Arango Gutierrez Date: Thu, 5 Jun 2025 13:25:46 +0200 Subject: [PATCH 3/4] BUGFIX: modifier: respect GPU volume-mount device requests The gated modifiers used to add support for GDS, Mofed, and CUDA Forward Comatibility only check the NVIDIA_VISIBLE_DEVICES envvar to determine whether GPUs are requested and modifications should be made. This means that use cases where volume mounts are used to request devices are not supported. This change ensures that device extraction is consistent for all use cases. Signed-off-by: Carlos Eduardo Arango Gutierrez Signed-off-by: Evan Lezar --- internal/config/image/cuda_image.go | 6 +- internal/config/image/cuda_image_test.go | 102 +++++++++++++++++++---- internal/modifier/csv.go | 2 +- internal/modifier/gated.go | 2 +- internal/modifier/graphics.go | 20 +++-- internal/modifier/graphics_test.go | 18 ++-- 6 files changed, 113 insertions(+), 37 deletions(-) diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index 7d5bf7c1..49184dbd 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -270,7 +270,7 @@ func (i CUDA) VisibleDevices() []string { } // Get the Fallback to reading from the environment variable if privileges are correct - envVarDeviceRequests := i.VisibleDevicesFromEnvVar() + envVarDeviceRequests := i.visibleDevicesFromEnvVar() if len(envVarDeviceRequests) == 0 { return nil } @@ -322,11 +322,11 @@ func (i CUDA) cdiDeviceRequestsFromAnnotations() []string { return devices } -// VisibleDevicesFromEnvVar returns the set of visible devices requested through environment variables. +// visibleDevicesFromEnvVar returns the set of visible devices requested through environment variables. // If any of the preferredVisibleDeviceEnvVars are present in the image, they // are used to determine the visible devices. If this is not the case, the // NVIDIA_VISIBLE_DEVICES environment variable is used. -func (i CUDA) VisibleDevicesFromEnvVar() []string { +func (i CUDA) visibleDevicesFromEnvVar() []string { envVars := i.visibleEnvVars() return i.DevicesFromEnvvars(envVars...).List() } diff --git a/internal/config/image/cuda_image_test.go b/internal/config/image/cuda_image_test.go index 7302a695..aad69d31 100644 --- a/internal/config/image/cuda_image_test.go +++ b/internal/config/image/cuda_image_test.go @@ -429,7 +429,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { ) require.NoError(t, err) - devices := image.VisibleDevicesFromEnvVar() + devices := image.visibleDevicesFromEnvVar() require.EqualValues(t, tc.expectedDevices, devices) }) } @@ -508,13 +508,15 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) { func TestVisibleDevices(t *testing.T) { var tests = []struct { - description string - mountDevices []specs.Mount - envvarDevices string - privileged bool - acceptUnprivileged bool - acceptMounts bool - expectedDevices []string + description string + mountDevices []specs.Mount + envvarDevices string + privileged bool + acceptUnprivileged bool + acceptMounts bool + preferredVisibleDeviceEnvVars []string + env map[string]string + expectedDevices []string }{ { description: "Mount devices, unprivileged, no accept unprivileged", @@ -597,20 +599,92 @@ func TestVisibleDevices(t *testing.T) { acceptMounts: false, expectedDevices: nil, }, + // New test cases for visibleEnvVars functionality + { + description: "preferred env var set and present in env, privileged", + mountDevices: nil, + envvarDevices: "", + privileged: true, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + }, + expectedDevices: []string{"GPU-12345"}, + }, + { + description: "preferred env var set and present in env, unprivileged but accepted", + mountDevices: nil, + envvarDevices: "", + privileged: false, + acceptUnprivileged: true, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + }, + expectedDevices: []string{"GPU-12345"}, + }, + { + description: "preferred env var set and present in env, unprivileged and not accepted", + mountDevices: nil, + envvarDevices: "", + privileged: false, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + }, + expectedDevices: nil, + }, + { + description: "multiple preferred env vars, both present, privileged", + mountDevices: nil, + envvarDevices: "", + privileged: true, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, + env: map[string]string{ + "DOCKER_RESOURCE_GPUS": "GPU-12345", + "DOCKER_RESOURCE_GPUS_ADDITIONAL": "GPU-67890", + }, + expectedDevices: []string{"GPU-12345", "GPU-67890"}, + }, + { + description: "preferred env var not present, fallback to NVIDIA_VISIBLE_DEVICES, privileged", + mountDevices: nil, + envvarDevices: "GPU-12345", + privileged: true, + acceptUnprivileged: false, + acceptMounts: true, + preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"}, + env: map[string]string{ + EnvVarNvidiaVisibleDevices: "GPU-12345", + }, + expectedDevices: []string{"GPU-12345"}, + }, } for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { - // Wrap the call to getDevices() in a closure. + // Create env map with both NVIDIA_VISIBLE_DEVICES and any additional env vars + env := make(map[string]string) + if tc.envvarDevices != "" { + env[EnvVarNvidiaVisibleDevices] = tc.envvarDevices + } + for k, v := range tc.env { + env[k] = v + } + image, err := New( - WithEnvMap( - map[string]string{ - EnvVarNvidiaVisibleDevices: tc.envvarDevices, - }, - ), + WithEnvMap(env), WithMounts(tc.mountDevices), WithPrivileged(tc.privileged), WithAcceptDeviceListAsVolumeMounts(tc.acceptMounts), WithAcceptEnvvarUnprivileged(tc.acceptUnprivileged), + WithPreferredVisibleDevicesEnvVars(tc.preferredVisibleDeviceEnvVars...), ) require.NoError(t, err) require.Equal(t, tc.expectedDevices, image.VisibleDevices()) diff --git a/internal/modifier/csv.go b/internal/modifier/csv.go index d3dc84e7..1f8a12f8 100644 --- a/internal/modifier/csv.go +++ b/internal/modifier/csv.go @@ -33,7 +33,7 @@ import ( // NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper. // The modifications are defined by CSV MountSpecs. func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) { - if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 { + if devices := container.VisibleDevices(); len(devices) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index a0239df8..e96946a4 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -37,7 +37,7 @@ import ( // // If not devices are selected, no changes are made. func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) { - if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 { + if devices := image.VisibleDevices(); len(devices) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } diff --git a/internal/modifier/graphics.go b/internal/modifier/graphics.go index 6e602d7a..cd024a89 100644 --- a/internal/modifier/graphics.go +++ b/internal/modifier/graphics.go @@ -29,9 +29,10 @@ import ( // NewGraphicsModifier constructs a modifier that injects graphics-related modifications into an OCI runtime specification. // The value of the NVIDIA_DRIVER_CAPABILITIES environment variable is checked to determine if this modification should be made. -func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerImage image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) { - if required, reason := requiresGraphicsModifier(containerImage); !required { - logger.Infof("No graphics modifier required: %v", reason) +func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, container image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) { + devices, reason := requiresGraphicsModifier(container) + if len(devices) == 0 { + logger.Infof("No graphics modifier required; %v", reason) return nil, nil } @@ -48,7 +49,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI devRoot := driver.Root drmNodes, err := discover.NewDRMNodesDiscoverer( logger, - containerImage.DevicesFromEnvvars(image.EnvVarNvidiaVisibleDevices), + image.NewVisibleDevices(devices...), devRoot, hookCreator, ) @@ -64,14 +65,15 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI } // requiresGraphicsModifier determines whether a graphics modifier is required. -func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) { - if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 { - return false, "no devices requested" +func requiresGraphicsModifier(cudaImage image.CUDA) ([]string, string) { + devices := cudaImage.VisibleDevices() + if len(devices) == 0 { + return nil, "no devices requested" } if !cudaImage.GetDriverCapabilities().Any(image.DriverCapabilityGraphics, image.DriverCapabilityDisplay) { - return false, "no required capabilities requested" + return nil, "no required capabilities requested" } - return true, "" + return devices, "" } diff --git a/internal/modifier/graphics_test.go b/internal/modifier/graphics_test.go index 186af48a..540fee5a 100644 --- a/internal/modifier/graphics_test.go +++ b/internal/modifier/graphics_test.go @@ -26,9 +26,9 @@ import ( func TestGraphicsModifier(t *testing.T) { testCases := []struct { - description string - envmap map[string]string - expectedRequired bool + description string + envmap map[string]string + expectedDevices []string }{ { description: "empty image does not create modifier", @@ -52,7 +52,7 @@ func TestGraphicsModifier(t *testing.T) { "NVIDIA_VISIBLE_DEVICES": "all", "NVIDIA_DRIVER_CAPABILITIES": "all", }, - expectedRequired: true, + expectedDevices: []string{"all"}, }, { description: "devices with graphics capability creates modifier", @@ -60,7 +60,7 @@ func TestGraphicsModifier(t *testing.T) { "NVIDIA_VISIBLE_DEVICES": "all", "NVIDIA_DRIVER_CAPABILITIES": "graphics", }, - expectedRequired: true, + expectedDevices: []string{"all"}, }, { description: "devices with compute,graphics capability creates modifier", @@ -68,7 +68,7 @@ func TestGraphicsModifier(t *testing.T) { "NVIDIA_VISIBLE_DEVICES": "all", "NVIDIA_DRIVER_CAPABILITIES": "compute,graphics", }, - expectedRequired: true, + expectedDevices: []string{"all"}, }, { description: "devices with display capability creates modifier", @@ -76,7 +76,7 @@ func TestGraphicsModifier(t *testing.T) { "NVIDIA_VISIBLE_DEVICES": "all", "NVIDIA_DRIVER_CAPABILITIES": "display", }, - expectedRequired: true, + expectedDevices: []string{"all"}, }, { description: "devices with display,graphics capability creates modifier", @@ -84,7 +84,7 @@ func TestGraphicsModifier(t *testing.T) { "NVIDIA_VISIBLE_DEVICES": "all", "NVIDIA_DRIVER_CAPABILITIES": "display,graphics", }, - expectedRequired: true, + expectedDevices: []string{"all"}, }, } @@ -94,7 +94,7 @@ func TestGraphicsModifier(t *testing.T) { image.WithEnvMap(tc.envmap), ) required, _ := requiresGraphicsModifier(image) - require.EqualValues(t, tc.expectedRequired, required) + require.EqualValues(t, tc.expectedDevices, required) }) } } From 82b62898bf6f624f7969f8a9950912f350325b76 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 17 Jun 2025 11:43:55 +0200 Subject: [PATCH 4/4] [no-relnote] Make devicesFromEnvvars private Signed-off-by: Evan Lezar --- internal/config/image/cuda_image.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index 49184dbd..d1e9a604 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -144,8 +144,8 @@ func (i CUDA) HasDisableRequire() bool { return false } -// DevicesFromEnvvars returns the devices requested by the image through environment variables -func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices { +// devicesFromEnvvars returns the devices requested by the image through environment variables +func (i CUDA) devicesFromEnvvars(envVars ...string) []string { // We concantenate all the devices from the specified env. var isSet bool var devices []string @@ -166,15 +166,15 @@ func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices { // Environment variable unset with legacy image: default to "all". if !isSet && len(devices) == 0 && i.IsLegacy() { - return NewVisibleDevices("all") + devices = []string{"all"} } // Environment variable unset or empty or "void": return nil if len(devices) == 0 || requested["void"] { - return NewVisibleDevices("void") + devices = []string{"void"} } - return NewVisibleDevices(devices...) + return NewVisibleDevices(devices...).List() } // GetDriverCapabilities returns the requested driver capabilities. @@ -328,7 +328,7 @@ func (i CUDA) cdiDeviceRequestsFromAnnotations() []string { // NVIDIA_VISIBLE_DEVICES environment variable is used. func (i CUDA) visibleDevicesFromEnvVar() []string { envVars := i.visibleEnvVars() - return i.DevicesFromEnvvars(envVars...).List() + return i.devicesFromEnvvars(envVars...) } // visibleDevicesFromMounts returns the set of visible devices requested as mounts. @@ -414,7 +414,7 @@ func (m cdiDeviceMountRequest) qualifiedName() (string, error) { // ImexChannelsFromEnvVar returns the list of IMEX channels requested for the image. func (i CUDA) ImexChannelsFromEnvVar() []string { - imexChannels := i.DevicesFromEnvvars(EnvVarNvidiaImexChannels).List() + imexChannels := i.devicesFromEnvvars(EnvVarNvidiaImexChannels) if len(imexChannels) == 1 && imexChannels[0] == "all" { return nil }