BUGFIX: modifier: respect GPU volume-mount device requests

The gated modifiers used to add support for GDS, Mofed, and CUDA Forward Comatibility
only check the NVIDIA_VISIBLE_DEVICES envvar to determine whether GPUs are requested
and modifications should be made. This means that use cases where volume mounts are
used to request devices are not supported.

This change ensures that device extraction is consistent for all use cases.

Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Carlos Eduardo Arango Gutierrez 2025-06-05 13:25:46 +02:00 committed by Evan Lezar
parent f4f7da65f1
commit d03a06029a
No known key found for this signature in database
6 changed files with 113 additions and 37 deletions

View File

@ -270,7 +270,7 @@ func (i CUDA) VisibleDevices() []string {
}
// Get the Fallback to reading from the environment variable if privileges are correct
envVarDeviceRequests := i.VisibleDevicesFromEnvVar()
envVarDeviceRequests := i.visibleDevicesFromEnvVar()
if len(envVarDeviceRequests) == 0 {
return nil
}
@ -322,11 +322,11 @@ func (i CUDA) cdiDeviceRequestsFromAnnotations() []string {
return devices
}
// VisibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
// visibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
// If any of the preferredVisibleDeviceEnvVars are present in the image, they
// are used to determine the visible devices. If this is not the case, the
// NVIDIA_VISIBLE_DEVICES environment variable is used.
func (i CUDA) VisibleDevicesFromEnvVar() []string {
func (i CUDA) visibleDevicesFromEnvVar() []string {
envVars := i.visibleEnvVars()
return i.DevicesFromEnvvars(envVars...).List()
}

View File

@ -429,7 +429,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
)
require.NoError(t, err)
devices := image.VisibleDevicesFromEnvVar()
devices := image.visibleDevicesFromEnvVar()
require.EqualValues(t, tc.expectedDevices, devices)
})
}
@ -514,6 +514,8 @@ func TestVisibleDevices(t *testing.T) {
privileged bool
acceptUnprivileged bool
acceptMounts bool
preferredVisibleDeviceEnvVars []string
env map[string]string
expectedDevices []string
}{
{
@ -597,20 +599,92 @@ func TestVisibleDevices(t *testing.T) {
acceptMounts: false,
expectedDevices: nil,
},
// New test cases for visibleEnvVars functionality
{
description: "preferred env var set and present in env, privileged",
mountDevices: nil,
envvarDevices: "",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
},
expectedDevices: []string{"GPU-12345"},
},
{
description: "preferred env var set and present in env, unprivileged but accepted",
mountDevices: nil,
envvarDevices: "",
privileged: false,
acceptUnprivileged: true,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
},
expectedDevices: []string{"GPU-12345"},
},
{
description: "preferred env var set and present in env, unprivileged and not accepted",
mountDevices: nil,
envvarDevices: "",
privileged: false,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
},
expectedDevices: nil,
},
{
description: "multiple preferred env vars, both present, privileged",
mountDevices: nil,
envvarDevices: "",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
"DOCKER_RESOURCE_GPUS_ADDITIONAL": "GPU-67890",
},
expectedDevices: []string{"GPU-12345", "GPU-67890"},
},
{
description: "preferred env var not present, fallback to NVIDIA_VISIBLE_DEVICES, privileged",
mountDevices: nil,
envvarDevices: "GPU-12345",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
EnvVarNvidiaVisibleDevices: "GPU-12345",
},
expectedDevices: []string{"GPU-12345"},
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
// Wrap the call to getDevices() in a closure.
// Create env map with both NVIDIA_VISIBLE_DEVICES and any additional env vars
env := make(map[string]string)
if tc.envvarDevices != "" {
env[EnvVarNvidiaVisibleDevices] = tc.envvarDevices
}
for k, v := range tc.env {
env[k] = v
}
image, err := New(
WithEnvMap(
map[string]string{
EnvVarNvidiaVisibleDevices: tc.envvarDevices,
},
),
WithEnvMap(env),
WithMounts(tc.mountDevices),
WithPrivileged(tc.privileged),
WithAcceptDeviceListAsVolumeMounts(tc.acceptMounts),
WithAcceptEnvvarUnprivileged(tc.acceptUnprivileged),
WithPreferredVisibleDevicesEnvVars(tc.preferredVisibleDeviceEnvVars...),
)
require.NoError(t, err)
require.Equal(t, tc.expectedDevices, image.VisibleDevices())

View File

@ -33,7 +33,7 @@ import (
// NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper.
// The modifications are defined by CSV MountSpecs.
func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) {
if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 {
if devices := container.VisibleDevices(); len(devices) == 0 {
logger.Infof("No modification required; no devices requested")
return nil, nil
}

View File

@ -37,7 +37,7 @@ import (
//
// If not devices are selected, no changes are made.
func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 {
if devices := image.VisibleDevices(); len(devices) == 0 {
logger.Infof("No modification required; no devices requested")
return nil, nil
}

View File

@ -29,9 +29,10 @@ import (
// NewGraphicsModifier constructs a modifier that injects graphics-related modifications into an OCI runtime specification.
// The value of the NVIDIA_DRIVER_CAPABILITIES environment variable is checked to determine if this modification should be made.
func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerImage image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
if required, reason := requiresGraphicsModifier(containerImage); !required {
logger.Infof("No graphics modifier required: %v", reason)
func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, container image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
devices, reason := requiresGraphicsModifier(container)
if len(devices) == 0 {
logger.Infof("No graphics modifier required; %v", reason)
return nil, nil
}
@ -48,7 +49,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI
devRoot := driver.Root
drmNodes, err := discover.NewDRMNodesDiscoverer(
logger,
containerImage.DevicesFromEnvvars(image.EnvVarNvidiaVisibleDevices),
image.NewVisibleDevices(devices...),
devRoot,
hookCreator,
)
@ -64,14 +65,15 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI
}
// requiresGraphicsModifier determines whether a graphics modifier is required.
func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) {
if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 {
return false, "no devices requested"
func requiresGraphicsModifier(cudaImage image.CUDA) ([]string, string) {
devices := cudaImage.VisibleDevices()
if len(devices) == 0 {
return nil, "no devices requested"
}
if !cudaImage.GetDriverCapabilities().Any(image.DriverCapabilityGraphics, image.DriverCapabilityDisplay) {
return false, "no required capabilities requested"
return nil, "no required capabilities requested"
}
return true, ""
return devices, ""
}

View File

@ -28,7 +28,7 @@ func TestGraphicsModifier(t *testing.T) {
testCases := []struct {
description string
envmap map[string]string
expectedRequired bool
expectedDevices []string
}{
{
description: "empty image does not create modifier",
@ -52,7 +52,7 @@ func TestGraphicsModifier(t *testing.T) {
"NVIDIA_VISIBLE_DEVICES": "all",
"NVIDIA_DRIVER_CAPABILITIES": "all",
},
expectedRequired: true,
expectedDevices: []string{"all"},
},
{
description: "devices with graphics capability creates modifier",
@ -60,7 +60,7 @@ func TestGraphicsModifier(t *testing.T) {
"NVIDIA_VISIBLE_DEVICES": "all",
"NVIDIA_DRIVER_CAPABILITIES": "graphics",
},
expectedRequired: true,
expectedDevices: []string{"all"},
},
{
description: "devices with compute,graphics capability creates modifier",
@ -68,7 +68,7 @@ func TestGraphicsModifier(t *testing.T) {
"NVIDIA_VISIBLE_DEVICES": "all",
"NVIDIA_DRIVER_CAPABILITIES": "compute,graphics",
},
expectedRequired: true,
expectedDevices: []string{"all"},
},
{
description: "devices with display capability creates modifier",
@ -76,7 +76,7 @@ func TestGraphicsModifier(t *testing.T) {
"NVIDIA_VISIBLE_DEVICES": "all",
"NVIDIA_DRIVER_CAPABILITIES": "display",
},
expectedRequired: true,
expectedDevices: []string{"all"},
},
{
description: "devices with display,graphics capability creates modifier",
@ -84,7 +84,7 @@ func TestGraphicsModifier(t *testing.T) {
"NVIDIA_VISIBLE_DEVICES": "all",
"NVIDIA_DRIVER_CAPABILITIES": "display,graphics",
},
expectedRequired: true,
expectedDevices: []string{"all"},
},
}
@ -94,7 +94,7 @@ func TestGraphicsModifier(t *testing.T) {
image.WithEnvMap(tc.envmap),
)
required, _ := requiresGraphicsModifier(image)
require.EqualValues(t, tc.expectedRequired, required)
require.EqualValues(t, tc.expectedDevices, required)
})
}
}