BUGFIX: modifier: respect GPU volume-mount device requests

The gated modifiers used to add support for GDS, Mofed, and CUDA Forward Comatibility only check the NVIDIA_VISIBLE_DEVICES envvar to determine whether GPUs are requested and modifications should be made. This means that use cases where volume mounts are used to request devices (e.g. when using the GPU Device Plugin) are not supported.

This patch takes visibleDevicesFromEnvVar private, making VisibleDevices the only exported method to query valid devices.
And edits the gated modifiers to use this func, ensuring device requests
via mounts are also taken into acount.

Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
This commit is contained in:
Carlos Eduardo Arango Gutierrez 2025-06-05 13:25:46 +02:00
parent 6fc6f2c594
commit b4f0ad9c8e
No known key found for this signature in database
GPG Key ID: 42D9CB42F300A852
7 changed files with 313 additions and 35 deletions

View File

@ -212,7 +212,7 @@ func parseMajorMinorVersion(version string) (string, error) {
// OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/
func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
var hasCDIdevice bool
for _, device := range i.VisibleDevicesFromEnvVar() {
for _, device := range i.visibleDevicesFromEnvVar() {
if !parser.IsQualifiedName(device) {
return false
}
@ -258,7 +258,7 @@ func (i CUDA) VisibleDevices() []string {
}
// Get the Fallback to reading from the environment variable if privileges are correct
envVarDeviceRequests := i.VisibleDevicesFromEnvVar()
envVarDeviceRequests := i.visibleDevicesFromEnvVar()
if len(envVarDeviceRequests) == 0 {
return nil
}
@ -278,11 +278,11 @@ func (i CUDA) VisibleDevices() []string {
return nil
}
// VisibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
// visibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
// If any of the preferredVisibleDeviceEnvVars are present in the image, they
// are used to determine the visible devices. If this is not the case, the
// NVIDIA_VISIBLE_DEVICES environment variable is used.
func (i CUDA) VisibleDevicesFromEnvVar() []string {
func (i CUDA) visibleDevicesFromEnvVar() []string {
envVars := i.visibleEnvVars()
return i.DevicesFromEnvvars(envVars...).List()
}

View File

@ -429,7 +429,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
)
require.NoError(t, err)
devices := image.VisibleDevicesFromEnvVar()
devices := image.visibleDevicesFromEnvVar()
require.EqualValues(t, tc.expectedDevices, devices)
})
}
@ -508,13 +508,15 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) {
func TestVisibleDevices(t *testing.T) {
var tests = []struct {
description string
mountDevices []specs.Mount
envvarDevices string
privileged bool
acceptUnprivileged bool
acceptMounts bool
expectedDevices []string
description string
mountDevices []specs.Mount
envvarDevices string
privileged bool
acceptUnprivileged bool
acceptMounts bool
preferredVisibleDeviceEnvVars []string
env map[string]string
expectedDevices []string
}{
{
description: "Mount devices, unprivileged, no accept unprivileged",
@ -597,20 +599,92 @@ func TestVisibleDevices(t *testing.T) {
acceptMounts: false,
expectedDevices: nil,
},
// New test cases for visibleEnvVars functionality
{
description: "preferred env var set and present in env, privileged",
mountDevices: nil,
envvarDevices: "",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
},
expectedDevices: []string{"GPU-12345"},
},
{
description: "preferred env var set and present in env, unprivileged but accepted",
mountDevices: nil,
envvarDevices: "",
privileged: false,
acceptUnprivileged: true,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
},
expectedDevices: []string{"GPU-12345"},
},
{
description: "preferred env var set and present in env, unprivileged and not accepted",
mountDevices: nil,
envvarDevices: "",
privileged: false,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
},
expectedDevices: nil,
},
{
description: "multiple preferred env vars, both present, privileged",
mountDevices: nil,
envvarDevices: "",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
"DOCKER_RESOURCE_GPUS_ADDITIONAL": "GPU-67890",
},
expectedDevices: []string{"GPU-12345", "GPU-67890"},
},
{
description: "preferred env var not present, fallback to NVIDIA_VISIBLE_DEVICES, privileged",
mountDevices: nil,
envvarDevices: "GPU-12345",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
EnvVarNvidiaVisibleDevices: "GPU-12345",
},
expectedDevices: []string{"GPU-12345"},
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
// Wrap the call to getDevices() in a closure.
// Create env map with both NVIDIA_VISIBLE_DEVICES and any additional env vars
env := make(map[string]string)
if tc.envvarDevices != "" {
env[EnvVarNvidiaVisibleDevices] = tc.envvarDevices
}
for k, v := range tc.env {
env[k] = v
}
image, err := New(
WithEnvMap(
map[string]string{
EnvVarNvidiaVisibleDevices: tc.envvarDevices,
},
),
WithEnvMap(env),
WithMounts(tc.mountDevices),
WithPrivileged(tc.privileged),
WithAcceptDeviceListAsVolumeMounts(tc.acceptMounts),
WithAcceptEnvvarUnprivileged(tc.acceptUnprivileged),
WithPreferredVisibleDevicesEnvVars(tc.preferredVisibleDeviceEnvVars...),
)
require.NoError(t, err)
require.Equal(t, tc.expectedDevices, image.VisibleDevices())

View File

@ -86,37 +86,38 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C
if err != nil {
return nil, err
}
if cfg.AcceptDeviceListAsVolumeMounts {
mountDevices := container.CDIDevicesFromMounts()
if len(mountDevices) > 0 {
return mountDevices, nil
}
}
var devices []string
seen := make(map[string]bool)
for _, name := range container.VisibleDevicesFromEnvVar() {
for _, name := range container.VisibleDevices() {
// Skip empty device names
if strings.TrimSpace(name) == "" {
continue
}
if !parser.IsQualifiedName(name) {
name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name)
}
// Skip devices that result in empty device IDs after qualification
if strings.HasSuffix(name, "=") {
logger.Debugf("Ignoring device with empty ID: %q", name)
continue
}
if seen[name] {
logger.Debugf("Ignoring duplicate device %q", name)
continue
}
devices = append(devices, name)
seen[name] = true
}
if len(devices) == 0 {
return nil, nil
}
if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged((*image.OCISpec)(rawSpec)) {
return devices, nil
}
logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES: %v", devices)
return nil, nil
return devices, nil
}
// getAnnotationDevices returns a list of devices specified in the annotations.

View File

@ -20,7 +20,12 @@ import (
"fmt"
"testing"
"github.com/opencontainers/runtime-spec/specs-go"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
)
func TestGetAnnotationDevices(t *testing.T) {
@ -90,3 +95,201 @@ func TestGetAnnotationDevices(t *testing.T) {
})
}
}
func getTestConfig() *config.Config {
cfg, _ := config.GetDefault()
return cfg
}
func getTestConfigWithAnnotations() *config.Config {
cfg, _ := config.GetDefault()
cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes = []string{"cdi.k8s.io/"}
return cfg
}
func TestGetDevicesFromSpec(t *testing.T) {
logger, _ := testlog.NewNullLogger()
testCases := []struct {
description string
spec *specs.Spec
config *config.Config
loadError error
expectedDevices []string
expectedError string
}{
{
description: "empty spec, no devices specified",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{},
},
},
config: getTestConfig(),
expectedDevices: nil,
expectedError: "",
},
{
description: "NVIDIA_VISIBLE_DEVICES=all devices specified",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{
"NVIDIA_VISIBLE_DEVICES=all",
},
},
},
config: getTestConfig(),
expectedDevices: []string{"nvidia.com/gpu=all"},
expectedError: "",
},
{
description: "devices from annotations",
spec: &specs.Spec{
Annotations: map[string]string{
"cdi.k8s.io/test": "example.com/device=device1,example.com/device=device2",
},
Process: &specs.Process{
Env: []string{},
},
},
config: getTestConfigWithAnnotations(),
expectedDevices: []string{"example.com/device=device1", "example.com/device=device2"},
},
{
description: "devices from environment variables - single device",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=0"},
},
},
config: getTestConfig(),
expectedDevices: []string{"nvidia.com/gpu=0"},
},
{
description: "devices from environment variables - multiple unique devices",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1,2"},
},
},
config: getTestConfig(),
expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"},
},
{
description: "devices from environment variables - duplicate devices should be deduplicated",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1,0,2,1"},
},
},
config: getTestConfig(),
expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"},
},
{
description: "devices from environment variables - qualified device names",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=example.com/device=dev1,0,example.com/device=dev2"},
},
},
config: getTestConfig(),
expectedDevices: []string{"example.com/device=dev1", "nvidia.com/gpu=0", "example.com/device=dev2"},
},
{
description: "devices from environment variables - duplicate qualified device names should be deduplicated",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=example.com/device=dev1,0,example.com/device=dev1,nvidia.com/gpu=0"},
},
},
config: getTestConfig(),
expectedDevices: []string{"example.com/device=dev1", "nvidia.com/gpu=0"},
},
{
description: "annotation devices take precedence over environment variables",
spec: &specs.Spec{
Annotations: map[string]string{
"cdi.k8s.io/test": "example.com/device=annotation-device",
},
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1"},
},
},
config: getTestConfigWithAnnotations(),
expectedDevices: []string{"example.com/device=annotation-device"},
},
{
description: "devices from environment variables - empty and whitespace devices should be filtered",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=0, ,1, \t ,2"},
},
},
config: getTestConfig(),
expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"},
},
{
description: "devices from environment variables - void should return empty",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=void"},
},
},
config: getTestConfig(),
expectedDevices: nil,
},
{
description: "devices from environment variables - none should be filtered out",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=none"},
},
},
config: getTestConfig(),
expectedDevices: nil,
},
{
description: "devices from environment variables - all empty devices should result in no devices",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES= , , \t "},
},
},
config: getTestConfig(),
expectedDevices: nil,
},
{
description: "error loading OCI spec",
loadError: fmt.Errorf("failed to load spec"),
config: getTestConfig(),
expectedError: "failed to load OCI spec",
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
mockSpec := &oci.SpecMock{
LoadFunc: func() (*specs.Spec, error) {
if tc.loadError != nil {
return nil, tc.loadError
}
return tc.spec, nil
},
}
devices, err := getDevicesFromSpec(logger, mockSpec, tc.config)
if tc.expectedError != "" {
require.Error(t, err)
require.Nil(t, devices)
require.Contains(t, err.Error(), tc.expectedError)
} else {
require.NoError(t, err)
require.ElementsMatch(t, tc.expectedDevices, devices)
}
// Verify that Load was called exactly once
require.Len(t, mockSpec.LoadCalls(), 1)
})
}
}

View File

@ -33,7 +33,7 @@ import (
// NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper.
// The modifications are defined by CSV MountSpecs.
func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) {
if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 {
if devices := container.VisibleDevices(); len(devices) == 0 {
logger.Infof("No modification required; no devices requested")
return nil, nil
}

View File

@ -37,7 +37,7 @@ import (
//
// If not devices are selected, no changes are made.
func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 {
if devices := image.VisibleDevices(); len(devices) == 0 {
logger.Infof("No modification required; no devices requested")
return nil, nil
}

View File

@ -65,7 +65,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI
// requiresGraphicsModifier determines whether a graphics modifier is required.
func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) {
if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 {
if devices := cudaImage.VisibleDevices(); len(devices) == 0 {
return false, "no devices requested"
}