mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-11 09:04:37 +00:00
BUGFIX: modifier: respect GPU volume-mount device requests
The gated modifiers used to add support for GDS, Mofed, and CUDA Forward Comatibility only check the NVIDIA_VISIBLE_DEVICES envvar to determine whether GPUs are requested and modifications should be made. This means that use cases where volume mounts are used to request devices (e.g. when using the GPU Device Plugin) are not supported. This patch takes visibleDevicesFromEnvVar private, making VisibleDevices the only exported method to query valid devices. And edits the gated modifiers to use this func, ensuring device requests via mounts are also taken into acount. Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
This commit is contained in:
parent
6fc6f2c594
commit
b4f0ad9c8e
@ -212,7 +212,7 @@ func parseMajorMinorVersion(version string) (string, error) {
|
||||
// OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/
|
||||
func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
|
||||
var hasCDIdevice bool
|
||||
for _, device := range i.VisibleDevicesFromEnvVar() {
|
||||
for _, device := range i.visibleDevicesFromEnvVar() {
|
||||
if !parser.IsQualifiedName(device) {
|
||||
return false
|
||||
}
|
||||
@ -258,7 +258,7 @@ func (i CUDA) VisibleDevices() []string {
|
||||
}
|
||||
|
||||
// Get the Fallback to reading from the environment variable if privileges are correct
|
||||
envVarDeviceRequests := i.VisibleDevicesFromEnvVar()
|
||||
envVarDeviceRequests := i.visibleDevicesFromEnvVar()
|
||||
if len(envVarDeviceRequests) == 0 {
|
||||
return nil
|
||||
}
|
||||
@ -278,11 +278,11 @@ func (i CUDA) VisibleDevices() []string {
|
||||
return nil
|
||||
}
|
||||
|
||||
// VisibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
|
||||
// visibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
|
||||
// If any of the preferredVisibleDeviceEnvVars are present in the image, they
|
||||
// are used to determine the visible devices. If this is not the case, the
|
||||
// NVIDIA_VISIBLE_DEVICES environment variable is used.
|
||||
func (i CUDA) VisibleDevicesFromEnvVar() []string {
|
||||
func (i CUDA) visibleDevicesFromEnvVar() []string {
|
||||
envVars := i.visibleEnvVars()
|
||||
return i.DevicesFromEnvvars(envVars...).List()
|
||||
}
|
||||
|
@ -429,7 +429,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
|
||||
)
|
||||
|
||||
require.NoError(t, err)
|
||||
devices := image.VisibleDevicesFromEnvVar()
|
||||
devices := image.visibleDevicesFromEnvVar()
|
||||
require.EqualValues(t, tc.expectedDevices, devices)
|
||||
})
|
||||
}
|
||||
@ -508,13 +508,15 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) {
|
||||
|
||||
func TestVisibleDevices(t *testing.T) {
|
||||
var tests = []struct {
|
||||
description string
|
||||
mountDevices []specs.Mount
|
||||
envvarDevices string
|
||||
privileged bool
|
||||
acceptUnprivileged bool
|
||||
acceptMounts bool
|
||||
expectedDevices []string
|
||||
description string
|
||||
mountDevices []specs.Mount
|
||||
envvarDevices string
|
||||
privileged bool
|
||||
acceptUnprivileged bool
|
||||
acceptMounts bool
|
||||
preferredVisibleDeviceEnvVars []string
|
||||
env map[string]string
|
||||
expectedDevices []string
|
||||
}{
|
||||
{
|
||||
description: "Mount devices, unprivileged, no accept unprivileged",
|
||||
@ -597,20 +599,92 @@ func TestVisibleDevices(t *testing.T) {
|
||||
acceptMounts: false,
|
||||
expectedDevices: nil,
|
||||
},
|
||||
// New test cases for visibleEnvVars functionality
|
||||
{
|
||||
description: "preferred env var set and present in env, privileged",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "",
|
||||
privileged: true,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: true,
|
||||
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
|
||||
env: map[string]string{
|
||||
"DOCKER_RESOURCE_GPUS": "GPU-12345",
|
||||
},
|
||||
expectedDevices: []string{"GPU-12345"},
|
||||
},
|
||||
{
|
||||
description: "preferred env var set and present in env, unprivileged but accepted",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "",
|
||||
privileged: false,
|
||||
acceptUnprivileged: true,
|
||||
acceptMounts: true,
|
||||
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
|
||||
env: map[string]string{
|
||||
"DOCKER_RESOURCE_GPUS": "GPU-12345",
|
||||
},
|
||||
expectedDevices: []string{"GPU-12345"},
|
||||
},
|
||||
{
|
||||
description: "preferred env var set and present in env, unprivileged and not accepted",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "",
|
||||
privileged: false,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: true,
|
||||
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
|
||||
env: map[string]string{
|
||||
"DOCKER_RESOURCE_GPUS": "GPU-12345",
|
||||
},
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "multiple preferred env vars, both present, privileged",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "",
|
||||
privileged: true,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: true,
|
||||
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
|
||||
env: map[string]string{
|
||||
"DOCKER_RESOURCE_GPUS": "GPU-12345",
|
||||
"DOCKER_RESOURCE_GPUS_ADDITIONAL": "GPU-67890",
|
||||
},
|
||||
expectedDevices: []string{"GPU-12345", "GPU-67890"},
|
||||
},
|
||||
{
|
||||
description: "preferred env var not present, fallback to NVIDIA_VISIBLE_DEVICES, privileged",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "GPU-12345",
|
||||
privileged: true,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: true,
|
||||
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
|
||||
env: map[string]string{
|
||||
EnvVarNvidiaVisibleDevices: "GPU-12345",
|
||||
},
|
||||
expectedDevices: []string{"GPU-12345"},
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
// Wrap the call to getDevices() in a closure.
|
||||
// Create env map with both NVIDIA_VISIBLE_DEVICES and any additional env vars
|
||||
env := make(map[string]string)
|
||||
if tc.envvarDevices != "" {
|
||||
env[EnvVarNvidiaVisibleDevices] = tc.envvarDevices
|
||||
}
|
||||
for k, v := range tc.env {
|
||||
env[k] = v
|
||||
}
|
||||
|
||||
image, err := New(
|
||||
WithEnvMap(
|
||||
map[string]string{
|
||||
EnvVarNvidiaVisibleDevices: tc.envvarDevices,
|
||||
},
|
||||
),
|
||||
WithEnvMap(env),
|
||||
WithMounts(tc.mountDevices),
|
||||
WithPrivileged(tc.privileged),
|
||||
WithAcceptDeviceListAsVolumeMounts(tc.acceptMounts),
|
||||
WithAcceptEnvvarUnprivileged(tc.acceptUnprivileged),
|
||||
WithPreferredVisibleDevicesEnvVars(tc.preferredVisibleDeviceEnvVars...),
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, tc.expectedDevices, image.VisibleDevices())
|
||||
|
@ -86,37 +86,38 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if cfg.AcceptDeviceListAsVolumeMounts {
|
||||
mountDevices := container.CDIDevicesFromMounts()
|
||||
if len(mountDevices) > 0 {
|
||||
return mountDevices, nil
|
||||
}
|
||||
}
|
||||
|
||||
var devices []string
|
||||
seen := make(map[string]bool)
|
||||
for _, name := range container.VisibleDevicesFromEnvVar() {
|
||||
for _, name := range container.VisibleDevices() {
|
||||
// Skip empty device names
|
||||
if strings.TrimSpace(name) == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if !parser.IsQualifiedName(name) {
|
||||
name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name)
|
||||
}
|
||||
|
||||
// Skip devices that result in empty device IDs after qualification
|
||||
if strings.HasSuffix(name, "=") {
|
||||
logger.Debugf("Ignoring device with empty ID: %q", name)
|
||||
continue
|
||||
}
|
||||
|
||||
if seen[name] {
|
||||
logger.Debugf("Ignoring duplicate device %q", name)
|
||||
continue
|
||||
}
|
||||
devices = append(devices, name)
|
||||
seen[name] = true
|
||||
}
|
||||
|
||||
if len(devices) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged((*image.OCISpec)(rawSpec)) {
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES: %v", devices)
|
||||
|
||||
return nil, nil
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
// getAnnotationDevices returns a list of devices specified in the annotations.
|
||||
|
@ -20,7 +20,12 @@ import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
testlog "github.com/sirupsen/logrus/hooks/test"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||
)
|
||||
|
||||
func TestGetAnnotationDevices(t *testing.T) {
|
||||
@ -90,3 +95,201 @@ func TestGetAnnotationDevices(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func getTestConfig() *config.Config {
|
||||
cfg, _ := config.GetDefault()
|
||||
return cfg
|
||||
}
|
||||
|
||||
func getTestConfigWithAnnotations() *config.Config {
|
||||
cfg, _ := config.GetDefault()
|
||||
cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes = []string{"cdi.k8s.io/"}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func TestGetDevicesFromSpec(t *testing.T) {
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
|
||||
testCases := []struct {
|
||||
description string
|
||||
spec *specs.Spec
|
||||
config *config.Config
|
||||
loadError error
|
||||
expectedDevices []string
|
||||
expectedError string
|
||||
}{
|
||||
{
|
||||
description: "empty spec, no devices specified",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: nil,
|
||||
expectedError: "",
|
||||
},
|
||||
{
|
||||
description: "NVIDIA_VISIBLE_DEVICES=all devices specified",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{
|
||||
"NVIDIA_VISIBLE_DEVICES=all",
|
||||
},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"nvidia.com/gpu=all"},
|
||||
expectedError: "",
|
||||
},
|
||||
{
|
||||
description: "devices from annotations",
|
||||
spec: &specs.Spec{
|
||||
Annotations: map[string]string{
|
||||
"cdi.k8s.io/test": "example.com/device=device1,example.com/device=device2",
|
||||
},
|
||||
Process: &specs.Process{
|
||||
Env: []string{},
|
||||
},
|
||||
},
|
||||
config: getTestConfigWithAnnotations(),
|
||||
expectedDevices: []string{"example.com/device=device1", "example.com/device=device2"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - single device",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=0"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"nvidia.com/gpu=0"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - multiple unique devices",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1,2"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - duplicate devices should be deduplicated",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1,0,2,1"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - qualified device names",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=example.com/device=dev1,0,example.com/device=dev2"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"example.com/device=dev1", "nvidia.com/gpu=0", "example.com/device=dev2"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - duplicate qualified device names should be deduplicated",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=example.com/device=dev1,0,example.com/device=dev1,nvidia.com/gpu=0"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"example.com/device=dev1", "nvidia.com/gpu=0"},
|
||||
},
|
||||
{
|
||||
description: "annotation devices take precedence over environment variables",
|
||||
spec: &specs.Spec{
|
||||
Annotations: map[string]string{
|
||||
"cdi.k8s.io/test": "example.com/device=annotation-device",
|
||||
},
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1"},
|
||||
},
|
||||
},
|
||||
config: getTestConfigWithAnnotations(),
|
||||
expectedDevices: []string{"example.com/device=annotation-device"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - empty and whitespace devices should be filtered",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=0, ,1, \t ,2"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - void should return empty",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=void"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - none should be filtered out",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=none"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - all empty devices should result in no devices",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES= , , \t "},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "error loading OCI spec",
|
||||
loadError: fmt.Errorf("failed to load spec"),
|
||||
config: getTestConfig(),
|
||||
expectedError: "failed to load OCI spec",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
mockSpec := &oci.SpecMock{
|
||||
LoadFunc: func() (*specs.Spec, error) {
|
||||
if tc.loadError != nil {
|
||||
return nil, tc.loadError
|
||||
}
|
||||
return tc.spec, nil
|
||||
},
|
||||
}
|
||||
|
||||
devices, err := getDevicesFromSpec(logger, mockSpec, tc.config)
|
||||
|
||||
if tc.expectedError != "" {
|
||||
require.Error(t, err)
|
||||
require.Nil(t, devices)
|
||||
require.Contains(t, err.Error(), tc.expectedError)
|
||||
} else {
|
||||
require.NoError(t, err)
|
||||
require.ElementsMatch(t, tc.expectedDevices, devices)
|
||||
}
|
||||
|
||||
// Verify that Load was called exactly once
|
||||
require.Len(t, mockSpec.LoadCalls(), 1)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -33,7 +33,7 @@ import (
|
||||
// NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper.
|
||||
// The modifications are defined by CSV MountSpecs.
|
||||
func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) {
|
||||
if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 {
|
||||
if devices := container.VisibleDevices(); len(devices) == 0 {
|
||||
logger.Infof("No modification required; no devices requested")
|
||||
return nil, nil
|
||||
}
|
||||
|
@ -37,7 +37,7 @@ import (
|
||||
//
|
||||
// If not devices are selected, no changes are made.
|
||||
func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
|
||||
if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 {
|
||||
if devices := image.VisibleDevices(); len(devices) == 0 {
|
||||
logger.Infof("No modification required; no devices requested")
|
||||
return nil, nil
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI
|
||||
|
||||
// requiresGraphicsModifier determines whether a graphics modifier is required.
|
||||
func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) {
|
||||
if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 {
|
||||
if devices := cudaImage.VisibleDevices(); len(devices) == 0 {
|
||||
return false, "no devices requested"
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user