mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-11 00:57:15 +00:00
Merge b4f0ad9c8e
into 27f5ec83de
This commit is contained in:
commit
07ad80b910
@ -212,7 +212,7 @@ func parseMajorMinorVersion(version string) (string, error) {
|
||||
// OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/
|
||||
func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
|
||||
var hasCDIdevice bool
|
||||
for _, device := range i.VisibleDevicesFromEnvVar() {
|
||||
for _, device := range i.visibleDevicesFromEnvVar() {
|
||||
if !parser.IsQualifiedName(device) {
|
||||
return false
|
||||
}
|
||||
@ -228,6 +228,21 @@ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
|
||||
return hasCDIdevice
|
||||
}
|
||||
|
||||
// visibleEnvVars returns the environment variables that are used to determine device visibility.
|
||||
// It returns the preferred environment variables that are set, or NVIDIA_VISIBLE_DEVICES if none are set.
|
||||
func (i CUDA) visibleEnvVars() []string {
|
||||
var envVars []string
|
||||
for _, envVar := range i.preferredVisibleDeviceEnvVars {
|
||||
if i.HasEnvvar(envVar) {
|
||||
envVars = append(envVars, envVar)
|
||||
}
|
||||
}
|
||||
if len(envVars) == 0 {
|
||||
envVars = append(envVars, EnvVarNvidiaVisibleDevices)
|
||||
}
|
||||
return envVars
|
||||
}
|
||||
|
||||
// VisibleDevices returns a list of devices requested in the container image.
|
||||
// If volume mount requests are enabled these are returned if requested,
|
||||
// otherwise device requests through environment variables are considered.
|
||||
@ -243,7 +258,7 @@ func (i CUDA) VisibleDevices() []string {
|
||||
}
|
||||
|
||||
// Get the Fallback to reading from the environment variable if privileges are correct
|
||||
envVarDeviceRequests := i.VisibleDevicesFromEnvVar()
|
||||
envVarDeviceRequests := i.visibleDevicesFromEnvVar()
|
||||
if len(envVarDeviceRequests) == 0 {
|
||||
return nil
|
||||
}
|
||||
@ -255,22 +270,21 @@ func (i CUDA) VisibleDevices() []string {
|
||||
}
|
||||
|
||||
// We log a warning if we are ignoring the environment variable requests.
|
||||
i.logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES in unprivileged container")
|
||||
envVars := i.visibleEnvVars()
|
||||
if len(envVars) > 0 {
|
||||
i.logger.Warningf("Ignoring devices requested by environment variable(s) in unprivileged container: %v", envVars)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// VisibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
|
||||
// visibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
|
||||
// If any of the preferredVisibleDeviceEnvVars are present in the image, they
|
||||
// are used to determine the visible devices. If this is not the case, the
|
||||
// NVIDIA_VISIBLE_DEVICES environment variable is used.
|
||||
func (i CUDA) VisibleDevicesFromEnvVar() []string {
|
||||
for _, envVar := range i.preferredVisibleDeviceEnvVars {
|
||||
if i.HasEnvvar(envVar) {
|
||||
return i.DevicesFromEnvvars(i.preferredVisibleDeviceEnvVars...).List()
|
||||
}
|
||||
}
|
||||
return i.DevicesFromEnvvars(EnvVarNvidiaVisibleDevices).List()
|
||||
func (i CUDA) visibleDevicesFromEnvVar() []string {
|
||||
envVars := i.visibleEnvVars()
|
||||
return i.DevicesFromEnvvars(envVars...).List()
|
||||
}
|
||||
|
||||
// visibleDevicesFromMounts returns the set of visible devices requested as mounts.
|
||||
|
@ -429,7 +429,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
|
||||
)
|
||||
|
||||
require.NoError(t, err)
|
||||
devices := image.VisibleDevicesFromEnvVar()
|
||||
devices := image.visibleDevicesFromEnvVar()
|
||||
require.EqualValues(t, tc.expectedDevices, devices)
|
||||
})
|
||||
}
|
||||
@ -508,13 +508,15 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) {
|
||||
|
||||
func TestVisibleDevices(t *testing.T) {
|
||||
var tests = []struct {
|
||||
description string
|
||||
mountDevices []specs.Mount
|
||||
envvarDevices string
|
||||
privileged bool
|
||||
acceptUnprivileged bool
|
||||
acceptMounts bool
|
||||
expectedDevices []string
|
||||
description string
|
||||
mountDevices []specs.Mount
|
||||
envvarDevices string
|
||||
privileged bool
|
||||
acceptUnprivileged bool
|
||||
acceptMounts bool
|
||||
preferredVisibleDeviceEnvVars []string
|
||||
env map[string]string
|
||||
expectedDevices []string
|
||||
}{
|
||||
{
|
||||
description: "Mount devices, unprivileged, no accept unprivileged",
|
||||
@ -597,20 +599,92 @@ func TestVisibleDevices(t *testing.T) {
|
||||
acceptMounts: false,
|
||||
expectedDevices: nil,
|
||||
},
|
||||
// New test cases for visibleEnvVars functionality
|
||||
{
|
||||
description: "preferred env var set and present in env, privileged",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "",
|
||||
privileged: true,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: true,
|
||||
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
|
||||
env: map[string]string{
|
||||
"DOCKER_RESOURCE_GPUS": "GPU-12345",
|
||||
},
|
||||
expectedDevices: []string{"GPU-12345"},
|
||||
},
|
||||
{
|
||||
description: "preferred env var set and present in env, unprivileged but accepted",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "",
|
||||
privileged: false,
|
||||
acceptUnprivileged: true,
|
||||
acceptMounts: true,
|
||||
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
|
||||
env: map[string]string{
|
||||
"DOCKER_RESOURCE_GPUS": "GPU-12345",
|
||||
},
|
||||
expectedDevices: []string{"GPU-12345"},
|
||||
},
|
||||
{
|
||||
description: "preferred env var set and present in env, unprivileged and not accepted",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "",
|
||||
privileged: false,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: true,
|
||||
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
|
||||
env: map[string]string{
|
||||
"DOCKER_RESOURCE_GPUS": "GPU-12345",
|
||||
},
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "multiple preferred env vars, both present, privileged",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "",
|
||||
privileged: true,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: true,
|
||||
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
|
||||
env: map[string]string{
|
||||
"DOCKER_RESOURCE_GPUS": "GPU-12345",
|
||||
"DOCKER_RESOURCE_GPUS_ADDITIONAL": "GPU-67890",
|
||||
},
|
||||
expectedDevices: []string{"GPU-12345", "GPU-67890"},
|
||||
},
|
||||
{
|
||||
description: "preferred env var not present, fallback to NVIDIA_VISIBLE_DEVICES, privileged",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "GPU-12345",
|
||||
privileged: true,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: true,
|
||||
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
|
||||
env: map[string]string{
|
||||
EnvVarNvidiaVisibleDevices: "GPU-12345",
|
||||
},
|
||||
expectedDevices: []string{"GPU-12345"},
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
// Wrap the call to getDevices() in a closure.
|
||||
// Create env map with both NVIDIA_VISIBLE_DEVICES and any additional env vars
|
||||
env := make(map[string]string)
|
||||
if tc.envvarDevices != "" {
|
||||
env[EnvVarNvidiaVisibleDevices] = tc.envvarDevices
|
||||
}
|
||||
for k, v := range tc.env {
|
||||
env[k] = v
|
||||
}
|
||||
|
||||
image, err := New(
|
||||
WithEnvMap(
|
||||
map[string]string{
|
||||
EnvVarNvidiaVisibleDevices: tc.envvarDevices,
|
||||
},
|
||||
),
|
||||
WithEnvMap(env),
|
||||
WithMounts(tc.mountDevices),
|
||||
WithPrivileged(tc.privileged),
|
||||
WithAcceptDeviceListAsVolumeMounts(tc.acceptMounts),
|
||||
WithAcceptEnvvarUnprivileged(tc.acceptUnprivileged),
|
||||
WithPreferredVisibleDevicesEnvVars(tc.preferredVisibleDeviceEnvVars...),
|
||||
)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, tc.expectedDevices, image.VisibleDevices())
|
||||
|
@ -86,37 +86,38 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if cfg.AcceptDeviceListAsVolumeMounts {
|
||||
mountDevices := container.CDIDevicesFromMounts()
|
||||
if len(mountDevices) > 0 {
|
||||
return mountDevices, nil
|
||||
}
|
||||
}
|
||||
|
||||
var devices []string
|
||||
seen := make(map[string]bool)
|
||||
for _, name := range container.VisibleDevicesFromEnvVar() {
|
||||
for _, name := range container.VisibleDevices() {
|
||||
// Skip empty device names
|
||||
if strings.TrimSpace(name) == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
if !parser.IsQualifiedName(name) {
|
||||
name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name)
|
||||
}
|
||||
|
||||
// Skip devices that result in empty device IDs after qualification
|
||||
if strings.HasSuffix(name, "=") {
|
||||
logger.Debugf("Ignoring device with empty ID: %q", name)
|
||||
continue
|
||||
}
|
||||
|
||||
if seen[name] {
|
||||
logger.Debugf("Ignoring duplicate device %q", name)
|
||||
continue
|
||||
}
|
||||
devices = append(devices, name)
|
||||
seen[name] = true
|
||||
}
|
||||
|
||||
if len(devices) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged((*image.OCISpec)(rawSpec)) {
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES: %v", devices)
|
||||
|
||||
return nil, nil
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
// getAnnotationDevices returns a list of devices specified in the annotations.
|
||||
|
@ -20,7 +20,12 @@ import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
testlog "github.com/sirupsen/logrus/hooks/test"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||
)
|
||||
|
||||
func TestGetAnnotationDevices(t *testing.T) {
|
||||
@ -90,3 +95,201 @@ func TestGetAnnotationDevices(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func getTestConfig() *config.Config {
|
||||
cfg, _ := config.GetDefault()
|
||||
return cfg
|
||||
}
|
||||
|
||||
func getTestConfigWithAnnotations() *config.Config {
|
||||
cfg, _ := config.GetDefault()
|
||||
cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes = []string{"cdi.k8s.io/"}
|
||||
return cfg
|
||||
}
|
||||
|
||||
func TestGetDevicesFromSpec(t *testing.T) {
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
|
||||
testCases := []struct {
|
||||
description string
|
||||
spec *specs.Spec
|
||||
config *config.Config
|
||||
loadError error
|
||||
expectedDevices []string
|
||||
expectedError string
|
||||
}{
|
||||
{
|
||||
description: "empty spec, no devices specified",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: nil,
|
||||
expectedError: "",
|
||||
},
|
||||
{
|
||||
description: "NVIDIA_VISIBLE_DEVICES=all devices specified",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{
|
||||
"NVIDIA_VISIBLE_DEVICES=all",
|
||||
},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"nvidia.com/gpu=all"},
|
||||
expectedError: "",
|
||||
},
|
||||
{
|
||||
description: "devices from annotations",
|
||||
spec: &specs.Spec{
|
||||
Annotations: map[string]string{
|
||||
"cdi.k8s.io/test": "example.com/device=device1,example.com/device=device2",
|
||||
},
|
||||
Process: &specs.Process{
|
||||
Env: []string{},
|
||||
},
|
||||
},
|
||||
config: getTestConfigWithAnnotations(),
|
||||
expectedDevices: []string{"example.com/device=device1", "example.com/device=device2"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - single device",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=0"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"nvidia.com/gpu=0"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - multiple unique devices",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1,2"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - duplicate devices should be deduplicated",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1,0,2,1"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - qualified device names",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=example.com/device=dev1,0,example.com/device=dev2"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"example.com/device=dev1", "nvidia.com/gpu=0", "example.com/device=dev2"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - duplicate qualified device names should be deduplicated",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=example.com/device=dev1,0,example.com/device=dev1,nvidia.com/gpu=0"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"example.com/device=dev1", "nvidia.com/gpu=0"},
|
||||
},
|
||||
{
|
||||
description: "annotation devices take precedence over environment variables",
|
||||
spec: &specs.Spec{
|
||||
Annotations: map[string]string{
|
||||
"cdi.k8s.io/test": "example.com/device=annotation-device",
|
||||
},
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=0,1"},
|
||||
},
|
||||
},
|
||||
config: getTestConfigWithAnnotations(),
|
||||
expectedDevices: []string{"example.com/device=annotation-device"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - empty and whitespace devices should be filtered",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=0, ,1, \t ,2"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1", "nvidia.com/gpu=2"},
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - void should return empty",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=void"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - none should be filtered out",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=none"},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "devices from environment variables - all empty devices should result in no devices",
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES= , , \t "},
|
||||
},
|
||||
},
|
||||
config: getTestConfig(),
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "error loading OCI spec",
|
||||
loadError: fmt.Errorf("failed to load spec"),
|
||||
config: getTestConfig(),
|
||||
expectedError: "failed to load OCI spec",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
mockSpec := &oci.SpecMock{
|
||||
LoadFunc: func() (*specs.Spec, error) {
|
||||
if tc.loadError != nil {
|
||||
return nil, tc.loadError
|
||||
}
|
||||
return tc.spec, nil
|
||||
},
|
||||
}
|
||||
|
||||
devices, err := getDevicesFromSpec(logger, mockSpec, tc.config)
|
||||
|
||||
if tc.expectedError != "" {
|
||||
require.Error(t, err)
|
||||
require.Nil(t, devices)
|
||||
require.Contains(t, err.Error(), tc.expectedError)
|
||||
} else {
|
||||
require.NoError(t, err)
|
||||
require.ElementsMatch(t, tc.expectedDevices, devices)
|
||||
}
|
||||
|
||||
// Verify that Load was called exactly once
|
||||
require.Len(t, mockSpec.LoadCalls(), 1)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -33,7 +33,7 @@ import (
|
||||
// NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper.
|
||||
// The modifications are defined by CSV MountSpecs.
|
||||
func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) {
|
||||
if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 {
|
||||
if devices := container.VisibleDevices(); len(devices) == 0 {
|
||||
logger.Infof("No modification required; no devices requested")
|
||||
return nil, nil
|
||||
}
|
||||
|
@ -37,7 +37,7 @@ import (
|
||||
//
|
||||
// If not devices are selected, no changes are made.
|
||||
func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
|
||||
if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 {
|
||||
if devices := image.VisibleDevices(); len(devices) == 0 {
|
||||
logger.Infof("No modification required; no devices requested")
|
||||
return nil, nil
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI
|
||||
|
||||
// requiresGraphicsModifier determines whether a graphics modifier is required.
|
||||
func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) {
|
||||
if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 {
|
||||
if devices := cudaImage.VisibleDevices(); len(devices) == 0 {
|
||||
return false, "no devices requested"
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user