Merge pull request #1130 from ArangoGutierrez/fix/1049

BUGFIX: modifier: respect GPU volume-mount device requests
This commit is contained in:
Evan Lezar 2025-06-17 15:04:22 +02:00 committed by GitHub
commit 208896d87d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 153 additions and 54 deletions

View File

@ -19,6 +19,7 @@ package image
import (
"fmt"
"path/filepath"
"slices"
"strconv"
"strings"
@ -143,8 +144,8 @@ func (i CUDA) HasDisableRequire() bool {
return false
}
// DevicesFromEnvvars returns the devices requested by the image through environment variables
func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices {
// devicesFromEnvvars returns the devices requested by the image through environment variables
func (i CUDA) devicesFromEnvvars(envVars ...string) []string {
// We concantenate all the devices from the specified env.
var isSet bool
var devices []string
@ -165,15 +166,15 @@ func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices {
// Environment variable unset with legacy image: default to "all".
if !isSet && len(devices) == 0 && i.IsLegacy() {
return NewVisibleDevices("all")
devices = []string{"all"}
}
// Environment variable unset or empty or "void": return nil
if len(devices) == 0 || requested["void"] {
return NewVisibleDevices("void")
devices = []string{"void"}
}
return NewVisibleDevices(devices...)
return NewVisibleDevices(devices...).List()
}
// GetDriverCapabilities returns the requested driver capabilities.
@ -232,6 +233,22 @@ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
return hasCDIdevice
}
// visibleEnvVars returns the environment variables that are used to determine device visibility.
// It returns the preferred environment variables that are set, or NVIDIA_VISIBLE_DEVICES if none are set.
func (i CUDA) visibleEnvVars() []string {
var envVars []string
for _, envVar := range i.preferredVisibleDeviceEnvVars {
if !i.HasEnvvar(envVar) {
continue
}
envVars = append(envVars, envVar)
}
if len(envVars) > 0 {
return envVars
}
return []string{EnvVarNvidiaVisibleDevices}
}
// VisibleDevices returns a list of devices requested in the container image.
// If volume mount requests are enabled these are returned if requested,
// otherwise device requests through environment variables are considered.
@ -253,7 +270,7 @@ func (i CUDA) VisibleDevices() []string {
}
// Get the Fallback to reading from the environment variable if privileges are correct
envVarDeviceRequests := i.VisibleDevicesFromEnvVar()
envVarDeviceRequests := i.visibleDevicesFromEnvVar()
if len(envVarDeviceRequests) == 0 {
return nil
}
@ -265,7 +282,10 @@ func (i CUDA) VisibleDevices() []string {
}
// We log a warning if we are ignoring the environment variable requests.
i.logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES in unprivileged container")
envVars := i.visibleEnvVars()
if len(envVars) > 0 {
i.logger.Warningf("Ignoring devices requested by environment variable(s) in unprivileged container: %v", envVars)
}
return nil
}
@ -281,31 +301,34 @@ func (i CUDA) cdiDeviceRequestsFromAnnotations() []string {
return nil
}
var devices []string
for key, value := range i.annotations {
var annotationKeys []string
for key := range i.annotations {
for _, prefix := range i.annotationsPrefixes {
if strings.HasPrefix(key, prefix) {
devices = append(devices, strings.Split(value, ",")...)
annotationKeys = append(annotationKeys, key)
// There is no need to check additional prefixes since we
// typically deduplicate devices in any case.
break
}
}
}
// We sort the annotationKeys for consistent results.
slices.Sort(annotationKeys)
var devices []string
for _, key := range annotationKeys {
devices = append(devices, strings.Split(i.annotations[key], ",")...)
}
return devices
}
// VisibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
// visibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
// If any of the preferredVisibleDeviceEnvVars are present in the image, they
// are used to determine the visible devices. If this is not the case, the
// NVIDIA_VISIBLE_DEVICES environment variable is used.
func (i CUDA) VisibleDevicesFromEnvVar() []string {
for _, envVar := range i.preferredVisibleDeviceEnvVars {
if i.HasEnvvar(envVar) {
return i.DevicesFromEnvvars(i.preferredVisibleDeviceEnvVars...).List()
}
}
return i.DevicesFromEnvvars(EnvVarNvidiaVisibleDevices).List()
func (i CUDA) visibleDevicesFromEnvVar() []string {
envVars := i.visibleEnvVars()
return i.devicesFromEnvvars(envVars...)
}
// visibleDevicesFromMounts returns the set of visible devices requested as mounts.
@ -391,7 +414,7 @@ func (m cdiDeviceMountRequest) qualifiedName() (string, error) {
// ImexChannelsFromEnvVar returns the list of IMEX channels requested for the image.
func (i CUDA) ImexChannelsFromEnvVar() []string {
imexChannels := i.DevicesFromEnvvars(EnvVarNvidiaImexChannels).List()
imexChannels := i.devicesFromEnvvars(EnvVarNvidiaImexChannels)
if len(imexChannels) == 1 && imexChannels[0] == "all" {
return nil
}

View File

@ -429,7 +429,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
)
require.NoError(t, err)
devices := image.VisibleDevicesFromEnvVar()
devices := image.visibleDevicesFromEnvVar()
require.EqualValues(t, tc.expectedDevices, devices)
})
}
@ -508,13 +508,15 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) {
func TestVisibleDevices(t *testing.T) {
var tests = []struct {
description string
mountDevices []specs.Mount
envvarDevices string
privileged bool
acceptUnprivileged bool
acceptMounts bool
expectedDevices []string
description string
mountDevices []specs.Mount
envvarDevices string
privileged bool
acceptUnprivileged bool
acceptMounts bool
preferredVisibleDeviceEnvVars []string
env map[string]string
expectedDevices []string
}{
{
description: "Mount devices, unprivileged, no accept unprivileged",
@ -597,20 +599,92 @@ func TestVisibleDevices(t *testing.T) {
acceptMounts: false,
expectedDevices: nil,
},
// New test cases for visibleEnvVars functionality
{
description: "preferred env var set and present in env, privileged",
mountDevices: nil,
envvarDevices: "",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
},
expectedDevices: []string{"GPU-12345"},
},
{
description: "preferred env var set and present in env, unprivileged but accepted",
mountDevices: nil,
envvarDevices: "",
privileged: false,
acceptUnprivileged: true,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
},
expectedDevices: []string{"GPU-12345"},
},
{
description: "preferred env var set and present in env, unprivileged and not accepted",
mountDevices: nil,
envvarDevices: "",
privileged: false,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
},
expectedDevices: nil,
},
{
description: "multiple preferred env vars, both present, privileged",
mountDevices: nil,
envvarDevices: "",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
"DOCKER_RESOURCE_GPUS": "GPU-12345",
"DOCKER_RESOURCE_GPUS_ADDITIONAL": "GPU-67890",
},
expectedDevices: []string{"GPU-12345", "GPU-67890"},
},
{
description: "preferred env var not present, fallback to NVIDIA_VISIBLE_DEVICES, privileged",
mountDevices: nil,
envvarDevices: "GPU-12345",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS"},
env: map[string]string{
EnvVarNvidiaVisibleDevices: "GPU-12345",
},
expectedDevices: []string{"GPU-12345"},
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
// Wrap the call to getDevices() in a closure.
// Create env map with both NVIDIA_VISIBLE_DEVICES and any additional env vars
env := make(map[string]string)
if tc.envvarDevices != "" {
env[EnvVarNvidiaVisibleDevices] = tc.envvarDevices
}
for k, v := range tc.env {
env[k] = v
}
image, err := New(
WithEnvMap(
map[string]string{
EnvVarNvidiaVisibleDevices: tc.envvarDevices,
},
),
WithEnvMap(env),
WithMounts(tc.mountDevices),
WithPrivileged(tc.privileged),
WithAcceptDeviceListAsVolumeMounts(tc.acceptMounts),
WithAcceptEnvvarUnprivileged(tc.acceptUnprivileged),
WithPreferredVisibleDevicesEnvVars(tc.preferredVisibleDeviceEnvVars...),
)
require.NoError(t, err)
require.Equal(t, tc.expectedDevices, image.VisibleDevices())

View File

@ -98,7 +98,7 @@ func TestDeviceRequests(t *testing.T) {
"another-prefix/bar": "example.com/device=baz",
},
},
expectedDevices: []string{"example.com/device=bar", "example.com/device=baz"},
expectedDevices: []string{"example.com/device=baz", "example.com/device=bar"},
},
{
description: "multiple matching annotations with duplicate devices",

View File

@ -33,7 +33,7 @@ import (
// NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper.
// The modifications are defined by CSV MountSpecs.
func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) {
if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 {
if devices := container.VisibleDevices(); len(devices) == 0 {
logger.Infof("No modification required; no devices requested")
return nil, nil
}

View File

@ -37,7 +37,7 @@ import (
//
// If not devices are selected, no changes are made.
func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 {
if devices := image.VisibleDevices(); len(devices) == 0 {
logger.Infof("No modification required; no devices requested")
return nil, nil
}

View File

@ -29,9 +29,10 @@ import (
// NewGraphicsModifier constructs a modifier that injects graphics-related modifications into an OCI runtime specification.
// The value of the NVIDIA_DRIVER_CAPABILITIES environment variable is checked to determine if this modification should be made.
func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerImage image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
if required, reason := requiresGraphicsModifier(containerImage); !required {
logger.Infof("No graphics modifier required: %v", reason)
func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, container image.CUDA, driver *root.Driver, hookCreator discover.HookCreator) (oci.SpecModifier, error) {
devices, reason := requiresGraphicsModifier(container)
if len(devices) == 0 {
logger.Infof("No graphics modifier required; %v", reason)
return nil, nil
}
@ -48,7 +49,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI
devRoot := driver.Root
drmNodes, err := discover.NewDRMNodesDiscoverer(
logger,
containerImage.DevicesFromEnvvars(image.EnvVarNvidiaVisibleDevices),
image.NewVisibleDevices(devices...),
devRoot,
hookCreator,
)
@ -64,14 +65,15 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerI
}
// requiresGraphicsModifier determines whether a graphics modifier is required.
func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) {
if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 {
return false, "no devices requested"
func requiresGraphicsModifier(cudaImage image.CUDA) ([]string, string) {
devices := cudaImage.VisibleDevices()
if len(devices) == 0 {
return nil, "no devices requested"
}
if !cudaImage.GetDriverCapabilities().Any(image.DriverCapabilityGraphics, image.DriverCapabilityDisplay) {
return false, "no required capabilities requested"
return nil, "no required capabilities requested"
}
return true, ""
return devices, ""
}

View File

@ -26,9 +26,9 @@ import (
func TestGraphicsModifier(t *testing.T) {
testCases := []struct {
description string
envmap map[string]string
expectedRequired bool
description string
envmap map[string]string
expectedDevices []string
}{
{
description: "empty image does not create modifier",
@ -52,7 +52,7 @@ func TestGraphicsModifier(t *testing.T) {
"NVIDIA_VISIBLE_DEVICES": "all",
"NVIDIA_DRIVER_CAPABILITIES": "all",
},
expectedRequired: true,
expectedDevices: []string{"all"},
},
{
description: "devices with graphics capability creates modifier",
@ -60,7 +60,7 @@ func TestGraphicsModifier(t *testing.T) {
"NVIDIA_VISIBLE_DEVICES": "all",
"NVIDIA_DRIVER_CAPABILITIES": "graphics",
},
expectedRequired: true,
expectedDevices: []string{"all"},
},
{
description: "devices with compute,graphics capability creates modifier",
@ -68,7 +68,7 @@ func TestGraphicsModifier(t *testing.T) {
"NVIDIA_VISIBLE_DEVICES": "all",
"NVIDIA_DRIVER_CAPABILITIES": "compute,graphics",
},
expectedRequired: true,
expectedDevices: []string{"all"},
},
{
description: "devices with display capability creates modifier",
@ -76,7 +76,7 @@ func TestGraphicsModifier(t *testing.T) {
"NVIDIA_VISIBLE_DEVICES": "all",
"NVIDIA_DRIVER_CAPABILITIES": "display",
},
expectedRequired: true,
expectedDevices: []string{"all"},
},
{
description: "devices with display,graphics capability creates modifier",
@ -84,7 +84,7 @@ func TestGraphicsModifier(t *testing.T) {
"NVIDIA_VISIBLE_DEVICES": "all",
"NVIDIA_DRIVER_CAPABILITIES": "display,graphics",
},
expectedRequired: true,
expectedDevices: []string{"all"},
},
}
@ -94,7 +94,7 @@ func TestGraphicsModifier(t *testing.T) {
image.WithEnvMap(tc.envmap),
)
required, _ := requiresGraphicsModifier(image)
require.EqualValues(t, tc.expectedRequired, required)
require.EqualValues(t, tc.expectedDevices, required)
})
}
}