Merge pull request #1110 from ArangoGutierrez/i/1049

Refactor extracting requested devices from the container image
This commit is contained in:
Evan Lezar 2025-06-05 13:00:32 +02:00 committed by GitHub
commit b55255e31f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 653 additions and 438 deletions

View File

@ -13,10 +13,6 @@ import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
)
const (
capSysAdmin = "CAP_SYS_ADMIN"
)
type nvidiaConfig struct {
Devices []string
MigConfigDevices string
@ -103,9 +99,9 @@ func loadSpec(path string) (spec *Spec) {
return
}
func isPrivileged(s *Spec) bool {
if s.Process.Capabilities == nil {
return false
func (s *Spec) GetCapabilities() []string {
if s == nil || s.Process == nil || s.Process.Capabilities == nil {
return nil
}
var caps []string
@ -118,67 +114,22 @@ func isPrivileged(s *Spec) bool {
if err != nil {
log.Panicln("could not decode Process.Capabilities in OCI spec:", err)
}
for _, c := range caps {
if c == capSysAdmin {
return true
}
}
return false
return caps
}
// Otherwise, parse s.Process.Capabilities as:
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L54
process := specs.Process{
Env: s.Process.Env,
}
err := json.Unmarshal(*s.Process.Capabilities, &process.Capabilities)
capabilities := specs.LinuxCapabilities{}
err := json.Unmarshal(*s.Process.Capabilities, &capabilities)
if err != nil {
log.Panicln("could not decode Process.Capabilities in OCI spec:", err)
}
fullSpec := specs.Spec{
Version: *s.Version,
Process: &process,
}
return image.IsPrivileged(&fullSpec)
return image.OCISpecCapabilities(capabilities).GetCapabilities()
}
func getDevicesFromEnvvar(containerImage image.CUDA, swarmResourceEnvvars []string) []string {
// We check if the image has at least one of the Swarm resource envvars defined and use this
// if specified.
for _, envvar := range swarmResourceEnvvars {
if containerImage.HasEnvvar(envvar) {
return containerImage.DevicesFromEnvvars(swarmResourceEnvvars...).List()
}
}
return containerImage.VisibleDevicesFromEnvVar()
}
func (hookConfig *hookConfig) getDevices(image image.CUDA, privileged bool) []string {
// If enabled, try and get the device list from volume mounts first
if hookConfig.AcceptDeviceListAsVolumeMounts {
devices := image.VisibleDevicesFromMounts()
if len(devices) > 0 {
return devices
}
}
// Fallback to reading from the environment variable if privileges are correct
devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars())
if len(devices) == 0 {
return nil
}
if privileged || hookConfig.AcceptEnvvarUnprivileged {
return devices
}
configName := hookConfig.getConfigOption("AcceptEnvvarUnprivileged")
log.Printf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES (privileged=%v, %v=%v) ", privileged, configName, hookConfig.AcceptEnvvarUnprivileged)
return nil
func isPrivileged(s *Spec) bool {
return image.IsPrivileged(s)
}
func getMigConfigDevices(i image.CUDA) *string {
@ -225,7 +176,6 @@ func (hookConfig *hookConfig) getDriverCapabilities(cudaImage image.CUDA, legacy
// We use the default driver capabilities by default. This is filtered to only include the
// supported capabilities
supportedDriverCapabilities := image.NewDriverCapabilities(hookConfig.SupportedDriverCapabilities)
capabilities := supportedDriverCapabilities.Intersection(image.DefaultDriverCapabilities)
capsEnvSpecified := cudaImage.HasEnvvar(image.EnvVarNvidiaDriverCapabilities)
@ -251,7 +201,7 @@ func (hookConfig *hookConfig) getDriverCapabilities(cudaImage image.CUDA, legacy
func (hookConfig *hookConfig) getNvidiaConfig(image image.CUDA, privileged bool) *nvidiaConfig {
legacyImage := image.IsLegacy()
devices := hookConfig.getDevices(image, privileged)
devices := image.VisibleDevices()
if len(devices) == 0 {
// empty devices means this is not a GPU container.
return nil
@ -306,20 +256,25 @@ func (hookConfig *hookConfig) getContainerConfig() (config containerConfig) {
s := loadSpec(path.Join(b, "config.json"))
image, err := image.New(
privileged := isPrivileged(s)
i, err := image.New(
image.WithEnv(s.Process.Env),
image.WithMounts(s.Mounts),
image.WithPrivileged(privileged),
image.WithDisableRequire(hookConfig.DisableRequire),
image.WithAcceptDeviceListAsVolumeMounts(hookConfig.AcceptDeviceListAsVolumeMounts),
image.WithAcceptEnvvarUnprivileged(hookConfig.AcceptEnvvarUnprivileged),
image.WithPreferredVisibleDevicesEnvVars(hookConfig.getSwarmResourceEnvvars()...),
)
if err != nil {
log.Panicln(err)
}
privileged := isPrivileged(s)
return containerConfig{
Pid: h.Pid,
Rootfs: s.Root.Path,
Image: image,
Nvidia: hookConfig.getNvidiaConfig(image, privileged),
Image: i,
Nvidia: hookConfig.getNvidiaConfig(i, privileged),
}
}

View File

@ -1,10 +1,8 @@
package main
import (
"path/filepath"
"testing"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/require"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
@ -479,7 +477,10 @@ func TestGetNvidiaConfig(t *testing.T) {
t.Run(tc.description, func(t *testing.T) {
image, _ := image.New(
image.WithEnvMap(tc.env),
image.WithPrivileged(tc.privileged),
image.WithPreferredVisibleDevicesEnvVars(tc.hookConfig.getSwarmResourceEnvvars()...),
)
// Wrap the call to getNvidiaConfig() in a closure.
var cfg *nvidiaConfig
getConfig := func() {
@ -518,340 +519,6 @@ func TestGetNvidiaConfig(t *testing.T) {
}
}
func TestDeviceListSourcePriority(t *testing.T) {
var tests = []struct {
description string
mountDevices []specs.Mount
envvarDevices string
privileged bool
acceptUnprivileged bool
acceptMounts bool
expectedDevices []string
}{
{
description: "Mount devices, unprivileged, no accept unprivileged",
mountDevices: []specs.Mount{
{
Source: "/dev/null",
Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU1"),
},
},
envvarDevices: "GPU2,GPU3",
privileged: false,
acceptUnprivileged: false,
acceptMounts: true,
expectedDevices: []string{"GPU0", "GPU1"},
},
{
description: "No mount devices, unprivileged, no accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: false,
acceptUnprivileged: false,
acceptMounts: true,
expectedDevices: nil,
},
{
description: "No mount devices, privileged, no accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
expectedDevices: []string{"GPU0", "GPU1"},
},
{
description: "No mount devices, unprivileged, accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: false,
acceptUnprivileged: true,
acceptMounts: true,
expectedDevices: []string{"GPU0", "GPU1"},
},
{
description: "Mount devices, unprivileged, accept unprivileged, no accept mounts",
mountDevices: []specs.Mount{
{
Source: "/dev/null",
Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU1"),
},
},
envvarDevices: "GPU2,GPU3",
privileged: false,
acceptUnprivileged: true,
acceptMounts: false,
expectedDevices: []string{"GPU2", "GPU3"},
},
{
description: "Mount devices, unprivileged, no accept unprivileged, no accept mounts",
mountDevices: []specs.Mount{
{
Source: "/dev/null",
Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU1"),
},
},
envvarDevices: "GPU2,GPU3",
privileged: false,
acceptUnprivileged: false,
acceptMounts: false,
expectedDevices: nil,
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
// Wrap the call to getDevices() in a closure.
var devices []string
getDevices := func() {
image, _ := image.New(
image.WithEnvMap(
map[string]string{
image.EnvVarNvidiaVisibleDevices: tc.envvarDevices,
},
),
image.WithMounts(tc.mountDevices),
)
defaultConfig, _ := config.GetDefault()
cfg := &hookConfig{defaultConfig}
cfg.AcceptEnvvarUnprivileged = tc.acceptUnprivileged
cfg.AcceptDeviceListAsVolumeMounts = tc.acceptMounts
devices = cfg.getDevices(image, tc.privileged)
}
// For all other tests, just grab the devices and check the results
getDevices()
require.Equal(t, tc.expectedDevices, devices)
})
}
}
func TestGetDevicesFromEnvvar(t *testing.T) {
envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS"
gpuID := "GPU-12345"
anotherGPUID := "GPU-67890"
thirdGPUID := "MIG-12345"
var tests = []struct {
description string
swarmResourceEnvvars []string
env map[string]string
expectedDevices []string
}{
{
description: "empty env returns nil for non-legacy image",
},
{
description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: "",
},
},
{
description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: "void",
},
},
{
description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image",
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: "none",
},
expectedDevices: []string{""},
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image",
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: gpuID,
},
expectedDevices: []string{gpuID},
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: gpuID,
image.EnvVarCudaVersion: "legacy",
},
expectedDevices: []string{gpuID},
},
{
description: "empty env returns all for legacy image",
env: map[string]string{
image.EnvVarCudaVersion: "legacy",
},
expectedDevices: []string{"all"},
},
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is ignored when
// not enabled
{
description: "missing NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
},
},
{
description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: "",
envDockerResourceGPUs: anotherGPUID,
},
},
{
description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: "void",
envDockerResourceGPUs: anotherGPUID,
},
},
{
description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image",
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: "none",
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: []string{""},
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image",
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: []string{gpuID},
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
image.EnvVarCudaVersion: "legacy",
},
expectedDevices: []string{gpuID},
},
{
description: "empty env returns all for legacy image",
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
image.EnvVarCudaVersion: "legacy",
},
expectedDevices: []string{"all"},
},
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
// enabled
{
description: "empty env returns nil for non-legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
},
{
description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: "",
},
},
{
description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: "void",
},
},
{
description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: "none",
},
expectedDevices: []string{""},
},
{
description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: gpuID,
},
expectedDevices: []string{gpuID},
},
{
description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: gpuID,
image.EnvVarCudaVersion: "legacy",
},
expectedDevices: []string{gpuID},
},
{
description: "DOCKER_RESOURCE_GPUS is selected if present",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: []string{anotherGPUID},
},
{
description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: []string{anotherGPUID},
},
{
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: []string{anotherGPUID},
},
{
description: "All available swarm resource envvars are selected and override NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS": thirdGPUID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: []string{thirdGPUID, anotherGPUID},
},
{
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS override NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
image.EnvVarNvidiaVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: []string{anotherGPUID},
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
image, _ := image.New(
image.WithEnvMap(tc.env),
)
devices := getDevicesFromEnvvar(image, tc.swarmResourceEnvvars)
require.EqualValues(t, tc.expectedDevices, devices)
})
}
}
func TestGetDriverCapabilities(t *testing.T) {
supportedCapabilities := "compute,display,utility,video"

View File

@ -88,7 +88,7 @@ func (c hookConfig) getConfigOption(fieldName string) string {
// getSwarmResourceEnvvars returns the swarm resource envvars for the config.
func (c *hookConfig) getSwarmResourceEnvvars() []string {
if c.SwarmResource == "" {
if c == nil || c.SwarmResource == "" {
return nil
}

View File

@ -21,22 +21,36 @@ import (
"strings"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
)
type builder struct {
env map[string]string
mounts []specs.Mount
CUDA
disableRequire bool
}
// Option is a functional option for creating a CUDA image.
type Option func(*builder) error
// New creates a new CUDA image from the input options.
func New(opt ...Option) (CUDA, error) {
b := &builder{}
b := &builder{
CUDA: CUDA{
acceptEnvvarUnprivileged: true,
},
}
for _, o := range opt {
if err := o(b); err != nil {
return CUDA{}, err
}
}
if b.logger == nil {
b.logger = logger.New()
}
if b.env == nil {
b.env = make(map[string]string)
}
@ -50,15 +64,22 @@ func (b builder) build() (CUDA, error) {
b.env[EnvVarNvidiaDisableRequire] = "true"
}
c := CUDA{
env: b.env,
mounts: b.mounts,
}
return c, nil
return b.CUDA, nil
}
// Option is a functional option for creating a CUDA image.
type Option func(*builder) error
func WithAcceptDeviceListAsVolumeMounts(acceptDeviceListAsVolumeMounts bool) Option {
return func(b *builder) error {
b.acceptDeviceListAsVolumeMounts = acceptDeviceListAsVolumeMounts
return nil
}
}
func WithAcceptEnvvarUnprivileged(acceptEnvvarUnprivileged bool) Option {
return func(b *builder) error {
b.acceptEnvvarUnprivileged = acceptEnvvarUnprivileged
return nil
}
}
// WithDisableRequire sets the disable require option.
func WithDisableRequire(disableRequire bool) Option {
@ -93,6 +114,14 @@ func WithEnvMap(env map[string]string) Option {
}
}
// WithLogger sets the logger to use when creating the CUDA image.
func WithLogger(logger logger.Interface) Option {
return func(b *builder) error {
b.logger = logger
return nil
}
}
// WithMounts sets the mounts associated with the CUDA image.
func WithMounts(mounts []specs.Mount) Option {
return func(b *builder) error {
@ -100,3 +129,20 @@ func WithMounts(mounts []specs.Mount) Option {
return nil
}
}
// WithPreferredVisibleDevicesEnvVars sets the environment variables that
// should take precedence over the default NVIDIA_VISIBLE_DEVICES.
func WithPreferredVisibleDevicesEnvVars(preferredVisibleDeviceEnvVars ...string) Option {
return func(b *builder) error {
b.preferredVisibleDeviceEnvVars = preferredVisibleDeviceEnvVars
return nil
}
}
// WithPrivileged sets whether an image is privileged or not.
func WithPrivileged(isPrivileged bool) Option {
return func(b *builder) error {
b.isPrivileged = isPrivileged
return nil
}
}

View File

@ -25,6 +25,8 @@ import (
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/mod/semver"
"tags.cncf.io/container-device-interface/pkg/parser"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
)
const (
@ -38,27 +40,37 @@ const (
// a map of environment variable to values that can be used to perform lookups
// such as requirements.
type CUDA struct {
logger logger.Interface
env map[string]string
isPrivileged bool
mounts []specs.Mount
acceptDeviceListAsVolumeMounts bool
acceptEnvvarUnprivileged bool
preferredVisibleDeviceEnvVars []string
}
// NewCUDAImageFromSpec creates a CUDA image from the input OCI runtime spec.
// The process environment is read (if present) to construc the CUDA Image.
func NewCUDAImageFromSpec(spec *specs.Spec) (CUDA, error) {
func NewCUDAImageFromSpec(spec *specs.Spec, opts ...Option) (CUDA, error) {
var env []string
if spec != nil && spec.Process != nil {
env = spec.Process.Env
}
return New(
specOpts := []Option{
WithEnv(env),
WithMounts(spec.Mounts),
)
WithPrivileged(IsPrivileged((*OCISpec)(spec))),
}
return New(append(opts, specOpts...)...)
}
// NewCUDAImageFromEnv creates a CUDA image from the input environment. The environment
// newCUDAImageFromEnv creates a CUDA image from the input environment. The environment
// is a list of strings of the form ENVAR=VALUE.
func NewCUDAImageFromEnv(env []string) (CUDA, error) {
func newCUDAImageFromEnv(env []string) (CUDA, error) {
return New(WithEnv(env))
}
@ -216,14 +228,53 @@ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
return hasCDIdevice
}
// VisibleDevicesFromEnvVar returns the set of visible devices requested through
// the NVIDIA_VISIBLE_DEVICES environment variable.
// VisibleDevices returns a list of devices requested in the container image.
// If volume mount requests are enabled these are returned if requested,
// otherwise device requests through environment variables are considered.
// In cases where environment variable requests required privileged containers,
// such devices requests are ignored.
func (i CUDA) VisibleDevices() []string {
// If enabled, try and get the device list from volume mounts first
if i.acceptDeviceListAsVolumeMounts {
volumeMountDeviceRequests := i.visibleDevicesFromMounts()
if len(volumeMountDeviceRequests) > 0 {
return volumeMountDeviceRequests
}
}
// Get the Fallback to reading from the environment variable if privileges are correct
envVarDeviceRequests := i.VisibleDevicesFromEnvVar()
if len(envVarDeviceRequests) == 0 {
return nil
}
// If the container is privileged, or environment variable requests are
// allowed for unprivileged containers, these devices are returned.
if i.isPrivileged || i.acceptEnvvarUnprivileged {
return envVarDeviceRequests
}
// We log a warning if we are ignoring the environment variable requests.
i.logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES in unprivileged container")
return nil
}
// VisibleDevicesFromEnvVar returns the set of visible devices requested through environment variables.
// If any of the preferredVisibleDeviceEnvVars are present in the image, they
// are used to determine the visible devices. If this is not the case, the
// NVIDIA_VISIBLE_DEVICES environment variable is used.
func (i CUDA) VisibleDevicesFromEnvVar() []string {
for _, envVar := range i.preferredVisibleDeviceEnvVars {
if i.HasEnvvar(envVar) {
return i.DevicesFromEnvvars(i.preferredVisibleDeviceEnvVars...).List()
}
}
return i.DevicesFromEnvvars(EnvVarNvidiaVisibleDevices).List()
}
// VisibleDevicesFromMounts returns the set of visible devices requested as mounts.
func (i CUDA) VisibleDevicesFromMounts() []string {
// visibleDevicesFromMounts returns the set of visible devices requested as mounts.
func (i CUDA) visibleDevicesFromMounts() []string {
var devices []string
for _, device := range i.DevicesFromMounts() {
switch {
@ -238,7 +289,6 @@ func (i CUDA) VisibleDevicesFromMounts() []string {
}
// DevicesFromMounts returns a list of device specified as mounts.
// TODO: This should be merged with getDevicesFromMounts used in the NVIDIA Container Runtime
func (i CUDA) DevicesFromMounts() []string {
root := filepath.Clean(DeviceListAsVolumeMountsRoot)
seen := make(map[string]bool)

View File

@ -21,9 +21,91 @@ import (
"testing"
"github.com/opencontainers/runtime-spec/specs-go"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
)
func TestNewCUDAImageFromSpec(t *testing.T) {
logger, _ := testlog.NewNullLogger()
testCases := []struct {
description string
spec *specs.Spec
options []Option
expected CUDA
}{
{
description: "no env vars",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{},
},
},
expected: CUDA{
logger: logger,
env: map[string]string{},
acceptEnvvarUnprivileged: true,
},
},
{
description: "NVIDIA_VISIBLE_DEVICES=all",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=all"},
},
},
expected: CUDA{
logger: logger,
env: map[string]string{"NVIDIA_VISIBLE_DEVICES": "all"},
acceptEnvvarUnprivileged: true,
},
},
{
description: "Spec overrides options",
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=all"},
},
Mounts: []specs.Mount{
{
Source: "/spec-source",
Destination: "/spec-destination",
},
},
},
options: []Option{
WithEnvMap(map[string]string{"OTHER": "value"}),
WithMounts([]specs.Mount{
{
Source: "/option-source",
Destination: "/option-destination",
},
}),
},
expected: CUDA{
logger: logger,
env: map[string]string{"NVIDIA_VISIBLE_DEVICES": "all"},
mounts: []specs.Mount{
{
Source: "/spec-source",
Destination: "/spec-destination",
},
},
acceptEnvvarUnprivileged: true,
},
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
options := append([]Option{WithLogger(logger)}, tc.options...)
image, err := NewCUDAImageFromSpec(tc.spec, options...)
require.NoError(t, err)
require.EqualValues(t, tc.expected, image)
})
}
}
func TestParseMajorMinorVersionValid(t *testing.T) {
var tests = []struct {
version string
@ -122,7 +204,7 @@ func TestGetRequirements(t *testing.T) {
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
image, err := NewCUDAImageFromEnv(tc.env)
image, err := newCUDAImageFromEnv(tc.env)
require.NoError(t, err)
requirements, err := image.GetRequirements()
@ -133,6 +215,226 @@ func TestGetRequirements(t *testing.T) {
}
}
func TestGetDevicesFromEnvvar(t *testing.T) {
envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS"
gpuID := "GPU-12345"
anotherGPUID := "GPU-67890"
thirdGPUID := "MIG-12345"
var tests = []struct {
description string
preferredVisibleDeviceEnvVars []string
env map[string]string
expectedDevices []string
}{
{
description: "empty env returns nil for non-legacy image",
},
{
description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
EnvVarNvidiaVisibleDevices: "",
},
},
{
description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
EnvVarNvidiaVisibleDevices: "void",
},
},
{
description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image",
env: map[string]string{
EnvVarNvidiaVisibleDevices: "none",
},
expectedDevices: []string{""},
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image",
env: map[string]string{
EnvVarNvidiaVisibleDevices: gpuID,
},
expectedDevices: []string{gpuID},
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
env: map[string]string{
EnvVarNvidiaVisibleDevices: gpuID,
EnvVarCudaVersion: "legacy",
},
expectedDevices: []string{gpuID},
},
{
description: "empty env returns all for legacy image",
env: map[string]string{
EnvVarCudaVersion: "legacy",
},
expectedDevices: []string{"all"},
},
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is ignored when
// not enabled
{
description: "missing NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
},
},
{
description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
EnvVarNvidiaVisibleDevices: "",
envDockerResourceGPUs: anotherGPUID,
},
},
{
description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
EnvVarNvidiaVisibleDevices: "void",
envDockerResourceGPUs: anotherGPUID,
},
},
{
description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image",
env: map[string]string{
EnvVarNvidiaVisibleDevices: "none",
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: []string{""},
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image",
env: map[string]string{
EnvVarNvidiaVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: []string{gpuID},
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
env: map[string]string{
EnvVarNvidiaVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
EnvVarCudaVersion: "legacy",
},
expectedDevices: []string{gpuID},
},
{
description: "empty env returns all for legacy image",
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
EnvVarCudaVersion: "legacy",
},
expectedDevices: []string{"all"},
},
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
// enabled
{
description: "empty env returns nil for non-legacy image",
preferredVisibleDeviceEnvVars: []string{envDockerResourceGPUs},
},
{
description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
preferredVisibleDeviceEnvVars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: "",
},
},
{
description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
preferredVisibleDeviceEnvVars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: "void",
},
},
{
description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
preferredVisibleDeviceEnvVars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: "none",
},
expectedDevices: []string{""},
},
{
description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
preferredVisibleDeviceEnvVars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: gpuID,
},
expectedDevices: []string{gpuID},
},
{
description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
preferredVisibleDeviceEnvVars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: gpuID,
EnvVarCudaVersion: "legacy",
},
expectedDevices: []string{gpuID},
},
{
description: "DOCKER_RESOURCE_GPUS is selected if present",
preferredVisibleDeviceEnvVars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: []string{anotherGPUID},
},
{
description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
preferredVisibleDeviceEnvVars: []string{envDockerResourceGPUs},
env: map[string]string{
EnvVarNvidiaVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: []string{anotherGPUID},
},
{
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present",
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
EnvVarNvidiaVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: []string{anotherGPUID},
},
{
description: "All available swarm resource envvars are selected and override NVIDIA_VISIBLE_DEVICES if present",
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
EnvVarNvidiaVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS": thirdGPUID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: []string{thirdGPUID, anotherGPUID},
},
{
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS override NVIDIA_VISIBLE_DEVICES if present",
preferredVisibleDeviceEnvVars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
EnvVarNvidiaVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: []string{anotherGPUID},
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
image, err := New(
WithEnvMap(tc.env),
WithPrivileged(true),
WithAcceptDeviceListAsVolumeMounts(false),
WithAcceptEnvvarUnprivileged(false),
WithPreferredVisibleDevicesEnvVars(tc.preferredVisibleDeviceEnvVars...),
)
require.NoError(t, err)
devices := image.VisibleDevicesFromEnvVar()
require.EqualValues(t, tc.expectedDevices, devices)
})
}
}
func TestGetVisibleDevicesFromMounts(t *testing.T) {
var tests = []struct {
description string
@ -197,8 +499,121 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) {
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
image, _ := New(WithMounts(tc.mounts))
require.Equal(t, tc.expectedDevices, image.VisibleDevicesFromMounts())
image, err := New(WithMounts(tc.mounts))
require.NoError(t, err)
require.Equal(t, tc.expectedDevices, image.visibleDevicesFromMounts())
})
}
}
func TestVisibleDevices(t *testing.T) {
var tests = []struct {
description string
mountDevices []specs.Mount
envvarDevices string
privileged bool
acceptUnprivileged bool
acceptMounts bool
expectedDevices []string
}{
{
description: "Mount devices, unprivileged, no accept unprivileged",
mountDevices: []specs.Mount{
{
Source: "/dev/null",
Destination: filepath.Join(DeviceListAsVolumeMountsRoot, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(DeviceListAsVolumeMountsRoot, "GPU1"),
},
},
envvarDevices: "GPU2,GPU3",
privileged: false,
acceptUnprivileged: false,
acceptMounts: true,
expectedDevices: []string{"GPU0", "GPU1"},
},
{
description: "No mount devices, unprivileged, no accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: false,
acceptUnprivileged: false,
acceptMounts: true,
expectedDevices: nil,
},
{
description: "No mount devices, privileged, no accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
expectedDevices: []string{"GPU0", "GPU1"},
},
{
description: "No mount devices, unprivileged, accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: false,
acceptUnprivileged: true,
acceptMounts: true,
expectedDevices: []string{"GPU0", "GPU1"},
},
{
description: "Mount devices, unprivileged, accept unprivileged, no accept mounts",
mountDevices: []specs.Mount{
{
Source: "/dev/null",
Destination: filepath.Join(DeviceListAsVolumeMountsRoot, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(DeviceListAsVolumeMountsRoot, "GPU1"),
},
},
envvarDevices: "GPU2,GPU3",
privileged: false,
acceptUnprivileged: true,
acceptMounts: false,
expectedDevices: []string{"GPU2", "GPU3"},
},
{
description: "Mount devices, unprivileged, no accept unprivileged, no accept mounts",
mountDevices: []specs.Mount{
{
Source: "/dev/null",
Destination: filepath.Join(DeviceListAsVolumeMountsRoot, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(DeviceListAsVolumeMountsRoot, "GPU1"),
},
},
envvarDevices: "GPU2,GPU3",
privileged: false,
acceptUnprivileged: false,
acceptMounts: false,
expectedDevices: nil,
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
// Wrap the call to getDevices() in a closure.
image, err := New(
WithEnvMap(
map[string]string{
EnvVarNvidiaVisibleDevices: tc.envvarDevices,
},
),
WithMounts(tc.mountDevices),
WithPrivileged(tc.privileged),
WithAcceptDeviceListAsVolumeMounts(tc.acceptMounts),
WithAcceptEnvvarUnprivileged(tc.acceptUnprivileged),
)
require.NoError(t, err)
require.Equal(t, tc.expectedDevices, image.VisibleDevices())
})
}
}
@ -224,7 +639,7 @@ func TestImexChannelsFromEnvVar(t *testing.T) {
for _, tc := range testCases {
for id, baseEnvvars := range map[string][]string{"": nil, "legacy": {"CUDA_VERSION=1.2.3"}} {
t.Run(tc.description+id, func(t *testing.T) {
i, err := NewCUDAImageFromEnv(append(baseEnvvars, tc.env...))
i, err := newCUDAImageFromEnv(append(baseEnvvars, tc.env...))
require.NoError(t, err)
channels := i.ImexChannelsFromEnvVar()

View File

@ -24,20 +24,39 @@ const (
capSysAdmin = "CAP_SYS_ADMIN"
)
type CapabilitiesGetter interface {
GetCapabilities() []string
}
type OCISpec specs.Spec
type OCISpecCapabilities specs.LinuxCapabilities
// IsPrivileged returns true if the container is a privileged container.
func IsPrivileged(s *specs.Spec) bool {
if s.Process.Capabilities == nil {
func IsPrivileged(s CapabilitiesGetter) bool {
if s == nil {
return false
}
// We only make sure that the bounding capabibility set has
// CAP_SYS_ADMIN. This allows us to make sure that the container was
// actually started as '--privileged', but also allow non-root users to
// access the privileged NVIDIA capabilities.
for _, c := range s.Process.Capabilities.Bounding {
for _, c := range s.GetCapabilities() {
if c == capSysAdmin {
return true
}
}
return false
}
func (s OCISpec) GetCapabilities() []string {
if s.Process == nil || s.Process.Capabilities == nil {
return nil
}
return (*OCISpecCapabilities)(s.Process.Capabilities).GetCapabilities()
}
func (c OCISpecCapabilities) GetCapabilities() []string {
// We only make sure that the bounding capability set has
// CAP_SYS_ADMIN. This allows us to make sure that the container was
// actually started as '--privileged', but also allow non-root users to
// access the privileged NVIDIA capabilities.
return c.Bounding
}

View File

@ -0,0 +1,57 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package image
import (
"testing"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/stretchr/testify/require"
)
func TestIsPrivileged(t *testing.T) {
var tests = []struct {
spec specs.Spec
expected bool
}{
{
specs.Spec{
Process: &specs.Process{
Capabilities: &specs.LinuxCapabilities{
Bounding: []string{"CAP_SYS_ADMIN"},
},
},
},
true,
},
{
specs.Spec{
Process: &specs.Process{
Capabilities: &specs.LinuxCapabilities{
Bounding: []string{"CAP_SYS_FOO"},
},
},
},
false,
},
}
for i, tc := range tests {
privileged := IsPrivileged((*OCISpec)(&tc.spec))
require.Equal(t, tc.expected, privileged, "%d: %v", i, tc)
}
}

View File

@ -79,7 +79,10 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C
return annotationDevices, nil
}
container, err := image.NewCUDAImageFromSpec(rawSpec)
container, err := image.NewCUDAImageFromSpec(
rawSpec,
image.WithLogger(logger),
)
if err != nil {
return nil, err
}
@ -107,7 +110,7 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C
return nil, nil
}
if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged(rawSpec) {
if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged((*image.OCISpec)(rawSpec)) {
return devices, nil
}

View File

@ -70,7 +70,10 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
}
image, err := image.NewCUDAImageFromSpec(rawSpec)
image, err := image.NewCUDAImageFromSpec(
rawSpec,
image.WithLogger(logger),
)
if err != nil {
return nil, err
}