diff --git a/cmd/nvidia-container-runtime-hook/container_config.go b/cmd/nvidia-container-runtime-hook/container_config.go index fa39bf2f..ac34fb93 100644 --- a/cmd/nvidia-container-runtime-hook/container_config.go +++ b/cmd/nvidia-container-runtime-hook/container_config.go @@ -6,8 +6,6 @@ import ( "log" "os" "path" - "path/filepath" - "strings" "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/mod/semver" @@ -15,31 +13,15 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" ) -const ( - envCUDAVersion = "CUDA_VERSION" - envNVRequirePrefix = "NVIDIA_REQUIRE_" - envNVRequireCUDA = envNVRequirePrefix + "CUDA" - envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE" - envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES" - envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES" - envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES" - envNVImexChannels = "NVIDIA_IMEX_CHANNELS" - envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES" -) - const ( capSysAdmin = "CAP_SYS_ADMIN" ) -const ( - deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices" -) - type nvidiaConfig struct { - Devices string + Devices []string MigConfigDevices string MigMonitorDevices string - ImexChannels string + ImexChannels []string DriverCapabilities string // Requirements defines the requirements DSL for the container to run. // This is empty if no specific requirements are needed, or if requirements are @@ -77,23 +59,14 @@ type LinuxCapabilities struct { Ambient []string `json:"ambient,omitempty" platform:"linux"` } -// Mount from OCI runtime spec -// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L103 -type Mount struct { - Destination string `json:"destination"` - Type string `json:"type,omitempty" platform:"linux,solaris"` - Source string `json:"source,omitempty"` - Options []string `json:"options,omitempty"` -} - // Spec from OCI runtime spec // We use pointers to structs, similarly to the latest version of runtime-spec: // https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L5-L28 type Spec struct { - Version *string `json:"ociVersion"` - Process *Process `json:"process,omitempty"` - Root *Root `json:"root,omitempty"` - Mounts []Mount `json:"mounts,omitempty"` + Version *string `json:"ociVersion"` + Process *Process `json:"process,omitempty"` + Root *Root `json:"root,omitempty"` + Mounts []specs.Mount `json:"mounts,omitempty"` } // HookState holds state information about the hook @@ -172,82 +145,30 @@ func isPrivileged(s *Spec) bool { return image.IsPrivileged(&fullSpec) } -func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string { +func getDevicesFromEnvvar(containerImage image.CUDA, swarmResourceEnvvars []string) []string { // We check if the image has at least one of the Swarm resource envvars defined and use this // if specified. - var hasSwarmEnvvar bool for _, envvar := range swarmResourceEnvvars { - if image.HasEnvvar(envvar) { - hasSwarmEnvvar = true - break + if containerImage.HasEnvvar(envvar) { + return containerImage.DevicesFromEnvvars(swarmResourceEnvvars...).List() } } - var devices []string - if hasSwarmEnvvar { - devices = image.DevicesFromEnvvars(swarmResourceEnvvars...).List() - } else { - devices = image.DevicesFromEnvvars(envNVVisibleDevices).List() - } - - if len(devices) == 0 { - return nil - } - - devicesString := strings.Join(devices, ",") - - return &devicesString + return containerImage.VisibleDevicesFromEnvVar() } -func getDevicesFromMounts(mounts []Mount) *string { - var devices []string - for _, m := range mounts { - root := filepath.Clean(deviceListAsVolumeMountsRoot) - source := filepath.Clean(m.Source) - destination := filepath.Clean(m.Destination) - - // Only consider mounts who's host volume is /dev/null - if source != "/dev/null" { - continue - } - // Only consider container mount points that begin with 'root' - if len(destination) < len(root) { - continue - } - if destination[:len(root)] != root { - continue - } - // Grab the full path beyond 'root' and add it to the list of devices - device := destination[len(root):] - if len(device) > 0 && device[0] == '/' { - device = device[1:] - } - if len(device) == 0 { - continue - } - devices = append(devices, device) - } - - if devices == nil { - return nil - } - - ret := strings.Join(devices, ",") - return &ret -} - -func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *string { +func getDevices(hookConfig *HookConfig, image image.CUDA, privileged bool) []string { // If enabled, try and get the device list from volume mounts first if hookConfig.AcceptDeviceListAsVolumeMounts { - devices := getDevicesFromMounts(mounts) - if devices != nil { + devices := image.VisibleDevicesFromMounts() + if len(devices) > 0 { return devices } } // Fallback to reading from the environment variable if privileges are correct devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars()) - if devices == nil { + if len(devices) == 0 { return nil } if privileged || hookConfig.AcceptEnvvarUnprivileged { @@ -260,12 +181,12 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil return nil } -func getMigConfigDevices(image image.CUDA) *string { - return getMigDevices(image, envNVMigConfigDevices) +func getMigConfigDevices(i image.CUDA) *string { + return getMigDevices(i, image.EnvVarNvidiaMigConfigDevices) } -func getMigMonitorDevices(image image.CUDA) *string { - return getMigDevices(image, envNVMigMonitorDevices) +func getMigMonitorDevices(i image.CUDA) *string { + return getMigDevices(i, image.EnvVarNvidiaMigMonitorDevices) } func getMigDevices(image image.CUDA, envvar string) *string { @@ -276,12 +197,24 @@ func getMigDevices(image image.CUDA, envvar string) *string { return &devices } -func getImexChannels(image image.CUDA) *string { - if !image.HasEnvvar(envNVImexChannels) { +func getImexChannels(hookConfig *HookConfig, image image.CUDA, privileged bool) []string { + // If enabled, try and get the device list from volume mounts first + if hookConfig.AcceptDeviceListAsVolumeMounts { + devices := image.ImexChannelsFromMounts() + if len(devices) > 0 { + return devices + } + } + devices := image.ImexChannelsFromEnvVar() + if len(devices) == 0 { return nil } - chans := image.Getenv(envNVImexChannels) - return &chans + + if privileged || hookConfig.AcceptEnvvarUnprivileged { + return devices + } + + return nil } func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage bool) image.DriverCapabilities { @@ -291,8 +224,8 @@ func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage boo capabilities := supportedDriverCapabilities.Intersection(image.DefaultDriverCapabilities) - capsEnvSpecified := cudaImage.HasEnvvar(envNVDriverCapabilities) - capsEnv := cudaImage.Getenv(envNVDriverCapabilities) + capsEnvSpecified := cudaImage.HasEnvvar(image.EnvVarNvidiaDriverCapabilities) + capsEnv := cudaImage.Getenv(image.EnvVarNvidiaDriverCapabilities) if !capsEnvSpecified && legacyImage { // Environment variable unset with legacy image: set all capabilities. @@ -311,14 +244,12 @@ func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage boo return capabilities } -func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *nvidiaConfig { +func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, privileged bool) *nvidiaConfig { legacyImage := image.IsLegacy() - var devices string - if d := getDevices(hookConfig, image, mounts, privileged); d != nil { - devices = *d - } else { - // 'nil' devices means this is not a GPU container. + devices := getDevices(hookConfig, image, privileged) + if len(devices) == 0 { + // empty devices means this is not a GPU container. return nil } @@ -338,10 +269,7 @@ func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, p log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container") } - var imexChannels string - if c := getImexChannels(image); c != nil { - imexChannels = *c - } + imexChannels := getImexChannels(hookConfig, image, privileged) driverCapabilities := hookConfig.getDriverCapabilities(image, legacyImage).String() @@ -376,6 +304,7 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { image, err := image.New( image.WithEnv(s.Process.Env), + image.WithMounts(s.Mounts), image.WithDisableRequire(hook.DisableRequire), ) if err != nil { @@ -387,6 +316,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { Pid: h.Pid, Rootfs: s.Root.Path, Image: image, - Nvidia: getNvidiaConfig(&hook, image, s.Mounts, privileged), + Nvidia: getNvidiaConfig(&hook, image, privileged), } } diff --git a/cmd/nvidia-container-runtime-hook/container_config_test.go b/cmd/nvidia-container-runtime-hook/container_config_test.go index 43fac8aa..cad9ca6e 100644 --- a/cmd/nvidia-container-runtime-hook/container_config_test.go +++ b/cmd/nvidia-container-runtime-hook/container_config_test.go @@ -1,10 +1,10 @@ package main import ( - "fmt" "path/filepath" "testing" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/require" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" @@ -34,11 +34,11 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, no devices, no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", + image.EnvVarCudaVersion: "9.0", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -46,12 +46,12 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices 'all', no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "all", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "all", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -59,8 +59,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices 'empty', no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "", }, privileged: false, expectedConfig: nil, @@ -68,8 +68,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices 'void', no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "void", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "void", }, privileged: false, expectedConfig: nil, @@ -77,12 +77,12 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices 'none', no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "none", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "none", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "", + Devices: []string{""}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -90,12 +90,12 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -103,13 +103,13 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, capabilities 'empty', no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -117,13 +117,13 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, capabilities 'all', no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "all", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "all", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -131,13 +131,13 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, capabilities set, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{"cuda>=9.0"}, }, @@ -145,15 +145,15 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, capabilities set, requirements set", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", - envNVRequirePrefix + "REQ0": "req0=true", - envNVRequirePrefix + "REQ1": "req1=false", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", + image.NvidiaRequirePrefix + "REQ0": "req0=true", + image.NvidiaRequirePrefix + "REQ1": "req1=false", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"}, }, @@ -161,33 +161,33 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, capabilities set, requirements set, disable requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", - envNVRequirePrefix + "REQ0": "req0=true", - envNVRequirePrefix + "REQ1": "req1=false", - envNVDisableRequire: "true", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", + image.NvidiaRequirePrefix + "REQ0": "req0=true", + image.NvidiaRequirePrefix + "REQ1": "req1=false", + image.EnvVarNvidiaDisableRequire: "true", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{}, }, }, { - description: "Modern image, no devices, no capabilities, no requirements, no envCUDAVersion", + description: "Modern image, no devices, no capabilities, no requirements, no image.EnvVarCudaVersion", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", }, privileged: false, expectedConfig: nil, }, { - description: "Modern image, no devices, no capabilities, no requirement, envCUDAVersion set", + description: "Modern image, no devices, no capabilities, no requirement, image.EnvVarCudaVersion set", env: map[string]string{ - envCUDAVersion: "9.0", - envNVRequireCUDA: "cuda>=9.0", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", }, privileged: false, expectedConfig: nil, @@ -195,12 +195,12 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'all', no capabilities, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "all", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "all", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -208,8 +208,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'empty', no capabilities, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "", }, privileged: false, expectedConfig: nil, @@ -217,8 +217,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'void', no capabilities, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "void", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "void", }, privileged: false, expectedConfig: nil, @@ -226,12 +226,12 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'none', no capabilities, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "none", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "none", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "", + Devices: []string{""}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -239,12 +239,12 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, no capabilities, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -252,13 +252,13 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, capabilities 'empty', no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -266,13 +266,13 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, capabilities 'all', no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "all", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "all", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -280,13 +280,13 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, capabilities set, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{"cuda>=9.0"}, }, @@ -294,15 +294,15 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, capabilities set, requirements set", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", - envNVRequirePrefix + "REQ0": "req0=true", - envNVRequirePrefix + "REQ1": "req1=false", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", + image.NvidiaRequirePrefix + "REQ0": "req0=true", + image.NvidiaRequirePrefix + "REQ1": "req1=false", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"}, }, @@ -310,16 +310,16 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, capabilities set, requirements set, disable requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", - envNVRequirePrefix + "REQ0": "req0=true", - envNVRequirePrefix + "REQ1": "req1=false", - envNVDisableRequire: "true", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", + image.NvidiaRequirePrefix + "REQ0": "req0=true", + image.NvidiaRequirePrefix + "REQ1": "req1=false", + image.EnvVarNvidiaDisableRequire: "true", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{}, }, @@ -327,12 +327,12 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "No cuda envs, devices 'all'", env: map[string]string{ - envNVVisibleDevices: "all", + image.EnvVarNvidiaVisibleDevices: "all", }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{}, }, @@ -340,13 +340,13 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'all', migConfig set, privileged", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "all", - envNVMigConfigDevices: "mig0,mig1", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaMigConfigDevices: "mig0,mig1", }, privileged: true, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, MigConfigDevices: "mig0,mig1", DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, @@ -355,9 +355,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'all', migConfig set, unprivileged", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "all", - envNVMigConfigDevices: "mig0,mig1", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaMigConfigDevices: "mig0,mig1", }, privileged: false, expectedPanic: true, @@ -365,13 +365,13 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'all', migMonitor set, privileged", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "all", - envNVMigMonitorDevices: "mig0,mig1", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaMigMonitorDevices: "mig0,mig1", }, privileged: true, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, MigMonitorDevices: "mig0,mig1", DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, @@ -380,9 +380,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'all', migMonitor set, unprivileged", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "all", - envNVMigMonitorDevices: "mig0,mig1", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaMigMonitorDevices: "mig0,mig1", }, privileged: false, expectedPanic: true, @@ -390,52 +390,52 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Hook config set as driver-capabilities-all", env: map[string]string{ - envNVVisibleDevices: "all", - envNVDriverCapabilities: "all", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaDriverCapabilities: "all", }, privileged: true, hookConfig: &HookConfig{ SupportedDriverCapabilities: "video,display", }, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: "display,video", }, }, { description: "Hook config set, envvar sets driver-capabilities", env: map[string]string{ - envNVVisibleDevices: "all", - envNVDriverCapabilities: "video,display", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaDriverCapabilities: "video,display", }, privileged: true, hookConfig: &HookConfig{ SupportedDriverCapabilities: "video,display,compute,utility", }, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: "display,video", }, }, { description: "Hook config set, envvar unset sets default driver-capabilities", env: map[string]string{ - envNVVisibleDevices: "all", + image.EnvVarNvidiaVisibleDevices: "all", }, privileged: true, hookConfig: &HookConfig{ SupportedDriverCapabilities: "video,display,utility,compute", }, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), }, }, { description: "Hook config set, swarmResource overrides device selection", env: map[string]string{ - envNVVisibleDevices: "all", - "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", + image.EnvVarNvidiaVisibleDevices: "all", + "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", }, privileged: true, hookConfig: &HookConfig{ @@ -443,15 +443,15 @@ func TestGetNvidiaConfig(t *testing.T) { SupportedDriverCapabilities: "video,display,utility,compute", }, expectedConfig: &nvidiaConfig{ - Devices: "GPU1,GPU2", + Devices: []string{"GPU1", "GPU2"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), }, }, { description: "Hook config set, comma separated swarmResource is split and overrides device selection", env: map[string]string{ - envNVVisibleDevices: "all", - "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", + image.EnvVarNvidiaVisibleDevices: "all", + "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", }, privileged: true, hookConfig: &HookConfig{ @@ -459,7 +459,7 @@ func TestGetNvidiaConfig(t *testing.T) { SupportedDriverCapabilities: "video,display,utility,compute", }, expectedConfig: &nvidiaConfig{ - Devices: "GPU1,GPU2", + Devices: []string{"GPU1", "GPU2"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), }, }, @@ -477,7 +477,7 @@ func TestGetNvidiaConfig(t *testing.T) { defaultConfig, _ := getDefaultHookConfig() hookConfig = &defaultConfig } - config = getNvidiaConfig(hookConfig, image, nil, tc.privileged) + config = getNvidiaConfig(hookConfig, image, tc.privileged) } // For any tests that are expected to panic, make sure they do. @@ -507,111 +507,33 @@ func TestGetNvidiaConfig(t *testing.T) { } } -func TestGetDevicesFromMounts(t *testing.T) { - var tests = []struct { - description string - mounts []Mount - expectedDevices *string - }{ - { - description: "No mounts", - mounts: nil, - expectedDevices: nil, - }, - { - description: "Host path is not /dev/null", - mounts: []Mount{ - { - Source: "/not/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"), - }, - }, - expectedDevices: nil, - }, - { - description: "Container path is not prefixed by 'root'", - mounts: []Mount{ - { - Source: "/dev/null", - Destination: filepath.Join("/other/prefix", "GPU0"), - }, - }, - expectedDevices: nil, - }, - { - description: "Container path is only 'root'", - mounts: []Mount{ - { - Source: "/dev/null", - Destination: deviceListAsVolumeMountsRoot, - }, - }, - expectedDevices: nil, - }, - { - description: "Discover 2 devices", - mounts: []Mount{ - { - Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"), - }, - { - Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"), - }, - }, - expectedDevices: &[]string{"GPU0,GPU1"}[0], - }, - { - description: "Discover 2 devices with slashes in the name", - mounts: []Mount{ - { - Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0-MIG0/0/1"), - }, - { - Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1-MIG0/0/1"), - }, - }, - expectedDevices: &[]string{"GPU0-MIG0/0/1,GPU1-MIG0/0/1"}[0], - }, - } - for _, tc := range tests { - t.Run(tc.description, func(t *testing.T) { - devices := getDevicesFromMounts(tc.mounts) - require.Equal(t, tc.expectedDevices, devices) - }) - } -} - func TestDeviceListSourcePriority(t *testing.T) { var tests = []struct { description string - mountDevices []Mount + mountDevices []specs.Mount envvarDevices string privileged bool acceptUnprivileged bool acceptMounts bool - expectedDevices *string + expectedDevices []string }{ { description: "Mount devices, unprivileged, no accept unprivileged", - mountDevices: []Mount{ + mountDevices: []specs.Mount{ { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU0"), }, { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU1"), }, }, envvarDevices: "GPU2,GPU3", privileged: false, acceptUnprivileged: false, acceptMounts: true, - expectedDevices: &[]string{"GPU0,GPU1"}[0], + expectedDevices: []string{"GPU0", "GPU1"}, }, { description: "No mount devices, unprivileged, no accept unprivileged", @@ -629,7 +551,7 @@ func TestDeviceListSourcePriority(t *testing.T) { privileged: true, acceptUnprivileged: false, acceptMounts: true, - expectedDevices: &[]string{"GPU0,GPU1"}[0], + expectedDevices: []string{"GPU0", "GPU1"}, }, { description: "No mount devices, unprivileged, accept unprivileged", @@ -638,36 +560,36 @@ func TestDeviceListSourcePriority(t *testing.T) { privileged: false, acceptUnprivileged: true, acceptMounts: true, - expectedDevices: &[]string{"GPU0,GPU1"}[0], + expectedDevices: []string{"GPU0", "GPU1"}, }, { description: "Mount devices, unprivileged, accept unprivileged, no accept mounts", - mountDevices: []Mount{ + mountDevices: []specs.Mount{ { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU0"), }, { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU1"), }, }, envvarDevices: "GPU2,GPU3", privileged: false, acceptUnprivileged: true, acceptMounts: false, - expectedDevices: &[]string{"GPU2,GPU3"}[0], + expectedDevices: []string{"GPU2", "GPU3"}, }, { description: "Mount devices, unprivileged, no accept unprivileged, no accept mounts", - mountDevices: []Mount{ + mountDevices: []specs.Mount{ { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU0"), }, { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU1"), }, }, envvarDevices: "GPU2,GPU3", @@ -680,19 +602,20 @@ func TestDeviceListSourcePriority(t *testing.T) { for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { // Wrap the call to getDevices() in a closure. - var devices *string + var devices []string getDevices := func() { image, _ := image.New( image.WithEnvMap( map[string]string{ - envNVVisibleDevices: tc.envvarDevices, + image.EnvVarNvidiaVisibleDevices: tc.envvarDevices, }, ), + image.WithMounts(tc.mountDevices), ) hookConfig, _ := getDefaultHookConfig() hookConfig.AcceptEnvvarUnprivileged = tc.acceptUnprivileged hookConfig.AcceptDeviceListAsVolumeMounts = tc.acceptMounts - devices = getDevices(&hookConfig, image, tc.mountDevices, tc.privileged) + devices = getDevices(&hookConfig, image, tc.privileged) } // For all other tests, just grab the devices and check the results @@ -704,8 +627,6 @@ func TestDeviceListSourcePriority(t *testing.T) { } func TestGetDevicesFromEnvvar(t *testing.T) { - all := "all" - empty := "" envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS" gpuID := "GPU-12345" anotherGPUID := "GPU-67890" @@ -715,7 +636,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { description string swarmResourceEnvvars []string env map[string]string - expectedDevices *string + expectedDevices []string }{ { description: "empty env returns nil for non-legacy image", @@ -723,43 +644,43 @@ func TestGetDevicesFromEnvvar(t *testing.T) { { description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "", + image.EnvVarNvidiaVisibleDevices: "", }, }, { description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "void", + image.EnvVarNvidiaVisibleDevices: "void", }, }, { description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "none", + image.EnvVarNvidiaVisibleDevices: "none", }, - expectedDevices: &empty, + expectedDevices: []string{""}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image", env: map[string]string{ - envNVVisibleDevices: gpuID, + image.EnvVarNvidiaVisibleDevices: gpuID, }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image", env: map[string]string{ - envNVVisibleDevices: gpuID, - envCUDAVersion: "legacy", + image.EnvVarNvidiaVisibleDevices: gpuID, + image.EnvVarCudaVersion: "legacy", }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "empty env returns all for legacy image", env: map[string]string{ - envCUDAVersion: "legacy", + image.EnvVarCudaVersion: "legacy", }, - expectedDevices: &all, + expectedDevices: []string{"all"}, }, // Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is ignored when // not enabled @@ -772,49 +693,49 @@ func TestGetDevicesFromEnvvar(t *testing.T) { { description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "", - envDockerResourceGPUs: anotherGPUID, + image.EnvVarNvidiaVisibleDevices: "", + envDockerResourceGPUs: anotherGPUID, }, }, { description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "void", - envDockerResourceGPUs: anotherGPUID, + image.EnvVarNvidiaVisibleDevices: "void", + envDockerResourceGPUs: anotherGPUID, }, }, { description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "none", - envDockerResourceGPUs: anotherGPUID, + image.EnvVarNvidiaVisibleDevices: "none", + envDockerResourceGPUs: anotherGPUID, }, - expectedDevices: &empty, + expectedDevices: []string{""}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image", env: map[string]string{ - envNVVisibleDevices: gpuID, - envDockerResourceGPUs: anotherGPUID, + image.EnvVarNvidiaVisibleDevices: gpuID, + envDockerResourceGPUs: anotherGPUID, }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image", env: map[string]string{ - envNVVisibleDevices: gpuID, - envDockerResourceGPUs: anotherGPUID, - envCUDAVersion: "legacy", + image.EnvVarNvidiaVisibleDevices: gpuID, + envDockerResourceGPUs: anotherGPUID, + image.EnvVarCudaVersion: "legacy", }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "empty env returns all for legacy image", env: map[string]string{ - envDockerResourceGPUs: anotherGPUID, - envCUDAVersion: "legacy", + envDockerResourceGPUs: anotherGPUID, + image.EnvVarCudaVersion: "legacy", }, - expectedDevices: &all, + expectedDevices: []string{"all"}, }, // Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when // enabled @@ -842,7 +763,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { env: map[string]string{ envDockerResourceGPUs: "none", }, - expectedDevices: &empty, + expectedDevices: []string{""}, }, { description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image", @@ -850,16 +771,16 @@ func TestGetDevicesFromEnvvar(t *testing.T) { env: map[string]string{ envDockerResourceGPUs: gpuID, }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "DOCKER_RESOURCE_GPUS set returns value for legacy image", swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ - envDockerResourceGPUs: gpuID, - envCUDAVersion: "legacy", + envDockerResourceGPUs: gpuID, + image.EnvVarCudaVersion: "legacy", }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "DOCKER_RESOURCE_GPUS is selected if present", @@ -867,63 +788,54 @@ func TestGetDevicesFromEnvvar(t *testing.T) { env: map[string]string{ envDockerResourceGPUs: anotherGPUID, }, - expectedDevices: &anotherGPUID, + expectedDevices: []string{anotherGPUID}, }, { description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ - envNVVisibleDevices: gpuID, - envDockerResourceGPUs: anotherGPUID, + image.EnvVarNvidiaVisibleDevices: gpuID, + envDockerResourceGPUs: anotherGPUID, }, - expectedDevices: &anotherGPUID, + expectedDevices: []string{anotherGPUID}, }, { description: "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present", swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"}, env: map[string]string{ - envNVVisibleDevices: gpuID, + image.EnvVarNvidiaVisibleDevices: gpuID, "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, }, - expectedDevices: &anotherGPUID, + expectedDevices: []string{anotherGPUID}, }, { description: "All available swarm resource envvars are selected and override NVIDIA_VISIBLE_DEVICES if present", swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, env: map[string]string{ - envNVVisibleDevices: gpuID, + image.EnvVarNvidiaVisibleDevices: gpuID, "DOCKER_RESOURCE_GPUS": thirdGPUID, "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, }, - expectedDevices: func() *string { - result := fmt.Sprintf("%s,%s", thirdGPUID, anotherGPUID) - return &result - }(), + expectedDevices: []string{thirdGPUID, anotherGPUID}, }, { description: "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS override NVIDIA_VISIBLE_DEVICES if present", swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, env: map[string]string{ - envNVVisibleDevices: gpuID, + image.EnvVarNvidiaVisibleDevices: gpuID, "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, }, - expectedDevices: &anotherGPUID, + expectedDevices: []string{anotherGPUID}, }, } - for i, tc := range tests { + for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { image, _ := image.New( image.WithEnvMap(tc.env), ) devices := getDevicesFromEnvvar(image, tc.swarmResourceEnvvars) - if tc.expectedDevices == nil { - require.Nil(t, devices, "%d: %v", i, tc) - return - } - - require.NotNil(t, devices, "%d: %v", i, tc) - require.Equal(t, *tc.expectedDevices, *devices, "%d: %v", i, tc) + require.EqualValues(t, tc.expectedDevices, devices) }) } } @@ -943,7 +855,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is set for legacy image", env: map[string]string{ - envNVDriverCapabilities: "display,video", + image.EnvVarNvidiaDriverCapabilities: "display,video", }, legacyImage: true, supportedCapabilities: supportedCapabilities, @@ -952,7 +864,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is all for legacy image", env: map[string]string{ - envNVDriverCapabilities: "all", + image.EnvVarNvidiaDriverCapabilities: "all", }, legacyImage: true, supportedCapabilities: supportedCapabilities, @@ -961,7 +873,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is empty for legacy image", env: map[string]string{ - envNVDriverCapabilities: "", + image.EnvVarNvidiaDriverCapabilities: "", }, legacyImage: true, supportedCapabilities: supportedCapabilities, @@ -977,7 +889,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is set for modern image", env: map[string]string{ - envNVDriverCapabilities: "display,video", + image.EnvVarNvidiaDriverCapabilities: "display,video", }, legacyImage: false, supportedCapabilities: supportedCapabilities, @@ -993,7 +905,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is all for modern image", env: map[string]string{ - envNVDriverCapabilities: "all", + image.EnvVarNvidiaDriverCapabilities: "all", }, legacyImage: false, supportedCapabilities: supportedCapabilities, @@ -1002,7 +914,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is empty for modern image", env: map[string]string{ - envNVDriverCapabilities: "", + image.EnvVarNvidiaDriverCapabilities: "", }, legacyImage: false, supportedCapabilities: supportedCapabilities, @@ -1011,7 +923,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Invalid capabilities panic", env: map[string]string{ - envNVDriverCapabilities: "compute,utility", + image.EnvVarNvidiaDriverCapabilities: "compute,utility", }, supportedCapabilities: "not-compute,not-utility", expectedPanic: true, diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index 1f4bd525..a0cfe3ec 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -120,8 +120,8 @@ func doPrestart() { if cli.NoCgroups { args = append(args, "--no-cgroups") } - if len(nvidia.Devices) > 0 { - args = append(args, fmt.Sprintf("--device=%s", nvidia.Devices)) + if devicesString := strings.Join(nvidia.Devices, ","); len(devicesString) > 0 { + args = append(args, fmt.Sprintf("--device=%s", devicesString)) } if len(nvidia.MigConfigDevices) > 0 { args = append(args, fmt.Sprintf("--mig-config=%s", nvidia.MigConfigDevices)) @@ -129,8 +129,8 @@ func doPrestart() { if len(nvidia.MigMonitorDevices) > 0 { args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices)) } - if len(nvidia.ImexChannels) > 0 { - args = append(args, fmt.Sprintf("--imex-channel=%s", nvidia.ImexChannels)) + if imexString := strings.Join(nvidia.ImexChannels, ","); len(imexString) > 0 { + args = append(args, fmt.Sprintf("--imex-channel=%s", imexString)) } for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") { diff --git a/internal/config/image/builder.go b/internal/config/image/builder.go index da9025f0..332d017a 100644 --- a/internal/config/image/builder.go +++ b/internal/config/image/builder.go @@ -47,7 +47,7 @@ func New(opt ...Option) (CUDA, error) { // build creates a CUDA image from the builder. func (b builder) build() (CUDA, error) { if b.disableRequire { - b.env[envNVDisableRequire] = "true" + b.env[EnvVarNvidiaDisableRequire] = "true" } c := CUDA{ diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index 75e622b7..4298e634 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -28,12 +28,10 @@ import ( ) const ( - envCUDAVersion = "CUDA_VERSION" - envNVRequirePrefix = "NVIDIA_REQUIRE_" - envNVRequireCUDA = envNVRequirePrefix + "CUDA" - envNVRequireJetpack = envNVRequirePrefix + "JETPACK" - envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE" - envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES" + DeviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices" + + volumeMountDevicePrefixCDI = "cdi/" + volumeMountDevicePrefixImex = "imex/" ) // CUDA represents a CUDA image that can be used for GPU computing. This wraps @@ -80,8 +78,8 @@ func (i CUDA) HasEnvvar(key string) bool { // image is considered legacy if it has a CUDA_VERSION environment variable defined // and no NVIDIA_REQUIRE_CUDA environment variable defined. func (i CUDA) IsLegacy() bool { - legacyCudaVersion := i.env[envCUDAVersion] - cudaRequire := i.env[envNVRequireCUDA] + legacyCudaVersion := i.env[EnvVarCudaVersion] + cudaRequire := i.env[EnvVarNvidiaRequireCuda] return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0 } @@ -95,7 +93,7 @@ func (i CUDA) GetRequirements() ([]string, error) { // All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli var requirements []string for name, value := range i.env { - if strings.HasPrefix(name, envNVRequirePrefix) && !strings.HasPrefix(name, envNVRequireJetpack) { + if strings.HasPrefix(name, NvidiaRequirePrefix) && !strings.HasPrefix(name, EnvVarNvidiaRequireJetpack) { requirements = append(requirements, value) } } @@ -113,7 +111,7 @@ func (i CUDA) GetRequirements() ([]string, error) { // HasDisableRequire checks for the value of the NVIDIA_DISABLE_REQUIRE. If set // to a valid (true) boolean value this can be used to disable the requirement checks func (i CUDA) HasDisableRequire() bool { - if disable, exists := i.env[envNVDisableRequire]; exists { + if disable, exists := i.env[EnvVarNvidiaDisableRequire]; exists { // i.logger.Debugf("NVIDIA_DISABLE_REQUIRE=%v; skipping requirement checks", disable) d, _ := strconv.ParseBool(disable) return d @@ -157,7 +155,7 @@ func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices { // GetDriverCapabilities returns the requested driver capabilities. func (i CUDA) GetDriverCapabilities() DriverCapabilities { - env := i.env[envNVDriverCapabilities] + env := i.env[EnvVarNvidiaDriverCapabilities] capabilities := make(DriverCapabilities) for _, c := range strings.Split(env, ",") { @@ -168,7 +166,7 @@ func (i CUDA) GetDriverCapabilities() DriverCapabilities { } func (i CUDA) legacyVersion() (string, error) { - cudaVersion := i.env[envCUDAVersion] + cudaVersion := i.env[EnvVarCudaVersion] majorMinor, err := parseMajorMinorVersion(cudaVersion) if err != nil { return "", fmt.Errorf("invalid CUDA version %v: %v", cudaVersion, err) @@ -202,7 +200,7 @@ func parseMajorMinorVersion(version string) (string, error) { // OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool { var hasCDIdevice bool - for _, device := range i.DevicesFromEnvvars("NVIDIA_VISIBLE_DEVICES").List() { + for _, device := range i.VisibleDevicesFromEnvVar() { if !parser.IsQualifiedName(device) { return false } @@ -218,14 +216,31 @@ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool { return hasCDIdevice } -const ( - deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices" -) +// VisibleDevicesFromEnvVar returns the set of visible devices requested through +// the NVIDIA_VISIBLE_DEVICES environment variable. +func (i CUDA) VisibleDevicesFromEnvVar() []string { + return i.DevicesFromEnvvars(EnvVarNvidiaVisibleDevices).List() +} + +// VisibleDevicesFromMounts returns the set of visible devices requested as mounts. +func (i CUDA) VisibleDevicesFromMounts() []string { + var devices []string + for _, device := range i.DevicesFromMounts() { + switch { + case strings.HasPrefix(device, volumeMountDevicePrefixCDI): + continue + case strings.HasPrefix(device, volumeMountDevicePrefixImex): + continue + } + devices = append(devices, device) + } + return devices +} // DevicesFromMounts returns a list of device specified as mounts. // TODO: This should be merged with getDevicesFromMounts used in the NVIDIA Container Runtime func (i CUDA) DevicesFromMounts() []string { - root := filepath.Clean(deviceListAsVolumeMountsRoot) + root := filepath.Clean(DeviceListAsVolumeMountsRoot) seen := make(map[string]bool) var devices []string for _, m := range i.mounts { @@ -260,10 +275,10 @@ func (i CUDA) DevicesFromMounts() []string { func (i CUDA) CDIDevicesFromMounts() []string { var devices []string for _, mountDevice := range i.DevicesFromMounts() { - if !strings.HasPrefix(mountDevice, "cdi/") { + if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixCDI) { continue } - parts := strings.SplitN(strings.TrimPrefix(mountDevice, "cdi/"), "/", 3) + parts := strings.SplitN(strings.TrimPrefix(mountDevice, volumeMountDevicePrefixCDI), "/", 3) if len(parts) != 3 { continue } @@ -275,6 +290,19 @@ func (i CUDA) CDIDevicesFromMounts() []string { return devices } -func (i CUDA) IsEnabled(envvar string) bool { - return i.Getenv(envvar) == "enabled" +// ImexChannelsFromEnvVar returns the list of IMEX channels requested for the image. +func (i CUDA) ImexChannelsFromEnvVar() []string { + return i.DevicesFromEnvvars(EnvVarNvidiaImexChannels).List() +} + +// ImexChannelsFromMounts returns the list of IMEX channels requested for the image. +func (i CUDA) ImexChannelsFromMounts() []string { + var channels []string + for _, mountDevice := range i.DevicesFromMounts() { + if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixImex) { + continue + } + channels = append(channels, strings.TrimPrefix(mountDevice, volumeMountDevicePrefixImex)) + } + return channels } diff --git a/internal/config/image/cuda_image_test.go b/internal/config/image/cuda_image_test.go index 406ad443..39bf30b4 100644 --- a/internal/config/image/cuda_image_test.go +++ b/internal/config/image/cuda_image_test.go @@ -17,8 +17,10 @@ package image import ( + "path/filepath" "testing" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/require" ) @@ -130,3 +132,85 @@ func TestGetRequirements(t *testing.T) { } } + +func TestGetVisibleDevicesFromMounts(t *testing.T) { + var tests = []struct { + description string + mounts []specs.Mount + expectedDevices []string + }{ + { + description: "No mounts", + mounts: nil, + expectedDevices: nil, + }, + { + description: "Host path is not /dev/null", + mounts: []specs.Mount{ + { + Source: "/not/dev/null", + Destination: filepath.Join(DeviceListAsVolumeMountsRoot, "GPU0"), + }, + }, + expectedDevices: nil, + }, + { + description: "Container path is not prefixed by 'root'", + mounts: []specs.Mount{ + { + Source: "/dev/null", + Destination: filepath.Join("/other/prefix", "GPU0"), + }, + }, + expectedDevices: nil, + }, + { + description: "Container path is only 'root'", + mounts: []specs.Mount{ + { + Source: "/dev/null", + Destination: DeviceListAsVolumeMountsRoot, + }, + }, + expectedDevices: nil, + }, + { + description: "Discover 2 devices", + mounts: makeTestMounts("GPU0", "GPU1"), + expectedDevices: []string{"GPU0", "GPU1"}, + }, + { + description: "Discover 2 devices with slashes in the name", + mounts: makeTestMounts("GPU0-MIG0/0/1", "GPU1-MIG0/0/1"), + expectedDevices: []string{"GPU0-MIG0/0/1", "GPU1-MIG0/0/1"}, + }, + { + description: "cdi devices are ignored", + mounts: makeTestMounts("GPU0", "cdi/nvidia.com/gpu=all", "GPU1"), + expectedDevices: []string{"GPU0", "GPU1"}, + }, + { + description: "imex devices are ignored", + mounts: makeTestMounts("GPU0", "imex/0", "GPU1"), + expectedDevices: []string{"GPU0", "GPU1"}, + }, + } + for _, tc := range tests { + t.Run(tc.description, func(t *testing.T) { + image, _ := New(WithMounts(tc.mounts)) + require.Equal(t, tc.expectedDevices, image.VisibleDevicesFromMounts()) + }) + } +} + +func makeTestMounts(paths ...string) []specs.Mount { + var mounts []specs.Mount + for _, path := range paths { + mount := specs.Mount{ + Source: "/dev/null", + Destination: filepath.Join(DeviceListAsVolumeMountsRoot, path), + } + mounts = append(mounts, mount) + } + return mounts +} diff --git a/internal/config/image/envvars.go b/internal/config/image/envvars.go new file mode 100644 index 00000000..0789f22f --- /dev/null +++ b/internal/config/image/envvars.go @@ -0,0 +1,31 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package image + +const ( + EnvVarCudaVersion = "CUDA_VERSION" + EnvVarNvidiaDisableRequire = "NVIDIA_DISABLE_REQUIRE" + EnvVarNvidiaDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES" + EnvVarNvidiaImexChannels = "NVIDIA_IMEX_CHANNELS" + EnvVarNvidiaMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES" + EnvVarNvidiaMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES" + EnvVarNvidiaRequireCuda = NvidiaRequirePrefix + "CUDA" + EnvVarNvidiaRequireJetpack = NvidiaRequirePrefix + "JETPACK" + EnvVarNvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" + + NvidiaRequirePrefix = "NVIDIA_REQUIRE_" +) diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index c5af4f88..90cd481b 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -90,11 +90,9 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C } } - envDevices := container.DevicesFromEnvvars(visibleDevicesEnvvar) - var devices []string seen := make(map[string]bool) - for _, name := range envDevices.List() { + for _, name := range container.VisibleDevicesFromEnvVar() { if !parser.IsQualifiedName(name) { name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name) } diff --git a/internal/modifier/csv.go b/internal/modifier/csv.go index 0905d5da..a6129432 100644 --- a/internal/modifier/csv.go +++ b/internal/modifier/csv.go @@ -30,23 +30,16 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" ) -const ( - visibleDevicesEnvvar = "NVIDIA_VISIBLE_DEVICES" - visibleDevicesVoid = "void" - - nvidiaRequireJetpackEnvvar = "NVIDIA_REQUIRE_JETPACK" -) - // NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper. // The modifications are defined by CSV MountSpecs. -func NewCSVModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) { - if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { +func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) { + if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } logger.Infof("Constructing modifier from config: %+v", *cfg) - if err := checkRequirements(logger, image); err != nil { + if err := checkRequirements(logger, container); err != nil { return nil, fmt.Errorf("requirements not met: %v", err) } @@ -55,7 +48,7 @@ func NewCSVModifier(logger logger.Interface, cfg *config.Config, image image.CUD return nil, fmt.Errorf("failed to get list of CSV files: %v", err) } - if image.Getenv(nvidiaRequireJetpackEnvvar) != "csv-mounts=all" { + if container.Getenv(image.EnvVarNvidiaRequireJetpack) != "csv-mounts=all" { csvFiles = csv.BaseFilesOnly(csvFiles) } diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index 5a7141b9..2c39d207 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -36,7 +36,7 @@ import ( // // If not devices are selected, no changes are made. func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) { - if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { + if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } diff --git a/internal/modifier/graphics.go b/internal/modifier/graphics.go index d9784d5e..31aa6ef3 100644 --- a/internal/modifier/graphics.go +++ b/internal/modifier/graphics.go @@ -29,8 +29,8 @@ import ( // NewGraphicsModifier constructs a modifier that injects graphics-related modifications into an OCI runtime specification. // The value of the NVIDIA_DRIVER_CAPABILITIES environment variable is checked to determine if this modification should be made. -func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver) (oci.SpecModifier, error) { - if required, reason := requiresGraphicsModifier(image); !required { +func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerImage image.CUDA, driver *root.Driver) (oci.SpecModifier, error) { + if required, reason := requiresGraphicsModifier(containerImage); !required { logger.Infof("No graphics modifier required: %v", reason) return nil, nil } @@ -50,7 +50,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, image imag devRoot := driver.Root drmNodes, err := discover.NewDRMNodesDiscoverer( logger, - image.DevicesFromEnvvars(visibleDevicesEnvvar), + containerImage.DevicesFromEnvvars(image.EnvVarNvidiaVisibleDevices), devRoot, nvidiaCDIHookPath, ) @@ -67,7 +67,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, image imag // requiresGraphicsModifier determines whether a graphics modifier is required. func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) { - if devices := cudaImage.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { + if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 { return false, "no devices requested" } diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index b98f4529..8ed9e5aa 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -24,6 +24,7 @@ import ( "github.com/NVIDIA/go-nvml/pkg/nvml" "tags.cncf.io/container-device-interface/pkg/cdi" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils" @@ -200,7 +201,7 @@ func (m *wrapper) GetCommonEdits() (*cdi.ContainerEdits, error) { if err != nil { return nil, err } - edits.Env = append(edits.Env, "NVIDIA_VISIBLE_DEVICES=void") + edits.Env = append(edits.Env, image.EnvVarNvidiaVisibleDevices+"=void") return edits, nil }