From 1991b3ef2afe2dd4142c994e2343284478c199d3 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 14 Oct 2024 14:53:36 +0200 Subject: [PATCH 1/3] [no-relnote] Use string slice for devices in hook Signed-off-by: Evan Lezar --- .../container_config.go | 43 ++----- .../container_config_test.go | 118 ++++++++---------- cmd/nvidia-container-runtime-hook/main.go | 4 +- 3 files changed, 67 insertions(+), 98 deletions(-) diff --git a/cmd/nvidia-container-runtime-hook/container_config.go b/cmd/nvidia-container-runtime-hook/container_config.go index fa39bf2f..b509cf41 100644 --- a/cmd/nvidia-container-runtime-hook/container_config.go +++ b/cmd/nvidia-container-runtime-hook/container_config.go @@ -7,7 +7,6 @@ import ( "os" "path" "path/filepath" - "strings" "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/mod/semver" @@ -36,7 +35,7 @@ const ( ) type nvidiaConfig struct { - Devices string + Devices []string MigConfigDevices string MigMonitorDevices string ImexChannels string @@ -172,34 +171,19 @@ func isPrivileged(s *Spec) bool { return image.IsPrivileged(&fullSpec) } -func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string { +func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) []string { // We check if the image has at least one of the Swarm resource envvars defined and use this // if specified. - var hasSwarmEnvvar bool for _, envvar := range swarmResourceEnvvars { if image.HasEnvvar(envvar) { - hasSwarmEnvvar = true - break + return image.DevicesFromEnvvars(swarmResourceEnvvars...).List() } } - var devices []string - if hasSwarmEnvvar { - devices = image.DevicesFromEnvvars(swarmResourceEnvvars...).List() - } else { - devices = image.DevicesFromEnvvars(envNVVisibleDevices).List() - } - - if len(devices) == 0 { - return nil - } - - devicesString := strings.Join(devices, ",") - - return &devicesString + return image.DevicesFromEnvvars(envNVVisibleDevices).List() } -func getDevicesFromMounts(mounts []Mount) *string { +func getDevicesFromMounts(mounts []Mount) []string { var devices []string for _, m := range mounts { root := filepath.Clean(deviceListAsVolumeMountsRoot) @@ -232,22 +216,21 @@ func getDevicesFromMounts(mounts []Mount) *string { return nil } - ret := strings.Join(devices, ",") - return &ret + return devices } -func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *string { +func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) []string { // If enabled, try and get the device list from volume mounts first if hookConfig.AcceptDeviceListAsVolumeMounts { devices := getDevicesFromMounts(mounts) - if devices != nil { + if len(devices) > 0 { return devices } } // Fallback to reading from the environment variable if privileges are correct devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars()) - if devices == nil { + if len(devices) == 0 { return nil } if privileged || hookConfig.AcceptEnvvarUnprivileged { @@ -314,11 +297,9 @@ func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage boo func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *nvidiaConfig { legacyImage := image.IsLegacy() - var devices string - if d := getDevices(hookConfig, image, mounts, privileged); d != nil { - devices = *d - } else { - // 'nil' devices means this is not a GPU container. + devices := getDevices(hookConfig, image, mounts, privileged) + if len(devices) == 0 { + // empty devices means this is not a GPU container. return nil } diff --git a/cmd/nvidia-container-runtime-hook/container_config_test.go b/cmd/nvidia-container-runtime-hook/container_config_test.go index 43fac8aa..8f852ff8 100644 --- a/cmd/nvidia-container-runtime-hook/container_config_test.go +++ b/cmd/nvidia-container-runtime-hook/container_config_test.go @@ -1,7 +1,6 @@ package main import ( - "fmt" "path/filepath" "testing" @@ -38,7 +37,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -51,7 +50,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -82,7 +81,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "", + Devices: []string{""}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -95,7 +94,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -109,7 +108,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -123,7 +122,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -137,7 +136,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{"cuda>=9.0"}, }, @@ -153,7 +152,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"}, }, @@ -170,7 +169,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{}, }, @@ -200,7 +199,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -231,7 +230,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "", + Devices: []string{""}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -244,7 +243,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -258,7 +257,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -272,7 +271,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: image.SupportedDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, }, @@ -286,7 +285,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{"cuda>=9.0"}, }, @@ -302,7 +301,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"}, }, @@ -319,7 +318,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "gpu0,gpu1", + Devices: []string{"gpu0", "gpu1"}, DriverCapabilities: "display,video", Requirements: []string{}, }, @@ -332,7 +331,7 @@ func TestGetNvidiaConfig(t *testing.T) { privileged: false, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{}, }, @@ -346,7 +345,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: true, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, MigConfigDevices: "mig0,mig1", DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, @@ -371,7 +370,7 @@ func TestGetNvidiaConfig(t *testing.T) { }, privileged: true, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, MigMonitorDevices: "mig0,mig1", DriverCapabilities: image.DefaultDriverCapabilities.String(), Requirements: []string{"cuda>=9.0"}, @@ -398,7 +397,7 @@ func TestGetNvidiaConfig(t *testing.T) { SupportedDriverCapabilities: "video,display", }, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: "display,video", }, }, @@ -413,7 +412,7 @@ func TestGetNvidiaConfig(t *testing.T) { SupportedDriverCapabilities: "video,display,compute,utility", }, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: "display,video", }, }, @@ -427,7 +426,7 @@ func TestGetNvidiaConfig(t *testing.T) { SupportedDriverCapabilities: "video,display,utility,compute", }, expectedConfig: &nvidiaConfig{ - Devices: "all", + Devices: []string{"all"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), }, }, @@ -443,7 +442,7 @@ func TestGetNvidiaConfig(t *testing.T) { SupportedDriverCapabilities: "video,display,utility,compute", }, expectedConfig: &nvidiaConfig{ - Devices: "GPU1,GPU2", + Devices: []string{"GPU1", "GPU2"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), }, }, @@ -459,7 +458,7 @@ func TestGetNvidiaConfig(t *testing.T) { SupportedDriverCapabilities: "video,display,utility,compute", }, expectedConfig: &nvidiaConfig{ - Devices: "GPU1,GPU2", + Devices: []string{"GPU1", "GPU2"}, DriverCapabilities: image.DefaultDriverCapabilities.String(), }, }, @@ -511,7 +510,7 @@ func TestGetDevicesFromMounts(t *testing.T) { var tests = []struct { description string mounts []Mount - expectedDevices *string + expectedDevices []string }{ { description: "No mounts", @@ -560,7 +559,7 @@ func TestGetDevicesFromMounts(t *testing.T) { Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"), }, }, - expectedDevices: &[]string{"GPU0,GPU1"}[0], + expectedDevices: []string{"GPU0", "GPU1"}, }, { description: "Discover 2 devices with slashes in the name", @@ -574,7 +573,7 @@ func TestGetDevicesFromMounts(t *testing.T) { Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1-MIG0/0/1"), }, }, - expectedDevices: &[]string{"GPU0-MIG0/0/1,GPU1-MIG0/0/1"}[0], + expectedDevices: []string{"GPU0-MIG0/0/1", "GPU1-MIG0/0/1"}, }, } for _, tc := range tests { @@ -593,7 +592,7 @@ func TestDeviceListSourcePriority(t *testing.T) { privileged bool acceptUnprivileged bool acceptMounts bool - expectedDevices *string + expectedDevices []string }{ { description: "Mount devices, unprivileged, no accept unprivileged", @@ -611,7 +610,7 @@ func TestDeviceListSourcePriority(t *testing.T) { privileged: false, acceptUnprivileged: false, acceptMounts: true, - expectedDevices: &[]string{"GPU0,GPU1"}[0], + expectedDevices: []string{"GPU0", "GPU1"}, }, { description: "No mount devices, unprivileged, no accept unprivileged", @@ -629,7 +628,7 @@ func TestDeviceListSourcePriority(t *testing.T) { privileged: true, acceptUnprivileged: false, acceptMounts: true, - expectedDevices: &[]string{"GPU0,GPU1"}[0], + expectedDevices: []string{"GPU0", "GPU1"}, }, { description: "No mount devices, unprivileged, accept unprivileged", @@ -638,7 +637,7 @@ func TestDeviceListSourcePriority(t *testing.T) { privileged: false, acceptUnprivileged: true, acceptMounts: true, - expectedDevices: &[]string{"GPU0,GPU1"}[0], + expectedDevices: []string{"GPU0", "GPU1"}, }, { description: "Mount devices, unprivileged, accept unprivileged, no accept mounts", @@ -656,7 +655,7 @@ func TestDeviceListSourcePriority(t *testing.T) { privileged: false, acceptUnprivileged: true, acceptMounts: false, - expectedDevices: &[]string{"GPU2,GPU3"}[0], + expectedDevices: []string{"GPU2", "GPU3"}, }, { description: "Mount devices, unprivileged, no accept unprivileged, no accept mounts", @@ -680,7 +679,7 @@ func TestDeviceListSourcePriority(t *testing.T) { for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { // Wrap the call to getDevices() in a closure. - var devices *string + var devices []string getDevices := func() { image, _ := image.New( image.WithEnvMap( @@ -704,8 +703,6 @@ func TestDeviceListSourcePriority(t *testing.T) { } func TestGetDevicesFromEnvvar(t *testing.T) { - all := "all" - empty := "" envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS" gpuID := "GPU-12345" anotherGPUID := "GPU-67890" @@ -715,7 +712,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { description string swarmResourceEnvvars []string env map[string]string - expectedDevices *string + expectedDevices []string }{ { description: "empty env returns nil for non-legacy image", @@ -737,14 +734,14 @@ func TestGetDevicesFromEnvvar(t *testing.T) { env: map[string]string{ envNVVisibleDevices: "none", }, - expectedDevices: &empty, + expectedDevices: []string{""}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image", env: map[string]string{ envNVVisibleDevices: gpuID, }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image", @@ -752,14 +749,14 @@ func TestGetDevicesFromEnvvar(t *testing.T) { envNVVisibleDevices: gpuID, envCUDAVersion: "legacy", }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "empty env returns all for legacy image", env: map[string]string{ envCUDAVersion: "legacy", }, - expectedDevices: &all, + expectedDevices: []string{"all"}, }, // Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is ignored when // not enabled @@ -789,7 +786,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { envNVVisibleDevices: "none", envDockerResourceGPUs: anotherGPUID, }, - expectedDevices: &empty, + expectedDevices: []string{""}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image", @@ -797,7 +794,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { envNVVisibleDevices: gpuID, envDockerResourceGPUs: anotherGPUID, }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image", @@ -806,7 +803,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { envDockerResourceGPUs: anotherGPUID, envCUDAVersion: "legacy", }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "empty env returns all for legacy image", @@ -814,7 +811,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { envDockerResourceGPUs: anotherGPUID, envCUDAVersion: "legacy", }, - expectedDevices: &all, + expectedDevices: []string{"all"}, }, // Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when // enabled @@ -842,7 +839,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { env: map[string]string{ envDockerResourceGPUs: "none", }, - expectedDevices: &empty, + expectedDevices: []string{""}, }, { description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image", @@ -850,7 +847,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { env: map[string]string{ envDockerResourceGPUs: gpuID, }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "DOCKER_RESOURCE_GPUS set returns value for legacy image", @@ -859,7 +856,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { envDockerResourceGPUs: gpuID, envCUDAVersion: "legacy", }, - expectedDevices: &gpuID, + expectedDevices: []string{gpuID}, }, { description: "DOCKER_RESOURCE_GPUS is selected if present", @@ -867,7 +864,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { env: map[string]string{ envDockerResourceGPUs: anotherGPUID, }, - expectedDevices: &anotherGPUID, + expectedDevices: []string{anotherGPUID}, }, { description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", @@ -876,7 +873,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { envNVVisibleDevices: gpuID, envDockerResourceGPUs: anotherGPUID, }, - expectedDevices: &anotherGPUID, + expectedDevices: []string{anotherGPUID}, }, { description: "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present", @@ -885,7 +882,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { envNVVisibleDevices: gpuID, "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, }, - expectedDevices: &anotherGPUID, + expectedDevices: []string{anotherGPUID}, }, { description: "All available swarm resource envvars are selected and override NVIDIA_VISIBLE_DEVICES if present", @@ -895,10 +892,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { "DOCKER_RESOURCE_GPUS": thirdGPUID, "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, }, - expectedDevices: func() *string { - result := fmt.Sprintf("%s,%s", thirdGPUID, anotherGPUID) - return &result - }(), + expectedDevices: []string{thirdGPUID, anotherGPUID}, }, { description: "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS override NVIDIA_VISIBLE_DEVICES if present", @@ -907,23 +901,17 @@ func TestGetDevicesFromEnvvar(t *testing.T) { envNVVisibleDevices: gpuID, "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, }, - expectedDevices: &anotherGPUID, + expectedDevices: []string{anotherGPUID}, }, } - for i, tc := range tests { + for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { image, _ := image.New( image.WithEnvMap(tc.env), ) devices := getDevicesFromEnvvar(image, tc.swarmResourceEnvvars) - if tc.expectedDevices == nil { - require.Nil(t, devices, "%d: %v", i, tc) - return - } - - require.NotNil(t, devices, "%d: %v", i, tc) - require.Equal(t, *tc.expectedDevices, *devices, "%d: %v", i, tc) + require.EqualValues(t, tc.expectedDevices, devices) }) } } diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index 1f4bd525..ad3e208d 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -120,8 +120,8 @@ func doPrestart() { if cli.NoCgroups { args = append(args, "--no-cgroups") } - if len(nvidia.Devices) > 0 { - args = append(args, fmt.Sprintf("--device=%s", nvidia.Devices)) + if devicesString := strings.Join(nvidia.Devices, ","); len(devicesString) > 0 { + args = append(args, fmt.Sprintf("--device=%s", devicesString)) } if len(nvidia.MigConfigDevices) > 0 { args = append(args, fmt.Sprintf("--mig-config=%s", nvidia.MigConfigDevices)) From 92df542f2f324ca16431dc824108e3c068a89334 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 14 Oct 2024 15:06:06 +0200 Subject: [PATCH 2/3] [no-relnote] Use image.CUDA to extract visible devices Signed-off-by: Evan Lezar --- .../container_config.go | 107 ++---- .../container_config_test.go | 350 +++++++----------- internal/config/image/builder.go | 2 +- internal/config/image/cuda_image.go | 49 ++- internal/config/image/cuda_image_test.go | 79 ++++ internal/config/image/envvars.go | 31 ++ internal/modifier/cdi.go | 4 +- internal/modifier/csv.go | 15 +- internal/modifier/gated.go | 2 +- internal/modifier/graphics.go | 8 +- pkg/nvcdi/lib.go | 3 +- 11 files changed, 313 insertions(+), 337 deletions(-) create mode 100644 internal/config/image/envvars.go diff --git a/cmd/nvidia-container-runtime-hook/container_config.go b/cmd/nvidia-container-runtime-hook/container_config.go index b509cf41..e982c695 100644 --- a/cmd/nvidia-container-runtime-hook/container_config.go +++ b/cmd/nvidia-container-runtime-hook/container_config.go @@ -6,7 +6,6 @@ import ( "log" "os" "path" - "path/filepath" "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/mod/semver" @@ -14,26 +13,10 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" ) -const ( - envCUDAVersion = "CUDA_VERSION" - envNVRequirePrefix = "NVIDIA_REQUIRE_" - envNVRequireCUDA = envNVRequirePrefix + "CUDA" - envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE" - envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES" - envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES" - envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES" - envNVImexChannels = "NVIDIA_IMEX_CHANNELS" - envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES" -) - const ( capSysAdmin = "CAP_SYS_ADMIN" ) -const ( - deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices" -) - type nvidiaConfig struct { Devices []string MigConfigDevices string @@ -76,23 +59,14 @@ type LinuxCapabilities struct { Ambient []string `json:"ambient,omitempty" platform:"linux"` } -// Mount from OCI runtime spec -// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L103 -type Mount struct { - Destination string `json:"destination"` - Type string `json:"type,omitempty" platform:"linux,solaris"` - Source string `json:"source,omitempty"` - Options []string `json:"options,omitempty"` -} - // Spec from OCI runtime spec // We use pointers to structs, similarly to the latest version of runtime-spec: // https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L5-L28 type Spec struct { - Version *string `json:"ociVersion"` - Process *Process `json:"process,omitempty"` - Root *Root `json:"root,omitempty"` - Mounts []Mount `json:"mounts,omitempty"` + Version *string `json:"ociVersion"` + Process *Process `json:"process,omitempty"` + Root *Root `json:"root,omitempty"` + Mounts []specs.Mount `json:"mounts,omitempty"` } // HookState holds state information about the hook @@ -171,58 +145,22 @@ func isPrivileged(s *Spec) bool { return image.IsPrivileged(&fullSpec) } -func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) []string { +func getDevicesFromEnvvar(containerImage image.CUDA, swarmResourceEnvvars []string) []string { // We check if the image has at least one of the Swarm resource envvars defined and use this // if specified. for _, envvar := range swarmResourceEnvvars { - if image.HasEnvvar(envvar) { - return image.DevicesFromEnvvars(swarmResourceEnvvars...).List() + if containerImage.HasEnvvar(envvar) { + return containerImage.DevicesFromEnvvars(swarmResourceEnvvars...).List() } } - return image.DevicesFromEnvvars(envNVVisibleDevices).List() + return containerImage.VisibleDevicesFromEnvVar() } -func getDevicesFromMounts(mounts []Mount) []string { - var devices []string - for _, m := range mounts { - root := filepath.Clean(deviceListAsVolumeMountsRoot) - source := filepath.Clean(m.Source) - destination := filepath.Clean(m.Destination) - - // Only consider mounts who's host volume is /dev/null - if source != "/dev/null" { - continue - } - // Only consider container mount points that begin with 'root' - if len(destination) < len(root) { - continue - } - if destination[:len(root)] != root { - continue - } - // Grab the full path beyond 'root' and add it to the list of devices - device := destination[len(root):] - if len(device) > 0 && device[0] == '/' { - device = device[1:] - } - if len(device) == 0 { - continue - } - devices = append(devices, device) - } - - if devices == nil { - return nil - } - - return devices -} - -func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) []string { +func getDevices(hookConfig *HookConfig, image image.CUDA, privileged bool) []string { // If enabled, try and get the device list from volume mounts first if hookConfig.AcceptDeviceListAsVolumeMounts { - devices := getDevicesFromMounts(mounts) + devices := image.VisibleDevicesFromMounts() if len(devices) > 0 { return devices } @@ -243,12 +181,12 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil return nil } -func getMigConfigDevices(image image.CUDA) *string { - return getMigDevices(image, envNVMigConfigDevices) +func getMigConfigDevices(i image.CUDA) *string { + return getMigDevices(i, image.EnvVarNvidiaMigConfigDevices) } -func getMigMonitorDevices(image image.CUDA) *string { - return getMigDevices(image, envNVMigMonitorDevices) +func getMigMonitorDevices(i image.CUDA) *string { + return getMigDevices(i, image.EnvVarNvidiaMigMonitorDevices) } func getMigDevices(image image.CUDA, envvar string) *string { @@ -259,11 +197,11 @@ func getMigDevices(image image.CUDA, envvar string) *string { return &devices } -func getImexChannels(image image.CUDA) *string { - if !image.HasEnvvar(envNVImexChannels) { +func getImexChannels(i image.CUDA) *string { + if !i.HasEnvvar(image.EnvVarNvidiaImexChannels) { return nil } - chans := image.Getenv(envNVImexChannels) + chans := i.Getenv(image.EnvVarNvidiaImexChannels) return &chans } @@ -274,8 +212,8 @@ func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage boo capabilities := supportedDriverCapabilities.Intersection(image.DefaultDriverCapabilities) - capsEnvSpecified := cudaImage.HasEnvvar(envNVDriverCapabilities) - capsEnv := cudaImage.Getenv(envNVDriverCapabilities) + capsEnvSpecified := cudaImage.HasEnvvar(image.EnvVarNvidiaDriverCapabilities) + capsEnv := cudaImage.Getenv(image.EnvVarNvidiaDriverCapabilities) if !capsEnvSpecified && legacyImage { // Environment variable unset with legacy image: set all capabilities. @@ -294,10 +232,10 @@ func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage boo return capabilities } -func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privileged bool) *nvidiaConfig { +func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, privileged bool) *nvidiaConfig { legacyImage := image.IsLegacy() - devices := getDevices(hookConfig, image, mounts, privileged) + devices := getDevices(hookConfig, image, privileged) if len(devices) == 0 { // empty devices means this is not a GPU container. return nil @@ -357,6 +295,7 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { image, err := image.New( image.WithEnv(s.Process.Env), + image.WithMounts(s.Mounts), image.WithDisableRequire(hook.DisableRequire), ) if err != nil { @@ -368,6 +307,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { Pid: h.Pid, Rootfs: s.Root.Path, Image: image, - Nvidia: getNvidiaConfig(&hook, image, s.Mounts, privileged), + Nvidia: getNvidiaConfig(&hook, image, privileged), } } diff --git a/cmd/nvidia-container-runtime-hook/container_config_test.go b/cmd/nvidia-container-runtime-hook/container_config_test.go index 8f852ff8..cad9ca6e 100644 --- a/cmd/nvidia-container-runtime-hook/container_config_test.go +++ b/cmd/nvidia-container-runtime-hook/container_config_test.go @@ -4,6 +4,7 @@ import ( "path/filepath" "testing" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/require" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" @@ -33,7 +34,7 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, no devices, no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", + image.EnvVarCudaVersion: "9.0", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -45,8 +46,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices 'all', no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "all", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "all", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -58,8 +59,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices 'empty', no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "", }, privileged: false, expectedConfig: nil, @@ -67,8 +68,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices 'void', no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "void", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "void", }, privileged: false, expectedConfig: nil, @@ -76,8 +77,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices 'none', no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "none", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "none", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -89,8 +90,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, no capabilities, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -102,9 +103,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, capabilities 'empty', no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -116,9 +117,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, capabilities 'all', no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "all", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "all", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -130,9 +131,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, capabilities set, no requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -144,11 +145,11 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, capabilities set, requirements set", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", - envNVRequirePrefix + "REQ0": "req0=true", - envNVRequirePrefix + "REQ1": "req1=false", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", + image.NvidiaRequirePrefix + "REQ0": "req0=true", + image.NvidiaRequirePrefix + "REQ1": "req1=false", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -160,12 +161,12 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Legacy image, devices set, capabilities set, requirements set, disable requirements", env: map[string]string{ - envCUDAVersion: "9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", - envNVRequirePrefix + "REQ0": "req0=true", - envNVRequirePrefix + "REQ1": "req1=false", - envNVDisableRequire: "true", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", + image.NvidiaRequirePrefix + "REQ0": "req0=true", + image.NvidiaRequirePrefix + "REQ1": "req1=false", + image.EnvVarNvidiaDisableRequire: "true", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -175,18 +176,18 @@ func TestGetNvidiaConfig(t *testing.T) { }, }, { - description: "Modern image, no devices, no capabilities, no requirements, no envCUDAVersion", + description: "Modern image, no devices, no capabilities, no requirements, no image.EnvVarCudaVersion", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", }, privileged: false, expectedConfig: nil, }, { - description: "Modern image, no devices, no capabilities, no requirement, envCUDAVersion set", + description: "Modern image, no devices, no capabilities, no requirement, image.EnvVarCudaVersion set", env: map[string]string{ - envCUDAVersion: "9.0", - envNVRequireCUDA: "cuda>=9.0", + image.EnvVarCudaVersion: "9.0", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", }, privileged: false, expectedConfig: nil, @@ -194,8 +195,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'all', no capabilities, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "all", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "all", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -207,8 +208,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'empty', no capabilities, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "", }, privileged: false, expectedConfig: nil, @@ -216,8 +217,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'void', no capabilities, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "void", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "void", }, privileged: false, expectedConfig: nil, @@ -225,8 +226,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'none', no capabilities, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "none", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "none", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -238,8 +239,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, no capabilities, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -251,9 +252,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, capabilities 'empty', no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -265,9 +266,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, capabilities 'all', no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "all", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "all", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -279,9 +280,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, capabilities set, no requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -293,11 +294,11 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, capabilities set, requirements set", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", - envNVRequirePrefix + "REQ0": "req0=true", - envNVRequirePrefix + "REQ1": "req1=false", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", + image.NvidiaRequirePrefix + "REQ0": "req0=true", + image.NvidiaRequirePrefix + "REQ1": "req1=false", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -309,12 +310,12 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices set, capabilities set, requirements set, disable requirements", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "gpu0,gpu1", - envNVDriverCapabilities: "video,display", - envNVRequirePrefix + "REQ0": "req0=true", - envNVRequirePrefix + "REQ1": "req1=false", - envNVDisableRequire: "true", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "gpu0,gpu1", + image.EnvVarNvidiaDriverCapabilities: "video,display", + image.NvidiaRequirePrefix + "REQ0": "req0=true", + image.NvidiaRequirePrefix + "REQ1": "req1=false", + image.EnvVarNvidiaDisableRequire: "true", }, privileged: false, expectedConfig: &nvidiaConfig{ @@ -326,7 +327,7 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "No cuda envs, devices 'all'", env: map[string]string{ - envNVVisibleDevices: "all", + image.EnvVarNvidiaVisibleDevices: "all", }, privileged: false, @@ -339,9 +340,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'all', migConfig set, privileged", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "all", - envNVMigConfigDevices: "mig0,mig1", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaMigConfigDevices: "mig0,mig1", }, privileged: true, expectedConfig: &nvidiaConfig{ @@ -354,9 +355,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'all', migConfig set, unprivileged", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "all", - envNVMigConfigDevices: "mig0,mig1", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaMigConfigDevices: "mig0,mig1", }, privileged: false, expectedPanic: true, @@ -364,9 +365,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'all', migMonitor set, privileged", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "all", - envNVMigMonitorDevices: "mig0,mig1", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaMigMonitorDevices: "mig0,mig1", }, privileged: true, expectedConfig: &nvidiaConfig{ @@ -379,9 +380,9 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Modern image, devices 'all', migMonitor set, unprivileged", env: map[string]string{ - envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "all", - envNVMigMonitorDevices: "mig0,mig1", + image.EnvVarNvidiaRequireCuda: "cuda>=9.0", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaMigMonitorDevices: "mig0,mig1", }, privileged: false, expectedPanic: true, @@ -389,8 +390,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Hook config set as driver-capabilities-all", env: map[string]string{ - envNVVisibleDevices: "all", - envNVDriverCapabilities: "all", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaDriverCapabilities: "all", }, privileged: true, hookConfig: &HookConfig{ @@ -404,8 +405,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Hook config set, envvar sets driver-capabilities", env: map[string]string{ - envNVVisibleDevices: "all", - envNVDriverCapabilities: "video,display", + image.EnvVarNvidiaVisibleDevices: "all", + image.EnvVarNvidiaDriverCapabilities: "video,display", }, privileged: true, hookConfig: &HookConfig{ @@ -419,7 +420,7 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Hook config set, envvar unset sets default driver-capabilities", env: map[string]string{ - envNVVisibleDevices: "all", + image.EnvVarNvidiaVisibleDevices: "all", }, privileged: true, hookConfig: &HookConfig{ @@ -433,8 +434,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Hook config set, swarmResource overrides device selection", env: map[string]string{ - envNVVisibleDevices: "all", - "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", + image.EnvVarNvidiaVisibleDevices: "all", + "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", }, privileged: true, hookConfig: &HookConfig{ @@ -449,8 +450,8 @@ func TestGetNvidiaConfig(t *testing.T) { { description: "Hook config set, comma separated swarmResource is split and overrides device selection", env: map[string]string{ - envNVVisibleDevices: "all", - "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", + image.EnvVarNvidiaVisibleDevices: "all", + "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", }, privileged: true, hookConfig: &HookConfig{ @@ -476,7 +477,7 @@ func TestGetNvidiaConfig(t *testing.T) { defaultConfig, _ := getDefaultHookConfig() hookConfig = &defaultConfig } - config = getNvidiaConfig(hookConfig, image, nil, tc.privileged) + config = getNvidiaConfig(hookConfig, image, tc.privileged) } // For any tests that are expected to panic, make sure they do. @@ -506,88 +507,10 @@ func TestGetNvidiaConfig(t *testing.T) { } } -func TestGetDevicesFromMounts(t *testing.T) { - var tests = []struct { - description string - mounts []Mount - expectedDevices []string - }{ - { - description: "No mounts", - mounts: nil, - expectedDevices: nil, - }, - { - description: "Host path is not /dev/null", - mounts: []Mount{ - { - Source: "/not/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"), - }, - }, - expectedDevices: nil, - }, - { - description: "Container path is not prefixed by 'root'", - mounts: []Mount{ - { - Source: "/dev/null", - Destination: filepath.Join("/other/prefix", "GPU0"), - }, - }, - expectedDevices: nil, - }, - { - description: "Container path is only 'root'", - mounts: []Mount{ - { - Source: "/dev/null", - Destination: deviceListAsVolumeMountsRoot, - }, - }, - expectedDevices: nil, - }, - { - description: "Discover 2 devices", - mounts: []Mount{ - { - Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"), - }, - { - Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"), - }, - }, - expectedDevices: []string{"GPU0", "GPU1"}, - }, - { - description: "Discover 2 devices with slashes in the name", - mounts: []Mount{ - { - Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0-MIG0/0/1"), - }, - { - Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1-MIG0/0/1"), - }, - }, - expectedDevices: []string{"GPU0-MIG0/0/1", "GPU1-MIG0/0/1"}, - }, - } - for _, tc := range tests { - t.Run(tc.description, func(t *testing.T) { - devices := getDevicesFromMounts(tc.mounts) - require.Equal(t, tc.expectedDevices, devices) - }) - } -} - func TestDeviceListSourcePriority(t *testing.T) { var tests = []struct { description string - mountDevices []Mount + mountDevices []specs.Mount envvarDevices string privileged bool acceptUnprivileged bool @@ -596,14 +519,14 @@ func TestDeviceListSourcePriority(t *testing.T) { }{ { description: "Mount devices, unprivileged, no accept unprivileged", - mountDevices: []Mount{ + mountDevices: []specs.Mount{ { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU0"), }, { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU1"), }, }, envvarDevices: "GPU2,GPU3", @@ -641,14 +564,14 @@ func TestDeviceListSourcePriority(t *testing.T) { }, { description: "Mount devices, unprivileged, accept unprivileged, no accept mounts", - mountDevices: []Mount{ + mountDevices: []specs.Mount{ { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU0"), }, { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU1"), }, }, envvarDevices: "GPU2,GPU3", @@ -659,14 +582,14 @@ func TestDeviceListSourcePriority(t *testing.T) { }, { description: "Mount devices, unprivileged, no accept unprivileged, no accept mounts", - mountDevices: []Mount{ + mountDevices: []specs.Mount{ { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU0"), }, { Source: "/dev/null", - Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"), + Destination: filepath.Join(image.DeviceListAsVolumeMountsRoot, "GPU1"), }, }, envvarDevices: "GPU2,GPU3", @@ -684,14 +607,15 @@ func TestDeviceListSourcePriority(t *testing.T) { image, _ := image.New( image.WithEnvMap( map[string]string{ - envNVVisibleDevices: tc.envvarDevices, + image.EnvVarNvidiaVisibleDevices: tc.envvarDevices, }, ), + image.WithMounts(tc.mountDevices), ) hookConfig, _ := getDefaultHookConfig() hookConfig.AcceptEnvvarUnprivileged = tc.acceptUnprivileged hookConfig.AcceptDeviceListAsVolumeMounts = tc.acceptMounts - devices = getDevices(&hookConfig, image, tc.mountDevices, tc.privileged) + devices = getDevices(&hookConfig, image, tc.privileged) } // For all other tests, just grab the devices and check the results @@ -720,41 +644,41 @@ func TestGetDevicesFromEnvvar(t *testing.T) { { description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "", + image.EnvVarNvidiaVisibleDevices: "", }, }, { description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "void", + image.EnvVarNvidiaVisibleDevices: "void", }, }, { description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "none", + image.EnvVarNvidiaVisibleDevices: "none", }, expectedDevices: []string{""}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image", env: map[string]string{ - envNVVisibleDevices: gpuID, + image.EnvVarNvidiaVisibleDevices: gpuID, }, expectedDevices: []string{gpuID}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image", env: map[string]string{ - envNVVisibleDevices: gpuID, - envCUDAVersion: "legacy", + image.EnvVarNvidiaVisibleDevices: gpuID, + image.EnvVarCudaVersion: "legacy", }, expectedDevices: []string{gpuID}, }, { description: "empty env returns all for legacy image", env: map[string]string{ - envCUDAVersion: "legacy", + image.EnvVarCudaVersion: "legacy", }, expectedDevices: []string{"all"}, }, @@ -769,47 +693,47 @@ func TestGetDevicesFromEnvvar(t *testing.T) { { description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "", - envDockerResourceGPUs: anotherGPUID, + image.EnvVarNvidiaVisibleDevices: "", + envDockerResourceGPUs: anotherGPUID, }, }, { description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "void", - envDockerResourceGPUs: anotherGPUID, + image.EnvVarNvidiaVisibleDevices: "void", + envDockerResourceGPUs: anotherGPUID, }, }, { description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image", env: map[string]string{ - envNVVisibleDevices: "none", - envDockerResourceGPUs: anotherGPUID, + image.EnvVarNvidiaVisibleDevices: "none", + envDockerResourceGPUs: anotherGPUID, }, expectedDevices: []string{""}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image", env: map[string]string{ - envNVVisibleDevices: gpuID, - envDockerResourceGPUs: anotherGPUID, + image.EnvVarNvidiaVisibleDevices: gpuID, + envDockerResourceGPUs: anotherGPUID, }, expectedDevices: []string{gpuID}, }, { description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image", env: map[string]string{ - envNVVisibleDevices: gpuID, - envDockerResourceGPUs: anotherGPUID, - envCUDAVersion: "legacy", + image.EnvVarNvidiaVisibleDevices: gpuID, + envDockerResourceGPUs: anotherGPUID, + image.EnvVarCudaVersion: "legacy", }, expectedDevices: []string{gpuID}, }, { description: "empty env returns all for legacy image", env: map[string]string{ - envDockerResourceGPUs: anotherGPUID, - envCUDAVersion: "legacy", + envDockerResourceGPUs: anotherGPUID, + image.EnvVarCudaVersion: "legacy", }, expectedDevices: []string{"all"}, }, @@ -853,8 +777,8 @@ func TestGetDevicesFromEnvvar(t *testing.T) { description: "DOCKER_RESOURCE_GPUS set returns value for legacy image", swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ - envDockerResourceGPUs: gpuID, - envCUDAVersion: "legacy", + envDockerResourceGPUs: gpuID, + image.EnvVarCudaVersion: "legacy", }, expectedDevices: []string{gpuID}, }, @@ -870,8 +794,8 @@ func TestGetDevicesFromEnvvar(t *testing.T) { description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ - envNVVisibleDevices: gpuID, - envDockerResourceGPUs: anotherGPUID, + image.EnvVarNvidiaVisibleDevices: gpuID, + envDockerResourceGPUs: anotherGPUID, }, expectedDevices: []string{anotherGPUID}, }, @@ -879,7 +803,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { description: "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present", swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"}, env: map[string]string{ - envNVVisibleDevices: gpuID, + image.EnvVarNvidiaVisibleDevices: gpuID, "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, }, expectedDevices: []string{anotherGPUID}, @@ -888,7 +812,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { description: "All available swarm resource envvars are selected and override NVIDIA_VISIBLE_DEVICES if present", swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, env: map[string]string{ - envNVVisibleDevices: gpuID, + image.EnvVarNvidiaVisibleDevices: gpuID, "DOCKER_RESOURCE_GPUS": thirdGPUID, "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, }, @@ -898,7 +822,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) { description: "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS override NVIDIA_VISIBLE_DEVICES if present", swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, env: map[string]string{ - envNVVisibleDevices: gpuID, + image.EnvVarNvidiaVisibleDevices: gpuID, "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, }, expectedDevices: []string{anotherGPUID}, @@ -931,7 +855,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is set for legacy image", env: map[string]string{ - envNVDriverCapabilities: "display,video", + image.EnvVarNvidiaDriverCapabilities: "display,video", }, legacyImage: true, supportedCapabilities: supportedCapabilities, @@ -940,7 +864,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is all for legacy image", env: map[string]string{ - envNVDriverCapabilities: "all", + image.EnvVarNvidiaDriverCapabilities: "all", }, legacyImage: true, supportedCapabilities: supportedCapabilities, @@ -949,7 +873,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is empty for legacy image", env: map[string]string{ - envNVDriverCapabilities: "", + image.EnvVarNvidiaDriverCapabilities: "", }, legacyImage: true, supportedCapabilities: supportedCapabilities, @@ -965,7 +889,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is set for modern image", env: map[string]string{ - envNVDriverCapabilities: "display,video", + image.EnvVarNvidiaDriverCapabilities: "display,video", }, legacyImage: false, supportedCapabilities: supportedCapabilities, @@ -981,7 +905,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is all for modern image", env: map[string]string{ - envNVDriverCapabilities: "all", + image.EnvVarNvidiaDriverCapabilities: "all", }, legacyImage: false, supportedCapabilities: supportedCapabilities, @@ -990,7 +914,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Env is empty for modern image", env: map[string]string{ - envNVDriverCapabilities: "", + image.EnvVarNvidiaDriverCapabilities: "", }, legacyImage: false, supportedCapabilities: supportedCapabilities, @@ -999,7 +923,7 @@ func TestGetDriverCapabilities(t *testing.T) { { description: "Invalid capabilities panic", env: map[string]string{ - envNVDriverCapabilities: "compute,utility", + image.EnvVarNvidiaDriverCapabilities: "compute,utility", }, supportedCapabilities: "not-compute,not-utility", expectedPanic: true, diff --git a/internal/config/image/builder.go b/internal/config/image/builder.go index da9025f0..332d017a 100644 --- a/internal/config/image/builder.go +++ b/internal/config/image/builder.go @@ -47,7 +47,7 @@ func New(opt ...Option) (CUDA, error) { // build creates a CUDA image from the builder. func (b builder) build() (CUDA, error) { if b.disableRequire { - b.env[envNVDisableRequire] = "true" + b.env[EnvVarNvidiaDisableRequire] = "true" } c := CUDA{ diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index 75e622b7..f6c0979b 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -28,12 +28,9 @@ import ( ) const ( - envCUDAVersion = "CUDA_VERSION" - envNVRequirePrefix = "NVIDIA_REQUIRE_" - envNVRequireCUDA = envNVRequirePrefix + "CUDA" - envNVRequireJetpack = envNVRequirePrefix + "JETPACK" - envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE" - envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES" + DeviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices" + + volumeMountDevicePrefixCDI = "cdi/" ) // CUDA represents a CUDA image that can be used for GPU computing. This wraps @@ -80,8 +77,8 @@ func (i CUDA) HasEnvvar(key string) bool { // image is considered legacy if it has a CUDA_VERSION environment variable defined // and no NVIDIA_REQUIRE_CUDA environment variable defined. func (i CUDA) IsLegacy() bool { - legacyCudaVersion := i.env[envCUDAVersion] - cudaRequire := i.env[envNVRequireCUDA] + legacyCudaVersion := i.env[EnvVarCudaVersion] + cudaRequire := i.env[EnvVarNvidiaRequireCuda] return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0 } @@ -95,7 +92,7 @@ func (i CUDA) GetRequirements() ([]string, error) { // All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli var requirements []string for name, value := range i.env { - if strings.HasPrefix(name, envNVRequirePrefix) && !strings.HasPrefix(name, envNVRequireJetpack) { + if strings.HasPrefix(name, NvidiaRequirePrefix) && !strings.HasPrefix(name, EnvVarNvidiaRequireJetpack) { requirements = append(requirements, value) } } @@ -113,7 +110,7 @@ func (i CUDA) GetRequirements() ([]string, error) { // HasDisableRequire checks for the value of the NVIDIA_DISABLE_REQUIRE. If set // to a valid (true) boolean value this can be used to disable the requirement checks func (i CUDA) HasDisableRequire() bool { - if disable, exists := i.env[envNVDisableRequire]; exists { + if disable, exists := i.env[EnvVarNvidiaDisableRequire]; exists { // i.logger.Debugf("NVIDIA_DISABLE_REQUIRE=%v; skipping requirement checks", disable) d, _ := strconv.ParseBool(disable) return d @@ -157,7 +154,7 @@ func (i CUDA) DevicesFromEnvvars(envVars ...string) VisibleDevices { // GetDriverCapabilities returns the requested driver capabilities. func (i CUDA) GetDriverCapabilities() DriverCapabilities { - env := i.env[envNVDriverCapabilities] + env := i.env[EnvVarNvidiaDriverCapabilities] capabilities := make(DriverCapabilities) for _, c := range strings.Split(env, ",") { @@ -168,7 +165,7 @@ func (i CUDA) GetDriverCapabilities() DriverCapabilities { } func (i CUDA) legacyVersion() (string, error) { - cudaVersion := i.env[envCUDAVersion] + cudaVersion := i.env[EnvVarCudaVersion] majorMinor, err := parseMajorMinorVersion(cudaVersion) if err != nil { return "", fmt.Errorf("invalid CUDA version %v: %v", cudaVersion, err) @@ -202,7 +199,7 @@ func parseMajorMinorVersion(version string) (string, error) { // OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool { var hasCDIdevice bool - for _, device := range i.DevicesFromEnvvars("NVIDIA_VISIBLE_DEVICES").List() { + for _, device := range i.VisibleDevicesFromEnvVar() { if !parser.IsQualifiedName(device) { return false } @@ -218,14 +215,28 @@ func (i CUDA) OnlyFullyQualifiedCDIDevices() bool { return hasCDIdevice } -const ( - deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices" -) +// VisibleDevicesFromEnvVar returns the set of visible devices requested through +// the NVIDIA_VISIBLE_DEVICES environment variable. +func (i CUDA) VisibleDevicesFromEnvVar() []string { + return i.DevicesFromEnvvars(EnvVarNvidiaVisibleDevices).List() +} + +// VisibleDevicesFromMounts returns the set of visible devices requested as mounts. +func (i CUDA) VisibleDevicesFromMounts() []string { + var devices []string + for _, device := range i.DevicesFromMounts() { + if strings.HasPrefix(device, volumeMountDevicePrefixCDI) { + continue + } + devices = append(devices, device) + } + return devices +} // DevicesFromMounts returns a list of device specified as mounts. // TODO: This should be merged with getDevicesFromMounts used in the NVIDIA Container Runtime func (i CUDA) DevicesFromMounts() []string { - root := filepath.Clean(deviceListAsVolumeMountsRoot) + root := filepath.Clean(DeviceListAsVolumeMountsRoot) seen := make(map[string]bool) var devices []string for _, m := range i.mounts { @@ -260,10 +271,10 @@ func (i CUDA) DevicesFromMounts() []string { func (i CUDA) CDIDevicesFromMounts() []string { var devices []string for _, mountDevice := range i.DevicesFromMounts() { - if !strings.HasPrefix(mountDevice, "cdi/") { + if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixCDI) { continue } - parts := strings.SplitN(strings.TrimPrefix(mountDevice, "cdi/"), "/", 3) + parts := strings.SplitN(strings.TrimPrefix(mountDevice, volumeMountDevicePrefixCDI), "/", 3) if len(parts) != 3 { continue } diff --git a/internal/config/image/cuda_image_test.go b/internal/config/image/cuda_image_test.go index 406ad443..9d9d5044 100644 --- a/internal/config/image/cuda_image_test.go +++ b/internal/config/image/cuda_image_test.go @@ -17,8 +17,10 @@ package image import ( + "path/filepath" "testing" + "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/require" ) @@ -130,3 +132,80 @@ func TestGetRequirements(t *testing.T) { } } + +func TestGetVisibleDevicesFromMounts(t *testing.T) { + var tests = []struct { + description string + mounts []specs.Mount + expectedDevices []string + }{ + { + description: "No mounts", + mounts: nil, + expectedDevices: nil, + }, + { + description: "Host path is not /dev/null", + mounts: []specs.Mount{ + { + Source: "/not/dev/null", + Destination: filepath.Join(DeviceListAsVolumeMountsRoot, "GPU0"), + }, + }, + expectedDevices: nil, + }, + { + description: "Container path is not prefixed by 'root'", + mounts: []specs.Mount{ + { + Source: "/dev/null", + Destination: filepath.Join("/other/prefix", "GPU0"), + }, + }, + expectedDevices: nil, + }, + { + description: "Container path is only 'root'", + mounts: []specs.Mount{ + { + Source: "/dev/null", + Destination: DeviceListAsVolumeMountsRoot, + }, + }, + expectedDevices: nil, + }, + { + description: "Discover 2 devices", + mounts: makeTestMounts("GPU0", "GPU1"), + expectedDevices: []string{"GPU0", "GPU1"}, + }, + { + description: "Discover 2 devices with slashes in the name", + mounts: makeTestMounts("GPU0-MIG0/0/1", "GPU1-MIG0/0/1"), + expectedDevices: []string{"GPU0-MIG0/0/1", "GPU1-MIG0/0/1"}, + }, + { + description: "cdi devices are ignored", + mounts: makeTestMounts("GPU0", "cdi/nvidia.com/gpu=all", "GPU1"), + expectedDevices: []string{"GPU0", "GPU1"}, + }, + } + for _, tc := range tests { + t.Run(tc.description, func(t *testing.T) { + image, _ := New(WithMounts(tc.mounts)) + require.Equal(t, tc.expectedDevices, image.VisibleDevicesFromMounts()) + }) + } +} + +func makeTestMounts(paths ...string) []specs.Mount { + var mounts []specs.Mount + for _, path := range paths { + mount := specs.Mount{ + Source: "/dev/null", + Destination: filepath.Join(DeviceListAsVolumeMountsRoot, path), + } + mounts = append(mounts, mount) + } + return mounts +} diff --git a/internal/config/image/envvars.go b/internal/config/image/envvars.go new file mode 100644 index 00000000..0789f22f --- /dev/null +++ b/internal/config/image/envvars.go @@ -0,0 +1,31 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package image + +const ( + EnvVarCudaVersion = "CUDA_VERSION" + EnvVarNvidiaDisableRequire = "NVIDIA_DISABLE_REQUIRE" + EnvVarNvidiaDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES" + EnvVarNvidiaImexChannels = "NVIDIA_IMEX_CHANNELS" + EnvVarNvidiaMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES" + EnvVarNvidiaMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES" + EnvVarNvidiaRequireCuda = NvidiaRequirePrefix + "CUDA" + EnvVarNvidiaRequireJetpack = NvidiaRequirePrefix + "JETPACK" + EnvVarNvidiaVisibleDevices = "NVIDIA_VISIBLE_DEVICES" + + NvidiaRequirePrefix = "NVIDIA_REQUIRE_" +) diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index c5af4f88..90cd481b 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -90,11 +90,9 @@ func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.C } } - envDevices := container.DevicesFromEnvvars(visibleDevicesEnvvar) - var devices []string seen := make(map[string]bool) - for _, name := range envDevices.List() { + for _, name := range container.VisibleDevicesFromEnvVar() { if !parser.IsQualifiedName(name) { name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name) } diff --git a/internal/modifier/csv.go b/internal/modifier/csv.go index 0905d5da..a6129432 100644 --- a/internal/modifier/csv.go +++ b/internal/modifier/csv.go @@ -30,23 +30,16 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" ) -const ( - visibleDevicesEnvvar = "NVIDIA_VISIBLE_DEVICES" - visibleDevicesVoid = "void" - - nvidiaRequireJetpackEnvvar = "NVIDIA_REQUIRE_JETPACK" -) - // NewCSVModifier creates a modifier that applies modications to an OCI spec if required by the runtime wrapper. // The modifications are defined by CSV MountSpecs. -func NewCSVModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) { - if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { +func NewCSVModifier(logger logger.Interface, cfg *config.Config, container image.CUDA) (oci.SpecModifier, error) { + if devices := container.VisibleDevicesFromEnvVar(); len(devices) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } logger.Infof("Constructing modifier from config: %+v", *cfg) - if err := checkRequirements(logger, image); err != nil { + if err := checkRequirements(logger, container); err != nil { return nil, fmt.Errorf("requirements not met: %v", err) } @@ -55,7 +48,7 @@ func NewCSVModifier(logger logger.Interface, cfg *config.Config, image image.CUD return nil, fmt.Errorf("failed to get list of CSV files: %v", err) } - if image.Getenv(nvidiaRequireJetpackEnvvar) != "csv-mounts=all" { + if container.Getenv(image.EnvVarNvidiaRequireJetpack) != "csv-mounts=all" { csvFiles = csv.BaseFilesOnly(csvFiles) } diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index 5a7141b9..2c39d207 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -36,7 +36,7 @@ import ( // // If not devices are selected, no changes are made. func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) { - if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { + if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 { logger.Infof("No modification required; no devices requested") return nil, nil } diff --git a/internal/modifier/graphics.go b/internal/modifier/graphics.go index d9784d5e..31aa6ef3 100644 --- a/internal/modifier/graphics.go +++ b/internal/modifier/graphics.go @@ -29,8 +29,8 @@ import ( // NewGraphicsModifier constructs a modifier that injects graphics-related modifications into an OCI runtime specification. // The value of the NVIDIA_DRIVER_CAPABILITIES environment variable is checked to determine if this modification should be made. -func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver) (oci.SpecModifier, error) { - if required, reason := requiresGraphicsModifier(image); !required { +func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerImage image.CUDA, driver *root.Driver) (oci.SpecModifier, error) { + if required, reason := requiresGraphicsModifier(containerImage); !required { logger.Infof("No graphics modifier required: %v", reason) return nil, nil } @@ -50,7 +50,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, image imag devRoot := driver.Root drmNodes, err := discover.NewDRMNodesDiscoverer( logger, - image.DevicesFromEnvvars(visibleDevicesEnvvar), + containerImage.DevicesFromEnvvars(image.EnvVarNvidiaVisibleDevices), devRoot, nvidiaCDIHookPath, ) @@ -67,7 +67,7 @@ func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, image imag // requiresGraphicsModifier determines whether a graphics modifier is required. func requiresGraphicsModifier(cudaImage image.CUDA) (bool, string) { - if devices := cudaImage.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 { + if devices := cudaImage.VisibleDevicesFromEnvVar(); len(devices) == 0 { return false, "no devices requested" } diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index b98f4529..8ed9e5aa 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -24,6 +24,7 @@ import ( "github.com/NVIDIA/go-nvml/pkg/nvml" "tags.cncf.io/container-device-interface/pkg/cdi" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" "github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils" @@ -200,7 +201,7 @@ func (m *wrapper) GetCommonEdits() (*cdi.ContainerEdits, error) { if err != nil { return nil, err } - edits.Env = append(edits.Env, "NVIDIA_VISIBLE_DEVICES=void") + edits.Env = append(edits.Env, image.EnvVarNvidiaVisibleDevices+"=void") return edits, nil } From 2e6712d2bcd192bf2982fe8fe8734580f8031a14 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 14 Oct 2024 14:32:06 +0200 Subject: [PATCH 3/3] Allow IMEX channels to be requested as volume mounts This change allows IMEX channels to be requested using the volume mount mechanism. A mount from /dev/null to /var/run/nvidia-container-devices/imex/{{ .ChannelID }} is equivalent to including {{ .ChannelID }} in the NVIDIA_IMEX_CHANNELS envvironment variables. Signed-off-by: Evan Lezar --- .../container_config.go | 27 ++++++++++++------- cmd/nvidia-container-runtime-hook/main.go | 4 +-- internal/config/image/cuda_image.go | 25 ++++++++++++++--- internal/config/image/cuda_image_test.go | 5 ++++ 4 files changed, 46 insertions(+), 15 deletions(-) diff --git a/cmd/nvidia-container-runtime-hook/container_config.go b/cmd/nvidia-container-runtime-hook/container_config.go index e982c695..ac34fb93 100644 --- a/cmd/nvidia-container-runtime-hook/container_config.go +++ b/cmd/nvidia-container-runtime-hook/container_config.go @@ -21,7 +21,7 @@ type nvidiaConfig struct { Devices []string MigConfigDevices string MigMonitorDevices string - ImexChannels string + ImexChannels []string DriverCapabilities string // Requirements defines the requirements DSL for the container to run. // This is empty if no specific requirements are needed, or if requirements are @@ -197,12 +197,24 @@ func getMigDevices(image image.CUDA, envvar string) *string { return &devices } -func getImexChannels(i image.CUDA) *string { - if !i.HasEnvvar(image.EnvVarNvidiaImexChannels) { +func getImexChannels(hookConfig *HookConfig, image image.CUDA, privileged bool) []string { + // If enabled, try and get the device list from volume mounts first + if hookConfig.AcceptDeviceListAsVolumeMounts { + devices := image.ImexChannelsFromMounts() + if len(devices) > 0 { + return devices + } + } + devices := image.ImexChannelsFromEnvVar() + if len(devices) == 0 { return nil } - chans := i.Getenv(image.EnvVarNvidiaImexChannels) - return &chans + + if privileged || hookConfig.AcceptEnvvarUnprivileged { + return devices + } + + return nil } func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage bool) image.DriverCapabilities { @@ -257,10 +269,7 @@ func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, privileged bool) log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container") } - var imexChannels string - if c := getImexChannels(image); c != nil { - imexChannels = *c - } + imexChannels := getImexChannels(hookConfig, image, privileged) driverCapabilities := hookConfig.getDriverCapabilities(image, legacyImage).String() diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index ad3e208d..a0cfe3ec 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -129,8 +129,8 @@ func doPrestart() { if len(nvidia.MigMonitorDevices) > 0 { args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices)) } - if len(nvidia.ImexChannels) > 0 { - args = append(args, fmt.Sprintf("--imex-channel=%s", nvidia.ImexChannels)) + if imexString := strings.Join(nvidia.ImexChannels, ","); len(imexString) > 0 { + args = append(args, fmt.Sprintf("--imex-channel=%s", imexString)) } for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") { diff --git a/internal/config/image/cuda_image.go b/internal/config/image/cuda_image.go index f6c0979b..4298e634 100644 --- a/internal/config/image/cuda_image.go +++ b/internal/config/image/cuda_image.go @@ -30,7 +30,8 @@ import ( const ( DeviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices" - volumeMountDevicePrefixCDI = "cdi/" + volumeMountDevicePrefixCDI = "cdi/" + volumeMountDevicePrefixImex = "imex/" ) // CUDA represents a CUDA image that can be used for GPU computing. This wraps @@ -225,7 +226,10 @@ func (i CUDA) VisibleDevicesFromEnvVar() []string { func (i CUDA) VisibleDevicesFromMounts() []string { var devices []string for _, device := range i.DevicesFromMounts() { - if strings.HasPrefix(device, volumeMountDevicePrefixCDI) { + switch { + case strings.HasPrefix(device, volumeMountDevicePrefixCDI): + continue + case strings.HasPrefix(device, volumeMountDevicePrefixImex): continue } devices = append(devices, device) @@ -286,6 +290,19 @@ func (i CUDA) CDIDevicesFromMounts() []string { return devices } -func (i CUDA) IsEnabled(envvar string) bool { - return i.Getenv(envvar) == "enabled" +// ImexChannelsFromEnvVar returns the list of IMEX channels requested for the image. +func (i CUDA) ImexChannelsFromEnvVar() []string { + return i.DevicesFromEnvvars(EnvVarNvidiaImexChannels).List() +} + +// ImexChannelsFromMounts returns the list of IMEX channels requested for the image. +func (i CUDA) ImexChannelsFromMounts() []string { + var channels []string + for _, mountDevice := range i.DevicesFromMounts() { + if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixImex) { + continue + } + channels = append(channels, strings.TrimPrefix(mountDevice, volumeMountDevicePrefixImex)) + } + return channels } diff --git a/internal/config/image/cuda_image_test.go b/internal/config/image/cuda_image_test.go index 9d9d5044..39bf30b4 100644 --- a/internal/config/image/cuda_image_test.go +++ b/internal/config/image/cuda_image_test.go @@ -189,6 +189,11 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) { mounts: makeTestMounts("GPU0", "cdi/nvidia.com/gpu=all", "GPU1"), expectedDevices: []string{"GPU0", "GPU1"}, }, + { + description: "imex devices are ignored", + mounts: makeTestMounts("GPU0", "imex/0", "GPU1"), + expectedDevices: []string{"GPU0", "GPU1"}, + }, } for _, tc := range tests { t.Run(tc.description, func(t *testing.T) {