From f0bdfbebe4b96de53fc58e8bbd21ab3fbf38cb42 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 30 Sep 2022 05:05:56 +0200 Subject: [PATCH] Add support for multiple swarm resource envvars This change allows the swarm-resource config option to specify a comma-separated list of environment variables instead of a single environment variable. The first environment variable matched is considered and other environment variables are ignored. Signed-off-by: Evan Lezar --- .../container_config.go | 15 +-- .../container_config_test.go | 110 ++++++++++++++---- .../hook_config.go | 20 ++++ .../hook_config_test.go | 56 +++++++++ 4 files changed, 168 insertions(+), 33 deletions(-) diff --git a/cmd/nvidia-container-runtime-hook/container_config.go b/cmd/nvidia-container-runtime-hook/container_config.go index 64d737ce..2168c5e2 100644 --- a/cmd/nvidia-container-runtime-hook/container_config.go +++ b/cmd/nvidia-container-runtime-hook/container_config.go @@ -13,8 +13,6 @@ import ( "golang.org/x/mod/semver" ) -var envSwarmGPU *string - const ( envCUDAVersion = "CUDA_VERSION" envNVRequirePrefix = "NVIDIA_REQUIRE_" @@ -165,13 +163,9 @@ func isPrivileged(s *Spec) bool { return false } -func getDevicesFromEnvvar(image image.CUDA) *string { - // Build a list of envvars to consider. - envVars := []string{envNVVisibleDevices} - if envSwarmGPU != nil { - // The Swarm envvar has higher precedence. - envVars = append([]string{*envSwarmGPU}, envVars...) - } +func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string { + // Build a list of envvars to consider. Note that the Swarm Resource envvars have a higher precedence. + envVars := append(swarmResourceEnvvars, envNVVisibleDevices) devices := image.DevicesFromEnvvars(envVars...) if len(devices) == 0 { @@ -230,7 +224,7 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil } // Fallback to reading from the environment variable if privileges are correct - devices := getDevicesFromEnvvar(image) + devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars()) if devices == nil { return nil } @@ -348,7 +342,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { } privileged := isPrivileged(s) - envSwarmGPU = hook.SwarmResource return containerConfig{ Pid: h.Pid, Rootfs: s.Root.Path, diff --git a/cmd/nvidia-container-runtime-hook/container_config_test.go b/cmd/nvidia-container-runtime-hook/container_config_test.go index f1660db1..b01e219a 100644 --- a/cmd/nvidia-container-runtime-hook/container_config_test.go +++ b/cmd/nvidia-container-runtime-hook/container_config_test.go @@ -449,6 +449,44 @@ func TestGetNvidiaConfig(t *testing.T) { DriverCapabilities: defaultDriverCapabilities.String(), }, }, + { + description: "Hook config set, swarmResource overrides device selection", + env: map[string]string{ + envNVVisibleDevices: "all", + "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", + }, + privileged: true, + hookConfig: &HookConfig{ + SwarmResource: func() *string { + s := "DOCKER_SWARM_RESOURCE" + return &s + }(), + SupportedDriverCapabilities: "video,display,utility,compute", + }, + expectedConfig: &nvidiaConfig{ + Devices: "GPU1,GPU2", + DriverCapabilities: defaultDriverCapabilities.String(), + }, + }, + { + description: "Hook config set, comma separated swarmResource is split and overrides device selection", + env: map[string]string{ + envNVVisibleDevices: "all", + "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", + }, + privileged: true, + hookConfig: &HookConfig{ + SwarmResource: func() *string { + s := "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE" + return &s + }(), + SupportedDriverCapabilities: "video,display,utility,compute", + }, + expectedConfig: &nvidiaConfig{ + Devices: "GPU1,GPU2", + DriverCapabilities: defaultDriverCapabilities.String(), + }, + }, } for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { @@ -689,12 +727,13 @@ func TestGetDevicesFromEnvvar(t *testing.T) { envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS" gpuID := "GPU-12345" anotherGPUID := "GPU-67890" + thirdGPUID := "MIG-12345" var tests = []struct { - description string - envSwarmGPU *string - env map[string]string - expectedDevices *string + description string + swarmResourceEnvvars []string + env map[string]string + expectedDevices *string }{ { description: "empty env returns nil for non-legacy image", @@ -798,42 +837,42 @@ func TestGetDevicesFromEnvvar(t *testing.T) { // Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when // enabled { - description: "empty env returns nil for non-legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "empty env returns nil for non-legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, }, { - description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: "", }, }, { - description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: "void", }, }, { - description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: "none", }, expectedDevices: &empty, }, { - description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: gpuID, }, expectedDevices: &gpuID, }, { - description: "DOCKER_RESOURCE_GPUS set returns value for legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "DOCKER_RESOURCE_GPUS set returns value for legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: gpuID, envCUDAVersion: "legacy", @@ -841,28 +880,55 @@ func TestGetDevicesFromEnvvar(t *testing.T) { expectedDevices: &gpuID, }, { - description: "DOCKER_RESOURCE_GPUS is selected if present", - envSwarmGPU: &envDockerResourceGPUs, + description: "DOCKER_RESOURCE_GPUS is selected if present", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: anotherGPUID, }, expectedDevices: &anotherGPUID, }, { - description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", - envSwarmGPU: &envDockerResourceGPUs, + description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envNVVisibleDevices: gpuID, envDockerResourceGPUs: anotherGPUID, }, expectedDevices: &anotherGPUID, }, + { + description: "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present", + swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"}, + env: map[string]string{ + envNVVisibleDevices: gpuID, + "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, + }, + expectedDevices: &anotherGPUID, + }, + { + description: "First available swarm resource envvar is selected and overrides NVIDIA_VISIBLE_DEVICES if present", + swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, + env: map[string]string{ + envNVVisibleDevices: gpuID, + "DOCKER_RESOURCE_GPUS": thirdGPUID, + "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, + }, + expectedDevices: &thirdGPUID, + }, + { + description: "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", + swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, + env: map[string]string{ + envNVVisibleDevices: gpuID, + "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, + }, + expectedDevices: &anotherGPUID, + }, } for i, tc := range tests { t.Run(tc.description, func(t *testing.T) { - envSwarmGPU = tc.envSwarmGPU - devices := getDevicesFromEnvvar(image.CUDA(tc.env)) + devices := getDevicesFromEnvvar(image.CUDA(tc.env), tc.swarmResourceEnvvars) if tc.expectedDevices == nil { require.Nil(t, devices, "%d: %v", i, tc) return diff --git a/cmd/nvidia-container-runtime-hook/hook_config.go b/cmd/nvidia-container-runtime-hook/hook_config.go index 9913ccc5..16922c29 100644 --- a/cmd/nvidia-container-runtime-hook/hook_config.go +++ b/cmd/nvidia-container-runtime-hook/hook_config.go @@ -5,6 +5,7 @@ import ( "os" "path" "reflect" + "strings" "github.com/BurntSushi/toml" "github.com/NVIDIA/nvidia-container-toolkit/internal/config" @@ -116,3 +117,22 @@ func (c HookConfig) getConfigOption(fieldName string) string { } return v } + +// getSwarmResourceEnvvars returns the swarm resource envvars for the config. +func (c *HookConfig) getSwarmResourceEnvvars() []string { + if c.SwarmResource == nil { + return nil + } + + candidates := strings.Split(*c.SwarmResource, ",") + + var envvars []string + for _, c := range candidates { + trimmed := strings.TrimSpace(c) + if len(trimmed) > 0 { + envvars = append(envvars, trimmed) + } + } + + return envvars +} diff --git a/cmd/nvidia-container-runtime-hook/hook_config_test.go b/cmd/nvidia-container-runtime-hook/hook_config_test.go index 4886664d..510e66a4 100644 --- a/cmd/nvidia-container-runtime-hook/hook_config_test.go +++ b/cmd/nvidia-container-runtime-hook/hook_config_test.go @@ -103,3 +103,59 @@ func TestGetHookConfig(t *testing.T) { }) } } + +func TestGetSwarmResourceEnvvars(t *testing.T) { + testCases := []struct { + value string + expected []string + }{ + { + value: "nil", + expected: nil, + }, + { + value: "", + expected: nil, + }, + { + value: " ", + expected: nil, + }, + { + value: "single", + expected: []string{"single"}, + }, + { + value: "single ", + expected: []string{"single"}, + }, + { + value: "one,two", + expected: []string{"one", "two"}, + }, + { + value: "one ,two", + expected: []string{"one", "two"}, + }, + { + value: "one, two", + expected: []string{"one", "two"}, + }, + } + + for i, tc := range testCases { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + c := &HookConfig{ + SwarmResource: func() *string { + if tc.value == "nil" { + return nil + } + return &tc.value + }(), + } + + envvars := c.getSwarmResourceEnvvars() + require.EqualValues(t, tc.expected, envvars) + }) + } +}