diff --git a/.gitignore b/.gitignore index 588f7091..e0c78c39 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,8 @@ dist /coverage.out* /test/output/ /nvidia-container-runtime +/nvidia-container-runtime-hook /nvidia-container-toolkit /nvidia-ctk /shared-* +/release-* \ No newline at end of file diff --git a/cmd/nvidia-container-runtime-hook/container_config.go b/cmd/nvidia-container-runtime-hook/container_config.go index 64d737ce..2168c5e2 100644 --- a/cmd/nvidia-container-runtime-hook/container_config.go +++ b/cmd/nvidia-container-runtime-hook/container_config.go @@ -13,8 +13,6 @@ import ( "golang.org/x/mod/semver" ) -var envSwarmGPU *string - const ( envCUDAVersion = "CUDA_VERSION" envNVRequirePrefix = "NVIDIA_REQUIRE_" @@ -165,13 +163,9 @@ func isPrivileged(s *Spec) bool { return false } -func getDevicesFromEnvvar(image image.CUDA) *string { - // Build a list of envvars to consider. - envVars := []string{envNVVisibleDevices} - if envSwarmGPU != nil { - // The Swarm envvar has higher precedence. - envVars = append([]string{*envSwarmGPU}, envVars...) - } +func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string { + // Build a list of envvars to consider. Note that the Swarm Resource envvars have a higher precedence. + envVars := append(swarmResourceEnvvars, envNVVisibleDevices) devices := image.DevicesFromEnvvars(envVars...) if len(devices) == 0 { @@ -230,7 +224,7 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil } // Fallback to reading from the environment variable if privileges are correct - devices := getDevicesFromEnvvar(image) + devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars()) if devices == nil { return nil } @@ -348,7 +342,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { } privileged := isPrivileged(s) - envSwarmGPU = hook.SwarmResource return containerConfig{ Pid: h.Pid, Rootfs: s.Root.Path, diff --git a/cmd/nvidia-container-runtime-hook/container_config_test.go b/cmd/nvidia-container-runtime-hook/container_config_test.go index f1660db1..59431d5c 100644 --- a/cmd/nvidia-container-runtime-hook/container_config_test.go +++ b/cmd/nvidia-container-runtime-hook/container_config_test.go @@ -69,7 +69,7 @@ func TestGetNvidiaConfig(t *testing.T) { description: "Legacy image, devices 'void', no capabilities, no requirements", env: map[string]string{ envCUDAVersion: "9.0", - envNVVisibleDevices: "", + envNVVisibleDevices: "void", }, privileged: false, expectedConfig: nil, @@ -226,7 +226,7 @@ func TestGetNvidiaConfig(t *testing.T) { description: "Modern image, devices 'void', no capabilities, no requirements", env: map[string]string{ envNVRequireCUDA: "cuda>=9.0", - envNVVisibleDevices: "", + envNVVisibleDevices: "void", }, privileged: false, expectedConfig: nil, @@ -449,6 +449,44 @@ func TestGetNvidiaConfig(t *testing.T) { DriverCapabilities: defaultDriverCapabilities.String(), }, }, + { + description: "Hook config set, swarmResource overrides device selection", + env: map[string]string{ + envNVVisibleDevices: "all", + "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", + }, + privileged: true, + hookConfig: &HookConfig{ + SwarmResource: func() *string { + s := "DOCKER_SWARM_RESOURCE" + return &s + }(), + SupportedDriverCapabilities: "video,display,utility,compute", + }, + expectedConfig: &nvidiaConfig{ + Devices: "GPU1,GPU2", + DriverCapabilities: defaultDriverCapabilities.String(), + }, + }, + { + description: "Hook config set, comma separated swarmResource is split and overrides device selection", + env: map[string]string{ + envNVVisibleDevices: "all", + "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", + }, + privileged: true, + hookConfig: &HookConfig{ + SwarmResource: func() *string { + s := "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE" + return &s + }(), + SupportedDriverCapabilities: "video,display,utility,compute", + }, + expectedConfig: &nvidiaConfig{ + Devices: "GPU1,GPU2", + DriverCapabilities: defaultDriverCapabilities.String(), + }, + }, } for _, tc := range tests { t.Run(tc.description, func(t *testing.T) { @@ -689,12 +727,13 @@ func TestGetDevicesFromEnvvar(t *testing.T) { envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS" gpuID := "GPU-12345" anotherGPUID := "GPU-67890" + thirdGPUID := "MIG-12345" var tests = []struct { - description string - envSwarmGPU *string - env map[string]string - expectedDevices *string + description string + swarmResourceEnvvars []string + env map[string]string + expectedDevices *string }{ { description: "empty env returns nil for non-legacy image", @@ -798,42 +837,42 @@ func TestGetDevicesFromEnvvar(t *testing.T) { // Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when // enabled { - description: "empty env returns nil for non-legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "empty env returns nil for non-legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, }, { - description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: "", }, }, { - description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: "void", }, }, { - description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: "none", }, expectedDevices: &empty, }, { - description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: gpuID, }, expectedDevices: &gpuID, }, { - description: "DOCKER_RESOURCE_GPUS set returns value for legacy image", - envSwarmGPU: &envDockerResourceGPUs, + description: "DOCKER_RESOURCE_GPUS set returns value for legacy image", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: gpuID, envCUDAVersion: "legacy", @@ -841,28 +880,55 @@ func TestGetDevicesFromEnvvar(t *testing.T) { expectedDevices: &gpuID, }, { - description: "DOCKER_RESOURCE_GPUS is selected if present", - envSwarmGPU: &envDockerResourceGPUs, + description: "DOCKER_RESOURCE_GPUS is selected if present", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envDockerResourceGPUs: anotherGPUID, }, expectedDevices: &anotherGPUID, }, { - description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", - envSwarmGPU: &envDockerResourceGPUs, + description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", + swarmResourceEnvvars: []string{envDockerResourceGPUs}, env: map[string]string{ envNVVisibleDevices: gpuID, envDockerResourceGPUs: anotherGPUID, }, expectedDevices: &anotherGPUID, }, + { + description: "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present", + swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"}, + env: map[string]string{ + envNVVisibleDevices: gpuID, + "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, + }, + expectedDevices: &anotherGPUID, + }, + { + description: "First available swarm resource envvar is selected and overrides NVIDIA_VISIBLE_DEVICES if present", + swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, + env: map[string]string{ + envNVVisibleDevices: gpuID, + "DOCKER_RESOURCE_GPUS": thirdGPUID, + "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, + }, + expectedDevices: &thirdGPUID, + }, + { + description: "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", + swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, + env: map[string]string{ + envNVVisibleDevices: gpuID, + "DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, + }, + expectedDevices: &anotherGPUID, + }, } for i, tc := range tests { t.Run(tc.description, func(t *testing.T) { - envSwarmGPU = tc.envSwarmGPU - devices := getDevicesFromEnvvar(image.CUDA(tc.env)) + devices := getDevicesFromEnvvar(image.CUDA(tc.env), tc.swarmResourceEnvvars) if tc.expectedDevices == nil { require.Nil(t, devices, "%d: %v", i, tc) return diff --git a/cmd/nvidia-container-runtime-hook/hook_config.go b/cmd/nvidia-container-runtime-hook/hook_config.go index 9913ccc5..16922c29 100644 --- a/cmd/nvidia-container-runtime-hook/hook_config.go +++ b/cmd/nvidia-container-runtime-hook/hook_config.go @@ -5,6 +5,7 @@ import ( "os" "path" "reflect" + "strings" "github.com/BurntSushi/toml" "github.com/NVIDIA/nvidia-container-toolkit/internal/config" @@ -116,3 +117,22 @@ func (c HookConfig) getConfigOption(fieldName string) string { } return v } + +// getSwarmResourceEnvvars returns the swarm resource envvars for the config. +func (c *HookConfig) getSwarmResourceEnvvars() []string { + if c.SwarmResource == nil { + return nil + } + + candidates := strings.Split(*c.SwarmResource, ",") + + var envvars []string + for _, c := range candidates { + trimmed := strings.TrimSpace(c) + if len(trimmed) > 0 { + envvars = append(envvars, trimmed) + } + } + + return envvars +} diff --git a/cmd/nvidia-container-runtime-hook/hook_config_test.go b/cmd/nvidia-container-runtime-hook/hook_config_test.go index 4886664d..510e66a4 100644 --- a/cmd/nvidia-container-runtime-hook/hook_config_test.go +++ b/cmd/nvidia-container-runtime-hook/hook_config_test.go @@ -103,3 +103,59 @@ func TestGetHookConfig(t *testing.T) { }) } } + +func TestGetSwarmResourceEnvvars(t *testing.T) { + testCases := []struct { + value string + expected []string + }{ + { + value: "nil", + expected: nil, + }, + { + value: "", + expected: nil, + }, + { + value: " ", + expected: nil, + }, + { + value: "single", + expected: []string{"single"}, + }, + { + value: "single ", + expected: []string{"single"}, + }, + { + value: "one,two", + expected: []string{"one", "two"}, + }, + { + value: "one ,two", + expected: []string{"one", "two"}, + }, + { + value: "one, two", + expected: []string{"one", "two"}, + }, + } + + for i, tc := range testCases { + t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { + c := &HookConfig{ + SwarmResource: func() *string { + if tc.value == "nil" { + return nil + } + return &tc.value + }(), + } + + envvars := c.getSwarmResourceEnvvars() + require.EqualValues(t, tc.expected, envvars) + }) + } +}