mirror of
				https://github.com/NVIDIA/nvidia-container-toolkit
				synced 2025-06-26 18:18:24 +00:00 
			
		
		
		
	Merge branch 'multiple-docker-swarm' into 'main'
Add support for multiple swarm resource envvars See merge request nvidia/container-toolkit/container-toolkit!220
This commit is contained in:
		
						commit
						38513d5a53
					
				
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @ -4,6 +4,8 @@ dist | ||||
| /coverage.out* | ||||
| /test/output/ | ||||
| /nvidia-container-runtime | ||||
| /nvidia-container-runtime-hook | ||||
| /nvidia-container-toolkit | ||||
| /nvidia-ctk | ||||
| /shared-* | ||||
| /release-* | ||||
| @ -13,8 +13,6 @@ import ( | ||||
| 	"golang.org/x/mod/semver" | ||||
| ) | ||||
| 
 | ||||
| var envSwarmGPU *string | ||||
| 
 | ||||
| const ( | ||||
| 	envCUDAVersion          = "CUDA_VERSION" | ||||
| 	envNVRequirePrefix      = "NVIDIA_REQUIRE_" | ||||
| @ -165,13 +163,9 @@ func isPrivileged(s *Spec) bool { | ||||
| 	return false | ||||
| } | ||||
| 
 | ||||
| func getDevicesFromEnvvar(image image.CUDA) *string { | ||||
| 	// Build a list of envvars to consider.
 | ||||
| 	envVars := []string{envNVVisibleDevices} | ||||
| 	if envSwarmGPU != nil { | ||||
| 		// The Swarm envvar has higher precedence.
 | ||||
| 		envVars = append([]string{*envSwarmGPU}, envVars...) | ||||
| 	} | ||||
| func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string { | ||||
| 	// Build a list of envvars to consider. Note that the Swarm Resource envvars have a higher precedence.
 | ||||
| 	envVars := append(swarmResourceEnvvars, envNVVisibleDevices) | ||||
| 
 | ||||
| 	devices := image.DevicesFromEnvvars(envVars...) | ||||
| 	if len(devices) == 0 { | ||||
| @ -230,7 +224,7 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil | ||||
| 	} | ||||
| 
 | ||||
| 	// Fallback to reading from the environment variable if privileges are correct
 | ||||
| 	devices := getDevicesFromEnvvar(image) | ||||
| 	devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars()) | ||||
| 	if devices == nil { | ||||
| 		return nil | ||||
| 	} | ||||
| @ -348,7 +342,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { | ||||
| 	} | ||||
| 
 | ||||
| 	privileged := isPrivileged(s) | ||||
| 	envSwarmGPU = hook.SwarmResource | ||||
| 	return containerConfig{ | ||||
| 		Pid:    h.Pid, | ||||
| 		Rootfs: s.Root.Path, | ||||
|  | ||||
| @ -69,7 +69,7 @@ func TestGetNvidiaConfig(t *testing.T) { | ||||
| 			description: "Legacy image, devices 'void', no capabilities, no requirements", | ||||
| 			env: map[string]string{ | ||||
| 				envCUDAVersion:      "9.0", | ||||
| 				envNVVisibleDevices: "", | ||||
| 				envNVVisibleDevices: "void", | ||||
| 			}, | ||||
| 			privileged:     false, | ||||
| 			expectedConfig: nil, | ||||
| @ -226,7 +226,7 @@ func TestGetNvidiaConfig(t *testing.T) { | ||||
| 			description: "Modern image, devices 'void', no capabilities, no requirements", | ||||
| 			env: map[string]string{ | ||||
| 				envNVRequireCUDA:    "cuda>=9.0", | ||||
| 				envNVVisibleDevices: "", | ||||
| 				envNVVisibleDevices: "void", | ||||
| 			}, | ||||
| 			privileged:     false, | ||||
| 			expectedConfig: nil, | ||||
| @ -449,6 +449,44 @@ func TestGetNvidiaConfig(t *testing.T) { | ||||
| 				DriverCapabilities: defaultDriverCapabilities.String(), | ||||
| 			}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description: "Hook config set, swarmResource overrides device selection", | ||||
| 			env: map[string]string{ | ||||
| 				envNVVisibleDevices:     "all", | ||||
| 				"DOCKER_SWARM_RESOURCE": "GPU1,GPU2", | ||||
| 			}, | ||||
| 			privileged: true, | ||||
| 			hookConfig: &HookConfig{ | ||||
| 				SwarmResource: func() *string { | ||||
| 					s := "DOCKER_SWARM_RESOURCE" | ||||
| 					return &s | ||||
| 				}(), | ||||
| 				SupportedDriverCapabilities: "video,display,utility,compute", | ||||
| 			}, | ||||
| 			expectedConfig: &nvidiaConfig{ | ||||
| 				Devices:            "GPU1,GPU2", | ||||
| 				DriverCapabilities: defaultDriverCapabilities.String(), | ||||
| 			}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description: "Hook config set, comma separated swarmResource is split and overrides device selection", | ||||
| 			env: map[string]string{ | ||||
| 				envNVVisibleDevices:     "all", | ||||
| 				"DOCKER_SWARM_RESOURCE": "GPU1,GPU2", | ||||
| 			}, | ||||
| 			privileged: true, | ||||
| 			hookConfig: &HookConfig{ | ||||
| 				SwarmResource: func() *string { | ||||
| 					s := "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE" | ||||
| 					return &s | ||||
| 				}(), | ||||
| 				SupportedDriverCapabilities: "video,display,utility,compute", | ||||
| 			}, | ||||
| 			expectedConfig: &nvidiaConfig{ | ||||
| 				Devices:            "GPU1,GPU2", | ||||
| 				DriverCapabilities: defaultDriverCapabilities.String(), | ||||
| 			}, | ||||
| 		}, | ||||
| 	} | ||||
| 	for _, tc := range tests { | ||||
| 		t.Run(tc.description, func(t *testing.T) { | ||||
| @ -689,12 +727,13 @@ func TestGetDevicesFromEnvvar(t *testing.T) { | ||||
| 	envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS" | ||||
| 	gpuID := "GPU-12345" | ||||
| 	anotherGPUID := "GPU-67890" | ||||
| 	thirdGPUID := "MIG-12345" | ||||
| 
 | ||||
| 	var tests = []struct { | ||||
| 		description     string | ||||
| 		envSwarmGPU     *string | ||||
| 		env             map[string]string | ||||
| 		expectedDevices *string | ||||
| 		description          string | ||||
| 		swarmResourceEnvvars []string | ||||
| 		env                  map[string]string | ||||
| 		expectedDevices      *string | ||||
| 	}{ | ||||
| 		{ | ||||
| 			description: "empty env returns nil for non-legacy image", | ||||
| @ -798,42 +837,42 @@ func TestGetDevicesFromEnvvar(t *testing.T) { | ||||
| 		// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
 | ||||
| 		// enabled
 | ||||
| 		{ | ||||
| 			description: "empty env returns nil for non-legacy image", | ||||
| 			envSwarmGPU: &envDockerResourceGPUs, | ||||
| 			description:          "empty env returns nil for non-legacy image", | ||||
| 			swarmResourceEnvvars: []string{envDockerResourceGPUs}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image", | ||||
| 			envSwarmGPU: &envDockerResourceGPUs, | ||||
| 			description:          "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image", | ||||
| 			swarmResourceEnvvars: []string{envDockerResourceGPUs}, | ||||
| 			env: map[string]string{ | ||||
| 				envDockerResourceGPUs: "", | ||||
| 			}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image", | ||||
| 			envSwarmGPU: &envDockerResourceGPUs, | ||||
| 			description:          "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image", | ||||
| 			swarmResourceEnvvars: []string{envDockerResourceGPUs}, | ||||
| 			env: map[string]string{ | ||||
| 				envDockerResourceGPUs: "void", | ||||
| 			}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image", | ||||
| 			envSwarmGPU: &envDockerResourceGPUs, | ||||
| 			description:          "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image", | ||||
| 			swarmResourceEnvvars: []string{envDockerResourceGPUs}, | ||||
| 			env: map[string]string{ | ||||
| 				envDockerResourceGPUs: "none", | ||||
| 			}, | ||||
| 			expectedDevices: &empty, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image", | ||||
| 			envSwarmGPU: &envDockerResourceGPUs, | ||||
| 			description:          "DOCKER_RESOURCE_GPUS set returns value for non-legacy image", | ||||
| 			swarmResourceEnvvars: []string{envDockerResourceGPUs}, | ||||
| 			env: map[string]string{ | ||||
| 				envDockerResourceGPUs: gpuID, | ||||
| 			}, | ||||
| 			expectedDevices: &gpuID, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description: "DOCKER_RESOURCE_GPUS set returns value for legacy image", | ||||
| 			envSwarmGPU: &envDockerResourceGPUs, | ||||
| 			description:          "DOCKER_RESOURCE_GPUS set returns value for legacy image", | ||||
| 			swarmResourceEnvvars: []string{envDockerResourceGPUs}, | ||||
| 			env: map[string]string{ | ||||
| 				envDockerResourceGPUs: gpuID, | ||||
| 				envCUDAVersion:        "legacy", | ||||
| @ -841,28 +880,55 @@ func TestGetDevicesFromEnvvar(t *testing.T) { | ||||
| 			expectedDevices: &gpuID, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description: "DOCKER_RESOURCE_GPUS is selected if present", | ||||
| 			envSwarmGPU: &envDockerResourceGPUs, | ||||
| 			description:          "DOCKER_RESOURCE_GPUS is selected if present", | ||||
| 			swarmResourceEnvvars: []string{envDockerResourceGPUs}, | ||||
| 			env: map[string]string{ | ||||
| 				envDockerResourceGPUs: anotherGPUID, | ||||
| 			}, | ||||
| 			expectedDevices: &anotherGPUID, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", | ||||
| 			envSwarmGPU: &envDockerResourceGPUs, | ||||
| 			description:          "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", | ||||
| 			swarmResourceEnvvars: []string{envDockerResourceGPUs}, | ||||
| 			env: map[string]string{ | ||||
| 				envNVVisibleDevices:   gpuID, | ||||
| 				envDockerResourceGPUs: anotherGPUID, | ||||
| 			}, | ||||
| 			expectedDevices: &anotherGPUID, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description:          "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present", | ||||
| 			swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"}, | ||||
| 			env: map[string]string{ | ||||
| 				envNVVisibleDevices:               gpuID, | ||||
| 				"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, | ||||
| 			}, | ||||
| 			expectedDevices: &anotherGPUID, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description:          "First available swarm resource envvar is selected and overrides NVIDIA_VISIBLE_DEVICES if present", | ||||
| 			swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, | ||||
| 			env: map[string]string{ | ||||
| 				envNVVisibleDevices:               gpuID, | ||||
| 				"DOCKER_RESOURCE_GPUS":            thirdGPUID, | ||||
| 				"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, | ||||
| 			}, | ||||
| 			expectedDevices: &thirdGPUID, | ||||
| 		}, | ||||
| 		{ | ||||
| 			description:          "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", | ||||
| 			swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"}, | ||||
| 			env: map[string]string{ | ||||
| 				envNVVisibleDevices:               gpuID, | ||||
| 				"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID, | ||||
| 			}, | ||||
| 			expectedDevices: &anotherGPUID, | ||||
| 		}, | ||||
| 	} | ||||
| 
 | ||||
| 	for i, tc := range tests { | ||||
| 		t.Run(tc.description, func(t *testing.T) { | ||||
| 			envSwarmGPU = tc.envSwarmGPU | ||||
| 			devices := getDevicesFromEnvvar(image.CUDA(tc.env)) | ||||
| 			devices := getDevicesFromEnvvar(image.CUDA(tc.env), tc.swarmResourceEnvvars) | ||||
| 			if tc.expectedDevices == nil { | ||||
| 				require.Nil(t, devices, "%d: %v", i, tc) | ||||
| 				return | ||||
|  | ||||
| @ -5,6 +5,7 @@ import ( | ||||
| 	"os" | ||||
| 	"path" | ||||
| 	"reflect" | ||||
| 	"strings" | ||||
| 
 | ||||
| 	"github.com/BurntSushi/toml" | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config" | ||||
| @ -116,3 +117,22 @@ func (c HookConfig) getConfigOption(fieldName string) string { | ||||
| 	} | ||||
| 	return v | ||||
| } | ||||
| 
 | ||||
| // getSwarmResourceEnvvars returns the swarm resource envvars for the config.
 | ||||
| func (c *HookConfig) getSwarmResourceEnvvars() []string { | ||||
| 	if c.SwarmResource == nil { | ||||
| 		return nil | ||||
| 	} | ||||
| 
 | ||||
| 	candidates := strings.Split(*c.SwarmResource, ",") | ||||
| 
 | ||||
| 	var envvars []string | ||||
| 	for _, c := range candidates { | ||||
| 		trimmed := strings.TrimSpace(c) | ||||
| 		if len(trimmed) > 0 { | ||||
| 			envvars = append(envvars, trimmed) | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
| 	return envvars | ||||
| } | ||||
|  | ||||
| @ -103,3 +103,59 @@ func TestGetHookConfig(t *testing.T) { | ||||
| 		}) | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func TestGetSwarmResourceEnvvars(t *testing.T) { | ||||
| 	testCases := []struct { | ||||
| 		value    string | ||||
| 		expected []string | ||||
| 	}{ | ||||
| 		{ | ||||
| 			value:    "nil", | ||||
| 			expected: nil, | ||||
| 		}, | ||||
| 		{ | ||||
| 			value:    "", | ||||
| 			expected: nil, | ||||
| 		}, | ||||
| 		{ | ||||
| 			value:    " ", | ||||
| 			expected: nil, | ||||
| 		}, | ||||
| 		{ | ||||
| 			value:    "single", | ||||
| 			expected: []string{"single"}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			value:    "single ", | ||||
| 			expected: []string{"single"}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			value:    "one,two", | ||||
| 			expected: []string{"one", "two"}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			value:    "one ,two", | ||||
| 			expected: []string{"one", "two"}, | ||||
| 		}, | ||||
| 		{ | ||||
| 			value:    "one, two", | ||||
| 			expected: []string{"one", "two"}, | ||||
| 		}, | ||||
| 	} | ||||
| 
 | ||||
| 	for i, tc := range testCases { | ||||
| 		t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { | ||||
| 			c := &HookConfig{ | ||||
| 				SwarmResource: func() *string { | ||||
| 					if tc.value == "nil" { | ||||
| 						return nil | ||||
| 					} | ||||
| 					return &tc.value | ||||
| 				}(), | ||||
| 			} | ||||
| 
 | ||||
| 			envvars := c.getSwarmResourceEnvvars() | ||||
| 			require.EqualValues(t, tc.expected, envvars) | ||||
| 		}) | ||||
| 	} | ||||
| } | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user