mirror of
				https://github.com/NVIDIA/nvidia-container-toolkit
				synced 2025-06-26 18:18:24 +00:00 
			
		
		
		
	Add support for multiple swarm resource envvars
This change allows the swarm-resource config option to specify a comma-separated list of environment variables instead of a single environment variable. The first environment variable matched is considered and other environment variables are ignored. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
		
							parent
							
								
									a4fa61d05d
								
							
						
					
					
						commit
						f0bdfbebe4
					
				@ -13,8 +13,6 @@ import (
 | 
			
		||||
	"golang.org/x/mod/semver"
 | 
			
		||||
)
 | 
			
		||||
 | 
			
		||||
var envSwarmGPU *string
 | 
			
		||||
 | 
			
		||||
const (
 | 
			
		||||
	envCUDAVersion          = "CUDA_VERSION"
 | 
			
		||||
	envNVRequirePrefix      = "NVIDIA_REQUIRE_"
 | 
			
		||||
@ -165,13 +163,9 @@ func isPrivileged(s *Spec) bool {
 | 
			
		||||
	return false
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func getDevicesFromEnvvar(image image.CUDA) *string {
 | 
			
		||||
	// Build a list of envvars to consider.
 | 
			
		||||
	envVars := []string{envNVVisibleDevices}
 | 
			
		||||
	if envSwarmGPU != nil {
 | 
			
		||||
		// The Swarm envvar has higher precedence.
 | 
			
		||||
		envVars = append([]string{*envSwarmGPU}, envVars...)
 | 
			
		||||
	}
 | 
			
		||||
func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string {
 | 
			
		||||
	// Build a list of envvars to consider. Note that the Swarm Resource envvars have a higher precedence.
 | 
			
		||||
	envVars := append(swarmResourceEnvvars, envNVVisibleDevices)
 | 
			
		||||
 | 
			
		||||
	devices := image.DevicesFromEnvvars(envVars...)
 | 
			
		||||
	if len(devices) == 0 {
 | 
			
		||||
@ -230,7 +224,7 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	// Fallback to reading from the environment variable if privileges are correct
 | 
			
		||||
	devices := getDevicesFromEnvvar(image)
 | 
			
		||||
	devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars())
 | 
			
		||||
	if devices == nil {
 | 
			
		||||
		return nil
 | 
			
		||||
	}
 | 
			
		||||
@ -348,7 +342,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	privileged := isPrivileged(s)
 | 
			
		||||
	envSwarmGPU = hook.SwarmResource
 | 
			
		||||
	return containerConfig{
 | 
			
		||||
		Pid:    h.Pid,
 | 
			
		||||
		Rootfs: s.Root.Path,
 | 
			
		||||
 | 
			
		||||
@ -449,6 +449,44 @@ func TestGetNvidiaConfig(t *testing.T) {
 | 
			
		||||
				DriverCapabilities: defaultDriverCapabilities.String(),
 | 
			
		||||
			},
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description: "Hook config set, swarmResource overrides device selection",
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envNVVisibleDevices:     "all",
 | 
			
		||||
				"DOCKER_SWARM_RESOURCE": "GPU1,GPU2",
 | 
			
		||||
			},
 | 
			
		||||
			privileged: true,
 | 
			
		||||
			hookConfig: &HookConfig{
 | 
			
		||||
				SwarmResource: func() *string {
 | 
			
		||||
					s := "DOCKER_SWARM_RESOURCE"
 | 
			
		||||
					return &s
 | 
			
		||||
				}(),
 | 
			
		||||
				SupportedDriverCapabilities: "video,display,utility,compute",
 | 
			
		||||
			},
 | 
			
		||||
			expectedConfig: &nvidiaConfig{
 | 
			
		||||
				Devices:            "GPU1,GPU2",
 | 
			
		||||
				DriverCapabilities: defaultDriverCapabilities.String(),
 | 
			
		||||
			},
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description: "Hook config set, comma separated swarmResource is split and overrides device selection",
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envNVVisibleDevices:     "all",
 | 
			
		||||
				"DOCKER_SWARM_RESOURCE": "GPU1,GPU2",
 | 
			
		||||
			},
 | 
			
		||||
			privileged: true,
 | 
			
		||||
			hookConfig: &HookConfig{
 | 
			
		||||
				SwarmResource: func() *string {
 | 
			
		||||
					s := "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE"
 | 
			
		||||
					return &s
 | 
			
		||||
				}(),
 | 
			
		||||
				SupportedDriverCapabilities: "video,display,utility,compute",
 | 
			
		||||
			},
 | 
			
		||||
			expectedConfig: &nvidiaConfig{
 | 
			
		||||
				Devices:            "GPU1,GPU2",
 | 
			
		||||
				DriverCapabilities: defaultDriverCapabilities.String(),
 | 
			
		||||
			},
 | 
			
		||||
		},
 | 
			
		||||
	}
 | 
			
		||||
	for _, tc := range tests {
 | 
			
		||||
		t.Run(tc.description, func(t *testing.T) {
 | 
			
		||||
@ -689,12 +727,13 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
 | 
			
		||||
	envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS"
 | 
			
		||||
	gpuID := "GPU-12345"
 | 
			
		||||
	anotherGPUID := "GPU-67890"
 | 
			
		||||
	thirdGPUID := "MIG-12345"
 | 
			
		||||
 | 
			
		||||
	var tests = []struct {
 | 
			
		||||
		description     string
 | 
			
		||||
		envSwarmGPU     *string
 | 
			
		||||
		env             map[string]string
 | 
			
		||||
		expectedDevices *string
 | 
			
		||||
		description          string
 | 
			
		||||
		swarmResourceEnvvars []string
 | 
			
		||||
		env                  map[string]string
 | 
			
		||||
		expectedDevices      *string
 | 
			
		||||
	}{
 | 
			
		||||
		{
 | 
			
		||||
			description: "empty env returns nil for non-legacy image",
 | 
			
		||||
@ -798,42 +837,42 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
 | 
			
		||||
		// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
 | 
			
		||||
		// enabled
 | 
			
		||||
		{
 | 
			
		||||
			description: "empty env returns nil for non-legacy image",
 | 
			
		||||
			envSwarmGPU: &envDockerResourceGPUs,
 | 
			
		||||
			description:          "empty env returns nil for non-legacy image",
 | 
			
		||||
			swarmResourceEnvvars: []string{envDockerResourceGPUs},
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
 | 
			
		||||
			envSwarmGPU: &envDockerResourceGPUs,
 | 
			
		||||
			description:          "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
 | 
			
		||||
			swarmResourceEnvvars: []string{envDockerResourceGPUs},
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envDockerResourceGPUs: "",
 | 
			
		||||
			},
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
 | 
			
		||||
			envSwarmGPU: &envDockerResourceGPUs,
 | 
			
		||||
			description:          "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
 | 
			
		||||
			swarmResourceEnvvars: []string{envDockerResourceGPUs},
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envDockerResourceGPUs: "void",
 | 
			
		||||
			},
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
 | 
			
		||||
			envSwarmGPU: &envDockerResourceGPUs,
 | 
			
		||||
			description:          "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
 | 
			
		||||
			swarmResourceEnvvars: []string{envDockerResourceGPUs},
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envDockerResourceGPUs: "none",
 | 
			
		||||
			},
 | 
			
		||||
			expectedDevices: &empty,
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
 | 
			
		||||
			envSwarmGPU: &envDockerResourceGPUs,
 | 
			
		||||
			description:          "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
 | 
			
		||||
			swarmResourceEnvvars: []string{envDockerResourceGPUs},
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envDockerResourceGPUs: gpuID,
 | 
			
		||||
			},
 | 
			
		||||
			expectedDevices: &gpuID,
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
 | 
			
		||||
			envSwarmGPU: &envDockerResourceGPUs,
 | 
			
		||||
			description:          "DOCKER_RESOURCE_GPUS set returns value for legacy image",
 | 
			
		||||
			swarmResourceEnvvars: []string{envDockerResourceGPUs},
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envDockerResourceGPUs: gpuID,
 | 
			
		||||
				envCUDAVersion:        "legacy",
 | 
			
		||||
@ -841,28 +880,55 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
 | 
			
		||||
			expectedDevices: &gpuID,
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description: "DOCKER_RESOURCE_GPUS is selected if present",
 | 
			
		||||
			envSwarmGPU: &envDockerResourceGPUs,
 | 
			
		||||
			description:          "DOCKER_RESOURCE_GPUS is selected if present",
 | 
			
		||||
			swarmResourceEnvvars: []string{envDockerResourceGPUs},
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envDockerResourceGPUs: anotherGPUID,
 | 
			
		||||
			},
 | 
			
		||||
			expectedDevices: &anotherGPUID,
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
 | 
			
		||||
			envSwarmGPU: &envDockerResourceGPUs,
 | 
			
		||||
			description:          "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
 | 
			
		||||
			swarmResourceEnvvars: []string{envDockerResourceGPUs},
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envNVVisibleDevices:   gpuID,
 | 
			
		||||
				envDockerResourceGPUs: anotherGPUID,
 | 
			
		||||
			},
 | 
			
		||||
			expectedDevices: &anotherGPUID,
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description:          "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present",
 | 
			
		||||
			swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"},
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envNVVisibleDevices:               gpuID,
 | 
			
		||||
				"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
 | 
			
		||||
			},
 | 
			
		||||
			expectedDevices: &anotherGPUID,
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description:          "First available swarm resource envvar is selected and overrides NVIDIA_VISIBLE_DEVICES if present",
 | 
			
		||||
			swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envNVVisibleDevices:               gpuID,
 | 
			
		||||
				"DOCKER_RESOURCE_GPUS":            thirdGPUID,
 | 
			
		||||
				"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
 | 
			
		||||
			},
 | 
			
		||||
			expectedDevices: &thirdGPUID,
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			description:          "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
 | 
			
		||||
			swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
 | 
			
		||||
			env: map[string]string{
 | 
			
		||||
				envNVVisibleDevices:               gpuID,
 | 
			
		||||
				"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
 | 
			
		||||
			},
 | 
			
		||||
			expectedDevices: &anotherGPUID,
 | 
			
		||||
		},
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for i, tc := range tests {
 | 
			
		||||
		t.Run(tc.description, func(t *testing.T) {
 | 
			
		||||
			envSwarmGPU = tc.envSwarmGPU
 | 
			
		||||
			devices := getDevicesFromEnvvar(image.CUDA(tc.env))
 | 
			
		||||
			devices := getDevicesFromEnvvar(image.CUDA(tc.env), tc.swarmResourceEnvvars)
 | 
			
		||||
			if tc.expectedDevices == nil {
 | 
			
		||||
				require.Nil(t, devices, "%d: %v", i, tc)
 | 
			
		||||
				return
 | 
			
		||||
 | 
			
		||||
@ -5,6 +5,7 @@ import (
 | 
			
		||||
	"os"
 | 
			
		||||
	"path"
 | 
			
		||||
	"reflect"
 | 
			
		||||
	"strings"
 | 
			
		||||
 | 
			
		||||
	"github.com/BurntSushi/toml"
 | 
			
		||||
	"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
 | 
			
		||||
@ -116,3 +117,22 @@ func (c HookConfig) getConfigOption(fieldName string) string {
 | 
			
		||||
	}
 | 
			
		||||
	return v
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
// getSwarmResourceEnvvars returns the swarm resource envvars for the config.
 | 
			
		||||
func (c *HookConfig) getSwarmResourceEnvvars() []string {
 | 
			
		||||
	if c.SwarmResource == nil {
 | 
			
		||||
		return nil
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	candidates := strings.Split(*c.SwarmResource, ",")
 | 
			
		||||
 | 
			
		||||
	var envvars []string
 | 
			
		||||
	for _, c := range candidates {
 | 
			
		||||
		trimmed := strings.TrimSpace(c)
 | 
			
		||||
		if len(trimmed) > 0 {
 | 
			
		||||
			envvars = append(envvars, trimmed)
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return envvars
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -103,3 +103,59 @@ func TestGetHookConfig(t *testing.T) {
 | 
			
		||||
		})
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
func TestGetSwarmResourceEnvvars(t *testing.T) {
 | 
			
		||||
	testCases := []struct {
 | 
			
		||||
		value    string
 | 
			
		||||
		expected []string
 | 
			
		||||
	}{
 | 
			
		||||
		{
 | 
			
		||||
			value:    "nil",
 | 
			
		||||
			expected: nil,
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			value:    "",
 | 
			
		||||
			expected: nil,
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			value:    " ",
 | 
			
		||||
			expected: nil,
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			value:    "single",
 | 
			
		||||
			expected: []string{"single"},
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			value:    "single ",
 | 
			
		||||
			expected: []string{"single"},
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			value:    "one,two",
 | 
			
		||||
			expected: []string{"one", "two"},
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			value:    "one ,two",
 | 
			
		||||
			expected: []string{"one", "two"},
 | 
			
		||||
		},
 | 
			
		||||
		{
 | 
			
		||||
			value:    "one, two",
 | 
			
		||||
			expected: []string{"one", "two"},
 | 
			
		||||
		},
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for i, tc := range testCases {
 | 
			
		||||
		t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
 | 
			
		||||
			c := &HookConfig{
 | 
			
		||||
				SwarmResource: func() *string {
 | 
			
		||||
					if tc.value == "nil" {
 | 
			
		||||
						return nil
 | 
			
		||||
					}
 | 
			
		||||
					return &tc.value
 | 
			
		||||
				}(),
 | 
			
		||||
			}
 | 
			
		||||
 | 
			
		||||
			envvars := c.getSwarmResourceEnvvars()
 | 
			
		||||
			require.EqualValues(t, tc.expected, envvars)
 | 
			
		||||
		})
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user