Add support for multiple swarm resource envvars

This change allows the swarm-resource config option to specify a
comma-separated list of environment variables instead of a single
environment variable.

The first environment variable matched is considered and other
environment variables are ignored.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2022-09-30 05:05:56 +02:00
parent a4fa61d05d
commit f0bdfbebe4
4 changed files with 168 additions and 33 deletions

View File

@ -13,8 +13,6 @@ import (
"golang.org/x/mod/semver" "golang.org/x/mod/semver"
) )
var envSwarmGPU *string
const ( const (
envCUDAVersion = "CUDA_VERSION" envCUDAVersion = "CUDA_VERSION"
envNVRequirePrefix = "NVIDIA_REQUIRE_" envNVRequirePrefix = "NVIDIA_REQUIRE_"
@ -165,13 +163,9 @@ func isPrivileged(s *Spec) bool {
return false return false
} }
func getDevicesFromEnvvar(image image.CUDA) *string { func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string {
// Build a list of envvars to consider. // Build a list of envvars to consider. Note that the Swarm Resource envvars have a higher precedence.
envVars := []string{envNVVisibleDevices} envVars := append(swarmResourceEnvvars, envNVVisibleDevices)
if envSwarmGPU != nil {
// The Swarm envvar has higher precedence.
envVars = append([]string{*envSwarmGPU}, envVars...)
}
devices := image.DevicesFromEnvvars(envVars...) devices := image.DevicesFromEnvvars(envVars...)
if len(devices) == 0 { if len(devices) == 0 {
@ -230,7 +224,7 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil
} }
// Fallback to reading from the environment variable if privileges are correct // Fallback to reading from the environment variable if privileges are correct
devices := getDevicesFromEnvvar(image) devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars())
if devices == nil { if devices == nil {
return nil return nil
} }
@ -348,7 +342,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
} }
privileged := isPrivileged(s) privileged := isPrivileged(s)
envSwarmGPU = hook.SwarmResource
return containerConfig{ return containerConfig{
Pid: h.Pid, Pid: h.Pid,
Rootfs: s.Root.Path, Rootfs: s.Root.Path,

View File

@ -449,6 +449,44 @@ func TestGetNvidiaConfig(t *testing.T) {
DriverCapabilities: defaultDriverCapabilities.String(), DriverCapabilities: defaultDriverCapabilities.String(),
}, },
}, },
{
description: "Hook config set, swarmResource overrides device selection",
env: map[string]string{
envNVVisibleDevices: "all",
"DOCKER_SWARM_RESOURCE": "GPU1,GPU2",
},
privileged: true,
hookConfig: &HookConfig{
SwarmResource: func() *string {
s := "DOCKER_SWARM_RESOURCE"
return &s
}(),
SupportedDriverCapabilities: "video,display,utility,compute",
},
expectedConfig: &nvidiaConfig{
Devices: "GPU1,GPU2",
DriverCapabilities: defaultDriverCapabilities.String(),
},
},
{
description: "Hook config set, comma separated swarmResource is split and overrides device selection",
env: map[string]string{
envNVVisibleDevices: "all",
"DOCKER_SWARM_RESOURCE": "GPU1,GPU2",
},
privileged: true,
hookConfig: &HookConfig{
SwarmResource: func() *string {
s := "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE"
return &s
}(),
SupportedDriverCapabilities: "video,display,utility,compute",
},
expectedConfig: &nvidiaConfig{
Devices: "GPU1,GPU2",
DriverCapabilities: defaultDriverCapabilities.String(),
},
},
} }
for _, tc := range tests { for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) { t.Run(tc.description, func(t *testing.T) {
@ -689,12 +727,13 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS" envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS"
gpuID := "GPU-12345" gpuID := "GPU-12345"
anotherGPUID := "GPU-67890" anotherGPUID := "GPU-67890"
thirdGPUID := "MIG-12345"
var tests = []struct { var tests = []struct {
description string description string
envSwarmGPU *string swarmResourceEnvvars []string
env map[string]string env map[string]string
expectedDevices *string expectedDevices *string
}{ }{
{ {
description: "empty env returns nil for non-legacy image", description: "empty env returns nil for non-legacy image",
@ -798,42 +837,42 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when // Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
// enabled // enabled
{ {
description: "empty env returns nil for non-legacy image", description: "empty env returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs, swarmResourceEnvvars: []string{envDockerResourceGPUs},
}, },
{ {
description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image", description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs, swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{ env: map[string]string{
envDockerResourceGPUs: "", envDockerResourceGPUs: "",
}, },
}, },
{ {
description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image", description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs, swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{ env: map[string]string{
envDockerResourceGPUs: "void", envDockerResourceGPUs: "void",
}, },
}, },
{ {
description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image", description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs, swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{ env: map[string]string{
envDockerResourceGPUs: "none", envDockerResourceGPUs: "none",
}, },
expectedDevices: &empty, expectedDevices: &empty,
}, },
{ {
description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image", description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs, swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{ env: map[string]string{
envDockerResourceGPUs: gpuID, envDockerResourceGPUs: gpuID,
}, },
expectedDevices: &gpuID, expectedDevices: &gpuID,
}, },
{ {
description: "DOCKER_RESOURCE_GPUS set returns value for legacy image", description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
envSwarmGPU: &envDockerResourceGPUs, swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{ env: map[string]string{
envDockerResourceGPUs: gpuID, envDockerResourceGPUs: gpuID,
envCUDAVersion: "legacy", envCUDAVersion: "legacy",
@ -841,28 +880,55 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
expectedDevices: &gpuID, expectedDevices: &gpuID,
}, },
{ {
description: "DOCKER_RESOURCE_GPUS is selected if present", description: "DOCKER_RESOURCE_GPUS is selected if present",
envSwarmGPU: &envDockerResourceGPUs, swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{ env: map[string]string{
envDockerResourceGPUs: anotherGPUID, envDockerResourceGPUs: anotherGPUID,
}, },
expectedDevices: &anotherGPUID, expectedDevices: &anotherGPUID,
}, },
{ {
description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present", description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
envSwarmGPU: &envDockerResourceGPUs, swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{ env: map[string]string{
envNVVisibleDevices: gpuID, envNVVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID, envDockerResourceGPUs: anotherGPUID,
}, },
expectedDevices: &anotherGPUID, expectedDevices: &anotherGPUID,
}, },
{
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
envNVVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: &anotherGPUID,
},
{
description: "First available swarm resource envvar is selected and overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
envNVVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS": thirdGPUID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: &thirdGPUID,
},
{
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
envNVVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: &anotherGPUID,
},
} }
for i, tc := range tests { for i, tc := range tests {
t.Run(tc.description, func(t *testing.T) { t.Run(tc.description, func(t *testing.T) {
envSwarmGPU = tc.envSwarmGPU devices := getDevicesFromEnvvar(image.CUDA(tc.env), tc.swarmResourceEnvvars)
devices := getDevicesFromEnvvar(image.CUDA(tc.env))
if tc.expectedDevices == nil { if tc.expectedDevices == nil {
require.Nil(t, devices, "%d: %v", i, tc) require.Nil(t, devices, "%d: %v", i, tc)
return return

View File

@ -5,6 +5,7 @@ import (
"os" "os"
"path" "path"
"reflect" "reflect"
"strings"
"github.com/BurntSushi/toml" "github.com/BurntSushi/toml"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config"
@ -116,3 +117,22 @@ func (c HookConfig) getConfigOption(fieldName string) string {
} }
return v return v
} }
// getSwarmResourceEnvvars returns the swarm resource envvars for the config.
func (c *HookConfig) getSwarmResourceEnvvars() []string {
if c.SwarmResource == nil {
return nil
}
candidates := strings.Split(*c.SwarmResource, ",")
var envvars []string
for _, c := range candidates {
trimmed := strings.TrimSpace(c)
if len(trimmed) > 0 {
envvars = append(envvars, trimmed)
}
}
return envvars
}

View File

@ -103,3 +103,59 @@ func TestGetHookConfig(t *testing.T) {
}) })
} }
} }
func TestGetSwarmResourceEnvvars(t *testing.T) {
testCases := []struct {
value string
expected []string
}{
{
value: "nil",
expected: nil,
},
{
value: "",
expected: nil,
},
{
value: " ",
expected: nil,
},
{
value: "single",
expected: []string{"single"},
},
{
value: "single ",
expected: []string{"single"},
},
{
value: "one,two",
expected: []string{"one", "two"},
},
{
value: "one ,two",
expected: []string{"one", "two"},
},
{
value: "one, two",
expected: []string{"one", "two"},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
c := &HookConfig{
SwarmResource: func() *string {
if tc.value == "nil" {
return nil
}
return &tc.value
}(),
}
envvars := c.getSwarmResourceEnvvars()
require.EqualValues(t, tc.expected, envvars)
})
}
}