Add support for multiple swarm resource envvars

This change allows the swarm-resource config option to specify a
comma-separated list of environment variables instead of a single
environment variable.

The first environment variable matched is considered and other
environment variables are ignored.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2022-09-30 05:05:56 +02:00
parent a4fa61d05d
commit f0bdfbebe4
4 changed files with 168 additions and 33 deletions

View File

@ -13,8 +13,6 @@ import (
"golang.org/x/mod/semver"
)
var envSwarmGPU *string
const (
envCUDAVersion = "CUDA_VERSION"
envNVRequirePrefix = "NVIDIA_REQUIRE_"
@ -165,13 +163,9 @@ func isPrivileged(s *Spec) bool {
return false
}
func getDevicesFromEnvvar(image image.CUDA) *string {
// Build a list of envvars to consider.
envVars := []string{envNVVisibleDevices}
if envSwarmGPU != nil {
// The Swarm envvar has higher precedence.
envVars = append([]string{*envSwarmGPU}, envVars...)
}
func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string {
// Build a list of envvars to consider. Note that the Swarm Resource envvars have a higher precedence.
envVars := append(swarmResourceEnvvars, envNVVisibleDevices)
devices := image.DevicesFromEnvvars(envVars...)
if len(devices) == 0 {
@ -230,7 +224,7 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil
}
// Fallback to reading from the environment variable if privileges are correct
devices := getDevicesFromEnvvar(image)
devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars())
if devices == nil {
return nil
}
@ -348,7 +342,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
}
privileged := isPrivileged(s)
envSwarmGPU = hook.SwarmResource
return containerConfig{
Pid: h.Pid,
Rootfs: s.Root.Path,

View File

@ -449,6 +449,44 @@ func TestGetNvidiaConfig(t *testing.T) {
DriverCapabilities: defaultDriverCapabilities.String(),
},
},
{
description: "Hook config set, swarmResource overrides device selection",
env: map[string]string{
envNVVisibleDevices: "all",
"DOCKER_SWARM_RESOURCE": "GPU1,GPU2",
},
privileged: true,
hookConfig: &HookConfig{
SwarmResource: func() *string {
s := "DOCKER_SWARM_RESOURCE"
return &s
}(),
SupportedDriverCapabilities: "video,display,utility,compute",
},
expectedConfig: &nvidiaConfig{
Devices: "GPU1,GPU2",
DriverCapabilities: defaultDriverCapabilities.String(),
},
},
{
description: "Hook config set, comma separated swarmResource is split and overrides device selection",
env: map[string]string{
envNVVisibleDevices: "all",
"DOCKER_SWARM_RESOURCE": "GPU1,GPU2",
},
privileged: true,
hookConfig: &HookConfig{
SwarmResource: func() *string {
s := "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE"
return &s
}(),
SupportedDriverCapabilities: "video,display,utility,compute",
},
expectedConfig: &nvidiaConfig{
Devices: "GPU1,GPU2",
DriverCapabilities: defaultDriverCapabilities.String(),
},
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
@ -689,10 +727,11 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS"
gpuID := "GPU-12345"
anotherGPUID := "GPU-67890"
thirdGPUID := "MIG-12345"
var tests = []struct {
description string
envSwarmGPU *string
swarmResourceEnvvars []string
env map[string]string
expectedDevices *string
}{
@ -799,25 +838,25 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
// enabled
{
description: "empty env returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
swarmResourceEnvvars: []string{envDockerResourceGPUs},
},
{
description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: "",
},
},
{
description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: "void",
},
},
{
description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: "none",
},
@ -825,7 +864,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
},
{
description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: gpuID,
},
@ -833,7 +872,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
},
{
description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
envSwarmGPU: &envDockerResourceGPUs,
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: gpuID,
envCUDAVersion: "legacy",
@ -842,7 +881,7 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
},
{
description: "DOCKER_RESOURCE_GPUS is selected if present",
envSwarmGPU: &envDockerResourceGPUs,
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
},
@ -850,19 +889,46 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
},
{
description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
envSwarmGPU: &envDockerResourceGPUs,
swarmResourceEnvvars: []string{envDockerResourceGPUs},
env: map[string]string{
envNVVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: &anotherGPUID,
},
{
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
envNVVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: &anotherGPUID,
},
{
description: "First available swarm resource envvar is selected and overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
envNVVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS": thirdGPUID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: &thirdGPUID,
},
{
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
env: map[string]string{
envNVVisibleDevices: gpuID,
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
},
expectedDevices: &anotherGPUID,
},
}
for i, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
envSwarmGPU = tc.envSwarmGPU
devices := getDevicesFromEnvvar(image.CUDA(tc.env))
devices := getDevicesFromEnvvar(image.CUDA(tc.env), tc.swarmResourceEnvvars)
if tc.expectedDevices == nil {
require.Nil(t, devices, "%d: %v", i, tc)
return

View File

@ -5,6 +5,7 @@ import (
"os"
"path"
"reflect"
"strings"
"github.com/BurntSushi/toml"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
@ -116,3 +117,22 @@ func (c HookConfig) getConfigOption(fieldName string) string {
}
return v
}
// getSwarmResourceEnvvars returns the swarm resource envvars for the config.
func (c *HookConfig) getSwarmResourceEnvvars() []string {
if c.SwarmResource == nil {
return nil
}
candidates := strings.Split(*c.SwarmResource, ",")
var envvars []string
for _, c := range candidates {
trimmed := strings.TrimSpace(c)
if len(trimmed) > 0 {
envvars = append(envvars, trimmed)
}
}
return envvars
}

View File

@ -103,3 +103,59 @@ func TestGetHookConfig(t *testing.T) {
})
}
}
func TestGetSwarmResourceEnvvars(t *testing.T) {
testCases := []struct {
value string
expected []string
}{
{
value: "nil",
expected: nil,
},
{
value: "",
expected: nil,
},
{
value: " ",
expected: nil,
},
{
value: "single",
expected: []string{"single"},
},
{
value: "single ",
expected: []string{"single"},
},
{
value: "one,two",
expected: []string{"one", "two"},
},
{
value: "one ,two",
expected: []string{"one", "two"},
},
{
value: "one, two",
expected: []string{"one", "two"},
},
}
for i, tc := range testCases {
t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
c := &HookConfig{
SwarmResource: func() *string {
if tc.value == "nil" {
return nil
}
return &tc.value
}(),
}
envvars := c.getSwarmResourceEnvvars()
require.EqualValues(t, tc.expected, envvars)
})
}
}