mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-22 08:18:32 +00:00
Add support for multiple swarm resource envvars
This change allows the swarm-resource config option to specify a comma-separated list of environment variables instead of a single environment variable. The first environment variable matched is considered and other environment variables are ignored. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
parent
a4fa61d05d
commit
f0bdfbebe4
@ -13,8 +13,6 @@ import (
|
|||||||
"golang.org/x/mod/semver"
|
"golang.org/x/mod/semver"
|
||||||
)
|
)
|
||||||
|
|
||||||
var envSwarmGPU *string
|
|
||||||
|
|
||||||
const (
|
const (
|
||||||
envCUDAVersion = "CUDA_VERSION"
|
envCUDAVersion = "CUDA_VERSION"
|
||||||
envNVRequirePrefix = "NVIDIA_REQUIRE_"
|
envNVRequirePrefix = "NVIDIA_REQUIRE_"
|
||||||
@ -165,13 +163,9 @@ func isPrivileged(s *Spec) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
func getDevicesFromEnvvar(image image.CUDA) *string {
|
func getDevicesFromEnvvar(image image.CUDA, swarmResourceEnvvars []string) *string {
|
||||||
// Build a list of envvars to consider.
|
// Build a list of envvars to consider. Note that the Swarm Resource envvars have a higher precedence.
|
||||||
envVars := []string{envNVVisibleDevices}
|
envVars := append(swarmResourceEnvvars, envNVVisibleDevices)
|
||||||
if envSwarmGPU != nil {
|
|
||||||
// The Swarm envvar has higher precedence.
|
|
||||||
envVars = append([]string{*envSwarmGPU}, envVars...)
|
|
||||||
}
|
|
||||||
|
|
||||||
devices := image.DevicesFromEnvvars(envVars...)
|
devices := image.DevicesFromEnvvars(envVars...)
|
||||||
if len(devices) == 0 {
|
if len(devices) == 0 {
|
||||||
@ -230,7 +224,7 @@ func getDevices(hookConfig *HookConfig, image image.CUDA, mounts []Mount, privil
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fallback to reading from the environment variable if privileges are correct
|
// Fallback to reading from the environment variable if privileges are correct
|
||||||
devices := getDevicesFromEnvvar(image)
|
devices := getDevicesFromEnvvar(image, hookConfig.getSwarmResourceEnvvars())
|
||||||
if devices == nil {
|
if devices == nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -348,7 +342,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
privileged := isPrivileged(s)
|
privileged := isPrivileged(s)
|
||||||
envSwarmGPU = hook.SwarmResource
|
|
||||||
return containerConfig{
|
return containerConfig{
|
||||||
Pid: h.Pid,
|
Pid: h.Pid,
|
||||||
Rootfs: s.Root.Path,
|
Rootfs: s.Root.Path,
|
||||||
|
@ -449,6 +449,44 @@ func TestGetNvidiaConfig(t *testing.T) {
|
|||||||
DriverCapabilities: defaultDriverCapabilities.String(),
|
DriverCapabilities: defaultDriverCapabilities.String(),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
description: "Hook config set, swarmResource overrides device selection",
|
||||||
|
env: map[string]string{
|
||||||
|
envNVVisibleDevices: "all",
|
||||||
|
"DOCKER_SWARM_RESOURCE": "GPU1,GPU2",
|
||||||
|
},
|
||||||
|
privileged: true,
|
||||||
|
hookConfig: &HookConfig{
|
||||||
|
SwarmResource: func() *string {
|
||||||
|
s := "DOCKER_SWARM_RESOURCE"
|
||||||
|
return &s
|
||||||
|
}(),
|
||||||
|
SupportedDriverCapabilities: "video,display,utility,compute",
|
||||||
|
},
|
||||||
|
expectedConfig: &nvidiaConfig{
|
||||||
|
Devices: "GPU1,GPU2",
|
||||||
|
DriverCapabilities: defaultDriverCapabilities.String(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
description: "Hook config set, comma separated swarmResource is split and overrides device selection",
|
||||||
|
env: map[string]string{
|
||||||
|
envNVVisibleDevices: "all",
|
||||||
|
"DOCKER_SWARM_RESOURCE": "GPU1,GPU2",
|
||||||
|
},
|
||||||
|
privileged: true,
|
||||||
|
hookConfig: &HookConfig{
|
||||||
|
SwarmResource: func() *string {
|
||||||
|
s := "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE"
|
||||||
|
return &s
|
||||||
|
}(),
|
||||||
|
SupportedDriverCapabilities: "video,display,utility,compute",
|
||||||
|
},
|
||||||
|
expectedConfig: &nvidiaConfig{
|
||||||
|
Devices: "GPU1,GPU2",
|
||||||
|
DriverCapabilities: defaultDriverCapabilities.String(),
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
for _, tc := range tests {
|
for _, tc := range tests {
|
||||||
t.Run(tc.description, func(t *testing.T) {
|
t.Run(tc.description, func(t *testing.T) {
|
||||||
@ -689,12 +727,13 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
|
|||||||
envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS"
|
envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS"
|
||||||
gpuID := "GPU-12345"
|
gpuID := "GPU-12345"
|
||||||
anotherGPUID := "GPU-67890"
|
anotherGPUID := "GPU-67890"
|
||||||
|
thirdGPUID := "MIG-12345"
|
||||||
|
|
||||||
var tests = []struct {
|
var tests = []struct {
|
||||||
description string
|
description string
|
||||||
envSwarmGPU *string
|
swarmResourceEnvvars []string
|
||||||
env map[string]string
|
env map[string]string
|
||||||
expectedDevices *string
|
expectedDevices *string
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
description: "empty env returns nil for non-legacy image",
|
description: "empty env returns nil for non-legacy image",
|
||||||
@ -798,42 +837,42 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
|
|||||||
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
|
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
|
||||||
// enabled
|
// enabled
|
||||||
{
|
{
|
||||||
description: "empty env returns nil for non-legacy image",
|
description: "empty env returns nil for non-legacy image",
|
||||||
envSwarmGPU: &envDockerResourceGPUs,
|
swarmResourceEnvvars: []string{envDockerResourceGPUs},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
|
description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
|
||||||
envSwarmGPU: &envDockerResourceGPUs,
|
swarmResourceEnvvars: []string{envDockerResourceGPUs},
|
||||||
env: map[string]string{
|
env: map[string]string{
|
||||||
envDockerResourceGPUs: "",
|
envDockerResourceGPUs: "",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
|
description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
|
||||||
envSwarmGPU: &envDockerResourceGPUs,
|
swarmResourceEnvvars: []string{envDockerResourceGPUs},
|
||||||
env: map[string]string{
|
env: map[string]string{
|
||||||
envDockerResourceGPUs: "void",
|
envDockerResourceGPUs: "void",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
|
description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
|
||||||
envSwarmGPU: &envDockerResourceGPUs,
|
swarmResourceEnvvars: []string{envDockerResourceGPUs},
|
||||||
env: map[string]string{
|
env: map[string]string{
|
||||||
envDockerResourceGPUs: "none",
|
envDockerResourceGPUs: "none",
|
||||||
},
|
},
|
||||||
expectedDevices: &empty,
|
expectedDevices: &empty,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
|
description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
|
||||||
envSwarmGPU: &envDockerResourceGPUs,
|
swarmResourceEnvvars: []string{envDockerResourceGPUs},
|
||||||
env: map[string]string{
|
env: map[string]string{
|
||||||
envDockerResourceGPUs: gpuID,
|
envDockerResourceGPUs: gpuID,
|
||||||
},
|
},
|
||||||
expectedDevices: &gpuID,
|
expectedDevices: &gpuID,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
|
description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
|
||||||
envSwarmGPU: &envDockerResourceGPUs,
|
swarmResourceEnvvars: []string{envDockerResourceGPUs},
|
||||||
env: map[string]string{
|
env: map[string]string{
|
||||||
envDockerResourceGPUs: gpuID,
|
envDockerResourceGPUs: gpuID,
|
||||||
envCUDAVersion: "legacy",
|
envCUDAVersion: "legacy",
|
||||||
@ -841,28 +880,55 @@ func TestGetDevicesFromEnvvar(t *testing.T) {
|
|||||||
expectedDevices: &gpuID,
|
expectedDevices: &gpuID,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "DOCKER_RESOURCE_GPUS is selected if present",
|
description: "DOCKER_RESOURCE_GPUS is selected if present",
|
||||||
envSwarmGPU: &envDockerResourceGPUs,
|
swarmResourceEnvvars: []string{envDockerResourceGPUs},
|
||||||
env: map[string]string{
|
env: map[string]string{
|
||||||
envDockerResourceGPUs: anotherGPUID,
|
envDockerResourceGPUs: anotherGPUID,
|
||||||
},
|
},
|
||||||
expectedDevices: &anotherGPUID,
|
expectedDevices: &anotherGPUID,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
|
description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
|
||||||
envSwarmGPU: &envDockerResourceGPUs,
|
swarmResourceEnvvars: []string{envDockerResourceGPUs},
|
||||||
env: map[string]string{
|
env: map[string]string{
|
||||||
envNVVisibleDevices: gpuID,
|
envNVVisibleDevices: gpuID,
|
||||||
envDockerResourceGPUs: anotherGPUID,
|
envDockerResourceGPUs: anotherGPUID,
|
||||||
},
|
},
|
||||||
expectedDevices: &anotherGPUID,
|
expectedDevices: &anotherGPUID,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL overrides NVIDIA_VISIBLE_DEVICES if present",
|
||||||
|
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS_ADDITIONAL"},
|
||||||
|
env: map[string]string{
|
||||||
|
envNVVisibleDevices: gpuID,
|
||||||
|
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
|
||||||
|
},
|
||||||
|
expectedDevices: &anotherGPUID,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
description: "First available swarm resource envvar is selected and overrides NVIDIA_VISIBLE_DEVICES if present",
|
||||||
|
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
|
||||||
|
env: map[string]string{
|
||||||
|
envNVVisibleDevices: gpuID,
|
||||||
|
"DOCKER_RESOURCE_GPUS": thirdGPUID,
|
||||||
|
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
|
||||||
|
},
|
||||||
|
expectedDevices: &thirdGPUID,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
description: "DOCKER_RESOURCE_GPUS_ADDITIONAL or DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
|
||||||
|
swarmResourceEnvvars: []string{"DOCKER_RESOURCE_GPUS", "DOCKER_RESOURCE_GPUS_ADDITIONAL"},
|
||||||
|
env: map[string]string{
|
||||||
|
envNVVisibleDevices: gpuID,
|
||||||
|
"DOCKER_RESOURCE_GPUS_ADDITIONAL": anotherGPUID,
|
||||||
|
},
|
||||||
|
expectedDevices: &anotherGPUID,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, tc := range tests {
|
for i, tc := range tests {
|
||||||
t.Run(tc.description, func(t *testing.T) {
|
t.Run(tc.description, func(t *testing.T) {
|
||||||
envSwarmGPU = tc.envSwarmGPU
|
devices := getDevicesFromEnvvar(image.CUDA(tc.env), tc.swarmResourceEnvvars)
|
||||||
devices := getDevicesFromEnvvar(image.CUDA(tc.env))
|
|
||||||
if tc.expectedDevices == nil {
|
if tc.expectedDevices == nil {
|
||||||
require.Nil(t, devices, "%d: %v", i, tc)
|
require.Nil(t, devices, "%d: %v", i, tc)
|
||||||
return
|
return
|
||||||
|
@ -5,6 +5,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
"reflect"
|
"reflect"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"github.com/BurntSushi/toml"
|
"github.com/BurntSushi/toml"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||||
@ -116,3 +117,22 @@ func (c HookConfig) getConfigOption(fieldName string) string {
|
|||||||
}
|
}
|
||||||
return v
|
return v
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getSwarmResourceEnvvars returns the swarm resource envvars for the config.
|
||||||
|
func (c *HookConfig) getSwarmResourceEnvvars() []string {
|
||||||
|
if c.SwarmResource == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
candidates := strings.Split(*c.SwarmResource, ",")
|
||||||
|
|
||||||
|
var envvars []string
|
||||||
|
for _, c := range candidates {
|
||||||
|
trimmed := strings.TrimSpace(c)
|
||||||
|
if len(trimmed) > 0 {
|
||||||
|
envvars = append(envvars, trimmed)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return envvars
|
||||||
|
}
|
||||||
|
@ -103,3 +103,59 @@ func TestGetHookConfig(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGetSwarmResourceEnvvars(t *testing.T) {
|
||||||
|
testCases := []struct {
|
||||||
|
value string
|
||||||
|
expected []string
|
||||||
|
}{
|
||||||
|
{
|
||||||
|
value: "nil",
|
||||||
|
expected: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
value: "",
|
||||||
|
expected: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
value: " ",
|
||||||
|
expected: nil,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
value: "single",
|
||||||
|
expected: []string{"single"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
value: "single ",
|
||||||
|
expected: []string{"single"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
value: "one,two",
|
||||||
|
expected: []string{"one", "two"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
value: "one ,two",
|
||||||
|
expected: []string{"one", "two"},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
value: "one, two",
|
||||||
|
expected: []string{"one", "two"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, tc := range testCases {
|
||||||
|
t.Run(fmt.Sprintf("%d", i), func(t *testing.T) {
|
||||||
|
c := &HookConfig{
|
||||||
|
SwarmResource: func() *string {
|
||||||
|
if tc.value == "nil" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return &tc.value
|
||||||
|
}(),
|
||||||
|
}
|
||||||
|
|
||||||
|
envvars := c.getSwarmResourceEnvvars()
|
||||||
|
require.EqualValues(t, tc.expected, envvars)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user