From 01b4381282f5b30df0c8f75c820fdf75d83c19b1 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Wed, 22 Jan 2020 04:42:00 -0800 Subject: [PATCH] Add ability to merge envars of the form NVIDIA_VISIBLE_DEVICES_* This allows someone to (for example) pass the following environment variables: NVIDIA_VISIBLE_DEVICES_0="0,1" NVIDIA_VISIBLE_DEVICES_1="2,3" NVIDIA_VISIBLE_DEVICES_WHATEVER="4,5" and have the nvidia-container-toolkit automatically merge these into: NVIDIA_VISIBLE_DEVICES="0,1,2,3,4,5" This is useful (for example) if the full list of devices comes from multiple, disparate sources. Note: This will override whatever the original value of NVIDIA_VISIBLE_DEVICES was (*excluding* its original value) if it also exists as an environment variable already. We exclude the original value to ensure that we have a way to override the default value of NVIDIA_VISIBLE_DEVICES set to "all" inside a container image. Signed-off-by: Kevin Klues --- config/config.toml.amzn | 1 + config/config.toml.centos | 1 + config/config.toml.debian | 1 + config/config.toml.opensuse-leap | 1 + config/config.toml.ubuntu | 1 + container_config_test.go | 131 +++++++++++++++++++++++++++++++ pkg/container_config.go | 15 +++- pkg/hook_config.go | 42 +++++----- 8 files changed, 171 insertions(+), 22 deletions(-) create mode 100644 container_config_test.go diff --git a/config/config.toml.amzn b/config/config.toml.amzn index 33d8e827..23d056ca 100644 --- a/config/config.toml.amzn +++ b/config/config.toml.amzn @@ -11,6 +11,7 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig" +#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.centos b/config/config.toml.centos index 33d8e827..23d056ca 100644 --- a/config/config.toml.centos +++ b/config/config.toml.centos @@ -11,6 +11,7 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig" +#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.debian b/config/config.toml.debian index 33d8e827..23d056ca 100644 --- a/config/config.toml.debian +++ b/config/config.toml.debian @@ -11,6 +11,7 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig" +#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.opensuse-leap b/config/config.toml.opensuse-leap index cee982ad..4d51e51f 100644 --- a/config/config.toml.opensuse-leap +++ b/config/config.toml.opensuse-leap @@ -11,6 +11,7 @@ load-kmods = true #no-cgroups = false user = "root:video" ldconfig = "@/sbin/ldconfig" +#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.ubuntu b/config/config.toml.ubuntu index 0acaae6a..1c2c2b6e 100644 --- a/config/config.toml.ubuntu +++ b/config/config.toml.ubuntu @@ -11,6 +11,7 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig.real" +#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/container_config_test.go b/container_config_test.go new file mode 100644 index 00000000..7e5b0699 --- /dev/null +++ b/container_config_test.go @@ -0,0 +1,131 @@ +package main + +import ( + "github.com/stretchr/testify/require" + "sort" + "strings" + "testing" +) + +func TestMergeVisibleDevicesEnvvars(t *testing.T) { + var tests = []struct { + name string + input []string + expected string + enableMerge bool + }{ + { + "Simple Merge Enabled", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "0,1,2,3,4,5", + true, + }, + { + "Simple Merge Disabled", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "", + false, + }, + { + "Merge No Override (Enabled)", + []string{ + "NVIDIA_VISIBLE_DEVICES=all", + }, + "all", + true, + }, + { + "Merge No Override (Disabled)", + []string{ + "NVIDIA_VISIBLE_DEVICES=all", + }, + "all", + false, + }, + { + "Merge Override (Enabled, Before)", + []string{ + "NVIDIA_VISIBLE_DEVICES=all", + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "0,1,2,3,4,5", + true, + }, + { + "Merge Override (Enabled, After)", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + "NVIDIA_VISIBLE_DEVICES=all", + }, + "0,1,2,3,4,5", + true, + }, + { + "Merge Override (Enabled, In Between)", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES=all", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "0,1,2,3,4,5", + true, + }, + { + "Merge Override (Disabled, Before)", + []string{ + "NVIDIA_VISIBLE_DEVICES=all", + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "all", + false, + }, + { + "Merge Override (Disabled, After)", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + "NVIDIA_VISIBLE_DEVICES=all", + }, + "all", + false, + }, + { + "Merge Override (Disabled, In Between)", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES=all", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "all", + false, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + config := CLIConfig{ + AlphaMergeVisibleDevicesEnvvars: tc.enableMerge, + } + envvars := getEnvMap(tc.input, config) + devices := strings.Split(envvars[envNVVisibleDevices], ",") + sort.Strings(devices) + require.Equal(t, tc.expected, strings.Join(devices, ",")) + }) + } +} diff --git a/pkg/container_config.go b/pkg/container_config.go index 26f23341..ac2ab7fa 100644 --- a/pkg/container_config.go +++ b/pkg/container_config.go @@ -82,7 +82,7 @@ func parseCudaVersion(cudaVersion string) (vmaj, vmin, vpatch uint32) { return } -func getEnvMap(e []string) (m map[string]string) { +func getEnvMap(e []string, config CLIConfig) (m map[string]string) { m = make(map[string]string) for _, s := range e { p := strings.SplitN(s, "=", 2) @@ -91,6 +91,17 @@ func getEnvMap(e []string) (m map[string]string) { } m[p[0]] = p[1] } + if config.AlphaMergeVisibleDevicesEnvvars { + var mergable []string + for k, v := range m { + if strings.HasPrefix(k, envNVVisibleDevices+"_") { + mergable = append(mergable, v) + } + } + if len(mergable) > 0 { + m[envNVVisibleDevices] = strings.Join(mergable, ",") + } + } return } @@ -254,7 +265,7 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { s := loadSpec(path.Join(b, "config.json")) - env := getEnvMap(s.Process.Env) + env := getEnvMap(s.Process.Env, hook.NvidiaContainerCLI) envSwarmGPU = hook.SwarmResource return containerConfig{ Pid: h.Pid, diff --git a/pkg/hook_config.go b/pkg/hook_config.go index 320b842d..eff2f3f4 100644 --- a/pkg/hook_config.go +++ b/pkg/hook_config.go @@ -20,16 +20,17 @@ var defaultPaths = [...]string{ // CLIConfig: options for nvidia-container-cli. type CLIConfig struct { - Root *string `toml:"root"` - Path *string `toml:"path"` - Environment []string `toml:"environment"` - Debug *string `toml:"debug"` - Ldcache *string `toml:"ldcache"` - LoadKmods bool `toml:"load-kmods"` - NoPivot bool `toml:"no-pivot"` - NoCgroups bool `toml:"no-cgroups"` - User *string `toml:"user"` - Ldconfig *string `toml:"ldconfig"` + Root *string `toml:"root"` + Path *string `toml:"path"` + Environment []string `toml:"environment"` + Debug *string `toml:"debug"` + Ldcache *string `toml:"ldcache"` + LoadKmods bool `toml:"load-kmods"` + NoPivot bool `toml:"no-pivot"` + NoCgroups bool `toml:"no-cgroups"` + User *string `toml:"user"` + Ldconfig *string `toml:"ldconfig"` + AlphaMergeVisibleDevicesEnvvars bool `toml:"alpha-merge-visible-devices-envvars"` } type HookConfig struct { @@ -44,16 +45,17 @@ func getDefaultHookConfig() (config HookConfig) { DisableRequire: false, SwarmResource: nil, NvidiaContainerCLI: CLIConfig{ - Root: nil, - Path: nil, - Environment: []string{}, - Debug: nil, - Ldcache: nil, - LoadKmods: true, - NoPivot: false, - NoCgroups: false, - User: nil, - Ldconfig: nil, + Root: nil, + Path: nil, + Environment: []string{}, + Debug: nil, + Ldcache: nil, + LoadKmods: true, + NoPivot: false, + NoCgroups: false, + User: nil, + Ldconfig: nil, + AlphaMergeVisibleDevicesEnvvars: false, }, } }