diff --git a/Makefile b/Makefile index 72fa84e6..3a7618e0 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ MKDIR ?= mkdir DIST_DIR ?= $(CURDIR)/dist LIB_NAME := nvidia-container-toolkit -LIB_VERSION := 1.0.5 +LIB_VERSION := 1.1.0 GOLANG_VERSION := 1.14.2 GOLANG_PKG_PATH := github.com/NVIDIA/container-toolkit/pkg diff --git a/config/config.toml.amzn b/config/config.toml.amzn index 33d8e827..23d056ca 100644 --- a/config/config.toml.amzn +++ b/config/config.toml.amzn @@ -11,6 +11,7 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig" +#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.centos b/config/config.toml.centos index 33d8e827..23d056ca 100644 --- a/config/config.toml.centos +++ b/config/config.toml.centos @@ -11,6 +11,7 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig" +#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.debian b/config/config.toml.debian index 33d8e827..23d056ca 100644 --- a/config/config.toml.debian +++ b/config/config.toml.debian @@ -11,6 +11,7 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig" +#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.opensuse-leap b/config/config.toml.opensuse-leap index cee982ad..4d51e51f 100644 --- a/config/config.toml.opensuse-leap +++ b/config/config.toml.opensuse-leap @@ -11,6 +11,7 @@ load-kmods = true #no-cgroups = false user = "root:video" ldconfig = "@/sbin/ldconfig" +#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.ubuntu b/config/config.toml.ubuntu index 0acaae6a..1c2c2b6e 100644 --- a/config/config.toml.ubuntu +++ b/config/config.toml.ubuntu @@ -11,6 +11,7 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig.real" +#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/container_config_test.go b/container_config_test.go new file mode 100644 index 00000000..7e5b0699 --- /dev/null +++ b/container_config_test.go @@ -0,0 +1,131 @@ +package main + +import ( + "github.com/stretchr/testify/require" + "sort" + "strings" + "testing" +) + +func TestMergeVisibleDevicesEnvvars(t *testing.T) { + var tests = []struct { + name string + input []string + expected string + enableMerge bool + }{ + { + "Simple Merge Enabled", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "0,1,2,3,4,5", + true, + }, + { + "Simple Merge Disabled", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "", + false, + }, + { + "Merge No Override (Enabled)", + []string{ + "NVIDIA_VISIBLE_DEVICES=all", + }, + "all", + true, + }, + { + "Merge No Override (Disabled)", + []string{ + "NVIDIA_VISIBLE_DEVICES=all", + }, + "all", + false, + }, + { + "Merge Override (Enabled, Before)", + []string{ + "NVIDIA_VISIBLE_DEVICES=all", + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "0,1,2,3,4,5", + true, + }, + { + "Merge Override (Enabled, After)", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + "NVIDIA_VISIBLE_DEVICES=all", + }, + "0,1,2,3,4,5", + true, + }, + { + "Merge Override (Enabled, In Between)", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES=all", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "0,1,2,3,4,5", + true, + }, + { + "Merge Override (Disabled, Before)", + []string{ + "NVIDIA_VISIBLE_DEVICES=all", + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "all", + false, + }, + { + "Merge Override (Disabled, After)", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + "NVIDIA_VISIBLE_DEVICES=all", + }, + "all", + false, + }, + { + "Merge Override (Disabled, In Between)", + []string{ + "NVIDIA_VISIBLE_DEVICES_0=0,1", + "NVIDIA_VISIBLE_DEVICES_1=2,3", + "NVIDIA_VISIBLE_DEVICES=all", + "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", + }, + "all", + false, + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + config := CLIConfig{ + AlphaMergeVisibleDevicesEnvvars: tc.enableMerge, + } + envvars := getEnvMap(tc.input, config) + devices := strings.Split(envvars[envNVVisibleDevices], ",") + sort.Strings(devices) + require.Equal(t, tc.expected, strings.Join(devices, ",")) + }) + } +} diff --git a/packaging/debian/changelog b/packaging/debian/changelog index 21a25929..5c6364c8 100644 --- a/packaging/debian/changelog +++ b/packaging/debian/changelog @@ -1,4 +1,12 @@ -nvidia-container-toolkit (@VERSION@) UNRELEASED; urgency=medium +nvidia-container-toolkit (1.1.0-1) UNRELEASED; urgency=medium + + * Add ability to merge envars of the form NVIDIA_VISIBLE_DEVICES_* (Closes: #XXXXXX) + * Extend fields we inspect in the runc spec to include linux capabilities (Closes: #XXXXXX) + * Add support for MIG (Closes: #XXXXXX) + + -- NVIDIA CORPORATION Wed, 07 Mar 2018 05:47:37 +0000 + +nvidia-container-toolkit (1.0.5-1) UNRELEASED; urgency=medium * Initial release. Replaces older package nvidia-container-runtime-hook. (Closes: #XXXXXX) diff --git a/packaging/rpm/SPECS/nvidia-container-toolkit.spec b/packaging/rpm/SPECS/nvidia-container-toolkit.spec index e3cad551..2c3f0b9a 100644 --- a/packaging/rpm/SPECS/nvidia-container-toolkit.spec +++ b/packaging/rpm/SPECS/nvidia-container-toolkit.spec @@ -53,3 +53,7 @@ rm -f %{_bindir}/nvidia-container-runtime-hook /usr/share/containers/oci/hooks.d/oci-nvidia-hook.json %changelog +* Fri May 15 2020 NVIDIA CORPORATION 1.1.0-1 + - Add ability to merge envars of the form NVIDIA_VISIBLE_DEVICES_* + - Extend fields we inspect in the runc spec to include linux capabilities + - Add support for MIG diff --git a/pkg/container_config.go b/pkg/container_config.go index 26f23341..3c87c307 100644 --- a/pkg/container_config.go +++ b/pkg/container_config.go @@ -18,6 +18,8 @@ const ( envNVRequireCUDA = envNVRequirePrefix + "CUDA" envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE" envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES" + envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES" + envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES" envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES" ) @@ -26,8 +28,14 @@ const ( defaultDriverCapabilities = "utility" ) +const ( + capSysAdmin = "CAP_SYS_ADMIN" +) + type nvidiaConfig struct { Devices string + MigConfigDevices string + MigMonitorDevices string DriverCapabilities string Requirements []string DisableRequire bool @@ -47,7 +55,17 @@ type Root struct { // github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L57 type Process struct { - Env []string `json:"env,omitempty"` + Env []string `json:"env,omitempty"` + Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"` +} + +// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L61 +type LinuxCapabilities struct { + Bounding []string `json:"bounding,omitempty" platform:"linux"` + Effective []string `json:"effective,omitempty" platform:"linux"` + Inheritable []string `json:"inheritable,omitempty" platform:"linux"` + Permitted []string `json:"permitted,omitempty" platform:"linux"` + Ambient []string `json:"ambient,omitempty" platform:"linux"` } // We use pointers to structs, similarly to the latest version of runtime-spec: @@ -82,7 +100,7 @@ func parseCudaVersion(cudaVersion string) (vmaj, vmin, vpatch uint32) { return } -func getEnvMap(e []string) (m map[string]string) { +func getEnvMap(e []string, config CLIConfig) (m map[string]string) { m = make(map[string]string) for _, s := range e { p := strings.SplitN(s, "=", 2) @@ -91,6 +109,17 @@ func getEnvMap(e []string) (m map[string]string) { } m[p[0]] = p[1] } + if config.AlphaMergeVisibleDevicesEnvvars { + var mergable []string + for k, v := range m { + if strings.HasPrefix(k, envNVVisibleDevices+"_") { + mergable = append(mergable, v) + } + } + if len(mergable) > 0 { + m[envNVVisibleDevices] = strings.Join(mergable, ",") + } + } return } @@ -113,6 +142,31 @@ func loadSpec(path string) (spec *Spec) { return } +func isPrivileged(caps *LinuxCapabilities) bool { + if caps == nil { + return false + } + + hasCapSysAdmin := func(caps []string) bool { + for _, c := range caps { + if c == capSysAdmin { + return true + } + } + return false + } + + // We only make sure that the bounding capabibility set has + // CAP_SYS_ADMIN. This allows us to make sure that the container was + // actually started as '--privileged', but also allow non-root users to + // access the priviliged NVIDIA capabilities. + if !hasCapSysAdmin(caps.Bounding) { + return false + } + + return true +} + func getDevices(env map[string]string) *string { gpuVars := []string{envNVVisibleDevices} if envSwarmGPU != nil { @@ -128,6 +182,26 @@ func getDevices(env map[string]string) *string { return nil } +func getMigConfigDevices(env map[string]string) *string { + gpuVars := []string{envNVMigConfigDevices} + for _, gpuVar := range gpuVars { + if devices, ok := env[gpuVar]; ok { + return &devices + } + } + return nil +} + +func getMigMonitorDevices(env map[string]string) *string { + gpuVars := []string{envNVMigMonitorDevices} + for _, gpuVar := range gpuVars { + if devices, ok := env[gpuVar]; ok { + return &devices + } + } + return nil +} + func getDriverCapabilities(env map[string]string) *string { if capabilities, ok := env[envNVDriverCapabilities]; ok { return &capabilities @@ -147,7 +221,7 @@ func getRequirements(env map[string]string) []string { } // Mimic the new CUDA images if no capabilities or devices are specified. -func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig { +func getNvidiaConfigLegacy(env map[string]string, privileged bool) *nvidiaConfig { var devices string if d := getDevices(env); d == nil { // Environment variable unset: default to "all". @@ -163,6 +237,22 @@ func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig { devices = "" } + var migConfigDevices string + if d := getMigConfigDevices(env); d != nil { + migConfigDevices = *d + } + if !privileged && migConfigDevices != "" { + log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container") + } + + var migMonitorDevices string + if d := getMigMonitorDevices(env); d != nil { + migMonitorDevices = *d + } + if !privileged && migMonitorDevices != "" { + log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container") + } + var driverCapabilities string if c := getDriverCapabilities(env); c == nil { // Environment variable unset: default to "all". @@ -189,18 +279,20 @@ func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig { return &nvidiaConfig{ Devices: devices, + MigConfigDevices: migConfigDevices, + MigMonitorDevices: migMonitorDevices, DriverCapabilities: driverCapabilities, Requirements: requirements, DisableRequire: disableRequire, } } -func getNvidiaConfig(env map[string]string) *nvidiaConfig { +func getNvidiaConfig(env map[string]string, privileged bool) *nvidiaConfig { legacyCudaVersion := env[envCUDAVersion] cudaRequire := env[envNVRequireCUDA] if len(legacyCudaVersion) > 0 && len(cudaRequire) == 0 { // Legacy CUDA image detected. - return getNvidiaConfigLegacy(env) + return getNvidiaConfigLegacy(env, privileged) } var devices string @@ -215,6 +307,22 @@ func getNvidiaConfig(env map[string]string) *nvidiaConfig { devices = "" } + var migConfigDevices string + if d := getMigConfigDevices(env); d != nil { + migConfigDevices = *d + } + if !privileged && migConfigDevices != "" { + log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container") + } + + var migMonitorDevices string + if d := getMigMonitorDevices(env); d != nil { + migMonitorDevices = *d + } + if !privileged && migMonitorDevices != "" { + log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container") + } + var driverCapabilities string if c := getDriverCapabilities(env); c == nil || len(*c) == 0 { // Environment variable unset or set but empty: use default capability. @@ -234,6 +342,8 @@ func getNvidiaConfig(env map[string]string) *nvidiaConfig { return &nvidiaConfig{ Devices: devices, + MigConfigDevices: migConfigDevices, + MigMonitorDevices: migMonitorDevices, DriverCapabilities: driverCapabilities, Requirements: requirements, DisableRequire: disableRequire, @@ -254,12 +364,13 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { s := loadSpec(path.Join(b, "config.json")) - env := getEnvMap(s.Process.Env) + env := getEnvMap(s.Process.Env, hook.NvidiaContainerCLI) + privileged := isPrivileged(s.Process.Capabilities) envSwarmGPU = hook.SwarmResource return containerConfig{ Pid: h.Pid, Rootfs: s.Root.Path, Env: env, - Nvidia: getNvidiaConfig(env), + Nvidia: getNvidiaConfig(env, privileged), } } diff --git a/pkg/hook_config.go b/pkg/hook_config.go index 320b842d..eff2f3f4 100644 --- a/pkg/hook_config.go +++ b/pkg/hook_config.go @@ -20,16 +20,17 @@ var defaultPaths = [...]string{ // CLIConfig: options for nvidia-container-cli. type CLIConfig struct { - Root *string `toml:"root"` - Path *string `toml:"path"` - Environment []string `toml:"environment"` - Debug *string `toml:"debug"` - Ldcache *string `toml:"ldcache"` - LoadKmods bool `toml:"load-kmods"` - NoPivot bool `toml:"no-pivot"` - NoCgroups bool `toml:"no-cgroups"` - User *string `toml:"user"` - Ldconfig *string `toml:"ldconfig"` + Root *string `toml:"root"` + Path *string `toml:"path"` + Environment []string `toml:"environment"` + Debug *string `toml:"debug"` + Ldcache *string `toml:"ldcache"` + LoadKmods bool `toml:"load-kmods"` + NoPivot bool `toml:"no-pivot"` + NoCgroups bool `toml:"no-cgroups"` + User *string `toml:"user"` + Ldconfig *string `toml:"ldconfig"` + AlphaMergeVisibleDevicesEnvvars bool `toml:"alpha-merge-visible-devices-envvars"` } type HookConfig struct { @@ -44,16 +45,17 @@ func getDefaultHookConfig() (config HookConfig) { DisableRequire: false, SwarmResource: nil, NvidiaContainerCLI: CLIConfig{ - Root: nil, - Path: nil, - Environment: []string{}, - Debug: nil, - Ldcache: nil, - LoadKmods: true, - NoPivot: false, - NoCgroups: false, - User: nil, - Ldconfig: nil, + Root: nil, + Path: nil, + Environment: []string{}, + Debug: nil, + Ldcache: nil, + LoadKmods: true, + NoPivot: false, + NoCgroups: false, + User: nil, + Ldconfig: nil, + AlphaMergeVisibleDevicesEnvvars: false, }, } } diff --git a/pkg/main.go b/pkg/main.go index 010ff359..13f8197c 100644 --- a/pkg/main.go +++ b/pkg/main.go @@ -126,6 +126,12 @@ func doPrestart() { if len(nvidia.Devices) > 0 { args = append(args, fmt.Sprintf("--device=%s", nvidia.Devices)) } + if len(nvidia.MigConfigDevices) > 0 { + args = append(args, fmt.Sprintf("--mig-config=%s", nvidia.MigConfigDevices)) + } + if len(nvidia.MigMonitorDevices) > 0 { + args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices)) + } for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") { if len(cap) == 0 {