Merge branch '1.1.0-staging' into 'master'

1.1.0 staging

See merge request nvidia/container-toolkit/container-toolkit!7
This commit is contained in:
Renaud Gaubert 2020-05-15 19:39:41 +00:00
commit 976428af2c
12 changed files with 296 additions and 29 deletions

View File

@ -5,7 +5,7 @@ MKDIR ?= mkdir
DIST_DIR ?= $(CURDIR)/dist
LIB_NAME := nvidia-container-toolkit
LIB_VERSION := 1.0.5
LIB_VERSION := 1.1.0
GOLANG_VERSION := 1.14.2
GOLANG_PKG_PATH := github.com/NVIDIA/container-toolkit/pkg

View File

@ -11,6 +11,7 @@ load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig"
#alpha-merge-visible-devices-envvars = false
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

View File

@ -11,6 +11,7 @@ load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig"
#alpha-merge-visible-devices-envvars = false
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

View File

@ -11,6 +11,7 @@ load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig"
#alpha-merge-visible-devices-envvars = false
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

View File

@ -11,6 +11,7 @@ load-kmods = true
#no-cgroups = false
user = "root:video"
ldconfig = "@/sbin/ldconfig"
#alpha-merge-visible-devices-envvars = false
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

View File

@ -11,6 +11,7 @@ load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig.real"
#alpha-merge-visible-devices-envvars = false
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

131
container_config_test.go Normal file
View File

@ -0,0 +1,131 @@
package main
import (
"github.com/stretchr/testify/require"
"sort"
"strings"
"testing"
)
func TestMergeVisibleDevicesEnvvars(t *testing.T) {
var tests = []struct {
name string
input []string
expected string
enableMerge bool
}{
{
"Simple Merge Enabled",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"0,1,2,3,4,5",
true,
},
{
"Simple Merge Disabled",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"",
false,
},
{
"Merge No Override (Enabled)",
[]string{
"NVIDIA_VISIBLE_DEVICES=all",
},
"all",
true,
},
{
"Merge No Override (Disabled)",
[]string{
"NVIDIA_VISIBLE_DEVICES=all",
},
"all",
false,
},
{
"Merge Override (Enabled, Before)",
[]string{
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"0,1,2,3,4,5",
true,
},
{
"Merge Override (Enabled, After)",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
"NVIDIA_VISIBLE_DEVICES=all",
},
"0,1,2,3,4,5",
true,
},
{
"Merge Override (Enabled, In Between)",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"0,1,2,3,4,5",
true,
},
{
"Merge Override (Disabled, Before)",
[]string{
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"all",
false,
},
{
"Merge Override (Disabled, After)",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
"NVIDIA_VISIBLE_DEVICES=all",
},
"all",
false,
},
{
"Merge Override (Disabled, In Between)",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"all",
false,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
config := CLIConfig{
AlphaMergeVisibleDevicesEnvvars: tc.enableMerge,
}
envvars := getEnvMap(tc.input, config)
devices := strings.Split(envvars[envNVVisibleDevices], ",")
sort.Strings(devices)
require.Equal(t, tc.expected, strings.Join(devices, ","))
})
}
}

View File

@ -1,4 +1,12 @@
nvidia-container-toolkit (@VERSION@) UNRELEASED; urgency=medium
nvidia-container-toolkit (1.1.0-1) UNRELEASED; urgency=medium
* Add ability to merge envars of the form NVIDIA_VISIBLE_DEVICES_* (Closes: #XXXXXX)
* Extend fields we inspect in the runc spec to include linux capabilities (Closes: #XXXXXX)
* Add support for MIG (Closes: #XXXXXX)
-- NVIDIA CORPORATION <cudatools@nvidia.com> Wed, 07 Mar 2018 05:47:37 +0000
nvidia-container-toolkit (1.0.5-1) UNRELEASED; urgency=medium
* Initial release. Replaces older package nvidia-container-runtime-hook. (Closes: #XXXXXX)

View File

@ -53,3 +53,7 @@ rm -f %{_bindir}/nvidia-container-runtime-hook
/usr/share/containers/oci/hooks.d/oci-nvidia-hook.json
%changelog
* Fri May 15 2020 NVIDIA CORPORATION <cudatools@nvidia.com> 1.1.0-1
- Add ability to merge envars of the form NVIDIA_VISIBLE_DEVICES_*
- Extend fields we inspect in the runc spec to include linux capabilities
- Add support for MIG

View File

@ -18,6 +18,8 @@ const (
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES"
envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES"
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
)
@ -26,8 +28,14 @@ const (
defaultDriverCapabilities = "utility"
)
const (
capSysAdmin = "CAP_SYS_ADMIN"
)
type nvidiaConfig struct {
Devices string
MigConfigDevices string
MigMonitorDevices string
DriverCapabilities string
Requirements []string
DisableRequire bool
@ -47,7 +55,17 @@ type Root struct {
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L57
type Process struct {
Env []string `json:"env,omitempty"`
Env []string `json:"env,omitempty"`
Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"`
}
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L61
type LinuxCapabilities struct {
Bounding []string `json:"bounding,omitempty" platform:"linux"`
Effective []string `json:"effective,omitempty" platform:"linux"`
Inheritable []string `json:"inheritable,omitempty" platform:"linux"`
Permitted []string `json:"permitted,omitempty" platform:"linux"`
Ambient []string `json:"ambient,omitempty" platform:"linux"`
}
// We use pointers to structs, similarly to the latest version of runtime-spec:
@ -82,7 +100,7 @@ func parseCudaVersion(cudaVersion string) (vmaj, vmin, vpatch uint32) {
return
}
func getEnvMap(e []string) (m map[string]string) {
func getEnvMap(e []string, config CLIConfig) (m map[string]string) {
m = make(map[string]string)
for _, s := range e {
p := strings.SplitN(s, "=", 2)
@ -91,6 +109,17 @@ func getEnvMap(e []string) (m map[string]string) {
}
m[p[0]] = p[1]
}
if config.AlphaMergeVisibleDevicesEnvvars {
var mergable []string
for k, v := range m {
if strings.HasPrefix(k, envNVVisibleDevices+"_") {
mergable = append(mergable, v)
}
}
if len(mergable) > 0 {
m[envNVVisibleDevices] = strings.Join(mergable, ",")
}
}
return
}
@ -113,6 +142,31 @@ func loadSpec(path string) (spec *Spec) {
return
}
func isPrivileged(caps *LinuxCapabilities) bool {
if caps == nil {
return false
}
hasCapSysAdmin := func(caps []string) bool {
for _, c := range caps {
if c == capSysAdmin {
return true
}
}
return false
}
// We only make sure that the bounding capabibility set has
// CAP_SYS_ADMIN. This allows us to make sure that the container was
// actually started as '--privileged', but also allow non-root users to
// access the priviliged NVIDIA capabilities.
if !hasCapSysAdmin(caps.Bounding) {
return false
}
return true
}
func getDevices(env map[string]string) *string {
gpuVars := []string{envNVVisibleDevices}
if envSwarmGPU != nil {
@ -128,6 +182,26 @@ func getDevices(env map[string]string) *string {
return nil
}
func getMigConfigDevices(env map[string]string) *string {
gpuVars := []string{envNVMigConfigDevices}
for _, gpuVar := range gpuVars {
if devices, ok := env[gpuVar]; ok {
return &devices
}
}
return nil
}
func getMigMonitorDevices(env map[string]string) *string {
gpuVars := []string{envNVMigMonitorDevices}
for _, gpuVar := range gpuVars {
if devices, ok := env[gpuVar]; ok {
return &devices
}
}
return nil
}
func getDriverCapabilities(env map[string]string) *string {
if capabilities, ok := env[envNVDriverCapabilities]; ok {
return &capabilities
@ -147,7 +221,7 @@ func getRequirements(env map[string]string) []string {
}
// Mimic the new CUDA images if no capabilities or devices are specified.
func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig {
func getNvidiaConfigLegacy(env map[string]string, privileged bool) *nvidiaConfig {
var devices string
if d := getDevices(env); d == nil {
// Environment variable unset: default to "all".
@ -163,6 +237,22 @@ func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig {
devices = ""
}
var migConfigDevices string
if d := getMigConfigDevices(env); d != nil {
migConfigDevices = *d
}
if !privileged && migConfigDevices != "" {
log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container")
}
var migMonitorDevices string
if d := getMigMonitorDevices(env); d != nil {
migMonitorDevices = *d
}
if !privileged && migMonitorDevices != "" {
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
}
var driverCapabilities string
if c := getDriverCapabilities(env); c == nil {
// Environment variable unset: default to "all".
@ -189,18 +279,20 @@ func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig {
return &nvidiaConfig{
Devices: devices,
MigConfigDevices: migConfigDevices,
MigMonitorDevices: migMonitorDevices,
DriverCapabilities: driverCapabilities,
Requirements: requirements,
DisableRequire: disableRequire,
}
}
func getNvidiaConfig(env map[string]string) *nvidiaConfig {
func getNvidiaConfig(env map[string]string, privileged bool) *nvidiaConfig {
legacyCudaVersion := env[envCUDAVersion]
cudaRequire := env[envNVRequireCUDA]
if len(legacyCudaVersion) > 0 && len(cudaRequire) == 0 {
// Legacy CUDA image detected.
return getNvidiaConfigLegacy(env)
return getNvidiaConfigLegacy(env, privileged)
}
var devices string
@ -215,6 +307,22 @@ func getNvidiaConfig(env map[string]string) *nvidiaConfig {
devices = ""
}
var migConfigDevices string
if d := getMigConfigDevices(env); d != nil {
migConfigDevices = *d
}
if !privileged && migConfigDevices != "" {
log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container")
}
var migMonitorDevices string
if d := getMigMonitorDevices(env); d != nil {
migMonitorDevices = *d
}
if !privileged && migMonitorDevices != "" {
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
}
var driverCapabilities string
if c := getDriverCapabilities(env); c == nil || len(*c) == 0 {
// Environment variable unset or set but empty: use default capability.
@ -234,6 +342,8 @@ func getNvidiaConfig(env map[string]string) *nvidiaConfig {
return &nvidiaConfig{
Devices: devices,
MigConfigDevices: migConfigDevices,
MigMonitorDevices: migMonitorDevices,
DriverCapabilities: driverCapabilities,
Requirements: requirements,
DisableRequire: disableRequire,
@ -254,12 +364,13 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
s := loadSpec(path.Join(b, "config.json"))
env := getEnvMap(s.Process.Env)
env := getEnvMap(s.Process.Env, hook.NvidiaContainerCLI)
privileged := isPrivileged(s.Process.Capabilities)
envSwarmGPU = hook.SwarmResource
return containerConfig{
Pid: h.Pid,
Rootfs: s.Root.Path,
Env: env,
Nvidia: getNvidiaConfig(env),
Nvidia: getNvidiaConfig(env, privileged),
}
}

View File

@ -20,16 +20,17 @@ var defaultPaths = [...]string{
// CLIConfig: options for nvidia-container-cli.
type CLIConfig struct {
Root *string `toml:"root"`
Path *string `toml:"path"`
Environment []string `toml:"environment"`
Debug *string `toml:"debug"`
Ldcache *string `toml:"ldcache"`
LoadKmods bool `toml:"load-kmods"`
NoPivot bool `toml:"no-pivot"`
NoCgroups bool `toml:"no-cgroups"`
User *string `toml:"user"`
Ldconfig *string `toml:"ldconfig"`
Root *string `toml:"root"`
Path *string `toml:"path"`
Environment []string `toml:"environment"`
Debug *string `toml:"debug"`
Ldcache *string `toml:"ldcache"`
LoadKmods bool `toml:"load-kmods"`
NoPivot bool `toml:"no-pivot"`
NoCgroups bool `toml:"no-cgroups"`
User *string `toml:"user"`
Ldconfig *string `toml:"ldconfig"`
AlphaMergeVisibleDevicesEnvvars bool `toml:"alpha-merge-visible-devices-envvars"`
}
type HookConfig struct {
@ -44,16 +45,17 @@ func getDefaultHookConfig() (config HookConfig) {
DisableRequire: false,
SwarmResource: nil,
NvidiaContainerCLI: CLIConfig{
Root: nil,
Path: nil,
Environment: []string{},
Debug: nil,
Ldcache: nil,
LoadKmods: true,
NoPivot: false,
NoCgroups: false,
User: nil,
Ldconfig: nil,
Root: nil,
Path: nil,
Environment: []string{},
Debug: nil,
Ldcache: nil,
LoadKmods: true,
NoPivot: false,
NoCgroups: false,
User: nil,
Ldconfig: nil,
AlphaMergeVisibleDevicesEnvvars: false,
},
}
}

View File

@ -126,6 +126,12 @@ func doPrestart() {
if len(nvidia.Devices) > 0 {
args = append(args, fmt.Sprintf("--device=%s", nvidia.Devices))
}
if len(nvidia.MigConfigDevices) > 0 {
args = append(args, fmt.Sprintf("--mig-config=%s", nvidia.MigConfigDevices))
}
if len(nvidia.MigMonitorDevices) > 0 {
args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices))
}
for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") {
if len(cap) == 0 {