Merge branch 'upstream-add-alternate-device-list' into 'master'

Add the ability to pull the device list from mounted files instead of just Envvars

See merge request nvidia/container-toolkit/container-toolkit!15
This commit is contained in:
Kevin Klues 2020-07-24 13:18:53 +00:00
commit 4448319605
11 changed files with 288 additions and 192 deletions

View File

@ -1,5 +1,7 @@
disable-require = false
#swarm-resource = "DOCKER_RESOURCE_GPU"
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
#look-for-nvidia-visible-devices-as-volume-mounts-under = "/var/run/nvidia-container-devices"
[nvidia-container-cli]
#root = "/run/nvidia/driver"
@ -11,7 +13,6 @@ load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig"
#alpha-merge-visible-devices-envvars = false
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

View File

@ -1,5 +1,7 @@
disable-require = false
#swarm-resource = "DOCKER_RESOURCE_GPU"
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
#look-for-nvidia-visible-devices-as-volume-mounts-under = "/var/run/nvidia-container-devices"
[nvidia-container-cli]
#root = "/run/nvidia/driver"
@ -11,7 +13,6 @@ load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig"
#alpha-merge-visible-devices-envvars = false
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

View File

@ -1,5 +1,7 @@
disable-require = false
#swarm-resource = "DOCKER_RESOURCE_GPU"
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
#look-for-nvidia-visible-devices-as-volume-mounts-under = "/var/run/nvidia-container-devices"
[nvidia-container-cli]
#root = "/run/nvidia/driver"
@ -11,7 +13,6 @@ load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig"
#alpha-merge-visible-devices-envvars = false
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

View File

@ -1,5 +1,7 @@
disable-require = false
#swarm-resource = "DOCKER_RESOURCE_GPU"
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
#look-for-nvidia-visible-devices-as-volume-mounts-under = "/var/run/nvidia-container-devices"
[nvidia-container-cli]
#root = "/run/nvidia/driver"
@ -11,7 +13,6 @@ load-kmods = true
#no-cgroups = false
user = "root:video"
ldconfig = "@/sbin/ldconfig"
#alpha-merge-visible-devices-envvars = false
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

View File

@ -1,5 +1,7 @@
disable-require = false
#swarm-resource = "DOCKER_RESOURCE_GPU"
#accept-nvidia-visible-devices-envvar-when-unprivileged = true
#look-for-nvidia-visible-devices-as-volume-mounts-under = "/var/run/nvidia-container-devices"
[nvidia-container-cli]
#root = "/run/nvidia/driver"
@ -11,7 +13,6 @@ load-kmods = true
#no-cgroups = false
#user = "root:video"
ldconfig = "@/sbin/ldconfig.real"
#alpha-merge-visible-devices-envvars = false
[nvidia-container-runtime]
#debug = "/var/log/nvidia-container-runtime.log"

View File

@ -1,131 +0,0 @@
package main
import (
"github.com/stretchr/testify/require"
"sort"
"strings"
"testing"
)
func TestMergeVisibleDevicesEnvvars(t *testing.T) {
var tests = []struct {
name string
input []string
expected string
enableMerge bool
}{
{
"Simple Merge Enabled",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"0,1,2,3,4,5",
true,
},
{
"Simple Merge Disabled",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"",
false,
},
{
"Merge No Override (Enabled)",
[]string{
"NVIDIA_VISIBLE_DEVICES=all",
},
"all",
true,
},
{
"Merge No Override (Disabled)",
[]string{
"NVIDIA_VISIBLE_DEVICES=all",
},
"all",
false,
},
{
"Merge Override (Enabled, Before)",
[]string{
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"0,1,2,3,4,5",
true,
},
{
"Merge Override (Enabled, After)",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
"NVIDIA_VISIBLE_DEVICES=all",
},
"0,1,2,3,4,5",
true,
},
{
"Merge Override (Enabled, In Between)",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"0,1,2,3,4,5",
true,
},
{
"Merge Override (Disabled, Before)",
[]string{
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"all",
false,
},
{
"Merge Override (Disabled, After)",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
"NVIDIA_VISIBLE_DEVICES=all",
},
"all",
false,
},
{
"Merge Override (Disabled, In Between)",
[]string{
"NVIDIA_VISIBLE_DEVICES_0=0,1",
"NVIDIA_VISIBLE_DEVICES_1=2,3",
"NVIDIA_VISIBLE_DEVICES=all",
"NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5",
},
"all",
false,
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
config := CLIConfig{
AlphaMergeVisibleDevicesEnvvars: tc.enableMerge,
}
envvars := getEnvMap(tc.input, config)
devices := strings.Split(envvars[envNVVisibleDevices], ",")
sort.Strings(devices)
require.Equal(t, tc.expected, strings.Join(devices, ","))
})
}
}

12
pkg/Godeps/Godeps.json generated
View File

@ -1,12 +0,0 @@
{
"ImportPath": "github.com/nvidia/nvidia-container-runtime/toolkit/nvidia-container-toolkit",
"GoVersion": "go1.9",
"GodepVersion": "v80",
"Deps": [
{
"ImportPath": "github.com/BurntSushi/toml",
"Comment": "v0.3.0-7-ga368813",
"Rev": "a368813c5e648fee92e5f6c30e3944ff9d5e8895"
}
]
}

Binary file not shown.

View File

@ -6,6 +6,7 @@ import (
"log"
"os"
"path"
"path/filepath"
"strconv"
"strings"
@ -73,6 +74,15 @@ type LinuxCapabilities struct {
Ambient []string `json:"ambient,omitempty" platform:"linux"`
}
// Mount from OCI runtime spec
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L103
type Mount struct {
Destination string `json:"destination"`
Type string `json:"type,omitempty" platform:"linux,solaris"`
Source string `json:"source,omitempty"`
Options []string `json:"options,omitempty"`
}
// Spec from OCI runtime spec
// We use pointers to structs, similarly to the latest version of runtime-spec:
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L5-L28
@ -80,6 +90,7 @@ type Spec struct {
Version *string `json:"ociVersion"`
Process *Process `json:"process,omitempty"`
Root *Root `json:"root,omitempty"`
Mounts []Mount `json:"mounts,omitempty"`
}
// HookState holds state information about the hook
@ -108,7 +119,7 @@ func parseCudaVersion(cudaVersion string) (vmaj, vmin, vpatch uint32) {
return
}
func getEnvMap(e []string, config CLIConfig) (m map[string]string) {
func getEnvMap(e []string) (m map[string]string) {
m = make(map[string]string)
for _, s := range e {
p := strings.SplitN(s, "=", 2)
@ -117,17 +128,6 @@ func getEnvMap(e []string, config CLIConfig) (m map[string]string) {
}
m[p[0]] = p[1]
}
if config.AlphaMergeVisibleDevicesEnvvars {
var mergable []string
for k, v := range m {
if strings.HasPrefix(k, envNVVisibleDevices+"_") {
mergable = append(mergable, v)
}
}
if len(mergable) > 0 {
m[envNVVisibleDevices] = strings.Join(mergable, ",")
}
}
return
}
@ -198,7 +198,7 @@ func isLegacyCUDAImage(env map[string]string) bool {
return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0
}
func getDevices(env map[string]string, legacyImage bool) *string {
func getDevicesFromEnvvar(env map[string]string, legacyImage bool) *string {
// Build a list of envvars to consider.
envVars := []string{envNVVisibleDevices}
if envSwarmGPU != nil {
@ -236,6 +236,65 @@ func getDevices(env map[string]string, legacyImage bool) *string {
return devices
}
func getDevicesFromMounts(root string, mounts []Mount) *string {
var devices []string
for _, m := range mounts {
root := filepath.Clean(root)
source := filepath.Clean(m.Source)
destination := filepath.Clean(m.Destination)
// Only consider mounts who's host volume is /dev/null
if source != "/dev/null" {
continue
}
// Only consider container mount points that begin with 'root'
if len(destination) < len(root) {
continue
}
if destination[:len(root)] != root {
continue
}
// Grab the full path beyond 'root' and add it to the list of devices
device := destination[len(root):]
if len(device) > 0 && device[0] == '/' {
device = device[1:]
}
if len(device) == 0 {
continue
}
devices = append(devices, device)
}
if devices == nil {
return nil
}
ret := strings.Join(devices, ",")
return &ret
}
func getDevices(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool, legacyImage bool) *string {
// Try and get the device list from mount volumes first
devices := getDevicesFromMounts(*hookConfig.DeviceListVolumeMount, mounts)
if devices != nil {
return devices
}
// Fallback to reading from the environment variable if privileges are correct
devices = getDevicesFromEnvvar(env, legacyImage)
if devices == nil {
return nil
}
if privileged || hookConfig.AcceptEnvvarUnprivileged {
return devices
}
// Error out otherwise
log.Panicln("insufficient privileges to read device list from NVIDIA_VISIBLE_DEVICES envvar")
return nil
}
func getMigConfigDevices(env map[string]string) *string {
if devices, ok := env[envNVMigConfigDevices]; ok {
return &devices
@ -296,11 +355,11 @@ func getRequirements(env map[string]string, legacyImage bool) []string {
return requirements
}
func getNvidiaConfig(env map[string]string, privileged bool) *nvidiaConfig {
func getNvidiaConfig(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool) *nvidiaConfig {
legacyImage := isLegacyCUDAImage(env)
var devices string
if d := getDevices(env, legacyImage); d != nil {
if d := getDevices(hookConfig, env, mounts, privileged, legacyImage); d != nil {
devices = *d
} else {
// 'nil' devices means this is not a GPU container.
@ -357,13 +416,13 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
s := loadSpec(path.Join(b, "config.json"))
env := getEnvMap(s.Process.Env, hook.NvidiaContainerCLI)
env := getEnvMap(s.Process.Env)
privileged := isPrivileged(s)
envSwarmGPU = hook.SwarmResource
return containerConfig{
Pid: h.Pid,
Rootfs: s.Root.Path,
Env: env,
Nvidia: getNvidiaConfig(env, privileged),
Nvidia: getNvidiaConfig(&hook, env, s.Mounts, privileged),
}
}

View File

@ -1,6 +1,7 @@
package main
import (
"path/filepath"
"reflect"
"testing"
)
@ -407,7 +408,8 @@ func TestGetNvidiaConfig(t *testing.T) {
// Wrap the call to getNvidiaConfig() in a closure.
var config *nvidiaConfig
getConfig := func() {
config = getNvidiaConfig(tc.env, tc.privileged)
hookConfig := getDefaultHookConfig()
config = getNvidiaConfig(&hookConfig, tc.env, nil, tc.privileged)
}
// For any tests that are expected to panic, make sure they do.
@ -449,6 +451,173 @@ func TestGetNvidiaConfig(t *testing.T) {
}
}
func TestGetDevicesFromMounts(t *testing.T) {
var tests = []struct {
description string
root string
mounts []Mount
expectedDevices *string
}{
{
description: "No mounts",
root: defaultDeviceListVolumeMount,
mounts: nil,
expectedDevices: nil,
},
{
description: "Host path is not /dev/null",
root: defaultDeviceListVolumeMount,
mounts: []Mount{
{
Source: "/not/dev/null",
Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU0"),
},
},
expectedDevices: nil,
},
{
description: "Container path is not prefixed by 'root'",
root: defaultDeviceListVolumeMount,
mounts: []Mount{
{
Source: "/dev/null",
Destination: filepath.Join("/other/prefix", "GPU0"),
},
},
expectedDevices: nil,
},
{
description: "Container path is only 'root'",
root: defaultDeviceListVolumeMount,
mounts: []Mount{
{
Source: "/dev/null",
Destination: defaultDeviceListVolumeMount,
},
},
expectedDevices: nil,
},
{
description: "Discover 2 devices",
root: defaultDeviceListVolumeMount,
mounts: []Mount{
{
Source: "/dev/null",
Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU1"),
},
},
expectedDevices: &[]string{"GPU0,GPU1"}[0],
},
{
description: "Discover 2 devices with slashes in the name",
root: defaultDeviceListVolumeMount,
mounts: []Mount{
{
Source: "/dev/null",
Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU0-MIG0/0/1"),
},
{
Source: "/dev/null",
Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU1-MIG0/0/1"),
},
},
expectedDevices: &[]string{"GPU0-MIG0/0/1,GPU1-MIG0/0/1"}[0],
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
devices := getDevicesFromMounts(tc.root, tc.mounts)
if !reflect.DeepEqual(devices, tc.expectedDevices) {
t.Errorf("Unexpected devices (got: %v, wanted: %v)", *devices, *tc.expectedDevices)
}
})
}
}
func TestDeviceListSourcePriority(t *testing.T) {
var tests = []struct {
description string
mountDevices []Mount
envvarDevices string
privileged bool
acceptUnprivileged bool
expectedDevices *string
expectedPanic bool
}{
{
description: "Mount devices, unprivileged, no accept unprivileged",
mountDevices: []Mount{
{
Source: "/dev/null",
Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU1"),
},
},
envvarDevices: "GPU2,GPU3",
privileged: false,
acceptUnprivileged: false,
expectedDevices: &[]string{"GPU0,GPU1"}[0],
},
{
description: "No mount devices, unprivileged, no accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: false,
acceptUnprivileged: false,
expectedPanic: true,
},
{
description: "No mount devices, privileged, no accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: true,
acceptUnprivileged: false,
expectedDevices: &[]string{"GPU0,GPU1"}[0],
},
{
description: "No mount devices, unprivileged, accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: false,
acceptUnprivileged: true,
expectedDevices: &[]string{"GPU0,GPU1"}[0],
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
// Wrap the call to getDevices() in a closure.
var devices *string
getDevices := func() {
env := map[string]string{
envNVVisibleDevices: tc.envvarDevices,
}
hookConfig := getDefaultHookConfig()
hookConfig.AcceptEnvvarUnprivileged = tc.acceptUnprivileged
devices = getDevices(&hookConfig, env, tc.mountDevices, tc.privileged, false)
}
// For any tests that are expected to panic, make sure they do.
if tc.expectedPanic {
mustPanic(t, getDevices)
return
}
// For all other tests, just grab the devices and check the results
getDevices()
if !reflect.DeepEqual(devices, tc.expectedDevices) {
t.Errorf("Unexpected devices (got: %v, wanted: %v)", *devices, *tc.expectedDevices)
}
})
}
}
func elementsMatch(slice0, slice1 []string) bool {
map0 := make(map[string]int)
map1 := make(map[string]int)

View File

@ -13,6 +13,10 @@ const (
driverPath = "/run/nvidia/driver"
)
const (
defaultDeviceListVolumeMount = "/var/run/nvidia-container-devices"
)
var defaultPaths = [...]string{
path.Join(driverPath, configPath),
configPath,
@ -20,43 +24,45 @@ var defaultPaths = [...]string{
// CLIConfig : options for nvidia-container-cli.
type CLIConfig struct {
Root *string `toml:"root"`
Path *string `toml:"path"`
Environment []string `toml:"environment"`
Debug *string `toml:"debug"`
Ldcache *string `toml:"ldcache"`
LoadKmods bool `toml:"load-kmods"`
NoPivot bool `toml:"no-pivot"`
NoCgroups bool `toml:"no-cgroups"`
User *string `toml:"user"`
Ldconfig *string `toml:"ldconfig"`
AlphaMergeVisibleDevicesEnvvars bool `toml:"alpha-merge-visible-devices-envvars"`
Root *string `toml:"root"`
Path *string `toml:"path"`
Environment []string `toml:"environment"`
Debug *string `toml:"debug"`
Ldcache *string `toml:"ldcache"`
LoadKmods bool `toml:"load-kmods"`
NoPivot bool `toml:"no-pivot"`
NoCgroups bool `toml:"no-cgroups"`
User *string `toml:"user"`
Ldconfig *string `toml:"ldconfig"`
}
// HookConfig : options for the nvidia-container-toolkit.
type HookConfig struct {
DisableRequire bool `toml:"disable-require"`
SwarmResource *string `toml:"swarm-resource"`
DisableRequire bool `toml:"disable-require"`
SwarmResource *string `toml:"swarm-resource"`
AcceptEnvvarUnprivileged bool `toml:"accept-nvidia-visible-devices-envvar-when-unprivileged"`
DeviceListVolumeMount *string `toml:"look-for-nvidia-visible-devices-as-volume-mounts-under"`
NvidiaContainerCLI CLIConfig `toml:"nvidia-container-cli"`
}
func getDefaultHookConfig() (config HookConfig) {
return HookConfig{
DisableRequire: false,
SwarmResource: nil,
DisableRequire: false,
SwarmResource: nil,
AcceptEnvvarUnprivileged: true,
DeviceListVolumeMount: &[]string{defaultDeviceListVolumeMount}[0],
NvidiaContainerCLI: CLIConfig{
Root: nil,
Path: nil,
Environment: []string{},
Debug: nil,
Ldcache: nil,
LoadKmods: true,
NoPivot: false,
NoCgroups: false,
User: nil,
Ldconfig: nil,
AlphaMergeVisibleDevicesEnvvars: false,
Root: nil,
Path: nil,
Environment: []string{},
Debug: nil,
Ldcache: nil,
LoadKmods: true,
NoPivot: false,
NoCgroups: false,
User: nil,
Ldconfig: nil,
},
}
}