diff --git a/config/config.toml.amzn b/config/config.toml.amzn index 23d056ca..4de3d1c9 100644 --- a/config/config.toml.amzn +++ b/config/config.toml.amzn @@ -1,5 +1,7 @@ disable-require = false #swarm-resource = "DOCKER_RESOURCE_GPU" +#accept-nvidia-visible-devices-envvar-when-unprivileged = true +#look-for-nvidia-visible-devices-as-volume-mounts-under = "/var/run/nvidia-container-devices" [nvidia-container-cli] #root = "/run/nvidia/driver" @@ -11,7 +13,6 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig" -#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.centos b/config/config.toml.centos index 23d056ca..4de3d1c9 100644 --- a/config/config.toml.centos +++ b/config/config.toml.centos @@ -1,5 +1,7 @@ disable-require = false #swarm-resource = "DOCKER_RESOURCE_GPU" +#accept-nvidia-visible-devices-envvar-when-unprivileged = true +#look-for-nvidia-visible-devices-as-volume-mounts-under = "/var/run/nvidia-container-devices" [nvidia-container-cli] #root = "/run/nvidia/driver" @@ -11,7 +13,6 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig" -#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.debian b/config/config.toml.debian index 23d056ca..4de3d1c9 100644 --- a/config/config.toml.debian +++ b/config/config.toml.debian @@ -1,5 +1,7 @@ disable-require = false #swarm-resource = "DOCKER_RESOURCE_GPU" +#accept-nvidia-visible-devices-envvar-when-unprivileged = true +#look-for-nvidia-visible-devices-as-volume-mounts-under = "/var/run/nvidia-container-devices" [nvidia-container-cli] #root = "/run/nvidia/driver" @@ -11,7 +13,6 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig" -#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.opensuse-leap b/config/config.toml.opensuse-leap index 4d51e51f..fe28e163 100644 --- a/config/config.toml.opensuse-leap +++ b/config/config.toml.opensuse-leap @@ -1,5 +1,7 @@ disable-require = false #swarm-resource = "DOCKER_RESOURCE_GPU" +#accept-nvidia-visible-devices-envvar-when-unprivileged = true +#look-for-nvidia-visible-devices-as-volume-mounts-under = "/var/run/nvidia-container-devices" [nvidia-container-cli] #root = "/run/nvidia/driver" @@ -11,7 +13,6 @@ load-kmods = true #no-cgroups = false user = "root:video" ldconfig = "@/sbin/ldconfig" -#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/config/config.toml.ubuntu b/config/config.toml.ubuntu index 1c2c2b6e..061c0759 100644 --- a/config/config.toml.ubuntu +++ b/config/config.toml.ubuntu @@ -1,5 +1,7 @@ disable-require = false #swarm-resource = "DOCKER_RESOURCE_GPU" +#accept-nvidia-visible-devices-envvar-when-unprivileged = true +#look-for-nvidia-visible-devices-as-volume-mounts-under = "/var/run/nvidia-container-devices" [nvidia-container-cli] #root = "/run/nvidia/driver" @@ -11,7 +13,6 @@ load-kmods = true #no-cgroups = false #user = "root:video" ldconfig = "@/sbin/ldconfig.real" -#alpha-merge-visible-devices-envvars = false [nvidia-container-runtime] #debug = "/var/log/nvidia-container-runtime.log" diff --git a/container_config_test.go b/container_config_test.go deleted file mode 100644 index 7e5b0699..00000000 --- a/container_config_test.go +++ /dev/null @@ -1,131 +0,0 @@ -package main - -import ( - "github.com/stretchr/testify/require" - "sort" - "strings" - "testing" -) - -func TestMergeVisibleDevicesEnvvars(t *testing.T) { - var tests = []struct { - name string - input []string - expected string - enableMerge bool - }{ - { - "Simple Merge Enabled", - []string{ - "NVIDIA_VISIBLE_DEVICES_0=0,1", - "NVIDIA_VISIBLE_DEVICES_1=2,3", - "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", - }, - "0,1,2,3,4,5", - true, - }, - { - "Simple Merge Disabled", - []string{ - "NVIDIA_VISIBLE_DEVICES_0=0,1", - "NVIDIA_VISIBLE_DEVICES_1=2,3", - "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", - }, - "", - false, - }, - { - "Merge No Override (Enabled)", - []string{ - "NVIDIA_VISIBLE_DEVICES=all", - }, - "all", - true, - }, - { - "Merge No Override (Disabled)", - []string{ - "NVIDIA_VISIBLE_DEVICES=all", - }, - "all", - false, - }, - { - "Merge Override (Enabled, Before)", - []string{ - "NVIDIA_VISIBLE_DEVICES=all", - "NVIDIA_VISIBLE_DEVICES_0=0,1", - "NVIDIA_VISIBLE_DEVICES_1=2,3", - "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", - }, - "0,1,2,3,4,5", - true, - }, - { - "Merge Override (Enabled, After)", - []string{ - "NVIDIA_VISIBLE_DEVICES_0=0,1", - "NVIDIA_VISIBLE_DEVICES_1=2,3", - "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", - "NVIDIA_VISIBLE_DEVICES=all", - }, - "0,1,2,3,4,5", - true, - }, - { - "Merge Override (Enabled, In Between)", - []string{ - "NVIDIA_VISIBLE_DEVICES_0=0,1", - "NVIDIA_VISIBLE_DEVICES_1=2,3", - "NVIDIA_VISIBLE_DEVICES=all", - "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", - }, - "0,1,2,3,4,5", - true, - }, - { - "Merge Override (Disabled, Before)", - []string{ - "NVIDIA_VISIBLE_DEVICES=all", - "NVIDIA_VISIBLE_DEVICES_0=0,1", - "NVIDIA_VISIBLE_DEVICES_1=2,3", - "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", - }, - "all", - false, - }, - { - "Merge Override (Disabled, After)", - []string{ - "NVIDIA_VISIBLE_DEVICES_0=0,1", - "NVIDIA_VISIBLE_DEVICES_1=2,3", - "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", - "NVIDIA_VISIBLE_DEVICES=all", - }, - "all", - false, - }, - { - "Merge Override (Disabled, In Between)", - []string{ - "NVIDIA_VISIBLE_DEVICES_0=0,1", - "NVIDIA_VISIBLE_DEVICES_1=2,3", - "NVIDIA_VISIBLE_DEVICES=all", - "NVIDIA_VISIBLE_DEVICES_WHATEVER=4,5", - }, - "all", - false, - }, - } - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - config := CLIConfig{ - AlphaMergeVisibleDevicesEnvvars: tc.enableMerge, - } - envvars := getEnvMap(tc.input, config) - devices := strings.Split(envvars[envNVVisibleDevices], ",") - sort.Strings(devices) - require.Equal(t, tc.expected, strings.Join(devices, ",")) - }) - } -} diff --git a/pkg/Godeps/Godeps.json b/pkg/Godeps/Godeps.json deleted file mode 100644 index e6db9bf9..00000000 --- a/pkg/Godeps/Godeps.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "ImportPath": "github.com/nvidia/nvidia-container-runtime/toolkit/nvidia-container-toolkit", - "GoVersion": "go1.9", - "GodepVersion": "v80", - "Deps": [ - { - "ImportPath": "github.com/BurntSushi/toml", - "Comment": "v0.3.0-7-ga368813", - "Rev": "a368813c5e648fee92e5f6c30e3944ff9d5e8895" - } - ] -} diff --git a/pkg/container-toolkit b/pkg/container-toolkit deleted file mode 100755 index 1a8af0de..00000000 Binary files a/pkg/container-toolkit and /dev/null differ diff --git a/pkg/container_config.go b/pkg/container_config.go index 15bbc0f4..49ab5bb3 100644 --- a/pkg/container_config.go +++ b/pkg/container_config.go @@ -6,6 +6,7 @@ import ( "log" "os" "path" + "path/filepath" "strconv" "strings" @@ -73,6 +74,15 @@ type LinuxCapabilities struct { Ambient []string `json:"ambient,omitempty" platform:"linux"` } +// Mount from OCI runtime spec +// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L103 +type Mount struct { + Destination string `json:"destination"` + Type string `json:"type,omitempty" platform:"linux,solaris"` + Source string `json:"source,omitempty"` + Options []string `json:"options,omitempty"` +} + // Spec from OCI runtime spec // We use pointers to structs, similarly to the latest version of runtime-spec: // https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L5-L28 @@ -80,6 +90,7 @@ type Spec struct { Version *string `json:"ociVersion"` Process *Process `json:"process,omitempty"` Root *Root `json:"root,omitempty"` + Mounts []Mount `json:"mounts,omitempty"` } // HookState holds state information about the hook @@ -108,7 +119,7 @@ func parseCudaVersion(cudaVersion string) (vmaj, vmin, vpatch uint32) { return } -func getEnvMap(e []string, config CLIConfig) (m map[string]string) { +func getEnvMap(e []string) (m map[string]string) { m = make(map[string]string) for _, s := range e { p := strings.SplitN(s, "=", 2) @@ -117,17 +128,6 @@ func getEnvMap(e []string, config CLIConfig) (m map[string]string) { } m[p[0]] = p[1] } - if config.AlphaMergeVisibleDevicesEnvvars { - var mergable []string - for k, v := range m { - if strings.HasPrefix(k, envNVVisibleDevices+"_") { - mergable = append(mergable, v) - } - } - if len(mergable) > 0 { - m[envNVVisibleDevices] = strings.Join(mergable, ",") - } - } return } @@ -198,7 +198,7 @@ func isLegacyCUDAImage(env map[string]string) bool { return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0 } -func getDevices(env map[string]string, legacyImage bool) *string { +func getDevicesFromEnvvar(env map[string]string, legacyImage bool) *string { // Build a list of envvars to consider. envVars := []string{envNVVisibleDevices} if envSwarmGPU != nil { @@ -236,6 +236,65 @@ func getDevices(env map[string]string, legacyImage bool) *string { return devices } +func getDevicesFromMounts(root string, mounts []Mount) *string { + var devices []string + for _, m := range mounts { + root := filepath.Clean(root) + source := filepath.Clean(m.Source) + destination := filepath.Clean(m.Destination) + + // Only consider mounts who's host volume is /dev/null + if source != "/dev/null" { + continue + } + // Only consider container mount points that begin with 'root' + if len(destination) < len(root) { + continue + } + if destination[:len(root)] != root { + continue + } + // Grab the full path beyond 'root' and add it to the list of devices + device := destination[len(root):] + if len(device) > 0 && device[0] == '/' { + device = device[1:] + } + if len(device) == 0 { + continue + } + devices = append(devices, device) + } + + if devices == nil { + return nil + } + + ret := strings.Join(devices, ",") + return &ret +} + +func getDevices(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool, legacyImage bool) *string { + // Try and get the device list from mount volumes first + devices := getDevicesFromMounts(*hookConfig.DeviceListVolumeMount, mounts) + if devices != nil { + return devices + } + + // Fallback to reading from the environment variable if privileges are correct + devices = getDevicesFromEnvvar(env, legacyImage) + if devices == nil { + return nil + } + if privileged || hookConfig.AcceptEnvvarUnprivileged { + return devices + } + + // Error out otherwise + log.Panicln("insufficient privileges to read device list from NVIDIA_VISIBLE_DEVICES envvar") + + return nil +} + func getMigConfigDevices(env map[string]string) *string { if devices, ok := env[envNVMigConfigDevices]; ok { return &devices @@ -296,11 +355,11 @@ func getRequirements(env map[string]string, legacyImage bool) []string { return requirements } -func getNvidiaConfig(env map[string]string, privileged bool) *nvidiaConfig { +func getNvidiaConfig(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool) *nvidiaConfig { legacyImage := isLegacyCUDAImage(env) var devices string - if d := getDevices(env, legacyImage); d != nil { + if d := getDevices(hookConfig, env, mounts, privileged, legacyImage); d != nil { devices = *d } else { // 'nil' devices means this is not a GPU container. @@ -357,13 +416,13 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { s := loadSpec(path.Join(b, "config.json")) - env := getEnvMap(s.Process.Env, hook.NvidiaContainerCLI) + env := getEnvMap(s.Process.Env) privileged := isPrivileged(s) envSwarmGPU = hook.SwarmResource return containerConfig{ Pid: h.Pid, Rootfs: s.Root.Path, Env: env, - Nvidia: getNvidiaConfig(env, privileged), + Nvidia: getNvidiaConfig(&hook, env, s.Mounts, privileged), } } diff --git a/pkg/container_test.go b/pkg/container_test.go index 365e9c03..98c40d2c 100644 --- a/pkg/container_test.go +++ b/pkg/container_test.go @@ -1,6 +1,7 @@ package main import ( + "path/filepath" "reflect" "testing" ) @@ -407,7 +408,8 @@ func TestGetNvidiaConfig(t *testing.T) { // Wrap the call to getNvidiaConfig() in a closure. var config *nvidiaConfig getConfig := func() { - config = getNvidiaConfig(tc.env, tc.privileged) + hookConfig := getDefaultHookConfig() + config = getNvidiaConfig(&hookConfig, tc.env, nil, tc.privileged) } // For any tests that are expected to panic, make sure they do. @@ -449,6 +451,173 @@ func TestGetNvidiaConfig(t *testing.T) { } } +func TestGetDevicesFromMounts(t *testing.T) { + var tests = []struct { + description string + root string + mounts []Mount + expectedDevices *string + }{ + { + description: "No mounts", + root: defaultDeviceListVolumeMount, + mounts: nil, + expectedDevices: nil, + }, + { + description: "Host path is not /dev/null", + root: defaultDeviceListVolumeMount, + mounts: []Mount{ + { + Source: "/not/dev/null", + Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU0"), + }, + }, + expectedDevices: nil, + }, + { + description: "Container path is not prefixed by 'root'", + root: defaultDeviceListVolumeMount, + mounts: []Mount{ + { + Source: "/dev/null", + Destination: filepath.Join("/other/prefix", "GPU0"), + }, + }, + expectedDevices: nil, + }, + { + description: "Container path is only 'root'", + root: defaultDeviceListVolumeMount, + mounts: []Mount{ + { + Source: "/dev/null", + Destination: defaultDeviceListVolumeMount, + }, + }, + expectedDevices: nil, + }, + { + description: "Discover 2 devices", + root: defaultDeviceListVolumeMount, + mounts: []Mount{ + { + Source: "/dev/null", + Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU0"), + }, + { + Source: "/dev/null", + Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU1"), + }, + }, + expectedDevices: &[]string{"GPU0,GPU1"}[0], + }, + { + description: "Discover 2 devices with slashes in the name", + root: defaultDeviceListVolumeMount, + mounts: []Mount{ + { + Source: "/dev/null", + Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU0-MIG0/0/1"), + }, + { + Source: "/dev/null", + Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU1-MIG0/0/1"), + }, + }, + expectedDevices: &[]string{"GPU0-MIG0/0/1,GPU1-MIG0/0/1"}[0], + }, + } + for _, tc := range tests { + t.Run(tc.description, func(t *testing.T) { + devices := getDevicesFromMounts(tc.root, tc.mounts) + if !reflect.DeepEqual(devices, tc.expectedDevices) { + t.Errorf("Unexpected devices (got: %v, wanted: %v)", *devices, *tc.expectedDevices) + } + }) + } +} + +func TestDeviceListSourcePriority(t *testing.T) { + var tests = []struct { + description string + mountDevices []Mount + envvarDevices string + privileged bool + acceptUnprivileged bool + expectedDevices *string + expectedPanic bool + }{ + { + description: "Mount devices, unprivileged, no accept unprivileged", + mountDevices: []Mount{ + { + Source: "/dev/null", + Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU0"), + }, + { + Source: "/dev/null", + Destination: filepath.Join(defaultDeviceListVolumeMount, "GPU1"), + }, + }, + envvarDevices: "GPU2,GPU3", + privileged: false, + acceptUnprivileged: false, + expectedDevices: &[]string{"GPU0,GPU1"}[0], + }, + { + description: "No mount devices, unprivileged, no accept unprivileged", + mountDevices: nil, + envvarDevices: "GPU0,GPU1", + privileged: false, + acceptUnprivileged: false, + expectedPanic: true, + }, + { + description: "No mount devices, privileged, no accept unprivileged", + mountDevices: nil, + envvarDevices: "GPU0,GPU1", + privileged: true, + acceptUnprivileged: false, + expectedDevices: &[]string{"GPU0,GPU1"}[0], + }, + { + description: "No mount devices, unprivileged, accept unprivileged", + mountDevices: nil, + envvarDevices: "GPU0,GPU1", + privileged: false, + acceptUnprivileged: true, + expectedDevices: &[]string{"GPU0,GPU1"}[0], + }, + } + for _, tc := range tests { + t.Run(tc.description, func(t *testing.T) { + // Wrap the call to getDevices() in a closure. + var devices *string + getDevices := func() { + env := map[string]string{ + envNVVisibleDevices: tc.envvarDevices, + } + hookConfig := getDefaultHookConfig() + hookConfig.AcceptEnvvarUnprivileged = tc.acceptUnprivileged + devices = getDevices(&hookConfig, env, tc.mountDevices, tc.privileged, false) + } + + // For any tests that are expected to panic, make sure they do. + if tc.expectedPanic { + mustPanic(t, getDevices) + return + } + + // For all other tests, just grab the devices and check the results + getDevices() + if !reflect.DeepEqual(devices, tc.expectedDevices) { + t.Errorf("Unexpected devices (got: %v, wanted: %v)", *devices, *tc.expectedDevices) + } + }) + } +} + func elementsMatch(slice0, slice1 []string) bool { map0 := make(map[string]int) map1 := make(map[string]int) diff --git a/pkg/hook_config.go b/pkg/hook_config.go index 3790fc46..09b78e14 100644 --- a/pkg/hook_config.go +++ b/pkg/hook_config.go @@ -13,6 +13,10 @@ const ( driverPath = "/run/nvidia/driver" ) +const ( + defaultDeviceListVolumeMount = "/var/run/nvidia-container-devices" +) + var defaultPaths = [...]string{ path.Join(driverPath, configPath), configPath, @@ -20,43 +24,45 @@ var defaultPaths = [...]string{ // CLIConfig : options for nvidia-container-cli. type CLIConfig struct { - Root *string `toml:"root"` - Path *string `toml:"path"` - Environment []string `toml:"environment"` - Debug *string `toml:"debug"` - Ldcache *string `toml:"ldcache"` - LoadKmods bool `toml:"load-kmods"` - NoPivot bool `toml:"no-pivot"` - NoCgroups bool `toml:"no-cgroups"` - User *string `toml:"user"` - Ldconfig *string `toml:"ldconfig"` - AlphaMergeVisibleDevicesEnvvars bool `toml:"alpha-merge-visible-devices-envvars"` + Root *string `toml:"root"` + Path *string `toml:"path"` + Environment []string `toml:"environment"` + Debug *string `toml:"debug"` + Ldcache *string `toml:"ldcache"` + LoadKmods bool `toml:"load-kmods"` + NoPivot bool `toml:"no-pivot"` + NoCgroups bool `toml:"no-cgroups"` + User *string `toml:"user"` + Ldconfig *string `toml:"ldconfig"` } // HookConfig : options for the nvidia-container-toolkit. type HookConfig struct { - DisableRequire bool `toml:"disable-require"` - SwarmResource *string `toml:"swarm-resource"` + DisableRequire bool `toml:"disable-require"` + SwarmResource *string `toml:"swarm-resource"` + AcceptEnvvarUnprivileged bool `toml:"accept-nvidia-visible-devices-envvar-when-unprivileged"` + DeviceListVolumeMount *string `toml:"look-for-nvidia-visible-devices-as-volume-mounts-under"` NvidiaContainerCLI CLIConfig `toml:"nvidia-container-cli"` } func getDefaultHookConfig() (config HookConfig) { return HookConfig{ - DisableRequire: false, - SwarmResource: nil, + DisableRequire: false, + SwarmResource: nil, + AcceptEnvvarUnprivileged: true, + DeviceListVolumeMount: &[]string{defaultDeviceListVolumeMount}[0], NvidiaContainerCLI: CLIConfig{ - Root: nil, - Path: nil, - Environment: []string{}, - Debug: nil, - Ldcache: nil, - LoadKmods: true, - NoPivot: false, - NoCgroups: false, - User: nil, - Ldconfig: nil, - AlphaMergeVisibleDevicesEnvvars: false, + Root: nil, + Path: nil, + Environment: []string{}, + Debug: nil, + Ldcache: nil, + LoadKmods: true, + NoPivot: false, + NoCgroups: false, + User: nil, + Ldconfig: nil, }, } }