Move pkg to cmd/nvidia-container-toolkit

This change moves the pkg folder to `cmd/nvidia-container-toolkit` to
better match go best practices. This allows, for example, for the
`cmd/nvidia-container-toolkit` to be go installed.

The only package included in `pkg` was `main`.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar
2021-06-08 15:20:17 +02:00
parent f25698e96e
commit f72b79cc2a
7 changed files with 2 additions and 3 deletions

View File

@@ -0,0 +1,27 @@
package main
import (
"log"
)
func capabilityToCLI(cap string) string {
switch cap {
case "compute":
return "--compute"
case "compat32":
return "--compat32"
case "graphics":
return "--graphics"
case "utility":
return "--utility"
case "video":
return "--video"
case "display":
return "--display"
case "ngx":
return "--ngx"
default:
log.Panicln("unknown driver capability:", cap)
}
return ""
}

View File

@@ -0,0 +1,435 @@
package main
import (
"encoding/json"
"fmt"
"log"
"os"
"path"
"path/filepath"
"strconv"
"strings"
"golang.org/x/mod/semver"
)
var envSwarmGPU *string
const (
envCUDAVersion = "CUDA_VERSION"
envNVRequirePrefix = "NVIDIA_REQUIRE_"
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES"
envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES"
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
)
const (
allDriverCapabilities = "compute,compat32,graphics,utility,video,display,ngx"
defaultDriverCapabilities = "utility,compute"
)
const (
capSysAdmin = "CAP_SYS_ADMIN"
)
const (
deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices"
)
type nvidiaConfig struct {
Devices string
MigConfigDevices string
MigMonitorDevices string
DriverCapabilities string
Requirements []string
DisableRequire bool
}
type containerConfig struct {
Pid int
Rootfs string
Env map[string]string
Nvidia *nvidiaConfig
}
// Root from OCI runtime spec
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L94-L100
type Root struct {
Path string `json:"path"`
}
// Process from OCI runtime spec
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L57
type Process struct {
Env []string `json:"env,omitempty"`
Capabilities *json.RawMessage `json:"capabilities,omitempty" platform:"linux"`
}
// LinuxCapabilities from OCI runtime spec
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L61
type LinuxCapabilities struct {
Bounding []string `json:"bounding,omitempty" platform:"linux"`
Effective []string `json:"effective,omitempty" platform:"linux"`
Inheritable []string `json:"inheritable,omitempty" platform:"linux"`
Permitted []string `json:"permitted,omitempty" platform:"linux"`
Ambient []string `json:"ambient,omitempty" platform:"linux"`
}
// Mount from OCI runtime spec
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L103
type Mount struct {
Destination string `json:"destination"`
Type string `json:"type,omitempty" platform:"linux,solaris"`
Source string `json:"source,omitempty"`
Options []string `json:"options,omitempty"`
}
// Spec from OCI runtime spec
// We use pointers to structs, similarly to the latest version of runtime-spec:
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L5-L28
type Spec struct {
Version *string `json:"ociVersion"`
Process *Process `json:"process,omitempty"`
Root *Root `json:"root,omitempty"`
Mounts []Mount `json:"mounts,omitempty"`
}
// HookState holds state information about the hook
type HookState struct {
Pid int `json:"pid,omitempty"`
// After 17.06, runc is using the runtime spec:
// github.com/docker/runc/blob/17.06/libcontainer/configs/config.go#L262-L263
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/state.go#L3-L17
Bundle string `json:"bundle"`
// Before 17.06, runc used a custom struct that didn't conform to the spec:
// github.com/docker/runc/blob/17.03.x/libcontainer/configs/config.go#L245-L252
BundlePath string `json:"bundlePath"`
}
func parseCudaVersion(cudaVersion string) (vmaj, vmin, vpatch uint32) {
if _, err := fmt.Sscanf(cudaVersion, "%d.%d.%d\n", &vmaj, &vmin, &vpatch); err != nil {
vpatch = 0
if _, err := fmt.Sscanf(cudaVersion, "%d.%d\n", &vmaj, &vmin); err != nil {
vmin = 0
if _, err := fmt.Sscanf(cudaVersion, "%d\n", &vmaj); err != nil {
log.Panicln("invalid CUDA version:", cudaVersion)
}
}
}
return
}
func getEnvMap(e []string) (m map[string]string) {
m = make(map[string]string)
for _, s := range e {
p := strings.SplitN(s, "=", 2)
if len(p) != 2 {
log.Panicln("environment error")
}
m[p[0]] = p[1]
}
return
}
func loadSpec(path string) (spec *Spec) {
f, err := os.Open(path)
if err != nil {
log.Panicln("could not open OCI spec:", err)
}
defer f.Close()
if err = json.NewDecoder(f).Decode(&spec); err != nil {
log.Panicln("could not decode OCI spec:", err)
}
if spec.Version == nil {
log.Panicln("Version is empty in OCI spec")
}
if spec.Process == nil {
log.Panicln("Process is empty in OCI spec")
}
if spec.Root == nil {
log.Panicln("Root is empty in OCI spec")
}
return
}
func isPrivileged(s *Spec) bool {
if s.Process.Capabilities == nil {
return false
}
var caps []string
// If v1.1.0-rc1 <= OCI version < v1.0.0-rc5 parse s.Process.Capabilities as:
// github.com/opencontainers/runtime-spec/blob/v1.0.0-rc1/specs-go/config.go#L30-L54
rc1cmp := semver.Compare("v"+*s.Version, "v1.0.0-rc1")
rc5cmp := semver.Compare("v"+*s.Version, "v1.0.0-rc5")
if (rc1cmp == 1 || rc1cmp == 0) && (rc5cmp == -1) {
err := json.Unmarshal(*s.Process.Capabilities, &caps)
if err != nil {
log.Panicln("could not decode Process.Capabilities in OCI spec:", err)
}
// Otherwise, parse s.Process.Capabilities as:
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L54
} else {
var lc LinuxCapabilities
err := json.Unmarshal(*s.Process.Capabilities, &lc)
if err != nil {
log.Panicln("could not decode Process.Capabilities in OCI spec:", err)
}
// We only make sure that the bounding capabibility set has
// CAP_SYS_ADMIN. This allows us to make sure that the container was
// actually started as '--privileged', but also allow non-root users to
// access the privileged NVIDIA capabilities.
caps = lc.Bounding
}
for _, c := range caps {
if c == capSysAdmin {
return true
}
}
return false
}
func isLegacyCUDAImage(env map[string]string) bool {
legacyCudaVersion := env[envCUDAVersion]
cudaRequire := env[envNVRequireCUDA]
return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0
}
func getDevicesFromEnvvar(env map[string]string, legacyImage bool) *string {
// Build a list of envvars to consider.
envVars := []string{envNVVisibleDevices}
if envSwarmGPU != nil {
// The Swarm envvar has higher precedence.
envVars = append([]string{*envSwarmGPU}, envVars...)
}
// Grab a reference to devices from the first envvar
// in the list that actually exists in the environment.
var devices *string
for _, envVar := range envVars {
if devs, ok := env[envVar]; ok {
devices = &devs
break
}
}
// Environment variable unset with legacy image: default to "all".
if devices == nil && legacyImage {
all := "all"
return &all
}
// Environment variable unset or empty or "void": return nil
if devices == nil || len(*devices) == 0 || *devices == "void" {
return nil
}
// Environment variable set to "none": reset to "".
if *devices == "none" {
empty := ""
return &empty
}
// Any other value.
return devices
}
func getDevicesFromMounts(mounts []Mount) *string {
var devices []string
for _, m := range mounts {
root := filepath.Clean(deviceListAsVolumeMountsRoot)
source := filepath.Clean(m.Source)
destination := filepath.Clean(m.Destination)
// Only consider mounts who's host volume is /dev/null
if source != "/dev/null" {
continue
}
// Only consider container mount points that begin with 'root'
if len(destination) < len(root) {
continue
}
if destination[:len(root)] != root {
continue
}
// Grab the full path beyond 'root' and add it to the list of devices
device := destination[len(root):]
if len(device) > 0 && device[0] == '/' {
device = device[1:]
}
if len(device) == 0 {
continue
}
devices = append(devices, device)
}
if devices == nil {
return nil
}
ret := strings.Join(devices, ",")
return &ret
}
func getDevices(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool, legacyImage bool) *string {
// If enabled, try and get the device list from volume mounts first
if hookConfig.AcceptDeviceListAsVolumeMounts {
devices := getDevicesFromMounts(mounts)
if devices != nil {
return devices
}
}
// Fallback to reading from the environment variable if privileges are correct
devices := getDevicesFromEnvvar(env, legacyImage)
if devices == nil {
return nil
}
if privileged || hookConfig.AcceptEnvvarUnprivileged {
return devices
}
configName := hookConfig.getConfigOption("AcceptEnvvarUnprivileged")
log.Printf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES (privileged=%v, %v=%v) ", privileged, configName, hookConfig.AcceptEnvvarUnprivileged)
return nil
}
func getMigConfigDevices(env map[string]string) *string {
if devices, ok := env[envNVMigConfigDevices]; ok {
return &devices
}
return nil
}
func getMigMonitorDevices(env map[string]string) *string {
if devices, ok := env[envNVMigMonitorDevices]; ok {
return &devices
}
return nil
}
func getDriverCapabilities(env map[string]string, legacyImage bool) *string {
// Grab a reference to the capabilities from the envvar
// if it actually exists in the environment.
var capabilities *string
if caps, ok := env[envNVDriverCapabilities]; ok {
capabilities = &caps
}
// Environment variable unset with legacy image: set all capabilities.
if capabilities == nil && legacyImage {
allCaps := allDriverCapabilities
return &allCaps
}
// Environment variable unset or set but empty: set default capabilities.
if capabilities == nil || len(*capabilities) == 0 {
defaultCaps := defaultDriverCapabilities
return &defaultCaps
}
// Environment variable set to "all": set all capabilities.
if *capabilities == "all" {
allCaps := allDriverCapabilities
return &allCaps
}
// Any other value
return capabilities
}
func getRequirements(env map[string]string, legacyImage bool) []string {
// All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli
var requirements []string
for name, value := range env {
if strings.HasPrefix(name, envNVRequirePrefix) {
requirements = append(requirements, value)
}
}
if legacyImage {
vmaj, vmin, _ := parseCudaVersion(env[envCUDAVersion])
cudaRequire := fmt.Sprintf("cuda>=%d.%d", vmaj, vmin)
requirements = append(requirements, cudaRequire)
}
return requirements
}
func getNvidiaConfig(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool) *nvidiaConfig {
legacyImage := isLegacyCUDAImage(env)
var devices string
if d := getDevices(hookConfig, env, mounts, privileged, legacyImage); d != nil {
devices = *d
} else {
// 'nil' devices means this is not a GPU container.
return nil
}
var migConfigDevices string
if d := getMigConfigDevices(env); d != nil {
migConfigDevices = *d
}
if !privileged && migConfigDevices != "" {
log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container")
}
var migMonitorDevices string
if d := getMigMonitorDevices(env); d != nil {
migMonitorDevices = *d
}
if !privileged && migMonitorDevices != "" {
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
}
var driverCapabilities string
if c := getDriverCapabilities(env, legacyImage); c != nil {
driverCapabilities = *c
}
requirements := getRequirements(env, legacyImage)
// Don't fail on invalid values.
disableRequire, _ := strconv.ParseBool(env[envNVDisableRequire])
return &nvidiaConfig{
Devices: devices,
MigConfigDevices: migConfigDevices,
MigMonitorDevices: migMonitorDevices,
DriverCapabilities: driverCapabilities,
Requirements: requirements,
DisableRequire: disableRequire,
}
}
func getContainerConfig(hook HookConfig) (config containerConfig) {
var h HookState
d := json.NewDecoder(os.Stdin)
if err := d.Decode(&h); err != nil {
log.Panicln("could not decode container state:", err)
}
b := h.Bundle
if len(b) == 0 {
b = h.BundlePath
}
s := loadSpec(path.Join(b, "config.json"))
env := getEnvMap(s.Process.Env)
privileged := isPrivileged(s)
envSwarmGPU = hook.SwarmResource
return containerConfig{
Pid: h.Pid,
Rootfs: s.Root.Path,
Env: env,
Nvidia: getNvidiaConfig(&hook, env, s.Mounts, privileged),
}
}

View File

@@ -0,0 +1,824 @@
package main
import (
"path/filepath"
"testing"
"github.com/stretchr/testify/require"
)
func TestGetNvidiaConfig(t *testing.T) {
var tests = []struct {
description string
env map[string]string
privileged bool
expectedConfig *nvidiaConfig
expectedPanic bool
}{
{
description: "No environment, unprivileged",
env: map[string]string{},
privileged: false,
expectedConfig: nil,
},
{
description: "No environment, privileged",
env: map[string]string{},
privileged: true,
expectedConfig: nil,
},
{
description: "Legacy image, no devices, no capabilities, no requirements",
env: map[string]string{
envCUDAVersion: "9.0",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "all",
DriverCapabilities: allDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Legacy image, devices 'all', no capabilities, no requirements",
env: map[string]string{
envCUDAVersion: "9.0",
envNVVisibleDevices: "all",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "all",
DriverCapabilities: allDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Legacy image, devices 'empty', no capabilities, no requirements",
env: map[string]string{
envCUDAVersion: "9.0",
envNVVisibleDevices: "",
},
privileged: false,
expectedConfig: nil,
},
{
description: "Legacy image, devices 'void', no capabilities, no requirements",
env: map[string]string{
envCUDAVersion: "9.0",
envNVVisibleDevices: "",
},
privileged: false,
expectedConfig: nil,
},
{
description: "Legacy image, devices 'none', no capabilities, no requirements",
env: map[string]string{
envCUDAVersion: "9.0",
envNVVisibleDevices: "none",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "",
DriverCapabilities: allDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Legacy image, devices set, no capabilities, no requirements",
env: map[string]string{
envCUDAVersion: "9.0",
envNVVisibleDevices: "gpu0,gpu1",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: allDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Legacy image, devices set, capabilities 'empty', no requirements",
env: map[string]string{
envCUDAVersion: "9.0",
envNVVisibleDevices: "gpu0,gpu1",
envNVDriverCapabilities: "",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: defaultDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Legacy image, devices set, capabilities 'all', no requirements",
env: map[string]string{
envCUDAVersion: "9.0",
envNVVisibleDevices: "gpu0,gpu1",
envNVDriverCapabilities: "all",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: allDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Legacy image, devices set, capabilities set, no requirements",
env: map[string]string{
envCUDAVersion: "9.0",
envNVVisibleDevices: "gpu0,gpu1",
envNVDriverCapabilities: "cap0,cap1",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: "cap0,cap1",
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Legacy image, devices set, capabilities set, requirements set",
env: map[string]string{
envCUDAVersion: "9.0",
envNVVisibleDevices: "gpu0,gpu1",
envNVDriverCapabilities: "cap0,cap1",
envNVRequirePrefix + "REQ0": "req0=true",
envNVRequirePrefix + "REQ1": "req1=false",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: "cap0,cap1",
Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"},
DisableRequire: false,
},
},
{
description: "Legacy image, devices set, capabilities set, requirements set, disable requirements",
env: map[string]string{
envCUDAVersion: "9.0",
envNVVisibleDevices: "gpu0,gpu1",
envNVDriverCapabilities: "cap0,cap1",
envNVRequirePrefix + "REQ0": "req0=true",
envNVRequirePrefix + "REQ1": "req1=false",
envNVDisableRequire: "true",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: "cap0,cap1",
Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"},
DisableRequire: true,
},
},
{
description: "Modern image, no devices, no capabilities, no requirements, no envCUDAVersion",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
},
privileged: false,
expectedConfig: nil,
},
{
description: "Modern image, no devices, no capabilities, no requirement, envCUDAVersion set",
env: map[string]string{
envCUDAVersion: "9.0",
envNVRequireCUDA: "cuda>=9.0",
},
privileged: false,
expectedConfig: nil,
},
{
description: "Modern image, devices 'all', no capabilities, no requirements",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "all",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "all",
DriverCapabilities: defaultDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Modern image, devices 'empty', no capabilities, no requirements",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "",
},
privileged: false,
expectedConfig: nil,
},
{
description: "Modern image, devices 'void', no capabilities, no requirements",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "",
},
privileged: false,
expectedConfig: nil,
},
{
description: "Modern image, devices 'none', no capabilities, no requirements",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "none",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "",
DriverCapabilities: defaultDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Modern image, devices set, no capabilities, no requirements",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "gpu0,gpu1",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: defaultDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Modern image, devices set, capabilities 'empty', no requirements",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "gpu0,gpu1",
envNVDriverCapabilities: "",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: defaultDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Modern image, devices set, capabilities 'all', no requirements",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "gpu0,gpu1",
envNVDriverCapabilities: "all",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: allDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Modern image, devices set, capabilities set, no requirements",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "gpu0,gpu1",
envNVDriverCapabilities: "cap0,cap1",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: "cap0,cap1",
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Modern image, devices set, capabilities set, requirements set",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "gpu0,gpu1",
envNVDriverCapabilities: "cap0,cap1",
envNVRequirePrefix + "REQ0": "req0=true",
envNVRequirePrefix + "REQ1": "req1=false",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: "cap0,cap1",
Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"},
DisableRequire: false,
},
},
{
description: "Modern image, devices set, capabilities set, requirements set, disable requirements",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "gpu0,gpu1",
envNVDriverCapabilities: "cap0,cap1",
envNVRequirePrefix + "REQ0": "req0=true",
envNVRequirePrefix + "REQ1": "req1=false",
envNVDisableRequire: "true",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "gpu0,gpu1",
DriverCapabilities: "cap0,cap1",
Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"},
DisableRequire: true,
},
},
{
description: "No cuda envs, devices 'all'",
env: map[string]string{
envNVVisibleDevices: "all",
},
privileged: false,
expectedConfig: &nvidiaConfig{
Devices: "all",
DriverCapabilities: defaultDriverCapabilities,
Requirements: []string{},
DisableRequire: false,
},
},
{
description: "Modern image, devices 'all', migConfig set, privileged",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "all",
envNVMigConfigDevices: "mig0,mig1",
},
privileged: true,
expectedConfig: &nvidiaConfig{
Devices: "all",
MigConfigDevices: "mig0,mig1",
DriverCapabilities: defaultDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Modern image, devices 'all', migConfig set, unprivileged",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "all",
envNVMigConfigDevices: "mig0,mig1",
},
privileged: false,
expectedPanic: true,
},
{
description: "Modern image, devices 'all', migMonitor set, privileged",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "all",
envNVMigMonitorDevices: "mig0,mig1",
},
privileged: true,
expectedConfig: &nvidiaConfig{
Devices: "all",
MigMonitorDevices: "mig0,mig1",
DriverCapabilities: defaultDriverCapabilities,
Requirements: []string{"cuda>=9.0"},
DisableRequire: false,
},
},
{
description: "Modern image, devices 'all', migMonitor set, unprivileged",
env: map[string]string{
envNVRequireCUDA: "cuda>=9.0",
envNVVisibleDevices: "all",
envNVMigMonitorDevices: "mig0,mig1",
},
privileged: false,
expectedPanic: true,
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
// Wrap the call to getNvidiaConfig() in a closure.
var config *nvidiaConfig
getConfig := func() {
hookConfig := getDefaultHookConfig()
config = getNvidiaConfig(&hookConfig, tc.env, nil, tc.privileged)
}
// For any tests that are expected to panic, make sure they do.
if tc.expectedPanic {
require.Panics(t, getConfig)
return
}
// For all other tests, just grab the config
getConfig()
// And start comparing the test results to the expected results.
if tc.expectedConfig == nil {
require.Nil(t, config, tc.description)
return
}
require.NotNil(t, config, tc.description)
require.Equal(t, tc.expectedConfig.Devices, config.Devices)
require.Equal(t, tc.expectedConfig.MigConfigDevices, config.MigConfigDevices)
require.Equal(t, tc.expectedConfig.MigMonitorDevices, config.MigMonitorDevices)
require.Equal(t, tc.expectedConfig.DriverCapabilities, config.DriverCapabilities)
require.ElementsMatch(t, tc.expectedConfig.Requirements, config.Requirements)
require.Equal(t, tc.expectedConfig.DisableRequire, config.DisableRequire)
})
}
}
func TestGetDevicesFromMounts(t *testing.T) {
var tests = []struct {
description string
mounts []Mount
expectedDevices *string
}{
{
description: "No mounts",
mounts: nil,
expectedDevices: nil,
},
{
description: "Host path is not /dev/null",
mounts: []Mount{
{
Source: "/not/dev/null",
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"),
},
},
expectedDevices: nil,
},
{
description: "Container path is not prefixed by 'root'",
mounts: []Mount{
{
Source: "/dev/null",
Destination: filepath.Join("/other/prefix", "GPU0"),
},
},
expectedDevices: nil,
},
{
description: "Container path is only 'root'",
mounts: []Mount{
{
Source: "/dev/null",
Destination: deviceListAsVolumeMountsRoot,
},
},
expectedDevices: nil,
},
{
description: "Discover 2 devices",
mounts: []Mount{
{
Source: "/dev/null",
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"),
},
},
expectedDevices: &[]string{"GPU0,GPU1"}[0],
},
{
description: "Discover 2 devices with slashes in the name",
mounts: []Mount{
{
Source: "/dev/null",
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0-MIG0/0/1"),
},
{
Source: "/dev/null",
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1-MIG0/0/1"),
},
},
expectedDevices: &[]string{"GPU0-MIG0/0/1,GPU1-MIG0/0/1"}[0],
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
devices := getDevicesFromMounts(tc.mounts)
require.Equal(t, tc.expectedDevices, devices)
})
}
}
func TestDeviceListSourcePriority(t *testing.T) {
var tests = []struct {
description string
mountDevices []Mount
envvarDevices string
privileged bool
acceptUnprivileged bool
acceptMounts bool
expectedDevices *string
}{
{
description: "Mount devices, unprivileged, no accept unprivileged",
mountDevices: []Mount{
{
Source: "/dev/null",
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"),
},
},
envvarDevices: "GPU2,GPU3",
privileged: false,
acceptUnprivileged: false,
acceptMounts: true,
expectedDevices: &[]string{"GPU0,GPU1"}[0],
},
{
description: "No mount devices, unprivileged, no accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: false,
acceptUnprivileged: false,
acceptMounts: true,
expectedDevices: nil,
},
{
description: "No mount devices, privileged, no accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: true,
acceptUnprivileged: false,
acceptMounts: true,
expectedDevices: &[]string{"GPU0,GPU1"}[0],
},
{
description: "No mount devices, unprivileged, accept unprivileged",
mountDevices: nil,
envvarDevices: "GPU0,GPU1",
privileged: false,
acceptUnprivileged: true,
acceptMounts: true,
expectedDevices: &[]string{"GPU0,GPU1"}[0],
},
{
description: "Mount devices, unprivileged, accept unprivileged, no accept mounts",
mountDevices: []Mount{
{
Source: "/dev/null",
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"),
},
},
envvarDevices: "GPU2,GPU3",
privileged: false,
acceptUnprivileged: true,
acceptMounts: false,
expectedDevices: &[]string{"GPU2,GPU3"}[0],
},
{
description: "Mount devices, unprivileged, no accept unprivileged, no accept mounts",
mountDevices: []Mount{
{
Source: "/dev/null",
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"),
},
{
Source: "/dev/null",
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"),
},
},
envvarDevices: "GPU2,GPU3",
privileged: false,
acceptUnprivileged: false,
acceptMounts: false,
expectedDevices: nil,
},
}
for _, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
// Wrap the call to getDevices() in a closure.
var devices *string
getDevices := func() {
env := map[string]string{
envNVVisibleDevices: tc.envvarDevices,
}
hookConfig := getDefaultHookConfig()
hookConfig.AcceptEnvvarUnprivileged = tc.acceptUnprivileged
hookConfig.AcceptDeviceListAsVolumeMounts = tc.acceptMounts
devices = getDevices(&hookConfig, env, tc.mountDevices, tc.privileged, false)
}
// For all other tests, just grab the devices and check the results
getDevices()
require.Equal(t, tc.expectedDevices, devices)
})
}
}
func TestGetDevicesFromEnvvar(t *testing.T) {
all := "all"
empty := ""
envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS"
gpuID := "GPU-12345"
anotherGPUID := "GPU-67890"
var tests = []struct {
description string
envSwarmGPU *string
env map[string]string
legacyImage bool
expectedDevices *string
}{
{
description: "empty env returns nil for non-legacy image",
},
{
description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
envNVVisibleDevices: "",
},
},
{
description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
envNVVisibleDevices: "void",
},
},
{
description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image",
env: map[string]string{
envNVVisibleDevices: "none",
},
expectedDevices: &empty,
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image",
env: map[string]string{
envNVVisibleDevices: gpuID,
},
expectedDevices: &gpuID,
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
env: map[string]string{
envNVVisibleDevices: gpuID,
},
legacyImage: true,
expectedDevices: &gpuID,
},
{
description: "empty env returns all for legacy image",
legacyImage: true,
expectedDevices: &all,
},
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is ignored when
// not enabled
{
description: "missing NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
},
},
{
description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
envNVVisibleDevices: "",
envDockerResourceGPUs: anotherGPUID,
},
},
{
description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
env: map[string]string{
envNVVisibleDevices: "void",
envDockerResourceGPUs: anotherGPUID,
},
},
{
description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image",
env: map[string]string{
envNVVisibleDevices: "none",
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: &empty,
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image",
env: map[string]string{
envNVVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: &gpuID,
},
{
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
env: map[string]string{
envNVVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
},
legacyImage: true,
expectedDevices: &gpuID,
},
{
description: "empty env returns all for legacy image",
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
},
legacyImage: true,
expectedDevices: &all,
},
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
// enabled
{
description: "empty env returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
},
{
description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: "",
},
},
{
description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: "void",
},
},
{
description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: "none",
},
expectedDevices: &empty,
},
{
description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: gpuID,
},
expectedDevices: &gpuID,
},
{
description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: gpuID,
},
legacyImage: true,
expectedDevices: &gpuID,
},
{
description: "DOCKER_RESOURCE_GPUS is selected if present",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: &anotherGPUID,
},
{
description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
envSwarmGPU: &envDockerResourceGPUs,
env: map[string]string{
envNVVisibleDevices: gpuID,
envDockerResourceGPUs: anotherGPUID,
},
expectedDevices: &anotherGPUID,
},
}
for i, tc := range tests {
t.Run(tc.description, func(t *testing.T) {
envSwarmGPU = tc.envSwarmGPU
devices := getDevicesFromEnvvar(tc.env, tc.legacyImage)
if tc.expectedDevices == nil {
require.Nil(t, devices, "%d: %v", i, tc)
return
}
require.NotNil(t, devices, "%d: %v", i, tc)
require.Equal(t, *tc.expectedDevices, *devices, "%d: %v", i, tc)
})
}
}

View File

@@ -0,0 +1,104 @@
package main
import (
"log"
"os"
"path"
"reflect"
"github.com/BurntSushi/toml"
)
const (
configPath = "/etc/nvidia-container-runtime/config.toml"
driverPath = "/run/nvidia/driver"
)
var defaultPaths = [...]string{
path.Join(driverPath, configPath),
configPath,
}
// CLIConfig : options for nvidia-container-cli.
type CLIConfig struct {
Root *string `toml:"root"`
Path *string `toml:"path"`
Environment []string `toml:"environment"`
Debug *string `toml:"debug"`
Ldcache *string `toml:"ldcache"`
LoadKmods bool `toml:"load-kmods"`
NoPivot bool `toml:"no-pivot"`
NoCgroups bool `toml:"no-cgroups"`
User *string `toml:"user"`
Ldconfig *string `toml:"ldconfig"`
}
// HookConfig : options for the nvidia-container-toolkit.
type HookConfig struct {
DisableRequire bool `toml:"disable-require"`
SwarmResource *string `toml:"swarm-resource"`
AcceptEnvvarUnprivileged bool `toml:"accept-nvidia-visible-devices-envvar-when-unprivileged"`
AcceptDeviceListAsVolumeMounts bool `toml:"accept-nvidia-visible-devices-as-volume-mounts"`
NvidiaContainerCLI CLIConfig `toml:"nvidia-container-cli"`
}
func getDefaultHookConfig() (config HookConfig) {
return HookConfig{
DisableRequire: false,
SwarmResource: nil,
AcceptEnvvarUnprivileged: true,
AcceptDeviceListAsVolumeMounts: false,
NvidiaContainerCLI: CLIConfig{
Root: nil,
Path: nil,
Environment: []string{},
Debug: nil,
Ldcache: nil,
LoadKmods: true,
NoPivot: false,
NoCgroups: false,
User: nil,
Ldconfig: nil,
},
}
}
func getHookConfig() (config HookConfig) {
var err error
if len(*configflag) > 0 {
config = getDefaultHookConfig()
_, err = toml.DecodeFile(*configflag, &config)
if err != nil {
log.Panicln("couldn't open configuration file:", err)
}
} else {
for _, p := range defaultPaths {
config = getDefaultHookConfig()
_, err = toml.DecodeFile(p, &config)
if err == nil {
break
} else if !os.IsNotExist(err) {
log.Panicln("couldn't open default configuration file:", err)
}
}
}
return config
}
// getConfigOption returns the toml config option associated with the
// specified struct field.
func (c HookConfig) getConfigOption(fieldName string) string {
t := reflect.TypeOf(c)
f, ok := t.FieldByName(fieldName)
if !ok {
return fieldName
}
v, ok := f.Tag.Lookup("toml")
if !ok {
return fieldName
}
return v
}

View File

@@ -0,0 +1,134 @@
package main
import (
"encoding/json"
"testing"
"github.com/stretchr/testify/require"
)
func TestParseCudaVersionValid(t *testing.T) {
var tests = []struct {
version string
expected [3]uint32
}{
{"0", [3]uint32{0, 0, 0}},
{"8", [3]uint32{8, 0, 0}},
{"7.5", [3]uint32{7, 5, 0}},
{"9.0.116", [3]uint32{9, 0, 116}},
{"4294967295.4294967295.4294967295", [3]uint32{4294967295, 4294967295, 4294967295}},
}
for i, c := range tests {
vmaj, vmin, vpatch := parseCudaVersion(c.version)
version := [3]uint32{vmaj, vmin, vpatch}
require.Equal(t, c.expected, version, "%d: %v", i, c)
}
}
func TestParseCudaVersionInvalid(t *testing.T) {
var tests = []string{
"foo",
"foo.5.10",
"9.0.116.50",
"9.0.116foo",
"7.foo",
"9.0.bar",
"9.4294967296",
"9.0.116.",
"9..0",
"9.",
".5.10",
"-9",
"+9",
"-9.1.116",
"-9.-1.-116",
}
for _, c := range tests {
require.Panics(t, func() {
parseCudaVersion(c)
}, "parseCudaVersion(%v)", c)
}
}
func TestIsPrivileged(t *testing.T) {
var tests = []struct {
spec string
expected bool
}{
{
`
{
"ociVersion": "1.0.0",
"process": {
"capabilities": {
"bounding": [ "CAP_SYS_ADMIN" ]
}
}
}
`,
true,
},
{
`
{
"ociVersion": "1.0.0",
"process": {
"capabilities": {
"bounding": [ "CAP_SYS_OTHER" ]
}
}
}
`,
false,
},
{
`
{
"ociVersion": "1.0.0",
"process": {}
}
`,
false,
},
{
`
{
"ociVersion": "1.0.0-rc2-dev",
"process": {
"capabilities": [ "CAP_SYS_ADMIN" ]
}
}
`,
true,
},
{
`
{
"ociVersion": "1.0.0-rc2-dev",
"process": {
"capabilities": [ "CAP_SYS_OTHER" ]
}
}
`,
false,
},
{
`
{
"ociVersion": "1.0.0-rc2-dev",
"process": {}
}
`,
false,
},
}
for i, tc := range tests {
var spec Spec
_ = json.Unmarshal([]byte(tc.spec), &spec)
privileged := isPrivileged(&spec)
require.Equal(t, tc.expected, privileged, "%d: %v", i, tc)
}
}

View File

@@ -0,0 +1,188 @@
package main
import (
"flag"
"fmt"
"log"
"os"
"os/exec"
"path"
"path/filepath"
"runtime"
"runtime/debug"
"strconv"
"strings"
"syscall"
)
var (
debugflag = flag.Bool("debug", false, "enable debug output")
configflag = flag.String("config", "", "configuration file")
defaultPATH = []string{"/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"}
)
func exit() {
if err := recover(); err != nil {
if _, ok := err.(runtime.Error); ok {
log.Println(err)
}
if *debugflag {
log.Printf("%s", debug.Stack())
}
os.Exit(1)
}
os.Exit(0)
}
func getPATH(config CLIConfig) string {
dirs := filepath.SplitList(os.Getenv("PATH"))
// directories from the hook environment have higher precedence
dirs = append(dirs, defaultPATH...)
if config.Root != nil {
rootDirs := []string{}
for _, dir := range dirs {
rootDirs = append(rootDirs, path.Join(*config.Root, dir))
}
// directories with the root prefix have higher precedence
dirs = append(rootDirs, dirs...)
}
return strings.Join(dirs, ":")
}
func getCLIPath(config CLIConfig) string {
if config.Path != nil {
return *config.Path
}
if err := os.Setenv("PATH", getPATH(config)); err != nil {
log.Panicln("couldn't set PATH variable:", err)
}
path, err := exec.LookPath("nvidia-container-cli")
if err != nil {
log.Panicln("couldn't find binary nvidia-container-cli in", os.Getenv("PATH"), ":", err)
}
return path
}
// getRootfsPath returns an absolute path. We don't need to resolve symlinks for now.
func getRootfsPath(config containerConfig) string {
rootfs, err := filepath.Abs(config.Rootfs)
if err != nil {
log.Panicln(err)
}
return rootfs
}
func doPrestart() {
var err error
defer exit()
log.SetFlags(0)
hook := getHookConfig()
cli := hook.NvidiaContainerCLI
container := getContainerConfig(hook)
nvidia := container.Nvidia
if nvidia == nil {
// Not a GPU container, nothing to do.
return
}
rootfs := getRootfsPath(container)
args := []string{getCLIPath(cli)}
if cli.Root != nil {
args = append(args, fmt.Sprintf("--root=%s", *cli.Root))
}
if cli.LoadKmods {
args = append(args, "--load-kmods")
}
if cli.NoPivot {
args = append(args, "--no-pivot")
}
if *debugflag {
args = append(args, "--debug=/dev/stderr")
} else if cli.Debug != nil {
args = append(args, fmt.Sprintf("--debug=%s", *cli.Debug))
}
if cli.Ldcache != nil {
args = append(args, fmt.Sprintf("--ldcache=%s", *cli.Ldcache))
}
if cli.User != nil {
args = append(args, fmt.Sprintf("--user=%s", *cli.User))
}
args = append(args, "configure")
if cli.Ldconfig != nil {
args = append(args, fmt.Sprintf("--ldconfig=%s", *cli.Ldconfig))
}
if cli.NoCgroups {
args = append(args, "--no-cgroups")
}
if len(nvidia.Devices) > 0 {
args = append(args, fmt.Sprintf("--device=%s", nvidia.Devices))
}
if len(nvidia.MigConfigDevices) > 0 {
args = append(args, fmt.Sprintf("--mig-config=%s", nvidia.MigConfigDevices))
}
if len(nvidia.MigMonitorDevices) > 0 {
args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices))
}
for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") {
if len(cap) == 0 {
break
}
args = append(args, capabilityToCLI(cap))
}
if !hook.DisableRequire && !nvidia.DisableRequire {
for _, req := range nvidia.Requirements {
args = append(args, fmt.Sprintf("--require=%s", req))
}
}
args = append(args, fmt.Sprintf("--pid=%s", strconv.FormatUint(uint64(container.Pid), 10)))
args = append(args, rootfs)
env := append(os.Environ(), cli.Environment...)
err = syscall.Exec(args[0], args, env)
log.Panicln("exec failed:", err)
}
func usage() {
fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
flag.PrintDefaults()
fmt.Fprintf(os.Stderr, "\nCommands:\n")
fmt.Fprintf(os.Stderr, " prestart\n run the prestart hook\n")
fmt.Fprintf(os.Stderr, " poststart\n no-op\n")
fmt.Fprintf(os.Stderr, " poststop\n no-op\n")
}
func main() {
flag.Usage = usage
flag.Parse()
args := flag.Args()
if len(args) == 0 {
flag.Usage()
os.Exit(2)
}
switch args[0] {
case "prestart":
doPrestart()
os.Exit(0)
case "poststart":
fallthrough
case "poststop":
os.Exit(0)
default:
flag.Usage()
os.Exit(2)
}
}