mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-26 18:18:24 +00:00
Move pkg to cmd/nvidia-container-toolkit
This change moves the pkg folder to `cmd/nvidia-container-toolkit` to better match go best practices. This allows, for example, for the `cmd/nvidia-container-toolkit` to be go installed. The only package included in `pkg` was `main`. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
27
cmd/nvidia-container-toolkit/capabilities.go
Normal file
27
cmd/nvidia-container-toolkit/capabilities.go
Normal file
@@ -0,0 +1,27 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
)
|
||||
|
||||
func capabilityToCLI(cap string) string {
|
||||
switch cap {
|
||||
case "compute":
|
||||
return "--compute"
|
||||
case "compat32":
|
||||
return "--compat32"
|
||||
case "graphics":
|
||||
return "--graphics"
|
||||
case "utility":
|
||||
return "--utility"
|
||||
case "video":
|
||||
return "--video"
|
||||
case "display":
|
||||
return "--display"
|
||||
case "ngx":
|
||||
return "--ngx"
|
||||
default:
|
||||
log.Panicln("unknown driver capability:", cap)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
435
cmd/nvidia-container-toolkit/container_config.go
Normal file
435
cmd/nvidia-container-toolkit/container_config.go
Normal file
@@ -0,0 +1,435 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/mod/semver"
|
||||
)
|
||||
|
||||
var envSwarmGPU *string
|
||||
|
||||
const (
|
||||
envCUDAVersion = "CUDA_VERSION"
|
||||
envNVRequirePrefix = "NVIDIA_REQUIRE_"
|
||||
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
|
||||
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
|
||||
envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
|
||||
envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES"
|
||||
envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES"
|
||||
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
|
||||
)
|
||||
|
||||
const (
|
||||
allDriverCapabilities = "compute,compat32,graphics,utility,video,display,ngx"
|
||||
defaultDriverCapabilities = "utility,compute"
|
||||
)
|
||||
|
||||
const (
|
||||
capSysAdmin = "CAP_SYS_ADMIN"
|
||||
)
|
||||
|
||||
const (
|
||||
deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices"
|
||||
)
|
||||
|
||||
type nvidiaConfig struct {
|
||||
Devices string
|
||||
MigConfigDevices string
|
||||
MigMonitorDevices string
|
||||
DriverCapabilities string
|
||||
Requirements []string
|
||||
DisableRequire bool
|
||||
}
|
||||
|
||||
type containerConfig struct {
|
||||
Pid int
|
||||
Rootfs string
|
||||
Env map[string]string
|
||||
Nvidia *nvidiaConfig
|
||||
}
|
||||
|
||||
// Root from OCI runtime spec
|
||||
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L94-L100
|
||||
type Root struct {
|
||||
Path string `json:"path"`
|
||||
}
|
||||
|
||||
// Process from OCI runtime spec
|
||||
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L57
|
||||
type Process struct {
|
||||
Env []string `json:"env,omitempty"`
|
||||
Capabilities *json.RawMessage `json:"capabilities,omitempty" platform:"linux"`
|
||||
}
|
||||
|
||||
// LinuxCapabilities from OCI runtime spec
|
||||
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L61
|
||||
type LinuxCapabilities struct {
|
||||
Bounding []string `json:"bounding,omitempty" platform:"linux"`
|
||||
Effective []string `json:"effective,omitempty" platform:"linux"`
|
||||
Inheritable []string `json:"inheritable,omitempty" platform:"linux"`
|
||||
Permitted []string `json:"permitted,omitempty" platform:"linux"`
|
||||
Ambient []string `json:"ambient,omitempty" platform:"linux"`
|
||||
}
|
||||
|
||||
// Mount from OCI runtime spec
|
||||
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L103
|
||||
type Mount struct {
|
||||
Destination string `json:"destination"`
|
||||
Type string `json:"type,omitempty" platform:"linux,solaris"`
|
||||
Source string `json:"source,omitempty"`
|
||||
Options []string `json:"options,omitempty"`
|
||||
}
|
||||
|
||||
// Spec from OCI runtime spec
|
||||
// We use pointers to structs, similarly to the latest version of runtime-spec:
|
||||
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L5-L28
|
||||
type Spec struct {
|
||||
Version *string `json:"ociVersion"`
|
||||
Process *Process `json:"process,omitempty"`
|
||||
Root *Root `json:"root,omitempty"`
|
||||
Mounts []Mount `json:"mounts,omitempty"`
|
||||
}
|
||||
|
||||
// HookState holds state information about the hook
|
||||
type HookState struct {
|
||||
Pid int `json:"pid,omitempty"`
|
||||
// After 17.06, runc is using the runtime spec:
|
||||
// github.com/docker/runc/blob/17.06/libcontainer/configs/config.go#L262-L263
|
||||
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/state.go#L3-L17
|
||||
Bundle string `json:"bundle"`
|
||||
// Before 17.06, runc used a custom struct that didn't conform to the spec:
|
||||
// github.com/docker/runc/blob/17.03.x/libcontainer/configs/config.go#L245-L252
|
||||
BundlePath string `json:"bundlePath"`
|
||||
}
|
||||
|
||||
func parseCudaVersion(cudaVersion string) (vmaj, vmin, vpatch uint32) {
|
||||
if _, err := fmt.Sscanf(cudaVersion, "%d.%d.%d\n", &vmaj, &vmin, &vpatch); err != nil {
|
||||
vpatch = 0
|
||||
if _, err := fmt.Sscanf(cudaVersion, "%d.%d\n", &vmaj, &vmin); err != nil {
|
||||
vmin = 0
|
||||
if _, err := fmt.Sscanf(cudaVersion, "%d\n", &vmaj); err != nil {
|
||||
log.Panicln("invalid CUDA version:", cudaVersion)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func getEnvMap(e []string) (m map[string]string) {
|
||||
m = make(map[string]string)
|
||||
for _, s := range e {
|
||||
p := strings.SplitN(s, "=", 2)
|
||||
if len(p) != 2 {
|
||||
log.Panicln("environment error")
|
||||
}
|
||||
m[p[0]] = p[1]
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func loadSpec(path string) (spec *Spec) {
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
log.Panicln("could not open OCI spec:", err)
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
if err = json.NewDecoder(f).Decode(&spec); err != nil {
|
||||
log.Panicln("could not decode OCI spec:", err)
|
||||
}
|
||||
if spec.Version == nil {
|
||||
log.Panicln("Version is empty in OCI spec")
|
||||
}
|
||||
if spec.Process == nil {
|
||||
log.Panicln("Process is empty in OCI spec")
|
||||
}
|
||||
if spec.Root == nil {
|
||||
log.Panicln("Root is empty in OCI spec")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
func isPrivileged(s *Spec) bool {
|
||||
if s.Process.Capabilities == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
var caps []string
|
||||
// If v1.1.0-rc1 <= OCI version < v1.0.0-rc5 parse s.Process.Capabilities as:
|
||||
// github.com/opencontainers/runtime-spec/blob/v1.0.0-rc1/specs-go/config.go#L30-L54
|
||||
rc1cmp := semver.Compare("v"+*s.Version, "v1.0.0-rc1")
|
||||
rc5cmp := semver.Compare("v"+*s.Version, "v1.0.0-rc5")
|
||||
if (rc1cmp == 1 || rc1cmp == 0) && (rc5cmp == -1) {
|
||||
err := json.Unmarshal(*s.Process.Capabilities, &caps)
|
||||
if err != nil {
|
||||
log.Panicln("could not decode Process.Capabilities in OCI spec:", err)
|
||||
}
|
||||
// Otherwise, parse s.Process.Capabilities as:
|
||||
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L54
|
||||
} else {
|
||||
var lc LinuxCapabilities
|
||||
err := json.Unmarshal(*s.Process.Capabilities, &lc)
|
||||
if err != nil {
|
||||
log.Panicln("could not decode Process.Capabilities in OCI spec:", err)
|
||||
}
|
||||
// We only make sure that the bounding capabibility set has
|
||||
// CAP_SYS_ADMIN. This allows us to make sure that the container was
|
||||
// actually started as '--privileged', but also allow non-root users to
|
||||
// access the privileged NVIDIA capabilities.
|
||||
caps = lc.Bounding
|
||||
}
|
||||
|
||||
for _, c := range caps {
|
||||
if c == capSysAdmin {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
func isLegacyCUDAImage(env map[string]string) bool {
|
||||
legacyCudaVersion := env[envCUDAVersion]
|
||||
cudaRequire := env[envNVRequireCUDA]
|
||||
return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0
|
||||
}
|
||||
|
||||
func getDevicesFromEnvvar(env map[string]string, legacyImage bool) *string {
|
||||
// Build a list of envvars to consider.
|
||||
envVars := []string{envNVVisibleDevices}
|
||||
if envSwarmGPU != nil {
|
||||
// The Swarm envvar has higher precedence.
|
||||
envVars = append([]string{*envSwarmGPU}, envVars...)
|
||||
}
|
||||
|
||||
// Grab a reference to devices from the first envvar
|
||||
// in the list that actually exists in the environment.
|
||||
var devices *string
|
||||
for _, envVar := range envVars {
|
||||
if devs, ok := env[envVar]; ok {
|
||||
devices = &devs
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
// Environment variable unset with legacy image: default to "all".
|
||||
if devices == nil && legacyImage {
|
||||
all := "all"
|
||||
return &all
|
||||
}
|
||||
|
||||
// Environment variable unset or empty or "void": return nil
|
||||
if devices == nil || len(*devices) == 0 || *devices == "void" {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Environment variable set to "none": reset to "".
|
||||
if *devices == "none" {
|
||||
empty := ""
|
||||
return &empty
|
||||
}
|
||||
|
||||
// Any other value.
|
||||
return devices
|
||||
}
|
||||
|
||||
func getDevicesFromMounts(mounts []Mount) *string {
|
||||
var devices []string
|
||||
for _, m := range mounts {
|
||||
root := filepath.Clean(deviceListAsVolumeMountsRoot)
|
||||
source := filepath.Clean(m.Source)
|
||||
destination := filepath.Clean(m.Destination)
|
||||
|
||||
// Only consider mounts who's host volume is /dev/null
|
||||
if source != "/dev/null" {
|
||||
continue
|
||||
}
|
||||
// Only consider container mount points that begin with 'root'
|
||||
if len(destination) < len(root) {
|
||||
continue
|
||||
}
|
||||
if destination[:len(root)] != root {
|
||||
continue
|
||||
}
|
||||
// Grab the full path beyond 'root' and add it to the list of devices
|
||||
device := destination[len(root):]
|
||||
if len(device) > 0 && device[0] == '/' {
|
||||
device = device[1:]
|
||||
}
|
||||
if len(device) == 0 {
|
||||
continue
|
||||
}
|
||||
devices = append(devices, device)
|
||||
}
|
||||
|
||||
if devices == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
ret := strings.Join(devices, ",")
|
||||
return &ret
|
||||
}
|
||||
|
||||
func getDevices(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool, legacyImage bool) *string {
|
||||
// If enabled, try and get the device list from volume mounts first
|
||||
if hookConfig.AcceptDeviceListAsVolumeMounts {
|
||||
devices := getDevicesFromMounts(mounts)
|
||||
if devices != nil {
|
||||
return devices
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback to reading from the environment variable if privileges are correct
|
||||
devices := getDevicesFromEnvvar(env, legacyImage)
|
||||
if devices == nil {
|
||||
return nil
|
||||
}
|
||||
if privileged || hookConfig.AcceptEnvvarUnprivileged {
|
||||
return devices
|
||||
}
|
||||
|
||||
configName := hookConfig.getConfigOption("AcceptEnvvarUnprivileged")
|
||||
log.Printf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES (privileged=%v, %v=%v) ", privileged, configName, hookConfig.AcceptEnvvarUnprivileged)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func getMigConfigDevices(env map[string]string) *string {
|
||||
if devices, ok := env[envNVMigConfigDevices]; ok {
|
||||
return &devices
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func getMigMonitorDevices(env map[string]string) *string {
|
||||
if devices, ok := env[envNVMigMonitorDevices]; ok {
|
||||
return &devices
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func getDriverCapabilities(env map[string]string, legacyImage bool) *string {
|
||||
// Grab a reference to the capabilities from the envvar
|
||||
// if it actually exists in the environment.
|
||||
var capabilities *string
|
||||
if caps, ok := env[envNVDriverCapabilities]; ok {
|
||||
capabilities = &caps
|
||||
}
|
||||
|
||||
// Environment variable unset with legacy image: set all capabilities.
|
||||
if capabilities == nil && legacyImage {
|
||||
allCaps := allDriverCapabilities
|
||||
return &allCaps
|
||||
}
|
||||
|
||||
// Environment variable unset or set but empty: set default capabilities.
|
||||
if capabilities == nil || len(*capabilities) == 0 {
|
||||
defaultCaps := defaultDriverCapabilities
|
||||
return &defaultCaps
|
||||
}
|
||||
|
||||
// Environment variable set to "all": set all capabilities.
|
||||
if *capabilities == "all" {
|
||||
allCaps := allDriverCapabilities
|
||||
return &allCaps
|
||||
}
|
||||
|
||||
// Any other value
|
||||
return capabilities
|
||||
}
|
||||
|
||||
func getRequirements(env map[string]string, legacyImage bool) []string {
|
||||
// All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli
|
||||
var requirements []string
|
||||
for name, value := range env {
|
||||
if strings.HasPrefix(name, envNVRequirePrefix) {
|
||||
requirements = append(requirements, value)
|
||||
}
|
||||
}
|
||||
if legacyImage {
|
||||
vmaj, vmin, _ := parseCudaVersion(env[envCUDAVersion])
|
||||
cudaRequire := fmt.Sprintf("cuda>=%d.%d", vmaj, vmin)
|
||||
requirements = append(requirements, cudaRequire)
|
||||
}
|
||||
return requirements
|
||||
}
|
||||
|
||||
func getNvidiaConfig(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool) *nvidiaConfig {
|
||||
legacyImage := isLegacyCUDAImage(env)
|
||||
|
||||
var devices string
|
||||
if d := getDevices(hookConfig, env, mounts, privileged, legacyImage); d != nil {
|
||||
devices = *d
|
||||
} else {
|
||||
// 'nil' devices means this is not a GPU container.
|
||||
return nil
|
||||
}
|
||||
|
||||
var migConfigDevices string
|
||||
if d := getMigConfigDevices(env); d != nil {
|
||||
migConfigDevices = *d
|
||||
}
|
||||
if !privileged && migConfigDevices != "" {
|
||||
log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container")
|
||||
}
|
||||
|
||||
var migMonitorDevices string
|
||||
if d := getMigMonitorDevices(env); d != nil {
|
||||
migMonitorDevices = *d
|
||||
}
|
||||
if !privileged && migMonitorDevices != "" {
|
||||
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
|
||||
}
|
||||
|
||||
var driverCapabilities string
|
||||
if c := getDriverCapabilities(env, legacyImage); c != nil {
|
||||
driverCapabilities = *c
|
||||
}
|
||||
|
||||
requirements := getRequirements(env, legacyImage)
|
||||
|
||||
// Don't fail on invalid values.
|
||||
disableRequire, _ := strconv.ParseBool(env[envNVDisableRequire])
|
||||
|
||||
return &nvidiaConfig{
|
||||
Devices: devices,
|
||||
MigConfigDevices: migConfigDevices,
|
||||
MigMonitorDevices: migMonitorDevices,
|
||||
DriverCapabilities: driverCapabilities,
|
||||
Requirements: requirements,
|
||||
DisableRequire: disableRequire,
|
||||
}
|
||||
}
|
||||
|
||||
func getContainerConfig(hook HookConfig) (config containerConfig) {
|
||||
var h HookState
|
||||
d := json.NewDecoder(os.Stdin)
|
||||
if err := d.Decode(&h); err != nil {
|
||||
log.Panicln("could not decode container state:", err)
|
||||
}
|
||||
|
||||
b := h.Bundle
|
||||
if len(b) == 0 {
|
||||
b = h.BundlePath
|
||||
}
|
||||
|
||||
s := loadSpec(path.Join(b, "config.json"))
|
||||
|
||||
env := getEnvMap(s.Process.Env)
|
||||
privileged := isPrivileged(s)
|
||||
envSwarmGPU = hook.SwarmResource
|
||||
return containerConfig{
|
||||
Pid: h.Pid,
|
||||
Rootfs: s.Root.Path,
|
||||
Env: env,
|
||||
Nvidia: getNvidiaConfig(&hook, env, s.Mounts, privileged),
|
||||
}
|
||||
}
|
||||
824
cmd/nvidia-container-toolkit/container_test.go
Normal file
824
cmd/nvidia-container-toolkit/container_test.go
Normal file
@@ -0,0 +1,824 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestGetNvidiaConfig(t *testing.T) {
|
||||
var tests = []struct {
|
||||
description string
|
||||
env map[string]string
|
||||
privileged bool
|
||||
expectedConfig *nvidiaConfig
|
||||
expectedPanic bool
|
||||
}{
|
||||
{
|
||||
description: "No environment, unprivileged",
|
||||
env: map[string]string{},
|
||||
privileged: false,
|
||||
expectedConfig: nil,
|
||||
},
|
||||
{
|
||||
description: "No environment, privileged",
|
||||
env: map[string]string{},
|
||||
privileged: true,
|
||||
expectedConfig: nil,
|
||||
},
|
||||
{
|
||||
description: "Legacy image, no devices, no capabilities, no requirements",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "all",
|
||||
DriverCapabilities: allDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Legacy image, devices 'all', no capabilities, no requirements",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
envNVVisibleDevices: "all",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "all",
|
||||
DriverCapabilities: allDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Legacy image, devices 'empty', no capabilities, no requirements",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
envNVVisibleDevices: "",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: nil,
|
||||
},
|
||||
{
|
||||
description: "Legacy image, devices 'void', no capabilities, no requirements",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
envNVVisibleDevices: "",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: nil,
|
||||
},
|
||||
{
|
||||
description: "Legacy image, devices 'none', no capabilities, no requirements",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
envNVVisibleDevices: "none",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "",
|
||||
DriverCapabilities: allDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Legacy image, devices set, no capabilities, no requirements",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: allDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Legacy image, devices set, capabilities 'empty', no requirements",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
envNVDriverCapabilities: "",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: defaultDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Legacy image, devices set, capabilities 'all', no requirements",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
envNVDriverCapabilities: "all",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: allDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Legacy image, devices set, capabilities set, no requirements",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
envNVDriverCapabilities: "cap0,cap1",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: "cap0,cap1",
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Legacy image, devices set, capabilities set, requirements set",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
envNVDriverCapabilities: "cap0,cap1",
|
||||
envNVRequirePrefix + "REQ0": "req0=true",
|
||||
envNVRequirePrefix + "REQ1": "req1=false",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: "cap0,cap1",
|
||||
Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Legacy image, devices set, capabilities set, requirements set, disable requirements",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
envNVDriverCapabilities: "cap0,cap1",
|
||||
envNVRequirePrefix + "REQ0": "req0=true",
|
||||
envNVRequirePrefix + "REQ1": "req1=false",
|
||||
envNVDisableRequire: "true",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: "cap0,cap1",
|
||||
Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"},
|
||||
DisableRequire: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Modern image, no devices, no capabilities, no requirements, no envCUDAVersion",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: nil,
|
||||
},
|
||||
{
|
||||
description: "Modern image, no devices, no capabilities, no requirement, envCUDAVersion set",
|
||||
env: map[string]string{
|
||||
envCUDAVersion: "9.0",
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: nil,
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices 'all', no capabilities, no requirements",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "all",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "all",
|
||||
DriverCapabilities: defaultDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices 'empty', no capabilities, no requirements",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: nil,
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices 'void', no capabilities, no requirements",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: nil,
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices 'none', no capabilities, no requirements",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "none",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "",
|
||||
DriverCapabilities: defaultDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices set, no capabilities, no requirements",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: defaultDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices set, capabilities 'empty', no requirements",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
envNVDriverCapabilities: "",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: defaultDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices set, capabilities 'all', no requirements",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
envNVDriverCapabilities: "all",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: allDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices set, capabilities set, no requirements",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
envNVDriverCapabilities: "cap0,cap1",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: "cap0,cap1",
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices set, capabilities set, requirements set",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
envNVDriverCapabilities: "cap0,cap1",
|
||||
envNVRequirePrefix + "REQ0": "req0=true",
|
||||
envNVRequirePrefix + "REQ1": "req1=false",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: "cap0,cap1",
|
||||
Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices set, capabilities set, requirements set, disable requirements",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "gpu0,gpu1",
|
||||
envNVDriverCapabilities: "cap0,cap1",
|
||||
envNVRequirePrefix + "REQ0": "req0=true",
|
||||
envNVRequirePrefix + "REQ1": "req1=false",
|
||||
envNVDisableRequire: "true",
|
||||
},
|
||||
privileged: false,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "gpu0,gpu1",
|
||||
DriverCapabilities: "cap0,cap1",
|
||||
Requirements: []string{"cuda>=9.0", "req0=true", "req1=false"},
|
||||
DisableRequire: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "No cuda envs, devices 'all'",
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: "all",
|
||||
},
|
||||
privileged: false,
|
||||
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "all",
|
||||
DriverCapabilities: defaultDriverCapabilities,
|
||||
Requirements: []string{},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices 'all', migConfig set, privileged",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "all",
|
||||
envNVMigConfigDevices: "mig0,mig1",
|
||||
},
|
||||
privileged: true,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "all",
|
||||
MigConfigDevices: "mig0,mig1",
|
||||
DriverCapabilities: defaultDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices 'all', migConfig set, unprivileged",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "all",
|
||||
envNVMigConfigDevices: "mig0,mig1",
|
||||
},
|
||||
privileged: false,
|
||||
expectedPanic: true,
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices 'all', migMonitor set, privileged",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "all",
|
||||
envNVMigMonitorDevices: "mig0,mig1",
|
||||
},
|
||||
privileged: true,
|
||||
expectedConfig: &nvidiaConfig{
|
||||
Devices: "all",
|
||||
MigMonitorDevices: "mig0,mig1",
|
||||
DriverCapabilities: defaultDriverCapabilities,
|
||||
Requirements: []string{"cuda>=9.0"},
|
||||
DisableRequire: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "Modern image, devices 'all', migMonitor set, unprivileged",
|
||||
env: map[string]string{
|
||||
envNVRequireCUDA: "cuda>=9.0",
|
||||
envNVVisibleDevices: "all",
|
||||
envNVMigMonitorDevices: "mig0,mig1",
|
||||
},
|
||||
privileged: false,
|
||||
expectedPanic: true,
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
// Wrap the call to getNvidiaConfig() in a closure.
|
||||
var config *nvidiaConfig
|
||||
getConfig := func() {
|
||||
hookConfig := getDefaultHookConfig()
|
||||
config = getNvidiaConfig(&hookConfig, tc.env, nil, tc.privileged)
|
||||
}
|
||||
|
||||
// For any tests that are expected to panic, make sure they do.
|
||||
if tc.expectedPanic {
|
||||
require.Panics(t, getConfig)
|
||||
return
|
||||
}
|
||||
|
||||
// For all other tests, just grab the config
|
||||
getConfig()
|
||||
|
||||
// And start comparing the test results to the expected results.
|
||||
if tc.expectedConfig == nil {
|
||||
require.Nil(t, config, tc.description)
|
||||
return
|
||||
}
|
||||
|
||||
require.NotNil(t, config, tc.description)
|
||||
|
||||
require.Equal(t, tc.expectedConfig.Devices, config.Devices)
|
||||
require.Equal(t, tc.expectedConfig.MigConfigDevices, config.MigConfigDevices)
|
||||
require.Equal(t, tc.expectedConfig.MigMonitorDevices, config.MigMonitorDevices)
|
||||
require.Equal(t, tc.expectedConfig.DriverCapabilities, config.DriverCapabilities)
|
||||
|
||||
require.ElementsMatch(t, tc.expectedConfig.Requirements, config.Requirements)
|
||||
require.Equal(t, tc.expectedConfig.DisableRequire, config.DisableRequire)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetDevicesFromMounts(t *testing.T) {
|
||||
var tests = []struct {
|
||||
description string
|
||||
mounts []Mount
|
||||
expectedDevices *string
|
||||
}{
|
||||
{
|
||||
description: "No mounts",
|
||||
mounts: nil,
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "Host path is not /dev/null",
|
||||
mounts: []Mount{
|
||||
{
|
||||
Source: "/not/dev/null",
|
||||
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"),
|
||||
},
|
||||
},
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "Container path is not prefixed by 'root'",
|
||||
mounts: []Mount{
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join("/other/prefix", "GPU0"),
|
||||
},
|
||||
},
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "Container path is only 'root'",
|
||||
mounts: []Mount{
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: deviceListAsVolumeMountsRoot,
|
||||
},
|
||||
},
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "Discover 2 devices",
|
||||
mounts: []Mount{
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"),
|
||||
},
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"),
|
||||
},
|
||||
},
|
||||
expectedDevices: &[]string{"GPU0,GPU1"}[0],
|
||||
},
|
||||
{
|
||||
description: "Discover 2 devices with slashes in the name",
|
||||
mounts: []Mount{
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0-MIG0/0/1"),
|
||||
},
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1-MIG0/0/1"),
|
||||
},
|
||||
},
|
||||
expectedDevices: &[]string{"GPU0-MIG0/0/1,GPU1-MIG0/0/1"}[0],
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
devices := getDevicesFromMounts(tc.mounts)
|
||||
require.Equal(t, tc.expectedDevices, devices)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestDeviceListSourcePriority(t *testing.T) {
|
||||
var tests = []struct {
|
||||
description string
|
||||
mountDevices []Mount
|
||||
envvarDevices string
|
||||
privileged bool
|
||||
acceptUnprivileged bool
|
||||
acceptMounts bool
|
||||
expectedDevices *string
|
||||
}{
|
||||
{
|
||||
description: "Mount devices, unprivileged, no accept unprivileged",
|
||||
mountDevices: []Mount{
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"),
|
||||
},
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"),
|
||||
},
|
||||
},
|
||||
envvarDevices: "GPU2,GPU3",
|
||||
privileged: false,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: true,
|
||||
expectedDevices: &[]string{"GPU0,GPU1"}[0],
|
||||
},
|
||||
{
|
||||
description: "No mount devices, unprivileged, no accept unprivileged",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "GPU0,GPU1",
|
||||
privileged: false,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: true,
|
||||
expectedDevices: nil,
|
||||
},
|
||||
{
|
||||
description: "No mount devices, privileged, no accept unprivileged",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "GPU0,GPU1",
|
||||
privileged: true,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: true,
|
||||
expectedDevices: &[]string{"GPU0,GPU1"}[0],
|
||||
},
|
||||
{
|
||||
description: "No mount devices, unprivileged, accept unprivileged",
|
||||
mountDevices: nil,
|
||||
envvarDevices: "GPU0,GPU1",
|
||||
privileged: false,
|
||||
acceptUnprivileged: true,
|
||||
acceptMounts: true,
|
||||
expectedDevices: &[]string{"GPU0,GPU1"}[0],
|
||||
},
|
||||
{
|
||||
description: "Mount devices, unprivileged, accept unprivileged, no accept mounts",
|
||||
mountDevices: []Mount{
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"),
|
||||
},
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"),
|
||||
},
|
||||
},
|
||||
envvarDevices: "GPU2,GPU3",
|
||||
privileged: false,
|
||||
acceptUnprivileged: true,
|
||||
acceptMounts: false,
|
||||
expectedDevices: &[]string{"GPU2,GPU3"}[0],
|
||||
},
|
||||
{
|
||||
description: "Mount devices, unprivileged, no accept unprivileged, no accept mounts",
|
||||
mountDevices: []Mount{
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU0"),
|
||||
},
|
||||
{
|
||||
Source: "/dev/null",
|
||||
Destination: filepath.Join(deviceListAsVolumeMountsRoot, "GPU1"),
|
||||
},
|
||||
},
|
||||
envvarDevices: "GPU2,GPU3",
|
||||
privileged: false,
|
||||
acceptUnprivileged: false,
|
||||
acceptMounts: false,
|
||||
expectedDevices: nil,
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
// Wrap the call to getDevices() in a closure.
|
||||
var devices *string
|
||||
getDevices := func() {
|
||||
env := map[string]string{
|
||||
envNVVisibleDevices: tc.envvarDevices,
|
||||
}
|
||||
hookConfig := getDefaultHookConfig()
|
||||
hookConfig.AcceptEnvvarUnprivileged = tc.acceptUnprivileged
|
||||
hookConfig.AcceptDeviceListAsVolumeMounts = tc.acceptMounts
|
||||
devices = getDevices(&hookConfig, env, tc.mountDevices, tc.privileged, false)
|
||||
}
|
||||
|
||||
// For all other tests, just grab the devices and check the results
|
||||
getDevices()
|
||||
|
||||
require.Equal(t, tc.expectedDevices, devices)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetDevicesFromEnvvar(t *testing.T) {
|
||||
all := "all"
|
||||
empty := ""
|
||||
envDockerResourceGPUs := "DOCKER_RESOURCE_GPUS"
|
||||
gpuID := "GPU-12345"
|
||||
anotherGPUID := "GPU-67890"
|
||||
|
||||
var tests = []struct {
|
||||
description string
|
||||
envSwarmGPU *string
|
||||
env map[string]string
|
||||
legacyImage bool
|
||||
expectedDevices *string
|
||||
}{
|
||||
{
|
||||
description: "empty env returns nil for non-legacy image",
|
||||
},
|
||||
{
|
||||
description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: "",
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: "void",
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image",
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: "none",
|
||||
},
|
||||
expectedDevices: &empty,
|
||||
},
|
||||
{
|
||||
description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image",
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: gpuID,
|
||||
},
|
||||
expectedDevices: &gpuID,
|
||||
},
|
||||
{
|
||||
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: gpuID,
|
||||
},
|
||||
legacyImage: true,
|
||||
expectedDevices: &gpuID,
|
||||
},
|
||||
{
|
||||
description: "empty env returns all for legacy image",
|
||||
legacyImage: true,
|
||||
expectedDevices: &all,
|
||||
},
|
||||
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is ignored when
|
||||
// not enabled
|
||||
{
|
||||
description: "missing NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
|
||||
env: map[string]string{
|
||||
envDockerResourceGPUs: anotherGPUID,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "blank NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: "",
|
||||
envDockerResourceGPUs: anotherGPUID,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "'void' NVIDIA_VISIBLE_DEVICES returns nil for non-legacy image",
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: "void",
|
||||
envDockerResourceGPUs: anotherGPUID,
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "'none' NVIDIA_VISIBLE_DEVICES returns empty for non-legacy image",
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: "none",
|
||||
envDockerResourceGPUs: anotherGPUID,
|
||||
},
|
||||
expectedDevices: &empty,
|
||||
},
|
||||
{
|
||||
description: "NVIDIA_VISIBLE_DEVICES set returns value for non-legacy image",
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: gpuID,
|
||||
envDockerResourceGPUs: anotherGPUID,
|
||||
},
|
||||
expectedDevices: &gpuID,
|
||||
},
|
||||
{
|
||||
description: "NVIDIA_VISIBLE_DEVICES set returns value for legacy image",
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: gpuID,
|
||||
envDockerResourceGPUs: anotherGPUID,
|
||||
},
|
||||
legacyImage: true,
|
||||
expectedDevices: &gpuID,
|
||||
},
|
||||
{
|
||||
description: "empty env returns all for legacy image",
|
||||
env: map[string]string{
|
||||
envDockerResourceGPUs: anotherGPUID,
|
||||
},
|
||||
legacyImage: true,
|
||||
expectedDevices: &all,
|
||||
},
|
||||
// Add the `DOCKER_RESOURCE_GPUS` envvar and ensure that this is selected when
|
||||
// enabled
|
||||
{
|
||||
description: "empty env returns nil for non-legacy image",
|
||||
envSwarmGPU: &envDockerResourceGPUs,
|
||||
},
|
||||
{
|
||||
description: "blank DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
|
||||
envSwarmGPU: &envDockerResourceGPUs,
|
||||
env: map[string]string{
|
||||
envDockerResourceGPUs: "",
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "'void' DOCKER_RESOURCE_GPUS returns nil for non-legacy image",
|
||||
envSwarmGPU: &envDockerResourceGPUs,
|
||||
env: map[string]string{
|
||||
envDockerResourceGPUs: "void",
|
||||
},
|
||||
},
|
||||
{
|
||||
description: "'none' DOCKER_RESOURCE_GPUS returns empty for non-legacy image",
|
||||
envSwarmGPU: &envDockerResourceGPUs,
|
||||
env: map[string]string{
|
||||
envDockerResourceGPUs: "none",
|
||||
},
|
||||
expectedDevices: &empty,
|
||||
},
|
||||
{
|
||||
description: "DOCKER_RESOURCE_GPUS set returns value for non-legacy image",
|
||||
envSwarmGPU: &envDockerResourceGPUs,
|
||||
env: map[string]string{
|
||||
envDockerResourceGPUs: gpuID,
|
||||
},
|
||||
expectedDevices: &gpuID,
|
||||
},
|
||||
{
|
||||
description: "DOCKER_RESOURCE_GPUS set returns value for legacy image",
|
||||
envSwarmGPU: &envDockerResourceGPUs,
|
||||
env: map[string]string{
|
||||
envDockerResourceGPUs: gpuID,
|
||||
},
|
||||
legacyImage: true,
|
||||
expectedDevices: &gpuID,
|
||||
},
|
||||
{
|
||||
description: "DOCKER_RESOURCE_GPUS is selected if present",
|
||||
envSwarmGPU: &envDockerResourceGPUs,
|
||||
env: map[string]string{
|
||||
envDockerResourceGPUs: anotherGPUID,
|
||||
},
|
||||
expectedDevices: &anotherGPUID,
|
||||
},
|
||||
{
|
||||
description: "DOCKER_RESOURCE_GPUS overrides NVIDIA_VISIBLE_DEVICES if present",
|
||||
envSwarmGPU: &envDockerResourceGPUs,
|
||||
env: map[string]string{
|
||||
envNVVisibleDevices: gpuID,
|
||||
envDockerResourceGPUs: anotherGPUID,
|
||||
},
|
||||
expectedDevices: &anotherGPUID,
|
||||
},
|
||||
}
|
||||
|
||||
for i, tc := range tests {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
envSwarmGPU = tc.envSwarmGPU
|
||||
devices := getDevicesFromEnvvar(tc.env, tc.legacyImage)
|
||||
if tc.expectedDevices == nil {
|
||||
require.Nil(t, devices, "%d: %v", i, tc)
|
||||
return
|
||||
}
|
||||
|
||||
require.NotNil(t, devices, "%d: %v", i, tc)
|
||||
require.Equal(t, *tc.expectedDevices, *devices, "%d: %v", i, tc)
|
||||
})
|
||||
}
|
||||
}
|
||||
104
cmd/nvidia-container-toolkit/hook_config.go
Normal file
104
cmd/nvidia-container-toolkit/hook_config.go
Normal file
@@ -0,0 +1,104 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
"path"
|
||||
"reflect"
|
||||
|
||||
"github.com/BurntSushi/toml"
|
||||
)
|
||||
|
||||
const (
|
||||
configPath = "/etc/nvidia-container-runtime/config.toml"
|
||||
driverPath = "/run/nvidia/driver"
|
||||
)
|
||||
|
||||
var defaultPaths = [...]string{
|
||||
path.Join(driverPath, configPath),
|
||||
configPath,
|
||||
}
|
||||
|
||||
// CLIConfig : options for nvidia-container-cli.
|
||||
type CLIConfig struct {
|
||||
Root *string `toml:"root"`
|
||||
Path *string `toml:"path"`
|
||||
Environment []string `toml:"environment"`
|
||||
Debug *string `toml:"debug"`
|
||||
Ldcache *string `toml:"ldcache"`
|
||||
LoadKmods bool `toml:"load-kmods"`
|
||||
NoPivot bool `toml:"no-pivot"`
|
||||
NoCgroups bool `toml:"no-cgroups"`
|
||||
User *string `toml:"user"`
|
||||
Ldconfig *string `toml:"ldconfig"`
|
||||
}
|
||||
|
||||
// HookConfig : options for the nvidia-container-toolkit.
|
||||
type HookConfig struct {
|
||||
DisableRequire bool `toml:"disable-require"`
|
||||
SwarmResource *string `toml:"swarm-resource"`
|
||||
AcceptEnvvarUnprivileged bool `toml:"accept-nvidia-visible-devices-envvar-when-unprivileged"`
|
||||
AcceptDeviceListAsVolumeMounts bool `toml:"accept-nvidia-visible-devices-as-volume-mounts"`
|
||||
|
||||
NvidiaContainerCLI CLIConfig `toml:"nvidia-container-cli"`
|
||||
}
|
||||
|
||||
func getDefaultHookConfig() (config HookConfig) {
|
||||
return HookConfig{
|
||||
DisableRequire: false,
|
||||
SwarmResource: nil,
|
||||
AcceptEnvvarUnprivileged: true,
|
||||
AcceptDeviceListAsVolumeMounts: false,
|
||||
NvidiaContainerCLI: CLIConfig{
|
||||
Root: nil,
|
||||
Path: nil,
|
||||
Environment: []string{},
|
||||
Debug: nil,
|
||||
Ldcache: nil,
|
||||
LoadKmods: true,
|
||||
NoPivot: false,
|
||||
NoCgroups: false,
|
||||
User: nil,
|
||||
Ldconfig: nil,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func getHookConfig() (config HookConfig) {
|
||||
var err error
|
||||
|
||||
if len(*configflag) > 0 {
|
||||
config = getDefaultHookConfig()
|
||||
_, err = toml.DecodeFile(*configflag, &config)
|
||||
if err != nil {
|
||||
log.Panicln("couldn't open configuration file:", err)
|
||||
}
|
||||
} else {
|
||||
for _, p := range defaultPaths {
|
||||
config = getDefaultHookConfig()
|
||||
_, err = toml.DecodeFile(p, &config)
|
||||
if err == nil {
|
||||
break
|
||||
} else if !os.IsNotExist(err) {
|
||||
log.Panicln("couldn't open default configuration file:", err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return config
|
||||
}
|
||||
|
||||
// getConfigOption returns the toml config option associated with the
|
||||
// specified struct field.
|
||||
func (c HookConfig) getConfigOption(fieldName string) string {
|
||||
t := reflect.TypeOf(c)
|
||||
f, ok := t.FieldByName(fieldName)
|
||||
if !ok {
|
||||
return fieldName
|
||||
}
|
||||
v, ok := f.Tag.Lookup("toml")
|
||||
if !ok {
|
||||
return fieldName
|
||||
}
|
||||
return v
|
||||
}
|
||||
134
cmd/nvidia-container-toolkit/hook_test.go
Normal file
134
cmd/nvidia-container-toolkit/hook_test.go
Normal file
@@ -0,0 +1,134 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestParseCudaVersionValid(t *testing.T) {
|
||||
var tests = []struct {
|
||||
version string
|
||||
expected [3]uint32
|
||||
}{
|
||||
{"0", [3]uint32{0, 0, 0}},
|
||||
{"8", [3]uint32{8, 0, 0}},
|
||||
{"7.5", [3]uint32{7, 5, 0}},
|
||||
{"9.0.116", [3]uint32{9, 0, 116}},
|
||||
{"4294967295.4294967295.4294967295", [3]uint32{4294967295, 4294967295, 4294967295}},
|
||||
}
|
||||
for i, c := range tests {
|
||||
vmaj, vmin, vpatch := parseCudaVersion(c.version)
|
||||
|
||||
version := [3]uint32{vmaj, vmin, vpatch}
|
||||
|
||||
require.Equal(t, c.expected, version, "%d: %v", i, c)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseCudaVersionInvalid(t *testing.T) {
|
||||
var tests = []string{
|
||||
"foo",
|
||||
"foo.5.10",
|
||||
"9.0.116.50",
|
||||
"9.0.116foo",
|
||||
"7.foo",
|
||||
"9.0.bar",
|
||||
"9.4294967296",
|
||||
"9.0.116.",
|
||||
"9..0",
|
||||
"9.",
|
||||
".5.10",
|
||||
"-9",
|
||||
"+9",
|
||||
"-9.1.116",
|
||||
"-9.-1.-116",
|
||||
}
|
||||
for _, c := range tests {
|
||||
require.Panics(t, func() {
|
||||
parseCudaVersion(c)
|
||||
}, "parseCudaVersion(%v)", c)
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsPrivileged(t *testing.T) {
|
||||
var tests = []struct {
|
||||
spec string
|
||||
expected bool
|
||||
}{
|
||||
{
|
||||
`
|
||||
{
|
||||
"ociVersion": "1.0.0",
|
||||
"process": {
|
||||
"capabilities": {
|
||||
"bounding": [ "CAP_SYS_ADMIN" ]
|
||||
}
|
||||
}
|
||||
}
|
||||
`,
|
||||
true,
|
||||
},
|
||||
{
|
||||
`
|
||||
{
|
||||
"ociVersion": "1.0.0",
|
||||
"process": {
|
||||
"capabilities": {
|
||||
"bounding": [ "CAP_SYS_OTHER" ]
|
||||
}
|
||||
}
|
||||
}
|
||||
`,
|
||||
false,
|
||||
},
|
||||
{
|
||||
`
|
||||
{
|
||||
"ociVersion": "1.0.0",
|
||||
"process": {}
|
||||
}
|
||||
`,
|
||||
false,
|
||||
},
|
||||
{
|
||||
`
|
||||
{
|
||||
"ociVersion": "1.0.0-rc2-dev",
|
||||
"process": {
|
||||
"capabilities": [ "CAP_SYS_ADMIN" ]
|
||||
}
|
||||
}
|
||||
`,
|
||||
true,
|
||||
},
|
||||
{
|
||||
`
|
||||
{
|
||||
"ociVersion": "1.0.0-rc2-dev",
|
||||
"process": {
|
||||
"capabilities": [ "CAP_SYS_OTHER" ]
|
||||
}
|
||||
}
|
||||
`,
|
||||
false,
|
||||
},
|
||||
{
|
||||
`
|
||||
{
|
||||
"ociVersion": "1.0.0-rc2-dev",
|
||||
"process": {}
|
||||
}
|
||||
`,
|
||||
false,
|
||||
},
|
||||
}
|
||||
for i, tc := range tests {
|
||||
var spec Spec
|
||||
_ = json.Unmarshal([]byte(tc.spec), &spec)
|
||||
privileged := isPrivileged(&spec)
|
||||
|
||||
require.Equal(t, tc.expected, privileged, "%d: %v", i, tc)
|
||||
}
|
||||
}
|
||||
188
cmd/nvidia-container-toolkit/main.go
Normal file
188
cmd/nvidia-container-toolkit/main.go
Normal file
@@ -0,0 +1,188 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"runtime/debug"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
var (
|
||||
debugflag = flag.Bool("debug", false, "enable debug output")
|
||||
configflag = flag.String("config", "", "configuration file")
|
||||
|
||||
defaultPATH = []string{"/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"}
|
||||
)
|
||||
|
||||
func exit() {
|
||||
if err := recover(); err != nil {
|
||||
if _, ok := err.(runtime.Error); ok {
|
||||
log.Println(err)
|
||||
}
|
||||
if *debugflag {
|
||||
log.Printf("%s", debug.Stack())
|
||||
}
|
||||
os.Exit(1)
|
||||
}
|
||||
os.Exit(0)
|
||||
}
|
||||
|
||||
func getPATH(config CLIConfig) string {
|
||||
dirs := filepath.SplitList(os.Getenv("PATH"))
|
||||
// directories from the hook environment have higher precedence
|
||||
dirs = append(dirs, defaultPATH...)
|
||||
|
||||
if config.Root != nil {
|
||||
rootDirs := []string{}
|
||||
for _, dir := range dirs {
|
||||
rootDirs = append(rootDirs, path.Join(*config.Root, dir))
|
||||
}
|
||||
// directories with the root prefix have higher precedence
|
||||
dirs = append(rootDirs, dirs...)
|
||||
}
|
||||
return strings.Join(dirs, ":")
|
||||
}
|
||||
|
||||
func getCLIPath(config CLIConfig) string {
|
||||
if config.Path != nil {
|
||||
return *config.Path
|
||||
}
|
||||
|
||||
if err := os.Setenv("PATH", getPATH(config)); err != nil {
|
||||
log.Panicln("couldn't set PATH variable:", err)
|
||||
}
|
||||
|
||||
path, err := exec.LookPath("nvidia-container-cli")
|
||||
if err != nil {
|
||||
log.Panicln("couldn't find binary nvidia-container-cli in", os.Getenv("PATH"), ":", err)
|
||||
}
|
||||
return path
|
||||
}
|
||||
|
||||
// getRootfsPath returns an absolute path. We don't need to resolve symlinks for now.
|
||||
func getRootfsPath(config containerConfig) string {
|
||||
rootfs, err := filepath.Abs(config.Rootfs)
|
||||
if err != nil {
|
||||
log.Panicln(err)
|
||||
}
|
||||
return rootfs
|
||||
}
|
||||
|
||||
func doPrestart() {
|
||||
var err error
|
||||
|
||||
defer exit()
|
||||
log.SetFlags(0)
|
||||
|
||||
hook := getHookConfig()
|
||||
cli := hook.NvidiaContainerCLI
|
||||
|
||||
container := getContainerConfig(hook)
|
||||
nvidia := container.Nvidia
|
||||
if nvidia == nil {
|
||||
// Not a GPU container, nothing to do.
|
||||
return
|
||||
}
|
||||
|
||||
rootfs := getRootfsPath(container)
|
||||
|
||||
args := []string{getCLIPath(cli)}
|
||||
if cli.Root != nil {
|
||||
args = append(args, fmt.Sprintf("--root=%s", *cli.Root))
|
||||
}
|
||||
if cli.LoadKmods {
|
||||
args = append(args, "--load-kmods")
|
||||
}
|
||||
if cli.NoPivot {
|
||||
args = append(args, "--no-pivot")
|
||||
}
|
||||
if *debugflag {
|
||||
args = append(args, "--debug=/dev/stderr")
|
||||
} else if cli.Debug != nil {
|
||||
args = append(args, fmt.Sprintf("--debug=%s", *cli.Debug))
|
||||
}
|
||||
if cli.Ldcache != nil {
|
||||
args = append(args, fmt.Sprintf("--ldcache=%s", *cli.Ldcache))
|
||||
}
|
||||
if cli.User != nil {
|
||||
args = append(args, fmt.Sprintf("--user=%s", *cli.User))
|
||||
}
|
||||
args = append(args, "configure")
|
||||
|
||||
if cli.Ldconfig != nil {
|
||||
args = append(args, fmt.Sprintf("--ldconfig=%s", *cli.Ldconfig))
|
||||
}
|
||||
if cli.NoCgroups {
|
||||
args = append(args, "--no-cgroups")
|
||||
}
|
||||
if len(nvidia.Devices) > 0 {
|
||||
args = append(args, fmt.Sprintf("--device=%s", nvidia.Devices))
|
||||
}
|
||||
if len(nvidia.MigConfigDevices) > 0 {
|
||||
args = append(args, fmt.Sprintf("--mig-config=%s", nvidia.MigConfigDevices))
|
||||
}
|
||||
if len(nvidia.MigMonitorDevices) > 0 {
|
||||
args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices))
|
||||
}
|
||||
|
||||
for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") {
|
||||
if len(cap) == 0 {
|
||||
break
|
||||
}
|
||||
args = append(args, capabilityToCLI(cap))
|
||||
}
|
||||
|
||||
if !hook.DisableRequire && !nvidia.DisableRequire {
|
||||
for _, req := range nvidia.Requirements {
|
||||
args = append(args, fmt.Sprintf("--require=%s", req))
|
||||
}
|
||||
}
|
||||
|
||||
args = append(args, fmt.Sprintf("--pid=%s", strconv.FormatUint(uint64(container.Pid), 10)))
|
||||
args = append(args, rootfs)
|
||||
|
||||
env := append(os.Environ(), cli.Environment...)
|
||||
err = syscall.Exec(args[0], args, env)
|
||||
log.Panicln("exec failed:", err)
|
||||
}
|
||||
|
||||
func usage() {
|
||||
fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0])
|
||||
flag.PrintDefaults()
|
||||
fmt.Fprintf(os.Stderr, "\nCommands:\n")
|
||||
fmt.Fprintf(os.Stderr, " prestart\n run the prestart hook\n")
|
||||
fmt.Fprintf(os.Stderr, " poststart\n no-op\n")
|
||||
fmt.Fprintf(os.Stderr, " poststop\n no-op\n")
|
||||
}
|
||||
|
||||
func main() {
|
||||
flag.Usage = usage
|
||||
flag.Parse()
|
||||
|
||||
args := flag.Args()
|
||||
if len(args) == 0 {
|
||||
flag.Usage()
|
||||
os.Exit(2)
|
||||
}
|
||||
|
||||
switch args[0] {
|
||||
case "prestart":
|
||||
doPrestart()
|
||||
os.Exit(0)
|
||||
case "poststart":
|
||||
fallthrough
|
||||
case "poststop":
|
||||
os.Exit(0)
|
||||
default:
|
||||
flag.Usage()
|
||||
os.Exit(2)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user