mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-01-23 02:57:01 +00:00
20604621e4
For most practical purposes, it should be fine to set NVIDIA_DRIVER_CAPABILITIES=all nowadays. Historically, these different capabilities exist because they were added incrementally, with varying degrees of stability. It's fairly common to run with GPUs in containers today, but a few years ago the driver didn't support them very well, and it was important to make sure the libraries being injected into the container actually worked in a containerized environment. When they didn't, it was common to get information leaks, crashes, or even silent failures. In the past, whenever a new set of libraries was being vetted for injected, a new capability was added to make sure that users had control to explicitly include only those libraries they were comfortable having injected into their containers. The idea being that whoever puts together a container image for use with GPUs should have the knowledge of what capabilities the software in that container image requires, and can set the NVIDIA_DRIVER_CAPABILITIES envvar in that image appropriately. After some back and forth, we've decided it doesn't quite make sense to set it to "all" just yet, but we should set it to "utility, compute" instead of just "utility", so that at least the core CUDA libraries work by default (once installed in the container). Signed-off-by: Kevin Klues <kklues@nvidia.com>
435 lines
12 KiB
Go
435 lines
12 KiB
Go
package main
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"path"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
|
|
"golang.org/x/mod/semver"
|
|
)
|
|
|
|
var envSwarmGPU *string
|
|
|
|
const (
|
|
envCUDAVersion = "CUDA_VERSION"
|
|
envNVRequirePrefix = "NVIDIA_REQUIRE_"
|
|
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
|
|
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
|
|
envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
|
|
envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES"
|
|
envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES"
|
|
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
|
|
)
|
|
|
|
const (
|
|
allDriverCapabilities = "compute,compat32,graphics,utility,video,display,ngx"
|
|
defaultDriverCapabilities = "utility,compute"
|
|
)
|
|
|
|
const (
|
|
capSysAdmin = "CAP_SYS_ADMIN"
|
|
)
|
|
|
|
const (
|
|
deviceListAsVolumeMountsRoot = "/var/run/nvidia-container-devices"
|
|
)
|
|
|
|
type nvidiaConfig struct {
|
|
Devices string
|
|
MigConfigDevices string
|
|
MigMonitorDevices string
|
|
DriverCapabilities string
|
|
Requirements []string
|
|
DisableRequire bool
|
|
}
|
|
|
|
type containerConfig struct {
|
|
Pid int
|
|
Rootfs string
|
|
Env map[string]string
|
|
Nvidia *nvidiaConfig
|
|
}
|
|
|
|
// Root from OCI runtime spec
|
|
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L94-L100
|
|
type Root struct {
|
|
Path string `json:"path"`
|
|
}
|
|
|
|
// Process from OCI runtime spec
|
|
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L57
|
|
type Process struct {
|
|
Env []string `json:"env,omitempty"`
|
|
Capabilities *json.RawMessage `json:"capabilities,omitempty" platform:"linux"`
|
|
}
|
|
|
|
// LinuxCapabilities from OCI runtime spec
|
|
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L61
|
|
type LinuxCapabilities struct {
|
|
Bounding []string `json:"bounding,omitempty" platform:"linux"`
|
|
Effective []string `json:"effective,omitempty" platform:"linux"`
|
|
Inheritable []string `json:"inheritable,omitempty" platform:"linux"`
|
|
Permitted []string `json:"permitted,omitempty" platform:"linux"`
|
|
Ambient []string `json:"ambient,omitempty" platform:"linux"`
|
|
}
|
|
|
|
// Mount from OCI runtime spec
|
|
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L103
|
|
type Mount struct {
|
|
Destination string `json:"destination"`
|
|
Type string `json:"type,omitempty" platform:"linux,solaris"`
|
|
Source string `json:"source,omitempty"`
|
|
Options []string `json:"options,omitempty"`
|
|
}
|
|
|
|
// Spec from OCI runtime spec
|
|
// We use pointers to structs, similarly to the latest version of runtime-spec:
|
|
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L5-L28
|
|
type Spec struct {
|
|
Version *string `json:"ociVersion"`
|
|
Process *Process `json:"process,omitempty"`
|
|
Root *Root `json:"root,omitempty"`
|
|
Mounts []Mount `json:"mounts,omitempty"`
|
|
}
|
|
|
|
// HookState holds state information about the hook
|
|
type HookState struct {
|
|
Pid int `json:"pid,omitempty"`
|
|
// After 17.06, runc is using the runtime spec:
|
|
// github.com/docker/runc/blob/17.06/libcontainer/configs/config.go#L262-L263
|
|
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/state.go#L3-L17
|
|
Bundle string `json:"bundle"`
|
|
// Before 17.06, runc used a custom struct that didn't conform to the spec:
|
|
// github.com/docker/runc/blob/17.03.x/libcontainer/configs/config.go#L245-L252
|
|
BundlePath string `json:"bundlePath"`
|
|
}
|
|
|
|
func parseCudaVersion(cudaVersion string) (vmaj, vmin, vpatch uint32) {
|
|
if _, err := fmt.Sscanf(cudaVersion, "%d.%d.%d\n", &vmaj, &vmin, &vpatch); err != nil {
|
|
vpatch = 0
|
|
if _, err := fmt.Sscanf(cudaVersion, "%d.%d\n", &vmaj, &vmin); err != nil {
|
|
vmin = 0
|
|
if _, err := fmt.Sscanf(cudaVersion, "%d\n", &vmaj); err != nil {
|
|
log.Panicln("invalid CUDA version:", cudaVersion)
|
|
}
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func getEnvMap(e []string) (m map[string]string) {
|
|
m = make(map[string]string)
|
|
for _, s := range e {
|
|
p := strings.SplitN(s, "=", 2)
|
|
if len(p) != 2 {
|
|
log.Panicln("environment error")
|
|
}
|
|
m[p[0]] = p[1]
|
|
}
|
|
return
|
|
}
|
|
|
|
func loadSpec(path string) (spec *Spec) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
log.Panicln("could not open OCI spec:", err)
|
|
}
|
|
defer f.Close()
|
|
|
|
if err = json.NewDecoder(f).Decode(&spec); err != nil {
|
|
log.Panicln("could not decode OCI spec:", err)
|
|
}
|
|
if spec.Version == nil {
|
|
log.Panicln("Version is empty in OCI spec")
|
|
}
|
|
if spec.Process == nil {
|
|
log.Panicln("Process is empty in OCI spec")
|
|
}
|
|
if spec.Root == nil {
|
|
log.Panicln("Root is empty in OCI spec")
|
|
}
|
|
return
|
|
}
|
|
|
|
func isPrivileged(s *Spec) bool {
|
|
if s.Process.Capabilities == nil {
|
|
return false
|
|
}
|
|
|
|
var caps []string
|
|
// If v1.1.0-rc1 <= OCI version < v1.0.0-rc5 parse s.Process.Capabilities as:
|
|
// github.com/opencontainers/runtime-spec/blob/v1.0.0-rc1/specs-go/config.go#L30-L54
|
|
rc1cmp := semver.Compare("v"+*s.Version, "v1.0.0-rc1")
|
|
rc5cmp := semver.Compare("v"+*s.Version, "v1.0.0-rc5")
|
|
if (rc1cmp == 1 || rc1cmp == 0) && (rc5cmp == -1) {
|
|
err := json.Unmarshal(*s.Process.Capabilities, &caps)
|
|
if err != nil {
|
|
log.Panicln("could not decode Process.Capabilities in OCI spec:", err)
|
|
}
|
|
// Otherwise, parse s.Process.Capabilities as:
|
|
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L54
|
|
} else {
|
|
var lc LinuxCapabilities
|
|
err := json.Unmarshal(*s.Process.Capabilities, &lc)
|
|
if err != nil {
|
|
log.Panicln("could not decode Process.Capabilities in OCI spec:", err)
|
|
}
|
|
// We only make sure that the bounding capabibility set has
|
|
// CAP_SYS_ADMIN. This allows us to make sure that the container was
|
|
// actually started as '--privileged', but also allow non-root users to
|
|
// access the privileged NVIDIA capabilities.
|
|
caps = lc.Bounding
|
|
}
|
|
|
|
for _, c := range caps {
|
|
if c == capSysAdmin {
|
|
return true
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
func isLegacyCUDAImage(env map[string]string) bool {
|
|
legacyCudaVersion := env[envCUDAVersion]
|
|
cudaRequire := env[envNVRequireCUDA]
|
|
return len(legacyCudaVersion) > 0 && len(cudaRequire) == 0
|
|
}
|
|
|
|
func getDevicesFromEnvvar(env map[string]string, legacyImage bool) *string {
|
|
// Build a list of envvars to consider.
|
|
envVars := []string{envNVVisibleDevices}
|
|
if envSwarmGPU != nil {
|
|
// The Swarm envvar has higher precedence.
|
|
envVars = append([]string{*envSwarmGPU}, envVars...)
|
|
}
|
|
|
|
// Grab a reference to devices from the first envvar
|
|
// in the list that actually exists in the environment.
|
|
var devices *string
|
|
for _, envVar := range envVars {
|
|
if devs, ok := env[envVar]; ok {
|
|
devices = &devs
|
|
}
|
|
}
|
|
|
|
// Environment variable unset with legacy image: default to "all".
|
|
if devices == nil && legacyImage {
|
|
all := "all"
|
|
return &all
|
|
}
|
|
|
|
// Environment variable unset or empty or "void": return nil
|
|
if devices == nil || len(*devices) == 0 || *devices == "void" {
|
|
return nil
|
|
}
|
|
|
|
// Environment variable set to "none": reset to "".
|
|
if *devices == "none" {
|
|
empty := ""
|
|
return &empty
|
|
}
|
|
|
|
// Any other value.
|
|
return devices
|
|
}
|
|
|
|
func getDevicesFromMounts(mounts []Mount) *string {
|
|
var devices []string
|
|
for _, m := range mounts {
|
|
root := filepath.Clean(deviceListAsVolumeMountsRoot)
|
|
source := filepath.Clean(m.Source)
|
|
destination := filepath.Clean(m.Destination)
|
|
|
|
// Only consider mounts who's host volume is /dev/null
|
|
if source != "/dev/null" {
|
|
continue
|
|
}
|
|
// Only consider container mount points that begin with 'root'
|
|
if len(destination) < len(root) {
|
|
continue
|
|
}
|
|
if destination[:len(root)] != root {
|
|
continue
|
|
}
|
|
// Grab the full path beyond 'root' and add it to the list of devices
|
|
device := destination[len(root):]
|
|
if len(device) > 0 && device[0] == '/' {
|
|
device = device[1:]
|
|
}
|
|
if len(device) == 0 {
|
|
continue
|
|
}
|
|
devices = append(devices, device)
|
|
}
|
|
|
|
if devices == nil {
|
|
return nil
|
|
}
|
|
|
|
ret := strings.Join(devices, ",")
|
|
return &ret
|
|
}
|
|
|
|
func getDevices(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool, legacyImage bool) *string {
|
|
// If enabled, try and get the device list from volume mounts first
|
|
if hookConfig.AcceptDeviceListAsVolumeMounts {
|
|
devices := getDevicesFromMounts(mounts)
|
|
if devices != nil {
|
|
return devices
|
|
}
|
|
}
|
|
|
|
// Fallback to reading from the environment variable if privileges are correct
|
|
devices := getDevicesFromEnvvar(env, legacyImage)
|
|
if devices == nil {
|
|
return nil
|
|
}
|
|
if privileged || hookConfig.AcceptEnvvarUnprivileged {
|
|
return devices
|
|
}
|
|
|
|
// Error out otherwise
|
|
log.Panicln("insufficient privileges to read device list from NVIDIA_VISIBLE_DEVICES envvar")
|
|
|
|
return nil
|
|
}
|
|
|
|
func getMigConfigDevices(env map[string]string) *string {
|
|
if devices, ok := env[envNVMigConfigDevices]; ok {
|
|
return &devices
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func getMigMonitorDevices(env map[string]string) *string {
|
|
if devices, ok := env[envNVMigMonitorDevices]; ok {
|
|
return &devices
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func getDriverCapabilities(env map[string]string, legacyImage bool) *string {
|
|
// Grab a reference to the capabilities from the envvar
|
|
// if it actually exists in the environment.
|
|
var capabilities *string
|
|
if caps, ok := env[envNVDriverCapabilities]; ok {
|
|
capabilities = &caps
|
|
}
|
|
|
|
// Environment variable unset with legacy image: set all capabilities.
|
|
if capabilities == nil && legacyImage {
|
|
allCaps := allDriverCapabilities
|
|
return &allCaps
|
|
}
|
|
|
|
// Environment variable unset or set but empty: set default capabilities.
|
|
if capabilities == nil || len(*capabilities) == 0 {
|
|
defaultCaps := defaultDriverCapabilities
|
|
return &defaultCaps
|
|
}
|
|
|
|
// Environment variable set to "all": set all capabilities.
|
|
if *capabilities == "all" {
|
|
allCaps := allDriverCapabilities
|
|
return &allCaps
|
|
}
|
|
|
|
// Any other value
|
|
return capabilities
|
|
}
|
|
|
|
func getRequirements(env map[string]string, legacyImage bool) []string {
|
|
// All variables with the "NVIDIA_REQUIRE_" prefix are passed to nvidia-container-cli
|
|
var requirements []string
|
|
for name, value := range env {
|
|
if strings.HasPrefix(name, envNVRequirePrefix) {
|
|
requirements = append(requirements, value)
|
|
}
|
|
}
|
|
if legacyImage {
|
|
vmaj, vmin, _ := parseCudaVersion(env[envCUDAVersion])
|
|
cudaRequire := fmt.Sprintf("cuda>=%d.%d", vmaj, vmin)
|
|
requirements = append(requirements, cudaRequire)
|
|
}
|
|
return requirements
|
|
}
|
|
|
|
func getNvidiaConfig(hookConfig *HookConfig, env map[string]string, mounts []Mount, privileged bool) *nvidiaConfig {
|
|
legacyImage := isLegacyCUDAImage(env)
|
|
|
|
var devices string
|
|
if d := getDevices(hookConfig, env, mounts, privileged, legacyImage); d != nil {
|
|
devices = *d
|
|
} else {
|
|
// 'nil' devices means this is not a GPU container.
|
|
return nil
|
|
}
|
|
|
|
var migConfigDevices string
|
|
if d := getMigConfigDevices(env); d != nil {
|
|
migConfigDevices = *d
|
|
}
|
|
if !privileged && migConfigDevices != "" {
|
|
log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container")
|
|
}
|
|
|
|
var migMonitorDevices string
|
|
if d := getMigMonitorDevices(env); d != nil {
|
|
migMonitorDevices = *d
|
|
}
|
|
if !privileged && migMonitorDevices != "" {
|
|
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
|
|
}
|
|
|
|
var driverCapabilities string
|
|
if c := getDriverCapabilities(env, legacyImage); c != nil {
|
|
driverCapabilities = *c
|
|
}
|
|
|
|
requirements := getRequirements(env, legacyImage)
|
|
|
|
// Don't fail on invalid values.
|
|
disableRequire, _ := strconv.ParseBool(env[envNVDisableRequire])
|
|
|
|
return &nvidiaConfig{
|
|
Devices: devices,
|
|
MigConfigDevices: migConfigDevices,
|
|
MigMonitorDevices: migMonitorDevices,
|
|
DriverCapabilities: driverCapabilities,
|
|
Requirements: requirements,
|
|
DisableRequire: disableRequire,
|
|
}
|
|
}
|
|
|
|
func getContainerConfig(hook HookConfig) (config containerConfig) {
|
|
var h HookState
|
|
d := json.NewDecoder(os.Stdin)
|
|
if err := d.Decode(&h); err != nil {
|
|
log.Panicln("could not decode container state:", err)
|
|
}
|
|
|
|
b := h.Bundle
|
|
if len(b) == 0 {
|
|
b = h.BundlePath
|
|
}
|
|
|
|
s := loadSpec(path.Join(b, "config.json"))
|
|
|
|
env := getEnvMap(s.Process.Env)
|
|
privileged := isPrivileged(s)
|
|
envSwarmGPU = hook.SwarmResource
|
|
return containerConfig{
|
|
Pid: h.Pid,
|
|
Rootfs: s.Root.Path,
|
|
Env: env,
|
|
Nvidia: getNvidiaConfig(&hook, env, s.Mounts, privileged),
|
|
}
|
|
}
|