mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-01-22 10:35:38 +00:00
Merge branch 'internal-add-mig-config-monitor' into 'master'
Add support for mig-config and mig-monitor as privileged capabilities See merge request dl/container-dev/nvidia-container-toolkit!3
This commit is contained in:
commit
fcc1d116f0
@ -18,6 +18,8 @@ const (
|
||||
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
|
||||
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
|
||||
envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
|
||||
envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES"
|
||||
envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES"
|
||||
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
|
||||
)
|
||||
|
||||
@ -26,8 +28,14 @@ const (
|
||||
defaultDriverCapabilities = "utility"
|
||||
)
|
||||
|
||||
const (
|
||||
capSysAdmin = "CAP_SYS_ADMIN"
|
||||
)
|
||||
|
||||
type nvidiaConfig struct {
|
||||
Devices string
|
||||
MigConfigDevices string
|
||||
MigMonitorDevices string
|
||||
DriverCapabilities string
|
||||
Requirements []string
|
||||
DisableRequire bool
|
||||
@ -47,7 +55,17 @@ type Root struct {
|
||||
|
||||
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L57
|
||||
type Process struct {
|
||||
Env []string `json:"env,omitempty"`
|
||||
Env []string `json:"env,omitempty"`
|
||||
Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"`
|
||||
}
|
||||
|
||||
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L61
|
||||
type LinuxCapabilities struct {
|
||||
Bounding []string `json:"bounding,omitempty" platform:"linux"`
|
||||
Effective []string `json:"effective,omitempty" platform:"linux"`
|
||||
Inheritable []string `json:"inheritable,omitempty" platform:"linux"`
|
||||
Permitted []string `json:"permitted,omitempty" platform:"linux"`
|
||||
Ambient []string `json:"ambient,omitempty" platform:"linux"`
|
||||
}
|
||||
|
||||
// We use pointers to structs, similarly to the latest version of runtime-spec:
|
||||
@ -124,6 +142,31 @@ func loadSpec(path string) (spec *Spec) {
|
||||
return
|
||||
}
|
||||
|
||||
func isPrivileged(caps *LinuxCapabilities) bool {
|
||||
if caps == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
hasCapSysAdmin := func(caps []string) bool {
|
||||
for _, c := range caps {
|
||||
if c == capSysAdmin {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// We only make sure that the bounding capabibility set has
|
||||
// CAP_SYS_ADMIN. This allows us to make sure that the container was
|
||||
// actually started as '--privileged', but also allow non-root users to
|
||||
// access the priviliged NVIDIA capabilities.
|
||||
if !hasCapSysAdmin(caps.Bounding) {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func getDevices(env map[string]string) *string {
|
||||
gpuVars := []string{envNVVisibleDevices}
|
||||
if envSwarmGPU != nil {
|
||||
@ -139,6 +182,26 @@ func getDevices(env map[string]string) *string {
|
||||
return nil
|
||||
}
|
||||
|
||||
func getMigConfigDevices(env map[string]string) *string {
|
||||
gpuVars := []string{envNVMigConfigDevices}
|
||||
for _, gpuVar := range gpuVars {
|
||||
if devices, ok := env[gpuVar]; ok {
|
||||
return &devices
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func getMigMonitorDevices(env map[string]string) *string {
|
||||
gpuVars := []string{envNVMigMonitorDevices}
|
||||
for _, gpuVar := range gpuVars {
|
||||
if devices, ok := env[gpuVar]; ok {
|
||||
return &devices
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func getDriverCapabilities(env map[string]string) *string {
|
||||
if capabilities, ok := env[envNVDriverCapabilities]; ok {
|
||||
return &capabilities
|
||||
@ -158,7 +221,7 @@ func getRequirements(env map[string]string) []string {
|
||||
}
|
||||
|
||||
// Mimic the new CUDA images if no capabilities or devices are specified.
|
||||
func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig {
|
||||
func getNvidiaConfigLegacy(env map[string]string, privileged bool) *nvidiaConfig {
|
||||
var devices string
|
||||
if d := getDevices(env); d == nil {
|
||||
// Environment variable unset: default to "all".
|
||||
@ -174,6 +237,22 @@ func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig {
|
||||
devices = ""
|
||||
}
|
||||
|
||||
var migConfigDevices string
|
||||
if d := getMigConfigDevices(env); d != nil {
|
||||
migConfigDevices = *d
|
||||
}
|
||||
if !privileged && migConfigDevices != "" {
|
||||
log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container")
|
||||
}
|
||||
|
||||
var migMonitorDevices string
|
||||
if d := getMigMonitorDevices(env); d != nil {
|
||||
migMonitorDevices = *d
|
||||
}
|
||||
if !privileged && migMonitorDevices != "" {
|
||||
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
|
||||
}
|
||||
|
||||
var driverCapabilities string
|
||||
if c := getDriverCapabilities(env); c == nil {
|
||||
// Environment variable unset: default to "all".
|
||||
@ -200,18 +279,20 @@ func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig {
|
||||
|
||||
return &nvidiaConfig{
|
||||
Devices: devices,
|
||||
MigConfigDevices: migConfigDevices,
|
||||
MigMonitorDevices: migMonitorDevices,
|
||||
DriverCapabilities: driverCapabilities,
|
||||
Requirements: requirements,
|
||||
DisableRequire: disableRequire,
|
||||
}
|
||||
}
|
||||
|
||||
func getNvidiaConfig(env map[string]string) *nvidiaConfig {
|
||||
func getNvidiaConfig(env map[string]string, privileged bool) *nvidiaConfig {
|
||||
legacyCudaVersion := env[envCUDAVersion]
|
||||
cudaRequire := env[envNVRequireCUDA]
|
||||
if len(legacyCudaVersion) > 0 && len(cudaRequire) == 0 {
|
||||
// Legacy CUDA image detected.
|
||||
return getNvidiaConfigLegacy(env)
|
||||
return getNvidiaConfigLegacy(env, privileged)
|
||||
}
|
||||
|
||||
var devices string
|
||||
@ -226,6 +307,22 @@ func getNvidiaConfig(env map[string]string) *nvidiaConfig {
|
||||
devices = ""
|
||||
}
|
||||
|
||||
var migConfigDevices string
|
||||
if d := getMigConfigDevices(env); d != nil {
|
||||
migConfigDevices = *d
|
||||
}
|
||||
if !privileged && migConfigDevices != "" {
|
||||
log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container")
|
||||
}
|
||||
|
||||
var migMonitorDevices string
|
||||
if d := getMigMonitorDevices(env); d != nil {
|
||||
migMonitorDevices = *d
|
||||
}
|
||||
if !privileged && migMonitorDevices != "" {
|
||||
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
|
||||
}
|
||||
|
||||
var driverCapabilities string
|
||||
if c := getDriverCapabilities(env); c == nil || len(*c) == 0 {
|
||||
// Environment variable unset or set but empty: use default capability.
|
||||
@ -245,6 +342,8 @@ func getNvidiaConfig(env map[string]string) *nvidiaConfig {
|
||||
|
||||
return &nvidiaConfig{
|
||||
Devices: devices,
|
||||
MigConfigDevices: migConfigDevices,
|
||||
MigMonitorDevices: migMonitorDevices,
|
||||
DriverCapabilities: driverCapabilities,
|
||||
Requirements: requirements,
|
||||
DisableRequire: disableRequire,
|
||||
@ -266,11 +365,12 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
|
||||
s := loadSpec(path.Join(b, "config.json"))
|
||||
|
||||
env := getEnvMap(s.Process.Env, hook.NvidiaContainerCLI)
|
||||
privileged := isPrivileged(s.Process.Capabilities)
|
||||
envSwarmGPU = hook.SwarmResource
|
||||
return containerConfig{
|
||||
Pid: h.Pid,
|
||||
Rootfs: s.Root.Path,
|
||||
Env: env,
|
||||
Nvidia: getNvidiaConfig(env),
|
||||
Nvidia: getNvidiaConfig(env, privileged),
|
||||
}
|
||||
}
|
||||
|
@ -126,6 +126,12 @@ func doPrestart() {
|
||||
if len(nvidia.Devices) > 0 {
|
||||
args = append(args, fmt.Sprintf("--device=%s", nvidia.Devices))
|
||||
}
|
||||
if len(nvidia.MigConfigDevices) > 0 {
|
||||
args = append(args, fmt.Sprintf("--mig-config=%s", nvidia.MigConfigDevices))
|
||||
}
|
||||
if len(nvidia.MigMonitorDevices) > 0 {
|
||||
args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices))
|
||||
}
|
||||
|
||||
for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") {
|
||||
if len(cap) == 0 {
|
||||
|
Loading…
Reference in New Issue
Block a user