mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-02-16 17:42:20 +00:00
Merge branch 'internal-add-mig-config-monitor' into 'master'
Add support for mig-config and mig-monitor as privileged capabilities See merge request dl/container-dev/nvidia-container-toolkit!3
This commit is contained in:
commit
fcc1d116f0
@ -18,6 +18,8 @@ const (
|
|||||||
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
|
envNVRequireCUDA = envNVRequirePrefix + "CUDA"
|
||||||
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
|
envNVDisableRequire = "NVIDIA_DISABLE_REQUIRE"
|
||||||
envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
|
envNVVisibleDevices = "NVIDIA_VISIBLE_DEVICES"
|
||||||
|
envNVMigConfigDevices = "NVIDIA_MIG_CONFIG_DEVICES"
|
||||||
|
envNVMigMonitorDevices = "NVIDIA_MIG_MONITOR_DEVICES"
|
||||||
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
|
envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -26,8 +28,14 @@ const (
|
|||||||
defaultDriverCapabilities = "utility"
|
defaultDriverCapabilities = "utility"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
capSysAdmin = "CAP_SYS_ADMIN"
|
||||||
|
)
|
||||||
|
|
||||||
type nvidiaConfig struct {
|
type nvidiaConfig struct {
|
||||||
Devices string
|
Devices string
|
||||||
|
MigConfigDevices string
|
||||||
|
MigMonitorDevices string
|
||||||
DriverCapabilities string
|
DriverCapabilities string
|
||||||
Requirements []string
|
Requirements []string
|
||||||
DisableRequire bool
|
DisableRequire bool
|
||||||
@ -47,7 +55,17 @@ type Root struct {
|
|||||||
|
|
||||||
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L57
|
// github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L57
|
||||||
type Process struct {
|
type Process struct {
|
||||||
Env []string `json:"env,omitempty"`
|
Env []string `json:"env,omitempty"`
|
||||||
|
Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L61
|
||||||
|
type LinuxCapabilities struct {
|
||||||
|
Bounding []string `json:"bounding,omitempty" platform:"linux"`
|
||||||
|
Effective []string `json:"effective,omitempty" platform:"linux"`
|
||||||
|
Inheritable []string `json:"inheritable,omitempty" platform:"linux"`
|
||||||
|
Permitted []string `json:"permitted,omitempty" platform:"linux"`
|
||||||
|
Ambient []string `json:"ambient,omitempty" platform:"linux"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// We use pointers to structs, similarly to the latest version of runtime-spec:
|
// We use pointers to structs, similarly to the latest version of runtime-spec:
|
||||||
@ -124,6 +142,31 @@ func loadSpec(path string) (spec *Spec) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isPrivileged(caps *LinuxCapabilities) bool {
|
||||||
|
if caps == nil {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
hasCapSysAdmin := func(caps []string) bool {
|
||||||
|
for _, c := range caps {
|
||||||
|
if c == capSysAdmin {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// We only make sure that the bounding capabibility set has
|
||||||
|
// CAP_SYS_ADMIN. This allows us to make sure that the container was
|
||||||
|
// actually started as '--privileged', but also allow non-root users to
|
||||||
|
// access the priviliged NVIDIA capabilities.
|
||||||
|
if !hasCapSysAdmin(caps.Bounding) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
func getDevices(env map[string]string) *string {
|
func getDevices(env map[string]string) *string {
|
||||||
gpuVars := []string{envNVVisibleDevices}
|
gpuVars := []string{envNVVisibleDevices}
|
||||||
if envSwarmGPU != nil {
|
if envSwarmGPU != nil {
|
||||||
@ -139,6 +182,26 @@ func getDevices(env map[string]string) *string {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getMigConfigDevices(env map[string]string) *string {
|
||||||
|
gpuVars := []string{envNVMigConfigDevices}
|
||||||
|
for _, gpuVar := range gpuVars {
|
||||||
|
if devices, ok := env[gpuVar]; ok {
|
||||||
|
return &devices
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getMigMonitorDevices(env map[string]string) *string {
|
||||||
|
gpuVars := []string{envNVMigMonitorDevices}
|
||||||
|
for _, gpuVar := range gpuVars {
|
||||||
|
if devices, ok := env[gpuVar]; ok {
|
||||||
|
return &devices
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func getDriverCapabilities(env map[string]string) *string {
|
func getDriverCapabilities(env map[string]string) *string {
|
||||||
if capabilities, ok := env[envNVDriverCapabilities]; ok {
|
if capabilities, ok := env[envNVDriverCapabilities]; ok {
|
||||||
return &capabilities
|
return &capabilities
|
||||||
@ -158,7 +221,7 @@ func getRequirements(env map[string]string) []string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Mimic the new CUDA images if no capabilities or devices are specified.
|
// Mimic the new CUDA images if no capabilities or devices are specified.
|
||||||
func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig {
|
func getNvidiaConfigLegacy(env map[string]string, privileged bool) *nvidiaConfig {
|
||||||
var devices string
|
var devices string
|
||||||
if d := getDevices(env); d == nil {
|
if d := getDevices(env); d == nil {
|
||||||
// Environment variable unset: default to "all".
|
// Environment variable unset: default to "all".
|
||||||
@ -174,6 +237,22 @@ func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig {
|
|||||||
devices = ""
|
devices = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var migConfigDevices string
|
||||||
|
if d := getMigConfigDevices(env); d != nil {
|
||||||
|
migConfigDevices = *d
|
||||||
|
}
|
||||||
|
if !privileged && migConfigDevices != "" {
|
||||||
|
log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container")
|
||||||
|
}
|
||||||
|
|
||||||
|
var migMonitorDevices string
|
||||||
|
if d := getMigMonitorDevices(env); d != nil {
|
||||||
|
migMonitorDevices = *d
|
||||||
|
}
|
||||||
|
if !privileged && migMonitorDevices != "" {
|
||||||
|
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
|
||||||
|
}
|
||||||
|
|
||||||
var driverCapabilities string
|
var driverCapabilities string
|
||||||
if c := getDriverCapabilities(env); c == nil {
|
if c := getDriverCapabilities(env); c == nil {
|
||||||
// Environment variable unset: default to "all".
|
// Environment variable unset: default to "all".
|
||||||
@ -200,18 +279,20 @@ func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig {
|
|||||||
|
|
||||||
return &nvidiaConfig{
|
return &nvidiaConfig{
|
||||||
Devices: devices,
|
Devices: devices,
|
||||||
|
MigConfigDevices: migConfigDevices,
|
||||||
|
MigMonitorDevices: migMonitorDevices,
|
||||||
DriverCapabilities: driverCapabilities,
|
DriverCapabilities: driverCapabilities,
|
||||||
Requirements: requirements,
|
Requirements: requirements,
|
||||||
DisableRequire: disableRequire,
|
DisableRequire: disableRequire,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func getNvidiaConfig(env map[string]string) *nvidiaConfig {
|
func getNvidiaConfig(env map[string]string, privileged bool) *nvidiaConfig {
|
||||||
legacyCudaVersion := env[envCUDAVersion]
|
legacyCudaVersion := env[envCUDAVersion]
|
||||||
cudaRequire := env[envNVRequireCUDA]
|
cudaRequire := env[envNVRequireCUDA]
|
||||||
if len(legacyCudaVersion) > 0 && len(cudaRequire) == 0 {
|
if len(legacyCudaVersion) > 0 && len(cudaRequire) == 0 {
|
||||||
// Legacy CUDA image detected.
|
// Legacy CUDA image detected.
|
||||||
return getNvidiaConfigLegacy(env)
|
return getNvidiaConfigLegacy(env, privileged)
|
||||||
}
|
}
|
||||||
|
|
||||||
var devices string
|
var devices string
|
||||||
@ -226,6 +307,22 @@ func getNvidiaConfig(env map[string]string) *nvidiaConfig {
|
|||||||
devices = ""
|
devices = ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var migConfigDevices string
|
||||||
|
if d := getMigConfigDevices(env); d != nil {
|
||||||
|
migConfigDevices = *d
|
||||||
|
}
|
||||||
|
if !privileged && migConfigDevices != "" {
|
||||||
|
log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container")
|
||||||
|
}
|
||||||
|
|
||||||
|
var migMonitorDevices string
|
||||||
|
if d := getMigMonitorDevices(env); d != nil {
|
||||||
|
migMonitorDevices = *d
|
||||||
|
}
|
||||||
|
if !privileged && migMonitorDevices != "" {
|
||||||
|
log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container")
|
||||||
|
}
|
||||||
|
|
||||||
var driverCapabilities string
|
var driverCapabilities string
|
||||||
if c := getDriverCapabilities(env); c == nil || len(*c) == 0 {
|
if c := getDriverCapabilities(env); c == nil || len(*c) == 0 {
|
||||||
// Environment variable unset or set but empty: use default capability.
|
// Environment variable unset or set but empty: use default capability.
|
||||||
@ -245,6 +342,8 @@ func getNvidiaConfig(env map[string]string) *nvidiaConfig {
|
|||||||
|
|
||||||
return &nvidiaConfig{
|
return &nvidiaConfig{
|
||||||
Devices: devices,
|
Devices: devices,
|
||||||
|
MigConfigDevices: migConfigDevices,
|
||||||
|
MigMonitorDevices: migMonitorDevices,
|
||||||
DriverCapabilities: driverCapabilities,
|
DriverCapabilities: driverCapabilities,
|
||||||
Requirements: requirements,
|
Requirements: requirements,
|
||||||
DisableRequire: disableRequire,
|
DisableRequire: disableRequire,
|
||||||
@ -266,11 +365,12 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
|
|||||||
s := loadSpec(path.Join(b, "config.json"))
|
s := loadSpec(path.Join(b, "config.json"))
|
||||||
|
|
||||||
env := getEnvMap(s.Process.Env, hook.NvidiaContainerCLI)
|
env := getEnvMap(s.Process.Env, hook.NvidiaContainerCLI)
|
||||||
|
privileged := isPrivileged(s.Process.Capabilities)
|
||||||
envSwarmGPU = hook.SwarmResource
|
envSwarmGPU = hook.SwarmResource
|
||||||
return containerConfig{
|
return containerConfig{
|
||||||
Pid: h.Pid,
|
Pid: h.Pid,
|
||||||
Rootfs: s.Root.Path,
|
Rootfs: s.Root.Path,
|
||||||
Env: env,
|
Env: env,
|
||||||
Nvidia: getNvidiaConfig(env),
|
Nvidia: getNvidiaConfig(env, privileged),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -126,6 +126,12 @@ func doPrestart() {
|
|||||||
if len(nvidia.Devices) > 0 {
|
if len(nvidia.Devices) > 0 {
|
||||||
args = append(args, fmt.Sprintf("--device=%s", nvidia.Devices))
|
args = append(args, fmt.Sprintf("--device=%s", nvidia.Devices))
|
||||||
}
|
}
|
||||||
|
if len(nvidia.MigConfigDevices) > 0 {
|
||||||
|
args = append(args, fmt.Sprintf("--mig-config=%s", nvidia.MigConfigDevices))
|
||||||
|
}
|
||||||
|
if len(nvidia.MigMonitorDevices) > 0 {
|
||||||
|
args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices))
|
||||||
|
}
|
||||||
|
|
||||||
for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") {
|
for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") {
|
||||||
if len(cap) == 0 {
|
if len(cap) == 0 {
|
||||||
|
Loading…
Reference in New Issue
Block a user