mirror of
				https://github.com/NVIDIA/nvidia-container-toolkit
				synced 2025-06-26 18:18:24 +00:00 
			
		
		
		
	Merge branch 'internal-add-mig-config-monitor' into 'master'
Add support for mig-config and mig-monitor as privileged capabilities See merge request dl/container-dev/nvidia-container-toolkit!3
This commit is contained in:
		
						commit
						fcc1d116f0
					
				| @ -18,6 +18,8 @@ const ( | ||||
| 	envNVRequireCUDA        = envNVRequirePrefix + "CUDA" | ||||
| 	envNVDisableRequire     = "NVIDIA_DISABLE_REQUIRE" | ||||
| 	envNVVisibleDevices     = "NVIDIA_VISIBLE_DEVICES" | ||||
| 	envNVMigConfigDevices   = "NVIDIA_MIG_CONFIG_DEVICES" | ||||
| 	envNVMigMonitorDevices  = "NVIDIA_MIG_MONITOR_DEVICES" | ||||
| 	envNVDriverCapabilities = "NVIDIA_DRIVER_CAPABILITIES" | ||||
| ) | ||||
| 
 | ||||
| @ -26,8 +28,14 @@ const ( | ||||
| 	defaultDriverCapabilities = "utility" | ||||
| ) | ||||
| 
 | ||||
| const ( | ||||
| 	capSysAdmin = "CAP_SYS_ADMIN" | ||||
| ) | ||||
| 
 | ||||
| type nvidiaConfig struct { | ||||
| 	Devices            string | ||||
| 	MigConfigDevices   string | ||||
| 	MigMonitorDevices  string | ||||
| 	DriverCapabilities string | ||||
| 	Requirements       []string | ||||
| 	DisableRequire     bool | ||||
| @ -47,7 +55,17 @@ type Root struct { | ||||
| 
 | ||||
| // github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L30-L57
 | ||||
| type Process struct { | ||||
| 	Env []string `json:"env,omitempty"` | ||||
| 	Env          []string           `json:"env,omitempty"` | ||||
| 	Capabilities *LinuxCapabilities `json:"capabilities,omitempty" platform:"linux"` | ||||
| } | ||||
| 
 | ||||
| // https://github.com/opencontainers/runtime-spec/blob/v1.0.0/specs-go/config.go#L61
 | ||||
| type LinuxCapabilities struct { | ||||
| 	Bounding    []string `json:"bounding,omitempty" platform:"linux"` | ||||
| 	Effective   []string `json:"effective,omitempty" platform:"linux"` | ||||
| 	Inheritable []string `json:"inheritable,omitempty" platform:"linux"` | ||||
| 	Permitted   []string `json:"permitted,omitempty" platform:"linux"` | ||||
| 	Ambient     []string `json:"ambient,omitempty" platform:"linux"` | ||||
| } | ||||
| 
 | ||||
| // We use pointers to structs, similarly to the latest version of runtime-spec:
 | ||||
| @ -124,6 +142,31 @@ func loadSpec(path string) (spec *Spec) { | ||||
| 	return | ||||
| } | ||||
| 
 | ||||
| func isPrivileged(caps *LinuxCapabilities) bool { | ||||
| 	if caps == nil { | ||||
| 		return false | ||||
| 	} | ||||
| 
 | ||||
| 	hasCapSysAdmin := func(caps []string) bool { | ||||
| 		for _, c := range caps { | ||||
| 			if c == capSysAdmin { | ||||
| 				return true | ||||
| 			} | ||||
| 		} | ||||
| 		return false | ||||
| 	} | ||||
| 
 | ||||
| 	// We only make sure that the bounding capabibility set has
 | ||||
| 	// CAP_SYS_ADMIN. This allows us to make sure that the container was
 | ||||
| 	// actually started as '--privileged', but also allow non-root users to
 | ||||
| 	// access the priviliged NVIDIA capabilities.
 | ||||
| 	if !hasCapSysAdmin(caps.Bounding) { | ||||
| 		return false | ||||
| 	} | ||||
| 
 | ||||
| 	return true | ||||
| } | ||||
| 
 | ||||
| func getDevices(env map[string]string) *string { | ||||
| 	gpuVars := []string{envNVVisibleDevices} | ||||
| 	if envSwarmGPU != nil { | ||||
| @ -139,6 +182,26 @@ func getDevices(env map[string]string) *string { | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func getMigConfigDevices(env map[string]string) *string { | ||||
| 	gpuVars := []string{envNVMigConfigDevices} | ||||
| 	for _, gpuVar := range gpuVars { | ||||
| 		if devices, ok := env[gpuVar]; ok { | ||||
| 			return &devices | ||||
| 		} | ||||
| 	} | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func getMigMonitorDevices(env map[string]string) *string { | ||||
| 	gpuVars := []string{envNVMigMonitorDevices} | ||||
| 	for _, gpuVar := range gpuVars { | ||||
| 		if devices, ok := env[gpuVar]; ok { | ||||
| 			return &devices | ||||
| 		} | ||||
| 	} | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func getDriverCapabilities(env map[string]string) *string { | ||||
| 	if capabilities, ok := env[envNVDriverCapabilities]; ok { | ||||
| 		return &capabilities | ||||
| @ -158,7 +221,7 @@ func getRequirements(env map[string]string) []string { | ||||
| } | ||||
| 
 | ||||
| // Mimic the new CUDA images if no capabilities or devices are specified.
 | ||||
| func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig { | ||||
| func getNvidiaConfigLegacy(env map[string]string, privileged bool) *nvidiaConfig { | ||||
| 	var devices string | ||||
| 	if d := getDevices(env); d == nil { | ||||
| 		// Environment variable unset: default to "all".
 | ||||
| @ -174,6 +237,22 @@ func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig { | ||||
| 		devices = "" | ||||
| 	} | ||||
| 
 | ||||
| 	var migConfigDevices string | ||||
| 	if d := getMigConfigDevices(env); d != nil { | ||||
| 		migConfigDevices = *d | ||||
| 	} | ||||
| 	if !privileged && migConfigDevices != "" { | ||||
| 		log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container") | ||||
| 	} | ||||
| 
 | ||||
| 	var migMonitorDevices string | ||||
| 	if d := getMigMonitorDevices(env); d != nil { | ||||
| 		migMonitorDevices = *d | ||||
| 	} | ||||
| 	if !privileged && migMonitorDevices != "" { | ||||
| 		log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container") | ||||
| 	} | ||||
| 
 | ||||
| 	var driverCapabilities string | ||||
| 	if c := getDriverCapabilities(env); c == nil { | ||||
| 		// Environment variable unset: default to "all".
 | ||||
| @ -200,18 +279,20 @@ func getNvidiaConfigLegacy(env map[string]string) *nvidiaConfig { | ||||
| 
 | ||||
| 	return &nvidiaConfig{ | ||||
| 		Devices:            devices, | ||||
| 		MigConfigDevices:   migConfigDevices, | ||||
| 		MigMonitorDevices:  migMonitorDevices, | ||||
| 		DriverCapabilities: driverCapabilities, | ||||
| 		Requirements:       requirements, | ||||
| 		DisableRequire:     disableRequire, | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| func getNvidiaConfig(env map[string]string) *nvidiaConfig { | ||||
| func getNvidiaConfig(env map[string]string, privileged bool) *nvidiaConfig { | ||||
| 	legacyCudaVersion := env[envCUDAVersion] | ||||
| 	cudaRequire := env[envNVRequireCUDA] | ||||
| 	if len(legacyCudaVersion) > 0 && len(cudaRequire) == 0 { | ||||
| 		// Legacy CUDA image detected.
 | ||||
| 		return getNvidiaConfigLegacy(env) | ||||
| 		return getNvidiaConfigLegacy(env, privileged) | ||||
| 	} | ||||
| 
 | ||||
| 	var devices string | ||||
| @ -226,6 +307,22 @@ func getNvidiaConfig(env map[string]string) *nvidiaConfig { | ||||
| 		devices = "" | ||||
| 	} | ||||
| 
 | ||||
| 	var migConfigDevices string | ||||
| 	if d := getMigConfigDevices(env); d != nil { | ||||
| 		migConfigDevices = *d | ||||
| 	} | ||||
| 	if !privileged && migConfigDevices != "" { | ||||
| 		log.Panicln("cannot set MIG_CONFIG_DEVICES in non privileged container") | ||||
| 	} | ||||
| 
 | ||||
| 	var migMonitorDevices string | ||||
| 	if d := getMigMonitorDevices(env); d != nil { | ||||
| 		migMonitorDevices = *d | ||||
| 	} | ||||
| 	if !privileged && migMonitorDevices != "" { | ||||
| 		log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container") | ||||
| 	} | ||||
| 
 | ||||
| 	var driverCapabilities string | ||||
| 	if c := getDriverCapabilities(env); c == nil || len(*c) == 0 { | ||||
| 		// Environment variable unset or set but empty: use default capability.
 | ||||
| @ -245,6 +342,8 @@ func getNvidiaConfig(env map[string]string) *nvidiaConfig { | ||||
| 
 | ||||
| 	return &nvidiaConfig{ | ||||
| 		Devices:            devices, | ||||
| 		MigConfigDevices:   migConfigDevices, | ||||
| 		MigMonitorDevices:  migMonitorDevices, | ||||
| 		DriverCapabilities: driverCapabilities, | ||||
| 		Requirements:       requirements, | ||||
| 		DisableRequire:     disableRequire, | ||||
| @ -266,11 +365,12 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { | ||||
| 	s := loadSpec(path.Join(b, "config.json")) | ||||
| 
 | ||||
| 	env := getEnvMap(s.Process.Env, hook.NvidiaContainerCLI) | ||||
| 	privileged := isPrivileged(s.Process.Capabilities) | ||||
| 	envSwarmGPU = hook.SwarmResource | ||||
| 	return containerConfig{ | ||||
| 		Pid:    h.Pid, | ||||
| 		Rootfs: s.Root.Path, | ||||
| 		Env:    env, | ||||
| 		Nvidia: getNvidiaConfig(env), | ||||
| 		Nvidia: getNvidiaConfig(env, privileged), | ||||
| 	} | ||||
| } | ||||
|  | ||||
| @ -126,6 +126,12 @@ func doPrestart() { | ||||
| 	if len(nvidia.Devices) > 0 { | ||||
| 		args = append(args, fmt.Sprintf("--device=%s", nvidia.Devices)) | ||||
| 	} | ||||
| 	if len(nvidia.MigConfigDevices) > 0 { | ||||
| 		args = append(args, fmt.Sprintf("--mig-config=%s", nvidia.MigConfigDevices)) | ||||
| 	} | ||||
| 	if len(nvidia.MigMonitorDevices) > 0 { | ||||
| 		args = append(args, fmt.Sprintf("--mig-monitor=%s", nvidia.MigMonitorDevices)) | ||||
| 	} | ||||
| 
 | ||||
| 	for _, cap := range strings.Split(nvidia.DriverCapabilities, ",") { | ||||
| 		if len(cap) == 0 { | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user