mirror of
				https://github.com/NVIDIA/nvidia-container-toolkit
				synced 2025-06-26 18:18:24 +00:00 
			
		
		
		
	Extend the 'runtime.nvidia.com/gpu' CDI device kind to support full-GPUs specified by index or UUID
Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
This commit is contained in:
		
							parent
							
								
									08ef3e7969
								
							
						
					
					
						commit
						ae1b7e126c
					
				| @ -18,9 +18,13 @@ package modifier | ||||
| 
 | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"strconv" | ||||
| 	"strings" | ||||
| 
 | ||||
| 	nvdevice "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" | ||||
| 	"github.com/NVIDIA/go-nvlib/pkg/nvml" | ||||
| 	"tags.cncf.io/container-device-interface/pkg/parser" | ||||
| 	"tags.cncf.io/container-device-interface/specs-go" | ||||
| 
 | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config" | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" | ||||
| @ -152,7 +156,8 @@ func getAnnotationDevices(prefixes []string, annotations map[string]string) ([]s | ||||
| func filterAutomaticDevices(devices []string) []string { | ||||
| 	var automatic []string | ||||
| 	for _, device := range devices { | ||||
| 		if device == "runtime.nvidia.com/gpu=all" { | ||||
| 		vendor, class, _ := parser.ParseDevice(device) | ||||
| 		if vendor == "runtime.nvidia.com" && class == "gpu" { | ||||
| 			automatic = append(automatic, device) | ||||
| 		} | ||||
| 	} | ||||
| @ -176,9 +181,6 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de | ||||
| 	return cdiModifier, nil | ||||
| } | ||||
| 
 | ||||
| // TODO: use the requested devices when generating the CDI spec once we add
 | ||||
| // automatic CDI generation for more than just the 'runtime.nvidia.com/gpu=all'
 | ||||
| // device
 | ||||
| func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) { | ||||
| 	cdilib, err := nvcdi.New( | ||||
| 		nvcdi.WithLogger(logger), | ||||
| @ -191,5 +193,67 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic | ||||
| 		return nil, fmt.Errorf("failed to construct CDI library: %w", err) | ||||
| 	} | ||||
| 
 | ||||
| 	return cdilib.GetSpec() | ||||
| 	names := []string{} | ||||
| 	for _, device := range devices { | ||||
| 		_, _, name := parser.ParseDevice(device) | ||||
| 		if name == "all" { | ||||
| 			return cdilib.GetSpec() | ||||
| 		} | ||||
| 		names = append(names, name) | ||||
| 	} | ||||
| 
 | ||||
| 	// Note: The below code currently only supports generating CDI spec modifications
 | ||||
| 	// for full-GPUs, specified either by index or UUID. MIG devices are not
 | ||||
| 	// supported.
 | ||||
| 	nvmlLib := nvml.New() | ||||
| 	ret := nvmlLib.Init() | ||||
| 	if ret != nvml.SUCCESS { | ||||
| 		return nil, fmt.Errorf("failed to initialized NVML: %w", ret) | ||||
| 	} | ||||
| 	nvdevice := nvdevice.New(nvdevice.WithNvml(nvmlLib)) | ||||
| 
 | ||||
| 	deviceSpecs := []specs.Device{} | ||||
| 	for _, name := range names { | ||||
| 		logger.Debugf("Getting CDI spec edits for device %q", name) | ||||
| 		// Get a device handle by either index or UUID
 | ||||
| 		var nvmlDevice nvml.Device | ||||
| 		if idx, err := strconv.Atoi(name); err == nil { | ||||
| 			nvmlDevice, err = nvmlLib.DeviceGetHandleByIndex(idx) | ||||
| 			if err != nvml.SUCCESS { | ||||
| 				return nil, fmt.Errorf("failed to get device handle for index '%v': %w", idx, err) | ||||
| 			} | ||||
| 		} else { | ||||
| 			nvmlDevice, err = nvmlLib.DeviceGetHandleByUUID(name) | ||||
| 			if err != nvml.SUCCESS { | ||||
| 				return nil, fmt.Errorf("failed to get device handle for UUID '%v': %w", name, err) | ||||
| 			} | ||||
| 		} | ||||
| 
 | ||||
| 		nvlibDevice, err := nvdevice.NewDevice(nvmlDevice) | ||||
| 		if err != nil { | ||||
| 			return nil, fmt.Errorf("failed to construct device: %w", err) | ||||
| 		} | ||||
| 
 | ||||
| 		gpuEdits, err := cdilib.GetGPUDeviceEdits(nvlibDevice) | ||||
| 		if err != nil { | ||||
| 			return nil, fmt.Errorf("failed to get CDI spec edits for GPU %q: %w", name, err) | ||||
| 		} | ||||
| 		gpuDevice := specs.Device{ | ||||
| 			Name:           name, | ||||
| 			ContainerEdits: *gpuEdits.ContainerEdits, | ||||
| 		} | ||||
| 		deviceSpecs = append(deviceSpecs, gpuDevice) | ||||
| 	} | ||||
| 
 | ||||
| 	commonEdits, err := cdilib.GetCommonEdits() | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to get common CDI spec edits: %w", err) | ||||
| 	} | ||||
| 
 | ||||
| 	return spec.New( | ||||
| 		spec.WithDeviceSpecs(deviceSpecs), | ||||
| 		spec.WithEdits(*commonEdits.ContainerEdits), | ||||
| 		spec.WithVendor("runtime.nvidia.com"), | ||||
| 		spec.WithClass("gpu"), | ||||
| 	) | ||||
| } | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user