From ae1b7e126c2d0710937143302d2579c63ca87bdf Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Thu, 30 Nov 2023 14:16:10 -0800 Subject: [PATCH] Extend the 'runtime.nvidia.com/gpu' CDI device kind to support full-GPUs specified by index or UUID Signed-off-by: Christopher Desiniotis --- internal/modifier/cdi.go | 74 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index 76e8dab7..855277c7 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -18,9 +18,13 @@ package modifier import ( "fmt" + "strconv" "strings" + nvdevice "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/go-nvlib/pkg/nvml" "tags.cncf.io/container-device-interface/pkg/parser" + "tags.cncf.io/container-device-interface/specs-go" "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" @@ -152,7 +156,8 @@ func getAnnotationDevices(prefixes []string, annotations map[string]string) ([]s func filterAutomaticDevices(devices []string) []string { var automatic []string for _, device := range devices { - if device == "runtime.nvidia.com/gpu=all" { + vendor, class, _ := parser.ParseDevice(device) + if vendor == "runtime.nvidia.com" && class == "gpu" { automatic = append(automatic, device) } } @@ -176,9 +181,6 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de return cdiModifier, nil } -// TODO: use the requested devices when generating the CDI spec once we add -// automatic CDI generation for more than just the 'runtime.nvidia.com/gpu=all' -// device func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) { cdilib, err := nvcdi.New( nvcdi.WithLogger(logger), @@ -191,5 +193,67 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic return nil, fmt.Errorf("failed to construct CDI library: %w", err) } - return cdilib.GetSpec() + names := []string{} + for _, device := range devices { + _, _, name := parser.ParseDevice(device) + if name == "all" { + return cdilib.GetSpec() + } + names = append(names, name) + } + + // Note: The below code currently only supports generating CDI spec modifications + // for full-GPUs, specified either by index or UUID. MIG devices are not + // supported. + nvmlLib := nvml.New() + ret := nvmlLib.Init() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("failed to initialized NVML: %w", ret) + } + nvdevice := nvdevice.New(nvdevice.WithNvml(nvmlLib)) + + deviceSpecs := []specs.Device{} + for _, name := range names { + logger.Debugf("Getting CDI spec edits for device %q", name) + // Get a device handle by either index or UUID + var nvmlDevice nvml.Device + if idx, err := strconv.Atoi(name); err == nil { + nvmlDevice, err = nvmlLib.DeviceGetHandleByIndex(idx) + if err != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get device handle for index '%v': %w", idx, err) + } + } else { + nvmlDevice, err = nvmlLib.DeviceGetHandleByUUID(name) + if err != nvml.SUCCESS { + return nil, fmt.Errorf("failed to get device handle for UUID '%v': %w", name, err) + } + } + + nvlibDevice, err := nvdevice.NewDevice(nvmlDevice) + if err != nil { + return nil, fmt.Errorf("failed to construct device: %w", err) + } + + gpuEdits, err := cdilib.GetGPUDeviceEdits(nvlibDevice) + if err != nil { + return nil, fmt.Errorf("failed to get CDI spec edits for GPU %q: %w", name, err) + } + gpuDevice := specs.Device{ + Name: name, + ContainerEdits: *gpuEdits.ContainerEdits, + } + deviceSpecs = append(deviceSpecs, gpuDevice) + } + + commonEdits, err := cdilib.GetCommonEdits() + if err != nil { + return nil, fmt.Errorf("failed to get common CDI spec edits: %w", err) + } + + return spec.New( + spec.WithDeviceSpecs(deviceSpecs), + spec.WithEdits(*commonEdits.ContainerEdits), + spec.WithVendor("runtime.nvidia.com"), + spec.WithClass("gpu"), + ) }