mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-02-16 17:42:20 +00:00
Extend the 'runtime.nvidia.com/gpu' CDI device kind to support full-GPUs specified by index or UUID
Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
This commit is contained in:
parent
08ef3e7969
commit
ae1b7e126c
@ -18,9 +18,13 @@ package modifier
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
nvdevice "github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
||||||
|
"github.com/NVIDIA/go-nvlib/pkg/nvml"
|
||||||
"tags.cncf.io/container-device-interface/pkg/parser"
|
"tags.cncf.io/container-device-interface/pkg/parser"
|
||||||
|
"tags.cncf.io/container-device-interface/specs-go"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||||
@ -152,7 +156,8 @@ func getAnnotationDevices(prefixes []string, annotations map[string]string) ([]s
|
|||||||
func filterAutomaticDevices(devices []string) []string {
|
func filterAutomaticDevices(devices []string) []string {
|
||||||
var automatic []string
|
var automatic []string
|
||||||
for _, device := range devices {
|
for _, device := range devices {
|
||||||
if device == "runtime.nvidia.com/gpu=all" {
|
vendor, class, _ := parser.ParseDevice(device)
|
||||||
|
if vendor == "runtime.nvidia.com" && class == "gpu" {
|
||||||
automatic = append(automatic, device)
|
automatic = append(automatic, device)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -176,9 +181,6 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de
|
|||||||
return cdiModifier, nil
|
return cdiModifier, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: use the requested devices when generating the CDI spec once we add
|
|
||||||
// automatic CDI generation for more than just the 'runtime.nvidia.com/gpu=all'
|
|
||||||
// device
|
|
||||||
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
|
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
|
||||||
cdilib, err := nvcdi.New(
|
cdilib, err := nvcdi.New(
|
||||||
nvcdi.WithLogger(logger),
|
nvcdi.WithLogger(logger),
|
||||||
@ -191,5 +193,67 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
|
|||||||
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return cdilib.GetSpec()
|
names := []string{}
|
||||||
|
for _, device := range devices {
|
||||||
|
_, _, name := parser.ParseDevice(device)
|
||||||
|
if name == "all" {
|
||||||
|
return cdilib.GetSpec()
|
||||||
|
}
|
||||||
|
names = append(names, name)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note: The below code currently only supports generating CDI spec modifications
|
||||||
|
// for full-GPUs, specified either by index or UUID. MIG devices are not
|
||||||
|
// supported.
|
||||||
|
nvmlLib := nvml.New()
|
||||||
|
ret := nvmlLib.Init()
|
||||||
|
if ret != nvml.SUCCESS {
|
||||||
|
return nil, fmt.Errorf("failed to initialized NVML: %w", ret)
|
||||||
|
}
|
||||||
|
nvdevice := nvdevice.New(nvdevice.WithNvml(nvmlLib))
|
||||||
|
|
||||||
|
deviceSpecs := []specs.Device{}
|
||||||
|
for _, name := range names {
|
||||||
|
logger.Debugf("Getting CDI spec edits for device %q", name)
|
||||||
|
// Get a device handle by either index or UUID
|
||||||
|
var nvmlDevice nvml.Device
|
||||||
|
if idx, err := strconv.Atoi(name); err == nil {
|
||||||
|
nvmlDevice, err = nvmlLib.DeviceGetHandleByIndex(idx)
|
||||||
|
if err != nvml.SUCCESS {
|
||||||
|
return nil, fmt.Errorf("failed to get device handle for index '%v': %w", idx, err)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nvmlDevice, err = nvmlLib.DeviceGetHandleByUUID(name)
|
||||||
|
if err != nvml.SUCCESS {
|
||||||
|
return nil, fmt.Errorf("failed to get device handle for UUID '%v': %w", name, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nvlibDevice, err := nvdevice.NewDevice(nvmlDevice)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to construct device: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
gpuEdits, err := cdilib.GetGPUDeviceEdits(nvlibDevice)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get CDI spec edits for GPU %q: %w", name, err)
|
||||||
|
}
|
||||||
|
gpuDevice := specs.Device{
|
||||||
|
Name: name,
|
||||||
|
ContainerEdits: *gpuEdits.ContainerEdits,
|
||||||
|
}
|
||||||
|
deviceSpecs = append(deviceSpecs, gpuDevice)
|
||||||
|
}
|
||||||
|
|
||||||
|
commonEdits, err := cdilib.GetCommonEdits()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get common CDI spec edits: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return spec.New(
|
||||||
|
spec.WithDeviceSpecs(deviceSpecs),
|
||||||
|
spec.WithEdits(*commonEdits.ContainerEdits),
|
||||||
|
spec.WithVendor("runtime.nvidia.com"),
|
||||||
|
spec.WithClass("gpu"),
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user