Add GetDeviceSpecsByID() API to the nvcdi Interface

Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
This commit is contained in:
Christopher Desiniotis
2023-12-04 12:57:12 -08:00
parent ae1b7e126c
commit b9ac54b922
8 changed files with 109 additions and 51 deletions

View File

@@ -51,4 +51,5 @@ type Interface interface {
GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error)
GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error)
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error)
GetDeviceSpecsByID(...string) ([]specs.Device, error)
}

View File

@@ -81,3 +81,10 @@ func (l *gdslib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
}
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
// the provided identifiers, where an identifier is an index or UUID of a valid
// GPU device.
func (l *gdslib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported")
}

View File

@@ -94,3 +94,10 @@ func (l *csvlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported for CSV files")
}
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
// the provided identifiers, where an identifier is an index or UUID of a valid
// GPU device.
func (l *csvlib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported for CSV files")
}

View File

@@ -18,6 +18,7 @@ package nvcdi
import (
"fmt"
"strconv"
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
"github.com/NVIDIA/go-nvlib/pkg/nvml"
@@ -75,6 +76,72 @@ func (l *nvmllib) GetCommonEdits() (*cdi.ContainerEdits, error) {
return edits.FromDiscoverer(common)
}
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
// the provided identifiers, where an identifier is an index or UUID of a valid
// GPU device.
// TODO: support identifiers that correspond to MIG devices
func (l *nvmllib) GetDeviceSpecsByID(identifiers ...string) ([]specs.Device, error) {
for _, id := range identifiers {
if id == "all" {
return l.GetAllDeviceSpecs()
}
}
var deviceSpecs []specs.Device
if r := l.nvmllib.Init(); r != nvml.SUCCESS {
return nil, fmt.Errorf("failed to initialize NVML: %w", r)
}
defer func() {
if r := l.nvmllib.Shutdown(); r != nvml.SUCCESS {
l.logger.Warningf("failed to shutdown NVML: %w", r)
}
}()
nvmlDevices, err := l.getNVMLDevicesByID(identifiers...)
if err != nil {
return nil, fmt.Errorf("failed to get NVML device handles: %w", err)
}
for i, nvmlDevice := range nvmlDevices {
nvlibDevice, err := l.devicelib.NewDevice(nvmlDevice)
if err != nil {
return nil, fmt.Errorf("failed to construct device: %w", err)
}
deviceEdits, err := l.GetGPUDeviceEdits(nvlibDevice)
if err != nil {
return nil, fmt.Errorf("failed to get CDI device edits for identifier %q: %w", identifiers[i], err)
}
deviceSpec := specs.Device{
Name: identifiers[i],
ContainerEdits: *deviceEdits.ContainerEdits,
}
deviceSpecs = append(deviceSpecs, deviceSpec)
}
return deviceSpecs, nil
}
// TODO: move this to go-nvlib?
func (l *nvmllib) getNVMLDevicesByID(identifiers ...string) ([]nvml.Device, error) {
devices := []nvml.Device{}
for _, id := range identifiers {
if dev, err := l.nvmllib.DeviceGetHandleByUUID(id); err == nvml.SUCCESS {
devices = append(devices, dev)
continue
}
// TODO: check for a MIG device index
if idx, err := strconv.Atoi(id); err == nil {
if dev, err := l.nvmllib.DeviceGetHandleByIndex(idx); err == nvml.SUCCESS {
devices = append(devices, dev)
continue
}
}
return nil, fmt.Errorf("failed to get NVML device handle for identifier %q", id)
}
return devices, nil
}
func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) {
var deviceSpecs []specs.Device
err := l.devicelib.VisitDevices(func(i int, d device.Device) error {

View File

@@ -81,3 +81,10 @@ func (l *wsllib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
func (l *wsllib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported on WSL")
}
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
// the provided identifiers, where an identifier is an index or UUID of a valid
// GPU device.
func (l *wsllib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported on WSL")
}

View File

@@ -188,3 +188,10 @@ func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
}
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
// the provided identifiers, where an identifier is an index or UUID of a valid
// GPU device.
func (l *managementlib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported")
}

View File

@@ -81,3 +81,10 @@ func (l *mofedlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Cont
func (l *mofedlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
}
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
// the provided identifiers, where an identifier is an index or UUID of a valid
// GPU device.
func (l *mofedlib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported")
}