mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-22 00:08:11 +00:00
Add GetDeviceSpecsByID() API to the nvcdi Interface
Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com>
This commit is contained in:
parent
ae1b7e126c
commit
b9ac54b922
@ -18,13 +18,9 @@ package modifier
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"strconv"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
nvdevice "github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
|
||||||
"github.com/NVIDIA/go-nvlib/pkg/nvml"
|
|
||||||
"tags.cncf.io/container-device-interface/pkg/parser"
|
"tags.cncf.io/container-device-interface/pkg/parser"
|
||||||
"tags.cncf.io/container-device-interface/specs-go"
|
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||||
@ -193,56 +189,15 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
|
|||||||
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
names := []string{}
|
identifiers := []string{}
|
||||||
for _, device := range devices {
|
for _, device := range devices {
|
||||||
_, _, name := parser.ParseDevice(device)
|
_, _, id := parser.ParseDevice(device)
|
||||||
if name == "all" {
|
identifiers = append(identifiers, id)
|
||||||
return cdilib.GetSpec()
|
|
||||||
}
|
|
||||||
names = append(names, name)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Note: The below code currently only supports generating CDI spec modifications
|
deviceSpecs, err := cdilib.GetDeviceSpecsByID(identifiers...)
|
||||||
// for full-GPUs, specified either by index or UUID. MIG devices are not
|
if err != nil {
|
||||||
// supported.
|
return nil, fmt.Errorf("failed to get CDI device specs: %w", err)
|
||||||
nvmlLib := nvml.New()
|
|
||||||
ret := nvmlLib.Init()
|
|
||||||
if ret != nvml.SUCCESS {
|
|
||||||
return nil, fmt.Errorf("failed to initialized NVML: %w", ret)
|
|
||||||
}
|
|
||||||
nvdevice := nvdevice.New(nvdevice.WithNvml(nvmlLib))
|
|
||||||
|
|
||||||
deviceSpecs := []specs.Device{}
|
|
||||||
for _, name := range names {
|
|
||||||
logger.Debugf("Getting CDI spec edits for device %q", name)
|
|
||||||
// Get a device handle by either index or UUID
|
|
||||||
var nvmlDevice nvml.Device
|
|
||||||
if idx, err := strconv.Atoi(name); err == nil {
|
|
||||||
nvmlDevice, err = nvmlLib.DeviceGetHandleByIndex(idx)
|
|
||||||
if err != nvml.SUCCESS {
|
|
||||||
return nil, fmt.Errorf("failed to get device handle for index '%v': %w", idx, err)
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
nvmlDevice, err = nvmlLib.DeviceGetHandleByUUID(name)
|
|
||||||
if err != nvml.SUCCESS {
|
|
||||||
return nil, fmt.Errorf("failed to get device handle for UUID '%v': %w", name, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
nvlibDevice, err := nvdevice.NewDevice(nvmlDevice)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to construct device: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
gpuEdits, err := cdilib.GetGPUDeviceEdits(nvlibDevice)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to get CDI spec edits for GPU %q: %w", name, err)
|
|
||||||
}
|
|
||||||
gpuDevice := specs.Device{
|
|
||||||
Name: name,
|
|
||||||
ContainerEdits: *gpuEdits.ContainerEdits,
|
|
||||||
}
|
|
||||||
deviceSpecs = append(deviceSpecs, gpuDevice)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
commonEdits, err := cdilib.GetCommonEdits()
|
commonEdits, err := cdilib.GetCommonEdits()
|
||||||
|
@ -51,4 +51,5 @@ type Interface interface {
|
|||||||
GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error)
|
GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error)
|
||||||
GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error)
|
GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error)
|
||||||
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error)
|
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error)
|
||||||
|
GetDeviceSpecsByID(...string) ([]specs.Device, error)
|
||||||
}
|
}
|
||||||
|
@ -81,3 +81,10 @@ func (l *gdslib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
|
|||||||
func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
||||||
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
|
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
|
||||||
|
// the provided identifiers, where an identifier is an index or UUID of a valid
|
||||||
|
// GPU device.
|
||||||
|
func (l *gdslib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
|
||||||
|
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported")
|
||||||
|
}
|
||||||
|
@ -94,3 +94,10 @@ func (l *csvlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
|
|||||||
func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
||||||
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported for CSV files")
|
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported for CSV files")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
|
||||||
|
// the provided identifiers, where an identifier is an index or UUID of a valid
|
||||||
|
// GPU device.
|
||||||
|
func (l *csvlib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
|
||||||
|
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported for CSV files")
|
||||||
|
}
|
||||||
|
@ -18,6 +18,7 @@ package nvcdi
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
||||||
"github.com/NVIDIA/go-nvlib/pkg/nvml"
|
"github.com/NVIDIA/go-nvlib/pkg/nvml"
|
||||||
@ -75,6 +76,72 @@ func (l *nvmllib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
|||||||
return edits.FromDiscoverer(common)
|
return edits.FromDiscoverer(common)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
|
||||||
|
// the provided identifiers, where an identifier is an index or UUID of a valid
|
||||||
|
// GPU device.
|
||||||
|
// TODO: support identifiers that correspond to MIG devices
|
||||||
|
func (l *nvmllib) GetDeviceSpecsByID(identifiers ...string) ([]specs.Device, error) {
|
||||||
|
for _, id := range identifiers {
|
||||||
|
if id == "all" {
|
||||||
|
return l.GetAllDeviceSpecs()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var deviceSpecs []specs.Device
|
||||||
|
|
||||||
|
if r := l.nvmllib.Init(); r != nvml.SUCCESS {
|
||||||
|
return nil, fmt.Errorf("failed to initialize NVML: %w", r)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if r := l.nvmllib.Shutdown(); r != nvml.SUCCESS {
|
||||||
|
l.logger.Warningf("failed to shutdown NVML: %w", r)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
nvmlDevices, err := l.getNVMLDevicesByID(identifiers...)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get NVML device handles: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, nvmlDevice := range nvmlDevices {
|
||||||
|
nvlibDevice, err := l.devicelib.NewDevice(nvmlDevice)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to construct device: %w", err)
|
||||||
|
}
|
||||||
|
deviceEdits, err := l.GetGPUDeviceEdits(nvlibDevice)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to get CDI device edits for identifier %q: %w", identifiers[i], err)
|
||||||
|
}
|
||||||
|
deviceSpec := specs.Device{
|
||||||
|
Name: identifiers[i],
|
||||||
|
ContainerEdits: *deviceEdits.ContainerEdits,
|
||||||
|
}
|
||||||
|
deviceSpecs = append(deviceSpecs, deviceSpec)
|
||||||
|
}
|
||||||
|
|
||||||
|
return deviceSpecs, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: move this to go-nvlib?
|
||||||
|
func (l *nvmllib) getNVMLDevicesByID(identifiers ...string) ([]nvml.Device, error) {
|
||||||
|
devices := []nvml.Device{}
|
||||||
|
for _, id := range identifiers {
|
||||||
|
if dev, err := l.nvmllib.DeviceGetHandleByUUID(id); err == nvml.SUCCESS {
|
||||||
|
devices = append(devices, dev)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// TODO: check for a MIG device index
|
||||||
|
if idx, err := strconv.Atoi(id); err == nil {
|
||||||
|
if dev, err := l.nvmllib.DeviceGetHandleByIndex(idx); err == nvml.SUCCESS {
|
||||||
|
devices = append(devices, dev)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil, fmt.Errorf("failed to get NVML device handle for identifier %q", id)
|
||||||
|
}
|
||||||
|
return devices, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) {
|
func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) {
|
||||||
var deviceSpecs []specs.Device
|
var deviceSpecs []specs.Device
|
||||||
err := l.devicelib.VisitDevices(func(i int, d device.Device) error {
|
err := l.devicelib.VisitDevices(func(i int, d device.Device) error {
|
||||||
|
@ -81,3 +81,10 @@ func (l *wsllib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
|
|||||||
func (l *wsllib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
func (l *wsllib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
||||||
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported on WSL")
|
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported on WSL")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
|
||||||
|
// the provided identifiers, where an identifier is an index or UUID of a valid
|
||||||
|
// GPU device.
|
||||||
|
func (l *wsllib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
|
||||||
|
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported on WSL")
|
||||||
|
}
|
||||||
|
@ -188,3 +188,10 @@ func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi
|
|||||||
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
||||||
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
|
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
|
||||||
|
// the provided identifiers, where an identifier is an index or UUID of a valid
|
||||||
|
// GPU device.
|
||||||
|
func (l *managementlib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
|
||||||
|
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported")
|
||||||
|
}
|
||||||
|
@ -81,3 +81,10 @@ func (l *mofedlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Cont
|
|||||||
func (l *mofedlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
func (l *mofedlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
||||||
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
|
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
|
||||||
|
// the provided identifiers, where an identifier is an index or UUID of a valid
|
||||||
|
// GPU device.
|
||||||
|
func (l *mofedlib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
|
||||||
|
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported")
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user