Allow multiple device name strategies to be specified

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2023-03-21 15:51:36 +02:00
parent a8d48808d7
commit 52da12cf9a
14 changed files with 127 additions and 64 deletions

View File

@ -2,6 +2,8 @@
* Add a `--spec-dir` option to the `nvidia-ctk cdi generate` command. This allows specs outside of `/etc/cdi` and `/var/run/cdi` to be processed.
* Add support for extracting device major number from `/proc/devices` if `nvidia` is used as a device name over `nvidia-frontend`.
* Allow multiple device naming strategies for `nvidia-ctk cdi generate` command. This allows a single
CDI spec to be generated that includes GPUs by index and UUID.
## v1.15.0-rc.3
* Fix bug in `nvidia-ctk hook update-ldcache` where default `--ldconfig-path` value was not applied.

View File

@ -44,7 +44,7 @@ type command struct {
type options struct {
output string
format string
deviceNameStrategy string
deviceNameStrategies cli.StringSlice
driverRoot string
devRoot string
nvidiaCTKPath string
@ -109,11 +109,11 @@ func (m command) build() *cli.Command {
Usage: "Specify the root where `/dev` is located. If this is not specified, the driver-root is assumed.",
Destination: &opts.devRoot,
},
&cli.StringFlag{
&cli.StringSliceFlag{
Name: "device-name-strategy",
Usage: "Specify the strategy for generating device names. One of [index | uuid | type-index]",
Value: nvcdi.DeviceNameStrategyIndex,
Destination: &opts.deviceNameStrategy,
Usage: "Specify the strategy for generating device names. If this is specified multiple times, the devices will be duplicated for each strategy. One of [index | uuid | type-index]",
Value: cli.NewStringSlice(nvcdi.DeviceNameStrategyIndex),
Destination: &opts.deviceNameStrategies,
},
&cli.StringFlag{
Name: "driver-root",
@ -185,10 +185,12 @@ func (m command) validateFlags(c *cli.Context, opts *options) error {
return fmt.Errorf("invalid discovery mode: %v", opts.mode)
}
_, err := nvcdi.NewDeviceNamer(opts.deviceNameStrategy)
for _, strategy := range opts.deviceNameStrategies.Value() {
_, err := nvcdi.NewDeviceNamer(strategy)
if err != nil {
return err
}
}
opts.nvidiaCTKPath = config.ResolveNVIDIACTKPath(m.logger, opts.nvidiaCTKPath)
@ -241,10 +243,14 @@ func formatFromFilename(filename string) string {
}
func (m command) generateSpec(opts *options) (spec.Interface, error) {
deviceNamer, err := nvcdi.NewDeviceNamer(opts.deviceNameStrategy)
var deviceNamers []nvcdi.DeviceNamer
for _, strategy := range opts.deviceNameStrategies.Value() {
deviceNamer, err := nvcdi.NewDeviceNamer(strategy)
if err != nil {
return nil, fmt.Errorf("failed to create device namer: %v", err)
}
deviceNamers = append(deviceNamers, deviceNamer)
}
cdilib, err := nvcdi.New(
nvcdi.WithLogger(m.logger),
@ -252,7 +258,7 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) {
nvcdi.WithDevRoot(opts.devRoot),
nvcdi.WithNVIDIACTKPath(opts.nvidiaCTKPath),
nvcdi.WithLdconfigPath(opts.ldconfigPath),
nvcdi.WithDeviceNamer(deviceNamer),
nvcdi.WithDeviceNamers(deviceNamers...),
nvcdi.WithMode(opts.mode),
nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()),
nvcdi.WithCSVFiles(opts.csv.files.Value()),

View File

@ -48,8 +48,8 @@ type Interface interface {
GetCommonEdits() (*cdi.ContainerEdits, error)
GetAllDeviceSpecs() ([]specs.Device, error)
GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error)
GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error)
GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error)
GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error)
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error)
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error)
GetDeviceSpecsByID(...string) ([]specs.Device, error)
}

View File

@ -34,23 +34,26 @@ import (
)
// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) {
func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) {
edits, err := l.GetGPUDeviceEdits(d)
if err != nil {
return nil, fmt.Errorf("failed to get edits for device: %v", err)
}
name, err := l.deviceNamer.GetDeviceName(i, convert{d})
var deviceSpecs []specs.Device
names, err := l.deviceNamers.GetDeviceNames(i, convert{d})
if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err)
}
for _, name := range names {
spec := specs.Device{
Name: name,
ContainerEdits: *edits.ContainerEdits,
}
deviceSpecs = append(deviceSpecs, spec)
}
return &spec, nil
return deviceSpecs, nil
}
// GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'.

View File

@ -68,7 +68,7 @@ func (l *gdslib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
}
// GetGPUDeviceSpecs is unsupported for the gdslib specs
func (l *gdslib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) {
func (l *gdslib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
}
@ -78,7 +78,7 @@ func (l *gdslib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
}
// GetMIGDeviceSpecs is unsupported for the gdslib specs
func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
}

View File

@ -58,16 +58,20 @@ func (l *csvlib) GetAllDeviceSpecs() ([]specs.Device, error) {
return nil, fmt.Errorf("failed to create container edits for CSV files: %v", err)
}
name, err := l.deviceNamer.GetDeviceName(0, uuidUnsupported{})
names, err := l.deviceNamers.GetDeviceNames(0, uuidIgnored{})
if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err)
}
var deviceSpecs []specs.Device
for _, name := range names {
deviceSpec := specs.Device{
Name: name,
ContainerEdits: *e.ContainerEdits,
}
return []specs.Device{deviceSpec}, nil
deviceSpecs = append(deviceSpecs, deviceSpec)
}
return deviceSpecs, nil
}
// GetCommonEdits generates a CDI specification that can be used for ANY devices
@ -82,7 +86,7 @@ func (l *csvlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
}
// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *csvlib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) {
func (l *csvlib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported for CSV files")
}
@ -92,7 +96,7 @@ func (l *csvlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
}
// GetMIGDeviceSpecs returns the CDI device specs for the full MIG represented by 'device'.
func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported for CSV files")
}

View File

@ -208,11 +208,11 @@ func (l *nvmllib) getEditsForMIGDevice(nvmlDevice nvml.Device) (*cdi.ContainerEd
func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) {
var deviceSpecs []specs.Device
err := l.devicelib.VisitDevices(func(i int, d device.Device) error {
deviceSpec, err := l.GetGPUDeviceSpecs(i, d)
specsForDevice, err := l.GetGPUDeviceSpecs(i, d)
if err != nil {
return err
}
deviceSpecs = append(deviceSpecs, *deviceSpec)
deviceSpecs = append(deviceSpecs, specsForDevice...)
return nil
})
@ -225,11 +225,11 @@ func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) {
func (l *nvmllib) getMigDeviceSpecs() ([]specs.Device, error) {
var deviceSpecs []specs.Device
err := l.devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error {
deviceSpec, err := l.GetMIGDeviceSpecs(i, d, j, mig)
specsForDevice, err := l.GetMIGDeviceSpecs(i, d, j, mig)
if err != nil {
return err
}
deviceSpecs = append(deviceSpecs, *deviceSpec)
deviceSpecs = append(deviceSpecs, specsForDevice...)
return nil
})

View File

@ -68,7 +68,7 @@ func (l *wsllib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
}
// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *wsllib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) {
func (l *wsllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported on WSL")
}
@ -78,7 +78,7 @@ func (l *wsllib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
}
// GetMIGDeviceSpecs returns the CDI device specs for the full MIG represented by 'device'.
func (l *wsllib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
func (l *wsllib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported on WSL")
}

View File

@ -44,7 +44,7 @@ type nvcdilib struct {
nvmllib nvml.Interface
mode string
devicelib device.Interface
deviceNamer DeviceNamer
deviceNamers DeviceNamers
driverRoot string
devRoot string
nvidiaCTKPath string
@ -75,8 +75,9 @@ func New(opts ...Option) (Interface, error) {
if l.logger == nil {
l.logger = logger.New()
}
if l.deviceNamer == nil {
l.deviceNamer, _ = NewDeviceNamer(DeviceNameStrategyIndex)
if len(l.deviceNamers) == 0 {
indexNamer, _ := NewDeviceNamer(DeviceNameStrategyIndex)
l.deviceNamers = []DeviceNamer{indexNamer}
}
if l.driverRoot == "" {
l.driverRoot = "/"

View File

@ -175,7 +175,7 @@ func (m *managementlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, e
}
// GetGPUDeviceSpecs is unsupported for the managementlib specs
func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) {
func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
}
@ -185,7 +185,7 @@ func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi
}
// GetMIGDeviceSpecs is unsupported for the managementlib specs
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
}

View File

@ -31,23 +31,25 @@ import (
)
// GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.MigDevice) (*specs.Device, error) {
func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.MigDevice) ([]specs.Device, error) {
edits, err := l.GetMIGDeviceEdits(d, mig)
if err != nil {
return nil, fmt.Errorf("failed to get edits for device: %v", err)
}
name, err := l.deviceNamer.GetMigDeviceName(i, convert{d}, j, convert{mig})
names, err := l.deviceNamers.GetMigDeviceNames(i, convert{d}, j, convert{mig})
if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err)
}
var deviceSpecs []specs.Device
for _, name := range names {
spec := specs.Device{
Name: name,
ContainerEdits: *edits.ContainerEdits,
}
return &spec, nil
deviceSpecs = append(deviceSpecs, spec)
}
return deviceSpecs, nil
}
// GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'.

View File

@ -68,7 +68,7 @@ func (l *mofedlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error)
}
// GetGPUDeviceSpecs is unsupported for the mofedlib specs
func (l *mofedlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) {
func (l *mofedlib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
}
@ -78,7 +78,7 @@ func (l *mofedlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Cont
}
// GetMIGDeviceSpecs is unsupported for the mofedlib specs
func (l *mofedlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
func (l *mofedlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
}

View File

@ -28,6 +28,9 @@ type UUIDer interface {
GetUUID() (string, error)
}
// DeviceNamers represents a list of device namers
type DeviceNamers []DeviceNamer
// DeviceNamer is an interface for getting device names
type DeviceNamer interface {
GetDeviceName(int, UUIDer) (string, error)
@ -102,6 +105,12 @@ type convert struct {
nvmlUUIDer
}
type uuidIgnored struct{}
func (m uuidIgnored) GetUUID() (string, error) {
return "", nil
}
type uuidUnsupported struct{}
func (m convert) GetUUID() (string, error) {
@ -120,3 +129,39 @@ var errUUIDUnsupported = errors.New("GetUUID is not supported")
func (m uuidUnsupported) GetUUID() (string, error) {
return "", errUUIDUnsupported
}
func (l DeviceNamers) GetDeviceNames(i int, d UUIDer) ([]string, error) {
var names []string
for _, namer := range l {
name, err := namer.GetDeviceName(i, d)
if err != nil {
return nil, err
}
if name == "" {
continue
}
names = append(names, name)
}
if len(names) == 0 {
return nil, errors.New("no names defined")
}
return names, nil
}
func (l DeviceNamers) GetMigDeviceNames(i int, d UUIDer, j int, mig UUIDer) ([]string, error) {
var names []string
for _, namer := range l {
name, err := namer.GetMigDeviceName(i, d, j, mig)
if err != nil {
return nil, err
}
if name == "" {
continue
}
names = append(names, name)
}
if len(names) == 0 {
return nil, errors.New("no names defined")
}
return names, nil
}

View File

@ -34,10 +34,10 @@ func WithDeviceLib(devicelib device.Interface) Option {
}
}
// WithDeviceNamer sets the device namer for the library
func WithDeviceNamer(namer DeviceNamer) Option {
// WithDeviceNamers sets the device namer for the library
func WithDeviceNamers(namers ...DeviceNamer) Option {
return func(l *nvcdilib) {
l.deviceNamer = namer
l.deviceNamers = namers
}
}