Allow multiple device name strategies to be specified

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2023-03-21 15:51:36 +02:00
parent a8d48808d7
commit 52da12cf9a
14 changed files with 127 additions and 64 deletions

View File

@ -2,6 +2,8 @@
* Add a `--spec-dir` option to the `nvidia-ctk cdi generate` command. This allows specs outside of `/etc/cdi` and `/var/run/cdi` to be processed. * Add a `--spec-dir` option to the `nvidia-ctk cdi generate` command. This allows specs outside of `/etc/cdi` and `/var/run/cdi` to be processed.
* Add support for extracting device major number from `/proc/devices` if `nvidia` is used as a device name over `nvidia-frontend`. * Add support for extracting device major number from `/proc/devices` if `nvidia` is used as a device name over `nvidia-frontend`.
* Allow multiple device naming strategies for `nvidia-ctk cdi generate` command. This allows a single
CDI spec to be generated that includes GPUs by index and UUID.
## v1.15.0-rc.3 ## v1.15.0-rc.3
* Fix bug in `nvidia-ctk hook update-ldcache` where default `--ldconfig-path` value was not applied. * Fix bug in `nvidia-ctk hook update-ldcache` where default `--ldconfig-path` value was not applied.

View File

@ -42,16 +42,16 @@ type command struct {
} }
type options struct { type options struct {
output string output string
format string format string
deviceNameStrategy string deviceNameStrategies cli.StringSlice
driverRoot string driverRoot string
devRoot string devRoot string
nvidiaCTKPath string nvidiaCTKPath string
ldconfigPath string ldconfigPath string
mode string mode string
vendor string vendor string
class string class string
librarySearchPaths cli.StringSlice librarySearchPaths cli.StringSlice
@ -109,11 +109,11 @@ func (m command) build() *cli.Command {
Usage: "Specify the root where `/dev` is located. If this is not specified, the driver-root is assumed.", Usage: "Specify the root where `/dev` is located. If this is not specified, the driver-root is assumed.",
Destination: &opts.devRoot, Destination: &opts.devRoot,
}, },
&cli.StringFlag{ &cli.StringSliceFlag{
Name: "device-name-strategy", Name: "device-name-strategy",
Usage: "Specify the strategy for generating device names. One of [index | uuid | type-index]", Usage: "Specify the strategy for generating device names. If this is specified multiple times, the devices will be duplicated for each strategy. One of [index | uuid | type-index]",
Value: nvcdi.DeviceNameStrategyIndex, Value: cli.NewStringSlice(nvcdi.DeviceNameStrategyIndex),
Destination: &opts.deviceNameStrategy, Destination: &opts.deviceNameStrategies,
}, },
&cli.StringFlag{ &cli.StringFlag{
Name: "driver-root", Name: "driver-root",
@ -185,9 +185,11 @@ func (m command) validateFlags(c *cli.Context, opts *options) error {
return fmt.Errorf("invalid discovery mode: %v", opts.mode) return fmt.Errorf("invalid discovery mode: %v", opts.mode)
} }
_, err := nvcdi.NewDeviceNamer(opts.deviceNameStrategy) for _, strategy := range opts.deviceNameStrategies.Value() {
if err != nil { _, err := nvcdi.NewDeviceNamer(strategy)
return err if err != nil {
return err
}
} }
opts.nvidiaCTKPath = config.ResolveNVIDIACTKPath(m.logger, opts.nvidiaCTKPath) opts.nvidiaCTKPath = config.ResolveNVIDIACTKPath(m.logger, opts.nvidiaCTKPath)
@ -241,9 +243,13 @@ func formatFromFilename(filename string) string {
} }
func (m command) generateSpec(opts *options) (spec.Interface, error) { func (m command) generateSpec(opts *options) (spec.Interface, error) {
deviceNamer, err := nvcdi.NewDeviceNamer(opts.deviceNameStrategy) var deviceNamers []nvcdi.DeviceNamer
if err != nil { for _, strategy := range opts.deviceNameStrategies.Value() {
return nil, fmt.Errorf("failed to create device namer: %v", err) deviceNamer, err := nvcdi.NewDeviceNamer(strategy)
if err != nil {
return nil, fmt.Errorf("failed to create device namer: %v", err)
}
deviceNamers = append(deviceNamers, deviceNamer)
} }
cdilib, err := nvcdi.New( cdilib, err := nvcdi.New(
@ -252,7 +258,7 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) {
nvcdi.WithDevRoot(opts.devRoot), nvcdi.WithDevRoot(opts.devRoot),
nvcdi.WithNVIDIACTKPath(opts.nvidiaCTKPath), nvcdi.WithNVIDIACTKPath(opts.nvidiaCTKPath),
nvcdi.WithLdconfigPath(opts.ldconfigPath), nvcdi.WithLdconfigPath(opts.ldconfigPath),
nvcdi.WithDeviceNamer(deviceNamer), nvcdi.WithDeviceNamers(deviceNamers...),
nvcdi.WithMode(opts.mode), nvcdi.WithMode(opts.mode),
nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()), nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()),
nvcdi.WithCSVFiles(opts.csv.files.Value()), nvcdi.WithCSVFiles(opts.csv.files.Value()),

View File

@ -48,8 +48,8 @@ type Interface interface {
GetCommonEdits() (*cdi.ContainerEdits, error) GetCommonEdits() (*cdi.ContainerEdits, error)
GetAllDeviceSpecs() ([]specs.Device, error) GetAllDeviceSpecs() ([]specs.Device, error)
GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error)
GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error)
GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error)
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error)
GetDeviceSpecsByID(...string) ([]specs.Device, error) GetDeviceSpecsByID(...string) ([]specs.Device, error)
} }

View File

@ -34,23 +34,26 @@ import (
) )
// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. // GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) { func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) {
edits, err := l.GetGPUDeviceEdits(d) edits, err := l.GetGPUDeviceEdits(d)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get edits for device: %v", err) return nil, fmt.Errorf("failed to get edits for device: %v", err)
} }
name, err := l.deviceNamer.GetDeviceName(i, convert{d}) var deviceSpecs []specs.Device
names, err := l.deviceNamers.GetDeviceNames(i, convert{d})
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err) return nil, fmt.Errorf("failed to get device name: %v", err)
} }
for _, name := range names {
spec := specs.Device{ spec := specs.Device{
Name: name, Name: name,
ContainerEdits: *edits.ContainerEdits, ContainerEdits: *edits.ContainerEdits,
}
deviceSpecs = append(deviceSpecs, spec)
} }
return &spec, nil return deviceSpecs, nil
} }
// GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'. // GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'.

View File

@ -68,7 +68,7 @@ func (l *gdslib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
} }
// GetGPUDeviceSpecs is unsupported for the gdslib specs // GetGPUDeviceSpecs is unsupported for the gdslib specs
func (l *gdslib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) { func (l *gdslib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported") return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
} }
@ -78,7 +78,7 @@ func (l *gdslib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
} }
// GetMIGDeviceSpecs is unsupported for the gdslib specs // GetMIGDeviceSpecs is unsupported for the gdslib specs
func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported") return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
} }

View File

@ -58,16 +58,20 @@ func (l *csvlib) GetAllDeviceSpecs() ([]specs.Device, error) {
return nil, fmt.Errorf("failed to create container edits for CSV files: %v", err) return nil, fmt.Errorf("failed to create container edits for CSV files: %v", err)
} }
name, err := l.deviceNamer.GetDeviceName(0, uuidUnsupported{}) names, err := l.deviceNamers.GetDeviceNames(0, uuidIgnored{})
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err) return nil, fmt.Errorf("failed to get device name: %v", err)
} }
var deviceSpecs []specs.Device
deviceSpec := specs.Device{ for _, name := range names {
Name: name, deviceSpec := specs.Device{
ContainerEdits: *e.ContainerEdits, Name: name,
ContainerEdits: *e.ContainerEdits,
}
deviceSpecs = append(deviceSpecs, deviceSpec)
} }
return []specs.Device{deviceSpec}, nil
return deviceSpecs, nil
} }
// GetCommonEdits generates a CDI specification that can be used for ANY devices // GetCommonEdits generates a CDI specification that can be used for ANY devices
@ -82,7 +86,7 @@ func (l *csvlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
} }
// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. // GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *csvlib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) { func (l *csvlib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported for CSV files") return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported for CSV files")
} }
@ -92,7 +96,7 @@ func (l *csvlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
} }
// GetMIGDeviceSpecs returns the CDI device specs for the full MIG represented by 'device'. // GetMIGDeviceSpecs returns the CDI device specs for the full MIG represented by 'device'.
func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported for CSV files") return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported for CSV files")
} }

View File

@ -208,11 +208,11 @@ func (l *nvmllib) getEditsForMIGDevice(nvmlDevice nvml.Device) (*cdi.ContainerEd
func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) { func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) {
var deviceSpecs []specs.Device var deviceSpecs []specs.Device
err := l.devicelib.VisitDevices(func(i int, d device.Device) error { err := l.devicelib.VisitDevices(func(i int, d device.Device) error {
deviceSpec, err := l.GetGPUDeviceSpecs(i, d) specsForDevice, err := l.GetGPUDeviceSpecs(i, d)
if err != nil { if err != nil {
return err return err
} }
deviceSpecs = append(deviceSpecs, *deviceSpec) deviceSpecs = append(deviceSpecs, specsForDevice...)
return nil return nil
}) })
@ -225,11 +225,11 @@ func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) {
func (l *nvmllib) getMigDeviceSpecs() ([]specs.Device, error) { func (l *nvmllib) getMigDeviceSpecs() ([]specs.Device, error) {
var deviceSpecs []specs.Device var deviceSpecs []specs.Device
err := l.devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error { err := l.devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error {
deviceSpec, err := l.GetMIGDeviceSpecs(i, d, j, mig) specsForDevice, err := l.GetMIGDeviceSpecs(i, d, j, mig)
if err != nil { if err != nil {
return err return err
} }
deviceSpecs = append(deviceSpecs, *deviceSpec) deviceSpecs = append(deviceSpecs, specsForDevice...)
return nil return nil
}) })

View File

@ -68,7 +68,7 @@ func (l *wsllib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
} }
// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. // GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *wsllib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) { func (l *wsllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported on WSL") return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported on WSL")
} }
@ -78,7 +78,7 @@ func (l *wsllib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai
} }
// GetMIGDeviceSpecs returns the CDI device specs for the full MIG represented by 'device'. // GetMIGDeviceSpecs returns the CDI device specs for the full MIG represented by 'device'.
func (l *wsllib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { func (l *wsllib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported on WSL") return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported on WSL")
} }

View File

@ -44,7 +44,7 @@ type nvcdilib struct {
nvmllib nvml.Interface nvmllib nvml.Interface
mode string mode string
devicelib device.Interface devicelib device.Interface
deviceNamer DeviceNamer deviceNamers DeviceNamers
driverRoot string driverRoot string
devRoot string devRoot string
nvidiaCTKPath string nvidiaCTKPath string
@ -75,8 +75,9 @@ func New(opts ...Option) (Interface, error) {
if l.logger == nil { if l.logger == nil {
l.logger = logger.New() l.logger = logger.New()
} }
if l.deviceNamer == nil { if len(l.deviceNamers) == 0 {
l.deviceNamer, _ = NewDeviceNamer(DeviceNameStrategyIndex) indexNamer, _ := NewDeviceNamer(DeviceNameStrategyIndex)
l.deviceNamers = []DeviceNamer{indexNamer}
} }
if l.driverRoot == "" { if l.driverRoot == "" {
l.driverRoot = "/" l.driverRoot = "/"

View File

@ -175,7 +175,7 @@ func (m *managementlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, e
} }
// GetGPUDeviceSpecs is unsupported for the managementlib specs // GetGPUDeviceSpecs is unsupported for the managementlib specs
func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) { func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported") return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
} }
@ -185,7 +185,7 @@ func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi
} }
// GetMIGDeviceSpecs is unsupported for the managementlib specs // GetMIGDeviceSpecs is unsupported for the managementlib specs
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported") return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
} }

View File

@ -31,23 +31,25 @@ import (
) )
// GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. // GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.MigDevice) (*specs.Device, error) { func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.MigDevice) ([]specs.Device, error) {
edits, err := l.GetMIGDeviceEdits(d, mig) edits, err := l.GetMIGDeviceEdits(d, mig)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get edits for device: %v", err) return nil, fmt.Errorf("failed to get edits for device: %v", err)
} }
name, err := l.deviceNamer.GetMigDeviceName(i, convert{d}, j, convert{mig}) names, err := l.deviceNamers.GetMigDeviceNames(i, convert{d}, j, convert{mig})
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err) return nil, fmt.Errorf("failed to get device name: %v", err)
} }
var deviceSpecs []specs.Device
spec := specs.Device{ for _, name := range names {
Name: name, spec := specs.Device{
ContainerEdits: *edits.ContainerEdits, Name: name,
ContainerEdits: *edits.ContainerEdits,
}
deviceSpecs = append(deviceSpecs, spec)
} }
return deviceSpecs, nil
return &spec, nil
} }
// GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'. // GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'.

View File

@ -68,7 +68,7 @@ func (l *mofedlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error)
} }
// GetGPUDeviceSpecs is unsupported for the mofedlib specs // GetGPUDeviceSpecs is unsupported for the mofedlib specs
func (l *mofedlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) { func (l *mofedlib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported") return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
} }
@ -78,7 +78,7 @@ func (l *mofedlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Cont
} }
// GetMIGDeviceSpecs is unsupported for the mofedlib specs // GetMIGDeviceSpecs is unsupported for the mofedlib specs
func (l *mofedlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { func (l *mofedlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported") return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
} }

View File

@ -28,6 +28,9 @@ type UUIDer interface {
GetUUID() (string, error) GetUUID() (string, error)
} }
// DeviceNamers represents a list of device namers
type DeviceNamers []DeviceNamer
// DeviceNamer is an interface for getting device names // DeviceNamer is an interface for getting device names
type DeviceNamer interface { type DeviceNamer interface {
GetDeviceName(int, UUIDer) (string, error) GetDeviceName(int, UUIDer) (string, error)
@ -102,6 +105,12 @@ type convert struct {
nvmlUUIDer nvmlUUIDer
} }
type uuidIgnored struct{}
func (m uuidIgnored) GetUUID() (string, error) {
return "", nil
}
type uuidUnsupported struct{} type uuidUnsupported struct{}
func (m convert) GetUUID() (string, error) { func (m convert) GetUUID() (string, error) {
@ -120,3 +129,39 @@ var errUUIDUnsupported = errors.New("GetUUID is not supported")
func (m uuidUnsupported) GetUUID() (string, error) { func (m uuidUnsupported) GetUUID() (string, error) {
return "", errUUIDUnsupported return "", errUUIDUnsupported
} }
func (l DeviceNamers) GetDeviceNames(i int, d UUIDer) ([]string, error) {
var names []string
for _, namer := range l {
name, err := namer.GetDeviceName(i, d)
if err != nil {
return nil, err
}
if name == "" {
continue
}
names = append(names, name)
}
if len(names) == 0 {
return nil, errors.New("no names defined")
}
return names, nil
}
func (l DeviceNamers) GetMigDeviceNames(i int, d UUIDer, j int, mig UUIDer) ([]string, error) {
var names []string
for _, namer := range l {
name, err := namer.GetMigDeviceName(i, d, j, mig)
if err != nil {
return nil, err
}
if name == "" {
continue
}
names = append(names, name)
}
if len(names) == 0 {
return nil, errors.New("no names defined")
}
return names, nil
}

View File

@ -34,10 +34,10 @@ func WithDeviceLib(devicelib device.Interface) Option {
} }
} }
// WithDeviceNamer sets the device namer for the library // WithDeviceNamers sets the device namer for the library
func WithDeviceNamer(namer DeviceNamer) Option { func WithDeviceNamers(namers ...DeviceNamer) Option {
return func(l *nvcdilib) { return func(l *nvcdilib) {
l.deviceNamer = namer l.deviceNamers = namers
} }
} }