From 52da12cf9a8642cb2b39ae99e2568bb029078b83 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 21 Mar 2023 15:51:36 +0200 Subject: [PATCH] Allow multiple device name strategies to be specified Signed-off-by: Evan Lezar --- CHANGELOG.md | 2 ++ cmd/nvidia-ctk/cdi/generate/generate.go | 48 ++++++++++++++----------- pkg/nvcdi/api.go | 4 +-- pkg/nvcdi/full-gpu-nvml.go | 17 +++++---- pkg/nvcdi/gds.go | 4 +-- pkg/nvcdi/lib-csv.go | 20 ++++++----- pkg/nvcdi/lib-nvml.go | 8 ++--- pkg/nvcdi/lib-wsl.go | 4 +-- pkg/nvcdi/lib.go | 7 ++-- pkg/nvcdi/management.go | 4 +-- pkg/nvcdi/mig-device-nvml.go | 18 +++++----- pkg/nvcdi/mofed.go | 4 +-- pkg/nvcdi/namer.go | 45 +++++++++++++++++++++++ pkg/nvcdi/options.go | 6 ++-- 14 files changed, 127 insertions(+), 64 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 04d20094..2838e367 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ * Add a `--spec-dir` option to the `nvidia-ctk cdi generate` command. This allows specs outside of `/etc/cdi` and `/var/run/cdi` to be processed. * Add support for extracting device major number from `/proc/devices` if `nvidia` is used as a device name over `nvidia-frontend`. +* Allow multiple device naming strategies for `nvidia-ctk cdi generate` command. This allows a single + CDI spec to be generated that includes GPUs by index and UUID. ## v1.15.0-rc.3 * Fix bug in `nvidia-ctk hook update-ldcache` where default `--ldconfig-path` value was not applied. diff --git a/cmd/nvidia-ctk/cdi/generate/generate.go b/cmd/nvidia-ctk/cdi/generate/generate.go index 4eed5d5d..e6abf432 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate.go +++ b/cmd/nvidia-ctk/cdi/generate/generate.go @@ -42,16 +42,16 @@ type command struct { } type options struct { - output string - format string - deviceNameStrategy string - driverRoot string - devRoot string - nvidiaCTKPath string - ldconfigPath string - mode string - vendor string - class string + output string + format string + deviceNameStrategies cli.StringSlice + driverRoot string + devRoot string + nvidiaCTKPath string + ldconfigPath string + mode string + vendor string + class string librarySearchPaths cli.StringSlice @@ -109,11 +109,11 @@ func (m command) build() *cli.Command { Usage: "Specify the root where `/dev` is located. If this is not specified, the driver-root is assumed.", Destination: &opts.devRoot, }, - &cli.StringFlag{ + &cli.StringSliceFlag{ Name: "device-name-strategy", - Usage: "Specify the strategy for generating device names. One of [index | uuid | type-index]", - Value: nvcdi.DeviceNameStrategyIndex, - Destination: &opts.deviceNameStrategy, + Usage: "Specify the strategy for generating device names. If this is specified multiple times, the devices will be duplicated for each strategy. One of [index | uuid | type-index]", + Value: cli.NewStringSlice(nvcdi.DeviceNameStrategyIndex), + Destination: &opts.deviceNameStrategies, }, &cli.StringFlag{ Name: "driver-root", @@ -185,9 +185,11 @@ func (m command) validateFlags(c *cli.Context, opts *options) error { return fmt.Errorf("invalid discovery mode: %v", opts.mode) } - _, err := nvcdi.NewDeviceNamer(opts.deviceNameStrategy) - if err != nil { - return err + for _, strategy := range opts.deviceNameStrategies.Value() { + _, err := nvcdi.NewDeviceNamer(strategy) + if err != nil { + return err + } } opts.nvidiaCTKPath = config.ResolveNVIDIACTKPath(m.logger, opts.nvidiaCTKPath) @@ -241,9 +243,13 @@ func formatFromFilename(filename string) string { } func (m command) generateSpec(opts *options) (spec.Interface, error) { - deviceNamer, err := nvcdi.NewDeviceNamer(opts.deviceNameStrategy) - if err != nil { - return nil, fmt.Errorf("failed to create device namer: %v", err) + var deviceNamers []nvcdi.DeviceNamer + for _, strategy := range opts.deviceNameStrategies.Value() { + deviceNamer, err := nvcdi.NewDeviceNamer(strategy) + if err != nil { + return nil, fmt.Errorf("failed to create device namer: %v", err) + } + deviceNamers = append(deviceNamers, deviceNamer) } cdilib, err := nvcdi.New( @@ -252,7 +258,7 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) { nvcdi.WithDevRoot(opts.devRoot), nvcdi.WithNVIDIACTKPath(opts.nvidiaCTKPath), nvcdi.WithLdconfigPath(opts.ldconfigPath), - nvcdi.WithDeviceNamer(deviceNamer), + nvcdi.WithDeviceNamers(deviceNamers...), nvcdi.WithMode(opts.mode), nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()), nvcdi.WithCSVFiles(opts.csv.files.Value()), diff --git a/pkg/nvcdi/api.go b/pkg/nvcdi/api.go index 27c264de..1c84fefa 100644 --- a/pkg/nvcdi/api.go +++ b/pkg/nvcdi/api.go @@ -48,8 +48,8 @@ type Interface interface { GetCommonEdits() (*cdi.ContainerEdits, error) GetAllDeviceSpecs() ([]specs.Device, error) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) - GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) + GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error) - GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) + GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) GetDeviceSpecsByID(...string) ([]specs.Device, error) } diff --git a/pkg/nvcdi/full-gpu-nvml.go b/pkg/nvcdi/full-gpu-nvml.go index 6793cbbc..9e4013ef 100644 --- a/pkg/nvcdi/full-gpu-nvml.go +++ b/pkg/nvcdi/full-gpu-nvml.go @@ -34,23 +34,26 @@ import ( ) // GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. -func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) { +func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) { edits, err := l.GetGPUDeviceEdits(d) if err != nil { return nil, fmt.Errorf("failed to get edits for device: %v", err) } - name, err := l.deviceNamer.GetDeviceName(i, convert{d}) + var deviceSpecs []specs.Device + names, err := l.deviceNamers.GetDeviceNames(i, convert{d}) if err != nil { return nil, fmt.Errorf("failed to get device name: %v", err) } - - spec := specs.Device{ - Name: name, - ContainerEdits: *edits.ContainerEdits, + for _, name := range names { + spec := specs.Device{ + Name: name, + ContainerEdits: *edits.ContainerEdits, + } + deviceSpecs = append(deviceSpecs, spec) } - return &spec, nil + return deviceSpecs, nil } // GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'. diff --git a/pkg/nvcdi/gds.go b/pkg/nvcdi/gds.go index 74a186c1..915cb94a 100644 --- a/pkg/nvcdi/gds.go +++ b/pkg/nvcdi/gds.go @@ -68,7 +68,7 @@ func (l *gdslib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) { } // GetGPUDeviceSpecs is unsupported for the gdslib specs -func (l *gdslib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) { +func (l *gdslib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) { return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported") } @@ -78,7 +78,7 @@ func (l *gdslib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai } // GetMIGDeviceSpecs is unsupported for the gdslib specs -func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { +func (l *gdslib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) { return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported") } diff --git a/pkg/nvcdi/lib-csv.go b/pkg/nvcdi/lib-csv.go index 77b5a6d4..4acc5cd6 100644 --- a/pkg/nvcdi/lib-csv.go +++ b/pkg/nvcdi/lib-csv.go @@ -58,16 +58,20 @@ func (l *csvlib) GetAllDeviceSpecs() ([]specs.Device, error) { return nil, fmt.Errorf("failed to create container edits for CSV files: %v", err) } - name, err := l.deviceNamer.GetDeviceName(0, uuidUnsupported{}) + names, err := l.deviceNamers.GetDeviceNames(0, uuidIgnored{}) if err != nil { return nil, fmt.Errorf("failed to get device name: %v", err) } - - deviceSpec := specs.Device{ - Name: name, - ContainerEdits: *e.ContainerEdits, + var deviceSpecs []specs.Device + for _, name := range names { + deviceSpec := specs.Device{ + Name: name, + ContainerEdits: *e.ContainerEdits, + } + deviceSpecs = append(deviceSpecs, deviceSpec) } - return []specs.Device{deviceSpec}, nil + + return deviceSpecs, nil } // GetCommonEdits generates a CDI specification that can be used for ANY devices @@ -82,7 +86,7 @@ func (l *csvlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) { } // GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. -func (l *csvlib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) { +func (l *csvlib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) { return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported for CSV files") } @@ -92,7 +96,7 @@ func (l *csvlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai } // GetMIGDeviceSpecs returns the CDI device specs for the full MIG represented by 'device'. -func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { +func (l *csvlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) { return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported for CSV files") } diff --git a/pkg/nvcdi/lib-nvml.go b/pkg/nvcdi/lib-nvml.go index ae7f3173..0403901a 100644 --- a/pkg/nvcdi/lib-nvml.go +++ b/pkg/nvcdi/lib-nvml.go @@ -208,11 +208,11 @@ func (l *nvmllib) getEditsForMIGDevice(nvmlDevice nvml.Device) (*cdi.ContainerEd func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) { var deviceSpecs []specs.Device err := l.devicelib.VisitDevices(func(i int, d device.Device) error { - deviceSpec, err := l.GetGPUDeviceSpecs(i, d) + specsForDevice, err := l.GetGPUDeviceSpecs(i, d) if err != nil { return err } - deviceSpecs = append(deviceSpecs, *deviceSpec) + deviceSpecs = append(deviceSpecs, specsForDevice...) return nil }) @@ -225,11 +225,11 @@ func (l *nvmllib) getGPUDeviceSpecs() ([]specs.Device, error) { func (l *nvmllib) getMigDeviceSpecs() ([]specs.Device, error) { var deviceSpecs []specs.Device err := l.devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error { - deviceSpec, err := l.GetMIGDeviceSpecs(i, d, j, mig) + specsForDevice, err := l.GetMIGDeviceSpecs(i, d, j, mig) if err != nil { return err } - deviceSpecs = append(deviceSpecs, *deviceSpec) + deviceSpecs = append(deviceSpecs, specsForDevice...) return nil }) diff --git a/pkg/nvcdi/lib-wsl.go b/pkg/nvcdi/lib-wsl.go index 620aa75d..5e8ea97b 100644 --- a/pkg/nvcdi/lib-wsl.go +++ b/pkg/nvcdi/lib-wsl.go @@ -68,7 +68,7 @@ func (l *wsllib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) { } // GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. -func (l *wsllib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) { +func (l *wsllib) GetGPUDeviceSpecs(i int, d device.Device) ([]specs.Device, error) { return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported on WSL") } @@ -78,7 +78,7 @@ func (l *wsllib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Contai } // GetMIGDeviceSpecs returns the CDI device specs for the full MIG represented by 'device'. -func (l *wsllib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { +func (l *wsllib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) { return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported on WSL") } diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index 80916540..53692e06 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -44,7 +44,7 @@ type nvcdilib struct { nvmllib nvml.Interface mode string devicelib device.Interface - deviceNamer DeviceNamer + deviceNamers DeviceNamers driverRoot string devRoot string nvidiaCTKPath string @@ -75,8 +75,9 @@ func New(opts ...Option) (Interface, error) { if l.logger == nil { l.logger = logger.New() } - if l.deviceNamer == nil { - l.deviceNamer, _ = NewDeviceNamer(DeviceNameStrategyIndex) + if len(l.deviceNamers) == 0 { + indexNamer, _ := NewDeviceNamer(DeviceNameStrategyIndex) + l.deviceNamers = []DeviceNamer{indexNamer} } if l.driverRoot == "" { l.driverRoot = "/" diff --git a/pkg/nvcdi/management.go b/pkg/nvcdi/management.go index f21ac34b..c906db2c 100644 --- a/pkg/nvcdi/management.go +++ b/pkg/nvcdi/management.go @@ -175,7 +175,7 @@ func (m *managementlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, e } // GetGPUDeviceSpecs is unsupported for the managementlib specs -func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) { +func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) { return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported") } @@ -185,7 +185,7 @@ func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi } // GetMIGDeviceSpecs is unsupported for the managementlib specs -func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { +func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) { return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported") } diff --git a/pkg/nvcdi/mig-device-nvml.go b/pkg/nvcdi/mig-device-nvml.go index a96ad9f7..86ce04d2 100644 --- a/pkg/nvcdi/mig-device-nvml.go +++ b/pkg/nvcdi/mig-device-nvml.go @@ -31,23 +31,25 @@ import ( ) // GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'. -func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.MigDevice) (*specs.Device, error) { +func (l *nvmllib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.MigDevice) ([]specs.Device, error) { edits, err := l.GetMIGDeviceEdits(d, mig) if err != nil { return nil, fmt.Errorf("failed to get edits for device: %v", err) } - name, err := l.deviceNamer.GetMigDeviceName(i, convert{d}, j, convert{mig}) + names, err := l.deviceNamers.GetMigDeviceNames(i, convert{d}, j, convert{mig}) if err != nil { return nil, fmt.Errorf("failed to get device name: %v", err) } - - spec := specs.Device{ - Name: name, - ContainerEdits: *edits.ContainerEdits, + var deviceSpecs []specs.Device + for _, name := range names { + spec := specs.Device{ + Name: name, + ContainerEdits: *edits.ContainerEdits, + } + deviceSpecs = append(deviceSpecs, spec) } - - return &spec, nil + return deviceSpecs, nil } // GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'. diff --git a/pkg/nvcdi/mofed.go b/pkg/nvcdi/mofed.go index 607b7baf..9f45cfc9 100644 --- a/pkg/nvcdi/mofed.go +++ b/pkg/nvcdi/mofed.go @@ -68,7 +68,7 @@ func (l *mofedlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) } // GetGPUDeviceSpecs is unsupported for the mofedlib specs -func (l *mofedlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) { +func (l *mofedlib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) { return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported") } @@ -78,7 +78,7 @@ func (l *mofedlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.Cont } // GetMIGDeviceSpecs is unsupported for the mofedlib specs -func (l *mofedlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { +func (l *mofedlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) { return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported") } diff --git a/pkg/nvcdi/namer.go b/pkg/nvcdi/namer.go index b460611c..e6a27f45 100644 --- a/pkg/nvcdi/namer.go +++ b/pkg/nvcdi/namer.go @@ -28,6 +28,9 @@ type UUIDer interface { GetUUID() (string, error) } +// DeviceNamers represents a list of device namers +type DeviceNamers []DeviceNamer + // DeviceNamer is an interface for getting device names type DeviceNamer interface { GetDeviceName(int, UUIDer) (string, error) @@ -102,6 +105,12 @@ type convert struct { nvmlUUIDer } +type uuidIgnored struct{} + +func (m uuidIgnored) GetUUID() (string, error) { + return "", nil +} + type uuidUnsupported struct{} func (m convert) GetUUID() (string, error) { @@ -120,3 +129,39 @@ var errUUIDUnsupported = errors.New("GetUUID is not supported") func (m uuidUnsupported) GetUUID() (string, error) { return "", errUUIDUnsupported } + +func (l DeviceNamers) GetDeviceNames(i int, d UUIDer) ([]string, error) { + var names []string + for _, namer := range l { + name, err := namer.GetDeviceName(i, d) + if err != nil { + return nil, err + } + if name == "" { + continue + } + names = append(names, name) + } + if len(names) == 0 { + return nil, errors.New("no names defined") + } + return names, nil +} + +func (l DeviceNamers) GetMigDeviceNames(i int, d UUIDer, j int, mig UUIDer) ([]string, error) { + var names []string + for _, namer := range l { + name, err := namer.GetMigDeviceName(i, d, j, mig) + if err != nil { + return nil, err + } + if name == "" { + continue + } + names = append(names, name) + } + if len(names) == 0 { + return nil, errors.New("no names defined") + } + return names, nil +} diff --git a/pkg/nvcdi/options.go b/pkg/nvcdi/options.go index 731e2ee6..388bb0a2 100644 --- a/pkg/nvcdi/options.go +++ b/pkg/nvcdi/options.go @@ -34,10 +34,10 @@ func WithDeviceLib(devicelib device.Interface) Option { } } -// WithDeviceNamer sets the device namer for the library -func WithDeviceNamer(namer DeviceNamer) Option { +// WithDeviceNamers sets the device namer for the library +func WithDeviceNamers(namers ...DeviceNamer) Option { return func(l *nvcdilib) { - l.deviceNamer = namer + l.deviceNamers = namers } }