Add --device-name-strategy flag for CDI spec generation

This change adds a --device-name-strategy flag for generating a CDI
specificaion. This allows a CDI spec to be generated with the following
names used for device:

* type-index: gpu0 and mig0:1
* index: 0 and 0:1
* uuid: GPU and MIG UUIDs

Note that the use of 'index' generates a v0.5.0 CDI specification since
this relaxes the restriction on the device names.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2023-01-20 15:58:04 +01:00
parent 6706024687
commit 89bf81a9db
2 changed files with 119 additions and 9 deletions

View File

@ -44,10 +44,11 @@ type command struct {
} }
type config struct { type config struct {
output string output string
format string format string
root string deviceNameStrategy string
nvidiaCTKPath string root string
nvidiaCTKPath string
} }
// NewCommand constructs a generate-cdi command with the specified logger // NewCommand constructs a generate-cdi command with the specified logger
@ -86,6 +87,12 @@ func (m command) build() *cli.Command {
Value: formatYAML, Value: formatYAML,
Destination: &cfg.format, Destination: &cfg.format,
}, },
&cli.StringFlag{
Name: "device-name-strategy",
Usage: "Specify the strategy for generating device names. One of [type-index | index | uuid]",
Value: deviceNameStrategyTypeIndex,
Destination: &cfg.deviceNameStrategy,
},
&cli.StringFlag{ &cli.StringFlag{
Name: "root", Name: "root",
Usage: "Specify the root to use when discovering the entities that should be included in the CDI specification.", Usage: "Specify the root to use when discovering the entities that should be included in the CDI specification.",
@ -110,13 +117,24 @@ func (m command) validateFlags(r *cli.Context, cfg *config) error {
return fmt.Errorf("invalid output format: %v", cfg.format) return fmt.Errorf("invalid output format: %v", cfg.format)
} }
_, err := NewDeviceNamer(cfg.deviceNameStrategy)
if err != nil {
return err
}
return nil return nil
} }
func (m command) run(c *cli.Context, cfg *config) error { func (m command) run(c *cli.Context, cfg *config) error {
deviceNamer, err := NewDeviceNamer(cfg.deviceNameStrategy)
if err != nil {
return fmt.Errorf("failed to create device namer: %v", err)
}
spec, err := m.generateSpec( spec, err := m.generateSpec(
cfg.root, cfg.root,
discover.FindNvidiaCTK(m.logger, cfg.nvidiaCTKPath), discover.FindNvidiaCTK(m.logger, cfg.nvidiaCTKPath),
deviceNamer,
) )
if err != nil { if err != nil {
return fmt.Errorf("failed to generate CDI spec: %v", err) return fmt.Errorf("failed to generate CDI spec: %v", err)
@ -196,7 +214,7 @@ func writeToOutput(format string, data []byte, output io.Writer) error {
return nil return nil
} }
func (m command) generateSpec(root string, nvidiaCTKPath string) (*specs.Spec, error) { func (m command) generateSpec(root string, nvidiaCTKPath string, namer deviceNamer) (*specs.Spec, error) {
nvmllib := nvml.New() nvmllib := nvml.New()
if r := nvmllib.Init(); r != nvml.SUCCESS { if r := nvmllib.Init(); r != nvml.SUCCESS {
return nil, r return nil, r
@ -205,7 +223,7 @@ func (m command) generateSpec(root string, nvidiaCTKPath string) (*specs.Spec, e
devicelib := device.New(device.WithNvml(nvmllib)) devicelib := device.New(device.WithNvml(nvmllib))
deviceSpecs, err := m.generateDeviceSpecs(devicelib, root, nvidiaCTKPath) deviceSpecs, err := m.generateDeviceSpecs(devicelib, root, nvidiaCTKPath, namer)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to create device CDI specs: %v", err) return nil, fmt.Errorf("failed to create device CDI specs: %v", err)
} }
@ -266,7 +284,7 @@ func (m command) generateSpec(root string, nvidiaCTKPath string) (*specs.Spec, e
return &spec, nil return &spec, nil
} }
func (m command) generateDeviceSpecs(devicelib device.Interface, root string, nvidiaCTKPath string) ([]specs.Device, error) { func (m command) generateDeviceSpecs(devicelib device.Interface, root string, nvidiaCTKPath string, namer deviceNamer) ([]specs.Device, error) {
var deviceSpecs []specs.Device var deviceSpecs []specs.Device
err := devicelib.VisitDevices(func(i int, d device.Device) error { err := devicelib.VisitDevices(func(i int, d device.Device) error {
@ -287,8 +305,12 @@ func (m command) generateDeviceSpecs(devicelib device.Interface, root string, nv
return fmt.Errorf("failed to create container edits for device: %v", err) return fmt.Errorf("failed to create container edits for device: %v", err)
} }
deviceName, err := namer.GetDeviceName(i, d)
if err != nil {
return fmt.Errorf("failed to get device name: %v", err)
}
deviceSpec := specs.Device{ deviceSpec := specs.Device{
Name: fmt.Sprintf("gpu%d", i), Name: deviceName,
ContainerEdits: *deviceEdits.ContainerEdits, ContainerEdits: *deviceEdits.ContainerEdits,
} }
@ -310,8 +332,12 @@ func (m command) generateDeviceSpecs(devicelib device.Interface, root string, nv
return fmt.Errorf("failed to create container edits for MIG device: %v", err) return fmt.Errorf("failed to create container edits for MIG device: %v", err)
} }
deviceName, err := namer.GetMigDeviceName(i, j, mig)
if err != nil {
return fmt.Errorf("failed to get device name: %v", err)
}
deviceSpec := specs.Device{ deviceSpec := specs.Device{
Name: fmt.Sprintf("mig%v:%v", i, j), Name: deviceName,
ContainerEdits: *deviceEdits.ContainerEdits, ContainerEdits: *deviceEdits.ContainerEdits,
} }

View File

@ -0,0 +1,84 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package generate
import (
"fmt"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
type deviceNamer interface {
GetDeviceName(int, device.Device) (string, error)
GetMigDeviceName(int, int, device.MigDevice) (string, error)
}
const (
deviceNameStrategyIndex = "index"
deviceNameStrategyTypeIndex = "type-index"
deviceNameStrategyUUID = "uuid"
)
type deviceNameIndex struct {
gpuPrefix string
migPrefix string
}
type deviceNameUUID struct{}
// NewDeviceNamer creates a Device Namer based on the supplied strategy.
// This namer can be used to construct the names for MIG and GPU devices when generating the CDI spec.
func NewDeviceNamer(strategy string) (deviceNamer, error) {
switch strategy {
case deviceNameStrategyIndex:
return deviceNameIndex{}, nil
case deviceNameStrategyTypeIndex:
return deviceNameIndex{gpuPrefix: "gpu", migPrefix: "mig"}, nil
case deviceNameStrategyUUID:
return deviceNameUUID{}, nil
}
return nil, fmt.Errorf("invalid device name strategy: %v", strategy)
}
// GetDeviceName returns the name for the specified device based on the naming strategy
func (s deviceNameIndex) GetDeviceName(i int, d device.Device) (string, error) {
return fmt.Sprintf("%s%d", s.gpuPrefix, i), nil
}
// GetMigDeviceName returns the name for the specified device based on the naming strategy
func (s deviceNameIndex) GetMigDeviceName(i int, j int, d device.MigDevice) (string, error) {
return fmt.Sprintf("%s%d:%d", s.migPrefix, i, j), nil
}
// GetDeviceName returns the name for the specified device based on the naming strategy
func (s deviceNameUUID) GetDeviceName(i int, d device.Device) (string, error) {
uuid, ret := d.GetUUID()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("failed to get device UUID: %v", ret)
}
return uuid, nil
}
// GetMigDeviceName returns the name for the specified device based on the naming strategy
func (s deviceNameUUID) GetMigDeviceName(i int, j int, d device.MigDevice) (string, error) {
uuid, ret := d.GetUUID()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("failed to get device UUID: %v", ret)
}
return uuid, nil
}