Add nvcdi package with basic CDI generation API

This change adds an nvcdi package that exposes a basic API for
CDI spec generation. This is used from the nvidia-ctk cdi generate
command and can be consumed by DRA implementations and the device plugin.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2022-12-02 14:17:52 +01:00
parent 179133c8ad
commit 5b110fba2d
11 changed files with 456 additions and 189 deletions

View File

@ -17,9 +17,12 @@
package generate package generate
import ( import (
"fmt"
"path/filepath" "path/filepath"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/container-orchestrated-devices/container-device-interface/specs-go" "github.com/container-orchestrated-devices/container-device-interface/specs-go"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
) )
@ -33,6 +36,16 @@ type deviceFolderPermissions struct {
var _ discover.Discover = (*deviceFolderPermissions)(nil) var _ discover.Discover = (*deviceFolderPermissions)(nil)
// GetDeviceFolderPermissionHookEdits gets the edits required for device folder permissions discoverer
func GetDeviceFolderPermissionHookEdits(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, deviceSpecs []specs.Device) (*cdi.ContainerEdits, error) {
deviceFolderPermissionHooks, err := NewDeviceFolderPermissionHookDiscoverer(logger, driverRoot, nvidiaCTKPath, deviceSpecs)
if err != nil {
return nil, fmt.Errorf("failed to generated permission hooks for device nodes: %v", err)
}
return edits.FromDiscoverer(deviceFolderPermissionHooks)
}
// NewDeviceFolderPermissionHookDiscoverer creates a discoverer that can be used to update the permissions for the parent folders of nested device nodes from the specified set of device specs. // NewDeviceFolderPermissionHookDiscoverer creates a discoverer that can be used to update the permissions for the parent folders of nested device nodes from the specified set of device specs.
// This works around an issue with rootless podman when using crun as a low-level runtime. // This works around an issue with rootless podman when using crun as a low-level runtime.
// See https://github.com/containers/crun/issues/1047 // See https://github.com/containers/crun/issues/1047

View File

@ -23,8 +23,8 @@ import (
"path/filepath" "path/filepath"
"strings" "strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
specs "github.com/container-orchestrated-devices/container-device-interface/specs-go" specs "github.com/container-orchestrated-devices/container-device-interface/specs-go"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
@ -90,7 +90,7 @@ func (m command) build() *cli.Command {
&cli.StringFlag{ &cli.StringFlag{
Name: "device-name-strategy", Name: "device-name-strategy",
Usage: "Specify the strategy for generating device names. One of [index | uuid | type-index]", Usage: "Specify the strategy for generating device names. One of [index | uuid | type-index]",
Value: deviceNameStrategyIndex, Value: nvcdi.DeviceNameStrategyIndex,
Destination: &cfg.deviceNameStrategy, Destination: &cfg.deviceNameStrategy,
}, },
&cli.StringFlag{ &cli.StringFlag{
@ -117,7 +117,7 @@ func (m command) validateFlags(r *cli.Context, cfg *config) error {
return fmt.Errorf("invalid output format: %v", cfg.format) return fmt.Errorf("invalid output format: %v", cfg.format)
} }
_, err := newDeviceNamer(cfg.deviceNameStrategy) _, err := nvcdi.NewDeviceNamer(cfg.deviceNameStrategy)
if err != nil { if err != nil {
return err return err
} }
@ -126,16 +126,7 @@ func (m command) validateFlags(r *cli.Context, cfg *config) error {
} }
func (m command) run(c *cli.Context, cfg *config) error { func (m command) run(c *cli.Context, cfg *config) error {
deviceNamer, err := newDeviceNamer(cfg.deviceNameStrategy) spec, err := m.generateSpec(cfg)
if err != nil {
return fmt.Errorf("failed to create device namer: %v", err)
}
spec, err := m.generateSpec(
cfg.driverRoot,
discover.FindNvidiaCTK(m.logger, cfg.nvidiaCTKPath),
deviceNamer,
)
if err != nil { if err != nil {
return fmt.Errorf("failed to generate CDI spec: %v", err) return fmt.Errorf("failed to generate CDI spec: %v", err)
} }
@ -214,7 +205,12 @@ func writeToOutput(format string, data []byte, output io.Writer) error {
return nil return nil
} }
func (m command) generateSpec(driverRoot string, nvidiaCTKPath string, namer deviceNamer) (*specs.Spec, error) { func (m command) generateSpec(cfg *config) (*specs.Spec, error) {
deviceNamer, err := nvcdi.NewDeviceNamer(cfg.deviceNameStrategy)
if err != nil {
return nil, fmt.Errorf("failed to create device namer: %v", err)
}
nvmllib := nvml.New() nvmllib := nvml.New()
if r := nvmllib.Init(); r != nvml.SUCCESS { if r := nvmllib.Init(); r != nvml.SUCCESS {
return nil, r return nil, r
@ -223,7 +219,16 @@ func (m command) generateSpec(driverRoot string, nvidiaCTKPath string, namer dev
devicelib := device.New(device.WithNvml(nvmllib)) devicelib := device.New(device.WithNvml(nvmllib))
deviceSpecs, err := m.generateDeviceSpecs(devicelib, driverRoot, nvidiaCTKPath, namer) cdilib := nvcdi.New(
nvcdi.WithLogger(m.logger),
nvcdi.WithDriverRoot(cfg.driverRoot),
nvcdi.WithNVIDIACTKPath(cfg.nvidiaCTKPath),
nvcdi.WithDeviceNamer(deviceNamer),
nvcdi.WithDeviceLib(devicelib),
nvcdi.WithNvmlLib(nvmllib),
)
deviceSpecs, err := cdilib.GetAllDeviceSpecs()
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to create device CDI specs: %v", err) return nil, fmt.Errorf("failed to create device CDI specs: %v", err)
} }
@ -232,20 +237,16 @@ func (m command) generateSpec(driverRoot string, nvidiaCTKPath string, namer dev
deviceSpecs = append(deviceSpecs, allDevice) deviceSpecs = append(deviceSpecs, allDevice)
common, err := NewCommonDiscoverer(m.logger, driverRoot, nvidiaCTKPath, nvmllib) commonEdits, err := cdilib.GetCommonEdits()
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to create discoverer for common entities: %v", err) return nil, fmt.Errorf("failed to create edits common for entities: %v", err)
}
deviceFolderPermissionEdits, err := GetDeviceFolderPermissionHookEdits(m.logger, cfg.driverRoot, cfg.nvidiaCTKPath, deviceSpecs)
if err != nil {
return nil, fmt.Errorf("failed to generated edits for device folder permissions: %v", err)
} }
deviceFolderPermissionHooks, err := NewDeviceFolderPermissionHookDiscoverer(m.logger, driverRoot, nvidiaCTKPath, deviceSpecs) commonEdits.Append(deviceFolderPermissionEdits)
if err != nil {
return nil, fmt.Errorf("failed to generated permission hooks for device nodes: %v", err)
}
commonEdits, err := edits.FromDiscoverer(discover.Merge(common, deviceFolderPermissionHooks))
if err != nil {
return nil, fmt.Errorf("failed to create container edits for common entities: %v", err)
}
// We construct the spec and determine the minimum required version based on the specification. // We construct the spec and determine the minimum required version based on the specification.
spec := specs.Spec{ spec := specs.Spec{
@ -266,73 +267,6 @@ func (m command) generateSpec(driverRoot string, nvidiaCTKPath string, namer dev
return &spec, nil return &spec, nil
} }
func (m command) generateDeviceSpecs(devicelib device.Interface, driverRoot string, nvidiaCTKPath string, namer deviceNamer) ([]specs.Device, error) {
var deviceSpecs []specs.Device
err := devicelib.VisitDevices(func(i int, d device.Device) error {
isMigEnabled, err := d.IsMigEnabled()
if err != nil {
return fmt.Errorf("failed to check whether device is MIG device: %v", err)
}
if isMigEnabled {
return nil
}
device, err := NewFullGPUDiscoverer(m.logger, driverRoot, nvidiaCTKPath, d)
if err != nil {
return fmt.Errorf("failed to create device: %v", err)
}
deviceEdits, err := edits.FromDiscoverer(device)
if err != nil {
return fmt.Errorf("failed to create container edits for device: %v", err)
}
deviceName, err := namer.GetDeviceName(i, d)
if err != nil {
return fmt.Errorf("failed to get device name: %v", err)
}
deviceSpec := specs.Device{
Name: deviceName,
ContainerEdits: *deviceEdits.ContainerEdits,
}
deviceSpecs = append(deviceSpecs, deviceSpec)
return nil
})
if err != nil {
return nil, fmt.Errorf("failed to generate CDI spec for GPU devices: %v", err)
}
err = devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error {
device, err := NewMigDeviceDiscoverer(m.logger, "", d, mig)
if err != nil {
return fmt.Errorf("failed to create MIG device: %v", err)
}
deviceEdits, err := edits.FromDiscoverer(device)
if err != nil {
return fmt.Errorf("failed to create container edits for MIG device: %v", err)
}
deviceName, err := namer.GetMigDeviceName(i, j, mig)
if err != nil {
return fmt.Errorf("failed to get device name: %v", err)
}
deviceSpec := specs.Device{
Name: deviceName,
ContainerEdits: *deviceEdits.ContainerEdits,
}
deviceSpecs = append(deviceSpecs, deviceSpec)
return nil
})
if err != nil {
return nil, fmt.Errorf("falied to generate CDI spec for MIG devices: %v", err)
}
return deviceSpecs, nil
}
// createAllDevice creates an 'all' device which combines the edits from the previous devices // createAllDevice creates an 'all' device which combines the edits from the previous devices
func createAllDevice(deviceSpecs []specs.Device) specs.Device { func createAllDevice(deviceSpecs []specs.Device) specs.Device {
edits := edits.NewContainerEdits() edits := edits.NewContainerEdits()

View File

@ -1,75 +0,0 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package generate
import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
"github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
// NewMigDeviceDiscoverer creates a discoverer for the specified mig device and its parent.
func NewMigDeviceDiscoverer(logger *logrus.Logger, driverRoot string, parent device.Device, d device.MigDevice) (discover.Discover, error) {
minor, ret := parent.GetMinorNumber()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU device minor number: %v", ret)
}
parentPath := fmt.Sprintf("/dev/nvidia%d", minor)
migCaps, err := nvcaps.NewMigCaps()
if err != nil {
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
}
gi, ret := d.GetGpuInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret)
}
ci, ret := d.GetComputeInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret)
}
giCap := nvcaps.NewGPUInstanceCap(minor, gi)
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
if err != nil {
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
}
ciCap := nvcaps.NewComputeInstanceCap(minor, gi, ci)
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
if err != nil {
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
}
deviceNodes := discover.NewCharDeviceDiscoverer(
logger,
[]string{
parentPath,
giCapDevicePath,
ciCapDevicePath,
},
driverRoot,
)
return deviceNodes, nil
}

33
pkg/nvcdi/api.go Normal file
View File

@ -0,0 +1,33 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package nvcdi
import (
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
)
// Interface defines the API for the nvcdi package
type Interface interface {
GetCommonEdits() (*cdi.ContainerEdits, error)
GetAllDeviceSpecs() ([]specs.Device, error)
GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error)
GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error)
GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error)
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error)
}

View File

@ -14,20 +14,33 @@
# limitations under the License. # limitations under the License.
**/ **/
package generate package nvcdi
import ( import (
"fmt" "fmt"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
) )
// NewCommonDiscoverer returns a discoverer for entities that are not associated with a specific CDI device. // GetCommonEdits generates a CDI specification that can be used for ANY devices
func (l *nvcdilib) GetCommonEdits() (*cdi.ContainerEdits, error) {
common, err := newCommonDiscoverer(l.logger, l.driverRoot, l.nvidiaCTKPath, l.nvmllib)
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for common entities: %v", err)
}
return edits.FromDiscoverer(common)
}
// newCommonDiscoverer returns a discoverer for entities that are not associated with a specific CDI device.
// This includes driver libraries and meta devices, for example. // This includes driver libraries and meta devices, for example.
func NewCommonDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, nvmllib nvml.Interface) (discover.Discover, error) { func newCommonDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, nvmllib nvml.Interface) (discover.Discover, error) {
metaDevices := discover.NewDeviceDiscoverer( metaDevices := discover.NewDeviceDiscoverer(
logger, logger,
lookup.NewCharDeviceLocator( lookup.NewCharDeviceLocator(

View File

@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
**/ **/
package generate package nvcdi
import ( import (
"fmt" "fmt"

View File

@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
**/ **/
package generate package nvcdi
import ( import (
"fmt" "fmt"
@ -23,12 +23,50 @@ import (
"strings" "strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm" "github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
) )
// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *nvcdilib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) {
edits, err := l.GetGPUDeviceEdits(d)
if err != nil {
return nil, fmt.Errorf("failed to get edits for device: %v", err)
}
name, err := l.deviceNamer.GetDeviceName(i, d)
if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err)
}
spec := specs.Device{
Name: name,
ContainerEdits: *edits.ContainerEdits,
}
return &spec, nil
}
// GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'.
func (l *nvcdilib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error) {
device, err := newFullGPUDiscoverer(l.logger, l.driverRoot, l.nvidiaCTKPath, d)
if err != nil {
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
}
editsForDevice, err := edits.FromDiscoverer(device)
if err != nil {
return nil, fmt.Errorf("failed to create container edits for device: %v", err)
}
return editsForDevice, nil
}
// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links // byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links
type byPathHookDiscoverer struct { type byPathHookDiscoverer struct {
logger *logrus.Logger logger *logrus.Logger
@ -39,8 +77,8 @@ type byPathHookDiscoverer struct {
var _ discover.Discover = (*byPathHookDiscoverer)(nil) var _ discover.Discover = (*byPathHookDiscoverer)(nil)
// NewFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device. // newFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device.
func NewFullGPUDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, d device.Device) (discover.Discover, error) { func newFullGPUDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, d device.Device) (discover.Discover, error) {
// TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface. // TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface.
// This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin. // This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin.
minor, ret := d.GetMinorNumber() minor, ret := d.GetMinorNumber()

114
pkg/nvcdi/lib.go Normal file
View File

@ -0,0 +1,114 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package nvcdi
import (
"fmt"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
"github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
type nvcdilib struct {
logger *logrus.Logger
nvmllib nvml.Interface
devicelib device.Interface
deviceNamer DeviceNamer
driverRoot string
nvidiaCTKPath string
}
// New creates a new nvcdi library
func New(opts ...Option) Interface {
l := &nvcdilib{}
if l.nvmllib == nil {
l.nvmllib = nvml.New()
}
if l.devicelib == nil {
l.devicelib = device.New(device.WithNvml(l.nvmllib))
}
if l.logger == nil {
l.logger = logrus.StandardLogger()
}
if l.deviceNamer == nil {
l.deviceNamer, _ = NewDeviceNamer(DeviceNameStrategyIndex)
}
if l.driverRoot == "" {
l.driverRoot = "/"
}
if l.nvidiaCTKPath == "" {
l.nvidiaCTKPath = "/usr/bin/nvidia-ctk"
}
return l
}
// GetAllDeviceSpecs returns the device specs for all available devices.
func (l *nvcdilib) GetAllDeviceSpecs() ([]specs.Device, error) {
var deviceSpecs []specs.Device
gpuDeviceSpecs, err := l.getGPUDeviceSpecs()
if err != nil {
return nil, err
}
deviceSpecs = append(deviceSpecs, gpuDeviceSpecs...)
migDeviceSpecs, err := l.getMigDeviceSpecs()
if err != nil {
return nil, err
}
deviceSpecs = append(deviceSpecs, migDeviceSpecs...)
return deviceSpecs, nil
}
func (l *nvcdilib) getGPUDeviceSpecs() ([]specs.Device, error) {
var deviceSpecs []specs.Device
err := l.devicelib.VisitDevices(func(i int, d device.Device) error {
deviceSpec, err := l.GetGPUDeviceSpecs(i, d)
if err != nil {
return err
}
deviceSpecs = append(deviceSpecs, *deviceSpec)
return nil
})
if err != nil {
return nil, fmt.Errorf("failed to generate CDI edits for GPU devices: %v", err)
}
return deviceSpecs, err
}
func (l *nvcdilib) getMigDeviceSpecs() ([]specs.Device, error) {
var deviceSpecs []specs.Device
err := l.devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error {
deviceSpec, err := l.GetMIGDeviceSpecs(i, d, j, mig)
if err != nil {
return err
}
deviceSpecs = append(deviceSpecs, *deviceSpec)
return nil
})
if err != nil {
return nil, fmt.Errorf("failed to generate CDI edits for GPU devices: %v", err)
}
return deviceSpecs, err
}

124
pkg/nvcdi/mig-device.go Normal file
View File

@ -0,0 +1,124 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package nvcdi
import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
"github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
// GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
func (l *nvcdilib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.MigDevice) (*specs.Device, error) {
edits, err := l.GetMIGDeviceEdits(d, mig)
if err != nil {
return nil, fmt.Errorf("failed to get edits for device: %v", err)
}
name, err := l.deviceNamer.GetMigDeviceName(i, d, j, mig)
if err != nil {
return nil, fmt.Errorf("failed to get device name: %v", err)
}
spec := specs.Device{
Name: name,
ContainerEdits: *edits.ContainerEdits,
}
return &spec, nil
}
// GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'.
func (l *nvcdilib) GetMIGDeviceEdits(parent device.Device, mig device.MigDevice) (*cdi.ContainerEdits, error) {
gpu, ret := parent.GetMinorNumber()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU minor: %v", ret)
}
gi, ret := mig.GetGpuInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret)
}
ci, ret := mig.GetComputeInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret)
}
editsForDevice, err := GetEditsForComputeInstance(l.logger, l.driverRoot, gpu, gi, ci)
if err != nil {
return nil, fmt.Errorf("failed to create container edits for MIG device: %v", err)
}
return editsForDevice, nil
}
// GetEditsForComputeInstance returns the CDI edits for a particular compute instance defined by the (gpu, gi, ci) tuple
func GetEditsForComputeInstance(logger *logrus.Logger, driverRoot string, gpu int, gi int, ci int) (*cdi.ContainerEdits, error) {
computeInstance, err := newComputeInstanceDiscoverer(logger, driverRoot, gpu, gi, ci)
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for Compute Instance: %v", err)
}
editsForDevice, err := edits.FromDiscoverer(computeInstance)
if err != nil {
return nil, fmt.Errorf("failed to create container edits for Compute Instance: %v", err)
}
return editsForDevice, nil
}
// newComputeInstanceDiscoverer returns a discoverer for the specified compute instance
func newComputeInstanceDiscoverer(logger *logrus.Logger, driverRoot string, gpu int, gi int, ci int) (discover.Discover, error) {
parentPath := fmt.Sprintf("/dev/nvidia%d", gpu)
migCaps, err := nvcaps.NewMigCaps()
if err != nil {
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
}
giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
if err != nil {
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
}
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
if err != nil {
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
}
deviceNodes := discover.NewCharDeviceDiscoverer(
logger,
[]string{
parentPath,
giCapDevicePath,
ciCapDevicePath,
},
driverRoot,
)
return deviceNodes, nil
}

View File

@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
**/ **/
package generate package nvcdi
import ( import (
"fmt" "fmt"
@ -23,15 +23,20 @@ import (
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
) )
type deviceNamer interface { // DeviceNamer is an interface for getting device names
type DeviceNamer interface {
GetDeviceName(int, device.Device) (string, error) GetDeviceName(int, device.Device) (string, error)
GetMigDeviceName(int, int, device.MigDevice) (string, error) GetMigDeviceName(int, device.Device, int, device.MigDevice) (string, error)
} }
// Supported device naming strategies
const ( const (
deviceNameStrategyIndex = "index" // DeviceNameStrategyIndex generates devices names such as 0 or 1:0
deviceNameStrategyTypeIndex = "type-index" DeviceNameStrategyIndex = "index"
deviceNameStrategyUUID = "uuid" // DeviceNameStrategyTypeIndex generates devices names such as gpu0 or mig1:0
DeviceNameStrategyTypeIndex = "type-index"
// DeviceNameStrategyUUID uses the device UUID as the name
DeviceNameStrategyUUID = "uuid"
) )
type deviceNameIndex struct { type deviceNameIndex struct {
@ -40,15 +45,15 @@ type deviceNameIndex struct {
} }
type deviceNameUUID struct{} type deviceNameUUID struct{}
// newDeviceNamer creates a Device Namer based on the supplied strategy. // NewDeviceNamer creates a Device Namer based on the supplied strategy.
// This namer can be used to construct the names for MIG and GPU devices when generating the CDI spec. // This namer can be used to construct the names for MIG and GPU devices when generating the CDI spec.
func newDeviceNamer(strategy string) (deviceNamer, error) { func NewDeviceNamer(strategy string) (DeviceNamer, error) {
switch strategy { switch strategy {
case deviceNameStrategyIndex: case DeviceNameStrategyIndex:
return deviceNameIndex{}, nil return deviceNameIndex{}, nil
case deviceNameStrategyTypeIndex: case DeviceNameStrategyTypeIndex:
return deviceNameIndex{gpuPrefix: "gpu", migPrefix: "mig"}, nil return deviceNameIndex{gpuPrefix: "gpu", migPrefix: "mig"}, nil
case deviceNameStrategyUUID: case DeviceNameStrategyUUID:
return deviceNameUUID{}, nil return deviceNameUUID{}, nil
} }
@ -61,7 +66,7 @@ func (s deviceNameIndex) GetDeviceName(i int, d device.Device) (string, error) {
} }
// GetMigDeviceName returns the name for the specified device based on the naming strategy // GetMigDeviceName returns the name for the specified device based on the naming strategy
func (s deviceNameIndex) GetMigDeviceName(i int, j int, d device.MigDevice) (string, error) { func (s deviceNameIndex) GetMigDeviceName(i int, d device.Device, j int, mig device.MigDevice) (string, error) {
return fmt.Sprintf("%s%d:%d", s.migPrefix, i, j), nil return fmt.Sprintf("%s%d:%d", s.migPrefix, i, j), nil
} }
@ -75,8 +80,8 @@ func (s deviceNameUUID) GetDeviceName(i int, d device.Device) (string, error) {
} }
// GetMigDeviceName returns the name for the specified device based on the naming strategy // GetMigDeviceName returns the name for the specified device based on the naming strategy
func (s deviceNameUUID) GetMigDeviceName(i int, j int, d device.MigDevice) (string, error) { func (s deviceNameUUID) GetMigDeviceName(i int, d device.Device, j int, mig device.MigDevice) (string, error) {
uuid, ret := d.GetUUID() uuid, ret := mig.GetUUID()
if ret != nvml.SUCCESS { if ret != nvml.SUCCESS {
return "", fmt.Errorf("failed to get device UUID: %v", ret) return "", fmt.Errorf("failed to get device UUID: %v", ret)
} }

68
pkg/nvcdi/options.go Normal file
View File

@ -0,0 +1,68 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package nvcdi
import (
"github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
// Option is a function that configures the nvcdilib
type Option func(*nvcdilib)
// WithDeviceLib sets the device library for the library
func WithDeviceLib(devicelib device.Interface) Option {
return func(l *nvcdilib) {
l.devicelib = devicelib
}
}
// WithDeviceNamer sets the device namer for the library
func WithDeviceNamer(namer DeviceNamer) Option {
return func(l *nvcdilib) {
l.deviceNamer = namer
}
}
// WithDriverRoot sets the driver root for the library
func WithDriverRoot(root string) Option {
return func(l *nvcdilib) {
l.driverRoot = root
}
}
// WithLogger sets the logger for the library
func WithLogger(logger *logrus.Logger) Option {
return func(l *nvcdilib) {
l.logger = logger
}
}
// WithNVIDIACTKPath sets the path to the NVIDIA Container Toolkit CLI path for the library
func WithNVIDIACTKPath(path string) Option {
return func(l *nvcdilib) {
l.nvidiaCTKPath = path
}
}
// WithNvmlLib sets the nvml library for the library
func WithNvmlLib(nvmllib nvml.Interface) Option {
return func(l *nvcdilib) {
l.nvmllib = nvmllib
}
}