Add devRoot option to CDI api

A driverRoot defines both the driver library root and the
root for device nodes. In the case of preinstalled drivers or
the driver container, these are equal, but in cases such as GKE
they do not match. In this case, drivers are extracted to a folder
and devices exist at the root /.

The changes here add a devRoot option to the nvcdi API that allows the
parent of /dev to be specified explicitly.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar
2023-11-14 16:57:37 +01:00
parent f6e3593a72
commit d4e21fdd10
15 changed files with 73 additions and 44 deletions

View File

@@ -20,22 +20,19 @@ import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/NVIDIA/go-nvlib/pkg/nvml"
)
// newCommonNVMLDiscoverer returns a discoverer for entities that are not associated with a specific CDI device.
// This includes driver libraries and meta devices, for example.
func newCommonNVMLDiscoverer(logger logger.Interface, driverRoot string, nvidiaCTKPath string, nvmllib nvml.Interface) (discover.Discover, error) {
func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) {
metaDevices := discover.NewDeviceDiscoverer(
logger,
l.logger,
lookup.NewCharDeviceLocator(
lookup.WithLogger(logger),
lookup.WithRoot(driverRoot),
lookup.WithLogger(l.logger),
lookup.WithRoot(l.devRoot),
),
driverRoot,
l.devRoot,
[]string{
"/dev/nvidia-modeset",
"/dev/nvidia-uvm-tools",
@@ -44,12 +41,12 @@ func newCommonNVMLDiscoverer(logger logger.Interface, driverRoot string, nvidiaC
},
)
graphicsMounts, err := discover.NewGraphicsMountsDiscoverer(logger, driverRoot, nvidiaCTKPath)
graphicsMounts, err := discover.NewGraphicsMountsDiscoverer(l.logger, l.driverRoot, l.nvidiaCTKPath)
if err != nil {
logger.Warningf("failed to create discoverer for graphics mounts: %v", err)
l.logger.Warningf("failed to create discoverer for graphics mounts: %v", err)
}
driverFiles, err := NewDriverDiscoverer(logger, driverRoot, nvidiaCTKPath, nvmllib)
driverFiles, err := NewDriverDiscoverer(l.logger, l.driverRoot, l.nvidiaCTKPath, l.nvmllib)
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err)
}

View File

@@ -26,11 +26,11 @@ const (
)
// newDXGDeviceDiscoverer returns a Discoverer for DXG devices under WSL2.
func newDXGDeviceDiscoverer(logger logger.Interface, driverRoot string) discover.Discover {
func newDXGDeviceDiscoverer(logger logger.Interface, devRoot string) discover.Discover {
deviceNodes := discover.NewCharDeviceDiscoverer(
logger,
[]string{dxgDeviceNode},
driverRoot,
devRoot,
)
return deviceNodes

View File

@@ -54,7 +54,7 @@ func (l *nvmllib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, erro
// GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'.
func (l *nvmllib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error) {
device, err := newFullGPUDiscoverer(l.logger, l.driverRoot, l.nvidiaCTKPath, d)
device, err := newFullGPUDiscoverer(l.logger, l.devRoot, l.nvidiaCTKPath, d)
if err != nil {
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
}
@@ -70,7 +70,7 @@ func (l *nvmllib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error
// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links
type byPathHookDiscoverer struct {
logger logger.Interface
driverRoot string
devRoot string
nvidiaCTKPath string
pciBusID string
deviceNodes discover.Discover
@@ -79,7 +79,7 @@ type byPathHookDiscoverer struct {
var _ discover.Discover = (*byPathHookDiscoverer)(nil)
// newFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device.
func newFullGPUDiscoverer(logger logger.Interface, driverRoot string, nvidiaCTKPath string, d device.Device) (discover.Discover, error) {
func newFullGPUDiscoverer(logger logger.Interface, devRoot string, nvidiaCTKPath string, d device.Device) (discover.Discover, error) {
// TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface.
// This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin.
minor, ret := d.GetMinorNumber()
@@ -104,12 +104,12 @@ func newFullGPUDiscoverer(logger logger.Interface, driverRoot string, nvidiaCTKP
deviceNodes := discover.NewCharDeviceDiscoverer(
logger,
deviceNodePaths,
driverRoot,
devRoot,
)
byPathHooks := &byPathHookDiscoverer{
logger: logger,
driverRoot: driverRoot,
devRoot: devRoot,
nvidiaCTKPath: nvidiaCTKPath,
pciBusID: pciBusID,
deviceNodes: deviceNodes,
@@ -117,7 +117,7 @@ func newFullGPUDiscoverer(logger logger.Interface, driverRoot string, nvidiaCTKP
deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer(
logger,
driverRoot,
devRoot,
nvidiaCTKPath,
deviceNodes,
)
@@ -189,7 +189,7 @@ func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) {
var links []string
for _, c := range candidates {
linkPath := filepath.Join(d.driverRoot, c)
linkPath := filepath.Join(d.devRoot, c)
device, err := os.Readlink(linkPath)
if err != nil {
d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath)

View File

@@ -33,7 +33,7 @@ var _ Interface = (*gdslib)(nil)
// GetAllDeviceSpecs returns the device specs for all available devices.
func (l *gdslib) GetAllDeviceSpecs() ([]specs.Device, error) {
discoverer, err := discover.NewGDSDiscoverer(l.logger, l.driverRoot)
discoverer, err := discover.NewGDSDiscoverer(l.logger, l.driverRoot, l.devRoot)
if err != nil {
return nil, fmt.Errorf("failed to create GPUDirect Storage discoverer: %v", err)
}

View File

@@ -42,6 +42,7 @@ func (l *csvlib) GetAllDeviceSpecs() ([]specs.Device, error) {
d, err := tegra.New(
tegra.WithLogger(l.logger),
tegra.WithDriverRoot(l.driverRoot),
tegra.WithDevRoot(l.devRoot),
tegra.WithNVIDIACTKPath(l.nvidiaCTKPath),
tegra.WithCSVFiles(l.csvFiles),
tegra.WithLibrarySearchPaths(l.librarySearchPaths...),

View File

@@ -66,7 +66,7 @@ func (l *nvmllib) GetAllDeviceSpecs() ([]specs.Device, error) {
// GetCommonEdits generates a CDI specification that can be used for ANY devices
func (l *nvmllib) GetCommonEdits() (*cdi.ContainerEdits, error) {
common, err := newCommonNVMLDiscoverer(l.logger, l.driverRoot, l.nvidiaCTKPath, l.nvmllib)
common, err := l.newCommonNVMLDiscoverer()
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for common entities: %v", err)
}

View File

@@ -37,7 +37,7 @@ func (l *wsllib) GetSpec() (spec.Interface, error) {
// GetAllDeviceSpecs returns the device specs for all available devices.
func (l *wsllib) GetAllDeviceSpecs() ([]specs.Device, error) {
device := newDXGDeviceDiscoverer(l.logger, l.driverRoot)
device := newDXGDeviceDiscoverer(l.logger, l.devRoot)
deviceEdits, err := edits.FromDiscoverer(device)
if err != nil {
return nil, fmt.Errorf("failed to create container edits for DXG device: %v", err)

View File

@@ -44,6 +44,7 @@ type nvcdilib struct {
devicelib device.Interface
deviceNamer DeviceNamer
driverRoot string
devRoot string
nvidiaCTKPath string
librarySearchPaths []string
@@ -76,6 +77,9 @@ func New(opts ...Option) (Interface, error) {
if l.driverRoot == "" {
l.driverRoot = "/"
}
if l.devRoot == "" {
l.devRoot = l.driverRoot
}
if l.nvidiaCTKPath == "" {
l.nvidiaCTKPath = "/usr/bin/nvidia-ctk"
}

View File

@@ -117,12 +117,12 @@ func (m *managementlib) newManagementDeviceDiscoverer() (discover.Discover, erro
"/dev/nvidia-uvm",
"/dev/nvidiactl",
},
m.driverRoot,
m.devRoot,
)
deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer(
m.logger,
m.driverRoot,
m.devRoot,
m.nvidiaCTKPath,
deviceNodes,
)

View File

@@ -47,6 +47,13 @@ func WithDriverRoot(root string) Option {
}
}
// WithDevRoot sets the root where /dev is located.
func WithDevRoot(root string) Option {
return func(l *nvcdilib) {
l.devRoot = root
}
}
// WithLogger sets the logger for the library
func WithLogger(logger logger.Interface) Option {
return func(l *nvcdilib) {

View File

@@ -26,7 +26,7 @@ import (
type deviceFolderPermissions struct {
logger logger.Interface
driverRoot string
devRoot string
nvidiaCTKPath string
devices discover.Discover
}
@@ -39,10 +39,10 @@ var _ discover.Discover = (*deviceFolderPermissions)(nil)
// The nested devices that are applicable to the NVIDIA GPU devices are:
// - DRM devices at /dev/dri/*
// - NVIDIA Caps devices at /dev/nvidia-caps/*
func newDeviceFolderPermissionHookDiscoverer(logger logger.Interface, driverRoot string, nvidiaCTKPath string, devices discover.Discover) discover.Discover {
func newDeviceFolderPermissionHookDiscoverer(logger logger.Interface, devRoot string, nvidiaCTKPath string, devices discover.Discover) discover.Discover {
d := &deviceFolderPermissions{
logger: logger,
driverRoot: driverRoot,
devRoot: devRoot,
nvidiaCTKPath: nvidiaCTKPath,
devices: devices,
}