Merge branch 'CNT-3965/clean-up-by-path-symlinks' into 'main'

Improve handling of /dev/dri devices and nested device paths

See merge request nvidia/container-toolkit/container-toolkit!307
This commit is contained in:
Evan Lezar 2023-03-01 10:25:48 +00:00
commit accba4ead5
3 changed files with 77 additions and 51 deletions

View File

@ -273,12 +273,6 @@ func (m command) generateSpec(cfg *config) (*specs.Spec, error) {
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to create edits common for entities: %v", err) return nil, fmt.Errorf("failed to create edits common for entities: %v", err)
} }
deviceFolderPermissionEdits, err := GetDeviceFolderPermissionHookEdits(m.logger, cfg.driverRoot, cfg.nvidiaCTKPath, deviceSpecs)
if err != nil {
return nil, fmt.Errorf("failed to generated edits for device folder permissions: %v", err)
}
commonEdits.Append(deviceFolderPermissionEdits)
// We construct the spec and determine the minimum required version based on the specification. // We construct the spec and determine the minimum required version based on the specification.
spec := specs.Spec{ spec := specs.Spec{

View File

@ -73,6 +73,7 @@ type byPathHookDiscoverer struct {
driverRoot string driverRoot string
nvidiaCTKPath string nvidiaCTKPath string
pciBusID string pciBusID string
deviceNodes discover.Discover
} }
var _ discover.Discover = (*byPathHookDiscoverer)(nil) var _ discover.Discover = (*byPathHookDiscoverer)(nil)
@ -111,11 +112,20 @@ func newFullGPUDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPat
driverRoot: driverRoot, driverRoot: driverRoot,
nvidiaCTKPath: nvidiaCTKPath, nvidiaCTKPath: nvidiaCTKPath,
pciBusID: pciBusID, pciBusID: pciBusID,
deviceNodes: deviceNodes,
} }
deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer(
logger,
driverRoot,
nvidiaCTKPath,
deviceNodes,
)
dd := discover.Merge( dd := discover.Merge(
deviceNodes, deviceNodes,
byPathHooks, byPathHooks,
deviceFolderPermissionHooks,
) )
return dd, nil return dd, nil
@ -158,6 +168,20 @@ func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) {
} }
func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) { func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) {
devices, err := d.deviceNodes.Devices()
if err != nil {
return nil, fmt.Errorf("failed to discover device nodes: %v", err)
}
if len(devices) == 0 {
return nil, nil
}
selectedDevices := make(map[string]bool)
for _, d := range devices {
selectedDevices[d.HostPath] = true
}
candidates := []string{ candidates := []string{
fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID), fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID),
fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID), fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID),
@ -172,6 +196,14 @@ func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) {
continue continue
} }
deviceNode := device
if !filepath.IsAbs(device) {
deviceNode = filepath.Join(filepath.Dir(linkPath), device)
}
if !selectedDevices[deviceNode] {
d.logger.Debugf("ignoring device symlink %v -> %v since %v is not mounted", linkPath, device, deviceNode)
continue
}
d.logger.Debugf("adding device symlink %v -> %v", linkPath, device) d.logger.Debugf("adding device symlink %v -> %v", linkPath, device)
links = append(links, fmt.Sprintf("%v::%v", device, linkPath)) links = append(links, fmt.Sprintf("%v::%v", device, linkPath))
} }

View File

@ -14,16 +14,13 @@
# limitations under the License. # limitations under the License.
**/ **/
package generate package nvcdi
import ( import (
"fmt" "fmt"
"path/filepath" "path/filepath"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
) )
@ -31,60 +28,26 @@ type deviceFolderPermissions struct {
logger *logrus.Logger logger *logrus.Logger
driverRoot string driverRoot string
nvidiaCTKPath string nvidiaCTKPath string
folders []string devices discover.Discover
} }
var _ discover.Discover = (*deviceFolderPermissions)(nil) var _ discover.Discover = (*deviceFolderPermissions)(nil)
// GetDeviceFolderPermissionHookEdits gets the edits required for device folder permissions discoverer // newDeviceFolderPermissionHookDiscoverer creates a discoverer that can be used to update the permissions for the parent folders of nested device nodes from the specified set of device specs.
func GetDeviceFolderPermissionHookEdits(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, deviceSpecs []specs.Device) (*cdi.ContainerEdits, error) {
deviceFolderPermissionHooks, err := NewDeviceFolderPermissionHookDiscoverer(logger, driverRoot, nvidiaCTKPath, deviceSpecs)
if err != nil {
return nil, fmt.Errorf("failed to generated permission hooks for device nodes: %v", err)
}
return edits.FromDiscoverer(deviceFolderPermissionHooks)
}
// NewDeviceFolderPermissionHookDiscoverer creates a discoverer that can be used to update the permissions for the parent folders of nested device nodes from the specified set of device specs.
// This works around an issue with rootless podman when using crun as a low-level runtime. // This works around an issue with rootless podman when using crun as a low-level runtime.
// See https://github.com/containers/crun/issues/1047 // See https://github.com/containers/crun/issues/1047
// The nested devices that are applicable to the NVIDIA GPU devices are: // The nested devices that are applicable to the NVIDIA GPU devices are:
// - DRM devices at /dev/dri/* // - DRM devices at /dev/dri/*
// - NVIDIA Caps devices at /dev/nvidia-caps/* // - NVIDIA Caps devices at /dev/nvidia-caps/*
func NewDeviceFolderPermissionHookDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, deviceSpecs []specs.Device) (discover.Discover, error) { func newDeviceFolderPermissionHookDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, devices discover.Discover) discover.Discover {
var folders []string
seen := make(map[string]bool)
for _, device := range deviceSpecs {
for _, dn := range device.ContainerEdits.DeviceNodes {
df := filepath.Dir(dn.Path)
if seen[df] {
continue
}
// We only consider the special case paths
if df != "/dev/dri" && df != "/dev/nvidia-caps" {
continue
}
folders = append(folders, df)
seen[df] = true
}
if len(folders) == 2 {
break
}
}
if len(folders) == 0 {
return discover.None{}, nil
}
d := &deviceFolderPermissions{ d := &deviceFolderPermissions{
logger: logger, logger: logger,
driverRoot: driverRoot, driverRoot: driverRoot,
nvidiaCTKPath: nvidiaCTKPath, nvidiaCTKPath: nvidiaCTKPath,
folders: folders, devices: devices,
} }
return d, nil return d
} }
// Devices are empty for this discoverer // Devices are empty for this discoverer
@ -94,12 +57,16 @@ func (d *deviceFolderPermissions) Devices() ([]discover.Device, error) {
// Hooks returns a set of hooks that sets the file mode to 755 of parent folders for nested device nodes. // Hooks returns a set of hooks that sets the file mode to 755 of parent folders for nested device nodes.
func (d *deviceFolderPermissions) Hooks() ([]discover.Hook, error) { func (d *deviceFolderPermissions) Hooks() ([]discover.Hook, error) {
if len(d.folders) == 0 { folders, err := d.getDeviceSubfolders()
if err != nil {
return nil, fmt.Errorf("failed to get device subfolders: %v", err)
}
if len(folders) == 0 {
return nil, nil return nil, nil
} }
args := []string{"--mode", "755"} args := []string{"--mode", "755"}
for _, folder := range d.folders { for _, folder := range folders {
args = append(args, "--path", folder) args = append(args, "--path", folder)
} }
@ -112,6 +79,39 @@ func (d *deviceFolderPermissions) Hooks() ([]discover.Hook, error) {
return []discover.Hook{hook}, nil return []discover.Hook{hook}, nil
} }
func (d *deviceFolderPermissions) getDeviceSubfolders() ([]string, error) {
// For now we only consider the following special case paths
allowedPaths := map[string]bool{
"/dev/dri": true,
"/dev/nvidia-caps": true,
}
devices, err := d.devices.Devices()
if err != nil {
return nil, fmt.Errorf("failed to get devices: %v", err)
}
var folders []string
seen := make(map[string]bool)
for _, device := range devices {
df := filepath.Dir(device.Path)
if seen[df] {
continue
}
// We only consider the special case paths
if !allowedPaths[df] {
continue
}
folders = append(folders, df)
seen[df] = true
if len(folders) == len(allowedPaths) {
break
}
}
return folders, nil
}
// Mounts are empty for this discoverer // Mounts are empty for this discoverer
func (d *deviceFolderPermissions) Mounts() ([]discover.Mount, error) { func (d *deviceFolderPermissions) Mounts() ([]discover.Mount, error) {
return nil, nil return nil, nil