From 29cbbe83f97936fc0ca3c7189b0fa8f1691bd9a9 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 1 Mar 2023 12:16:38 +0200 Subject: [PATCH] Add management mode to CDI spec generation API These changes add support for generating a management spec to the nvcdi API. A management spec consists of a single CDI device (`all`) which includes all expected NVIDIA device nodes, driver libraries, binaries, and IPC sockets. Signed-off-by: Evan Lezar --- pkg/nvcdi/api.go | 2 + pkg/nvcdi/driver-nvml.go | 4 + pkg/nvcdi/lib.go | 5 ++ pkg/nvcdi/management.go | 182 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 193 insertions(+) create mode 100644 pkg/nvcdi/management.go diff --git a/pkg/nvcdi/api.go b/pkg/nvcdi/api.go index 85bace99..a1a26e2d 100644 --- a/pkg/nvcdi/api.go +++ b/pkg/nvcdi/api.go @@ -30,6 +30,8 @@ const ( ModeNvml = "nvml" // ModeWsl configures the CDI spec generator to generate a WSL spec. ModeWsl = "wsl" + // ModeManagement configures the CDI spec generator to generate a management spec. + ModeManagement = "management" ) // Interface defines the API for the nvcdi package diff --git a/pkg/nvcdi/driver-nvml.go b/pkg/nvcdi/driver-nvml.go index 981ebda2..eee3ffd8 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/pkg/nvcdi/driver-nvml.go @@ -36,6 +36,10 @@ func NewDriverDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath return nil, fmt.Errorf("failed to determine driver version: %v", r) } + return newDriverVersionDiscoverer(logger, driverRoot, nvidiaCTKPath, version) +} + +func newDriverVersionDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, version string) (discover.Discover, error) { libraries, err := NewDriverLibraryDiscoverer(logger, driverRoot, nvidiaCTKPath, version) if err != nil { return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err) diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index c13a2ab5..ba94822e 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -73,6 +73,11 @@ func New(opts ...Option) Interface { var lib Interface switch l.resolveMode() { + case ModeManagement: + if l.vendor == "" { + l.vendor = "management.nvidia.com" + } + lib = (*managementlib)(l) case ModeNvml: if l.nvmllib == nil { l.nvmllib = nvml.New() diff --git a/pkg/nvcdi/management.go b/pkg/nvcdi/management.go new file mode 100644 index 00000000..484393a0 --- /dev/null +++ b/pkg/nvcdi/management.go @@ -0,0 +1,182 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvcdi + +import ( + "fmt" + "path/filepath" + "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" + "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" + "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" + "github.com/container-orchestrated-devices/container-device-interface/specs-go" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" +) + +type managementlib nvcdilib + +var _ Interface = (*managementlib)(nil) + +// GetAllDeviceSpecs returns all device specs for use in managemnt containers. +// A single device with the name `all` is returned. +func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) { + devices, err := m.newManagementDeviceDiscoverer() + if err != nil { + return nil, fmt.Errorf("failed to create device discoverer: %v", err) + } + + edits, err := edits.FromDiscoverer(devices) + if err != nil { + return nil, fmt.Errorf("failed to create edits from discoverer: %v", err) + } + + if len(edits.DeviceNodes) == 0 { + return nil, fmt.Errorf("no NVIDIA device nodes found") + } + + device := specs.Device{ + Name: "all", + ContainerEdits: *edits.ContainerEdits, + } + return []specs.Device{device}, nil +} + +// GetCommonEdits returns the common edits for use in managementlib containers. +func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { + locator, err := lookup.NewLibraryLocator( + m.logger, + m.driverRoot, + ) + if err != nil { + return nil, fmt.Errorf("failed to create library locator: %v", err) + } + + candidates, err := locator.Locate("libcuda.so") + if err != nil { + return nil, fmt.Errorf("failed to locate libcuda.so: %v", err) + } + libcudaPath := candidates[0] + + version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.") + if version == "" { + return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath) + } + + driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version) + if err != nil { + return nil, fmt.Errorf("failed to create driver library discoverer: %v", err) + } + + edits, err := edits.FromDiscoverer(driver) + if err != nil { + return nil, fmt.Errorf("failed to create edits from discoverer: %v", err) + } + + return edits, nil +} + +type managementDiscoverer struct { + discover.Discover +} + +// newManagementDeviceDiscoverer returns a discover.Discover that discovers device nodes for use in managementlib containers. +// NVML is not used to query devices and all device nodes are returned. +func (m *managementlib) newManagementDeviceDiscoverer() (discover.Discover, error) { + deviceNodes := discover.NewCharDeviceDiscoverer( + m.logger, + []string{ + "/dev/nvidia*", + "/dev/nvidia-caps/nvidia-cap*", + "/dev/nvidia-modeset", + "/dev/nvidia-uvm-tools", + "/dev/nvidia-uvm", + "/dev/nvidiactl", + }, + m.driverRoot, + ) + + deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer( + m.logger, + m.driverRoot, + m.nvidiaCTKPath, + deviceNodes, + ) + + d := discover.Merge( + &managementDiscoverer{deviceNodes}, + deviceFolderPermissionHooks, + ) + return d, nil +} + +func (m *managementDiscoverer) Devices() ([]discover.Device, error) { + devices, err := m.Discover.Devices() + if err != nil { + return devices, err + } + + var filteredDevices []discover.Device + for _, device := range devices { + if m.nodeIsBlocked(device.HostPath) { + continue + } + filteredDevices = append(filteredDevices, device) + } + + return filteredDevices, nil +} + +// nodeIsBlocked returns true if the specified device node should be ignored. +func (m managementDiscoverer) nodeIsBlocked(path string) bool { + blockedPrefixes := []string{"nvidia-fs", "nvidia-nvswitch", "nvidia-nvlink"} + nodeName := filepath.Base(path) + for _, prefix := range blockedPrefixes { + if strings.HasPrefix(nodeName, prefix) { + return true + } + } + return false +} + +// GetSpec is unsppported for the managementlib specs. +// managementlib is typically wrapped by a spec that implements GetSpec. +func (m *managementlib) GetSpec() (spec.Interface, error) { + return nil, fmt.Errorf("GetSpec is not supported") +} + +// GetGPUDeviceEdits is unsupported for the managementlib specs +func (m *managementlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) { + return nil, fmt.Errorf("GetGPUDeviceEdits is not supported") +} + +// GetGPUDeviceSpecs is unsupported for the managementlib specs +func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) { + return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported") +} + +// GetMIGDeviceEdits is unsupported for the managementlib specs +func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error) { + return nil, fmt.Errorf("GetMIGDeviceEdits is not supported") +} + +// GetMIGDeviceSpecs is unsupported for the managementlib specs +func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { + return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported") +}