mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-22 00:08:11 +00:00
Add management mode to CDI spec generation API
These changes add support for generating a management spec to the nvcdi API. A management spec consists of a single CDI device (`all`) which includes all expected NVIDIA device nodes, driver libraries, binaries, and IPC sockets. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
parent
64b16acb1f
commit
29cbbe83f9
@ -30,6 +30,8 @@ const (
|
||||
ModeNvml = "nvml"
|
||||
// ModeWsl configures the CDI spec generator to generate a WSL spec.
|
||||
ModeWsl = "wsl"
|
||||
// ModeManagement configures the CDI spec generator to generate a management spec.
|
||||
ModeManagement = "management"
|
||||
)
|
||||
|
||||
// Interface defines the API for the nvcdi package
|
||||
|
@ -36,6 +36,10 @@ func NewDriverDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath
|
||||
return nil, fmt.Errorf("failed to determine driver version: %v", r)
|
||||
}
|
||||
|
||||
return newDriverVersionDiscoverer(logger, driverRoot, nvidiaCTKPath, version)
|
||||
}
|
||||
|
||||
func newDriverVersionDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, version string) (discover.Discover, error) {
|
||||
libraries, err := NewDriverLibraryDiscoverer(logger, driverRoot, nvidiaCTKPath, version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err)
|
||||
|
@ -73,6 +73,11 @@ func New(opts ...Option) Interface {
|
||||
|
||||
var lib Interface
|
||||
switch l.resolveMode() {
|
||||
case ModeManagement:
|
||||
if l.vendor == "" {
|
||||
l.vendor = "management.nvidia.com"
|
||||
}
|
||||
lib = (*managementlib)(l)
|
||||
case ModeNvml:
|
||||
if l.nvmllib == nil {
|
||||
l.nvmllib = nvml.New()
|
||||
|
182
pkg/nvcdi/management.go
Normal file
182
pkg/nvcdi/management.go
Normal file
@ -0,0 +1,182 @@
|
||||
/**
|
||||
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package nvcdi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||
)
|
||||
|
||||
type managementlib nvcdilib
|
||||
|
||||
var _ Interface = (*managementlib)(nil)
|
||||
|
||||
// GetAllDeviceSpecs returns all device specs for use in managemnt containers.
|
||||
// A single device with the name `all` is returned.
|
||||
func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) {
|
||||
devices, err := m.newManagementDeviceDiscoverer()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
|
||||
}
|
||||
|
||||
edits, err := edits.FromDiscoverer(devices)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create edits from discoverer: %v", err)
|
||||
}
|
||||
|
||||
if len(edits.DeviceNodes) == 0 {
|
||||
return nil, fmt.Errorf("no NVIDIA device nodes found")
|
||||
}
|
||||
|
||||
device := specs.Device{
|
||||
Name: "all",
|
||||
ContainerEdits: *edits.ContainerEdits,
|
||||
}
|
||||
return []specs.Device{device}, nil
|
||||
}
|
||||
|
||||
// GetCommonEdits returns the common edits for use in managementlib containers.
|
||||
func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
||||
locator, err := lookup.NewLibraryLocator(
|
||||
m.logger,
|
||||
m.driverRoot,
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create library locator: %v", err)
|
||||
}
|
||||
|
||||
candidates, err := locator.Locate("libcuda.so")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to locate libcuda.so: %v", err)
|
||||
}
|
||||
libcudaPath := candidates[0]
|
||||
|
||||
version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.")
|
||||
if version == "" {
|
||||
return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath)
|
||||
}
|
||||
|
||||
driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create driver library discoverer: %v", err)
|
||||
}
|
||||
|
||||
edits, err := edits.FromDiscoverer(driver)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create edits from discoverer: %v", err)
|
||||
}
|
||||
|
||||
return edits, nil
|
||||
}
|
||||
|
||||
type managementDiscoverer struct {
|
||||
discover.Discover
|
||||
}
|
||||
|
||||
// newManagementDeviceDiscoverer returns a discover.Discover that discovers device nodes for use in managementlib containers.
|
||||
// NVML is not used to query devices and all device nodes are returned.
|
||||
func (m *managementlib) newManagementDeviceDiscoverer() (discover.Discover, error) {
|
||||
deviceNodes := discover.NewCharDeviceDiscoverer(
|
||||
m.logger,
|
||||
[]string{
|
||||
"/dev/nvidia*",
|
||||
"/dev/nvidia-caps/nvidia-cap*",
|
||||
"/dev/nvidia-modeset",
|
||||
"/dev/nvidia-uvm-tools",
|
||||
"/dev/nvidia-uvm",
|
||||
"/dev/nvidiactl",
|
||||
},
|
||||
m.driverRoot,
|
||||
)
|
||||
|
||||
deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer(
|
||||
m.logger,
|
||||
m.driverRoot,
|
||||
m.nvidiaCTKPath,
|
||||
deviceNodes,
|
||||
)
|
||||
|
||||
d := discover.Merge(
|
||||
&managementDiscoverer{deviceNodes},
|
||||
deviceFolderPermissionHooks,
|
||||
)
|
||||
return d, nil
|
||||
}
|
||||
|
||||
func (m *managementDiscoverer) Devices() ([]discover.Device, error) {
|
||||
devices, err := m.Discover.Devices()
|
||||
if err != nil {
|
||||
return devices, err
|
||||
}
|
||||
|
||||
var filteredDevices []discover.Device
|
||||
for _, device := range devices {
|
||||
if m.nodeIsBlocked(device.HostPath) {
|
||||
continue
|
||||
}
|
||||
filteredDevices = append(filteredDevices, device)
|
||||
}
|
||||
|
||||
return filteredDevices, nil
|
||||
}
|
||||
|
||||
// nodeIsBlocked returns true if the specified device node should be ignored.
|
||||
func (m managementDiscoverer) nodeIsBlocked(path string) bool {
|
||||
blockedPrefixes := []string{"nvidia-fs", "nvidia-nvswitch", "nvidia-nvlink"}
|
||||
nodeName := filepath.Base(path)
|
||||
for _, prefix := range blockedPrefixes {
|
||||
if strings.HasPrefix(nodeName, prefix) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// GetSpec is unsppported for the managementlib specs.
|
||||
// managementlib is typically wrapped by a spec that implements GetSpec.
|
||||
func (m *managementlib) GetSpec() (spec.Interface, error) {
|
||||
return nil, fmt.Errorf("GetSpec is not supported")
|
||||
}
|
||||
|
||||
// GetGPUDeviceEdits is unsupported for the managementlib specs
|
||||
func (m *managementlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
|
||||
return nil, fmt.Errorf("GetGPUDeviceEdits is not supported")
|
||||
}
|
||||
|
||||
// GetGPUDeviceSpecs is unsupported for the managementlib specs
|
||||
func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) {
|
||||
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
|
||||
}
|
||||
|
||||
// GetMIGDeviceEdits is unsupported for the managementlib specs
|
||||
func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error) {
|
||||
return nil, fmt.Errorf("GetMIGDeviceEdits is not supported")
|
||||
}
|
||||
|
||||
// GetMIGDeviceSpecs is unsupported for the managementlib specs
|
||||
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
||||
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
|
||||
}
|
Loading…
Reference in New Issue
Block a user