mirror of
				https://github.com/NVIDIA/nvidia-container-toolkit
				synced 2025-06-26 18:18:24 +00:00 
			
		
		
		
	Add management mode to CDI spec generation API
These changes add support for generating a management spec to the nvcdi API. A management spec consists of a single CDI device (`all`) which includes all expected NVIDIA device nodes, driver libraries, binaries, and IPC sockets. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
		
							parent
							
								
									64b16acb1f
								
							
						
					
					
						commit
						29cbbe83f9
					
				| @ -30,6 +30,8 @@ const ( | ||||
| 	ModeNvml = "nvml" | ||||
| 	// ModeWsl configures the CDI spec generator to generate a WSL spec.
 | ||||
| 	ModeWsl = "wsl" | ||||
| 	// ModeManagement configures the CDI spec generator to generate a management spec.
 | ||||
| 	ModeManagement = "management" | ||||
| ) | ||||
| 
 | ||||
| // Interface defines the API for the nvcdi package
 | ||||
|  | ||||
| @ -36,6 +36,10 @@ func NewDriverDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath | ||||
| 		return nil, fmt.Errorf("failed to determine driver version: %v", r) | ||||
| 	} | ||||
| 
 | ||||
| 	return newDriverVersionDiscoverer(logger, driverRoot, nvidiaCTKPath, version) | ||||
| } | ||||
| 
 | ||||
| func newDriverVersionDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, version string) (discover.Discover, error) { | ||||
| 	libraries, err := NewDriverLibraryDiscoverer(logger, driverRoot, nvidiaCTKPath, version) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err) | ||||
|  | ||||
| @ -73,6 +73,11 @@ func New(opts ...Option) Interface { | ||||
| 
 | ||||
| 	var lib Interface | ||||
| 	switch l.resolveMode() { | ||||
| 	case ModeManagement: | ||||
| 		if l.vendor == "" { | ||||
| 			l.vendor = "management.nvidia.com" | ||||
| 		} | ||||
| 		lib = (*managementlib)(l) | ||||
| 	case ModeNvml: | ||||
| 		if l.nvmllib == nil { | ||||
| 			l.nvmllib = nvml.New() | ||||
|  | ||||
							
								
								
									
										182
									
								
								pkg/nvcdi/management.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										182
									
								
								pkg/nvcdi/management.go
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,182 @@ | ||||
| /** | ||||
| # Copyright (c) NVIDIA CORPORATION.  All rights reserved. | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| #     http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| **/ | ||||
| 
 | ||||
| package nvcdi | ||||
| 
 | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"path/filepath" | ||||
| 	"strings" | ||||
| 
 | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/internal/discover" | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/internal/edits" | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" | ||||
| 	"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" | ||||
| 	"github.com/container-orchestrated-devices/container-device-interface/specs-go" | ||||
| 	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" | ||||
| ) | ||||
| 
 | ||||
| type managementlib nvcdilib | ||||
| 
 | ||||
| var _ Interface = (*managementlib)(nil) | ||||
| 
 | ||||
| // GetAllDeviceSpecs returns all device specs for use in managemnt containers.
 | ||||
| // A single device with the name `all` is returned.
 | ||||
| func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) { | ||||
| 	devices, err := m.newManagementDeviceDiscoverer() | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create device discoverer: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	edits, err := edits.FromDiscoverer(devices) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create edits from discoverer: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	if len(edits.DeviceNodes) == 0 { | ||||
| 		return nil, fmt.Errorf("no NVIDIA device nodes found") | ||||
| 	} | ||||
| 
 | ||||
| 	device := specs.Device{ | ||||
| 		Name:           "all", | ||||
| 		ContainerEdits: *edits.ContainerEdits, | ||||
| 	} | ||||
| 	return []specs.Device{device}, nil | ||||
| } | ||||
| 
 | ||||
| // GetCommonEdits returns the common edits for use in managementlib containers.
 | ||||
| func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { | ||||
| 	locator, err := lookup.NewLibraryLocator( | ||||
| 		m.logger, | ||||
| 		m.driverRoot, | ||||
| 	) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create library locator: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	candidates, err := locator.Locate("libcuda.so") | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to locate libcuda.so: %v", err) | ||||
| 	} | ||||
| 	libcudaPath := candidates[0] | ||||
| 
 | ||||
| 	version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.") | ||||
| 	if version == "" { | ||||
| 		return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath) | ||||
| 	} | ||||
| 
 | ||||
| 	driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create driver library discoverer: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	edits, err := edits.FromDiscoverer(driver) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create edits from discoverer: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	return edits, nil | ||||
| } | ||||
| 
 | ||||
| type managementDiscoverer struct { | ||||
| 	discover.Discover | ||||
| } | ||||
| 
 | ||||
| // newManagementDeviceDiscoverer returns a discover.Discover that discovers device nodes for use in managementlib containers.
 | ||||
| // NVML is not used to query devices and all device nodes are returned.
 | ||||
| func (m *managementlib) newManagementDeviceDiscoverer() (discover.Discover, error) { | ||||
| 	deviceNodes := discover.NewCharDeviceDiscoverer( | ||||
| 		m.logger, | ||||
| 		[]string{ | ||||
| 			"/dev/nvidia*", | ||||
| 			"/dev/nvidia-caps/nvidia-cap*", | ||||
| 			"/dev/nvidia-modeset", | ||||
| 			"/dev/nvidia-uvm-tools", | ||||
| 			"/dev/nvidia-uvm", | ||||
| 			"/dev/nvidiactl", | ||||
| 		}, | ||||
| 		m.driverRoot, | ||||
| 	) | ||||
| 
 | ||||
| 	deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer( | ||||
| 		m.logger, | ||||
| 		m.driverRoot, | ||||
| 		m.nvidiaCTKPath, | ||||
| 		deviceNodes, | ||||
| 	) | ||||
| 
 | ||||
| 	d := discover.Merge( | ||||
| 		&managementDiscoverer{deviceNodes}, | ||||
| 		deviceFolderPermissionHooks, | ||||
| 	) | ||||
| 	return d, nil | ||||
| } | ||||
| 
 | ||||
| func (m *managementDiscoverer) Devices() ([]discover.Device, error) { | ||||
| 	devices, err := m.Discover.Devices() | ||||
| 	if err != nil { | ||||
| 		return devices, err | ||||
| 	} | ||||
| 
 | ||||
| 	var filteredDevices []discover.Device | ||||
| 	for _, device := range devices { | ||||
| 		if m.nodeIsBlocked(device.HostPath) { | ||||
| 			continue | ||||
| 		} | ||||
| 		filteredDevices = append(filteredDevices, device) | ||||
| 	} | ||||
| 
 | ||||
| 	return filteredDevices, nil | ||||
| } | ||||
| 
 | ||||
| // nodeIsBlocked returns true if the specified device node should be ignored.
 | ||||
| func (m managementDiscoverer) nodeIsBlocked(path string) bool { | ||||
| 	blockedPrefixes := []string{"nvidia-fs", "nvidia-nvswitch", "nvidia-nvlink"} | ||||
| 	nodeName := filepath.Base(path) | ||||
| 	for _, prefix := range blockedPrefixes { | ||||
| 		if strings.HasPrefix(nodeName, prefix) { | ||||
| 			return true | ||||
| 		} | ||||
| 	} | ||||
| 	return false | ||||
| } | ||||
| 
 | ||||
| // GetSpec is unsppported for the managementlib specs.
 | ||||
| // managementlib is typically wrapped by a spec that implements GetSpec.
 | ||||
| func (m *managementlib) GetSpec() (spec.Interface, error) { | ||||
| 	return nil, fmt.Errorf("GetSpec is not supported") | ||||
| } | ||||
| 
 | ||||
| // GetGPUDeviceEdits is unsupported for the managementlib specs
 | ||||
| func (m *managementlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) { | ||||
| 	return nil, fmt.Errorf("GetGPUDeviceEdits is not supported") | ||||
| } | ||||
| 
 | ||||
| // GetGPUDeviceSpecs is unsupported for the managementlib specs
 | ||||
| func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) { | ||||
| 	return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported") | ||||
| } | ||||
| 
 | ||||
| // GetMIGDeviceEdits is unsupported for the managementlib specs
 | ||||
| func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error) { | ||||
| 	return nil, fmt.Errorf("GetMIGDeviceEdits is not supported") | ||||
| } | ||||
| 
 | ||||
| // GetMIGDeviceSpecs is unsupported for the managementlib specs
 | ||||
| func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { | ||||
| 	return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported") | ||||
| } | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user