mirror of
				https://github.com/NVIDIA/nvidia-container-toolkit
				synced 2025-06-26 18:18:24 +00:00 
			
		
		
		
	Merge branch 'CNT-3897/generate-management-container-spec' into 'main'
Generate CDI specs for management containers See merge request nvidia/container-toolkit/container-toolkit!314
This commit is contained in:
		
						commit
						cb5006c73f
					
				| @ -130,6 +130,7 @@ func (m command) validateFlags(c *cli.Context, cfg *config) error { | ||||
| 	case nvcdi.ModeAuto: | ||||
| 	case nvcdi.ModeNvml: | ||||
| 	case nvcdi.ModeWsl: | ||||
| 	case nvcdi.ModeManagement: | ||||
| 	default: | ||||
| 		return fmt.Errorf("invalid discovery mode: %v", cfg.mode) | ||||
| 	} | ||||
|  | ||||
| @ -30,6 +30,8 @@ const ( | ||||
| 	ModeNvml = "nvml" | ||||
| 	// ModeWsl configures the CDI spec generator to generate a WSL spec.
 | ||||
| 	ModeWsl = "wsl" | ||||
| 	// ModeManagement configures the CDI spec generator to generate a management spec.
 | ||||
| 	ModeManagement = "management" | ||||
| ) | ||||
| 
 | ||||
| // Interface defines the API for the nvcdi package
 | ||||
|  | ||||
| @ -36,6 +36,10 @@ func NewDriverDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath | ||||
| 		return nil, fmt.Errorf("failed to determine driver version: %v", r) | ||||
| 	} | ||||
| 
 | ||||
| 	return newDriverVersionDiscoverer(logger, driverRoot, nvidiaCTKPath, version) | ||||
| } | ||||
| 
 | ||||
| func newDriverVersionDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, version string) (discover.Discover, error) { | ||||
| 	libraries, err := NewDriverLibraryDiscoverer(logger, driverRoot, nvidiaCTKPath, version) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err) | ||||
|  | ||||
| @ -73,6 +73,11 @@ func New(opts ...Option) Interface { | ||||
| 
 | ||||
| 	var lib Interface | ||||
| 	switch l.resolveMode() { | ||||
| 	case ModeManagement: | ||||
| 		if l.vendor == "" { | ||||
| 			l.vendor = "management.nvidia.com" | ||||
| 		} | ||||
| 		lib = (*managementlib)(l) | ||||
| 	case ModeNvml: | ||||
| 		if l.nvmllib == nil { | ||||
| 			l.nvmllib = nvml.New() | ||||
|  | ||||
							
								
								
									
										182
									
								
								pkg/nvcdi/management.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										182
									
								
								pkg/nvcdi/management.go
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,182 @@ | ||||
| /** | ||||
| # Copyright (c) NVIDIA CORPORATION.  All rights reserved. | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| #     http://www.apache.org/licenses/LICENSE-2.0
 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| **/ | ||||
| 
 | ||||
| package nvcdi | ||||
| 
 | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"path/filepath" | ||||
| 	"strings" | ||||
| 
 | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/internal/discover" | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/internal/edits" | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" | ||||
| 	"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" | ||||
| 	"github.com/container-orchestrated-devices/container-device-interface/specs-go" | ||||
| 	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" | ||||
| ) | ||||
| 
 | ||||
| type managementlib nvcdilib | ||||
| 
 | ||||
| var _ Interface = (*managementlib)(nil) | ||||
| 
 | ||||
| // GetAllDeviceSpecs returns all device specs for use in managemnt containers.
 | ||||
| // A single device with the name `all` is returned.
 | ||||
| func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) { | ||||
| 	devices, err := m.newManagementDeviceDiscoverer() | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create device discoverer: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	edits, err := edits.FromDiscoverer(devices) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create edits from discoverer: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	if len(edits.DeviceNodes) == 0 { | ||||
| 		return nil, fmt.Errorf("no NVIDIA device nodes found") | ||||
| 	} | ||||
| 
 | ||||
| 	device := specs.Device{ | ||||
| 		Name:           "all", | ||||
| 		ContainerEdits: *edits.ContainerEdits, | ||||
| 	} | ||||
| 	return []specs.Device{device}, nil | ||||
| } | ||||
| 
 | ||||
| // GetCommonEdits returns the common edits for use in managementlib containers.
 | ||||
| func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { | ||||
| 	locator, err := lookup.NewLibraryLocator( | ||||
| 		m.logger, | ||||
| 		m.driverRoot, | ||||
| 	) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create library locator: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	candidates, err := locator.Locate("libcuda.so") | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to locate libcuda.so: %v", err) | ||||
| 	} | ||||
| 	libcudaPath := candidates[0] | ||||
| 
 | ||||
| 	version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.") | ||||
| 	if version == "" { | ||||
| 		return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath) | ||||
| 	} | ||||
| 
 | ||||
| 	driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create driver library discoverer: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	edits, err := edits.FromDiscoverer(driver) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create edits from discoverer: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	return edits, nil | ||||
| } | ||||
| 
 | ||||
| type managementDiscoverer struct { | ||||
| 	discover.Discover | ||||
| } | ||||
| 
 | ||||
| // newManagementDeviceDiscoverer returns a discover.Discover that discovers device nodes for use in managementlib containers.
 | ||||
| // NVML is not used to query devices and all device nodes are returned.
 | ||||
| func (m *managementlib) newManagementDeviceDiscoverer() (discover.Discover, error) { | ||||
| 	deviceNodes := discover.NewCharDeviceDiscoverer( | ||||
| 		m.logger, | ||||
| 		[]string{ | ||||
| 			"/dev/nvidia*", | ||||
| 			"/dev/nvidia-caps/nvidia-cap*", | ||||
| 			"/dev/nvidia-modeset", | ||||
| 			"/dev/nvidia-uvm-tools", | ||||
| 			"/dev/nvidia-uvm", | ||||
| 			"/dev/nvidiactl", | ||||
| 		}, | ||||
| 		m.driverRoot, | ||||
| 	) | ||||
| 
 | ||||
| 	deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer( | ||||
| 		m.logger, | ||||
| 		m.driverRoot, | ||||
| 		m.nvidiaCTKPath, | ||||
| 		deviceNodes, | ||||
| 	) | ||||
| 
 | ||||
| 	d := discover.Merge( | ||||
| 		&managementDiscoverer{deviceNodes}, | ||||
| 		deviceFolderPermissionHooks, | ||||
| 	) | ||||
| 	return d, nil | ||||
| } | ||||
| 
 | ||||
| func (m *managementDiscoverer) Devices() ([]discover.Device, error) { | ||||
| 	devices, err := m.Discover.Devices() | ||||
| 	if err != nil { | ||||
| 		return devices, err | ||||
| 	} | ||||
| 
 | ||||
| 	var filteredDevices []discover.Device | ||||
| 	for _, device := range devices { | ||||
| 		if m.nodeIsBlocked(device.HostPath) { | ||||
| 			continue | ||||
| 		} | ||||
| 		filteredDevices = append(filteredDevices, device) | ||||
| 	} | ||||
| 
 | ||||
| 	return filteredDevices, nil | ||||
| } | ||||
| 
 | ||||
| // nodeIsBlocked returns true if the specified device node should be ignored.
 | ||||
| func (m managementDiscoverer) nodeIsBlocked(path string) bool { | ||||
| 	blockedPrefixes := []string{"nvidia-fs", "nvidia-nvswitch", "nvidia-nvlink"} | ||||
| 	nodeName := filepath.Base(path) | ||||
| 	for _, prefix := range blockedPrefixes { | ||||
| 		if strings.HasPrefix(nodeName, prefix) { | ||||
| 			return true | ||||
| 		} | ||||
| 	} | ||||
| 	return false | ||||
| } | ||||
| 
 | ||||
| // GetSpec is unsppported for the managementlib specs.
 | ||||
| // managementlib is typically wrapped by a spec that implements GetSpec.
 | ||||
| func (m *managementlib) GetSpec() (spec.Interface, error) { | ||||
| 	return nil, fmt.Errorf("GetSpec is not supported") | ||||
| } | ||||
| 
 | ||||
| // GetGPUDeviceEdits is unsupported for the managementlib specs
 | ||||
| func (m *managementlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) { | ||||
| 	return nil, fmt.Errorf("GetGPUDeviceEdits is not supported") | ||||
| } | ||||
| 
 | ||||
| // GetGPUDeviceSpecs is unsupported for the managementlib specs
 | ||||
| func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) { | ||||
| 	return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported") | ||||
| } | ||||
| 
 | ||||
| // GetMIGDeviceEdits is unsupported for the managementlib specs
 | ||||
| func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error) { | ||||
| 	return nil, fmt.Errorf("GetMIGDeviceEdits is not supported") | ||||
| } | ||||
| 
 | ||||
| // GetMIGDeviceSpecs is unsupported for the managementlib specs
 | ||||
| func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) { | ||||
| 	return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported") | ||||
| } | ||||
| @ -23,7 +23,7 @@ testing::toolkit::install() { | ||||
| 		READLINK="greadlink" | ||||
| 	fi | ||||
| 
 | ||||
| 	testing::docker_run::toolkit::shell 'toolkit install --toolkit-root=/usr/local/nvidia/toolkit' | ||||
| 	testing::docker_run::toolkit::shell 'toolkit install --toolkit-root=/usr/local/nvidia/toolkit --cdi-output-dir=""' | ||||
| 	docker run --rm -v "${shared_dir}:/work" alpine sh -c "chown -R ${uid}:${gid} /work/" | ||||
| 
 | ||||
| 	# Ensure toolkit dir is correctly setup | ||||
|  | ||||
| @ -23,6 +23,9 @@ import ( | ||||
| 	"path/filepath" | ||||
| 	"strings" | ||||
| 
 | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" | ||||
| 	"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform" | ||||
| 	"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" | ||||
| 	toml "github.com/pelletier/go-toml" | ||||
| 	log "github.com/sirupsen/logrus" | ||||
| 	"github.com/urfave/cli/v2" | ||||
| @ -41,12 +44,18 @@ const ( | ||||
| 
 | ||||
| type options struct { | ||||
| 	DriverRoot               string | ||||
| 	DriverRootCtrPath        string | ||||
| 	ContainerRuntimeMode     string | ||||
| 	ContainerRuntimeDebug    string | ||||
| 	ContainerRuntimeLogLevel string | ||||
| 	ContainerCLIDebug        string | ||||
| 	toolkitRoot              string | ||||
| 
 | ||||
| 	cdiOutputDir string | ||||
| 	cdiKind      string | ||||
| 	cdiVendor    string | ||||
| 	cdiClass     string | ||||
| 
 | ||||
| 	acceptNVIDIAVisibleDevicesWhenUnprivileged bool | ||||
| 	acceptNVIDIAVisibleDevicesAsVolumeMounts   bool | ||||
| } | ||||
| @ -98,6 +107,12 @@ func main() { | ||||
| 			Destination: &opts.DriverRoot, | ||||
| 			EnvVars:     []string{"NVIDIA_DRIVER_ROOT"}, | ||||
| 		}, | ||||
| 		&cli.StringFlag{ | ||||
| 			Name:        "driver-root-ctr-path", | ||||
| 			Value:       DefaultNvidiaDriverRoot, | ||||
| 			Destination: &opts.DriverRootCtrPath, | ||||
| 			EnvVars:     []string{"DRIVER_ROOT_CTR_PATH"}, | ||||
| 		}, | ||||
| 		&cli.StringFlag{ | ||||
| 			Name:        "nvidia-container-runtime-debug", | ||||
| 			Usage:       "Specify the location of the debug log file for the NVIDIA Container Runtime", | ||||
| @ -140,6 +155,18 @@ func main() { | ||||
| 			Destination: &opts.toolkitRoot, | ||||
| 			EnvVars:     []string{"TOOLKIT_ROOT"}, | ||||
| 		}, | ||||
| 		&cli.StringFlag{ | ||||
| 			Name:        "cdi-output-dir", | ||||
| 			Usage:       "the directory where the CDI output files are to be written. If this is set to '', no CDI specification is generated.", | ||||
| 			Value:       "/var/run/cdi", | ||||
| 			Destination: &opts.cdiOutputDir, | ||||
| 		}, | ||||
| 		&cli.StringFlag{ | ||||
| 			Name:        "cdi-kind", | ||||
| 			Usage:       "the vendor string to use for the generated CDI specification", | ||||
| 			Value:       "management.nvidia.com/gpu", | ||||
| 			Destination: &opts.cdiKind, | ||||
| 		}, | ||||
| 	} | ||||
| 
 | ||||
| 	// Update the subcommand flags with the common subcommand flags
 | ||||
| @ -158,6 +185,16 @@ func validateOptions(c *cli.Context, opts *options) error { | ||||
| 		return fmt.Errorf("invalid --toolkit-root option: %v", opts.toolkitRoot) | ||||
| 	} | ||||
| 
 | ||||
| 	vendor, class := cdi.ParseQualifier(opts.cdiKind) | ||||
| 	if err := cdi.ValidateVendorName(vendor); err != nil { | ||||
| 		return fmt.Errorf("invalid CDI vendor name: %v", err) | ||||
| 	} | ||||
| 	if err := cdi.ValidateClassName(class); err != nil { | ||||
| 		return fmt.Errorf("invalid CDI class name: %v", err) | ||||
| 	} | ||||
| 	opts.cdiVendor = vendor | ||||
| 	opts.cdiClass = class | ||||
| 
 | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| @ -215,7 +252,12 @@ func Install(cli *cli.Context, opts *options) error { | ||||
| 		return fmt.Errorf("error installing NVIDIA container toolkit config: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	return nil | ||||
| 	nvidiaCTKPath, err := installContainerToolkitCLI(opts.toolkitRoot) | ||||
| 	if err != nil { | ||||
| 		return fmt.Errorf("error installing NVIDIA Container Toolkit CLI: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	return generateCDISpec(opts, nvidiaCTKPath) | ||||
| } | ||||
| 
 | ||||
| // installContainerLibraries locates and installs the libraries that are part of
 | ||||
| @ -326,6 +368,19 @@ func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutable | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| // installContainerToolkitCLI installs the nvidia-ctk CLI executable and wrapper.
 | ||||
| func installContainerToolkitCLI(toolkitDir string) (string, error) { | ||||
| 	e := executable{ | ||||
| 		source: "/usr/bin/nvidia-ctk", | ||||
| 		target: executableTarget{ | ||||
| 			dotfileName: "nvidia-ctk.real", | ||||
| 			wrapperName: "nvidia-ctk", | ||||
| 		}, | ||||
| 	} | ||||
| 
 | ||||
| 	return e.install(toolkitDir) | ||||
| } | ||||
| 
 | ||||
| // installContainerCLI sets up the NVIDIA container CLI executable, copying the executable
 | ||||
| // and implementing the required wrapper
 | ||||
| func installContainerCLI(toolkitRoot string) (string, error) { | ||||
| @ -509,3 +564,42 @@ func createDirectories(dir ...string) error { | ||||
| 	} | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| // generateCDISpec generates a CDI spec for use in managemnt containers
 | ||||
| func generateCDISpec(opts *options, nvidiaCTKPath string) error { | ||||
| 	if opts.cdiOutputDir == "" { | ||||
| 		log.Info("Skipping CDI spec generation (no output directory specified)") | ||||
| 		return nil | ||||
| 	} | ||||
| 
 | ||||
| 	cdilib := nvcdi.New( | ||||
| 		nvcdi.WithMode(nvcdi.ModeManagement), | ||||
| 		nvcdi.WithDriverRoot(opts.DriverRootCtrPath), | ||||
| 		nvcdi.WithNVIDIACTKPath(nvidiaCTKPath), | ||||
| 		nvcdi.WithVendor(opts.cdiVendor), | ||||
| 		nvcdi.WithClass(opts.cdiClass), | ||||
| 	) | ||||
| 
 | ||||
| 	spec, err := cdilib.GetSpec() | ||||
| 	if err != nil { | ||||
| 		return fmt.Errorf("failed to genereate CDI spec for management containers: %v", err) | ||||
| 	} | ||||
| 	err = transform.NewRootTransformer( | ||||
| 		opts.DriverRootCtrPath, | ||||
| 		opts.DriverRoot, | ||||
| 	).Transform(spec.Raw()) | ||||
| 	if err != nil { | ||||
| 		return fmt.Errorf("failed to transform driver root in CDI spec: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	name, err := cdi.GenerateNameForSpec(spec.Raw()) | ||||
| 	if err != nil { | ||||
| 		return fmt.Errorf("failed to generate CDI name for management containers: %v", err) | ||||
| 	} | ||||
| 	err = spec.Save(filepath.Join(opts.cdiOutputDir, name)) | ||||
| 	if err != nil { | ||||
| 		return fmt.Errorf("failed to save CDI spec for management containers: %v", err) | ||||
| 	} | ||||
| 
 | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user