mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-26 18:18:24 +00:00
Merge branch 'CNT-3897/generate-management-container-spec' into 'main'
Generate CDI specs for management containers See merge request nvidia/container-toolkit/container-toolkit!314
This commit is contained in:
commit
cb5006c73f
@ -130,6 +130,7 @@ func (m command) validateFlags(c *cli.Context, cfg *config) error {
|
|||||||
case nvcdi.ModeAuto:
|
case nvcdi.ModeAuto:
|
||||||
case nvcdi.ModeNvml:
|
case nvcdi.ModeNvml:
|
||||||
case nvcdi.ModeWsl:
|
case nvcdi.ModeWsl:
|
||||||
|
case nvcdi.ModeManagement:
|
||||||
default:
|
default:
|
||||||
return fmt.Errorf("invalid discovery mode: %v", cfg.mode)
|
return fmt.Errorf("invalid discovery mode: %v", cfg.mode)
|
||||||
}
|
}
|
||||||
|
@ -30,6 +30,8 @@ const (
|
|||||||
ModeNvml = "nvml"
|
ModeNvml = "nvml"
|
||||||
// ModeWsl configures the CDI spec generator to generate a WSL spec.
|
// ModeWsl configures the CDI spec generator to generate a WSL spec.
|
||||||
ModeWsl = "wsl"
|
ModeWsl = "wsl"
|
||||||
|
// ModeManagement configures the CDI spec generator to generate a management spec.
|
||||||
|
ModeManagement = "management"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Interface defines the API for the nvcdi package
|
// Interface defines the API for the nvcdi package
|
||||||
|
@ -36,6 +36,10 @@ func NewDriverDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath
|
|||||||
return nil, fmt.Errorf("failed to determine driver version: %v", r)
|
return nil, fmt.Errorf("failed to determine driver version: %v", r)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return newDriverVersionDiscoverer(logger, driverRoot, nvidiaCTKPath, version)
|
||||||
|
}
|
||||||
|
|
||||||
|
func newDriverVersionDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, version string) (discover.Discover, error) {
|
||||||
libraries, err := NewDriverLibraryDiscoverer(logger, driverRoot, nvidiaCTKPath, version)
|
libraries, err := NewDriverLibraryDiscoverer(logger, driverRoot, nvidiaCTKPath, version)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err)
|
return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err)
|
||||||
|
@ -73,6 +73,11 @@ func New(opts ...Option) Interface {
|
|||||||
|
|
||||||
var lib Interface
|
var lib Interface
|
||||||
switch l.resolveMode() {
|
switch l.resolveMode() {
|
||||||
|
case ModeManagement:
|
||||||
|
if l.vendor == "" {
|
||||||
|
l.vendor = "management.nvidia.com"
|
||||||
|
}
|
||||||
|
lib = (*managementlib)(l)
|
||||||
case ModeNvml:
|
case ModeNvml:
|
||||||
if l.nvmllib == nil {
|
if l.nvmllib == nil {
|
||||||
l.nvmllib = nvml.New()
|
l.nvmllib = nvml.New()
|
||||||
|
182
pkg/nvcdi/management.go
Normal file
182
pkg/nvcdi/management.go
Normal file
@ -0,0 +1,182 @@
|
|||||||
|
/**
|
||||||
|
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
**/
|
||||||
|
|
||||||
|
package nvcdi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
||||||
|
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||||
|
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
|
||||||
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||||
|
)
|
||||||
|
|
||||||
|
type managementlib nvcdilib
|
||||||
|
|
||||||
|
var _ Interface = (*managementlib)(nil)
|
||||||
|
|
||||||
|
// GetAllDeviceSpecs returns all device specs for use in managemnt containers.
|
||||||
|
// A single device with the name `all` is returned.
|
||||||
|
func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) {
|
||||||
|
devices, err := m.newManagementDeviceDiscoverer()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
edits, err := edits.FromDiscoverer(devices)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create edits from discoverer: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(edits.DeviceNodes) == 0 {
|
||||||
|
return nil, fmt.Errorf("no NVIDIA device nodes found")
|
||||||
|
}
|
||||||
|
|
||||||
|
device := specs.Device{
|
||||||
|
Name: "all",
|
||||||
|
ContainerEdits: *edits.ContainerEdits,
|
||||||
|
}
|
||||||
|
return []specs.Device{device}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetCommonEdits returns the common edits for use in managementlib containers.
|
||||||
|
func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
||||||
|
locator, err := lookup.NewLibraryLocator(
|
||||||
|
m.logger,
|
||||||
|
m.driverRoot,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create library locator: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
candidates, err := locator.Locate("libcuda.so")
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to locate libcuda.so: %v", err)
|
||||||
|
}
|
||||||
|
libcudaPath := candidates[0]
|
||||||
|
|
||||||
|
version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.")
|
||||||
|
if version == "" {
|
||||||
|
return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create driver library discoverer: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
edits, err := edits.FromDiscoverer(driver)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to create edits from discoverer: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return edits, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type managementDiscoverer struct {
|
||||||
|
discover.Discover
|
||||||
|
}
|
||||||
|
|
||||||
|
// newManagementDeviceDiscoverer returns a discover.Discover that discovers device nodes for use in managementlib containers.
|
||||||
|
// NVML is not used to query devices and all device nodes are returned.
|
||||||
|
func (m *managementlib) newManagementDeviceDiscoverer() (discover.Discover, error) {
|
||||||
|
deviceNodes := discover.NewCharDeviceDiscoverer(
|
||||||
|
m.logger,
|
||||||
|
[]string{
|
||||||
|
"/dev/nvidia*",
|
||||||
|
"/dev/nvidia-caps/nvidia-cap*",
|
||||||
|
"/dev/nvidia-modeset",
|
||||||
|
"/dev/nvidia-uvm-tools",
|
||||||
|
"/dev/nvidia-uvm",
|
||||||
|
"/dev/nvidiactl",
|
||||||
|
},
|
||||||
|
m.driverRoot,
|
||||||
|
)
|
||||||
|
|
||||||
|
deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer(
|
||||||
|
m.logger,
|
||||||
|
m.driverRoot,
|
||||||
|
m.nvidiaCTKPath,
|
||||||
|
deviceNodes,
|
||||||
|
)
|
||||||
|
|
||||||
|
d := discover.Merge(
|
||||||
|
&managementDiscoverer{deviceNodes},
|
||||||
|
deviceFolderPermissionHooks,
|
||||||
|
)
|
||||||
|
return d, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *managementDiscoverer) Devices() ([]discover.Device, error) {
|
||||||
|
devices, err := m.Discover.Devices()
|
||||||
|
if err != nil {
|
||||||
|
return devices, err
|
||||||
|
}
|
||||||
|
|
||||||
|
var filteredDevices []discover.Device
|
||||||
|
for _, device := range devices {
|
||||||
|
if m.nodeIsBlocked(device.HostPath) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
filteredDevices = append(filteredDevices, device)
|
||||||
|
}
|
||||||
|
|
||||||
|
return filteredDevices, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// nodeIsBlocked returns true if the specified device node should be ignored.
|
||||||
|
func (m managementDiscoverer) nodeIsBlocked(path string) bool {
|
||||||
|
blockedPrefixes := []string{"nvidia-fs", "nvidia-nvswitch", "nvidia-nvlink"}
|
||||||
|
nodeName := filepath.Base(path)
|
||||||
|
for _, prefix := range blockedPrefixes {
|
||||||
|
if strings.HasPrefix(nodeName, prefix) {
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetSpec is unsppported for the managementlib specs.
|
||||||
|
// managementlib is typically wrapped by a spec that implements GetSpec.
|
||||||
|
func (m *managementlib) GetSpec() (spec.Interface, error) {
|
||||||
|
return nil, fmt.Errorf("GetSpec is not supported")
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetGPUDeviceEdits is unsupported for the managementlib specs
|
||||||
|
func (m *managementlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
|
||||||
|
return nil, fmt.Errorf("GetGPUDeviceEdits is not supported")
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetGPUDeviceSpecs is unsupported for the managementlib specs
|
||||||
|
func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) {
|
||||||
|
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetMIGDeviceEdits is unsupported for the managementlib specs
|
||||||
|
func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error) {
|
||||||
|
return nil, fmt.Errorf("GetMIGDeviceEdits is not supported")
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetMIGDeviceSpecs is unsupported for the managementlib specs
|
||||||
|
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
|
||||||
|
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
|
||||||
|
}
|
@ -23,7 +23,7 @@ testing::toolkit::install() {
|
|||||||
READLINK="greadlink"
|
READLINK="greadlink"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
testing::docker_run::toolkit::shell 'toolkit install --toolkit-root=/usr/local/nvidia/toolkit'
|
testing::docker_run::toolkit::shell 'toolkit install --toolkit-root=/usr/local/nvidia/toolkit --cdi-output-dir=""'
|
||||||
docker run --rm -v "${shared_dir}:/work" alpine sh -c "chown -R ${uid}:${gid} /work/"
|
docker run --rm -v "${shared_dir}:/work" alpine sh -c "chown -R ${uid}:${gid} /work/"
|
||||||
|
|
||||||
# Ensure toolkit dir is correctly setup
|
# Ensure toolkit dir is correctly setup
|
||||||
|
@ -23,6 +23,9 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform"
|
||||||
|
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||||
toml "github.com/pelletier/go-toml"
|
toml "github.com/pelletier/go-toml"
|
||||||
log "github.com/sirupsen/logrus"
|
log "github.com/sirupsen/logrus"
|
||||||
"github.com/urfave/cli/v2"
|
"github.com/urfave/cli/v2"
|
||||||
@ -41,12 +44,18 @@ const (
|
|||||||
|
|
||||||
type options struct {
|
type options struct {
|
||||||
DriverRoot string
|
DriverRoot string
|
||||||
|
DriverRootCtrPath string
|
||||||
ContainerRuntimeMode string
|
ContainerRuntimeMode string
|
||||||
ContainerRuntimeDebug string
|
ContainerRuntimeDebug string
|
||||||
ContainerRuntimeLogLevel string
|
ContainerRuntimeLogLevel string
|
||||||
ContainerCLIDebug string
|
ContainerCLIDebug string
|
||||||
toolkitRoot string
|
toolkitRoot string
|
||||||
|
|
||||||
|
cdiOutputDir string
|
||||||
|
cdiKind string
|
||||||
|
cdiVendor string
|
||||||
|
cdiClass string
|
||||||
|
|
||||||
acceptNVIDIAVisibleDevicesWhenUnprivileged bool
|
acceptNVIDIAVisibleDevicesWhenUnprivileged bool
|
||||||
acceptNVIDIAVisibleDevicesAsVolumeMounts bool
|
acceptNVIDIAVisibleDevicesAsVolumeMounts bool
|
||||||
}
|
}
|
||||||
@ -98,6 +107,12 @@ func main() {
|
|||||||
Destination: &opts.DriverRoot,
|
Destination: &opts.DriverRoot,
|
||||||
EnvVars: []string{"NVIDIA_DRIVER_ROOT"},
|
EnvVars: []string{"NVIDIA_DRIVER_ROOT"},
|
||||||
},
|
},
|
||||||
|
&cli.StringFlag{
|
||||||
|
Name: "driver-root-ctr-path",
|
||||||
|
Value: DefaultNvidiaDriverRoot,
|
||||||
|
Destination: &opts.DriverRootCtrPath,
|
||||||
|
EnvVars: []string{"DRIVER_ROOT_CTR_PATH"},
|
||||||
|
},
|
||||||
&cli.StringFlag{
|
&cli.StringFlag{
|
||||||
Name: "nvidia-container-runtime-debug",
|
Name: "nvidia-container-runtime-debug",
|
||||||
Usage: "Specify the location of the debug log file for the NVIDIA Container Runtime",
|
Usage: "Specify the location of the debug log file for the NVIDIA Container Runtime",
|
||||||
@ -140,6 +155,18 @@ func main() {
|
|||||||
Destination: &opts.toolkitRoot,
|
Destination: &opts.toolkitRoot,
|
||||||
EnvVars: []string{"TOOLKIT_ROOT"},
|
EnvVars: []string{"TOOLKIT_ROOT"},
|
||||||
},
|
},
|
||||||
|
&cli.StringFlag{
|
||||||
|
Name: "cdi-output-dir",
|
||||||
|
Usage: "the directory where the CDI output files are to be written. If this is set to '', no CDI specification is generated.",
|
||||||
|
Value: "/var/run/cdi",
|
||||||
|
Destination: &opts.cdiOutputDir,
|
||||||
|
},
|
||||||
|
&cli.StringFlag{
|
||||||
|
Name: "cdi-kind",
|
||||||
|
Usage: "the vendor string to use for the generated CDI specification",
|
||||||
|
Value: "management.nvidia.com/gpu",
|
||||||
|
Destination: &opts.cdiKind,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the subcommand flags with the common subcommand flags
|
// Update the subcommand flags with the common subcommand flags
|
||||||
@ -158,6 +185,16 @@ func validateOptions(c *cli.Context, opts *options) error {
|
|||||||
return fmt.Errorf("invalid --toolkit-root option: %v", opts.toolkitRoot)
|
return fmt.Errorf("invalid --toolkit-root option: %v", opts.toolkitRoot)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
vendor, class := cdi.ParseQualifier(opts.cdiKind)
|
||||||
|
if err := cdi.ValidateVendorName(vendor); err != nil {
|
||||||
|
return fmt.Errorf("invalid CDI vendor name: %v", err)
|
||||||
|
}
|
||||||
|
if err := cdi.ValidateClassName(class); err != nil {
|
||||||
|
return fmt.Errorf("invalid CDI class name: %v", err)
|
||||||
|
}
|
||||||
|
opts.cdiVendor = vendor
|
||||||
|
opts.cdiClass = class
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -215,7 +252,12 @@ func Install(cli *cli.Context, opts *options) error {
|
|||||||
return fmt.Errorf("error installing NVIDIA container toolkit config: %v", err)
|
return fmt.Errorf("error installing NVIDIA container toolkit config: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
nvidiaCTKPath, err := installContainerToolkitCLI(opts.toolkitRoot)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("error installing NVIDIA Container Toolkit CLI: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return generateCDISpec(opts, nvidiaCTKPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
// installContainerLibraries locates and installs the libraries that are part of
|
// installContainerLibraries locates and installs the libraries that are part of
|
||||||
@ -326,6 +368,19 @@ func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutable
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// installContainerToolkitCLI installs the nvidia-ctk CLI executable and wrapper.
|
||||||
|
func installContainerToolkitCLI(toolkitDir string) (string, error) {
|
||||||
|
e := executable{
|
||||||
|
source: "/usr/bin/nvidia-ctk",
|
||||||
|
target: executableTarget{
|
||||||
|
dotfileName: "nvidia-ctk.real",
|
||||||
|
wrapperName: "nvidia-ctk",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return e.install(toolkitDir)
|
||||||
|
}
|
||||||
|
|
||||||
// installContainerCLI sets up the NVIDIA container CLI executable, copying the executable
|
// installContainerCLI sets up the NVIDIA container CLI executable, copying the executable
|
||||||
// and implementing the required wrapper
|
// and implementing the required wrapper
|
||||||
func installContainerCLI(toolkitRoot string) (string, error) {
|
func installContainerCLI(toolkitRoot string) (string, error) {
|
||||||
@ -509,3 +564,42 @@ func createDirectories(dir ...string) error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// generateCDISpec generates a CDI spec for use in managemnt containers
|
||||||
|
func generateCDISpec(opts *options, nvidiaCTKPath string) error {
|
||||||
|
if opts.cdiOutputDir == "" {
|
||||||
|
log.Info("Skipping CDI spec generation (no output directory specified)")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
cdilib := nvcdi.New(
|
||||||
|
nvcdi.WithMode(nvcdi.ModeManagement),
|
||||||
|
nvcdi.WithDriverRoot(opts.DriverRootCtrPath),
|
||||||
|
nvcdi.WithNVIDIACTKPath(nvidiaCTKPath),
|
||||||
|
nvcdi.WithVendor(opts.cdiVendor),
|
||||||
|
nvcdi.WithClass(opts.cdiClass),
|
||||||
|
)
|
||||||
|
|
||||||
|
spec, err := cdilib.GetSpec()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to genereate CDI spec for management containers: %v", err)
|
||||||
|
}
|
||||||
|
err = transform.NewRootTransformer(
|
||||||
|
opts.DriverRootCtrPath,
|
||||||
|
opts.DriverRoot,
|
||||||
|
).Transform(spec.Raw())
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to transform driver root in CDI spec: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
name, err := cdi.GenerateNameForSpec(spec.Raw())
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to generate CDI name for management containers: %v", err)
|
||||||
|
}
|
||||||
|
err = spec.Save(filepath.Join(opts.cdiOutputDir, name))
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to save CDI spec for management containers: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user