Merge branch 'CNT-3897/generate-management-container-spec' into 'main'

Generate CDI specs for management containers

See merge request nvidia/container-toolkit/container-toolkit!314
This commit is contained in:
Evan Lezar 2023-03-06 16:23:13 +00:00
commit cb5006c73f
7 changed files with 290 additions and 2 deletions

View File

@ -130,6 +130,7 @@ func (m command) validateFlags(c *cli.Context, cfg *config) error {
case nvcdi.ModeAuto: case nvcdi.ModeAuto:
case nvcdi.ModeNvml: case nvcdi.ModeNvml:
case nvcdi.ModeWsl: case nvcdi.ModeWsl:
case nvcdi.ModeManagement:
default: default:
return fmt.Errorf("invalid discovery mode: %v", cfg.mode) return fmt.Errorf("invalid discovery mode: %v", cfg.mode)
} }

View File

@ -30,6 +30,8 @@ const (
ModeNvml = "nvml" ModeNvml = "nvml"
// ModeWsl configures the CDI spec generator to generate a WSL spec. // ModeWsl configures the CDI spec generator to generate a WSL spec.
ModeWsl = "wsl" ModeWsl = "wsl"
// ModeManagement configures the CDI spec generator to generate a management spec.
ModeManagement = "management"
) )
// Interface defines the API for the nvcdi package // Interface defines the API for the nvcdi package

View File

@ -36,6 +36,10 @@ func NewDriverDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath
return nil, fmt.Errorf("failed to determine driver version: %v", r) return nil, fmt.Errorf("failed to determine driver version: %v", r)
} }
return newDriverVersionDiscoverer(logger, driverRoot, nvidiaCTKPath, version)
}
func newDriverVersionDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, version string) (discover.Discover, error) {
libraries, err := NewDriverLibraryDiscoverer(logger, driverRoot, nvidiaCTKPath, version) libraries, err := NewDriverLibraryDiscoverer(logger, driverRoot, nvidiaCTKPath, version)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err) return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err)

View File

@ -73,6 +73,11 @@ func New(opts ...Option) Interface {
var lib Interface var lib Interface
switch l.resolveMode() { switch l.resolveMode() {
case ModeManagement:
if l.vendor == "" {
l.vendor = "management.nvidia.com"
}
lib = (*managementlib)(l)
case ModeNvml: case ModeNvml:
if l.nvmllib == nil { if l.nvmllib == nil {
l.nvmllib = nvml.New() l.nvmllib = nvml.New()

182
pkg/nvcdi/management.go Normal file
View File

@ -0,0 +1,182 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package nvcdi
import (
"fmt"
"path/filepath"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
)
type managementlib nvcdilib
var _ Interface = (*managementlib)(nil)
// GetAllDeviceSpecs returns all device specs for use in managemnt containers.
// A single device with the name `all` is returned.
func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) {
devices, err := m.newManagementDeviceDiscoverer()
if err != nil {
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
}
edits, err := edits.FromDiscoverer(devices)
if err != nil {
return nil, fmt.Errorf("failed to create edits from discoverer: %v", err)
}
if len(edits.DeviceNodes) == 0 {
return nil, fmt.Errorf("no NVIDIA device nodes found")
}
device := specs.Device{
Name: "all",
ContainerEdits: *edits.ContainerEdits,
}
return []specs.Device{device}, nil
}
// GetCommonEdits returns the common edits for use in managementlib containers.
func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
locator, err := lookup.NewLibraryLocator(
m.logger,
m.driverRoot,
)
if err != nil {
return nil, fmt.Errorf("failed to create library locator: %v", err)
}
candidates, err := locator.Locate("libcuda.so")
if err != nil {
return nil, fmt.Errorf("failed to locate libcuda.so: %v", err)
}
libcudaPath := candidates[0]
version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.")
if version == "" {
return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath)
}
driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version)
if err != nil {
return nil, fmt.Errorf("failed to create driver library discoverer: %v", err)
}
edits, err := edits.FromDiscoverer(driver)
if err != nil {
return nil, fmt.Errorf("failed to create edits from discoverer: %v", err)
}
return edits, nil
}
type managementDiscoverer struct {
discover.Discover
}
// newManagementDeviceDiscoverer returns a discover.Discover that discovers device nodes for use in managementlib containers.
// NVML is not used to query devices and all device nodes are returned.
func (m *managementlib) newManagementDeviceDiscoverer() (discover.Discover, error) {
deviceNodes := discover.NewCharDeviceDiscoverer(
m.logger,
[]string{
"/dev/nvidia*",
"/dev/nvidia-caps/nvidia-cap*",
"/dev/nvidia-modeset",
"/dev/nvidia-uvm-tools",
"/dev/nvidia-uvm",
"/dev/nvidiactl",
},
m.driverRoot,
)
deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer(
m.logger,
m.driverRoot,
m.nvidiaCTKPath,
deviceNodes,
)
d := discover.Merge(
&managementDiscoverer{deviceNodes},
deviceFolderPermissionHooks,
)
return d, nil
}
func (m *managementDiscoverer) Devices() ([]discover.Device, error) {
devices, err := m.Discover.Devices()
if err != nil {
return devices, err
}
var filteredDevices []discover.Device
for _, device := range devices {
if m.nodeIsBlocked(device.HostPath) {
continue
}
filteredDevices = append(filteredDevices, device)
}
return filteredDevices, nil
}
// nodeIsBlocked returns true if the specified device node should be ignored.
func (m managementDiscoverer) nodeIsBlocked(path string) bool {
blockedPrefixes := []string{"nvidia-fs", "nvidia-nvswitch", "nvidia-nvlink"}
nodeName := filepath.Base(path)
for _, prefix := range blockedPrefixes {
if strings.HasPrefix(nodeName, prefix) {
return true
}
}
return false
}
// GetSpec is unsppported for the managementlib specs.
// managementlib is typically wrapped by a spec that implements GetSpec.
func (m *managementlib) GetSpec() (spec.Interface, error) {
return nil, fmt.Errorf("GetSpec is not supported")
}
// GetGPUDeviceEdits is unsupported for the managementlib specs
func (m *managementlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
return nil, fmt.Errorf("GetGPUDeviceEdits is not supported")
}
// GetGPUDeviceSpecs is unsupported for the managementlib specs
func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error) {
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
}
// GetMIGDeviceEdits is unsupported for the managementlib specs
func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error) {
return nil, fmt.Errorf("GetMIGDeviceEdits is not supported")
}
// GetMIGDeviceSpecs is unsupported for the managementlib specs
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error) {
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
}

View File

@ -23,7 +23,7 @@ testing::toolkit::install() {
READLINK="greadlink" READLINK="greadlink"
fi fi
testing::docker_run::toolkit::shell 'toolkit install --toolkit-root=/usr/local/nvidia/toolkit' testing::docker_run::toolkit::shell 'toolkit install --toolkit-root=/usr/local/nvidia/toolkit --cdi-output-dir=""'
docker run --rm -v "${shared_dir}:/work" alpine sh -c "chown -R ${uid}:${gid} /work/" docker run --rm -v "${shared_dir}:/work" alpine sh -c "chown -R ${uid}:${gid} /work/"
# Ensure toolkit dir is correctly setup # Ensure toolkit dir is correctly setup

View File

@ -23,6 +23,9 @@ import (
"path/filepath" "path/filepath"
"strings" "strings"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
toml "github.com/pelletier/go-toml" toml "github.com/pelletier/go-toml"
log "github.com/sirupsen/logrus" log "github.com/sirupsen/logrus"
"github.com/urfave/cli/v2" "github.com/urfave/cli/v2"
@ -41,12 +44,18 @@ const (
type options struct { type options struct {
DriverRoot string DriverRoot string
DriverRootCtrPath string
ContainerRuntimeMode string ContainerRuntimeMode string
ContainerRuntimeDebug string ContainerRuntimeDebug string
ContainerRuntimeLogLevel string ContainerRuntimeLogLevel string
ContainerCLIDebug string ContainerCLIDebug string
toolkitRoot string toolkitRoot string
cdiOutputDir string
cdiKind string
cdiVendor string
cdiClass string
acceptNVIDIAVisibleDevicesWhenUnprivileged bool acceptNVIDIAVisibleDevicesWhenUnprivileged bool
acceptNVIDIAVisibleDevicesAsVolumeMounts bool acceptNVIDIAVisibleDevicesAsVolumeMounts bool
} }
@ -98,6 +107,12 @@ func main() {
Destination: &opts.DriverRoot, Destination: &opts.DriverRoot,
EnvVars: []string{"NVIDIA_DRIVER_ROOT"}, EnvVars: []string{"NVIDIA_DRIVER_ROOT"},
}, },
&cli.StringFlag{
Name: "driver-root-ctr-path",
Value: DefaultNvidiaDriverRoot,
Destination: &opts.DriverRootCtrPath,
EnvVars: []string{"DRIVER_ROOT_CTR_PATH"},
},
&cli.StringFlag{ &cli.StringFlag{
Name: "nvidia-container-runtime-debug", Name: "nvidia-container-runtime-debug",
Usage: "Specify the location of the debug log file for the NVIDIA Container Runtime", Usage: "Specify the location of the debug log file for the NVIDIA Container Runtime",
@ -140,6 +155,18 @@ func main() {
Destination: &opts.toolkitRoot, Destination: &opts.toolkitRoot,
EnvVars: []string{"TOOLKIT_ROOT"}, EnvVars: []string{"TOOLKIT_ROOT"},
}, },
&cli.StringFlag{
Name: "cdi-output-dir",
Usage: "the directory where the CDI output files are to be written. If this is set to '', no CDI specification is generated.",
Value: "/var/run/cdi",
Destination: &opts.cdiOutputDir,
},
&cli.StringFlag{
Name: "cdi-kind",
Usage: "the vendor string to use for the generated CDI specification",
Value: "management.nvidia.com/gpu",
Destination: &opts.cdiKind,
},
} }
// Update the subcommand flags with the common subcommand flags // Update the subcommand flags with the common subcommand flags
@ -158,6 +185,16 @@ func validateOptions(c *cli.Context, opts *options) error {
return fmt.Errorf("invalid --toolkit-root option: %v", opts.toolkitRoot) return fmt.Errorf("invalid --toolkit-root option: %v", opts.toolkitRoot)
} }
vendor, class := cdi.ParseQualifier(opts.cdiKind)
if err := cdi.ValidateVendorName(vendor); err != nil {
return fmt.Errorf("invalid CDI vendor name: %v", err)
}
if err := cdi.ValidateClassName(class); err != nil {
return fmt.Errorf("invalid CDI class name: %v", err)
}
opts.cdiVendor = vendor
opts.cdiClass = class
return nil return nil
} }
@ -215,7 +252,12 @@ func Install(cli *cli.Context, opts *options) error {
return fmt.Errorf("error installing NVIDIA container toolkit config: %v", err) return fmt.Errorf("error installing NVIDIA container toolkit config: %v", err)
} }
return nil nvidiaCTKPath, err := installContainerToolkitCLI(opts.toolkitRoot)
if err != nil {
return fmt.Errorf("error installing NVIDIA Container Toolkit CLI: %v", err)
}
return generateCDISpec(opts, nvidiaCTKPath)
} }
// installContainerLibraries locates and installs the libraries that are part of // installContainerLibraries locates and installs the libraries that are part of
@ -326,6 +368,19 @@ func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutable
return nil return nil
} }
// installContainerToolkitCLI installs the nvidia-ctk CLI executable and wrapper.
func installContainerToolkitCLI(toolkitDir string) (string, error) {
e := executable{
source: "/usr/bin/nvidia-ctk",
target: executableTarget{
dotfileName: "nvidia-ctk.real",
wrapperName: "nvidia-ctk",
},
}
return e.install(toolkitDir)
}
// installContainerCLI sets up the NVIDIA container CLI executable, copying the executable // installContainerCLI sets up the NVIDIA container CLI executable, copying the executable
// and implementing the required wrapper // and implementing the required wrapper
func installContainerCLI(toolkitRoot string) (string, error) { func installContainerCLI(toolkitRoot string) (string, error) {
@ -509,3 +564,42 @@ func createDirectories(dir ...string) error {
} }
return nil return nil
} }
// generateCDISpec generates a CDI spec for use in managemnt containers
func generateCDISpec(opts *options, nvidiaCTKPath string) error {
if opts.cdiOutputDir == "" {
log.Info("Skipping CDI spec generation (no output directory specified)")
return nil
}
cdilib := nvcdi.New(
nvcdi.WithMode(nvcdi.ModeManagement),
nvcdi.WithDriverRoot(opts.DriverRootCtrPath),
nvcdi.WithNVIDIACTKPath(nvidiaCTKPath),
nvcdi.WithVendor(opts.cdiVendor),
nvcdi.WithClass(opts.cdiClass),
)
spec, err := cdilib.GetSpec()
if err != nil {
return fmt.Errorf("failed to genereate CDI spec for management containers: %v", err)
}
err = transform.NewRootTransformer(
opts.DriverRootCtrPath,
opts.DriverRoot,
).Transform(spec.Raw())
if err != nil {
return fmt.Errorf("failed to transform driver root in CDI spec: %v", err)
}
name, err := cdi.GenerateNameForSpec(spec.Raw())
if err != nil {
return fmt.Errorf("failed to generate CDI name for management containers: %v", err)
}
err = spec.Save(filepath.Join(opts.cdiOutputDir, name))
if err != nil {
return fmt.Errorf("failed to save CDI spec for management containers: %v", err)
}
return nil
}