mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-02-16 17:42:20 +00:00
Add nvidia-ctk info generate-cdi command
This change adds functionality to generate CDI specifications for all devices detected on the system. A specification containing all GPUs and MIG devices is generated. All libraries on the host ldcache that have an NVIDIA Driver Version suffix are included as are the required binaries and IPC sockets. A hook (based on the nvidia-ctk hook subcommand) to update the ldcache in the container for the libraries being injected is also added to the CDI specificiation. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
parent
a0065456d0
commit
d37c17857e
376
cmd/nvidia-ctk/info/generate-cdi/generate-cdi.go
Normal file
376
cmd/nvidia-ctk/info/generate-cdi/generate-cdi.go
Normal file
@ -0,0 +1,376 @@
|
||||
/**
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package cdi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||
specs "github.com/container-orchestrated-devices/container-device-interface/specs-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/urfave/cli/v2"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
"sigs.k8s.io/yaml"
|
||||
)
|
||||
|
||||
const (
|
||||
nvidiaCTKExecutable = "nvidia-ctk"
|
||||
nvidiaCTKDefaultFilePath = "/usr/bin/" + nvidiaCTKExecutable
|
||||
)
|
||||
|
||||
type command struct {
|
||||
logger *logrus.Logger
|
||||
}
|
||||
|
||||
type config struct {
|
||||
output string
|
||||
jsonMode bool
|
||||
}
|
||||
|
||||
// NewCommand constructs a generate-cdi command with the specified logger
|
||||
func NewCommand(logger *logrus.Logger) *cli.Command {
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
return c.build()
|
||||
}
|
||||
|
||||
// build creates the CLI command
|
||||
func (m command) build() *cli.Command {
|
||||
cfg := config{}
|
||||
|
||||
// Create the 'generate-cdi' command
|
||||
c := cli.Command{
|
||||
Name: "generate-cdi",
|
||||
Usage: "Generate CDI specifications for use with CDI-enabled runtimes",
|
||||
Action: func(c *cli.Context) error {
|
||||
return m.run(c, &cfg)
|
||||
},
|
||||
}
|
||||
|
||||
c.Flags = []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "output",
|
||||
Usage: "Specify the file to output the generated CDI specification to. If this is '-' or '' the specification is output to STDOUT",
|
||||
Destination: &cfg.output,
|
||||
},
|
||||
&cli.BoolFlag{
|
||||
Name: "json",
|
||||
Usage: "Output the generated CDI spec in JSON mode instead of YAML",
|
||||
Destination: &cfg.jsonMode,
|
||||
},
|
||||
}
|
||||
|
||||
return &c
|
||||
}
|
||||
|
||||
func (m command) run(c *cli.Context, cfg *config) error {
|
||||
spec, err := m.generateSpec()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to generate CDI spec: %v", err)
|
||||
}
|
||||
|
||||
var outputTo io.Writer
|
||||
if cfg.output == "" || cfg.output == "-" {
|
||||
outputTo = os.Stdout
|
||||
} else {
|
||||
outputFile, err := os.Create(cfg.output)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create output file: %v", err)
|
||||
}
|
||||
defer outputFile.Close()
|
||||
outputTo = outputFile
|
||||
}
|
||||
|
||||
if filepath.Ext(cfg.output) == ".json" {
|
||||
cfg.jsonMode = true
|
||||
} else if filepath.Ext(cfg.output) == ".yaml" || filepath.Ext(cfg.output) == ".yml" {
|
||||
cfg.jsonMode = false
|
||||
}
|
||||
|
||||
data, err := yaml.Marshal(spec)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal CDI spec: %v", err)
|
||||
}
|
||||
|
||||
if cfg.jsonMode {
|
||||
data, err = yaml.YAMLToJSONStrict(data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to convert CDI spec from YAML to JSON: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
_, err = outputTo.Write(data)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to write output: %v", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m command) generateSpec() (*specs.Spec, error) {
|
||||
nvmllib := nvml.New()
|
||||
if r := nvmllib.Init(); r != nvml.SUCCESS {
|
||||
return nil, r
|
||||
}
|
||||
defer nvmllib.Shutdown()
|
||||
|
||||
devicelib := device.New(device.WithNvml(nvmllib))
|
||||
|
||||
spec := specs.Spec{
|
||||
Version: specs.CurrentVersion,
|
||||
Kind: "nvidia.com/gpu",
|
||||
ContainerEdits: specs.ContainerEdits{},
|
||||
}
|
||||
err := devicelib.VisitDevices(func(i int, d device.Device) error {
|
||||
isMig, err := d.IsMigEnabled()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to check whether device is MIG device: %v", err)
|
||||
}
|
||||
if isMig {
|
||||
return nil
|
||||
}
|
||||
device, err := generateEditsForDevice(newGPUDevice(i, d))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to generate CDI spec for device %v: %v", i, err)
|
||||
}
|
||||
|
||||
spec.Devices = append(spec.Devices, device)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to generate CDI spec for GPU devices: %v", err)
|
||||
}
|
||||
|
||||
err = devicelib.VisitMigDevices(func(i int, d device.Device, j int, m device.MigDevice) error {
|
||||
device, err := generateEditsForDevice(newMigDevice(i, j, m))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to generate CDI spec for device %v: %v", i, err)
|
||||
}
|
||||
|
||||
spec.Devices = append(spec.Devices, device)
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("falied to generate CDI spec for MIG devices: %v", err)
|
||||
}
|
||||
|
||||
libraries, err := m.findLibs(nvmllib)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to locate driver libraries: %v", err)
|
||||
}
|
||||
|
||||
binaries, err := m.findBinaries()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to locate driver binaries: %v", err)
|
||||
}
|
||||
|
||||
ipcs, err := m.findIPC()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to locate driver IPC sockets: %v", err)
|
||||
}
|
||||
|
||||
spec.ContainerEdits.Mounts = generateMountsForPaths(libraries, binaries, ipcs)
|
||||
|
||||
ldcacheUpdateHook := m.generateUpdateLdCacheHook(libraries)
|
||||
|
||||
spec.ContainerEdits.Hooks = []*specs.Hook{ldcacheUpdateHook}
|
||||
|
||||
return &spec, nil
|
||||
}
|
||||
|
||||
func generateEditsForDevice(name string, d deviceInfo) (specs.Device, error) {
|
||||
var deviceNodes []*specs.DeviceNode
|
||||
|
||||
deviceNodePaths, err := d.GetDeviceNodes()
|
||||
if err != nil {
|
||||
return specs.Device{}, fmt.Errorf("failed to get paths for device: %v", err)
|
||||
}
|
||||
for _, p := range deviceNodePaths {
|
||||
deviceNode := specs.DeviceNode{
|
||||
Path: p,
|
||||
// TODO: Set the host path dependent on the root
|
||||
HostPath: p,
|
||||
}
|
||||
deviceNodes = append(deviceNodes, &deviceNode)
|
||||
}
|
||||
device := specs.Device{
|
||||
Name: name,
|
||||
ContainerEdits: specs.ContainerEdits{
|
||||
DeviceNodes: deviceNodes,
|
||||
},
|
||||
}
|
||||
|
||||
return device, nil
|
||||
}
|
||||
|
||||
func (m command) findLibs(nvmllib nvml.Interface) ([]string, error) {
|
||||
version, r := nvmllib.SystemGetDriverVersion()
|
||||
if r != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("failed to determine driver version: %v", r)
|
||||
}
|
||||
m.logger.Infof("Using driver version %v", version)
|
||||
|
||||
cache, err := ldcache.New(m.logger, "")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load ldcache: %v", err)
|
||||
}
|
||||
|
||||
libs32, libs64 := cache.List()
|
||||
|
||||
var libs []string
|
||||
for _, l := range libs64 {
|
||||
if strings.HasSuffix(l, version) {
|
||||
m.logger.Infof("found 64-bit driver lib: %v", l)
|
||||
libs = append(libs, l)
|
||||
}
|
||||
}
|
||||
|
||||
for _, l := range libs32 {
|
||||
if strings.HasSuffix(l, version) {
|
||||
m.logger.Infof("found 32-bit driver lib: %v", l)
|
||||
libs = append(libs, l)
|
||||
}
|
||||
}
|
||||
|
||||
return libs, nil
|
||||
}
|
||||
|
||||
func (m command) findBinaries() ([]string, error) {
|
||||
candidates := []string{
|
||||
"nvidia-smi", /* System management interface */
|
||||
"nvidia-debugdump", /* GPU coredump utility */
|
||||
"nvidia-persistenced", /* Persistence mode utility */
|
||||
"nvidia-cuda-mps-control", /* Multi process service CLI */
|
||||
"nvidia-cuda-mps-server", /* Multi process service server */
|
||||
}
|
||||
|
||||
locator := lookup.NewExecutableLocator(m.logger, "")
|
||||
|
||||
var binaries []string
|
||||
for _, c := range candidates {
|
||||
targets, err := locator.Locate(c)
|
||||
if err != nil {
|
||||
m.logger.Warningf("skipping %v: %v", c, err)
|
||||
continue
|
||||
}
|
||||
|
||||
binaries = append(binaries, targets[0])
|
||||
}
|
||||
return binaries, nil
|
||||
}
|
||||
|
||||
func (m command) findIPC() ([]string, error) {
|
||||
candidates := []string{
|
||||
"/var/run/nvidia-persistenced/socket",
|
||||
"/var/run/nvidia-fabricmanager/socket",
|
||||
// TODO: This can be controlled by the NV_MPS_PIPE_DIR envvar
|
||||
"/tmp/nvidia-mps",
|
||||
}
|
||||
|
||||
locator := lookup.NewFileLocator(m.logger, "")
|
||||
|
||||
var ipcs []string
|
||||
for _, c := range candidates {
|
||||
targets, err := locator.Locate(c)
|
||||
if err != nil {
|
||||
m.logger.Warningf("skipping %v: %v", c, err)
|
||||
continue
|
||||
}
|
||||
|
||||
ipcs = append(ipcs, targets[0])
|
||||
}
|
||||
return ipcs, nil
|
||||
}
|
||||
|
||||
func generateMountsForPaths(pathSets ...[]string) []*specs.Mount {
|
||||
var mounts []*specs.Mount
|
||||
for _, paths := range pathSets {
|
||||
for _, p := range paths {
|
||||
mount := specs.Mount{
|
||||
HostPath: p,
|
||||
// We may want to adjust the container path
|
||||
ContainerPath: p,
|
||||
Type: "bind",
|
||||
Options: []string{
|
||||
"ro",
|
||||
"nosuid",
|
||||
"nodev",
|
||||
"bind",
|
||||
},
|
||||
}
|
||||
mounts = append(mounts, &mount)
|
||||
}
|
||||
}
|
||||
return mounts
|
||||
}
|
||||
|
||||
func (m command) generateUpdateLdCacheHook(libraries []string) *specs.Hook {
|
||||
|
||||
locator := lookup.NewExecutableLocator(m.logger, "")
|
||||
|
||||
hookPath := nvidiaCTKDefaultFilePath
|
||||
targets, err := locator.Locate(nvidiaCTKExecutable)
|
||||
if err != nil {
|
||||
m.logger.Warnf("Failed to locate %v: %v", nvidiaCTKExecutable, err)
|
||||
} else {
|
||||
m.logger.Debugf("Found %v candidates: %v", nvidiaCTKExecutable, targets)
|
||||
hookPath = targets[0]
|
||||
}
|
||||
m.logger.Debugf("Using NVIDIA Container Toolkit CLI path %v", hookPath)
|
||||
|
||||
folders := getLibraryPaths(libraries)
|
||||
|
||||
args := []string{hookPath, "hook", "update-ldcache"}
|
||||
for _, f := range folders {
|
||||
args = append(args, "--folder", f)
|
||||
}
|
||||
|
||||
hook := specs.Hook{
|
||||
HookName: cdi.CreateContainerHook,
|
||||
Path: hookPath,
|
||||
Args: args,
|
||||
}
|
||||
|
||||
return &hook
|
||||
}
|
||||
|
||||
// getLibraryPaths returns the directories in which the libraries can be found
|
||||
func getLibraryPaths(libraries []string) []string {
|
||||
var paths []string
|
||||
checked := make(map[string]bool)
|
||||
|
||||
for _, l := range libraries {
|
||||
dir := filepath.Dir(l)
|
||||
if dir == "" {
|
||||
continue
|
||||
}
|
||||
if checked[dir] {
|
||||
continue
|
||||
}
|
||||
checked[dir] = true
|
||||
paths = append(paths, dir)
|
||||
}
|
||||
return paths
|
||||
}
|
123
cmd/nvidia-ctk/info/generate-cdi/nvml_devices.go
Normal file
123
cmd/nvidia-ctk/info/generate-cdi/nvml_devices.go
Normal file
@ -0,0 +1,123 @@
|
||||
/*
|
||||
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY Type, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package cdi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
)
|
||||
|
||||
// nvmlDevice wraps an nvml.Device with more functions.
|
||||
type nvmlDevice struct {
|
||||
nvml.Device
|
||||
}
|
||||
|
||||
// nvmlMigDevice allows for specific functions of nvmlDevice to be overridden.
|
||||
type nvmlMigDevice nvmlDevice
|
||||
|
||||
// deviceInfo defines the information the required to construct a Device
|
||||
type deviceInfo interface {
|
||||
GetUUID() (string, error)
|
||||
GetDeviceNodes() ([]string, error)
|
||||
}
|
||||
|
||||
var _ deviceInfo = (*nvmlDevice)(nil)
|
||||
var _ deviceInfo = (*nvmlMigDevice)(nil)
|
||||
|
||||
func newGPUDevice(i int, gpu device.Device) (string, nvmlDevice) {
|
||||
return fmt.Sprintf("%v", i), nvmlDevice{gpu}
|
||||
}
|
||||
|
||||
func newMigDevice(i int, j int, mig device.MigDevice) (string, nvmlMigDevice) {
|
||||
return fmt.Sprintf("%v:%v", i, j), nvmlMigDevice{mig}
|
||||
}
|
||||
|
||||
// GetUUID returns the UUID of the device
|
||||
func (d nvmlDevice) GetUUID() (string, error) {
|
||||
uuid, ret := d.Device.GetUUID()
|
||||
if ret != nvml.SUCCESS {
|
||||
return "", ret
|
||||
}
|
||||
return uuid, nil
|
||||
}
|
||||
|
||||
// GetUUID returns the UUID of the device
|
||||
func (d nvmlMigDevice) GetUUID() (string, error) {
|
||||
return nvmlDevice(d).GetUUID()
|
||||
}
|
||||
|
||||
// GetDeviceNodes returns the device node paths for a GPU device
|
||||
func (d nvmlDevice) GetDeviceNodes() ([]string, error) {
|
||||
minor, ret := d.GetMinorNumber()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("error getting GPU device minor number: %v", ret)
|
||||
}
|
||||
path := fmt.Sprintf("/dev/nvidia%d", minor)
|
||||
|
||||
return []string{path}, nil
|
||||
}
|
||||
|
||||
// GetDeviceNodes returns the device node paths for a MIG device
|
||||
func (d nvmlMigDevice) GetDeviceNodes() ([]string, error) {
|
||||
parent, ret := d.GetDeviceHandleFromMigDeviceHandle()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("error getting parent device: %v", ret)
|
||||
}
|
||||
minor, ret := parent.GetMinorNumber()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("error getting GPU device minor number: %v", ret)
|
||||
}
|
||||
parentPath := fmt.Sprintf("/dev/nvidia%d", minor)
|
||||
|
||||
migCaps, err := nvcaps.NewMigCaps()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
|
||||
}
|
||||
|
||||
gi, ret := d.GetGpuInstanceId()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret)
|
||||
}
|
||||
|
||||
ci, ret := d.GetComputeInstanceId()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret)
|
||||
}
|
||||
|
||||
giCap := nvcaps.NewGPUInstanceCap(minor, gi)
|
||||
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
|
||||
}
|
||||
|
||||
ciCap := nvcaps.NewComputeInstanceCap(minor, gi, ci)
|
||||
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
|
||||
}
|
||||
|
||||
devicePaths := []string{
|
||||
parentPath,
|
||||
giCapDevicePath,
|
||||
ciCapDevicePath,
|
||||
}
|
||||
|
||||
return devicePaths, nil
|
||||
}
|
50
cmd/nvidia-ctk/info/info.go
Normal file
50
cmd/nvidia-ctk/info/info.go
Normal file
@ -0,0 +1,50 @@
|
||||
/**
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package info
|
||||
|
||||
import (
|
||||
cdi "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/info/generate-cdi"
|
||||
"github.com/sirupsen/logrus"
|
||||
"github.com/urfave/cli/v2"
|
||||
)
|
||||
|
||||
type command struct {
|
||||
logger *logrus.Logger
|
||||
}
|
||||
|
||||
// NewCommand constructs an info command with the specified logger
|
||||
func NewCommand(logger *logrus.Logger) *cli.Command {
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
return c.build()
|
||||
}
|
||||
|
||||
// build
|
||||
func (m command) build() *cli.Command {
|
||||
// Create the 'hook' command
|
||||
hook := cli.Command{
|
||||
Name: "info",
|
||||
Usage: "Provide information about the system",
|
||||
}
|
||||
|
||||
hook.Subcommands = []*cli.Command{
|
||||
cdi.NewCommand(m.logger),
|
||||
}
|
||||
|
||||
return &hook
|
||||
}
|
@ -20,8 +20,10 @@ import (
|
||||
"os"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook"
|
||||
infoCLI "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/info"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/runtime"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/info"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
cli "github.com/urfave/cli/v2"
|
||||
)
|
||||
@ -72,6 +74,7 @@ func main() {
|
||||
c.Commands = []*cli.Command{
|
||||
hook.NewCommand(logger),
|
||||
runtime.NewCommand(logger),
|
||||
infoCLI.NewCommand(logger),
|
||||
}
|
||||
|
||||
// Run the CLI
|
||||
|
Loading…
Reference in New Issue
Block a user