Merge branch 'CNT-3898/improve-cdi-annotations' into 'main'

Improve handling of environment variable devices in CDI mode

See merge request nvidia/container-toolkit/container-toolkit!321
This commit is contained in:
Evan Lezar 2023-03-08 04:37:41 +00:00
commit a2adbc1133
4 changed files with 59 additions and 44 deletions

View File

@ -69,6 +69,9 @@ func TestGetConfig(t *testing.T) {
CSV: csvModeConfig{
MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d",
},
CDI: cdiModeConfig{
DefaultKind: "nvidia.com/gpu",
},
},
},
NVIDIACTKConfig: CTKConfig{
@ -86,6 +89,7 @@ func TestGetConfig(t *testing.T) {
"nvidia-container-runtime.log-level = \"debug\"",
"nvidia-container-runtime.runtimes = [\"/some/runtime\",]",
"nvidia-container-runtime.mode = \"not-auto\"",
"nvidia-container-runtime.modes.cdi.default-kind = \"example.vendor.com/device\"",
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
},
@ -102,6 +106,9 @@ func TestGetConfig(t *testing.T) {
CSV: csvModeConfig{
MountSpecPath: "/not/etc/nvidia-container-runtime/host-files-for-container.d",
},
CDI: cdiModeConfig{
DefaultKind: "example.vendor.com/device",
},
},
},
NVIDIACTKConfig: CTKConfig{
@ -121,6 +128,8 @@ func TestGetConfig(t *testing.T) {
"log-level = \"debug\"",
"runtimes = [\"/some/runtime\",]",
"mode = \"not-auto\"",
"[nvidia-container-runtime.modes.cdi]",
"default-kind = \"example.vendor.com/device\"",
"[nvidia-container-runtime.modes.csv]",
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"[nvidia-ctk]",
@ -139,6 +148,9 @@ func TestGetConfig(t *testing.T) {
CSV: csvModeConfig{
MountSpecPath: "/not/etc/nvidia-container-runtime/host-files-for-container.d",
},
CDI: cdiModeConfig{
DefaultKind: "example.vendor.com/device",
},
},
},
NVIDIACTKConfig: CTKConfig{

View File

@ -50,6 +50,8 @@ type modesConfig struct {
type cdiModeConfig struct {
// SpecDirs allows for the default spec dirs for CDI to be overridden
SpecDirs []string `toml:"spec-dirs"`
// DefaultKind sets the default kind to be used when constructing fully-qualified CDI device names
DefaultKind string `toml:"default-kind"`
}
type csvModeConfig struct {
@ -94,6 +96,9 @@ func GetDefaultRuntimeConfig() *RuntimeConfig {
CSV: csvModeConfig{
MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d",
},
CDI: cdiModeConfig{
DefaultKind: "nvidia.com/gpu",
},
},
}

View File

@ -18,7 +18,6 @@ package modifier
import (
"fmt"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
@ -38,7 +37,7 @@ type cdiModifier struct {
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES enviroment variable is
// used to select the devices to include.
func NewCDIModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
devices, err := getDevicesFromSpec(ociSpec)
devices, err := getDevicesFromSpec(logger, ociSpec, cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind)
if err != nil {
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
}
@ -46,6 +45,7 @@ func NewCDIModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec)
logger.Debugf("No devices requested; no modification required.")
return nil, nil
}
logger.Debugf("Creating CDI modifier for devices: %v", devices)
specDirs := cdi.DefaultSpecDirs
if len(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.SpecDirs) > 0 {
@ -61,34 +61,36 @@ func NewCDIModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec)
return m, nil
}
func getDevicesFromSpec(ociSpec oci.Spec) ([]string, error) {
func getDevicesFromSpec(logger *logrus.Logger, ociSpec oci.Spec, defaultKind string) ([]string, error) {
rawSpec, err := ociSpec.Load()
if err != nil {
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
}
_, annotationDevices, err := cdi.ParseAnnotations(rawSpec.Annotations)
if err != nil {
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
}
if len(annotationDevices) > 0 {
return annotationDevices, nil
}
image, err := image.NewCUDAImageFromSpec(rawSpec)
if err != nil {
return nil, err
}
envDevices := image.DevicesFromEnvvars(visibleDevicesEnvvar)
_, annotationDevices, err := cdi.ParseAnnotations(rawSpec.Annotations)
if err != nil {
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
}
uniqueDevices := make(map[string]struct{})
for _, name := range append(envDevices.List(), annotationDevices...) {
if !cdi.IsQualifiedName(name) {
name = cdi.QualifiedName("nvidia.com", "gpu", name)
}
uniqueDevices[name] = struct{}{}
}
var devices []string
for name := range uniqueDevices {
seen := make(map[string]bool)
for _, name := range envDevices.List() {
if !cdi.IsQualifiedName(name) {
name = fmt.Sprintf("%s=%s", defaultKind, name)
}
if seen[name] {
logger.Debugf("Ignoring duplicate device %q", name)
continue
}
devices = append(devices, name)
}
@ -105,21 +107,8 @@ func (m cdiModifier) Modify(spec *specs.Spec) error {
m.logger.Debugf("The following error was triggered when refreshing the CDI registry: %v", err)
}
devices := m.devices
for _, d := range devices {
if d == "nvidia.com/gpu=all" {
devices = []string{}
for _, candidate := range registry.DeviceDB().ListDevices() {
if strings.HasPrefix(candidate, "nvidia.com/gpu=") {
devices = append(devices, candidate)
}
}
break
}
}
m.logger.Debugf("Injecting devices using CDI: %v", devices)
_, err := registry.InjectDevices(spec, devices...)
m.logger.Debugf("Injecting devices using CDI: %v", m.devices)
_, err := registry.InjectDevices(spec, m.devices...)
if err != nil {
return fmt.Errorf("failed to inject CDI devices: %v", err)
}

View File

@ -43,13 +43,16 @@ const (
)
type options struct {
DriverRoot string
DriverRootCtrPath string
ContainerRuntimeMode string
ContainerRuntimeDebug string
ContainerRuntimeLogLevel string
ContainerCLIDebug string
toolkitRoot string
DriverRoot string
DriverRootCtrPath string
ContainerRuntimeMode string
ContainerRuntimeModesCdiDefaultKind string
ContainerRuntimeDebug string
ContainerRuntimeLogLevel string
ContainerCLIDebug string
toolkitRoot string
cdiOutputDir string
cdiKind string
@ -129,6 +132,11 @@ func main() {
Destination: &opts.ContainerRuntimeMode,
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODE"},
},
&cli.StringFlag{
Name: "nvidia-container-runtime-modes.cdi.default-kind",
Destination: &opts.ContainerRuntimeModesCdiDefaultKind,
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODES_CDI_DEFAULT_KIND"},
},
&cli.StringFlag{
Name: "nvidia-container-cli-debug",
Usage: "Specify the location of the debug log file for the NVIDIA Container CLI",
@ -345,10 +353,11 @@ func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutable
// Set the debug options if selected
debugOptions := map[string]string{
"nvidia-container-runtime.debug": opts.ContainerRuntimeDebug,
"nvidia-container-runtime.log-level": opts.ContainerRuntimeLogLevel,
"nvidia-container-runtime.mode": opts.ContainerRuntimeMode,
"nvidia-container-cli.debug": opts.ContainerCLIDebug,
"nvidia-container-runtime.debug": opts.ContainerRuntimeDebug,
"nvidia-container-runtime.log-level": opts.ContainerRuntimeLogLevel,
"nvidia-container-runtime.mode": opts.ContainerRuntimeMode,
"nvidia-container-runtime.modes.cdi.default-kind": opts.ContainerRuntimeModesCdiDefaultKind,
"nvidia-container-cli.debug": opts.ContainerCLIDebug,
}
for key, value := range debugOptions {
if value == "" {