mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-03-20 11:58:28 +00:00
Merge branch 'CNT-3898/improve-cdi-annotations' into 'main'
Improve handling of environment variable devices in CDI mode See merge request nvidia/container-toolkit/container-toolkit!321
This commit is contained in:
commit
a2adbc1133
@ -69,6 +69,9 @@ func TestGetConfig(t *testing.T) {
|
|||||||
CSV: csvModeConfig{
|
CSV: csvModeConfig{
|
||||||
MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d",
|
MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d",
|
||||||
},
|
},
|
||||||
|
CDI: cdiModeConfig{
|
||||||
|
DefaultKind: "nvidia.com/gpu",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
NVIDIACTKConfig: CTKConfig{
|
NVIDIACTKConfig: CTKConfig{
|
||||||
@ -86,6 +89,7 @@ func TestGetConfig(t *testing.T) {
|
|||||||
"nvidia-container-runtime.log-level = \"debug\"",
|
"nvidia-container-runtime.log-level = \"debug\"",
|
||||||
"nvidia-container-runtime.runtimes = [\"/some/runtime\",]",
|
"nvidia-container-runtime.runtimes = [\"/some/runtime\",]",
|
||||||
"nvidia-container-runtime.mode = \"not-auto\"",
|
"nvidia-container-runtime.mode = \"not-auto\"",
|
||||||
|
"nvidia-container-runtime.modes.cdi.default-kind = \"example.vendor.com/device\"",
|
||||||
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
||||||
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
|
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
|
||||||
},
|
},
|
||||||
@ -102,6 +106,9 @@ func TestGetConfig(t *testing.T) {
|
|||||||
CSV: csvModeConfig{
|
CSV: csvModeConfig{
|
||||||
MountSpecPath: "/not/etc/nvidia-container-runtime/host-files-for-container.d",
|
MountSpecPath: "/not/etc/nvidia-container-runtime/host-files-for-container.d",
|
||||||
},
|
},
|
||||||
|
CDI: cdiModeConfig{
|
||||||
|
DefaultKind: "example.vendor.com/device",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
NVIDIACTKConfig: CTKConfig{
|
NVIDIACTKConfig: CTKConfig{
|
||||||
@ -121,6 +128,8 @@ func TestGetConfig(t *testing.T) {
|
|||||||
"log-level = \"debug\"",
|
"log-level = \"debug\"",
|
||||||
"runtimes = [\"/some/runtime\",]",
|
"runtimes = [\"/some/runtime\",]",
|
||||||
"mode = \"not-auto\"",
|
"mode = \"not-auto\"",
|
||||||
|
"[nvidia-container-runtime.modes.cdi]",
|
||||||
|
"default-kind = \"example.vendor.com/device\"",
|
||||||
"[nvidia-container-runtime.modes.csv]",
|
"[nvidia-container-runtime.modes.csv]",
|
||||||
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
||||||
"[nvidia-ctk]",
|
"[nvidia-ctk]",
|
||||||
@ -139,6 +148,9 @@ func TestGetConfig(t *testing.T) {
|
|||||||
CSV: csvModeConfig{
|
CSV: csvModeConfig{
|
||||||
MountSpecPath: "/not/etc/nvidia-container-runtime/host-files-for-container.d",
|
MountSpecPath: "/not/etc/nvidia-container-runtime/host-files-for-container.d",
|
||||||
},
|
},
|
||||||
|
CDI: cdiModeConfig{
|
||||||
|
DefaultKind: "example.vendor.com/device",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
NVIDIACTKConfig: CTKConfig{
|
NVIDIACTKConfig: CTKConfig{
|
||||||
|
@ -50,6 +50,8 @@ type modesConfig struct {
|
|||||||
type cdiModeConfig struct {
|
type cdiModeConfig struct {
|
||||||
// SpecDirs allows for the default spec dirs for CDI to be overridden
|
// SpecDirs allows for the default spec dirs for CDI to be overridden
|
||||||
SpecDirs []string `toml:"spec-dirs"`
|
SpecDirs []string `toml:"spec-dirs"`
|
||||||
|
// DefaultKind sets the default kind to be used when constructing fully-qualified CDI device names
|
||||||
|
DefaultKind string `toml:"default-kind"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type csvModeConfig struct {
|
type csvModeConfig struct {
|
||||||
@ -94,6 +96,9 @@ func GetDefaultRuntimeConfig() *RuntimeConfig {
|
|||||||
CSV: csvModeConfig{
|
CSV: csvModeConfig{
|
||||||
MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d",
|
MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d",
|
||||||
},
|
},
|
||||||
|
CDI: cdiModeConfig{
|
||||||
|
DefaultKind: "nvidia.com/gpu",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,7 +18,6 @@ package modifier
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||||
@ -38,7 +37,7 @@ type cdiModifier struct {
|
|||||||
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES enviroment variable is
|
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES enviroment variable is
|
||||||
// used to select the devices to include.
|
// used to select the devices to include.
|
||||||
func NewCDIModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
func NewCDIModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
||||||
devices, err := getDevicesFromSpec(ociSpec)
|
devices, err := getDevicesFromSpec(logger, ociSpec, cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
|
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
|
||||||
}
|
}
|
||||||
@ -46,6 +45,7 @@ func NewCDIModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec)
|
|||||||
logger.Debugf("No devices requested; no modification required.")
|
logger.Debugf("No devices requested; no modification required.")
|
||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
logger.Debugf("Creating CDI modifier for devices: %v", devices)
|
||||||
|
|
||||||
specDirs := cdi.DefaultSpecDirs
|
specDirs := cdi.DefaultSpecDirs
|
||||||
if len(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.SpecDirs) > 0 {
|
if len(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.SpecDirs) > 0 {
|
||||||
@ -61,34 +61,36 @@ func NewCDIModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec)
|
|||||||
return m, nil
|
return m, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getDevicesFromSpec(ociSpec oci.Spec) ([]string, error) {
|
func getDevicesFromSpec(logger *logrus.Logger, ociSpec oci.Spec, defaultKind string) ([]string, error) {
|
||||||
rawSpec, err := ociSpec.Load()
|
rawSpec, err := ociSpec.Load()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_, annotationDevices, err := cdi.ParseAnnotations(rawSpec.Annotations)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
|
||||||
|
}
|
||||||
|
if len(annotationDevices) > 0 {
|
||||||
|
return annotationDevices, nil
|
||||||
|
}
|
||||||
|
|
||||||
image, err := image.NewCUDAImageFromSpec(rawSpec)
|
image, err := image.NewCUDAImageFromSpec(rawSpec)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
envDevices := image.DevicesFromEnvvars(visibleDevicesEnvvar)
|
envDevices := image.DevicesFromEnvvars(visibleDevicesEnvvar)
|
||||||
|
|
||||||
_, annotationDevices, err := cdi.ParseAnnotations(rawSpec.Annotations)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
uniqueDevices := make(map[string]struct{})
|
|
||||||
for _, name := range append(envDevices.List(), annotationDevices...) {
|
|
||||||
if !cdi.IsQualifiedName(name) {
|
|
||||||
name = cdi.QualifiedName("nvidia.com", "gpu", name)
|
|
||||||
}
|
|
||||||
uniqueDevices[name] = struct{}{}
|
|
||||||
}
|
|
||||||
|
|
||||||
var devices []string
|
var devices []string
|
||||||
for name := range uniqueDevices {
|
seen := make(map[string]bool)
|
||||||
|
for _, name := range envDevices.List() {
|
||||||
|
if !cdi.IsQualifiedName(name) {
|
||||||
|
name = fmt.Sprintf("%s=%s", defaultKind, name)
|
||||||
|
}
|
||||||
|
if seen[name] {
|
||||||
|
logger.Debugf("Ignoring duplicate device %q", name)
|
||||||
|
continue
|
||||||
|
}
|
||||||
devices = append(devices, name)
|
devices = append(devices, name)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,21 +107,8 @@ func (m cdiModifier) Modify(spec *specs.Spec) error {
|
|||||||
m.logger.Debugf("The following error was triggered when refreshing the CDI registry: %v", err)
|
m.logger.Debugf("The following error was triggered when refreshing the CDI registry: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
devices := m.devices
|
m.logger.Debugf("Injecting devices using CDI: %v", m.devices)
|
||||||
for _, d := range devices {
|
_, err := registry.InjectDevices(spec, m.devices...)
|
||||||
if d == "nvidia.com/gpu=all" {
|
|
||||||
devices = []string{}
|
|
||||||
for _, candidate := range registry.DeviceDB().ListDevices() {
|
|
||||||
if strings.HasPrefix(candidate, "nvidia.com/gpu=") {
|
|
||||||
devices = append(devices, candidate)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
m.logger.Debugf("Injecting devices using CDI: %v", devices)
|
|
||||||
_, err := registry.InjectDevices(spec, devices...)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to inject CDI devices: %v", err)
|
return fmt.Errorf("failed to inject CDI devices: %v", err)
|
||||||
}
|
}
|
||||||
|
@ -45,9 +45,12 @@ const (
|
|||||||
type options struct {
|
type options struct {
|
||||||
DriverRoot string
|
DriverRoot string
|
||||||
DriverRootCtrPath string
|
DriverRootCtrPath string
|
||||||
|
|
||||||
ContainerRuntimeMode string
|
ContainerRuntimeMode string
|
||||||
|
ContainerRuntimeModesCdiDefaultKind string
|
||||||
ContainerRuntimeDebug string
|
ContainerRuntimeDebug string
|
||||||
ContainerRuntimeLogLevel string
|
ContainerRuntimeLogLevel string
|
||||||
|
|
||||||
ContainerCLIDebug string
|
ContainerCLIDebug string
|
||||||
toolkitRoot string
|
toolkitRoot string
|
||||||
|
|
||||||
@ -129,6 +132,11 @@ func main() {
|
|||||||
Destination: &opts.ContainerRuntimeMode,
|
Destination: &opts.ContainerRuntimeMode,
|
||||||
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODE"},
|
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODE"},
|
||||||
},
|
},
|
||||||
|
&cli.StringFlag{
|
||||||
|
Name: "nvidia-container-runtime-modes.cdi.default-kind",
|
||||||
|
Destination: &opts.ContainerRuntimeModesCdiDefaultKind,
|
||||||
|
EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODES_CDI_DEFAULT_KIND"},
|
||||||
|
},
|
||||||
&cli.StringFlag{
|
&cli.StringFlag{
|
||||||
Name: "nvidia-container-cli-debug",
|
Name: "nvidia-container-cli-debug",
|
||||||
Usage: "Specify the location of the debug log file for the NVIDIA Container CLI",
|
Usage: "Specify the location of the debug log file for the NVIDIA Container CLI",
|
||||||
@ -348,6 +356,7 @@ func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutable
|
|||||||
"nvidia-container-runtime.debug": opts.ContainerRuntimeDebug,
|
"nvidia-container-runtime.debug": opts.ContainerRuntimeDebug,
|
||||||
"nvidia-container-runtime.log-level": opts.ContainerRuntimeLogLevel,
|
"nvidia-container-runtime.log-level": opts.ContainerRuntimeLogLevel,
|
||||||
"nvidia-container-runtime.mode": opts.ContainerRuntimeMode,
|
"nvidia-container-runtime.mode": opts.ContainerRuntimeMode,
|
||||||
|
"nvidia-container-runtime.modes.cdi.default-kind": opts.ContainerRuntimeModesCdiDefaultKind,
|
||||||
"nvidia-container-cli.debug": opts.ContainerCLIDebug,
|
"nvidia-container-cli.debug": opts.ContainerCLIDebug,
|
||||||
}
|
}
|
||||||
for key, value := range debugOptions {
|
for key, value := range debugOptions {
|
||||||
|
Loading…
Reference in New Issue
Block a user