From f00439c93ea6a3c84e8d9307d64b56866773aaa6 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 6 Mar 2023 13:30:40 +0200 Subject: [PATCH 1/3] Add nvidia-container-runtime.modes.csv.default-kind config option Signed-off-by: Evan Lezar --- internal/config/config_test.go | 12 ++++++++++++ internal/config/runtime.go | 5 +++++ 2 files changed, 17 insertions(+) diff --git a/internal/config/config_test.go b/internal/config/config_test.go index f4501bf3..12683ae2 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -69,6 +69,9 @@ func TestGetConfig(t *testing.T) { CSV: csvModeConfig{ MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d", }, + CDI: cdiModeConfig{ + DefaultKind: "nvidia.com/gpu", + }, }, }, NVIDIACTKConfig: CTKConfig{ @@ -86,6 +89,7 @@ func TestGetConfig(t *testing.T) { "nvidia-container-runtime.log-level = \"debug\"", "nvidia-container-runtime.runtimes = [\"/some/runtime\",]", "nvidia-container-runtime.mode = \"not-auto\"", + "nvidia-container-runtime.modes.cdi.default-kind = \"example.vendor.com/device\"", "nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", "nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"", }, @@ -102,6 +106,9 @@ func TestGetConfig(t *testing.T) { CSV: csvModeConfig{ MountSpecPath: "/not/etc/nvidia-container-runtime/host-files-for-container.d", }, + CDI: cdiModeConfig{ + DefaultKind: "example.vendor.com/device", + }, }, }, NVIDIACTKConfig: CTKConfig{ @@ -121,6 +128,8 @@ func TestGetConfig(t *testing.T) { "log-level = \"debug\"", "runtimes = [\"/some/runtime\",]", "mode = \"not-auto\"", + "[nvidia-container-runtime.modes.cdi]", + "default-kind = \"example.vendor.com/device\"", "[nvidia-container-runtime.modes.csv]", "mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", "[nvidia-ctk]", @@ -139,6 +148,9 @@ func TestGetConfig(t *testing.T) { CSV: csvModeConfig{ MountSpecPath: "/not/etc/nvidia-container-runtime/host-files-for-container.d", }, + CDI: cdiModeConfig{ + DefaultKind: "example.vendor.com/device", + }, }, }, NVIDIACTKConfig: CTKConfig{ diff --git a/internal/config/runtime.go b/internal/config/runtime.go index a4a79d38..0248754e 100644 --- a/internal/config/runtime.go +++ b/internal/config/runtime.go @@ -50,6 +50,8 @@ type modesConfig struct { type cdiModeConfig struct { // SpecDirs allows for the default spec dirs for CDI to be overridden SpecDirs []string `toml:"spec-dirs"` + // DefaultKind sets the default kind to be used when constructing fully-qualified CDI device names + DefaultKind string `toml:"default-kind"` } type csvModeConfig struct { @@ -94,6 +96,9 @@ func GetDefaultRuntimeConfig() *RuntimeConfig { CSV: csvModeConfig{ MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d", }, + CDI: cdiModeConfig{ + DefaultKind: "nvidia.com/gpu", + }, }, } From 6d220ed9a2837fd26ded363daa4cd2f9200a9b77 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 6 Mar 2023 13:40:21 +0200 Subject: [PATCH 2/3] Rework selection of devices in CDI mode The following changes are made: * The default-cdi-kind config option is used to convert an envvar entry to a fully-qualified device name * If annotation devices exist, these are used instead of the envvar devices. * The `all` device is no longer treated as a special case and MUST exist in the CDI spec. Signed-off-by: Evan Lezar --- internal/modifier/cdi.go | 55 ++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index cffe2967..eb15b4bf 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -18,7 +18,6 @@ package modifier import ( "fmt" - "strings" "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" @@ -38,7 +37,7 @@ type cdiModifier struct { // CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES enviroment variable is // used to select the devices to include. func NewCDIModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) { - devices, err := getDevicesFromSpec(ociSpec) + devices, err := getDevicesFromSpec(logger, ociSpec, cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind) if err != nil { return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err) } @@ -46,6 +45,7 @@ func NewCDIModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) logger.Debugf("No devices requested; no modification required.") return nil, nil } + logger.Debugf("Creating CDI modifier for devices: %v", devices) specDirs := cdi.DefaultSpecDirs if len(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.SpecDirs) > 0 { @@ -61,34 +61,36 @@ func NewCDIModifier(logger *logrus.Logger, cfg *config.Config, ociSpec oci.Spec) return m, nil } -func getDevicesFromSpec(ociSpec oci.Spec) ([]string, error) { +func getDevicesFromSpec(logger *logrus.Logger, ociSpec oci.Spec, defaultKind string) ([]string, error) { rawSpec, err := ociSpec.Load() if err != nil { return nil, fmt.Errorf("failed to load OCI spec: %v", err) } + _, annotationDevices, err := cdi.ParseAnnotations(rawSpec.Annotations) + if err != nil { + return nil, fmt.Errorf("failed to parse container annotations: %v", err) + } + if len(annotationDevices) > 0 { + return annotationDevices, nil + } + image, err := image.NewCUDAImageFromSpec(rawSpec) if err != nil { return nil, err } - envDevices := image.DevicesFromEnvvars(visibleDevicesEnvvar) - _, annotationDevices, err := cdi.ParseAnnotations(rawSpec.Annotations) - if err != nil { - return nil, fmt.Errorf("failed to parse container annotations: %v", err) - } - - uniqueDevices := make(map[string]struct{}) - for _, name := range append(envDevices.List(), annotationDevices...) { - if !cdi.IsQualifiedName(name) { - name = cdi.QualifiedName("nvidia.com", "gpu", name) - } - uniqueDevices[name] = struct{}{} - } - var devices []string - for name := range uniqueDevices { + seen := make(map[string]bool) + for _, name := range envDevices.List() { + if !cdi.IsQualifiedName(name) { + name = fmt.Sprintf("%s=%s", defaultKind, name) + } + if seen[name] { + logger.Debugf("Ignoring duplicate device %q", name) + continue + } devices = append(devices, name) } @@ -105,21 +107,8 @@ func (m cdiModifier) Modify(spec *specs.Spec) error { m.logger.Debugf("The following error was triggered when refreshing the CDI registry: %v", err) } - devices := m.devices - for _, d := range devices { - if d == "nvidia.com/gpu=all" { - devices = []string{} - for _, candidate := range registry.DeviceDB().ListDevices() { - if strings.HasPrefix(candidate, "nvidia.com/gpu=") { - devices = append(devices, candidate) - } - } - break - } - } - - m.logger.Debugf("Injecting devices using CDI: %v", devices) - _, err := registry.InjectDevices(spec, devices...) + m.logger.Debugf("Injecting devices using CDI: %v", m.devices) + _, err := registry.InjectDevices(spec, m.devices...) if err != nil { return fmt.Errorf("failed to inject CDI devices: %v", err) } From 1e6fe40c764950f1e801806b309d9bc8e21d339c Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 7 Mar 2023 16:17:49 +0200 Subject: [PATCH 3/3] Allow nvidia-container-runtime.modes.cdi.default-kind to be set This change allows the nvidia-container-runtime.modes.cdi.default-kind to be set in the toolkit-container. The NVIDIA_CONTAINER_RUNTIME_MODES_CDI_DEFAULT_KIND envvar is used. Signed-off-by: Evan Lezar --- tools/container/toolkit/toolkit.go | 31 +++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/tools/container/toolkit/toolkit.go b/tools/container/toolkit/toolkit.go index c3153e97..e1ffb716 100644 --- a/tools/container/toolkit/toolkit.go +++ b/tools/container/toolkit/toolkit.go @@ -43,13 +43,16 @@ const ( ) type options struct { - DriverRoot string - DriverRootCtrPath string - ContainerRuntimeMode string - ContainerRuntimeDebug string - ContainerRuntimeLogLevel string - ContainerCLIDebug string - toolkitRoot string + DriverRoot string + DriverRootCtrPath string + + ContainerRuntimeMode string + ContainerRuntimeModesCdiDefaultKind string + ContainerRuntimeDebug string + ContainerRuntimeLogLevel string + + ContainerCLIDebug string + toolkitRoot string cdiOutputDir string cdiKind string @@ -129,6 +132,11 @@ func main() { Destination: &opts.ContainerRuntimeMode, EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODE"}, }, + &cli.StringFlag{ + Name: "nvidia-container-runtime-modes.cdi.default-kind", + Destination: &opts.ContainerRuntimeModesCdiDefaultKind, + EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODES_CDI_DEFAULT_KIND"}, + }, &cli.StringFlag{ Name: "nvidia-container-cli-debug", Usage: "Specify the location of the debug log file for the NVIDIA Container CLI", @@ -345,10 +353,11 @@ func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutable // Set the debug options if selected debugOptions := map[string]string{ - "nvidia-container-runtime.debug": opts.ContainerRuntimeDebug, - "nvidia-container-runtime.log-level": opts.ContainerRuntimeLogLevel, - "nvidia-container-runtime.mode": opts.ContainerRuntimeMode, - "nvidia-container-cli.debug": opts.ContainerCLIDebug, + "nvidia-container-runtime.debug": opts.ContainerRuntimeDebug, + "nvidia-container-runtime.log-level": opts.ContainerRuntimeLogLevel, + "nvidia-container-runtime.mode": opts.ContainerRuntimeMode, + "nvidia-container-runtime.modes.cdi.default-kind": opts.ContainerRuntimeModesCdiDefaultKind, + "nvidia-container-cli.debug": opts.ContainerCLIDebug, } for key, value := range debugOptions { if value == "" {