diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b1a34c1..077c46fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ * Generate a simplified CDI specification by default. This means that entities in the common edits in a spec are not included in device definitions. * Also return an error from the nvcdi.New constructor instead of panicing. * Detect XOrg libraries for injection and CDI spec generation. +* Add `nvidia-container-runtime.modes.cdi.annotation-prefixes` config option that allows the CDI annotation prefixes that are read to be overridden. * [libnvidia-container] Fix segmentation fault when RPC initialization fails. * [libnvidia-container] Build centos variants of the NVIDIA Container Library with static libtirpc v1.3.2. diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 4c378d55..72f03336 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -71,7 +71,8 @@ func TestGetConfig(t *testing.T) { MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d", }, CDI: cdiModeConfig{ - DefaultKind: "nvidia.com/gpu", + DefaultKind: "nvidia.com/gpu", + AnnotationPrefixes: []string{"cdi.k8s.io/"}, }, }, }, @@ -92,6 +93,7 @@ func TestGetConfig(t *testing.T) { "nvidia-container-runtime.runtimes = [\"/some/runtime\",]", "nvidia-container-runtime.mode = \"not-auto\"", "nvidia-container-runtime.modes.cdi.default-kind = \"example.vendor.com/device\"", + "nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]", "nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", "nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"", }, @@ -111,6 +113,10 @@ func TestGetConfig(t *testing.T) { }, CDI: cdiModeConfig{ DefaultKind: "example.vendor.com/device", + AnnotationPrefixes: []string{ + "cdi.k8s.io/", + "example.vendor.com/", + }, }, }, }, @@ -134,6 +140,7 @@ func TestGetConfig(t *testing.T) { "mode = \"not-auto\"", "[nvidia-container-runtime.modes.cdi]", "default-kind = \"example.vendor.com/device\"", + "annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]", "[nvidia-container-runtime.modes.csv]", "mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", "[nvidia-ctk]", @@ -155,6 +162,10 @@ func TestGetConfig(t *testing.T) { }, CDI: cdiModeConfig{ DefaultKind: "example.vendor.com/device", + AnnotationPrefixes: []string{ + "cdi.k8s.io/", + "example.vendor.com/", + }, }, }, }, diff --git a/internal/config/engine/containerd/config_v1.go b/internal/config/engine/containerd/config_v1.go index d68c9d18..a470a581 100644 --- a/internal/config/engine/containerd/config_v1.go +++ b/internal/config/engine/containerd/config_v1.go @@ -50,12 +50,15 @@ func (c *ConfigV1) AddRuntime(name string, path string, setAsDefault bool) error config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "runtime_engine"}, "") config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "privileged_without_host_devices"}, false) } - cdiAnnotations := []interface{}{"cdi.k8s.io/*"} - containerAnnotations, ok := config.GetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "container_annotations"}).([]interface{}) - if ok && containerAnnotations != nil { - cdiAnnotations = append(containerAnnotations, cdiAnnotations...) + + if len(c.ContainerAnnotations) > 0 { + annotations, err := (*Config)(c).getRuntimeAnnotations([]string{"plugins", "cri", "containerd", "runtimes", name, "container_annotations"}) + if err != nil { + return err + } + annotations = append(c.ContainerAnnotations, annotations...) + config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "container_annotations"}, annotations) } - config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "container_annotations"}, cdiAnnotations) config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "options", "BinaryName"}, path) config.SetPath([]string{"plugins", "cri", "containerd", "runtimes", name, "options", "Runtime"}, path) diff --git a/internal/config/engine/containerd/config_v2.go b/internal/config/engine/containerd/config_v2.go index 8156e470..fa1287fa 100644 --- a/internal/config/engine/containerd/config_v2.go +++ b/internal/config/engine/containerd/config_v2.go @@ -45,12 +45,14 @@ func (c *Config) AddRuntime(name string, path string, setAsDefault bool) error { config.SetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "privileged_without_host_devices"}, false) } - cdiAnnotations := []interface{}{"cdi.k8s.io/*"} - containerAnnotations, ok := config.GetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "container_annotations"}).([]interface{}) - if ok && containerAnnotations != nil { - cdiAnnotations = append(containerAnnotations, cdiAnnotations...) + if len(c.ContainerAnnotations) > 0 { + annotations, err := c.getRuntimeAnnotations([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "container_annotations"}) + if err != nil { + return err + } + annotations = append(c.ContainerAnnotations, annotations...) + config.SetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "container_annotations"}, annotations) } - config.SetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "container_annotations"}, cdiAnnotations) config.SetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes", name, "options", "BinaryName"}, path) @@ -62,6 +64,32 @@ func (c *Config) AddRuntime(name string, path string, setAsDefault bool) error { return nil } +func (c *Config) getRuntimeAnnotations(path []string) ([]string, error) { + if c == nil || c.Tree == nil { + return nil, nil + } + + config := *c.Tree + if !config.HasPath(path) { + return nil, nil + } + annotationsI, ok := config.GetPath(path).([]interface{}) + if !ok { + return nil, fmt.Errorf("invalid annotations: %v", annotationsI) + } + + var annotations []string + for _, annotation := range annotationsI { + a, ok := annotation.(string) + if !ok { + return nil, fmt.Errorf("invalid annotation: %v", annotation) + } + annotations = append(annotations, a) + } + + return annotations, nil +} + // DefaultRuntime returns the default runtime for the cri-o config func (c Config) DefaultRuntime() string { if runtime, ok := c.GetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "default_runtime_name"}).(string); ok { diff --git a/internal/config/engine/containerd/containerd.go b/internal/config/engine/containerd/containerd.go index 3ea1c65e..e163c41d 100644 --- a/internal/config/engine/containerd/containerd.go +++ b/internal/config/engine/containerd/containerd.go @@ -26,6 +26,7 @@ type Config struct { *toml.Tree RuntimeType string UseDefaultRuntimeName bool + ContainerAnnotations []string } // New creates a containerd config with the specified options diff --git a/internal/config/engine/containerd/option.go b/internal/config/engine/containerd/option.go index 1c0497df..92f2073d 100644 --- a/internal/config/engine/containerd/option.go +++ b/internal/config/engine/containerd/option.go @@ -30,9 +30,10 @@ const ( ) type builder struct { - path string - runtimeType string - useLegacyConfig bool + path string + runtimeType string + useLegacyConfig bool + containerAnnotations []string } // Option defines a function that can be used to configure the config builder @@ -59,6 +60,13 @@ func WithUseLegacyConfig(useLegacyConfig bool) Option { } } +// WithContainerAnnotations sets the container annotations for the config builder +func WithContainerAnnotations(containerAnnotations ...string) Option { + return func(b *builder) { + b.containerAnnotations = containerAnnotations + } +} + func (b *builder) build() (engine.Interface, error) { if b.path == "" { return nil, fmt.Errorf("config path is empty") @@ -74,6 +82,7 @@ func (b *builder) build() (engine.Interface, error) { } config.RuntimeType = b.runtimeType config.UseDefaultRuntimeName = !b.useLegacyConfig + config.ContainerAnnotations = b.containerAnnotations version, err := config.parseVersion(b.useLegacyConfig) if err != nil { diff --git a/internal/config/runtime.go b/internal/config/runtime.go index 0248754e..8ff8c6e8 100644 --- a/internal/config/runtime.go +++ b/internal/config/runtime.go @@ -19,6 +19,7 @@ package config import ( "fmt" + "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/pelletier/go-toml" "github.com/sirupsen/logrus" ) @@ -52,6 +53,8 @@ type cdiModeConfig struct { SpecDirs []string `toml:"spec-dirs"` // DefaultKind sets the default kind to be used when constructing fully-qualified CDI device names DefaultKind string `toml:"default-kind"` + // AnnotationPrefixes sets the allowed prefixes for CDI annotation-based device injection + AnnotationPrefixes []string `toml:"annotation-prefixes"` } type csvModeConfig struct { @@ -98,6 +101,9 @@ func GetDefaultRuntimeConfig() *RuntimeConfig { }, CDI: cdiModeConfig{ DefaultKind: "nvidia.com/gpu", + AnnotationPrefixes: []string{ + cdi.AnnotationPrefix, + }, }, }, } diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index a5f42eb8..879967b9 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -18,6 +18,7 @@ package modifier import ( "fmt" + "strings" "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" @@ -67,7 +68,7 @@ func getDevicesFromSpec(logger *logrus.Logger, ociSpec oci.Spec, cfg *config.Con return nil, fmt.Errorf("failed to load OCI spec: %v", err) } - _, annotationDevices, err := cdi.ParseAnnotations(rawSpec.Annotations) + annotationDevices, err := getAnnotationDevices(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes, rawSpec.Annotations) if err != nil { return nil, fmt.Errorf("failed to parse container annotations: %v", err) } @@ -107,6 +108,38 @@ func getDevicesFromSpec(logger *logrus.Logger, ociSpec oci.Spec, cfg *config.Con return nil, nil } +// getAnnotationDevices returns a list of devices specified in the annotations. +// Keys starting with the specified prefixes are considered and expected to contain a comma-separated list of +// fully-qualified CDI devices names. If any device name is not fully-quality an error is returned. +// The list of returned devices is deduplicated. +func getAnnotationDevices(prefixes []string, annotations map[string]string) ([]string, error) { + devicesByKey := make(map[string][]string) + for key, value := range annotations { + for _, prefix := range prefixes { + if strings.HasPrefix(key, prefix) { + devicesByKey[key] = strings.Split(value, ",") + } + } + } + + seen := make(map[string]bool) + var annotationDevices []string + for key, devices := range devicesByKey { + for _, device := range devices { + if !cdi.IsQualifiedName(device) { + return nil, fmt.Errorf("invalid device name %q in annotation %q", device, key) + } + if seen[device] { + continue + } + annotationDevices = append(annotationDevices, device) + seen[device] = true + } + } + + return annotationDevices, nil +} + // Modify loads the CDI registry and injects the specified CDI devices into the OCI runtime specification. func (m cdiModifier) Modify(spec *specs.Spec) error { registry := cdi.GetRegistry( diff --git a/internal/modifier/cdi_test.go b/internal/modifier/cdi_test.go new file mode 100644 index 00000000..88ff697a --- /dev/null +++ b/internal/modifier/cdi_test.go @@ -0,0 +1,92 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package modifier + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestGetAnnotationDevices(t *testing.T) { + testCases := []struct { + description string + prefixes []string + annotations map[string]string + expectedDevices []string + expectedError error + }{ + { + description: "no annotations", + }, + { + description: "no matching annotations", + prefixes: []string{"not-prefix/"}, + annotations: map[string]string{ + "prefix/foo": "example.com/device=bar", + }, + }, + { + description: "single matching annotation", + prefixes: []string{"prefix/"}, + annotations: map[string]string{ + "prefix/foo": "example.com/device=bar", + }, + expectedDevices: []string{"example.com/device=bar"}, + }, + { + description: "multiple matching annotations", + prefixes: []string{"prefix/", "another-prefix/"}, + annotations: map[string]string{ + "prefix/foo": "example.com/device=bar", + "another-prefix/bar": "example.com/device=baz", + }, + expectedDevices: []string{"example.com/device=bar", "example.com/device=baz"}, + }, + { + description: "multiple matching annotations with duplicate devices", + prefixes: []string{"prefix/", "another-prefix/"}, + annotations: map[string]string{ + "prefix/foo": "example.com/device=bar", + "another-prefix/bar": "example.com/device=bar", + }, + expectedDevices: []string{"example.com/device=bar"}, + }, + { + description: "invalid devices", + prefixes: []string{"prefix/"}, + annotations: map[string]string{ + "prefix/foo": "example.com/device", + }, + expectedError: fmt.Errorf("invalid device %q", "example.com/device"), + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + devices, err := getAnnotationDevices(tc.prefixes, tc.annotations) + if tc.expectedError != nil { + require.Error(t, err) + return + } + + require.NoError(t, err) + require.ElementsMatch(t, tc.expectedDevices, devices) + }) + } +} diff --git a/tools/container/containerd/config_v1_test.go b/tools/container/containerd/config_v1_test.go index f3c0ce1d..9625d90b 100644 --- a/tools/container/containerd/config_v1_test.go +++ b/tools/container/containerd/config_v1_test.go @@ -332,6 +332,7 @@ func TestUpdateV1Config(t *testing.T) { Tree: config, UseDefaultRuntimeName: true, RuntimeType: runtimeType, + ContainerAnnotations: []string{"cdi.k8s.io/*"}, } err = UpdateConfig(v1, o) @@ -585,6 +586,7 @@ func TestUpdateV1ConfigWithRuncPresent(t *testing.T) { Tree: config, UseDefaultRuntimeName: true, RuntimeType: runtimeType, + ContainerAnnotations: []string{"cdi.k8s.io/*"}, } err = UpdateConfig(v1, o) diff --git a/tools/container/containerd/config_v2_test.go b/tools/container/containerd/config_v2_test.go index d54340c2..c14fb58f 100644 --- a/tools/container/containerd/config_v2_test.go +++ b/tools/container/containerd/config_v2_test.go @@ -279,8 +279,9 @@ func TestUpdateV2Config(t *testing.T) { require.NoError(t, err) v2 := &containerd.Config{ - Tree: config, - RuntimeType: runtimeType, + Tree: config, + RuntimeType: runtimeType, + ContainerAnnotations: []string{"cdi.k8s.io/*"}, } err = UpdateConfig(v2, o) @@ -520,8 +521,9 @@ func TestUpdateV2ConfigWithRuncPresent(t *testing.T) { require.NoError(t, err) v2 := &containerd.Config{ - Tree: config, - RuntimeType: runtimeType, + Tree: config, + RuntimeType: runtimeType, + ContainerAnnotations: []string{"cdi.k8s.io/*"}, } err = UpdateConfig(v2, o) diff --git a/tools/container/containerd/containerd.go b/tools/container/containerd/containerd.go index b87769a3..011a27d9 100644 --- a/tools/container/containerd/containerd.go +++ b/tools/container/containerd/containerd.go @@ -72,6 +72,8 @@ type options struct { hostRootMount string runtimeDir string useLegacyConfig bool + + ContainerRuntimeModesCDIAnnotationPrefixes cli.StringSlice } func main() { @@ -173,6 +175,11 @@ func main() { Destination: &options.useLegacyConfig, EnvVars: []string{"CONTAINERD_USE_LEGACY_CONFIG"}, }, + &cli.StringSliceFlag{ + Name: "nvidia-container-runtime-modes.cdi.annotation-prefixes", + Destination: &options.ContainerRuntimeModesCDIAnnotationPrefixes, + EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODES_CDI_ANNOTATION_PREFIXES"}, + }, } // Update the subcommand flags with the common subcommand flags @@ -199,6 +206,7 @@ func Setup(c *cli.Context, o *options) error { containerd.WithPath(o.config), containerd.WithRuntimeType(o.runtimeType), containerd.WithUseLegacyConfig(o.useLegacyConfig), + containerd.WithContainerAnnotations(o.containerAnnotationsFromCDIPrefixes()...), ) if err != nil { return fmt.Errorf("unable to load config: %v", err) @@ -241,6 +249,7 @@ func Cleanup(c *cli.Context, o *options) error { containerd.WithPath(o.config), containerd.WithRuntimeType(o.runtimeType), containerd.WithUseLegacyConfig(o.useLegacyConfig), + containerd.WithContainerAnnotations(o.containerAnnotationsFromCDIPrefixes()...), ) if err != nil { return fmt.Errorf("unable to load config: %v", err) @@ -434,3 +443,13 @@ func RestartContainerdSystemd(hostRootMount string) error { return nil } + +// containerAnnotationsFromCDIPrefixes returns the container annotations to set for the given CDI prefixes. +func (o *options) containerAnnotationsFromCDIPrefixes() []string { + var annotations []string + for _, prefix := range o.ContainerRuntimeModesCDIAnnotationPrefixes.Value() { + annotations = append(annotations, prefix+"*") + } + + return annotations +} diff --git a/tools/container/toolkit/toolkit.go b/tools/container/toolkit/toolkit.go index e33a9d2b..1f8d9b40 100644 --- a/tools/container/toolkit/toolkit.go +++ b/tools/container/toolkit/toolkit.go @@ -47,10 +47,12 @@ type options struct { DriverRoot string DriverRootCtrPath string - ContainerRuntimeMode string - ContainerRuntimeModesCdiDefaultKind string - ContainerRuntimeDebug string - ContainerRuntimeLogLevel string + ContainerRuntimeMode string + ContainerRuntimeDebug string + ContainerRuntimeLogLevel string + + ContainerRuntimeModesCdiDefaultKind string + ContainerRuntimeModesCDIAnnotationPrefixes cli.StringSlice ContainerRuntimeHookSkipModeDetection bool @@ -65,6 +67,8 @@ type options struct { acceptNVIDIAVisibleDevicesWhenUnprivileged bool acceptNVIDIAVisibleDevicesAsVolumeMounts bool + + ignoreErrors bool } func main() { @@ -121,26 +125,34 @@ func main() { EnvVars: []string{"DRIVER_ROOT_CTR_PATH"}, }, &cli.StringFlag{ - Name: "nvidia-container-runtime-debug", + Name: "nvidia-container-runtime.debug", + Aliases: []string{"nvidia-container-runtime-debug"}, Usage: "Specify the location of the debug log file for the NVIDIA Container Runtime", Destination: &opts.ContainerRuntimeDebug, EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_DEBUG"}, }, &cli.StringFlag{ - Name: "nvidia-container-runtime-debug-log-level", + Name: "nvidia-container-runtime.log-level", + Aliases: []string{"nvidia-container-runtime-debug-log-level"}, Destination: &opts.ContainerRuntimeLogLevel, EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_LOG_LEVEL"}, }, &cli.StringFlag{ - Name: "nvidia-container-runtime-mode", + Name: "nvidia-container-runtime.mode", + Aliases: []string{"nvidia-container-runtime-mode"}, Destination: &opts.ContainerRuntimeMode, EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODE"}, }, &cli.StringFlag{ - Name: "nvidia-container-runtime-modes.cdi.default-kind", + Name: "nvidia-container-runtime.modes.cdi.default-kind", Destination: &opts.ContainerRuntimeModesCdiDefaultKind, EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODES_CDI_DEFAULT_KIND"}, }, + &cli.StringSliceFlag{ + Name: "nvidia-container-runtime.modes.cdi.annotation-prefixes", + Destination: &opts.ContainerRuntimeModesCDIAnnotationPrefixes, + EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_MODES_CDI_ANNOTATION_PREFIXES"}, + }, &cli.BoolFlag{ Name: "nvidia-container-runtime-hook.skip-mode-detection", Value: true, @@ -148,7 +160,8 @@ func main() { EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_HOOK_SKIP_MODE_DETECTION"}, }, &cli.StringFlag{ - Name: "nvidia-container-cli-debug", + Name: "nvidia-container-cli.debug", + Aliases: []string{"nvidia-container-cli-debug"}, Usage: "Specify the location of the debug log file for the NVIDIA Container CLI", Destination: &opts.ContainerCLIDebug, EnvVars: []string{"NVIDIA_CONTAINER_CLI_DEBUG"}, @@ -194,6 +207,12 @@ func main() { Destination: &opts.cdiKind, EnvVars: []string{"CDI_KIND"}, }, + &cli.BoolFlag{ + Name: "ignore-errors", + Usage: "ignore errors when installing the NVIDIA Container toolkit. This is used for testing purposes only.", + Hidden: true, + Destination: &opts.ignoreErrors, + }, } // Update the subcommand flags with the common subcommand flags @@ -242,46 +261,62 @@ func Install(cli *cli.Context, opts *options) error { log.Infof("Removing existing NVIDIA container toolkit installation") err := os.RemoveAll(opts.toolkitRoot) - if err != nil { + if err != nil && !opts.ignoreErrors { return fmt.Errorf("error removing toolkit directory: %v", err) + } else if err != nil { + log.Errorf("Ignoring error: %v", fmt.Errorf("error removing toolkit directory: %v", err)) } toolkitConfigDir := filepath.Join(opts.toolkitRoot, ".config", "nvidia-container-runtime") toolkitConfigPath := filepath.Join(toolkitConfigDir, configFilename) err = createDirectories(opts.toolkitRoot, toolkitConfigDir) - if err != nil { + if err != nil && !opts.ignoreErrors { return fmt.Errorf("could not create required directories: %v", err) + } else if err != nil { + log.Errorf("Ignoring error: %v", fmt.Errorf("could not create required directories: %v", err)) } err = installContainerLibraries(opts.toolkitRoot) - if err != nil { + if err != nil && !opts.ignoreErrors { return fmt.Errorf("error installing NVIDIA container library: %v", err) + } else if err != nil { + log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA container library: %v", err)) } err = installContainerRuntimes(opts.toolkitRoot, opts.DriverRoot) - if err != nil { + if err != nil && !opts.ignoreErrors { return fmt.Errorf("error installing NVIDIA container runtime: %v", err) + } else if err != nil { + log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA container runtime: %v", err)) } nvidiaContainerCliExecutable, err := installContainerCLI(opts.toolkitRoot) - if err != nil { + if err != nil && !opts.ignoreErrors { return fmt.Errorf("error installing NVIDIA container CLI: %v", err) + } else if err != nil { + log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA container CLI: %v", err)) } _, err = installRuntimeHook(opts.toolkitRoot, toolkitConfigPath) - if err != nil { + if err != nil && !opts.ignoreErrors { return fmt.Errorf("error installing NVIDIA container runtime hook: %v", err) + } else if err != nil { + log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA container runtime hook: %v", err)) } nvidiaCTKPath, err := installContainerToolkitCLI(opts.toolkitRoot) - if err != nil { + if err != nil && !opts.ignoreErrors { return fmt.Errorf("error installing NVIDIA Container Toolkit CLI: %v", err) + } else if err != nil { + log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA Container Toolkit CLI: %v", err)) } - err = installToolkitConfig(toolkitConfigPath, nvidiaContainerCliExecutable, nvidiaCTKPath, opts) - if err != nil { + err = installToolkitConfig(cli, toolkitConfigPath, nvidiaContainerCliExecutable, nvidiaCTKPath, opts) + if err != nil && !opts.ignoreErrors { return fmt.Errorf("error installing NVIDIA container toolkit config: %v", err) + } else if err != nil { + log.Errorf("Ignoring error: %v", fmt.Errorf("error installing NVIDIA container toolkit config: %v", err)) } return generateCDISpec(opts, nvidiaCTKPath) @@ -337,10 +372,10 @@ func installLibrary(libName string, toolkitRoot string) error { // installToolkitConfig installs the config file for the NVIDIA container toolkit ensuring // that the settings are updated to match the desired install and nvidia driver directories. -func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutablePath string, nvidiaCTKPath string, opts *options) error { +func installToolkitConfig(c *cli.Context, toolkitConfigPath string, nvidiaContainerCliExecutablePath string, nvidiaCTKPath string, opts *options) error { log.Infof("Installing NVIDIA container toolkit config '%v'", toolkitConfigPath) - config, err := toml.LoadFile(nvidiaContainerToolkitConfigSource) + config, err := loadConfig(nvidiaContainerToolkitConfigSource) if err != nil { return fmt.Errorf("could not open source config file: %v", err) } @@ -351,45 +386,64 @@ func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutable } defer targetConfig.Close() - // Set the options in the root toml table - config.Set("accept-nvidia-visible-devices-envvar-when-unprivileged", opts.acceptNVIDIAVisibleDevicesWhenUnprivileged) - config.Set("accept-nvidia-visible-devices-as-volume-mounts", opts.acceptNVIDIAVisibleDevicesAsVolumeMounts) - - nvidiaContainerCliKey := func(p string) []string { - return []string{"nvidia-container-cli", p} - } - // Read the ldconfig path from the config as this may differ per platform // On ubuntu-based systems this ends in `.real` - ldconfigPath := fmt.Sprintf("%s", config.GetPath(nvidiaContainerCliKey("ldconfig"))) - + ldconfigPath := fmt.Sprintf("%s", config.GetDefault("nvidia-container-cli.ldconfig", "/sbin/ldconfig")) // Use the driver run root as the root: driverLdconfigPath := "@" + filepath.Join(opts.DriverRoot, strings.TrimPrefix(ldconfigPath, "@/")) - config.SetPath(nvidiaContainerCliKey("root"), opts.DriverRoot) - config.SetPath(nvidiaContainerCliKey("path"), nvidiaContainerCliExecutablePath) - config.SetPath(nvidiaContainerCliKey("ldconfig"), driverLdconfigPath) - - // Set the debug options if selected - debugOptions := map[string]string{ - "nvidia-container-runtime.debug": opts.ContainerRuntimeDebug, - "nvidia-container-runtime.log-level": opts.ContainerRuntimeLogLevel, - "nvidia-container-runtime.mode": opts.ContainerRuntimeMode, - "nvidia-container-runtime.modes.cdi.default-kind": opts.ContainerRuntimeModesCdiDefaultKind, - "nvidia-container-cli.debug": opts.ContainerCLIDebug, + configValues := map[string]interface{}{ + // Set the options in the root toml table + "accept-nvidia-visible-devices-envvar-when-unprivileged": opts.acceptNVIDIAVisibleDevicesWhenUnprivileged, + "accept-nvidia-visible-devices-as-volume-mounts": opts.acceptNVIDIAVisibleDevicesAsVolumeMounts, + // Set the nvidia-container-cli options + "nvidia-container-cli.root": opts.DriverRoot, + "nvidia-container-cli.path": nvidiaContainerCliExecutablePath, + "nvidia-container-cli.ldconfig": driverLdconfigPath, + // Set nvidia-ctk options + "nvidia-ctk.path": nvidiaCTKPath, + // Set the nvidia-container-runtime-hook options + "nvidia-container-runtime-hook.skip-mode-detection": opts.ContainerRuntimeHookSkipModeDetection, } - for key, value := range debugOptions { - if value == "" { - continue - } + for key, value := range configValues { config.Set(key, value) } - // Set nvidia-ctk options - config.Set("nvidia-ctk.path", nvidiaCTKPath) + // Set the optional config options + optionalConfigValues := map[string]interface{}{ + "nvidia-container-runtime.debug": opts.ContainerRuntimeDebug, + "nvidia-container-runtime.log-level": opts.ContainerRuntimeLogLevel, + "nvidia-container-runtime.mode": opts.ContainerRuntimeMode, + "nvidia-container-runtime.modes.cdi.annotation-prefixes": opts.ContainerRuntimeModesCDIAnnotationPrefixes, + "nvidia-container-runtime.modes.cdi.default-kind": opts.ContainerRuntimeModesCdiDefaultKind, + "nvidia-container-cli.debug": opts.ContainerCLIDebug, + } + for key, value := range optionalConfigValues { + if !c.IsSet(key) { + log.Infof("Skipping unset option: %v", key) + continue + } + if value == nil { + log.Infof("Skipping option with nil value: %v", key) + continue + } - // Set the nvidia-container-runtime-hook options - config.Set("nvidia-container-runtime-hook.skip-mode-detection", opts.ContainerRuntimeHookSkipModeDetection) + switch v := value.(type) { + case string: + if v == "" { + continue + } + case cli.StringSlice: + if len(v.Value()) == 0 { + continue + } + value = v.Value() + default: + log.Warnf("Unexpected type for option %v=%v: %T", key, value, v) + } + + config.Set(key, value) + } _, err = config.WriteTo(targetConfig) if err != nil { @@ -402,6 +456,16 @@ func installToolkitConfig(toolkitConfigPath string, nvidiaContainerCliExecutable return nil } +func loadConfig(path string) (*toml.Tree, error) { + _, err := os.Stat(path) + if err == nil { + return toml.LoadFile(path) + } else if os.IsNotExist(err) { + return toml.TreeFromMap(nil) + } + return nil, err +} + // installContainerToolkitCLI installs the nvidia-ctk CLI executable and wrapper. func installContainerToolkitCLI(toolkitDir string) (string, error) { e := executable{