From 54ea4a24a7d6894213d6fb6a6f07703bc1b0989a Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Sun, 9 Mar 2025 13:29:06 +0200 Subject: [PATCH] Load NVIDIA Kernel Modules for JIT-CDI mode This change attempts to load the nvidia, nvidia-uvm, and nvidia-modeset kernel modules before generating the automatic (jit) CDI specification. The kernel modules can be controlled by the nvidia-container-runtime.modes.jit-cdi.load-kernel-modules config option. If this is set to the empty list, then no kernel modules are loaded. Errors in loading the kernel modules are logged, but ignored. Signed-off-by: Evan Lezar --- cmd/nvidia-ctk-installer/main_test.go | 15 +++++++++++++++ internal/config/config.go | 3 +++ internal/config/config_test.go | 21 +++++++++++++++++++++ internal/config/runtime.go | 13 +++++++++++-- internal/config/toml_test.go | 3 +++ internal/lookup/root/root.go | 20 ++++++++++++++++++++ internal/modifier/cdi.go | 16 +++++++++++----- internal/runtime/runtime_factory.go | 6 +++--- 8 files changed, 87 insertions(+), 10 deletions(-) diff --git a/cmd/nvidia-ctk-installer/main_test.go b/cmd/nvidia-ctk-installer/main_test.go index 759ae8c1..1e3f8006 100644 --- a/cmd/nvidia-ctk-installer/main_test.go +++ b/cmd/nvidia-ctk-installer/main_test.go @@ -141,6 +141,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.jit-cdi] + load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -202,6 +205,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.jit-cdi] + load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -266,6 +272,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.jit-cdi] + load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -327,6 +336,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.jit-cdi] + load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -410,6 +422,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.jit-cdi] + load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true diff --git a/internal/config/config.go b/internal/config/config.go index 652cc83a..5d17d674 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -121,6 +121,9 @@ func GetDefault() (*Config, error) { AnnotationPrefixes: []string{cdi.AnnotationPrefix}, SpecDirs: cdi.DefaultSpecDirs, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 963058e1..7b4d638c 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -102,6 +105,7 @@ func TestGetConfig(t *testing.T) { "nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]", "nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]", "nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", + "nvidia-container-runtime.modes.jit-cdi.load-kernel-modules = [\"foo\"]", "nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"", "nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"", }, @@ -134,6 +138,9 @@ func TestGetConfig(t *testing.T) { "/not/var/run/cdi", }, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"foo"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -178,6 +185,9 @@ func TestGetConfig(t *testing.T) { "/var/run/cdi", }, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -213,6 +223,8 @@ func TestGetConfig(t *testing.T) { "spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]", "[nvidia-container-runtime.modes.csv]", "mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", + "[nvidia-container-runtime.modes.jit-cdi]", + "load-kernel-modules = [\"foo\"]", "[nvidia-container-runtime-hook]", "path = \"/foo/bar/nvidia-container-runtime-hook\"", "[nvidia-ctk]", @@ -247,6 +259,9 @@ func TestGetConfig(t *testing.T) { "/not/var/run/cdi", }, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"foo"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -283,6 +298,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -322,6 +340,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ diff --git a/internal/config/runtime.go b/internal/config/runtime.go index 2ba1b7a8..ea9869b9 100644 --- a/internal/config/runtime.go +++ b/internal/config/runtime.go @@ -29,8 +29,9 @@ type RuntimeConfig struct { // modesConfig defines (optional) per-mode configs type modesConfig struct { - CSV csvModeConfig `toml:"csv"` - CDI cdiModeConfig `toml:"cdi"` + CSV csvModeConfig `toml:"csv"` + CDI cdiModeConfig `toml:"cdi"` + JitCDI jitCDIModeConfig `toml:"jit-cdi"` } type cdiModeConfig struct { @@ -45,3 +46,11 @@ type cdiModeConfig struct { type csvModeConfig struct { MountSpecPath string `toml:"mount-spec-path"` } + +type jitCDIModeConfig struct { + // LoadKernelModules defines the names of the kernel modules that should be + // loaded before generating a just-in-time CDI specification. + // The module names must start with `nvidia` and if no modules are specified + // no kernel modules are loaded. + LoadKernelModules []string `toml:"load-kernel-modules"` +} diff --git a/internal/config/toml_test.go b/internal/config/toml_test.go index f7c649f7..96cff3b8 100644 --- a/internal/config/toml_test.go +++ b/internal/config/toml_test.go @@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"] [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" +[nvidia-container-runtime.modes.jit-cdi] +load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "nvidia-container-runtime-hook" skip-mode-detection = false diff --git a/internal/lookup/root/root.go b/internal/lookup/root/root.go index d0c83701..a5f19aab 100644 --- a/internal/lookup/root/root.go +++ b/internal/lookup/root/root.go @@ -17,12 +17,15 @@ package root import ( + "errors" + "fmt" "os" "path/filepath" "strings" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules" ) // Driver represents a filesystem in which a set of drivers or devices is defined. @@ -125,3 +128,20 @@ func xdgDataDirs() []string { return []string{"/usr/local/share", "/usr/share"} } + +// LoadKmods loads the specified kernel modules in the driver root. +// Errors in loading a module do not prevent other modules from being attempted. +func (r *Driver) LoadKernelModules(moduleNames ...string) error { + modules := nvmodules.New( + nvmodules.WithLogger(r.logger), + nvmodules.WithRoot(r.Root), + ) + + var errs error + for _, moduleName := range moduleNames { + if err := modules.Load(moduleName); err != nil { + errs = errors.Join(errs, fmt.Errorf("failed to load kernel module %q: %w", moduleName, err)) + } + } + return errs +} diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index 90cd481b..bc9a7de3 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -25,6 +25,7 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" "github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi" "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" @@ -34,7 +35,7 @@ import ( // NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the // CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is // used to select the devices to include. -func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) { +func NewCDIModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec) (oci.SpecModifier, error) { devices, err := getDevicesFromSpec(logger, ociSpec, cfg) if err != nil { return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err) @@ -50,7 +51,7 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices") } if len(automaticDevices) > 0 { - automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices) + automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, driver, automaticDevices) if err == nil { return automaticModifier, nil } @@ -163,9 +164,9 @@ func filterAutomaticDevices(devices []string) []string { return automatic } -func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) { +func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (oci.SpecModifier, error) { logger.Debugf("Generating in-memory CDI specs for devices %v", devices) - spec, err := generateAutomaticCDISpec(logger, cfg, devices) + spec, err := generateAutomaticCDISpec(logger, cfg, driver, devices) if err != nil { return nil, fmt.Errorf("failed to generate CDI spec: %w", err) } @@ -180,7 +181,7 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de return cdiModifier, nil } -func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) { +func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (spec.Interface, error) { cdilib, err := nvcdi.New( nvcdi.WithLogger(logger), nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path), @@ -192,6 +193,11 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic return nil, fmt.Errorf("failed to construct CDI library: %w", err) } + // TODO: Consider moving this into the nvcdi API. + if err := driver.LoadKernelModules(cfg.NVIDIAContainerRuntimeConfig.Modes.JitCDI.LoadKernelModules...); err != nil { + logger.Warningf("Ignoring error(s) loading kernel modules: %v", err) + } + identifiers := []string{} for _, device := range devices { _, _, id := parser.ParseDevice(device) diff --git a/internal/runtime/runtime_factory.go b/internal/runtime/runtime_factory.go index e88213dc..9ee12c48 100644 --- a/internal/runtime/runtime_factory.go +++ b/internal/runtime/runtime_factory.go @@ -77,7 +77,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image) // We update the mode here so that we can continue passing just the config to other functions. cfg.NVIDIAContainerRuntimeConfig.Mode = mode - modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image) + modeModifier, err := newModeModifier(logger, mode, cfg, driver, ociSpec, image) if err != nil { return nil, err } @@ -107,14 +107,14 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp return modifiers, nil } -func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) { +func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) { switch mode { case "legacy": return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil case "csv": return modifier.NewCSVModifier(logger, cfg, image) case "cdi": - return modifier.NewCDIModifier(logger, cfg, ociSpec) + return modifier.NewCDIModifier(logger, cfg, driver, ociSpec) } return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)