This commit is contained in:
Evan Lezar 2025-03-21 09:13:50 +08:00 committed by GitHub
commit bee7158a56
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 87 additions and 10 deletions

View File

@ -79,6 +79,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
@ -140,6 +143,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
@ -204,6 +210,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
@ -265,6 +274,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
@ -348,6 +360,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true

View File

@ -121,6 +121,9 @@ func GetDefault() (*Config, error) {
AnnotationPrefixes: []string{cdi.AnnotationPrefix},
SpecDirs: cdi.DefaultSpecDirs,
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{

View File

@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@ -102,6 +105,7 @@ func TestGetConfig(t *testing.T) {
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"nvidia-container-runtime.modes.jit-cdi.load-kernel-modules = [\"foo\"]",
"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
},
@ -134,6 +138,9 @@ func TestGetConfig(t *testing.T) {
"/not/var/run/cdi",
},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"foo"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@ -178,6 +185,9 @@ func TestGetConfig(t *testing.T) {
"/var/run/cdi",
},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@ -213,6 +223,8 @@ func TestGetConfig(t *testing.T) {
"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
"[nvidia-container-runtime.modes.csv]",
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"[nvidia-container-runtime.modes.jit-cdi]",
"load-kernel-modules = [\"foo\"]",
"[nvidia-container-runtime-hook]",
"path = \"/foo/bar/nvidia-container-runtime-hook\"",
"[nvidia-ctk]",
@ -247,6 +259,9 @@ func TestGetConfig(t *testing.T) {
"/not/var/run/cdi",
},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"foo"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@ -283,6 +298,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@ -322,6 +340,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{

View File

@ -29,8 +29,9 @@ type RuntimeConfig struct {
// modesConfig defines (optional) per-mode configs
type modesConfig struct {
CSV csvModeConfig `toml:"csv"`
CDI cdiModeConfig `toml:"cdi"`
CSV csvModeConfig `toml:"csv"`
CDI cdiModeConfig `toml:"cdi"`
JitCDI jitCDIModeConfig `toml:"jit-cdi"`
}
type cdiModeConfig struct {
@ -45,3 +46,11 @@ type cdiModeConfig struct {
type csvModeConfig struct {
MountSpecPath string `toml:"mount-spec-path"`
}
type jitCDIModeConfig struct {
// LoadKernelModules defines the names of the kernel modules that should be
// loaded before generating a just-in-time CDI specification.
// The module names must start with `nvidia` and if no modules are specified
// no kernel modules are loaded.
LoadKernelModules []string `toml:"load-kernel-modules"`
}

View File

@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"]
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
[nvidia-container-runtime-hook]
path = "nvidia-container-runtime-hook"
skip-mode-detection = false

View File

@ -17,12 +17,15 @@
package root
import (
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules"
)
// Driver represents a filesystem in which a set of drivers or devices is defined.
@ -125,3 +128,20 @@ func xdgDataDirs() []string {
return []string{"/usr/local/share", "/usr/share"}
}
// LoadKmods loads the specified kernel modules in the driver root.
// Errors in loading a module do not prevent other modules from being attempted.
func (r *Driver) LoadKernelModules(moduleNames ...string) error {
modules := nvmodules.New(
nvmodules.WithLogger(r.logger),
nvmodules.WithRoot(r.Root),
)
var errs error
for _, moduleName := range moduleNames {
if err := modules.Load(moduleName); err != nil {
errs = errors.Join(errs, fmt.Errorf("failed to load kernel module %q: %w", moduleName, err))
}
}
return errs
}

View File

@ -25,6 +25,7 @@ import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
@ -34,7 +35,7 @@ import (
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
// used to select the devices to include.
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
func NewCDIModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec) (oci.SpecModifier, error) {
devices, err := getDevicesFromSpec(logger, ociSpec, cfg)
if err != nil {
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
@ -50,7 +51,7 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
}
if len(automaticDevices) > 0 {
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, driver, automaticDevices)
if err == nil {
return automaticModifier, nil
}
@ -163,9 +164,9 @@ func filterAutomaticDevices(devices []string) []string {
return automatic
}
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (oci.SpecModifier, error) {
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
spec, err := generateAutomaticCDISpec(logger, cfg, driver, devices)
if err != nil {
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
}
@ -180,7 +181,7 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de
return cdiModifier, nil
}
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (spec.Interface, error) {
cdilib, err := nvcdi.New(
nvcdi.WithLogger(logger),
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
@ -192,6 +193,11 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
}
// TODO: Consider moving this into the nvcdi API.
if err := driver.LoadKernelModules(cfg.NVIDIAContainerRuntimeConfig.Modes.JitCDI.LoadKernelModules...); err != nil {
logger.Warningf("Ignoring error(s) loading kernel modules: %v", err)
}
identifiers := []string{}
for _, device := range devices {
_, _, id := parser.ParseDevice(device)

View File

@ -77,7 +77,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
// We update the mode here so that we can continue passing just the config to other functions.
cfg.NVIDIAContainerRuntimeConfig.Mode = mode
modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image)
modeModifier, err := newModeModifier(logger, mode, cfg, driver, ociSpec, image)
if err != nil {
return nil, err
}
@ -107,14 +107,14 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
return modifiers, nil
}
func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
switch mode {
case "legacy":
return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil
case "csv":
return modifier.NewCSVModifier(logger, cfg, image)
case "cdi":
return modifier.NewCDIModifier(logger, cfg, ociSpec)
return modifier.NewCDIModifier(logger, cfg, driver, ociSpec)
}
return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)