mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-04-02 12:00:18 +00:00
Merge 54ea4a24a7
into e33f15a128
This commit is contained in:
commit
bee7158a56
@ -79,6 +79,9 @@ swarm-resource = ""
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.jit-cdi]
|
||||
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||
skip-mode-detection = true
|
||||
@ -140,6 +143,9 @@ swarm-resource = ""
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.jit-cdi]
|
||||
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||
skip-mode-detection = true
|
||||
@ -204,6 +210,9 @@ swarm-resource = ""
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.jit-cdi]
|
||||
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||
skip-mode-detection = true
|
||||
@ -265,6 +274,9 @@ swarm-resource = ""
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.jit-cdi]
|
||||
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||
skip-mode-detection = true
|
||||
@ -348,6 +360,9 @@ swarm-resource = ""
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.jit-cdi]
|
||||
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||
skip-mode-detection = true
|
||||
|
@ -121,6 +121,9 @@ func GetDefault() (*Config, error) {
|
||||
AnnotationPrefixes: []string{cdi.AnnotationPrefix},
|
||||
SpecDirs: cdi.DefaultSpecDirs,
|
||||
},
|
||||
JitCDI: jitCDIModeConfig{
|
||||
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
|
@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) {
|
||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||
},
|
||||
JitCDI: jitCDIModeConfig{
|
||||
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@ -102,6 +105,7 @@ func TestGetConfig(t *testing.T) {
|
||||
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
|
||||
"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
|
||||
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
||||
"nvidia-container-runtime.modes.jit-cdi.load-kernel-modules = [\"foo\"]",
|
||||
"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
|
||||
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
|
||||
},
|
||||
@ -134,6 +138,9 @@ func TestGetConfig(t *testing.T) {
|
||||
"/not/var/run/cdi",
|
||||
},
|
||||
},
|
||||
JitCDI: jitCDIModeConfig{
|
||||
LoadKernelModules: []string{"foo"},
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@ -178,6 +185,9 @@ func TestGetConfig(t *testing.T) {
|
||||
"/var/run/cdi",
|
||||
},
|
||||
},
|
||||
JitCDI: jitCDIModeConfig{
|
||||
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@ -213,6 +223,8 @@ func TestGetConfig(t *testing.T) {
|
||||
"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
|
||||
"[nvidia-container-runtime.modes.csv]",
|
||||
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
||||
"[nvidia-container-runtime.modes.jit-cdi]",
|
||||
"load-kernel-modules = [\"foo\"]",
|
||||
"[nvidia-container-runtime-hook]",
|
||||
"path = \"/foo/bar/nvidia-container-runtime-hook\"",
|
||||
"[nvidia-ctk]",
|
||||
@ -247,6 +259,9 @@ func TestGetConfig(t *testing.T) {
|
||||
"/not/var/run/cdi",
|
||||
},
|
||||
},
|
||||
JitCDI: jitCDIModeConfig{
|
||||
LoadKernelModules: []string{"foo"},
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@ -283,6 +298,9 @@ func TestGetConfig(t *testing.T) {
|
||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||
},
|
||||
JitCDI: jitCDIModeConfig{
|
||||
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@ -322,6 +340,9 @@ func TestGetConfig(t *testing.T) {
|
||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||
},
|
||||
JitCDI: jitCDIModeConfig{
|
||||
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
|
@ -29,8 +29,9 @@ type RuntimeConfig struct {
|
||||
|
||||
// modesConfig defines (optional) per-mode configs
|
||||
type modesConfig struct {
|
||||
CSV csvModeConfig `toml:"csv"`
|
||||
CDI cdiModeConfig `toml:"cdi"`
|
||||
CSV csvModeConfig `toml:"csv"`
|
||||
CDI cdiModeConfig `toml:"cdi"`
|
||||
JitCDI jitCDIModeConfig `toml:"jit-cdi"`
|
||||
}
|
||||
|
||||
type cdiModeConfig struct {
|
||||
@ -45,3 +46,11 @@ type cdiModeConfig struct {
|
||||
type csvModeConfig struct {
|
||||
MountSpecPath string `toml:"mount-spec-path"`
|
||||
}
|
||||
|
||||
type jitCDIModeConfig struct {
|
||||
// LoadKernelModules defines the names of the kernel modules that should be
|
||||
// loaded before generating a just-in-time CDI specification.
|
||||
// The module names must start with `nvidia` and if no modules are specified
|
||||
// no kernel modules are loaded.
|
||||
LoadKernelModules []string `toml:"load-kernel-modules"`
|
||||
}
|
||||
|
@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"]
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.jit-cdi]
|
||||
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "nvidia-container-runtime-hook"
|
||||
skip-mode-detection = false
|
||||
|
@ -17,12 +17,15 @@
|
||||
package root
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules"
|
||||
)
|
||||
|
||||
// Driver represents a filesystem in which a set of drivers or devices is defined.
|
||||
@ -125,3 +128,20 @@ func xdgDataDirs() []string {
|
||||
|
||||
return []string{"/usr/local/share", "/usr/share"}
|
||||
}
|
||||
|
||||
// LoadKmods loads the specified kernel modules in the driver root.
|
||||
// Errors in loading a module do not prevent other modules from being attempted.
|
||||
func (r *Driver) LoadKernelModules(moduleNames ...string) error {
|
||||
modules := nvmodules.New(
|
||||
nvmodules.WithLogger(r.logger),
|
||||
nvmodules.WithRoot(r.Root),
|
||||
)
|
||||
|
||||
var errs error
|
||||
for _, moduleName := range moduleNames {
|
||||
if err := modules.Load(moduleName); err != nil {
|
||||
errs = errors.Join(errs, fmt.Errorf("failed to load kernel module %q: %w", moduleName, err))
|
||||
}
|
||||
}
|
||||
return errs
|
||||
}
|
||||
|
@ -25,6 +25,7 @@ import (
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
||||
@ -34,7 +35,7 @@ import (
|
||||
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
|
||||
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
|
||||
// used to select the devices to include.
|
||||
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
||||
func NewCDIModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
||||
devices, err := getDevicesFromSpec(logger, ociSpec, cfg)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
|
||||
@ -50,7 +51,7 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe
|
||||
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
|
||||
}
|
||||
if len(automaticDevices) > 0 {
|
||||
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
|
||||
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, driver, automaticDevices)
|
||||
if err == nil {
|
||||
return automaticModifier, nil
|
||||
}
|
||||
@ -163,9 +164,9 @@ func filterAutomaticDevices(devices []string) []string {
|
||||
return automatic
|
||||
}
|
||||
|
||||
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
|
||||
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (oci.SpecModifier, error) {
|
||||
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
|
||||
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
|
||||
spec, err := generateAutomaticCDISpec(logger, cfg, driver, devices)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
|
||||
}
|
||||
@ -180,7 +181,7 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de
|
||||
return cdiModifier, nil
|
||||
}
|
||||
|
||||
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
|
||||
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (spec.Interface, error) {
|
||||
cdilib, err := nvcdi.New(
|
||||
nvcdi.WithLogger(logger),
|
||||
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
|
||||
@ -192,6 +193,11 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
|
||||
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
||||
}
|
||||
|
||||
// TODO: Consider moving this into the nvcdi API.
|
||||
if err := driver.LoadKernelModules(cfg.NVIDIAContainerRuntimeConfig.Modes.JitCDI.LoadKernelModules...); err != nil {
|
||||
logger.Warningf("Ignoring error(s) loading kernel modules: %v", err)
|
||||
}
|
||||
|
||||
identifiers := []string{}
|
||||
for _, device := range devices {
|
||||
_, _, id := parser.ParseDevice(device)
|
||||
|
@ -77,7 +77,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
|
||||
mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
|
||||
// We update the mode here so that we can continue passing just the config to other functions.
|
||||
cfg.NVIDIAContainerRuntimeConfig.Mode = mode
|
||||
modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image)
|
||||
modeModifier, err := newModeModifier(logger, mode, cfg, driver, ociSpec, image)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -107,14 +107,14 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
|
||||
return modifiers, nil
|
||||
}
|
||||
|
||||
func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
|
||||
func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
|
||||
switch mode {
|
||||
case "legacy":
|
||||
return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil
|
||||
case "csv":
|
||||
return modifier.NewCSVModifier(logger, cfg, image)
|
||||
case "cdi":
|
||||
return modifier.NewCDIModifier(logger, cfg, ociSpec)
|
||||
return modifier.NewCDIModifier(logger, cfg, driver, ociSpec)
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)
|
||||
|
Loading…
Reference in New Issue
Block a user