From 7b801a0ce0fca4ab32ed97a13272f0cc57c281fd Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 31 May 2023 11:16:30 +0200 Subject: [PATCH] Add option to load NVIDIA kernel modules These changes add a --load-kernel-modules option to the nvidia-ctk system commands. If specified the NVIDIA kernel modules (nvidia, nvidia-uvm, and nvidia-modeset) are loaded before any operations on device nodes are performed. Signed-off-by: Evan Lezar --- .../create-dev-char-symlinks.go | 57 +++++++++++++++---- .../create-device-nodes.go | 8 +++ internal/system/options.go | 7 +++ internal/system/system.go | 35 ++++++++++-- 4 files changed, 92 insertions(+), 15 deletions(-) diff --git a/cmd/nvidia-ctk/system/create-dev-char-symlinks/create-dev-char-symlinks.go b/cmd/nvidia-ctk/system/create-dev-char-symlinks/create-dev-char-symlinks.go index f84734b3..03b9de78 100644 --- a/cmd/nvidia-ctk/system/create-dev-char-symlinks/create-dev-char-symlinks.go +++ b/cmd/nvidia-ctk/system/create-dev-char-symlinks/create-dev-char-symlinks.go @@ -24,6 +24,7 @@ import ( "strings" "syscall" + "github.com/NVIDIA/nvidia-container-toolkit/internal/system" "github.com/fsnotify/fsnotify" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" @@ -38,11 +39,12 @@ type command struct { } type config struct { - devCharPath string - driverRoot string - dryRun bool - watch bool - createAll bool + devCharPath string + driverRoot string + dryRun bool + watch bool + createAll bool + loadKernelModules bool } // NewCommand constructs a command sub-command with the specified logger @@ -97,6 +99,12 @@ func (m command) build() *cli.Command { Destination: &cfg.createAll, EnvVars: []string{"CREATE_ALL"}, }, + &cli.BoolFlag{ + Name: "load-kernel-modules", + Usage: "Load the NVIDIA kernel modules before creating symlinks. This is only applicable when --create-all is set.", + Destination: &cfg.loadKernelModules, + EnvVars: []string{"LOAD_KERNEL_MODULES"}, + }, &cli.BoolFlag{ Name: "dry-run", Usage: "If set, the command will not create any symlinks.", @@ -114,6 +122,11 @@ func (m command) validateFlags(r *cli.Context, cfg *config) error { return fmt.Errorf("create-all and watch are mutually exclusive") } + if cfg.loadKernelModules && !cfg.createAll { + m.logger.Warn("load-kernel-modules is only applicable when create-all is set; ignoring") + cfg.loadKernelModules = false + } + return nil } @@ -137,6 +150,7 @@ func (m command) run(c *cli.Context, cfg *config) error { WithDriverRoot(cfg.driverRoot), WithDryRun(cfg.dryRun), WithCreateAll(cfg.createAll), + WithLoadKernelModules(cfg.loadKernelModules), ) if err != nil { return fmt.Errorf("failed to create symlink creator: %v", err) @@ -186,12 +200,13 @@ create: } type linkCreator struct { - logger *logrus.Logger - lister nodeLister - driverRoot string - devCharPath string - dryRun bool - createAll bool + logger *logrus.Logger + lister nodeLister + driverRoot string + devCharPath string + dryRun bool + createAll bool + loadKernelModules bool } // Creator is an interface for creating symlinks to /dev/nv* devices in /dev/char. @@ -218,6 +233,19 @@ func NewSymlinkCreator(opts ...Option) (Creator, error) { c.devCharPath = defaultDevCharPath } + if c.loadKernelModules { + s, err := system.New( + system.WithLogger(c.logger), + system.WithDryRun(c.dryRun), + ) + if err != nil { + return nil, err + } + if err := s.LoadNVIDIAKernelModules(); err != nil { + return nil, fmt.Errorf("failed to load NVIDIA kernel modules: %v", err) + } + } + if c.createAll { lister, err := newAllPossible(c.logger, c.driverRoot) if err != nil { @@ -265,6 +293,13 @@ func WithCreateAll(createAll bool) Option { } } +// WithLoadKernelModules sets the loadKernelModules flag for the linkCreator. +func WithLoadKernelModules(loadKernelModules bool) Option { + return func(lc *linkCreator) { + lc.loadKernelModules = loadKernelModules + } +} + // CreateLinks creates symlinks for all NVIDIA device nodes found in the driver root. func (m linkCreator) CreateLinks() error { deviceNodes, err := m.lister.DeviceNodes() diff --git a/cmd/nvidia-ctk/system/create-device-nodes/create-device-nodes.go b/cmd/nvidia-ctk/system/create-device-nodes/create-device-nodes.go index d1fb346e..508177bf 100644 --- a/cmd/nvidia-ctk/system/create-device-nodes/create-device-nodes.go +++ b/cmd/nvidia-ctk/system/create-device-nodes/create-device-nodes.go @@ -34,6 +34,8 @@ type options struct { dryRun bool control bool + + loadKernelModules bool } // NewCommand constructs a command sub-command with the specified logger @@ -72,6 +74,11 @@ func (m command) build() *cli.Command { Usage: "create all control device nodes: nvidiactl, nvidia-modeset, nvidia-uvm, nvidia-uvm-tools", Destination: &opts.control, }, + &cli.BoolFlag{ + Name: "load-kernel-modules", + Usage: "load the NVIDIA Kernel Modules before creating devices nodes", + Destination: &opts.loadKernelModules, + }, &cli.BoolFlag{ Name: "dry-run", Usage: "if set, the command will not create any symlinks.", @@ -92,6 +99,7 @@ func (m command) run(c *cli.Context, opts *options) error { s, err := system.New( system.WithLogger(m.logger), system.WithDryRun(opts.dryRun), + system.WithLoadKernelModules(opts.loadKernelModules), ) if err != nil { return fmt.Errorf("failed to create library: %v", err) diff --git a/internal/system/options.go b/internal/system/options.go index fb0fbb38..de3bf21d 100644 --- a/internal/system/options.go +++ b/internal/system/options.go @@ -34,3 +34,10 @@ func WithDryRun(dryRun bool) Option { i.dryRun = dryRun } } + +// WithLoadKernelModules sets the load kernel modules flag +func WithLoadKernelModules(loadKernelModules bool) Option { + return func(i *Interface) { + i.loadKernelModules = loadKernelModules + } +} diff --git a/internal/system/system.go b/internal/system/system.go index d3ad63eb..fe745160 100644 --- a/internal/system/system.go +++ b/internal/system/system.go @@ -19,6 +19,7 @@ package system import ( "fmt" "os" + "os/exec" "path/filepath" "strings" @@ -29,10 +30,10 @@ import ( // Interface is the interface for the system command type Interface struct { - logger *logrus.Logger - dryRun bool - - nvidiaDevices nvidiaDevices + logger *logrus.Logger + dryRun bool + loadKernelModules bool + nvidiaDevices nvidiaDevices } // New constructs a system command with the specified options @@ -44,6 +45,12 @@ func New(opts ...Option) (*Interface, error) { opt(i) } + if i.loadKernelModules { + if err := i.LoadNVIDIAKernelModules(); err != nil { + return nil, fmt.Errorf("failed to load kernel modules: %v", err) + } + } + devices, err := devices.GetNVIDIADevices() if err != nil { return nil, fmt.Errorf("failed to create devices info: %v", err) @@ -108,6 +115,26 @@ func (m *Interface) createDeviceNode(path string, major int, minor int) error { return unix.Chmod(path, 0666) } +// LoadNVIDIAKernelModules loads the NVIDIA kernel modules. +func (m *Interface) LoadNVIDIAKernelModules() error { + modules := []string{"nvidia", "nvidia-uvm", "nvidia-modeset"} + + for _, module := range modules { + if m.dryRun { + m.logger.Infof("Running: /sbin/modprobe %s", module) + continue + } + cmd := exec.Command("/sbin/modprobe", module) + + if output, err := cmd.CombinedOutput(); err != nil { + m.logger.Debugf("Failed to load kernel module %s: %v", module, string(output)) + return fmt.Errorf("failed to load kernel module %s: %v", module, err) + } + } + + return nil +} + type nvidiaDevices struct { devices.Devices }