mirror of
				https://github.com/NVIDIA/nvidia-container-toolkit
				synced 2025-06-26 18:18:24 +00:00 
			
		
		
		
	Merge branch 'add-mod-probe' into 'main'
Add option to load NVIDIA kernel modules See merge request nvidia/container-toolkit/container-toolkit!409
This commit is contained in:
		
						commit
						6b1e8171c8
					
				| @ -10,6 +10,9 @@ | |||||||
| * Use *.* pattern to locate libcuda.so when generating a CDI specification to support platforms where a patch version is not specified. | * Use *.* pattern to locate libcuda.so when generating a CDI specification to support platforms where a patch version is not specified. | ||||||
| * Update go-nvlib to skip devices that are not MIG capable when generating CDI specifications. | * Update go-nvlib to skip devices that are not MIG capable when generating CDI specifications. | ||||||
| * Add `nvidia-container-runtime-hook.path` config option to specify NVIDIA Container Runtime Hook path explicitly. | * Add `nvidia-container-runtime-hook.path` config option to specify NVIDIA Container Runtime Hook path explicitly. | ||||||
|  | * Fix bug in creation of `/dev/char` symlinks by failing operation if kernel modules are not loaded. | ||||||
|  | * Add option to load kernel modules when creating device nodes | ||||||
|  | * Add option to create device nodes when creating `/dev/char` symlinks | ||||||
| 
 | 
 | ||||||
| ## v1.13.1 | ## v1.13.1 | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -24,6 +24,7 @@ import ( | |||||||
| 	"strings" | 	"strings" | ||||||
| 	"syscall" | 	"syscall" | ||||||
| 
 | 
 | ||||||
|  | 	"github.com/NVIDIA/nvidia-container-toolkit/internal/system" | ||||||
| 	"github.com/fsnotify/fsnotify" | 	"github.com/fsnotify/fsnotify" | ||||||
| 	"github.com/sirupsen/logrus" | 	"github.com/sirupsen/logrus" | ||||||
| 	"github.com/urfave/cli/v2" | 	"github.com/urfave/cli/v2" | ||||||
| @ -43,6 +44,8 @@ type config struct { | |||||||
| 	dryRun            bool | 	dryRun            bool | ||||||
| 	watch             bool | 	watch             bool | ||||||
| 	createAll         bool | 	createAll         bool | ||||||
|  | 	createDeviceNodes bool | ||||||
|  | 	loadKernelModules bool | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // NewCommand constructs a command sub-command with the specified logger
 | // NewCommand constructs a command sub-command with the specified logger
 | ||||||
| @ -97,6 +100,18 @@ func (m command) build() *cli.Command { | |||||||
| 			Destination: &cfg.createAll, | 			Destination: &cfg.createAll, | ||||||
| 			EnvVars:     []string{"CREATE_ALL"}, | 			EnvVars:     []string{"CREATE_ALL"}, | ||||||
| 		}, | 		}, | ||||||
|  | 		&cli.BoolFlag{ | ||||||
|  | 			Name:        "load-kernel-modules", | ||||||
|  | 			Usage:       "Load the NVIDIA kernel modules before creating symlinks. This is only applicable when --create-all is set.", | ||||||
|  | 			Destination: &cfg.loadKernelModules, | ||||||
|  | 			EnvVars:     []string{"LOAD_KERNEL_MODULES"}, | ||||||
|  | 		}, | ||||||
|  | 		&cli.BoolFlag{ | ||||||
|  | 			Name:        "create-device-nodes", | ||||||
|  | 			Usage:       "Create the NVIDIA control device nodes in the driver root if they do not exist. This is only applicable when --create-all is set", | ||||||
|  | 			Destination: &cfg.createDeviceNodes, | ||||||
|  | 			EnvVars:     []string{"CREATE_DEVICE_NODES"}, | ||||||
|  | 		}, | ||||||
| 		&cli.BoolFlag{ | 		&cli.BoolFlag{ | ||||||
| 			Name:        "dry-run", | 			Name:        "dry-run", | ||||||
| 			Usage:       "If set, the command will not create any symlinks.", | 			Usage:       "If set, the command will not create any symlinks.", | ||||||
| @ -114,6 +129,16 @@ func (m command) validateFlags(r *cli.Context, cfg *config) error { | |||||||
| 		return fmt.Errorf("create-all and watch are mutually exclusive") | 		return fmt.Errorf("create-all and watch are mutually exclusive") | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if cfg.loadKernelModules && !cfg.createAll { | ||||||
|  | 		m.logger.Warn("load-kernel-modules is only applicable when create-all is set; ignoring") | ||||||
|  | 		cfg.loadKernelModules = false | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if cfg.createDeviceNodes && !cfg.createAll { | ||||||
|  | 		m.logger.Warn("create-device-nodes is only applicable when create-all is set; ignoring") | ||||||
|  | 		cfg.createDeviceNodes = false | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	return nil | 	return nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -137,6 +162,8 @@ func (m command) run(c *cli.Context, cfg *config) error { | |||||||
| 		WithDriverRoot(cfg.driverRoot), | 		WithDriverRoot(cfg.driverRoot), | ||||||
| 		WithDryRun(cfg.dryRun), | 		WithDryRun(cfg.dryRun), | ||||||
| 		WithCreateAll(cfg.createAll), | 		WithCreateAll(cfg.createAll), | ||||||
|  | 		WithLoadKernelModules(cfg.loadKernelModules), | ||||||
|  | 		WithCreateDeviceNodes(cfg.createDeviceNodes), | ||||||
| 	) | 	) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return fmt.Errorf("failed to create symlink creator: %v", err) | 		return fmt.Errorf("failed to create symlink creator: %v", err) | ||||||
| @ -192,6 +219,8 @@ type linkCreator struct { | |||||||
| 	devCharPath       string | 	devCharPath       string | ||||||
| 	dryRun            bool | 	dryRun            bool | ||||||
| 	createAll         bool | 	createAll         bool | ||||||
|  | 	createDeviceNodes bool | ||||||
|  | 	loadKernelModules bool | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // Creator is an interface for creating symlinks to /dev/nv* devices in /dev/char.
 | // Creator is an interface for creating symlinks to /dev/nv* devices in /dev/char.
 | ||||||
| @ -218,6 +247,10 @@ func NewSymlinkCreator(opts ...Option) (Creator, error) { | |||||||
| 		c.devCharPath = defaultDevCharPath | 		c.devCharPath = defaultDevCharPath | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if err := c.setup(); err != nil { | ||||||
|  | 		return nil, err | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	if c.createAll { | 	if c.createAll { | ||||||
| 		lister, err := newAllPossible(c.logger, c.driverRoot) | 		lister, err := newAllPossible(c.logger, c.driverRoot) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| @ -230,6 +263,34 @@ func NewSymlinkCreator(opts ...Option) (Creator, error) { | |||||||
| 	return c, nil | 	return c, nil | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | func (m linkCreator) setup() error { | ||||||
|  | 	if !m.loadKernelModules && !m.createDeviceNodes { | ||||||
|  | 		return nil | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	s, err := system.New( | ||||||
|  | 		system.WithLogger(m.logger), | ||||||
|  | 		system.WithDryRun(m.dryRun), | ||||||
|  | 	) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return err | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if m.loadKernelModules { | ||||||
|  | 		if err := s.LoadNVIDIAKernelModules(); err != nil { | ||||||
|  | 			return fmt.Errorf("failed to load NVIDIA kernel modules: %v", err) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	if m.createDeviceNodes { | ||||||
|  | 		if err := s.CreateNVIDIAControlDeviceNodesAt(m.driverRoot); err != nil { | ||||||
|  | 			return fmt.Errorf("failed to create NVIDIA device nodes: %v", err) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return nil | ||||||
|  | } | ||||||
|  | 
 | ||||||
| // WithDriverRoot sets the driver root path.
 | // WithDriverRoot sets the driver root path.
 | ||||||
| func WithDriverRoot(root string) Option { | func WithDriverRoot(root string) Option { | ||||||
| 	return func(c *linkCreator) { | 	return func(c *linkCreator) { | ||||||
| @ -265,6 +326,20 @@ func WithCreateAll(createAll bool) Option { | |||||||
| 	} | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | // WithLoadKernelModules sets the loadKernelModules flag for the linkCreator.
 | ||||||
|  | func WithLoadKernelModules(loadKernelModules bool) Option { | ||||||
|  | 	return func(lc *linkCreator) { | ||||||
|  | 		lc.loadKernelModules = loadKernelModules | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | // WithCreateDeviceNodes sets the createDeviceNodes flag for the linkCreator.
 | ||||||
|  | func WithCreateDeviceNodes(createDeviceNodes bool) Option { | ||||||
|  | 	return func(lc *linkCreator) { | ||||||
|  | 		lc.createDeviceNodes = createDeviceNodes | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
| // CreateLinks creates symlinks for all NVIDIA device nodes found in the driver root.
 | // CreateLinks creates symlinks for all NVIDIA device nodes found in the driver root.
 | ||||||
| func (m linkCreator) CreateLinks() error { | func (m linkCreator) CreateLinks() error { | ||||||
| 	deviceNodes, err := m.lister.DeviceNodes() | 	deviceNodes, err := m.lister.DeviceNodes() | ||||||
|  | |||||||
| @ -34,6 +34,8 @@ type options struct { | |||||||
| 	dryRun bool | 	dryRun bool | ||||||
| 
 | 
 | ||||||
| 	control bool | 	control bool | ||||||
|  | 
 | ||||||
|  | 	loadKernelModules bool | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // NewCommand constructs a command sub-command with the specified logger
 | // NewCommand constructs a command sub-command with the specified logger
 | ||||||
| @ -72,6 +74,11 @@ func (m command) build() *cli.Command { | |||||||
| 			Usage:       "create all control device nodes: nvidiactl, nvidia-modeset, nvidia-uvm, nvidia-uvm-tools", | 			Usage:       "create all control device nodes: nvidiactl, nvidia-modeset, nvidia-uvm, nvidia-uvm-tools", | ||||||
| 			Destination: &opts.control, | 			Destination: &opts.control, | ||||||
| 		}, | 		}, | ||||||
|  | 		&cli.BoolFlag{ | ||||||
|  | 			Name:        "load-kernel-modules", | ||||||
|  | 			Usage:       "load the NVIDIA Kernel Modules before creating devices nodes", | ||||||
|  | 			Destination: &opts.loadKernelModules, | ||||||
|  | 		}, | ||||||
| 		&cli.BoolFlag{ | 		&cli.BoolFlag{ | ||||||
| 			Name:        "dry-run", | 			Name:        "dry-run", | ||||||
| 			Usage:       "if set, the command will not create any symlinks.", | 			Usage:       "if set, the command will not create any symlinks.", | ||||||
| @ -92,6 +99,7 @@ func (m command) run(c *cli.Context, opts *options) error { | |||||||
| 	s, err := system.New( | 	s, err := system.New( | ||||||
| 		system.WithLogger(m.logger), | 		system.WithLogger(m.logger), | ||||||
| 		system.WithDryRun(opts.dryRun), | 		system.WithDryRun(opts.dryRun), | ||||||
|  | 		system.WithLoadKernelModules(opts.loadKernelModules), | ||||||
| 	) | 	) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return fmt.Errorf("failed to create library: %v", err) | 		return fmt.Errorf("failed to create library: %v", err) | ||||||
|  | |||||||
| @ -34,3 +34,10 @@ func WithDryRun(dryRun bool) Option { | |||||||
| 		i.dryRun = dryRun | 		i.dryRun = dryRun | ||||||
| 	} | 	} | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | // WithLoadKernelModules sets the load kernel modules flag
 | ||||||
|  | func WithLoadKernelModules(loadKernelModules bool) Option { | ||||||
|  | 	return func(i *Interface) { | ||||||
|  | 		i.loadKernelModules = loadKernelModules | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | |||||||
| @ -19,6 +19,7 @@ package system | |||||||
| import ( | import ( | ||||||
| 	"fmt" | 	"fmt" | ||||||
| 	"os" | 	"os" | ||||||
|  | 	"os/exec" | ||||||
| 	"path/filepath" | 	"path/filepath" | ||||||
| 	"strings" | 	"strings" | ||||||
| 
 | 
 | ||||||
| @ -31,7 +32,7 @@ import ( | |||||||
| type Interface struct { | type Interface struct { | ||||||
| 	logger            *logrus.Logger | 	logger            *logrus.Logger | ||||||
| 	dryRun            bool | 	dryRun            bool | ||||||
| 
 | 	loadKernelModules bool | ||||||
| 	nvidiaDevices     nvidiaDevices | 	nvidiaDevices     nvidiaDevices | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| @ -44,6 +45,12 @@ func New(opts ...Option) (*Interface, error) { | |||||||
| 		opt(i) | 		opt(i) | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
|  | 	if i.loadKernelModules { | ||||||
|  | 		if err := i.LoadNVIDIAKernelModules(); err != nil { | ||||||
|  | 			return nil, fmt.Errorf("failed to load kernel modules: %v", err) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
| 	devices, err := devices.GetNVIDIADevices() | 	devices, err := devices.GetNVIDIADevices() | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
| 		return nil, fmt.Errorf("failed to create devices info: %v", err) | 		return nil, fmt.Errorf("failed to create devices info: %v", err) | ||||||
| @ -108,6 +115,26 @@ func (m *Interface) createDeviceNode(path string, major int, minor int) error { | |||||||
| 	return unix.Chmod(path, 0666) | 	return unix.Chmod(path, 0666) | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | // LoadNVIDIAKernelModules loads the NVIDIA kernel modules.
 | ||||||
|  | func (m *Interface) LoadNVIDIAKernelModules() error { | ||||||
|  | 	modules := []string{"nvidia", "nvidia-uvm", "nvidia-modeset"} | ||||||
|  | 
 | ||||||
|  | 	for _, module := range modules { | ||||||
|  | 		if m.dryRun { | ||||||
|  | 			m.logger.Infof("Running: /sbin/modprobe %s", module) | ||||||
|  | 			continue | ||||||
|  | 		} | ||||||
|  | 		cmd := exec.Command("/sbin/modprobe", module) | ||||||
|  | 
 | ||||||
|  | 		if output, err := cmd.CombinedOutput(); err != nil { | ||||||
|  | 			m.logger.Debugf("Failed to load kernel module %s: %v", module, string(output)) | ||||||
|  | 			return fmt.Errorf("failed to load kernel module %s: %v", module, err) | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	return nil | ||||||
|  | } | ||||||
|  | 
 | ||||||
| type nvidiaDevices struct { | type nvidiaDevices struct { | ||||||
| 	devices.Devices | 	devices.Devices | ||||||
| } | } | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user