From e774c51c97fbb185857693ccde1887260157a92d Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 27 Mar 2023 22:11:55 +0200 Subject: [PATCH 1/2] Add nvidia-ctk system create-device-nodes command This change adds an nvidia-ctk system create-device-nodes command for creating NVIDIA device nodes. Currently this is limited to control devices (nvidia-uvm, nvidia-uvm-tools, nvidia-modeset, nvidiactl). A --dry-run mode is included for outputing commands that would be executed and the driver root can be specified. Signed-off-by: Evan Lezar --- .../create-device-nodes.go | 107 +++++++++++++ cmd/nvidia-ctk/system/system.go | 2 + internal/system/options.go | 36 +++++ internal/system/system.go | 149 ++++++++++++++++++ 4 files changed, 294 insertions(+) create mode 100644 cmd/nvidia-ctk/system/create-device-nodes/create-device-nodes.go create mode 100644 internal/system/options.go create mode 100644 internal/system/system.go diff --git a/cmd/nvidia-ctk/system/create-device-nodes/create-device-nodes.go b/cmd/nvidia-ctk/system/create-device-nodes/create-device-nodes.go new file mode 100644 index 00000000..939c8525 --- /dev/null +++ b/cmd/nvidia-ctk/system/create-device-nodes/create-device-nodes.go @@ -0,0 +1,107 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package createdevicenodes + +import ( + "fmt" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/system" + "github.com/sirupsen/logrus" + "github.com/urfave/cli/v2" +) + +type command struct { + logger *logrus.Logger +} + +type options struct { + driverRoot string + + dryRun bool + + control bool +} + +// NewCommand constructs a command sub-command with the specified logger +func NewCommand(logger *logrus.Logger) *cli.Command { + c := command{ + logger: logger, + } + return c.build() +} + +// build +func (m command) build() *cli.Command { + opts := options{} + + c := cli.Command{ + Name: "create-device-nodes", + Usage: "A utility to create NVIDIA device ndoes", + Before: func(c *cli.Context) error { + return m.validateFlags(c, &opts) + }, + Action: func(c *cli.Context) error { + return m.run(c, &opts) + }, + } + + c.Flags = []cli.Flag{ + &cli.StringFlag{ + Name: "driver-root", + Usage: "the path to the driver root. Device nodes will be created at `DRIVER_ROOT`/dev", + Value: "/", + Destination: &opts.driverRoot, + EnvVars: []string{"DRIVER_ROOT"}, + }, + &cli.BoolFlag{ + Name: "control-devices", + Usage: "create all control device nodes: nvidiactl, nvidia-modeset, nvidia-uvm, nvidia-uvm-tools", + Destination: &opts.control, + }, + &cli.BoolFlag{ + Name: "dry-run", + Usage: "if set, the command will not create any symlinks.", + Value: false, + Destination: &opts.dryRun, + EnvVars: []string{"DRY_RUN"}, + }, + } + + return &c +} + +func (m command) validateFlags(r *cli.Context, opts *options) error { + return nil +} + +func (m command) run(c *cli.Context, opts *options) error { + s, err := system.New( + system.WithLogger(m.logger), + system.WithDryRun(opts.dryRun), + ) + if err != nil { + return fmt.Errorf("failed to create library: %v", err) + } + + if opts.control { + m.logger.Infof("Creating control device nodes at %s", opts.driverRoot) + if err := s.CreateNVIDIAControlDeviceNodesAt(opts.driverRoot); err != nil { + return fmt.Errorf("failed to create control device nodes: %v", err) + } + } + return nil +} diff --git a/cmd/nvidia-ctk/system/system.go b/cmd/nvidia-ctk/system/system.go index a9f2e2ea..b4d2f049 100644 --- a/cmd/nvidia-ctk/system/system.go +++ b/cmd/nvidia-ctk/system/system.go @@ -18,6 +18,7 @@ package system import ( devchar "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/system/create-dev-char-symlinks" + devicenodes "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/system/create-device-nodes" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" ) @@ -43,6 +44,7 @@ func (m command) build() *cli.Command { system.Subcommands = []*cli.Command{ devchar.NewCommand(m.logger), + devicenodes.NewCommand(m.logger), } return &system diff --git a/internal/system/options.go b/internal/system/options.go new file mode 100644 index 00000000..fb0fbb38 --- /dev/null +++ b/internal/system/options.go @@ -0,0 +1,36 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package system + +import "github.com/sirupsen/logrus" + +// Option is a functional option for the system command +type Option func(*Interface) + +// WithLogger sets the logger for the system command +func WithLogger(logger *logrus.Logger) Option { + return func(i *Interface) { + i.logger = logger + } +} + +// WithDryRun sets the dry run flag +func WithDryRun(dryRun bool) Option { + return func(i *Interface) { + i.dryRun = dryRun + } +} diff --git a/internal/system/system.go b/internal/system/system.go new file mode 100644 index 00000000..d3ad63eb --- /dev/null +++ b/internal/system/system.go @@ -0,0 +1,149 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package system + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// Interface is the interface for the system command +type Interface struct { + logger *logrus.Logger + dryRun bool + + nvidiaDevices nvidiaDevices +} + +// New constructs a system command with the specified options +func New(opts ...Option) (*Interface, error) { + i := &Interface{ + logger: logrus.StandardLogger(), + } + for _, opt := range opts { + opt(i) + } + + devices, err := devices.GetNVIDIADevices() + if err != nil { + return nil, fmt.Errorf("failed to create devices info: %v", err) + } + i.nvidiaDevices = nvidiaDevices{devices} + + return i, nil +} + +// CreateNVIDIAControlDeviceNodesAt creates the NVIDIA control device nodes associated with the NVIDIA driver at the specified root. +func (m *Interface) CreateNVIDIAControlDeviceNodesAt(root string) error { + controlNodes := []string{"/dev/nvidiactl", "/dev/nvidia-modeset", "/dev/nvidia-uvm", "/dev/nvidia-uvm-tools"} + + for _, node := range controlNodes { + path := filepath.Join(root, node) + err := m.CreateNVIDIADeviceNode(path) + if err != nil { + return fmt.Errorf("failed to create device node %s: %v", path, err) + } + } + + return nil +} + +// CreateNVIDIADeviceNode creates a specified device node associated with the NVIDIA driver. +func (m *Interface) CreateNVIDIADeviceNode(path string) error { + node := filepath.Base(path) + if !strings.HasPrefix(node, "nvidia") { + return fmt.Errorf("invalid device node %q", node) + } + + major, err := m.nvidiaDevices.Major(node) + if err != nil { + return fmt.Errorf("failed to determine major: %v", err) + } + + minor, err := m.nvidiaDevices.Minor(node) + if err != nil { + return fmt.Errorf("failed to determine minor: %v", err) + } + + return m.createDeviceNode(path, int(major), int(minor)) +} + +func (m *Interface) createDeviceNode(path string, major int, minor int) error { + if m.dryRun { + m.logger.Infof("Running: mknod --mode=0666 %s c %d %d", path, major, minor) + return nil + } + + if _, err := os.Stat(path); err == nil { + m.logger.Infof("Skipping: %s already exists", path) + return nil + } else if !os.IsNotExist(err) { + return fmt.Errorf("failed to stat %s: %v", path, err) + } + + err := unix.Mknod(path, unix.S_IFCHR, int(unix.Mkdev(uint32(major), uint32(minor)))) + if err != nil { + return err + } + return unix.Chmod(path, 0666) +} + +type nvidiaDevices struct { + devices.Devices +} + +// Major returns the major number for the specified NVIDIA device node. +// If the device node is not supported, an error is returned. +func (n *nvidiaDevices) Major(node string) (int64, error) { + var valid bool + var major devices.Major + switch node { + case "nvidia-uvm", "nvidia-uvm-tools": + major, valid = n.Get(devices.NVIDIAUVM) + case "nvidia-modeset", "nvidiactl": + major, valid = n.Get(devices.NVIDIAGPU) + } + + if !valid { + return 0, fmt.Errorf("invalid device node %q", node) + } + + return int64(major), nil +} + +// Minor returns the minor number for the specified NVIDIA device node. +// If the device node is not supported, an error is returned. +func (n *nvidiaDevices) Minor(node string) (int64, error) { + switch node { + case "nvidia-modeset": + return devices.NVIDIAModesetMinor, nil + case "nvidia-uvm-tools": + return devices.NVIDIAUVMToolsMinor, nil + case "nvidia-uvm": + return devices.NVIDIAUVMMinor, nil + case "nvidiactl": + return devices.NVIDIACTLMinor, nil + } + + return 0, fmt.Errorf("invalid device node %q", node) +} From cdaaf5e46fe8fd05cbb7de0ebe0a2db849c16a4d Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 27 Mar 2023 23:02:24 +0200 Subject: [PATCH 2/2] Generate device nodes when creating management spec Signed-off-by: Evan Lezar --- tools/container/toolkit/toolkit.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/container/toolkit/toolkit.go b/tools/container/toolkit/toolkit.go index f07d48b9..e33a9d2b 100644 --- a/tools/container/toolkit/toolkit.go +++ b/tools/container/toolkit/toolkit.go @@ -23,6 +23,7 @@ import ( "path/filepath" "strings" + "github.com/NVIDIA/nvidia-container-toolkit/internal/system" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform" "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" @@ -608,6 +609,16 @@ func generateCDISpec(opts *options, nvidiaCTKPath string) error { return nil } + log.Infof("Creating control device nodes at %v", opts.DriverRootCtrPath) + s, err := system.New() + if err != nil { + return fmt.Errorf("failed to create library: %v", err) + } + if err := s.CreateNVIDIAControlDeviceNodesAt(opts.DriverRootCtrPath); err != nil { + return fmt.Errorf("failed to create control device nodes: %v", err) + } + + log.Info("Generating CDI spec for management containers") cdilib, err := nvcdi.New( nvcdi.WithMode(nvcdi.ModeManagement), nvcdi.WithDriverRoot(opts.DriverRootCtrPath),