Add --create-all mode to creation of dev/char symlinks

This change adds a --create-all mode to the create-dev-char-symlinks hook.
This mode creates all POSSIBLE symlinks to device nodes for regular and cap
devices. With the number of GPUs inferred from the PCI device information.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2023-01-24 10:06:21 +01:00
parent 95394e0fc8
commit 1d7e419008
2 changed files with 219 additions and 7 deletions

View File

@ -0,0 +1,175 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package devchar
import (
"fmt"
"path/filepath"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
"github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci"
)
type allPossible struct {
logger *logrus.Logger
driverRoot string
deviceMajors devices.Devices
migCaps nvcaps.MigCaps
}
// newAllPossible returns a new allPossible device node lister.
// This lister lists all possible device nodes for NVIDIA GPUs, control devices, and capability devices.
func newAllPossible(logger *logrus.Logger, driverRoot string) (nodeLister, error) {
deviceMajors, err := devices.GetNVIDIADevices()
if err != nil {
return nil, fmt.Errorf("failed reading device majors: %v", err)
}
migCaps, err := nvcaps.NewMigCaps()
if err != nil {
return nil, fmt.Errorf("failed to read MIG caps: %v", err)
}
if migCaps == nil {
migCaps = make(nvcaps.MigCaps)
}
l := allPossible{
logger: logger,
driverRoot: driverRoot,
deviceMajors: deviceMajors,
migCaps: migCaps,
}
return l, nil
}
// DeviceNodes returns a list of all possible device nodes for NVIDIA GPUs, control devices, and capability devices.
func (m allPossible) DeviceNodes() ([]deviceNode, error) {
gpus, err := nvpci.NewFrom(
filepath.Join(m.driverRoot, nvpci.PCIDevicesRoot),
).GetGPUs()
if err != nil {
return nil, fmt.Errorf("failed to get GPU information: %v", err)
}
count := len(gpus)
if count == 0 {
m.logger.Infof("No NVIDIA devices found in %s", m.driverRoot)
return nil, nil
}
deviceNodes, err := m.getControlDeviceNodes()
if err != nil {
return nil, fmt.Errorf("failed to get control device nodes: %v", err)
}
for gpu := 0; gpu < count; gpu++ {
deviceNodes = append(deviceNodes, m.getGPUDeviceNodes(gpu)...)
deviceNodes = append(deviceNodes, m.getNVCapDeviceNodes(gpu)...)
}
return deviceNodes, nil
}
// getControlDeviceNodes generates a list of control devices
func (m allPossible) getControlDeviceNodes() ([]deviceNode, error) {
var deviceNodes []deviceNode
// Define the control devices for standard GPUs.
controlDevices := []deviceNode{
m.newDeviceNode(devices.NVIDIAGPU, "/dev/nvidia-modeset", devices.NVIDIAModesetMinor),
m.newDeviceNode(devices.NVIDIAGPU, "/dev/nvidiactl", devices.NVIDIACTLMinor),
m.newDeviceNode(devices.NVIDIAUVM, "/dev/nvidia-uvm", devices.NVIDIAUVMMinor),
m.newDeviceNode(devices.NVIDIAUVM, "/dev/nvidia-uvm-tools", devices.NVIDIAUVMToolsMinor),
}
deviceNodes = append(deviceNodes, controlDevices...)
for _, migControlDevice := range []nvcaps.MigCap{"config", "monitor"} {
migControlMinor, exist := m.migCaps[migControlDevice]
if !exist {
continue
}
d := m.newDeviceNode(
devices.NVIDIACaps,
migControlMinor.DevicePath(),
int(migControlMinor),
)
deviceNodes = append(deviceNodes, d)
}
return deviceNodes, nil
}
// getGPUDeviceNodes generates a list of device nodes for a given GPU.
func (m allPossible) getGPUDeviceNodes(gpu int) []deviceNode {
d := m.newDeviceNode(
devices.NVIDIAGPU,
fmt.Sprintf("/dev/nvidia%d", gpu),
gpu,
)
return []deviceNode{d}
}
// getNVCapDeviceNodes generates a list of cap device nodes for a given GPU.
func (m allPossible) getNVCapDeviceNodes(gpu int) []deviceNode {
var selectedCapMinors []nvcaps.MigMinor
for gi := 0; ; gi++ {
giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
giMinor, exist := m.migCaps[giCap]
if !exist {
break
}
selectedCapMinors = append(selectedCapMinors, giMinor)
for ci := 0; ; ci++ {
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
ciMinor, exist := m.migCaps[ciCap]
if !exist {
break
}
selectedCapMinors = append(selectedCapMinors, ciMinor)
}
}
var deviceNodes []deviceNode
for _, capMinor := range selectedCapMinors {
d := m.newDeviceNode(
devices.NVIDIACaps,
capMinor.DevicePath(),
int(capMinor),
)
deviceNodes = append(deviceNodes, d)
}
return deviceNodes
}
// newDeviceNode creates a new device node with the specified path and major/minor numbers.
// The path is adjusted for the specified driver root.
func (m allPossible) newDeviceNode(deviceName devices.Name, path string, minor int) deviceNode {
major, _ := m.deviceMajors.Get(deviceName)
return deviceNode{
path: filepath.Join(m.driverRoot, path),
major: uint32(major),
minor: uint32(minor),
}
}

View File

@ -42,6 +42,7 @@ type config struct {
driverRoot string driverRoot string
dryRun bool dryRun bool
watch bool watch bool
createAll bool
} }
// NewCommand constructs a hook sub-command with the specified logger // NewCommand constructs a hook sub-command with the specified logger
@ -60,6 +61,9 @@ func (m command) build() *cli.Command {
c := cli.Command{ c := cli.Command{
Name: "create-dev-char-symlinks", Name: "create-dev-char-symlinks",
Usage: "A hook to create symlinks to possible /dev/nv* devices in /dev/char", Usage: "A hook to create symlinks to possible /dev/nv* devices in /dev/char",
Before: func(c *cli.Context) error {
return m.validateFlags(c, &cfg)
},
Action: func(c *cli.Context) error { Action: func(c *cli.Context) error {
return m.run(c, &cfg) return m.run(c, &cfg)
}, },
@ -87,6 +91,12 @@ func (m command) build() *cli.Command {
Destination: &cfg.watch, Destination: &cfg.watch,
EnvVars: []string{"WATCH"}, EnvVars: []string{"WATCH"},
}, },
&cli.BoolFlag{
Name: "create-all",
Usage: "Create all possible /dev/char symlinks instead of limiting these to existing device nodes.",
Destination: &cfg.createAll,
EnvVars: []string{"CREATE_ALL"},
},
&cli.BoolFlag{ &cli.BoolFlag{
Name: "dry-run", Name: "dry-run",
Usage: "If set, the command will not create any symlinks.", Usage: "If set, the command will not create any symlinks.",
@ -99,8 +109,15 @@ func (m command) build() *cli.Command {
return &c return &c
} }
func (m command) run(c *cli.Context, cfg *config) error { func (m command) validateFlags(r *cli.Context, cfg *config) error {
if cfg.createAll && cfg.watch {
return fmt.Errorf("create-all and watch are mutually exclusive")
}
return nil
}
func (m command) run(c *cli.Context, cfg *config) error {
var watcher *fsnotify.Watcher var watcher *fsnotify.Watcher
var sigs chan os.Signal var sigs chan os.Signal
@ -114,14 +131,19 @@ func (m command) run(c *cli.Context, cfg *config) error {
sigs = newOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) sigs = newOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
} }
l := NewSymlinkCreator( l, err := NewSymlinkCreator(
WithLogger(m.logger), WithLogger(m.logger),
WithDevCharPath(cfg.devCharPath), WithDevCharPath(cfg.devCharPath),
WithDriverRoot(cfg.driverRoot), WithDriverRoot(cfg.driverRoot),
WithDryRun(cfg.dryRun), WithDryRun(cfg.dryRun),
WithCreateAll(cfg.createAll),
) )
if err != nil {
return fmt.Errorf("failed to create symlink creator: %v", err)
}
create: create:
err := l.CreateLinks() err = l.CreateLinks()
if err != nil { if err != nil {
return fmt.Errorf("failed to create links: %v", err) return fmt.Errorf("failed to create links: %v", err)
} }
@ -169,6 +191,7 @@ type linkCreator struct {
driverRoot string driverRoot string
devCharPath string devCharPath string
dryRun bool dryRun bool
createAll bool
} }
// Creator is an interface for creating symlinks to /dev/nv* devices in /dev/char. // Creator is an interface for creating symlinks to /dev/nv* devices in /dev/char.
@ -180,7 +203,7 @@ type Creator interface {
type Option func(*linkCreator) type Option func(*linkCreator)
// NewSymlinkCreator creates a new linkCreator. // NewSymlinkCreator creates a new linkCreator.
func NewSymlinkCreator(opts ...Option) Creator { func NewSymlinkCreator(opts ...Option) (Creator, error) {
c := linkCreator{} c := linkCreator{}
for _, opt := range opts { for _, opt := range opts {
opt(&c) opt(&c)
@ -194,10 +217,17 @@ func NewSymlinkCreator(opts ...Option) Creator {
if c.devCharPath == "" { if c.devCharPath == "" {
c.devCharPath = defaultDevCharPath c.devCharPath = defaultDevCharPath
} }
if c.lister == nil {
if c.createAll {
lister, err := newAllPossible(c.logger, c.driverRoot)
if err != nil {
return nil, fmt.Errorf("failed to create all possible device lister: %v", err)
}
c.lister = lister
} else {
c.lister = existing{c.logger, c.driverRoot} c.lister = existing{c.logger, c.driverRoot}
} }
return c return c, nil
} }
// WithDriverRoot sets the driver root path. // WithDriverRoot sets the driver root path.
@ -228,7 +258,14 @@ func WithLogger(logger *logrus.Logger) Option {
} }
} }
// CreateLinks creates symlinks for all device nodes returned by the configured lister. // WithCreateAll sets the createAll flag for the linkCreator.
func WithCreateAll(createAll bool) Option {
return func(lc *linkCreator) {
lc.createAll = createAll
}
}
// CreateLinks creates symlinks for all NVIDIA device nodes found in the driver root.
func (m linkCreator) CreateLinks() error { func (m linkCreator) CreateLinks() error {
deviceNodes, err := m.lister.DeviceNodes() deviceNodes, err := m.lister.DeviceNodes()
if err != nil { if err != nil {