Move create-dev-char-symlinks subcommand from hook to system

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar
2023-01-27 11:41:30 +01:00
parent 962d38e9dd
commit 8188400c97
5 changed files with 5 additions and 6 deletions

View File

@@ -1,175 +0,0 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package devchar
import (
"fmt"
"path/filepath"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
"github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci"
)
type allPossible struct {
logger *logrus.Logger
driverRoot string
deviceMajors devices.Devices
migCaps nvcaps.MigCaps
}
// newAllPossible returns a new allPossible device node lister.
// This lister lists all possible device nodes for NVIDIA GPUs, control devices, and capability devices.
func newAllPossible(logger *logrus.Logger, driverRoot string) (nodeLister, error) {
deviceMajors, err := devices.GetNVIDIADevices()
if err != nil {
return nil, fmt.Errorf("failed reading device majors: %v", err)
}
migCaps, err := nvcaps.NewMigCaps()
if err != nil {
return nil, fmt.Errorf("failed to read MIG caps: %v", err)
}
if migCaps == nil {
migCaps = make(nvcaps.MigCaps)
}
l := allPossible{
logger: logger,
driverRoot: driverRoot,
deviceMajors: deviceMajors,
migCaps: migCaps,
}
return l, nil
}
// DeviceNodes returns a list of all possible device nodes for NVIDIA GPUs, control devices, and capability devices.
func (m allPossible) DeviceNodes() ([]deviceNode, error) {
gpus, err := nvpci.NewFrom(
filepath.Join(m.driverRoot, nvpci.PCIDevicesRoot),
).GetGPUs()
if err != nil {
return nil, fmt.Errorf("failed to get GPU information: %v", err)
}
count := len(gpus)
if count == 0 {
m.logger.Infof("No NVIDIA devices found in %s", m.driverRoot)
return nil, nil
}
deviceNodes, err := m.getControlDeviceNodes()
if err != nil {
return nil, fmt.Errorf("failed to get control device nodes: %v", err)
}
for gpu := 0; gpu < count; gpu++ {
deviceNodes = append(deviceNodes, m.getGPUDeviceNodes(gpu)...)
deviceNodes = append(deviceNodes, m.getNVCapDeviceNodes(gpu)...)
}
return deviceNodes, nil
}
// getControlDeviceNodes generates a list of control devices
func (m allPossible) getControlDeviceNodes() ([]deviceNode, error) {
var deviceNodes []deviceNode
// Define the control devices for standard GPUs.
controlDevices := []deviceNode{
m.newDeviceNode(devices.NVIDIAGPU, "/dev/nvidia-modeset", devices.NVIDIAModesetMinor),
m.newDeviceNode(devices.NVIDIAGPU, "/dev/nvidiactl", devices.NVIDIACTLMinor),
m.newDeviceNode(devices.NVIDIAUVM, "/dev/nvidia-uvm", devices.NVIDIAUVMMinor),
m.newDeviceNode(devices.NVIDIAUVM, "/dev/nvidia-uvm-tools", devices.NVIDIAUVMToolsMinor),
}
deviceNodes = append(deviceNodes, controlDevices...)
for _, migControlDevice := range []nvcaps.MigCap{"config", "monitor"} {
migControlMinor, exist := m.migCaps[migControlDevice]
if !exist {
continue
}
d := m.newDeviceNode(
devices.NVIDIACaps,
migControlMinor.DevicePath(),
int(migControlMinor),
)
deviceNodes = append(deviceNodes, d)
}
return deviceNodes, nil
}
// getGPUDeviceNodes generates a list of device nodes for a given GPU.
func (m allPossible) getGPUDeviceNodes(gpu int) []deviceNode {
d := m.newDeviceNode(
devices.NVIDIAGPU,
fmt.Sprintf("/dev/nvidia%d", gpu),
gpu,
)
return []deviceNode{d}
}
// getNVCapDeviceNodes generates a list of cap device nodes for a given GPU.
func (m allPossible) getNVCapDeviceNodes(gpu int) []deviceNode {
var selectedCapMinors []nvcaps.MigMinor
for gi := 0; ; gi++ {
giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
giMinor, exist := m.migCaps[giCap]
if !exist {
break
}
selectedCapMinors = append(selectedCapMinors, giMinor)
for ci := 0; ; ci++ {
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
ciMinor, exist := m.migCaps[ciCap]
if !exist {
break
}
selectedCapMinors = append(selectedCapMinors, ciMinor)
}
}
var deviceNodes []deviceNode
for _, capMinor := range selectedCapMinors {
d := m.newDeviceNode(
devices.NVIDIACaps,
capMinor.DevicePath(),
int(capMinor),
)
deviceNodes = append(deviceNodes, d)
}
return deviceNodes
}
// newDeviceNode creates a new device node with the specified path and major/minor numbers.
// The path is adjusted for the specified driver root.
func (m allPossible) newDeviceNode(deviceName devices.Name, path string, minor int) deviceNode {
major, _ := m.deviceMajors.Get(deviceName)
return deviceNode{
path: filepath.Join(m.driverRoot, path),
major: uint32(major),
minor: uint32(minor),
}
}

View File

@@ -1,332 +0,0 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package devchar
import (
"fmt"
"os"
"os/signal"
"path/filepath"
"strings"
"syscall"
"github.com/fsnotify/fsnotify"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
const (
defaultDevCharPath = "/dev/char"
)
type command struct {
logger *logrus.Logger
}
type config struct {
devCharPath string
driverRoot string
dryRun bool
watch bool
createAll bool
}
// NewCommand constructs a hook sub-command with the specified logger
func NewCommand(logger *logrus.Logger) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
// build
func (m command) build() *cli.Command {
cfg := config{}
// Create the 'create-dev-char-symlinks' command
c := cli.Command{
Name: "create-dev-char-symlinks",
Usage: "A hook to create symlinks to possible /dev/nv* devices in /dev/char",
Before: func(c *cli.Context) error {
return m.validateFlags(c, &cfg)
},
Action: func(c *cli.Context) error {
return m.run(c, &cfg)
},
}
c.Flags = []cli.Flag{
&cli.StringFlag{
Name: "dev-char-path",
Usage: "The path at which the symlinks will be created. Symlinks will be created as `DEV_CHAR`/MAJOR:MINOR where MAJOR and MINOR are the major and minor numbers of a corresponding device node.",
Value: defaultDevCharPath,
Destination: &cfg.devCharPath,
EnvVars: []string{"DEV_CHAR_PATH"},
},
&cli.StringFlag{
Name: "driver-root",
Usage: "The path to the driver root. `DRIVER_ROOT`/dev is searched for NVIDIA device nodes.",
Value: "/",
Destination: &cfg.driverRoot,
EnvVars: []string{"DRIVER_ROOT"},
},
&cli.BoolFlag{
Name: "watch",
Usage: "If set, the command will watch for changes to the driver root and recreate the symlinks when changes are detected.",
Value: false,
Destination: &cfg.watch,
EnvVars: []string{"WATCH"},
},
&cli.BoolFlag{
Name: "create-all",
Usage: "Create all possible /dev/char symlinks instead of limiting these to existing device nodes.",
Destination: &cfg.createAll,
EnvVars: []string{"CREATE_ALL"},
},
&cli.BoolFlag{
Name: "dry-run",
Usage: "If set, the command will not create any symlinks.",
Value: false,
Destination: &cfg.dryRun,
EnvVars: []string{"DRY_RUN"},
},
}
return &c
}
func (m command) validateFlags(r *cli.Context, cfg *config) error {
if cfg.createAll && cfg.watch {
return fmt.Errorf("create-all and watch are mutually exclusive")
}
return nil
}
func (m command) run(c *cli.Context, cfg *config) error {
var watcher *fsnotify.Watcher
var sigs chan os.Signal
if cfg.watch {
watcher, err := newFSWatcher(filepath.Join(cfg.driverRoot, "dev"))
if err != nil {
return fmt.Errorf("failed to create FS watcher: %v", err)
}
defer watcher.Close()
sigs = newOSWatcher(syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
}
l, err := NewSymlinkCreator(
WithLogger(m.logger),
WithDevCharPath(cfg.devCharPath),
WithDriverRoot(cfg.driverRoot),
WithDryRun(cfg.dryRun),
WithCreateAll(cfg.createAll),
)
if err != nil {
return fmt.Errorf("failed to create symlink creator: %v", err)
}
create:
err = l.CreateLinks()
if err != nil {
return fmt.Errorf("failed to create links: %v", err)
}
if !cfg.watch {
return nil
}
for {
select {
case event := <-watcher.Events:
deviceNode := filepath.Base(event.Name)
if !strings.HasPrefix(deviceNode, "nvidia") {
continue
}
if event.Op&fsnotify.Create == fsnotify.Create {
m.logger.Infof("%s created, restarting.", event.Name)
goto create
}
if event.Op&fsnotify.Create == fsnotify.Remove {
m.logger.Infof("%s removed. Ignoring", event.Name)
}
// Watch for any other fs errors and log them.
case err := <-watcher.Errors:
m.logger.Errorf("inotify: %s", err)
// React to signals
case s := <-sigs:
switch s {
case syscall.SIGHUP:
m.logger.Infof("Received SIGHUP, recreating symlinks.")
goto create
default:
m.logger.Infof("Received signal %q, shutting down.", s)
return nil
}
}
}
}
type linkCreator struct {
logger *logrus.Logger
lister nodeLister
driverRoot string
devCharPath string
dryRun bool
createAll bool
}
// Creator is an interface for creating symlinks to /dev/nv* devices in /dev/char.
type Creator interface {
CreateLinks() error
}
// Option is a functional option for configuring the linkCreator.
type Option func(*linkCreator)
// NewSymlinkCreator creates a new linkCreator.
func NewSymlinkCreator(opts ...Option) (Creator, error) {
c := linkCreator{}
for _, opt := range opts {
opt(&c)
}
if c.logger == nil {
c.logger = logrus.StandardLogger()
}
if c.driverRoot == "" {
c.driverRoot = "/"
}
if c.devCharPath == "" {
c.devCharPath = defaultDevCharPath
}
if c.createAll {
lister, err := newAllPossible(c.logger, c.driverRoot)
if err != nil {
return nil, fmt.Errorf("failed to create all possible device lister: %v", err)
}
c.lister = lister
} else {
c.lister = existing{c.logger, c.driverRoot}
}
return c, nil
}
// WithDriverRoot sets the driver root path.
func WithDriverRoot(root string) Option {
return func(c *linkCreator) {
c.driverRoot = root
}
}
// WithDevCharPath sets the path at which the symlinks will be created.
func WithDevCharPath(path string) Option {
return func(c *linkCreator) {
c.devCharPath = path
}
}
// WithDryRun sets the dry run flag.
func WithDryRun(dryRun bool) Option {
return func(c *linkCreator) {
c.dryRun = dryRun
}
}
// WithLogger sets the logger.
func WithLogger(logger *logrus.Logger) Option {
return func(c *linkCreator) {
c.logger = logger
}
}
// WithCreateAll sets the createAll flag for the linkCreator.
func WithCreateAll(createAll bool) Option {
return func(lc *linkCreator) {
lc.createAll = createAll
}
}
// CreateLinks creates symlinks for all NVIDIA device nodes found in the driver root.
func (m linkCreator) CreateLinks() error {
deviceNodes, err := m.lister.DeviceNodes()
if err != nil {
return fmt.Errorf("failed to get device nodes: %v", err)
}
if len(deviceNodes) != 0 && !m.dryRun {
err := os.MkdirAll(m.devCharPath, 0755)
if err != nil {
return fmt.Errorf("failed to create directory %s: %v", m.devCharPath, err)
}
}
for _, deviceNode := range deviceNodes {
target := deviceNode.path
linkPath := filepath.Join(m.devCharPath, deviceNode.devCharName())
m.logger.Infof("Creating link %s => %s", linkPath, target)
if m.dryRun {
continue
}
err = os.Symlink(target, linkPath)
if err != nil {
m.logger.Warnf("Could not create symlink: %v", err)
}
}
return nil
}
type deviceNode struct {
path string
major uint32
minor uint32
}
func (d deviceNode) devCharName() string {
return fmt.Sprintf("%d:%d", d.major, d.minor)
}
func newFSWatcher(files ...string) (*fsnotify.Watcher, error) {
watcher, err := fsnotify.NewWatcher()
if err != nil {
return nil, err
}
for _, f := range files {
err = watcher.Add(f)
if err != nil {
watcher.Close()
return nil, err
}
}
return watcher, nil
}
func newOSWatcher(sigs ...os.Signal) chan os.Signal {
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, sigs...)
return sigChan
}

View File

@@ -1,95 +0,0 @@
/**
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package devchar
import (
"path/filepath"
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
type nodeLister interface {
DeviceNodes() ([]deviceNode, error)
}
type existing struct {
logger *logrus.Logger
driverRoot string
}
// DeviceNodes returns a list of NVIDIA device nodes in the specified root.
// The nvidia-nvswitch* and nvidia-nvlink devices are excluded.
func (m existing) DeviceNodes() ([]deviceNode, error) {
locator := lookup.NewCharDeviceLocator(
lookup.WithLogger(m.logger),
lookup.WithRoot(m.driverRoot),
lookup.WithOptional(true),
)
devices, err := locator.Locate("/dev/nvidia*")
if err != nil {
m.logger.Warnf("Error while locating device: %v", err)
}
capDevices, err := locator.Locate("/dev/nvidia-caps/nvidia-*")
if err != nil {
m.logger.Warnf("Error while locating caps device: %v", err)
}
if len(devices) == 0 && len(capDevices) == 0 {
m.logger.Infof("No NVIDIA devices found in %s", m.driverRoot)
return nil, nil
}
var deviceNodes []deviceNode
for _, d := range append(devices, capDevices...) {
if m.nodeIsBlocked(d) {
continue
}
var stat unix.Stat_t
err := unix.Stat(d, &stat)
if err != nil {
m.logger.Warnf("Could not stat device: %v", err)
continue
}
deviceNode := deviceNode{
path: d,
major: unix.Major(uint64(stat.Rdev)),
minor: unix.Minor(uint64(stat.Rdev)),
}
deviceNodes = append(deviceNodes, deviceNode)
}
return deviceNodes, nil
}
// nodeIsBlocked returns true if the specified device node should be ignored.
func (m existing) nodeIsBlocked(path string) bool {
blockedPrefixes := []string{"nvidia-fs", "nvidia-nvswitch", "nvidia-nvlink"}
nodeName := filepath.Base(path)
for _, prefix := range blockedPrefixes {
if strings.HasPrefix(nodeName, prefix) {
return true
}
}
return false
}

View File

@@ -18,7 +18,7 @@ package hook
import (
chmod "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook/chmod"
devchar "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook/create-dev-char-symlinks"
symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook/create-symlinks"
ldcache "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/hook/update-ldcache"
"github.com/sirupsen/logrus"
@@ -49,7 +49,6 @@ func (m hookCommand) build() *cli.Command {
ldcache.NewCommand(m.logger),
symlinks.NewCommand(m.logger),
chmod.NewCommand(m.logger),
devchar.NewCommand(m.logger),
}
return &hook