diff --git a/cmd/nvidia-ctk-installer/main_test.go b/cmd/nvidia-ctk-installer/main_test.go index 759ae8c1..1e3f8006 100644 --- a/cmd/nvidia-ctk-installer/main_test.go +++ b/cmd/nvidia-ctk-installer/main_test.go @@ -141,6 +141,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.jit-cdi] + load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -202,6 +205,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.jit-cdi] + load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -266,6 +272,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.jit-cdi] + load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -327,6 +336,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.jit-cdi] + load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -410,6 +422,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.jit-cdi] + load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true diff --git a/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go b/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go index cafb8f9c..62aaf32d 100644 --- a/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go +++ b/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go @@ -145,21 +145,9 @@ func (m allPossible) getGPUDeviceNodes(gpu int) []deviceNode { // getNVCapDeviceNodes generates a list of cap device nodes for a given GPU. func (m allPossible) getNVCapDeviceNodes(gpu int) []deviceNode { var selectedCapMinors []nvcaps.MigMinor - for gi := 0; ; gi++ { - giCap := nvcaps.NewGPUInstanceCap(gpu, gi) - giMinor, exist := m.migCaps[giCap] - if !exist { - break - } - selectedCapMinors = append(selectedCapMinors, giMinor) - for ci := 0; ; ci++ { - ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci) - ciMinor, exist := m.migCaps[ciCap] - if !exist { - break - } - selectedCapMinors = append(selectedCapMinors, ciMinor) - } + + for _, capMinors := range m.migCaps.FilterForGPU(nvcaps.Index(gpu)) { + selectedCapMinors = append(selectedCapMinors, capMinors) } var deviceNodes []deviceNode diff --git a/internal/config/config.go b/internal/config/config.go index 652cc83a..5d17d674 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -121,6 +121,9 @@ func GetDefault() (*Config, error) { AnnotationPrefixes: []string{cdi.AnnotationPrefix}, SpecDirs: cdi.DefaultSpecDirs, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 963058e1..7b4d638c 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -102,6 +105,7 @@ func TestGetConfig(t *testing.T) { "nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]", "nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]", "nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", + "nvidia-container-runtime.modes.jit-cdi.load-kernel-modules = [\"foo\"]", "nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"", "nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"", }, @@ -134,6 +138,9 @@ func TestGetConfig(t *testing.T) { "/not/var/run/cdi", }, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"foo"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -178,6 +185,9 @@ func TestGetConfig(t *testing.T) { "/var/run/cdi", }, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -213,6 +223,8 @@ func TestGetConfig(t *testing.T) { "spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]", "[nvidia-container-runtime.modes.csv]", "mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", + "[nvidia-container-runtime.modes.jit-cdi]", + "load-kernel-modules = [\"foo\"]", "[nvidia-container-runtime-hook]", "path = \"/foo/bar/nvidia-container-runtime-hook\"", "[nvidia-ctk]", @@ -247,6 +259,9 @@ func TestGetConfig(t *testing.T) { "/not/var/run/cdi", }, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"foo"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -283,6 +298,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -322,6 +340,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + JitCDI: jitCDIModeConfig{ + LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"}, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ diff --git a/internal/config/runtime.go b/internal/config/runtime.go index 2ba1b7a8..ea9869b9 100644 --- a/internal/config/runtime.go +++ b/internal/config/runtime.go @@ -29,8 +29,9 @@ type RuntimeConfig struct { // modesConfig defines (optional) per-mode configs type modesConfig struct { - CSV csvModeConfig `toml:"csv"` - CDI cdiModeConfig `toml:"cdi"` + CSV csvModeConfig `toml:"csv"` + CDI cdiModeConfig `toml:"cdi"` + JitCDI jitCDIModeConfig `toml:"jit-cdi"` } type cdiModeConfig struct { @@ -45,3 +46,11 @@ type cdiModeConfig struct { type csvModeConfig struct { MountSpecPath string `toml:"mount-spec-path"` } + +type jitCDIModeConfig struct { + // LoadKernelModules defines the names of the kernel modules that should be + // loaded before generating a just-in-time CDI specification. + // The module names must start with `nvidia` and if no modules are specified + // no kernel modules are loaded. + LoadKernelModules []string `toml:"load-kernel-modules"` +} diff --git a/internal/config/toml_test.go b/internal/config/toml_test.go index f7c649f7..96cff3b8 100644 --- a/internal/config/toml_test.go +++ b/internal/config/toml_test.go @@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"] [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" +[nvidia-container-runtime.modes.jit-cdi] +load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"] + [nvidia-container-runtime-hook] path = "nvidia-container-runtime-hook" skip-mode-detection = false diff --git a/internal/info/proc/devices/builder.go b/internal/info/proc/devices/builder.go index 6da9a90d..23a4eaf0 100644 --- a/internal/info/proc/devices/builder.go +++ b/internal/info/proc/devices/builder.go @@ -45,7 +45,7 @@ func New(opts ...Option) Devices { type Option func(*builder) // WithDeviceToMajor specifies an explicit device name to major number map. -func WithDeviceToMajor(deviceToMajor map[string]int) Option { +func WithDeviceToMajor(deviceToMajor map[string]uint32) Option { return func(b *builder) { b.asMap = make(devices) for name, major := range deviceToMajor { diff --git a/internal/info/proc/devices/devices.go b/internal/info/proc/devices/devices.go index 5927c837..a1bfb274 100644 --- a/internal/info/proc/devices/devices.go +++ b/internal/info/proc/devices/devices.go @@ -45,7 +45,7 @@ const ( type Name string // Major represents a device major as specified under /proc/devices -type Major int +type Major uint32 // Devices represents the set of devices under /proc/devices // @@ -130,8 +130,8 @@ func nvidiaDeviceFrom(reader io.Reader) (Devices, error) { return nvidiaDevices, nil } -func devicesFrom(reader io.Reader) map[string]int { - allDevices := make(map[string]int) +func devicesFrom(reader io.Reader) map[string]uint32 { + allDevices := make(map[string]uint32) scanner := bufio.NewScanner(reader) for scanner.Scan() { device, major, err := processProcDeviceLine(scanner.Text()) @@ -143,11 +143,11 @@ func devicesFrom(reader io.Reader) map[string]int { return allDevices } -func processProcDeviceLine(line string) (string, int, error) { +func processProcDeviceLine(line string) (string, uint32, error) { trimmed := strings.TrimSpace(line) var name string - var major int + var major uint32 n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name) if n == 2 { diff --git a/internal/info/proc/devices/devices_test.go b/internal/info/proc/devices/devices_test.go index 1669dee6..945614d7 100644 --- a/internal/info/proc/devices/devices_test.go +++ b/internal/info/proc/devices/devices_test.go @@ -25,7 +25,7 @@ import ( ) func TestNvidiaDevices(t *testing.T) { - perDriverDeviceMaps := map[string]map[string]int{ + perDriverDeviceMaps := map[string]map[string]uint32{ "pre550": { "nvidia-frontend": 195, "nvidia-nvlink": 234, @@ -100,7 +100,7 @@ func TestProcessDeviceFileLine(t *testing.T) { testCases := []struct { line string name string - major int + major uint32 err bool }{ {"", "", 0, true}, diff --git a/internal/lookup/root/root.go b/internal/lookup/root/root.go index d0c83701..a5f19aab 100644 --- a/internal/lookup/root/root.go +++ b/internal/lookup/root/root.go @@ -17,12 +17,15 @@ package root import ( + "errors" + "fmt" "os" "path/filepath" "strings" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" + "github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules" ) // Driver represents a filesystem in which a set of drivers or devices is defined. @@ -125,3 +128,20 @@ func xdgDataDirs() []string { return []string{"/usr/local/share", "/usr/share"} } + +// LoadKmods loads the specified kernel modules in the driver root. +// Errors in loading a module do not prevent other modules from being attempted. +func (r *Driver) LoadKernelModules(moduleNames ...string) error { + modules := nvmodules.New( + nvmodules.WithLogger(r.logger), + nvmodules.WithRoot(r.Root), + ) + + var errs error + for _, moduleName := range moduleNames { + if err := modules.Load(moduleName); err != nil { + errs = errors.Join(errs, fmt.Errorf("failed to load kernel module %q: %w", moduleName, err)) + } + } + return errs +} diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index 90cd481b..3291fe0b 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -22,11 +22,15 @@ import ( "tags.cncf.io/container-device-interface/pkg/parser" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root" "github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi" "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" + "github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvdevices" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" ) @@ -34,7 +38,7 @@ import ( // NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the // CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is // used to select the devices to include. -func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) { +func NewCDIModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec) (oci.SpecModifier, error) { devices, err := getDevicesFromSpec(logger, ociSpec, cfg) if err != nil { return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err) @@ -50,7 +54,7 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices") } if len(automaticDevices) > 0 { - automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices) + automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, driver, automaticDevices) if err == nil { return automaticModifier, nil } @@ -163,9 +167,9 @@ func filterAutomaticDevices(devices []string) []string { return automatic } -func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) { +func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (oci.SpecModifier, error) { logger.Debugf("Generating in-memory CDI specs for devices %v", devices) - spec, err := generateAutomaticCDISpec(logger, cfg, devices) + spec, err := generateAutomaticCDISpec(logger, cfg, driver, devices) if err != nil { return nil, fmt.Errorf("failed to generate CDI spec: %w", err) } @@ -180,7 +184,7 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de return cdiModifier, nil } -func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) { +func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (spec.Interface, error) { cdilib, err := nvcdi.New( nvcdi.WithLogger(logger), nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path), @@ -192,12 +196,19 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic return nil, fmt.Errorf("failed to construct CDI library: %w", err) } - identifiers := []string{} + // TODO: Consider moving this into the nvcdi API. + if err := driver.LoadKernelModules(cfg.NVIDIAContainerRuntimeConfig.Modes.JitCDI.LoadKernelModules...); err != nil { + logger.Warningf("Ignoring error(s) loading kernel modules: %v", err) + } + + var identifiers []string for _, device := range devices { _, _, id := parser.ParseDevice(device) identifiers = append(identifiers, id) } + tryCreateDeviceNodes(logger, driver, identifiers...) + deviceSpecs, err := cdilib.GetDeviceSpecsByID(identifiers...) if err != nil { return nil, fmt.Errorf("failed to get CDI device specs: %w", err) @@ -215,3 +226,27 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic spec.WithClass("gpu"), ) } + +func tryCreateDeviceNodes(logger logger.Interface, driver *root.Driver, identifiers ...string) { + devices, err := nvdevices.New( + nvdevices.WithLogger(logger), + nvdevices.WithDevRoot(driver.Root), + ) + if err != nil { + logger.Warningf("Failed to create devices library: %v", err) + return + } + if err := devices.CreateNVIDIAControlDevices(); err != nil { + logger.Warningf("Failed to create control devices: %v", err) + } + if err := devices.CreateNVIDIACapsControlDeviceNodes(); err != nil { + logger.Warningf("Failed to create nvidia-caps control devices: %v", err) + } + + for _, id := range identifiers { + identifier := device.Identifier(id) + if err := devices.CreateDeviceNodes(identifier); err != nil { + logger.Warningf("Error creating device nodes for %v: %v", identifier, err) + } + } +} diff --git a/internal/nvcaps/nvcaps.go b/internal/nvcaps/nvcaps.go index 48d98ccf..8e6037d8 100644 --- a/internal/nvcaps/nvcaps.go +++ b/internal/nvcaps/nvcaps.go @@ -36,10 +36,20 @@ const ( nvcapsDevicePath = "/dev/nvidia-caps" ) -// MigMinor represents the minor number of a MIG device -type MigMinor int +// An Index represents a gpu, ci, or gi index. +// We use uint32 as this typically maps to a device minor number. +type Index uint32 -// MigCap represents the path to a MIG cap file +// MigMinor represents the minor number of a MIG device +type MigMinor Index + +// MigCap represents the path to a MIG cap file. +// These are listed in /proc/driver/nvidia-caps/mig-minors and have one of the +// follown forms: +// - config +// - monitor +// - gpu{{ .gpuIndex }}/gi{{ .gi }}/access +// - gpu{{ .gpuIndex }}/gi{{ .gi }}/ci {{ .ci }}/access type MigCap string // MigCaps stores a map of MIG cap file paths to MIG minors @@ -47,16 +57,41 @@ type MigCaps map[MigCap]MigMinor // NewGPUInstanceCap creates a MigCap for the specified MIG GPU instance. // A GPU instance is uniquely defined by the GPU minor number and GI instance ID. -func NewGPUInstanceCap(gpu, gi int) MigCap { +func NewGPUInstanceCap[T uint32 | int | Index](gpu, gi T) MigCap { return MigCap(fmt.Sprintf("gpu%d/gi%d/access", gpu, gi)) } // NewComputeInstanceCap creates a MigCap for the specified MIG Compute instance. // A GPU instance is uniquely defined by the GPU minor number, GI instance ID, and CI instance ID. -func NewComputeInstanceCap(gpu, gi, ci int) MigCap { +func NewComputeInstanceCap[T uint32 | int | Index](gpu, gi, ci T) MigCap { return MigCap(fmt.Sprintf("gpu%d/gi%d/ci%d/access", gpu, gi, ci)) } +// FilterForGPU limits the MIG Caps to those associated with a particular GPU. +func (m MigCaps) FilterForGPU(gpu Index) MigCaps { + if m == nil { + return nil + } + filtered := make(MigCaps) + for gi := Index(0); ; gi++ { + giCap := NewGPUInstanceCap(gpu, gi) + giMinor, exist := m[giCap] + if !exist { + break + } + filtered[giCap] = giMinor + for ci := Index(0); ; ci++ { + ciCap := NewComputeInstanceCap(gpu, gi, ci) + ciMinor, exist := m[ciCap] + if !exist { + break + } + filtered[ciCap] = ciMinor + } + } + return filtered +} + // GetCapDevicePath returns the path to the cap device for the specified cap. // An error is returned if the cap is invalid. func (m MigCaps) GetCapDevicePath(cap MigCap) (string, error) { @@ -113,7 +148,7 @@ func processMigMinorsLine(line string) (MigCap, MigMinor, error) { return "", 0, fmt.Errorf("invalid MIG minors line: '%v'", line) } - minor, err := strconv.Atoi(parts[1]) + minor, err := strconv.ParseUint(parts[1], 10, 32) if err != nil { return "", 0, fmt.Errorf("error reading MIG minor from '%v': %v", line, err) } diff --git a/internal/oci/spec_mock.go b/internal/oci/spec_mock.go index f004d69c..ff8ff647 100644 --- a/internal/oci/spec_mock.go +++ b/internal/oci/spec_mock.go @@ -4,9 +4,8 @@ package oci import ( - "sync" - "github.com/opencontainers/runtime-spec/specs-go" + "sync" ) // Ensure, that SpecMock does implement Spec. diff --git a/internal/runtime/runtime_factory.go b/internal/runtime/runtime_factory.go index e88213dc..9ee12c48 100644 --- a/internal/runtime/runtime_factory.go +++ b/internal/runtime/runtime_factory.go @@ -77,7 +77,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image) // We update the mode here so that we can continue passing just the config to other functions. cfg.NVIDIAContainerRuntimeConfig.Mode = mode - modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image) + modeModifier, err := newModeModifier(logger, mode, cfg, driver, ociSpec, image) if err != nil { return nil, err } @@ -107,14 +107,14 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp return modifiers, nil } -func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) { +func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) { switch mode { case "legacy": return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil case "csv": return modifier.NewCSVModifier(logger, cfg, image) case "cdi": - return modifier.NewCDIModifier(logger, cfg, ociSpec) + return modifier.NewCDIModifier(logger, cfg, driver, ociSpec) } return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode) diff --git a/internal/system/nvdevices/control-device-nodes.go b/internal/system/nvdevices/control-device-nodes.go new file mode 100644 index 00000000..793fb8e7 --- /dev/null +++ b/internal/system/nvdevices/control-device-nodes.go @@ -0,0 +1,122 @@ +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvdevices + +import ( + "errors" + "fmt" + "path/filepath" + "strings" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" +) + +// A controlDeviceNode represents an NVIDIA devices node for control or meta devices. +// Such device nodes are typically required regardless of which GPU is being accessed. +type controlDeviceNode string + +func (c controlDeviceNode) path() string { + return filepath.Join("dev", string(c)) +} + +// CreateNVIDIAControlDevices creates the NVIDIA control device nodes at the configured devRoot. +func (m *Interface) CreateNVIDIAControlDevices() error { + controlNodes := []controlDeviceNode{"nvidiactl", "nvidia-modeset", "nvidia-uvm", "nvidia-uvm-tools"} + for _, node := range controlNodes { + if err := m.createControlDeviceNode(node); err != nil { + return fmt.Errorf("failed to create device node %s: %w", node, err) + } + } + return nil +} + +// CreateNVIDIACapsControlDeviceNodes creates the nvidia-caps control device nodes at the configured devRoot. +func (m *Interface) CreateNVIDIACapsControlDeviceNodes() error { + capsMajor, exists := m.Get("nvidia-caps") + if !exists { + return nil + } + + var errs error + for _, migCap := range []nvcaps.MigCap{"config", "monitor"} { + migMinor, exists := m.migCaps[migCap] + if !exists { + continue + } + deviceNodePath := migMinor.DevicePath() + if err := m.createDeviceNode(deviceNodePath, capsMajor, uint32(migMinor)); err != nil { + errs = errors.Join(errs, fmt.Errorf("failed to create nvidia-caps device node %v: %w", deviceNodePath, err)) + } + } + return errs +} + +// createControlDeviceNode creates the specified NVIDIA device node at the configured devRoot. +func (m *Interface) createControlDeviceNode(node controlDeviceNode) error { + if !strings.HasPrefix(string(node), "nvidia") { + return fmt.Errorf("invalid device node %q: %w", node, errInvalidDeviceNode) + } + + major, err := m.controlDeviceNodeMajor(node) + if err != nil { + return fmt.Errorf("failed to determine major: %w", err) + } + + minor, err := m.controlDeviceNodeMinor(node) + if err != nil { + return fmt.Errorf("failed to determine minor: %w", err) + } + + return m.createDeviceNode(node.path(), major, minor) +} + +// controlDeviceNodeMajor returns the major number for the specified NVIDIA control device node. +// If the device node is not supported, an error is returned. +func (m *Interface) controlDeviceNodeMajor(node controlDeviceNode) (devices.Major, error) { + var valid bool + var major devices.Major + switch node { + case "nvidia-uvm", "nvidia-uvm-tools": + major, valid = m.Get(devices.NVIDIAUVM) + case "nvidia-modeset", "nvidiactl": + major, valid = m.Get(devices.NVIDIAGPU) + } + + if valid { + return major, nil + } + + return 0, errInvalidDeviceNode +} + +// controlDeviceNodeMinor returns the minor number for the specified NVIDIA control device node. +// If the device node is not supported, an error is returned. +func (m *Interface) controlDeviceNodeMinor(node controlDeviceNode) (uint32, error) { + switch node { + case "nvidia-modeset": + return devices.NVIDIAModesetMinor, nil + case "nvidia-uvm-tools": + return devices.NVIDIAUVMToolsMinor, nil + case "nvidia-uvm": + return devices.NVIDIAUVMMinor, nil + case "nvidiactl": + return devices.NVIDIACTLMinor, nil + } + + return 0, errInvalidDeviceNode +} diff --git a/internal/system/nvdevices/devices.go b/internal/system/nvdevices/devices.go index f667f6b7..28fb90ce 100644 --- a/internal/system/nvdevices/devices.go +++ b/internal/system/nvdevices/devices.go @@ -22,8 +22,11 @@ import ( "path/filepath" "strings" + "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" + "github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" ) var errInvalidDeviceNode = errors.New("invalid device node") @@ -38,6 +41,8 @@ type Interface struct { // devRoot is the root directory where device nodes are expected to exist. devRoot string + migCaps nvcaps.MigCaps + mknoder } @@ -62,6 +67,14 @@ func New(opts ...Option) (*Interface, error) { i.Devices = devices } + if i.migCaps == nil { + migCaps, err := nvcaps.NewMigCaps() + if err != nil { + return nil, fmt.Errorf("failed to load MIG caps: %w", err) + } + i.migCaps = migCaps + } + if i.dryRun { i.mknoder = &mknodLogger{i.logger} } else { @@ -70,77 +83,40 @@ func New(opts ...Option) (*Interface, error) { return i, nil } -// CreateNVIDIAControlDevices creates the NVIDIA control device nodes at the configured devRoot. -func (m *Interface) CreateNVIDIAControlDevices() error { - controlNodes := []string{"nvidiactl", "nvidia-modeset", "nvidia-uvm", "nvidia-uvm-tools"} - for _, node := range controlNodes { - err := m.CreateNVIDIADevice(node) +// CreateDeviceNodes creates the device nodes for a device with the specified identifier. +// A list of created device nodes are returned and an error. +func (m *Interface) CreateDeviceNodes(id device.Identifier) error { + switch { + case id.IsGpuIndex(): + gpuIndex, err := toIndex(string(id)) if err != nil { - return fmt.Errorf("failed to create device node %s: %w", node, err) + return fmt.Errorf("invalid GPU index: %v", id) + } + return m.createGPUDeviceNode(gpuIndex) + case id.IsMigIndex(): + indices := strings.Split(string(id), ":") + if len(indices) != 2 { + return fmt.Errorf("invalid MIG index %v", id) + } + gpuIndex, err := toIndex(indices[0]) + if err != nil { + return fmt.Errorf("invalid parent index %v: %w", indices[0], err) + } + if err := m.createGPUDeviceNode(gpuIndex); err != nil { + return fmt.Errorf("failed to create parent device node: %w", err) } - } - return nil -} -// CreateNVIDIADevice creates the specified NVIDIA device node at the configured devRoot. -func (m *Interface) CreateNVIDIADevice(node string) error { - node = filepath.Base(node) - if !strings.HasPrefix(node, "nvidia") { - return fmt.Errorf("invalid device node %q: %w", node, errInvalidDeviceNode) + return m.createMigDeviceNodes(gpuIndex) + case id.IsGpuUUID(), id.IsMigUUID(), id == "all": + return m.createAllGPUDeviceNodes() + default: + return fmt.Errorf("invalid device identifier: %v", id) } - - major, err := m.Major(node) - if err != nil { - return fmt.Errorf("failed to determine major: %w", err) - } - - minor, err := m.Minor(node) - if err != nil { - return fmt.Errorf("failed to determine minor: %w", err) - } - - return m.createDeviceNode(filepath.Join("dev", node), int(major), int(minor)) } // createDeviceNode creates the specified device node with the require major and minor numbers. // If a devRoot is configured, this is prepended to the path. -func (m *Interface) createDeviceNode(path string, major int, minor int) error { +func (m *Interface) createDeviceNode(path string, major devices.Major, minor uint32) error { path = filepath.Join(m.devRoot, path) - return m.Mknode(path, major, minor) -} - -// Major returns the major number for the specified NVIDIA device node. -// If the device node is not supported, an error is returned. -func (m *Interface) Major(node string) (int64, error) { - var valid bool - var major devices.Major - switch node { - case "nvidia-uvm", "nvidia-uvm-tools": - major, valid = m.Get(devices.NVIDIAUVM) - case "nvidia-modeset", "nvidiactl": - major, valid = m.Get(devices.NVIDIAGPU) - } - - if valid { - return int64(major), nil - } - - return 0, errInvalidDeviceNode -} - -// Minor returns the minor number for the specified NVIDIA device node. -// If the device node is not supported, an error is returned. -func (m *Interface) Minor(node string) (int64, error) { - switch node { - case "nvidia-modeset": - return devices.NVIDIAModesetMinor, nil - case "nvidia-uvm-tools": - return devices.NVIDIAUVMToolsMinor, nil - case "nvidia-uvm": - return devices.NVIDIAUVMMinor, nil - case "nvidiactl": - return devices.NVIDIACTLMinor, nil - } - - return 0, errInvalidDeviceNode + return m.Mknode(path, uint32(major), minor) } diff --git a/internal/system/nvdevices/devices_test.go b/internal/system/nvdevices/devices_test.go index d4d8616c..9f4c13af 100644 --- a/internal/system/nvdevices/devices_test.go +++ b/internal/system/nvdevices/devices_test.go @@ -30,13 +30,13 @@ func TestCreateControlDevices(t *testing.T) { logger, _ := testlog.NewNullLogger() nvidiaDevices := devices.New( - devices.WithDeviceToMajor(map[string]int{ + devices.WithDeviceToMajor(map[string]uint32{ "nvidia-frontend": 195, "nvidia-uvm": 243, }), ) nvidia550Devices := devices.New( - devices.WithDeviceToMajor(map[string]int{ + devices.WithDeviceToMajor(map[string]uint32{ "nvidia": 195, "nvidia-uvm": 243, }), @@ -52,8 +52,8 @@ func TestCreateControlDevices(t *testing.T) { expectedError error expectedCalls []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 } }{ { @@ -63,8 +63,8 @@ func TestCreateControlDevices(t *testing.T) { mknodeError: nil, expectedCalls: []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 }{ {"/dev/nvidiactl", 195, 255}, {"/dev/nvidia-modeset", 195, 254}, @@ -79,8 +79,8 @@ func TestCreateControlDevices(t *testing.T) { mknodeError: nil, expectedCalls: []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 }{ {"/dev/nvidiactl", 195, 255}, {"/dev/nvidia-modeset", 195, 254}, @@ -95,8 +95,8 @@ func TestCreateControlDevices(t *testing.T) { mknodeError: nil, expectedCalls: []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 }{ {"/some/root/dev/nvidiactl", 195, 255}, {"/some/root/dev/nvidia-modeset", 195, 254}, @@ -112,8 +112,8 @@ func TestCreateControlDevices(t *testing.T) { // We expect the first call to this to fail, and the rest to be skipped expectedCalls: []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 }{ {"/dev/nvidiactl", 195, 255}, }, @@ -132,7 +132,7 @@ func TestCreateControlDevices(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { mknode := &mknoderMock{ - MknodeFunc: func(string, int, int) error { + MknodeFunc: func(string, uint32, uint32) error { return tc.mknodeError }, } diff --git a/internal/system/nvdevices/gpu-device-nodes.go b/internal/system/nvdevices/gpu-device-nodes.go new file mode 100644 index 00000000..9075a0b1 --- /dev/null +++ b/internal/system/nvdevices/gpu-device-nodes.go @@ -0,0 +1,88 @@ +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvdevices + +import ( + "errors" + "fmt" + "path/filepath" + "strconv" + + "github.com/NVIDIA/go-nvlib/pkg/nvpci" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" +) + +type gpuIndex nvcaps.Index + +func toIndex(index string) (gpuIndex, error) { + i, err := strconv.ParseUint(index, 10, 32) + if err != nil { + return 0, err + } + return gpuIndex(i), nil +} + +func (m *Interface) createGPUDeviceNode(gpu gpuIndex) error { + major, exists := m.Get(devices.NVIDIAGPU) + if !exists { + return fmt.Errorf("failed to determine device major; nvidia kernel module may not be loaded") + } + + deviceNodePath := fmt.Sprintf("/dev/nvidia%d", gpu) + if err := m.createDeviceNode(deviceNodePath, major, uint32(gpu)); err != nil { + return fmt.Errorf("failed to create device node %v: %w", deviceNodePath, err) + } + return nil +} + +func (m *Interface) createMigDeviceNodes(gpu gpuIndex) error { + capsMajor, exists := m.Get("nvidia-caps") + if !exists { + return nil + } + var errs error + for _, capsDeviceMinor := range m.migCaps.FilterForGPU(nvcaps.Index(gpu)) { + capDevicePath := capsDeviceMinor.DevicePath() + err := m.createDeviceNode(capDevicePath, capsMajor, uint32(capsDeviceMinor)) + errs = errors.Join(errs, fmt.Errorf("failed to create %v: %w", capDevicePath, err)) + } + return errs +} + +func (m *Interface) createAllGPUDeviceNodes() error { + gpus, err := nvpci.New( + nvpci.WithPCIDevicesRoot(filepath.Join(m.devRoot, nvpci.PCIDevicesRoot)), + nvpci.WithLogger(m.logger), + ).GetGPUs() + if err != nil { + return fmt.Errorf("failed to get GPU information from PCI: %w", err) + } + + count := gpuIndex(len(gpus)) + if count == 0 { + return nil + } + + var errs error + for gpuIndex := gpuIndex(0); gpuIndex < count; gpuIndex++ { + errs = errors.Join(errs, m.createGPUDeviceNode(gpuIndex)) + errs = errors.Join(errs, m.createMigDeviceNodes(gpuIndex)) + } + return errs +} diff --git a/internal/system/nvdevices/mknod.go b/internal/system/nvdevices/mknod.go index 5754fc40..30eef032 100644 --- a/internal/system/nvdevices/mknod.go +++ b/internal/system/nvdevices/mknod.go @@ -25,16 +25,16 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" ) -//go:generate moq -stub -out mknod_mock.go . mknoder +//go:generate moq -fmt=goimports -rm -stub -out mknod_mock.go . mknoder type mknoder interface { - Mknode(string, int, int) error + Mknode(string, uint32, uint32) error } type mknodLogger struct { logger.Interface } -func (m *mknodLogger) Mknode(path string, major, minor int) error { +func (m *mknodLogger) Mknode(path string, major uint32, minor uint32) error { m.Infof("Running: mknod --mode=0666 %s c %d %d", path, major, minor) return nil } @@ -43,7 +43,7 @@ type mknodUnix struct { logger logger.Interface } -func (m *mknodUnix) Mknode(path string, major, minor int) error { +func (m *mknodUnix) Mknode(path string, major uint32, minor uint32) error { // TODO: Ensure that the existing device node has the correct properties. if _, err := os.Stat(path); err == nil { m.logger.Infof("Skipping: %s already exists", path) @@ -52,7 +52,7 @@ func (m *mknodUnix) Mknode(path string, major, minor int) error { return fmt.Errorf("failed to stat %s: %v", path, err) } - err := unix.Mknod(path, unix.S_IFCHR, int(unix.Mkdev(uint32(major), uint32(minor)))) + err := unix.Mknod(path, unix.S_IFCHR, int(unix.Mkdev(major, minor))) if err != nil { return err } diff --git a/internal/system/nvdevices/mknod_mock.go b/internal/system/nvdevices/mknod_mock.go index 4bb384fa..f4e7bace 100644 --- a/internal/system/nvdevices/mknod_mock.go +++ b/internal/system/nvdevices/mknod_mock.go @@ -17,7 +17,7 @@ var _ mknoder = &mknoderMock{} // // // make and configure a mocked mknoder // mockedmknoder := &mknoderMock{ -// MknodeFunc: func(s string, n1 int, n2 int) error { +// MknodeFunc: func(s string, v1 uint32, v2 uint32) error { // panic("mock out the Mknode method") // }, // } @@ -28,7 +28,7 @@ var _ mknoder = &mknoderMock{} // } type mknoderMock struct { // MknodeFunc mocks the Mknode method. - MknodeFunc func(s string, n1 int, n2 int) error + MknodeFunc func(s string, v1 uint32, v2 uint32) error // calls tracks calls to the methods. calls struct { @@ -36,25 +36,25 @@ type mknoderMock struct { Mknode []struct { // S is the s argument value. S string - // N1 is the n1 argument value. - N1 int - // N2 is the n2 argument value. - N2 int + // V1 is the v1 argument value. + V1 uint32 + // V2 is the v2 argument value. + V2 uint32 } } lockMknode sync.RWMutex } // Mknode calls MknodeFunc. -func (mock *mknoderMock) Mknode(s string, n1 int, n2 int) error { +func (mock *mknoderMock) Mknode(s string, v1 uint32, v2 uint32) error { callInfo := struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 }{ S: s, - N1: n1, - N2: n2, + V1: v1, + V2: v2, } mock.lockMknode.Lock() mock.calls.Mknode = append(mock.calls.Mknode, callInfo) @@ -65,7 +65,7 @@ func (mock *mknoderMock) Mknode(s string, n1 int, n2 int) error { ) return errOut } - return mock.MknodeFunc(s, n1, n2) + return mock.MknodeFunc(s, v1, v2) } // MknodeCalls gets all the calls that were made to Mknode. @@ -74,13 +74,13 @@ func (mock *mknoderMock) Mknode(s string, n1 int, n2 int) error { // len(mockedmknoder.MknodeCalls()) func (mock *mknoderMock) MknodeCalls() []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 } { var calls []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 } mock.lockMknode.RLock() calls = mock.calls.Mknode diff --git a/pkg/nvcdi/namer_nvml_mock.go b/pkg/nvcdi/namer_nvml_mock.go index 6a704b45..f81a1eee 100644 --- a/pkg/nvcdi/namer_nvml_mock.go +++ b/pkg/nvcdi/namer_nvml_mock.go @@ -4,9 +4,8 @@ package nvcdi import ( - "sync" - "github.com/NVIDIA/go-nvml/pkg/nvml" + "sync" ) // Ensure, that nvmlUUIDerMock does implement nvmlUUIDer.