mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-26 18:18:24 +00:00
Merge 699608902b
into 57f077fce7
This commit is contained in:
commit
b8d54eb9dc
@ -141,6 +141,9 @@ swarm-resource = ""
|
|||||||
[nvidia-container-runtime.modes.csv]
|
[nvidia-container-runtime.modes.csv]
|
||||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||||
|
|
||||||
|
[nvidia-container-runtime.modes.jit-cdi]
|
||||||
|
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||||
|
|
||||||
[nvidia-container-runtime-hook]
|
[nvidia-container-runtime-hook]
|
||||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||||
skip-mode-detection = true
|
skip-mode-detection = true
|
||||||
@ -202,6 +205,9 @@ swarm-resource = ""
|
|||||||
[nvidia-container-runtime.modes.csv]
|
[nvidia-container-runtime.modes.csv]
|
||||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||||
|
|
||||||
|
[nvidia-container-runtime.modes.jit-cdi]
|
||||||
|
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||||
|
|
||||||
[nvidia-container-runtime-hook]
|
[nvidia-container-runtime-hook]
|
||||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||||
skip-mode-detection = true
|
skip-mode-detection = true
|
||||||
@ -266,6 +272,9 @@ swarm-resource = ""
|
|||||||
[nvidia-container-runtime.modes.csv]
|
[nvidia-container-runtime.modes.csv]
|
||||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||||
|
|
||||||
|
[nvidia-container-runtime.modes.jit-cdi]
|
||||||
|
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||||
|
|
||||||
[nvidia-container-runtime-hook]
|
[nvidia-container-runtime-hook]
|
||||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||||
skip-mode-detection = true
|
skip-mode-detection = true
|
||||||
@ -327,6 +336,9 @@ swarm-resource = ""
|
|||||||
[nvidia-container-runtime.modes.csv]
|
[nvidia-container-runtime.modes.csv]
|
||||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||||
|
|
||||||
|
[nvidia-container-runtime.modes.jit-cdi]
|
||||||
|
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||||
|
|
||||||
[nvidia-container-runtime-hook]
|
[nvidia-container-runtime-hook]
|
||||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||||
skip-mode-detection = true
|
skip-mode-detection = true
|
||||||
@ -410,6 +422,9 @@ swarm-resource = ""
|
|||||||
[nvidia-container-runtime.modes.csv]
|
[nvidia-container-runtime.modes.csv]
|
||||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||||
|
|
||||||
|
[nvidia-container-runtime.modes.jit-cdi]
|
||||||
|
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||||
|
|
||||||
[nvidia-container-runtime-hook]
|
[nvidia-container-runtime-hook]
|
||||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||||
skip-mode-detection = true
|
skip-mode-detection = true
|
||||||
|
@ -145,21 +145,9 @@ func (m allPossible) getGPUDeviceNodes(gpu int) []deviceNode {
|
|||||||
// getNVCapDeviceNodes generates a list of cap device nodes for a given GPU.
|
// getNVCapDeviceNodes generates a list of cap device nodes for a given GPU.
|
||||||
func (m allPossible) getNVCapDeviceNodes(gpu int) []deviceNode {
|
func (m allPossible) getNVCapDeviceNodes(gpu int) []deviceNode {
|
||||||
var selectedCapMinors []nvcaps.MigMinor
|
var selectedCapMinors []nvcaps.MigMinor
|
||||||
for gi := 0; ; gi++ {
|
|
||||||
giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
|
for _, capMinors := range m.migCaps.FilterForGPU(nvcaps.Index(gpu)) {
|
||||||
giMinor, exist := m.migCaps[giCap]
|
selectedCapMinors = append(selectedCapMinors, capMinors)
|
||||||
if !exist {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
selectedCapMinors = append(selectedCapMinors, giMinor)
|
|
||||||
for ci := 0; ; ci++ {
|
|
||||||
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
|
|
||||||
ciMinor, exist := m.migCaps[ciCap]
|
|
||||||
if !exist {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
selectedCapMinors = append(selectedCapMinors, ciMinor)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
var deviceNodes []deviceNode
|
var deviceNodes []deviceNode
|
||||||
|
@ -121,6 +121,9 @@ func GetDefault() (*Config, error) {
|
|||||||
AnnotationPrefixes: []string{cdi.AnnotationPrefix},
|
AnnotationPrefixes: []string{cdi.AnnotationPrefix},
|
||||||
SpecDirs: cdi.DefaultSpecDirs,
|
SpecDirs: cdi.DefaultSpecDirs,
|
||||||
},
|
},
|
||||||
|
JitCDI: jitCDIModeConfig{
|
||||||
|
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||||
|
@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) {
|
|||||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||||
},
|
},
|
||||||
|
JitCDI: jitCDIModeConfig{
|
||||||
|
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||||
@ -102,6 +105,7 @@ func TestGetConfig(t *testing.T) {
|
|||||||
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
|
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
|
||||||
"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
|
"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
|
||||||
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
||||||
|
"nvidia-container-runtime.modes.jit-cdi.load-kernel-modules = [\"foo\"]",
|
||||||
"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
|
"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
|
||||||
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
|
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
|
||||||
},
|
},
|
||||||
@ -134,6 +138,9 @@ func TestGetConfig(t *testing.T) {
|
|||||||
"/not/var/run/cdi",
|
"/not/var/run/cdi",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
JitCDI: jitCDIModeConfig{
|
||||||
|
LoadKernelModules: []string{"foo"},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||||
@ -178,6 +185,9 @@ func TestGetConfig(t *testing.T) {
|
|||||||
"/var/run/cdi",
|
"/var/run/cdi",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
JitCDI: jitCDIModeConfig{
|
||||||
|
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||||
@ -213,6 +223,8 @@ func TestGetConfig(t *testing.T) {
|
|||||||
"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
|
"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
|
||||||
"[nvidia-container-runtime.modes.csv]",
|
"[nvidia-container-runtime.modes.csv]",
|
||||||
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
||||||
|
"[nvidia-container-runtime.modes.jit-cdi]",
|
||||||
|
"load-kernel-modules = [\"foo\"]",
|
||||||
"[nvidia-container-runtime-hook]",
|
"[nvidia-container-runtime-hook]",
|
||||||
"path = \"/foo/bar/nvidia-container-runtime-hook\"",
|
"path = \"/foo/bar/nvidia-container-runtime-hook\"",
|
||||||
"[nvidia-ctk]",
|
"[nvidia-ctk]",
|
||||||
@ -247,6 +259,9 @@ func TestGetConfig(t *testing.T) {
|
|||||||
"/not/var/run/cdi",
|
"/not/var/run/cdi",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
JitCDI: jitCDIModeConfig{
|
||||||
|
LoadKernelModules: []string{"foo"},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||||
@ -283,6 +298,9 @@ func TestGetConfig(t *testing.T) {
|
|||||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||||
},
|
},
|
||||||
|
JitCDI: jitCDIModeConfig{
|
||||||
|
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||||
@ -322,6 +340,9 @@ func TestGetConfig(t *testing.T) {
|
|||||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||||
},
|
},
|
||||||
|
JitCDI: jitCDIModeConfig{
|
||||||
|
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
|
||||||
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||||
|
@ -29,8 +29,9 @@ type RuntimeConfig struct {
|
|||||||
|
|
||||||
// modesConfig defines (optional) per-mode configs
|
// modesConfig defines (optional) per-mode configs
|
||||||
type modesConfig struct {
|
type modesConfig struct {
|
||||||
CSV csvModeConfig `toml:"csv"`
|
CSV csvModeConfig `toml:"csv"`
|
||||||
CDI cdiModeConfig `toml:"cdi"`
|
CDI cdiModeConfig `toml:"cdi"`
|
||||||
|
JitCDI jitCDIModeConfig `toml:"jit-cdi"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type cdiModeConfig struct {
|
type cdiModeConfig struct {
|
||||||
@ -45,3 +46,11 @@ type cdiModeConfig struct {
|
|||||||
type csvModeConfig struct {
|
type csvModeConfig struct {
|
||||||
MountSpecPath string `toml:"mount-spec-path"`
|
MountSpecPath string `toml:"mount-spec-path"`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type jitCDIModeConfig struct {
|
||||||
|
// LoadKernelModules defines the names of the kernel modules that should be
|
||||||
|
// loaded before generating a just-in-time CDI specification.
|
||||||
|
// The module names must start with `nvidia` and if no modules are specified
|
||||||
|
// no kernel modules are loaded.
|
||||||
|
LoadKernelModules []string `toml:"load-kernel-modules"`
|
||||||
|
}
|
||||||
|
@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"]
|
|||||||
[nvidia-container-runtime.modes.csv]
|
[nvidia-container-runtime.modes.csv]
|
||||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||||
|
|
||||||
|
[nvidia-container-runtime.modes.jit-cdi]
|
||||||
|
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
|
||||||
|
|
||||||
[nvidia-container-runtime-hook]
|
[nvidia-container-runtime-hook]
|
||||||
path = "nvidia-container-runtime-hook"
|
path = "nvidia-container-runtime-hook"
|
||||||
skip-mode-detection = false
|
skip-mode-detection = false
|
||||||
|
@ -45,7 +45,7 @@ func New(opts ...Option) Devices {
|
|||||||
type Option func(*builder)
|
type Option func(*builder)
|
||||||
|
|
||||||
// WithDeviceToMajor specifies an explicit device name to major number map.
|
// WithDeviceToMajor specifies an explicit device name to major number map.
|
||||||
func WithDeviceToMajor(deviceToMajor map[string]int) Option {
|
func WithDeviceToMajor(deviceToMajor map[string]uint32) Option {
|
||||||
return func(b *builder) {
|
return func(b *builder) {
|
||||||
b.asMap = make(devices)
|
b.asMap = make(devices)
|
||||||
for name, major := range deviceToMajor {
|
for name, major := range deviceToMajor {
|
||||||
|
@ -45,7 +45,7 @@ const (
|
|||||||
type Name string
|
type Name string
|
||||||
|
|
||||||
// Major represents a device major as specified under /proc/devices
|
// Major represents a device major as specified under /proc/devices
|
||||||
type Major int
|
type Major uint32
|
||||||
|
|
||||||
// Devices represents the set of devices under /proc/devices
|
// Devices represents the set of devices under /proc/devices
|
||||||
//
|
//
|
||||||
@ -130,8 +130,8 @@ func nvidiaDeviceFrom(reader io.Reader) (Devices, error) {
|
|||||||
return nvidiaDevices, nil
|
return nvidiaDevices, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func devicesFrom(reader io.Reader) map[string]int {
|
func devicesFrom(reader io.Reader) map[string]uint32 {
|
||||||
allDevices := make(map[string]int)
|
allDevices := make(map[string]uint32)
|
||||||
scanner := bufio.NewScanner(reader)
|
scanner := bufio.NewScanner(reader)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
device, major, err := processProcDeviceLine(scanner.Text())
|
device, major, err := processProcDeviceLine(scanner.Text())
|
||||||
@ -143,11 +143,11 @@ func devicesFrom(reader io.Reader) map[string]int {
|
|||||||
return allDevices
|
return allDevices
|
||||||
}
|
}
|
||||||
|
|
||||||
func processProcDeviceLine(line string) (string, int, error) {
|
func processProcDeviceLine(line string) (string, uint32, error) {
|
||||||
trimmed := strings.TrimSpace(line)
|
trimmed := strings.TrimSpace(line)
|
||||||
|
|
||||||
var name string
|
var name string
|
||||||
var major int
|
var major uint32
|
||||||
|
|
||||||
n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name)
|
n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name)
|
||||||
if n == 2 {
|
if n == 2 {
|
||||||
|
@ -25,7 +25,7 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestNvidiaDevices(t *testing.T) {
|
func TestNvidiaDevices(t *testing.T) {
|
||||||
perDriverDeviceMaps := map[string]map[string]int{
|
perDriverDeviceMaps := map[string]map[string]uint32{
|
||||||
"pre550": {
|
"pre550": {
|
||||||
"nvidia-frontend": 195,
|
"nvidia-frontend": 195,
|
||||||
"nvidia-nvlink": 234,
|
"nvidia-nvlink": 234,
|
||||||
@ -100,7 +100,7 @@ func TestProcessDeviceFileLine(t *testing.T) {
|
|||||||
testCases := []struct {
|
testCases := []struct {
|
||||||
line string
|
line string
|
||||||
name string
|
name string
|
||||||
major int
|
major uint32
|
||||||
err bool
|
err bool
|
||||||
}{
|
}{
|
||||||
{"", "", 0, true},
|
{"", "", 0, true},
|
||||||
|
@ -17,12 +17,15 @@
|
|||||||
package root
|
package root
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Driver represents a filesystem in which a set of drivers or devices is defined.
|
// Driver represents a filesystem in which a set of drivers or devices is defined.
|
||||||
@ -125,3 +128,20 @@ func xdgDataDirs() []string {
|
|||||||
|
|
||||||
return []string{"/usr/local/share", "/usr/share"}
|
return []string{"/usr/local/share", "/usr/share"}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// LoadKmods loads the specified kernel modules in the driver root.
|
||||||
|
// Errors in loading a module do not prevent other modules from being attempted.
|
||||||
|
func (r *Driver) LoadKernelModules(moduleNames ...string) error {
|
||||||
|
modules := nvmodules.New(
|
||||||
|
nvmodules.WithLogger(r.logger),
|
||||||
|
nvmodules.WithRoot(r.Root),
|
||||||
|
)
|
||||||
|
|
||||||
|
var errs error
|
||||||
|
for _, moduleName := range moduleNames {
|
||||||
|
if err := modules.Load(moduleName); err != nil {
|
||||||
|
errs = errors.Join(errs, fmt.Errorf("failed to load kernel module %q: %w", moduleName, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return errs
|
||||||
|
}
|
||||||
|
@ -22,11 +22,15 @@ import (
|
|||||||
|
|
||||||
"tags.cncf.io/container-device-interface/pkg/parser"
|
"tags.cncf.io/container-device-interface/pkg/parser"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvdevices"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
||||||
)
|
)
|
||||||
@ -34,7 +38,7 @@ import (
|
|||||||
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
|
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
|
||||||
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
|
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
|
||||||
// used to select the devices to include.
|
// used to select the devices to include.
|
||||||
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
func NewCDIModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
||||||
devices, err := getDevicesFromSpec(logger, ociSpec, cfg)
|
devices, err := getDevicesFromSpec(logger, ociSpec, cfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
|
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
|
||||||
@ -50,7 +54,7 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe
|
|||||||
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
|
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
|
||||||
}
|
}
|
||||||
if len(automaticDevices) > 0 {
|
if len(automaticDevices) > 0 {
|
||||||
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
|
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, driver, automaticDevices)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
return automaticModifier, nil
|
return automaticModifier, nil
|
||||||
}
|
}
|
||||||
@ -163,9 +167,9 @@ func filterAutomaticDevices(devices []string) []string {
|
|||||||
return automatic
|
return automatic
|
||||||
}
|
}
|
||||||
|
|
||||||
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
|
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (oci.SpecModifier, error) {
|
||||||
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
|
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
|
||||||
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
|
spec, err := generateAutomaticCDISpec(logger, cfg, driver, devices)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
|
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
|
||||||
}
|
}
|
||||||
@ -180,7 +184,7 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de
|
|||||||
return cdiModifier, nil
|
return cdiModifier, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
|
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (spec.Interface, error) {
|
||||||
cdilib, err := nvcdi.New(
|
cdilib, err := nvcdi.New(
|
||||||
nvcdi.WithLogger(logger),
|
nvcdi.WithLogger(logger),
|
||||||
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
|
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
|
||||||
@ -192,12 +196,19 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
|
|||||||
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
identifiers := []string{}
|
// TODO: Consider moving this into the nvcdi API.
|
||||||
|
if err := driver.LoadKernelModules(cfg.NVIDIAContainerRuntimeConfig.Modes.JitCDI.LoadKernelModules...); err != nil {
|
||||||
|
logger.Warningf("Ignoring error(s) loading kernel modules: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var identifiers []string
|
||||||
for _, device := range devices {
|
for _, device := range devices {
|
||||||
_, _, id := parser.ParseDevice(device)
|
_, _, id := parser.ParseDevice(device)
|
||||||
identifiers = append(identifiers, id)
|
identifiers = append(identifiers, id)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tryCreateDeviceNodes(logger, driver, identifiers...)
|
||||||
|
|
||||||
deviceSpecs, err := cdilib.GetDeviceSpecsByID(identifiers...)
|
deviceSpecs, err := cdilib.GetDeviceSpecsByID(identifiers...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to get CDI device specs: %w", err)
|
return nil, fmt.Errorf("failed to get CDI device specs: %w", err)
|
||||||
@ -215,3 +226,27 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
|
|||||||
spec.WithClass("gpu"),
|
spec.WithClass("gpu"),
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func tryCreateDeviceNodes(logger logger.Interface, driver *root.Driver, identifiers ...string) {
|
||||||
|
devices, err := nvdevices.New(
|
||||||
|
nvdevices.WithLogger(logger),
|
||||||
|
nvdevices.WithDevRoot(driver.Root),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
logger.Warningf("Failed to create devices library: %v", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := devices.CreateNVIDIAControlDevices(); err != nil {
|
||||||
|
logger.Warningf("Failed to create control devices: %v", err)
|
||||||
|
}
|
||||||
|
if err := devices.CreateNVIDIACapsControlDeviceNodes(); err != nil {
|
||||||
|
logger.Warningf("Failed to create nvidia-caps control devices: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, id := range identifiers {
|
||||||
|
identifier := device.Identifier(id)
|
||||||
|
if err := devices.CreateDeviceNodes(identifier); err != nil {
|
||||||
|
logger.Warningf("Error creating device nodes for %v: %v", identifier, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -36,10 +36,20 @@ const (
|
|||||||
nvcapsDevicePath = "/dev/nvidia-caps"
|
nvcapsDevicePath = "/dev/nvidia-caps"
|
||||||
)
|
)
|
||||||
|
|
||||||
// MigMinor represents the minor number of a MIG device
|
// An Index represents a gpu, ci, or gi index.
|
||||||
type MigMinor int
|
// We use uint32 as this typically maps to a device minor number.
|
||||||
|
type Index uint32
|
||||||
|
|
||||||
// MigCap represents the path to a MIG cap file
|
// MigMinor represents the minor number of a MIG device
|
||||||
|
type MigMinor Index
|
||||||
|
|
||||||
|
// MigCap represents the path to a MIG cap file.
|
||||||
|
// These are listed in /proc/driver/nvidia-caps/mig-minors and have one of the
|
||||||
|
// follown forms:
|
||||||
|
// - config
|
||||||
|
// - monitor
|
||||||
|
// - gpu{{ .gpuIndex }}/gi{{ .gi }}/access
|
||||||
|
// - gpu{{ .gpuIndex }}/gi{{ .gi }}/ci {{ .ci }}/access
|
||||||
type MigCap string
|
type MigCap string
|
||||||
|
|
||||||
// MigCaps stores a map of MIG cap file paths to MIG minors
|
// MigCaps stores a map of MIG cap file paths to MIG minors
|
||||||
@ -47,16 +57,41 @@ type MigCaps map[MigCap]MigMinor
|
|||||||
|
|
||||||
// NewGPUInstanceCap creates a MigCap for the specified MIG GPU instance.
|
// NewGPUInstanceCap creates a MigCap for the specified MIG GPU instance.
|
||||||
// A GPU instance is uniquely defined by the GPU minor number and GI instance ID.
|
// A GPU instance is uniquely defined by the GPU minor number and GI instance ID.
|
||||||
func NewGPUInstanceCap(gpu, gi int) MigCap {
|
func NewGPUInstanceCap[T uint32 | int | Index](gpu, gi T) MigCap {
|
||||||
return MigCap(fmt.Sprintf("gpu%d/gi%d/access", gpu, gi))
|
return MigCap(fmt.Sprintf("gpu%d/gi%d/access", gpu, gi))
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewComputeInstanceCap creates a MigCap for the specified MIG Compute instance.
|
// NewComputeInstanceCap creates a MigCap for the specified MIG Compute instance.
|
||||||
// A GPU instance is uniquely defined by the GPU minor number, GI instance ID, and CI instance ID.
|
// A GPU instance is uniquely defined by the GPU minor number, GI instance ID, and CI instance ID.
|
||||||
func NewComputeInstanceCap(gpu, gi, ci int) MigCap {
|
func NewComputeInstanceCap[T uint32 | int | Index](gpu, gi, ci T) MigCap {
|
||||||
return MigCap(fmt.Sprintf("gpu%d/gi%d/ci%d/access", gpu, gi, ci))
|
return MigCap(fmt.Sprintf("gpu%d/gi%d/ci%d/access", gpu, gi, ci))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FilterForGPU limits the MIG Caps to those associated with a particular GPU.
|
||||||
|
func (m MigCaps) FilterForGPU(gpu Index) MigCaps {
|
||||||
|
if m == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
filtered := make(MigCaps)
|
||||||
|
for gi := Index(0); ; gi++ {
|
||||||
|
giCap := NewGPUInstanceCap(gpu, gi)
|
||||||
|
giMinor, exist := m[giCap]
|
||||||
|
if !exist {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
filtered[giCap] = giMinor
|
||||||
|
for ci := Index(0); ; ci++ {
|
||||||
|
ciCap := NewComputeInstanceCap(gpu, gi, ci)
|
||||||
|
ciMinor, exist := m[ciCap]
|
||||||
|
if !exist {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
filtered[ciCap] = ciMinor
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return filtered
|
||||||
|
}
|
||||||
|
|
||||||
// GetCapDevicePath returns the path to the cap device for the specified cap.
|
// GetCapDevicePath returns the path to the cap device for the specified cap.
|
||||||
// An error is returned if the cap is invalid.
|
// An error is returned if the cap is invalid.
|
||||||
func (m MigCaps) GetCapDevicePath(cap MigCap) (string, error) {
|
func (m MigCaps) GetCapDevicePath(cap MigCap) (string, error) {
|
||||||
@ -113,7 +148,7 @@ func processMigMinorsLine(line string) (MigCap, MigMinor, error) {
|
|||||||
return "", 0, fmt.Errorf("invalid MIG minors line: '%v'", line)
|
return "", 0, fmt.Errorf("invalid MIG minors line: '%v'", line)
|
||||||
}
|
}
|
||||||
|
|
||||||
minor, err := strconv.Atoi(parts[1])
|
minor, err := strconv.ParseUint(parts[1], 10, 32)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", 0, fmt.Errorf("error reading MIG minor from '%v': %v", line, err)
|
return "", 0, fmt.Errorf("error reading MIG minor from '%v': %v", line, err)
|
||||||
}
|
}
|
||||||
|
@ -4,9 +4,8 @@
|
|||||||
package oci
|
package oci
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"sync"
|
|
||||||
|
|
||||||
"github.com/opencontainers/runtime-spec/specs-go"
|
"github.com/opencontainers/runtime-spec/specs-go"
|
||||||
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Ensure, that SpecMock does implement Spec.
|
// Ensure, that SpecMock does implement Spec.
|
||||||
|
@ -77,7 +77,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
|
|||||||
mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
|
mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
|
||||||
// We update the mode here so that we can continue passing just the config to other functions.
|
// We update the mode here so that we can continue passing just the config to other functions.
|
||||||
cfg.NVIDIAContainerRuntimeConfig.Mode = mode
|
cfg.NVIDIAContainerRuntimeConfig.Mode = mode
|
||||||
modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image)
|
modeModifier, err := newModeModifier(logger, mode, cfg, driver, ociSpec, image)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@ -107,14 +107,14 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
|
|||||||
return modifiers, nil
|
return modifiers, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
|
func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
|
||||||
switch mode {
|
switch mode {
|
||||||
case "legacy":
|
case "legacy":
|
||||||
return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil
|
return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil
|
||||||
case "csv":
|
case "csv":
|
||||||
return modifier.NewCSVModifier(logger, cfg, image)
|
return modifier.NewCSVModifier(logger, cfg, image)
|
||||||
case "cdi":
|
case "cdi":
|
||||||
return modifier.NewCDIModifier(logger, cfg, ociSpec)
|
return modifier.NewCDIModifier(logger, cfg, driver, ociSpec)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)
|
return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)
|
||||||
|
122
internal/system/nvdevices/control-device-nodes.go
Normal file
122
internal/system/nvdevices/control-device-nodes.go
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
/**
|
||||||
|
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
**/
|
||||||
|
|
||||||
|
package nvdevices
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"path/filepath"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
|
||||||
|
)
|
||||||
|
|
||||||
|
// A controlDeviceNode represents an NVIDIA devices node for control or meta devices.
|
||||||
|
// Such device nodes are typically required regardless of which GPU is being accessed.
|
||||||
|
type controlDeviceNode string
|
||||||
|
|
||||||
|
func (c controlDeviceNode) path() string {
|
||||||
|
return filepath.Join("dev", string(c))
|
||||||
|
}
|
||||||
|
|
||||||
|
// CreateNVIDIAControlDevices creates the NVIDIA control device nodes at the configured devRoot.
|
||||||
|
func (m *Interface) CreateNVIDIAControlDevices() error {
|
||||||
|
controlNodes := []controlDeviceNode{"nvidiactl", "nvidia-modeset", "nvidia-uvm", "nvidia-uvm-tools"}
|
||||||
|
for _, node := range controlNodes {
|
||||||
|
if err := m.createControlDeviceNode(node); err != nil {
|
||||||
|
return fmt.Errorf("failed to create device node %s: %w", node, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// CreateNVIDIACapsControlDeviceNodes creates the nvidia-caps control device nodes at the configured devRoot.
|
||||||
|
func (m *Interface) CreateNVIDIACapsControlDeviceNodes() error {
|
||||||
|
capsMajor, exists := m.Get("nvidia-caps")
|
||||||
|
if !exists {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var errs error
|
||||||
|
for _, migCap := range []nvcaps.MigCap{"config", "monitor"} {
|
||||||
|
migMinor, exists := m.migCaps[migCap]
|
||||||
|
if !exists {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
deviceNodePath := migMinor.DevicePath()
|
||||||
|
if err := m.createDeviceNode(deviceNodePath, capsMajor, uint32(migMinor)); err != nil {
|
||||||
|
errs = errors.Join(errs, fmt.Errorf("failed to create nvidia-caps device node %v: %w", deviceNodePath, err))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return errs
|
||||||
|
}
|
||||||
|
|
||||||
|
// createControlDeviceNode creates the specified NVIDIA device node at the configured devRoot.
|
||||||
|
func (m *Interface) createControlDeviceNode(node controlDeviceNode) error {
|
||||||
|
if !strings.HasPrefix(string(node), "nvidia") {
|
||||||
|
return fmt.Errorf("invalid device node %q: %w", node, errInvalidDeviceNode)
|
||||||
|
}
|
||||||
|
|
||||||
|
major, err := m.controlDeviceNodeMajor(node)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to determine major: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
minor, err := m.controlDeviceNodeMinor(node)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to determine minor: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return m.createDeviceNode(node.path(), major, minor)
|
||||||
|
}
|
||||||
|
|
||||||
|
// controlDeviceNodeMajor returns the major number for the specified NVIDIA control device node.
|
||||||
|
// If the device node is not supported, an error is returned.
|
||||||
|
func (m *Interface) controlDeviceNodeMajor(node controlDeviceNode) (devices.Major, error) {
|
||||||
|
var valid bool
|
||||||
|
var major devices.Major
|
||||||
|
switch node {
|
||||||
|
case "nvidia-uvm", "nvidia-uvm-tools":
|
||||||
|
major, valid = m.Get(devices.NVIDIAUVM)
|
||||||
|
case "nvidia-modeset", "nvidiactl":
|
||||||
|
major, valid = m.Get(devices.NVIDIAGPU)
|
||||||
|
}
|
||||||
|
|
||||||
|
if valid {
|
||||||
|
return major, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0, errInvalidDeviceNode
|
||||||
|
}
|
||||||
|
|
||||||
|
// controlDeviceNodeMinor returns the minor number for the specified NVIDIA control device node.
|
||||||
|
// If the device node is not supported, an error is returned.
|
||||||
|
func (m *Interface) controlDeviceNodeMinor(node controlDeviceNode) (uint32, error) {
|
||||||
|
switch node {
|
||||||
|
case "nvidia-modeset":
|
||||||
|
return devices.NVIDIAModesetMinor, nil
|
||||||
|
case "nvidia-uvm-tools":
|
||||||
|
return devices.NVIDIAUVMToolsMinor, nil
|
||||||
|
case "nvidia-uvm":
|
||||||
|
return devices.NVIDIAUVMMinor, nil
|
||||||
|
case "nvidiactl":
|
||||||
|
return devices.NVIDIACTLMinor, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0, errInvalidDeviceNode
|
||||||
|
}
|
@ -22,8 +22,11 @@ import (
|
|||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
|
||||||
)
|
)
|
||||||
|
|
||||||
var errInvalidDeviceNode = errors.New("invalid device node")
|
var errInvalidDeviceNode = errors.New("invalid device node")
|
||||||
@ -38,6 +41,8 @@ type Interface struct {
|
|||||||
// devRoot is the root directory where device nodes are expected to exist.
|
// devRoot is the root directory where device nodes are expected to exist.
|
||||||
devRoot string
|
devRoot string
|
||||||
|
|
||||||
|
migCaps nvcaps.MigCaps
|
||||||
|
|
||||||
mknoder
|
mknoder
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -62,6 +67,14 @@ func New(opts ...Option) (*Interface, error) {
|
|||||||
i.Devices = devices
|
i.Devices = devices
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if i.migCaps == nil {
|
||||||
|
migCaps, err := nvcaps.NewMigCaps()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to load MIG caps: %w", err)
|
||||||
|
}
|
||||||
|
i.migCaps = migCaps
|
||||||
|
}
|
||||||
|
|
||||||
if i.dryRun {
|
if i.dryRun {
|
||||||
i.mknoder = &mknodLogger{i.logger}
|
i.mknoder = &mknodLogger{i.logger}
|
||||||
} else {
|
} else {
|
||||||
@ -70,77 +83,40 @@ func New(opts ...Option) (*Interface, error) {
|
|||||||
return i, nil
|
return i, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// CreateNVIDIAControlDevices creates the NVIDIA control device nodes at the configured devRoot.
|
// CreateDeviceNodes creates the device nodes for a device with the specified identifier.
|
||||||
func (m *Interface) CreateNVIDIAControlDevices() error {
|
// A list of created device nodes are returned and an error.
|
||||||
controlNodes := []string{"nvidiactl", "nvidia-modeset", "nvidia-uvm", "nvidia-uvm-tools"}
|
func (m *Interface) CreateDeviceNodes(id device.Identifier) error {
|
||||||
for _, node := range controlNodes {
|
switch {
|
||||||
err := m.CreateNVIDIADevice(node)
|
case id.IsGpuIndex():
|
||||||
|
gpuIndex, err := toIndex(string(id))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("failed to create device node %s: %w", node, err)
|
return fmt.Errorf("invalid GPU index: %v", id)
|
||||||
|
}
|
||||||
|
return m.createGPUDeviceNode(gpuIndex)
|
||||||
|
case id.IsMigIndex():
|
||||||
|
indices := strings.Split(string(id), ":")
|
||||||
|
if len(indices) != 2 {
|
||||||
|
return fmt.Errorf("invalid MIG index %v", id)
|
||||||
|
}
|
||||||
|
gpuIndex, err := toIndex(indices[0])
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("invalid parent index %v: %w", indices[0], err)
|
||||||
|
}
|
||||||
|
if err := m.createGPUDeviceNode(gpuIndex); err != nil {
|
||||||
|
return fmt.Errorf("failed to create parent device node: %w", err)
|
||||||
}
|
}
|
||||||
}
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// CreateNVIDIADevice creates the specified NVIDIA device node at the configured devRoot.
|
return m.createMigDeviceNodes(gpuIndex)
|
||||||
func (m *Interface) CreateNVIDIADevice(node string) error {
|
case id.IsGpuUUID(), id.IsMigUUID(), id == "all":
|
||||||
node = filepath.Base(node)
|
return m.createAllGPUDeviceNodes()
|
||||||
if !strings.HasPrefix(node, "nvidia") {
|
default:
|
||||||
return fmt.Errorf("invalid device node %q: %w", node, errInvalidDeviceNode)
|
return fmt.Errorf("invalid device identifier: %v", id)
|
||||||
}
|
}
|
||||||
|
|
||||||
major, err := m.Major(node)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to determine major: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
minor, err := m.Minor(node)
|
|
||||||
if err != nil {
|
|
||||||
return fmt.Errorf("failed to determine minor: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return m.createDeviceNode(filepath.Join("dev", node), int(major), int(minor))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// createDeviceNode creates the specified device node with the require major and minor numbers.
|
// createDeviceNode creates the specified device node with the require major and minor numbers.
|
||||||
// If a devRoot is configured, this is prepended to the path.
|
// If a devRoot is configured, this is prepended to the path.
|
||||||
func (m *Interface) createDeviceNode(path string, major int, minor int) error {
|
func (m *Interface) createDeviceNode(path string, major devices.Major, minor uint32) error {
|
||||||
path = filepath.Join(m.devRoot, path)
|
path = filepath.Join(m.devRoot, path)
|
||||||
return m.Mknode(path, major, minor)
|
return m.Mknode(path, uint32(major), minor)
|
||||||
}
|
|
||||||
|
|
||||||
// Major returns the major number for the specified NVIDIA device node.
|
|
||||||
// If the device node is not supported, an error is returned.
|
|
||||||
func (m *Interface) Major(node string) (int64, error) {
|
|
||||||
var valid bool
|
|
||||||
var major devices.Major
|
|
||||||
switch node {
|
|
||||||
case "nvidia-uvm", "nvidia-uvm-tools":
|
|
||||||
major, valid = m.Get(devices.NVIDIAUVM)
|
|
||||||
case "nvidia-modeset", "nvidiactl":
|
|
||||||
major, valid = m.Get(devices.NVIDIAGPU)
|
|
||||||
}
|
|
||||||
|
|
||||||
if valid {
|
|
||||||
return int64(major), nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0, errInvalidDeviceNode
|
|
||||||
}
|
|
||||||
|
|
||||||
// Minor returns the minor number for the specified NVIDIA device node.
|
|
||||||
// If the device node is not supported, an error is returned.
|
|
||||||
func (m *Interface) Minor(node string) (int64, error) {
|
|
||||||
switch node {
|
|
||||||
case "nvidia-modeset":
|
|
||||||
return devices.NVIDIAModesetMinor, nil
|
|
||||||
case "nvidia-uvm-tools":
|
|
||||||
return devices.NVIDIAUVMToolsMinor, nil
|
|
||||||
case "nvidia-uvm":
|
|
||||||
return devices.NVIDIAUVMMinor, nil
|
|
||||||
case "nvidiactl":
|
|
||||||
return devices.NVIDIACTLMinor, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0, errInvalidDeviceNode
|
|
||||||
}
|
}
|
||||||
|
@ -30,13 +30,13 @@ func TestCreateControlDevices(t *testing.T) {
|
|||||||
logger, _ := testlog.NewNullLogger()
|
logger, _ := testlog.NewNullLogger()
|
||||||
|
|
||||||
nvidiaDevices := devices.New(
|
nvidiaDevices := devices.New(
|
||||||
devices.WithDeviceToMajor(map[string]int{
|
devices.WithDeviceToMajor(map[string]uint32{
|
||||||
"nvidia-frontend": 195,
|
"nvidia-frontend": 195,
|
||||||
"nvidia-uvm": 243,
|
"nvidia-uvm": 243,
|
||||||
}),
|
}),
|
||||||
)
|
)
|
||||||
nvidia550Devices := devices.New(
|
nvidia550Devices := devices.New(
|
||||||
devices.WithDeviceToMajor(map[string]int{
|
devices.WithDeviceToMajor(map[string]uint32{
|
||||||
"nvidia": 195,
|
"nvidia": 195,
|
||||||
"nvidia-uvm": 243,
|
"nvidia-uvm": 243,
|
||||||
}),
|
}),
|
||||||
@ -52,8 +52,8 @@ func TestCreateControlDevices(t *testing.T) {
|
|||||||
expectedError error
|
expectedError error
|
||||||
expectedCalls []struct {
|
expectedCalls []struct {
|
||||||
S string
|
S string
|
||||||
N1 int
|
V1 uint32
|
||||||
N2 int
|
V2 uint32
|
||||||
}
|
}
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
@ -63,8 +63,8 @@ func TestCreateControlDevices(t *testing.T) {
|
|||||||
mknodeError: nil,
|
mknodeError: nil,
|
||||||
expectedCalls: []struct {
|
expectedCalls: []struct {
|
||||||
S string
|
S string
|
||||||
N1 int
|
V1 uint32
|
||||||
N2 int
|
V2 uint32
|
||||||
}{
|
}{
|
||||||
{"/dev/nvidiactl", 195, 255},
|
{"/dev/nvidiactl", 195, 255},
|
||||||
{"/dev/nvidia-modeset", 195, 254},
|
{"/dev/nvidia-modeset", 195, 254},
|
||||||
@ -79,8 +79,8 @@ func TestCreateControlDevices(t *testing.T) {
|
|||||||
mknodeError: nil,
|
mknodeError: nil,
|
||||||
expectedCalls: []struct {
|
expectedCalls: []struct {
|
||||||
S string
|
S string
|
||||||
N1 int
|
V1 uint32
|
||||||
N2 int
|
V2 uint32
|
||||||
}{
|
}{
|
||||||
{"/dev/nvidiactl", 195, 255},
|
{"/dev/nvidiactl", 195, 255},
|
||||||
{"/dev/nvidia-modeset", 195, 254},
|
{"/dev/nvidia-modeset", 195, 254},
|
||||||
@ -95,8 +95,8 @@ func TestCreateControlDevices(t *testing.T) {
|
|||||||
mknodeError: nil,
|
mknodeError: nil,
|
||||||
expectedCalls: []struct {
|
expectedCalls: []struct {
|
||||||
S string
|
S string
|
||||||
N1 int
|
V1 uint32
|
||||||
N2 int
|
V2 uint32
|
||||||
}{
|
}{
|
||||||
{"/some/root/dev/nvidiactl", 195, 255},
|
{"/some/root/dev/nvidiactl", 195, 255},
|
||||||
{"/some/root/dev/nvidia-modeset", 195, 254},
|
{"/some/root/dev/nvidia-modeset", 195, 254},
|
||||||
@ -112,8 +112,8 @@ func TestCreateControlDevices(t *testing.T) {
|
|||||||
// We expect the first call to this to fail, and the rest to be skipped
|
// We expect the first call to this to fail, and the rest to be skipped
|
||||||
expectedCalls: []struct {
|
expectedCalls: []struct {
|
||||||
S string
|
S string
|
||||||
N1 int
|
V1 uint32
|
||||||
N2 int
|
V2 uint32
|
||||||
}{
|
}{
|
||||||
{"/dev/nvidiactl", 195, 255},
|
{"/dev/nvidiactl", 195, 255},
|
||||||
},
|
},
|
||||||
@ -132,7 +132,7 @@ func TestCreateControlDevices(t *testing.T) {
|
|||||||
for _, tc := range testCases {
|
for _, tc := range testCases {
|
||||||
t.Run(tc.description, func(t *testing.T) {
|
t.Run(tc.description, func(t *testing.T) {
|
||||||
mknode := &mknoderMock{
|
mknode := &mknoderMock{
|
||||||
MknodeFunc: func(string, int, int) error {
|
MknodeFunc: func(string, uint32, uint32) error {
|
||||||
return tc.mknodeError
|
return tc.mknodeError
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
88
internal/system/nvdevices/gpu-device-nodes.go
Normal file
88
internal/system/nvdevices/gpu-device-nodes.go
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
/**
|
||||||
|
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
**/
|
||||||
|
|
||||||
|
package nvdevices
|
||||||
|
|
||||||
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"path/filepath"
|
||||||
|
"strconv"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/go-nvlib/pkg/nvpci"
|
||||||
|
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
|
||||||
|
)
|
||||||
|
|
||||||
|
type gpuIndex nvcaps.Index
|
||||||
|
|
||||||
|
func toIndex(index string) (gpuIndex, error) {
|
||||||
|
i, err := strconv.ParseUint(index, 10, 32)
|
||||||
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
|
}
|
||||||
|
return gpuIndex(i), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *Interface) createGPUDeviceNode(gpu gpuIndex) error {
|
||||||
|
major, exists := m.Get(devices.NVIDIAGPU)
|
||||||
|
if !exists {
|
||||||
|
return fmt.Errorf("failed to determine device major; nvidia kernel module may not be loaded")
|
||||||
|
}
|
||||||
|
|
||||||
|
deviceNodePath := fmt.Sprintf("/dev/nvidia%d", gpu)
|
||||||
|
if err := m.createDeviceNode(deviceNodePath, major, uint32(gpu)); err != nil {
|
||||||
|
return fmt.Errorf("failed to create device node %v: %w", deviceNodePath, err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *Interface) createMigDeviceNodes(gpu gpuIndex) error {
|
||||||
|
capsMajor, exists := m.Get("nvidia-caps")
|
||||||
|
if !exists {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
var errs error
|
||||||
|
for _, capsDeviceMinor := range m.migCaps.FilterForGPU(nvcaps.Index(gpu)) {
|
||||||
|
capDevicePath := capsDeviceMinor.DevicePath()
|
||||||
|
err := m.createDeviceNode(capDevicePath, capsMajor, uint32(capsDeviceMinor))
|
||||||
|
errs = errors.Join(errs, fmt.Errorf("failed to create %v: %w", capDevicePath, err))
|
||||||
|
}
|
||||||
|
return errs
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *Interface) createAllGPUDeviceNodes() error {
|
||||||
|
gpus, err := nvpci.New(
|
||||||
|
nvpci.WithPCIDevicesRoot(filepath.Join(m.devRoot, nvpci.PCIDevicesRoot)),
|
||||||
|
nvpci.WithLogger(m.logger),
|
||||||
|
).GetGPUs()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to get GPU information from PCI: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
count := gpuIndex(len(gpus))
|
||||||
|
if count == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var errs error
|
||||||
|
for gpuIndex := gpuIndex(0); gpuIndex < count; gpuIndex++ {
|
||||||
|
errs = errors.Join(errs, m.createGPUDeviceNode(gpuIndex))
|
||||||
|
errs = errors.Join(errs, m.createMigDeviceNodes(gpuIndex))
|
||||||
|
}
|
||||||
|
return errs
|
||||||
|
}
|
@ -25,16 +25,16 @@ import (
|
|||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||||
)
|
)
|
||||||
|
|
||||||
//go:generate moq -stub -out mknod_mock.go . mknoder
|
//go:generate moq -fmt=goimports -rm -stub -out mknod_mock.go . mknoder
|
||||||
type mknoder interface {
|
type mknoder interface {
|
||||||
Mknode(string, int, int) error
|
Mknode(string, uint32, uint32) error
|
||||||
}
|
}
|
||||||
|
|
||||||
type mknodLogger struct {
|
type mknodLogger struct {
|
||||||
logger.Interface
|
logger.Interface
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *mknodLogger) Mknode(path string, major, minor int) error {
|
func (m *mknodLogger) Mknode(path string, major uint32, minor uint32) error {
|
||||||
m.Infof("Running: mknod --mode=0666 %s c %d %d", path, major, minor)
|
m.Infof("Running: mknod --mode=0666 %s c %d %d", path, major, minor)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -43,7 +43,7 @@ type mknodUnix struct {
|
|||||||
logger logger.Interface
|
logger logger.Interface
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *mknodUnix) Mknode(path string, major, minor int) error {
|
func (m *mknodUnix) Mknode(path string, major uint32, minor uint32) error {
|
||||||
// TODO: Ensure that the existing device node has the correct properties.
|
// TODO: Ensure that the existing device node has the correct properties.
|
||||||
if _, err := os.Stat(path); err == nil {
|
if _, err := os.Stat(path); err == nil {
|
||||||
m.logger.Infof("Skipping: %s already exists", path)
|
m.logger.Infof("Skipping: %s already exists", path)
|
||||||
@ -52,7 +52,7 @@ func (m *mknodUnix) Mknode(path string, major, minor int) error {
|
|||||||
return fmt.Errorf("failed to stat %s: %v", path, err)
|
return fmt.Errorf("failed to stat %s: %v", path, err)
|
||||||
}
|
}
|
||||||
|
|
||||||
err := unix.Mknod(path, unix.S_IFCHR, int(unix.Mkdev(uint32(major), uint32(minor))))
|
err := unix.Mknod(path, unix.S_IFCHR, int(unix.Mkdev(major, minor)))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -17,7 +17,7 @@ var _ mknoder = &mknoderMock{}
|
|||||||
//
|
//
|
||||||
// // make and configure a mocked mknoder
|
// // make and configure a mocked mknoder
|
||||||
// mockedmknoder := &mknoderMock{
|
// mockedmknoder := &mknoderMock{
|
||||||
// MknodeFunc: func(s string, n1 int, n2 int) error {
|
// MknodeFunc: func(s string, v1 uint32, v2 uint32) error {
|
||||||
// panic("mock out the Mknode method")
|
// panic("mock out the Mknode method")
|
||||||
// },
|
// },
|
||||||
// }
|
// }
|
||||||
@ -28,7 +28,7 @@ var _ mknoder = &mknoderMock{}
|
|||||||
// }
|
// }
|
||||||
type mknoderMock struct {
|
type mknoderMock struct {
|
||||||
// MknodeFunc mocks the Mknode method.
|
// MknodeFunc mocks the Mknode method.
|
||||||
MknodeFunc func(s string, n1 int, n2 int) error
|
MknodeFunc func(s string, v1 uint32, v2 uint32) error
|
||||||
|
|
||||||
// calls tracks calls to the methods.
|
// calls tracks calls to the methods.
|
||||||
calls struct {
|
calls struct {
|
||||||
@ -36,25 +36,25 @@ type mknoderMock struct {
|
|||||||
Mknode []struct {
|
Mknode []struct {
|
||||||
// S is the s argument value.
|
// S is the s argument value.
|
||||||
S string
|
S string
|
||||||
// N1 is the n1 argument value.
|
// V1 is the v1 argument value.
|
||||||
N1 int
|
V1 uint32
|
||||||
// N2 is the n2 argument value.
|
// V2 is the v2 argument value.
|
||||||
N2 int
|
V2 uint32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
lockMknode sync.RWMutex
|
lockMknode sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
// Mknode calls MknodeFunc.
|
// Mknode calls MknodeFunc.
|
||||||
func (mock *mknoderMock) Mknode(s string, n1 int, n2 int) error {
|
func (mock *mknoderMock) Mknode(s string, v1 uint32, v2 uint32) error {
|
||||||
callInfo := struct {
|
callInfo := struct {
|
||||||
S string
|
S string
|
||||||
N1 int
|
V1 uint32
|
||||||
N2 int
|
V2 uint32
|
||||||
}{
|
}{
|
||||||
S: s,
|
S: s,
|
||||||
N1: n1,
|
V1: v1,
|
||||||
N2: n2,
|
V2: v2,
|
||||||
}
|
}
|
||||||
mock.lockMknode.Lock()
|
mock.lockMknode.Lock()
|
||||||
mock.calls.Mknode = append(mock.calls.Mknode, callInfo)
|
mock.calls.Mknode = append(mock.calls.Mknode, callInfo)
|
||||||
@ -65,7 +65,7 @@ func (mock *mknoderMock) Mknode(s string, n1 int, n2 int) error {
|
|||||||
)
|
)
|
||||||
return errOut
|
return errOut
|
||||||
}
|
}
|
||||||
return mock.MknodeFunc(s, n1, n2)
|
return mock.MknodeFunc(s, v1, v2)
|
||||||
}
|
}
|
||||||
|
|
||||||
// MknodeCalls gets all the calls that were made to Mknode.
|
// MknodeCalls gets all the calls that were made to Mknode.
|
||||||
@ -74,13 +74,13 @@ func (mock *mknoderMock) Mknode(s string, n1 int, n2 int) error {
|
|||||||
// len(mockedmknoder.MknodeCalls())
|
// len(mockedmknoder.MknodeCalls())
|
||||||
func (mock *mknoderMock) MknodeCalls() []struct {
|
func (mock *mknoderMock) MknodeCalls() []struct {
|
||||||
S string
|
S string
|
||||||
N1 int
|
V1 uint32
|
||||||
N2 int
|
V2 uint32
|
||||||
} {
|
} {
|
||||||
var calls []struct {
|
var calls []struct {
|
||||||
S string
|
S string
|
||||||
N1 int
|
V1 uint32
|
||||||
N2 int
|
V2 uint32
|
||||||
}
|
}
|
||||||
mock.lockMknode.RLock()
|
mock.lockMknode.RLock()
|
||||||
calls = mock.calls.Mknode
|
calls = mock.calls.Mknode
|
||||||
|
@ -4,9 +4,8 @@
|
|||||||
package nvcdi
|
package nvcdi
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"sync"
|
|
||||||
|
|
||||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||||
|
"sync"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Ensure, that nvmlUUIDerMock does implement nvmlUUIDer.
|
// Ensure, that nvmlUUIDerMock does implement nvmlUUIDer.
|
||||||
|
Loading…
Reference in New Issue
Block a user