From 5d5166cbb67b3e17fe5f578888885cc849aae341 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Sun, 9 Mar 2025 13:29:06 +0200
Subject: [PATCH 1/8] Load NVIDIA Kernel Modules for JIT-CDI mode

This change attempts to load the nvidia, nvidia-uvm, and nvidia-modeset
kernel modules before generating the automatic (jit) CDI specification.

The kernel modules can be controlled by the

nvidia-container-runtime.modes.jit-cdi.load-kernel-modules

config option. If this is set to the empty list, then no kernel modules
are loaded.

Errors in loading the kernel modules are logged, but ignored.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 cmd/nvidia-ctk-installer/main_test.go | 15 +++++++++++++++
 internal/config/config.go             |  3 +++
 internal/config/config_test.go        | 21 +++++++++++++++++++++
 internal/config/runtime.go            | 13 +++++++++++--
 internal/config/toml_test.go          |  3 +++
 internal/lookup/root/root.go          | 20 ++++++++++++++++++++
 internal/modifier/cdi.go              | 16 +++++++++++-----
 internal/runtime/runtime_factory.go   |  6 +++---
 8 files changed, 87 insertions(+), 10 deletions(-)

diff --git a/cmd/nvidia-ctk-installer/main_test.go b/cmd/nvidia-ctk-installer/main_test.go
index 759ae8c1..1e3f8006 100644
--- a/cmd/nvidia-ctk-installer/main_test.go
+++ b/cmd/nvidia-ctk-installer/main_test.go
@@ -141,6 +141,9 @@ swarm-resource = ""
     [nvidia-container-runtime.modes.csv]
       mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
 
+    [nvidia-container-runtime.modes.jit-cdi]
+      load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
+
 [nvidia-container-runtime-hook]
   path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
   skip-mode-detection = true
@@ -202,6 +205,9 @@ swarm-resource = ""
     [nvidia-container-runtime.modes.csv]
       mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
 
+    [nvidia-container-runtime.modes.jit-cdi]
+      load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
+
 [nvidia-container-runtime-hook]
   path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
   skip-mode-detection = true
@@ -266,6 +272,9 @@ swarm-resource = ""
     [nvidia-container-runtime.modes.csv]
       mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
 
+    [nvidia-container-runtime.modes.jit-cdi]
+      load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
+
 [nvidia-container-runtime-hook]
   path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
   skip-mode-detection = true
@@ -327,6 +336,9 @@ swarm-resource = ""
     [nvidia-container-runtime.modes.csv]
       mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
 
+    [nvidia-container-runtime.modes.jit-cdi]
+      load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
+
 [nvidia-container-runtime-hook]
   path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
   skip-mode-detection = true
@@ -410,6 +422,9 @@ swarm-resource = ""
     [nvidia-container-runtime.modes.csv]
       mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
 
+    [nvidia-container-runtime.modes.jit-cdi]
+      load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
+
 [nvidia-container-runtime-hook]
   path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
   skip-mode-detection = true
diff --git a/internal/config/config.go b/internal/config/config.go
index 652cc83a..5d17d674 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -121,6 +121,9 @@ func GetDefault() (*Config, error) {
 					AnnotationPrefixes: []string{cdi.AnnotationPrefix},
 					SpecDirs:           cdi.DefaultSpecDirs,
 				},
+				JitCDI: jitCDIModeConfig{
+					LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
+				},
 			},
 		},
 		NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
index 963058e1..7b4d638c 100644
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) {
 							AnnotationPrefixes: []string{"cdi.k8s.io/"},
 							SpecDirs:           []string{"/etc/cdi", "/var/run/cdi"},
 						},
+						JitCDI: jitCDIModeConfig{
+							LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
+						},
 					},
 				},
 				NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -102,6 +105,7 @@ func TestGetConfig(t *testing.T) {
 				"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
 				"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
 				"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
+				"nvidia-container-runtime.modes.jit-cdi.load-kernel-modules = [\"foo\"]",
 				"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
 				"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
 			},
@@ -134,6 +138,9 @@ func TestGetConfig(t *testing.T) {
 								"/not/var/run/cdi",
 							},
 						},
+						JitCDI: jitCDIModeConfig{
+							LoadKernelModules: []string{"foo"},
+						},
 					},
 				},
 				NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -178,6 +185,9 @@ func TestGetConfig(t *testing.T) {
 								"/var/run/cdi",
 							},
 						},
+						JitCDI: jitCDIModeConfig{
+							LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
+						},
 					},
 				},
 				NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -213,6 +223,8 @@ func TestGetConfig(t *testing.T) {
 				"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
 				"[nvidia-container-runtime.modes.csv]",
 				"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
+				"[nvidia-container-runtime.modes.jit-cdi]",
+				"load-kernel-modules = [\"foo\"]",
 				"[nvidia-container-runtime-hook]",
 				"path = \"/foo/bar/nvidia-container-runtime-hook\"",
 				"[nvidia-ctk]",
@@ -247,6 +259,9 @@ func TestGetConfig(t *testing.T) {
 								"/not/var/run/cdi",
 							},
 						},
+						JitCDI: jitCDIModeConfig{
+							LoadKernelModules: []string{"foo"},
+						},
 					},
 				},
 				NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -283,6 +298,9 @@ func TestGetConfig(t *testing.T) {
 							AnnotationPrefixes: []string{"cdi.k8s.io/"},
 							SpecDirs:           []string{"/etc/cdi", "/var/run/cdi"},
 						},
+						JitCDI: jitCDIModeConfig{
+							LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
+						},
 					},
 				},
 				NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
@@ -322,6 +340,9 @@ func TestGetConfig(t *testing.T) {
 							AnnotationPrefixes: []string{"cdi.k8s.io/"},
 							SpecDirs:           []string{"/etc/cdi", "/var/run/cdi"},
 						},
+						JitCDI: jitCDIModeConfig{
+							LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
+						},
 					},
 				},
 				NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
diff --git a/internal/config/runtime.go b/internal/config/runtime.go
index 2ba1b7a8..ea9869b9 100644
--- a/internal/config/runtime.go
+++ b/internal/config/runtime.go
@@ -29,8 +29,9 @@ type RuntimeConfig struct {
 
 // modesConfig defines (optional) per-mode configs
 type modesConfig struct {
-	CSV csvModeConfig `toml:"csv"`
-	CDI cdiModeConfig `toml:"cdi"`
+	CSV    csvModeConfig    `toml:"csv"`
+	CDI    cdiModeConfig    `toml:"cdi"`
+	JitCDI jitCDIModeConfig `toml:"jit-cdi"`
 }
 
 type cdiModeConfig struct {
@@ -45,3 +46,11 @@ type cdiModeConfig struct {
 type csvModeConfig struct {
 	MountSpecPath string `toml:"mount-spec-path"`
 }
+
+type jitCDIModeConfig struct {
+	// LoadKernelModules defines the names of the kernel modules that should be
+	// loaded before generating a just-in-time CDI specification.
+	// The module names must start with `nvidia` and if no modules are specified
+	// no kernel modules are loaded.
+	LoadKernelModules []string `toml:"load-kernel-modules"`
+}
diff --git a/internal/config/toml_test.go b/internal/config/toml_test.go
index f7c649f7..96cff3b8 100644
--- a/internal/config/toml_test.go
+++ b/internal/config/toml_test.go
@@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"]
 [nvidia-container-runtime.modes.csv]
 mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
 
+[nvidia-container-runtime.modes.jit-cdi]
+load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]
+
 [nvidia-container-runtime-hook]
 path = "nvidia-container-runtime-hook"
 skip-mode-detection = false
diff --git a/internal/lookup/root/root.go b/internal/lookup/root/root.go
index d0c83701..a5f19aab 100644
--- a/internal/lookup/root/root.go
+++ b/internal/lookup/root/root.go
@@ -17,12 +17,15 @@
 package root
 
 import (
+	"errors"
+	"fmt"
 	"os"
 	"path/filepath"
 	"strings"
 
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules"
 )
 
 // Driver represents a filesystem in which a set of drivers or devices is defined.
@@ -125,3 +128,20 @@ func xdgDataDirs() []string {
 
 	return []string{"/usr/local/share", "/usr/share"}
 }
+
+// LoadKmods loads the specified kernel modules in the driver root.
+// Errors in loading a module do not prevent other modules from being attempted.
+func (r *Driver) LoadKernelModules(moduleNames ...string) error {
+	modules := nvmodules.New(
+		nvmodules.WithLogger(r.logger),
+		nvmodules.WithRoot(r.Root),
+	)
+
+	var errs error
+	for _, moduleName := range moduleNames {
+		if err := modules.Load(moduleName); err != nil {
+			errs = errors.Join(errs, fmt.Errorf("failed to load kernel module %q: %w", moduleName, err))
+		}
+	}
+	return errs
+}
diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go
index 90cd481b..bc9a7de3 100644
--- a/internal/modifier/cdi.go
+++ b/internal/modifier/cdi.go
@@ -25,6 +25,7 @@ import (
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
 	"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
@@ -34,7 +35,7 @@ import (
 // NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
 // CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
 // used to select the devices to include.
-func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
+func NewCDIModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec) (oci.SpecModifier, error) {
 	devices, err := getDevicesFromSpec(logger, ociSpec, cfg)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
@@ -50,7 +51,7 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe
 		return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
 	}
 	if len(automaticDevices) > 0 {
-		automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
+		automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, driver, automaticDevices)
 		if err == nil {
 			return automaticModifier, nil
 		}
@@ -163,9 +164,9 @@ func filterAutomaticDevices(devices []string) []string {
 	return automatic
 }
 
-func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
+func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (oci.SpecModifier, error) {
 	logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
-	spec, err := generateAutomaticCDISpec(logger, cfg, devices)
+	spec, err := generateAutomaticCDISpec(logger, cfg, driver, devices)
 	if err != nil {
 		return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
 	}
@@ -180,7 +181,7 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de
 	return cdiModifier, nil
 }
 
-func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
+func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (spec.Interface, error) {
 	cdilib, err := nvcdi.New(
 		nvcdi.WithLogger(logger),
 		nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
@@ -192,6 +193,11 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
 		return nil, fmt.Errorf("failed to construct CDI library: %w", err)
 	}
 
+	// TODO: Consider moving this into the nvcdi API.
+	if err := driver.LoadKernelModules(cfg.NVIDIAContainerRuntimeConfig.Modes.JitCDI.LoadKernelModules...); err != nil {
+		logger.Warningf("Ignoring error(s) loading kernel modules: %v", err)
+	}
+
 	identifiers := []string{}
 	for _, device := range devices {
 		_, _, id := parser.ParseDevice(device)
diff --git a/internal/runtime/runtime_factory.go b/internal/runtime/runtime_factory.go
index e88213dc..9ee12c48 100644
--- a/internal/runtime/runtime_factory.go
+++ b/internal/runtime/runtime_factory.go
@@ -77,7 +77,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
 	mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
 	// We update the mode here so that we can continue passing just the config to other functions.
 	cfg.NVIDIAContainerRuntimeConfig.Mode = mode
-	modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image)
+	modeModifier, err := newModeModifier(logger, mode, cfg, driver, ociSpec, image)
 	if err != nil {
 		return nil, err
 	}
@@ -107,14 +107,14 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
 	return modifiers, nil
 }
 
-func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
+func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec, image image.CUDA) (oci.SpecModifier, error) {
 	switch mode {
 	case "legacy":
 		return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil
 	case "csv":
 		return modifier.NewCSVModifier(logger, cfg, image)
 	case "cdi":
-		return modifier.NewCDIModifier(logger, cfg, ociSpec)
+		return modifier.NewCDIModifier(logger, cfg, driver, ociSpec)
 	}
 
 	return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)

From d757f6e68cb53e45caf324e706f92c3a01e66782 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Mon, 10 Mar 2025 13:50:41 +0200
Subject: [PATCH 2/8] [no-relnote] Move control device nodes to separate file

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 .../system/nvdevices/control-device-nodes.go  | 99 +++++++++++++++++++
 internal/system/nvdevices/devices.go          | 69 -------------
 2 files changed, 99 insertions(+), 69 deletions(-)
 create mode 100644 internal/system/nvdevices/control-device-nodes.go

diff --git a/internal/system/nvdevices/control-device-nodes.go b/internal/system/nvdevices/control-device-nodes.go
new file mode 100644
index 00000000..2b5c6c14
--- /dev/null
+++ b/internal/system/nvdevices/control-device-nodes.go
@@ -0,0 +1,99 @@
+/**
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package nvdevices
+
+import (
+	"fmt"
+	"path/filepath"
+	"strings"
+
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
+)
+
+// A controlDeviceNode represents an NVIDIA devices node for control or meta devices.
+// Such device nodes are typically required regardless of which GPU is being accessed.
+type controlDeviceNode string
+
+func (c controlDeviceNode) path() string {
+	return filepath.Join("dev", string(c))
+}
+
+// CreateNVIDIAControlDevices creates the NVIDIA control device nodes at the configured devRoot.
+func (m *Interface) CreateNVIDIAControlDevices() error {
+	controlNodes := []controlDeviceNode{"nvidiactl", "nvidia-modeset", "nvidia-uvm", "nvidia-uvm-tools"}
+	for _, node := range controlNodes {
+		if err := m.createControlDeviceNode(node); err != nil {
+			return fmt.Errorf("failed to create device node %s: %w", node, err)
+		}
+	}
+	return nil
+}
+
+// createControlDeviceNode creates the specified NVIDIA device node at the configured devRoot.
+func (m *Interface) createControlDeviceNode(node controlDeviceNode) error {
+	if !strings.HasPrefix(string(node), "nvidia") {
+		return fmt.Errorf("invalid device node %q: %w", node, errInvalidDeviceNode)
+	}
+
+	major, err := m.controlDeviceNodeMajor(node)
+	if err != nil {
+		return fmt.Errorf("failed to determine major: %w", err)
+	}
+
+	minor, err := m.controlDeviceNodeMinor(node)
+	if err != nil {
+		return fmt.Errorf("failed to determine minor: %w", err)
+	}
+
+	return m.createDeviceNode(node.path(), int(major), int(minor))
+}
+
+// controlDeviceNodeMajor returns the major number for the specified NVIDIA control device node.
+// If the device node is not supported, an error is returned.
+func (m *Interface) controlDeviceNodeMajor(node controlDeviceNode) (int64, error) {
+	var valid bool
+	var major devices.Major
+	switch node {
+	case "nvidia-uvm", "nvidia-uvm-tools":
+		major, valid = m.Get(devices.NVIDIAUVM)
+	case "nvidia-modeset", "nvidiactl":
+		major, valid = m.Get(devices.NVIDIAGPU)
+	}
+
+	if valid {
+		return int64(major), nil
+	}
+
+	return 0, errInvalidDeviceNode
+}
+
+// controlDeviceNodeMinor returns the minor number for the specified NVIDIA control device node.
+// If the device node is not supported, an error is returned.
+func (m *Interface) controlDeviceNodeMinor(node controlDeviceNode) (int64, error) {
+	switch node {
+	case "nvidia-modeset":
+		return devices.NVIDIAModesetMinor, nil
+	case "nvidia-uvm-tools":
+		return devices.NVIDIAUVMToolsMinor, nil
+	case "nvidia-uvm":
+		return devices.NVIDIAUVMMinor, nil
+	case "nvidiactl":
+		return devices.NVIDIACTLMinor, nil
+	}
+
+	return 0, errInvalidDeviceNode
+}
diff --git a/internal/system/nvdevices/devices.go b/internal/system/nvdevices/devices.go
index f667f6b7..ef935078 100644
--- a/internal/system/nvdevices/devices.go
+++ b/internal/system/nvdevices/devices.go
@@ -20,7 +20,6 @@ import (
 	"errors"
 	"fmt"
 	"path/filepath"
-	"strings"
 
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
@@ -70,77 +69,9 @@ func New(opts ...Option) (*Interface, error) {
 	return i, nil
 }
 
-// CreateNVIDIAControlDevices creates the NVIDIA control device nodes at the configured devRoot.
-func (m *Interface) CreateNVIDIAControlDevices() error {
-	controlNodes := []string{"nvidiactl", "nvidia-modeset", "nvidia-uvm", "nvidia-uvm-tools"}
-	for _, node := range controlNodes {
-		err := m.CreateNVIDIADevice(node)
-		if err != nil {
-			return fmt.Errorf("failed to create device node %s: %w", node, err)
-		}
-	}
-	return nil
-}
-
-// CreateNVIDIADevice creates the specified NVIDIA device node at the configured devRoot.
-func (m *Interface) CreateNVIDIADevice(node string) error {
-	node = filepath.Base(node)
-	if !strings.HasPrefix(node, "nvidia") {
-		return fmt.Errorf("invalid device node %q: %w", node, errInvalidDeviceNode)
-	}
-
-	major, err := m.Major(node)
-	if err != nil {
-		return fmt.Errorf("failed to determine major: %w", err)
-	}
-
-	minor, err := m.Minor(node)
-	if err != nil {
-		return fmt.Errorf("failed to determine minor: %w", err)
-	}
-
-	return m.createDeviceNode(filepath.Join("dev", node), int(major), int(minor))
-}
-
 // createDeviceNode creates the specified device node with the require major and minor numbers.
 // If a devRoot is configured, this is prepended to the path.
 func (m *Interface) createDeviceNode(path string, major int, minor int) error {
 	path = filepath.Join(m.devRoot, path)
 	return m.Mknode(path, major, minor)
 }
-
-// Major returns the major number for the specified NVIDIA device node.
-// If the device node is not supported, an error is returned.
-func (m *Interface) Major(node string) (int64, error) {
-	var valid bool
-	var major devices.Major
-	switch node {
-	case "nvidia-uvm", "nvidia-uvm-tools":
-		major, valid = m.Get(devices.NVIDIAUVM)
-	case "nvidia-modeset", "nvidiactl":
-		major, valid = m.Get(devices.NVIDIAGPU)
-	}
-
-	if valid {
-		return int64(major), nil
-	}
-
-	return 0, errInvalidDeviceNode
-}
-
-// Minor returns the minor number for the specified NVIDIA device node.
-// If the device node is not supported, an error is returned.
-func (m *Interface) Minor(node string) (int64, error) {
-	switch node {
-	case "nvidia-modeset":
-		return devices.NVIDIAModesetMinor, nil
-	case "nvidia-uvm-tools":
-		return devices.NVIDIAUVMToolsMinor, nil
-	case "nvidia-uvm":
-		return devices.NVIDIAUVMMinor, nil
-	case "nvidiactl":
-		return devices.NVIDIACTLMinor, nil
-	}
-
-	return 0, errInvalidDeviceNode
-}

From 4523b2e35ddf45dfe22caabe2fa1fc0810c5f1b2 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Mon, 10 Mar 2025 13:51:00 +0200
Subject: [PATCH 3/8] [no-relnote] Add function to filter nvcaps by GPU

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 internal/nvcaps/nvcaps.go | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/internal/nvcaps/nvcaps.go b/internal/nvcaps/nvcaps.go
index 48d98ccf..8f6e272a 100644
--- a/internal/nvcaps/nvcaps.go
+++ b/internal/nvcaps/nvcaps.go
@@ -39,7 +39,13 @@ const (
 // MigMinor represents the minor number of a MIG device
 type MigMinor int
 
-// MigCap represents the path to a MIG cap file
+// MigCap represents the path to a MIG cap file.
+// These are listed in /proc/driver/nvidia-caps/mig-minors and have one of the
+// follown forms:
+//   - config
+//   - monitor
+//   - gpu{{ .gpuIndex }}/gi{{ .gi }}/access
+//   - gpu{{ .gpuIndex }}/gi{{ .gi }}/ci {{ .ci }}/access
 type MigCap string
 
 // MigCaps stores a map of MIG cap file paths to MIG minors
@@ -57,6 +63,31 @@ func NewComputeInstanceCap(gpu, gi, ci int) MigCap {
 	return MigCap(fmt.Sprintf("gpu%d/gi%d/ci%d/access", gpu, gi, ci))
 }
 
+// FilterForGPU limits the MIG Caps to those associated with a particular GPU.
+func (m MigCaps) FilterForGPU(gpu int) MigCaps {
+	if m == nil {
+		return nil
+	}
+	filtered := make(MigCaps)
+	for gi := 0; ; gi++ {
+		giCap := NewGPUInstanceCap(gpu, gi)
+		giMinor, exist := m[giCap]
+		if !exist {
+			break
+		}
+		filtered[giCap] = giMinor
+		for ci := 0; ; ci++ {
+			ciCap := NewComputeInstanceCap(gpu, gi, ci)
+			ciMinor, exist := m[ciCap]
+			if !exist {
+				break
+			}
+			filtered[ciCap] = ciMinor
+		}
+	}
+	return filtered
+}
+
 // GetCapDevicePath returns the path to the cap device for the specified cap.
 // An error is returned if the cap is invalid.
 func (m MigCaps) GetCapDevicePath(cap MigCap) (string, error) {

From 76b6d4d38f4162a8e7beca52ee3cca5eeb754f89 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Mon, 10 Mar 2025 13:51:53 +0200
Subject: [PATCH 4/8] [no-relnote] Add functions to create gpu device nodes

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 .../system/nvdevices/control-device-nodes.go  | 23 ++++++
 internal/system/nvdevices/devices.go          | 46 +++++++++++
 internal/system/nvdevices/gpu-device-nodes.go | 76 +++++++++++++++++++
 3 files changed, 145 insertions(+)
 create mode 100644 internal/system/nvdevices/gpu-device-nodes.go

diff --git a/internal/system/nvdevices/control-device-nodes.go b/internal/system/nvdevices/control-device-nodes.go
index 2b5c6c14..b7a3b4a8 100644
--- a/internal/system/nvdevices/control-device-nodes.go
+++ b/internal/system/nvdevices/control-device-nodes.go
@@ -17,11 +17,13 @@
 package nvdevices
 
 import (
+	"errors"
 	"fmt"
 	"path/filepath"
 	"strings"
 
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
 )
 
 // A controlDeviceNode represents an NVIDIA devices node for control or meta devices.
@@ -43,6 +45,27 @@ func (m *Interface) CreateNVIDIAControlDevices() error {
 	return nil
 }
 
+// CreateNVIDIACapsControlDeviceNodes creates the nvidia-caps control device nodes at the configured devRoot.
+func (m *Interface) CreateNVIDIACapsControlDeviceNodes() error {
+	capsMajor, exists := m.Get("nvidia-caps")
+	if !exists {
+		return nil
+	}
+
+	var errs error
+	for _, migCap := range []nvcaps.MigCap{"config", "monitor"} {
+		migMinor, exists := m.migCaps[migCap]
+		if !exists {
+			continue
+		}
+		deviceNodePath := migMinor.DevicePath()
+		if err := m.createDeviceNode(deviceNodePath, int(capsMajor), int(migMinor)); err != nil {
+			errs = errors.Join(errs, fmt.Errorf("failed to create nvidia-caps device node %v: %w", deviceNodePath, err))
+		}
+	}
+	return errs
+}
+
 // createControlDeviceNode creates the specified NVIDIA device node at the configured devRoot.
 func (m *Interface) createControlDeviceNode(node controlDeviceNode) error {
 	if !strings.HasPrefix(string(node), "nvidia") {
diff --git a/internal/system/nvdevices/devices.go b/internal/system/nvdevices/devices.go
index ef935078..882af59e 100644
--- a/internal/system/nvdevices/devices.go
+++ b/internal/system/nvdevices/devices.go
@@ -20,9 +20,14 @@ import (
 	"errors"
 	"fmt"
 	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
 
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
 )
 
 var errInvalidDeviceNode = errors.New("invalid device node")
@@ -37,6 +42,8 @@ type Interface struct {
 	// devRoot is the root directory where device nodes are expected to exist.
 	devRoot string
 
+	migCaps nvcaps.MigCaps
+
 	mknoder
 }
 
@@ -61,6 +68,14 @@ func New(opts ...Option) (*Interface, error) {
 		i.Devices = devices
 	}
 
+	if i.migCaps == nil {
+		migCaps, err := nvcaps.NewMigCaps()
+		if err != nil {
+			return nil, fmt.Errorf("failed to load MIG caps: %w", err)
+		}
+		i.migCaps = migCaps
+	}
+
 	if i.dryRun {
 		i.mknoder = &mknodLogger{i.logger}
 	} else {
@@ -69,6 +84,37 @@ func New(opts ...Option) (*Interface, error) {
 	return i, nil
 }
 
+// CreateDeviceNodes creates the device nodes for a device with the specified identifier.
+// A list of created device nodes are returned and an error.
+func (m *Interface) CreateDeviceNodes(id device.Identifier) error {
+	switch {
+	case id.IsGpuIndex():
+		index, err := strconv.Atoi(string(id))
+		if err != nil {
+			return fmt.Errorf("invalid GPU index: %v", id)
+		}
+		return m.createGPUDeviceNode(index)
+	case id.IsMigIndex():
+		indices := strings.Split(string(id), ":")
+		if len(indices) != 2 {
+			return fmt.Errorf("invalid MIG index %v", id)
+		}
+		gpuIndex, err := strconv.Atoi(indices[0])
+		if err != nil {
+			return fmt.Errorf("invalid parent index %v: %w", indices[0], err)
+		}
+		if err := m.createGPUDeviceNode(gpuIndex); err != nil {
+			return fmt.Errorf("failed to create parent device node: %w", err)
+		}
+
+		return m.createMigDeviceNodes(gpuIndex)
+	case id.IsGpuUUID(), id.IsMigUUID(), id == "all":
+		return m.createAllGPUDeviceNodes()
+	default:
+		return fmt.Errorf("invalid device identifier: %v", id)
+	}
+}
+
 // createDeviceNode creates the specified device node with the require major and minor numbers.
 // If a devRoot is configured, this is prepended to the path.
 func (m *Interface) createDeviceNode(path string, major int, minor int) error {
diff --git a/internal/system/nvdevices/gpu-device-nodes.go b/internal/system/nvdevices/gpu-device-nodes.go
new file mode 100644
index 00000000..be75f7a9
--- /dev/null
+++ b/internal/system/nvdevices/gpu-device-nodes.go
@@ -0,0 +1,76 @@
+/**
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package nvdevices
+
+import (
+	"errors"
+	"fmt"
+	"path/filepath"
+
+	"github.com/NVIDIA/go-nvlib/pkg/nvpci"
+
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
+)
+
+func (m *Interface) createGPUDeviceNode(gpuIndex int) error {
+	major, exists := m.Get(devices.NVIDIAGPU)
+	if !exists {
+		return fmt.Errorf("failed to determine device major; nvidia kernel module may not be loaded")
+	}
+
+	deviceNodePath := fmt.Sprintf("/dev/nvidia%d", gpuIndex)
+	if err := m.createDeviceNode(deviceNodePath, int(major), gpuIndex); err != nil {
+		return fmt.Errorf("failed to create device node %v: %w", deviceNodePath, err)
+	}
+	return nil
+}
+
+func (m *Interface) createMigDeviceNodes(gpuIndex int) error {
+	capsMajor, exists := m.Get("nvidia-caps")
+	if !exists {
+		return nil
+	}
+	var errs error
+	for _, capsDeviceMinor := range m.migCaps.FilterForGPU(gpuIndex) {
+		capDevicePath := capsDeviceMinor.DevicePath()
+		err := m.createDeviceNode(capDevicePath, int(capsMajor), int(capsDeviceMinor))
+		errs = errors.Join(errs, fmt.Errorf("failed to create %v: %w", capDevicePath, err))
+	}
+	return errs
+}
+
+func (m *Interface) createAllGPUDeviceNodes() error {
+	gpus, err := nvpci.New(
+		nvpci.WithPCIDevicesRoot(filepath.Join(m.devRoot, nvpci.PCIDevicesRoot)),
+		nvpci.WithLogger(m.logger),
+	).GetGPUs()
+	if err != nil {
+		return fmt.Errorf("failed to get GPU information from PCI: %w", err)
+	}
+
+	count := len(gpus)
+	if count == 0 {
+		return nil
+	}
+
+	var errs error
+	for gpuIndex := 0; gpuIndex < count; gpuIndex++ {
+		errs = errors.Join(errs, m.createGPUDeviceNode(gpuIndex))
+		errs = errors.Join(errs, m.createMigDeviceNodes(gpuIndex))
+	}
+	return errs
+}

From 1cfaef4b01a83eb72534e62ede45e5c820081ed6 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Sun, 9 Mar 2025 13:49:56 +0200
Subject: [PATCH 5/8] Create device nodes in JIT-CDI mode

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 internal/info/proc/devices/builder.go         |  2 +-
 internal/info/proc/devices/devices.go         | 10 +++---
 internal/modifier/cdi.go                      | 31 +++++++++++++++++-
 internal/nvcaps/nvcaps.go                     |  4 +--
 internal/oci/spec_mock.go                     |  3 +-
 .../system/nvdevices/control-device-nodes.go  | 10 +++---
 internal/system/nvdevices/devices.go          | 14 ++++----
 internal/system/nvdevices/gpu-device-nodes.go | 14 ++++----
 internal/system/nvdevices/mknod.go            | 12 ++++---
 internal/system/nvdevices/mknod_mock.go       | 32 +++++++++----------
 pkg/nvcdi/namer_nvml_mock.go                  |  3 +-
 11 files changed, 82 insertions(+), 53 deletions(-)

diff --git a/internal/info/proc/devices/builder.go b/internal/info/proc/devices/builder.go
index 6da9a90d..23a4eaf0 100644
--- a/internal/info/proc/devices/builder.go
+++ b/internal/info/proc/devices/builder.go
@@ -45,7 +45,7 @@ func New(opts ...Option) Devices {
 type Option func(*builder)
 
 // WithDeviceToMajor specifies an explicit device name to major number map.
-func WithDeviceToMajor(deviceToMajor map[string]int) Option {
+func WithDeviceToMajor(deviceToMajor map[string]uint32) Option {
 	return func(b *builder) {
 		b.asMap = make(devices)
 		for name, major := range deviceToMajor {
diff --git a/internal/info/proc/devices/devices.go b/internal/info/proc/devices/devices.go
index 5927c837..a1bfb274 100644
--- a/internal/info/proc/devices/devices.go
+++ b/internal/info/proc/devices/devices.go
@@ -45,7 +45,7 @@ const (
 type Name string
 
 // Major represents a device major as specified under /proc/devices
-type Major int
+type Major uint32
 
 // Devices represents the set of devices under /proc/devices
 //
@@ -130,8 +130,8 @@ func nvidiaDeviceFrom(reader io.Reader) (Devices, error) {
 	return nvidiaDevices, nil
 }
 
-func devicesFrom(reader io.Reader) map[string]int {
-	allDevices := make(map[string]int)
+func devicesFrom(reader io.Reader) map[string]uint32 {
+	allDevices := make(map[string]uint32)
 	scanner := bufio.NewScanner(reader)
 	for scanner.Scan() {
 		device, major, err := processProcDeviceLine(scanner.Text())
@@ -143,11 +143,11 @@ func devicesFrom(reader io.Reader) map[string]int {
 	return allDevices
 }
 
-func processProcDeviceLine(line string) (string, int, error) {
+func processProcDeviceLine(line string) (string, uint32, error) {
 	trimmed := strings.TrimSpace(line)
 
 	var name string
-	var major int
+	var major uint32
 
 	n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name)
 	if n == 2 {
diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go
index bc9a7de3..3291fe0b 100644
--- a/internal/modifier/cdi.go
+++ b/internal/modifier/cdi.go
@@ -22,12 +22,15 @@ import (
 
 	"tags.cncf.io/container-device-interface/pkg/parser"
 
+	"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
+
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvdevices"
 	"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
 	"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
 )
@@ -198,12 +201,14 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, drive
 		logger.Warningf("Ignoring error(s) loading kernel modules: %v", err)
 	}
 
-	identifiers := []string{}
+	var identifiers []string
 	for _, device := range devices {
 		_, _, id := parser.ParseDevice(device)
 		identifiers = append(identifiers, id)
 	}
 
+	tryCreateDeviceNodes(logger, driver, identifiers...)
+
 	deviceSpecs, err := cdilib.GetDeviceSpecsByID(identifiers...)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get CDI device specs: %w", err)
@@ -221,3 +226,27 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, drive
 		spec.WithClass("gpu"),
 	)
 }
+
+func tryCreateDeviceNodes(logger logger.Interface, driver *root.Driver, identifiers ...string) {
+	devices, err := nvdevices.New(
+		nvdevices.WithLogger(logger),
+		nvdevices.WithDevRoot(driver.Root),
+	)
+	if err != nil {
+		logger.Warningf("Failed to create devices library: %v", err)
+		return
+	}
+	if err := devices.CreateNVIDIAControlDevices(); err != nil {
+		logger.Warningf("Failed to create control devices: %v", err)
+	}
+	if err := devices.CreateNVIDIACapsControlDeviceNodes(); err != nil {
+		logger.Warningf("Failed to create nvidia-caps control devices: %v", err)
+	}
+
+	for _, id := range identifiers {
+		identifier := device.Identifier(id)
+		if err := devices.CreateDeviceNodes(identifier); err != nil {
+			logger.Warningf("Error creating device nodes for %v: %v", identifier, err)
+		}
+	}
+}
diff --git a/internal/nvcaps/nvcaps.go b/internal/nvcaps/nvcaps.go
index 8f6e272a..36c674b4 100644
--- a/internal/nvcaps/nvcaps.go
+++ b/internal/nvcaps/nvcaps.go
@@ -37,7 +37,7 @@ const (
 )
 
 // MigMinor represents the minor number of a MIG device
-type MigMinor int
+type MigMinor uint32
 
 // MigCap represents the path to a MIG cap file.
 // These are listed in /proc/driver/nvidia-caps/mig-minors and have one of the
@@ -144,7 +144,7 @@ func processMigMinorsLine(line string) (MigCap, MigMinor, error) {
 		return "", 0, fmt.Errorf("invalid MIG minors line: '%v'", line)
 	}
 
-	minor, err := strconv.Atoi(parts[1])
+	minor, err := strconv.ParseUint(parts[1], 10, 32)
 	if err != nil {
 		return "", 0, fmt.Errorf("error reading MIG minor from '%v': %v", line, err)
 	}
diff --git a/internal/oci/spec_mock.go b/internal/oci/spec_mock.go
index f004d69c..ff8ff647 100644
--- a/internal/oci/spec_mock.go
+++ b/internal/oci/spec_mock.go
@@ -4,9 +4,8 @@
 package oci
 
 import (
-	"sync"
-
 	"github.com/opencontainers/runtime-spec/specs-go"
+	"sync"
 )
 
 // Ensure, that SpecMock does implement Spec.
diff --git a/internal/system/nvdevices/control-device-nodes.go b/internal/system/nvdevices/control-device-nodes.go
index b7a3b4a8..793fb8e7 100644
--- a/internal/system/nvdevices/control-device-nodes.go
+++ b/internal/system/nvdevices/control-device-nodes.go
@@ -59,7 +59,7 @@ func (m *Interface) CreateNVIDIACapsControlDeviceNodes() error {
 			continue
 		}
 		deviceNodePath := migMinor.DevicePath()
-		if err := m.createDeviceNode(deviceNodePath, int(capsMajor), int(migMinor)); err != nil {
+		if err := m.createDeviceNode(deviceNodePath, capsMajor, uint32(migMinor)); err != nil {
 			errs = errors.Join(errs, fmt.Errorf("failed to create nvidia-caps device node %v: %w", deviceNodePath, err))
 		}
 	}
@@ -82,12 +82,12 @@ func (m *Interface) createControlDeviceNode(node controlDeviceNode) error {
 		return fmt.Errorf("failed to determine minor: %w", err)
 	}
 
-	return m.createDeviceNode(node.path(), int(major), int(minor))
+	return m.createDeviceNode(node.path(), major, minor)
 }
 
 // controlDeviceNodeMajor returns the major number for the specified NVIDIA control device node.
 // If the device node is not supported, an error is returned.
-func (m *Interface) controlDeviceNodeMajor(node controlDeviceNode) (int64, error) {
+func (m *Interface) controlDeviceNodeMajor(node controlDeviceNode) (devices.Major, error) {
 	var valid bool
 	var major devices.Major
 	switch node {
@@ -98,7 +98,7 @@ func (m *Interface) controlDeviceNodeMajor(node controlDeviceNode) (int64, error
 	}
 
 	if valid {
-		return int64(major), nil
+		return major, nil
 	}
 
 	return 0, errInvalidDeviceNode
@@ -106,7 +106,7 @@ func (m *Interface) controlDeviceNodeMajor(node controlDeviceNode) (int64, error
 
 // controlDeviceNodeMinor returns the minor number for the specified NVIDIA control device node.
 // If the device node is not supported, an error is returned.
-func (m *Interface) controlDeviceNodeMinor(node controlDeviceNode) (int64, error) {
+func (m *Interface) controlDeviceNodeMinor(node controlDeviceNode) (uint32, error) {
 	switch node {
 	case "nvidia-modeset":
 		return devices.NVIDIAModesetMinor, nil
diff --git a/internal/system/nvdevices/devices.go b/internal/system/nvdevices/devices.go
index 882af59e..9fa2eb1f 100644
--- a/internal/system/nvdevices/devices.go
+++ b/internal/system/nvdevices/devices.go
@@ -89,25 +89,25 @@ func New(opts ...Option) (*Interface, error) {
 func (m *Interface) CreateDeviceNodes(id device.Identifier) error {
 	switch {
 	case id.IsGpuIndex():
-		index, err := strconv.Atoi(string(id))
+		index, err := strconv.ParseUint(string(id), 10, 32)
 		if err != nil {
 			return fmt.Errorf("invalid GPU index: %v", id)
 		}
-		return m.createGPUDeviceNode(index)
+		return m.createGPUDeviceNode(uint32(index))
 	case id.IsMigIndex():
 		indices := strings.Split(string(id), ":")
 		if len(indices) != 2 {
 			return fmt.Errorf("invalid MIG index %v", id)
 		}
-		gpuIndex, err := strconv.Atoi(indices[0])
+		gpuIndex, err := strconv.ParseUint(indices[0], 10, 32)
 		if err != nil {
 			return fmt.Errorf("invalid parent index %v: %w", indices[0], err)
 		}
-		if err := m.createGPUDeviceNode(gpuIndex); err != nil {
+		if err := m.createGPUDeviceNode(uint32(gpuIndex)); err != nil {
 			return fmt.Errorf("failed to create parent device node: %w", err)
 		}
 
-		return m.createMigDeviceNodes(gpuIndex)
+		return m.createMigDeviceNodes(uint32(gpuIndex))
 	case id.IsGpuUUID(), id.IsMigUUID(), id == "all":
 		return m.createAllGPUDeviceNodes()
 	default:
@@ -117,7 +117,7 @@ func (m *Interface) CreateDeviceNodes(id device.Identifier) error {
 
 // createDeviceNode creates the specified device node with the require major and minor numbers.
 // If a devRoot is configured, this is prepended to the path.
-func (m *Interface) createDeviceNode(path string, major int, minor int) error {
+func (m *Interface) createDeviceNode(path string, major devices.Major, minor uint32) error {
 	path = filepath.Join(m.devRoot, path)
-	return m.Mknode(path, major, minor)
+	return m.Mknode(path, uint32(major), minor)
 }
diff --git a/internal/system/nvdevices/gpu-device-nodes.go b/internal/system/nvdevices/gpu-device-nodes.go
index be75f7a9..b6ea7240 100644
--- a/internal/system/nvdevices/gpu-device-nodes.go
+++ b/internal/system/nvdevices/gpu-device-nodes.go
@@ -26,28 +26,28 @@ import (
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
 )
 
-func (m *Interface) createGPUDeviceNode(gpuIndex int) error {
+func (m *Interface) createGPUDeviceNode(gpuIndex uint32) error {
 	major, exists := m.Get(devices.NVIDIAGPU)
 	if !exists {
 		return fmt.Errorf("failed to determine device major; nvidia kernel module may not be loaded")
 	}
 
 	deviceNodePath := fmt.Sprintf("/dev/nvidia%d", gpuIndex)
-	if err := m.createDeviceNode(deviceNodePath, int(major), gpuIndex); err != nil {
+	if err := m.createDeviceNode(deviceNodePath, major, uint32(gpuIndex)); err != nil {
 		return fmt.Errorf("failed to create device node %v: %w", deviceNodePath, err)
 	}
 	return nil
 }
 
-func (m *Interface) createMigDeviceNodes(gpuIndex int) error {
+func (m *Interface) createMigDeviceNodes(gpuIndex uint32) error {
 	capsMajor, exists := m.Get("nvidia-caps")
 	if !exists {
 		return nil
 	}
 	var errs error
-	for _, capsDeviceMinor := range m.migCaps.FilterForGPU(gpuIndex) {
+	for _, capsDeviceMinor := range m.migCaps.FilterForGPU(int(gpuIndex)) {
 		capDevicePath := capsDeviceMinor.DevicePath()
-		err := m.createDeviceNode(capDevicePath, int(capsMajor), int(capsDeviceMinor))
+		err := m.createDeviceNode(capDevicePath, capsMajor, uint32(capsDeviceMinor))
 		errs = errors.Join(errs, fmt.Errorf("failed to create %v: %w", capDevicePath, err))
 	}
 	return errs
@@ -62,13 +62,13 @@ func (m *Interface) createAllGPUDeviceNodes() error {
 		return fmt.Errorf("failed to get GPU information from PCI: %w", err)
 	}
 
-	count := len(gpus)
+	count := uint32(len(gpus))
 	if count == 0 {
 		return nil
 	}
 
 	var errs error
-	for gpuIndex := 0; gpuIndex < count; gpuIndex++ {
+	for gpuIndex := uint32(0); gpuIndex < count; gpuIndex++ {
 		errs = errors.Join(errs, m.createGPUDeviceNode(gpuIndex))
 		errs = errors.Join(errs, m.createMigDeviceNodes(gpuIndex))
 	}
diff --git a/internal/system/nvdevices/mknod.go b/internal/system/nvdevices/mknod.go
index 5754fc40..0b35e9d6 100644
--- a/internal/system/nvdevices/mknod.go
+++ b/internal/system/nvdevices/mknod.go
@@ -25,16 +25,18 @@ import (
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
 )
 
-//go:generate moq -stub -out mknod_mock.go . mknoder
+type mint uint32
+
+//go:generate moq -fmt=goimports -rm -stub -out mknod_mock.go . mknoder
 type mknoder interface {
-	Mknode(string, int, int) error
+	Mknode(string, uint32, uint32) error
 }
 
 type mknodLogger struct {
 	logger.Interface
 }
 
-func (m *mknodLogger) Mknode(path string, major, minor int) error {
+func (m *mknodLogger) Mknode(path string, major uint32, minor uint32) error {
 	m.Infof("Running: mknod --mode=0666 %s c %d %d", path, major, minor)
 	return nil
 }
@@ -43,7 +45,7 @@ type mknodUnix struct {
 	logger logger.Interface
 }
 
-func (m *mknodUnix) Mknode(path string, major, minor int) error {
+func (m *mknodUnix) Mknode(path string, major uint32, minor uint32) error {
 	// TODO: Ensure that the existing device node has the correct properties.
 	if _, err := os.Stat(path); err == nil {
 		m.logger.Infof("Skipping: %s already exists", path)
@@ -52,7 +54,7 @@ func (m *mknodUnix) Mknode(path string, major, minor int) error {
 		return fmt.Errorf("failed to stat %s: %v", path, err)
 	}
 
-	err := unix.Mknod(path, unix.S_IFCHR, int(unix.Mkdev(uint32(major), uint32(minor))))
+	err := unix.Mknod(path, unix.S_IFCHR, int(unix.Mkdev(major, minor)))
 	if err != nil {
 		return err
 	}
diff --git a/internal/system/nvdevices/mknod_mock.go b/internal/system/nvdevices/mknod_mock.go
index 4bb384fa..f4e7bace 100644
--- a/internal/system/nvdevices/mknod_mock.go
+++ b/internal/system/nvdevices/mknod_mock.go
@@ -17,7 +17,7 @@ var _ mknoder = &mknoderMock{}
 //
 //		// make and configure a mocked mknoder
 //		mockedmknoder := &mknoderMock{
-//			MknodeFunc: func(s string, n1 int, n2 int) error {
+//			MknodeFunc: func(s string, v1 uint32, v2 uint32) error {
 //				panic("mock out the Mknode method")
 //			},
 //		}
@@ -28,7 +28,7 @@ var _ mknoder = &mknoderMock{}
 //	}
 type mknoderMock struct {
 	// MknodeFunc mocks the Mknode method.
-	MknodeFunc func(s string, n1 int, n2 int) error
+	MknodeFunc func(s string, v1 uint32, v2 uint32) error
 
 	// calls tracks calls to the methods.
 	calls struct {
@@ -36,25 +36,25 @@ type mknoderMock struct {
 		Mknode []struct {
 			// S is the s argument value.
 			S string
-			// N1 is the n1 argument value.
-			N1 int
-			// N2 is the n2 argument value.
-			N2 int
+			// V1 is the v1 argument value.
+			V1 uint32
+			// V2 is the v2 argument value.
+			V2 uint32
 		}
 	}
 	lockMknode sync.RWMutex
 }
 
 // Mknode calls MknodeFunc.
-func (mock *mknoderMock) Mknode(s string, n1 int, n2 int) error {
+func (mock *mknoderMock) Mknode(s string, v1 uint32, v2 uint32) error {
 	callInfo := struct {
 		S  string
-		N1 int
-		N2 int
+		V1 uint32
+		V2 uint32
 	}{
 		S:  s,
-		N1: n1,
-		N2: n2,
+		V1: v1,
+		V2: v2,
 	}
 	mock.lockMknode.Lock()
 	mock.calls.Mknode = append(mock.calls.Mknode, callInfo)
@@ -65,7 +65,7 @@ func (mock *mknoderMock) Mknode(s string, n1 int, n2 int) error {
 		)
 		return errOut
 	}
-	return mock.MknodeFunc(s, n1, n2)
+	return mock.MknodeFunc(s, v1, v2)
 }
 
 // MknodeCalls gets all the calls that were made to Mknode.
@@ -74,13 +74,13 @@ func (mock *mknoderMock) Mknode(s string, n1 int, n2 int) error {
 //	len(mockedmknoder.MknodeCalls())
 func (mock *mknoderMock) MknodeCalls() []struct {
 	S  string
-	N1 int
-	N2 int
+	V1 uint32
+	V2 uint32
 } {
 	var calls []struct {
 		S  string
-		N1 int
-		N2 int
+		V1 uint32
+		V2 uint32
 	}
 	mock.lockMknode.RLock()
 	calls = mock.calls.Mknode
diff --git a/pkg/nvcdi/namer_nvml_mock.go b/pkg/nvcdi/namer_nvml_mock.go
index 6a704b45..f81a1eee 100644
--- a/pkg/nvcdi/namer_nvml_mock.go
+++ b/pkg/nvcdi/namer_nvml_mock.go
@@ -4,9 +4,8 @@
 package nvcdi
 
 import (
-	"sync"
-
 	"github.com/NVIDIA/go-nvml/pkg/nvml"
+	"sync"
 )
 
 // Ensure, that nvmlUUIDerMock does implement nvmlUUIDer.

From f3b730c805b6b0bcca62efd5f76bd083e450c0e4 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Wed, 12 Mar 2025 12:42:37 +0200
Subject: [PATCH 6/8] [no-relnote] Use FilterForGPU to create all devchar
 symlinks

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 .../system/create-dev-char-symlinks/all.go     | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)

diff --git a/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go b/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go
index cafb8f9c..62aaf32d 100644
--- a/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go
+++ b/cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go
@@ -145,21 +145,9 @@ func (m allPossible) getGPUDeviceNodes(gpu int) []deviceNode {
 // getNVCapDeviceNodes generates a list of cap device nodes for a given GPU.
 func (m allPossible) getNVCapDeviceNodes(gpu int) []deviceNode {
 	var selectedCapMinors []nvcaps.MigMinor
-	for gi := 0; ; gi++ {
-		giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
-		giMinor, exist := m.migCaps[giCap]
-		if !exist {
-			break
-		}
-		selectedCapMinors = append(selectedCapMinors, giMinor)
-		for ci := 0; ; ci++ {
-			ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
-			ciMinor, exist := m.migCaps[ciCap]
-			if !exist {
-				break
-			}
-			selectedCapMinors = append(selectedCapMinors, ciMinor)
-		}
+
+	for _, capMinors := range m.migCaps.FilterForGPU(nvcaps.Index(gpu)) {
+		selectedCapMinors = append(selectedCapMinors, capMinors)
 	}
 
 	var deviceNodes []deviceNode

From d4b331fbbb068a4063f7a56ca3afefcb6870f310 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Wed, 12 Mar 2025 12:43:07 +0200
Subject: [PATCH 7/8] Add Index type to nvcaps

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 internal/nvcaps/nvcaps.go | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/internal/nvcaps/nvcaps.go b/internal/nvcaps/nvcaps.go
index 36c674b4..8e6037d8 100644
--- a/internal/nvcaps/nvcaps.go
+++ b/internal/nvcaps/nvcaps.go
@@ -36,8 +36,12 @@ const (
 	nvcapsDevicePath     = "/dev/nvidia-caps"
 )
 
+// An Index represents a gpu, ci, or gi index.
+// We use uint32 as this typically maps to a device minor number.
+type Index uint32
+
 // MigMinor represents the minor number of a MIG device
-type MigMinor uint32
+type MigMinor Index
 
 // MigCap represents the path to a MIG cap file.
 // These are listed in /proc/driver/nvidia-caps/mig-minors and have one of the
@@ -53,30 +57,30 @@ type MigCaps map[MigCap]MigMinor
 
 // NewGPUInstanceCap creates a MigCap for the specified MIG GPU instance.
 // A GPU instance is uniquely defined by the GPU minor number and GI instance ID.
-func NewGPUInstanceCap(gpu, gi int) MigCap {
+func NewGPUInstanceCap[T uint32 | int | Index](gpu, gi T) MigCap {
 	return MigCap(fmt.Sprintf("gpu%d/gi%d/access", gpu, gi))
 }
 
 // NewComputeInstanceCap creates a MigCap for the specified MIG Compute instance.
 // A GPU instance is uniquely defined by the GPU minor number, GI instance ID, and CI instance ID.
-func NewComputeInstanceCap(gpu, gi, ci int) MigCap {
+func NewComputeInstanceCap[T uint32 | int | Index](gpu, gi, ci T) MigCap {
 	return MigCap(fmt.Sprintf("gpu%d/gi%d/ci%d/access", gpu, gi, ci))
 }
 
 // FilterForGPU limits the MIG Caps to those associated with a particular GPU.
-func (m MigCaps) FilterForGPU(gpu int) MigCaps {
+func (m MigCaps) FilterForGPU(gpu Index) MigCaps {
 	if m == nil {
 		return nil
 	}
 	filtered := make(MigCaps)
-	for gi := 0; ; gi++ {
+	for gi := Index(0); ; gi++ {
 		giCap := NewGPUInstanceCap(gpu, gi)
 		giMinor, exist := m[giCap]
 		if !exist {
 			break
 		}
 		filtered[giCap] = giMinor
-		for ci := 0; ; ci++ {
+		for ci := Index(0); ; ci++ {
 			ciCap := NewComputeInstanceCap(gpu, gi, ci)
 			ciMinor, exist := m[ciCap]
 			if !exist {

From 699608902ba8b82e32fa93005770f75962e40143 Mon Sep 17 00:00:00 2001
From: Evan Lezar <elezar@nvidia.com>
Date: Wed, 12 Mar 2025 12:43:39 +0200
Subject: [PATCH 8/8] TOFIX

Signed-off-by: Evan Lezar <elezar@nvidia.com>
---
 internal/info/proc/devices/devices_test.go    |  4 +--
 internal/system/nvdevices/devices.go          | 11 ++++----
 internal/system/nvdevices/devices_test.go     | 26 +++++++++----------
 internal/system/nvdevices/gpu-device-nodes.go | 26 ++++++++++++++-----
 internal/system/nvdevices/mknod.go            |  2 --
 5 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/internal/info/proc/devices/devices_test.go b/internal/info/proc/devices/devices_test.go
index 1669dee6..945614d7 100644
--- a/internal/info/proc/devices/devices_test.go
+++ b/internal/info/proc/devices/devices_test.go
@@ -25,7 +25,7 @@ import (
 )
 
 func TestNvidiaDevices(t *testing.T) {
-	perDriverDeviceMaps := map[string]map[string]int{
+	perDriverDeviceMaps := map[string]map[string]uint32{
 		"pre550": {
 			"nvidia-frontend": 195,
 			"nvidia-nvlink":   234,
@@ -100,7 +100,7 @@ func TestProcessDeviceFileLine(t *testing.T) {
 	testCases := []struct {
 		line  string
 		name  string
-		major int
+		major uint32
 		err   bool
 	}{
 		{"", "", 0, true},
diff --git a/internal/system/nvdevices/devices.go b/internal/system/nvdevices/devices.go
index 9fa2eb1f..28fb90ce 100644
--- a/internal/system/nvdevices/devices.go
+++ b/internal/system/nvdevices/devices.go
@@ -20,7 +20,6 @@ import (
 	"errors"
 	"fmt"
 	"path/filepath"
-	"strconv"
 	"strings"
 
 	"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
@@ -89,25 +88,25 @@ func New(opts ...Option) (*Interface, error) {
 func (m *Interface) CreateDeviceNodes(id device.Identifier) error {
 	switch {
 	case id.IsGpuIndex():
-		index, err := strconv.ParseUint(string(id), 10, 32)
+		gpuIndex, err := toIndex(string(id))
 		if err != nil {
 			return fmt.Errorf("invalid GPU index: %v", id)
 		}
-		return m.createGPUDeviceNode(uint32(index))
+		return m.createGPUDeviceNode(gpuIndex)
 	case id.IsMigIndex():
 		indices := strings.Split(string(id), ":")
 		if len(indices) != 2 {
 			return fmt.Errorf("invalid MIG index %v", id)
 		}
-		gpuIndex, err := strconv.ParseUint(indices[0], 10, 32)
+		gpuIndex, err := toIndex(indices[0])
 		if err != nil {
 			return fmt.Errorf("invalid parent index %v: %w", indices[0], err)
 		}
-		if err := m.createGPUDeviceNode(uint32(gpuIndex)); err != nil {
+		if err := m.createGPUDeviceNode(gpuIndex); err != nil {
 			return fmt.Errorf("failed to create parent device node: %w", err)
 		}
 
-		return m.createMigDeviceNodes(uint32(gpuIndex))
+		return m.createMigDeviceNodes(gpuIndex)
 	case id.IsGpuUUID(), id.IsMigUUID(), id == "all":
 		return m.createAllGPUDeviceNodes()
 	default:
diff --git a/internal/system/nvdevices/devices_test.go b/internal/system/nvdevices/devices_test.go
index d4d8616c..9f4c13af 100644
--- a/internal/system/nvdevices/devices_test.go
+++ b/internal/system/nvdevices/devices_test.go
@@ -30,13 +30,13 @@ func TestCreateControlDevices(t *testing.T) {
 	logger, _ := testlog.NewNullLogger()
 
 	nvidiaDevices := devices.New(
-		devices.WithDeviceToMajor(map[string]int{
+		devices.WithDeviceToMajor(map[string]uint32{
 			"nvidia-frontend": 195,
 			"nvidia-uvm":      243,
 		}),
 	)
 	nvidia550Devices := devices.New(
-		devices.WithDeviceToMajor(map[string]int{
+		devices.WithDeviceToMajor(map[string]uint32{
 			"nvidia":     195,
 			"nvidia-uvm": 243,
 		}),
@@ -52,8 +52,8 @@ func TestCreateControlDevices(t *testing.T) {
 		expectedError error
 		expectedCalls []struct {
 			S  string
-			N1 int
-			N2 int
+			V1 uint32
+			V2 uint32
 		}
 	}{
 		{
@@ -63,8 +63,8 @@ func TestCreateControlDevices(t *testing.T) {
 			mknodeError: nil,
 			expectedCalls: []struct {
 				S  string
-				N1 int
-				N2 int
+				V1 uint32
+				V2 uint32
 			}{
 				{"/dev/nvidiactl", 195, 255},
 				{"/dev/nvidia-modeset", 195, 254},
@@ -79,8 +79,8 @@ func TestCreateControlDevices(t *testing.T) {
 			mknodeError: nil,
 			expectedCalls: []struct {
 				S  string
-				N1 int
-				N2 int
+				V1 uint32
+				V2 uint32
 			}{
 				{"/dev/nvidiactl", 195, 255},
 				{"/dev/nvidia-modeset", 195, 254},
@@ -95,8 +95,8 @@ func TestCreateControlDevices(t *testing.T) {
 			mknodeError: nil,
 			expectedCalls: []struct {
 				S  string
-				N1 int
-				N2 int
+				V1 uint32
+				V2 uint32
 			}{
 				{"/some/root/dev/nvidiactl", 195, 255},
 				{"/some/root/dev/nvidia-modeset", 195, 254},
@@ -112,8 +112,8 @@ func TestCreateControlDevices(t *testing.T) {
 			// We expect the first call to this to fail, and the rest to be skipped
 			expectedCalls: []struct {
 				S  string
-				N1 int
-				N2 int
+				V1 uint32
+				V2 uint32
 			}{
 				{"/dev/nvidiactl", 195, 255},
 			},
@@ -132,7 +132,7 @@ func TestCreateControlDevices(t *testing.T) {
 	for _, tc := range testCases {
 		t.Run(tc.description, func(t *testing.T) {
 			mknode := &mknoderMock{
-				MknodeFunc: func(string, int, int) error {
+				MknodeFunc: func(string, uint32, uint32) error {
 					return tc.mknodeError
 				},
 			}
diff --git a/internal/system/nvdevices/gpu-device-nodes.go b/internal/system/nvdevices/gpu-device-nodes.go
index b6ea7240..9075a0b1 100644
--- a/internal/system/nvdevices/gpu-device-nodes.go
+++ b/internal/system/nvdevices/gpu-device-nodes.go
@@ -20,32 +20,44 @@ import (
 	"errors"
 	"fmt"
 	"path/filepath"
+	"strconv"
 
 	"github.com/NVIDIA/go-nvlib/pkg/nvpci"
 
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices"
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
 )
 
-func (m *Interface) createGPUDeviceNode(gpuIndex uint32) error {
+type gpuIndex nvcaps.Index
+
+func toIndex(index string) (gpuIndex, error) {
+	i, err := strconv.ParseUint(index, 10, 32)
+	if err != nil {
+		return 0, err
+	}
+	return gpuIndex(i), nil
+}
+
+func (m *Interface) createGPUDeviceNode(gpu gpuIndex) error {
 	major, exists := m.Get(devices.NVIDIAGPU)
 	if !exists {
 		return fmt.Errorf("failed to determine device major; nvidia kernel module may not be loaded")
 	}
 
-	deviceNodePath := fmt.Sprintf("/dev/nvidia%d", gpuIndex)
-	if err := m.createDeviceNode(deviceNodePath, major, uint32(gpuIndex)); err != nil {
+	deviceNodePath := fmt.Sprintf("/dev/nvidia%d", gpu)
+	if err := m.createDeviceNode(deviceNodePath, major, uint32(gpu)); err != nil {
 		return fmt.Errorf("failed to create device node %v: %w", deviceNodePath, err)
 	}
 	return nil
 }
 
-func (m *Interface) createMigDeviceNodes(gpuIndex uint32) error {
+func (m *Interface) createMigDeviceNodes(gpu gpuIndex) error {
 	capsMajor, exists := m.Get("nvidia-caps")
 	if !exists {
 		return nil
 	}
 	var errs error
-	for _, capsDeviceMinor := range m.migCaps.FilterForGPU(int(gpuIndex)) {
+	for _, capsDeviceMinor := range m.migCaps.FilterForGPU(nvcaps.Index(gpu)) {
 		capDevicePath := capsDeviceMinor.DevicePath()
 		err := m.createDeviceNode(capDevicePath, capsMajor, uint32(capsDeviceMinor))
 		errs = errors.Join(errs, fmt.Errorf("failed to create %v: %w", capDevicePath, err))
@@ -62,13 +74,13 @@ func (m *Interface) createAllGPUDeviceNodes() error {
 		return fmt.Errorf("failed to get GPU information from PCI: %w", err)
 	}
 
-	count := uint32(len(gpus))
+	count := gpuIndex(len(gpus))
 	if count == 0 {
 		return nil
 	}
 
 	var errs error
-	for gpuIndex := uint32(0); gpuIndex < count; gpuIndex++ {
+	for gpuIndex := gpuIndex(0); gpuIndex < count; gpuIndex++ {
 		errs = errors.Join(errs, m.createGPUDeviceNode(gpuIndex))
 		errs = errors.Join(errs, m.createMigDeviceNodes(gpuIndex))
 	}
diff --git a/internal/system/nvdevices/mknod.go b/internal/system/nvdevices/mknod.go
index 0b35e9d6..30eef032 100644
--- a/internal/system/nvdevices/mknod.go
+++ b/internal/system/nvdevices/mknod.go
@@ -25,8 +25,6 @@ import (
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
 )
 
-type mint uint32
-
 //go:generate moq -fmt=goimports -rm -stub -out mknod_mock.go . mknoder
 type mknoder interface {
 	Mknode(string, uint32, uint32) error