diff --git a/internal/info/proc/devices/devices_test.go b/internal/info/proc/devices/devices_test.go index 1669dee6..945614d7 100644 --- a/internal/info/proc/devices/devices_test.go +++ b/internal/info/proc/devices/devices_test.go @@ -25,7 +25,7 @@ import ( ) func TestNvidiaDevices(t *testing.T) { - perDriverDeviceMaps := map[string]map[string]int{ + perDriverDeviceMaps := map[string]map[string]uint32{ "pre550": { "nvidia-frontend": 195, "nvidia-nvlink": 234, @@ -100,7 +100,7 @@ func TestProcessDeviceFileLine(t *testing.T) { testCases := []struct { line string name string - major int + major uint32 err bool }{ {"", "", 0, true}, diff --git a/internal/system/nvdevices/devices.go b/internal/system/nvdevices/devices.go index 9fa2eb1f..28fb90ce 100644 --- a/internal/system/nvdevices/devices.go +++ b/internal/system/nvdevices/devices.go @@ -20,7 +20,6 @@ import ( "errors" "fmt" "path/filepath" - "strconv" "strings" "github.com/NVIDIA/go-nvlib/pkg/nvlib/device" @@ -89,25 +88,25 @@ func New(opts ...Option) (*Interface, error) { func (m *Interface) CreateDeviceNodes(id device.Identifier) error { switch { case id.IsGpuIndex(): - index, err := strconv.ParseUint(string(id), 10, 32) + gpuIndex, err := toIndex(string(id)) if err != nil { return fmt.Errorf("invalid GPU index: %v", id) } - return m.createGPUDeviceNode(uint32(index)) + return m.createGPUDeviceNode(gpuIndex) case id.IsMigIndex(): indices := strings.Split(string(id), ":") if len(indices) != 2 { return fmt.Errorf("invalid MIG index %v", id) } - gpuIndex, err := strconv.ParseUint(indices[0], 10, 32) + gpuIndex, err := toIndex(indices[0]) if err != nil { return fmt.Errorf("invalid parent index %v: %w", indices[0], err) } - if err := m.createGPUDeviceNode(uint32(gpuIndex)); err != nil { + if err := m.createGPUDeviceNode(gpuIndex); err != nil { return fmt.Errorf("failed to create parent device node: %w", err) } - return m.createMigDeviceNodes(uint32(gpuIndex)) + return m.createMigDeviceNodes(gpuIndex) case id.IsGpuUUID(), id.IsMigUUID(), id == "all": return m.createAllGPUDeviceNodes() default: diff --git a/internal/system/nvdevices/devices_test.go b/internal/system/nvdevices/devices_test.go index d4d8616c..9f4c13af 100644 --- a/internal/system/nvdevices/devices_test.go +++ b/internal/system/nvdevices/devices_test.go @@ -30,13 +30,13 @@ func TestCreateControlDevices(t *testing.T) { logger, _ := testlog.NewNullLogger() nvidiaDevices := devices.New( - devices.WithDeviceToMajor(map[string]int{ + devices.WithDeviceToMajor(map[string]uint32{ "nvidia-frontend": 195, "nvidia-uvm": 243, }), ) nvidia550Devices := devices.New( - devices.WithDeviceToMajor(map[string]int{ + devices.WithDeviceToMajor(map[string]uint32{ "nvidia": 195, "nvidia-uvm": 243, }), @@ -52,8 +52,8 @@ func TestCreateControlDevices(t *testing.T) { expectedError error expectedCalls []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 } }{ { @@ -63,8 +63,8 @@ func TestCreateControlDevices(t *testing.T) { mknodeError: nil, expectedCalls: []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 }{ {"/dev/nvidiactl", 195, 255}, {"/dev/nvidia-modeset", 195, 254}, @@ -79,8 +79,8 @@ func TestCreateControlDevices(t *testing.T) { mknodeError: nil, expectedCalls: []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 }{ {"/dev/nvidiactl", 195, 255}, {"/dev/nvidia-modeset", 195, 254}, @@ -95,8 +95,8 @@ func TestCreateControlDevices(t *testing.T) { mknodeError: nil, expectedCalls: []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 }{ {"/some/root/dev/nvidiactl", 195, 255}, {"/some/root/dev/nvidia-modeset", 195, 254}, @@ -112,8 +112,8 @@ func TestCreateControlDevices(t *testing.T) { // We expect the first call to this to fail, and the rest to be skipped expectedCalls: []struct { S string - N1 int - N2 int + V1 uint32 + V2 uint32 }{ {"/dev/nvidiactl", 195, 255}, }, @@ -132,7 +132,7 @@ func TestCreateControlDevices(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { mknode := &mknoderMock{ - MknodeFunc: func(string, int, int) error { + MknodeFunc: func(string, uint32, uint32) error { return tc.mknodeError }, } diff --git a/internal/system/nvdevices/gpu-device-nodes.go b/internal/system/nvdevices/gpu-device-nodes.go index b6ea7240..9075a0b1 100644 --- a/internal/system/nvdevices/gpu-device-nodes.go +++ b/internal/system/nvdevices/gpu-device-nodes.go @@ -20,32 +20,44 @@ import ( "errors" "fmt" "path/filepath" + "strconv" "github.com/NVIDIA/go-nvlib/pkg/nvpci" "github.com/NVIDIA/nvidia-container-toolkit/internal/info/proc/devices" + "github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps" ) -func (m *Interface) createGPUDeviceNode(gpuIndex uint32) error { +type gpuIndex nvcaps.Index + +func toIndex(index string) (gpuIndex, error) { + i, err := strconv.ParseUint(index, 10, 32) + if err != nil { + return 0, err + } + return gpuIndex(i), nil +} + +func (m *Interface) createGPUDeviceNode(gpu gpuIndex) error { major, exists := m.Get(devices.NVIDIAGPU) if !exists { return fmt.Errorf("failed to determine device major; nvidia kernel module may not be loaded") } - deviceNodePath := fmt.Sprintf("/dev/nvidia%d", gpuIndex) - if err := m.createDeviceNode(deviceNodePath, major, uint32(gpuIndex)); err != nil { + deviceNodePath := fmt.Sprintf("/dev/nvidia%d", gpu) + if err := m.createDeviceNode(deviceNodePath, major, uint32(gpu)); err != nil { return fmt.Errorf("failed to create device node %v: %w", deviceNodePath, err) } return nil } -func (m *Interface) createMigDeviceNodes(gpuIndex uint32) error { +func (m *Interface) createMigDeviceNodes(gpu gpuIndex) error { capsMajor, exists := m.Get("nvidia-caps") if !exists { return nil } var errs error - for _, capsDeviceMinor := range m.migCaps.FilterForGPU(int(gpuIndex)) { + for _, capsDeviceMinor := range m.migCaps.FilterForGPU(nvcaps.Index(gpu)) { capDevicePath := capsDeviceMinor.DevicePath() err := m.createDeviceNode(capDevicePath, capsMajor, uint32(capsDeviceMinor)) errs = errors.Join(errs, fmt.Errorf("failed to create %v: %w", capDevicePath, err)) @@ -62,13 +74,13 @@ func (m *Interface) createAllGPUDeviceNodes() error { return fmt.Errorf("failed to get GPU information from PCI: %w", err) } - count := uint32(len(gpus)) + count := gpuIndex(len(gpus)) if count == 0 { return nil } var errs error - for gpuIndex := uint32(0); gpuIndex < count; gpuIndex++ { + for gpuIndex := gpuIndex(0); gpuIndex < count; gpuIndex++ { errs = errors.Join(errs, m.createGPUDeviceNode(gpuIndex)) errs = errors.Join(errs, m.createMigDeviceNodes(gpuIndex)) } diff --git a/internal/system/nvdevices/mknod.go b/internal/system/nvdevices/mknod.go index 0b35e9d6..30eef032 100644 --- a/internal/system/nvdevices/mknod.go +++ b/internal/system/nvdevices/mknod.go @@ -25,8 +25,6 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" ) -type mint uint32 - //go:generate moq -fmt=goimports -rm -stub -out mknod_mock.go . mknoder type mknoder interface { Mknode(string, uint32, uint32) error