From f414ac28653084d2398109d042bdedca133d3de1 Mon Sep 17 00:00:00 2001 From: Tariq Ibrahim Date: Mon, 5 Feb 2024 21:35:30 -0800 Subject: [PATCH 1/2] add fallback logic when retrieving major number of the nvidia control device Signed-off-by: Tariq Ibrahim --- internal/info/proc/devices/devices.go | 23 +++++++++-- internal/info/proc/devices/devices_test.go | 5 +++ internal/system/nvdevices/devices_test.go | 45 ++++++++++++++++++++-- 3 files changed, 65 insertions(+), 8 deletions(-) diff --git a/internal/info/proc/devices/devices.go b/internal/info/proc/devices/devices.go index 95430428..232df3b2 100644 --- a/internal/info/proc/devices/devices.go +++ b/internal/info/proc/devices/devices.go @@ -33,7 +33,7 @@ const ( NVIDIAModesetMinor = 254 NVIDIAFrontend = Name("nvidia-frontend") - NVIDIAGPU = NVIDIAFrontend + NVIDIAGPU = Name("nvidia") NVIDIACaps = Name("nvidia-caps") NVIDIAUVM = Name("nvidia-uvm") @@ -65,10 +65,25 @@ func (d devices) Exists(name Name) bool { return exists } -// Get a Device from Devices +// Get a Device from Devices. It also has fallback logic to ensure device name changes in /proc/devices are handled +// For e.g:- For GPU drivers 550.40.x or greater, the gpu device has been renamed from "nvidia-frontend" to "nvidia". func (d devices) Get(name Name) (Major, bool) { - device, exists := d[name] - return device, exists + for _, n := range name.getWithFallback() { + device, exists := d[n] + if exists { + return device, true + } + } + return 0, false +} + +// getWithFallback returns a prioritised list of device names for a specific name. +// This allows multiple names to be associated with a single name to support various driver versions. +func (n Name) getWithFallback() []Name { + if n == NVIDIAGPU || n == NVIDIAFrontend { + return []Name{NVIDIAGPU, NVIDIAFrontend} + } + return []Name{n} } // GetNVIDIADevices returns the set of NVIDIA Devices on the machine diff --git a/internal/info/proc/devices/devices_test.go b/internal/info/proc/devices/devices_test.go index 037ccbe0..97c56b7a 100644 --- a/internal/info/proc/devices/devices_test.go +++ b/internal/info/proc/devices/devices_test.go @@ -41,6 +41,11 @@ func TestNvidiaDevices(t *testing.T) { } _, exists := nvidiaDevices.Get("bogus") require.False(t, exists, "Unexpected 'bogus' device found") + + // assert that nvidia and nvidia-frontend can be used interchangeably and have the device major numbers + m, exists := nvidiaDevices.Get("nvidia") + require.True(t, exists) + require.Equal(t, devices["nvidia-frontend"], m) } func TestProcessDeviceFile(t *testing.T) { diff --git a/internal/system/nvdevices/devices_test.go b/internal/system/nvdevices/devices_test.go index 388f91dc..e1b68e65 100644 --- a/internal/system/nvdevices/devices_test.go +++ b/internal/system/nvdevices/devices_test.go @@ -31,11 +31,25 @@ func TestCreateControlDevices(t *testing.T) { nvidiaDevices := &devices.DevicesMock{ GetFunc: func(name devices.Name) (devices.Major, bool) { - devices := map[devices.Name]devices.Major{ + devs := map[devices.Name]devices.Major{ "nvidia-frontend": 195, "nvidia-uvm": 243, } - return devices[name], true + + // devs550_40 represents the device map from the nvidia gpu drivers >= 550.40.x + devs550_40 := map[devices.Name]devices.Major{ + "nvidia": 195, + "nvidia-uvm": 243, + } + + d, ok := devs[name] + if ok { + return d, ok + } + + // if device d is not found, fallback to the second mock device map + d, ok = devs550_40[name] + return d, ok }, } @@ -46,6 +60,7 @@ func TestCreateControlDevices(t *testing.T) { root string devices devices.Devices mknodeError error + hasError bool expectedError error expectedCalls []struct { S string @@ -58,6 +73,7 @@ func TestCreateControlDevices(t *testing.T) { root: "", devices: nvidiaDevices, mknodeError: nil, + hasError: false, expectedCalls: []struct { S string N1 int @@ -73,6 +89,7 @@ func TestCreateControlDevices(t *testing.T) { description: "some root specified", root: "/some/root", devices: nvidiaDevices, + hasError: false, mknodeError: nil, expectedCalls: []struct { S string @@ -88,6 +105,7 @@ func TestCreateControlDevices(t *testing.T) { { description: "mknod error returns error", devices: nvidiaDevices, + hasError: true, mknodeError: mknodeError, expectedError: mknodeError, // We expect the first call to this to fail, and the rest to be skipped @@ -106,8 +124,24 @@ func TestCreateControlDevices(t *testing.T) { return 0, false }, }, + hasError: true, expectedError: errInvalidDeviceNode, }, + { + description: "nvidia device renamed from nvidia-frontend to nvidia", + devices: nvidiaDevices, + hasError: false, + expectedCalls: []struct { + S string + N1 int + N2 int + }{ + {"/dev/nvidiactl", 195, 255}, + {"/dev/nvidia-modeset", 195, 254}, + {"/dev/nvidia-uvm", 243, 0}, + {"/dev/nvidia-uvm-tools", 243, 1}, + }, + }, } for _, tc := range testCases { @@ -126,9 +160,12 @@ func TestCreateControlDevices(t *testing.T) { d.mknoder = mknode err := d.CreateNVIDIAControlDevices() - require.ErrorIs(t, err, tc.expectedError) + if tc.hasError { + require.ErrorContains(t, err, tc.expectedError.Error()) + } else { + require.Nil(t, err) + } require.EqualValues(t, tc.expectedCalls, mknode.MknodeCalls()) }) } - } From e64b723b71be7419df525e1f5d1a0cb74a5a9006 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 6 Feb 2024 11:19:43 +0100 Subject: [PATCH 2/2] Add proc.devices.New constructor Signed-off-by: Evan Lezar --- internal/info/proc/devices/builder.go | 62 +++++++++++++++++ internal/info/proc/devices/devices.go | 36 +++++----- internal/info/proc/devices/devices_mock.go | 40 +++++++++++ internal/info/proc/devices/devices_test.go | 70 ++++++++++++------- internal/system/nvdevices/devices_test.go | 79 ++++++++-------------- 5 files changed, 195 insertions(+), 92 deletions(-) create mode 100644 internal/info/proc/devices/builder.go diff --git a/internal/info/proc/devices/builder.go b/internal/info/proc/devices/builder.go new file mode 100644 index 00000000..6da9a90d --- /dev/null +++ b/internal/info/proc/devices/builder.go @@ -0,0 +1,62 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package devices + +type builder struct { + asMap devices + filter func(string) bool +} + +// New creates a new devices struct with the specified options. +func New(opts ...Option) Devices { + b := &builder{} + for _, opt := range opts { + opt(b) + } + + if b.filter == nil { + b.filter = func(string) bool { return false } + } + + devices := make(devices) + for k, v := range b.asMap { + if b.filter(string(k)) { + continue + } + devices[k] = v + } + return devices +} + +type Option func(*builder) + +// WithDeviceToMajor specifies an explicit device name to major number map. +func WithDeviceToMajor(deviceToMajor map[string]int) Option { + return func(b *builder) { + b.asMap = make(devices) + for name, major := range deviceToMajor { + b.asMap[Name(name)] = Major(major) + } + } +} + +// WithFilter specifies a filter to exclude devices. +func WithFilter(filter func(string) bool) Option { + return func(b *builder) { + b.filter = filter + } +} diff --git a/internal/info/proc/devices/devices.go b/internal/info/proc/devices/devices.go index 232df3b2..e0863f97 100644 --- a/internal/info/proc/devices/devices.go +++ b/internal/info/proc/devices/devices.go @@ -53,12 +53,18 @@ type Major int type Devices interface { Exists(Name) bool Get(Name) (Major, bool) + Count() int } type devices map[Name]Major var _ Devices = devices(nil) +// Count returns the number of devices defined. +func (d devices) Count() int { + return len(d) +} + // Exists checks if a Device with a given name exists or not func (d devices) Exists(name Name) bool { _, exists := d[name] @@ -109,27 +115,23 @@ func nvidiaDevices(devicesPath string) (Devices, error) { var errNoNvidiaDevices = errors.New("no NVIDIA devices found") -func nvidiaDeviceFrom(reader io.Reader) (devices, error) { +func nvidiaDeviceFrom(reader io.Reader) (Devices, error) { allDevices := devicesFrom(reader) - nvidiaDevices := make(devices) - var hasNvidiaDevices bool - for n, d := range allDevices { - if !strings.HasPrefix(string(n), nvidiaDevicePrefix) { - continue - } - nvidiaDevices[n] = d - hasNvidiaDevices = true - } - - if !hasNvidiaDevices { + nvidiaDevices := New( + WithDeviceToMajor(allDevices), + WithFilter(func(n string) bool { + return !strings.HasPrefix(n, nvidiaDevicePrefix) + }), + ) + if nvidiaDevices.Count() == 0 { return nil, errNoNvidiaDevices } return nvidiaDevices, nil } -func devicesFrom(reader io.Reader) devices { - allDevices := make(devices) +func devicesFrom(reader io.Reader) map[string]int { + allDevices := make(map[string]int) scanner := bufio.NewScanner(reader) for scanner.Scan() { device, major, err := processProcDeviceLine(scanner.Text()) @@ -141,11 +143,11 @@ func devicesFrom(reader io.Reader) devices { return allDevices } -func processProcDeviceLine(line string) (Name, Major, error) { +func processProcDeviceLine(line string) (string, int, error) { trimmed := strings.TrimSpace(line) - var name Name - var major Major + var name string + var major int n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name) if n == 2 { diff --git a/internal/info/proc/devices/devices_mock.go b/internal/info/proc/devices/devices_mock.go index 315541c2..8f32cd8b 100644 --- a/internal/info/proc/devices/devices_mock.go +++ b/internal/info/proc/devices/devices_mock.go @@ -17,6 +17,9 @@ var _ Devices = &DevicesMock{} // // // make and configure a mocked Devices // mockedDevices := &DevicesMock{ +// CountFunc: func() int { +// panic("mock out the Count method") +// }, // ExistsFunc: func(name Name) bool { // panic("mock out the Exists method") // }, @@ -30,6 +33,9 @@ var _ Devices = &DevicesMock{} // // } type DevicesMock struct { + // CountFunc mocks the Count method. + CountFunc func() int + // ExistsFunc mocks the Exists method. ExistsFunc func(name Name) bool @@ -38,6 +44,9 @@ type DevicesMock struct { // calls tracks calls to the methods. calls struct { + // Count holds details about calls to the Count method. + Count []struct { + } // Exists holds details about calls to the Exists method. Exists []struct { // Name is the name argument value. @@ -49,10 +58,41 @@ type DevicesMock struct { Name Name } } + lockCount sync.RWMutex lockExists sync.RWMutex lockGet sync.RWMutex } +// Count calls CountFunc. +func (mock *DevicesMock) Count() int { + callInfo := struct { + }{} + mock.lockCount.Lock() + mock.calls.Count = append(mock.calls.Count, callInfo) + mock.lockCount.Unlock() + if mock.CountFunc == nil { + var ( + nOut int + ) + return nOut + } + return mock.CountFunc() +} + +// CountCalls gets all the calls that were made to Count. +// Check the length with: +// +// len(mockedDevices.CountCalls()) +func (mock *DevicesMock) CountCalls() []struct { +} { + var calls []struct { + } + mock.lockCount.RLock() + calls = mock.calls.Count + mock.lockCount.RUnlock() + return calls +} + // Exists calls ExistsFunc. func (mock *DevicesMock) Exists(name Name) bool { callInfo := struct { diff --git a/internal/info/proc/devices/devices_test.go b/internal/info/proc/devices/devices_test.go index 97c56b7a..1669dee6 100644 --- a/internal/info/proc/devices/devices_test.go +++ b/internal/info/proc/devices/devices_test.go @@ -25,27 +25,46 @@ import ( ) func TestNvidiaDevices(t *testing.T) { - devices := map[Name]Major{ - "nvidia-frontend": 195, - "nvidia-nvlink": 234, - "nvidia-caps": 235, - "nvidia-uvm": 510, - "nvidia-nvswitch": 511, + perDriverDeviceMaps := map[string]map[string]int{ + "pre550": { + "nvidia-frontend": 195, + "nvidia-nvlink": 234, + "nvidia-caps": 235, + "nvidia-uvm": 510, + "nvidia-nvswitch": 511, + }, + "post550": { + "nvidia": 195, + "nvidia-nvlink": 234, + "nvidia-caps": 235, + "nvidia-uvm": 510, + "nvidia-nvswitch": 511, + }, } - nvidiaDevices := testDevices(devices) - for name, major := range devices { - device, exists := nvidiaDevices.Get(name) - require.True(t, exists, "Unexpected missing device") - require.Equal(t, device, major, "Unexpected device major") - } - _, exists := nvidiaDevices.Get("bogus") - require.False(t, exists, "Unexpected 'bogus' device found") + for k, devices := range perDriverDeviceMaps { + nvidiaDevices := New(WithDeviceToMajor(devices)) + t.Run(k, func(t *testing.T) { + // Each of the expected devices needs to exist. + for name, major := range devices { + device, exists := nvidiaDevices.Get(Name(name)) + require.True(t, exists) + require.Equal(t, device, Major(major)) + } + // An unexpected device cannot exist + _, exists := nvidiaDevices.Get("bogus") + require.False(t, exists) - // assert that nvidia and nvidia-frontend can be used interchangeably and have the device major numbers - m, exists := nvidiaDevices.Get("nvidia") - require.True(t, exists) - require.Equal(t, devices["nvidia-frontend"], m) + // Regardles of the driver version, the nvidia and nvidia-frontend + // names are supported and have the same value. + nvidia, exists := nvidiaDevices.Get(NVIDIAGPU) + require.True(t, exists) + nvidiaFrontend, exists := nvidiaDevices.Get(NVIDIAFrontend) + require.True(t, exists) + require.Equal(t, nvidia, nvidiaFrontend) + }) + + } } func TestProcessDeviceFile(t *testing.T) { @@ -57,6 +76,7 @@ func TestProcessDeviceFile(t *testing.T) { {lines: []string{}, expectedError: errNoNvidiaDevices}, {lines: []string{"Not a valid line:"}, expectedError: errNoNvidiaDevices}, {lines: []string{"195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}}, + {lines: []string{"195 nvidia"}, expected: devices{"nvidia": 195}}, {lines: []string{"195 nvidia-frontend", "235 nvidia-caps"}, expected: devices{"nvidia-frontend": 195, "nvidia-caps": 235}}, {lines: []string{" 195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}}, {lines: []string{"Not a valid line:", "", "195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}}, @@ -68,7 +88,10 @@ func TestProcessDeviceFile(t *testing.T) { d, err := nvidiaDeviceFrom(contents) require.ErrorIs(t, err, tc.expectedError) - require.EqualValues(t, tc.expected, d) + if tc.expectedError == nil { + require.EqualValues(t, tc.expected, d.(devices)) + } + }) } } @@ -76,8 +99,8 @@ func TestProcessDeviceFile(t *testing.T) { func TestProcessDeviceFileLine(t *testing.T) { testCases := []struct { line string - name Name - major Major + name string + major int err bool }{ {"", "", 0, true}, @@ -102,8 +125,3 @@ func TestProcessDeviceFileLine(t *testing.T) { }) } } - -// testDevices creates a set of test NVIDIA devices -func testDevices(d map[Name]Major) Devices { - return devices(d) -} diff --git a/internal/system/nvdevices/devices_test.go b/internal/system/nvdevices/devices_test.go index e1b68e65..d4d8616c 100644 --- a/internal/system/nvdevices/devices_test.go +++ b/internal/system/nvdevices/devices_test.go @@ -29,29 +29,18 @@ import ( func TestCreateControlDevices(t *testing.T) { logger, _ := testlog.NewNullLogger() - nvidiaDevices := &devices.DevicesMock{ - GetFunc: func(name devices.Name) (devices.Major, bool) { - devs := map[devices.Name]devices.Major{ - "nvidia-frontend": 195, - "nvidia-uvm": 243, - } - - // devs550_40 represents the device map from the nvidia gpu drivers >= 550.40.x - devs550_40 := map[devices.Name]devices.Major{ - "nvidia": 195, - "nvidia-uvm": 243, - } - - d, ok := devs[name] - if ok { - return d, ok - } - - // if device d is not found, fallback to the second mock device map - d, ok = devs550_40[name] - return d, ok - }, - } + nvidiaDevices := devices.New( + devices.WithDeviceToMajor(map[string]int{ + "nvidia-frontend": 195, + "nvidia-uvm": 243, + }), + ) + nvidia550Devices := devices.New( + devices.WithDeviceToMajor(map[string]int{ + "nvidia": 195, + "nvidia-uvm": 243, + }), + ) mknodeError := errors.New("mknode error") @@ -60,7 +49,6 @@ func TestCreateControlDevices(t *testing.T) { root string devices devices.Devices mknodeError error - hasError bool expectedError error expectedCalls []struct { S string @@ -69,11 +57,26 @@ func TestCreateControlDevices(t *testing.T) { } }{ { - description: "no root specified", + description: "no root specified; pre 550 driver", root: "", devices: nvidiaDevices, mknodeError: nil, - hasError: false, + expectedCalls: []struct { + S string + N1 int + N2 int + }{ + {"/dev/nvidiactl", 195, 255}, + {"/dev/nvidia-modeset", 195, 254}, + {"/dev/nvidia-uvm", 243, 0}, + {"/dev/nvidia-uvm-tools", 243, 1}, + }, + }, + { + description: "no root specified; 550 driver", + root: "", + devices: nvidia550Devices, + mknodeError: nil, expectedCalls: []struct { S string N1 int @@ -89,7 +92,6 @@ func TestCreateControlDevices(t *testing.T) { description: "some root specified", root: "/some/root", devices: nvidiaDevices, - hasError: false, mknodeError: nil, expectedCalls: []struct { S string @@ -105,7 +107,6 @@ func TestCreateControlDevices(t *testing.T) { { description: "mknod error returns error", devices: nvidiaDevices, - hasError: true, mknodeError: mknodeError, expectedError: mknodeError, // We expect the first call to this to fail, and the rest to be skipped @@ -124,24 +125,8 @@ func TestCreateControlDevices(t *testing.T) { return 0, false }, }, - hasError: true, expectedError: errInvalidDeviceNode, }, - { - description: "nvidia device renamed from nvidia-frontend to nvidia", - devices: nvidiaDevices, - hasError: false, - expectedCalls: []struct { - S string - N1 int - N2 int - }{ - {"/dev/nvidiactl", 195, 255}, - {"/dev/nvidia-modeset", 195, 254}, - {"/dev/nvidia-uvm", 243, 0}, - {"/dev/nvidia-uvm-tools", 243, 1}, - }, - }, } for _, tc := range testCases { @@ -160,11 +145,7 @@ func TestCreateControlDevices(t *testing.T) { d.mknoder = mknode err := d.CreateNVIDIAControlDevices() - if tc.hasError { - require.ErrorContains(t, err, tc.expectedError.Error()) - } else { - require.Nil(t, err) - } + require.ErrorIs(t, err, tc.expectedError) require.EqualValues(t, tc.expectedCalls, mknode.MknodeCalls()) }) }