add fallback logic when retrieving major number of the nvidia control device

Signed-off-by: Tariq Ibrahim <tibrahim@nvidia.com>
This commit is contained in:
Tariq Ibrahim 2024-02-05 21:35:30 -08:00
parent 2f3600af9a
commit f414ac2865
3 changed files with 65 additions and 8 deletions

View File

@ -33,7 +33,7 @@ const (
NVIDIAModesetMinor = 254 NVIDIAModesetMinor = 254
NVIDIAFrontend = Name("nvidia-frontend") NVIDIAFrontend = Name("nvidia-frontend")
NVIDIAGPU = NVIDIAFrontend NVIDIAGPU = Name("nvidia")
NVIDIACaps = Name("nvidia-caps") NVIDIACaps = Name("nvidia-caps")
NVIDIAUVM = Name("nvidia-uvm") NVIDIAUVM = Name("nvidia-uvm")
@ -65,10 +65,25 @@ func (d devices) Exists(name Name) bool {
return exists return exists
} }
// Get a Device from Devices // Get a Device from Devices. It also has fallback logic to ensure device name changes in /proc/devices are handled
// For e.g:- For GPU drivers 550.40.x or greater, the gpu device has been renamed from "nvidia-frontend" to "nvidia".
func (d devices) Get(name Name) (Major, bool) { func (d devices) Get(name Name) (Major, bool) {
device, exists := d[name] for _, n := range name.getWithFallback() {
return device, exists device, exists := d[n]
if exists {
return device, true
}
}
return 0, false
}
// getWithFallback returns a prioritised list of device names for a specific name.
// This allows multiple names to be associated with a single name to support various driver versions.
func (n Name) getWithFallback() []Name {
if n == NVIDIAGPU || n == NVIDIAFrontend {
return []Name{NVIDIAGPU, NVIDIAFrontend}
}
return []Name{n}
} }
// GetNVIDIADevices returns the set of NVIDIA Devices on the machine // GetNVIDIADevices returns the set of NVIDIA Devices on the machine

View File

@ -41,6 +41,11 @@ func TestNvidiaDevices(t *testing.T) {
} }
_, exists := nvidiaDevices.Get("bogus") _, exists := nvidiaDevices.Get("bogus")
require.False(t, exists, "Unexpected 'bogus' device found") require.False(t, exists, "Unexpected 'bogus' device found")
// assert that nvidia and nvidia-frontend can be used interchangeably and have the device major numbers
m, exists := nvidiaDevices.Get("nvidia")
require.True(t, exists)
require.Equal(t, devices["nvidia-frontend"], m)
} }
func TestProcessDeviceFile(t *testing.T) { func TestProcessDeviceFile(t *testing.T) {

View File

@ -31,11 +31,25 @@ func TestCreateControlDevices(t *testing.T) {
nvidiaDevices := &devices.DevicesMock{ nvidiaDevices := &devices.DevicesMock{
GetFunc: func(name devices.Name) (devices.Major, bool) { GetFunc: func(name devices.Name) (devices.Major, bool) {
devices := map[devices.Name]devices.Major{ devs := map[devices.Name]devices.Major{
"nvidia-frontend": 195, "nvidia-frontend": 195,
"nvidia-uvm": 243, "nvidia-uvm": 243,
} }
return devices[name], true
// devs550_40 represents the device map from the nvidia gpu drivers >= 550.40.x
devs550_40 := map[devices.Name]devices.Major{
"nvidia": 195,
"nvidia-uvm": 243,
}
d, ok := devs[name]
if ok {
return d, ok
}
// if device d is not found, fallback to the second mock device map
d, ok = devs550_40[name]
return d, ok
}, },
} }
@ -46,6 +60,7 @@ func TestCreateControlDevices(t *testing.T) {
root string root string
devices devices.Devices devices devices.Devices
mknodeError error mknodeError error
hasError bool
expectedError error expectedError error
expectedCalls []struct { expectedCalls []struct {
S string S string
@ -58,6 +73,7 @@ func TestCreateControlDevices(t *testing.T) {
root: "", root: "",
devices: nvidiaDevices, devices: nvidiaDevices,
mknodeError: nil, mknodeError: nil,
hasError: false,
expectedCalls: []struct { expectedCalls: []struct {
S string S string
N1 int N1 int
@ -73,6 +89,7 @@ func TestCreateControlDevices(t *testing.T) {
description: "some root specified", description: "some root specified",
root: "/some/root", root: "/some/root",
devices: nvidiaDevices, devices: nvidiaDevices,
hasError: false,
mknodeError: nil, mknodeError: nil,
expectedCalls: []struct { expectedCalls: []struct {
S string S string
@ -88,6 +105,7 @@ func TestCreateControlDevices(t *testing.T) {
{ {
description: "mknod error returns error", description: "mknod error returns error",
devices: nvidiaDevices, devices: nvidiaDevices,
hasError: true,
mknodeError: mknodeError, mknodeError: mknodeError,
expectedError: mknodeError, expectedError: mknodeError,
// We expect the first call to this to fail, and the rest to be skipped // We expect the first call to this to fail, and the rest to be skipped
@ -106,8 +124,24 @@ func TestCreateControlDevices(t *testing.T) {
return 0, false return 0, false
}, },
}, },
hasError: true,
expectedError: errInvalidDeviceNode, expectedError: errInvalidDeviceNode,
}, },
{
description: "nvidia device renamed from nvidia-frontend to nvidia",
devices: nvidiaDevices,
hasError: false,
expectedCalls: []struct {
S string
N1 int
N2 int
}{
{"/dev/nvidiactl", 195, 255},
{"/dev/nvidia-modeset", 195, 254},
{"/dev/nvidia-uvm", 243, 0},
{"/dev/nvidia-uvm-tools", 243, 1},
},
},
} }
for _, tc := range testCases { for _, tc := range testCases {
@ -126,9 +160,12 @@ func TestCreateControlDevices(t *testing.T) {
d.mknoder = mknode d.mknoder = mknode
err := d.CreateNVIDIAControlDevices() err := d.CreateNVIDIAControlDevices()
require.ErrorIs(t, err, tc.expectedError) if tc.hasError {
require.ErrorContains(t, err, tc.expectedError.Error())
} else {
require.Nil(t, err)
}
require.EqualValues(t, tc.expectedCalls, mknode.MknodeCalls()) require.EqualValues(t, tc.expectedCalls, mknode.MknodeCalls())
}) })
} }
} }