mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-22 00:08:11 +00:00
add fallback logic when retrieving major number of the nvidia control device
Signed-off-by: Tariq Ibrahim <tibrahim@nvidia.com>
This commit is contained in:
parent
2f3600af9a
commit
f414ac2865
@ -33,7 +33,7 @@ const (
|
||||
NVIDIAModesetMinor = 254
|
||||
|
||||
NVIDIAFrontend = Name("nvidia-frontend")
|
||||
NVIDIAGPU = NVIDIAFrontend
|
||||
NVIDIAGPU = Name("nvidia")
|
||||
NVIDIACaps = Name("nvidia-caps")
|
||||
NVIDIAUVM = Name("nvidia-uvm")
|
||||
|
||||
@ -65,10 +65,25 @@ func (d devices) Exists(name Name) bool {
|
||||
return exists
|
||||
}
|
||||
|
||||
// Get a Device from Devices
|
||||
// Get a Device from Devices. It also has fallback logic to ensure device name changes in /proc/devices are handled
|
||||
// For e.g:- For GPU drivers 550.40.x or greater, the gpu device has been renamed from "nvidia-frontend" to "nvidia".
|
||||
func (d devices) Get(name Name) (Major, bool) {
|
||||
device, exists := d[name]
|
||||
return device, exists
|
||||
for _, n := range name.getWithFallback() {
|
||||
device, exists := d[n]
|
||||
if exists {
|
||||
return device, true
|
||||
}
|
||||
}
|
||||
return 0, false
|
||||
}
|
||||
|
||||
// getWithFallback returns a prioritised list of device names for a specific name.
|
||||
// This allows multiple names to be associated with a single name to support various driver versions.
|
||||
func (n Name) getWithFallback() []Name {
|
||||
if n == NVIDIAGPU || n == NVIDIAFrontend {
|
||||
return []Name{NVIDIAGPU, NVIDIAFrontend}
|
||||
}
|
||||
return []Name{n}
|
||||
}
|
||||
|
||||
// GetNVIDIADevices returns the set of NVIDIA Devices on the machine
|
||||
|
@ -41,6 +41,11 @@ func TestNvidiaDevices(t *testing.T) {
|
||||
}
|
||||
_, exists := nvidiaDevices.Get("bogus")
|
||||
require.False(t, exists, "Unexpected 'bogus' device found")
|
||||
|
||||
// assert that nvidia and nvidia-frontend can be used interchangeably and have the device major numbers
|
||||
m, exists := nvidiaDevices.Get("nvidia")
|
||||
require.True(t, exists)
|
||||
require.Equal(t, devices["nvidia-frontend"], m)
|
||||
}
|
||||
|
||||
func TestProcessDeviceFile(t *testing.T) {
|
||||
|
@ -31,11 +31,25 @@ func TestCreateControlDevices(t *testing.T) {
|
||||
|
||||
nvidiaDevices := &devices.DevicesMock{
|
||||
GetFunc: func(name devices.Name) (devices.Major, bool) {
|
||||
devices := map[devices.Name]devices.Major{
|
||||
devs := map[devices.Name]devices.Major{
|
||||
"nvidia-frontend": 195,
|
||||
"nvidia-uvm": 243,
|
||||
}
|
||||
return devices[name], true
|
||||
|
||||
// devs550_40 represents the device map from the nvidia gpu drivers >= 550.40.x
|
||||
devs550_40 := map[devices.Name]devices.Major{
|
||||
"nvidia": 195,
|
||||
"nvidia-uvm": 243,
|
||||
}
|
||||
|
||||
d, ok := devs[name]
|
||||
if ok {
|
||||
return d, ok
|
||||
}
|
||||
|
||||
// if device d is not found, fallback to the second mock device map
|
||||
d, ok = devs550_40[name]
|
||||
return d, ok
|
||||
},
|
||||
}
|
||||
|
||||
@ -46,6 +60,7 @@ func TestCreateControlDevices(t *testing.T) {
|
||||
root string
|
||||
devices devices.Devices
|
||||
mknodeError error
|
||||
hasError bool
|
||||
expectedError error
|
||||
expectedCalls []struct {
|
||||
S string
|
||||
@ -58,6 +73,7 @@ func TestCreateControlDevices(t *testing.T) {
|
||||
root: "",
|
||||
devices: nvidiaDevices,
|
||||
mknodeError: nil,
|
||||
hasError: false,
|
||||
expectedCalls: []struct {
|
||||
S string
|
||||
N1 int
|
||||
@ -73,6 +89,7 @@ func TestCreateControlDevices(t *testing.T) {
|
||||
description: "some root specified",
|
||||
root: "/some/root",
|
||||
devices: nvidiaDevices,
|
||||
hasError: false,
|
||||
mknodeError: nil,
|
||||
expectedCalls: []struct {
|
||||
S string
|
||||
@ -88,6 +105,7 @@ func TestCreateControlDevices(t *testing.T) {
|
||||
{
|
||||
description: "mknod error returns error",
|
||||
devices: nvidiaDevices,
|
||||
hasError: true,
|
||||
mknodeError: mknodeError,
|
||||
expectedError: mknodeError,
|
||||
// We expect the first call to this to fail, and the rest to be skipped
|
||||
@ -106,8 +124,24 @@ func TestCreateControlDevices(t *testing.T) {
|
||||
return 0, false
|
||||
},
|
||||
},
|
||||
hasError: true,
|
||||
expectedError: errInvalidDeviceNode,
|
||||
},
|
||||
{
|
||||
description: "nvidia device renamed from nvidia-frontend to nvidia",
|
||||
devices: nvidiaDevices,
|
||||
hasError: false,
|
||||
expectedCalls: []struct {
|
||||
S string
|
||||
N1 int
|
||||
N2 int
|
||||
}{
|
||||
{"/dev/nvidiactl", 195, 255},
|
||||
{"/dev/nvidia-modeset", 195, 254},
|
||||
{"/dev/nvidia-uvm", 243, 0},
|
||||
{"/dev/nvidia-uvm-tools", 243, 1},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
@ -126,9 +160,12 @@ func TestCreateControlDevices(t *testing.T) {
|
||||
d.mknoder = mknode
|
||||
|
||||
err := d.CreateNVIDIAControlDevices()
|
||||
require.ErrorIs(t, err, tc.expectedError)
|
||||
if tc.hasError {
|
||||
require.ErrorContains(t, err, tc.expectedError.Error())
|
||||
} else {
|
||||
require.Nil(t, err)
|
||||
}
|
||||
require.EqualValues(t, tc.expectedCalls, mknode.MknodeCalls())
|
||||
})
|
||||
}
|
||||
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user