diff --git a/CHANGELOG.md b/CHANGELOG.md index 7f03f2c5..5180dde4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ # NVIDIA Container Toolkit Changelog +* Add support for extracting device major number from `/proc/devices` if `nvidia` is used as a device name over `nvidia-frontend`. + ## v1.14.5 * Fix `nvidia-ctk runtime configure --cdi.enabled` for Docker. This was incorrectly setting `experimental = true` instead of setting `features.cdi = true`. diff --git a/internal/info/proc/devices/builder.go b/internal/info/proc/devices/builder.go new file mode 100644 index 00000000..ed93939a --- /dev/null +++ b/internal/info/proc/devices/builder.go @@ -0,0 +1,63 @@ +/** +# Copyright 2024 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package devices + +type builder struct { + asMap devices + filter func(string) bool +} + +// New creates a new devices struct with the specified options. +func New(opts ...Option) Devices { + b := &builder{} + for _, opt := range opts { + opt(b) + } + + if b.filter == nil { + b.filter = func(string) bool { return false } + } + + devices := make(devices) + for k, v := range b.asMap { + if b.filter(string(k)) { + continue + } + devices[k] = v + } + return devices +} + +// Option defines a functional option. +type Option func(*builder) + +// WithDeviceToMajor specifies an explicit device name to major number map. +func WithDeviceToMajor(deviceToMajor map[string]int) Option { + return func(b *builder) { + b.asMap = make(devices) + for name, major := range deviceToMajor { + b.asMap[Name(name)] = Major(major) + } + } +} + +// WithFilter specifies a filter to exclude devices. +func WithFilter(filter func(string) bool) Option { + return func(b *builder) { + b.filter = filter + } +} diff --git a/internal/info/proc/devices/devices.go b/internal/info/proc/devices/devices.go index 95430428..e0863f97 100644 --- a/internal/info/proc/devices/devices.go +++ b/internal/info/proc/devices/devices.go @@ -33,7 +33,7 @@ const ( NVIDIAModesetMinor = 254 NVIDIAFrontend = Name("nvidia-frontend") - NVIDIAGPU = NVIDIAFrontend + NVIDIAGPU = Name("nvidia") NVIDIACaps = Name("nvidia-caps") NVIDIAUVM = Name("nvidia-uvm") @@ -53,22 +53,43 @@ type Major int type Devices interface { Exists(Name) bool Get(Name) (Major, bool) + Count() int } type devices map[Name]Major var _ Devices = devices(nil) +// Count returns the number of devices defined. +func (d devices) Count() int { + return len(d) +} + // Exists checks if a Device with a given name exists or not func (d devices) Exists(name Name) bool { _, exists := d[name] return exists } -// Get a Device from Devices +// Get a Device from Devices. It also has fallback logic to ensure device name changes in /proc/devices are handled +// For e.g:- For GPU drivers 550.40.x or greater, the gpu device has been renamed from "nvidia-frontend" to "nvidia". func (d devices) Get(name Name) (Major, bool) { - device, exists := d[name] - return device, exists + for _, n := range name.getWithFallback() { + device, exists := d[n] + if exists { + return device, true + } + } + return 0, false +} + +// getWithFallback returns a prioritised list of device names for a specific name. +// This allows multiple names to be associated with a single name to support various driver versions. +func (n Name) getWithFallback() []Name { + if n == NVIDIAGPU || n == NVIDIAFrontend { + return []Name{NVIDIAGPU, NVIDIAFrontend} + } + return []Name{n} } // GetNVIDIADevices returns the set of NVIDIA Devices on the machine @@ -94,27 +115,23 @@ func nvidiaDevices(devicesPath string) (Devices, error) { var errNoNvidiaDevices = errors.New("no NVIDIA devices found") -func nvidiaDeviceFrom(reader io.Reader) (devices, error) { +func nvidiaDeviceFrom(reader io.Reader) (Devices, error) { allDevices := devicesFrom(reader) - nvidiaDevices := make(devices) - var hasNvidiaDevices bool - for n, d := range allDevices { - if !strings.HasPrefix(string(n), nvidiaDevicePrefix) { - continue - } - nvidiaDevices[n] = d - hasNvidiaDevices = true - } - - if !hasNvidiaDevices { + nvidiaDevices := New( + WithDeviceToMajor(allDevices), + WithFilter(func(n string) bool { + return !strings.HasPrefix(n, nvidiaDevicePrefix) + }), + ) + if nvidiaDevices.Count() == 0 { return nil, errNoNvidiaDevices } return nvidiaDevices, nil } -func devicesFrom(reader io.Reader) devices { - allDevices := make(devices) +func devicesFrom(reader io.Reader) map[string]int { + allDevices := make(map[string]int) scanner := bufio.NewScanner(reader) for scanner.Scan() { device, major, err := processProcDeviceLine(scanner.Text()) @@ -126,11 +143,11 @@ func devicesFrom(reader io.Reader) devices { return allDevices } -func processProcDeviceLine(line string) (Name, Major, error) { +func processProcDeviceLine(line string) (string, int, error) { trimmed := strings.TrimSpace(line) - var name Name - var major Major + var name string + var major int n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name) if n == 2 { diff --git a/internal/info/proc/devices/devices_mock.go b/internal/info/proc/devices/devices_mock.go index 315541c2..8f32cd8b 100644 --- a/internal/info/proc/devices/devices_mock.go +++ b/internal/info/proc/devices/devices_mock.go @@ -17,6 +17,9 @@ var _ Devices = &DevicesMock{} // // // make and configure a mocked Devices // mockedDevices := &DevicesMock{ +// CountFunc: func() int { +// panic("mock out the Count method") +// }, // ExistsFunc: func(name Name) bool { // panic("mock out the Exists method") // }, @@ -30,6 +33,9 @@ var _ Devices = &DevicesMock{} // // } type DevicesMock struct { + // CountFunc mocks the Count method. + CountFunc func() int + // ExistsFunc mocks the Exists method. ExistsFunc func(name Name) bool @@ -38,6 +44,9 @@ type DevicesMock struct { // calls tracks calls to the methods. calls struct { + // Count holds details about calls to the Count method. + Count []struct { + } // Exists holds details about calls to the Exists method. Exists []struct { // Name is the name argument value. @@ -49,10 +58,41 @@ type DevicesMock struct { Name Name } } + lockCount sync.RWMutex lockExists sync.RWMutex lockGet sync.RWMutex } +// Count calls CountFunc. +func (mock *DevicesMock) Count() int { + callInfo := struct { + }{} + mock.lockCount.Lock() + mock.calls.Count = append(mock.calls.Count, callInfo) + mock.lockCount.Unlock() + if mock.CountFunc == nil { + var ( + nOut int + ) + return nOut + } + return mock.CountFunc() +} + +// CountCalls gets all the calls that were made to Count. +// Check the length with: +// +// len(mockedDevices.CountCalls()) +func (mock *DevicesMock) CountCalls() []struct { +} { + var calls []struct { + } + mock.lockCount.RLock() + calls = mock.calls.Count + mock.lockCount.RUnlock() + return calls +} + // Exists calls ExistsFunc. func (mock *DevicesMock) Exists(name Name) bool { callInfo := struct { diff --git a/internal/info/proc/devices/devices_test.go b/internal/info/proc/devices/devices_test.go index 037ccbe0..1669dee6 100644 --- a/internal/info/proc/devices/devices_test.go +++ b/internal/info/proc/devices/devices_test.go @@ -25,22 +25,46 @@ import ( ) func TestNvidiaDevices(t *testing.T) { - devices := map[Name]Major{ - "nvidia-frontend": 195, - "nvidia-nvlink": 234, - "nvidia-caps": 235, - "nvidia-uvm": 510, - "nvidia-nvswitch": 511, + perDriverDeviceMaps := map[string]map[string]int{ + "pre550": { + "nvidia-frontend": 195, + "nvidia-nvlink": 234, + "nvidia-caps": 235, + "nvidia-uvm": 510, + "nvidia-nvswitch": 511, + }, + "post550": { + "nvidia": 195, + "nvidia-nvlink": 234, + "nvidia-caps": 235, + "nvidia-uvm": 510, + "nvidia-nvswitch": 511, + }, } - nvidiaDevices := testDevices(devices) - for name, major := range devices { - device, exists := nvidiaDevices.Get(name) - require.True(t, exists, "Unexpected missing device") - require.Equal(t, device, major, "Unexpected device major") + for k, devices := range perDriverDeviceMaps { + nvidiaDevices := New(WithDeviceToMajor(devices)) + t.Run(k, func(t *testing.T) { + // Each of the expected devices needs to exist. + for name, major := range devices { + device, exists := nvidiaDevices.Get(Name(name)) + require.True(t, exists) + require.Equal(t, device, Major(major)) + } + // An unexpected device cannot exist + _, exists := nvidiaDevices.Get("bogus") + require.False(t, exists) + + // Regardles of the driver version, the nvidia and nvidia-frontend + // names are supported and have the same value. + nvidia, exists := nvidiaDevices.Get(NVIDIAGPU) + require.True(t, exists) + nvidiaFrontend, exists := nvidiaDevices.Get(NVIDIAFrontend) + require.True(t, exists) + require.Equal(t, nvidia, nvidiaFrontend) + }) + } - _, exists := nvidiaDevices.Get("bogus") - require.False(t, exists, "Unexpected 'bogus' device found") } func TestProcessDeviceFile(t *testing.T) { @@ -52,6 +76,7 @@ func TestProcessDeviceFile(t *testing.T) { {lines: []string{}, expectedError: errNoNvidiaDevices}, {lines: []string{"Not a valid line:"}, expectedError: errNoNvidiaDevices}, {lines: []string{"195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}}, + {lines: []string{"195 nvidia"}, expected: devices{"nvidia": 195}}, {lines: []string{"195 nvidia-frontend", "235 nvidia-caps"}, expected: devices{"nvidia-frontend": 195, "nvidia-caps": 235}}, {lines: []string{" 195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}}, {lines: []string{"Not a valid line:", "", "195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}}, @@ -63,7 +88,10 @@ func TestProcessDeviceFile(t *testing.T) { d, err := nvidiaDeviceFrom(contents) require.ErrorIs(t, err, tc.expectedError) - require.EqualValues(t, tc.expected, d) + if tc.expectedError == nil { + require.EqualValues(t, tc.expected, d.(devices)) + } + }) } } @@ -71,8 +99,8 @@ func TestProcessDeviceFile(t *testing.T) { func TestProcessDeviceFileLine(t *testing.T) { testCases := []struct { line string - name Name - major Major + name string + major int err bool }{ {"", "", 0, true}, @@ -97,8 +125,3 @@ func TestProcessDeviceFileLine(t *testing.T) { }) } } - -// testDevices creates a set of test NVIDIA devices -func testDevices(d map[Name]Major) Devices { - return devices(d) -} diff --git a/internal/system/nvdevices/devices_test.go b/internal/system/nvdevices/devices_test.go index 5d94bc75..0af15320 100644 --- a/internal/system/nvdevices/devices_test.go +++ b/internal/system/nvdevices/devices_test.go @@ -28,15 +28,18 @@ import ( func TestCreateControlDevices(t *testing.T) { logger, _ := testlog.NewNullLogger() - nvidiaDevices := &devices.DevicesMock{ - GetFunc: func(name devices.Name) (devices.Major, bool) { - devices := map[devices.Name]devices.Major{ - "nvidia-frontend": 195, - "nvidia-uvm": 243, - } - return devices[name], true - }, - } + nvidiaDevices := devices.New( + devices.WithDeviceToMajor(map[string]int{ + "nvidia-frontend": 195, + "nvidia-uvm": 243, + }), + ) + nvidia550Devices := devices.New( + devices.WithDeviceToMajor(map[string]int{ + "nvidia": 195, + "nvidia-uvm": 243, + }), + ) mknodeError := errors.New("mknode error") @@ -53,7 +56,7 @@ func TestCreateControlDevices(t *testing.T) { } }{ { - description: "no root specified", + description: "no root specified; pre 550 driver", root: "", devices: nvidiaDevices, mknodeError: nil, @@ -68,6 +71,22 @@ func TestCreateControlDevices(t *testing.T) { {"/dev/nvidia-uvm-tools", 243, 1}, }, }, + { + description: "no root specified; 550 driver", + root: "", + devices: nvidia550Devices, + mknodeError: nil, + expectedCalls: []struct { + S string + N1 int + N2 int + }{ + {"/dev/nvidiactl", 195, 255}, + {"/dev/nvidia-modeset", 195, 254}, + {"/dev/nvidia-uvm", 243, 0}, + {"/dev/nvidia-uvm-tools", 243, 1}, + }, + }, { description: "some root specified", root: "/some/root", @@ -129,5 +148,4 @@ func TestCreateControlDevices(t *testing.T) { require.EqualValues(t, tc.expectedCalls, mknode.MknodeCalls()) }) } - }