mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-22 00:08:11 +00:00
Support both nvidia and nvidia-frontend when extracting major
With newer driver versions the nvidia-frontend name in /proc/devices was replaced with nvidia. This change allows both nvidia and nvidia-frontend to be used to extract the expected device major (195). Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
parent
9ea3360701
commit
4e2861fe77
@ -1,5 +1,7 @@
|
|||||||
# NVIDIA Container Toolkit Changelog
|
# NVIDIA Container Toolkit Changelog
|
||||||
|
|
||||||
|
* Add support for extracting device major number from `/proc/devices` if `nvidia` is used as a device name over `nvidia-frontend`.
|
||||||
|
|
||||||
## v1.14.5
|
## v1.14.5
|
||||||
* Fix `nvidia-ctk runtime configure --cdi.enabled` for Docker. This was incorrectly setting `experimental = true` instead
|
* Fix `nvidia-ctk runtime configure --cdi.enabled` for Docker. This was incorrectly setting `experimental = true` instead
|
||||||
of setting `features.cdi = true`.
|
of setting `features.cdi = true`.
|
||||||
|
63
internal/info/proc/devices/builder.go
Normal file
63
internal/info/proc/devices/builder.go
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
/**
|
||||||
|
# Copyright 2024 NVIDIA CORPORATION
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
**/
|
||||||
|
|
||||||
|
package devices
|
||||||
|
|
||||||
|
type builder struct {
|
||||||
|
asMap devices
|
||||||
|
filter func(string) bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// New creates a new devices struct with the specified options.
|
||||||
|
func New(opts ...Option) Devices {
|
||||||
|
b := &builder{}
|
||||||
|
for _, opt := range opts {
|
||||||
|
opt(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
if b.filter == nil {
|
||||||
|
b.filter = func(string) bool { return false }
|
||||||
|
}
|
||||||
|
|
||||||
|
devices := make(devices)
|
||||||
|
for k, v := range b.asMap {
|
||||||
|
if b.filter(string(k)) {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
devices[k] = v
|
||||||
|
}
|
||||||
|
return devices
|
||||||
|
}
|
||||||
|
|
||||||
|
// Option defines a functional option.
|
||||||
|
type Option func(*builder)
|
||||||
|
|
||||||
|
// WithDeviceToMajor specifies an explicit device name to major number map.
|
||||||
|
func WithDeviceToMajor(deviceToMajor map[string]int) Option {
|
||||||
|
return func(b *builder) {
|
||||||
|
b.asMap = make(devices)
|
||||||
|
for name, major := range deviceToMajor {
|
||||||
|
b.asMap[Name(name)] = Major(major)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// WithFilter specifies a filter to exclude devices.
|
||||||
|
func WithFilter(filter func(string) bool) Option {
|
||||||
|
return func(b *builder) {
|
||||||
|
b.filter = filter
|
||||||
|
}
|
||||||
|
}
|
@ -33,7 +33,7 @@ const (
|
|||||||
NVIDIAModesetMinor = 254
|
NVIDIAModesetMinor = 254
|
||||||
|
|
||||||
NVIDIAFrontend = Name("nvidia-frontend")
|
NVIDIAFrontend = Name("nvidia-frontend")
|
||||||
NVIDIAGPU = NVIDIAFrontend
|
NVIDIAGPU = Name("nvidia")
|
||||||
NVIDIACaps = Name("nvidia-caps")
|
NVIDIACaps = Name("nvidia-caps")
|
||||||
NVIDIAUVM = Name("nvidia-uvm")
|
NVIDIAUVM = Name("nvidia-uvm")
|
||||||
|
|
||||||
@ -53,22 +53,43 @@ type Major int
|
|||||||
type Devices interface {
|
type Devices interface {
|
||||||
Exists(Name) bool
|
Exists(Name) bool
|
||||||
Get(Name) (Major, bool)
|
Get(Name) (Major, bool)
|
||||||
|
Count() int
|
||||||
}
|
}
|
||||||
|
|
||||||
type devices map[Name]Major
|
type devices map[Name]Major
|
||||||
|
|
||||||
var _ Devices = devices(nil)
|
var _ Devices = devices(nil)
|
||||||
|
|
||||||
|
// Count returns the number of devices defined.
|
||||||
|
func (d devices) Count() int {
|
||||||
|
return len(d)
|
||||||
|
}
|
||||||
|
|
||||||
// Exists checks if a Device with a given name exists or not
|
// Exists checks if a Device with a given name exists or not
|
||||||
func (d devices) Exists(name Name) bool {
|
func (d devices) Exists(name Name) bool {
|
||||||
_, exists := d[name]
|
_, exists := d[name]
|
||||||
return exists
|
return exists
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get a Device from Devices
|
// Get a Device from Devices. It also has fallback logic to ensure device name changes in /proc/devices are handled
|
||||||
|
// For e.g:- For GPU drivers 550.40.x or greater, the gpu device has been renamed from "nvidia-frontend" to "nvidia".
|
||||||
func (d devices) Get(name Name) (Major, bool) {
|
func (d devices) Get(name Name) (Major, bool) {
|
||||||
device, exists := d[name]
|
for _, n := range name.getWithFallback() {
|
||||||
return device, exists
|
device, exists := d[n]
|
||||||
|
if exists {
|
||||||
|
return device, true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0, false
|
||||||
|
}
|
||||||
|
|
||||||
|
// getWithFallback returns a prioritised list of device names for a specific name.
|
||||||
|
// This allows multiple names to be associated with a single name to support various driver versions.
|
||||||
|
func (n Name) getWithFallback() []Name {
|
||||||
|
if n == NVIDIAGPU || n == NVIDIAFrontend {
|
||||||
|
return []Name{NVIDIAGPU, NVIDIAFrontend}
|
||||||
|
}
|
||||||
|
return []Name{n}
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetNVIDIADevices returns the set of NVIDIA Devices on the machine
|
// GetNVIDIADevices returns the set of NVIDIA Devices on the machine
|
||||||
@ -94,27 +115,23 @@ func nvidiaDevices(devicesPath string) (Devices, error) {
|
|||||||
|
|
||||||
var errNoNvidiaDevices = errors.New("no NVIDIA devices found")
|
var errNoNvidiaDevices = errors.New("no NVIDIA devices found")
|
||||||
|
|
||||||
func nvidiaDeviceFrom(reader io.Reader) (devices, error) {
|
func nvidiaDeviceFrom(reader io.Reader) (Devices, error) {
|
||||||
allDevices := devicesFrom(reader)
|
allDevices := devicesFrom(reader)
|
||||||
nvidiaDevices := make(devices)
|
|
||||||
|
|
||||||
var hasNvidiaDevices bool
|
nvidiaDevices := New(
|
||||||
for n, d := range allDevices {
|
WithDeviceToMajor(allDevices),
|
||||||
if !strings.HasPrefix(string(n), nvidiaDevicePrefix) {
|
WithFilter(func(n string) bool {
|
||||||
continue
|
return !strings.HasPrefix(n, nvidiaDevicePrefix)
|
||||||
}
|
}),
|
||||||
nvidiaDevices[n] = d
|
)
|
||||||
hasNvidiaDevices = true
|
if nvidiaDevices.Count() == 0 {
|
||||||
}
|
|
||||||
|
|
||||||
if !hasNvidiaDevices {
|
|
||||||
return nil, errNoNvidiaDevices
|
return nil, errNoNvidiaDevices
|
||||||
}
|
}
|
||||||
return nvidiaDevices, nil
|
return nvidiaDevices, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func devicesFrom(reader io.Reader) devices {
|
func devicesFrom(reader io.Reader) map[string]int {
|
||||||
allDevices := make(devices)
|
allDevices := make(map[string]int)
|
||||||
scanner := bufio.NewScanner(reader)
|
scanner := bufio.NewScanner(reader)
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
device, major, err := processProcDeviceLine(scanner.Text())
|
device, major, err := processProcDeviceLine(scanner.Text())
|
||||||
@ -126,11 +143,11 @@ func devicesFrom(reader io.Reader) devices {
|
|||||||
return allDevices
|
return allDevices
|
||||||
}
|
}
|
||||||
|
|
||||||
func processProcDeviceLine(line string) (Name, Major, error) {
|
func processProcDeviceLine(line string) (string, int, error) {
|
||||||
trimmed := strings.TrimSpace(line)
|
trimmed := strings.TrimSpace(line)
|
||||||
|
|
||||||
var name Name
|
var name string
|
||||||
var major Major
|
var major int
|
||||||
|
|
||||||
n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name)
|
n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name)
|
||||||
if n == 2 {
|
if n == 2 {
|
||||||
|
@ -17,6 +17,9 @@ var _ Devices = &DevicesMock{}
|
|||||||
//
|
//
|
||||||
// // make and configure a mocked Devices
|
// // make and configure a mocked Devices
|
||||||
// mockedDevices := &DevicesMock{
|
// mockedDevices := &DevicesMock{
|
||||||
|
// CountFunc: func() int {
|
||||||
|
// panic("mock out the Count method")
|
||||||
|
// },
|
||||||
// ExistsFunc: func(name Name) bool {
|
// ExistsFunc: func(name Name) bool {
|
||||||
// panic("mock out the Exists method")
|
// panic("mock out the Exists method")
|
||||||
// },
|
// },
|
||||||
@ -30,6 +33,9 @@ var _ Devices = &DevicesMock{}
|
|||||||
//
|
//
|
||||||
// }
|
// }
|
||||||
type DevicesMock struct {
|
type DevicesMock struct {
|
||||||
|
// CountFunc mocks the Count method.
|
||||||
|
CountFunc func() int
|
||||||
|
|
||||||
// ExistsFunc mocks the Exists method.
|
// ExistsFunc mocks the Exists method.
|
||||||
ExistsFunc func(name Name) bool
|
ExistsFunc func(name Name) bool
|
||||||
|
|
||||||
@ -38,6 +44,9 @@ type DevicesMock struct {
|
|||||||
|
|
||||||
// calls tracks calls to the methods.
|
// calls tracks calls to the methods.
|
||||||
calls struct {
|
calls struct {
|
||||||
|
// Count holds details about calls to the Count method.
|
||||||
|
Count []struct {
|
||||||
|
}
|
||||||
// Exists holds details about calls to the Exists method.
|
// Exists holds details about calls to the Exists method.
|
||||||
Exists []struct {
|
Exists []struct {
|
||||||
// Name is the name argument value.
|
// Name is the name argument value.
|
||||||
@ -49,10 +58,41 @@ type DevicesMock struct {
|
|||||||
Name Name
|
Name Name
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
lockCount sync.RWMutex
|
||||||
lockExists sync.RWMutex
|
lockExists sync.RWMutex
|
||||||
lockGet sync.RWMutex
|
lockGet sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Count calls CountFunc.
|
||||||
|
func (mock *DevicesMock) Count() int {
|
||||||
|
callInfo := struct {
|
||||||
|
}{}
|
||||||
|
mock.lockCount.Lock()
|
||||||
|
mock.calls.Count = append(mock.calls.Count, callInfo)
|
||||||
|
mock.lockCount.Unlock()
|
||||||
|
if mock.CountFunc == nil {
|
||||||
|
var (
|
||||||
|
nOut int
|
||||||
|
)
|
||||||
|
return nOut
|
||||||
|
}
|
||||||
|
return mock.CountFunc()
|
||||||
|
}
|
||||||
|
|
||||||
|
// CountCalls gets all the calls that were made to Count.
|
||||||
|
// Check the length with:
|
||||||
|
//
|
||||||
|
// len(mockedDevices.CountCalls())
|
||||||
|
func (mock *DevicesMock) CountCalls() []struct {
|
||||||
|
} {
|
||||||
|
var calls []struct {
|
||||||
|
}
|
||||||
|
mock.lockCount.RLock()
|
||||||
|
calls = mock.calls.Count
|
||||||
|
mock.lockCount.RUnlock()
|
||||||
|
return calls
|
||||||
|
}
|
||||||
|
|
||||||
// Exists calls ExistsFunc.
|
// Exists calls ExistsFunc.
|
||||||
func (mock *DevicesMock) Exists(name Name) bool {
|
func (mock *DevicesMock) Exists(name Name) bool {
|
||||||
callInfo := struct {
|
callInfo := struct {
|
||||||
|
@ -25,22 +25,46 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
func TestNvidiaDevices(t *testing.T) {
|
func TestNvidiaDevices(t *testing.T) {
|
||||||
devices := map[Name]Major{
|
perDriverDeviceMaps := map[string]map[string]int{
|
||||||
"nvidia-frontend": 195,
|
"pre550": {
|
||||||
"nvidia-nvlink": 234,
|
"nvidia-frontend": 195,
|
||||||
"nvidia-caps": 235,
|
"nvidia-nvlink": 234,
|
||||||
"nvidia-uvm": 510,
|
"nvidia-caps": 235,
|
||||||
"nvidia-nvswitch": 511,
|
"nvidia-uvm": 510,
|
||||||
|
"nvidia-nvswitch": 511,
|
||||||
|
},
|
||||||
|
"post550": {
|
||||||
|
"nvidia": 195,
|
||||||
|
"nvidia-nvlink": 234,
|
||||||
|
"nvidia-caps": 235,
|
||||||
|
"nvidia-uvm": 510,
|
||||||
|
"nvidia-nvswitch": 511,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
nvidiaDevices := testDevices(devices)
|
for k, devices := range perDriverDeviceMaps {
|
||||||
for name, major := range devices {
|
nvidiaDevices := New(WithDeviceToMajor(devices))
|
||||||
device, exists := nvidiaDevices.Get(name)
|
t.Run(k, func(t *testing.T) {
|
||||||
require.True(t, exists, "Unexpected missing device")
|
// Each of the expected devices needs to exist.
|
||||||
require.Equal(t, device, major, "Unexpected device major")
|
for name, major := range devices {
|
||||||
|
device, exists := nvidiaDevices.Get(Name(name))
|
||||||
|
require.True(t, exists)
|
||||||
|
require.Equal(t, device, Major(major))
|
||||||
|
}
|
||||||
|
// An unexpected device cannot exist
|
||||||
|
_, exists := nvidiaDevices.Get("bogus")
|
||||||
|
require.False(t, exists)
|
||||||
|
|
||||||
|
// Regardles of the driver version, the nvidia and nvidia-frontend
|
||||||
|
// names are supported and have the same value.
|
||||||
|
nvidia, exists := nvidiaDevices.Get(NVIDIAGPU)
|
||||||
|
require.True(t, exists)
|
||||||
|
nvidiaFrontend, exists := nvidiaDevices.Get(NVIDIAFrontend)
|
||||||
|
require.True(t, exists)
|
||||||
|
require.Equal(t, nvidia, nvidiaFrontend)
|
||||||
|
})
|
||||||
|
|
||||||
}
|
}
|
||||||
_, exists := nvidiaDevices.Get("bogus")
|
|
||||||
require.False(t, exists, "Unexpected 'bogus' device found")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestProcessDeviceFile(t *testing.T) {
|
func TestProcessDeviceFile(t *testing.T) {
|
||||||
@ -52,6 +76,7 @@ func TestProcessDeviceFile(t *testing.T) {
|
|||||||
{lines: []string{}, expectedError: errNoNvidiaDevices},
|
{lines: []string{}, expectedError: errNoNvidiaDevices},
|
||||||
{lines: []string{"Not a valid line:"}, expectedError: errNoNvidiaDevices},
|
{lines: []string{"Not a valid line:"}, expectedError: errNoNvidiaDevices},
|
||||||
{lines: []string{"195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}},
|
{lines: []string{"195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}},
|
||||||
|
{lines: []string{"195 nvidia"}, expected: devices{"nvidia": 195}},
|
||||||
{lines: []string{"195 nvidia-frontend", "235 nvidia-caps"}, expected: devices{"nvidia-frontend": 195, "nvidia-caps": 235}},
|
{lines: []string{"195 nvidia-frontend", "235 nvidia-caps"}, expected: devices{"nvidia-frontend": 195, "nvidia-caps": 235}},
|
||||||
{lines: []string{" 195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}},
|
{lines: []string{" 195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}},
|
||||||
{lines: []string{"Not a valid line:", "", "195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}},
|
{lines: []string{"Not a valid line:", "", "195 nvidia-frontend"}, expected: devices{"nvidia-frontend": 195}},
|
||||||
@ -63,7 +88,10 @@ func TestProcessDeviceFile(t *testing.T) {
|
|||||||
d, err := nvidiaDeviceFrom(contents)
|
d, err := nvidiaDeviceFrom(contents)
|
||||||
require.ErrorIs(t, err, tc.expectedError)
|
require.ErrorIs(t, err, tc.expectedError)
|
||||||
|
|
||||||
require.EqualValues(t, tc.expected, d)
|
if tc.expectedError == nil {
|
||||||
|
require.EqualValues(t, tc.expected, d.(devices))
|
||||||
|
}
|
||||||
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -71,8 +99,8 @@ func TestProcessDeviceFile(t *testing.T) {
|
|||||||
func TestProcessDeviceFileLine(t *testing.T) {
|
func TestProcessDeviceFileLine(t *testing.T) {
|
||||||
testCases := []struct {
|
testCases := []struct {
|
||||||
line string
|
line string
|
||||||
name Name
|
name string
|
||||||
major Major
|
major int
|
||||||
err bool
|
err bool
|
||||||
}{
|
}{
|
||||||
{"", "", 0, true},
|
{"", "", 0, true},
|
||||||
@ -97,8 +125,3 @@ func TestProcessDeviceFileLine(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// testDevices creates a set of test NVIDIA devices
|
|
||||||
func testDevices(d map[Name]Major) Devices {
|
|
||||||
return devices(d)
|
|
||||||
}
|
|
||||||
|
@ -28,15 +28,18 @@ import (
|
|||||||
func TestCreateControlDevices(t *testing.T) {
|
func TestCreateControlDevices(t *testing.T) {
|
||||||
logger, _ := testlog.NewNullLogger()
|
logger, _ := testlog.NewNullLogger()
|
||||||
|
|
||||||
nvidiaDevices := &devices.DevicesMock{
|
nvidiaDevices := devices.New(
|
||||||
GetFunc: func(name devices.Name) (devices.Major, bool) {
|
devices.WithDeviceToMajor(map[string]int{
|
||||||
devices := map[devices.Name]devices.Major{
|
"nvidia-frontend": 195,
|
||||||
"nvidia-frontend": 195,
|
"nvidia-uvm": 243,
|
||||||
"nvidia-uvm": 243,
|
}),
|
||||||
}
|
)
|
||||||
return devices[name], true
|
nvidia550Devices := devices.New(
|
||||||
},
|
devices.WithDeviceToMajor(map[string]int{
|
||||||
}
|
"nvidia": 195,
|
||||||
|
"nvidia-uvm": 243,
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
|
||||||
mknodeError := errors.New("mknode error")
|
mknodeError := errors.New("mknode error")
|
||||||
|
|
||||||
@ -53,7 +56,7 @@ func TestCreateControlDevices(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
description: "no root specified",
|
description: "no root specified; pre 550 driver",
|
||||||
root: "",
|
root: "",
|
||||||
devices: nvidiaDevices,
|
devices: nvidiaDevices,
|
||||||
mknodeError: nil,
|
mknodeError: nil,
|
||||||
@ -68,6 +71,22 @@ func TestCreateControlDevices(t *testing.T) {
|
|||||||
{"/dev/nvidia-uvm-tools", 243, 1},
|
{"/dev/nvidia-uvm-tools", 243, 1},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
description: "no root specified; 550 driver",
|
||||||
|
root: "",
|
||||||
|
devices: nvidia550Devices,
|
||||||
|
mknodeError: nil,
|
||||||
|
expectedCalls: []struct {
|
||||||
|
S string
|
||||||
|
N1 int
|
||||||
|
N2 int
|
||||||
|
}{
|
||||||
|
{"/dev/nvidiactl", 195, 255},
|
||||||
|
{"/dev/nvidia-modeset", 195, 254},
|
||||||
|
{"/dev/nvidia-uvm", 243, 0},
|
||||||
|
{"/dev/nvidia-uvm-tools", 243, 1},
|
||||||
|
},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
description: "some root specified",
|
description: "some root specified",
|
||||||
root: "/some/root",
|
root: "/some/root",
|
||||||
@ -129,5 +148,4 @@ func TestCreateControlDevices(t *testing.T) {
|
|||||||
require.EqualValues(t, tc.expectedCalls, mknode.MknodeCalls())
|
require.EqualValues(t, tc.expectedCalls, mknode.MknodeCalls())
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user