mirror of
https://github.com/clearml/go-nvlib
synced 2025-04-06 13:54:58 +00:00
Merge branch 'driver-detection' into 'main'
Detect driver bound to an NvidiaPCIDevice and mdev device See merge request nvidia/cloud-native/go-nvlib!11
This commit is contained in:
commit
f281b5e581
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,2 +1,3 @@
|
||||
*.swp
|
||||
*.swo
|
||||
*.test
|
||||
|
@ -20,7 +20,6 @@ import (
|
||||
"fmt"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci/bytes"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
@ -34,7 +33,7 @@ var _ Interface = (*MockNvmdev)(nil)
|
||||
|
||||
// NewMock creates new mock mediated (vGPU) and parent PCI devices and removes old devices
|
||||
func NewMock() (mock *MockNvmdev, rerr error) {
|
||||
mdevParentsRootDir, err := ioutil.TempDir("", "")
|
||||
mdevParentsRootDir, err := os.MkdirTemp(os.TempDir(), "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -43,7 +42,7 @@ func NewMock() (mock *MockNvmdev, rerr error) {
|
||||
os.RemoveAll(mdevParentsRootDir)
|
||||
}
|
||||
}()
|
||||
mdevDevicesRootDir, err := ioutil.TempDir("", "")
|
||||
mdevDevicesRootDir, err := os.MkdirTemp(os.TempDir(), "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -184,14 +183,29 @@ func (m *MockNvmdev) AddMockA100Parent(address string, numaNode int) error {
|
||||
|
||||
// AddMockA100Mdev creates an A100 like MDEV (vGPU) mock device.
|
||||
// The corresponding mocked parent A100 device must be created beforehand.
|
||||
func (m *MockNvmdev) AddMockA100Mdev(uuid string, mdevType string, parentMdevTypeDir string) error {
|
||||
deviceDir := filepath.Join(m.mdevDevicesRoot, uuid)
|
||||
err := os.MkdirAll(deviceDir, 0755)
|
||||
func (m *MockNvmdev) AddMockA100Mdev(uuid string, mdevType string, mdevTypeDir string, parentDeviceDir string) error {
|
||||
mdevDeviceDir := filepath.Join(parentDeviceDir, uuid)
|
||||
err := os.Mkdir(mdevDeviceDir, 0755)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = os.Symlink(parentMdevTypeDir, filepath.Join(deviceDir, "mdev_type"))
|
||||
parentMdevTypeDir := filepath.Join(parentDeviceDir, "mdev_supported_types", mdevTypeDir)
|
||||
err = os.Symlink(parentMdevTypeDir, filepath.Join(mdevDeviceDir, "mdev_type"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = os.Create(filepath.Join(mdevDeviceDir, "vfio_mdev"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = os.Symlink(filepath.Join(mdevDeviceDir, "vfio_mdev"), filepath.Join(mdevDeviceDir, "driver"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = os.Symlink(mdevDeviceDir, filepath.Join(m.mdevDevicesRoot, uuid))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -19,7 +19,6 @@ package nvmdev
|
||||
import (
|
||||
"fmt"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
@ -57,6 +56,7 @@ type Device struct {
|
||||
Path string
|
||||
UUID string
|
||||
MDEVType string
|
||||
Driver string
|
||||
Parent *ParentDevice
|
||||
}
|
||||
|
||||
@ -67,7 +67,7 @@ func New() Interface {
|
||||
|
||||
// GetAllParentDevices returns all NVIDIA Parent PCI devices on the system
|
||||
func (m *nvmdev) GetAllParentDevices() ([]*ParentDevice, error) {
|
||||
deviceDirs, err := ioutil.ReadDir(m.mdevParentsRoot)
|
||||
deviceDirs, err := os.ReadDir(m.mdevParentsRoot)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read PCI bus devices: %v", err)
|
||||
}
|
||||
@ -101,7 +101,7 @@ func (m *nvmdev) GetAllParentDevices() ([]*ParentDevice, error) {
|
||||
|
||||
// GetAllDevices returns all NVIDIA mdev (vGPU) devices on the system
|
||||
func (m *nvmdev) GetAllDevices() ([]*Device, error) {
|
||||
deviceDirs, err := ioutil.ReadDir(m.mdevDevicesRoot)
|
||||
deviceDirs, err := os.ReadDir(m.mdevDevicesRoot)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read MDEV devices directory: %v", err)
|
||||
}
|
||||
@ -144,37 +144,49 @@ func NewDevice(root string, uuid string) (*Device, error) {
|
||||
return nil, fmt.Errorf("error getting mdev type: %v", err)
|
||||
}
|
||||
|
||||
driver, err := m.driver()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error detecting driver: %v", err)
|
||||
}
|
||||
|
||||
device := Device{
|
||||
Path: path,
|
||||
UUID: uuid,
|
||||
MDEVType: mdevType,
|
||||
Driver: driver,
|
||||
Parent: parent,
|
||||
}
|
||||
|
||||
return &device, nil
|
||||
}
|
||||
|
||||
// mdev represents the path to an NVIDIA mdev (vGPU) device.
|
||||
type mdev string
|
||||
|
||||
func newMdev(devicePath string) (mdev, error) {
|
||||
mdevTypeDir, err := filepath.EvalSymlinks(path.Join(devicePath, "mdev_type"))
|
||||
mdevDir, err := filepath.EvalSymlinks(devicePath)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error resolving mdev_type link: %v", err)
|
||||
return "", fmt.Errorf("error resolving symlink for %s: %v", devicePath, err)
|
||||
}
|
||||
|
||||
return mdev(mdevTypeDir), nil
|
||||
return mdev(mdevDir), nil
|
||||
}
|
||||
|
||||
func (m mdev) String() string {
|
||||
return string(m)
|
||||
}
|
||||
func (m mdev) parentDevicePath() string {
|
||||
// /sys/bus/pci/devices/<addr>/mdev_supported_types/<mdev_type>
|
||||
return path.Dir(path.Dir(string(m)))
|
||||
// /sys/bus/pci/devices/<addr>/<uuid>
|
||||
return path.Dir(string(m))
|
||||
}
|
||||
|
||||
func (m mdev) Type() (string, error) {
|
||||
mdevType, err := ioutil.ReadFile(path.Join(string(m), "name"))
|
||||
mdevTypeDir, err := filepath.EvalSymlinks(path.Join(string(m), "mdev_type"))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error resolving mdev_type link for mdev %s: %v", m, err)
|
||||
}
|
||||
|
||||
mdevType, err := os.ReadFile(path.Join(mdevTypeDir, "name"))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("unable to read mdev_type name for mdev %s: %v", m, err)
|
||||
}
|
||||
@ -188,6 +200,14 @@ func (m mdev) Type() (string, error) {
|
||||
return mdevTypeSplit[1], nil
|
||||
}
|
||||
|
||||
func (m mdev) driver() (string, error) {
|
||||
driver, err := filepath.EvalSymlinks(path.Join(string(m), "driver"))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return filepath.Base(driver), nil
|
||||
}
|
||||
|
||||
// NewParentDevice constructs a ParentDevice
|
||||
func NewParentDevice(devicePath string) (*ParentDevice, error) {
|
||||
nvdevice, err := nvpci.NewDevice(devicePath)
|
||||
@ -205,7 +225,7 @@ func NewParentDevice(devicePath string) (*ParentDevice, error) {
|
||||
}
|
||||
mdevTypesMap := make(map[string]string)
|
||||
for _, path := range paths {
|
||||
name, err := ioutil.ReadFile(path)
|
||||
name, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read file %s: %v", path, err)
|
||||
}
|
||||
@ -292,7 +312,7 @@ func (p *ParentDevice) GetAvailableMDEVInstances(mdevType string) (int, error) {
|
||||
return -1, nil
|
||||
}
|
||||
|
||||
available, err := ioutil.ReadFile(filepath.Join(mdevPath, "available_instances"))
|
||||
available, err := os.ReadFile(filepath.Join(mdevPath, "available_instances"))
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("unable to read available_instances file: %v", err)
|
||||
}
|
||||
|
@ -18,7 +18,6 @@ package nvmdev
|
||||
|
||||
import (
|
||||
"github.com/stretchr/testify/require"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
@ -41,11 +40,11 @@ func TestNvmdev(t *testing.T) {
|
||||
require.Nil(t, err, "Error checking if A100-4Q vGPU type is available for creation")
|
||||
require.True(t, available, "A100-4C should be available to create")
|
||||
|
||||
err = nvmdev.AddMockA100Mdev("b1914f0a-15cf-416e-8967-55fc7cb68e20", "A100-4C",
|
||||
filepath.Join(parentDevs[0].Path, "mdev_supported_types/nvidia-500"))
|
||||
err = nvmdev.AddMockA100Mdev("b1914f0a-15cf-416e-8967-55fc7cb68e20", "A100-4C", "nvidia-500", parentDevs[0].Path)
|
||||
require.Nil(t, err, "Error adding Mock A100 mediated device")
|
||||
|
||||
mdevs, err := nvmdev.GetAllDevices()
|
||||
require.Nil(t, err, "Error getting NVIDIA MDEV (vGPU) devices")
|
||||
require.Equal(t, 1, len(mdevs), "Wrong number of NVIDIA MDEV (vGPU) devices")
|
||||
require.Equal(t, "vfio_mdev", mdevs[0].Driver, "Wrong driver detected for mdev device")
|
||||
}
|
||||
|
@ -18,7 +18,7 @@ package nvpci
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci/bytes"
|
||||
)
|
||||
@ -71,7 +71,7 @@ type PCICapabilities struct {
|
||||
}
|
||||
|
||||
func (cs *ConfigSpace) Read() (ConfigSpaceIO, error) {
|
||||
config, err := ioutil.ReadFile(cs.Path)
|
||||
config, err := os.ReadFile(cs.Path)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open file: %v", err)
|
||||
}
|
||||
|
@ -18,7 +18,6 @@ package nvpci
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
@ -34,7 +33,7 @@ var _ Interface = (*MockNvpci)(nil)
|
||||
|
||||
// NewMockNvpci create new mock PCI and remove old devices
|
||||
func NewMockNvpci() (mock *MockNvpci, rerr error) {
|
||||
rootDir, err := ioutil.TempDir("", "")
|
||||
rootDir, err := os.MkdirTemp(os.TempDir(), "")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -91,6 +90,15 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = os.Create(filepath.Join(deviceDir, "nvidia"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = os.Symlink(filepath.Join(deviceDir, "nvidia"), filepath.Join(deviceDir, "driver"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -18,9 +18,9 @@ package nvpci
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"path"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
@ -70,6 +70,7 @@ type NvidiaPCIDevice struct {
|
||||
Vendor uint16
|
||||
Class uint32
|
||||
Device uint16
|
||||
Driver string
|
||||
NumaNode int
|
||||
Config *ConfigSpace
|
||||
Resources MemoryResources
|
||||
@ -104,7 +105,7 @@ func (d *NvidiaPCIDevice) IsResetAvailable() bool {
|
||||
|
||||
// Reset perform a reset to apply a new configuration at HW level
|
||||
func (d *NvidiaPCIDevice) Reset() error {
|
||||
err := ioutil.WriteFile(path.Join(d.Path, "reset"), []byte("1"), 0)
|
||||
err := os.WriteFile(path.Join(d.Path, "reset"), []byte("1"), 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to write to reset file: %v", err)
|
||||
}
|
||||
@ -123,7 +124,7 @@ func NewFrom(root string) Interface {
|
||||
|
||||
// GetAllDevices returns all Nvidia PCI devices on the system
|
||||
func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) {
|
||||
deviceDirs, err := ioutil.ReadDir(p.pciDevicesRoot)
|
||||
deviceDirs, err := os.ReadDir(p.pciDevicesRoot)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read PCI bus devices: %v", err)
|
||||
}
|
||||
@ -159,7 +160,7 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) {
|
||||
func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
|
||||
address := path.Base(devicePath)
|
||||
|
||||
vendor, err := ioutil.ReadFile(path.Join(devicePath, "vendor"))
|
||||
vendor, err := os.ReadFile(path.Join(devicePath, "vendor"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read PCI device vendor id for %s: %v", address, err)
|
||||
}
|
||||
@ -173,7 +174,7 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
class, err := ioutil.ReadFile(path.Join(devicePath, "class"))
|
||||
class, err := os.ReadFile(path.Join(devicePath, "class"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read PCI device class for %s: %v", address, err)
|
||||
}
|
||||
@ -183,7 +184,7 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
|
||||
return nil, fmt.Errorf("unable to convert class string to uint32: %v", classStr)
|
||||
}
|
||||
|
||||
device, err := ioutil.ReadFile(path.Join(devicePath, "device"))
|
||||
device, err := os.ReadFile(path.Join(devicePath, "device"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read PCI device id for %s: %v", address, err)
|
||||
}
|
||||
@ -193,7 +194,16 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
|
||||
return nil, fmt.Errorf("unable to convert device string to uint16: %v", deviceStr)
|
||||
}
|
||||
|
||||
numa, err := ioutil.ReadFile(path.Join(devicePath, "numa_node"))
|
||||
driver, err := filepath.EvalSymlinks(path.Join(devicePath, "driver"))
|
||||
if err == nil {
|
||||
driver = filepath.Base(driver)
|
||||
} else if os.IsNotExist(err) {
|
||||
driver = ""
|
||||
} else {
|
||||
return nil, fmt.Errorf("unable to detect driver for %s: %v", address, err)
|
||||
}
|
||||
|
||||
numa, err := os.ReadFile(path.Join(devicePath, "numa_node"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err)
|
||||
}
|
||||
@ -207,7 +217,7 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
|
||||
Path: path.Join(devicePath, "config"),
|
||||
}
|
||||
|
||||
resource, err := ioutil.ReadFile(path.Join(devicePath, "resource"))
|
||||
resource, err := os.ReadFile(path.Join(devicePath, "resource"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read PCI resource file for %s: %v", address, err)
|
||||
}
|
||||
@ -239,6 +249,7 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
|
||||
Vendor: uint16(vendorID),
|
||||
Class: uint32(classID),
|
||||
Device: uint16(deviceID),
|
||||
Driver: driver,
|
||||
NumaNode: int(numaNode),
|
||||
Config: config,
|
||||
Resources: resources,
|
||||
|
@ -45,6 +45,7 @@ func TestNvpci(t *testing.T) {
|
||||
require.Nil(t, err, "Error reading config")
|
||||
require.Equal(t, devices[0].Vendor, config.GetVendorID(), "Vendor IDs do not match")
|
||||
require.Equal(t, devices[0].Device, config.GetDeviceID(), "Device IDs do not match")
|
||||
require.Equal(t, "nvidia", devices[0].Driver, "Wrong driver detected for device")
|
||||
|
||||
capabilities, err := config.GetPCICapabilities()
|
||||
require.Nil(t, err, "Error getting PCI capabilities")
|
||||
|
Loading…
Reference in New Issue
Block a user