Merge branch 'driver-detection' into 'main'

Detect driver bound to an NvidiaPCIDevice and mdev device

See merge request nvidia/cloud-native/go-nvlib!11
This commit is contained in:
Christopher Desiniotis 2022-07-14 20:39:17 +00:00
commit f281b5e581
8 changed files with 87 additions and 33 deletions

1
.gitignore vendored
View File

@ -1,2 +1,3 @@
*.swp *.swp
*.swo *.swo
*.test

View File

@ -20,7 +20,6 @@ import (
"fmt" "fmt"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci/bytes" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci/bytes"
"io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
) )
@ -34,7 +33,7 @@ var _ Interface = (*MockNvmdev)(nil)
// NewMock creates new mock mediated (vGPU) and parent PCI devices and removes old devices // NewMock creates new mock mediated (vGPU) and parent PCI devices and removes old devices
func NewMock() (mock *MockNvmdev, rerr error) { func NewMock() (mock *MockNvmdev, rerr error) {
mdevParentsRootDir, err := ioutil.TempDir("", "") mdevParentsRootDir, err := os.MkdirTemp(os.TempDir(), "")
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -43,7 +42,7 @@ func NewMock() (mock *MockNvmdev, rerr error) {
os.RemoveAll(mdevParentsRootDir) os.RemoveAll(mdevParentsRootDir)
} }
}() }()
mdevDevicesRootDir, err := ioutil.TempDir("", "") mdevDevicesRootDir, err := os.MkdirTemp(os.TempDir(), "")
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -184,14 +183,29 @@ func (m *MockNvmdev) AddMockA100Parent(address string, numaNode int) error {
// AddMockA100Mdev creates an A100 like MDEV (vGPU) mock device. // AddMockA100Mdev creates an A100 like MDEV (vGPU) mock device.
// The corresponding mocked parent A100 device must be created beforehand. // The corresponding mocked parent A100 device must be created beforehand.
func (m *MockNvmdev) AddMockA100Mdev(uuid string, mdevType string, parentMdevTypeDir string) error { func (m *MockNvmdev) AddMockA100Mdev(uuid string, mdevType string, mdevTypeDir string, parentDeviceDir string) error {
deviceDir := filepath.Join(m.mdevDevicesRoot, uuid) mdevDeviceDir := filepath.Join(parentDeviceDir, uuid)
err := os.MkdirAll(deviceDir, 0755) err := os.Mkdir(mdevDeviceDir, 0755)
if err != nil { if err != nil {
return err return err
} }
err = os.Symlink(parentMdevTypeDir, filepath.Join(deviceDir, "mdev_type")) parentMdevTypeDir := filepath.Join(parentDeviceDir, "mdev_supported_types", mdevTypeDir)
err = os.Symlink(parentMdevTypeDir, filepath.Join(mdevDeviceDir, "mdev_type"))
if err != nil {
return err
}
_, err = os.Create(filepath.Join(mdevDeviceDir, "vfio_mdev"))
if err != nil {
return err
}
err = os.Symlink(filepath.Join(mdevDeviceDir, "vfio_mdev"), filepath.Join(mdevDeviceDir, "driver"))
if err != nil {
return err
}
err = os.Symlink(mdevDeviceDir, filepath.Join(m.mdevDevicesRoot, uuid))
if err != nil { if err != nil {
return err return err
} }

View File

@ -19,7 +19,6 @@ package nvmdev
import ( import (
"fmt" "fmt"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci"
"io/ioutil"
"os" "os"
"path" "path"
"path/filepath" "path/filepath"
@ -57,6 +56,7 @@ type Device struct {
Path string Path string
UUID string UUID string
MDEVType string MDEVType string
Driver string
Parent *ParentDevice Parent *ParentDevice
} }
@ -67,7 +67,7 @@ func New() Interface {
// GetAllParentDevices returns all NVIDIA Parent PCI devices on the system // GetAllParentDevices returns all NVIDIA Parent PCI devices on the system
func (m *nvmdev) GetAllParentDevices() ([]*ParentDevice, error) { func (m *nvmdev) GetAllParentDevices() ([]*ParentDevice, error) {
deviceDirs, err := ioutil.ReadDir(m.mdevParentsRoot) deviceDirs, err := os.ReadDir(m.mdevParentsRoot)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to read PCI bus devices: %v", err) return nil, fmt.Errorf("unable to read PCI bus devices: %v", err)
} }
@ -101,7 +101,7 @@ func (m *nvmdev) GetAllParentDevices() ([]*ParentDevice, error) {
// GetAllDevices returns all NVIDIA mdev (vGPU) devices on the system // GetAllDevices returns all NVIDIA mdev (vGPU) devices on the system
func (m *nvmdev) GetAllDevices() ([]*Device, error) { func (m *nvmdev) GetAllDevices() ([]*Device, error) {
deviceDirs, err := ioutil.ReadDir(m.mdevDevicesRoot) deviceDirs, err := os.ReadDir(m.mdevDevicesRoot)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to read MDEV devices directory: %v", err) return nil, fmt.Errorf("unable to read MDEV devices directory: %v", err)
} }
@ -144,37 +144,49 @@ func NewDevice(root string, uuid string) (*Device, error) {
return nil, fmt.Errorf("error getting mdev type: %v", err) return nil, fmt.Errorf("error getting mdev type: %v", err)
} }
driver, err := m.driver()
if err != nil {
return nil, fmt.Errorf("error detecting driver: %v", err)
}
device := Device{ device := Device{
Path: path, Path: path,
UUID: uuid, UUID: uuid,
MDEVType: mdevType, MDEVType: mdevType,
Driver: driver,
Parent: parent, Parent: parent,
} }
return &device, nil return &device, nil
} }
// mdev represents the path to an NVIDIA mdev (vGPU) device.
type mdev string type mdev string
func newMdev(devicePath string) (mdev, error) { func newMdev(devicePath string) (mdev, error) {
mdevTypeDir, err := filepath.EvalSymlinks(path.Join(devicePath, "mdev_type")) mdevDir, err := filepath.EvalSymlinks(devicePath)
if err != nil { if err != nil {
return "", fmt.Errorf("error resolving mdev_type link: %v", err) return "", fmt.Errorf("error resolving symlink for %s: %v", devicePath, err)
} }
return mdev(mdevTypeDir), nil return mdev(mdevDir), nil
} }
func (m mdev) String() string { func (m mdev) String() string {
return string(m) return string(m)
} }
func (m mdev) parentDevicePath() string { func (m mdev) parentDevicePath() string {
// /sys/bus/pci/devices/<addr>/mdev_supported_types/<mdev_type> // /sys/bus/pci/devices/<addr>/<uuid>
return path.Dir(path.Dir(string(m))) return path.Dir(string(m))
} }
func (m mdev) Type() (string, error) { func (m mdev) Type() (string, error) {
mdevType, err := ioutil.ReadFile(path.Join(string(m), "name")) mdevTypeDir, err := filepath.EvalSymlinks(path.Join(string(m), "mdev_type"))
if err != nil {
return "", fmt.Errorf("error resolving mdev_type link for mdev %s: %v", m, err)
}
mdevType, err := os.ReadFile(path.Join(mdevTypeDir, "name"))
if err != nil { if err != nil {
return "", fmt.Errorf("unable to read mdev_type name for mdev %s: %v", m, err) return "", fmt.Errorf("unable to read mdev_type name for mdev %s: %v", m, err)
} }
@ -188,6 +200,14 @@ func (m mdev) Type() (string, error) {
return mdevTypeSplit[1], nil return mdevTypeSplit[1], nil
} }
func (m mdev) driver() (string, error) {
driver, err := filepath.EvalSymlinks(path.Join(string(m), "driver"))
if err != nil {
return "", err
}
return filepath.Base(driver), nil
}
// NewParentDevice constructs a ParentDevice // NewParentDevice constructs a ParentDevice
func NewParentDevice(devicePath string) (*ParentDevice, error) { func NewParentDevice(devicePath string) (*ParentDevice, error) {
nvdevice, err := nvpci.NewDevice(devicePath) nvdevice, err := nvpci.NewDevice(devicePath)
@ -205,7 +225,7 @@ func NewParentDevice(devicePath string) (*ParentDevice, error) {
} }
mdevTypesMap := make(map[string]string) mdevTypesMap := make(map[string]string)
for _, path := range paths { for _, path := range paths {
name, err := ioutil.ReadFile(path) name, err := os.ReadFile(path)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to read file %s: %v", path, err) return nil, fmt.Errorf("unable to read file %s: %v", path, err)
} }
@ -292,7 +312,7 @@ func (p *ParentDevice) GetAvailableMDEVInstances(mdevType string) (int, error) {
return -1, nil return -1, nil
} }
available, err := ioutil.ReadFile(filepath.Join(mdevPath, "available_instances")) available, err := os.ReadFile(filepath.Join(mdevPath, "available_instances"))
if err != nil { if err != nil {
return -1, fmt.Errorf("unable to read available_instances file: %v", err) return -1, fmt.Errorf("unable to read available_instances file: %v", err)
} }

View File

@ -18,7 +18,6 @@ package nvmdev
import ( import (
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"path/filepath"
"testing" "testing"
) )
@ -41,11 +40,11 @@ func TestNvmdev(t *testing.T) {
require.Nil(t, err, "Error checking if A100-4Q vGPU type is available for creation") require.Nil(t, err, "Error checking if A100-4Q vGPU type is available for creation")
require.True(t, available, "A100-4C should be available to create") require.True(t, available, "A100-4C should be available to create")
err = nvmdev.AddMockA100Mdev("b1914f0a-15cf-416e-8967-55fc7cb68e20", "A100-4C", err = nvmdev.AddMockA100Mdev("b1914f0a-15cf-416e-8967-55fc7cb68e20", "A100-4C", "nvidia-500", parentDevs[0].Path)
filepath.Join(parentDevs[0].Path, "mdev_supported_types/nvidia-500"))
require.Nil(t, err, "Error adding Mock A100 mediated device") require.Nil(t, err, "Error adding Mock A100 mediated device")
mdevs, err := nvmdev.GetAllDevices() mdevs, err := nvmdev.GetAllDevices()
require.Nil(t, err, "Error getting NVIDIA MDEV (vGPU) devices") require.Nil(t, err, "Error getting NVIDIA MDEV (vGPU) devices")
require.Equal(t, 1, len(mdevs), "Wrong number of NVIDIA MDEV (vGPU) devices") require.Equal(t, 1, len(mdevs), "Wrong number of NVIDIA MDEV (vGPU) devices")
require.Equal(t, "vfio_mdev", mdevs[0].Driver, "Wrong driver detected for mdev device")
} }

View File

@ -18,7 +18,7 @@ package nvpci
import ( import (
"fmt" "fmt"
"io/ioutil" "os"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci/bytes" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci/bytes"
) )
@ -71,7 +71,7 @@ type PCICapabilities struct {
} }
func (cs *ConfigSpace) Read() (ConfigSpaceIO, error) { func (cs *ConfigSpace) Read() (ConfigSpaceIO, error) {
config, err := ioutil.ReadFile(cs.Path) config, err := os.ReadFile(cs.Path)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to open file: %v", err) return nil, fmt.Errorf("failed to open file: %v", err)
} }

View File

@ -18,7 +18,6 @@ package nvpci
import ( import (
"fmt" "fmt"
"io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
@ -34,7 +33,7 @@ var _ Interface = (*MockNvpci)(nil)
// NewMockNvpci create new mock PCI and remove old devices // NewMockNvpci create new mock PCI and remove old devices
func NewMockNvpci() (mock *MockNvpci, rerr error) { func NewMockNvpci() (mock *MockNvpci, rerr error) {
rootDir, err := ioutil.TempDir("", "") rootDir, err := os.MkdirTemp(os.TempDir(), "")
if err != nil { if err != nil {
return nil, err return nil, err
} }
@ -91,6 +90,15 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
return err return err
} }
_, err = os.Create(filepath.Join(deviceDir, "nvidia"))
if err != nil {
return err
}
err = os.Symlink(filepath.Join(deviceDir, "nvidia"), filepath.Join(deviceDir, "driver"))
if err != nil {
return err
}
numa, err := os.Create(filepath.Join(deviceDir, "numa_node")) numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
if err != nil { if err != nil {
return err return err

View File

@ -18,9 +18,9 @@ package nvpci
import ( import (
"fmt" "fmt"
"io/ioutil"
"os" "os"
"path" "path"
"path/filepath"
"sort" "sort"
"strconv" "strconv"
"strings" "strings"
@ -70,6 +70,7 @@ type NvidiaPCIDevice struct {
Vendor uint16 Vendor uint16
Class uint32 Class uint32
Device uint16 Device uint16
Driver string
NumaNode int NumaNode int
Config *ConfigSpace Config *ConfigSpace
Resources MemoryResources Resources MemoryResources
@ -104,7 +105,7 @@ func (d *NvidiaPCIDevice) IsResetAvailable() bool {
// Reset perform a reset to apply a new configuration at HW level // Reset perform a reset to apply a new configuration at HW level
func (d *NvidiaPCIDevice) Reset() error { func (d *NvidiaPCIDevice) Reset() error {
err := ioutil.WriteFile(path.Join(d.Path, "reset"), []byte("1"), 0) err := os.WriteFile(path.Join(d.Path, "reset"), []byte("1"), 0)
if err != nil { if err != nil {
return fmt.Errorf("unable to write to reset file: %v", err) return fmt.Errorf("unable to write to reset file: %v", err)
} }
@ -123,7 +124,7 @@ func NewFrom(root string) Interface {
// GetAllDevices returns all Nvidia PCI devices on the system // GetAllDevices returns all Nvidia PCI devices on the system
func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) { func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) {
deviceDirs, err := ioutil.ReadDir(p.pciDevicesRoot) deviceDirs, err := os.ReadDir(p.pciDevicesRoot)
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to read PCI bus devices: %v", err) return nil, fmt.Errorf("unable to read PCI bus devices: %v", err)
} }
@ -159,7 +160,7 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) {
func NewDevice(devicePath string) (*NvidiaPCIDevice, error) { func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
address := path.Base(devicePath) address := path.Base(devicePath)
vendor, err := ioutil.ReadFile(path.Join(devicePath, "vendor")) vendor, err := os.ReadFile(path.Join(devicePath, "vendor"))
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to read PCI device vendor id for %s: %v", address, err) return nil, fmt.Errorf("unable to read PCI device vendor id for %s: %v", address, err)
} }
@ -173,7 +174,7 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
return nil, nil return nil, nil
} }
class, err := ioutil.ReadFile(path.Join(devicePath, "class")) class, err := os.ReadFile(path.Join(devicePath, "class"))
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to read PCI device class for %s: %v", address, err) return nil, fmt.Errorf("unable to read PCI device class for %s: %v", address, err)
} }
@ -183,7 +184,7 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
return nil, fmt.Errorf("unable to convert class string to uint32: %v", classStr) return nil, fmt.Errorf("unable to convert class string to uint32: %v", classStr)
} }
device, err := ioutil.ReadFile(path.Join(devicePath, "device")) device, err := os.ReadFile(path.Join(devicePath, "device"))
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to read PCI device id for %s: %v", address, err) return nil, fmt.Errorf("unable to read PCI device id for %s: %v", address, err)
} }
@ -193,7 +194,16 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
return nil, fmt.Errorf("unable to convert device string to uint16: %v", deviceStr) return nil, fmt.Errorf("unable to convert device string to uint16: %v", deviceStr)
} }
numa, err := ioutil.ReadFile(path.Join(devicePath, "numa_node")) driver, err := filepath.EvalSymlinks(path.Join(devicePath, "driver"))
if err == nil {
driver = filepath.Base(driver)
} else if os.IsNotExist(err) {
driver = ""
} else {
return nil, fmt.Errorf("unable to detect driver for %s: %v", address, err)
}
numa, err := os.ReadFile(path.Join(devicePath, "numa_node"))
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err) return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err)
} }
@ -207,7 +217,7 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
Path: path.Join(devicePath, "config"), Path: path.Join(devicePath, "config"),
} }
resource, err := ioutil.ReadFile(path.Join(devicePath, "resource")) resource, err := os.ReadFile(path.Join(devicePath, "resource"))
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to read PCI resource file for %s: %v", address, err) return nil, fmt.Errorf("unable to read PCI resource file for %s: %v", address, err)
} }
@ -239,6 +249,7 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
Vendor: uint16(vendorID), Vendor: uint16(vendorID),
Class: uint32(classID), Class: uint32(classID),
Device: uint16(deviceID), Device: uint16(deviceID),
Driver: driver,
NumaNode: int(numaNode), NumaNode: int(numaNode),
Config: config, Config: config,
Resources: resources, Resources: resources,

View File

@ -45,6 +45,7 @@ func TestNvpci(t *testing.T) {
require.Nil(t, err, "Error reading config") require.Nil(t, err, "Error reading config")
require.Equal(t, devices[0].Vendor, config.GetVendorID(), "Vendor IDs do not match") require.Equal(t, devices[0].Vendor, config.GetVendorID(), "Vendor IDs do not match")
require.Equal(t, devices[0].Device, config.GetDeviceID(), "Device IDs do not match") require.Equal(t, devices[0].Device, config.GetDeviceID(), "Device IDs do not match")
require.Equal(t, "nvidia", devices[0].Driver, "Wrong driver detected for device")
capabilities, err := config.GetPCICapabilities() capabilities, err := config.GetPCICapabilities()
require.Nil(t, err, "Error getting PCI capabilities") require.Nil(t, err, "Error getting PCI capabilities")