1
0
mirror of https://github.com/clearml/go-nvlib synced 2025-05-11 15:21:25 +00:00

Detect iommu_group for PCI and mdev devices

This commit is contained in:
Christopher Desiniotis 2022-07-25 23:20:03 +00:00
parent f281b5e581
commit f52cd402a1
6 changed files with 114 additions and 31 deletions

View File

@ -100,6 +100,24 @@ func (m *MockNvmdev) AddMockA100Parent(address string, numaNode int) error {
return err return err
} }
_, err = os.Create(filepath.Join(deviceDir, "nvidia"))
if err != nil {
return err
}
err = os.Symlink(filepath.Join(deviceDir, "nvidia"), filepath.Join(deviceDir, "driver"))
if err != nil {
return err
}
_, err = os.Create(filepath.Join(deviceDir, "20"))
if err != nil {
return err
}
err = os.Symlink(filepath.Join(deviceDir, "20"), filepath.Join(deviceDir, "iommu_group"))
if err != nil {
return err
}
numa, err := os.Create(filepath.Join(deviceDir, "numa_node")) numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
if err != nil { if err != nil {
return err return err
@ -201,6 +219,12 @@ func (m *MockNvmdev) AddMockA100Mdev(uuid string, mdevType string, mdevTypeDir s
return err return err
} }
err = os.Symlink(filepath.Join(mdevDeviceDir, "vfio_mdev"), filepath.Join(mdevDeviceDir, "driver")) err = os.Symlink(filepath.Join(mdevDeviceDir, "vfio_mdev"), filepath.Join(mdevDeviceDir, "driver"))
_, err = os.Create(filepath.Join(mdevDeviceDir, "200"))
if err != nil {
return err
}
err = os.Symlink(filepath.Join(mdevDeviceDir, "200"), filepath.Join(mdevDeviceDir, "iommu_group"))
if err != nil { if err != nil {
return err return err
} }

View File

@ -53,11 +53,12 @@ type ParentDevice struct {
// Device represents an NVIDIA MDEV (vGPU) device // Device represents an NVIDIA MDEV (vGPU) device
type Device struct { type Device struct {
Path string Path string
UUID string UUID string
MDEVType string MDEVType string
Driver string Driver string
Parent *ParentDevice IommuGroup int
Parent *ParentDevice
} }
// New interface that allows us to get a list of all NVIDIA parent and MDEV (vGPU) devices // New interface that allows us to get a list of all NVIDIA parent and MDEV (vGPU) devices
@ -149,12 +150,18 @@ func NewDevice(root string, uuid string) (*Device, error) {
return nil, fmt.Errorf("error detecting driver: %v", err) return nil, fmt.Errorf("error detecting driver: %v", err)
} }
iommuGroup, err := m.iommuGroup()
if err != nil {
return nil, fmt.Errorf("error getting iommu_group: %v", err)
}
device := Device{ device := Device{
Path: path, Path: path,
UUID: uuid, UUID: uuid,
MDEVType: mdevType, MDEVType: mdevType,
Driver: driver, Driver: driver,
Parent: parent, IommuGroup: iommuGroup,
Parent: parent,
} }
return &device, nil return &device, nil
@ -175,15 +182,25 @@ func newMdev(devicePath string) (mdev, error) {
func (m mdev) String() string { func (m mdev) String() string {
return string(m) return string(m)
} }
func (m mdev) resolve(target string) (string, error) {
resolved, err := filepath.EvalSymlinks(path.Join(string(m), target))
if err != nil {
return "", fmt.Errorf("error resolving %q: %v", target, err)
}
return resolved, nil
}
func (m mdev) parentDevicePath() string { func (m mdev) parentDevicePath() string {
// /sys/bus/pci/devices/<addr>/<uuid> // /sys/bus/pci/devices/<addr>/<uuid>
return path.Dir(string(m)) return path.Dir(string(m))
} }
func (m mdev) Type() (string, error) { func (m mdev) Type() (string, error) {
mdevTypeDir, err := filepath.EvalSymlinks(path.Join(string(m), "mdev_type")) mdevTypeDir, err := m.resolve("mdev_type")
if err != nil { if err != nil {
return "", fmt.Errorf("error resolving mdev_type link for mdev %s: %v", m, err) return "", err
} }
mdevType, err := os.ReadFile(path.Join(mdevTypeDir, "name")) mdevType, err := os.ReadFile(path.Join(mdevTypeDir, "name"))
@ -201,13 +218,27 @@ func (m mdev) Type() (string, error) {
} }
func (m mdev) driver() (string, error) { func (m mdev) driver() (string, error) {
driver, err := filepath.EvalSymlinks(path.Join(string(m), "driver")) driver, err := m.resolve("driver")
if err != nil { if err != nil {
return "", err return "", err
} }
return filepath.Base(driver), nil return filepath.Base(driver), nil
} }
func (m mdev) iommuGroup() (int, error) {
iommu, err := m.resolve("iommu_group")
if err != nil {
return -1, err
}
iommuGroupStr := strings.TrimSpace(filepath.Base(iommu))
iommuGroup, err := strconv.ParseInt(iommuGroupStr, 0, 64)
if err != nil {
return -1, fmt.Errorf("unable to convert iommu_group string to int64: %v", iommuGroupStr)
}
return int(iommuGroup), nil
}
// NewParentDevice constructs a ParentDevice // NewParentDevice constructs a ParentDevice
func NewParentDevice(devicePath string) (*ParentDevice, error) { func NewParentDevice(devicePath string) (*ParentDevice, error) {
nvdevice, err := nvpci.NewDevice(devicePath) nvdevice, err := nvpci.NewDevice(devicePath)

View File

@ -46,5 +46,7 @@ func TestNvmdev(t *testing.T) {
mdevs, err := nvmdev.GetAllDevices() mdevs, err := nvmdev.GetAllDevices()
require.Nil(t, err, "Error getting NVIDIA MDEV (vGPU) devices") require.Nil(t, err, "Error getting NVIDIA MDEV (vGPU) devices")
require.Equal(t, 1, len(mdevs), "Wrong number of NVIDIA MDEV (vGPU) devices") require.Equal(t, 1, len(mdevs), "Wrong number of NVIDIA MDEV (vGPU) devices")
require.Equal(t, "A100-4C", mdevs[0].MDEVType, "Wrong value for mdev_type")
require.Equal(t, "vfio_mdev", mdevs[0].Driver, "Wrong driver detected for mdev device") require.Equal(t, "vfio_mdev", mdevs[0].Driver, "Wrong driver detected for mdev device")
require.Equal(t, 200, mdevs[0].IommuGroup, "Wrong value for iommu_group")
} }

View File

@ -99,6 +99,15 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
return err return err
} }
_, err = os.Create(filepath.Join(deviceDir, "20"))
if err != nil {
return err
}
err = os.Symlink(filepath.Join(deviceDir, "20"), filepath.Join(deviceDir, "iommu_group"))
if err != nil {
return err
}
numa, err := os.Create(filepath.Join(deviceDir, "numa_node")) numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
if err != nil { if err != nil {
return err return err

View File

@ -65,15 +65,16 @@ var _ ResourceInterface = (*MemoryResources)(nil)
// NvidiaPCIDevice represents a PCI device for an NVIDIA product // NvidiaPCIDevice represents a PCI device for an NVIDIA product
type NvidiaPCIDevice struct { type NvidiaPCIDevice struct {
Path string Path string
Address string Address string
Vendor uint16 Vendor uint16
Class uint32 Class uint32
Device uint16 Device uint16
Driver string Driver string
NumaNode int IommuGroup int
Config *ConfigSpace NumaNode int
Resources MemoryResources Config *ConfigSpace
Resources MemoryResources
} }
// IsVGAController if class == 0x300 // IsVGAController if class == 0x300
@ -203,6 +204,20 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
return nil, fmt.Errorf("unable to detect driver for %s: %v", address, err) return nil, fmt.Errorf("unable to detect driver for %s: %v", address, err)
} }
var iommuGroup int64
iommu, err := filepath.EvalSymlinks(path.Join(devicePath, "iommu_group"))
if err == nil {
iommuGroupStr := strings.TrimSpace(filepath.Base(iommu))
iommuGroup, err = strconv.ParseInt(iommuGroupStr, 0, 64)
if err != nil {
return nil, fmt.Errorf("unable to convert iommu_group string to int64: %v", iommuGroupStr)
}
} else if os.IsNotExist(err) {
iommuGroup = -1
} else {
return nil, fmt.Errorf("unable to detect iommu_group for %s: %v", address, err)
}
numa, err := os.ReadFile(path.Join(devicePath, "numa_node")) numa, err := os.ReadFile(path.Join(devicePath, "numa_node"))
if err != nil { if err != nil {
return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err) return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err)
@ -244,15 +259,16 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
} }
nvdevice := &NvidiaPCIDevice{ nvdevice := &NvidiaPCIDevice{
Path: devicePath, Path: devicePath,
Address: address, Address: address,
Vendor: uint16(vendorID), Vendor: uint16(vendorID),
Class: uint32(classID), Class: uint32(classID),
Device: uint16(deviceID), Device: uint16(deviceID),
Driver: driver, Driver: driver,
NumaNode: int(numaNode), IommuGroup: int(iommuGroup),
Config: config, NumaNode: int(numaNode),
Resources: resources, Config: config,
Resources: resources,
} }
return nvdevice, nil return nvdevice, nil

View File

@ -46,6 +46,7 @@ func TestNvpci(t *testing.T) {
require.Equal(t, devices[0].Vendor, config.GetVendorID(), "Vendor IDs do not match") require.Equal(t, devices[0].Vendor, config.GetVendorID(), "Vendor IDs do not match")
require.Equal(t, devices[0].Device, config.GetDeviceID(), "Device IDs do not match") require.Equal(t, devices[0].Device, config.GetDeviceID(), "Device IDs do not match")
require.Equal(t, "nvidia", devices[0].Driver, "Wrong driver detected for device") require.Equal(t, "nvidia", devices[0].Driver, "Wrong driver detected for device")
require.Equal(t, 20, devices[0].IommuGroup, "Wrong iommu_group detected for device")
capabilities, err := config.GetPCICapabilities() capabilities, err := config.GetPCICapabilities()
require.Nil(t, err, "Error getting PCI capabilities") require.Nil(t, err, "Error getting PCI capabilities")