mirror of
https://github.com/clearml/go-nvlib
synced 2025-04-06 13:54:58 +00:00
Merge branch 'iommu-group' into 'main'
Detect iommu_group for PCI and mdev devices See merge request nvidia/cloud-native/go-nvlib!12
This commit is contained in:
commit
c7f47cb02a
@ -100,6 +100,24 @@ func (m *MockNvmdev) AddMockA100Parent(address string, numaNode int) error {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = os.Create(filepath.Join(deviceDir, "nvidia"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = os.Symlink(filepath.Join(deviceDir, "nvidia"), filepath.Join(deviceDir, "driver"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = os.Create(filepath.Join(deviceDir, "20"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = os.Symlink(filepath.Join(deviceDir, "20"), filepath.Join(deviceDir, "iommu_group"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
|
||||
if err != nil {
|
||||
return err
|
||||
@ -201,6 +219,12 @@ func (m *MockNvmdev) AddMockA100Mdev(uuid string, mdevType string, mdevTypeDir s
|
||||
return err
|
||||
}
|
||||
err = os.Symlink(filepath.Join(mdevDeviceDir, "vfio_mdev"), filepath.Join(mdevDeviceDir, "driver"))
|
||||
|
||||
_, err = os.Create(filepath.Join(mdevDeviceDir, "200"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = os.Symlink(filepath.Join(mdevDeviceDir, "200"), filepath.Join(mdevDeviceDir, "iommu_group"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -53,11 +53,12 @@ type ParentDevice struct {
|
||||
|
||||
// Device represents an NVIDIA MDEV (vGPU) device
|
||||
type Device struct {
|
||||
Path string
|
||||
UUID string
|
||||
MDEVType string
|
||||
Driver string
|
||||
Parent *ParentDevice
|
||||
Path string
|
||||
UUID string
|
||||
MDEVType string
|
||||
Driver string
|
||||
IommuGroup int
|
||||
Parent *ParentDevice
|
||||
}
|
||||
|
||||
// New interface that allows us to get a list of all NVIDIA parent and MDEV (vGPU) devices
|
||||
@ -149,12 +150,18 @@ func NewDevice(root string, uuid string) (*Device, error) {
|
||||
return nil, fmt.Errorf("error detecting driver: %v", err)
|
||||
}
|
||||
|
||||
iommuGroup, err := m.iommuGroup()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting iommu_group: %v", err)
|
||||
}
|
||||
|
||||
device := Device{
|
||||
Path: path,
|
||||
UUID: uuid,
|
||||
MDEVType: mdevType,
|
||||
Driver: driver,
|
||||
Parent: parent,
|
||||
Path: path,
|
||||
UUID: uuid,
|
||||
MDEVType: mdevType,
|
||||
Driver: driver,
|
||||
IommuGroup: iommuGroup,
|
||||
Parent: parent,
|
||||
}
|
||||
|
||||
return &device, nil
|
||||
@ -175,15 +182,25 @@ func newMdev(devicePath string) (mdev, error) {
|
||||
func (m mdev) String() string {
|
||||
return string(m)
|
||||
}
|
||||
|
||||
func (m mdev) resolve(target string) (string, error) {
|
||||
resolved, err := filepath.EvalSymlinks(path.Join(string(m), target))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error resolving %q: %v", target, err)
|
||||
}
|
||||
|
||||
return resolved, nil
|
||||
}
|
||||
|
||||
func (m mdev) parentDevicePath() string {
|
||||
// /sys/bus/pci/devices/<addr>/<uuid>
|
||||
return path.Dir(string(m))
|
||||
}
|
||||
|
||||
func (m mdev) Type() (string, error) {
|
||||
mdevTypeDir, err := filepath.EvalSymlinks(path.Join(string(m), "mdev_type"))
|
||||
mdevTypeDir, err := m.resolve("mdev_type")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error resolving mdev_type link for mdev %s: %v", m, err)
|
||||
return "", err
|
||||
}
|
||||
|
||||
mdevType, err := os.ReadFile(path.Join(mdevTypeDir, "name"))
|
||||
@ -201,13 +218,27 @@ func (m mdev) Type() (string, error) {
|
||||
}
|
||||
|
||||
func (m mdev) driver() (string, error) {
|
||||
driver, err := filepath.EvalSymlinks(path.Join(string(m), "driver"))
|
||||
driver, err := m.resolve("driver")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return filepath.Base(driver), nil
|
||||
}
|
||||
|
||||
func (m mdev) iommuGroup() (int, error) {
|
||||
iommu, err := m.resolve("iommu_group")
|
||||
if err != nil {
|
||||
return -1, err
|
||||
}
|
||||
iommuGroupStr := strings.TrimSpace(filepath.Base(iommu))
|
||||
iommuGroup, err := strconv.ParseInt(iommuGroupStr, 0, 64)
|
||||
if err != nil {
|
||||
return -1, fmt.Errorf("unable to convert iommu_group string to int64: %v", iommuGroupStr)
|
||||
}
|
||||
|
||||
return int(iommuGroup), nil
|
||||
}
|
||||
|
||||
// NewParentDevice constructs a ParentDevice
|
||||
func NewParentDevice(devicePath string) (*ParentDevice, error) {
|
||||
nvdevice, err := nvpci.NewDevice(devicePath)
|
||||
|
@ -46,5 +46,7 @@ func TestNvmdev(t *testing.T) {
|
||||
mdevs, err := nvmdev.GetAllDevices()
|
||||
require.Nil(t, err, "Error getting NVIDIA MDEV (vGPU) devices")
|
||||
require.Equal(t, 1, len(mdevs), "Wrong number of NVIDIA MDEV (vGPU) devices")
|
||||
require.Equal(t, "A100-4C", mdevs[0].MDEVType, "Wrong value for mdev_type")
|
||||
require.Equal(t, "vfio_mdev", mdevs[0].Driver, "Wrong driver detected for mdev device")
|
||||
require.Equal(t, 200, mdevs[0].IommuGroup, "Wrong value for iommu_group")
|
||||
}
|
||||
|
@ -99,6 +99,15 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = os.Create(filepath.Join(deviceDir, "20"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = os.Symlink(filepath.Join(deviceDir, "20"), filepath.Join(deviceDir, "iommu_group"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -65,15 +65,16 @@ var _ ResourceInterface = (*MemoryResources)(nil)
|
||||
|
||||
// NvidiaPCIDevice represents a PCI device for an NVIDIA product
|
||||
type NvidiaPCIDevice struct {
|
||||
Path string
|
||||
Address string
|
||||
Vendor uint16
|
||||
Class uint32
|
||||
Device uint16
|
||||
Driver string
|
||||
NumaNode int
|
||||
Config *ConfigSpace
|
||||
Resources MemoryResources
|
||||
Path string
|
||||
Address string
|
||||
Vendor uint16
|
||||
Class uint32
|
||||
Device uint16
|
||||
Driver string
|
||||
IommuGroup int
|
||||
NumaNode int
|
||||
Config *ConfigSpace
|
||||
Resources MemoryResources
|
||||
}
|
||||
|
||||
// IsVGAController if class == 0x300
|
||||
@ -203,6 +204,20 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
|
||||
return nil, fmt.Errorf("unable to detect driver for %s: %v", address, err)
|
||||
}
|
||||
|
||||
var iommuGroup int64
|
||||
iommu, err := filepath.EvalSymlinks(path.Join(devicePath, "iommu_group"))
|
||||
if err == nil {
|
||||
iommuGroupStr := strings.TrimSpace(filepath.Base(iommu))
|
||||
iommuGroup, err = strconv.ParseInt(iommuGroupStr, 0, 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to convert iommu_group string to int64: %v", iommuGroupStr)
|
||||
}
|
||||
} else if os.IsNotExist(err) {
|
||||
iommuGroup = -1
|
||||
} else {
|
||||
return nil, fmt.Errorf("unable to detect iommu_group for %s: %v", address, err)
|
||||
}
|
||||
|
||||
numa, err := os.ReadFile(path.Join(devicePath, "numa_node"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err)
|
||||
@ -244,15 +259,16 @@ func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
|
||||
}
|
||||
|
||||
nvdevice := &NvidiaPCIDevice{
|
||||
Path: devicePath,
|
||||
Address: address,
|
||||
Vendor: uint16(vendorID),
|
||||
Class: uint32(classID),
|
||||
Device: uint16(deviceID),
|
||||
Driver: driver,
|
||||
NumaNode: int(numaNode),
|
||||
Config: config,
|
||||
Resources: resources,
|
||||
Path: devicePath,
|
||||
Address: address,
|
||||
Vendor: uint16(vendorID),
|
||||
Class: uint32(classID),
|
||||
Device: uint16(deviceID),
|
||||
Driver: driver,
|
||||
IommuGroup: int(iommuGroup),
|
||||
NumaNode: int(numaNode),
|
||||
Config: config,
|
||||
Resources: resources,
|
||||
}
|
||||
|
||||
return nvdevice, nil
|
||||
|
@ -46,6 +46,7 @@ func TestNvpci(t *testing.T) {
|
||||
require.Equal(t, devices[0].Vendor, config.GetVendorID(), "Vendor IDs do not match")
|
||||
require.Equal(t, devices[0].Device, config.GetDeviceID(), "Device IDs do not match")
|
||||
require.Equal(t, "nvidia", devices[0].Driver, "Wrong driver detected for device")
|
||||
require.Equal(t, 20, devices[0].IommuGroup, "Wrong iommu_group detected for device")
|
||||
|
||||
capabilities, err := config.GetPCICapabilities()
|
||||
require.Nil(t, err, "Error getting PCI capabilities")
|
||||
|
Loading…
Reference in New Issue
Block a user