From 505f83b94307ecd87512f4696a24143aaa128afd Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis Date: Wed, 25 May 2022 16:34:29 +0000 Subject: [PATCH] Add nvmdev package for mdev (vGPU) devices --- pkg/nvmdev/mock.go | 200 +++++++++++++++++++++++++ pkg/nvmdev/nvmdev.go | 306 ++++++++++++++++++++++++++++++++++++++ pkg/nvmdev/nvmdev_test.go | 51 +++++++ pkg/nvpci/config.go | 27 ++-- pkg/nvpci/mock.go | 12 +- pkg/nvpci/nvpci.go | 211 ++++++++++++++------------ 6 files changed, 693 insertions(+), 114 deletions(-) create mode 100644 pkg/nvmdev/mock.go create mode 100644 pkg/nvmdev/nvmdev.go create mode 100644 pkg/nvmdev/nvmdev_test.go diff --git a/pkg/nvmdev/mock.go b/pkg/nvmdev/mock.go new file mode 100644 index 0000000..76a82e2 --- /dev/null +++ b/pkg/nvmdev/mock.go @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nvmdev + +import ( + "fmt" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci/bytes" + "io/ioutil" + "os" + "path/filepath" +) + +// MockNvmdev mock mdev device +type MockNvmdev struct { + *nvmdev +} + +var _ Interface = (*MockNvmdev)(nil) + +// NewMock creates new mock mediated (vGPU) and parent PCI devices and removes old devices +func NewMock() (mock *MockNvmdev, rerr error) { + mdevParentsRootDir, err := ioutil.TempDir("", "") + if err != nil { + return nil, err + } + defer func() { + if rerr != nil { + os.RemoveAll(mdevParentsRootDir) + } + }() + mdevDevicesRootDir, err := ioutil.TempDir("", "") + if err != nil { + return nil, err + } + defer func() { + if rerr != nil { + os.RemoveAll(mdevDevicesRootDir) + } + }() + + mock = &MockNvmdev{ + &nvmdev{mdevParentsRootDir, mdevDevicesRootDir}, + } + + return mock, nil +} + +// Cleanup removes the mocked mediated (vGPU) and parent PCI devices root folders +func (m *MockNvmdev) Cleanup() { + os.RemoveAll(m.mdevParentsRoot) + os.RemoveAll(m.mdevDevicesRoot) +} + +// AddMockA100Parent creates an A100 like parent GPU mock device +func (m *MockNvmdev) AddMockA100Parent(address string, numaNode int) error { + deviceDir := filepath.Join(m.mdevParentsRoot, address) + err := os.MkdirAll(deviceDir, 0755) + if err != nil { + return err + } + + vendor, err := os.Create(filepath.Join(deviceDir, "vendor")) + if err != nil { + return err + } + _, err = vendor.WriteString(fmt.Sprintf("0x%x", nvpci.PCINvidiaVendorID)) + if err != nil { + return err + } + + class, err := os.Create(filepath.Join(deviceDir, "class")) + if err != nil { + return err + } + _, err = class.WriteString(fmt.Sprintf("0x%x", nvpci.PCI3dControllerClass)) + if err != nil { + return err + } + + device, err := os.Create(filepath.Join(deviceDir, "device")) + if err != nil { + return err + } + _, err = device.WriteString("0x20bf") + if err != nil { + return err + } + + numa, err := os.Create(filepath.Join(deviceDir, "numa_node")) + if err != nil { + return err + } + _, err = numa.WriteString(fmt.Sprintf("%v", numaNode)) + if err != nil { + return err + } + + config, err := os.Create(filepath.Join(deviceDir, "config")) + if err != nil { + return err + } + _data := make([]byte, nvpci.PCICfgSpaceStandardSize) + data := bytes.New(&_data) + data.Write16(0, nvpci.PCINvidiaVendorID) + data.Write16(2, uint16(0x20bf)) + data.Write8(nvpci.PCIStatusBytePosition, nvpci.PCIStatusCapabilityList) + _, err = config.Write(*data.Raw()) + if err != nil { + return err + } + + bar0 := []uint64{0x00000000c2000000, 0x00000000c2ffffff, 0x0000000000040200} + resource, err := os.Create(filepath.Join(deviceDir, "resource")) + if err != nil { + return err + } + _, err = resource.WriteString(fmt.Sprintf("0x%x 0x%x 0x%x", bar0[0], bar0[1], bar0[2])) + if err != nil { + return err + } + + pmcID := uint32(0x170000a1) + resource0, err := os.Create(filepath.Join(deviceDir, "resource0")) + if err != nil { + return err + } + _data = make([]byte, bar0[1]-bar0[0]+1) + data = bytes.New(&_data).LittleEndian() + data.Write32(0, pmcID) + _, err = resource0.Write(*data.Raw()) + if err != nil { + return err + } + + mdevSupportedTypes := []string{"A100-4C", "A100-5C", "A100-8C", "A100-10C", + "A100-20C", "A100-40C", "A100-1-5CME", "A100-1-5C", "A100-2-10C", "A100-3-20C", + "A100-4-20C", "A100-7-40C"} + mdevSupportedTypesDir := filepath.Join(deviceDir, "mdev_supported_types") + err = os.MkdirAll(mdevSupportedTypesDir, 0755) + if err != nil { + return err + } + for i, mdevTypeName := range mdevSupportedTypes { + mdevTypeDir := filepath.Join(mdevSupportedTypesDir, fmt.Sprintf("nvidia-%d", 500+i)) + err := os.MkdirAll(mdevTypeDir, 0755) + if err != nil { + return err + } + name, err := os.Create(filepath.Join(mdevTypeDir, "name")) + if err != nil { + return err + } + _, err = name.WriteString(fmt.Sprintf("NVIDIA %s", mdevTypeName)) + if err != nil { + return err + } + availableInstances, err := os.Create(filepath.Join(mdevTypeDir, "available_instances")) + if err != nil { + return err + } + _, err = availableInstances.WriteString("1") + if err != nil { + return err + } + } + + return nil +} + +// AddMockA100Mdev creates an A100 like MDEV (vGPU) mock device. +// The corresponding mocked parent A100 device must be created beforehand. +func (m *MockNvmdev) AddMockA100Mdev(uuid string, mdevType string, parentMdevTypeDir string) error { + deviceDir := filepath.Join(m.mdevDevicesRoot, uuid) + err := os.MkdirAll(deviceDir, 0755) + if err != nil { + return err + } + + err = os.Symlink(parentMdevTypeDir, filepath.Join(deviceDir, "mdev_type")) + if err != nil { + return err + } + + return nil +} diff --git a/pkg/nvmdev/nvmdev.go b/pkg/nvmdev/nvmdev.go new file mode 100644 index 0000000..ea45836 --- /dev/null +++ b/pkg/nvmdev/nvmdev.go @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nvmdev + +import ( + "fmt" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci" + "io/ioutil" + "os" + "path" + "path/filepath" + "sort" + "strconv" + "strings" +) + +const ( + mdevParentsRoot = "/sys/class/mdev_bus" + mdevDevicesRoot = "/sys/bus/mdev/devices" +) + +// Interface allows us to get a list of NVIDIA MDEV (vGPU) and parent devices +type Interface interface { + GetAllDevices() ([]*Device, error) + GetAllParentDevices() ([]*ParentDevice, error) +} + +type nvmdev struct { + mdevParentsRoot string + mdevDevicesRoot string +} + +var _ Interface = (*nvmdev)(nil) + +// ParentDevice represents an NVIDIA parent PCI device +type ParentDevice struct { + *nvpci.NvidiaPCIDevice + mdevPaths map[string]string +} + +// Device represents an NVIDIA MDEV (vGPU) device +type Device struct { + Path string + UUID string + MDEVType string + Parent *ParentDevice +} + +// New interface that allows us to get a list of all NVIDIA parent and MDEV (vGPU) devices +func New() Interface { + return &nvmdev{mdevParentsRoot, mdevDevicesRoot} +} + +// GetAllParentDevices returns all NVIDIA Parent PCI devices on the system +func (m *nvmdev) GetAllParentDevices() ([]*ParentDevice, error) { + deviceDirs, err := ioutil.ReadDir(m.mdevParentsRoot) + if err != nil { + return nil, fmt.Errorf("unable to read PCI bus devices: %v", err) + } + + var nvdevices []*ParentDevice + for _, deviceDir := range deviceDirs { + devicePath := path.Join(m.mdevParentsRoot, deviceDir.Name()) + nvdevice, err := NewParentDevice(devicePath) + if err != nil { + return nil, fmt.Errorf("error constructing NVIDIA parent device: %v", err) + } + if nvdevice == nil { + continue + } + nvdevices = append(nvdevices, nvdevice) + } + + addressToID := func(address string) uint64 { + address = strings.ReplaceAll(address, ":", "") + address = strings.ReplaceAll(address, ".", "") + id, _ := strconv.ParseUint(address, 16, 64) + return id + } + + sort.Slice(nvdevices, func(i, j int) bool { + return addressToID(nvdevices[i].Address) < addressToID(nvdevices[j].Address) + }) + + return nvdevices, nil +} + +// GetAllDevices returns all NVIDIA mdev (vGPU) devices on the system +func (m *nvmdev) GetAllDevices() ([]*Device, error) { + deviceDirs, err := ioutil.ReadDir(m.mdevDevicesRoot) + if err != nil { + return nil, fmt.Errorf("unable to read MDEV devices directory: %v", err) + } + + var nvdevices []*Device + for _, deviceDir := range deviceDirs { + nvdevice, err := NewDevice(m.mdevDevicesRoot, deviceDir.Name()) + if err != nil { + return nil, fmt.Errorf("error constructing MDEV device: %v", err) + } + if nvdevice == nil { + continue + } + nvdevices = append(nvdevices, nvdevice) + } + + return nvdevices, nil +} + +// NewDevice constructs a Device, which represents an NVIDIA mdev (vGPU) device +func NewDevice(root string, uuid string) (*Device, error) { + path := path.Join(root, uuid) + + m, err := newMdev(path) + if err != nil { + return nil, err + } + + parent, err := NewParentDevice(m.parentDevicePath()) + if err != nil { + return nil, fmt.Errorf("error constructing NVIDIA PCI device: %v", err) + } + + if parent == nil { + return nil, nil + } + + mdevType, err := m.Type() + if err != nil { + return nil, fmt.Errorf("error getting mdev type: %v", err) + } + + device := Device{ + Path: path, + UUID: uuid, + MDEVType: mdevType, + Parent: parent, + } + + return &device, nil +} + +type mdev string + +func newMdev(devicePath string) (mdev, error) { + mdevTypeDir, err := filepath.EvalSymlinks(path.Join(devicePath, "mdev_type")) + if err != nil { + return "", fmt.Errorf("error resolving mdev_type link: %v", err) + } + + return mdev(mdevTypeDir), nil +} + +func (m mdev) String() string { + return string(m) +} +func (m mdev) parentDevicePath() string { + // /sys/bus/pci/devices//mdev_supported_types/ + return path.Dir(path.Dir(string(m))) +} + +func (m mdev) Type() (string, error) { + mdevType, err := ioutil.ReadFile(path.Join(string(m), "name")) + if err != nil { + return "", fmt.Errorf("unable to read mdev_type name for mdev %s: %v", m, err) + } + // file in the format: [NVIDIA|GRID] + mdevTypeStr := strings.TrimSpace(string(mdevType)) + mdevTypeSplit := strings.SplitN(mdevTypeStr, " ", 2) + if len(mdevTypeSplit) != 2 { + return "", fmt.Errorf("unable to parse mdev_type name %s for mdev %s", mdevTypeStr, m) + } + + return mdevTypeSplit[1], nil +} + +// NewParentDevice constructs a ParentDevice +func NewParentDevice(devicePath string) (*ParentDevice, error) { + nvdevice, err := nvpci.NewDevice(devicePath) + if err != nil { + return nil, fmt.Errorf("failed to construct NVIDIA PCI device: %v", err) + } + if nvdevice == nil { + // not a NVIDIA device + return nil, err + } + + paths, err := filepath.Glob(fmt.Sprintf("%s/mdev_supported_types/nvidia-*/name", nvdevice.Path)) + if err != nil { + return nil, fmt.Errorf("unable to get files in mdev_supported_types directory: %v", err) + } + mdevTypesMap := make(map[string]string) + for _, path := range paths { + name, err := ioutil.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("unable to read file %s: %v", path, err) + } + // file in the format: [NVIDIA|GRID] + nameStr := strings.TrimSpace(string(name)) + nameSplit := strings.SplitN(nameStr, " ", 2) + if len(nameSplit) != 2 { + return nil, fmt.Errorf("unable to parse mdev_type name %s at path %s", nameStr, path) + } + nameStr = nameSplit[len(nameSplit)-1] + + mdevTypesMap[nameStr] = filepath.Dir(path) + } + + return &ParentDevice{nvdevice, mdevTypesMap}, err +} + +// CreateMDEVDevice creates a mediated device (vGPU) on the parent GPU +func (p *ParentDevice) CreateMDEVDevice(mdevType string, id string) error { + mdevPath, ok := p.mdevPaths[mdevType] + if !ok { + return fmt.Errorf("unable to create mdev %s: mdev not supported by parent device %s", mdevType, p.Address) + } + f, err := os.OpenFile(filepath.Join(mdevPath, "create"), os.O_WRONLY|os.O_SYNC, 0200) + if err != nil { + return fmt.Errorf("unable to open create file: %v", err) + } + _, err = f.WriteString(id) + if err != nil { + return fmt.Errorf("unable to create mdev: %v", err) + } + return nil +} + +// DeleteMDEVDevice deletes a mediated device (vGPU) +func (p *ParentDevice) DeleteMDEVDevice(id string) error { + removeFile, err := os.OpenFile(filepath.Join(p.Path, id, "remove"), os.O_WRONLY|os.O_SYNC, 0200) + if err != nil { + return fmt.Errorf("unable to open remove file: %v", err) + } + _, err = removeFile.WriteString("1") + if err != nil { + return fmt.Errorf("unable to delete mdev: %v", err) + } + + return nil +} + +// Delete deletes a mediated device (vGPU) +func (m *Device) Delete() error { + removeFile, err := os.OpenFile(filepath.Join(m.Path, "remove"), os.O_WRONLY|os.O_SYNC, 0200) + if err != nil { + return fmt.Errorf("unable to open remove file: %v", err) + } + _, err = removeFile.WriteString("1") + if err != nil { + return fmt.Errorf("unable to delete mdev: %v", err) + } + + return nil +} + +// IsMDEVTypeSupported checks if the mdevType is supported by the GPU +func (p *ParentDevice) IsMDEVTypeSupported(mdevType string) bool { + _, found := p.mdevPaths[mdevType] + return found +} + +// IsMDEVTypeAvailable checks if a vGPU instance of mdevType can be created on the parent GPU +func (p *ParentDevice) IsMDEVTypeAvailable(mdevType string) (bool, error) { + availableInstances, err := p.GetAvailableMDEVInstances(mdevType) + if err != nil { + return false, fmt.Errorf("failed to get available instances for mdev type %s: %v", mdevType, err) + } + + return (availableInstances > 0), nil +} + +// GetAvailableMDEVInstances returns the available instances for mdevType. +// Return -1 if mdevType is not supported for the device. +func (p *ParentDevice) GetAvailableMDEVInstances(mdevType string) (int, error) { + mdevPath, ok := p.mdevPaths[mdevType] + if !ok { + return -1, nil + } + + available, err := ioutil.ReadFile(filepath.Join(mdevPath, "available_instances")) + if err != nil { + return -1, fmt.Errorf("unable to read available_instances file: %v", err) + } + + availableInstances, err := strconv.Atoi(strings.TrimSpace(string(available))) + if err != nil { + return -1, fmt.Errorf("unable to convert available_instances to an int: %v", err) + } + + return availableInstances, nil +} diff --git a/pkg/nvmdev/nvmdev_test.go b/pkg/nvmdev/nvmdev_test.go new file mode 100644 index 0000000..43815f4 --- /dev/null +++ b/pkg/nvmdev/nvmdev_test.go @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nvmdev + +import ( + "github.com/stretchr/testify/require" + "path/filepath" + "testing" +) + +func TestNvmdev(t *testing.T) { + nvmdev, err := NewMock() + require.Nil(t, err, "Error creating MockNvmdev") + defer nvmdev.Cleanup() + + err = nvmdev.AddMockA100Parent("0000:3b:04.1", 0) + require.Nil(t, err, "Error adding Mock A100 parent device to MockNvmdev") + parentDevs, err := nvmdev.GetAllParentDevices() + require.Nil(t, err, "Error getting parent GPU devices") + require.Equal(t, 1, len(parentDevs), "Wrong number of parent GPU devices") + + parentA100 := parentDevs[0] + supported := parentA100.IsMDEVTypeSupported("A100-4C") + require.True(t, supported, "A100-4C should be a supported vGPU type") + + available, err := parentA100.IsMDEVTypeAvailable("A100-4C") + require.Nil(t, err, "Error checking if A100-4Q vGPU type is available for creation") + require.True(t, available, "A100-4C should be available to create") + + err = nvmdev.AddMockA100Mdev("b1914f0a-15cf-416e-8967-55fc7cb68e20", "A100-4C", + filepath.Join(parentDevs[0].Path, "mdev_supported_types/nvidia-500")) + require.Nil(t, err, "Error adding Mock A100 mediated device") + + mdevs, err := nvmdev.GetAllDevices() + require.Nil(t, err, "Error getting NVIDIA MDEV (vGPU) devices") + require.Equal(t, 1, len(mdevs), "Wrong number of NVIDIA MDEV (vGPU) devices") +} diff --git a/pkg/nvpci/config.go b/pkg/nvpci/config.go index 5373a0f..7cd2920 100644 --- a/pkg/nvpci/config.go +++ b/pkg/nvpci/config.go @@ -24,11 +24,16 @@ import ( ) const ( - pciCfgSpaceStandardSize = 256 - pciCfgSpaceExtendedSize = 4096 - pciCapabilityListPointer = 0x34 - pciStatusCapabilityList = 0x10 - pciStatusBytePosition = 0x06 + // PCICfgSpaceStandardSize represents the size in bytes of the standard config space + PCICfgSpaceStandardSize = 256 + // PCICfgSpaceExtendedSize represents the size in bytes of the extended config space + PCICfgSpaceExtendedSize = 4096 + // PCICapabilityListPointer represents offset for the capability list pointer + PCICapabilityListPointer = 0x34 + // PCIStatusCapabilityList represents the status register bit which indicates capability list support + PCIStatusCapabilityList = 0x10 + // PCIStatusBytePosition represents the position of the status register + PCIStatusBytePosition = 0x06 ) // ConfigSpace PCI configuration space (standard extended) file path @@ -87,12 +92,12 @@ func (cs *configSpaceIO) GetPCICapabilities() (*PCICapabilities, error) { make(map[uint16]*PCIExtendedCapability), } - support := cs.Read8(pciStatusBytePosition) & pciStatusCapabilityList + support := cs.Read8(PCIStatusBytePosition) & PCIStatusCapabilityList if support == 0 { return nil, fmt.Errorf("pci device does not support capability list") } - soffset := cs.Read8(pciCapabilityListPointer) + soffset := cs.Read8(PCICapabilityListPointer) if int(soffset) >= cs.Len() { return nil, fmt.Errorf("capability list pointer out of bounds") } @@ -101,7 +106,7 @@ func (cs *configSpaceIO) GetPCICapabilities() (*PCICapabilities, error) { if soffset == 0xff { return nil, fmt.Errorf("config space broken") } - if int(soffset) >= pciCfgSpaceStandardSize { + if int(soffset) >= PCICfgSpaceStandardSize { return nil, fmt.Errorf("standard capability list pointer out of bounds") } data := cs.Read32(int(soffset)) @@ -112,16 +117,16 @@ func (cs *configSpaceIO) GetPCICapabilities() (*PCICapabilities, error) { soffset = uint8((data >> 8) & 0xff) } - if cs.Len() <= pciCfgSpaceStandardSize { + if cs.Len() <= PCICfgSpaceStandardSize { return caps, nil } - eoffset := uint16(pciCfgSpaceStandardSize) + eoffset := uint16(PCICfgSpaceStandardSize) for eoffset != 0 { if eoffset == 0xffff { return nil, fmt.Errorf("config space broken") } - if int(eoffset) >= pciCfgSpaceExtendedSize { + if int(eoffset) >= PCICfgSpaceExtendedSize { return nil, fmt.Errorf("extended capability list pointer out of bounds") } data := cs.Read32(int(eoffset)) diff --git a/pkg/nvpci/mock.go b/pkg/nvpci/mock.go index 7448d0d..5c13ae1 100644 --- a/pkg/nvpci/mock.go +++ b/pkg/nvpci/mock.go @@ -45,7 +45,7 @@ func NewMockNvpci() (mock *MockNvpci, rerr error) { }() mock = &MockNvpci{ - &nvpci{rootDir}, + NewFrom(rootDir).(*nvpci), } return mock, nil @@ -68,7 +68,7 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error { if err != nil { return err } - _, err = vendor.WriteString(fmt.Sprintf("0x%x", pciNvidiaVendorID)) + _, err = vendor.WriteString(fmt.Sprintf("0x%x", PCINvidiaVendorID)) if err != nil { return err } @@ -77,7 +77,7 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error { if err != nil { return err } - _, err = class.WriteString(fmt.Sprintf("0x%x", pci3dControllerClass)) + _, err = class.WriteString(fmt.Sprintf("0x%x", PCI3dControllerClass)) if err != nil { return err } @@ -104,11 +104,11 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error { if err != nil { return err } - _data := make([]byte, pciCfgSpaceStandardSize) + _data := make([]byte, PCICfgSpaceStandardSize) data := bytes.New(&_data) - data.Write16(0, pciNvidiaVendorID) + data.Write16(0, PCINvidiaVendorID) data.Write16(2, uint16(0x20bf)) - data.Write8(pciStatusBytePosition, pciStatusCapabilityList) + data.Write8(PCIStatusBytePosition, PCIStatusCapabilityList) _, err = config.Write(*data.Raw()) if err != nil { return err diff --git a/pkg/nvpci/nvpci.go b/pkg/nvpci/nvpci.go index 1638369..ef8cd04 100644 --- a/pkg/nvpci/nvpci.go +++ b/pkg/nvpci/nvpci.go @@ -27,16 +27,16 @@ import ( ) const ( - // pciDevicesRoot represents base path for all pci devices under sysfs - pciDevicesRoot = "/sys/bus/pci/devices" - // pciNvidiaVendorID represents PCI vendor id for NVIDIA - pciNvidiaVendorID uint16 = 0x10de - // pciVgaControllerClass represents the PCI class for VGA Controllers - pciVgaControllerClass uint32 = 0x030000 - // pci3dControllerClass represents the PCI class for 3D Graphics accellerators - pci3dControllerClass uint32 = 0x030200 - // pciNvSwitchClass represents the PCI class for NVSwitches - pciNvSwitchClass uint32 = 0x068000 + // PCIDevicesRoot represents base path for all pci devices under sysfs + PCIDevicesRoot = "/sys/bus/pci/devices" + // PCINvidiaVendorID represents PCI vendor id for NVIDIA + PCINvidiaVendorID uint16 = 0x10de + // PCIVgaControllerClass represents the PCI class for VGA Controllers + PCIVgaControllerClass uint32 = 0x030000 + // PCI3dControllerClass represents the PCI class for 3D Graphics accellerators + PCI3dControllerClass uint32 = 0x030200 + // PCINvSwitchClass represents the PCI class for NVSwitches + PCINvSwitchClass uint32 = 0x068000 ) // Interface allows us to get a list of all NVIDIA PCI devices @@ -68,17 +68,17 @@ type NvidiaPCIDevice struct { // IsVGAController if class == 0x300 func (d *NvidiaPCIDevice) IsVGAController() bool { - return d.Class == pciVgaControllerClass + return d.Class == PCIVgaControllerClass } // Is3DController if class == 0x302 func (d *NvidiaPCIDevice) Is3DController() bool { - return d.Class == pci3dControllerClass + return d.Class == PCI3dControllerClass } // IsNVSwitch if classe == 0x068 func (d *NvidiaPCIDevice) IsNVSwitch() bool { - return d.Class == pciNvSwitchClass + return d.Class == PCINvSwitchClass } // IsGPU either VGA for older cards or 3D for newer @@ -104,7 +104,12 @@ func (d *NvidiaPCIDevice) Reset() error { // New interface that allows us to get a list of all NVIDIA PCI devices func New() Interface { - return &nvpci{pciDevicesRoot} + return &nvpci{PCIDevicesRoot} +} + +// NewFrom interface allows us to get a list of all NVIDIA PCI devices at a specific root directory +func NewFrom(root string) Interface { + return &nvpci{root} } // GetAllDevices returns all Nvidia PCI devices on the system @@ -117,93 +122,13 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) { var nvdevices []*NvidiaPCIDevice for _, deviceDir := range deviceDirs { devicePath := path.Join(p.pciDevicesRoot, deviceDir.Name()) - address := deviceDir.Name() - - vendor, err := ioutil.ReadFile(path.Join(devicePath, "vendor")) + nvdevice, err := NewDevice(devicePath) if err != nil { - return nil, fmt.Errorf("unable to read PCI device vendor id for %s: %v", address, err) + return nil, fmt.Errorf("error constructing NVIDIA PCI device %s: %v", deviceDir.Name(), err) } - vendorStr := strings.TrimSpace(string(vendor)) - vendorID, err := strconv.ParseUint(vendorStr, 0, 16) - if err != nil { - return nil, fmt.Errorf("unable to convert vendor string to uint16: %v", vendorStr) - } - - if uint16(vendorID) != pciNvidiaVendorID { + if nvdevice == nil { continue } - - class, err := ioutil.ReadFile(path.Join(devicePath, "class")) - if err != nil { - return nil, fmt.Errorf("unable to read PCI device class for %s: %v", address, err) - } - classStr := strings.TrimSpace(string(class)) - classID, err := strconv.ParseUint(classStr, 0, 32) - if err != nil { - return nil, fmt.Errorf("unable to convert class string to uint32: %v", classStr) - } - - device, err := ioutil.ReadFile(path.Join(devicePath, "device")) - if err != nil { - return nil, fmt.Errorf("unable to read PCI device id for %s: %v", address, err) - } - deviceStr := strings.TrimSpace(string(device)) - deviceID, err := strconv.ParseUint(deviceStr, 0, 16) - if err != nil { - return nil, fmt.Errorf("unable to convert device string to uint16: %v", deviceStr) - } - - numa, err := ioutil.ReadFile(path.Join(devicePath, "numa_node")) - if err != nil { - return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err) - } - numaStr := strings.TrimSpace(string(numa)) - numaNode, err := strconv.ParseInt(numaStr, 0, 64) - if err != nil { - return nil, fmt.Errorf("unable to convert NUMA node string to int64: %v", numaNode) - } - - config := &ConfigSpace{ - Path: path.Join(devicePath, "config"), - } - - resource, err := ioutil.ReadFile(path.Join(devicePath, "resource")) - if err != nil { - return nil, fmt.Errorf("unable to read PCI resource file for %s: %v", address, err) - } - - resources := make(map[int]*MemoryResource) - for i, line := range strings.Split(strings.TrimSpace(string(resource)), "\n") { - values := strings.Split(line, " ") - if len(values) != 3 { - return nil, fmt.Errorf("more than 3 entries in line '%d' of resource file", i) - } - - start, _ := strconv.ParseUint(values[0], 0, 64) - end, _ := strconv.ParseUint(values[1], 0, 64) - flags, _ := strconv.ParseUint(values[2], 0, 64) - - if (end - start) != 0 { - resources[i] = &MemoryResource{ - uintptr(start), - uintptr(end), - flags, - fmt.Sprintf("%s/resource%d", devicePath, i), - } - } - } - - nvdevice := &NvidiaPCIDevice{ - Path: devicePath, - Address: address, - Vendor: uint16(vendorID), - Class: uint32(classID), - Device: uint16(deviceID), - NumaNode: int(numaNode), - Config: config, - Resources: resources, - } - nvdevices = append(nvdevices, nvdevice) } @@ -221,6 +146,98 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) { return nvdevices, nil } +// NewDevice constructs an NvidiaPCIDevice +func NewDevice(devicePath string) (*NvidiaPCIDevice, error) { + address := path.Base(devicePath) + + vendor, err := ioutil.ReadFile(path.Join(devicePath, "vendor")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI device vendor id for %s: %v", address, err) + } + vendorStr := strings.TrimSpace(string(vendor)) + vendorID, err := strconv.ParseUint(vendorStr, 0, 16) + if err != nil { + return nil, fmt.Errorf("unable to convert vendor string to uint16: %v", vendorStr) + } + + if uint16(vendorID) != PCINvidiaVendorID { + return nil, nil + } + + class, err := ioutil.ReadFile(path.Join(devicePath, "class")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI device class for %s: %v", address, err) + } + classStr := strings.TrimSpace(string(class)) + classID, err := strconv.ParseUint(classStr, 0, 32) + if err != nil { + return nil, fmt.Errorf("unable to convert class string to uint32: %v", classStr) + } + + device, err := ioutil.ReadFile(path.Join(devicePath, "device")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI device id for %s: %v", address, err) + } + deviceStr := strings.TrimSpace(string(device)) + deviceID, err := strconv.ParseUint(deviceStr, 0, 16) + if err != nil { + return nil, fmt.Errorf("unable to convert device string to uint16: %v", deviceStr) + } + + numa, err := ioutil.ReadFile(path.Join(devicePath, "numa_node")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err) + } + numaStr := strings.TrimSpace(string(numa)) + numaNode, err := strconv.ParseInt(numaStr, 0, 64) + if err != nil { + return nil, fmt.Errorf("unable to convert NUMA node string to int64: %v", numaNode) + } + + config := &ConfigSpace{ + Path: path.Join(devicePath, "config"), + } + + resource, err := ioutil.ReadFile(path.Join(devicePath, "resource")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI resource file for %s: %v", address, err) + } + + resources := make(map[int]*MemoryResource) + for i, line := range strings.Split(strings.TrimSpace(string(resource)), "\n") { + values := strings.Split(line, " ") + if len(values) != 3 { + return nil, fmt.Errorf("more than 3 entries in line '%d' of resource file", i) + } + + start, _ := strconv.ParseUint(values[0], 0, 64) + end, _ := strconv.ParseUint(values[1], 0, 64) + flags, _ := strconv.ParseUint(values[2], 0, 64) + + if (end - start) != 0 { + resources[i] = &MemoryResource{ + uintptr(start), + uintptr(end), + flags, + fmt.Sprintf("%s/resource%d", devicePath, i), + } + } + } + + nvdevice := &NvidiaPCIDevice{ + Path: devicePath, + Address: address, + Vendor: uint16(vendorID), + Class: uint32(classID), + Device: uint16(deviceID), + NumaNode: int(numaNode), + Config: config, + Resources: resources, + } + + return nvdevice, nil +} + // Get3DControllers returns all NVIDIA 3D Controller PCI devices on the system func (p *nvpci) Get3DControllers() ([]*NvidiaPCIDevice, error) { devices, err := p.GetAllDevices()