diff --git a/pkg/nvpci/config.go b/pkg/nvpci/config.go new file mode 100644 index 0000000..415e13b --- /dev/null +++ b/pkg/nvpci/config.go @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nvpci + +import ( + "fmt" + "io/ioutil" + + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/bytes" +) + +const ( + pciCfgSpaceStandardSize = 256 + pciCfgSpaceExtendedSize = 4096 + pciCapabilityListPointer = 0x34 +) + +type ConfigSpace struct { + Path string +} + +type ConfigSpaceIO interface { + bytes.Bytes + GetVendorID() uint16 + GetDeviceID() uint16 + GetPCICapabilities() (*PCICapabilities, error) +} + +type configSpaceIO struct { + bytes.Bytes +} + +type PCIStandardCapability struct { + bytes.Bytes +} + +type PCIExtendedCapability struct { + bytes.Bytes + Version uint8 +} + +type PCICapabilities struct { + Standard map[uint8]*PCIStandardCapability + Extended map[uint16]*PCIExtendedCapability +} + +func (cs *ConfigSpace) Read() (ConfigSpaceIO, error) { + config, err := ioutil.ReadFile(cs.Path) + if err != nil { + return nil, fmt.Errorf("failed to open file: %v", err) + } + return &configSpaceIO{bytes.New(&config)}, nil +} + +func (cs *configSpaceIO) GetVendorID() uint16 { + return cs.Read16(0) +} + +func (cs *configSpaceIO) GetDeviceID() uint16 { + return cs.Read16(2) +} + +func (cs *configSpaceIO) GetPCICapabilities() (*PCICapabilities, error) { + caps := &PCICapabilities{ + make(map[uint8]*PCIStandardCapability), + make(map[uint16]*PCIExtendedCapability), + } + + soffset := cs.Read8(pciCapabilityListPointer) + if int(soffset) >= cs.Len() { + return nil, fmt.Errorf("capability list pointer out of bounds") + } + + for soffset != 0 { + if soffset == 0xff { + return nil, fmt.Errorf("config space broken") + } + if int(soffset) >= pciCfgSpaceStandardSize { + return nil, fmt.Errorf("standard capability list pointer out of bounds") + } + data := cs.Read32(int(soffset)) + id := uint8(data & 0xff) + caps.Standard[id] = &PCIStandardCapability{ + cs.Slice(int(soffset), cs.Len()-int(soffset)), + } + soffset = uint8((data >> 8) & 0xff) + } + + if cs.Len() <= pciCfgSpaceStandardSize { + return caps, nil + } + + eoffset := uint16(pciCfgSpaceStandardSize) + for eoffset != 0 { + if eoffset == 0xffff { + return nil, fmt.Errorf("config space broken") + } + if int(eoffset) >= pciCfgSpaceExtendedSize { + return nil, fmt.Errorf("extended capability list pointer out of bounds") + } + data := cs.Read32(int(eoffset)) + id := uint16(data & 0xffff) + version := uint8((data >> 16) & 0xf) + caps.Extended[id] = &PCIExtendedCapability{ + cs.Slice(int(eoffset), cs.Len()-int(eoffset)), + version, + } + eoffset = uint16((data >> 4) & 0xffc) + } + + return caps, nil +} diff --git a/pkg/nvpci/mock.go b/pkg/nvpci/mock.go new file mode 100644 index 0000000..a1acb9d --- /dev/null +++ b/pkg/nvpci/mock.go @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nvpci + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/bytes" +) + +type MockA100 struct { + *nvpci +} + +func (m *MockA100) Cleanup() { + os.RemoveAll(m.pciDevicesRoot) +} + +var _ Interface = (*MockA100)(nil) + +func NewMockA100() (mock *MockA100, rerr error) { + rootDir, err := ioutil.TempDir("", "") + if err != nil { + return nil, err + } + defer func() { + if rerr != nil { + os.RemoveAll(rootDir) + } + }() + + deviceDir := filepath.Join(rootDir, "0000:80:05.1") + err = os.MkdirAll(deviceDir, 0755) + if err != nil { + return nil, err + } + + vendor, err := os.Create(filepath.Join(deviceDir, "vendor")) + if err != nil { + return nil, err + } + _, err = vendor.WriteString(fmt.Sprintf("0x%x", pciNvidiaVendorID)) + if err != nil { + return nil, err + } + + class, err := os.Create(filepath.Join(deviceDir, "class")) + if err != nil { + return nil, err + } + _, err = class.WriteString(fmt.Sprintf("0x%x", pci3dControllerClass)) + if err != nil { + return nil, err + } + + device, err := os.Create(filepath.Join(deviceDir, "device")) + if err != nil { + return nil, err + } + _, err = device.WriteString("0x20bf") + if err != nil { + return nil, err + } + + config, err := os.Create(filepath.Join(deviceDir, "config")) + if err != nil { + return nil, err + } + _data := make([]byte, pciCfgSpaceStandardSize) + data := bytes.New(&_data) + data.Write16(0, pciNvidiaVendorID) + data.Write16(2, uint16(0x20bf)) + _, err = config.Write(*data.Raw()) + if err != nil { + return nil, err + } + + bar0 := []uint64{0x00000000c2000000, 0x00000000c2ffffff, 0x0000000000040200} + resource, err := os.Create(filepath.Join(deviceDir, "resource")) + _, err = resource.WriteString(fmt.Sprintf("0x%x 0x%x 0x%x", bar0[0], bar0[1], bar0[2])) + if err != nil { + return nil, err + } + + pmcID := uint32(0x170000a1) + resource0, err := os.Create(filepath.Join(deviceDir, "resource0")) + if err != nil { + return nil, err + } + _data = make([]byte, bar0[1]-bar0[0]+1) + data = bytes.New(&_data).LittleEndian() + data.Write32(0, pmcID) + _, err = resource0.Write(*data.Raw()) + if err != nil { + return nil, err + } + + mock = &MockA100{ + &nvpci{rootDir}, + } + + return mock, nil +} diff --git a/pkg/nvpci/nvpci.go b/pkg/nvpci/nvpci.go new file mode 100644 index 0000000..0bd5b21 --- /dev/null +++ b/pkg/nvpci/nvpci.go @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nvpci + +import ( + "fmt" + "io/ioutil" + "os" + "path" + "sort" + "strconv" + "strings" +) + +const ( + // pciDevicesRoot represents base path for all pci devices under sysfs + pciDevicesRoot = "/sys/bus/pci/devices" + // pciNvidiaVendorID represents PCI vendor id for NVIDIA + pciNvidiaVendorID uint16 = 0x10de + // pciVgaControllerClass represents the PCI class for VGA Controllers + pciVgaControllerClass uint32 = 0x030000 + // pci3dControllerClass represents the PCI class for 3D Graphics accellerators + pci3dControllerClass uint32 = 0x030200 + // pciNvSwitchClass represents the PCI class for NVSwitches + pciNvSwitchClass uint32 = 0x068000 +) + +// Interface allows us to get a list of all NVIDIA PCI devices +type Interface interface { + GetAllDevices() ([]*NvidiaPCIDevice, error) + Get3DControllers() ([]*NvidiaPCIDevice, error) + GetVGAControllers() ([]*NvidiaPCIDevice, error) + GetNVSwitches() ([]*NvidiaPCIDevice, error) + GetGPUs() ([]*NvidiaPCIDevice, error) +} + +type nvpci struct { + pciDevicesRoot string +} + +var _ Interface = (*nvpci)(nil) + +// NvidiaPCIDevice represents a PCI device for an NVIDIA product +type NvidiaPCIDevice struct { + Path string + Address string + Vendor uint16 + Class uint32 + Device uint16 + Config *ConfigSpace + Resources map[int]*MemoryResource +} + +func (d *NvidiaPCIDevice) IsVGAController() bool { + return d.Class == pciVgaControllerClass +} + +func (d *NvidiaPCIDevice) Is3DController() bool { + return d.Class == pci3dControllerClass +} + +func (d *NvidiaPCIDevice) IsNVSwitch() bool { + return d.Class == pciNvSwitchClass +} + +func (d *NvidiaPCIDevice) IsGPU() bool { + return d.IsVGAController() || d.Is3DController() +} + +func (d *NvidiaPCIDevice) IsResetAvailable() bool { + _, err := os.Stat(path.Join(d.Path, "reset")) + if err != nil { + return false + } + return true +} + +func (d *NvidiaPCIDevice) Reset() error { + err := ioutil.WriteFile(path.Join(d.Path, "reset"), []byte("1"), 0) + if err != nil { + return fmt.Errorf("unable to write to reset file: %v", err) + } + return nil +} + +func New() Interface { + return &nvpci{pciDevicesRoot} +} + +// GetAllDevices returns all Nvidia PCI devices on the system +func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) { + deviceDirs, err := ioutil.ReadDir(p.pciDevicesRoot) + if err != nil { + return nil, fmt.Errorf("unable to read PCI bus devices: %v", err) + } + + var nvdevices []*NvidiaPCIDevice + for _, deviceDir := range deviceDirs { + devicePath := path.Join(p.pciDevicesRoot, deviceDir.Name()) + address := deviceDir.Name() + + vendor, err := ioutil.ReadFile(path.Join(devicePath, "vendor")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI device vendor id for %s: %v", address, err) + } + vendorStr := strings.TrimSpace(string(vendor)) + vendorID, err := strconv.ParseUint(vendorStr, 0, 16) + if err != nil { + return nil, fmt.Errorf("unable to convert vendor string to uint16: %v", vendorStr) + } + + if uint16(vendorID) != pciNvidiaVendorID { + continue + } + + class, err := ioutil.ReadFile(path.Join(devicePath, "class")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI device class for %s: %v", address, err) + } + classStr := strings.TrimSpace(string(class)) + classID, err := strconv.ParseUint(classStr, 0, 32) + if err != nil { + return nil, fmt.Errorf("unable to convert class string to uint32: %v", classStr) + } + + device, err := ioutil.ReadFile(path.Join(devicePath, "device")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI device id for %s: %v", address, err) + } + deviceStr := strings.TrimSpace(string(device)) + deviceID, err := strconv.ParseUint(deviceStr, 0, 16) + if err != nil { + return nil, fmt.Errorf("unable to convert device string to uint16: %v", deviceStr) + } + + config := &ConfigSpace{ + Path: path.Join(devicePath, "config"), + } + + resource, err := ioutil.ReadFile(path.Join(devicePath, "resource")) + if err != nil { + return nil, fmt.Errorf("unable to read PCI resource file for %s: %v", address, err) + } + + resources := make(map[int]*MemoryResource) + for i, line := range strings.Split(strings.TrimSpace(string(resource)), "\n") { + values := strings.Split(line, " ") + if len(values) != 3 { + return nil, fmt.Errorf("more than 3 entries in line '%d' of resource file", i) + } + + start, _ := strconv.ParseUint(values[0], 0, 64) + end, _ := strconv.ParseUint(values[1], 0, 64) + flags, _ := strconv.ParseUint(values[2], 0, 64) + + if (end - start) != 0 { + resources[i] = &MemoryResource{ + uintptr(start), + uintptr(end), + flags, + fmt.Sprintf("%s/resource%d", devicePath, i), + } + } + } + + nvdevice := &NvidiaPCIDevice{ + Path: devicePath, + Address: address, + Vendor: uint16(vendorID), + Class: uint32(classID), + Device: uint16(deviceID), + Config: config, + Resources: resources, + } + + nvdevices = append(nvdevices, nvdevice) + } + + addressToId := func(address string) uint64 { + address = strings.ReplaceAll(address, ":", "") + address = strings.ReplaceAll(address, ".", "") + id, _ := strconv.ParseUint(address, 16, 64) + return id + } + + sort.Slice(nvdevices, func(i, j int) bool { + return addressToId(nvdevices[i].Address) < addressToId(nvdevices[j].Address) + }) + + return nvdevices, nil +} + +// Get3DControllers returns all NVIDIA 3D Controller PCI devices on the system +func (p *nvpci) Get3DControllers() ([]*NvidiaPCIDevice, error) { + devices, err := p.GetAllDevices() + if err != nil { + return nil, fmt.Errorf("error getting all NVIDIA devices: %v", err) + } + + var filtered []*NvidiaPCIDevice + for _, d := range devices { + if d.Is3DController() { + filtered = append(filtered, d) + } + } + + return filtered, nil +} + +// GetVGAControllers returns all NVIDIA VGA Controller PCI devices on the system +func (p *nvpci) GetVGAControllers() ([]*NvidiaPCIDevice, error) { + devices, err := p.GetAllDevices() + if err != nil { + return nil, fmt.Errorf("error getting all NVIDIA devices: %v", err) + } + + var filtered []*NvidiaPCIDevice + for _, d := range devices { + if d.IsVGAController() { + filtered = append(filtered, d) + } + } + + return filtered, nil +} + +// GetNVSwitches returns all NVIDIA NVSwitch PCI devices on the system +func (p *nvpci) GetNVSwitches() ([]*NvidiaPCIDevice, error) { + devices, err := p.GetAllDevices() + if err != nil { + return nil, fmt.Errorf("error getting all NVIDIA devices: %v", err) + } + + var filtered []*NvidiaPCIDevice + for _, d := range devices { + if d.IsNVSwitch() { + filtered = append(filtered, d) + } + } + + return filtered, nil +} + +// GetGPUs returns all NVIDIA GPU devices on the system +func (p *nvpci) GetGPUs() ([]*NvidiaPCIDevice, error) { + devices, err := p.GetAllDevices() + if err != nil { + return nil, fmt.Errorf("error getting all NVIDIA devices: %v", err) + } + + var filtered []*NvidiaPCIDevice + for _, d := range devices { + if d.IsGPU() { + filtered = append(filtered, d) + } + } + + return filtered, nil +} diff --git a/pkg/nvpci/nvpci_test.go b/pkg/nvpci/nvpci_test.go new file mode 100644 index 0000000..ab3c544 --- /dev/null +++ b/pkg/nvpci/nvpci_test.go @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nvpci + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +const ( + ga100PmcID = uint32(0x170000a1) +) + +func TestNvpci(t *testing.T) { + nvpci, err := NewMockA100() + require.Nil(t, err, "Error creating NewMockA100") + defer nvpci.Cleanup() + + devices, err := nvpci.GetGPUs() + require.Nil(t, err, "Error getting GPUs") + require.Equal(t, 1, len(devices), "Wrong number of GPU devices") + require.Equal(t, 1, len(devices[0].Resources), "Wrong number GPU resources found") + + config, err := devices[0].Config.Read() + require.Nil(t, err, "Error reading config") + require.Equal(t, devices[0].Vendor, config.GetVendorID(), "Vendor IDs do not match") + require.Equal(t, devices[0].Device, config.GetDeviceID(), "Device IDs do not match") + + capabilities, err := config.GetPCICapabilities() + require.Nil(t, err, "Error getting PCI capabilities") + require.Equal(t, 0, len(capabilities.Standard), "Wrong number of standard PCI capabilities") + require.Equal(t, 0, len(capabilities.Extended), "Wrong number of extended PCI capabilities") + + resource0 := devices[0].Resources[0] + bar0, err := resource0.Open() + require.Nil(t, err, "Error opening bar0") + defer func() { + err := bar0.Close() + if err != nil { + t.Errorf("Error closing bar0: %v", err) + } + }() + require.Equal(t, int(resource0.End-resource0.Start+1), bar0.Len()) + require.Equal(t, ga100PmcID, bar0.Read32(0)) +} diff --git a/pkg/nvpci/resources.go b/pkg/nvpci/resources.go new file mode 100644 index 0000000..e4d6b57 --- /dev/null +++ b/pkg/nvpci/resources.go @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nvpci + +import ( + "fmt" + + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/mmio" +) + +const ( + pmcEndianRegister = 0x4 + pmcLittleEndian = 0x0 + pmcBigEndian = 0x01000001 +) + +type MemoryResource struct { + Start uintptr + End uintptr + Flags uint64 + Path string +} + +func (mr *MemoryResource) Open() (mmio.Mmio, error) { + rw, err := mmio.OpenRW(mr.Path, 0, int(mr.End-mr.Start+1)) + if err != nil { + return nil, fmt.Errorf("failed to open file for mmio: %v\n", err) + } + switch rw.Read32(pmcEndianRegister) { + case pmcBigEndian: + return rw.BigEndian(), nil + case pmcLittleEndian: + return rw.LittleEndian(), nil + } + return nil, fmt.Errorf("unknown endianness for mmio: %v\n", err) +} + +func (mr *MemoryResource) OpenReadOnly() (mmio.Mmio, error) { + ro, err := mmio.OpenRO(mr.Path, 0, int(mr.End-mr.Start+1)) + if err != nil { + return nil, fmt.Errorf("failed to open file for mmio: %v\n", err) + } + switch ro.Read32(pmcEndianRegister) { + case pmcBigEndian: + return ro.BigEndian(), nil + case pmcLittleEndian: + return ro.LittleEndian(), nil + } + return nil, fmt.Errorf("unknown endianness for mmio: %v\n", err) +}