mirror of
https://github.com/clearml/go-nvlib
synced 2025-04-22 07:04:34 +00:00
Add nvmdev package for mdev (vGPU) devices
This commit is contained in:
parent
cc0dadbb96
commit
505f83b943
200
pkg/nvmdev/mock.go
Normal file
200
pkg/nvmdev/mock.go
Normal file
@ -0,0 +1,200 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package nvmdev
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci"
|
||||||
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci/bytes"
|
||||||
|
"io/ioutil"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
)
|
||||||
|
|
||||||
|
// MockNvmdev mock mdev device
|
||||||
|
type MockNvmdev struct {
|
||||||
|
*nvmdev
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ Interface = (*MockNvmdev)(nil)
|
||||||
|
|
||||||
|
// NewMock creates new mock mediated (vGPU) and parent PCI devices and removes old devices
|
||||||
|
func NewMock() (mock *MockNvmdev, rerr error) {
|
||||||
|
mdevParentsRootDir, err := ioutil.TempDir("", "")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if rerr != nil {
|
||||||
|
os.RemoveAll(mdevParentsRootDir)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
mdevDevicesRootDir, err := ioutil.TempDir("", "")
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
if rerr != nil {
|
||||||
|
os.RemoveAll(mdevDevicesRootDir)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
mock = &MockNvmdev{
|
||||||
|
&nvmdev{mdevParentsRootDir, mdevDevicesRootDir},
|
||||||
|
}
|
||||||
|
|
||||||
|
return mock, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Cleanup removes the mocked mediated (vGPU) and parent PCI devices root folders
|
||||||
|
func (m *MockNvmdev) Cleanup() {
|
||||||
|
os.RemoveAll(m.mdevParentsRoot)
|
||||||
|
os.RemoveAll(m.mdevDevicesRoot)
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddMockA100Parent creates an A100 like parent GPU mock device
|
||||||
|
func (m *MockNvmdev) AddMockA100Parent(address string, numaNode int) error {
|
||||||
|
deviceDir := filepath.Join(m.mdevParentsRoot, address)
|
||||||
|
err := os.MkdirAll(deviceDir, 0755)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
vendor, err := os.Create(filepath.Join(deviceDir, "vendor"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = vendor.WriteString(fmt.Sprintf("0x%x", nvpci.PCINvidiaVendorID))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
class, err := os.Create(filepath.Join(deviceDir, "class"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = class.WriteString(fmt.Sprintf("0x%x", nvpci.PCI3dControllerClass))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
device, err := os.Create(filepath.Join(deviceDir, "device"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = device.WriteString("0x20bf")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = numa.WriteString(fmt.Sprintf("%v", numaNode))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
config, err := os.Create(filepath.Join(deviceDir, "config"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_data := make([]byte, nvpci.PCICfgSpaceStandardSize)
|
||||||
|
data := bytes.New(&_data)
|
||||||
|
data.Write16(0, nvpci.PCINvidiaVendorID)
|
||||||
|
data.Write16(2, uint16(0x20bf))
|
||||||
|
data.Write8(nvpci.PCIStatusBytePosition, nvpci.PCIStatusCapabilityList)
|
||||||
|
_, err = config.Write(*data.Raw())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
bar0 := []uint64{0x00000000c2000000, 0x00000000c2ffffff, 0x0000000000040200}
|
||||||
|
resource, err := os.Create(filepath.Join(deviceDir, "resource"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = resource.WriteString(fmt.Sprintf("0x%x 0x%x 0x%x", bar0[0], bar0[1], bar0[2]))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
pmcID := uint32(0x170000a1)
|
||||||
|
resource0, err := os.Create(filepath.Join(deviceDir, "resource0"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_data = make([]byte, bar0[1]-bar0[0]+1)
|
||||||
|
data = bytes.New(&_data).LittleEndian()
|
||||||
|
data.Write32(0, pmcID)
|
||||||
|
_, err = resource0.Write(*data.Raw())
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
mdevSupportedTypes := []string{"A100-4C", "A100-5C", "A100-8C", "A100-10C",
|
||||||
|
"A100-20C", "A100-40C", "A100-1-5CME", "A100-1-5C", "A100-2-10C", "A100-3-20C",
|
||||||
|
"A100-4-20C", "A100-7-40C"}
|
||||||
|
mdevSupportedTypesDir := filepath.Join(deviceDir, "mdev_supported_types")
|
||||||
|
err = os.MkdirAll(mdevSupportedTypesDir, 0755)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for i, mdevTypeName := range mdevSupportedTypes {
|
||||||
|
mdevTypeDir := filepath.Join(mdevSupportedTypesDir, fmt.Sprintf("nvidia-%d", 500+i))
|
||||||
|
err := os.MkdirAll(mdevTypeDir, 0755)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
name, err := os.Create(filepath.Join(mdevTypeDir, "name"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = name.WriteString(fmt.Sprintf("NVIDIA %s", mdevTypeName))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
availableInstances, err := os.Create(filepath.Join(mdevTypeDir, "available_instances"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
_, err = availableInstances.WriteString("1")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddMockA100Mdev creates an A100 like MDEV (vGPU) mock device.
|
||||||
|
// The corresponding mocked parent A100 device must be created beforehand.
|
||||||
|
func (m *MockNvmdev) AddMockA100Mdev(uuid string, mdevType string, parentMdevTypeDir string) error {
|
||||||
|
deviceDir := filepath.Join(m.mdevDevicesRoot, uuid)
|
||||||
|
err := os.MkdirAll(deviceDir, 0755)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
err = os.Symlink(parentMdevTypeDir, filepath.Join(deviceDir, "mdev_type"))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
306
pkg/nvmdev/nvmdev.go
Normal file
306
pkg/nvmdev/nvmdev.go
Normal file
@ -0,0 +1,306 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package nvmdev
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvpci"
|
||||||
|
"io/ioutil"
|
||||||
|
"os"
|
||||||
|
"path"
|
||||||
|
"path/filepath"
|
||||||
|
"sort"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
mdevParentsRoot = "/sys/class/mdev_bus"
|
||||||
|
mdevDevicesRoot = "/sys/bus/mdev/devices"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Interface allows us to get a list of NVIDIA MDEV (vGPU) and parent devices
|
||||||
|
type Interface interface {
|
||||||
|
GetAllDevices() ([]*Device, error)
|
||||||
|
GetAllParentDevices() ([]*ParentDevice, error)
|
||||||
|
}
|
||||||
|
|
||||||
|
type nvmdev struct {
|
||||||
|
mdevParentsRoot string
|
||||||
|
mdevDevicesRoot string
|
||||||
|
}
|
||||||
|
|
||||||
|
var _ Interface = (*nvmdev)(nil)
|
||||||
|
|
||||||
|
// ParentDevice represents an NVIDIA parent PCI device
|
||||||
|
type ParentDevice struct {
|
||||||
|
*nvpci.NvidiaPCIDevice
|
||||||
|
mdevPaths map[string]string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Device represents an NVIDIA MDEV (vGPU) device
|
||||||
|
type Device struct {
|
||||||
|
Path string
|
||||||
|
UUID string
|
||||||
|
MDEVType string
|
||||||
|
Parent *ParentDevice
|
||||||
|
}
|
||||||
|
|
||||||
|
// New interface that allows us to get a list of all NVIDIA parent and MDEV (vGPU) devices
|
||||||
|
func New() Interface {
|
||||||
|
return &nvmdev{mdevParentsRoot, mdevDevicesRoot}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetAllParentDevices returns all NVIDIA Parent PCI devices on the system
|
||||||
|
func (m *nvmdev) GetAllParentDevices() ([]*ParentDevice, error) {
|
||||||
|
deviceDirs, err := ioutil.ReadDir(m.mdevParentsRoot)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to read PCI bus devices: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var nvdevices []*ParentDevice
|
||||||
|
for _, deviceDir := range deviceDirs {
|
||||||
|
devicePath := path.Join(m.mdevParentsRoot, deviceDir.Name())
|
||||||
|
nvdevice, err := NewParentDevice(devicePath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error constructing NVIDIA parent device: %v", err)
|
||||||
|
}
|
||||||
|
if nvdevice == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
nvdevices = append(nvdevices, nvdevice)
|
||||||
|
}
|
||||||
|
|
||||||
|
addressToID := func(address string) uint64 {
|
||||||
|
address = strings.ReplaceAll(address, ":", "")
|
||||||
|
address = strings.ReplaceAll(address, ".", "")
|
||||||
|
id, _ := strconv.ParseUint(address, 16, 64)
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Slice(nvdevices, func(i, j int) bool {
|
||||||
|
return addressToID(nvdevices[i].Address) < addressToID(nvdevices[j].Address)
|
||||||
|
})
|
||||||
|
|
||||||
|
return nvdevices, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetAllDevices returns all NVIDIA mdev (vGPU) devices on the system
|
||||||
|
func (m *nvmdev) GetAllDevices() ([]*Device, error) {
|
||||||
|
deviceDirs, err := ioutil.ReadDir(m.mdevDevicesRoot)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to read MDEV devices directory: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var nvdevices []*Device
|
||||||
|
for _, deviceDir := range deviceDirs {
|
||||||
|
nvdevice, err := NewDevice(m.mdevDevicesRoot, deviceDir.Name())
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error constructing MDEV device: %v", err)
|
||||||
|
}
|
||||||
|
if nvdevice == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
nvdevices = append(nvdevices, nvdevice)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nvdevices, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDevice constructs a Device, which represents an NVIDIA mdev (vGPU) device
|
||||||
|
func NewDevice(root string, uuid string) (*Device, error) {
|
||||||
|
path := path.Join(root, uuid)
|
||||||
|
|
||||||
|
m, err := newMdev(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
parent, err := NewParentDevice(m.parentDevicePath())
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error constructing NVIDIA PCI device: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if parent == nil {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
mdevType, err := m.Type()
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error getting mdev type: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
device := Device{
|
||||||
|
Path: path,
|
||||||
|
UUID: uuid,
|
||||||
|
MDEVType: mdevType,
|
||||||
|
Parent: parent,
|
||||||
|
}
|
||||||
|
|
||||||
|
return &device, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type mdev string
|
||||||
|
|
||||||
|
func newMdev(devicePath string) (mdev, error) {
|
||||||
|
mdevTypeDir, err := filepath.EvalSymlinks(path.Join(devicePath, "mdev_type"))
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("error resolving mdev_type link: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return mdev(mdevTypeDir), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m mdev) String() string {
|
||||||
|
return string(m)
|
||||||
|
}
|
||||||
|
func (m mdev) parentDevicePath() string {
|
||||||
|
// /sys/bus/pci/devices/<addr>/mdev_supported_types/<mdev_type>
|
||||||
|
return path.Dir(path.Dir(string(m)))
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m mdev) Type() (string, error) {
|
||||||
|
mdevType, err := ioutil.ReadFile(path.Join(string(m), "name"))
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("unable to read mdev_type name for mdev %s: %v", m, err)
|
||||||
|
}
|
||||||
|
// file in the format: [NVIDIA|GRID] <vGPU type>
|
||||||
|
mdevTypeStr := strings.TrimSpace(string(mdevType))
|
||||||
|
mdevTypeSplit := strings.SplitN(mdevTypeStr, " ", 2)
|
||||||
|
if len(mdevTypeSplit) != 2 {
|
||||||
|
return "", fmt.Errorf("unable to parse mdev_type name %s for mdev %s", mdevTypeStr, m)
|
||||||
|
}
|
||||||
|
|
||||||
|
return mdevTypeSplit[1], nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewParentDevice constructs a ParentDevice
|
||||||
|
func NewParentDevice(devicePath string) (*ParentDevice, error) {
|
||||||
|
nvdevice, err := nvpci.NewDevice(devicePath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to construct NVIDIA PCI device: %v", err)
|
||||||
|
}
|
||||||
|
if nvdevice == nil {
|
||||||
|
// not a NVIDIA device
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
paths, err := filepath.Glob(fmt.Sprintf("%s/mdev_supported_types/nvidia-*/name", nvdevice.Path))
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to get files in mdev_supported_types directory: %v", err)
|
||||||
|
}
|
||||||
|
mdevTypesMap := make(map[string]string)
|
||||||
|
for _, path := range paths {
|
||||||
|
name, err := ioutil.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to read file %s: %v", path, err)
|
||||||
|
}
|
||||||
|
// file in the format: [NVIDIA|GRID] <vGPU type>
|
||||||
|
nameStr := strings.TrimSpace(string(name))
|
||||||
|
nameSplit := strings.SplitN(nameStr, " ", 2)
|
||||||
|
if len(nameSplit) != 2 {
|
||||||
|
return nil, fmt.Errorf("unable to parse mdev_type name %s at path %s", nameStr, path)
|
||||||
|
}
|
||||||
|
nameStr = nameSplit[len(nameSplit)-1]
|
||||||
|
|
||||||
|
mdevTypesMap[nameStr] = filepath.Dir(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &ParentDevice{nvdevice, mdevTypesMap}, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// CreateMDEVDevice creates a mediated device (vGPU) on the parent GPU
|
||||||
|
func (p *ParentDevice) CreateMDEVDevice(mdevType string, id string) error {
|
||||||
|
mdevPath, ok := p.mdevPaths[mdevType]
|
||||||
|
if !ok {
|
||||||
|
return fmt.Errorf("unable to create mdev %s: mdev not supported by parent device %s", mdevType, p.Address)
|
||||||
|
}
|
||||||
|
f, err := os.OpenFile(filepath.Join(mdevPath, "create"), os.O_WRONLY|os.O_SYNC, 0200)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("unable to open create file: %v", err)
|
||||||
|
}
|
||||||
|
_, err = f.WriteString(id)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("unable to create mdev: %v", err)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// DeleteMDEVDevice deletes a mediated device (vGPU)
|
||||||
|
func (p *ParentDevice) DeleteMDEVDevice(id string) error {
|
||||||
|
removeFile, err := os.OpenFile(filepath.Join(p.Path, id, "remove"), os.O_WRONLY|os.O_SYNC, 0200)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("unable to open remove file: %v", err)
|
||||||
|
}
|
||||||
|
_, err = removeFile.WriteString("1")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("unable to delete mdev: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Delete deletes a mediated device (vGPU)
|
||||||
|
func (m *Device) Delete() error {
|
||||||
|
removeFile, err := os.OpenFile(filepath.Join(m.Path, "remove"), os.O_WRONLY|os.O_SYNC, 0200)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("unable to open remove file: %v", err)
|
||||||
|
}
|
||||||
|
_, err = removeFile.WriteString("1")
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("unable to delete mdev: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsMDEVTypeSupported checks if the mdevType is supported by the GPU
|
||||||
|
func (p *ParentDevice) IsMDEVTypeSupported(mdevType string) bool {
|
||||||
|
_, found := p.mdevPaths[mdevType]
|
||||||
|
return found
|
||||||
|
}
|
||||||
|
|
||||||
|
// IsMDEVTypeAvailable checks if a vGPU instance of mdevType can be created on the parent GPU
|
||||||
|
func (p *ParentDevice) IsMDEVTypeAvailable(mdevType string) (bool, error) {
|
||||||
|
availableInstances, err := p.GetAvailableMDEVInstances(mdevType)
|
||||||
|
if err != nil {
|
||||||
|
return false, fmt.Errorf("failed to get available instances for mdev type %s: %v", mdevType, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return (availableInstances > 0), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetAvailableMDEVInstances returns the available instances for mdevType.
|
||||||
|
// Return -1 if mdevType is not supported for the device.
|
||||||
|
func (p *ParentDevice) GetAvailableMDEVInstances(mdevType string) (int, error) {
|
||||||
|
mdevPath, ok := p.mdevPaths[mdevType]
|
||||||
|
if !ok {
|
||||||
|
return -1, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
available, err := ioutil.ReadFile(filepath.Join(mdevPath, "available_instances"))
|
||||||
|
if err != nil {
|
||||||
|
return -1, fmt.Errorf("unable to read available_instances file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
availableInstances, err := strconv.Atoi(strings.TrimSpace(string(available)))
|
||||||
|
if err != nil {
|
||||||
|
return -1, fmt.Errorf("unable to convert available_instances to an int: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return availableInstances, nil
|
||||||
|
}
|
51
pkg/nvmdev/nvmdev_test.go
Normal file
51
pkg/nvmdev/nvmdev_test.go
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package nvmdev
|
||||||
|
|
||||||
|
import (
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestNvmdev(t *testing.T) {
|
||||||
|
nvmdev, err := NewMock()
|
||||||
|
require.Nil(t, err, "Error creating MockNvmdev")
|
||||||
|
defer nvmdev.Cleanup()
|
||||||
|
|
||||||
|
err = nvmdev.AddMockA100Parent("0000:3b:04.1", 0)
|
||||||
|
require.Nil(t, err, "Error adding Mock A100 parent device to MockNvmdev")
|
||||||
|
parentDevs, err := nvmdev.GetAllParentDevices()
|
||||||
|
require.Nil(t, err, "Error getting parent GPU devices")
|
||||||
|
require.Equal(t, 1, len(parentDevs), "Wrong number of parent GPU devices")
|
||||||
|
|
||||||
|
parentA100 := parentDevs[0]
|
||||||
|
supported := parentA100.IsMDEVTypeSupported("A100-4C")
|
||||||
|
require.True(t, supported, "A100-4C should be a supported vGPU type")
|
||||||
|
|
||||||
|
available, err := parentA100.IsMDEVTypeAvailable("A100-4C")
|
||||||
|
require.Nil(t, err, "Error checking if A100-4Q vGPU type is available for creation")
|
||||||
|
require.True(t, available, "A100-4C should be available to create")
|
||||||
|
|
||||||
|
err = nvmdev.AddMockA100Mdev("b1914f0a-15cf-416e-8967-55fc7cb68e20", "A100-4C",
|
||||||
|
filepath.Join(parentDevs[0].Path, "mdev_supported_types/nvidia-500"))
|
||||||
|
require.Nil(t, err, "Error adding Mock A100 mediated device")
|
||||||
|
|
||||||
|
mdevs, err := nvmdev.GetAllDevices()
|
||||||
|
require.Nil(t, err, "Error getting NVIDIA MDEV (vGPU) devices")
|
||||||
|
require.Equal(t, 1, len(mdevs), "Wrong number of NVIDIA MDEV (vGPU) devices")
|
||||||
|
}
|
@ -24,11 +24,16 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
pciCfgSpaceStandardSize = 256
|
// PCICfgSpaceStandardSize represents the size in bytes of the standard config space
|
||||||
pciCfgSpaceExtendedSize = 4096
|
PCICfgSpaceStandardSize = 256
|
||||||
pciCapabilityListPointer = 0x34
|
// PCICfgSpaceExtendedSize represents the size in bytes of the extended config space
|
||||||
pciStatusCapabilityList = 0x10
|
PCICfgSpaceExtendedSize = 4096
|
||||||
pciStatusBytePosition = 0x06
|
// PCICapabilityListPointer represents offset for the capability list pointer
|
||||||
|
PCICapabilityListPointer = 0x34
|
||||||
|
// PCIStatusCapabilityList represents the status register bit which indicates capability list support
|
||||||
|
PCIStatusCapabilityList = 0x10
|
||||||
|
// PCIStatusBytePosition represents the position of the status register
|
||||||
|
PCIStatusBytePosition = 0x06
|
||||||
)
|
)
|
||||||
|
|
||||||
// ConfigSpace PCI configuration space (standard extended) file path
|
// ConfigSpace PCI configuration space (standard extended) file path
|
||||||
@ -87,12 +92,12 @@ func (cs *configSpaceIO) GetPCICapabilities() (*PCICapabilities, error) {
|
|||||||
make(map[uint16]*PCIExtendedCapability),
|
make(map[uint16]*PCIExtendedCapability),
|
||||||
}
|
}
|
||||||
|
|
||||||
support := cs.Read8(pciStatusBytePosition) & pciStatusCapabilityList
|
support := cs.Read8(PCIStatusBytePosition) & PCIStatusCapabilityList
|
||||||
if support == 0 {
|
if support == 0 {
|
||||||
return nil, fmt.Errorf("pci device does not support capability list")
|
return nil, fmt.Errorf("pci device does not support capability list")
|
||||||
}
|
}
|
||||||
|
|
||||||
soffset := cs.Read8(pciCapabilityListPointer)
|
soffset := cs.Read8(PCICapabilityListPointer)
|
||||||
if int(soffset) >= cs.Len() {
|
if int(soffset) >= cs.Len() {
|
||||||
return nil, fmt.Errorf("capability list pointer out of bounds")
|
return nil, fmt.Errorf("capability list pointer out of bounds")
|
||||||
}
|
}
|
||||||
@ -101,7 +106,7 @@ func (cs *configSpaceIO) GetPCICapabilities() (*PCICapabilities, error) {
|
|||||||
if soffset == 0xff {
|
if soffset == 0xff {
|
||||||
return nil, fmt.Errorf("config space broken")
|
return nil, fmt.Errorf("config space broken")
|
||||||
}
|
}
|
||||||
if int(soffset) >= pciCfgSpaceStandardSize {
|
if int(soffset) >= PCICfgSpaceStandardSize {
|
||||||
return nil, fmt.Errorf("standard capability list pointer out of bounds")
|
return nil, fmt.Errorf("standard capability list pointer out of bounds")
|
||||||
}
|
}
|
||||||
data := cs.Read32(int(soffset))
|
data := cs.Read32(int(soffset))
|
||||||
@ -112,16 +117,16 @@ func (cs *configSpaceIO) GetPCICapabilities() (*PCICapabilities, error) {
|
|||||||
soffset = uint8((data >> 8) & 0xff)
|
soffset = uint8((data >> 8) & 0xff)
|
||||||
}
|
}
|
||||||
|
|
||||||
if cs.Len() <= pciCfgSpaceStandardSize {
|
if cs.Len() <= PCICfgSpaceStandardSize {
|
||||||
return caps, nil
|
return caps, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
eoffset := uint16(pciCfgSpaceStandardSize)
|
eoffset := uint16(PCICfgSpaceStandardSize)
|
||||||
for eoffset != 0 {
|
for eoffset != 0 {
|
||||||
if eoffset == 0xffff {
|
if eoffset == 0xffff {
|
||||||
return nil, fmt.Errorf("config space broken")
|
return nil, fmt.Errorf("config space broken")
|
||||||
}
|
}
|
||||||
if int(eoffset) >= pciCfgSpaceExtendedSize {
|
if int(eoffset) >= PCICfgSpaceExtendedSize {
|
||||||
return nil, fmt.Errorf("extended capability list pointer out of bounds")
|
return nil, fmt.Errorf("extended capability list pointer out of bounds")
|
||||||
}
|
}
|
||||||
data := cs.Read32(int(eoffset))
|
data := cs.Read32(int(eoffset))
|
||||||
|
@ -45,7 +45,7 @@ func NewMockNvpci() (mock *MockNvpci, rerr error) {
|
|||||||
}()
|
}()
|
||||||
|
|
||||||
mock = &MockNvpci{
|
mock = &MockNvpci{
|
||||||
&nvpci{rootDir},
|
NewFrom(rootDir).(*nvpci),
|
||||||
}
|
}
|
||||||
|
|
||||||
return mock, nil
|
return mock, nil
|
||||||
@ -68,7 +68,7 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
_, err = vendor.WriteString(fmt.Sprintf("0x%x", pciNvidiaVendorID))
|
_, err = vendor.WriteString(fmt.Sprintf("0x%x", PCINvidiaVendorID))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -77,7 +77,7 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
_, err = class.WriteString(fmt.Sprintf("0x%x", pci3dControllerClass))
|
_, err = class.WriteString(fmt.Sprintf("0x%x", PCI3dControllerClass))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
@ -104,11 +104,11 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
_data := make([]byte, pciCfgSpaceStandardSize)
|
_data := make([]byte, PCICfgSpaceStandardSize)
|
||||||
data := bytes.New(&_data)
|
data := bytes.New(&_data)
|
||||||
data.Write16(0, pciNvidiaVendorID)
|
data.Write16(0, PCINvidiaVendorID)
|
||||||
data.Write16(2, uint16(0x20bf))
|
data.Write16(2, uint16(0x20bf))
|
||||||
data.Write8(pciStatusBytePosition, pciStatusCapabilityList)
|
data.Write8(PCIStatusBytePosition, PCIStatusCapabilityList)
|
||||||
_, err = config.Write(*data.Raw())
|
_, err = config.Write(*data.Raw())
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -27,16 +27,16 @@ import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
// pciDevicesRoot represents base path for all pci devices under sysfs
|
// PCIDevicesRoot represents base path for all pci devices under sysfs
|
||||||
pciDevicesRoot = "/sys/bus/pci/devices"
|
PCIDevicesRoot = "/sys/bus/pci/devices"
|
||||||
// pciNvidiaVendorID represents PCI vendor id for NVIDIA
|
// PCINvidiaVendorID represents PCI vendor id for NVIDIA
|
||||||
pciNvidiaVendorID uint16 = 0x10de
|
PCINvidiaVendorID uint16 = 0x10de
|
||||||
// pciVgaControllerClass represents the PCI class for VGA Controllers
|
// PCIVgaControllerClass represents the PCI class for VGA Controllers
|
||||||
pciVgaControllerClass uint32 = 0x030000
|
PCIVgaControllerClass uint32 = 0x030000
|
||||||
// pci3dControllerClass represents the PCI class for 3D Graphics accellerators
|
// PCI3dControllerClass represents the PCI class for 3D Graphics accellerators
|
||||||
pci3dControllerClass uint32 = 0x030200
|
PCI3dControllerClass uint32 = 0x030200
|
||||||
// pciNvSwitchClass represents the PCI class for NVSwitches
|
// PCINvSwitchClass represents the PCI class for NVSwitches
|
||||||
pciNvSwitchClass uint32 = 0x068000
|
PCINvSwitchClass uint32 = 0x068000
|
||||||
)
|
)
|
||||||
|
|
||||||
// Interface allows us to get a list of all NVIDIA PCI devices
|
// Interface allows us to get a list of all NVIDIA PCI devices
|
||||||
@ -68,17 +68,17 @@ type NvidiaPCIDevice struct {
|
|||||||
|
|
||||||
// IsVGAController if class == 0x300
|
// IsVGAController if class == 0x300
|
||||||
func (d *NvidiaPCIDevice) IsVGAController() bool {
|
func (d *NvidiaPCIDevice) IsVGAController() bool {
|
||||||
return d.Class == pciVgaControllerClass
|
return d.Class == PCIVgaControllerClass
|
||||||
}
|
}
|
||||||
|
|
||||||
// Is3DController if class == 0x302
|
// Is3DController if class == 0x302
|
||||||
func (d *NvidiaPCIDevice) Is3DController() bool {
|
func (d *NvidiaPCIDevice) Is3DController() bool {
|
||||||
return d.Class == pci3dControllerClass
|
return d.Class == PCI3dControllerClass
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsNVSwitch if classe == 0x068
|
// IsNVSwitch if classe == 0x068
|
||||||
func (d *NvidiaPCIDevice) IsNVSwitch() bool {
|
func (d *NvidiaPCIDevice) IsNVSwitch() bool {
|
||||||
return d.Class == pciNvSwitchClass
|
return d.Class == PCINvSwitchClass
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsGPU either VGA for older cards or 3D for newer
|
// IsGPU either VGA for older cards or 3D for newer
|
||||||
@ -104,7 +104,12 @@ func (d *NvidiaPCIDevice) Reset() error {
|
|||||||
|
|
||||||
// New interface that allows us to get a list of all NVIDIA PCI devices
|
// New interface that allows us to get a list of all NVIDIA PCI devices
|
||||||
func New() Interface {
|
func New() Interface {
|
||||||
return &nvpci{pciDevicesRoot}
|
return &nvpci{PCIDevicesRoot}
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewFrom interface allows us to get a list of all NVIDIA PCI devices at a specific root directory
|
||||||
|
func NewFrom(root string) Interface {
|
||||||
|
return &nvpci{root}
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetAllDevices returns all Nvidia PCI devices on the system
|
// GetAllDevices returns all Nvidia PCI devices on the system
|
||||||
@ -117,7 +122,33 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) {
|
|||||||
var nvdevices []*NvidiaPCIDevice
|
var nvdevices []*NvidiaPCIDevice
|
||||||
for _, deviceDir := range deviceDirs {
|
for _, deviceDir := range deviceDirs {
|
||||||
devicePath := path.Join(p.pciDevicesRoot, deviceDir.Name())
|
devicePath := path.Join(p.pciDevicesRoot, deviceDir.Name())
|
||||||
address := deviceDir.Name()
|
nvdevice, err := NewDevice(devicePath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("error constructing NVIDIA PCI device %s: %v", deviceDir.Name(), err)
|
||||||
|
}
|
||||||
|
if nvdevice == nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
nvdevices = append(nvdevices, nvdevice)
|
||||||
|
}
|
||||||
|
|
||||||
|
addressToID := func(address string) uint64 {
|
||||||
|
address = strings.ReplaceAll(address, ":", "")
|
||||||
|
address = strings.ReplaceAll(address, ".", "")
|
||||||
|
id, _ := strconv.ParseUint(address, 16, 64)
|
||||||
|
return id
|
||||||
|
}
|
||||||
|
|
||||||
|
sort.Slice(nvdevices, func(i, j int) bool {
|
||||||
|
return addressToID(nvdevices[i].Address) < addressToID(nvdevices[j].Address)
|
||||||
|
})
|
||||||
|
|
||||||
|
return nvdevices, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewDevice constructs an NvidiaPCIDevice
|
||||||
|
func NewDevice(devicePath string) (*NvidiaPCIDevice, error) {
|
||||||
|
address := path.Base(devicePath)
|
||||||
|
|
||||||
vendor, err := ioutil.ReadFile(path.Join(devicePath, "vendor"))
|
vendor, err := ioutil.ReadFile(path.Join(devicePath, "vendor"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -129,8 +160,8 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) {
|
|||||||
return nil, fmt.Errorf("unable to convert vendor string to uint16: %v", vendorStr)
|
return nil, fmt.Errorf("unable to convert vendor string to uint16: %v", vendorStr)
|
||||||
}
|
}
|
||||||
|
|
||||||
if uint16(vendorID) != pciNvidiaVendorID {
|
if uint16(vendorID) != PCINvidiaVendorID {
|
||||||
continue
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
class, err := ioutil.ReadFile(path.Join(devicePath, "class"))
|
class, err := ioutil.ReadFile(path.Join(devicePath, "class"))
|
||||||
@ -204,21 +235,7 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) {
|
|||||||
Resources: resources,
|
Resources: resources,
|
||||||
}
|
}
|
||||||
|
|
||||||
nvdevices = append(nvdevices, nvdevice)
|
return nvdevice, nil
|
||||||
}
|
|
||||||
|
|
||||||
addressToID := func(address string) uint64 {
|
|
||||||
address = strings.ReplaceAll(address, ":", "")
|
|
||||||
address = strings.ReplaceAll(address, ".", "")
|
|
||||||
id, _ := strconv.ParseUint(address, 16, 64)
|
|
||||||
return id
|
|
||||||
}
|
|
||||||
|
|
||||||
sort.Slice(nvdevices, func(i, j int) bool {
|
|
||||||
return addressToID(nvdevices[i].Address) < addressToID(nvdevices[j].Address)
|
|
||||||
})
|
|
||||||
|
|
||||||
return nvdevices, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get3DControllers returns all NVIDIA 3D Controller PCI devices on the system
|
// Get3DControllers returns all NVIDIA 3D Controller PCI devices on the system
|
||||||
|
Loading…
Reference in New Issue
Block a user