Merge pull request #558 from NVIDIA/dependabot/go_modules/main/github.com/NVIDIA/go-nvlib-0.6.0

Bump github.com/NVIDIA/go-nvlib from 0.5.0 to 0.6.0
This commit is contained in:
Evan Lezar 2024-07-02 16:21:02 +02:00 committed by GitHub
commit 15c884e99f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 259 additions and 39 deletions

2
go.mod
View File

@ -3,7 +3,7 @@ module github.com/NVIDIA/nvidia-container-toolkit
go 1.20
require (
github.com/NVIDIA/go-nvlib v0.5.0
github.com/NVIDIA/go-nvlib v0.6.0
github.com/NVIDIA/go-nvml v0.12.4-0
github.com/fsnotify/fsnotify v1.7.0
github.com/opencontainers/runtime-spec v1.2.0

4
go.sum
View File

@ -1,5 +1,5 @@
github.com/NVIDIA/go-nvlib v0.5.0 h1:951KGrfr+p3cs89alO9z/ZxPPWKxwht9tx9rxiADoLI=
github.com/NVIDIA/go-nvlib v0.5.0/go.mod h1:87z49ULPr4GWPSGfSIp3taU4XENRYN/enIg88MzcL4k=
github.com/NVIDIA/go-nvlib v0.6.0 h1:zAMBzCYT9xeyRQo0tb7HJbStkzajD6e5joyaQqJ2OGU=
github.com/NVIDIA/go-nvlib v0.6.0/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY=
github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg=
github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=

View File

@ -18,6 +18,7 @@ package device
import (
"fmt"
"strings"
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
@ -30,6 +31,7 @@ type Device interface {
GetCudaComputeCapabilityAsString() (string, error)
GetMigDevices() ([]MigDevice, error)
GetMigProfiles() ([]MigProfile, error)
GetPCIBusID() (string, error)
IsMigCapable() (bool, error)
IsMigEnabled() (bool, error)
VisitMigDevices(func(j int, m MigDevice) error) error
@ -140,6 +142,29 @@ func (d *device) GetBrandAsString() (string, error) {
return "", fmt.Errorf("error interpreting device brand as string: %v", brand)
}
// GetPCIBusID returns the string representation of the bus ID.
func (d *device) GetPCIBusID() (string, error) {
info, ret := d.GetPciInfo()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("error getting PCI info: %w", ret)
}
var bytes []byte
for _, b := range info.BusId {
if byte(b) == '\x00' {
break
}
bytes = append(bytes, byte(b))
}
id := strings.ToLower(string(bytes))
if id != "0000" {
id = strings.TrimPrefix(id, "0000")
}
return id, nil
}
// GetCudaComputeCapabilityAsString returns the Device's CUDA compute capability as a version string.
func (d *device) GetCudaComputeCapabilityAsString() (string, error) {
major, minor, ret := d.GetCudaComputeCapability()

View File

@ -20,6 +20,8 @@ import (
"fmt"
"os"
"path/filepath"
"regexp"
"strconv"
"github.com/NVIDIA/go-nvlib/pkg/nvpci/bytes"
)
@ -55,14 +57,82 @@ func (m *MockNvpci) Cleanup() {
os.RemoveAll(m.pciDevicesRoot)
}
func validatePCIAddress(addr string) error {
r := regexp.MustCompile(`0{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]`)
if !r.Match([]byte(addr)) {
return fmt.Errorf(`invalid PCI address should match 0{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]: %s`, addr)
}
return nil
}
// AddMockA100 Create an A100 like GPU mock device.
func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
deviceDir := filepath.Join(m.pciDevicesRoot, address)
err := os.MkdirAll(deviceDir, 0755)
func (m *MockNvpci) AddMockA100(address string, numaNode int, sriov *SriovInfo) error {
err := validatePCIAddress(address)
if err != nil {
return err
}
deviceDir := filepath.Join(m.pciDevicesRoot, address)
err = os.MkdirAll(deviceDir, 0755)
if err != nil {
return err
}
err = createNVIDIAgpuFiles(deviceDir)
if err != nil {
return err
}
iommuGroup := 20
_, err = os.Create(filepath.Join(deviceDir, strconv.Itoa(iommuGroup)))
if err != nil {
return err
}
err = os.Symlink(filepath.Join(deviceDir, strconv.Itoa(iommuGroup)), filepath.Join(deviceDir, "iommu_group"))
if err != nil {
return err
}
numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
if err != nil {
return err
}
_, err = numa.WriteString(fmt.Sprintf("%v", numaNode))
if err != nil {
return err
}
if sriov != nil && sriov.PhysicalFunction != nil {
totalVFs, err := os.Create(filepath.Join(deviceDir, "sriov_totalvfs"))
if err != nil {
return err
}
_, err = fmt.Fprintf(totalVFs, "%d", sriov.PhysicalFunction.TotalVFs)
if err != nil {
return err
}
numVFs, err := os.Create(filepath.Join(deviceDir, "sriov_numvfs"))
if err != nil {
return err
}
_, err = fmt.Fprintf(numVFs, "%d", sriov.PhysicalFunction.NumVFs)
if err != nil {
return err
}
for i := 1; i <= int(sriov.PhysicalFunction.NumVFs); i++ {
err = m.createVf(address, i, iommuGroup, numaNode)
if err != nil {
return err
}
}
}
return nil
}
func createNVIDIAgpuFiles(deviceDir string) error {
vendor, err := os.Create(filepath.Join(deviceDir, "vendor"))
if err != nil {
return err
@ -99,24 +169,6 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
return err
}
_, err = os.Create(filepath.Join(deviceDir, "20"))
if err != nil {
return err
}
err = os.Symlink(filepath.Join(deviceDir, "20"), filepath.Join(deviceDir, "iommu_group"))
if err != nil {
return err
}
numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
if err != nil {
return err
}
_, err = numa.WriteString(fmt.Sprintf("%v", numaNode))
if err != nil {
return err
}
config, err := os.Create(filepath.Join(deviceDir, "config"))
if err != nil {
return err
@ -156,3 +208,53 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
return nil
}
func (m *MockNvpci) createVf(pfAddress string, id, iommu_group, numaNode int) error {
functionID := pfAddress[len(pfAddress)-1]
// we are verifying the last character of pfAddress is integer.
functionNumber, err := strconv.Atoi(string(functionID))
if err != nil {
return fmt.Errorf("can't conver physical function pci address function number %s to integer: %v", string(functionID), err)
}
vfFunctionNumber := functionNumber + id
vfAddress := pfAddress[:len(pfAddress)-1] + strconv.Itoa(vfFunctionNumber)
deviceDir := filepath.Join(m.pciDevicesRoot, vfAddress)
err = os.MkdirAll(deviceDir, 0755)
if err != nil {
return err
}
err = createNVIDIAgpuFiles(deviceDir)
if err != nil {
return err
}
vfIommuGroup := strconv.Itoa(iommu_group + id)
_, err = os.Create(filepath.Join(deviceDir, vfIommuGroup))
if err != nil {
return err
}
err = os.Symlink(filepath.Join(deviceDir, vfIommuGroup), filepath.Join(deviceDir, "iommu_group"))
if err != nil {
return err
}
numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
if err != nil {
return err
}
_, err = numa.WriteString(fmt.Sprintf("%v", numaNode))
if err != nil {
return err
}
err = os.Symlink(filepath.Join(m.pciDevicesRoot, pfAddress), filepath.Join(deviceDir, "physfn"))
if err != nil {
return err
}
return nil
}

View File

@ -76,6 +76,32 @@ type nvpci struct {
var _ Interface = (*nvpci)(nil)
var _ ResourceInterface = (*MemoryResources)(nil)
// SriovInfo indicates whether device is VF/PF for SRIOV capable devices.
// Only one should be set at any given time.
type SriovInfo struct {
PhysicalFunction *SriovPhysicalFunction
VirtualFunction *SriovVirtualFunction
}
// SriovPhysicalFunction stores info about SRIOV physical function.
type SriovPhysicalFunction struct {
TotalVFs uint64
NumVFs uint64
}
// SriovVirtualFunction keeps data about SRIOV virtual function.
type SriovVirtualFunction struct {
PhysicalFunction *NvidiaPCIDevice
}
func (s *SriovInfo) IsPF() bool {
return s != nil && s.PhysicalFunction != nil
}
func (s *SriovInfo) IsVF() bool {
return s != nil && s.VirtualFunction != nil
}
// NvidiaPCIDevice represents a PCI device for an NVIDIA product.
type NvidiaPCIDevice struct {
Path string
@ -90,7 +116,7 @@ type NvidiaPCIDevice struct {
NumaNode int
Config *ConfigSpace
Resources MemoryResources
IsVF bool
SriovInfo SriovInfo
}
// IsVGAController if class == 0x300.
@ -178,9 +204,11 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) {
}
var nvdevices []*NvidiaPCIDevice
// Cache devices for each GetAllDevices invocation to speed things up.
cache := make(map[string]*NvidiaPCIDevice)
for _, deviceDir := range deviceDirs {
deviceAddress := deviceDir.Name()
nvdevice, err := p.GetGPUByPciBusID(deviceAddress)
nvdevice, err := p.getGPUByPciBusID(deviceAddress, cache)
if err != nil {
return nil, fmt.Errorf("error constructing NVIDIA PCI device %s: %v", deviceAddress, err)
}
@ -206,6 +234,16 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) {
// GetGPUByPciBusID constructs an NvidiaPCIDevice for the specified address (PCI Bus ID).
func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) {
// Pass nil as to force reading device information from sysfs.
return p.getGPUByPciBusID(address, nil)
}
func (p *nvpci) getGPUByPciBusID(address string, cache map[string]*NvidiaPCIDevice) (*NvidiaPCIDevice, error) {
if cache != nil {
if pciDevice, exists := cache[address]; exists {
return pciDevice, nil
}
}
devicePath := filepath.Join(p.pciDevicesRoot, address)
vendor, err := os.ReadFile(path.Join(devicePath, "vendor"))
@ -265,16 +303,6 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) {
return nil, fmt.Errorf("unable to detect iommu_group for %s: %v", address, err)
}
// device is a virtual function (VF) if "physfn" symlink exists.
var isVF bool
_, err = filepath.EvalSymlinks(path.Join(devicePath, "physfn"))
if err == nil {
isVF = true
}
if err != nil && !os.IsNotExist(err) {
return nil, fmt.Errorf("unable to resolve %s: %v", path.Join(devicePath, "physfn"), err)
}
numa, err := os.ReadFile(path.Join(devicePath, "numa_node"))
if err != nil {
return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err)
@ -328,6 +356,28 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) {
className = UnknownClassString
}
var sriovInfo SriovInfo
// Device is a virtual function (VF) if "physfn" symlink exists.
physFnAddress, err := filepath.EvalSymlinks(path.Join(devicePath, "physfn"))
if err == nil {
physFn, err := p.getGPUByPciBusID(filepath.Base(physFnAddress), cache)
if err != nil {
return nil, fmt.Errorf("unable to detect physfn for %s: %v", address, err)
}
sriovInfo = SriovInfo{
VirtualFunction: &SriovVirtualFunction{
PhysicalFunction: physFn,
},
}
} else if os.IsNotExist(err) {
sriovInfo, err = p.getSriovInfoForPhysicalFunction(devicePath)
if err != nil {
return nil, fmt.Errorf("unable to read SRIOV physical function details for %s: %v", devicePath, err)
}
} else {
return nil, fmt.Errorf("unable to read %s: %v", path.Join(devicePath, "physfn"), err)
}
nvdevice := &NvidiaPCIDevice{
Path: devicePath,
Address: address,
@ -339,9 +389,14 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) {
NumaNode: int(numaNode),
Config: config,
Resources: resources,
IsVF: isVF,
DeviceName: deviceName,
ClassName: className,
SriovInfo: sriovInfo,
}
// Cache physical functions only as VF can't be a root device.
if cache != nil && sriovInfo.IsPF() {
cache[address] = nvdevice
}
return nvdevice, nil
@ -407,7 +462,7 @@ func (p *nvpci) GetGPUs() ([]*NvidiaPCIDevice, error) {
var filtered []*NvidiaPCIDevice
for _, d := range devices {
if d.IsGPU() && !d.IsVF {
if d.IsGPU() && !d.SriovInfo.IsVF() {
filtered = append(filtered, d)
}
}
@ -428,3 +483,41 @@ func (p *nvpci) GetGPUByIndex(i int) (*NvidiaPCIDevice, error) {
return gpus[i], nil
}
func (p *nvpci) getSriovInfoForPhysicalFunction(devicePath string) (sriovInfo SriovInfo, err error) {
totalVfsPath := filepath.Join(devicePath, "sriov_totalvfs")
numVfsPath := filepath.Join(devicePath, "sriov_numvfs")
// No file for sriov_totalvfs exists? Not an SRIOV device, return nil
_, err = os.Stat(totalVfsPath)
if err != nil && os.IsNotExist(err) {
return sriovInfo, nil
}
sriovTotalVfs, err := os.ReadFile(totalVfsPath)
if err != nil {
return sriovInfo, fmt.Errorf("unable to read sriov_totalvfs: %v", err)
}
totalVfsStr := strings.TrimSpace(string(sriovTotalVfs))
totalVfsInt, err := strconv.ParseUint(totalVfsStr, 10, 16)
if err != nil {
return sriovInfo, fmt.Errorf("unable to convert sriov_totalvfs to uint64: %v", err)
}
sriovNumVfs, err := os.ReadFile(numVfsPath)
if err != nil {
return sriovInfo, fmt.Errorf("unable to read sriov_numvfs for: %v", err)
}
numVfsStr := strings.TrimSpace(string(sriovNumVfs))
numVfsInt, err := strconv.ParseUint(numVfsStr, 10, 16)
if err != nil {
return sriovInfo, fmt.Errorf("unable to convert sriov_numvfs to uint64: %v", err)
}
sriovInfo = SriovInfo{
PhysicalFunction: &SriovPhysicalFunction{
TotalVFs: totalVfsInt,
NumVFs: numVfsInt,
},
}
return sriovInfo, nil
}

2
vendor/modules.txt vendored
View File

@ -1,4 +1,4 @@
# github.com/NVIDIA/go-nvlib v0.5.0
# github.com/NVIDIA/go-nvlib v0.6.0
## explicit; go 1.20
github.com/NVIDIA/go-nvlib/pkg/nvlib/device
github.com/NVIDIA/go-nvlib/pkg/nvlib/info