mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-21 15:57:49 +00:00
Merge pull request #558 from NVIDIA/dependabot/go_modules/main/github.com/NVIDIA/go-nvlib-0.6.0
Bump github.com/NVIDIA/go-nvlib from 0.5.0 to 0.6.0
This commit is contained in:
commit
15c884e99f
2
go.mod
2
go.mod
@ -3,7 +3,7 @@ module github.com/NVIDIA/nvidia-container-toolkit
|
||||
go 1.20
|
||||
|
||||
require (
|
||||
github.com/NVIDIA/go-nvlib v0.5.0
|
||||
github.com/NVIDIA/go-nvlib v0.6.0
|
||||
github.com/NVIDIA/go-nvml v0.12.4-0
|
||||
github.com/fsnotify/fsnotify v1.7.0
|
||||
github.com/opencontainers/runtime-spec v1.2.0
|
||||
|
4
go.sum
4
go.sum
@ -1,5 +1,5 @@
|
||||
github.com/NVIDIA/go-nvlib v0.5.0 h1:951KGrfr+p3cs89alO9z/ZxPPWKxwht9tx9rxiADoLI=
|
||||
github.com/NVIDIA/go-nvlib v0.5.0/go.mod h1:87z49ULPr4GWPSGfSIp3taU4XENRYN/enIg88MzcL4k=
|
||||
github.com/NVIDIA/go-nvlib v0.6.0 h1:zAMBzCYT9xeyRQo0tb7HJbStkzajD6e5joyaQqJ2OGU=
|
||||
github.com/NVIDIA/go-nvlib v0.6.0/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY=
|
||||
github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg=
|
||||
github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ=
|
||||
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
|
||||
|
25
vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go
generated
vendored
25
vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go
generated
vendored
@ -18,6 +18,7 @@ package device
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
)
|
||||
@ -30,6 +31,7 @@ type Device interface {
|
||||
GetCudaComputeCapabilityAsString() (string, error)
|
||||
GetMigDevices() ([]MigDevice, error)
|
||||
GetMigProfiles() ([]MigProfile, error)
|
||||
GetPCIBusID() (string, error)
|
||||
IsMigCapable() (bool, error)
|
||||
IsMigEnabled() (bool, error)
|
||||
VisitMigDevices(func(j int, m MigDevice) error) error
|
||||
@ -140,6 +142,29 @@ func (d *device) GetBrandAsString() (string, error) {
|
||||
return "", fmt.Errorf("error interpreting device brand as string: %v", brand)
|
||||
}
|
||||
|
||||
// GetPCIBusID returns the string representation of the bus ID.
|
||||
func (d *device) GetPCIBusID() (string, error) {
|
||||
info, ret := d.GetPciInfo()
|
||||
if ret != nvml.SUCCESS {
|
||||
return "", fmt.Errorf("error getting PCI info: %w", ret)
|
||||
}
|
||||
|
||||
var bytes []byte
|
||||
for _, b := range info.BusId {
|
||||
if byte(b) == '\x00' {
|
||||
break
|
||||
}
|
||||
bytes = append(bytes, byte(b))
|
||||
}
|
||||
id := strings.ToLower(string(bytes))
|
||||
|
||||
if id != "0000" {
|
||||
id = strings.TrimPrefix(id, "0000")
|
||||
}
|
||||
|
||||
return id, nil
|
||||
}
|
||||
|
||||
// GetCudaComputeCapabilityAsString returns the Device's CUDA compute capability as a version string.
|
||||
func (d *device) GetCudaComputeCapabilityAsString() (string, error) {
|
||||
major, minor, ret := d.GetCudaComputeCapability()
|
||||
|
144
vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go
generated
vendored
144
vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go
generated
vendored
@ -20,6 +20,8 @@ import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
|
||||
"github.com/NVIDIA/go-nvlib/pkg/nvpci/bytes"
|
||||
)
|
||||
@ -55,14 +57,82 @@ func (m *MockNvpci) Cleanup() {
|
||||
os.RemoveAll(m.pciDevicesRoot)
|
||||
}
|
||||
|
||||
func validatePCIAddress(addr string) error {
|
||||
r := regexp.MustCompile(`0{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]`)
|
||||
if !r.Match([]byte(addr)) {
|
||||
return fmt.Errorf(`invalid PCI address should match 0{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]: %s`, addr)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// AddMockA100 Create an A100 like GPU mock device.
|
||||
func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
|
||||
deviceDir := filepath.Join(m.pciDevicesRoot, address)
|
||||
err := os.MkdirAll(deviceDir, 0755)
|
||||
func (m *MockNvpci) AddMockA100(address string, numaNode int, sriov *SriovInfo) error {
|
||||
err := validatePCIAddress(address)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
deviceDir := filepath.Join(m.pciDevicesRoot, address)
|
||||
err = os.MkdirAll(deviceDir, 0755)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = createNVIDIAgpuFiles(deviceDir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
iommuGroup := 20
|
||||
_, err = os.Create(filepath.Join(deviceDir, strconv.Itoa(iommuGroup)))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = os.Symlink(filepath.Join(deviceDir, strconv.Itoa(iommuGroup)), filepath.Join(deviceDir, "iommu_group"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = numa.WriteString(fmt.Sprintf("%v", numaNode))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if sriov != nil && sriov.PhysicalFunction != nil {
|
||||
totalVFs, err := os.Create(filepath.Join(deviceDir, "sriov_totalvfs"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = fmt.Fprintf(totalVFs, "%d", sriov.PhysicalFunction.TotalVFs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
numVFs, err := os.Create(filepath.Join(deviceDir, "sriov_numvfs"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = fmt.Fprintf(numVFs, "%d", sriov.PhysicalFunction.NumVFs)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for i := 1; i <= int(sriov.PhysicalFunction.NumVFs); i++ {
|
||||
err = m.createVf(address, i, iommuGroup, numaNode)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func createNVIDIAgpuFiles(deviceDir string) error {
|
||||
vendor, err := os.Create(filepath.Join(deviceDir, "vendor"))
|
||||
if err != nil {
|
||||
return err
|
||||
@ -99,24 +169,6 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
|
||||
return err
|
||||
}
|
||||
|
||||
_, err = os.Create(filepath.Join(deviceDir, "20"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = os.Symlink(filepath.Join(deviceDir, "20"), filepath.Join(deviceDir, "iommu_group"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = numa.WriteString(fmt.Sprintf("%v", numaNode))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
config, err := os.Create(filepath.Join(deviceDir, "config"))
|
||||
if err != nil {
|
||||
return err
|
||||
@ -156,3 +208,53 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *MockNvpci) createVf(pfAddress string, id, iommu_group, numaNode int) error {
|
||||
functionID := pfAddress[len(pfAddress)-1]
|
||||
// we are verifying the last character of pfAddress is integer.
|
||||
functionNumber, err := strconv.Atoi(string(functionID))
|
||||
if err != nil {
|
||||
return fmt.Errorf("can't conver physical function pci address function number %s to integer: %v", string(functionID), err)
|
||||
}
|
||||
|
||||
vfFunctionNumber := functionNumber + id
|
||||
vfAddress := pfAddress[:len(pfAddress)-1] + strconv.Itoa(vfFunctionNumber)
|
||||
|
||||
deviceDir := filepath.Join(m.pciDevicesRoot, vfAddress)
|
||||
err = os.MkdirAll(deviceDir, 0755)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = createNVIDIAgpuFiles(deviceDir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
vfIommuGroup := strconv.Itoa(iommu_group + id)
|
||||
|
||||
_, err = os.Create(filepath.Join(deviceDir, vfIommuGroup))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
err = os.Symlink(filepath.Join(deviceDir, vfIommuGroup), filepath.Join(deviceDir, "iommu_group"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
numa, err := os.Create(filepath.Join(deviceDir, "numa_node"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = numa.WriteString(fmt.Sprintf("%v", numaNode))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = os.Symlink(filepath.Join(m.pciDevicesRoot, pfAddress), filepath.Join(deviceDir, "physfn"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
121
vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go
generated
vendored
121
vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go
generated
vendored
@ -76,6 +76,32 @@ type nvpci struct {
|
||||
var _ Interface = (*nvpci)(nil)
|
||||
var _ ResourceInterface = (*MemoryResources)(nil)
|
||||
|
||||
// SriovInfo indicates whether device is VF/PF for SRIOV capable devices.
|
||||
// Only one should be set at any given time.
|
||||
type SriovInfo struct {
|
||||
PhysicalFunction *SriovPhysicalFunction
|
||||
VirtualFunction *SriovVirtualFunction
|
||||
}
|
||||
|
||||
// SriovPhysicalFunction stores info about SRIOV physical function.
|
||||
type SriovPhysicalFunction struct {
|
||||
TotalVFs uint64
|
||||
NumVFs uint64
|
||||
}
|
||||
|
||||
// SriovVirtualFunction keeps data about SRIOV virtual function.
|
||||
type SriovVirtualFunction struct {
|
||||
PhysicalFunction *NvidiaPCIDevice
|
||||
}
|
||||
|
||||
func (s *SriovInfo) IsPF() bool {
|
||||
return s != nil && s.PhysicalFunction != nil
|
||||
}
|
||||
|
||||
func (s *SriovInfo) IsVF() bool {
|
||||
return s != nil && s.VirtualFunction != nil
|
||||
}
|
||||
|
||||
// NvidiaPCIDevice represents a PCI device for an NVIDIA product.
|
||||
type NvidiaPCIDevice struct {
|
||||
Path string
|
||||
@ -90,7 +116,7 @@ type NvidiaPCIDevice struct {
|
||||
NumaNode int
|
||||
Config *ConfigSpace
|
||||
Resources MemoryResources
|
||||
IsVF bool
|
||||
SriovInfo SriovInfo
|
||||
}
|
||||
|
||||
// IsVGAController if class == 0x300.
|
||||
@ -178,9 +204,11 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) {
|
||||
}
|
||||
|
||||
var nvdevices []*NvidiaPCIDevice
|
||||
// Cache devices for each GetAllDevices invocation to speed things up.
|
||||
cache := make(map[string]*NvidiaPCIDevice)
|
||||
for _, deviceDir := range deviceDirs {
|
||||
deviceAddress := deviceDir.Name()
|
||||
nvdevice, err := p.GetGPUByPciBusID(deviceAddress)
|
||||
nvdevice, err := p.getGPUByPciBusID(deviceAddress, cache)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error constructing NVIDIA PCI device %s: %v", deviceAddress, err)
|
||||
}
|
||||
@ -206,6 +234,16 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) {
|
||||
|
||||
// GetGPUByPciBusID constructs an NvidiaPCIDevice for the specified address (PCI Bus ID).
|
||||
func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) {
|
||||
// Pass nil as to force reading device information from sysfs.
|
||||
return p.getGPUByPciBusID(address, nil)
|
||||
}
|
||||
|
||||
func (p *nvpci) getGPUByPciBusID(address string, cache map[string]*NvidiaPCIDevice) (*NvidiaPCIDevice, error) {
|
||||
if cache != nil {
|
||||
if pciDevice, exists := cache[address]; exists {
|
||||
return pciDevice, nil
|
||||
}
|
||||
}
|
||||
devicePath := filepath.Join(p.pciDevicesRoot, address)
|
||||
|
||||
vendor, err := os.ReadFile(path.Join(devicePath, "vendor"))
|
||||
@ -265,16 +303,6 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) {
|
||||
return nil, fmt.Errorf("unable to detect iommu_group for %s: %v", address, err)
|
||||
}
|
||||
|
||||
// device is a virtual function (VF) if "physfn" symlink exists.
|
||||
var isVF bool
|
||||
_, err = filepath.EvalSymlinks(path.Join(devicePath, "physfn"))
|
||||
if err == nil {
|
||||
isVF = true
|
||||
}
|
||||
if err != nil && !os.IsNotExist(err) {
|
||||
return nil, fmt.Errorf("unable to resolve %s: %v", path.Join(devicePath, "physfn"), err)
|
||||
}
|
||||
|
||||
numa, err := os.ReadFile(path.Join(devicePath, "numa_node"))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err)
|
||||
@ -328,6 +356,28 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) {
|
||||
className = UnknownClassString
|
||||
}
|
||||
|
||||
var sriovInfo SriovInfo
|
||||
// Device is a virtual function (VF) if "physfn" symlink exists.
|
||||
physFnAddress, err := filepath.EvalSymlinks(path.Join(devicePath, "physfn"))
|
||||
if err == nil {
|
||||
physFn, err := p.getGPUByPciBusID(filepath.Base(physFnAddress), cache)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to detect physfn for %s: %v", address, err)
|
||||
}
|
||||
sriovInfo = SriovInfo{
|
||||
VirtualFunction: &SriovVirtualFunction{
|
||||
PhysicalFunction: physFn,
|
||||
},
|
||||
}
|
||||
} else if os.IsNotExist(err) {
|
||||
sriovInfo, err = p.getSriovInfoForPhysicalFunction(devicePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to read SRIOV physical function details for %s: %v", devicePath, err)
|
||||
}
|
||||
} else {
|
||||
return nil, fmt.Errorf("unable to read %s: %v", path.Join(devicePath, "physfn"), err)
|
||||
}
|
||||
|
||||
nvdevice := &NvidiaPCIDevice{
|
||||
Path: devicePath,
|
||||
Address: address,
|
||||
@ -339,9 +389,14 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) {
|
||||
NumaNode: int(numaNode),
|
||||
Config: config,
|
||||
Resources: resources,
|
||||
IsVF: isVF,
|
||||
DeviceName: deviceName,
|
||||
ClassName: className,
|
||||
SriovInfo: sriovInfo,
|
||||
}
|
||||
|
||||
// Cache physical functions only as VF can't be a root device.
|
||||
if cache != nil && sriovInfo.IsPF() {
|
||||
cache[address] = nvdevice
|
||||
}
|
||||
|
||||
return nvdevice, nil
|
||||
@ -407,7 +462,7 @@ func (p *nvpci) GetGPUs() ([]*NvidiaPCIDevice, error) {
|
||||
|
||||
var filtered []*NvidiaPCIDevice
|
||||
for _, d := range devices {
|
||||
if d.IsGPU() && !d.IsVF {
|
||||
if d.IsGPU() && !d.SriovInfo.IsVF() {
|
||||
filtered = append(filtered, d)
|
||||
}
|
||||
}
|
||||
@ -428,3 +483,41 @@ func (p *nvpci) GetGPUByIndex(i int) (*NvidiaPCIDevice, error) {
|
||||
|
||||
return gpus[i], nil
|
||||
}
|
||||
|
||||
func (p *nvpci) getSriovInfoForPhysicalFunction(devicePath string) (sriovInfo SriovInfo, err error) {
|
||||
totalVfsPath := filepath.Join(devicePath, "sriov_totalvfs")
|
||||
numVfsPath := filepath.Join(devicePath, "sriov_numvfs")
|
||||
|
||||
// No file for sriov_totalvfs exists? Not an SRIOV device, return nil
|
||||
_, err = os.Stat(totalVfsPath)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
return sriovInfo, nil
|
||||
}
|
||||
sriovTotalVfs, err := os.ReadFile(totalVfsPath)
|
||||
if err != nil {
|
||||
return sriovInfo, fmt.Errorf("unable to read sriov_totalvfs: %v", err)
|
||||
}
|
||||
totalVfsStr := strings.TrimSpace(string(sriovTotalVfs))
|
||||
totalVfsInt, err := strconv.ParseUint(totalVfsStr, 10, 16)
|
||||
if err != nil {
|
||||
return sriovInfo, fmt.Errorf("unable to convert sriov_totalvfs to uint64: %v", err)
|
||||
}
|
||||
|
||||
sriovNumVfs, err := os.ReadFile(numVfsPath)
|
||||
if err != nil {
|
||||
return sriovInfo, fmt.Errorf("unable to read sriov_numvfs for: %v", err)
|
||||
}
|
||||
numVfsStr := strings.TrimSpace(string(sriovNumVfs))
|
||||
numVfsInt, err := strconv.ParseUint(numVfsStr, 10, 16)
|
||||
if err != nil {
|
||||
return sriovInfo, fmt.Errorf("unable to convert sriov_numvfs to uint64: %v", err)
|
||||
}
|
||||
|
||||
sriovInfo = SriovInfo{
|
||||
PhysicalFunction: &SriovPhysicalFunction{
|
||||
TotalVFs: totalVfsInt,
|
||||
NumVFs: numVfsInt,
|
||||
},
|
||||
}
|
||||
return sriovInfo, nil
|
||||
}
|
||||
|
2
vendor/modules.txt
vendored
2
vendor/modules.txt
vendored
@ -1,4 +1,4 @@
|
||||
# github.com/NVIDIA/go-nvlib v0.5.0
|
||||
# github.com/NVIDIA/go-nvlib v0.6.0
|
||||
## explicit; go 1.20
|
||||
github.com/NVIDIA/go-nvlib/pkg/nvlib/device
|
||||
github.com/NVIDIA/go-nvlib/pkg/nvlib/info
|
||||
|
Loading…
Reference in New Issue
Block a user