From ca528c4f530d0a2fb7bbb0190c740259d16ab213 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sun, 23 Jun 2024 08:42:01 +0000 Subject: [PATCH] Bump github.com/NVIDIA/go-nvlib from 0.5.0 to 0.6.0 Bumps [github.com/NVIDIA/go-nvlib](https://github.com/NVIDIA/go-nvlib) from 0.5.0 to 0.6.0. - [Release notes](https://github.com/NVIDIA/go-nvlib/releases) - [Commits](https://github.com/NVIDIA/go-nvlib/compare/v0.5.0...v0.6.0) --- updated-dependencies: - dependency-name: github.com/NVIDIA/go-nvlib dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- go.mod | 2 +- go.sum | 4 +- .../go-nvlib/pkg/nvlib/device/device.go | 25 +++ .../NVIDIA/go-nvlib/pkg/nvpci/mock.go | 144 +++++++++++++++--- .../NVIDIA/go-nvlib/pkg/nvpci/nvpci.go | 121 +++++++++++++-- vendor/modules.txt | 2 +- 6 files changed, 259 insertions(+), 39 deletions(-) diff --git a/go.mod b/go.mod index 0eb046eb..ebe8485d 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/NVIDIA/nvidia-container-toolkit go 1.20 require ( - github.com/NVIDIA/go-nvlib v0.5.0 + github.com/NVIDIA/go-nvlib v0.6.0 github.com/NVIDIA/go-nvml v0.12.4-0 github.com/fsnotify/fsnotify v1.7.0 github.com/opencontainers/runtime-spec v1.2.0 diff --git a/go.sum b/go.sum index 9fec4ce7..0cf58698 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,5 @@ -github.com/NVIDIA/go-nvlib v0.5.0 h1:951KGrfr+p3cs89alO9z/ZxPPWKxwht9tx9rxiADoLI= -github.com/NVIDIA/go-nvlib v0.5.0/go.mod h1:87z49ULPr4GWPSGfSIp3taU4XENRYN/enIg88MzcL4k= +github.com/NVIDIA/go-nvlib v0.6.0 h1:zAMBzCYT9xeyRQo0tb7HJbStkzajD6e5joyaQqJ2OGU= +github.com/NVIDIA/go-nvlib v0.6.0/go.mod h1:9UrsLGx/q1OrENygXjOuM5Ey5KCtiZhbvBlbUIxtGWY= github.com/NVIDIA/go-nvml v0.12.4-0 h1:4tkbB3pT1O77JGr0gQ6uD8FrsUPqP1A/EOEm2wI1TUg= github.com/NVIDIA/go-nvml v0.12.4-0/go.mod h1:8Llmj+1Rr+9VGGwZuRer5N/aCjxGuR5nPb/9ebBiIEQ= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go index 5e1510ca..5b21fc13 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvlib/device/device.go @@ -18,6 +18,7 @@ package device import ( "fmt" + "strings" "github.com/NVIDIA/go-nvml/pkg/nvml" ) @@ -30,6 +31,7 @@ type Device interface { GetCudaComputeCapabilityAsString() (string, error) GetMigDevices() ([]MigDevice, error) GetMigProfiles() ([]MigProfile, error) + GetPCIBusID() (string, error) IsMigCapable() (bool, error) IsMigEnabled() (bool, error) VisitMigDevices(func(j int, m MigDevice) error) error @@ -140,6 +142,29 @@ func (d *device) GetBrandAsString() (string, error) { return "", fmt.Errorf("error interpreting device brand as string: %v", brand) } +// GetPCIBusID returns the string representation of the bus ID. +func (d *device) GetPCIBusID() (string, error) { + info, ret := d.GetPciInfo() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("error getting PCI info: %w", ret) + } + + var bytes []byte + for _, b := range info.BusId { + if byte(b) == '\x00' { + break + } + bytes = append(bytes, byte(b)) + } + id := strings.ToLower(string(bytes)) + + if id != "0000" { + id = strings.TrimPrefix(id, "0000") + } + + return id, nil +} + // GetCudaComputeCapabilityAsString returns the Device's CUDA compute capability as a version string. func (d *device) GetCudaComputeCapabilityAsString() (string, error) { major, minor, ret := d.GetCudaComputeCapability() diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go index 7c1b69dd..9b3d6e2a 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/mock.go @@ -20,6 +20,8 @@ import ( "fmt" "os" "path/filepath" + "regexp" + "strconv" "github.com/NVIDIA/go-nvlib/pkg/nvpci/bytes" ) @@ -55,14 +57,82 @@ func (m *MockNvpci) Cleanup() { os.RemoveAll(m.pciDevicesRoot) } +func validatePCIAddress(addr string) error { + r := regexp.MustCompile(`0{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]`) + if !r.Match([]byte(addr)) { + return fmt.Errorf(`invalid PCI address should match 0{4}:[0-9a-f]{2}:[0-9a-f]{2}\.[0-9]: %s`, addr) + } + + return nil +} + // AddMockA100 Create an A100 like GPU mock device. -func (m *MockNvpci) AddMockA100(address string, numaNode int) error { - deviceDir := filepath.Join(m.pciDevicesRoot, address) - err := os.MkdirAll(deviceDir, 0755) +func (m *MockNvpci) AddMockA100(address string, numaNode int, sriov *SriovInfo) error { + err := validatePCIAddress(address) if err != nil { return err } + deviceDir := filepath.Join(m.pciDevicesRoot, address) + err = os.MkdirAll(deviceDir, 0755) + if err != nil { + return err + } + + err = createNVIDIAgpuFiles(deviceDir) + if err != nil { + return err + } + + iommuGroup := 20 + _, err = os.Create(filepath.Join(deviceDir, strconv.Itoa(iommuGroup))) + if err != nil { + return err + } + err = os.Symlink(filepath.Join(deviceDir, strconv.Itoa(iommuGroup)), filepath.Join(deviceDir, "iommu_group")) + if err != nil { + return err + } + + numa, err := os.Create(filepath.Join(deviceDir, "numa_node")) + if err != nil { + return err + } + _, err = numa.WriteString(fmt.Sprintf("%v", numaNode)) + if err != nil { + return err + } + + if sriov != nil && sriov.PhysicalFunction != nil { + totalVFs, err := os.Create(filepath.Join(deviceDir, "sriov_totalvfs")) + if err != nil { + return err + } + _, err = fmt.Fprintf(totalVFs, "%d", sriov.PhysicalFunction.TotalVFs) + if err != nil { + return err + } + + numVFs, err := os.Create(filepath.Join(deviceDir, "sriov_numvfs")) + if err != nil { + return err + } + _, err = fmt.Fprintf(numVFs, "%d", sriov.PhysicalFunction.NumVFs) + if err != nil { + return err + } + for i := 1; i <= int(sriov.PhysicalFunction.NumVFs); i++ { + err = m.createVf(address, i, iommuGroup, numaNode) + if err != nil { + return err + } + } + } + + return nil +} + +func createNVIDIAgpuFiles(deviceDir string) error { vendor, err := os.Create(filepath.Join(deviceDir, "vendor")) if err != nil { return err @@ -99,24 +169,6 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error { return err } - _, err = os.Create(filepath.Join(deviceDir, "20")) - if err != nil { - return err - } - err = os.Symlink(filepath.Join(deviceDir, "20"), filepath.Join(deviceDir, "iommu_group")) - if err != nil { - return err - } - - numa, err := os.Create(filepath.Join(deviceDir, "numa_node")) - if err != nil { - return err - } - _, err = numa.WriteString(fmt.Sprintf("%v", numaNode)) - if err != nil { - return err - } - config, err := os.Create(filepath.Join(deviceDir, "config")) if err != nil { return err @@ -156,3 +208,53 @@ func (m *MockNvpci) AddMockA100(address string, numaNode int) error { return nil } + +func (m *MockNvpci) createVf(pfAddress string, id, iommu_group, numaNode int) error { + functionID := pfAddress[len(pfAddress)-1] + // we are verifying the last character of pfAddress is integer. + functionNumber, err := strconv.Atoi(string(functionID)) + if err != nil { + return fmt.Errorf("can't conver physical function pci address function number %s to integer: %v", string(functionID), err) + } + + vfFunctionNumber := functionNumber + id + vfAddress := pfAddress[:len(pfAddress)-1] + strconv.Itoa(vfFunctionNumber) + + deviceDir := filepath.Join(m.pciDevicesRoot, vfAddress) + err = os.MkdirAll(deviceDir, 0755) + if err != nil { + return err + } + + err = createNVIDIAgpuFiles(deviceDir) + if err != nil { + return err + } + + vfIommuGroup := strconv.Itoa(iommu_group + id) + + _, err = os.Create(filepath.Join(deviceDir, vfIommuGroup)) + if err != nil { + return err + } + err = os.Symlink(filepath.Join(deviceDir, vfIommuGroup), filepath.Join(deviceDir, "iommu_group")) + if err != nil { + return err + } + + numa, err := os.Create(filepath.Join(deviceDir, "numa_node")) + if err != nil { + return err + } + _, err = numa.WriteString(fmt.Sprintf("%v", numaNode)) + if err != nil { + return err + } + + err = os.Symlink(filepath.Join(m.pciDevicesRoot, pfAddress), filepath.Join(deviceDir, "physfn")) + if err != nil { + return err + } + + return nil +} diff --git a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go index 6d83a577..6ff197b1 100644 --- a/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go +++ b/vendor/github.com/NVIDIA/go-nvlib/pkg/nvpci/nvpci.go @@ -76,6 +76,32 @@ type nvpci struct { var _ Interface = (*nvpci)(nil) var _ ResourceInterface = (*MemoryResources)(nil) +// SriovInfo indicates whether device is VF/PF for SRIOV capable devices. +// Only one should be set at any given time. +type SriovInfo struct { + PhysicalFunction *SriovPhysicalFunction + VirtualFunction *SriovVirtualFunction +} + +// SriovPhysicalFunction stores info about SRIOV physical function. +type SriovPhysicalFunction struct { + TotalVFs uint64 + NumVFs uint64 +} + +// SriovVirtualFunction keeps data about SRIOV virtual function. +type SriovVirtualFunction struct { + PhysicalFunction *NvidiaPCIDevice +} + +func (s *SriovInfo) IsPF() bool { + return s != nil && s.PhysicalFunction != nil +} + +func (s *SriovInfo) IsVF() bool { + return s != nil && s.VirtualFunction != nil +} + // NvidiaPCIDevice represents a PCI device for an NVIDIA product. type NvidiaPCIDevice struct { Path string @@ -90,7 +116,7 @@ type NvidiaPCIDevice struct { NumaNode int Config *ConfigSpace Resources MemoryResources - IsVF bool + SriovInfo SriovInfo } // IsVGAController if class == 0x300. @@ -178,9 +204,11 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) { } var nvdevices []*NvidiaPCIDevice + // Cache devices for each GetAllDevices invocation to speed things up. + cache := make(map[string]*NvidiaPCIDevice) for _, deviceDir := range deviceDirs { deviceAddress := deviceDir.Name() - nvdevice, err := p.GetGPUByPciBusID(deviceAddress) + nvdevice, err := p.getGPUByPciBusID(deviceAddress, cache) if err != nil { return nil, fmt.Errorf("error constructing NVIDIA PCI device %s: %v", deviceAddress, err) } @@ -206,6 +234,16 @@ func (p *nvpci) GetAllDevices() ([]*NvidiaPCIDevice, error) { // GetGPUByPciBusID constructs an NvidiaPCIDevice for the specified address (PCI Bus ID). func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) { + // Pass nil as to force reading device information from sysfs. + return p.getGPUByPciBusID(address, nil) +} + +func (p *nvpci) getGPUByPciBusID(address string, cache map[string]*NvidiaPCIDevice) (*NvidiaPCIDevice, error) { + if cache != nil { + if pciDevice, exists := cache[address]; exists { + return pciDevice, nil + } + } devicePath := filepath.Join(p.pciDevicesRoot, address) vendor, err := os.ReadFile(path.Join(devicePath, "vendor")) @@ -265,16 +303,6 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) { return nil, fmt.Errorf("unable to detect iommu_group for %s: %v", address, err) } - // device is a virtual function (VF) if "physfn" symlink exists. - var isVF bool - _, err = filepath.EvalSymlinks(path.Join(devicePath, "physfn")) - if err == nil { - isVF = true - } - if err != nil && !os.IsNotExist(err) { - return nil, fmt.Errorf("unable to resolve %s: %v", path.Join(devicePath, "physfn"), err) - } - numa, err := os.ReadFile(path.Join(devicePath, "numa_node")) if err != nil { return nil, fmt.Errorf("unable to read PCI NUMA node for %s: %v", address, err) @@ -328,6 +356,28 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) { className = UnknownClassString } + var sriovInfo SriovInfo + // Device is a virtual function (VF) if "physfn" symlink exists. + physFnAddress, err := filepath.EvalSymlinks(path.Join(devicePath, "physfn")) + if err == nil { + physFn, err := p.getGPUByPciBusID(filepath.Base(physFnAddress), cache) + if err != nil { + return nil, fmt.Errorf("unable to detect physfn for %s: %v", address, err) + } + sriovInfo = SriovInfo{ + VirtualFunction: &SriovVirtualFunction{ + PhysicalFunction: physFn, + }, + } + } else if os.IsNotExist(err) { + sriovInfo, err = p.getSriovInfoForPhysicalFunction(devicePath) + if err != nil { + return nil, fmt.Errorf("unable to read SRIOV physical function details for %s: %v", devicePath, err) + } + } else { + return nil, fmt.Errorf("unable to read %s: %v", path.Join(devicePath, "physfn"), err) + } + nvdevice := &NvidiaPCIDevice{ Path: devicePath, Address: address, @@ -339,9 +389,14 @@ func (p *nvpci) GetGPUByPciBusID(address string) (*NvidiaPCIDevice, error) { NumaNode: int(numaNode), Config: config, Resources: resources, - IsVF: isVF, DeviceName: deviceName, ClassName: className, + SriovInfo: sriovInfo, + } + + // Cache physical functions only as VF can't be a root device. + if cache != nil && sriovInfo.IsPF() { + cache[address] = nvdevice } return nvdevice, nil @@ -407,7 +462,7 @@ func (p *nvpci) GetGPUs() ([]*NvidiaPCIDevice, error) { var filtered []*NvidiaPCIDevice for _, d := range devices { - if d.IsGPU() && !d.IsVF { + if d.IsGPU() && !d.SriovInfo.IsVF() { filtered = append(filtered, d) } } @@ -428,3 +483,41 @@ func (p *nvpci) GetGPUByIndex(i int) (*NvidiaPCIDevice, error) { return gpus[i], nil } + +func (p *nvpci) getSriovInfoForPhysicalFunction(devicePath string) (sriovInfo SriovInfo, err error) { + totalVfsPath := filepath.Join(devicePath, "sriov_totalvfs") + numVfsPath := filepath.Join(devicePath, "sriov_numvfs") + + // No file for sriov_totalvfs exists? Not an SRIOV device, return nil + _, err = os.Stat(totalVfsPath) + if err != nil && os.IsNotExist(err) { + return sriovInfo, nil + } + sriovTotalVfs, err := os.ReadFile(totalVfsPath) + if err != nil { + return sriovInfo, fmt.Errorf("unable to read sriov_totalvfs: %v", err) + } + totalVfsStr := strings.TrimSpace(string(sriovTotalVfs)) + totalVfsInt, err := strconv.ParseUint(totalVfsStr, 10, 16) + if err != nil { + return sriovInfo, fmt.Errorf("unable to convert sriov_totalvfs to uint64: %v", err) + } + + sriovNumVfs, err := os.ReadFile(numVfsPath) + if err != nil { + return sriovInfo, fmt.Errorf("unable to read sriov_numvfs for: %v", err) + } + numVfsStr := strings.TrimSpace(string(sriovNumVfs)) + numVfsInt, err := strconv.ParseUint(numVfsStr, 10, 16) + if err != nil { + return sriovInfo, fmt.Errorf("unable to convert sriov_numvfs to uint64: %v", err) + } + + sriovInfo = SriovInfo{ + PhysicalFunction: &SriovPhysicalFunction{ + TotalVFs: totalVfsInt, + NumVFs: numVfsInt, + }, + } + return sriovInfo, nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 589e675c..d4b21741 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,4 +1,4 @@ -# github.com/NVIDIA/go-nvlib v0.5.0 +# github.com/NVIDIA/go-nvlib v0.6.0 ## explicit; go 1.20 github.com/NVIDIA/go-nvlib/pkg/nvlib/device github.com/NVIDIA/go-nvlib/pkg/nvlib/info