Update vendoring

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar
2023-11-15 21:38:54 +01:00
parent c63fb35ba8
commit 2ff2d84283
57 changed files with 4299 additions and 1606 deletions

View File

@@ -0,0 +1,98 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package device
import (
"github.com/NVIDIA/go-nvlib/pkg/nvml"
)
// Interface provides the API to the 'device' package
type Interface interface {
AssertValidMigProfileFormat(profile string) error
GetDevices() ([]Device, error)
GetMigDevices() ([]MigDevice, error)
GetMigProfiles() ([]MigProfile, error)
NewDevice(d nvml.Device) (Device, error)
NewDeviceByUUID(uuid string) (Device, error)
NewMigDevice(d nvml.Device) (MigDevice, error)
NewMigDeviceByUUID(uuid string) (MigDevice, error)
NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, migMemorySizeMB, deviceMemorySizeBytes uint64) (MigProfile, error)
ParseMigProfile(profile string) (MigProfile, error)
VisitDevices(func(i int, d Device) error) error
VisitMigDevices(func(i int, d Device, j int, m MigDevice) error) error
VisitMigProfiles(func(p MigProfile) error) error
}
type devicelib struct {
nvml nvml.Interface
skippedDevices map[string]struct{}
verifySymbols *bool
migProfiles []MigProfile
}
var _ Interface = &devicelib{}
// New creates a new instance of the 'device' interface
func New(opts ...Option) Interface {
d := &devicelib{}
for _, opt := range opts {
opt(d)
}
if d.nvml == nil {
d.nvml = nvml.New()
}
if d.verifySymbols == nil {
verify := true
d.verifySymbols = &verify
}
if d.skippedDevices == nil {
WithSkippedDevices(
"DGX Display",
"NVIDIA DGX Display",
)(d)
}
return d
}
// WithNvml provides an Option to set the NVML library used by the 'device' interface
func WithNvml(nvml nvml.Interface) Option {
return func(d *devicelib) {
d.nvml = nvml
}
}
// WithVerifySymbols provides an option to toggle whether to verify select symbols exist in dynamic libraries before calling them
func WithVerifySymbols(verify bool) Option {
return func(d *devicelib) {
d.verifySymbols = &verify
}
}
// WithSkippedDevices provides an Option to set devices to be skipped by model name
func WithSkippedDevices(names ...string) Option {
return func(d *devicelib) {
if d.skippedDevices == nil {
d.skippedDevices = make(map[string]struct{})
}
for _, name := range names {
d.skippedDevices[name] = struct{}{}
}
}
}
// Option defines a function for passing options to the New() call
type Option func(*devicelib)

View File

@@ -0,0 +1,473 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package device
import (
"fmt"
"github.com/NVIDIA/go-nvlib/pkg/nvml"
)
// Device defines the set of extended functions associated with a device.Device
type Device interface {
nvml.Device
GetArchitectureAsString() (string, error)
GetBrandAsString() (string, error)
GetCudaComputeCapabilityAsString() (string, error)
GetMigDevices() ([]MigDevice, error)
GetMigProfiles() ([]MigProfile, error)
IsMigCapable() (bool, error)
IsMigEnabled() (bool, error)
VisitMigDevices(func(j int, m MigDevice) error) error
VisitMigProfiles(func(p MigProfile) error) error
}
type device struct {
nvml.Device
lib *devicelib
migProfiles []MigProfile
}
var _ Device = &device{}
// NewDevice builds a new Device from an nvml.Device
func (d *devicelib) NewDevice(dev nvml.Device) (Device, error) {
return d.newDevice(dev)
}
// NewDeviceByUUID builds a new Device from a UUID
func (d *devicelib) NewDeviceByUUID(uuid string) (Device, error) {
dev, ret := d.nvml.DeviceGetHandleByUUID(uuid)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting device handle for uuid '%v': %v", uuid, ret)
}
return d.newDevice(dev)
}
// newDevice creates a device from an nvml.Device
func (d *devicelib) newDevice(dev nvml.Device) (*device, error) {
return &device{dev, d, nil}, nil
}
// GetArchitectureAsString returns the Device architecture as a string
func (d *device) GetArchitectureAsString() (string, error) {
arch, ret := d.GetArchitecture()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("error getting device architecture: %v", ret)
}
switch arch {
case nvml.DEVICE_ARCH_KEPLER:
return "Kepler", nil
case nvml.DEVICE_ARCH_MAXWELL:
return "Maxwell", nil
case nvml.DEVICE_ARCH_PASCAL:
return "Pascal", nil
case nvml.DEVICE_ARCH_VOLTA:
return "Volta", nil
case nvml.DEVICE_ARCH_TURING:
return "Turing", nil
case nvml.DEVICE_ARCH_AMPERE:
return "Ampere", nil
case nvml.DEVICE_ARCH_ADA:
return "Ada", nil
case nvml.DEVICE_ARCH_HOPPER:
return "Hopper", nil
case nvml.DEVICE_ARCH_UNKNOWN:
return "Unknown", nil
}
return "", fmt.Errorf("error interpreting device architecture as string: %v", arch)
}
// GetBrandAsString returns the Device architecture as a string
func (d *device) GetBrandAsString() (string, error) {
brand, ret := d.GetBrand()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("error getting device brand: %v", ret)
}
switch brand {
case nvml.BRAND_UNKNOWN:
return "Unknown", nil
case nvml.BRAND_QUADRO:
return "Quadro", nil
case nvml.BRAND_TESLA:
return "Tesla", nil
case nvml.BRAND_NVS:
return "NVS", nil
case nvml.BRAND_GRID:
return "Grid", nil
case nvml.BRAND_GEFORCE:
return "GeForce", nil
case nvml.BRAND_TITAN:
return "Titan", nil
case nvml.BRAND_NVIDIA_VAPPS:
return "NvidiaVApps", nil
case nvml.BRAND_NVIDIA_VPC:
return "NvidiaVPC", nil
case nvml.BRAND_NVIDIA_VCS:
return "NvidiaVCS", nil
case nvml.BRAND_NVIDIA_VWS:
return "NvidiaVWS", nil
// Deprecated in favor of nvml.BRAND_NVIDIA_CLOUD_GAMING
//case nvml.BRAND_NVIDIA_VGAMING:
// return "VGaming", nil
case nvml.BRAND_NVIDIA_CLOUD_GAMING:
return "NvidiaCloudGaming", nil
case nvml.BRAND_QUADRO_RTX:
return "QuadroRTX", nil
case nvml.BRAND_NVIDIA_RTX:
return "NvidiaRTX", nil
case nvml.BRAND_NVIDIA:
return "Nvidia", nil
case nvml.BRAND_GEFORCE_RTX:
return "GeForceRTX", nil
case nvml.BRAND_TITAN_RTX:
return "TitanRTX", nil
}
return "", fmt.Errorf("error interpreting device brand as string: %v", brand)
}
// GetCudaComputeCapabilityAsString returns the Device's CUDA compute capability as a version string
func (d *device) GetCudaComputeCapabilityAsString() (string, error) {
major, minor, ret := d.GetCudaComputeCapability()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("error getting CUDA compute capability: %v", ret)
}
return fmt.Sprintf("%d.%d", major, minor), nil
}
// IsMigCapable checks if a device is capable of having MIG paprtitions created on it
func (d *device) IsMigCapable() (bool, error) {
if !d.lib.hasSymbol("nvmlDeviceGetMigMode") {
return false, nil
}
_, _, ret := nvml.Device(d).GetMigMode()
if ret == nvml.ERROR_NOT_SUPPORTED {
return false, nil
}
if ret != nvml.SUCCESS {
return false, fmt.Errorf("error getting MIG mode: %v", ret)
}
return true, nil
}
// IsMigEnabled checks if a device has MIG mode currently enabled on it
func (d *device) IsMigEnabled() (bool, error) {
if !d.lib.hasSymbol("nvmlDeviceGetMigMode") {
return false, nil
}
mode, _, ret := nvml.Device(d).GetMigMode()
if ret == nvml.ERROR_NOT_SUPPORTED {
return false, nil
}
if ret != nvml.SUCCESS {
return false, fmt.Errorf("error getting MIG mode: %v", ret)
}
return (mode == nvml.DEVICE_MIG_ENABLE), nil
}
// VisitMigDevices walks a top-level device and invokes a callback function for each MIG device configured on it
func (d *device) VisitMigDevices(visit func(int, MigDevice) error) error {
capable, err := d.IsMigCapable()
if err != nil {
return fmt.Errorf("error checking if GPU is MIG capable: %v", err)
}
if !capable {
return nil
}
count, ret := nvml.Device(d).GetMaxMigDeviceCount()
if ret != nvml.SUCCESS {
return fmt.Errorf("error getting max MIG device count: %v", ret)
}
for i := 0; i < count; i++ {
device, ret := nvml.Device(d).GetMigDeviceHandleByIndex(i)
if ret == nvml.ERROR_NOT_FOUND {
continue
}
if ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
return fmt.Errorf("error getting MIG device handle at index '%v': %v", i, ret)
}
mig, err := d.lib.NewMigDevice(device)
if err != nil {
return fmt.Errorf("error creating new MIG device wrapper: %v", err)
}
err = visit(i, mig)
if err != nil {
return fmt.Errorf("error visiting MIG device: %v", err)
}
}
return nil
}
// VisitMigProfiles walks a top-level device and invokes a callback function for each unique MIG Profile that can be configured on it
func (d *device) VisitMigProfiles(visit func(MigProfile) error) error {
capable, err := d.IsMigCapable()
if err != nil {
return fmt.Errorf("error checking if GPU is MIG capable: %v", err)
}
if !capable {
return nil
}
memory, ret := d.GetMemoryInfo()
if ret != nvml.SUCCESS {
return fmt.Errorf("error getting device memory info: %v", ret)
}
for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ {
giProfileInfo, ret := d.GetGpuInstanceProfileInfo(i)
if ret == nvml.ERROR_NOT_SUPPORTED {
continue
}
if ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
return fmt.Errorf("error getting GPU Instance profile info: %v", ret)
}
for j := 0; j < nvml.COMPUTE_INSTANCE_PROFILE_COUNT; j++ {
for k := 0; k < nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT; k++ {
p, err := d.lib.NewMigProfile(i, j, k, giProfileInfo.MemorySizeMB, memory.Total)
if err != nil {
return fmt.Errorf("error creating MIG profile: %v", err)
}
// NOTE: The NVML API doesn't currently let us query the set of
// valid Compute Instance profiles without first instantiating
// a GPU Instance to check against. In theory, it should be
// possible to get this information without a reference to a
// GPU instance, but no API is provided for that at the moment.
// We run the checks below to weed out invalid profiles
// heuristically, given what we know about how they are
// physically constructed. In the future we should do this via
// NVML once a proper API for this exists.
pi := p.GetInfo()
if pi.C > pi.G {
continue
}
if (pi.C < pi.G) && ((pi.C * 2) > (pi.G + 1)) {
continue
}
err = visit(p)
if err != nil {
return fmt.Errorf("error visiting MIG profile: %v", err)
}
}
}
}
return nil
}
// GetMigDevices gets the set of MIG devices associated with a top-level device
func (d *device) GetMigDevices() ([]MigDevice, error) {
var migs []MigDevice
err := d.VisitMigDevices(func(j int, m MigDevice) error {
migs = append(migs, m)
return nil
})
if err != nil {
return nil, err
}
return migs, nil
}
// GetMigProfiles gets the set of unique MIG profiles associated with a top-level device
func (d *device) GetMigProfiles() ([]MigProfile, error) {
// Return the cached list if available
if d.migProfiles != nil {
return d.migProfiles, nil
}
// Otherwise generate it...
var profiles []MigProfile
err := d.VisitMigProfiles(func(p MigProfile) error {
profiles = append(profiles, p)
return nil
})
if err != nil {
return nil, err
}
// And cache it before returning
d.migProfiles = profiles
return profiles, nil
}
// isSkipped checks whether the device should be skipped.
func (d *device) isSkipped() (bool, error) {
name, ret := d.GetName()
if ret != nvml.SUCCESS {
return false, fmt.Errorf("error getting device name: %v", ret)
}
if _, exists := d.lib.skippedDevices[name]; exists {
return true, nil
}
return false, nil
}
// VisitDevices visits each top-level device and invokes a callback function for it
func (d *devicelib) VisitDevices(visit func(int, Device) error) error {
count, ret := d.nvml.DeviceGetCount()
if ret != nvml.SUCCESS {
return fmt.Errorf("error getting device count: %v", ret)
}
for i := 0; i < count; i++ {
device, ret := d.nvml.DeviceGetHandleByIndex(i)
if ret != nvml.SUCCESS {
return fmt.Errorf("error getting device handle for index '%v': %v", i, ret)
}
dev, err := d.newDevice(device)
if err != nil {
return fmt.Errorf("error creating new device wrapper: %v", err)
}
isSkipped, err := dev.isSkipped()
if err != nil {
return fmt.Errorf("error checking whether device is skipped: %v", err)
}
if isSkipped {
continue
}
err = visit(i, dev)
if err != nil {
return fmt.Errorf("error visiting device: %v", err)
}
}
return nil
}
// VisitMigDevices walks a top-level device and invokes a callback function for each MIG device configured on it
func (d *devicelib) VisitMigDevices(visit func(int, Device, int, MigDevice) error) error {
err := d.VisitDevices(func(i int, dev Device) error {
err := dev.VisitMigDevices(func(j int, mig MigDevice) error {
err := visit(i, dev, j, mig)
if err != nil {
return fmt.Errorf("error visiting MIG device: %v", err)
}
return nil
})
if err != nil {
return fmt.Errorf("error visiting device: %v", err)
}
return nil
})
if err != nil {
return fmt.Errorf("error visiting devices: %v", err)
}
return nil
}
// VisitMigProfiles walks a top-level device and invokes a callback function for each unique MIG profile found on them
func (d *devicelib) VisitMigProfiles(visit func(MigProfile) error) error {
visited := make(map[string]bool)
err := d.VisitDevices(func(i int, dev Device) error {
err := dev.VisitMigProfiles(func(p MigProfile) error {
if visited[p.String()] {
return nil
}
err := visit(p)
if err != nil {
return fmt.Errorf("error visiting MIG profile: %v", err)
}
visited[p.String()] = true
return nil
})
if err != nil {
return fmt.Errorf("error visiting device: %v", err)
}
return nil
})
if err != nil {
return fmt.Errorf("error visiting devices: %v", err)
}
return nil
}
// GetDevices gets the set of all top-level devices
func (d *devicelib) GetDevices() ([]Device, error) {
var devs []Device
err := d.VisitDevices(func(i int, dev Device) error {
devs = append(devs, dev)
return nil
})
if err != nil {
return nil, err
}
return devs, nil
}
// GetMigDevices gets the set of MIG devices across all top-level devices
func (d *devicelib) GetMigDevices() ([]MigDevice, error) {
var migs []MigDevice
err := d.VisitMigDevices(func(i int, dev Device, j int, m MigDevice) error {
migs = append(migs, m)
return nil
})
if err != nil {
return nil, err
}
return migs, nil
}
// GetMigProfiles gets the set of unique MIG profiles across all top-level devices
func (d *devicelib) GetMigProfiles() ([]MigProfile, error) {
// Return the cached list if available
if d.migProfiles != nil {
return d.migProfiles, nil
}
// Otherwise generate it...
var profiles []MigProfile
err := d.VisitMigProfiles(func(p MigProfile) error {
profiles = append(profiles, p)
return nil
})
if err != nil {
return nil, err
}
// And cache it before returning
d.migProfiles = profiles
return profiles, nil
}
// hasSymbol checks to see if the given symbol is present in the NVML library.
// If devicelib is configured to not verify symbols, then all symbols are assumed to exist.
func (d *devicelib) hasSymbol(symbol string) bool {
if !*d.verifySymbols {
return true
}
return d.nvml.Lookup(symbol) == nil
}

View File

@@ -0,0 +1,157 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package device
import (
"fmt"
"github.com/NVIDIA/go-nvlib/pkg/nvml"
)
// MigDevice defines the set of extended functions associated with a MIG device
type MigDevice interface {
nvml.Device
GetProfile() (MigProfile, error)
}
type migdevice struct {
nvml.Device
lib *devicelib
profile MigProfile
}
var _ MigDevice = &migdevice{}
// NewMigDevice builds a new MigDevice from an nvml.Device
func (d *devicelib) NewMigDevice(handle nvml.Device) (MigDevice, error) {
isMig, ret := handle.IsMigDeviceHandle()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error checking if device is a MIG device: %v", ret)
}
if !isMig {
return nil, fmt.Errorf("not a MIG device")
}
return &migdevice{handle, d, nil}, nil
}
// NewMigDeviceByUUID builds a new MigDevice from a UUID
func (d *devicelib) NewMigDeviceByUUID(uuid string) (MigDevice, error) {
dev, ret := d.nvml.DeviceGetHandleByUUID(uuid)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting device handle for uuid '%v': %v", uuid, ret)
}
return d.NewMigDevice(dev)
}
// GetProfile returns the MIG profile associated with a MIG device
func (m *migdevice) GetProfile() (MigProfile, error) {
if m.profile != nil {
return m.profile, nil
}
parent, ret := m.Device.GetDeviceHandleFromMigDeviceHandle()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting parent device handle: %v", ret)
}
parentMemoryInfo, ret := parent.GetMemoryInfo()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting parent memory info: %v", ret)
}
attributes, ret := m.Device.GetAttributes()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting MIG device attributes: %v", ret)
}
giID, ret := m.Device.GetGpuInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting MIG device GPU Instance ID: %v", ret)
}
ciID, ret := m.Device.GetComputeInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting MIG device Compute Instance ID: %v", ret)
}
gi, ret := parent.GetGpuInstanceById(giID)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU Instance: %v", ret)
}
ci, ret := gi.GetComputeInstanceById(ciID)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting Compute Instance: %v", ret)
}
giInfo, ret := gi.GetInfo()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU Instance info: %v", ret)
}
ciInfo, ret := ci.GetInfo()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting Compute Instance info: %v", ret)
}
for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ {
giProfileInfo, ret := parent.GetGpuInstanceProfileInfo(i)
if ret == nvml.ERROR_NOT_SUPPORTED {
continue
}
if ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU Instance profile info: %v", ret)
}
if giProfileInfo.Id != giInfo.ProfileId {
continue
}
for j := 0; j < nvml.COMPUTE_INSTANCE_PROFILE_COUNT; j++ {
for k := 0; k < nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT; k++ {
ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo(j, k)
if ret == nvml.ERROR_NOT_SUPPORTED {
continue
}
if ret == nvml.ERROR_INVALID_ARGUMENT {
continue
}
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting Compute Instance profile info: %v", ret)
}
if ciProfileInfo.Id != ciInfo.ProfileId {
continue
}
p, err := m.lib.NewMigProfile(i, j, k, attributes.MemorySizeMB, parentMemoryInfo.Total)
if err != nil {
return nil, fmt.Errorf("error creating MIG profile: %v", err)
}
m.profile = p
return p, nil
}
}
}
return nil, fmt.Errorf("no matching profile IDs found")
}

View File

@@ -0,0 +1,331 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package device
import (
"fmt"
"math"
"sort"
"strconv"
"strings"
"github.com/NVIDIA/go-nvlib/pkg/nvml"
)
const (
// AttributeMediaExtensions holds the string representation for the media extension MIG profile attribute.
AttributeMediaExtensions = "me"
)
// MigProfile represents a specific MIG profile.
// Examples include "1g.5gb", "2g.10gb", "1c.2g.10gb", or "1c.1g.5gb+me", etc.
type MigProfile interface {
String() string
GetInfo() MigProfileInfo
Equals(other MigProfile) bool
Matches(profile string) bool
}
// MigProfileInfo holds all info associated with a specific MIG profile
type MigProfileInfo struct {
C int
G int
GB int
Attributes []string
GIProfileID int
CIProfileID int
CIEngProfileID int
}
var _ MigProfile = &MigProfileInfo{}
// NewProfile constructs a new Profile struct using info from the giProfiles and ciProfiles used to create it.
func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, migMemorySizeMB, deviceMemorySizeBytes uint64) (MigProfile, error) {
giSlices := 0
switch giProfileID {
case nvml.GPU_INSTANCE_PROFILE_1_SLICE,
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2:
giSlices = 1
case nvml.GPU_INSTANCE_PROFILE_2_SLICE,
nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
giSlices = 2
case nvml.GPU_INSTANCE_PROFILE_3_SLICE:
giSlices = 3
case nvml.GPU_INSTANCE_PROFILE_4_SLICE:
giSlices = 4
case nvml.GPU_INSTANCE_PROFILE_6_SLICE:
giSlices = 6
case nvml.GPU_INSTANCE_PROFILE_7_SLICE:
giSlices = 7
case nvml.GPU_INSTANCE_PROFILE_8_SLICE:
giSlices = 8
default:
return nil, fmt.Errorf("invalid GPU Instance Profile ID: %v", giProfileID)
}
ciSlices := 0
switch ciProfileID {
case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1:
ciSlices = 1
case nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE:
ciSlices = 2
case nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE:
ciSlices = 3
case nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE:
ciSlices = 4
case nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE:
ciSlices = 6
case nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE:
ciSlices = 7
case nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE:
ciSlices = 8
default:
return nil, fmt.Errorf("invalid Compute Instance Profile ID: %v", ciProfileID)
}
var attrs []string
switch giProfileID {
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
attrs = append(attrs, AttributeMediaExtensions)
}
p := &MigProfileInfo{
C: ciSlices,
G: giSlices,
GB: int(getMigMemorySizeGB(deviceMemorySizeBytes, migMemorySizeMB)),
Attributes: attrs,
GIProfileID: giProfileID,
CIProfileID: ciProfileID,
CIEngProfileID: ciEngProfileID,
}
return p, nil
}
// AssertValidMigProfileFormat checks if the string is in the proper format to represent a MIG profile
func (d *devicelib) AssertValidMigProfileFormat(profile string) error {
_, _, _, _, err := parseMigProfile(profile)
return err
}
// ParseMigProfile converts a string representation of a MigProfile into an object
func (d *devicelib) ParseMigProfile(profile string) (MigProfile, error) {
profiles, err := d.GetMigProfiles()
if err != nil {
return nil, fmt.Errorf("error getting list of possible MIG profiles: %v", err)
}
for _, p := range profiles {
if p.Matches(profile) {
return p, nil
}
}
return nil, fmt.Errorf("unable to parse profile string into a valid profile")
}
// String returns the string representation of a Profile
func (p MigProfileInfo) String() string {
var suffix string
if len(p.Attributes) > 0 {
suffix = "+" + strings.Join(p.Attributes, ",")
}
if p.C == p.G {
return fmt.Sprintf("%dg.%dgb%s", p.G, p.GB, suffix)
}
return fmt.Sprintf("%dc.%dg.%dgb%s", p.C, p.G, p.GB, suffix)
}
// GetInfo returns detailed info about a Profile
func (p MigProfileInfo) GetInfo() MigProfileInfo {
return p
}
// Equals checks if two Profiles are identical or not
func (p MigProfileInfo) Equals(other MigProfile) bool {
o := other.GetInfo()
if p.C != o.C {
return false
}
if p.G != o.G {
return false
}
if p.GB != o.GB {
return false
}
if p.GIProfileID != o.GIProfileID {
return false
}
if p.CIProfileID != o.CIProfileID {
return false
}
if p.CIEngProfileID != o.CIEngProfileID {
return false
}
return true
}
// Matches checks if a MigProfile matches the string passed in
func (p MigProfileInfo) Matches(profile string) bool {
c, g, gb, attrs, err := parseMigProfile(profile)
if err != nil {
return false
}
if c != p.C {
return false
}
if g != p.G {
return false
}
if gb != p.GB {
return false
}
if len(attrs) != len(p.Attributes) {
return false
}
sort.Strings(attrs)
sort.Strings(p.Attributes)
for i, a := range p.Attributes {
if a != attrs[i] {
return false
}
}
return true
}
func parseMigProfile(profile string) (int, int, int, []string, error) {
// If we are handed the empty string, we cannot parse it
if profile == "" {
return -1, -1, -1, nil, fmt.Errorf("profile is the empty string")
}
// Split by + to separate out attributes
split := strings.SplitN(profile, "+", 2)
// Check to make sure the c, g, and gb values match
c, g, gb, err := parseMigProfileFields(split[0])
if err != nil {
return -1, -1, -1, nil, fmt.Errorf("cannot parse fields of '%v': %v", profile, err)
}
// If we have no attributes we are done
if len(split) == 1 {
return c, g, gb, nil, nil
}
// Make sure we have the same set of attributes
attrs, err := parseMigProfileAttributes(split[1])
if err != nil {
return -1, -1, -1, nil, fmt.Errorf("cannot parse attributes of '%v': %v", profile, err)
}
return c, g, gb, attrs, nil
}
func parseMigProfileField(s string, field string) (int, error) {
if strings.TrimSpace(s) != s {
return -1, fmt.Errorf("leading or trailing spaces on '%%d%s'", field)
}
if !strings.HasSuffix(s, field) {
return -1, fmt.Errorf("missing '%s' from '%%d%s'", field, field)
}
v, err := strconv.Atoi(strings.TrimSuffix(s, field))
if err != nil {
return -1, fmt.Errorf("malformed number in '%%d%s'", field)
}
return v, nil
}
func parseMigProfileFields(s string) (int, int, int, error) {
var err error
var c, g, gb int
split := strings.SplitN(s, ".", 3)
if len(split) == 3 {
c, err = parseMigProfileField(split[0], "c")
if err != nil {
return -1, -1, -1, err
}
g, err = parseMigProfileField(split[1], "g")
if err != nil {
return -1, -1, -1, err
}
gb, err = parseMigProfileField(split[2], "gb")
if err != nil {
return -1, -1, -1, err
}
return c, g, gb, err
}
if len(split) == 2 {
g, err = parseMigProfileField(split[0], "g")
if err != nil {
return -1, -1, -1, err
}
gb, err = parseMigProfileField(split[1], "gb")
if err != nil {
return -1, -1, -1, err
}
return g, g, gb, nil
}
return -1, -1, -1, fmt.Errorf("parsed wrong number of fields, expected 2 or 3")
}
func parseMigProfileAttributes(s string) ([]string, error) {
attr := strings.Split(s, ",")
if len(attr) == 0 {
return nil, fmt.Errorf("empty attribute list")
}
unique := make(map[string]int)
for _, a := range attr {
if unique[a] > 0 {
return nil, fmt.Errorf("non unique attribute in list")
}
if a == "" {
return nil, fmt.Errorf("empty attribute in list")
}
if strings.TrimSpace(a) != a {
return nil, fmt.Errorf("leading or trailing spaces in attribute")
}
if a[0] >= '0' && a[0] <= '9' {
return nil, fmt.Errorf("attribute begins with a number")
}
for _, c := range a {
if (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') && (c < '0' || c > '9') {
return nil, fmt.Errorf("non alpha-numeric character or digit in attribute")
}
}
unique[a]++
}
return attr, nil
}
func getMigMemorySizeGB(totalDeviceMemory, migMemorySizeMB uint64) uint64 {
const fracDenominator = 8
const oneMB = 1024 * 1024
const oneGB = 1024 * 1024 * 1024
fractionalGpuMem := (float64(migMemorySizeMB) * oneMB) / float64(totalDeviceMemory)
fractionalGpuMem = math.Ceil(fractionalGpuMem*fracDenominator) / fracDenominator
totalMemGB := float64((totalDeviceMemory + oneGB - 1) / oneGB)
return uint64(math.Round(fractionalGpuMem * totalMemGB))
}

View File

@@ -0,0 +1,102 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package info
import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/NVIDIA/go-nvml/pkg/dl"
)
// Interface provides the API to the info package
type Interface interface {
HasDXCore() (bool, string)
HasNvml() (bool, string)
IsTegraSystem() (bool, string)
}
type infolib struct {
root string
}
var _ Interface = &infolib{}
// HasDXCore returns true if DXCore is detected on the system.
func (i *infolib) HasDXCore() (bool, string) {
const (
libraryName = "libdxcore.so"
)
if err := assertHasLibrary(libraryName); err != nil {
return false, fmt.Sprintf("could not load DXCore library: %v", err)
}
return true, "found DXCore library"
}
// HasNvml returns true if NVML is detected on the system
func (i *infolib) HasNvml() (bool, string) {
const (
libraryName = "libnvidia-ml.so.1"
)
if err := assertHasLibrary(libraryName); err != nil {
return false, fmt.Sprintf("could not load NVML library: %v", err)
}
return true, "found NVML library"
}
// IsTegraSystem returns true if the system is detected as a Tegra-based system
func (i *infolib) IsTegraSystem() (bool, string) {
tegraReleaseFile := filepath.Join(i.root, "/etc/nv_tegra_release")
tegraFamilyFile := filepath.Join(i.root, "/sys/devices/soc0/family")
if info, err := os.Stat(tegraReleaseFile); err == nil && !info.IsDir() {
return true, fmt.Sprintf("%v found", tegraReleaseFile)
}
if info, err := os.Stat(tegraFamilyFile); err != nil || info.IsDir() {
return false, fmt.Sprintf("%v file not found", tegraFamilyFile)
}
contents, err := os.ReadFile(tegraFamilyFile)
if err != nil {
return false, fmt.Sprintf("could not read %v", tegraFamilyFile)
}
if strings.HasPrefix(strings.ToLower(string(contents)), "tegra") {
return true, fmt.Sprintf("%v has 'tegra' prefix", tegraFamilyFile)
}
return false, fmt.Sprintf("%v has no 'tegra' prefix", tegraFamilyFile)
}
// assertHasLibrary returns an error if the specified library cannot be loaded
func assertHasLibrary(libraryName string) error {
const (
libraryLoadFlags = dl.RTLD_LAZY
)
lib := dl.New(libraryName, libraryLoadFlags)
if err := lib.Open(); err != nil {
return err
}
defer lib.Close()
return nil
}

View File

@@ -0,0 +1,39 @@
/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package info
// Option defines a function for passing options to the New() call
type Option func(*infolib)
// New creates a new instance of the 'info' interface
func New(opts ...Option) Interface {
i := &infolib{}
for _, opt := range opts {
opt(i)
}
if i.root == "" {
i.root = "/"
}
return i
}
// WithRoot provides a Option to set the root of the 'info' interface
func WithRoot(root string) Option {
return func(i *infolib) {
i.root = root
}
}