mirror of
https://github.com/clearml/go-nvlib
synced 2025-02-07 05:17:43 +00:00
Update mig-profile parsing / name generation after go-nvml v12.0 bump
Signed-off-by: Kevin Klues <kklues@nvidia.com>
This commit is contained in:
parent
1f178b880d
commit
642041d1e0
@ -39,6 +39,7 @@ type Interface interface {
|
||||
type devicelib struct {
|
||||
nvml nvml.Interface
|
||||
skippedDevices map[string]struct{}
|
||||
verifySymbols *bool
|
||||
}
|
||||
|
||||
var _ Interface = &devicelib{}
|
||||
@ -52,6 +53,10 @@ func New(opts ...Option) Interface {
|
||||
if d.nvml == nil {
|
||||
d.nvml = nvml.New()
|
||||
}
|
||||
if d.verifySymbols == nil {
|
||||
verify := true
|
||||
d.verifySymbols = &verify
|
||||
}
|
||||
if d.skippedDevices == nil {
|
||||
WithSkippedDevices(
|
||||
"DGX Display",
|
||||
@ -68,6 +73,13 @@ func WithNvml(nvml nvml.Interface) Option {
|
||||
}
|
||||
}
|
||||
|
||||
// WithVerifySymbols provides an option to toggle whether to verify select symbols exist in dynamic libraries before calling them
|
||||
func WithVerifySymbols(verify bool) Option {
|
||||
return func(d *devicelib) {
|
||||
d.verifySymbols = &verify
|
||||
}
|
||||
}
|
||||
|
||||
// WithSkippedDevices provides an Option to set devices to be skipped by model name
|
||||
func WithSkippedDevices(names ...string) Option {
|
||||
return func(d *devicelib) {
|
||||
|
@ -36,7 +36,8 @@ type Device interface {
|
||||
|
||||
type device struct {
|
||||
nvml.Device
|
||||
lib *devicelib
|
||||
lib *devicelib
|
||||
migProfiles []MigProfile
|
||||
}
|
||||
|
||||
var _ Device = &device{}
|
||||
@ -57,12 +58,12 @@ func (d *devicelib) NewDeviceByUUID(uuid string) (Device, error) {
|
||||
|
||||
// newDevice creates a device from an nvml.Device
|
||||
func (d *devicelib) newDevice(dev nvml.Device) (*device, error) {
|
||||
return &device{dev, d}, nil
|
||||
return &device{dev, d, nil}, nil
|
||||
}
|
||||
|
||||
// IsMigCapable checks if a device is capable of having MIG paprtitions created on it
|
||||
func (d *device) IsMigCapable() (bool, error) {
|
||||
err := nvmlLookupSymbol("nvmlDeviceGetMigMode")
|
||||
err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode")
|
||||
if err != nil {
|
||||
return false, nil
|
||||
}
|
||||
@ -80,7 +81,7 @@ func (d *device) IsMigCapable() (bool, error) {
|
||||
|
||||
// IsMigEnabled checks if a device has MIG mode currently enabled on it
|
||||
func (d *device) IsMigEnabled() (bool, error) {
|
||||
err := nvmlLookupSymbol("nvmlDeviceGetMigMode")
|
||||
err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode")
|
||||
if err != nil {
|
||||
return false, nil
|
||||
}
|
||||
@ -161,6 +162,20 @@ func (d *device) VisitMigProfiles(visit func(MigProfile) error) error {
|
||||
return fmt.Errorf("error creating MIG profile: %v", err)
|
||||
}
|
||||
|
||||
// NOTE: The NVML API doesn't currently let us query the set of
|
||||
// valid Compute Instance profiles without first instantiating
|
||||
// a GPU Instance to check against. In theory, it should be
|
||||
// possible to get this information without a reference to a
|
||||
// GPU instance, but no API is provided for that at the moment.
|
||||
// We run the checks below to weed out invalid profiles
|
||||
// heuristically, given what we know about how they are
|
||||
// physically constructed. In the future we should do this via
|
||||
// NVML once a proper API for this exists.
|
||||
pi := p.GetInfo()
|
||||
if (pi.C * 2) > (pi.G + 1) {
|
||||
continue
|
||||
}
|
||||
|
||||
err = visit(p)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error visiting MIG profile: %v", err)
|
||||
@ -186,6 +201,12 @@ func (d *device) GetMigDevices() ([]MigDevice, error) {
|
||||
|
||||
// GetMigProfiles gets the set of unique MIG profiles associated with a top-level device
|
||||
func (d *device) GetMigProfiles() ([]MigProfile, error) {
|
||||
// Return the cached list if available
|
||||
if d.migProfiles != nil {
|
||||
return d.migProfiles, nil
|
||||
}
|
||||
|
||||
// Otherwise generate it...
|
||||
var profiles []MigProfile
|
||||
err := d.VisitMigProfiles(func(p MigProfile) error {
|
||||
profiles = append(profiles, p)
|
||||
@ -194,6 +215,9 @@ func (d *device) GetMigProfiles() ([]MigProfile, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// And cache it before returning
|
||||
d.migProfiles = profiles
|
||||
return profiles, nil
|
||||
}
|
||||
|
||||
@ -333,7 +357,13 @@ func (d *devicelib) GetMigProfiles() ([]MigProfile, error) {
|
||||
}
|
||||
|
||||
// nvmlLookupSymbol checks to see if the given symbol is present in the NVML library
|
||||
func nvmlLookupSymbol(symbol string) error {
|
||||
func (d *devicelib) nvmlLookupSymbol(symbol string) error {
|
||||
// If devicelib is configured to not verify symbols, then we short-circuit here
|
||||
if !*d.verifySymbols {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Otherwise we lookup the provided symbol and verify it is available
|
||||
lib := dl.New("libnvidia-ml.so.1", dl.RTLD_LAZY|dl.RTLD_GLOBAL)
|
||||
if lib == nil {
|
||||
return fmt.Errorf("error instantiating DynamicLibrary for NVML")
|
||||
|
@ -19,6 +19,7 @@ package device
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
@ -36,6 +37,7 @@ type MigProfile interface {
|
||||
String() string
|
||||
GetInfo() MigProfileInfo
|
||||
Equals(other MigProfile) bool
|
||||
Matches(profile string) bool
|
||||
}
|
||||
|
||||
// MigProfileInfo holds all info associated with a specific MIG profile
|
||||
@ -55,11 +57,12 @@ var _ MigProfile = &MigProfileInfo{}
|
||||
func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, migMemorySizeMB, deviceMemorySizeBytes uint64) (MigProfile, error) {
|
||||
giSlices := 0
|
||||
switch giProfileID {
|
||||
case nvml.GPU_INSTANCE_PROFILE_1_SLICE:
|
||||
case nvml.GPU_INSTANCE_PROFILE_1_SLICE,
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2:
|
||||
giSlices = 1
|
||||
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1:
|
||||
giSlices = 1
|
||||
case nvml.GPU_INSTANCE_PROFILE_2_SLICE:
|
||||
case nvml.GPU_INSTANCE_PROFILE_2_SLICE,
|
||||
nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
|
||||
giSlices = 2
|
||||
case nvml.GPU_INSTANCE_PROFILE_3_SLICE:
|
||||
giSlices = 3
|
||||
@ -77,7 +80,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,
|
||||
|
||||
ciSlices := 0
|
||||
switch ciProfileID {
|
||||
case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE:
|
||||
case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1:
|
||||
ciSlices = 1
|
||||
case nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE:
|
||||
ciSlices = 2
|
||||
@ -97,7 +101,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,
|
||||
|
||||
var attrs []string
|
||||
switch giProfileID {
|
||||
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1:
|
||||
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
|
||||
nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
|
||||
attrs = append(attrs, AttributeMediaExtensions)
|
||||
}
|
||||
|
||||
@ -116,84 +121,18 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,
|
||||
|
||||
// ParseMigProfile converts a string representation of a MigProfile into an object
|
||||
func (d *devicelib) ParseMigProfile(profile string) (MigProfile, error) {
|
||||
var err error
|
||||
var c, g, gb int
|
||||
var attrs []string
|
||||
|
||||
if len(profile) == 0 {
|
||||
return nil, fmt.Errorf("empty Profile string")
|
||||
}
|
||||
|
||||
split := strings.SplitN(profile, "+", 2)
|
||||
if len(split) == 2 {
|
||||
attrs, err = parseMigProfileAttributes(split[1])
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing attributes following '+' in Profile string: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
c, g, gb, err = parseMigProfileFields(split[0])
|
||||
profiles, err := d.GetMigProfiles()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error parsing '.' separated fields in Profile string: %v", err)
|
||||
return nil, fmt.Errorf("error getting list of possible MIG profiles: %v", err)
|
||||
}
|
||||
|
||||
p := &MigProfileInfo{
|
||||
C: c,
|
||||
G: g,
|
||||
GB: gb,
|
||||
Attributes: attrs,
|
||||
}
|
||||
|
||||
switch c {
|
||||
case 1:
|
||||
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
|
||||
case 2:
|
||||
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
|
||||
case 3:
|
||||
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
|
||||
case 4:
|
||||
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
|
||||
case 6:
|
||||
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
|
||||
case 7:
|
||||
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
|
||||
case 8:
|
||||
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown Compute Instance slice size: %v", c)
|
||||
}
|
||||
|
||||
switch g {
|
||||
case 1:
|
||||
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE
|
||||
case 2:
|
||||
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_2_SLICE
|
||||
case 3:
|
||||
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_3_SLICE
|
||||
case 4:
|
||||
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_4_SLICE
|
||||
case 6:
|
||||
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_6_SLICE
|
||||
case 7:
|
||||
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_7_SLICE
|
||||
case 8:
|
||||
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_8_SLICE
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown GPU Instance slice size: %v", g)
|
||||
}
|
||||
|
||||
p.CIEngProfileID = nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED
|
||||
|
||||
for _, a := range attrs {
|
||||
switch a {
|
||||
case AttributeMediaExtensions:
|
||||
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown Profile attribute: %v", a)
|
||||
for _, p := range profiles {
|
||||
if p.Matches(profile) {
|
||||
return p, nil
|
||||
}
|
||||
}
|
||||
|
||||
return p, nil
|
||||
return nil, fmt.Errorf("unable to parse profile string into a valid profile")
|
||||
}
|
||||
|
||||
// String returns the string representation of a Profile
|
||||
@ -240,6 +179,55 @@ func (p *MigProfileInfo) Equals(other MigProfile) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// Matches checks if a MigProfile matches the string passed in
|
||||
func (p *MigProfileInfo) Matches(profile string) bool {
|
||||
// If we are handed the empty string, there is nothing to check
|
||||
if profile == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
// Split by + to separate out attributes
|
||||
split := strings.SplitN(profile, "+", 2)
|
||||
|
||||
// Check to make sure the c, g, and gb values match
|
||||
c, g, gb, err := parseMigProfileFields(split[0])
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
if c != p.C {
|
||||
return false
|
||||
}
|
||||
if g != p.G {
|
||||
return false
|
||||
}
|
||||
if gb != p.GB {
|
||||
return false
|
||||
}
|
||||
|
||||
// If we have no attributes we are done
|
||||
if len(split) == 1 {
|
||||
return true
|
||||
}
|
||||
|
||||
// Make sure we have the same set of attributes
|
||||
attrs, err := parseMigProfileAttributes(split[1])
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
if len(attrs) != len(p.Attributes) {
|
||||
return false
|
||||
}
|
||||
sort.Strings(attrs)
|
||||
sort.Strings(p.Attributes)
|
||||
for i, a := range p.Attributes {
|
||||
if a != attrs[i] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func parseMigProfileField(s string, field string) (int, error) {
|
||||
if strings.TrimSpace(s) != s {
|
||||
return -1, fmt.Errorf("leading or trailing spaces on '%%d%s'", field)
|
||||
|
@ -21,6 +21,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
)
|
||||
|
||||
func TestParseMigProfile(t *testing.T) {
|
||||
@ -256,7 +257,55 @@ func TestParseMigProfile(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
d := New()
|
||||
mockDevice := &nvml.DeviceMock{
|
||||
GetNameFunc: func() (string, nvml.Return) {
|
||||
return "MockDevice", nvml.SUCCESS
|
||||
},
|
||||
GetMigModeFunc: func() (int, int, nvml.Return) {
|
||||
return nvml.DEVICE_MIG_ENABLE, nvml.DEVICE_MIG_ENABLE, nvml.SUCCESS
|
||||
},
|
||||
GetMemoryInfoFunc: func() (nvml.Memory, nvml.Return) {
|
||||
memory := nvml.Memory{
|
||||
Total: 40 * 1024 * 1024 * 1024,
|
||||
}
|
||||
return memory, nvml.SUCCESS
|
||||
},
|
||||
GetGpuInstanceProfileInfoFunc: func(Profile int) (nvml.GpuInstanceProfileInfo, nvml.Return) {
|
||||
info := nvml.GpuInstanceProfileInfo{}
|
||||
switch Profile {
|
||||
case nvml.GPU_INSTANCE_PROFILE_1_SLICE,
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1:
|
||||
info.MemorySizeMB = 5 * 1024
|
||||
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2:
|
||||
info.MemorySizeMB = 10 * 1024
|
||||
case nvml.GPU_INSTANCE_PROFILE_2_SLICE,
|
||||
nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
|
||||
info.MemorySizeMB = 10 * 1024
|
||||
case nvml.GPU_INSTANCE_PROFILE_3_SLICE:
|
||||
info.MemorySizeMB = 20 * 1024
|
||||
case nvml.GPU_INSTANCE_PROFILE_4_SLICE:
|
||||
info.MemorySizeMB = 20 * 1024
|
||||
case nvml.GPU_INSTANCE_PROFILE_7_SLICE:
|
||||
info.MemorySizeMB = 40 * 1024
|
||||
case nvml.GPU_INSTANCE_PROFILE_6_SLICE,
|
||||
nvml.GPU_INSTANCE_PROFILE_8_SLICE:
|
||||
fallthrough
|
||||
default:
|
||||
return info, nvml.ERROR_NOT_SUPPORTED
|
||||
}
|
||||
return info, nvml.SUCCESS
|
||||
},
|
||||
}
|
||||
mockNvml := &nvml.InterfaceMock{
|
||||
DeviceGetCountFunc: func() (int, nvml.Return) {
|
||||
return 1, nvml.SUCCESS
|
||||
},
|
||||
DeviceGetHandleByIndexFunc: func(Index int) (nvml.Device, nvml.Return) {
|
||||
return mockDevice, nvml.SUCCESS
|
||||
},
|
||||
}
|
||||
|
||||
d := New(WithNvml(mockNvml), WithVerifySymbols(false))
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
_, err := d.ParseMigProfile(tc.device)
|
||||
|
@ -65,19 +65,22 @@ const (
|
||||
GPU_INSTANCE_PROFILE_7_SLICE = nvml.GPU_INSTANCE_PROFILE_7_SLICE
|
||||
GPU_INSTANCE_PROFILE_8_SLICE = nvml.GPU_INSTANCE_PROFILE_8_SLICE
|
||||
GPU_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1
|
||||
GPU_INSTANCE_PROFILE_1_SLICE_REV2 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2
|
||||
GPU_INSTANCE_PROFILE_2_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1
|
||||
GPU_INSTANCE_PROFILE_COUNT = nvml.GPU_INSTANCE_PROFILE_COUNT
|
||||
)
|
||||
|
||||
// Compute Instance Profiles
|
||||
const (
|
||||
COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_6_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT
|
||||
COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_6_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1
|
||||
COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT
|
||||
)
|
||||
|
||||
// Compute Instance Engine Profiles
|
||||
|
Loading…
Reference in New Issue
Block a user