Update mig-profile parsing / name generation after go-nvml v12.0 bump

Signed-off-by: Kevin Klues <kklues@nvidia.com>
This commit is contained in:
Kevin Klues 2023-03-23 11:47:12 +00:00
parent 1f178b880d
commit 642041d1e0
5 changed files with 174 additions and 92 deletions

View File

@ -39,6 +39,7 @@ type Interface interface {
type devicelib struct {
nvml nvml.Interface
skippedDevices map[string]struct{}
verifySymbols *bool
}
var _ Interface = &devicelib{}
@ -52,6 +53,10 @@ func New(opts ...Option) Interface {
if d.nvml == nil {
d.nvml = nvml.New()
}
if d.verifySymbols == nil {
verify := true
d.verifySymbols = &verify
}
if d.skippedDevices == nil {
WithSkippedDevices(
"DGX Display",
@ -68,6 +73,13 @@ func WithNvml(nvml nvml.Interface) Option {
}
}
// WithVerifySymbols provides an option to toggle whether to verify select symbols exist in dynamic libraries before calling them
func WithVerifySymbols(verify bool) Option {
return func(d *devicelib) {
d.verifySymbols = &verify
}
}
// WithSkippedDevices provides an Option to set devices to be skipped by model name
func WithSkippedDevices(names ...string) Option {
return func(d *devicelib) {

View File

@ -36,7 +36,8 @@ type Device interface {
type device struct {
nvml.Device
lib *devicelib
lib *devicelib
migProfiles []MigProfile
}
var _ Device = &device{}
@ -57,12 +58,12 @@ func (d *devicelib) NewDeviceByUUID(uuid string) (Device, error) {
// newDevice creates a device from an nvml.Device
func (d *devicelib) newDevice(dev nvml.Device) (*device, error) {
return &device{dev, d}, nil
return &device{dev, d, nil}, nil
}
// IsMigCapable checks if a device is capable of having MIG paprtitions created on it
func (d *device) IsMigCapable() (bool, error) {
err := nvmlLookupSymbol("nvmlDeviceGetMigMode")
err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode")
if err != nil {
return false, nil
}
@ -80,7 +81,7 @@ func (d *device) IsMigCapable() (bool, error) {
// IsMigEnabled checks if a device has MIG mode currently enabled on it
func (d *device) IsMigEnabled() (bool, error) {
err := nvmlLookupSymbol("nvmlDeviceGetMigMode")
err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode")
if err != nil {
return false, nil
}
@ -161,6 +162,20 @@ func (d *device) VisitMigProfiles(visit func(MigProfile) error) error {
return fmt.Errorf("error creating MIG profile: %v", err)
}
// NOTE: The NVML API doesn't currently let us query the set of
// valid Compute Instance profiles without first instantiating
// a GPU Instance to check against. In theory, it should be
// possible to get this information without a reference to a
// GPU instance, but no API is provided for that at the moment.
// We run the checks below to weed out invalid profiles
// heuristically, given what we know about how they are
// physically constructed. In the future we should do this via
// NVML once a proper API for this exists.
pi := p.GetInfo()
if (pi.C * 2) > (pi.G + 1) {
continue
}
err = visit(p)
if err != nil {
return fmt.Errorf("error visiting MIG profile: %v", err)
@ -186,6 +201,12 @@ func (d *device) GetMigDevices() ([]MigDevice, error) {
// GetMigProfiles gets the set of unique MIG profiles associated with a top-level device
func (d *device) GetMigProfiles() ([]MigProfile, error) {
// Return the cached list if available
if d.migProfiles != nil {
return d.migProfiles, nil
}
// Otherwise generate it...
var profiles []MigProfile
err := d.VisitMigProfiles(func(p MigProfile) error {
profiles = append(profiles, p)
@ -194,6 +215,9 @@ func (d *device) GetMigProfiles() ([]MigProfile, error) {
if err != nil {
return nil, err
}
// And cache it before returning
d.migProfiles = profiles
return profiles, nil
}
@ -333,7 +357,13 @@ func (d *devicelib) GetMigProfiles() ([]MigProfile, error) {
}
// nvmlLookupSymbol checks to see if the given symbol is present in the NVML library
func nvmlLookupSymbol(symbol string) error {
func (d *devicelib) nvmlLookupSymbol(symbol string) error {
// If devicelib is configured to not verify symbols, then we short-circuit here
if !*d.verifySymbols {
return nil
}
// Otherwise we lookup the provided symbol and verify it is available
lib := dl.New("libnvidia-ml.so.1", dl.RTLD_LAZY|dl.RTLD_GLOBAL)
if lib == nil {
return fmt.Errorf("error instantiating DynamicLibrary for NVML")

View File

@ -19,6 +19,7 @@ package device
import (
"fmt"
"math"
"sort"
"strconv"
"strings"
@ -36,6 +37,7 @@ type MigProfile interface {
String() string
GetInfo() MigProfileInfo
Equals(other MigProfile) bool
Matches(profile string) bool
}
// MigProfileInfo holds all info associated with a specific MIG profile
@ -55,11 +57,12 @@ var _ MigProfile = &MigProfileInfo{}
func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, migMemorySizeMB, deviceMemorySizeBytes uint64) (MigProfile, error) {
giSlices := 0
switch giProfileID {
case nvml.GPU_INSTANCE_PROFILE_1_SLICE:
case nvml.GPU_INSTANCE_PROFILE_1_SLICE,
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2:
giSlices = 1
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1:
giSlices = 1
case nvml.GPU_INSTANCE_PROFILE_2_SLICE:
case nvml.GPU_INSTANCE_PROFILE_2_SLICE,
nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
giSlices = 2
case nvml.GPU_INSTANCE_PROFILE_3_SLICE:
giSlices = 3
@ -77,7 +80,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,
ciSlices := 0
switch ciProfileID {
case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE:
case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1:
ciSlices = 1
case nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE:
ciSlices = 2
@ -97,7 +101,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,
var attrs []string
switch giProfileID {
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1:
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
attrs = append(attrs, AttributeMediaExtensions)
}
@ -116,84 +121,18 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,
// ParseMigProfile converts a string representation of a MigProfile into an object
func (d *devicelib) ParseMigProfile(profile string) (MigProfile, error) {
var err error
var c, g, gb int
var attrs []string
if len(profile) == 0 {
return nil, fmt.Errorf("empty Profile string")
}
split := strings.SplitN(profile, "+", 2)
if len(split) == 2 {
attrs, err = parseMigProfileAttributes(split[1])
if err != nil {
return nil, fmt.Errorf("error parsing attributes following '+' in Profile string: %v", err)
}
}
c, g, gb, err = parseMigProfileFields(split[0])
profiles, err := d.GetMigProfiles()
if err != nil {
return nil, fmt.Errorf("error parsing '.' separated fields in Profile string: %v", err)
return nil, fmt.Errorf("error getting list of possible MIG profiles: %v", err)
}
p := &MigProfileInfo{
C: c,
G: g,
GB: gb,
Attributes: attrs,
}
switch c {
case 1:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
case 2:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
case 3:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
case 4:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
case 6:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
case 7:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
case 8:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
default:
return nil, fmt.Errorf("unknown Compute Instance slice size: %v", c)
}
switch g {
case 1:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE
case 2:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_2_SLICE
case 3:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_3_SLICE
case 4:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_4_SLICE
case 6:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_6_SLICE
case 7:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_7_SLICE
case 8:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_8_SLICE
default:
return nil, fmt.Errorf("unknown GPU Instance slice size: %v", g)
}
p.CIEngProfileID = nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED
for _, a := range attrs {
switch a {
case AttributeMediaExtensions:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1
default:
return nil, fmt.Errorf("unknown Profile attribute: %v", a)
for _, p := range profiles {
if p.Matches(profile) {
return p, nil
}
}
return p, nil
return nil, fmt.Errorf("unable to parse profile string into a valid profile")
}
// String returns the string representation of a Profile
@ -240,6 +179,55 @@ func (p *MigProfileInfo) Equals(other MigProfile) bool {
return false
}
// Matches checks if a MigProfile matches the string passed in
func (p *MigProfileInfo) Matches(profile string) bool {
// If we are handed the empty string, there is nothing to check
if profile == "" {
return false
}
// Split by + to separate out attributes
split := strings.SplitN(profile, "+", 2)
// Check to make sure the c, g, and gb values match
c, g, gb, err := parseMigProfileFields(split[0])
if err != nil {
return false
}
if c != p.C {
return false
}
if g != p.G {
return false
}
if gb != p.GB {
return false
}
// If we have no attributes we are done
if len(split) == 1 {
return true
}
// Make sure we have the same set of attributes
attrs, err := parseMigProfileAttributes(split[1])
if err != nil {
return false
}
if len(attrs) != len(p.Attributes) {
return false
}
sort.Strings(attrs)
sort.Strings(p.Attributes)
for i, a := range p.Attributes {
if a != attrs[i] {
return false
}
}
return true
}
func parseMigProfileField(s string, field string) (int, error) {
if strings.TrimSpace(s) != s {
return -1, fmt.Errorf("leading or trailing spaces on '%%d%s'", field)

View File

@ -21,6 +21,7 @@ import (
"testing"
"github.com/stretchr/testify/require"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
func TestParseMigProfile(t *testing.T) {
@ -256,7 +257,55 @@ func TestParseMigProfile(t *testing.T) {
},
}
d := New()
mockDevice := &nvml.DeviceMock{
GetNameFunc: func() (string, nvml.Return) {
return "MockDevice", nvml.SUCCESS
},
GetMigModeFunc: func() (int, int, nvml.Return) {
return nvml.DEVICE_MIG_ENABLE, nvml.DEVICE_MIG_ENABLE, nvml.SUCCESS
},
GetMemoryInfoFunc: func() (nvml.Memory, nvml.Return) {
memory := nvml.Memory{
Total: 40 * 1024 * 1024 * 1024,
}
return memory, nvml.SUCCESS
},
GetGpuInstanceProfileInfoFunc: func(Profile int) (nvml.GpuInstanceProfileInfo, nvml.Return) {
info := nvml.GpuInstanceProfileInfo{}
switch Profile {
case nvml.GPU_INSTANCE_PROFILE_1_SLICE,
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1:
info.MemorySizeMB = 5 * 1024
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2:
info.MemorySizeMB = 10 * 1024
case nvml.GPU_INSTANCE_PROFILE_2_SLICE,
nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
info.MemorySizeMB = 10 * 1024
case nvml.GPU_INSTANCE_PROFILE_3_SLICE:
info.MemorySizeMB = 20 * 1024
case nvml.GPU_INSTANCE_PROFILE_4_SLICE:
info.MemorySizeMB = 20 * 1024
case nvml.GPU_INSTANCE_PROFILE_7_SLICE:
info.MemorySizeMB = 40 * 1024
case nvml.GPU_INSTANCE_PROFILE_6_SLICE,
nvml.GPU_INSTANCE_PROFILE_8_SLICE:
fallthrough
default:
return info, nvml.ERROR_NOT_SUPPORTED
}
return info, nvml.SUCCESS
},
}
mockNvml := &nvml.InterfaceMock{
DeviceGetCountFunc: func() (int, nvml.Return) {
return 1, nvml.SUCCESS
},
DeviceGetHandleByIndexFunc: func(Index int) (nvml.Device, nvml.Return) {
return mockDevice, nvml.SUCCESS
},
}
d := New(WithNvml(mockNvml), WithVerifySymbols(false))
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
_, err := d.ParseMigProfile(tc.device)

View File

@ -65,19 +65,22 @@ const (
GPU_INSTANCE_PROFILE_7_SLICE = nvml.GPU_INSTANCE_PROFILE_7_SLICE
GPU_INSTANCE_PROFILE_8_SLICE = nvml.GPU_INSTANCE_PROFILE_8_SLICE
GPU_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1
GPU_INSTANCE_PROFILE_1_SLICE_REV2 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2
GPU_INSTANCE_PROFILE_2_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1
GPU_INSTANCE_PROFILE_COUNT = nvml.GPU_INSTANCE_PROFILE_COUNT
)
// Compute Instance Profiles
const (
COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
COMPUTE_INSTANCE_PROFILE_6_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT
COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
COMPUTE_INSTANCE_PROFILE_6_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1
COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT
)
// Compute Instance Engine Profiles