From 1f178b880d2b4f3f6385ab615f3fa9d0783053ca Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Thu, 23 Mar 2023 10:27:47 +0000 Subject: [PATCH 1/2] Update glang version in go module to 1.20 Signed-off-by: Kevin Klues --- go.mod | 8 +++++++- vendor/gopkg.in/yaml.v3/go.mod | 5 ----- vendor/modules.txt | 7 +++++-- 3 files changed, 12 insertions(+), 8 deletions(-) delete mode 100644 vendor/gopkg.in/yaml.v3/go.mod diff --git a/go.mod b/go.mod index e4cf706..259755e 100644 --- a/go.mod +++ b/go.mod @@ -1,8 +1,14 @@ module gitlab.com/nvidia/cloud-native/go-nvlib -go 1.16 +go 1.20 require ( github.com/NVIDIA/go-nvml v0.12.0-1 github.com/stretchr/testify v1.7.0 ) + +require ( + github.com/davecgh/go-spew v1.1.0 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c // indirect +) diff --git a/vendor/gopkg.in/yaml.v3/go.mod b/vendor/gopkg.in/yaml.v3/go.mod deleted file mode 100644 index f407ea3..0000000 --- a/vendor/gopkg.in/yaml.v3/go.mod +++ /dev/null @@ -1,5 +0,0 @@ -module "gopkg.in/yaml.v3" - -require ( - "gopkg.in/check.v1" v0.0.0-20161208181325-20d25e280405 -) diff --git a/vendor/modules.txt b/vendor/modules.txt index 803eeb3..fcff859 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,14 +1,17 @@ # github.com/NVIDIA/go-nvml v0.12.0-1 -## explicit +## explicit; go 1.15 github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml # github.com/davecgh/go-spew v1.1.0 +## explicit github.com/davecgh/go-spew/spew # github.com/pmezard/go-difflib v1.0.0 +## explicit github.com/pmezard/go-difflib/difflib # github.com/stretchr/testify v1.7.0 -## explicit +## explicit; go 1.13 github.com/stretchr/testify/assert github.com/stretchr/testify/require # gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c +## explicit gopkg.in/yaml.v3 From 642041d1e044427fac2809732dff7e9c55007d83 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Thu, 23 Mar 2023 11:47:12 +0000 Subject: [PATCH 2/2] Update mig-profile parsing / name generation after go-nvml v12.0 bump Signed-off-by: Kevin Klues --- pkg/nvlib/device/api.go | 12 +++ pkg/nvlib/device/device.go | 40 +++++++- pkg/nvlib/device/mig_profile.go | 144 ++++++++++++--------------- pkg/nvlib/device/mig_profile_test.go | 51 +++++++++- pkg/nvml/consts.go | 19 ++-- 5 files changed, 174 insertions(+), 92 deletions(-) diff --git a/pkg/nvlib/device/api.go b/pkg/nvlib/device/api.go index 1643fcc..c6605fc 100644 --- a/pkg/nvlib/device/api.go +++ b/pkg/nvlib/device/api.go @@ -39,6 +39,7 @@ type Interface interface { type devicelib struct { nvml nvml.Interface skippedDevices map[string]struct{} + verifySymbols *bool } var _ Interface = &devicelib{} @@ -52,6 +53,10 @@ func New(opts ...Option) Interface { if d.nvml == nil { d.nvml = nvml.New() } + if d.verifySymbols == nil { + verify := true + d.verifySymbols = &verify + } if d.skippedDevices == nil { WithSkippedDevices( "DGX Display", @@ -68,6 +73,13 @@ func WithNvml(nvml nvml.Interface) Option { } } +// WithVerifySymbols provides an option to toggle whether to verify select symbols exist in dynamic libraries before calling them +func WithVerifySymbols(verify bool) Option { + return func(d *devicelib) { + d.verifySymbols = &verify + } +} + // WithSkippedDevices provides an Option to set devices to be skipped by model name func WithSkippedDevices(names ...string) Option { return func(d *devicelib) { diff --git a/pkg/nvlib/device/device.go b/pkg/nvlib/device/device.go index 3d549e4..f6f3caa 100644 --- a/pkg/nvlib/device/device.go +++ b/pkg/nvlib/device/device.go @@ -36,7 +36,8 @@ type Device interface { type device struct { nvml.Device - lib *devicelib + lib *devicelib + migProfiles []MigProfile } var _ Device = &device{} @@ -57,12 +58,12 @@ func (d *devicelib) NewDeviceByUUID(uuid string) (Device, error) { // newDevice creates a device from an nvml.Device func (d *devicelib) newDevice(dev nvml.Device) (*device, error) { - return &device{dev, d}, nil + return &device{dev, d, nil}, nil } // IsMigCapable checks if a device is capable of having MIG paprtitions created on it func (d *device) IsMigCapable() (bool, error) { - err := nvmlLookupSymbol("nvmlDeviceGetMigMode") + err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode") if err != nil { return false, nil } @@ -80,7 +81,7 @@ func (d *device) IsMigCapable() (bool, error) { // IsMigEnabled checks if a device has MIG mode currently enabled on it func (d *device) IsMigEnabled() (bool, error) { - err := nvmlLookupSymbol("nvmlDeviceGetMigMode") + err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode") if err != nil { return false, nil } @@ -161,6 +162,20 @@ func (d *device) VisitMigProfiles(visit func(MigProfile) error) error { return fmt.Errorf("error creating MIG profile: %v", err) } + // NOTE: The NVML API doesn't currently let us query the set of + // valid Compute Instance profiles without first instantiating + // a GPU Instance to check against. In theory, it should be + // possible to get this information without a reference to a + // GPU instance, but no API is provided for that at the moment. + // We run the checks below to weed out invalid profiles + // heuristically, given what we know about how they are + // physically constructed. In the future we should do this via + // NVML once a proper API for this exists. + pi := p.GetInfo() + if (pi.C * 2) > (pi.G + 1) { + continue + } + err = visit(p) if err != nil { return fmt.Errorf("error visiting MIG profile: %v", err) @@ -186,6 +201,12 @@ func (d *device) GetMigDevices() ([]MigDevice, error) { // GetMigProfiles gets the set of unique MIG profiles associated with a top-level device func (d *device) GetMigProfiles() ([]MigProfile, error) { + // Return the cached list if available + if d.migProfiles != nil { + return d.migProfiles, nil + } + + // Otherwise generate it... var profiles []MigProfile err := d.VisitMigProfiles(func(p MigProfile) error { profiles = append(profiles, p) @@ -194,6 +215,9 @@ func (d *device) GetMigProfiles() ([]MigProfile, error) { if err != nil { return nil, err } + + // And cache it before returning + d.migProfiles = profiles return profiles, nil } @@ -333,7 +357,13 @@ func (d *devicelib) GetMigProfiles() ([]MigProfile, error) { } // nvmlLookupSymbol checks to see if the given symbol is present in the NVML library -func nvmlLookupSymbol(symbol string) error { +func (d *devicelib) nvmlLookupSymbol(symbol string) error { + // If devicelib is configured to not verify symbols, then we short-circuit here + if !*d.verifySymbols { + return nil + } + + // Otherwise we lookup the provided symbol and verify it is available lib := dl.New("libnvidia-ml.so.1", dl.RTLD_LAZY|dl.RTLD_GLOBAL) if lib == nil { return fmt.Errorf("error instantiating DynamicLibrary for NVML") diff --git a/pkg/nvlib/device/mig_profile.go b/pkg/nvlib/device/mig_profile.go index 5aa00e6..7581db5 100644 --- a/pkg/nvlib/device/mig_profile.go +++ b/pkg/nvlib/device/mig_profile.go @@ -19,6 +19,7 @@ package device import ( "fmt" "math" + "sort" "strconv" "strings" @@ -36,6 +37,7 @@ type MigProfile interface { String() string GetInfo() MigProfileInfo Equals(other MigProfile) bool + Matches(profile string) bool } // MigProfileInfo holds all info associated with a specific MIG profile @@ -55,11 +57,12 @@ var _ MigProfile = &MigProfileInfo{} func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, migMemorySizeMB, deviceMemorySizeBytes uint64) (MigProfile, error) { giSlices := 0 switch giProfileID { - case nvml.GPU_INSTANCE_PROFILE_1_SLICE: + case nvml.GPU_INSTANCE_PROFILE_1_SLICE, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: giSlices = 1 - case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: - giSlices = 1 - case nvml.GPU_INSTANCE_PROFILE_2_SLICE: + case nvml.GPU_INSTANCE_PROFILE_2_SLICE, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: giSlices = 2 case nvml.GPU_INSTANCE_PROFILE_3_SLICE: giSlices = 3 @@ -77,7 +80,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, ciSlices := 0 switch ciProfileID { - case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: + case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1: ciSlices = 1 case nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: ciSlices = 2 @@ -97,7 +101,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, var attrs []string switch giProfileID { - case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: + case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: attrs = append(attrs, AttributeMediaExtensions) } @@ -116,84 +121,18 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, // ParseMigProfile converts a string representation of a MigProfile into an object func (d *devicelib) ParseMigProfile(profile string) (MigProfile, error) { - var err error - var c, g, gb int - var attrs []string - - if len(profile) == 0 { - return nil, fmt.Errorf("empty Profile string") - } - - split := strings.SplitN(profile, "+", 2) - if len(split) == 2 { - attrs, err = parseMigProfileAttributes(split[1]) - if err != nil { - return nil, fmt.Errorf("error parsing attributes following '+' in Profile string: %v", err) - } - } - - c, g, gb, err = parseMigProfileFields(split[0]) + profiles, err := d.GetMigProfiles() if err != nil { - return nil, fmt.Errorf("error parsing '.' separated fields in Profile string: %v", err) + return nil, fmt.Errorf("error getting list of possible MIG profiles: %v", err) } - p := &MigProfileInfo{ - C: c, - G: g, - GB: gb, - Attributes: attrs, - } - - switch c { - case 1: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE - case 2: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE - case 3: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE - case 4: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE - case 6: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE - case 7: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE - case 8: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE - default: - return nil, fmt.Errorf("unknown Compute Instance slice size: %v", c) - } - - switch g { - case 1: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE - case 2: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_2_SLICE - case 3: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_3_SLICE - case 4: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_4_SLICE - case 6: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_6_SLICE - case 7: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_7_SLICE - case 8: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_8_SLICE - default: - return nil, fmt.Errorf("unknown GPU Instance slice size: %v", g) - } - - p.CIEngProfileID = nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED - - for _, a := range attrs { - switch a { - case AttributeMediaExtensions: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1 - default: - return nil, fmt.Errorf("unknown Profile attribute: %v", a) + for _, p := range profiles { + if p.Matches(profile) { + return p, nil } } - return p, nil + return nil, fmt.Errorf("unable to parse profile string into a valid profile") } // String returns the string representation of a Profile @@ -240,6 +179,55 @@ func (p *MigProfileInfo) Equals(other MigProfile) bool { return false } +// Matches checks if a MigProfile matches the string passed in +func (p *MigProfileInfo) Matches(profile string) bool { + // If we are handed the empty string, there is nothing to check + if profile == "" { + return false + } + + // Split by + to separate out attributes + split := strings.SplitN(profile, "+", 2) + + // Check to make sure the c, g, and gb values match + c, g, gb, err := parseMigProfileFields(split[0]) + if err != nil { + return false + } + if c != p.C { + return false + } + if g != p.G { + return false + } + if gb != p.GB { + return false + } + + // If we have no attributes we are done + if len(split) == 1 { + return true + } + + // Make sure we have the same set of attributes + attrs, err := parseMigProfileAttributes(split[1]) + if err != nil { + return false + } + if len(attrs) != len(p.Attributes) { + return false + } + sort.Strings(attrs) + sort.Strings(p.Attributes) + for i, a := range p.Attributes { + if a != attrs[i] { + return false + } + } + + return true +} + func parseMigProfileField(s string, field string) (int, error) { if strings.TrimSpace(s) != s { return -1, fmt.Errorf("leading or trailing spaces on '%%d%s'", field) diff --git a/pkg/nvlib/device/mig_profile_test.go b/pkg/nvlib/device/mig_profile_test.go index ab19773..d22550a 100644 --- a/pkg/nvlib/device/mig_profile_test.go +++ b/pkg/nvlib/device/mig_profile_test.go @@ -21,6 +21,7 @@ import ( "testing" "github.com/stretchr/testify/require" + "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" ) func TestParseMigProfile(t *testing.T) { @@ -256,7 +257,55 @@ func TestParseMigProfile(t *testing.T) { }, } - d := New() + mockDevice := &nvml.DeviceMock{ + GetNameFunc: func() (string, nvml.Return) { + return "MockDevice", nvml.SUCCESS + }, + GetMigModeFunc: func() (int, int, nvml.Return) { + return nvml.DEVICE_MIG_ENABLE, nvml.DEVICE_MIG_ENABLE, nvml.SUCCESS + }, + GetMemoryInfoFunc: func() (nvml.Memory, nvml.Return) { + memory := nvml.Memory{ + Total: 40 * 1024 * 1024 * 1024, + } + return memory, nvml.SUCCESS + }, + GetGpuInstanceProfileInfoFunc: func(Profile int) (nvml.GpuInstanceProfileInfo, nvml.Return) { + info := nvml.GpuInstanceProfileInfo{} + switch Profile { + case nvml.GPU_INSTANCE_PROFILE_1_SLICE, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: + info.MemorySizeMB = 5 * 1024 + case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: + info.MemorySizeMB = 10 * 1024 + case nvml.GPU_INSTANCE_PROFILE_2_SLICE, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: + info.MemorySizeMB = 10 * 1024 + case nvml.GPU_INSTANCE_PROFILE_3_SLICE: + info.MemorySizeMB = 20 * 1024 + case nvml.GPU_INSTANCE_PROFILE_4_SLICE: + info.MemorySizeMB = 20 * 1024 + case nvml.GPU_INSTANCE_PROFILE_7_SLICE: + info.MemorySizeMB = 40 * 1024 + case nvml.GPU_INSTANCE_PROFILE_6_SLICE, + nvml.GPU_INSTANCE_PROFILE_8_SLICE: + fallthrough + default: + return info, nvml.ERROR_NOT_SUPPORTED + } + return info, nvml.SUCCESS + }, + } + mockNvml := &nvml.InterfaceMock{ + DeviceGetCountFunc: func() (int, nvml.Return) { + return 1, nvml.SUCCESS + }, + DeviceGetHandleByIndexFunc: func(Index int) (nvml.Device, nvml.Return) { + return mockDevice, nvml.SUCCESS + }, + } + + d := New(WithNvml(mockNvml), WithVerifySymbols(false)) for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { _, err := d.ParseMigProfile(tc.device) diff --git a/pkg/nvml/consts.go b/pkg/nvml/consts.go index b353ef9..8e61416 100644 --- a/pkg/nvml/consts.go +++ b/pkg/nvml/consts.go @@ -65,19 +65,22 @@ const ( GPU_INSTANCE_PROFILE_7_SLICE = nvml.GPU_INSTANCE_PROFILE_7_SLICE GPU_INSTANCE_PROFILE_8_SLICE = nvml.GPU_INSTANCE_PROFILE_8_SLICE GPU_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1 + GPU_INSTANCE_PROFILE_1_SLICE_REV2 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2 + GPU_INSTANCE_PROFILE_2_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1 GPU_INSTANCE_PROFILE_COUNT = nvml.GPU_INSTANCE_PROFILE_COUNT ) // Compute Instance Profiles const ( - COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE - COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE - COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE - COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE - COMPUTE_INSTANCE_PROFILE_6_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE - COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE - COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE - COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT + COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE + COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE + COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE + COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE + COMPUTE_INSTANCE_PROFILE_6_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE + COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE + COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE + COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 + COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT ) // Compute Instance Engine Profiles