Update go-nvlib to skip non-MIG devices

This change updates go-nvlib to ensure that non-migcapable GPUs are skipped when generating CDI specifications for MIG devices. Signed-off-by: Evan Lezar <elezar@nvidia.com>
2025-06-26 18:18:24 +00:00 · 2023-05-22 15:28:38 +02:00 · 2023-05-22 15:28:38 +02:00 · e11f65e51e
commit e11f65e51e
parent 3ea02d13fc
12 changed files with 432 additions and 139 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -7,6 +7,8 @@
 * Generate CDI specification files with `644` permissions to allow rootless applications (e.g. podman)
 * Add `nvidia-ctk cdi list` command to show the known CDI devices.
 * Add support for generating merged devices (e.g. `all` device) to the nvcdi API.
+* Use *.* pattern to locate libcuda.so when generating a CDI specification to support platforms where a patch version is not specified.
+* Update go-nvlib to skip devices that are not MIG capable when generating CDI specifications.

 ## v1.13.1

--- a/go.mod
+++ b/go.mod
@ -4,7 +4,7 @@ go 1.20

 require (
 	github.com/BurntSushi/toml v1.2.1
-	github.com/NVIDIA/go-nvml v0.12.0-0
+	github.com/NVIDIA/go-nvml v0.12.0-1
 	github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a
 	github.com/fsnotify/fsnotify v1.5.4
 	github.com/opencontainers/runtime-spec v1.1.0-rc.2
@ -12,7 +12,7 @@ require (
 	github.com/sirupsen/logrus v1.9.0
 	github.com/stretchr/testify v1.8.1
 	github.com/urfave/cli/v2 v2.3.0
-	gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438
+	gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386
 	golang.org/x/mod v0.5.0
 	golang.org/x/sys v0.7.0
 )
--- a/go.sum
+++ b/go.sum
@ -1,9 +1,8 @@
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
 github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak=
 github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
-github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
-github.com/NVIDIA/go-nvml v0.12.0-0 h1:eHYNHbzAsMgWYshf6dEmTY66/GCXnORJFnzm3TNH4mc=
-github.com/NVIDIA/go-nvml v0.12.0-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
+github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM=
+github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
 github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
 github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
 github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a h1:sP3PcgyIkRlHqfF3Jfpe/7G8kf/qpzG4C8r94y9hLbE=
@ -78,8 +77,8 @@ github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHo
 github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ=
 github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74=
 github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y=
-gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438 h1:+qRai7XRl8omFQVCeHcaWzL542Yw64vfmuXG+79ZCIc=
-gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438/go.mod h1:GStidGxhaqJhYFW1YpOnLvYCbL2EsM0od7IW4u7+JgU=
+gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386 h1:byHxP+mlgNQ4GX31owfgCIq5fJCsdJMchiJHGuM2rxw=
+gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386/go.mod h1:KYZksBgh18o+uzgnpDazzG4LVYtnfB96VXHMXypEtik=
 golang.org/x/mod v0.5.0 h1:UG21uOlmZabA4fW5i7ZX6bjw1xELEGg/ZLgZq9auk/Q=
 golang.org/x/mod v0.5.0/go.mod h1:5OXOZSfqPIIbmVBIIKWRFfZjPR0E5r58TLhUjH0a2Ro=
 golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
--- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go
+++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go
@ -2001,9 +2001,16 @@ func DeviceGetGpuInstancePossiblePlacements(Device Device, Info *GpuInstanceProf
 	if Info == nil {
 		return nil, ERROR_INVALID_ARGUMENT
 	}
-	var Count uint32 = Info.InstanceCount
+	var Count uint32
+	ret := nvmlDeviceGetGpuInstancePossiblePlacements(Device, Info.Id, nil, &Count)
+	if ret != SUCCESS {
+		return nil, ret
+	}
+	if Count == 0 {
+		return []GpuInstancePlacement{}, ret
+	}
 	Placements := make([]GpuInstancePlacement, Count)
-	ret := nvmlDeviceGetGpuInstancePossiblePlacements(Device, Info.Id, &Placements[0], &Count)
+	ret = nvmlDeviceGetGpuInstancePossiblePlacements(Device, Info.Id, &Placements[0], &Count)
 	return Placements[:Count], ret
 }

@ -2577,9 +2584,9 @@ func (Device Device) GetVgpuSchedulerCapabilities() (VgpuSchedulerCapabilities,
 }

 // nvml.GpuInstanceGetComputeInstancePossiblePlacements()
-func GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance GpuInstance, ProfileId int) ([]ComputeInstancePlacement, Return) {
+func GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance GpuInstance, Info *ComputeInstanceProfileInfo) ([]ComputeInstancePlacement, Return) {
 	var Count uint32
-	ret := nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, uint32(ProfileId), nil, &Count)
+	ret := nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, Info.Id, nil, &Count)
 	if ret != SUCCESS {
 		return nil, ret
 	}
@ -2587,21 +2594,21 @@ func GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance GpuInstance, Pr
 		return []ComputeInstancePlacement{}, ret
 	}
 	PlacementArray := make([]ComputeInstancePlacement, Count)
-	ret = nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, uint32(ProfileId), &PlacementArray[0], &Count)
+	ret = nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, Info.Id, &PlacementArray[0], &Count)
 	return PlacementArray, ret
 }

-func (GpuInstance GpuInstance) GetComputeInstancePossiblePlacements(ProfileId int) ([]ComputeInstancePlacement, Return) {
-	return GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, ProfileId)
+func (GpuInstance GpuInstance) GetComputeInstancePossiblePlacements(Info *ComputeInstanceProfileInfo) ([]ComputeInstancePlacement, Return) {
+	return GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, Info)
 }

 // nvml.GpuInstanceCreateComputeInstanceWithPlacement()
-func GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance GpuInstance, ProfileId int, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return {
-	return nvmlGpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, uint32(ProfileId), Placement, ComputeInstance)
+func GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance GpuInstance, Info *ComputeInstanceProfileInfo, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return {
+	return nvmlGpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, Info.Id, Placement, ComputeInstance)
 }

-func (GpuInstance GpuInstance) CreateComputeInstanceWithPlacement(ProfileId int, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return {
-	return GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, ProfileId, Placement, ComputeInstance)
+func (GpuInstance GpuInstance) CreateComputeInstanceWithPlacement(Info *ComputeInstanceProfileInfo, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return {
+	return GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, Info, Placement, ComputeInstance)
 }

 // nvml.DeviceGetGpuFabricInfo()
--- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/api.go
+++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/api.go
@ -22,6 +22,7 @@ import (

 // Interface provides the API to the 'device' package
 type Interface interface {
+	AssertValidMigProfileFormat(profile string) error
 	GetDevices() ([]Device, error)
 	GetMigDevices() ([]MigDevice, error)
 	GetMigProfiles() ([]MigProfile, error)
@ -39,6 +40,8 @@ type Interface interface {
 type devicelib struct {
 	nvml           nvml.Interface
 	skippedDevices map[string]struct{}
+	verifySymbols  *bool
+	migProfiles    []MigProfile
 }

 var _ Interface = &devicelib{}
@ -52,6 +55,10 @@ func New(opts ...Option) Interface {
 	if d.nvml == nil {
 		d.nvml = nvml.New()
 	}
+	if d.verifySymbols == nil {
+		verify := true
+		d.verifySymbols = &verify
+	}
 	if d.skippedDevices == nil {
 		WithSkippedDevices(
 			"DGX Display",
@ -68,6 +75,13 @@ func WithNvml(nvml nvml.Interface) Option {
 	}
 }

+// WithVerifySymbols provides an option to toggle whether to verify select symbols exist in dynamic libraries before calling them
+func WithVerifySymbols(verify bool) Option {
+	return func(d *devicelib) {
+		d.verifySymbols = &verify
+	}
+}
+
 // WithSkippedDevices provides an Option to set devices to be skipped by model name
 func WithSkippedDevices(names ...string) Option {
 	return func(d *devicelib) {
--- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/device.go
+++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/device.go
@ -26,6 +26,9 @@ import (
 // Device defines the set of extended functions associated with a device.Device
 type Device interface {
 	nvml.Device
+	GetArchitectureAsString() (string, error)
+	GetBrandAsString() (string, error)
+	GetCudaComputeCapabilityAsString() (string, error)
 	GetMigDevices() ([]MigDevice, error)
 	GetMigProfiles() ([]MigProfile, error)
 	IsMigCapable() (bool, error)
@ -36,7 +39,8 @@ type Device interface {

 type device struct {
 	nvml.Device
-	lib *devicelib
+	lib         *devicelib
+	migProfiles []MigProfile
 }

 var _ Device = &device{}
@ -57,12 +61,98 @@ func (d *devicelib) NewDeviceByUUID(uuid string) (Device, error) {

 // newDevice creates a device from an nvml.Device
 func (d *devicelib) newDevice(dev nvml.Device) (*device, error) {
-	return &device{dev, d}, nil
+	return &device{dev, d, nil}, nil
+}
+
+// GetArchitectureAsString returns the Device architecture as a string
+func (d *device) GetArchitectureAsString() (string, error) {
+	arch, ret := d.GetArchitecture()
+	if ret != nvml.SUCCESS {
+		return "", fmt.Errorf("error getting device architecture: %v", ret)
+	}
+	switch arch {
+	case nvml.DEVICE_ARCH_KEPLER:
+		return "Kepler", nil
+	case nvml.DEVICE_ARCH_MAXWELL:
+		return "Maxwell", nil
+	case nvml.DEVICE_ARCH_PASCAL:
+		return "Pascal", nil
+	case nvml.DEVICE_ARCH_VOLTA:
+		return "Volta", nil
+	case nvml.DEVICE_ARCH_TURING:
+		return "Turing", nil
+	case nvml.DEVICE_ARCH_AMPERE:
+		return "Ampere", nil
+	case nvml.DEVICE_ARCH_ADA:
+		return "Ada", nil
+	case nvml.DEVICE_ARCH_HOPPER:
+		return "Hopper", nil
+	case nvml.DEVICE_ARCH_UNKNOWN:
+		return "Unknown", nil
+	}
+	return "", fmt.Errorf("error interpreting device architecture as string: %v", arch)
+}
+
+// GetBrandAsString returns the Device architecture as a string
+func (d *device) GetBrandAsString() (string, error) {
+	brand, ret := d.GetBrand()
+	if ret != nvml.SUCCESS {
+		return "", fmt.Errorf("error getting device brand: %v", ret)
+	}
+	switch brand {
+	case nvml.BRAND_UNKNOWN:
+		return "Unknown", nil
+	case nvml.BRAND_QUADRO:
+		return "Quadro", nil
+	case nvml.BRAND_TESLA:
+		return "Tesla", nil
+	case nvml.BRAND_NVS:
+		return "NVS", nil
+	case nvml.BRAND_GRID:
+		return "Grid", nil
+	case nvml.BRAND_GEFORCE:
+		return "GeForce", nil
+	case nvml.BRAND_TITAN:
+		return "Titan", nil
+	case nvml.BRAND_NVIDIA_VAPPS:
+		return "NvidiaVApps", nil
+	case nvml.BRAND_NVIDIA_VPC:
+		return "NvidiaVPC", nil
+	case nvml.BRAND_NVIDIA_VCS:
+		return "NvidiaVCS", nil
+	case nvml.BRAND_NVIDIA_VWS:
+		return "NvidiaVWS", nil
+	// Deprecated in favor of nvml.BRAND_NVIDIA_CLOUD_GAMING
+	//case nvml.BRAND_NVIDIA_VGAMING:
+	//	return "VGaming", nil
+	case nvml.BRAND_NVIDIA_CLOUD_GAMING:
+		return "NvidiaCloudGaming", nil
+	case nvml.BRAND_QUADRO_RTX:
+		return "QuadroRTX", nil
+	case nvml.BRAND_NVIDIA_RTX:
+		return "NvidiaRTX", nil
+	case nvml.BRAND_NVIDIA:
+		return "Nvidia", nil
+	case nvml.BRAND_GEFORCE_RTX:
+		return "GeForceRTX", nil
+	case nvml.BRAND_TITAN_RTX:
+		return "TitanRTX", nil
+	}
+	return "", fmt.Errorf("error interpreting device brand as string: %v", brand)
+}
+
+// GetCudaComputeCapabilityAsString returns the Device's CUDA compute capability as a version string
+func (d *device) GetCudaComputeCapabilityAsString() (string, error) {
+	major, minor, ret := d.GetCudaComputeCapability()
+	if ret != nvml.SUCCESS {
+		return "", fmt.Errorf("error getting CUDA compute capability: %v", ret)
+	}
+	return fmt.Sprintf("%d.%d", major, minor), nil
 }

 // IsMigCapable checks if a device is capable of having MIG paprtitions created on it
 func (d *device) IsMigCapable() (bool, error) {
-	err := nvmlLookupSymbol("nvmlDeviceGetMigMode")
+	err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode")
 	if err != nil {
 		return false, nil
 	}
@ -80,7 +170,7 @@ func (d *device) IsMigCapable() (bool, error) {

 // IsMigEnabled checks if a device has MIG mode currently enabled on it
 func (d *device) IsMigEnabled() (bool, error) {
-	err := nvmlLookupSymbol("nvmlDeviceGetMigMode")
+	err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode")
 	if err != nil {
 		return false, nil
 	}
@ -98,6 +188,14 @@ func (d *device) IsMigEnabled() (bool, error) {

 // VisitMigDevices walks a top-level device and invokes a callback function for each MIG device configured on it
 func (d *device) VisitMigDevices(visit func(int, MigDevice) error) error {
+	capable, err := d.IsMigCapable()
+	if err != nil {
+		return fmt.Errorf("error checking if GPU is MIG capable: %v", err)
+	}
+	if !capable {
+		return nil
+	}
+
 	count, ret := nvml.Device(d).GetMaxMigDeviceCount()
 	if ret != nvml.SUCCESS {
 		return fmt.Errorf("error getting max MIG device count: %v", ret)
@ -161,6 +259,23 @@ func (d *device) VisitMigProfiles(visit func(MigProfile) error) error {
 					return fmt.Errorf("error creating MIG profile: %v", err)
 				}

+				// NOTE: The NVML API doesn't currently let us query the set of
+				// valid Compute Instance profiles without first instantiating
+				// a GPU Instance to check against. In theory, it should be
+				// possible to get this information without a reference to a
+				// GPU instance, but no API is provided for that at the moment.
+				// We run the checks below to weed out invalid profiles
+				// heuristically, given what we know about how they are
+				// physically constructed. In the future we should do this via
+				// NVML once a proper API for this exists.
+				pi := p.GetInfo()
+				if pi.C > pi.G {
+					continue
+				}
+				if (pi.C < pi.G) && ((pi.C * 2) > (pi.G + 1)) {
+					continue
+				}
+
 				err = visit(p)
 				if err != nil {
 					return fmt.Errorf("error visiting MIG profile: %v", err)
@ -186,6 +301,12 @@ func (d *device) GetMigDevices() ([]MigDevice, error) {

 // GetMigProfiles gets the set of unique MIG profiles associated with a top-level device
 func (d *device) GetMigProfiles() ([]MigProfile, error) {
+	// Return the cached list if available
+	if d.migProfiles != nil {
+		return d.migProfiles, nil
+	}
+
+	// Otherwise generate it...
 	var profiles []MigProfile
 	err := d.VisitMigProfiles(func(p MigProfile) error {
 		profiles = append(profiles, p)
@ -194,6 +315,9 @@ func (d *device) GetMigProfiles() ([]MigProfile, error) {
 	if err != nil {
 		return nil, err
 	}
+
+	// And cache it before returning
+	d.migProfiles = profiles
 	return profiles, nil
 }

@ -321,6 +445,12 @@ func (d *devicelib) GetMigDevices() ([]MigDevice, error) {

 // GetMigProfiles gets the set of unique MIG profiles across all top-level devices
 func (d *devicelib) GetMigProfiles() ([]MigProfile, error) {
+	// Return the cached list if available
+	if d.migProfiles != nil {
+		return d.migProfiles, nil
+	}
+
+	// Otherwise generate it...
 	var profiles []MigProfile
 	err := d.VisitMigProfiles(func(p MigProfile) error {
 		profiles = append(profiles, p)
@ -329,11 +459,20 @@ func (d *devicelib) GetMigProfiles() ([]MigProfile, error) {
 	if err != nil {
 		return nil, err
 	}
+
+	// And cache it before returning
+	d.migProfiles = profiles
 	return profiles, nil
 }

 // nvmlLookupSymbol checks to see if the given symbol is present in the NVML library
-func nvmlLookupSymbol(symbol string) error {
+func (d *devicelib) nvmlLookupSymbol(symbol string) error {
+	// If devicelib is configured to not verify symbols, then we short-circuit here
+	if !*d.verifySymbols {
+		return nil
+	}
+
+	// Otherwise we lookup the provided symbol and verify it is available
 	lib := dl.New("libnvidia-ml.so.1", dl.RTLD_LAZY|dl.RTLD_GLOBAL)
 	if lib == nil {
 		return fmt.Errorf("error instantiating DynamicLibrary for NVML")
--- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/mig_profile.go
+++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/mig_profile.go
@ -19,6 +19,7 @@ package device
 import (
 	"fmt"
 	"math"
+	"sort"
 	"strconv"
 	"strings"

@ -36,6 +37,7 @@ type MigProfile interface {
 	String() string
 	GetInfo() MigProfileInfo
 	Equals(other MigProfile) bool
+	Matches(profile string) bool
 }

 // MigProfileInfo holds all info associated with a specific MIG profile
@ -55,11 +57,12 @@ var _ MigProfile = &MigProfileInfo{}
 func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, migMemorySizeMB, deviceMemorySizeBytes uint64) (MigProfile, error) {
 	giSlices := 0
 	switch giProfileID {
-	case nvml.GPU_INSTANCE_PROFILE_1_SLICE:
+	case nvml.GPU_INSTANCE_PROFILE_1_SLICE,
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2:
 		giSlices = 1
-	case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1:
-		giSlices = 1
-	case nvml.GPU_INSTANCE_PROFILE_2_SLICE:
+	case nvml.GPU_INSTANCE_PROFILE_2_SLICE,
+		nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
 		giSlices = 2
 	case nvml.GPU_INSTANCE_PROFILE_3_SLICE:
 		giSlices = 3
@ -77,7 +80,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,

 	ciSlices := 0
 	switch ciProfileID {
-	case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE:
+	case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
+		nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1:
 		ciSlices = 1
 	case nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE:
 		ciSlices = 2
@ -97,7 +101,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,

 	var attrs []string
 	switch giProfileID {
-	case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1:
+	case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
+		nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
 		attrs = append(attrs, AttributeMediaExtensions)
 	}

@ -114,90 +119,30 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,
 	return p, nil
 }

+// AssertValidMigProfileFormat checks if the string is in the proper format to represent a MIG profile
+func (d *devicelib) AssertValidMigProfileFormat(profile string) error {
+	_, _, _, _, err := parseMigProfile(profile)
+	return err
+}
+
 // ParseMigProfile converts a string representation of a MigProfile into an object
 func (d *devicelib) ParseMigProfile(profile string) (MigProfile, error) {
-	var err error
-	var c, g, gb int
-	var attrs []string
-
-	if len(profile) == 0 {
-		return nil, fmt.Errorf("empty Profile string")
-	}
-
-	split := strings.SplitN(profile, "+", 2)
-	if len(split) == 2 {
-		attrs, err = parseMigProfileAttributes(split[1])
-		if err != nil {
-			return nil, fmt.Errorf("error parsing attributes following '+' in Profile string: %v", err)
-		}
-	}
-
-	c, g, gb, err = parseMigProfileFields(split[0])
+	profiles, err := d.GetMigProfiles()
 	if err != nil {
-		return nil, fmt.Errorf("error parsing '.' separated fields in Profile string: %v", err)
+		return nil, fmt.Errorf("error getting list of possible MIG profiles: %v", err)
 	}

-	p := &MigProfileInfo{
-		C:          c,
-		G:          g,
-		GB:         gb,
-		Attributes: attrs,
-	}
-
-	switch c {
-	case 1:
-		p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
-	case 2:
-		p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
-	case 3:
-		p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
-	case 4:
-		p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
-	case 6:
-		p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
-	case 7:
-		p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
-	case 8:
-		p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
-	default:
-		return nil, fmt.Errorf("unknown Compute Instance slice size: %v", c)
-	}
-
-	switch g {
-	case 1:
-		p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE
-	case 2:
-		p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_2_SLICE
-	case 3:
-		p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_3_SLICE
-	case 4:
-		p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_4_SLICE
-	case 6:
-		p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_6_SLICE
-	case 7:
-		p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_7_SLICE
-	case 8:
-		p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_8_SLICE
-	default:
-		return nil, fmt.Errorf("unknown GPU Instance slice size: %v", g)
-	}
-
-	p.CIEngProfileID = nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED
-
-	for _, a := range attrs {
-		switch a {
-		case AttributeMediaExtensions:
-			p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1
-		default:
-			return nil, fmt.Errorf("unknown Profile attribute: %v", a)
+	for _, p := range profiles {
+		if p.Matches(profile) {
+			return p, nil
 		}
 	}

-	return p, nil
+	return nil, fmt.Errorf("unable to parse profile string into a valid profile")
 }

 // String returns the string representation of a Profile
-func (p *MigProfileInfo) String() string {
+func (p MigProfileInfo) String() string {
 	var suffix string
 	if len(p.Attributes) > 0 {
 		suffix = "+" + strings.Join(p.Attributes, ",")
@ -209,35 +154,89 @@ func (p *MigProfileInfo) String() string {
 }

 // GetInfo returns detailed info about a Profile
-func (p *MigProfileInfo) GetInfo() MigProfileInfo {
-	return *p
+func (p MigProfileInfo) GetInfo() MigProfileInfo {
+	return p
 }

 // Equals checks if two Profiles are identical or not
-func (p *MigProfileInfo) Equals(other MigProfile) bool {
-	switch o := other.(type) {
-	case *MigProfileInfo:
-		if p.C != o.C {
-			return false
-		}
-		if p.G != o.G {
-			return false
-		}
-		if p.GB != o.GB {
-			return false
-		}
-		if p.GIProfileID != o.GIProfileID {
-			return false
-		}
-		if p.CIProfileID != o.CIProfileID {
-			return false
-		}
-		if p.CIEngProfileID != o.CIEngProfileID {
-			return false
-		}
-		return true
+func (p MigProfileInfo) Equals(other MigProfile) bool {
+	o := other.GetInfo()
+	if p.C != o.C {
+		return false
 	}
-	return false
+	if p.G != o.G {
+		return false
+	}
+	if p.GB != o.GB {
+		return false
+	}
+	if p.GIProfileID != o.GIProfileID {
+		return false
+	}
+	if p.CIProfileID != o.CIProfileID {
+		return false
+	}
+	if p.CIEngProfileID != o.CIEngProfileID {
+		return false
+	}
+	return true
+}
+
+// Matches checks if a MigProfile matches the string passed in
+func (p MigProfileInfo) Matches(profile string) bool {
+	c, g, gb, attrs, err := parseMigProfile(profile)
+	if err != nil {
+		return false
+	}
+	if c != p.C {
+		return false
+	}
+	if g != p.G {
+		return false
+	}
+	if gb != p.GB {
+		return false
+	}
+	if len(attrs) != len(p.Attributes) {
+		return false
+	}
+	sort.Strings(attrs)
+	sort.Strings(p.Attributes)
+	for i, a := range p.Attributes {
+		if a != attrs[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func parseMigProfile(profile string) (int, int, int, []string, error) {
+	// If we are handed the empty string, we cannot parse it
+	if profile == "" {
+		return -1, -1, -1, nil, fmt.Errorf("profile is the empty string")
+	}
+
+	// Split by + to separate out attributes
+	split := strings.SplitN(profile, "+", 2)
+
+	// Check to make sure the c, g, and gb values match
+	c, g, gb, err := parseMigProfileFields(split[0])
+	if err != nil {
+		return -1, -1, -1, nil, fmt.Errorf("cannot parse fields of '%v': %v", profile, err)
+	}
+
+	// If we have no attributes we are done
+	if len(split) == 1 {
+		return c, g, gb, nil, nil
+	}
+
+	// Make sure we have the same set of attributes
+	attrs, err := parseMigProfileAttributes(split[1])
+	if err != nil {
+		return -1, -1, -1, nil, fmt.Errorf("cannot parse attributes of '%v': %v", profile, err)
+	}
+
+	return c, g, gb, attrs, nil
 }

 func parseMigProfileField(s string, field string) (int, error) {
--- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/consts.go
+++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/consts.go
@ -49,6 +49,42 @@ const (
 	ERROR_UNKNOWN                 = Return(nvml.ERROR_UNKNOWN)
 )

+// Device architecture constants
+const (
+	DEVICE_ARCH_KEPLER  = nvml.DEVICE_ARCH_KEPLER
+	DEVICE_ARCH_MAXWELL = nvml.DEVICE_ARCH_MAXWELL
+	DEVICE_ARCH_PASCAL  = nvml.DEVICE_ARCH_PASCAL
+	DEVICE_ARCH_VOLTA   = nvml.DEVICE_ARCH_VOLTA
+	DEVICE_ARCH_TURING  = nvml.DEVICE_ARCH_TURING
+	DEVICE_ARCH_AMPERE  = nvml.DEVICE_ARCH_AMPERE
+	DEVICE_ARCH_ADA     = nvml.DEVICE_ARCH_ADA
+	DEVICE_ARCH_HOPPER  = nvml.DEVICE_ARCH_HOPPER
+	DEVICE_ARCH_UNKNOWN = nvml.DEVICE_ARCH_UNKNOWN
+)
+
+// Device brand constants
+const (
+	BRAND_UNKNOWN             = BrandType(nvml.BRAND_UNKNOWN)
+	BRAND_QUADRO              = BrandType(nvml.BRAND_QUADRO)
+	BRAND_TESLA               = BrandType(nvml.BRAND_TESLA)
+	BRAND_NVS                 = BrandType(nvml.BRAND_NVS)
+	BRAND_GRID                = BrandType(nvml.BRAND_GRID)
+	BRAND_GEFORCE             = BrandType(nvml.BRAND_GEFORCE)
+	BRAND_TITAN               = BrandType(nvml.BRAND_TITAN)
+	BRAND_NVIDIA_VAPPS        = BrandType(nvml.BRAND_NVIDIA_VAPPS)
+	BRAND_NVIDIA_VPC          = BrandType(nvml.BRAND_NVIDIA_VPC)
+	BRAND_NVIDIA_VCS          = BrandType(nvml.BRAND_NVIDIA_VCS)
+	BRAND_NVIDIA_VWS          = BrandType(nvml.BRAND_NVIDIA_VWS)
+	BRAND_NVIDIA_CLOUD_GAMING = BrandType(nvml.BRAND_NVIDIA_CLOUD_GAMING)
+	BRAND_NVIDIA_VGAMING      = BrandType(nvml.BRAND_NVIDIA_VGAMING)
+	BRAND_QUADRO_RTX          = BrandType(nvml.BRAND_QUADRO_RTX)
+	BRAND_NVIDIA_RTX          = BrandType(nvml.BRAND_NVIDIA_RTX)
+	BRAND_NVIDIA              = BrandType(nvml.BRAND_NVIDIA)
+	BRAND_GEFORCE_RTX         = BrandType(nvml.BRAND_GEFORCE_RTX)
+	BRAND_TITAN_RTX           = BrandType(nvml.BRAND_TITAN_RTX)
+	BRAND_COUNT               = BrandType(nvml.BRAND_COUNT)
+)
+
 // MIG Mode constants
 const (
 	DEVICE_MIG_ENABLE  = nvml.DEVICE_MIG_ENABLE
@ -65,19 +101,22 @@ const (
 	GPU_INSTANCE_PROFILE_7_SLICE      = nvml.GPU_INSTANCE_PROFILE_7_SLICE
 	GPU_INSTANCE_PROFILE_8_SLICE      = nvml.GPU_INSTANCE_PROFILE_8_SLICE
 	GPU_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1
+	GPU_INSTANCE_PROFILE_1_SLICE_REV2 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2
+	GPU_INSTANCE_PROFILE_2_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1
 	GPU_INSTANCE_PROFILE_COUNT        = nvml.GPU_INSTANCE_PROFILE_COUNT
 )

 // Compute Instance Profiles
 const (
-	COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
-	COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
-	COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
-	COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
-	COMPUTE_INSTANCE_PROFILE_6_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
-	COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
-	COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
-	COMPUTE_INSTANCE_PROFILE_COUNT   = nvml.COMPUTE_INSTANCE_PROFILE_COUNT
+	COMPUTE_INSTANCE_PROFILE_1_SLICE      = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
+	COMPUTE_INSTANCE_PROFILE_2_SLICE      = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
+	COMPUTE_INSTANCE_PROFILE_3_SLICE      = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
+	COMPUTE_INSTANCE_PROFILE_4_SLICE      = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
+	COMPUTE_INSTANCE_PROFILE_6_SLICE      = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
+	COMPUTE_INSTANCE_PROFILE_7_SLICE      = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
+	COMPUTE_INSTANCE_PROFILE_8_SLICE      = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
+	COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1
+	COMPUTE_INSTANCE_PROFILE_COUNT        = nvml.COMPUTE_INSTANCE_PROFILE_COUNT
 )

 // Compute Instance Engine Profiles
--- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device.go
+++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device.go
@ -150,12 +150,24 @@ func (d nvmlDevice) GetAttributes() (DeviceAttributes, Return) {
 	return DeviceAttributes(a), Return(r)
 }

-// GetName returns the device attributes for a MIG device
+// GetName returns the product name of a Device
 func (d nvmlDevice) GetName() (string, Return) {
 	n, r := nvml.Device(d).GetName()
 	return n, Return(r)
 }

+// GetBrand returns the brand of a Device
+func (d nvmlDevice) GetBrand() (BrandType, Return) {
+	b, r := nvml.Device(d).GetBrand()
+	return BrandType(b), Return(r)
+}
+
+// GetArchitecture returns the architecture of a Device
+func (d nvmlDevice) GetArchitecture() (DeviceArchitecture, Return) {
+	a, r := nvml.Device(d).GetArchitecture()
+	return DeviceArchitecture(a), Return(r)
+}
+
 // RegisterEvents registers the specified event set and type with the device
 func (d nvmlDevice) RegisterEvents(EventTypes uint64, Set EventSet) Return {
 	return Return(nvml.Device(d).RegisterEvents(EventTypes, nvml.EventSet(Set)))
--- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device_mock.go
+++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device_mock.go
@ -20,9 +20,15 @@ var _ Device = &DeviceMock{}
 //			CreateGpuInstanceWithPlacementFunc: func(gpuInstanceProfileInfo *GpuInstanceProfileInfo, gpuInstancePlacement *GpuInstancePlacement) (GpuInstance, Return) {
 //				panic("mock out the CreateGpuInstanceWithPlacement method")
 //			},
+//			GetArchitectureFunc: func() (DeviceArchitecture, Return) {
+//				panic("mock out the GetArchitecture method")
+//			},
 //			GetAttributesFunc: func() (DeviceAttributes, Return) {
 //				panic("mock out the GetAttributes method")
 //			},
+//			GetBrandFunc: func() (BrandType, Return) {
+//				panic("mock out the GetBrand method")
+//			},
 //			GetComputeInstanceIdFunc: func() (int, Return) {
 //				panic("mock out the GetComputeInstanceId method")
 //			},
@ -96,9 +102,15 @@ type DeviceMock struct {
 	// CreateGpuInstanceWithPlacementFunc mocks the CreateGpuInstanceWithPlacement method.
 	CreateGpuInstanceWithPlacementFunc func(gpuInstanceProfileInfo *GpuInstanceProfileInfo, gpuInstancePlacement *GpuInstancePlacement) (GpuInstance, Return)

+	// GetArchitectureFunc mocks the GetArchitecture method.
+	GetArchitectureFunc func() (DeviceArchitecture, Return)
+
 	// GetAttributesFunc mocks the GetAttributes method.
 	GetAttributesFunc func() (DeviceAttributes, Return)

+	// GetBrandFunc mocks the GetBrand method.
+	GetBrandFunc func() (BrandType, Return)
+
 	// GetComputeInstanceIdFunc mocks the GetComputeInstanceId method.
 	GetComputeInstanceIdFunc func() (int, Return)

@ -171,9 +183,15 @@ type DeviceMock struct {
 			// GpuInstancePlacement is the gpuInstancePlacement argument value.
 			GpuInstancePlacement *GpuInstancePlacement
 		}
+		// GetArchitecture holds details about calls to the GetArchitecture method.
+		GetArchitecture []struct {
+		}
 		// GetAttributes holds details about calls to the GetAttributes method.
 		GetAttributes []struct {
 		}
+		// GetBrand holds details about calls to the GetBrand method.
+		GetBrand []struct {
+		}
 		// GetComputeInstanceId holds details about calls to the GetComputeInstanceId method.
 		GetComputeInstanceId []struct {
 		}
@ -255,7 +273,9 @@ type DeviceMock struct {
 		}
 	}
 	lockCreateGpuInstanceWithPlacement     sync.RWMutex
+	lockGetArchitecture                    sync.RWMutex
 	lockGetAttributes                      sync.RWMutex
+	lockGetBrand                           sync.RWMutex
 	lockGetComputeInstanceId               sync.RWMutex
 	lockGetCudaComputeCapability           sync.RWMutex
 	lockGetDeviceHandleFromMigDeviceHandle sync.RWMutex
@ -315,6 +335,33 @@ func (mock *DeviceMock) CreateGpuInstanceWithPlacementCalls() []struct {
 	return calls
 }

+// GetArchitecture calls GetArchitectureFunc.
+func (mock *DeviceMock) GetArchitecture() (DeviceArchitecture, Return) {
+	if mock.GetArchitectureFunc == nil {
+		panic("DeviceMock.GetArchitectureFunc: method is nil but Device.GetArchitecture was just called")
+	}
+	callInfo := struct {
+	}{}
+	mock.lockGetArchitecture.Lock()
+	mock.calls.GetArchitecture = append(mock.calls.GetArchitecture, callInfo)
+	mock.lockGetArchitecture.Unlock()
+	return mock.GetArchitectureFunc()
+}
+
+// GetArchitectureCalls gets all the calls that were made to GetArchitecture.
+// Check the length with:
+//
+//	len(mockedDevice.GetArchitectureCalls())
+func (mock *DeviceMock) GetArchitectureCalls() []struct {
+} {
+	var calls []struct {
+	}
+	mock.lockGetArchitecture.RLock()
+	calls = mock.calls.GetArchitecture
+	mock.lockGetArchitecture.RUnlock()
+	return calls
+}
+
 // GetAttributes calls GetAttributesFunc.
 func (mock *DeviceMock) GetAttributes() (DeviceAttributes, Return) {
 	if mock.GetAttributesFunc == nil {
@ -342,6 +389,33 @@ func (mock *DeviceMock) GetAttributesCalls() []struct {
 	return calls
 }

+// GetBrand calls GetBrandFunc.
+func (mock *DeviceMock) GetBrand() (BrandType, Return) {
+	if mock.GetBrandFunc == nil {
+		panic("DeviceMock.GetBrandFunc: method is nil but Device.GetBrand was just called")
+	}
+	callInfo := struct {
+	}{}
+	mock.lockGetBrand.Lock()
+	mock.calls.GetBrand = append(mock.calls.GetBrand, callInfo)
+	mock.lockGetBrand.Unlock()
+	return mock.GetBrandFunc()
+}
+
+// GetBrandCalls gets all the calls that were made to GetBrand.
+// Check the length with:
+//
+//	len(mockedDevice.GetBrandCalls())
+func (mock *DeviceMock) GetBrandCalls() []struct {
+} {
+	var calls []struct {
+	}
+	mock.lockGetBrand.RLock()
+	calls = mock.calls.GetBrand
+	mock.lockGetBrand.RUnlock()
+	return calls
+}
+
 // GetComputeInstanceId calls GetComputeInstanceIdFunc.
 func (mock *DeviceMock) GetComputeInstanceId() (int, Return) {
 	if mock.GetComputeInstanceIdFunc == nil {
--- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/types.go
+++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/types.go
@ -40,7 +40,9 @@ type Interface interface {
 //go:generate moq -out device_mock.go . Device
 type Device interface {
 	CreateGpuInstanceWithPlacement(*GpuInstanceProfileInfo, *GpuInstancePlacement) (GpuInstance, Return)
+	GetArchitecture() (DeviceArchitecture, Return)
 	GetAttributes() (DeviceAttributes, Return)
+	GetBrand() (BrandType, Return)
 	GetComputeInstanceId() (int, Return)
 	GetCudaComputeCapability() (int, int, Return)
 	GetDeviceHandleFromMigDeviceHandle() (Device, Return)
@ -136,3 +138,9 @@ type ComputeInstancePlacement nvml.ComputeInstancePlacement

 // DeviceAttributes stores information about MIG devices
 type DeviceAttributes nvml.DeviceAttributes
+
+// DeviceArchitecture represents the hardware architecture of a GPU device
+type DeviceArchitecture nvml.DeviceArchitecture
+
+// BrandType represents the brand of a GPU device
+type BrandType nvml.BrandType
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -2,7 +2,7 @@
 ## explicit; go 1.16
 github.com/BurntSushi/toml
 github.com/BurntSushi/toml/internal
-# github.com/NVIDIA/go-nvml v0.12.0-0
+# github.com/NVIDIA/go-nvml v0.12.0-1
 ## explicit; go 1.15
 github.com/NVIDIA/go-nvml/pkg/dl
 github.com/NVIDIA/go-nvml/pkg/nvml
@ -62,8 +62,8 @@ github.com/syndtr/gocapability/capability
 github.com/urfave/cli/v2
 # github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb
 ## explicit
-# gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438
-## explicit; go 1.16
+# gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386
+## explicit; go 1.20
 gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device
 gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info
 gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml