diff --git a/CHANGELOG.md b/CHANGELOG.md index d2d515f5..d9e48600 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ * Generate CDI specification files with `644` permissions to allow rootless applications (e.g. podman) * Add `nvidia-ctk cdi list` command to show the known CDI devices. * Add support for generating merged devices (e.g. `all` device) to the nvcdi API. +* Use *.* pattern to locate libcuda.so when generating a CDI specification to support platforms where a patch version is not specified. +* Update go-nvlib to skip devices that are not MIG capable when generating CDI specifications. ## v1.13.1 diff --git a/go.mod b/go.mod index 94f0875c..ec0d7534 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.20 require ( github.com/BurntSushi/toml v1.2.1 - github.com/NVIDIA/go-nvml v0.12.0-0 + github.com/NVIDIA/go-nvml v0.12.0-1 github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a github.com/fsnotify/fsnotify v1.5.4 github.com/opencontainers/runtime-spec v1.1.0-rc.2 @@ -12,7 +12,7 @@ require ( github.com/sirupsen/logrus v1.9.0 github.com/stretchr/testify v1.8.1 github.com/urfave/cli/v2 v2.3.0 - gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438 + gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386 golang.org/x/mod v0.5.0 golang.org/x/sys v0.7.0 ) diff --git a/go.sum b/go.sum index b87ae41d..3722f9e4 100644 --- a/go.sum +++ b/go.sum @@ -1,9 +1,8 @@ github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak= github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= -github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= -github.com/NVIDIA/go-nvml v0.12.0-0 h1:eHYNHbzAsMgWYshf6dEmTY66/GCXnORJFnzm3TNH4mc= -github.com/NVIDIA/go-nvml v0.12.0-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= +github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM= +github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a h1:sP3PcgyIkRlHqfF3Jfpe/7G8kf/qpzG4C8r94y9hLbE= @@ -78,8 +77,8 @@ github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHo github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74= github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= -gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438 h1:+qRai7XRl8omFQVCeHcaWzL542Yw64vfmuXG+79ZCIc= -gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438/go.mod h1:GStidGxhaqJhYFW1YpOnLvYCbL2EsM0od7IW4u7+JgU= +gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386 h1:byHxP+mlgNQ4GX31owfgCIq5fJCsdJMchiJHGuM2rxw= +gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386/go.mod h1:KYZksBgh18o+uzgnpDazzG4LVYtnfB96VXHMXypEtik= golang.org/x/mod v0.5.0 h1:UG21uOlmZabA4fW5i7ZX6bjw1xELEGg/ZLgZq9auk/Q= golang.org/x/mod v0.5.0/go.mod h1:5OXOZSfqPIIbmVBIIKWRFfZjPR0E5r58TLhUjH0a2Ro= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go index f19da394..91a7baa9 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/device.go @@ -2001,9 +2001,16 @@ func DeviceGetGpuInstancePossiblePlacements(Device Device, Info *GpuInstanceProf if Info == nil { return nil, ERROR_INVALID_ARGUMENT } - var Count uint32 = Info.InstanceCount + var Count uint32 + ret := nvmlDeviceGetGpuInstancePossiblePlacements(Device, Info.Id, nil, &Count) + if ret != SUCCESS { + return nil, ret + } + if Count == 0 { + return []GpuInstancePlacement{}, ret + } Placements := make([]GpuInstancePlacement, Count) - ret := nvmlDeviceGetGpuInstancePossiblePlacements(Device, Info.Id, &Placements[0], &Count) + ret = nvmlDeviceGetGpuInstancePossiblePlacements(Device, Info.Id, &Placements[0], &Count) return Placements[:Count], ret } @@ -2577,9 +2584,9 @@ func (Device Device) GetVgpuSchedulerCapabilities() (VgpuSchedulerCapabilities, } // nvml.GpuInstanceGetComputeInstancePossiblePlacements() -func GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance GpuInstance, ProfileId int) ([]ComputeInstancePlacement, Return) { +func GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance GpuInstance, Info *ComputeInstanceProfileInfo) ([]ComputeInstancePlacement, Return) { var Count uint32 - ret := nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, uint32(ProfileId), nil, &Count) + ret := nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, Info.Id, nil, &Count) if ret != SUCCESS { return nil, ret } @@ -2587,21 +2594,21 @@ func GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance GpuInstance, Pr return []ComputeInstancePlacement{}, ret } PlacementArray := make([]ComputeInstancePlacement, Count) - ret = nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, uint32(ProfileId), &PlacementArray[0], &Count) + ret = nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, Info.Id, &PlacementArray[0], &Count) return PlacementArray, ret } -func (GpuInstance GpuInstance) GetComputeInstancePossiblePlacements(ProfileId int) ([]ComputeInstancePlacement, Return) { - return GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, ProfileId) +func (GpuInstance GpuInstance) GetComputeInstancePossiblePlacements(Info *ComputeInstanceProfileInfo) ([]ComputeInstancePlacement, Return) { + return GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, Info) } // nvml.GpuInstanceCreateComputeInstanceWithPlacement() -func GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance GpuInstance, ProfileId int, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return { - return nvmlGpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, uint32(ProfileId), Placement, ComputeInstance) +func GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance GpuInstance, Info *ComputeInstanceProfileInfo, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return { + return nvmlGpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, Info.Id, Placement, ComputeInstance) } -func (GpuInstance GpuInstance) CreateComputeInstanceWithPlacement(ProfileId int, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return { - return GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, ProfileId, Placement, ComputeInstance) +func (GpuInstance GpuInstance) CreateComputeInstanceWithPlacement(Info *ComputeInstanceProfileInfo, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return { + return GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, Info, Placement, ComputeInstance) } // nvml.DeviceGetGpuFabricInfo() diff --git a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/api.go b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/api.go index 1643fcc9..777789b4 100644 --- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/api.go +++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/api.go @@ -22,6 +22,7 @@ import ( // Interface provides the API to the 'device' package type Interface interface { + AssertValidMigProfileFormat(profile string) error GetDevices() ([]Device, error) GetMigDevices() ([]MigDevice, error) GetMigProfiles() ([]MigProfile, error) @@ -39,6 +40,8 @@ type Interface interface { type devicelib struct { nvml nvml.Interface skippedDevices map[string]struct{} + verifySymbols *bool + migProfiles []MigProfile } var _ Interface = &devicelib{} @@ -52,6 +55,10 @@ func New(opts ...Option) Interface { if d.nvml == nil { d.nvml = nvml.New() } + if d.verifySymbols == nil { + verify := true + d.verifySymbols = &verify + } if d.skippedDevices == nil { WithSkippedDevices( "DGX Display", @@ -68,6 +75,13 @@ func WithNvml(nvml nvml.Interface) Option { } } +// WithVerifySymbols provides an option to toggle whether to verify select symbols exist in dynamic libraries before calling them +func WithVerifySymbols(verify bool) Option { + return func(d *devicelib) { + d.verifySymbols = &verify + } +} + // WithSkippedDevices provides an Option to set devices to be skipped by model name func WithSkippedDevices(names ...string) Option { return func(d *devicelib) { diff --git a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/device.go b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/device.go index 3d549e46..62e6c3ed 100644 --- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/device.go +++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/device.go @@ -26,6 +26,9 @@ import ( // Device defines the set of extended functions associated with a device.Device type Device interface { nvml.Device + GetArchitectureAsString() (string, error) + GetBrandAsString() (string, error) + GetCudaComputeCapabilityAsString() (string, error) GetMigDevices() ([]MigDevice, error) GetMigProfiles() ([]MigProfile, error) IsMigCapable() (bool, error) @@ -36,7 +39,8 @@ type Device interface { type device struct { nvml.Device - lib *devicelib + lib *devicelib + migProfiles []MigProfile } var _ Device = &device{} @@ -57,12 +61,98 @@ func (d *devicelib) NewDeviceByUUID(uuid string) (Device, error) { // newDevice creates a device from an nvml.Device func (d *devicelib) newDevice(dev nvml.Device) (*device, error) { - return &device{dev, d}, nil + return &device{dev, d, nil}, nil +} + +// GetArchitectureAsString returns the Device architecture as a string +func (d *device) GetArchitectureAsString() (string, error) { + arch, ret := d.GetArchitecture() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("error getting device architecture: %v", ret) + } + switch arch { + case nvml.DEVICE_ARCH_KEPLER: + return "Kepler", nil + case nvml.DEVICE_ARCH_MAXWELL: + return "Maxwell", nil + case nvml.DEVICE_ARCH_PASCAL: + return "Pascal", nil + case nvml.DEVICE_ARCH_VOLTA: + return "Volta", nil + case nvml.DEVICE_ARCH_TURING: + return "Turing", nil + case nvml.DEVICE_ARCH_AMPERE: + return "Ampere", nil + case nvml.DEVICE_ARCH_ADA: + return "Ada", nil + case nvml.DEVICE_ARCH_HOPPER: + return "Hopper", nil + case nvml.DEVICE_ARCH_UNKNOWN: + return "Unknown", nil + } + return "", fmt.Errorf("error interpreting device architecture as string: %v", arch) +} + +// GetBrandAsString returns the Device architecture as a string +func (d *device) GetBrandAsString() (string, error) { + brand, ret := d.GetBrand() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("error getting device brand: %v", ret) + } + switch brand { + case nvml.BRAND_UNKNOWN: + return "Unknown", nil + case nvml.BRAND_QUADRO: + return "Quadro", nil + case nvml.BRAND_TESLA: + return "Tesla", nil + case nvml.BRAND_NVS: + return "NVS", nil + case nvml.BRAND_GRID: + return "Grid", nil + case nvml.BRAND_GEFORCE: + return "GeForce", nil + case nvml.BRAND_TITAN: + return "Titan", nil + case nvml.BRAND_NVIDIA_VAPPS: + return "NvidiaVApps", nil + case nvml.BRAND_NVIDIA_VPC: + return "NvidiaVPC", nil + case nvml.BRAND_NVIDIA_VCS: + return "NvidiaVCS", nil + case nvml.BRAND_NVIDIA_VWS: + return "NvidiaVWS", nil + // Deprecated in favor of nvml.BRAND_NVIDIA_CLOUD_GAMING + //case nvml.BRAND_NVIDIA_VGAMING: + // return "VGaming", nil + case nvml.BRAND_NVIDIA_CLOUD_GAMING: + return "NvidiaCloudGaming", nil + case nvml.BRAND_QUADRO_RTX: + return "QuadroRTX", nil + case nvml.BRAND_NVIDIA_RTX: + return "NvidiaRTX", nil + case nvml.BRAND_NVIDIA: + return "Nvidia", nil + case nvml.BRAND_GEFORCE_RTX: + return "GeForceRTX", nil + case nvml.BRAND_TITAN_RTX: + return "TitanRTX", nil + } + return "", fmt.Errorf("error interpreting device brand as string: %v", brand) +} + +// GetCudaComputeCapabilityAsString returns the Device's CUDA compute capability as a version string +func (d *device) GetCudaComputeCapabilityAsString() (string, error) { + major, minor, ret := d.GetCudaComputeCapability() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("error getting CUDA compute capability: %v", ret) + } + return fmt.Sprintf("%d.%d", major, minor), nil } // IsMigCapable checks if a device is capable of having MIG paprtitions created on it func (d *device) IsMigCapable() (bool, error) { - err := nvmlLookupSymbol("nvmlDeviceGetMigMode") + err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode") if err != nil { return false, nil } @@ -80,7 +170,7 @@ func (d *device) IsMigCapable() (bool, error) { // IsMigEnabled checks if a device has MIG mode currently enabled on it func (d *device) IsMigEnabled() (bool, error) { - err := nvmlLookupSymbol("nvmlDeviceGetMigMode") + err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode") if err != nil { return false, nil } @@ -98,6 +188,14 @@ func (d *device) IsMigEnabled() (bool, error) { // VisitMigDevices walks a top-level device and invokes a callback function for each MIG device configured on it func (d *device) VisitMigDevices(visit func(int, MigDevice) error) error { + capable, err := d.IsMigCapable() + if err != nil { + return fmt.Errorf("error checking if GPU is MIG capable: %v", err) + } + if !capable { + return nil + } + count, ret := nvml.Device(d).GetMaxMigDeviceCount() if ret != nvml.SUCCESS { return fmt.Errorf("error getting max MIG device count: %v", ret) @@ -161,6 +259,23 @@ func (d *device) VisitMigProfiles(visit func(MigProfile) error) error { return fmt.Errorf("error creating MIG profile: %v", err) } + // NOTE: The NVML API doesn't currently let us query the set of + // valid Compute Instance profiles without first instantiating + // a GPU Instance to check against. In theory, it should be + // possible to get this information without a reference to a + // GPU instance, but no API is provided for that at the moment. + // We run the checks below to weed out invalid profiles + // heuristically, given what we know about how they are + // physically constructed. In the future we should do this via + // NVML once a proper API for this exists. + pi := p.GetInfo() + if pi.C > pi.G { + continue + } + if (pi.C < pi.G) && ((pi.C * 2) > (pi.G + 1)) { + continue + } + err = visit(p) if err != nil { return fmt.Errorf("error visiting MIG profile: %v", err) @@ -186,6 +301,12 @@ func (d *device) GetMigDevices() ([]MigDevice, error) { // GetMigProfiles gets the set of unique MIG profiles associated with a top-level device func (d *device) GetMigProfiles() ([]MigProfile, error) { + // Return the cached list if available + if d.migProfiles != nil { + return d.migProfiles, nil + } + + // Otherwise generate it... var profiles []MigProfile err := d.VisitMigProfiles(func(p MigProfile) error { profiles = append(profiles, p) @@ -194,6 +315,9 @@ func (d *device) GetMigProfiles() ([]MigProfile, error) { if err != nil { return nil, err } + + // And cache it before returning + d.migProfiles = profiles return profiles, nil } @@ -321,6 +445,12 @@ func (d *devicelib) GetMigDevices() ([]MigDevice, error) { // GetMigProfiles gets the set of unique MIG profiles across all top-level devices func (d *devicelib) GetMigProfiles() ([]MigProfile, error) { + // Return the cached list if available + if d.migProfiles != nil { + return d.migProfiles, nil + } + + // Otherwise generate it... var profiles []MigProfile err := d.VisitMigProfiles(func(p MigProfile) error { profiles = append(profiles, p) @@ -329,11 +459,20 @@ func (d *devicelib) GetMigProfiles() ([]MigProfile, error) { if err != nil { return nil, err } + + // And cache it before returning + d.migProfiles = profiles return profiles, nil } // nvmlLookupSymbol checks to see if the given symbol is present in the NVML library -func nvmlLookupSymbol(symbol string) error { +func (d *devicelib) nvmlLookupSymbol(symbol string) error { + // If devicelib is configured to not verify symbols, then we short-circuit here + if !*d.verifySymbols { + return nil + } + + // Otherwise we lookup the provided symbol and verify it is available lib := dl.New("libnvidia-ml.so.1", dl.RTLD_LAZY|dl.RTLD_GLOBAL) if lib == nil { return fmt.Errorf("error instantiating DynamicLibrary for NVML") diff --git a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/mig_profile.go b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/mig_profile.go index 5aa00e6b..13a5dbf4 100644 --- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/mig_profile.go +++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device/mig_profile.go @@ -19,6 +19,7 @@ package device import ( "fmt" "math" + "sort" "strconv" "strings" @@ -36,6 +37,7 @@ type MigProfile interface { String() string GetInfo() MigProfileInfo Equals(other MigProfile) bool + Matches(profile string) bool } // MigProfileInfo holds all info associated with a specific MIG profile @@ -55,11 +57,12 @@ var _ MigProfile = &MigProfileInfo{} func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, migMemorySizeMB, deviceMemorySizeBytes uint64) (MigProfile, error) { giSlices := 0 switch giProfileID { - case nvml.GPU_INSTANCE_PROFILE_1_SLICE: + case nvml.GPU_INSTANCE_PROFILE_1_SLICE, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: giSlices = 1 - case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: - giSlices = 1 - case nvml.GPU_INSTANCE_PROFILE_2_SLICE: + case nvml.GPU_INSTANCE_PROFILE_2_SLICE, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: giSlices = 2 case nvml.GPU_INSTANCE_PROFILE_3_SLICE: giSlices = 3 @@ -77,7 +80,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, ciSlices := 0 switch ciProfileID { - case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: + case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1: ciSlices = 1 case nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: ciSlices = 2 @@ -97,7 +101,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, var attrs []string switch giProfileID { - case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: + case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1: attrs = append(attrs, AttributeMediaExtensions) } @@ -114,90 +119,30 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, return p, nil } +// AssertValidMigProfileFormat checks if the string is in the proper format to represent a MIG profile +func (d *devicelib) AssertValidMigProfileFormat(profile string) error { + _, _, _, _, err := parseMigProfile(profile) + return err +} + // ParseMigProfile converts a string representation of a MigProfile into an object func (d *devicelib) ParseMigProfile(profile string) (MigProfile, error) { - var err error - var c, g, gb int - var attrs []string - - if len(profile) == 0 { - return nil, fmt.Errorf("empty Profile string") - } - - split := strings.SplitN(profile, "+", 2) - if len(split) == 2 { - attrs, err = parseMigProfileAttributes(split[1]) - if err != nil { - return nil, fmt.Errorf("error parsing attributes following '+' in Profile string: %v", err) - } - } - - c, g, gb, err = parseMigProfileFields(split[0]) + profiles, err := d.GetMigProfiles() if err != nil { - return nil, fmt.Errorf("error parsing '.' separated fields in Profile string: %v", err) + return nil, fmt.Errorf("error getting list of possible MIG profiles: %v", err) } - p := &MigProfileInfo{ - C: c, - G: g, - GB: gb, - Attributes: attrs, - } - - switch c { - case 1: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE - case 2: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE - case 3: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE - case 4: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE - case 6: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE - case 7: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE - case 8: - p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE - default: - return nil, fmt.Errorf("unknown Compute Instance slice size: %v", c) - } - - switch g { - case 1: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE - case 2: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_2_SLICE - case 3: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_3_SLICE - case 4: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_4_SLICE - case 6: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_6_SLICE - case 7: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_7_SLICE - case 8: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_8_SLICE - default: - return nil, fmt.Errorf("unknown GPU Instance slice size: %v", g) - } - - p.CIEngProfileID = nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED - - for _, a := range attrs { - switch a { - case AttributeMediaExtensions: - p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1 - default: - return nil, fmt.Errorf("unknown Profile attribute: %v", a) + for _, p := range profiles { + if p.Matches(profile) { + return p, nil } } - return p, nil + return nil, fmt.Errorf("unable to parse profile string into a valid profile") } // String returns the string representation of a Profile -func (p *MigProfileInfo) String() string { +func (p MigProfileInfo) String() string { var suffix string if len(p.Attributes) > 0 { suffix = "+" + strings.Join(p.Attributes, ",") @@ -209,35 +154,89 @@ func (p *MigProfileInfo) String() string { } // GetInfo returns detailed info about a Profile -func (p *MigProfileInfo) GetInfo() MigProfileInfo { - return *p +func (p MigProfileInfo) GetInfo() MigProfileInfo { + return p } // Equals checks if two Profiles are identical or not -func (p *MigProfileInfo) Equals(other MigProfile) bool { - switch o := other.(type) { - case *MigProfileInfo: - if p.C != o.C { - return false - } - if p.G != o.G { - return false - } - if p.GB != o.GB { - return false - } - if p.GIProfileID != o.GIProfileID { - return false - } - if p.CIProfileID != o.CIProfileID { - return false - } - if p.CIEngProfileID != o.CIEngProfileID { - return false - } - return true +func (p MigProfileInfo) Equals(other MigProfile) bool { + o := other.GetInfo() + if p.C != o.C { + return false } - return false + if p.G != o.G { + return false + } + if p.GB != o.GB { + return false + } + if p.GIProfileID != o.GIProfileID { + return false + } + if p.CIProfileID != o.CIProfileID { + return false + } + if p.CIEngProfileID != o.CIEngProfileID { + return false + } + return true +} + +// Matches checks if a MigProfile matches the string passed in +func (p MigProfileInfo) Matches(profile string) bool { + c, g, gb, attrs, err := parseMigProfile(profile) + if err != nil { + return false + } + if c != p.C { + return false + } + if g != p.G { + return false + } + if gb != p.GB { + return false + } + if len(attrs) != len(p.Attributes) { + return false + } + sort.Strings(attrs) + sort.Strings(p.Attributes) + for i, a := range p.Attributes { + if a != attrs[i] { + return false + } + } + return true +} + +func parseMigProfile(profile string) (int, int, int, []string, error) { + // If we are handed the empty string, we cannot parse it + if profile == "" { + return -1, -1, -1, nil, fmt.Errorf("profile is the empty string") + } + + // Split by + to separate out attributes + split := strings.SplitN(profile, "+", 2) + + // Check to make sure the c, g, and gb values match + c, g, gb, err := parseMigProfileFields(split[0]) + if err != nil { + return -1, -1, -1, nil, fmt.Errorf("cannot parse fields of '%v': %v", profile, err) + } + + // If we have no attributes we are done + if len(split) == 1 { + return c, g, gb, nil, nil + } + + // Make sure we have the same set of attributes + attrs, err := parseMigProfileAttributes(split[1]) + if err != nil { + return -1, -1, -1, nil, fmt.Errorf("cannot parse attributes of '%v': %v", profile, err) + } + + return c, g, gb, attrs, nil } func parseMigProfileField(s string, field string) (int, error) { diff --git a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/consts.go b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/consts.go index b353ef9d..c9b85de4 100644 --- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/consts.go +++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/consts.go @@ -49,6 +49,42 @@ const ( ERROR_UNKNOWN = Return(nvml.ERROR_UNKNOWN) ) +// Device architecture constants +const ( + DEVICE_ARCH_KEPLER = nvml.DEVICE_ARCH_KEPLER + DEVICE_ARCH_MAXWELL = nvml.DEVICE_ARCH_MAXWELL + DEVICE_ARCH_PASCAL = nvml.DEVICE_ARCH_PASCAL + DEVICE_ARCH_VOLTA = nvml.DEVICE_ARCH_VOLTA + DEVICE_ARCH_TURING = nvml.DEVICE_ARCH_TURING + DEVICE_ARCH_AMPERE = nvml.DEVICE_ARCH_AMPERE + DEVICE_ARCH_ADA = nvml.DEVICE_ARCH_ADA + DEVICE_ARCH_HOPPER = nvml.DEVICE_ARCH_HOPPER + DEVICE_ARCH_UNKNOWN = nvml.DEVICE_ARCH_UNKNOWN +) + +// Device brand constants +const ( + BRAND_UNKNOWN = BrandType(nvml.BRAND_UNKNOWN) + BRAND_QUADRO = BrandType(nvml.BRAND_QUADRO) + BRAND_TESLA = BrandType(nvml.BRAND_TESLA) + BRAND_NVS = BrandType(nvml.BRAND_NVS) + BRAND_GRID = BrandType(nvml.BRAND_GRID) + BRAND_GEFORCE = BrandType(nvml.BRAND_GEFORCE) + BRAND_TITAN = BrandType(nvml.BRAND_TITAN) + BRAND_NVIDIA_VAPPS = BrandType(nvml.BRAND_NVIDIA_VAPPS) + BRAND_NVIDIA_VPC = BrandType(nvml.BRAND_NVIDIA_VPC) + BRAND_NVIDIA_VCS = BrandType(nvml.BRAND_NVIDIA_VCS) + BRAND_NVIDIA_VWS = BrandType(nvml.BRAND_NVIDIA_VWS) + BRAND_NVIDIA_CLOUD_GAMING = BrandType(nvml.BRAND_NVIDIA_CLOUD_GAMING) + BRAND_NVIDIA_VGAMING = BrandType(nvml.BRAND_NVIDIA_VGAMING) + BRAND_QUADRO_RTX = BrandType(nvml.BRAND_QUADRO_RTX) + BRAND_NVIDIA_RTX = BrandType(nvml.BRAND_NVIDIA_RTX) + BRAND_NVIDIA = BrandType(nvml.BRAND_NVIDIA) + BRAND_GEFORCE_RTX = BrandType(nvml.BRAND_GEFORCE_RTX) + BRAND_TITAN_RTX = BrandType(nvml.BRAND_TITAN_RTX) + BRAND_COUNT = BrandType(nvml.BRAND_COUNT) +) + // MIG Mode constants const ( DEVICE_MIG_ENABLE = nvml.DEVICE_MIG_ENABLE @@ -65,19 +101,22 @@ const ( GPU_INSTANCE_PROFILE_7_SLICE = nvml.GPU_INSTANCE_PROFILE_7_SLICE GPU_INSTANCE_PROFILE_8_SLICE = nvml.GPU_INSTANCE_PROFILE_8_SLICE GPU_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1 + GPU_INSTANCE_PROFILE_1_SLICE_REV2 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2 + GPU_INSTANCE_PROFILE_2_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1 GPU_INSTANCE_PROFILE_COUNT = nvml.GPU_INSTANCE_PROFILE_COUNT ) // Compute Instance Profiles const ( - COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE - COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE - COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE - COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE - COMPUTE_INSTANCE_PROFILE_6_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE - COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE - COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE - COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT + COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE + COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE + COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE + COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE + COMPUTE_INSTANCE_PROFILE_6_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE + COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE + COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE + COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 + COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT ) // Compute Instance Engine Profiles diff --git a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device.go b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device.go index faaac256..3c318a7b 100644 --- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device.go +++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device.go @@ -150,12 +150,24 @@ func (d nvmlDevice) GetAttributes() (DeviceAttributes, Return) { return DeviceAttributes(a), Return(r) } -// GetName returns the device attributes for a MIG device +// GetName returns the product name of a Device func (d nvmlDevice) GetName() (string, Return) { n, r := nvml.Device(d).GetName() return n, Return(r) } +// GetBrand returns the brand of a Device +func (d nvmlDevice) GetBrand() (BrandType, Return) { + b, r := nvml.Device(d).GetBrand() + return BrandType(b), Return(r) +} + +// GetArchitecture returns the architecture of a Device +func (d nvmlDevice) GetArchitecture() (DeviceArchitecture, Return) { + a, r := nvml.Device(d).GetArchitecture() + return DeviceArchitecture(a), Return(r) +} + // RegisterEvents registers the specified event set and type with the device func (d nvmlDevice) RegisterEvents(EventTypes uint64, Set EventSet) Return { return Return(nvml.Device(d).RegisterEvents(EventTypes, nvml.EventSet(Set))) diff --git a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device_mock.go b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device_mock.go index 0093e4e0..34e563c8 100644 --- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device_mock.go +++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/device_mock.go @@ -20,9 +20,15 @@ var _ Device = &DeviceMock{} // CreateGpuInstanceWithPlacementFunc: func(gpuInstanceProfileInfo *GpuInstanceProfileInfo, gpuInstancePlacement *GpuInstancePlacement) (GpuInstance, Return) { // panic("mock out the CreateGpuInstanceWithPlacement method") // }, +// GetArchitectureFunc: func() (DeviceArchitecture, Return) { +// panic("mock out the GetArchitecture method") +// }, // GetAttributesFunc: func() (DeviceAttributes, Return) { // panic("mock out the GetAttributes method") // }, +// GetBrandFunc: func() (BrandType, Return) { +// panic("mock out the GetBrand method") +// }, // GetComputeInstanceIdFunc: func() (int, Return) { // panic("mock out the GetComputeInstanceId method") // }, @@ -96,9 +102,15 @@ type DeviceMock struct { // CreateGpuInstanceWithPlacementFunc mocks the CreateGpuInstanceWithPlacement method. CreateGpuInstanceWithPlacementFunc func(gpuInstanceProfileInfo *GpuInstanceProfileInfo, gpuInstancePlacement *GpuInstancePlacement) (GpuInstance, Return) + // GetArchitectureFunc mocks the GetArchitecture method. + GetArchitectureFunc func() (DeviceArchitecture, Return) + // GetAttributesFunc mocks the GetAttributes method. GetAttributesFunc func() (DeviceAttributes, Return) + // GetBrandFunc mocks the GetBrand method. + GetBrandFunc func() (BrandType, Return) + // GetComputeInstanceIdFunc mocks the GetComputeInstanceId method. GetComputeInstanceIdFunc func() (int, Return) @@ -171,9 +183,15 @@ type DeviceMock struct { // GpuInstancePlacement is the gpuInstancePlacement argument value. GpuInstancePlacement *GpuInstancePlacement } + // GetArchitecture holds details about calls to the GetArchitecture method. + GetArchitecture []struct { + } // GetAttributes holds details about calls to the GetAttributes method. GetAttributes []struct { } + // GetBrand holds details about calls to the GetBrand method. + GetBrand []struct { + } // GetComputeInstanceId holds details about calls to the GetComputeInstanceId method. GetComputeInstanceId []struct { } @@ -255,7 +273,9 @@ type DeviceMock struct { } } lockCreateGpuInstanceWithPlacement sync.RWMutex + lockGetArchitecture sync.RWMutex lockGetAttributes sync.RWMutex + lockGetBrand sync.RWMutex lockGetComputeInstanceId sync.RWMutex lockGetCudaComputeCapability sync.RWMutex lockGetDeviceHandleFromMigDeviceHandle sync.RWMutex @@ -315,6 +335,33 @@ func (mock *DeviceMock) CreateGpuInstanceWithPlacementCalls() []struct { return calls } +// GetArchitecture calls GetArchitectureFunc. +func (mock *DeviceMock) GetArchitecture() (DeviceArchitecture, Return) { + if mock.GetArchitectureFunc == nil { + panic("DeviceMock.GetArchitectureFunc: method is nil but Device.GetArchitecture was just called") + } + callInfo := struct { + }{} + mock.lockGetArchitecture.Lock() + mock.calls.GetArchitecture = append(mock.calls.GetArchitecture, callInfo) + mock.lockGetArchitecture.Unlock() + return mock.GetArchitectureFunc() +} + +// GetArchitectureCalls gets all the calls that were made to GetArchitecture. +// Check the length with: +// +// len(mockedDevice.GetArchitectureCalls()) +func (mock *DeviceMock) GetArchitectureCalls() []struct { +} { + var calls []struct { + } + mock.lockGetArchitecture.RLock() + calls = mock.calls.GetArchitecture + mock.lockGetArchitecture.RUnlock() + return calls +} + // GetAttributes calls GetAttributesFunc. func (mock *DeviceMock) GetAttributes() (DeviceAttributes, Return) { if mock.GetAttributesFunc == nil { @@ -342,6 +389,33 @@ func (mock *DeviceMock) GetAttributesCalls() []struct { return calls } +// GetBrand calls GetBrandFunc. +func (mock *DeviceMock) GetBrand() (BrandType, Return) { + if mock.GetBrandFunc == nil { + panic("DeviceMock.GetBrandFunc: method is nil but Device.GetBrand was just called") + } + callInfo := struct { + }{} + mock.lockGetBrand.Lock() + mock.calls.GetBrand = append(mock.calls.GetBrand, callInfo) + mock.lockGetBrand.Unlock() + return mock.GetBrandFunc() +} + +// GetBrandCalls gets all the calls that were made to GetBrand. +// Check the length with: +// +// len(mockedDevice.GetBrandCalls()) +func (mock *DeviceMock) GetBrandCalls() []struct { +} { + var calls []struct { + } + mock.lockGetBrand.RLock() + calls = mock.calls.GetBrand + mock.lockGetBrand.RUnlock() + return calls +} + // GetComputeInstanceId calls GetComputeInstanceIdFunc. func (mock *DeviceMock) GetComputeInstanceId() (int, Return) { if mock.GetComputeInstanceIdFunc == nil { diff --git a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/types.go b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/types.go index b1c97c0d..39d005f6 100644 --- a/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/types.go +++ b/vendor/gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml/types.go @@ -40,7 +40,9 @@ type Interface interface { //go:generate moq -out device_mock.go . Device type Device interface { CreateGpuInstanceWithPlacement(*GpuInstanceProfileInfo, *GpuInstancePlacement) (GpuInstance, Return) + GetArchitecture() (DeviceArchitecture, Return) GetAttributes() (DeviceAttributes, Return) + GetBrand() (BrandType, Return) GetComputeInstanceId() (int, Return) GetCudaComputeCapability() (int, int, Return) GetDeviceHandleFromMigDeviceHandle() (Device, Return) @@ -136,3 +138,9 @@ type ComputeInstancePlacement nvml.ComputeInstancePlacement // DeviceAttributes stores information about MIG devices type DeviceAttributes nvml.DeviceAttributes + +// DeviceArchitecture represents the hardware architecture of a GPU device +type DeviceArchitecture nvml.DeviceArchitecture + +// BrandType represents the brand of a GPU device +type BrandType nvml.BrandType diff --git a/vendor/modules.txt b/vendor/modules.txt index 70323b39..4715e44e 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -2,7 +2,7 @@ ## explicit; go 1.16 github.com/BurntSushi/toml github.com/BurntSushi/toml/internal -# github.com/NVIDIA/go-nvml v0.12.0-0 +# github.com/NVIDIA/go-nvml v0.12.0-1 ## explicit; go 1.15 github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml @@ -62,8 +62,8 @@ github.com/syndtr/gocapability/capability github.com/urfave/cli/v2 # github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb ## explicit -# gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438 -## explicit; go 1.16 +# gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386 +## explicit; go 1.20 gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml