Update go-nvlib to skip non-MIG devices

This change updates go-nvlib to ensure that non-migcapable GPUs
are skipped when generating CDI specifications for MIG devices.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2023-05-22 15:28:38 +02:00
parent 3ea02d13fc
commit e11f65e51e
12 changed files with 432 additions and 139 deletions

View File

@ -7,6 +7,8 @@
* Generate CDI specification files with `644` permissions to allow rootless applications (e.g. podman)
* Add `nvidia-ctk cdi list` command to show the known CDI devices.
* Add support for generating merged devices (e.g. `all` device) to the nvcdi API.
* Use *.* pattern to locate libcuda.so when generating a CDI specification to support platforms where a patch version is not specified.
* Update go-nvlib to skip devices that are not MIG capable when generating CDI specifications.
## v1.13.1

4
go.mod
View File

@ -4,7 +4,7 @@ go 1.20
require (
github.com/BurntSushi/toml v1.2.1
github.com/NVIDIA/go-nvml v0.12.0-0
github.com/NVIDIA/go-nvml v0.12.0-1
github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a
github.com/fsnotify/fsnotify v1.5.4
github.com/opencontainers/runtime-spec v1.1.0-rc.2
@ -12,7 +12,7 @@ require (
github.com/sirupsen/logrus v1.9.0
github.com/stretchr/testify v1.8.1
github.com/urfave/cli/v2 v2.3.0
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386
golang.org/x/mod v0.5.0
golang.org/x/sys v0.7.0
)

9
go.sum
View File

@ -1,9 +1,8 @@
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/toml v1.2.1 h1:9F2/+DoOYIOksmaJFPw1tGFy1eDnIJXg+UHjuD8lTak=
github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/NVIDIA/go-nvml v0.11.6-0.0.20220823120812-7e2082095e82/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
github.com/NVIDIA/go-nvml v0.12.0-0 h1:eHYNHbzAsMgWYshf6dEmTY66/GCXnORJFnzm3TNH4mc=
github.com/NVIDIA/go-nvml v0.12.0-0/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM=
github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs=
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=
github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
github.com/container-orchestrated-devices/container-device-interface v0.5.4-0.20230111111500-5b3b5d81179a h1:sP3PcgyIkRlHqfF3Jfpe/7G8kf/qpzG4C8r94y9hLbE=
@ -78,8 +77,8 @@ github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHo
github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ=
github.com/xeipuuv/gojsonschema v1.2.0 h1:LhYJRs+L4fBtjZUfuSZIKGeVu0QRy8e5Xi7D17UxZ74=
github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y=
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438 h1:+qRai7XRl8omFQVCeHcaWzL542Yw64vfmuXG+79ZCIc=
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438/go.mod h1:GStidGxhaqJhYFW1YpOnLvYCbL2EsM0od7IW4u7+JgU=
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386 h1:byHxP+mlgNQ4GX31owfgCIq5fJCsdJMchiJHGuM2rxw=
gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386/go.mod h1:KYZksBgh18o+uzgnpDazzG4LVYtnfB96VXHMXypEtik=
golang.org/x/mod v0.5.0 h1:UG21uOlmZabA4fW5i7ZX6bjw1xELEGg/ZLgZq9auk/Q=
golang.org/x/mod v0.5.0/go.mod h1:5OXOZSfqPIIbmVBIIKWRFfZjPR0E5r58TLhUjH0a2Ro=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=

View File

@ -2001,9 +2001,16 @@ func DeviceGetGpuInstancePossiblePlacements(Device Device, Info *GpuInstanceProf
if Info == nil {
return nil, ERROR_INVALID_ARGUMENT
}
var Count uint32 = Info.InstanceCount
var Count uint32
ret := nvmlDeviceGetGpuInstancePossiblePlacements(Device, Info.Id, nil, &Count)
if ret != SUCCESS {
return nil, ret
}
if Count == 0 {
return []GpuInstancePlacement{}, ret
}
Placements := make([]GpuInstancePlacement, Count)
ret := nvmlDeviceGetGpuInstancePossiblePlacements(Device, Info.Id, &Placements[0], &Count)
ret = nvmlDeviceGetGpuInstancePossiblePlacements(Device, Info.Id, &Placements[0], &Count)
return Placements[:Count], ret
}
@ -2577,9 +2584,9 @@ func (Device Device) GetVgpuSchedulerCapabilities() (VgpuSchedulerCapabilities,
}
// nvml.GpuInstanceGetComputeInstancePossiblePlacements()
func GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance GpuInstance, ProfileId int) ([]ComputeInstancePlacement, Return) {
func GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance GpuInstance, Info *ComputeInstanceProfileInfo) ([]ComputeInstancePlacement, Return) {
var Count uint32
ret := nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, uint32(ProfileId), nil, &Count)
ret := nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, Info.Id, nil, &Count)
if ret != SUCCESS {
return nil, ret
}
@ -2587,21 +2594,21 @@ func GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance GpuInstance, Pr
return []ComputeInstancePlacement{}, ret
}
PlacementArray := make([]ComputeInstancePlacement, Count)
ret = nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, uint32(ProfileId), &PlacementArray[0], &Count)
ret = nvmlGpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, Info.Id, &PlacementArray[0], &Count)
return PlacementArray, ret
}
func (GpuInstance GpuInstance) GetComputeInstancePossiblePlacements(ProfileId int) ([]ComputeInstancePlacement, Return) {
return GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, ProfileId)
func (GpuInstance GpuInstance) GetComputeInstancePossiblePlacements(Info *ComputeInstanceProfileInfo) ([]ComputeInstancePlacement, Return) {
return GpuInstanceGetComputeInstancePossiblePlacements(GpuInstance, Info)
}
// nvml.GpuInstanceCreateComputeInstanceWithPlacement()
func GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance GpuInstance, ProfileId int, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return {
return nvmlGpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, uint32(ProfileId), Placement, ComputeInstance)
func GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance GpuInstance, Info *ComputeInstanceProfileInfo, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return {
return nvmlGpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, Info.Id, Placement, ComputeInstance)
}
func (GpuInstance GpuInstance) CreateComputeInstanceWithPlacement(ProfileId int, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return {
return GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, ProfileId, Placement, ComputeInstance)
func (GpuInstance GpuInstance) CreateComputeInstanceWithPlacement(Info *ComputeInstanceProfileInfo, Placement *ComputeInstancePlacement, ComputeInstance *ComputeInstance) Return {
return GpuInstanceCreateComputeInstanceWithPlacement(GpuInstance, Info, Placement, ComputeInstance)
}
// nvml.DeviceGetGpuFabricInfo()

View File

@ -22,6 +22,7 @@ import (
// Interface provides the API to the 'device' package
type Interface interface {
AssertValidMigProfileFormat(profile string) error
GetDevices() ([]Device, error)
GetMigDevices() ([]MigDevice, error)
GetMigProfiles() ([]MigProfile, error)
@ -39,6 +40,8 @@ type Interface interface {
type devicelib struct {
nvml nvml.Interface
skippedDevices map[string]struct{}
verifySymbols *bool
migProfiles []MigProfile
}
var _ Interface = &devicelib{}
@ -52,6 +55,10 @@ func New(opts ...Option) Interface {
if d.nvml == nil {
d.nvml = nvml.New()
}
if d.verifySymbols == nil {
verify := true
d.verifySymbols = &verify
}
if d.skippedDevices == nil {
WithSkippedDevices(
"DGX Display",
@ -68,6 +75,13 @@ func WithNvml(nvml nvml.Interface) Option {
}
}
// WithVerifySymbols provides an option to toggle whether to verify select symbols exist in dynamic libraries before calling them
func WithVerifySymbols(verify bool) Option {
return func(d *devicelib) {
d.verifySymbols = &verify
}
}
// WithSkippedDevices provides an Option to set devices to be skipped by model name
func WithSkippedDevices(names ...string) Option {
return func(d *devicelib) {

View File

@ -26,6 +26,9 @@ import (
// Device defines the set of extended functions associated with a device.Device
type Device interface {
nvml.Device
GetArchitectureAsString() (string, error)
GetBrandAsString() (string, error)
GetCudaComputeCapabilityAsString() (string, error)
GetMigDevices() ([]MigDevice, error)
GetMigProfiles() ([]MigProfile, error)
IsMigCapable() (bool, error)
@ -36,7 +39,8 @@ type Device interface {
type device struct {
nvml.Device
lib *devicelib
lib *devicelib
migProfiles []MigProfile
}
var _ Device = &device{}
@ -57,12 +61,98 @@ func (d *devicelib) NewDeviceByUUID(uuid string) (Device, error) {
// newDevice creates a device from an nvml.Device
func (d *devicelib) newDevice(dev nvml.Device) (*device, error) {
return &device{dev, d}, nil
return &device{dev, d, nil}, nil
}
// GetArchitectureAsString returns the Device architecture as a string
func (d *device) GetArchitectureAsString() (string, error) {
arch, ret := d.GetArchitecture()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("error getting device architecture: %v", ret)
}
switch arch {
case nvml.DEVICE_ARCH_KEPLER:
return "Kepler", nil
case nvml.DEVICE_ARCH_MAXWELL:
return "Maxwell", nil
case nvml.DEVICE_ARCH_PASCAL:
return "Pascal", nil
case nvml.DEVICE_ARCH_VOLTA:
return "Volta", nil
case nvml.DEVICE_ARCH_TURING:
return "Turing", nil
case nvml.DEVICE_ARCH_AMPERE:
return "Ampere", nil
case nvml.DEVICE_ARCH_ADA:
return "Ada", nil
case nvml.DEVICE_ARCH_HOPPER:
return "Hopper", nil
case nvml.DEVICE_ARCH_UNKNOWN:
return "Unknown", nil
}
return "", fmt.Errorf("error interpreting device architecture as string: %v", arch)
}
// GetBrandAsString returns the Device architecture as a string
func (d *device) GetBrandAsString() (string, error) {
brand, ret := d.GetBrand()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("error getting device brand: %v", ret)
}
switch brand {
case nvml.BRAND_UNKNOWN:
return "Unknown", nil
case nvml.BRAND_QUADRO:
return "Quadro", nil
case nvml.BRAND_TESLA:
return "Tesla", nil
case nvml.BRAND_NVS:
return "NVS", nil
case nvml.BRAND_GRID:
return "Grid", nil
case nvml.BRAND_GEFORCE:
return "GeForce", nil
case nvml.BRAND_TITAN:
return "Titan", nil
case nvml.BRAND_NVIDIA_VAPPS:
return "NvidiaVApps", nil
case nvml.BRAND_NVIDIA_VPC:
return "NvidiaVPC", nil
case nvml.BRAND_NVIDIA_VCS:
return "NvidiaVCS", nil
case nvml.BRAND_NVIDIA_VWS:
return "NvidiaVWS", nil
// Deprecated in favor of nvml.BRAND_NVIDIA_CLOUD_GAMING
//case nvml.BRAND_NVIDIA_VGAMING:
// return "VGaming", nil
case nvml.BRAND_NVIDIA_CLOUD_GAMING:
return "NvidiaCloudGaming", nil
case nvml.BRAND_QUADRO_RTX:
return "QuadroRTX", nil
case nvml.BRAND_NVIDIA_RTX:
return "NvidiaRTX", nil
case nvml.BRAND_NVIDIA:
return "Nvidia", nil
case nvml.BRAND_GEFORCE_RTX:
return "GeForceRTX", nil
case nvml.BRAND_TITAN_RTX:
return "TitanRTX", nil
}
return "", fmt.Errorf("error interpreting device brand as string: %v", brand)
}
// GetCudaComputeCapabilityAsString returns the Device's CUDA compute capability as a version string
func (d *device) GetCudaComputeCapabilityAsString() (string, error) {
major, minor, ret := d.GetCudaComputeCapability()
if ret != nvml.SUCCESS {
return "", fmt.Errorf("error getting CUDA compute capability: %v", ret)
}
return fmt.Sprintf("%d.%d", major, minor), nil
}
// IsMigCapable checks if a device is capable of having MIG paprtitions created on it
func (d *device) IsMigCapable() (bool, error) {
err := nvmlLookupSymbol("nvmlDeviceGetMigMode")
err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode")
if err != nil {
return false, nil
}
@ -80,7 +170,7 @@ func (d *device) IsMigCapable() (bool, error) {
// IsMigEnabled checks if a device has MIG mode currently enabled on it
func (d *device) IsMigEnabled() (bool, error) {
err := nvmlLookupSymbol("nvmlDeviceGetMigMode")
err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode")
if err != nil {
return false, nil
}
@ -98,6 +188,14 @@ func (d *device) IsMigEnabled() (bool, error) {
// VisitMigDevices walks a top-level device and invokes a callback function for each MIG device configured on it
func (d *device) VisitMigDevices(visit func(int, MigDevice) error) error {
capable, err := d.IsMigCapable()
if err != nil {
return fmt.Errorf("error checking if GPU is MIG capable: %v", err)
}
if !capable {
return nil
}
count, ret := nvml.Device(d).GetMaxMigDeviceCount()
if ret != nvml.SUCCESS {
return fmt.Errorf("error getting max MIG device count: %v", ret)
@ -161,6 +259,23 @@ func (d *device) VisitMigProfiles(visit func(MigProfile) error) error {
return fmt.Errorf("error creating MIG profile: %v", err)
}
// NOTE: The NVML API doesn't currently let us query the set of
// valid Compute Instance profiles without first instantiating
// a GPU Instance to check against. In theory, it should be
// possible to get this information without a reference to a
// GPU instance, but no API is provided for that at the moment.
// We run the checks below to weed out invalid profiles
// heuristically, given what we know about how they are
// physically constructed. In the future we should do this via
// NVML once a proper API for this exists.
pi := p.GetInfo()
if pi.C > pi.G {
continue
}
if (pi.C < pi.G) && ((pi.C * 2) > (pi.G + 1)) {
continue
}
err = visit(p)
if err != nil {
return fmt.Errorf("error visiting MIG profile: %v", err)
@ -186,6 +301,12 @@ func (d *device) GetMigDevices() ([]MigDevice, error) {
// GetMigProfiles gets the set of unique MIG profiles associated with a top-level device
func (d *device) GetMigProfiles() ([]MigProfile, error) {
// Return the cached list if available
if d.migProfiles != nil {
return d.migProfiles, nil
}
// Otherwise generate it...
var profiles []MigProfile
err := d.VisitMigProfiles(func(p MigProfile) error {
profiles = append(profiles, p)
@ -194,6 +315,9 @@ func (d *device) GetMigProfiles() ([]MigProfile, error) {
if err != nil {
return nil, err
}
// And cache it before returning
d.migProfiles = profiles
return profiles, nil
}
@ -321,6 +445,12 @@ func (d *devicelib) GetMigDevices() ([]MigDevice, error) {
// GetMigProfiles gets the set of unique MIG profiles across all top-level devices
func (d *devicelib) GetMigProfiles() ([]MigProfile, error) {
// Return the cached list if available
if d.migProfiles != nil {
return d.migProfiles, nil
}
// Otherwise generate it...
var profiles []MigProfile
err := d.VisitMigProfiles(func(p MigProfile) error {
profiles = append(profiles, p)
@ -329,11 +459,20 @@ func (d *devicelib) GetMigProfiles() ([]MigProfile, error) {
if err != nil {
return nil, err
}
// And cache it before returning
d.migProfiles = profiles
return profiles, nil
}
// nvmlLookupSymbol checks to see if the given symbol is present in the NVML library
func nvmlLookupSymbol(symbol string) error {
func (d *devicelib) nvmlLookupSymbol(symbol string) error {
// If devicelib is configured to not verify symbols, then we short-circuit here
if !*d.verifySymbols {
return nil
}
// Otherwise we lookup the provided symbol and verify it is available
lib := dl.New("libnvidia-ml.so.1", dl.RTLD_LAZY|dl.RTLD_GLOBAL)
if lib == nil {
return fmt.Errorf("error instantiating DynamicLibrary for NVML")

View File

@ -19,6 +19,7 @@ package device
import (
"fmt"
"math"
"sort"
"strconv"
"strings"
@ -36,6 +37,7 @@ type MigProfile interface {
String() string
GetInfo() MigProfileInfo
Equals(other MigProfile) bool
Matches(profile string) bool
}
// MigProfileInfo holds all info associated with a specific MIG profile
@ -55,11 +57,12 @@ var _ MigProfile = &MigProfileInfo{}
func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int, migMemorySizeMB, deviceMemorySizeBytes uint64) (MigProfile, error) {
giSlices := 0
switch giProfileID {
case nvml.GPU_INSTANCE_PROFILE_1_SLICE:
case nvml.GPU_INSTANCE_PROFILE_1_SLICE,
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2:
giSlices = 1
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1:
giSlices = 1
case nvml.GPU_INSTANCE_PROFILE_2_SLICE:
case nvml.GPU_INSTANCE_PROFILE_2_SLICE,
nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
giSlices = 2
case nvml.GPU_INSTANCE_PROFILE_3_SLICE:
giSlices = 3
@ -77,7 +80,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,
ciSlices := 0
switch ciProfileID {
case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE:
case nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1:
ciSlices = 1
case nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE:
ciSlices = 2
@ -97,7 +101,8 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,
var attrs []string
switch giProfileID {
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1:
case nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1:
attrs = append(attrs, AttributeMediaExtensions)
}
@ -114,90 +119,30 @@ func (d *devicelib) NewMigProfile(giProfileID, ciProfileID, ciEngProfileID int,
return p, nil
}
// AssertValidMigProfileFormat checks if the string is in the proper format to represent a MIG profile
func (d *devicelib) AssertValidMigProfileFormat(profile string) error {
_, _, _, _, err := parseMigProfile(profile)
return err
}
// ParseMigProfile converts a string representation of a MigProfile into an object
func (d *devicelib) ParseMigProfile(profile string) (MigProfile, error) {
var err error
var c, g, gb int
var attrs []string
if len(profile) == 0 {
return nil, fmt.Errorf("empty Profile string")
}
split := strings.SplitN(profile, "+", 2)
if len(split) == 2 {
attrs, err = parseMigProfileAttributes(split[1])
if err != nil {
return nil, fmt.Errorf("error parsing attributes following '+' in Profile string: %v", err)
}
}
c, g, gb, err = parseMigProfileFields(split[0])
profiles, err := d.GetMigProfiles()
if err != nil {
return nil, fmt.Errorf("error parsing '.' separated fields in Profile string: %v", err)
return nil, fmt.Errorf("error getting list of possible MIG profiles: %v", err)
}
p := &MigProfileInfo{
C: c,
G: g,
GB: gb,
Attributes: attrs,
}
switch c {
case 1:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
case 2:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
case 3:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
case 4:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
case 6:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
case 7:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
case 8:
p.CIProfileID = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
default:
return nil, fmt.Errorf("unknown Compute Instance slice size: %v", c)
}
switch g {
case 1:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE
case 2:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_2_SLICE
case 3:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_3_SLICE
case 4:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_4_SLICE
case 6:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_6_SLICE
case 7:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_7_SLICE
case 8:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_8_SLICE
default:
return nil, fmt.Errorf("unknown GPU Instance slice size: %v", g)
}
p.CIEngProfileID = nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED
for _, a := range attrs {
switch a {
case AttributeMediaExtensions:
p.GIProfileID = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1
default:
return nil, fmt.Errorf("unknown Profile attribute: %v", a)
for _, p := range profiles {
if p.Matches(profile) {
return p, nil
}
}
return p, nil
return nil, fmt.Errorf("unable to parse profile string into a valid profile")
}
// String returns the string representation of a Profile
func (p *MigProfileInfo) String() string {
func (p MigProfileInfo) String() string {
var suffix string
if len(p.Attributes) > 0 {
suffix = "+" + strings.Join(p.Attributes, ",")
@ -209,35 +154,89 @@ func (p *MigProfileInfo) String() string {
}
// GetInfo returns detailed info about a Profile
func (p *MigProfileInfo) GetInfo() MigProfileInfo {
return *p
func (p MigProfileInfo) GetInfo() MigProfileInfo {
return p
}
// Equals checks if two Profiles are identical or not
func (p *MigProfileInfo) Equals(other MigProfile) bool {
switch o := other.(type) {
case *MigProfileInfo:
if p.C != o.C {
return false
}
if p.G != o.G {
return false
}
if p.GB != o.GB {
return false
}
if p.GIProfileID != o.GIProfileID {
return false
}
if p.CIProfileID != o.CIProfileID {
return false
}
if p.CIEngProfileID != o.CIEngProfileID {
return false
}
return true
func (p MigProfileInfo) Equals(other MigProfile) bool {
o := other.GetInfo()
if p.C != o.C {
return false
}
return false
if p.G != o.G {
return false
}
if p.GB != o.GB {
return false
}
if p.GIProfileID != o.GIProfileID {
return false
}
if p.CIProfileID != o.CIProfileID {
return false
}
if p.CIEngProfileID != o.CIEngProfileID {
return false
}
return true
}
// Matches checks if a MigProfile matches the string passed in
func (p MigProfileInfo) Matches(profile string) bool {
c, g, gb, attrs, err := parseMigProfile(profile)
if err != nil {
return false
}
if c != p.C {
return false
}
if g != p.G {
return false
}
if gb != p.GB {
return false
}
if len(attrs) != len(p.Attributes) {
return false
}
sort.Strings(attrs)
sort.Strings(p.Attributes)
for i, a := range p.Attributes {
if a != attrs[i] {
return false
}
}
return true
}
func parseMigProfile(profile string) (int, int, int, []string, error) {
// If we are handed the empty string, we cannot parse it
if profile == "" {
return -1, -1, -1, nil, fmt.Errorf("profile is the empty string")
}
// Split by + to separate out attributes
split := strings.SplitN(profile, "+", 2)
// Check to make sure the c, g, and gb values match
c, g, gb, err := parseMigProfileFields(split[0])
if err != nil {
return -1, -1, -1, nil, fmt.Errorf("cannot parse fields of '%v': %v", profile, err)
}
// If we have no attributes we are done
if len(split) == 1 {
return c, g, gb, nil, nil
}
// Make sure we have the same set of attributes
attrs, err := parseMigProfileAttributes(split[1])
if err != nil {
return -1, -1, -1, nil, fmt.Errorf("cannot parse attributes of '%v': %v", profile, err)
}
return c, g, gb, attrs, nil
}
func parseMigProfileField(s string, field string) (int, error) {

View File

@ -49,6 +49,42 @@ const (
ERROR_UNKNOWN = Return(nvml.ERROR_UNKNOWN)
)
// Device architecture constants
const (
DEVICE_ARCH_KEPLER = nvml.DEVICE_ARCH_KEPLER
DEVICE_ARCH_MAXWELL = nvml.DEVICE_ARCH_MAXWELL
DEVICE_ARCH_PASCAL = nvml.DEVICE_ARCH_PASCAL
DEVICE_ARCH_VOLTA = nvml.DEVICE_ARCH_VOLTA
DEVICE_ARCH_TURING = nvml.DEVICE_ARCH_TURING
DEVICE_ARCH_AMPERE = nvml.DEVICE_ARCH_AMPERE
DEVICE_ARCH_ADA = nvml.DEVICE_ARCH_ADA
DEVICE_ARCH_HOPPER = nvml.DEVICE_ARCH_HOPPER
DEVICE_ARCH_UNKNOWN = nvml.DEVICE_ARCH_UNKNOWN
)
// Device brand constants
const (
BRAND_UNKNOWN = BrandType(nvml.BRAND_UNKNOWN)
BRAND_QUADRO = BrandType(nvml.BRAND_QUADRO)
BRAND_TESLA = BrandType(nvml.BRAND_TESLA)
BRAND_NVS = BrandType(nvml.BRAND_NVS)
BRAND_GRID = BrandType(nvml.BRAND_GRID)
BRAND_GEFORCE = BrandType(nvml.BRAND_GEFORCE)
BRAND_TITAN = BrandType(nvml.BRAND_TITAN)
BRAND_NVIDIA_VAPPS = BrandType(nvml.BRAND_NVIDIA_VAPPS)
BRAND_NVIDIA_VPC = BrandType(nvml.BRAND_NVIDIA_VPC)
BRAND_NVIDIA_VCS = BrandType(nvml.BRAND_NVIDIA_VCS)
BRAND_NVIDIA_VWS = BrandType(nvml.BRAND_NVIDIA_VWS)
BRAND_NVIDIA_CLOUD_GAMING = BrandType(nvml.BRAND_NVIDIA_CLOUD_GAMING)
BRAND_NVIDIA_VGAMING = BrandType(nvml.BRAND_NVIDIA_VGAMING)
BRAND_QUADRO_RTX = BrandType(nvml.BRAND_QUADRO_RTX)
BRAND_NVIDIA_RTX = BrandType(nvml.BRAND_NVIDIA_RTX)
BRAND_NVIDIA = BrandType(nvml.BRAND_NVIDIA)
BRAND_GEFORCE_RTX = BrandType(nvml.BRAND_GEFORCE_RTX)
BRAND_TITAN_RTX = BrandType(nvml.BRAND_TITAN_RTX)
BRAND_COUNT = BrandType(nvml.BRAND_COUNT)
)
// MIG Mode constants
const (
DEVICE_MIG_ENABLE = nvml.DEVICE_MIG_ENABLE
@ -65,19 +101,22 @@ const (
GPU_INSTANCE_PROFILE_7_SLICE = nvml.GPU_INSTANCE_PROFILE_7_SLICE
GPU_INSTANCE_PROFILE_8_SLICE = nvml.GPU_INSTANCE_PROFILE_8_SLICE
GPU_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1
GPU_INSTANCE_PROFILE_1_SLICE_REV2 = nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2
GPU_INSTANCE_PROFILE_2_SLICE_REV1 = nvml.GPU_INSTANCE_PROFILE_2_SLICE_REV1
GPU_INSTANCE_PROFILE_COUNT = nvml.GPU_INSTANCE_PROFILE_COUNT
)
// Compute Instance Profiles
const (
COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
COMPUTE_INSTANCE_PROFILE_6_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT
COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
COMPUTE_INSTANCE_PROFILE_6_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_6_SLICE
COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1 = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE_REV1
COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT
)
// Compute Instance Engine Profiles

View File

@ -150,12 +150,24 @@ func (d nvmlDevice) GetAttributes() (DeviceAttributes, Return) {
return DeviceAttributes(a), Return(r)
}
// GetName returns the device attributes for a MIG device
// GetName returns the product name of a Device
func (d nvmlDevice) GetName() (string, Return) {
n, r := nvml.Device(d).GetName()
return n, Return(r)
}
// GetBrand returns the brand of a Device
func (d nvmlDevice) GetBrand() (BrandType, Return) {
b, r := nvml.Device(d).GetBrand()
return BrandType(b), Return(r)
}
// GetArchitecture returns the architecture of a Device
func (d nvmlDevice) GetArchitecture() (DeviceArchitecture, Return) {
a, r := nvml.Device(d).GetArchitecture()
return DeviceArchitecture(a), Return(r)
}
// RegisterEvents registers the specified event set and type with the device
func (d nvmlDevice) RegisterEvents(EventTypes uint64, Set EventSet) Return {
return Return(nvml.Device(d).RegisterEvents(EventTypes, nvml.EventSet(Set)))

View File

@ -20,9 +20,15 @@ var _ Device = &DeviceMock{}
// CreateGpuInstanceWithPlacementFunc: func(gpuInstanceProfileInfo *GpuInstanceProfileInfo, gpuInstancePlacement *GpuInstancePlacement) (GpuInstance, Return) {
// panic("mock out the CreateGpuInstanceWithPlacement method")
// },
// GetArchitectureFunc: func() (DeviceArchitecture, Return) {
// panic("mock out the GetArchitecture method")
// },
// GetAttributesFunc: func() (DeviceAttributes, Return) {
// panic("mock out the GetAttributes method")
// },
// GetBrandFunc: func() (BrandType, Return) {
// panic("mock out the GetBrand method")
// },
// GetComputeInstanceIdFunc: func() (int, Return) {
// panic("mock out the GetComputeInstanceId method")
// },
@ -96,9 +102,15 @@ type DeviceMock struct {
// CreateGpuInstanceWithPlacementFunc mocks the CreateGpuInstanceWithPlacement method.
CreateGpuInstanceWithPlacementFunc func(gpuInstanceProfileInfo *GpuInstanceProfileInfo, gpuInstancePlacement *GpuInstancePlacement) (GpuInstance, Return)
// GetArchitectureFunc mocks the GetArchitecture method.
GetArchitectureFunc func() (DeviceArchitecture, Return)
// GetAttributesFunc mocks the GetAttributes method.
GetAttributesFunc func() (DeviceAttributes, Return)
// GetBrandFunc mocks the GetBrand method.
GetBrandFunc func() (BrandType, Return)
// GetComputeInstanceIdFunc mocks the GetComputeInstanceId method.
GetComputeInstanceIdFunc func() (int, Return)
@ -171,9 +183,15 @@ type DeviceMock struct {
// GpuInstancePlacement is the gpuInstancePlacement argument value.
GpuInstancePlacement *GpuInstancePlacement
}
// GetArchitecture holds details about calls to the GetArchitecture method.
GetArchitecture []struct {
}
// GetAttributes holds details about calls to the GetAttributes method.
GetAttributes []struct {
}
// GetBrand holds details about calls to the GetBrand method.
GetBrand []struct {
}
// GetComputeInstanceId holds details about calls to the GetComputeInstanceId method.
GetComputeInstanceId []struct {
}
@ -255,7 +273,9 @@ type DeviceMock struct {
}
}
lockCreateGpuInstanceWithPlacement sync.RWMutex
lockGetArchitecture sync.RWMutex
lockGetAttributes sync.RWMutex
lockGetBrand sync.RWMutex
lockGetComputeInstanceId sync.RWMutex
lockGetCudaComputeCapability sync.RWMutex
lockGetDeviceHandleFromMigDeviceHandle sync.RWMutex
@ -315,6 +335,33 @@ func (mock *DeviceMock) CreateGpuInstanceWithPlacementCalls() []struct {
return calls
}
// GetArchitecture calls GetArchitectureFunc.
func (mock *DeviceMock) GetArchitecture() (DeviceArchitecture, Return) {
if mock.GetArchitectureFunc == nil {
panic("DeviceMock.GetArchitectureFunc: method is nil but Device.GetArchitecture was just called")
}
callInfo := struct {
}{}
mock.lockGetArchitecture.Lock()
mock.calls.GetArchitecture = append(mock.calls.GetArchitecture, callInfo)
mock.lockGetArchitecture.Unlock()
return mock.GetArchitectureFunc()
}
// GetArchitectureCalls gets all the calls that were made to GetArchitecture.
// Check the length with:
//
// len(mockedDevice.GetArchitectureCalls())
func (mock *DeviceMock) GetArchitectureCalls() []struct {
} {
var calls []struct {
}
mock.lockGetArchitecture.RLock()
calls = mock.calls.GetArchitecture
mock.lockGetArchitecture.RUnlock()
return calls
}
// GetAttributes calls GetAttributesFunc.
func (mock *DeviceMock) GetAttributes() (DeviceAttributes, Return) {
if mock.GetAttributesFunc == nil {
@ -342,6 +389,33 @@ func (mock *DeviceMock) GetAttributesCalls() []struct {
return calls
}
// GetBrand calls GetBrandFunc.
func (mock *DeviceMock) GetBrand() (BrandType, Return) {
if mock.GetBrandFunc == nil {
panic("DeviceMock.GetBrandFunc: method is nil but Device.GetBrand was just called")
}
callInfo := struct {
}{}
mock.lockGetBrand.Lock()
mock.calls.GetBrand = append(mock.calls.GetBrand, callInfo)
mock.lockGetBrand.Unlock()
return mock.GetBrandFunc()
}
// GetBrandCalls gets all the calls that were made to GetBrand.
// Check the length with:
//
// len(mockedDevice.GetBrandCalls())
func (mock *DeviceMock) GetBrandCalls() []struct {
} {
var calls []struct {
}
mock.lockGetBrand.RLock()
calls = mock.calls.GetBrand
mock.lockGetBrand.RUnlock()
return calls
}
// GetComputeInstanceId calls GetComputeInstanceIdFunc.
func (mock *DeviceMock) GetComputeInstanceId() (int, Return) {
if mock.GetComputeInstanceIdFunc == nil {

View File

@ -40,7 +40,9 @@ type Interface interface {
//go:generate moq -out device_mock.go . Device
type Device interface {
CreateGpuInstanceWithPlacement(*GpuInstanceProfileInfo, *GpuInstancePlacement) (GpuInstance, Return)
GetArchitecture() (DeviceArchitecture, Return)
GetAttributes() (DeviceAttributes, Return)
GetBrand() (BrandType, Return)
GetComputeInstanceId() (int, Return)
GetCudaComputeCapability() (int, int, Return)
GetDeviceHandleFromMigDeviceHandle() (Device, Return)
@ -136,3 +138,9 @@ type ComputeInstancePlacement nvml.ComputeInstancePlacement
// DeviceAttributes stores information about MIG devices
type DeviceAttributes nvml.DeviceAttributes
// DeviceArchitecture represents the hardware architecture of a GPU device
type DeviceArchitecture nvml.DeviceArchitecture
// BrandType represents the brand of a GPU device
type BrandType nvml.BrandType

6
vendor/modules.txt vendored
View File

@ -2,7 +2,7 @@
## explicit; go 1.16
github.com/BurntSushi/toml
github.com/BurntSushi/toml/internal
# github.com/NVIDIA/go-nvml v0.12.0-0
# github.com/NVIDIA/go-nvml v0.12.0-1
## explicit; go 1.15
github.com/NVIDIA/go-nvml/pkg/dl
github.com/NVIDIA/go-nvml/pkg/nvml
@ -62,8 +62,8 @@ github.com/syndtr/gocapability/capability
github.com/urfave/cli/v2
# github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb
## explicit
# gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230209143738-95328d8c4438
## explicit; go 1.16
# gitlab.com/nvidia/cloud-native/go-nvlib v0.0.0-20230522132528-649703f6b386
## explicit; go 1.20
gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device
gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info
gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml