From 631bde023f580dbb3273a3597bf9626f8aea725e Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Fri, 24 Mar 2023 14:23:26 +0000 Subject: [PATCH] Add ability to query device architeture and cuda compute capability Signed-off-by: Kevin Klues --- pkg/nvlib/device/device.go | 40 ++++++++++++++++++++++++++++++++++++++ pkg/nvml/consts.go | 13 +++++++++++++ pkg/nvml/device.go | 8 +++++++- pkg/nvml/device_mock.go | 37 +++++++++++++++++++++++++++++++++++ pkg/nvml/types.go | 4 ++++ 5 files changed, 101 insertions(+), 1 deletion(-) diff --git a/pkg/nvlib/device/device.go b/pkg/nvlib/device/device.go index f6f3caa..40325dc 100644 --- a/pkg/nvlib/device/device.go +++ b/pkg/nvlib/device/device.go @@ -26,6 +26,8 @@ import ( // Device defines the set of extended functions associated with a device.Device type Device interface { nvml.Device + GetArchitectureAsString() (string, error) + GetCudaComputeCapabilityAsString() (string, error) GetMigDevices() ([]MigDevice, error) GetMigProfiles() ([]MigProfile, error) IsMigCapable() (bool, error) @@ -61,6 +63,44 @@ func (d *devicelib) newDevice(dev nvml.Device) (*device, error) { return &device{dev, d, nil}, nil } +// GetArchitectureAsString returns the Device architecture as a string +func (d *device) GetArchitectureAsString() (string, error) { + arch, ret := d.GetArchitecture() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("error getting device architecture: %v", ret) + } + switch arch { + case nvml.DEVICE_ARCH_KEPLER: + return "Kepler", nil + case nvml.DEVICE_ARCH_MAXWELL: + return "Maxwell", nil + case nvml.DEVICE_ARCH_PASCAL: + return "Pascal", nil + case nvml.DEVICE_ARCH_VOLTA: + return "Volta", nil + case nvml.DEVICE_ARCH_TURING: + return "Turing", nil + case nvml.DEVICE_ARCH_AMPERE: + return "Ampere", nil + case nvml.DEVICE_ARCH_ADA: + return "Ada", nil + case nvml.DEVICE_ARCH_HOPPER: + return "Hopper", nil + case nvml.DEVICE_ARCH_UNKNOWN: + return "Unknown", nil + } + return "", fmt.Errorf("error interpreting device architecture as string: %v", arch) +} + +// GetCudaComputeCapabilityAsString returns the Device's CUDA compute capability as a version string +func (d *device) GetCudaComputeCapabilityAsString() (string, error) { + major, minor, ret := d.GetCudaComputeCapability() + if ret != nvml.SUCCESS { + return "", fmt.Errorf("error getting CUDA compute capability: %v", ret) + } + return fmt.Sprintf("%d.%d", major, minor), nil +} + // IsMigCapable checks if a device is capable of having MIG paprtitions created on it func (d *device) IsMigCapable() (bool, error) { err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode") diff --git a/pkg/nvml/consts.go b/pkg/nvml/consts.go index 8e61416..6cccee7 100644 --- a/pkg/nvml/consts.go +++ b/pkg/nvml/consts.go @@ -49,6 +49,19 @@ const ( ERROR_UNKNOWN = Return(nvml.ERROR_UNKNOWN) ) +// Device architecture constants +const ( + DEVICE_ARCH_KEPLER = nvml.DEVICE_ARCH_KEPLER + DEVICE_ARCH_MAXWELL = nvml.DEVICE_ARCH_MAXWELL + DEVICE_ARCH_PASCAL = nvml.DEVICE_ARCH_PASCAL + DEVICE_ARCH_VOLTA = nvml.DEVICE_ARCH_VOLTA + DEVICE_ARCH_TURING = nvml.DEVICE_ARCH_TURING + DEVICE_ARCH_AMPERE = nvml.DEVICE_ARCH_AMPERE + DEVICE_ARCH_ADA = nvml.DEVICE_ARCH_ADA + DEVICE_ARCH_HOPPER = nvml.DEVICE_ARCH_HOPPER + DEVICE_ARCH_UNKNOWN = nvml.DEVICE_ARCH_UNKNOWN +) + // MIG Mode constants const ( DEVICE_MIG_ENABLE = nvml.DEVICE_MIG_ENABLE diff --git a/pkg/nvml/device.go b/pkg/nvml/device.go index faaac25..ddfe6ed 100644 --- a/pkg/nvml/device.go +++ b/pkg/nvml/device.go @@ -150,12 +150,18 @@ func (d nvmlDevice) GetAttributes() (DeviceAttributes, Return) { return DeviceAttributes(a), Return(r) } -// GetName returns the device attributes for a MIG device +// GetName returns the product name of a Device func (d nvmlDevice) GetName() (string, Return) { n, r := nvml.Device(d).GetName() return n, Return(r) } +// GetArchitecture returns the architecture of a Device +func (d nvmlDevice) GetArchitecture() (DeviceArchitecture, Return) { + a, r := nvml.Device(d).GetArchitecture() + return DeviceArchitecture(a), Return(r) +} + // RegisterEvents registers the specified event set and type with the device func (d nvmlDevice) RegisterEvents(EventTypes uint64, Set EventSet) Return { return Return(nvml.Device(d).RegisterEvents(EventTypes, nvml.EventSet(Set))) diff --git a/pkg/nvml/device_mock.go b/pkg/nvml/device_mock.go index 0093e4e..636e01c 100644 --- a/pkg/nvml/device_mock.go +++ b/pkg/nvml/device_mock.go @@ -20,6 +20,9 @@ var _ Device = &DeviceMock{} // CreateGpuInstanceWithPlacementFunc: func(gpuInstanceProfileInfo *GpuInstanceProfileInfo, gpuInstancePlacement *GpuInstancePlacement) (GpuInstance, Return) { // panic("mock out the CreateGpuInstanceWithPlacement method") // }, +// GetArchitectureFunc: func() (DeviceArchitecture, Return) { +// panic("mock out the GetArchitecture method") +// }, // GetAttributesFunc: func() (DeviceAttributes, Return) { // panic("mock out the GetAttributes method") // }, @@ -96,6 +99,9 @@ type DeviceMock struct { // CreateGpuInstanceWithPlacementFunc mocks the CreateGpuInstanceWithPlacement method. CreateGpuInstanceWithPlacementFunc func(gpuInstanceProfileInfo *GpuInstanceProfileInfo, gpuInstancePlacement *GpuInstancePlacement) (GpuInstance, Return) + // GetArchitectureFunc mocks the GetArchitecture method. + GetArchitectureFunc func() (DeviceArchitecture, Return) + // GetAttributesFunc mocks the GetAttributes method. GetAttributesFunc func() (DeviceAttributes, Return) @@ -171,6 +177,9 @@ type DeviceMock struct { // GpuInstancePlacement is the gpuInstancePlacement argument value. GpuInstancePlacement *GpuInstancePlacement } + // GetArchitecture holds details about calls to the GetArchitecture method. + GetArchitecture []struct { + } // GetAttributes holds details about calls to the GetAttributes method. GetAttributes []struct { } @@ -255,6 +264,7 @@ type DeviceMock struct { } } lockCreateGpuInstanceWithPlacement sync.RWMutex + lockGetArchitecture sync.RWMutex lockGetAttributes sync.RWMutex lockGetComputeInstanceId sync.RWMutex lockGetCudaComputeCapability sync.RWMutex @@ -315,6 +325,33 @@ func (mock *DeviceMock) CreateGpuInstanceWithPlacementCalls() []struct { return calls } +// GetArchitecture calls GetArchitectureFunc. +func (mock *DeviceMock) GetArchitecture() (DeviceArchitecture, Return) { + if mock.GetArchitectureFunc == nil { + panic("DeviceMock.GetArchitectureFunc: method is nil but Device.GetArchitecture was just called") + } + callInfo := struct { + }{} + mock.lockGetArchitecture.Lock() + mock.calls.GetArchitecture = append(mock.calls.GetArchitecture, callInfo) + mock.lockGetArchitecture.Unlock() + return mock.GetArchitectureFunc() +} + +// GetArchitectureCalls gets all the calls that were made to GetArchitecture. +// Check the length with: +// +// len(mockedDevice.GetArchitectureCalls()) +func (mock *DeviceMock) GetArchitectureCalls() []struct { +} { + var calls []struct { + } + mock.lockGetArchitecture.RLock() + calls = mock.calls.GetArchitecture + mock.lockGetArchitecture.RUnlock() + return calls +} + // GetAttributes calls GetAttributesFunc. func (mock *DeviceMock) GetAttributes() (DeviceAttributes, Return) { if mock.GetAttributesFunc == nil { diff --git a/pkg/nvml/types.go b/pkg/nvml/types.go index b1c97c0..c360a47 100644 --- a/pkg/nvml/types.go +++ b/pkg/nvml/types.go @@ -40,6 +40,7 @@ type Interface interface { //go:generate moq -out device_mock.go . Device type Device interface { CreateGpuInstanceWithPlacement(*GpuInstanceProfileInfo, *GpuInstancePlacement) (GpuInstance, Return) + GetArchitecture() (DeviceArchitecture, Return) GetAttributes() (DeviceAttributes, Return) GetComputeInstanceId() (int, Return) GetCudaComputeCapability() (int, int, Return) @@ -136,3 +137,6 @@ type ComputeInstancePlacement nvml.ComputeInstancePlacement // DeviceAttributes stores information about MIG devices type DeviceAttributes nvml.DeviceAttributes + +// DeviceArchitecture represents the hardware architecture of a GPU device +type DeviceArchitecture nvml.DeviceArchitecture