From 631bde023f580dbb3273a3597bf9626f8aea725e Mon Sep 17 00:00:00 2001
From: Kevin Klues <kklues@nvidia.com>
Date: Fri, 24 Mar 2023 14:23:26 +0000
Subject: [PATCH] Add ability to query device architeture and cuda compute
 capability

Signed-off-by: Kevin Klues <kklues@nvidia.com>
---
 pkg/nvlib/device/device.go | 40 ++++++++++++++++++++++++++++++++++++++
 pkg/nvml/consts.go         | 13 +++++++++++++
 pkg/nvml/device.go         |  8 +++++++-
 pkg/nvml/device_mock.go    | 37 +++++++++++++++++++++++++++++++++++
 pkg/nvml/types.go          |  4 ++++
 5 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/pkg/nvlib/device/device.go b/pkg/nvlib/device/device.go
index f6f3caa..40325dc 100644
--- a/pkg/nvlib/device/device.go
+++ b/pkg/nvlib/device/device.go
@@ -26,6 +26,8 @@ import (
 // Device defines the set of extended functions associated with a device.Device
 type Device interface {
 	nvml.Device
+	GetArchitectureAsString() (string, error)
+	GetCudaComputeCapabilityAsString() (string, error)
 	GetMigDevices() ([]MigDevice, error)
 	GetMigProfiles() ([]MigProfile, error)
 	IsMigCapable() (bool, error)
@@ -61,6 +63,44 @@ func (d *devicelib) newDevice(dev nvml.Device) (*device, error) {
 	return &device{dev, d, nil}, nil
 }
 
+// GetArchitectureAsString returns the Device architecture as a string
+func (d *device) GetArchitectureAsString() (string, error) {
+	arch, ret := d.GetArchitecture()
+	if ret != nvml.SUCCESS {
+		return "", fmt.Errorf("error getting device architecture: %v", ret)
+	}
+	switch arch {
+	case nvml.DEVICE_ARCH_KEPLER:
+		return "Kepler", nil
+	case nvml.DEVICE_ARCH_MAXWELL:
+		return "Maxwell", nil
+	case nvml.DEVICE_ARCH_PASCAL:
+		return "Pascal", nil
+	case nvml.DEVICE_ARCH_VOLTA:
+		return "Volta", nil
+	case nvml.DEVICE_ARCH_TURING:
+		return "Turing", nil
+	case nvml.DEVICE_ARCH_AMPERE:
+		return "Ampere", nil
+	case nvml.DEVICE_ARCH_ADA:
+		return "Ada", nil
+	case nvml.DEVICE_ARCH_HOPPER:
+		return "Hopper", nil
+	case nvml.DEVICE_ARCH_UNKNOWN:
+		return "Unknown", nil
+	}
+	return "", fmt.Errorf("error interpreting device architecture as string: %v", arch)
+}
+
+// GetCudaComputeCapabilityAsString returns the Device's CUDA compute capability as a version string
+func (d *device) GetCudaComputeCapabilityAsString() (string, error) {
+	major, minor, ret := d.GetCudaComputeCapability()
+	if ret != nvml.SUCCESS {
+		return "", fmt.Errorf("error getting CUDA compute capability: %v", ret)
+	}
+	return fmt.Sprintf("%d.%d", major, minor), nil
+}
+
 // IsMigCapable checks if a device is capable of having MIG paprtitions created on it
 func (d *device) IsMigCapable() (bool, error) {
 	err := d.lib.nvmlLookupSymbol("nvmlDeviceGetMigMode")
diff --git a/pkg/nvml/consts.go b/pkg/nvml/consts.go
index 8e61416..6cccee7 100644
--- a/pkg/nvml/consts.go
+++ b/pkg/nvml/consts.go
@@ -49,6 +49,19 @@ const (
 	ERROR_UNKNOWN                 = Return(nvml.ERROR_UNKNOWN)
 )
 
+// Device architecture constants
+const (
+	DEVICE_ARCH_KEPLER  = nvml.DEVICE_ARCH_KEPLER
+	DEVICE_ARCH_MAXWELL = nvml.DEVICE_ARCH_MAXWELL
+	DEVICE_ARCH_PASCAL  = nvml.DEVICE_ARCH_PASCAL
+	DEVICE_ARCH_VOLTA   = nvml.DEVICE_ARCH_VOLTA
+	DEVICE_ARCH_TURING  = nvml.DEVICE_ARCH_TURING
+	DEVICE_ARCH_AMPERE  = nvml.DEVICE_ARCH_AMPERE
+	DEVICE_ARCH_ADA     = nvml.DEVICE_ARCH_ADA
+	DEVICE_ARCH_HOPPER  = nvml.DEVICE_ARCH_HOPPER
+	DEVICE_ARCH_UNKNOWN = nvml.DEVICE_ARCH_UNKNOWN
+)
+
 // MIG Mode constants
 const (
 	DEVICE_MIG_ENABLE  = nvml.DEVICE_MIG_ENABLE
diff --git a/pkg/nvml/device.go b/pkg/nvml/device.go
index faaac25..ddfe6ed 100644
--- a/pkg/nvml/device.go
+++ b/pkg/nvml/device.go
@@ -150,12 +150,18 @@ func (d nvmlDevice) GetAttributes() (DeviceAttributes, Return) {
 	return DeviceAttributes(a), Return(r)
 }
 
-// GetName returns the device attributes for a MIG device
+// GetName returns the product name of a Device
 func (d nvmlDevice) GetName() (string, Return) {
 	n, r := nvml.Device(d).GetName()
 	return n, Return(r)
 }
 
+// GetArchitecture returns the architecture of a Device
+func (d nvmlDevice) GetArchitecture() (DeviceArchitecture, Return) {
+	a, r := nvml.Device(d).GetArchitecture()
+	return DeviceArchitecture(a), Return(r)
+}
+
 // RegisterEvents registers the specified event set and type with the device
 func (d nvmlDevice) RegisterEvents(EventTypes uint64, Set EventSet) Return {
 	return Return(nvml.Device(d).RegisterEvents(EventTypes, nvml.EventSet(Set)))
diff --git a/pkg/nvml/device_mock.go b/pkg/nvml/device_mock.go
index 0093e4e..636e01c 100644
--- a/pkg/nvml/device_mock.go
+++ b/pkg/nvml/device_mock.go
@@ -20,6 +20,9 @@ var _ Device = &DeviceMock{}
 //			CreateGpuInstanceWithPlacementFunc: func(gpuInstanceProfileInfo *GpuInstanceProfileInfo, gpuInstancePlacement *GpuInstancePlacement) (GpuInstance, Return) {
 //				panic("mock out the CreateGpuInstanceWithPlacement method")
 //			},
+//			GetArchitectureFunc: func() (DeviceArchitecture, Return) {
+//				panic("mock out the GetArchitecture method")
+//			},
 //			GetAttributesFunc: func() (DeviceAttributes, Return) {
 //				panic("mock out the GetAttributes method")
 //			},
@@ -96,6 +99,9 @@ type DeviceMock struct {
 	// CreateGpuInstanceWithPlacementFunc mocks the CreateGpuInstanceWithPlacement method.
 	CreateGpuInstanceWithPlacementFunc func(gpuInstanceProfileInfo *GpuInstanceProfileInfo, gpuInstancePlacement *GpuInstancePlacement) (GpuInstance, Return)
 
+	// GetArchitectureFunc mocks the GetArchitecture method.
+	GetArchitectureFunc func() (DeviceArchitecture, Return)
+
 	// GetAttributesFunc mocks the GetAttributes method.
 	GetAttributesFunc func() (DeviceAttributes, Return)
 
@@ -171,6 +177,9 @@ type DeviceMock struct {
 			// GpuInstancePlacement is the gpuInstancePlacement argument value.
 			GpuInstancePlacement *GpuInstancePlacement
 		}
+		// GetArchitecture holds details about calls to the GetArchitecture method.
+		GetArchitecture []struct {
+		}
 		// GetAttributes holds details about calls to the GetAttributes method.
 		GetAttributes []struct {
 		}
@@ -255,6 +264,7 @@ type DeviceMock struct {
 		}
 	}
 	lockCreateGpuInstanceWithPlacement     sync.RWMutex
+	lockGetArchitecture                    sync.RWMutex
 	lockGetAttributes                      sync.RWMutex
 	lockGetComputeInstanceId               sync.RWMutex
 	lockGetCudaComputeCapability           sync.RWMutex
@@ -315,6 +325,33 @@ func (mock *DeviceMock) CreateGpuInstanceWithPlacementCalls() []struct {
 	return calls
 }
 
+// GetArchitecture calls GetArchitectureFunc.
+func (mock *DeviceMock) GetArchitecture() (DeviceArchitecture, Return) {
+	if mock.GetArchitectureFunc == nil {
+		panic("DeviceMock.GetArchitectureFunc: method is nil but Device.GetArchitecture was just called")
+	}
+	callInfo := struct {
+	}{}
+	mock.lockGetArchitecture.Lock()
+	mock.calls.GetArchitecture = append(mock.calls.GetArchitecture, callInfo)
+	mock.lockGetArchitecture.Unlock()
+	return mock.GetArchitectureFunc()
+}
+
+// GetArchitectureCalls gets all the calls that were made to GetArchitecture.
+// Check the length with:
+//
+//	len(mockedDevice.GetArchitectureCalls())
+func (mock *DeviceMock) GetArchitectureCalls() []struct {
+} {
+	var calls []struct {
+	}
+	mock.lockGetArchitecture.RLock()
+	calls = mock.calls.GetArchitecture
+	mock.lockGetArchitecture.RUnlock()
+	return calls
+}
+
 // GetAttributes calls GetAttributesFunc.
 func (mock *DeviceMock) GetAttributes() (DeviceAttributes, Return) {
 	if mock.GetAttributesFunc == nil {
diff --git a/pkg/nvml/types.go b/pkg/nvml/types.go
index b1c97c0..c360a47 100644
--- a/pkg/nvml/types.go
+++ b/pkg/nvml/types.go
@@ -40,6 +40,7 @@ type Interface interface {
 //go:generate moq -out device_mock.go . Device
 type Device interface {
 	CreateGpuInstanceWithPlacement(*GpuInstanceProfileInfo, *GpuInstancePlacement) (GpuInstance, Return)
+	GetArchitecture() (DeviceArchitecture, Return)
 	GetAttributes() (DeviceAttributes, Return)
 	GetComputeInstanceId() (int, Return)
 	GetCudaComputeCapability() (int, int, Return)
@@ -136,3 +137,6 @@ type ComputeInstancePlacement nvml.ComputeInstancePlacement
 
 // DeviceAttributes stores information about MIG devices
 type DeviceAttributes nvml.DeviceAttributes
+
+// DeviceArchitecture represents the hardware architecture of a GPU device
+type DeviceArchitecture nvml.DeviceArchitecture