Bump go-nvlib to v0.2.0 and go-nvml v0.12.0-3

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar
2024-03-15 15:09:12 +02:00
parent 2733661125
commit b9abb44613
18 changed files with 501 additions and 47 deletions

View File

@@ -152,3 +152,12 @@ const (
FEATURE_DISABLED = EnableState(nvml.FEATURE_DISABLED)
FEATURE_ENABLED = EnableState(nvml.FEATURE_ENABLED)
)
// Compute mode constants
const (
COMPUTEMODE_DEFAULT = ComputeMode(nvml.COMPUTEMODE_DEFAULT)
COMPUTEMODE_EXCLUSIVE_THREAD = ComputeMode(nvml.COMPUTEMODE_EXCLUSIVE_THREAD)
COMPUTEMODE_PROHIBITED = ComputeMode(nvml.COMPUTEMODE_PROHIBITED)
COMPUTEMODE_EXCLUSIVE_PROCESS = ComputeMode(nvml.COMPUTEMODE_EXCLUSIVE_PROCESS)
COMPUTEMODE_COUNT = ComputeMode(nvml.COMPUTEMODE_COUNT)
)

View File

@@ -22,6 +22,11 @@ type nvmlDevice nvml.Device
var _ Device = (*nvmlDevice)(nil)
// nvmlDeviceHandle returns a pointer to the underlying device.
func (d nvmlDevice) nvmlDeviceHandle() *nvml.Device {
return (*nvml.Device)(&d)
}
// GetIndex returns the index of a Device
func (d nvmlDevice) GetIndex() (int, Return) {
i, r := nvml.Device(d).GetIndex()
@@ -181,12 +186,12 @@ func (d nvmlDevice) GetSupportedEventTypes() (uint64, Return) {
// GetTopologyCommonAncestor retrieves the common ancestor for two devices.
func (d nvmlDevice) GetTopologyCommonAncestor(o Device) (GpuTopologyLevel, Return) {
other, ok := o.(nvmlDevice)
if !ok {
other := o.nvmlDeviceHandle()
if other == nil {
return 0, ERROR_INVALID_ARGUMENT
}
l, r := nvml.Device(d).GetTopologyCommonAncestor(nvml.Device(other))
l, r := nvml.Device(d).GetTopologyCommonAncestor(*other)
return GpuTopologyLevel(l), Return(r)
}
@@ -202,3 +207,9 @@ func (d nvmlDevice) GetNvLinkRemotePciInfo(link int) (PciInfo, Return) {
p, r := nvml.Device(d).GetNvLinkRemotePciInfo(link)
return PciInfo(p), Return(r)
}
// SetComputeMode sets the compute mode for the device.
func (d nvmlDevice) SetComputeMode(mode ComputeMode) Return {
r := nvml.Device(d).SetComputeMode(nvml.ComputeMode(mode))
return Return(r)
}

View File

@@ -4,6 +4,7 @@
package nvml
import (
"github.com/NVIDIA/go-nvml/pkg/nvml"
"sync"
)
@@ -98,9 +99,15 @@ var _ Device = &DeviceMock{}
// RegisterEventsFunc: func(v uint64, eventSet EventSet) Return {
// panic("mock out the RegisterEvents method")
// },
// SetComputeModeFunc: func(computeMode ComputeMode) Return {
// panic("mock out the SetComputeMode method")
// },
// SetMigModeFunc: func(Mode int) (Return, Return) {
// panic("mock out the SetMigMode method")
// },
// nvmlDeviceHandleFunc: func() *nvml.Device {
// panic("mock out the nvmlDeviceHandle method")
// },
// }
//
// // use mockedDevice in code that requires Device
@@ -189,9 +196,15 @@ type DeviceMock struct {
// RegisterEventsFunc mocks the RegisterEvents method.
RegisterEventsFunc func(v uint64, eventSet EventSet) Return
// SetComputeModeFunc mocks the SetComputeMode method.
SetComputeModeFunc func(computeMode ComputeMode) Return
// SetMigModeFunc mocks the SetMigMode method.
SetMigModeFunc func(Mode int) (Return, Return)
// nvmlDeviceHandleFunc mocks the nvmlDeviceHandle method.
nvmlDeviceHandleFunc func() *nvml.Device
// calls tracks calls to the methods.
calls struct {
// CreateGpuInstanceWithPlacement holds details about calls to the CreateGpuInstanceWithPlacement method.
@@ -299,11 +312,19 @@ type DeviceMock struct {
// EventSet is the eventSet argument value.
EventSet EventSet
}
// SetComputeMode holds details about calls to the SetComputeMode method.
SetComputeMode []struct {
// ComputeMode is the computeMode argument value.
ComputeMode ComputeMode
}
// SetMigMode holds details about calls to the SetMigMode method.
SetMigMode []struct {
// Mode is the Mode argument value.
Mode int
}
// nvmlDeviceHandle holds details about calls to the nvmlDeviceHandle method.
nvmlDeviceHandle []struct {
}
}
lockCreateGpuInstanceWithPlacement sync.RWMutex
lockGetArchitecture sync.RWMutex
@@ -332,7 +353,9 @@ type DeviceMock struct {
lockGetUUID sync.RWMutex
lockIsMigDeviceHandle sync.RWMutex
lockRegisterEvents sync.RWMutex
lockSetComputeMode sync.RWMutex
lockSetMigMode sync.RWMutex
locknvmlDeviceHandle sync.RWMutex
}
// CreateGpuInstanceWithPlacement calls CreateGpuInstanceWithPlacementFunc.
@@ -1122,6 +1145,38 @@ func (mock *DeviceMock) RegisterEventsCalls() []struct {
return calls
}
// SetComputeMode calls SetComputeModeFunc.
func (mock *DeviceMock) SetComputeMode(computeMode ComputeMode) Return {
if mock.SetComputeModeFunc == nil {
panic("DeviceMock.SetComputeModeFunc: method is nil but Device.SetComputeMode was just called")
}
callInfo := struct {
ComputeMode ComputeMode
}{
ComputeMode: computeMode,
}
mock.lockSetComputeMode.Lock()
mock.calls.SetComputeMode = append(mock.calls.SetComputeMode, callInfo)
mock.lockSetComputeMode.Unlock()
return mock.SetComputeModeFunc(computeMode)
}
// SetComputeModeCalls gets all the calls that were made to SetComputeMode.
// Check the length with:
//
// len(mockedDevice.SetComputeModeCalls())
func (mock *DeviceMock) SetComputeModeCalls() []struct {
ComputeMode ComputeMode
} {
var calls []struct {
ComputeMode ComputeMode
}
mock.lockSetComputeMode.RLock()
calls = mock.calls.SetComputeMode
mock.lockSetComputeMode.RUnlock()
return calls
}
// SetMigMode calls SetMigModeFunc.
func (mock *DeviceMock) SetMigMode(Mode int) (Return, Return) {
if mock.SetMigModeFunc == nil {
@@ -1153,3 +1208,30 @@ func (mock *DeviceMock) SetMigModeCalls() []struct {
mock.lockSetMigMode.RUnlock()
return calls
}
// nvmlDeviceHandle calls nvmlDeviceHandleFunc.
func (mock *DeviceMock) nvmlDeviceHandle() *nvml.Device {
if mock.nvmlDeviceHandleFunc == nil {
panic("DeviceMock.nvmlDeviceHandleFunc: method is nil but Device.nvmlDeviceHandle was just called")
}
callInfo := struct {
}{}
mock.locknvmlDeviceHandle.Lock()
mock.calls.nvmlDeviceHandle = append(mock.calls.nvmlDeviceHandle, callInfo)
mock.locknvmlDeviceHandle.Unlock()
return mock.nvmlDeviceHandleFunc()
}
// nvmlDeviceHandleCalls gets all the calls that were made to nvmlDeviceHandle.
// Check the length with:
//
// len(mockedDevice.nvmlDeviceHandleCalls())
func (mock *DeviceMock) nvmlDeviceHandleCalls() []struct {
} {
var calls []struct {
}
mock.locknvmlDeviceHandle.RLock()
calls = mock.calls.nvmlDeviceHandle
mock.locknvmlDeviceHandle.RUnlock()
return calls
}

View File

@@ -67,7 +67,10 @@ type Device interface {
GetUUID() (string, Return)
IsMigDeviceHandle() (bool, Return)
RegisterEvents(uint64, EventSet) Return
SetComputeMode(ComputeMode) Return
SetMigMode(Mode int) (Return, Return)
// nvmlDeviceHandle returns a pointer to the underlying NVML device.
nvmlDeviceHandle() *nvml.Device
}
// GpuInstance defines the functions implemented by a GpuInstance
@@ -154,3 +157,6 @@ type GpuTopologyLevel nvml.GpuTopologyLevel
// EnableState represents a generic enable/disable enum
type EnableState nvml.EnableState
// ComputeMode represents the compute mode for a device
type ComputeMode nvml.ComputeMode

View File

@@ -104,7 +104,7 @@ func (dl *DynamicLibrary) Lookup(symbol string) error {
var pointer unsafe.Pointer
if err := withOSLock(func() error {
// Call dlError() to clear out any previous errors.
dlError()
_ = dlError()
pointer = C.dlsym(dl.handle, sym)
if pointer == nil {
return fmt.Errorf("symbol %q not found: %w", symbol, dlError())

View File

@@ -31,7 +31,7 @@ type dynamicLibrary interface {
Close() error
}
// Interface represents the interace for the NVML library.
// Interface represents the interface for the NVML library.
type Interface interface {
GetLibrary() Library
}

View File

@@ -38,9 +38,10 @@ var errLibraryAlreadyLoaded = errors.New("library already loaded")
// This includes a reference to the underlying DynamicLibrary
type library struct {
sync.Mutex
path string
flags int
dl dynamicLibrary
path string
flags int
refcount refcount
dl dynamicLibrary
}
// libnvml is a global instance of the nvml library.
@@ -77,16 +78,17 @@ var newDynamicLibrary = func(path string, flags int) dynamicLibrary {
// load initializes the library and updates the versioned symbols.
// Multiple calls to an already loaded library will return without error.
func (l *library) load() error {
func (l *library) load() (rerr error) {
l.Lock()
defer l.Unlock()
if l.dl != nil {
defer func() { l.refcount.IncOnNoError(rerr) }()
if l.refcount > 0 {
return nil
}
dl := newDynamicLibrary(l.path, l.flags)
err := dl.Open()
if err != nil {
if err := dl.Open(); err != nil {
return fmt.Errorf("error opening %s: %w", l.path, err)
}
@@ -99,16 +101,16 @@ func (l *library) load() error {
// close the underlying library and ensure that the global pointer to the
// library is set to nil to ensure that subsequent calls to open will reinitialize it.
// Multiple calls to an already closed nvml library will return without error.
func (l *library) close() error {
func (l *library) close() (rerr error) {
l.Lock()
defer l.Unlock()
if l.dl == nil {
defer func() { l.refcount.DecOnNoError(rerr) }()
if l.refcount != 1 {
return nil
}
err := l.dl.Close()
if err != nil {
if err := l.dl.Close(); err != nil {
return fmt.Errorf("error closing %s: %w", l.path, err)
}
@@ -160,12 +162,7 @@ func (pis ProcessInfo_v1Slice) ToProcessInfoSlice() []ProcessInfo {
func (pis ProcessInfo_v2Slice) ToProcessInfoSlice() []ProcessInfo {
var newInfos []ProcessInfo
for _, pi := range pis {
info := ProcessInfo{
Pid: pi.Pid,
UsedGpuMemory: pi.UsedGpuMemory,
GpuInstanceId: pi.GpuInstanceId,
ComputeInstanceId: pi.ComputeInstanceId,
}
info := ProcessInfo(pi)
newInfos = append(newInfos, info)
}
return newInfos

31
vendor/github.com/NVIDIA/go-nvml/pkg/nvml/refcount.go generated vendored Normal file
View File

@@ -0,0 +1,31 @@
/**
# Copyright 2024 NVIDIA CORPORATION
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package nvml
type refcount int
func (r *refcount) IncOnNoError(err error) {
if err == nil {
(*r)++
}
}
func (r *refcount) DecOnNoError(err error) {
if err == nil && (*r) > 0 {
(*r)--
}
}

View File

@@ -14,7 +14,80 @@
package nvml
import (
"fmt"
)
// nvml.ErrorString()
func ErrorString(Result Return) string {
return nvmlErrorString(Result)
func ErrorString(r Return) string {
if err := GetLibrary().Lookup("nvmlErrorString"); err != nil {
return fallbackErrorStringFunc(r)
}
return nvmlErrorString(r)
}
// fallbackErrorStringFunc provides a basic nvmlErrorString implementation.
// This allows the nvml.ErrorString function to be used even if the NVML library
// is not loaded.
var fallbackErrorStringFunc = func(r Return) string {
switch r {
case SUCCESS:
return "SUCCESS"
case ERROR_UNINITIALIZED:
return "ERROR_UNINITIALIZED"
case ERROR_INVALID_ARGUMENT:
return "ERROR_INVALID_ARGUMENT"
case ERROR_NOT_SUPPORTED:
return "ERROR_NOT_SUPPORTED"
case ERROR_NO_PERMISSION:
return "ERROR_NO_PERMISSION"
case ERROR_ALREADY_INITIALIZED:
return "ERROR_ALREADY_INITIALIZED"
case ERROR_NOT_FOUND:
return "ERROR_NOT_FOUND"
case ERROR_INSUFFICIENT_SIZE:
return "ERROR_INSUFFICIENT_SIZE"
case ERROR_INSUFFICIENT_POWER:
return "ERROR_INSUFFICIENT_POWER"
case ERROR_DRIVER_NOT_LOADED:
return "ERROR_DRIVER_NOT_LOADED"
case ERROR_TIMEOUT:
return "ERROR_TIMEOUT"
case ERROR_IRQ_ISSUE:
return "ERROR_IRQ_ISSUE"
case ERROR_LIBRARY_NOT_FOUND:
return "ERROR_LIBRARY_NOT_FOUND"
case ERROR_FUNCTION_NOT_FOUND:
return "ERROR_FUNCTION_NOT_FOUND"
case ERROR_CORRUPTED_INFOROM:
return "ERROR_CORRUPTED_INFOROM"
case ERROR_GPU_IS_LOST:
return "ERROR_GPU_IS_LOST"
case ERROR_RESET_REQUIRED:
return "ERROR_RESET_REQUIRED"
case ERROR_OPERATING_SYSTEM:
return "ERROR_OPERATING_SYSTEM"
case ERROR_LIB_RM_VERSION_MISMATCH:
return "ERROR_LIB_RM_VERSION_MISMATCH"
case ERROR_IN_USE:
return "ERROR_IN_USE"
case ERROR_MEMORY:
return "ERROR_MEMORY"
case ERROR_NO_DATA:
return "ERROR_NO_DATA"
case ERROR_VGPU_ECC_NOT_SUPPORTED:
return "ERROR_VGPU_ECC_NOT_SUPPORTED"
case ERROR_INSUFFICIENT_RESOURCES:
return "ERROR_INSUFFICIENT_RESOURCES"
case ERROR_FREQ_NOT_SUPPORTED:
return "ERROR_FREQ_NOT_SUPPORTED"
case ERROR_ARGUMENT_VERSION_MISMATCH:
return "ERROR_ARGUMENT_VERSION_MISMATCH"
case ERROR_DEPRECATED:
return "ERROR_DEPRECATED"
case ERROR_UNKNOWN:
return "ERROR_UNKNOWN"
default:
return fmt.Sprintf("unknown return value: %d", r)
}
}