diff --git a/go.mod b/go.mod index 43d6762d..34c40459 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/NVIDIA/nvidia-container-toolkit go 1.20 require ( - github.com/NVIDIA/go-nvml v0.12.0-1 + github.com/NVIDIA/go-nvml v0.12.0-2 github.com/fsnotify/fsnotify v1.7.0 github.com/opencontainers/runtime-spec v1.1.0 github.com/pelletier/go-toml v1.9.5 diff --git a/go.sum b/go.sum index 254a8873..84f896bc 100644 --- a/go.sum +++ b/go.sum @@ -1,5 +1,5 @@ -github.com/NVIDIA/go-nvml v0.12.0-1 h1:6mdjtlFo+17dWL7VFPfuRMtf0061TF4DKls9pkSw6uM= -github.com/NVIDIA/go-nvml v0.12.0-1/go.mod h1:hy7HYeQy335x6nEss0Ne3PYqleRa6Ct+VKD9RQ4nyFs= +github.com/NVIDIA/go-nvml v0.12.0-2 h1:Sg239yy7jmopu/cuvYauoMj9fOpcGMngxVxxS1EBXeY= +github.com/NVIDIA/go-nvml v0.12.0-2/go.mod h1:7ruy85eOM73muOc/I37euONSwEyFqZsv5ED9AogD4G0= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w= @@ -46,9 +46,13 @@ github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI= diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/dl/dl.go b/vendor/github.com/NVIDIA/go-nvml/pkg/dl/dl.go index 21a02091..34948a72 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/dl/dl.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/dl/dl.go @@ -15,7 +15,9 @@ package dl import ( + "errors" "fmt" + "runtime" "unsafe" ) @@ -25,45 +27,72 @@ import ( import "C" const ( - RTLD_LAZY = C.RTLD_LAZY - RTLD_NOW = C.RTLD_NOW - RTLD_GLOBAL = C.RTLD_GLOBAL - RTLD_LOCAL = C.RTLD_LOCAL + RTLD_LAZY = C.RTLD_LAZY + RTLD_NOW = C.RTLD_NOW + RTLD_GLOBAL = C.RTLD_GLOBAL + RTLD_LOCAL = C.RTLD_LOCAL RTLD_NODELETE = C.RTLD_NODELETE - RTLD_NOLOAD = C.RTLD_NOLOAD - RTLD_DEEPBIND = C.RTLD_DEEPBIND + RTLD_NOLOAD = C.RTLD_NOLOAD ) -type DynamicLibrary struct{ - Name string - Flags int +type DynamicLibrary struct { + Name string + Flags int handle unsafe.Pointer } func New(name string, flags int) *DynamicLibrary { return &DynamicLibrary{ - Name: name, - Flags: flags, + Name: name, + Flags: flags, handle: nil, - } + } +} + +func withOSLock(action func() error) error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + return action() +} + +func dlError() error { + lastErr := C.dlerror() + if lastErr == nil { + return nil + } + return errors.New(C.GoString(lastErr)) } func (dl *DynamicLibrary) Open() error { name := C.CString(dl.Name) defer C.free(unsafe.Pointer(name)) - handle := C.dlopen(name, C.int(dl.Flags)) - if handle == C.NULL { - return fmt.Errorf("%s", C.GoString(C.dlerror())) + if err := withOSLock(func() error { + handle := C.dlopen(name, C.int(dl.Flags)) + if handle == nil { + return dlError() + } + dl.handle = handle + return nil + }); err != nil { + return err } - dl.handle = handle return nil } func (dl *DynamicLibrary) Close() error { - err := C.dlclose(dl.handle) - if err != 0 { - return fmt.Errorf("%s", C.GoString(C.dlerror())) + if dl.handle == nil { + return nil + } + if err := withOSLock(func() error { + if C.dlclose(dl.handle) != 0 { + return dlError() + } + dl.handle = nil + return nil + }); err != nil { + return err } return nil } @@ -72,11 +101,17 @@ func (dl *DynamicLibrary) Lookup(symbol string) error { sym := C.CString(symbol) defer C.free(unsafe.Pointer(sym)) - C.dlerror() // Clear out any previous errors - C.dlsym(dl.handle, sym) - err := C.dlerror() - if unsafe.Pointer(err) == C.NULL { + var pointer unsafe.Pointer + if err := withOSLock(func() error { + // Call dlError() to clear out any previous errors. + _ = dlError() + pointer = C.dlsym(dl.handle, sym) + if pointer == nil { + return fmt.Errorf("symbol %q not found: %w", symbol, dlError()) + } return nil + }); err != nil { + return err } - return fmt.Errorf("%s", C.GoString(err)) + return nil } diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/dl/dl_linux.go b/vendor/github.com/NVIDIA/go-nvml/pkg/dl/dl_linux.go new file mode 100644 index 00000000..ae3acd07 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/dl/dl_linux.go @@ -0,0 +1,26 @@ +/** +# Copyright (c) NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package dl + +// #cgo LDFLAGS: -ldl +// #include +// #include +import "C" + +const ( + RTLD_DEEPBIND = C.RTLD_DEEPBIND +) diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/api.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/api.go new file mode 100644 index 00000000..4885e8e9 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/api.go @@ -0,0 +1,37 @@ +/** +# Copyright 2023 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvml + +// Library defines a set of functions defined on the underlying dynamic library. +type Library interface { + Lookup(string) error +} + +// dynamicLibrary is an interface for abstacting the underlying library. +// This also allows for mocking and testing. + +//go:generate moq -stub -out dynamicLibrary_mock.go . dynamicLibrary +type dynamicLibrary interface { + Lookup(string) error + Open() error + Close() error +} + +// Interface represents the interface for the NVML library. +type Interface interface { + GetLibrary() Library +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go index 1a0efaf6..f4cecfbc 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/const.go @@ -18,7 +18,8 @@ package nvml /* -#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files +#cgo linux LDFLAGS: -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files +#cgo darwin LDFLAGS: -Wl,-undefined,dynamic_lookup #cgo CFLAGS: -DNVML_NO_UNVERSIONED_FUNC_DEFS=1 #include "nvml.h" #include diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/dynamicLibrary_mock.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/dynamicLibrary_mock.go new file mode 100644 index 00000000..b785431c --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/dynamicLibrary_mock.go @@ -0,0 +1,157 @@ +// Code generated by moq; DO NOT EDIT. +// github.com/matryer/moq + +package nvml + +import ( + "sync" +) + +// Ensure, that dynamicLibraryMock does implement dynamicLibrary. +// If this is not the case, regenerate this file with moq. +var _ dynamicLibrary = &dynamicLibraryMock{} + +// dynamicLibraryMock is a mock implementation of dynamicLibrary. +// +// func TestSomethingThatUsesdynamicLibrary(t *testing.T) { +// +// // make and configure a mocked dynamicLibrary +// mockeddynamicLibrary := &dynamicLibraryMock{ +// CloseFunc: func() error { +// panic("mock out the Close method") +// }, +// LookupFunc: func(s string) error { +// panic("mock out the Lookup method") +// }, +// OpenFunc: func() error { +// panic("mock out the Open method") +// }, +// } +// +// // use mockeddynamicLibrary in code that requires dynamicLibrary +// // and then make assertions. +// +// } +type dynamicLibraryMock struct { + // CloseFunc mocks the Close method. + CloseFunc func() error + + // LookupFunc mocks the Lookup method. + LookupFunc func(s string) error + + // OpenFunc mocks the Open method. + OpenFunc func() error + + // calls tracks calls to the methods. + calls struct { + // Close holds details about calls to the Close method. + Close []struct { + } + // Lookup holds details about calls to the Lookup method. + Lookup []struct { + // S is the s argument value. + S string + } + // Open holds details about calls to the Open method. + Open []struct { + } + } + lockClose sync.RWMutex + lockLookup sync.RWMutex + lockOpen sync.RWMutex +} + +// Close calls CloseFunc. +func (mock *dynamicLibraryMock) Close() error { + callInfo := struct { + }{} + mock.lockClose.Lock() + mock.calls.Close = append(mock.calls.Close, callInfo) + mock.lockClose.Unlock() + if mock.CloseFunc == nil { + var ( + errOut error + ) + return errOut + } + return mock.CloseFunc() +} + +// CloseCalls gets all the calls that were made to Close. +// Check the length with: +// +// len(mockeddynamicLibrary.CloseCalls()) +func (mock *dynamicLibraryMock) CloseCalls() []struct { +} { + var calls []struct { + } + mock.lockClose.RLock() + calls = mock.calls.Close + mock.lockClose.RUnlock() + return calls +} + +// Lookup calls LookupFunc. +func (mock *dynamicLibraryMock) Lookup(s string) error { + callInfo := struct { + S string + }{ + S: s, + } + mock.lockLookup.Lock() + mock.calls.Lookup = append(mock.calls.Lookup, callInfo) + mock.lockLookup.Unlock() + if mock.LookupFunc == nil { + var ( + errOut error + ) + return errOut + } + return mock.LookupFunc(s) +} + +// LookupCalls gets all the calls that were made to Lookup. +// Check the length with: +// +// len(mockeddynamicLibrary.LookupCalls()) +func (mock *dynamicLibraryMock) LookupCalls() []struct { + S string +} { + var calls []struct { + S string + } + mock.lockLookup.RLock() + calls = mock.calls.Lookup + mock.lockLookup.RUnlock() + return calls +} + +// Open calls OpenFunc. +func (mock *dynamicLibraryMock) Open() error { + callInfo := struct { + }{} + mock.lockOpen.Lock() + mock.calls.Open = append(mock.calls.Open, callInfo) + mock.lockOpen.Unlock() + if mock.OpenFunc == nil { + var ( + errOut error + ) + return errOut + } + return mock.OpenFunc() +} + +// OpenCalls gets all the calls that were made to Open. +// Check the length with: +// +// len(mockeddynamicLibrary.OpenCalls()) +func (mock *dynamicLibraryMock) OpenCalls() []struct { +} { + var calls []struct { + } + mock.lockOpen.RLock() + calls = mock.calls.Open + mock.lockOpen.RUnlock() + return calls +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/gpm.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/gpm.go new file mode 100644 index 00000000..c46c5d78 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/gpm.go @@ -0,0 +1,93 @@ +// Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nvml + +// nvml.GpmMetricsGet() +type GpmMetricsGetVType struct { + metricsGet *GpmMetricsGetType +} + +func GpmMetricsGetV(MetricsGet *GpmMetricsGetType) GpmMetricsGetVType { + return GpmMetricsGetVType{MetricsGet} +} + +func (MetricsGetV GpmMetricsGetVType) V1() Return { + MetricsGetV.metricsGet.Version = 1 + return nvmlGpmMetricsGet(MetricsGetV.metricsGet) +} + +func GpmMetricsGet(MetricsGet *GpmMetricsGetType) Return { + MetricsGet.Version = GPM_METRICS_GET_VERSION + return nvmlGpmMetricsGet(MetricsGet) +} + +// nvml.GpmSampleFree() +func GpmSampleFree(GpmSample GpmSample) Return { + return nvmlGpmSampleFree(GpmSample) +} + +// nvml.GpmSampleAlloc() +func GpmSampleAlloc(GpmSample *GpmSample) Return { + return nvmlGpmSampleAlloc(GpmSample) +} + +// nvml.GpmSampleGet() +func GpmSampleGet(Device Device, GpmSample GpmSample) Return { + return nvmlGpmSampleGet(Device, GpmSample) +} + +func (Device Device) GpmSampleGet(GpmSample GpmSample) Return { + return GpmSampleGet(Device, GpmSample) +} + +// nvml.GpmQueryDeviceSupport() +type GpmSupportV struct { + device Device +} + +func GpmQueryDeviceSupportV(Device Device) GpmSupportV { + return GpmSupportV{Device} +} + +func (Device Device) GpmQueryDeviceSupportV() GpmSupportV { + return GpmSupportV{Device} +} + +func (GpmSupportV GpmSupportV) V1() (GpmSupport, Return) { + var GpmSupport GpmSupport + GpmSupport.Version = 1 + ret := nvmlGpmQueryDeviceSupport(GpmSupportV.device, &GpmSupport) + return GpmSupport, ret +} + +func GpmQueryDeviceSupport(Device Device) (GpmSupport, Return) { + var GpmSupport GpmSupport + GpmSupport.Version = GPM_SUPPORT_VERSION + ret := nvmlGpmQueryDeviceSupport(Device, &GpmSupport) + return GpmSupport, ret +} + +func (Device Device) GpmQueryDeviceSupport() (GpmSupport, Return) { + return GpmQueryDeviceSupport(Device) +} + +// nvml.GpmMigSampleGet() +func GpmMigSampleGet(Device Device, GpuInstanceId int, GpmSample GpmSample) Return { + return nvmlGpmMigSampleGet(Device, uint32(GpuInstanceId), GpmSample) +} + +func (Device Device) GpmMigSampleGet(GpuInstanceId int, GpmSample GpmSample) Return { + return GpmMigSampleGet(Device, GpuInstanceId, GpmSample) +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/init.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/init.go index 1572f81f..e2bc943b 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/init.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/init.go @@ -14,45 +14,21 @@ package nvml -import ( - "fmt" - - "github.com/NVIDIA/go-nvml/pkg/dl" -) - import "C" -const ( - nvmlLibraryName = "libnvidia-ml.so.1" - nvmlLibraryLoadFlags = dl.RTLD_LAZY | dl.RTLD_GLOBAL -) - -var nvml *dl.DynamicLibrary - // nvml.Init() func Init() Return { - lib := dl.New(nvmlLibraryName, nvmlLibraryLoadFlags) - err := lib.Open() - if err != nil { + if err := libnvml.load(); err != nil { return ERROR_LIBRARY_NOT_FOUND } - - nvml = lib - updateVersionedSymbols() - return nvmlInit() } // nvml.InitWithFlags() func InitWithFlags(Flags uint32) Return { - lib := dl.New(nvmlLibraryName, nvmlLibraryLoadFlags) - err := lib.Open() - if err != nil { + if err := libnvml.load(); err != nil { return ERROR_LIBRARY_NOT_FOUND } - - nvml = lib - return nvmlInitWithFlags(Flags) } @@ -63,156 +39,10 @@ func Shutdown() Return { return ret } - err := nvml.Close() + err := libnvml.close() if err != nil { - panic(fmt.Sprintf("error closing %s: %v", nvmlLibraryName, err)) + panic(err) } return ret } - -// Default all versioned APIs to v1 (to infer the types) -var nvmlInit = nvmlInit_v1 -var nvmlDeviceGetPciInfo = nvmlDeviceGetPciInfo_v1 -var nvmlDeviceGetCount = nvmlDeviceGetCount_v1 -var nvmlDeviceGetHandleByIndex = nvmlDeviceGetHandleByIndex_v1 -var nvmlDeviceGetHandleByPciBusId = nvmlDeviceGetHandleByPciBusId_v1 -var nvmlDeviceGetNvLinkRemotePciInfo = nvmlDeviceGetNvLinkRemotePciInfo_v1 -var nvmlDeviceRemoveGpu = nvmlDeviceRemoveGpu_v1 -var nvmlDeviceGetGridLicensableFeatures = nvmlDeviceGetGridLicensableFeatures_v1 -var nvmlEventSetWait = nvmlEventSetWait_v1 -var nvmlDeviceGetAttributes = nvmlDeviceGetAttributes_v1 -var nvmlComputeInstanceGetInfo = nvmlComputeInstanceGetInfo_v1 -var DeviceGetComputeRunningProcesses = deviceGetComputeRunningProcesses_v1 -var DeviceGetGraphicsRunningProcesses = deviceGetGraphicsRunningProcesses_v1 -var DeviceGetMPSComputeRunningProcesses = deviceGetMPSComputeRunningProcesses_v1 -var GetBlacklistDeviceCount = GetExcludedDeviceCount -var GetBlacklistDeviceInfoByIndex = GetExcludedDeviceInfoByIndex -var nvmlDeviceGetGpuInstancePossiblePlacements = nvmlDeviceGetGpuInstancePossiblePlacements_v1 -var nvmlVgpuInstanceGetLicenseInfo = nvmlVgpuInstanceGetLicenseInfo_v1 - -type BlacklistDeviceInfo = ExcludedDeviceInfo -type ProcessInfo_v1Slice []ProcessInfo_v1 -type ProcessInfo_v2Slice []ProcessInfo_v2 - -func (pis ProcessInfo_v1Slice) ToProcessInfoSlice() []ProcessInfo { - var newInfos []ProcessInfo - for _, pi := range pis { - info := ProcessInfo{ - Pid: pi.Pid, - UsedGpuMemory: pi.UsedGpuMemory, - GpuInstanceId: 0xFFFFFFFF, // GPU instance ID is invalid in v1 - ComputeInstanceId: 0xFFFFFFFF, // Compute instance ID is invalid in v1 - } - newInfos = append(newInfos, info) - } - return newInfos -} - -func (pis ProcessInfo_v2Slice) ToProcessInfoSlice() []ProcessInfo { - var newInfos []ProcessInfo - for _, pi := range pis { - info := ProcessInfo{ - Pid: pi.Pid, - UsedGpuMemory: pi.UsedGpuMemory, - GpuInstanceId: pi.GpuInstanceId, - ComputeInstanceId: pi.ComputeInstanceId, - } - newInfos = append(newInfos, info) - } - return newInfos -} - -// updateVersionedSymbols() -func updateVersionedSymbols() { - err := nvml.Lookup("nvmlInit_v2") - if err == nil { - nvmlInit = nvmlInit_v2 - } - err = nvml.Lookup("nvmlDeviceGetPciInfo_v2") - if err == nil { - nvmlDeviceGetPciInfo = nvmlDeviceGetPciInfo_v2 - } - err = nvml.Lookup("nvmlDeviceGetPciInfo_v3") - if err == nil { - nvmlDeviceGetPciInfo = nvmlDeviceGetPciInfo_v3 - } - err = nvml.Lookup("nvmlDeviceGetCount_v2") - if err == nil { - nvmlDeviceGetCount = nvmlDeviceGetCount_v2 - } - err = nvml.Lookup("nvmlDeviceGetHandleByIndex_v2") - if err == nil { - nvmlDeviceGetHandleByIndex = nvmlDeviceGetHandleByIndex_v2 - } - err = nvml.Lookup("nvmlDeviceGetHandleByPciBusId_v2") - if err == nil { - nvmlDeviceGetHandleByPciBusId = nvmlDeviceGetHandleByPciBusId_v2 - } - err = nvml.Lookup("nvmlDeviceGetNvLinkRemotePciInfo_v2") - if err == nil { - nvmlDeviceGetNvLinkRemotePciInfo = nvmlDeviceGetNvLinkRemotePciInfo_v2 - } - // Unable to overwrite nvmlDeviceRemoveGpu() because the v2 function takes - // a different set of parameters than the v1 function. - //err = nvml.Lookup("nvmlDeviceRemoveGpu_v2") - //if err == nil { - // nvmlDeviceRemoveGpu = nvmlDeviceRemoveGpu_v2 - //} - err = nvml.Lookup("nvmlDeviceGetGridLicensableFeatures_v2") - if err == nil { - nvmlDeviceGetGridLicensableFeatures = nvmlDeviceGetGridLicensableFeatures_v2 - } - err = nvml.Lookup("nvmlDeviceGetGridLicensableFeatures_v3") - if err == nil { - nvmlDeviceGetGridLicensableFeatures = nvmlDeviceGetGridLicensableFeatures_v3 - } - err = nvml.Lookup("nvmlDeviceGetGridLicensableFeatures_v4") - if err == nil { - nvmlDeviceGetGridLicensableFeatures = nvmlDeviceGetGridLicensableFeatures_v4 - } - err = nvml.Lookup("nvmlEventSetWait_v2") - if err == nil { - nvmlEventSetWait = nvmlEventSetWait_v2 - } - err = nvml.Lookup("nvmlDeviceGetAttributes_v2") - if err == nil { - nvmlDeviceGetAttributes = nvmlDeviceGetAttributes_v2 - } - err = nvml.Lookup("nvmlComputeInstanceGetInfo_v2") - if err == nil { - nvmlComputeInstanceGetInfo = nvmlComputeInstanceGetInfo_v2 - } - err = nvml.Lookup("nvmlDeviceGetComputeRunningProcesses_v2") - if err == nil { - DeviceGetComputeRunningProcesses = deviceGetComputeRunningProcesses_v2 - } - err = nvml.Lookup("nvmlDeviceGetComputeRunningProcesses_v3") - if err == nil { - DeviceGetComputeRunningProcesses = deviceGetComputeRunningProcesses_v3 - } - err = nvml.Lookup("nvmlDeviceGetGraphicsRunningProcesses_v2") - if err == nil { - DeviceGetGraphicsRunningProcesses = deviceGetGraphicsRunningProcesses_v2 - } - err = nvml.Lookup("nvmlDeviceGetGraphicsRunningProcesses_v3") - if err == nil { - DeviceGetGraphicsRunningProcesses = deviceGetGraphicsRunningProcesses_v3 - } - err = nvml.Lookup("nvmlDeviceGetMPSComputeRunningProcesses_v2") - if err == nil { - DeviceGetMPSComputeRunningProcesses = deviceGetMPSComputeRunningProcesses_v2 - } - err = nvml.Lookup("nvmlDeviceGetMPSComputeRunningProcesses_v3") - if err == nil { - DeviceGetMPSComputeRunningProcesses = deviceGetMPSComputeRunningProcesses_v3 - } - err = nvml.Lookup("nvmlDeviceGetGpuInstancePossiblePlacements_v2") - if err == nil { - nvmlDeviceGetGpuInstancePossiblePlacements = nvmlDeviceGetGpuInstancePossiblePlacements_v2 - } - err = nvml.Lookup("nvmlVgpuInstanceGetLicenseInfo_v2") - if err == nil { - nvmlVgpuInstanceGetLicenseInfo = nvmlVgpuInstanceGetLicenseInfo_v2 - } -} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/lib.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/lib.go new file mode 100644 index 00000000..4d5eb8e0 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/lib.go @@ -0,0 +1,297 @@ +/** +# Copyright 2023 NVIDIA CORPORATION +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvml + +import ( + "errors" + "fmt" + "sync" + + "github.com/NVIDIA/go-nvml/pkg/dl" +) + +import "C" + +const ( + defaultNvmlLibraryName = "libnvidia-ml.so.1" + defaultNvmlLibraryLoadFlags = dl.RTLD_LAZY | dl.RTLD_GLOBAL +) + +var errLibraryNotLoaded = errors.New("library not loaded") +var errLibraryAlreadyLoaded = errors.New("library already loaded") + +// library represents an nvml library. +// This includes a reference to the underlying DynamicLibrary +type library struct { + sync.Mutex + path string + flags int + dl dynamicLibrary +} + +// libnvml is a global instance of the nvml library. +var libnvml = library{ + path: defaultNvmlLibraryName, + flags: defaultNvmlLibraryLoadFlags, +} + +var _ Interface = (*library)(nil) + +// GetLibrary returns a the library as a Library interface. +func (l *library) GetLibrary() Library { + return l +} + +// GetLibrary returns a representation of the underlying library that implements the Library interface. +func GetLibrary() Library { + return libnvml.GetLibrary() +} + +// Lookup checks whether the specified library symbol exists in the library. +// Note that this requires that the library be loaded. +func (l *library) Lookup(name string) error { + if l == nil || l.dl == nil { + return fmt.Errorf("error looking up %s: %w", name, errLibraryNotLoaded) + } + return l.dl.Lookup(name) +} + +// newDynamicLibrary is a function variable that can be overridden for testing. +var newDynamicLibrary = func(path string, flags int) dynamicLibrary { + return dl.New(path, flags) +} + +// load initializes the library and updates the versioned symbols. +// Multiple calls to an already loaded library will return without error. +func (l *library) load() error { + l.Lock() + defer l.Unlock() + if l.dl != nil { + return nil + } + + dl := newDynamicLibrary(l.path, l.flags) + err := dl.Open() + if err != nil { + return fmt.Errorf("error opening %s: %w", l.path, err) + } + + l.dl = dl + l.updateVersionedSymbols() + + return nil +} + +// close the underlying library and ensure that the global pointer to the +// library is set to nil to ensure that subsequent calls to open will reinitialize it. +// Multiple calls to an already closed nvml library will return without error. +func (l *library) close() error { + l.Lock() + defer l.Unlock() + + if l.dl == nil { + return nil + } + + err := l.dl.Close() + if err != nil { + return fmt.Errorf("error closing %s: %w", l.path, err) + } + + l.dl = nil + + return nil +} + +// Default all versioned APIs to v1 (to infer the types) +var nvmlInit = nvmlInit_v1 +var nvmlDeviceGetPciInfo = nvmlDeviceGetPciInfo_v1 +var nvmlDeviceGetCount = nvmlDeviceGetCount_v1 +var nvmlDeviceGetHandleByIndex = nvmlDeviceGetHandleByIndex_v1 +var nvmlDeviceGetHandleByPciBusId = nvmlDeviceGetHandleByPciBusId_v1 +var nvmlDeviceGetNvLinkRemotePciInfo = nvmlDeviceGetNvLinkRemotePciInfo_v1 +var nvmlDeviceRemoveGpu = nvmlDeviceRemoveGpu_v1 +var nvmlDeviceGetGridLicensableFeatures = nvmlDeviceGetGridLicensableFeatures_v1 +var nvmlEventSetWait = nvmlEventSetWait_v1 +var nvmlDeviceGetAttributes = nvmlDeviceGetAttributes_v1 +var nvmlComputeInstanceGetInfo = nvmlComputeInstanceGetInfo_v1 +var DeviceGetComputeRunningProcesses = deviceGetComputeRunningProcesses_v1 +var DeviceGetGraphicsRunningProcesses = deviceGetGraphicsRunningProcesses_v1 +var DeviceGetMPSComputeRunningProcesses = deviceGetMPSComputeRunningProcesses_v1 +var GetBlacklistDeviceCount = GetExcludedDeviceCount +var GetBlacklistDeviceInfoByIndex = GetExcludedDeviceInfoByIndex +var nvmlDeviceGetGpuInstancePossiblePlacements = nvmlDeviceGetGpuInstancePossiblePlacements_v1 +var nvmlVgpuInstanceGetLicenseInfo = nvmlVgpuInstanceGetLicenseInfo_v1 + +// BlacklistDeviceInfo was replaced by ExcludedDeviceInfo +type BlacklistDeviceInfo = ExcludedDeviceInfo + +type ProcessInfo_v1Slice []ProcessInfo_v1 +type ProcessInfo_v2Slice []ProcessInfo_v2 + +func (pis ProcessInfo_v1Slice) ToProcessInfoSlice() []ProcessInfo { + var newInfos []ProcessInfo + for _, pi := range pis { + info := ProcessInfo{ + Pid: pi.Pid, + UsedGpuMemory: pi.UsedGpuMemory, + GpuInstanceId: 0xFFFFFFFF, // GPU instance ID is invalid in v1 + ComputeInstanceId: 0xFFFFFFFF, // Compute instance ID is invalid in v1 + } + newInfos = append(newInfos, info) + } + return newInfos +} + +func (pis ProcessInfo_v2Slice) ToProcessInfoSlice() []ProcessInfo { + var newInfos []ProcessInfo + for _, pi := range pis { + info := ProcessInfo(pi) + newInfos = append(newInfos, info) + } + return newInfos +} + +// updateVersionedSymbols checks for versioned symbols in the loaded dynamic library. +// If newer versioned symbols exist, these replace the default `v1` symbols initialized above. +// When new versioned symbols are added, these would have to be initialized above and have +// corresponding checks and subsequent assignments added below. +func (l *library) updateVersionedSymbols() { + err := l.Lookup("nvmlInit_v2") + if err == nil { + nvmlInit = nvmlInit_v2 + } + err = l.Lookup("nvmlDeviceGetPciInfo_v2") + if err == nil { + nvmlDeviceGetPciInfo = nvmlDeviceGetPciInfo_v2 + } + err = l.Lookup("nvmlDeviceGetPciInfo_v3") + if err == nil { + nvmlDeviceGetPciInfo = nvmlDeviceGetPciInfo_v3 + } + err = l.Lookup("nvmlDeviceGetCount_v2") + if err == nil { + nvmlDeviceGetCount = nvmlDeviceGetCount_v2 + } + err = l.Lookup("nvmlDeviceGetHandleByIndex_v2") + if err == nil { + nvmlDeviceGetHandleByIndex = nvmlDeviceGetHandleByIndex_v2 + } + err = l.Lookup("nvmlDeviceGetHandleByPciBusId_v2") + if err == nil { + nvmlDeviceGetHandleByPciBusId = nvmlDeviceGetHandleByPciBusId_v2 + } + err = l.Lookup("nvmlDeviceGetNvLinkRemotePciInfo_v2") + if err == nil { + nvmlDeviceGetNvLinkRemotePciInfo = nvmlDeviceGetNvLinkRemotePciInfo_v2 + } + // Unable to overwrite nvmlDeviceRemoveGpu() because the v2 function takes + // a different set of parameters than the v1 function. + //err = l.Lookup("nvmlDeviceRemoveGpu_v2") + //if err == nil { + // nvmlDeviceRemoveGpu = nvmlDeviceRemoveGpu_v2 + //} + err = l.Lookup("nvmlDeviceGetGridLicensableFeatures_v2") + if err == nil { + nvmlDeviceGetGridLicensableFeatures = nvmlDeviceGetGridLicensableFeatures_v2 + } + err = l.Lookup("nvmlDeviceGetGridLicensableFeatures_v3") + if err == nil { + nvmlDeviceGetGridLicensableFeatures = nvmlDeviceGetGridLicensableFeatures_v3 + } + err = l.Lookup("nvmlDeviceGetGridLicensableFeatures_v4") + if err == nil { + nvmlDeviceGetGridLicensableFeatures = nvmlDeviceGetGridLicensableFeatures_v4 + } + err = l.Lookup("nvmlEventSetWait_v2") + if err == nil { + nvmlEventSetWait = nvmlEventSetWait_v2 + } + err = l.Lookup("nvmlDeviceGetAttributes_v2") + if err == nil { + nvmlDeviceGetAttributes = nvmlDeviceGetAttributes_v2 + } + err = l.Lookup("nvmlComputeInstanceGetInfo_v2") + if err == nil { + nvmlComputeInstanceGetInfo = nvmlComputeInstanceGetInfo_v2 + } + err = l.Lookup("nvmlDeviceGetComputeRunningProcesses_v2") + if err == nil { + DeviceGetComputeRunningProcesses = deviceGetComputeRunningProcesses_v2 + } + err = l.Lookup("nvmlDeviceGetComputeRunningProcesses_v3") + if err == nil { + DeviceGetComputeRunningProcesses = deviceGetComputeRunningProcesses_v3 + } + err = l.Lookup("nvmlDeviceGetGraphicsRunningProcesses_v2") + if err == nil { + DeviceGetGraphicsRunningProcesses = deviceGetGraphicsRunningProcesses_v2 + } + err = l.Lookup("nvmlDeviceGetGraphicsRunningProcesses_v3") + if err == nil { + DeviceGetGraphicsRunningProcesses = deviceGetGraphicsRunningProcesses_v3 + } + err = l.Lookup("nvmlDeviceGetMPSComputeRunningProcesses_v2") + if err == nil { + DeviceGetMPSComputeRunningProcesses = deviceGetMPSComputeRunningProcesses_v2 + } + err = l.Lookup("nvmlDeviceGetMPSComputeRunningProcesses_v3") + if err == nil { + DeviceGetMPSComputeRunningProcesses = deviceGetMPSComputeRunningProcesses_v3 + } + err = l.Lookup("nvmlDeviceGetGpuInstancePossiblePlacements_v2") + if err == nil { + nvmlDeviceGetGpuInstancePossiblePlacements = nvmlDeviceGetGpuInstancePossiblePlacements_v2 + } + err = l.Lookup("nvmlVgpuInstanceGetLicenseInfo_v2") + if err == nil { + nvmlVgpuInstanceGetLicenseInfo = nvmlVgpuInstanceGetLicenseInfo_v2 + } +} + +// LibraryOption represents a functional option to configure the underlying NVML library +type LibraryOption func(*library) + +// WithLibraryPath provides an option to set the library name to be used by the NVML library. +func WithLibraryPath(path string) LibraryOption { + return func(l *library) { + l.path = path + } +} + +// SetLibraryOptions applies the specified options to the NVML library. +// If this is called when a library is already loaded, and error is raised. +func SetLibraryOptions(opts ...LibraryOption) error { + libnvml.Lock() + defer libnvml.Unlock() + if libnvml.dl != nil { + return errLibraryAlreadyLoaded + } + + for _, opt := range opts { + opt(&libnvml) + } + + if libnvml.path == "" { + libnvml.path = defaultNvmlLibraryName + } + if libnvml.flags == 0 { + libnvml.flags = defaultNvmlLibraryLoadFlags + } + + return nil +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go index f63dfe8e..9bd6965d 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/nvml.go @@ -18,7 +18,8 @@ package nvml /* -#cgo LDFLAGS: -Wl,--unresolved-symbols=ignore-in-object-files +#cgo linux LDFLAGS: -Wl,--export-dynamic -Wl,--unresolved-symbols=ignore-in-object-files +#cgo darwin LDFLAGS: -Wl,-undefined,dynamic_lookup #cgo CFLAGS: -DNVML_NO_UNVERSIONED_FUNC_DEFS=1 #include "nvml.h" #include diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go index bbb93e3d..2366fb70 100644 --- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/vgpu.go @@ -438,7 +438,7 @@ func GetVgpuVersion() (VgpuVersion, VgpuVersion, Return) { // nvml.SetVgpuVersion() func SetVgpuVersion(VgpuVersion *VgpuVersion) Return { - return SetVgpuVersion(VgpuVersion) + return nvmlSetVgpuVersion(VgpuVersion) } // nvml.VgpuInstanceClearAccountingPids() diff --git a/vendor/modules.txt b/vendor/modules.txt index 5395d086..ea8ff96d 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1,4 +1,4 @@ -# github.com/NVIDIA/go-nvml v0.12.0-1 +# github.com/NVIDIA/go-nvml v0.12.0-2 ## explicit; go 1.15 github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml