nvsandboxutils: Add usage of GetDriverVersion API

This change includes the usage of Sandboxutils GetDriverVersion API to
retrieve the CUDA driver version. If the library is not available on the
system or the API call fails for some other reason, it will fallback to
the NVML API to return the driver version.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
Signed-off-by: Huy Nguyen <huyn@nvidia.com>
Signed-off-by: Sananya Majumder <sananyam@nvidia.com>
This commit is contained in:
Sananya Majumder 2024-09-24 10:05:11 -07:00
parent dcbf5bc81f
commit 7b770f63c3
2 changed files with 48 additions and 0 deletions

View File

@ -27,6 +27,7 @@ import (
"tags.cncf.io/container-device-interface/specs-go"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
)
@ -52,6 +53,19 @@ func (l *nvmllib) GetAllDeviceSpecs() ([]specs.Device, error) {
}
}()
if l.nvsandboxutilslib != nil {
if r := l.nvsandboxutilslib.Init(l.driverRoot); r != nvsandboxutils.SUCCESS {
l.logger.Warningf("Failed to init nvsandboxutils: %v; ignoring", r)
l.nvsandboxutilslib = nil
}
defer func() {
if l.nvsandboxutilslib == nil {
return
}
_ = l.nvsandboxutilslib.Shutdown()
}()
}
gpuDeviceSpecs, err := l.getGPUDeviceSpecs()
if err != nil {
return nil, err

View File

@ -26,6 +26,7 @@ import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils"
"github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform"
@ -43,6 +44,7 @@ type wrapper struct {
type nvcdilib struct {
logger logger.Interface
nvmllib nvml.Interface
nvsandboxutilslib nvsandboxutils.Interface
mode string
devicelib device.Interface
deviceNamers DeviceNamers
@ -107,6 +109,19 @@ func New(opts ...Option) (Interface, error) {
}
l.nvmllib = nvml.New(nvmlOpts...)
}
if l.nvsandboxutilslib == nil {
var nvsandboxutilsOpts []nvsandboxutils.LibraryOption
// Set the library path for libnvidia-sandboxutils
candidates, err := l.driver.Libraries().Locate("libnvidia-sandboxutils.so.1")
if err != nil {
l.logger.Warningf("Ignoring error in locating libnvidia-sandboxutils.so.1: %v", err)
} else {
libNvidiaSandboxutilsPath := candidates[0]
l.logger.Infof("Using %v", libNvidiaSandboxutilsPath)
nvsandboxutilsOpts = append(nvsandboxutilsOpts, nvsandboxutils.WithLibraryPath(libNvidiaSandboxutilsPath))
}
l.nvsandboxutilslib = nvsandboxutils.New(nvsandboxutilsOpts...)
}
if l.devicelib == nil {
l.devicelib = device.New(l.nvmllib)
}
@ -214,6 +229,16 @@ func (l *nvcdilib) resolveMode() (rmode string) {
// getCudaVersion returns the CUDA version of the current system.
func (l *nvcdilib) getCudaVersion() (string, error) {
version, err := l.getCudaVersionNvsandboxutils()
if err == nil {
return version, err
}
// Fallback to NVML
return l.getCudaVersionNvml()
}
func (l *nvcdilib) getCudaVersionNvml() (string, error) {
if hasNVML, reason := l.infolib.HasNvml(); !hasNVML {
return "", fmt.Errorf("nvml not detected: %v", reason)
}
@ -236,3 +261,12 @@ func (l *nvcdilib) getCudaVersion() (string, error) {
}
return version, nil
}
func (l *nvcdilib) getCudaVersionNvsandboxutils() (string, error) {
// Sandboxutils initialization should happen before this function is called
version, ret := l.nvsandboxutilslib.GetDriverVersion()
if ret != nvsandboxutils.SUCCESS {
return "", fmt.Errorf("%v", ret)
}
return version, nil
}