Fix generation of management CDI spec in containers

Since we relied on finding libcuda.so in the LDCache to determine both the CUDA
version and the expected directory for the driver libraries, the generation of the
management CDI specifications fails in containers where the LDCache has not been updated.

This change falls back to searching a set of predefined paths instead when the lookup of
libcuda.so in the cache fails.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2023-03-23 11:50:11 +02:00
parent 5e0684e99d
commit 9506bd9da0
4 changed files with 105 additions and 34 deletions

View File

@ -2,6 +2,7 @@
## v1.13.0-rc.3
* Fix the generation of CDI specifications for management containers when the driver libraries are not in the LDCache.
* Prefer /run over /var/run when locating nvidia-persistenced and nvidia-fabricmanager sockets.
* Only initialize NVML for modes that require it when runing `nvidia-ctk cdi generate`
* [libnvidia-container] Fix segmentation fault when RPC initialization fails.

View File

@ -22,7 +22,6 @@ import (
"strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
@ -136,26 +135,25 @@ func NewDriverBinariesDiscoverer(logger *logrus.Logger, driverRoot string) disco
func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([]string, error) {
logger.Infof("Using driver version %v", version)
cache, err := ldcache.New(logger, driverRoot)
l := cudaLocator{
logger: logger,
driverRoot: driverRoot,
}
libCudaPaths, err := l.Locate("libcuda.so." + version)
if err != nil {
return nil, fmt.Errorf("failed to load ldcache: %v", err)
return nil, fmt.Errorf("failed to locate libcuda.so.%v: %v", version, err)
}
libRoot := filepath.Dir(libCudaPaths[0])
libs32, libs64 := cache.List()
libraries := lookup.NewFileLocator(
lookup.WithLogger(logger),
lookup.WithSearchPaths(libRoot),
lookup.WithOptional(true),
)
var libs []string
for _, l := range libs64 {
if strings.HasSuffix(l, version) {
logger.Infof("found 64-bit driver lib: %v", l)
libs = append(libs, l)
}
}
for _, l := range libs32 {
if strings.HasSuffix(l, version) {
logger.Infof("found 32-bit driver lib: %v", l)
libs = append(libs, l)
}
libs, err := libraries.Locate("*.so." + version)
if err != nil {
return nil, fmt.Errorf("failed to locate libraries for driver version %v: %v", version, err)
}
if driverRoot == "/" || driverRoot == "" {
@ -169,3 +167,43 @@ func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([
return relative, nil
}
type cudaLocator struct {
logger *logrus.Logger
driverRoot string
}
// Locate returns the path to the libcuda.so.RMVERSION file.
func (l *cudaLocator) Locate(pattern string) ([]string, error) {
ldcacheLocator, err := lookup.NewLibraryLocator(
l.logger,
l.driverRoot,
)
if err != nil {
l.logger.Debugf("Failed to create LDCache locator: %v", err)
}
candidates, err := ldcacheLocator.Locate("libcuda.so")
if err == nil {
for _, c := range candidates {
if match, err := filepath.Match(pattern, filepath.Base(c)); err != nil || !match {
l.logger.Debugf("Skipping non-matching candidate %v: %v", c, err)
continue
}
return []string{c}, nil
}
}
l.logger.Debugf("Could not locate %q in LDCache: Checking predefined library paths.", pattern)
pathLocator := lookup.NewFileLocator(
lookup.WithLogger(l.logger),
lookup.WithRoot(l.driverRoot),
lookup.WithSearchPaths(
"/usr/lib64",
"/usr/lib/x86_64-linux-gnu",
"/usr/lib/aarch64-linux-gnu",
),
lookup.WithCount(1),
)
return pathLocator.Locate(pattern)
}

View File

@ -17,6 +17,8 @@
package nvcdi
import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
"github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
@ -151,3 +153,24 @@ func (l *nvcdilib) resolveMode() (rmode string) {
return ModeNvml
}
// getCudaVersion returns the CUDA version of the current system.
func (l *nvcdilib) getCudaVersion() (string, error) {
if hasNVML, reason := l.infolib.HasNvml(); !hasNVML {
return "", fmt.Errorf("nvml not detected: %v", reason)
}
if l.nvmllib == nil {
return "", fmt.Errorf("nvml library not initialized")
}
r := l.nvmllib.Init()
if r != nvml.SUCCESS {
return "", fmt.Errorf("failed to initialize nvml: %v", r)
}
defer l.nvmllib.Shutdown()
version, r := l.nvmllib.SystemGetDriverVersion()
if r != nvml.SUCCESS {
return "", fmt.Errorf("failed to get driver version: %v", r)
}
return version, nil
}

View File

@ -23,7 +23,6 @@ import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
@ -60,23 +59,9 @@ func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) {
// GetCommonEdits returns the common edits for use in managementlib containers.
func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
locator, err := lookup.NewLibraryLocator(
m.logger,
m.driverRoot,
)
version, err := m.getCudaVersion()
if err != nil {
return nil, fmt.Errorf("failed to create library locator: %v", err)
}
candidates, err := locator.Locate("libcuda.so")
if err != nil {
return nil, fmt.Errorf("failed to locate libcuda.so: %v", err)
}
libcudaPath := candidates[0]
version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.")
if version == "" {
return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath)
return nil, fmt.Errorf("failed to get CUDA version: %v", err)
}
driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version)
@ -92,6 +77,30 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
return edits, nil
}
// getCudaVersion returns the CUDA version for use in managementlib containers.
func (m *managementlib) getCudaVersion() (string, error) {
version, err := (*nvcdilib)(m).getCudaVersion()
if err == nil {
return version, nil
}
l := cudaLocator{
logger: m.logger,
driverRoot: m.driverRoot,
}
libCudaPaths, err := l.Locate("libcuda.so.*.*.*")
if err != nil {
return "", fmt.Errorf("failed to locate libcuda.so: %v", err)
}
libCudaPath := libCudaPaths[0]
version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.")
return version, nil
}
type managementDiscoverer struct {
discover.Discover
}