From 9506bd9da087af1790070efca5a8d842e8d31c9c Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 23 Mar 2023 11:50:11 +0200 Subject: [PATCH] Fix generation of management CDI spec in containers Since we relied on finding libcuda.so in the LDCache to determine both the CUDA version and the expected directory for the driver libraries, the generation of the management CDI specifications fails in containers where the LDCache has not been updated. This change falls back to searching a set of predefined paths instead when the lookup of libcuda.so in the cache fails. Signed-off-by: Evan Lezar --- CHANGELOG.md | 1 + pkg/nvcdi/driver-nvml.go | 72 ++++++++++++++++++++++++++++++---------- pkg/nvcdi/lib.go | 23 +++++++++++++ pkg/nvcdi/management.go | 43 ++++++++++++++---------- 4 files changed, 105 insertions(+), 34 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f7c765de..b9b52b48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## v1.13.0-rc.3 +* Fix the generation of CDI specifications for management containers when the driver libraries are not in the LDCache. * Prefer /run over /var/run when locating nvidia-persistenced and nvidia-fabricmanager sockets. * Only initialize NVML for modes that require it when runing `nvidia-ctk cdi generate` * [libnvidia-container] Fix segmentation fault when RPC initialization fails. diff --git a/pkg/nvcdi/driver-nvml.go b/pkg/nvcdi/driver-nvml.go index a9901590..f623ae30 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/pkg/nvcdi/driver-nvml.go @@ -22,7 +22,6 @@ import ( "strings" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" - "github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" "github.com/sirupsen/logrus" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" @@ -136,26 +135,25 @@ func NewDriverBinariesDiscoverer(logger *logrus.Logger, driverRoot string) disco func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([]string, error) { logger.Infof("Using driver version %v", version) - cache, err := ldcache.New(logger, driverRoot) + l := cudaLocator{ + logger: logger, + driverRoot: driverRoot, + } + libCudaPaths, err := l.Locate("libcuda.so." + version) if err != nil { - return nil, fmt.Errorf("failed to load ldcache: %v", err) + return nil, fmt.Errorf("failed to locate libcuda.so.%v: %v", version, err) } + libRoot := filepath.Dir(libCudaPaths[0]) - libs32, libs64 := cache.List() + libraries := lookup.NewFileLocator( + lookup.WithLogger(logger), + lookup.WithSearchPaths(libRoot), + lookup.WithOptional(true), + ) - var libs []string - for _, l := range libs64 { - if strings.HasSuffix(l, version) { - logger.Infof("found 64-bit driver lib: %v", l) - libs = append(libs, l) - } - } - - for _, l := range libs32 { - if strings.HasSuffix(l, version) { - logger.Infof("found 32-bit driver lib: %v", l) - libs = append(libs, l) - } + libs, err := libraries.Locate("*.so." + version) + if err != nil { + return nil, fmt.Errorf("failed to locate libraries for driver version %v: %v", version, err) } if driverRoot == "/" || driverRoot == "" { @@ -169,3 +167,43 @@ func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([ return relative, nil } + +type cudaLocator struct { + logger *logrus.Logger + driverRoot string +} + +// Locate returns the path to the libcuda.so.RMVERSION file. +func (l *cudaLocator) Locate(pattern string) ([]string, error) { + ldcacheLocator, err := lookup.NewLibraryLocator( + l.logger, + l.driverRoot, + ) + if err != nil { + l.logger.Debugf("Failed to create LDCache locator: %v", err) + } + candidates, err := ldcacheLocator.Locate("libcuda.so") + if err == nil { + for _, c := range candidates { + if match, err := filepath.Match(pattern, filepath.Base(c)); err != nil || !match { + l.logger.Debugf("Skipping non-matching candidate %v: %v", c, err) + continue + } + return []string{c}, nil + } + } + l.logger.Debugf("Could not locate %q in LDCache: Checking predefined library paths.", pattern) + + pathLocator := lookup.NewFileLocator( + lookup.WithLogger(l.logger), + lookup.WithRoot(l.driverRoot), + lookup.WithSearchPaths( + "/usr/lib64", + "/usr/lib/x86_64-linux-gnu", + "/usr/lib/aarch64-linux-gnu", + ), + lookup.WithCount(1), + ) + + return pathLocator.Locate(pattern) +} diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index 5027870a..646b06bd 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -17,6 +17,8 @@ package nvcdi import ( + "fmt" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" "github.com/sirupsen/logrus" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" @@ -151,3 +153,24 @@ func (l *nvcdilib) resolveMode() (rmode string) { return ModeNvml } + +// getCudaVersion returns the CUDA version of the current system. +func (l *nvcdilib) getCudaVersion() (string, error) { + if hasNVML, reason := l.infolib.HasNvml(); !hasNVML { + return "", fmt.Errorf("nvml not detected: %v", reason) + } + if l.nvmllib == nil { + return "", fmt.Errorf("nvml library not initialized") + } + r := l.nvmllib.Init() + if r != nvml.SUCCESS { + return "", fmt.Errorf("failed to initialize nvml: %v", r) + } + defer l.nvmllib.Shutdown() + + version, r := l.nvmllib.SystemGetDriverVersion() + if r != nvml.SUCCESS { + return "", fmt.Errorf("failed to get driver version: %v", r) + } + return version, nil +} diff --git a/pkg/nvcdi/management.go b/pkg/nvcdi/management.go index 484393a0..305023ff 100644 --- a/pkg/nvcdi/management.go +++ b/pkg/nvcdi/management.go @@ -23,7 +23,6 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" - "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/container-orchestrated-devices/container-device-interface/specs-go" @@ -60,23 +59,9 @@ func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) { // GetCommonEdits returns the common edits for use in managementlib containers. func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { - locator, err := lookup.NewLibraryLocator( - m.logger, - m.driverRoot, - ) + version, err := m.getCudaVersion() if err != nil { - return nil, fmt.Errorf("failed to create library locator: %v", err) - } - - candidates, err := locator.Locate("libcuda.so") - if err != nil { - return nil, fmt.Errorf("failed to locate libcuda.so: %v", err) - } - libcudaPath := candidates[0] - - version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.") - if version == "" { - return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath) + return nil, fmt.Errorf("failed to get CUDA version: %v", err) } driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version) @@ -92,6 +77,30 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { return edits, nil } +// getCudaVersion returns the CUDA version for use in managementlib containers. +func (m *managementlib) getCudaVersion() (string, error) { + version, err := (*nvcdilib)(m).getCudaVersion() + if err == nil { + return version, nil + } + + l := cudaLocator{ + logger: m.logger, + driverRoot: m.driverRoot, + } + + libCudaPaths, err := l.Locate("libcuda.so.*.*.*") + if err != nil { + return "", fmt.Errorf("failed to locate libcuda.so: %v", err) + } + + libCudaPath := libCudaPaths[0] + + version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.") + + return version, nil +} + type managementDiscoverer struct { discover.Discover }