diff --git a/CHANGELOG.md b/CHANGELOG.md index f7c765de..b9b52b48 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## v1.13.0-rc.3 +* Fix the generation of CDI specifications for management containers when the driver libraries are not in the LDCache. * Prefer /run over /var/run when locating nvidia-persistenced and nvidia-fabricmanager sockets. * Only initialize NVML for modes that require it when runing `nvidia-ctk cdi generate` * [libnvidia-container] Fix segmentation fault when RPC initialization fails. diff --git a/pkg/nvcdi/driver-nvml.go b/pkg/nvcdi/driver-nvml.go index a9901590..f623ae30 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/pkg/nvcdi/driver-nvml.go @@ -22,7 +22,6 @@ import ( "strings" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" - "github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" "github.com/sirupsen/logrus" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" @@ -136,26 +135,25 @@ func NewDriverBinariesDiscoverer(logger *logrus.Logger, driverRoot string) disco func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([]string, error) { logger.Infof("Using driver version %v", version) - cache, err := ldcache.New(logger, driverRoot) + l := cudaLocator{ + logger: logger, + driverRoot: driverRoot, + } + libCudaPaths, err := l.Locate("libcuda.so." + version) if err != nil { - return nil, fmt.Errorf("failed to load ldcache: %v", err) + return nil, fmt.Errorf("failed to locate libcuda.so.%v: %v", version, err) } + libRoot := filepath.Dir(libCudaPaths[0]) - libs32, libs64 := cache.List() + libraries := lookup.NewFileLocator( + lookup.WithLogger(logger), + lookup.WithSearchPaths(libRoot), + lookup.WithOptional(true), + ) - var libs []string - for _, l := range libs64 { - if strings.HasSuffix(l, version) { - logger.Infof("found 64-bit driver lib: %v", l) - libs = append(libs, l) - } - } - - for _, l := range libs32 { - if strings.HasSuffix(l, version) { - logger.Infof("found 32-bit driver lib: %v", l) - libs = append(libs, l) - } + libs, err := libraries.Locate("*.so." + version) + if err != nil { + return nil, fmt.Errorf("failed to locate libraries for driver version %v: %v", version, err) } if driverRoot == "/" || driverRoot == "" { @@ -169,3 +167,43 @@ func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([ return relative, nil } + +type cudaLocator struct { + logger *logrus.Logger + driverRoot string +} + +// Locate returns the path to the libcuda.so.RMVERSION file. +func (l *cudaLocator) Locate(pattern string) ([]string, error) { + ldcacheLocator, err := lookup.NewLibraryLocator( + l.logger, + l.driverRoot, + ) + if err != nil { + l.logger.Debugf("Failed to create LDCache locator: %v", err) + } + candidates, err := ldcacheLocator.Locate("libcuda.so") + if err == nil { + for _, c := range candidates { + if match, err := filepath.Match(pattern, filepath.Base(c)); err != nil || !match { + l.logger.Debugf("Skipping non-matching candidate %v: %v", c, err) + continue + } + return []string{c}, nil + } + } + l.logger.Debugf("Could not locate %q in LDCache: Checking predefined library paths.", pattern) + + pathLocator := lookup.NewFileLocator( + lookup.WithLogger(l.logger), + lookup.WithRoot(l.driverRoot), + lookup.WithSearchPaths( + "/usr/lib64", + "/usr/lib/x86_64-linux-gnu", + "/usr/lib/aarch64-linux-gnu", + ), + lookup.WithCount(1), + ) + + return pathLocator.Locate(pattern) +} diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index 5027870a..646b06bd 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -17,6 +17,8 @@ package nvcdi import ( + "fmt" + "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" "github.com/sirupsen/logrus" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" @@ -151,3 +153,24 @@ func (l *nvcdilib) resolveMode() (rmode string) { return ModeNvml } + +// getCudaVersion returns the CUDA version of the current system. +func (l *nvcdilib) getCudaVersion() (string, error) { + if hasNVML, reason := l.infolib.HasNvml(); !hasNVML { + return "", fmt.Errorf("nvml not detected: %v", reason) + } + if l.nvmllib == nil { + return "", fmt.Errorf("nvml library not initialized") + } + r := l.nvmllib.Init() + if r != nvml.SUCCESS { + return "", fmt.Errorf("failed to initialize nvml: %v", r) + } + defer l.nvmllib.Shutdown() + + version, r := l.nvmllib.SystemGetDriverVersion() + if r != nvml.SUCCESS { + return "", fmt.Errorf("failed to get driver version: %v", r) + } + return version, nil +} diff --git a/pkg/nvcdi/management.go b/pkg/nvcdi/management.go index 484393a0..305023ff 100644 --- a/pkg/nvcdi/management.go +++ b/pkg/nvcdi/management.go @@ -23,7 +23,6 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits" - "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/container-orchestrated-devices/container-device-interface/specs-go" @@ -60,23 +59,9 @@ func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) { // GetCommonEdits returns the common edits for use in managementlib containers. func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { - locator, err := lookup.NewLibraryLocator( - m.logger, - m.driverRoot, - ) + version, err := m.getCudaVersion() if err != nil { - return nil, fmt.Errorf("failed to create library locator: %v", err) - } - - candidates, err := locator.Locate("libcuda.so") - if err != nil { - return nil, fmt.Errorf("failed to locate libcuda.so: %v", err) - } - libcudaPath := candidates[0] - - version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.") - if version == "" { - return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath) + return nil, fmt.Errorf("failed to get CUDA version: %v", err) } driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version) @@ -92,6 +77,30 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { return edits, nil } +// getCudaVersion returns the CUDA version for use in managementlib containers. +func (m *managementlib) getCudaVersion() (string, error) { + version, err := (*nvcdilib)(m).getCudaVersion() + if err == nil { + return version, nil + } + + l := cudaLocator{ + logger: m.logger, + driverRoot: m.driverRoot, + } + + libCudaPaths, err := l.Locate("libcuda.so.*.*.*") + if err != nil { + return "", fmt.Errorf("failed to locate libcuda.so: %v", err) + } + + libCudaPath := libCudaPaths[0] + + version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.") + + return version, nil +} + type managementDiscoverer struct { discover.Discover }