mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-22 00:08:11 +00:00
Fix generation of management CDI spec in containers
Since we relied on finding libcuda.so in the LDCache to determine both the CUDA version and the expected directory for the driver libraries, the generation of the management CDI specifications fails in containers where the LDCache has not been updated. This change falls back to searching a set of predefined paths instead when the lookup of libcuda.so in the cache fails. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
parent
5e0684e99d
commit
9506bd9da0
@ -2,6 +2,7 @@
|
||||
|
||||
## v1.13.0-rc.3
|
||||
|
||||
* Fix the generation of CDI specifications for management containers when the driver libraries are not in the LDCache.
|
||||
* Prefer /run over /var/run when locating nvidia-persistenced and nvidia-fabricmanager sockets.
|
||||
* Only initialize NVML for modes that require it when runing `nvidia-ctk cdi generate`
|
||||
* [libnvidia-container] Fix segmentation fault when RPC initialization fails.
|
||||
|
@ -22,7 +22,6 @@ import (
|
||||
"strings"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||
"github.com/sirupsen/logrus"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
@ -136,26 +135,25 @@ func NewDriverBinariesDiscoverer(logger *logrus.Logger, driverRoot string) disco
|
||||
func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([]string, error) {
|
||||
logger.Infof("Using driver version %v", version)
|
||||
|
||||
cache, err := ldcache.New(logger, driverRoot)
|
||||
l := cudaLocator{
|
||||
logger: logger,
|
||||
driverRoot: driverRoot,
|
||||
}
|
||||
libCudaPaths, err := l.Locate("libcuda.so." + version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load ldcache: %v", err)
|
||||
return nil, fmt.Errorf("failed to locate libcuda.so.%v: %v", version, err)
|
||||
}
|
||||
libRoot := filepath.Dir(libCudaPaths[0])
|
||||
|
||||
libs32, libs64 := cache.List()
|
||||
libraries := lookup.NewFileLocator(
|
||||
lookup.WithLogger(logger),
|
||||
lookup.WithSearchPaths(libRoot),
|
||||
lookup.WithOptional(true),
|
||||
)
|
||||
|
||||
var libs []string
|
||||
for _, l := range libs64 {
|
||||
if strings.HasSuffix(l, version) {
|
||||
logger.Infof("found 64-bit driver lib: %v", l)
|
||||
libs = append(libs, l)
|
||||
}
|
||||
}
|
||||
|
||||
for _, l := range libs32 {
|
||||
if strings.HasSuffix(l, version) {
|
||||
logger.Infof("found 32-bit driver lib: %v", l)
|
||||
libs = append(libs, l)
|
||||
}
|
||||
libs, err := libraries.Locate("*.so." + version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to locate libraries for driver version %v: %v", version, err)
|
||||
}
|
||||
|
||||
if driverRoot == "/" || driverRoot == "" {
|
||||
@ -169,3 +167,43 @@ func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([
|
||||
|
||||
return relative, nil
|
||||
}
|
||||
|
||||
type cudaLocator struct {
|
||||
logger *logrus.Logger
|
||||
driverRoot string
|
||||
}
|
||||
|
||||
// Locate returns the path to the libcuda.so.RMVERSION file.
|
||||
func (l *cudaLocator) Locate(pattern string) ([]string, error) {
|
||||
ldcacheLocator, err := lookup.NewLibraryLocator(
|
||||
l.logger,
|
||||
l.driverRoot,
|
||||
)
|
||||
if err != nil {
|
||||
l.logger.Debugf("Failed to create LDCache locator: %v", err)
|
||||
}
|
||||
candidates, err := ldcacheLocator.Locate("libcuda.so")
|
||||
if err == nil {
|
||||
for _, c := range candidates {
|
||||
if match, err := filepath.Match(pattern, filepath.Base(c)); err != nil || !match {
|
||||
l.logger.Debugf("Skipping non-matching candidate %v: %v", c, err)
|
||||
continue
|
||||
}
|
||||
return []string{c}, nil
|
||||
}
|
||||
}
|
||||
l.logger.Debugf("Could not locate %q in LDCache: Checking predefined library paths.", pattern)
|
||||
|
||||
pathLocator := lookup.NewFileLocator(
|
||||
lookup.WithLogger(l.logger),
|
||||
lookup.WithRoot(l.driverRoot),
|
||||
lookup.WithSearchPaths(
|
||||
"/usr/lib64",
|
||||
"/usr/lib/x86_64-linux-gnu",
|
||||
"/usr/lib/aarch64-linux-gnu",
|
||||
),
|
||||
lookup.WithCount(1),
|
||||
)
|
||||
|
||||
return pathLocator.Locate(pattern)
|
||||
}
|
||||
|
@ -17,6 +17,8 @@
|
||||
package nvcdi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
||||
"github.com/sirupsen/logrus"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||
@ -151,3 +153,24 @@ func (l *nvcdilib) resolveMode() (rmode string) {
|
||||
|
||||
return ModeNvml
|
||||
}
|
||||
|
||||
// getCudaVersion returns the CUDA version of the current system.
|
||||
func (l *nvcdilib) getCudaVersion() (string, error) {
|
||||
if hasNVML, reason := l.infolib.HasNvml(); !hasNVML {
|
||||
return "", fmt.Errorf("nvml not detected: %v", reason)
|
||||
}
|
||||
if l.nvmllib == nil {
|
||||
return "", fmt.Errorf("nvml library not initialized")
|
||||
}
|
||||
r := l.nvmllib.Init()
|
||||
if r != nvml.SUCCESS {
|
||||
return "", fmt.Errorf("failed to initialize nvml: %v", r)
|
||||
}
|
||||
defer l.nvmllib.Shutdown()
|
||||
|
||||
version, r := l.nvmllib.SystemGetDriverVersion()
|
||||
if r != nvml.SUCCESS {
|
||||
return "", fmt.Errorf("failed to get driver version: %v", r)
|
||||
}
|
||||
return version, nil
|
||||
}
|
||||
|
@ -23,7 +23,6 @@ import (
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
|
||||
@ -60,23 +59,9 @@ func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) {
|
||||
|
||||
// GetCommonEdits returns the common edits for use in managementlib containers.
|
||||
func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
||||
locator, err := lookup.NewLibraryLocator(
|
||||
m.logger,
|
||||
m.driverRoot,
|
||||
)
|
||||
version, err := m.getCudaVersion()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create library locator: %v", err)
|
||||
}
|
||||
|
||||
candidates, err := locator.Locate("libcuda.so")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to locate libcuda.so: %v", err)
|
||||
}
|
||||
libcudaPath := candidates[0]
|
||||
|
||||
version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.")
|
||||
if version == "" {
|
||||
return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath)
|
||||
return nil, fmt.Errorf("failed to get CUDA version: %v", err)
|
||||
}
|
||||
|
||||
driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version)
|
||||
@ -92,6 +77,30 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
||||
return edits, nil
|
||||
}
|
||||
|
||||
// getCudaVersion returns the CUDA version for use in managementlib containers.
|
||||
func (m *managementlib) getCudaVersion() (string, error) {
|
||||
version, err := (*nvcdilib)(m).getCudaVersion()
|
||||
if err == nil {
|
||||
return version, nil
|
||||
}
|
||||
|
||||
l := cudaLocator{
|
||||
logger: m.logger,
|
||||
driverRoot: m.driverRoot,
|
||||
}
|
||||
|
||||
libCudaPaths, err := l.Locate("libcuda.so.*.*.*")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to locate libcuda.so: %v", err)
|
||||
}
|
||||
|
||||
libCudaPath := libCudaPaths[0]
|
||||
|
||||
version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.")
|
||||
|
||||
return version, nil
|
||||
}
|
||||
|
||||
type managementDiscoverer struct {
|
||||
discover.Discover
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user