Fix generation of management CDI spec in containers

Since we relied on finding libcuda.so in the LDCache to determine both the CUDA
version and the expected directory for the driver libraries, the generation of the
management CDI specifications fails in containers where the LDCache has not been updated.

This change falls back to searching a set of predefined paths instead when the lookup of
libcuda.so in the cache fails.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2023-03-23 11:50:11 +02:00
parent 5e0684e99d
commit 9506bd9da0
4 changed files with 105 additions and 34 deletions

View File

@ -2,6 +2,7 @@
## v1.13.0-rc.3 ## v1.13.0-rc.3
* Fix the generation of CDI specifications for management containers when the driver libraries are not in the LDCache.
* Prefer /run over /var/run when locating nvidia-persistenced and nvidia-fabricmanager sockets. * Prefer /run over /var/run when locating nvidia-persistenced and nvidia-fabricmanager sockets.
* Only initialize NVML for modes that require it when runing `nvidia-ctk cdi generate` * Only initialize NVML for modes that require it when runing `nvidia-ctk cdi generate`
* [libnvidia-container] Fix segmentation fault when RPC initialization fails. * [libnvidia-container] Fix segmentation fault when RPC initialization fails.

View File

@ -22,7 +22,6 @@ import (
"strings" "strings"
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup" "github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
@ -136,26 +135,25 @@ func NewDriverBinariesDiscoverer(logger *logrus.Logger, driverRoot string) disco
func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([]string, error) { func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([]string, error) {
logger.Infof("Using driver version %v", version) logger.Infof("Using driver version %v", version)
cache, err := ldcache.New(logger, driverRoot) l := cudaLocator{
logger: logger,
driverRoot: driverRoot,
}
libCudaPaths, err := l.Locate("libcuda.so." + version)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to load ldcache: %v", err) return nil, fmt.Errorf("failed to locate libcuda.so.%v: %v", version, err)
} }
libRoot := filepath.Dir(libCudaPaths[0])
libs32, libs64 := cache.List() libraries := lookup.NewFileLocator(
lookup.WithLogger(logger),
lookup.WithSearchPaths(libRoot),
lookup.WithOptional(true),
)
var libs []string libs, err := libraries.Locate("*.so." + version)
for _, l := range libs64 { if err != nil {
if strings.HasSuffix(l, version) { return nil, fmt.Errorf("failed to locate libraries for driver version %v: %v", version, err)
logger.Infof("found 64-bit driver lib: %v", l)
libs = append(libs, l)
}
}
for _, l := range libs32 {
if strings.HasSuffix(l, version) {
logger.Infof("found 32-bit driver lib: %v", l)
libs = append(libs, l)
}
} }
if driverRoot == "/" || driverRoot == "" { if driverRoot == "/" || driverRoot == "" {
@ -169,3 +167,43 @@ func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([
return relative, nil return relative, nil
} }
type cudaLocator struct {
logger *logrus.Logger
driverRoot string
}
// Locate returns the path to the libcuda.so.RMVERSION file.
func (l *cudaLocator) Locate(pattern string) ([]string, error) {
ldcacheLocator, err := lookup.NewLibraryLocator(
l.logger,
l.driverRoot,
)
if err != nil {
l.logger.Debugf("Failed to create LDCache locator: %v", err)
}
candidates, err := ldcacheLocator.Locate("libcuda.so")
if err == nil {
for _, c := range candidates {
if match, err := filepath.Match(pattern, filepath.Base(c)); err != nil || !match {
l.logger.Debugf("Skipping non-matching candidate %v: %v", c, err)
continue
}
return []string{c}, nil
}
}
l.logger.Debugf("Could not locate %q in LDCache: Checking predefined library paths.", pattern)
pathLocator := lookup.NewFileLocator(
lookup.WithLogger(l.logger),
lookup.WithRoot(l.driverRoot),
lookup.WithSearchPaths(
"/usr/lib64",
"/usr/lib/x86_64-linux-gnu",
"/usr/lib/aarch64-linux-gnu",
),
lookup.WithCount(1),
)
return pathLocator.Locate(pattern)
}

View File

@ -17,6 +17,8 @@
package nvcdi package nvcdi
import ( import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
@ -151,3 +153,24 @@ func (l *nvcdilib) resolveMode() (rmode string) {
return ModeNvml return ModeNvml
} }
// getCudaVersion returns the CUDA version of the current system.
func (l *nvcdilib) getCudaVersion() (string, error) {
if hasNVML, reason := l.infolib.HasNvml(); !hasNVML {
return "", fmt.Errorf("nvml not detected: %v", reason)
}
if l.nvmllib == nil {
return "", fmt.Errorf("nvml library not initialized")
}
r := l.nvmllib.Init()
if r != nvml.SUCCESS {
return "", fmt.Errorf("failed to initialize nvml: %v", r)
}
defer l.nvmllib.Shutdown()
version, r := l.nvmllib.SystemGetDriverVersion()
if r != nvml.SUCCESS {
return "", fmt.Errorf("failed to get driver version: %v", r)
}
return version, nil
}

View File

@ -23,7 +23,6 @@ import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover" "github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits" "github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec" "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"github.com/container-orchestrated-devices/container-device-interface/specs-go" "github.com/container-orchestrated-devices/container-device-interface/specs-go"
@ -60,23 +59,9 @@ func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) {
// GetCommonEdits returns the common edits for use in managementlib containers. // GetCommonEdits returns the common edits for use in managementlib containers.
func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
locator, err := lookup.NewLibraryLocator( version, err := m.getCudaVersion()
m.logger,
m.driverRoot,
)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to create library locator: %v", err) return nil, fmt.Errorf("failed to get CUDA version: %v", err)
}
candidates, err := locator.Locate("libcuda.so")
if err != nil {
return nil, fmt.Errorf("failed to locate libcuda.so: %v", err)
}
libcudaPath := candidates[0]
version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.")
if version == "" {
return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath)
} }
driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version) driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version)
@ -92,6 +77,30 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
return edits, nil return edits, nil
} }
// getCudaVersion returns the CUDA version for use in managementlib containers.
func (m *managementlib) getCudaVersion() (string, error) {
version, err := (*nvcdilib)(m).getCudaVersion()
if err == nil {
return version, nil
}
l := cudaLocator{
logger: m.logger,
driverRoot: m.driverRoot,
}
libCudaPaths, err := l.Locate("libcuda.so.*.*.*")
if err != nil {
return "", fmt.Errorf("failed to locate libcuda.so: %v", err)
}
libCudaPath := libCudaPaths[0]
version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.")
return version, nil
}
type managementDiscoverer struct { type managementDiscoverer struct {
discover.Discover discover.Discover
} }