mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2024-11-22 08:18:32 +00:00
Merge branch 'CNT-4052/fix-arm-management-containers' into 'main'
Fix generation of management CDI spec in containers See merge request nvidia/container-toolkit/container-toolkit!354
This commit is contained in:
commit
df618d3cba
@ -2,6 +2,7 @@
|
||||
|
||||
## v1.13.0-rc.3
|
||||
|
||||
* Fix the generation of CDI specifications for management containers when the driver libraries are not in the LDCache.
|
||||
* Prefer /run over /var/run when locating nvidia-persistenced and nvidia-fabricmanager sockets.
|
||||
* Only initialize NVML for modes that require it when runing `nvidia-ctk cdi generate`
|
||||
* [libnvidia-container] Fix segmentation fault when RPC initialization fails.
|
||||
|
@ -22,7 +22,6 @@ import (
|
||||
"strings"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||
"github.com/sirupsen/logrus"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
@ -136,26 +135,25 @@ func NewDriverBinariesDiscoverer(logger *logrus.Logger, driverRoot string) disco
|
||||
func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([]string, error) {
|
||||
logger.Infof("Using driver version %v", version)
|
||||
|
||||
cache, err := ldcache.New(logger, driverRoot)
|
||||
l := cudaLocator{
|
||||
logger: logger,
|
||||
driverRoot: driverRoot,
|
||||
}
|
||||
libCudaPaths, err := l.Locate("libcuda.so." + version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load ldcache: %v", err)
|
||||
return nil, fmt.Errorf("failed to locate libcuda.so.%v: %v", version, err)
|
||||
}
|
||||
libRoot := filepath.Dir(libCudaPaths[0])
|
||||
|
||||
libs32, libs64 := cache.List()
|
||||
libraries := lookup.NewFileLocator(
|
||||
lookup.WithLogger(logger),
|
||||
lookup.WithSearchPaths(libRoot),
|
||||
lookup.WithOptional(true),
|
||||
)
|
||||
|
||||
var libs []string
|
||||
for _, l := range libs64 {
|
||||
if strings.HasSuffix(l, version) {
|
||||
logger.Infof("found 64-bit driver lib: %v", l)
|
||||
libs = append(libs, l)
|
||||
}
|
||||
}
|
||||
|
||||
for _, l := range libs32 {
|
||||
if strings.HasSuffix(l, version) {
|
||||
logger.Infof("found 32-bit driver lib: %v", l)
|
||||
libs = append(libs, l)
|
||||
}
|
||||
libs, err := libraries.Locate("*.so." + version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to locate libraries for driver version %v: %v", version, err)
|
||||
}
|
||||
|
||||
if driverRoot == "/" || driverRoot == "" {
|
||||
@ -169,3 +167,43 @@ func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([
|
||||
|
||||
return relative, nil
|
||||
}
|
||||
|
||||
type cudaLocator struct {
|
||||
logger *logrus.Logger
|
||||
driverRoot string
|
||||
}
|
||||
|
||||
// Locate returns the path to the libcuda.so.RMVERSION file.
|
||||
func (l *cudaLocator) Locate(pattern string) ([]string, error) {
|
||||
ldcacheLocator, err := lookup.NewLibraryLocator(
|
||||
l.logger,
|
||||
l.driverRoot,
|
||||
)
|
||||
if err != nil {
|
||||
l.logger.Debugf("Failed to create LDCache locator: %v", err)
|
||||
}
|
||||
candidates, err := ldcacheLocator.Locate("libcuda.so")
|
||||
if err == nil {
|
||||
for _, c := range candidates {
|
||||
if match, err := filepath.Match(pattern, filepath.Base(c)); err != nil || !match {
|
||||
l.logger.Debugf("Skipping non-matching candidate %v: %v", c, err)
|
||||
continue
|
||||
}
|
||||
return []string{c}, nil
|
||||
}
|
||||
}
|
||||
l.logger.Debugf("Could not locate %q in LDCache: Checking predefined library paths.", pattern)
|
||||
|
||||
pathLocator := lookup.NewFileLocator(
|
||||
lookup.WithLogger(l.logger),
|
||||
lookup.WithRoot(l.driverRoot),
|
||||
lookup.WithSearchPaths(
|
||||
"/usr/lib64",
|
||||
"/usr/lib/x86_64-linux-gnu",
|
||||
"/usr/lib/aarch64-linux-gnu",
|
||||
),
|
||||
lookup.WithCount(1),
|
||||
)
|
||||
|
||||
return pathLocator.Locate(pattern)
|
||||
}
|
||||
|
@ -17,6 +17,8 @@
|
||||
package nvcdi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
||||
"github.com/sirupsen/logrus"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||
@ -151,3 +153,24 @@ func (l *nvcdilib) resolveMode() (rmode string) {
|
||||
|
||||
return ModeNvml
|
||||
}
|
||||
|
||||
// getCudaVersion returns the CUDA version of the current system.
|
||||
func (l *nvcdilib) getCudaVersion() (string, error) {
|
||||
if hasNVML, reason := l.infolib.HasNvml(); !hasNVML {
|
||||
return "", fmt.Errorf("nvml not detected: %v", reason)
|
||||
}
|
||||
if l.nvmllib == nil {
|
||||
return "", fmt.Errorf("nvml library not initialized")
|
||||
}
|
||||
r := l.nvmllib.Init()
|
||||
if r != nvml.SUCCESS {
|
||||
return "", fmt.Errorf("failed to initialize nvml: %v", r)
|
||||
}
|
||||
defer l.nvmllib.Shutdown()
|
||||
|
||||
version, r := l.nvmllib.SystemGetDriverVersion()
|
||||
if r != nvml.SUCCESS {
|
||||
return "", fmt.Errorf("failed to get driver version: %v", r)
|
||||
}
|
||||
return version, nil
|
||||
}
|
||||
|
@ -23,7 +23,6 @@ import (
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
|
||||
@ -60,23 +59,9 @@ func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) {
|
||||
|
||||
// GetCommonEdits returns the common edits for use in managementlib containers.
|
||||
func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
||||
locator, err := lookup.NewLibraryLocator(
|
||||
m.logger,
|
||||
m.driverRoot,
|
||||
)
|
||||
version, err := m.getCudaVersion()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create library locator: %v", err)
|
||||
}
|
||||
|
||||
candidates, err := locator.Locate("libcuda.so")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to locate libcuda.so: %v", err)
|
||||
}
|
||||
libcudaPath := candidates[0]
|
||||
|
||||
version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.")
|
||||
if version == "" {
|
||||
return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath)
|
||||
return nil, fmt.Errorf("failed to get CUDA version: %v", err)
|
||||
}
|
||||
|
||||
driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version)
|
||||
@ -92,6 +77,30 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
||||
return edits, nil
|
||||
}
|
||||
|
||||
// getCudaVersion returns the CUDA version for use in managementlib containers.
|
||||
func (m *managementlib) getCudaVersion() (string, error) {
|
||||
version, err := (*nvcdilib)(m).getCudaVersion()
|
||||
if err == nil {
|
||||
return version, nil
|
||||
}
|
||||
|
||||
l := cudaLocator{
|
||||
logger: m.logger,
|
||||
driverRoot: m.driverRoot,
|
||||
}
|
||||
|
||||
libCudaPaths, err := l.Locate("libcuda.so.*.*.*")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to locate libcuda.so: %v", err)
|
||||
}
|
||||
|
||||
libCudaPath := libCudaPaths[0]
|
||||
|
||||
version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.")
|
||||
|
||||
return version, nil
|
||||
}
|
||||
|
||||
type managementDiscoverer struct {
|
||||
discover.Discover
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user