mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-04-21 22:55:21 +00:00
Merge branch 'CNT-4052/fix-arm-management-containers' into 'main'
Fix generation of management CDI spec in containers See merge request nvidia/container-toolkit/container-toolkit!354
This commit is contained in:
commit
df618d3cba
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
## v1.13.0-rc.3
|
## v1.13.0-rc.3
|
||||||
|
|
||||||
|
* Fix the generation of CDI specifications for management containers when the driver libraries are not in the LDCache.
|
||||||
* Prefer /run over /var/run when locating nvidia-persistenced and nvidia-fabricmanager sockets.
|
* Prefer /run over /var/run when locating nvidia-persistenced and nvidia-fabricmanager sockets.
|
||||||
* Only initialize NVML for modes that require it when runing `nvidia-ctk cdi generate`
|
* Only initialize NVML for modes that require it when runing `nvidia-ctk cdi generate`
|
||||||
* [libnvidia-container] Fix segmentation fault when RPC initialization fails.
|
* [libnvidia-container] Fix segmentation fault when RPC initialization fails.
|
||||||
|
@ -22,7 +22,6 @@ import (
|
|||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache"
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||||
@ -136,26 +135,25 @@ func NewDriverBinariesDiscoverer(logger *logrus.Logger, driverRoot string) disco
|
|||||||
func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([]string, error) {
|
func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([]string, error) {
|
||||||
logger.Infof("Using driver version %v", version)
|
logger.Infof("Using driver version %v", version)
|
||||||
|
|
||||||
cache, err := ldcache.New(logger, driverRoot)
|
l := cudaLocator{
|
||||||
|
logger: logger,
|
||||||
|
driverRoot: driverRoot,
|
||||||
|
}
|
||||||
|
libCudaPaths, err := l.Locate("libcuda.so." + version)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to load ldcache: %v", err)
|
return nil, fmt.Errorf("failed to locate libcuda.so.%v: %v", version, err)
|
||||||
}
|
}
|
||||||
|
libRoot := filepath.Dir(libCudaPaths[0])
|
||||||
|
|
||||||
libs32, libs64 := cache.List()
|
libraries := lookup.NewFileLocator(
|
||||||
|
lookup.WithLogger(logger),
|
||||||
|
lookup.WithSearchPaths(libRoot),
|
||||||
|
lookup.WithOptional(true),
|
||||||
|
)
|
||||||
|
|
||||||
var libs []string
|
libs, err := libraries.Locate("*.so." + version)
|
||||||
for _, l := range libs64 {
|
if err != nil {
|
||||||
if strings.HasSuffix(l, version) {
|
return nil, fmt.Errorf("failed to locate libraries for driver version %v: %v", version, err)
|
||||||
logger.Infof("found 64-bit driver lib: %v", l)
|
|
||||||
libs = append(libs, l)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, l := range libs32 {
|
|
||||||
if strings.HasSuffix(l, version) {
|
|
||||||
logger.Infof("found 32-bit driver lib: %v", l)
|
|
||||||
libs = append(libs, l)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if driverRoot == "/" || driverRoot == "" {
|
if driverRoot == "/" || driverRoot == "" {
|
||||||
@ -169,3 +167,43 @@ func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([
|
|||||||
|
|
||||||
return relative, nil
|
return relative, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type cudaLocator struct {
|
||||||
|
logger *logrus.Logger
|
||||||
|
driverRoot string
|
||||||
|
}
|
||||||
|
|
||||||
|
// Locate returns the path to the libcuda.so.RMVERSION file.
|
||||||
|
func (l *cudaLocator) Locate(pattern string) ([]string, error) {
|
||||||
|
ldcacheLocator, err := lookup.NewLibraryLocator(
|
||||||
|
l.logger,
|
||||||
|
l.driverRoot,
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
l.logger.Debugf("Failed to create LDCache locator: %v", err)
|
||||||
|
}
|
||||||
|
candidates, err := ldcacheLocator.Locate("libcuda.so")
|
||||||
|
if err == nil {
|
||||||
|
for _, c := range candidates {
|
||||||
|
if match, err := filepath.Match(pattern, filepath.Base(c)); err != nil || !match {
|
||||||
|
l.logger.Debugf("Skipping non-matching candidate %v: %v", c, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
return []string{c}, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
l.logger.Debugf("Could not locate %q in LDCache: Checking predefined library paths.", pattern)
|
||||||
|
|
||||||
|
pathLocator := lookup.NewFileLocator(
|
||||||
|
lookup.WithLogger(l.logger),
|
||||||
|
lookup.WithRoot(l.driverRoot),
|
||||||
|
lookup.WithSearchPaths(
|
||||||
|
"/usr/lib64",
|
||||||
|
"/usr/lib/x86_64-linux-gnu",
|
||||||
|
"/usr/lib/aarch64-linux-gnu",
|
||||||
|
),
|
||||||
|
lookup.WithCount(1),
|
||||||
|
)
|
||||||
|
|
||||||
|
return pathLocator.Locate(pattern)
|
||||||
|
}
|
||||||
|
@ -17,6 +17,8 @@
|
|||||||
package nvcdi
|
package nvcdi
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
||||||
"github.com/sirupsen/logrus"
|
"github.com/sirupsen/logrus"
|
||||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||||
@ -151,3 +153,24 @@ func (l *nvcdilib) resolveMode() (rmode string) {
|
|||||||
|
|
||||||
return ModeNvml
|
return ModeNvml
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getCudaVersion returns the CUDA version of the current system.
|
||||||
|
func (l *nvcdilib) getCudaVersion() (string, error) {
|
||||||
|
if hasNVML, reason := l.infolib.HasNvml(); !hasNVML {
|
||||||
|
return "", fmt.Errorf("nvml not detected: %v", reason)
|
||||||
|
}
|
||||||
|
if l.nvmllib == nil {
|
||||||
|
return "", fmt.Errorf("nvml library not initialized")
|
||||||
|
}
|
||||||
|
r := l.nvmllib.Init()
|
||||||
|
if r != nvml.SUCCESS {
|
||||||
|
return "", fmt.Errorf("failed to initialize nvml: %v", r)
|
||||||
|
}
|
||||||
|
defer l.nvmllib.Shutdown()
|
||||||
|
|
||||||
|
version, r := l.nvmllib.SystemGetDriverVersion()
|
||||||
|
if r != nvml.SUCCESS {
|
||||||
|
return "", fmt.Errorf("failed to get driver version: %v", r)
|
||||||
|
}
|
||||||
|
return version, nil
|
||||||
|
}
|
||||||
|
@ -23,7 +23,6 @@ import (
|
|||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
||||||
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||||
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
|
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
|
||||||
@ -60,23 +59,9 @@ func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) {
|
|||||||
|
|
||||||
// GetCommonEdits returns the common edits for use in managementlib containers.
|
// GetCommonEdits returns the common edits for use in managementlib containers.
|
||||||
func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
||||||
locator, err := lookup.NewLibraryLocator(
|
version, err := m.getCudaVersion()
|
||||||
m.logger,
|
|
||||||
m.driverRoot,
|
|
||||||
)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to create library locator: %v", err)
|
return nil, fmt.Errorf("failed to get CUDA version: %v", err)
|
||||||
}
|
|
||||||
|
|
||||||
candidates, err := locator.Locate("libcuda.so")
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to locate libcuda.so: %v", err)
|
|
||||||
}
|
|
||||||
libcudaPath := candidates[0]
|
|
||||||
|
|
||||||
version := strings.TrimPrefix(filepath.Base(libcudaPath), "libcuda.so.")
|
|
||||||
if version == "" {
|
|
||||||
return nil, fmt.Errorf("failed to determine libcuda.so version from path: %q", libcudaPath)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version)
|
driver, err := newDriverVersionDiscoverer(m.logger, m.driverRoot, m.nvidiaCTKPath, version)
|
||||||
@ -92,6 +77,30 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
|||||||
return edits, nil
|
return edits, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getCudaVersion returns the CUDA version for use in managementlib containers.
|
||||||
|
func (m *managementlib) getCudaVersion() (string, error) {
|
||||||
|
version, err := (*nvcdilib)(m).getCudaVersion()
|
||||||
|
if err == nil {
|
||||||
|
return version, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
l := cudaLocator{
|
||||||
|
logger: m.logger,
|
||||||
|
driverRoot: m.driverRoot,
|
||||||
|
}
|
||||||
|
|
||||||
|
libCudaPaths, err := l.Locate("libcuda.so.*.*.*")
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("failed to locate libcuda.so: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
libCudaPath := libCudaPaths[0]
|
||||||
|
|
||||||
|
version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.")
|
||||||
|
|
||||||
|
return version, nil
|
||||||
|
}
|
||||||
|
|
||||||
type managementDiscoverer struct {
|
type managementDiscoverer struct {
|
||||||
discover.Discover
|
discover.Discover
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user