2022-12-02 13:17:52 +00:00
|
|
|
/**
|
|
|
|
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
**/
|
|
|
|
|
|
|
|
package nvcdi
|
|
|
|
|
|
|
|
import (
|
2023-03-23 09:50:11 +00:00
|
|
|
"fmt"
|
|
|
|
|
2023-11-15 20:36:23 +00:00
|
|
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
|
|
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/info"
|
2024-04-18 12:50:00 +00:00
|
|
|
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
2024-03-04 14:08:33 +00:00
|
|
|
"tags.cncf.io/container-device-interface/pkg/cdi"
|
2023-12-01 01:10:10 +00:00
|
|
|
|
2024-10-14 13:06:06 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
2023-03-22 12:27:43 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
2023-11-21 15:08:16 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
|
2024-09-24 17:05:11 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvsandboxutils"
|
2023-06-22 13:49:27 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv"
|
2023-02-20 12:29:52 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
2023-03-17 09:47:31 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform"
|
2022-12-02 13:17:52 +00:00
|
|
|
)
|
|
|
|
|
2023-02-20 12:29:52 +00:00
|
|
|
type wrapper struct {
|
|
|
|
Interface
|
2023-02-22 14:40:27 +00:00
|
|
|
|
|
|
|
vendor string
|
|
|
|
class string
|
2023-03-17 09:47:31 +00:00
|
|
|
|
|
|
|
mergedDeviceOptions []transform.MergedDeviceOption
|
2023-02-20 12:29:52 +00:00
|
|
|
}
|
|
|
|
|
2022-12-02 13:17:52 +00:00
|
|
|
type nvcdilib struct {
|
2023-07-18 10:02:37 +00:00
|
|
|
logger logger.Interface
|
|
|
|
nvmllib nvml.Interface
|
2024-09-24 17:05:11 +00:00
|
|
|
nvsandboxutilslib nvsandboxutils.Interface
|
2023-07-18 10:02:37 +00:00
|
|
|
mode string
|
|
|
|
devicelib device.Interface
|
2023-03-21 13:51:36 +00:00
|
|
|
deviceNamers DeviceNamers
|
2023-07-18 10:02:37 +00:00
|
|
|
driverRoot string
|
2023-11-14 15:57:37 +00:00
|
|
|
devRoot string
|
2024-04-24 08:47:45 +00:00
|
|
|
nvidiaCDIHookPath string
|
2023-12-15 00:46:00 +00:00
|
|
|
ldconfigPath string
|
2023-11-24 15:45:19 +00:00
|
|
|
configSearchPaths []string
|
2023-07-18 10:02:37 +00:00
|
|
|
librarySearchPaths []string
|
2023-02-13 15:04:30 +00:00
|
|
|
|
2023-09-22 14:30:20 +00:00
|
|
|
csvFiles []string
|
|
|
|
csvIgnorePatterns []string
|
2023-05-10 12:23:05 +00:00
|
|
|
|
2023-02-22 14:40:27 +00:00
|
|
|
vendor string
|
|
|
|
class string
|
|
|
|
|
2023-11-21 15:08:16 +00:00
|
|
|
driver *root.Driver
|
2023-02-13 15:04:30 +00:00
|
|
|
infolib info.Interface
|
2023-03-17 09:47:31 +00:00
|
|
|
|
|
|
|
mergedDeviceOptions []transform.MergedDeviceOption
|
2022-12-02 13:17:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// New creates a new nvcdi library
|
2023-03-22 12:04:12 +00:00
|
|
|
func New(opts ...Option) (Interface, error) {
|
2022-12-02 13:17:52 +00:00
|
|
|
l := &nvcdilib{}
|
2023-02-15 00:32:40 +00:00
|
|
|
for _, opt := range opts {
|
|
|
|
opt(l)
|
|
|
|
}
|
2023-02-16 15:29:53 +00:00
|
|
|
if l.mode == "" {
|
2023-02-20 14:27:34 +00:00
|
|
|
l.mode = ModeAuto
|
2022-12-02 13:17:52 +00:00
|
|
|
}
|
|
|
|
if l.logger == nil {
|
2023-03-22 12:27:43 +00:00
|
|
|
l.logger = logger.New()
|
2022-12-02 13:17:52 +00:00
|
|
|
}
|
2023-03-21 13:51:36 +00:00
|
|
|
if len(l.deviceNamers) == 0 {
|
|
|
|
indexNamer, _ := NewDeviceNamer(DeviceNameStrategyIndex)
|
|
|
|
l.deviceNamers = []DeviceNamer{indexNamer}
|
2022-12-02 13:17:52 +00:00
|
|
|
}
|
2024-05-28 10:05:38 +00:00
|
|
|
if l.nvidiaCDIHookPath == "" {
|
|
|
|
l.nvidiaCDIHookPath = "/usr/bin/nvidia-cdi-hook"
|
|
|
|
}
|
2022-12-02 13:17:52 +00:00
|
|
|
if l.driverRoot == "" {
|
|
|
|
l.driverRoot = "/"
|
|
|
|
}
|
2023-11-14 15:57:37 +00:00
|
|
|
if l.devRoot == "" {
|
|
|
|
l.devRoot = l.driverRoot
|
|
|
|
}
|
2024-05-28 10:05:38 +00:00
|
|
|
l.driver = root.New(
|
|
|
|
root.WithLogger(l.logger),
|
|
|
|
root.WithDriverRoot(l.driverRoot),
|
|
|
|
root.WithLibrarySearchPaths(l.librarySearchPaths...),
|
|
|
|
)
|
|
|
|
if l.nvmllib == nil {
|
2024-04-04 13:00:21 +00:00
|
|
|
var nvmlOpts []nvml.LibraryOption
|
|
|
|
candidates, err := l.driver.Libraries().Locate("libnvidia-ml.so.1")
|
|
|
|
if err != nil {
|
|
|
|
l.logger.Warningf("Ignoring error in locating libnvidia-ml.so.1: %v", err)
|
|
|
|
} else {
|
|
|
|
libNvidiaMlPath := candidates[0]
|
|
|
|
l.logger.Infof("Using %v", libNvidiaMlPath)
|
|
|
|
nvmlOpts = append(nvmlOpts, nvml.WithLibraryPath(libNvidiaMlPath))
|
|
|
|
}
|
|
|
|
l.nvmllib = nvml.New(nvmlOpts...)
|
2024-05-28 10:05:38 +00:00
|
|
|
}
|
2024-09-24 17:05:11 +00:00
|
|
|
if l.nvsandboxutilslib == nil {
|
|
|
|
var nvsandboxutilsOpts []nvsandboxutils.LibraryOption
|
|
|
|
// Set the library path for libnvidia-sandboxutils
|
|
|
|
candidates, err := l.driver.Libraries().Locate("libnvidia-sandboxutils.so.1")
|
|
|
|
if err != nil {
|
|
|
|
l.logger.Warningf("Ignoring error in locating libnvidia-sandboxutils.so.1: %v", err)
|
|
|
|
} else {
|
|
|
|
libNvidiaSandboxutilsPath := candidates[0]
|
|
|
|
l.logger.Infof("Using %v", libNvidiaSandboxutilsPath)
|
|
|
|
nvsandboxutilsOpts = append(nvsandboxutilsOpts, nvsandboxutils.WithLibraryPath(libNvidiaSandboxutilsPath))
|
|
|
|
}
|
|
|
|
l.nvsandboxutilslib = nvsandboxutils.New(nvsandboxutilsOpts...)
|
|
|
|
}
|
2024-05-28 10:05:38 +00:00
|
|
|
if l.devicelib == nil {
|
2024-05-28 11:28:28 +00:00
|
|
|
l.devicelib = device.New(l.nvmllib)
|
2022-12-02 13:17:52 +00:00
|
|
|
}
|
2023-02-13 15:04:30 +00:00
|
|
|
if l.infolib == nil {
|
2024-03-26 10:37:09 +00:00
|
|
|
l.infolib = info.New(
|
|
|
|
info.WithRoot(l.driverRoot),
|
|
|
|
info.WithLogger(l.logger),
|
|
|
|
info.WithNvmlLib(l.nvmllib),
|
|
|
|
info.WithDeviceLib(l.devicelib),
|
|
|
|
)
|
2023-02-13 15:04:30 +00:00
|
|
|
}
|
2022-12-02 13:17:52 +00:00
|
|
|
|
2023-02-20 12:29:52 +00:00
|
|
|
var lib Interface
|
2023-02-13 15:04:30 +00:00
|
|
|
switch l.resolveMode() {
|
2023-05-10 12:23:05 +00:00
|
|
|
case ModeCSV:
|
|
|
|
if len(l.csvFiles) == 0 {
|
2023-05-10 14:28:59 +00:00
|
|
|
l.csvFiles = csv.DefaultFileList()
|
2023-05-10 12:23:05 +00:00
|
|
|
}
|
|
|
|
lib = (*csvlib)(l)
|
2023-03-01 10:16:38 +00:00
|
|
|
case ModeManagement:
|
|
|
|
if l.vendor == "" {
|
|
|
|
l.vendor = "management.nvidia.com"
|
|
|
|
}
|
|
|
|
lib = (*managementlib)(l)
|
2023-02-20 14:27:34 +00:00
|
|
|
case ModeNvml:
|
2023-02-20 12:29:52 +00:00
|
|
|
lib = (*nvmllib)(l)
|
2023-02-20 14:27:34 +00:00
|
|
|
case ModeWsl:
|
2023-02-20 12:29:52 +00:00
|
|
|
lib = (*wsllib)(l)
|
2023-03-06 12:34:40 +00:00
|
|
|
case ModeGds:
|
|
|
|
if l.class == "" {
|
|
|
|
l.class = "gds"
|
|
|
|
}
|
|
|
|
lib = (*gdslib)(l)
|
2023-03-06 12:41:07 +00:00
|
|
|
case ModeMofed:
|
|
|
|
if l.class == "" {
|
|
|
|
l.class = "mofed"
|
|
|
|
}
|
|
|
|
lib = (*mofedlib)(l)
|
2023-02-20 12:29:52 +00:00
|
|
|
default:
|
2023-03-22 12:04:12 +00:00
|
|
|
return nil, fmt.Errorf("unknown mode %q", l.mode)
|
2023-02-20 12:29:52 +00:00
|
|
|
}
|
|
|
|
|
2023-02-22 14:40:27 +00:00
|
|
|
w := wrapper{
|
2023-03-17 09:47:31 +00:00
|
|
|
Interface: lib,
|
|
|
|
vendor: l.vendor,
|
|
|
|
class: l.class,
|
|
|
|
mergedDeviceOptions: l.mergedDeviceOptions,
|
2023-02-22 14:40:27 +00:00
|
|
|
}
|
2023-03-22 12:04:12 +00:00
|
|
|
return &w, nil
|
2023-02-20 12:29:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// GetSpec combines the device specs and common edits from the wrapped Interface to a single spec.Interface.
|
|
|
|
func (l *wrapper) GetSpec() (spec.Interface, error) {
|
|
|
|
deviceSpecs, err := l.GetAllDeviceSpecs()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
edits, err := l.GetCommonEdits()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2022-12-02 13:17:52 +00:00
|
|
|
}
|
2023-02-16 15:29:53 +00:00
|
|
|
|
2023-02-22 14:19:22 +00:00
|
|
|
return spec.New(
|
|
|
|
spec.WithDeviceSpecs(deviceSpecs),
|
|
|
|
spec.WithEdits(*edits.ContainerEdits),
|
2023-02-22 14:40:27 +00:00
|
|
|
spec.WithVendor(l.vendor),
|
|
|
|
spec.WithClass(l.class),
|
2023-03-17 09:47:31 +00:00
|
|
|
spec.WithMergedDeviceOptions(l.mergedDeviceOptions...),
|
2023-02-22 14:19:22 +00:00
|
|
|
)
|
2022-12-02 13:17:52 +00:00
|
|
|
}
|
2023-02-13 15:04:30 +00:00
|
|
|
|
2024-03-04 14:08:33 +00:00
|
|
|
// GetCommonEdits returns the wrapped edits and adds additional edits on top.
|
|
|
|
func (m *wrapper) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
|
|
|
edits, err := m.Interface.GetCommonEdits()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2024-10-14 13:06:06 +00:00
|
|
|
edits.Env = append(edits.Env, image.EnvVarNvidiaVisibleDevices+"=void")
|
2024-03-04 14:08:33 +00:00
|
|
|
|
|
|
|
return edits, nil
|
|
|
|
}
|
|
|
|
|
2023-02-13 15:04:30 +00:00
|
|
|
// resolveMode resolves the mode for CDI spec generation based on the current system.
|
|
|
|
func (l *nvcdilib) resolveMode() (rmode string) {
|
2023-02-20 14:27:34 +00:00
|
|
|
if l.mode != ModeAuto {
|
2023-02-13 15:04:30 +00:00
|
|
|
return l.mode
|
|
|
|
}
|
|
|
|
defer func() {
|
2024-03-26 10:37:09 +00:00
|
|
|
l.logger.Infof("Auto-detected mode as '%v'", rmode)
|
2023-02-13 15:04:30 +00:00
|
|
|
}()
|
|
|
|
|
2024-03-26 10:37:09 +00:00
|
|
|
platform := l.infolib.ResolvePlatform()
|
|
|
|
switch platform {
|
|
|
|
case info.PlatformNVML:
|
|
|
|
return ModeNvml
|
|
|
|
case info.PlatformTegra:
|
2023-05-10 12:28:09 +00:00
|
|
|
return ModeCSV
|
2024-03-26 10:37:09 +00:00
|
|
|
case info.PlatformWSL:
|
|
|
|
return ModeWsl
|
2023-05-10 12:28:09 +00:00
|
|
|
}
|
2024-03-26 10:37:09 +00:00
|
|
|
l.logger.Warningf("Unsupported platform detected: %v; assuming %v", platform, ModeNvml)
|
2023-02-20 14:27:34 +00:00
|
|
|
return ModeNvml
|
2023-02-13 15:04:30 +00:00
|
|
|
}
|
2023-03-23 09:50:11 +00:00
|
|
|
|
|
|
|
// getCudaVersion returns the CUDA version of the current system.
|
|
|
|
func (l *nvcdilib) getCudaVersion() (string, error) {
|
2024-09-24 17:05:11 +00:00
|
|
|
version, err := l.getCudaVersionNvsandboxutils()
|
|
|
|
if err == nil {
|
|
|
|
return version, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Fallback to NVML
|
|
|
|
return l.getCudaVersionNvml()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *nvcdilib) getCudaVersionNvml() (string, error) {
|
2023-03-23 09:50:11 +00:00
|
|
|
if hasNVML, reason := l.infolib.HasNvml(); !hasNVML {
|
|
|
|
return "", fmt.Errorf("nvml not detected: %v", reason)
|
|
|
|
}
|
|
|
|
if l.nvmllib == nil {
|
|
|
|
return "", fmt.Errorf("nvml library not initialized")
|
|
|
|
}
|
|
|
|
r := l.nvmllib.Init()
|
|
|
|
if r != nvml.SUCCESS {
|
|
|
|
return "", fmt.Errorf("failed to initialize nvml: %v", r)
|
|
|
|
}
|
2023-08-25 14:48:11 +00:00
|
|
|
defer func() {
|
|
|
|
if r := l.nvmllib.Shutdown(); r != nvml.SUCCESS {
|
|
|
|
l.logger.Warningf("failed to shutdown NVML: %v", r)
|
|
|
|
}
|
|
|
|
}()
|
2023-03-23 09:50:11 +00:00
|
|
|
|
|
|
|
version, r := l.nvmllib.SystemGetDriverVersion()
|
|
|
|
if r != nvml.SUCCESS {
|
|
|
|
return "", fmt.Errorf("failed to get driver version: %v", r)
|
|
|
|
}
|
|
|
|
return version, nil
|
|
|
|
}
|
2024-09-24 17:05:11 +00:00
|
|
|
|
|
|
|
func (l *nvcdilib) getCudaVersionNvsandboxutils() (string, error) {
|
2024-10-02 07:36:51 +00:00
|
|
|
if l.nvsandboxutilslib == nil {
|
|
|
|
return "", fmt.Errorf("libnvsandboxutils is not available")
|
|
|
|
}
|
|
|
|
|
2024-09-24 17:05:11 +00:00
|
|
|
// Sandboxutils initialization should happen before this function is called
|
|
|
|
version, ret := l.nvsandboxutilslib.GetDriverVersion()
|
|
|
|
if ret != nvsandboxutils.SUCCESS {
|
|
|
|
return "", fmt.Errorf("%v", ret)
|
|
|
|
}
|
|
|
|
return version, nil
|
|
|
|
}
|