2023-03-01 10:16:38 +00:00
|
|
|
/**
|
|
|
|
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
**/
|
|
|
|
|
|
|
|
package nvcdi
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"path/filepath"
|
|
|
|
"strings"
|
|
|
|
|
2023-11-15 20:36:23 +00:00
|
|
|
"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"
|
2023-12-01 01:10:10 +00:00
|
|
|
"tags.cncf.io/container-device-interface/pkg/cdi"
|
|
|
|
"tags.cncf.io/container-device-interface/specs-go"
|
|
|
|
|
2023-03-01 10:16:38 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
2023-03-23 20:03:52 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/cuda"
|
2023-03-01 10:16:38 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
|
|
|
)
|
|
|
|
|
|
|
|
type managementlib nvcdilib
|
|
|
|
|
|
|
|
var _ Interface = (*managementlib)(nil)
|
|
|
|
|
|
|
|
// GetAllDeviceSpecs returns all device specs for use in managemnt containers.
|
|
|
|
// A single device with the name `all` is returned.
|
|
|
|
func (m *managementlib) GetAllDeviceSpecs() ([]specs.Device, error) {
|
|
|
|
devices, err := m.newManagementDeviceDiscoverer()
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
edits, err := edits.FromDiscoverer(devices)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to create edits from discoverer: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if len(edits.DeviceNodes) == 0 {
|
|
|
|
return nil, fmt.Errorf("no NVIDIA device nodes found")
|
|
|
|
}
|
|
|
|
|
|
|
|
device := specs.Device{
|
|
|
|
Name: "all",
|
|
|
|
ContainerEdits: *edits.ContainerEdits,
|
|
|
|
}
|
|
|
|
return []specs.Device{device}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetCommonEdits returns the common edits for use in managementlib containers.
|
|
|
|
func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
2023-03-23 09:50:11 +00:00
|
|
|
version, err := m.getCudaVersion()
|
2023-03-01 10:16:38 +00:00
|
|
|
if err != nil {
|
2023-03-23 09:50:11 +00:00
|
|
|
return nil, fmt.Errorf("failed to get CUDA version: %v", err)
|
2023-03-01 10:16:38 +00:00
|
|
|
}
|
|
|
|
|
2024-09-16 09:19:02 +00:00
|
|
|
driver, err := (*nvcdilib)(m).newDriverVersionDiscoverer(version)
|
2023-03-01 10:16:38 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to create driver library discoverer: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
edits, err := edits.FromDiscoverer(driver)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to create edits from discoverer: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return edits, nil
|
|
|
|
}
|
|
|
|
|
2023-03-23 09:50:11 +00:00
|
|
|
// getCudaVersion returns the CUDA version for use in managementlib containers.
|
|
|
|
func (m *managementlib) getCudaVersion() (string, error) {
|
|
|
|
version, err := (*nvcdilib)(m).getCudaVersion()
|
|
|
|
if err == nil {
|
|
|
|
return version, nil
|
|
|
|
}
|
|
|
|
|
2023-03-23 20:03:52 +00:00
|
|
|
libCudaPaths, err := cuda.New(
|
2023-11-21 15:08:16 +00:00
|
|
|
m.driver.Libraries(),
|
2023-05-22 11:53:19 +00:00
|
|
|
).Locate(".*.*")
|
2023-03-23 09:50:11 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", fmt.Errorf("failed to locate libcuda.so: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
libCudaPath := libCudaPaths[0]
|
|
|
|
|
|
|
|
version = strings.TrimPrefix(filepath.Base(libCudaPath), "libcuda.so.")
|
|
|
|
|
|
|
|
return version, nil
|
|
|
|
}
|
|
|
|
|
2023-03-01 10:16:38 +00:00
|
|
|
type managementDiscoverer struct {
|
|
|
|
discover.Discover
|
|
|
|
}
|
|
|
|
|
|
|
|
// newManagementDeviceDiscoverer returns a discover.Discover that discovers device nodes for use in managementlib containers.
|
|
|
|
// NVML is not used to query devices and all device nodes are returned.
|
|
|
|
func (m *managementlib) newManagementDeviceDiscoverer() (discover.Discover, error) {
|
|
|
|
deviceNodes := discover.NewCharDeviceDiscoverer(
|
|
|
|
m.logger,
|
2023-11-20 14:03:36 +00:00
|
|
|
m.devRoot,
|
2023-03-01 10:16:38 +00:00
|
|
|
[]string{
|
|
|
|
"/dev/nvidia*",
|
|
|
|
"/dev/nvidia-caps/nvidia-cap*",
|
|
|
|
"/dev/nvidia-modeset",
|
|
|
|
"/dev/nvidia-uvm-tools",
|
|
|
|
"/dev/nvidia-uvm",
|
|
|
|
"/dev/nvidiactl",
|
|
|
|
},
|
|
|
|
)
|
|
|
|
|
|
|
|
deviceFolderPermissionHooks := newDeviceFolderPermissionHookDiscoverer(
|
|
|
|
m.logger,
|
2023-11-14 15:57:37 +00:00
|
|
|
m.devRoot,
|
2024-04-24 08:47:45 +00:00
|
|
|
m.nvidiaCDIHookPath,
|
2023-03-01 10:16:38 +00:00
|
|
|
deviceNodes,
|
|
|
|
)
|
|
|
|
|
|
|
|
d := discover.Merge(
|
|
|
|
&managementDiscoverer{deviceNodes},
|
|
|
|
deviceFolderPermissionHooks,
|
|
|
|
)
|
|
|
|
return d, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *managementDiscoverer) Devices() ([]discover.Device, error) {
|
|
|
|
devices, err := m.Discover.Devices()
|
|
|
|
if err != nil {
|
|
|
|
return devices, err
|
|
|
|
}
|
|
|
|
|
|
|
|
var filteredDevices []discover.Device
|
|
|
|
for _, device := range devices {
|
|
|
|
if m.nodeIsBlocked(device.HostPath) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
filteredDevices = append(filteredDevices, device)
|
|
|
|
}
|
|
|
|
|
|
|
|
return filteredDevices, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// nodeIsBlocked returns true if the specified device node should be ignored.
|
|
|
|
func (m managementDiscoverer) nodeIsBlocked(path string) bool {
|
|
|
|
blockedPrefixes := []string{"nvidia-fs", "nvidia-nvswitch", "nvidia-nvlink"}
|
|
|
|
nodeName := filepath.Base(path)
|
|
|
|
for _, prefix := range blockedPrefixes {
|
|
|
|
if strings.HasPrefix(nodeName, prefix) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetSpec is unsppported for the managementlib specs.
|
|
|
|
// managementlib is typically wrapped by a spec that implements GetSpec.
|
|
|
|
func (m *managementlib) GetSpec() (spec.Interface, error) {
|
|
|
|
return nil, fmt.Errorf("GetSpec is not supported")
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetGPUDeviceEdits is unsupported for the managementlib specs
|
|
|
|
func (m *managementlib) GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error) {
|
|
|
|
return nil, fmt.Errorf("GetGPUDeviceEdits is not supported")
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetGPUDeviceSpecs is unsupported for the managementlib specs
|
2023-03-21 13:51:36 +00:00
|
|
|
func (m *managementlib) GetGPUDeviceSpecs(int, device.Device) ([]specs.Device, error) {
|
2023-03-01 10:16:38 +00:00
|
|
|
return nil, fmt.Errorf("GetGPUDeviceSpecs is not supported")
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetMIGDeviceEdits is unsupported for the managementlib specs
|
|
|
|
func (m *managementlib) GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error) {
|
|
|
|
return nil, fmt.Errorf("GetMIGDeviceEdits is not supported")
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetMIGDeviceSpecs is unsupported for the managementlib specs
|
2023-03-21 13:51:36 +00:00
|
|
|
func (m *managementlib) GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) {
|
2023-03-01 10:16:38 +00:00
|
|
|
return nil, fmt.Errorf("GetMIGDeviceSpecs is not supported")
|
|
|
|
}
|
2023-12-04 20:57:12 +00:00
|
|
|
|
|
|
|
// GetDeviceSpecsByID returns the CDI device specs for the GPU(s) represented by
|
|
|
|
// the provided identifiers, where an identifier is an index or UUID of a valid
|
|
|
|
// GPU device.
|
|
|
|
func (l *managementlib) GetDeviceSpecsByID(...string) ([]specs.Device, error) {
|
|
|
|
return nil, fmt.Errorf("GetDeviceSpecsByID is not supported")
|
|
|
|
}
|