mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-26 18:18:24 +00:00
Add nvcdi package with basic CDI generation API
This change adds an nvcdi package that exposes a basic API for CDI spec generation. This is used from the nvidia-ctk cdi generate command and can be consumed by DRA implementations and the device plugin. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
33
pkg/nvcdi/api.go
Normal file
33
pkg/nvcdi/api.go
Normal file
@@ -0,0 +1,33 @@
|
||||
/**
|
||||
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package nvcdi
|
||||
|
||||
import (
|
||||
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||
)
|
||||
|
||||
// Interface defines the API for the nvcdi package
|
||||
type Interface interface {
|
||||
GetCommonEdits() (*cdi.ContainerEdits, error)
|
||||
GetAllDeviceSpecs() ([]specs.Device, error)
|
||||
GetGPUDeviceEdits(device.Device) (*cdi.ContainerEdits, error)
|
||||
GetGPUDeviceSpecs(int, device.Device) (*specs.Device, error)
|
||||
GetMIGDeviceEdits(device.Device, device.MigDevice) (*cdi.ContainerEdits, error)
|
||||
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) (*specs.Device, error)
|
||||
}
|
||||
76
pkg/nvcdi/common.go
Normal file
76
pkg/nvcdi/common.go
Normal file
@@ -0,0 +1,76 @@
|
||||
/**
|
||||
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package nvcdi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
)
|
||||
|
||||
// GetCommonEdits generates a CDI specification that can be used for ANY devices
|
||||
func (l *nvcdilib) GetCommonEdits() (*cdi.ContainerEdits, error) {
|
||||
common, err := newCommonDiscoverer(l.logger, l.driverRoot, l.nvidiaCTKPath, l.nvmllib)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create discoverer for common entities: %v", err)
|
||||
}
|
||||
|
||||
return edits.FromDiscoverer(common)
|
||||
}
|
||||
|
||||
// newCommonDiscoverer returns a discoverer for entities that are not associated with a specific CDI device.
|
||||
// This includes driver libraries and meta devices, for example.
|
||||
func newCommonDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, nvmllib nvml.Interface) (discover.Discover, error) {
|
||||
metaDevices := discover.NewDeviceDiscoverer(
|
||||
logger,
|
||||
lookup.NewCharDeviceLocator(
|
||||
lookup.WithLogger(logger),
|
||||
lookup.WithRoot(driverRoot),
|
||||
),
|
||||
driverRoot,
|
||||
[]string{
|
||||
"/dev/nvidia-modeset",
|
||||
"/dev/nvidia-uvm-tools",
|
||||
"/dev/nvidia-uvm",
|
||||
"/dev/nvidiactl",
|
||||
},
|
||||
)
|
||||
|
||||
graphicsMounts, err := discover.NewGraphicsMountsDiscoverer(logger, driverRoot)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error constructing discoverer for graphics mounts: %v", err)
|
||||
}
|
||||
|
||||
driverFiles, err := NewDriverDiscoverer(logger, driverRoot, nvidiaCTKPath, nvmllib)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err)
|
||||
}
|
||||
|
||||
d := discover.Merge(
|
||||
metaDevices,
|
||||
graphicsMounts,
|
||||
driverFiles,
|
||||
)
|
||||
|
||||
return d, nil
|
||||
}
|
||||
162
pkg/nvcdi/driver.go
Normal file
162
pkg/nvcdi/driver.go
Normal file
@@ -0,0 +1,162 @@
|
||||
/**
|
||||
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package nvcdi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/ldcache"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
|
||||
"github.com/sirupsen/logrus"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
)
|
||||
|
||||
// NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation.
|
||||
// The supplied NVML Library is used to query the expected driver version.
|
||||
func NewDriverDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, nvmllib nvml.Interface) (discover.Discover, error) {
|
||||
version, r := nvmllib.SystemGetDriverVersion()
|
||||
if r != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("failed to determine driver version: %v", r)
|
||||
}
|
||||
|
||||
libraries, err := NewDriverLibraryDiscoverer(logger, driverRoot, nvidiaCTKPath, version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err)
|
||||
}
|
||||
|
||||
ipcs, err := discover.NewIPCDiscoverer(logger, driverRoot)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err)
|
||||
}
|
||||
|
||||
firmwares := NewDriverFirmwareDiscoverer(logger, driverRoot, version)
|
||||
|
||||
binaries := NewDriverBinariesDiscoverer(logger, driverRoot)
|
||||
|
||||
d := discover.Merge(
|
||||
libraries,
|
||||
ipcs,
|
||||
firmwares,
|
||||
binaries,
|
||||
)
|
||||
|
||||
return d, nil
|
||||
}
|
||||
|
||||
// NewDriverLibraryDiscoverer creates a discoverer for the libraries associated with the specified driver version.
|
||||
func NewDriverLibraryDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, version string) (discover.Discover, error) {
|
||||
libraryPaths, err := getVersionLibs(logger, driverRoot, version)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get libraries for driver version: %v", err)
|
||||
}
|
||||
|
||||
libraries := discover.NewMounts(
|
||||
logger,
|
||||
lookup.NewFileLocator(
|
||||
lookup.WithLogger(logger),
|
||||
lookup.WithRoot(driverRoot),
|
||||
),
|
||||
driverRoot,
|
||||
libraryPaths,
|
||||
)
|
||||
|
||||
cfg := &discover.Config{
|
||||
DriverRoot: driverRoot,
|
||||
NvidiaCTKPath: nvidiaCTKPath,
|
||||
}
|
||||
hooks, _ := discover.NewLDCacheUpdateHook(logger, libraries, cfg)
|
||||
|
||||
d := discover.Merge(
|
||||
libraries,
|
||||
hooks,
|
||||
)
|
||||
|
||||
return d, nil
|
||||
}
|
||||
|
||||
// NewDriverFirmwareDiscoverer creates a discoverer for GSP firmware associated with the specified driver version.
|
||||
func NewDriverFirmwareDiscoverer(logger *logrus.Logger, driverRoot string, version string) discover.Discover {
|
||||
gspFirmwarePath := filepath.Join("/lib/firmware/nvidia", version, "gsp.bin")
|
||||
return discover.NewMounts(
|
||||
logger,
|
||||
lookup.NewFileLocator(
|
||||
lookup.WithLogger(logger),
|
||||
lookup.WithRoot(driverRoot),
|
||||
),
|
||||
driverRoot,
|
||||
[]string{gspFirmwarePath},
|
||||
)
|
||||
}
|
||||
|
||||
// NewDriverBinariesDiscoverer creates a discoverer for GSP firmware associated with the GPU driver.
|
||||
func NewDriverBinariesDiscoverer(logger *logrus.Logger, driverRoot string) discover.Discover {
|
||||
return discover.NewMounts(
|
||||
logger,
|
||||
lookup.NewExecutableLocator(logger, driverRoot),
|
||||
driverRoot,
|
||||
[]string{
|
||||
"nvidia-smi", /* System management interface */
|
||||
"nvidia-debugdump", /* GPU coredump utility */
|
||||
"nvidia-persistenced", /* Persistence mode utility */
|
||||
"nvidia-cuda-mps-control", /* Multi process service CLI */
|
||||
"nvidia-cuda-mps-server", /* Multi process service server */
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
// getVersionLibs checks the LDCache for libraries ending in the specified driver version.
|
||||
// Although the ldcache at the specified driverRoot is queried, the paths are returned relative to this driverRoot.
|
||||
// This allows the standard mount location logic to be used for resolving the mounts.
|
||||
func getVersionLibs(logger *logrus.Logger, driverRoot string, version string) ([]string, error) {
|
||||
logger.Infof("Using driver version %v", version)
|
||||
|
||||
cache, err := ldcache.New(logger, driverRoot)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load ldcache: %v", err)
|
||||
}
|
||||
|
||||
libs32, libs64 := cache.List()
|
||||
|
||||
var libs []string
|
||||
for _, l := range libs64 {
|
||||
if strings.HasSuffix(l, version) {
|
||||
logger.Infof("found 64-bit driver lib: %v", l)
|
||||
libs = append(libs, l)
|
||||
}
|
||||
}
|
||||
|
||||
for _, l := range libs32 {
|
||||
if strings.HasSuffix(l, version) {
|
||||
logger.Infof("found 32-bit driver lib: %v", l)
|
||||
libs = append(libs, l)
|
||||
}
|
||||
}
|
||||
|
||||
if driverRoot == "/" || driverRoot == "" {
|
||||
return libs, nil
|
||||
}
|
||||
|
||||
var relative []string
|
||||
for _, l := range libs {
|
||||
relative = append(relative, strings.TrimPrefix(l, driverRoot))
|
||||
}
|
||||
|
||||
return relative, nil
|
||||
}
|
||||
198
pkg/nvcdi/full-gpu.go
Normal file
198
pkg/nvcdi/full-gpu.go
Normal file
@@ -0,0 +1,198 @@
|
||||
/**
|
||||
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package nvcdi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/info/drm"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
)
|
||||
|
||||
// GetGPUDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
|
||||
func (l *nvcdilib) GetGPUDeviceSpecs(i int, d device.Device) (*specs.Device, error) {
|
||||
edits, err := l.GetGPUDeviceEdits(d)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get edits for device: %v", err)
|
||||
}
|
||||
|
||||
name, err := l.deviceNamer.GetDeviceName(i, d)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get device name: %v", err)
|
||||
}
|
||||
|
||||
spec := specs.Device{
|
||||
Name: name,
|
||||
ContainerEdits: *edits.ContainerEdits,
|
||||
}
|
||||
|
||||
return &spec, nil
|
||||
}
|
||||
|
||||
// GetGPUDeviceEdits returns the CDI edits for the full GPU represented by 'device'.
|
||||
func (l *nvcdilib) GetGPUDeviceEdits(d device.Device) (*cdi.ContainerEdits, error) {
|
||||
device, err := newFullGPUDiscoverer(l.logger, l.driverRoot, l.nvidiaCTKPath, d)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create device discoverer: %v", err)
|
||||
}
|
||||
|
||||
editsForDevice, err := edits.FromDiscoverer(device)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create container edits for device: %v", err)
|
||||
}
|
||||
|
||||
return editsForDevice, nil
|
||||
}
|
||||
|
||||
// byPathHookDiscoverer discovers the entities required for injecting by-path DRM device links
|
||||
type byPathHookDiscoverer struct {
|
||||
logger *logrus.Logger
|
||||
driverRoot string
|
||||
nvidiaCTKPath string
|
||||
pciBusID string
|
||||
}
|
||||
|
||||
var _ discover.Discover = (*byPathHookDiscoverer)(nil)
|
||||
|
||||
// newFullGPUDiscoverer creates a discoverer for the full GPU defined by the specified device.
|
||||
func newFullGPUDiscoverer(logger *logrus.Logger, driverRoot string, nvidiaCTKPath string, d device.Device) (discover.Discover, error) {
|
||||
// TODO: The functionality to get device paths should be integrated into the go-nvlib/pkg/device.Device interface.
|
||||
// This will allow reuse here and in other code where the paths are queried such as the NVIDIA device plugin.
|
||||
minor, ret := d.GetMinorNumber()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("error getting GPU device minor number: %v", ret)
|
||||
}
|
||||
path := fmt.Sprintf("/dev/nvidia%d", minor)
|
||||
|
||||
pciInfo, ret := d.GetPciInfo()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("error getting PCI info for device: %v", ret)
|
||||
}
|
||||
pciBusID := getBusID(pciInfo)
|
||||
|
||||
drmDeviceNodes, err := drm.GetDeviceNodesByBusID(pciBusID)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to determine DRM devices for %v: %v", pciBusID, err)
|
||||
}
|
||||
|
||||
deviceNodePaths := append([]string{path}, drmDeviceNodes...)
|
||||
|
||||
deviceNodes := discover.NewCharDeviceDiscoverer(
|
||||
logger,
|
||||
deviceNodePaths,
|
||||
driverRoot,
|
||||
)
|
||||
|
||||
byPathHooks := &byPathHookDiscoverer{
|
||||
logger: logger,
|
||||
driverRoot: driverRoot,
|
||||
nvidiaCTKPath: nvidiaCTKPath,
|
||||
pciBusID: pciBusID,
|
||||
}
|
||||
|
||||
dd := discover.Merge(
|
||||
deviceNodes,
|
||||
byPathHooks,
|
||||
)
|
||||
|
||||
return dd, nil
|
||||
}
|
||||
|
||||
// Devices returns the empty list for the by-path hook discoverer
|
||||
func (d *byPathHookDiscoverer) Devices() ([]discover.Device, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
// Hooks returns the hooks for the GPU device.
|
||||
// The following hooks are detected:
|
||||
// 1. A hook to create /dev/dri/by-path symlinks
|
||||
func (d *byPathHookDiscoverer) Hooks() ([]discover.Hook, error) {
|
||||
links, err := d.deviceNodeLinks()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to discover DRA device links: %v", err)
|
||||
}
|
||||
if len(links) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
var args []string
|
||||
for _, l := range links {
|
||||
args = append(args, "--link", l)
|
||||
}
|
||||
|
||||
hook := discover.CreateNvidiaCTKHook(
|
||||
d.nvidiaCTKPath,
|
||||
"create-symlinks",
|
||||
args...,
|
||||
)
|
||||
|
||||
return []discover.Hook{hook}, nil
|
||||
}
|
||||
|
||||
// Mounts returns an empty slice for a full GPU
|
||||
func (d *byPathHookDiscoverer) Mounts() ([]discover.Mount, error) {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (d *byPathHookDiscoverer) deviceNodeLinks() ([]string, error) {
|
||||
candidates := []string{
|
||||
fmt.Sprintf("/dev/dri/by-path/pci-%s-card", d.pciBusID),
|
||||
fmt.Sprintf("/dev/dri/by-path/pci-%s-render", d.pciBusID),
|
||||
}
|
||||
|
||||
var links []string
|
||||
for _, c := range candidates {
|
||||
linkPath := filepath.Join(d.driverRoot, c)
|
||||
device, err := os.Readlink(linkPath)
|
||||
if err != nil {
|
||||
d.logger.Warningf("Failed to evaluate symlink %v; ignoring", linkPath)
|
||||
continue
|
||||
}
|
||||
|
||||
d.logger.Debugf("adding device symlink %v -> %v", linkPath, device)
|
||||
links = append(links, fmt.Sprintf("%v::%v", device, linkPath))
|
||||
}
|
||||
|
||||
return links, nil
|
||||
}
|
||||
|
||||
// getBusID provides a utility function that returns the string representation of the bus ID.
|
||||
func getBusID(p nvml.PciInfo) string {
|
||||
var bytes []byte
|
||||
for _, b := range p.BusId {
|
||||
if byte(b) == '\x00' {
|
||||
break
|
||||
}
|
||||
bytes = append(bytes, byte(b))
|
||||
}
|
||||
id := strings.ToLower(string(bytes))
|
||||
|
||||
if id != "0000" {
|
||||
id = strings.TrimPrefix(id, "0000")
|
||||
}
|
||||
|
||||
return id
|
||||
}
|
||||
114
pkg/nvcdi/lib.go
Normal file
114
pkg/nvcdi/lib.go
Normal file
@@ -0,0 +1,114 @@
|
||||
/**
|
||||
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package nvcdi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
)
|
||||
|
||||
type nvcdilib struct {
|
||||
logger *logrus.Logger
|
||||
nvmllib nvml.Interface
|
||||
devicelib device.Interface
|
||||
deviceNamer DeviceNamer
|
||||
driverRoot string
|
||||
nvidiaCTKPath string
|
||||
}
|
||||
|
||||
// New creates a new nvcdi library
|
||||
func New(opts ...Option) Interface {
|
||||
l := &nvcdilib{}
|
||||
|
||||
if l.nvmllib == nil {
|
||||
l.nvmllib = nvml.New()
|
||||
}
|
||||
if l.devicelib == nil {
|
||||
l.devicelib = device.New(device.WithNvml(l.nvmllib))
|
||||
}
|
||||
if l.logger == nil {
|
||||
l.logger = logrus.StandardLogger()
|
||||
}
|
||||
if l.deviceNamer == nil {
|
||||
l.deviceNamer, _ = NewDeviceNamer(DeviceNameStrategyIndex)
|
||||
}
|
||||
if l.driverRoot == "" {
|
||||
l.driverRoot = "/"
|
||||
}
|
||||
if l.nvidiaCTKPath == "" {
|
||||
l.nvidiaCTKPath = "/usr/bin/nvidia-ctk"
|
||||
}
|
||||
|
||||
return l
|
||||
}
|
||||
|
||||
// GetAllDeviceSpecs returns the device specs for all available devices.
|
||||
func (l *nvcdilib) GetAllDeviceSpecs() ([]specs.Device, error) {
|
||||
var deviceSpecs []specs.Device
|
||||
|
||||
gpuDeviceSpecs, err := l.getGPUDeviceSpecs()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
deviceSpecs = append(deviceSpecs, gpuDeviceSpecs...)
|
||||
|
||||
migDeviceSpecs, err := l.getMigDeviceSpecs()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
deviceSpecs = append(deviceSpecs, migDeviceSpecs...)
|
||||
|
||||
return deviceSpecs, nil
|
||||
}
|
||||
|
||||
func (l *nvcdilib) getGPUDeviceSpecs() ([]specs.Device, error) {
|
||||
var deviceSpecs []specs.Device
|
||||
err := l.devicelib.VisitDevices(func(i int, d device.Device) error {
|
||||
deviceSpec, err := l.GetGPUDeviceSpecs(i, d)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
deviceSpecs = append(deviceSpecs, *deviceSpec)
|
||||
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to generate CDI edits for GPU devices: %v", err)
|
||||
}
|
||||
return deviceSpecs, err
|
||||
}
|
||||
|
||||
func (l *nvcdilib) getMigDeviceSpecs() ([]specs.Device, error) {
|
||||
var deviceSpecs []specs.Device
|
||||
err := l.devicelib.VisitMigDevices(func(i int, d device.Device, j int, mig device.MigDevice) error {
|
||||
deviceSpec, err := l.GetMIGDeviceSpecs(i, d, j, mig)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
deviceSpecs = append(deviceSpecs, *deviceSpec)
|
||||
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to generate CDI edits for GPU devices: %v", err)
|
||||
}
|
||||
return deviceSpecs, err
|
||||
}
|
||||
124
pkg/nvcdi/mig-device.go
Normal file
124
pkg/nvcdi/mig-device.go
Normal file
@@ -0,0 +1,124 @@
|
||||
/**
|
||||
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package nvcdi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/edits"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/nvcaps"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
|
||||
"github.com/container-orchestrated-devices/container-device-interface/specs-go"
|
||||
"github.com/sirupsen/logrus"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
)
|
||||
|
||||
// GetMIGDeviceSpecs returns the CDI device specs for the full GPU represented by 'device'.
|
||||
func (l *nvcdilib) GetMIGDeviceSpecs(i int, d device.Device, j int, mig device.MigDevice) (*specs.Device, error) {
|
||||
edits, err := l.GetMIGDeviceEdits(d, mig)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get edits for device: %v", err)
|
||||
}
|
||||
|
||||
name, err := l.deviceNamer.GetMigDeviceName(i, d, j, mig)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get device name: %v", err)
|
||||
}
|
||||
|
||||
spec := specs.Device{
|
||||
Name: name,
|
||||
ContainerEdits: *edits.ContainerEdits,
|
||||
}
|
||||
|
||||
return &spec, nil
|
||||
}
|
||||
|
||||
// GetMIGDeviceEdits returns the CDI edits for the MIG device represented by 'mig' on 'parent'.
|
||||
func (l *nvcdilib) GetMIGDeviceEdits(parent device.Device, mig device.MigDevice) (*cdi.ContainerEdits, error) {
|
||||
gpu, ret := parent.GetMinorNumber()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("error getting GPU minor: %v", ret)
|
||||
}
|
||||
|
||||
gi, ret := mig.GetGpuInstanceId()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("error getting GPU Instance ID: %v", ret)
|
||||
}
|
||||
|
||||
ci, ret := mig.GetComputeInstanceId()
|
||||
if ret != nvml.SUCCESS {
|
||||
return nil, fmt.Errorf("error getting Compute Instance ID: %v", ret)
|
||||
}
|
||||
|
||||
editsForDevice, err := GetEditsForComputeInstance(l.logger, l.driverRoot, gpu, gi, ci)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create container edits for MIG device: %v", err)
|
||||
}
|
||||
|
||||
return editsForDevice, nil
|
||||
}
|
||||
|
||||
// GetEditsForComputeInstance returns the CDI edits for a particular compute instance defined by the (gpu, gi, ci) tuple
|
||||
func GetEditsForComputeInstance(logger *logrus.Logger, driverRoot string, gpu int, gi int, ci int) (*cdi.ContainerEdits, error) {
|
||||
computeInstance, err := newComputeInstanceDiscoverer(logger, driverRoot, gpu, gi, ci)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create discoverer for Compute Instance: %v", err)
|
||||
}
|
||||
|
||||
editsForDevice, err := edits.FromDiscoverer(computeInstance)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create container edits for Compute Instance: %v", err)
|
||||
}
|
||||
|
||||
return editsForDevice, nil
|
||||
}
|
||||
|
||||
// newComputeInstanceDiscoverer returns a discoverer for the specified compute instance
|
||||
func newComputeInstanceDiscoverer(logger *logrus.Logger, driverRoot string, gpu int, gi int, ci int) (discover.Discover, error) {
|
||||
parentPath := fmt.Sprintf("/dev/nvidia%d", gpu)
|
||||
|
||||
migCaps, err := nvcaps.NewMigCaps()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error getting MIG capability device paths: %v", err)
|
||||
}
|
||||
|
||||
giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
|
||||
giCapDevicePath, err := migCaps.GetCapDevicePath(giCap)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get GI cap device path: %v", err)
|
||||
}
|
||||
|
||||
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
|
||||
ciCapDevicePath, err := migCaps.GetCapDevicePath(ciCap)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to get CI cap device path: %v", err)
|
||||
}
|
||||
|
||||
deviceNodes := discover.NewCharDeviceDiscoverer(
|
||||
logger,
|
||||
[]string{
|
||||
parentPath,
|
||||
giCapDevicePath,
|
||||
ciCapDevicePath,
|
||||
},
|
||||
driverRoot,
|
||||
)
|
||||
|
||||
return deviceNodes, nil
|
||||
}
|
||||
89
pkg/nvcdi/namer.go
Normal file
89
pkg/nvcdi/namer.go
Normal file
@@ -0,0 +1,89 @@
|
||||
/**
|
||||
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package nvcdi
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
)
|
||||
|
||||
// DeviceNamer is an interface for getting device names
|
||||
type DeviceNamer interface {
|
||||
GetDeviceName(int, device.Device) (string, error)
|
||||
GetMigDeviceName(int, device.Device, int, device.MigDevice) (string, error)
|
||||
}
|
||||
|
||||
// Supported device naming strategies
|
||||
const (
|
||||
// DeviceNameStrategyIndex generates devices names such as 0 or 1:0
|
||||
DeviceNameStrategyIndex = "index"
|
||||
// DeviceNameStrategyTypeIndex generates devices names such as gpu0 or mig1:0
|
||||
DeviceNameStrategyTypeIndex = "type-index"
|
||||
// DeviceNameStrategyUUID uses the device UUID as the name
|
||||
DeviceNameStrategyUUID = "uuid"
|
||||
)
|
||||
|
||||
type deviceNameIndex struct {
|
||||
gpuPrefix string
|
||||
migPrefix string
|
||||
}
|
||||
type deviceNameUUID struct{}
|
||||
|
||||
// NewDeviceNamer creates a Device Namer based on the supplied strategy.
|
||||
// This namer can be used to construct the names for MIG and GPU devices when generating the CDI spec.
|
||||
func NewDeviceNamer(strategy string) (DeviceNamer, error) {
|
||||
switch strategy {
|
||||
case DeviceNameStrategyIndex:
|
||||
return deviceNameIndex{}, nil
|
||||
case DeviceNameStrategyTypeIndex:
|
||||
return deviceNameIndex{gpuPrefix: "gpu", migPrefix: "mig"}, nil
|
||||
case DeviceNameStrategyUUID:
|
||||
return deviceNameUUID{}, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("invalid device name strategy: %v", strategy)
|
||||
}
|
||||
|
||||
// GetDeviceName returns the name for the specified device based on the naming strategy
|
||||
func (s deviceNameIndex) GetDeviceName(i int, d device.Device) (string, error) {
|
||||
return fmt.Sprintf("%s%d", s.gpuPrefix, i), nil
|
||||
}
|
||||
|
||||
// GetMigDeviceName returns the name for the specified device based on the naming strategy
|
||||
func (s deviceNameIndex) GetMigDeviceName(i int, d device.Device, j int, mig device.MigDevice) (string, error) {
|
||||
return fmt.Sprintf("%s%d:%d", s.migPrefix, i, j), nil
|
||||
}
|
||||
|
||||
// GetDeviceName returns the name for the specified device based on the naming strategy
|
||||
func (s deviceNameUUID) GetDeviceName(i int, d device.Device) (string, error) {
|
||||
uuid, ret := d.GetUUID()
|
||||
if ret != nvml.SUCCESS {
|
||||
return "", fmt.Errorf("failed to get device UUID: %v", ret)
|
||||
}
|
||||
return uuid, nil
|
||||
}
|
||||
|
||||
// GetMigDeviceName returns the name for the specified device based on the naming strategy
|
||||
func (s deviceNameUUID) GetMigDeviceName(i int, d device.Device, j int, mig device.MigDevice) (string, error) {
|
||||
uuid, ret := mig.GetUUID()
|
||||
if ret != nvml.SUCCESS {
|
||||
return "", fmt.Errorf("failed to get device UUID: %v", ret)
|
||||
}
|
||||
return uuid, nil
|
||||
}
|
||||
68
pkg/nvcdi/options.go
Normal file
68
pkg/nvcdi/options.go
Normal file
@@ -0,0 +1,68 @@
|
||||
/**
|
||||
# Copyright (c) NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package nvcdi
|
||||
|
||||
import (
|
||||
"github.com/sirupsen/logrus"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/device"
|
||||
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||||
)
|
||||
|
||||
// Option is a function that configures the nvcdilib
|
||||
type Option func(*nvcdilib)
|
||||
|
||||
// WithDeviceLib sets the device library for the library
|
||||
func WithDeviceLib(devicelib device.Interface) Option {
|
||||
return func(l *nvcdilib) {
|
||||
l.devicelib = devicelib
|
||||
}
|
||||
}
|
||||
|
||||
// WithDeviceNamer sets the device namer for the library
|
||||
func WithDeviceNamer(namer DeviceNamer) Option {
|
||||
return func(l *nvcdilib) {
|
||||
l.deviceNamer = namer
|
||||
}
|
||||
}
|
||||
|
||||
// WithDriverRoot sets the driver root for the library
|
||||
func WithDriverRoot(root string) Option {
|
||||
return func(l *nvcdilib) {
|
||||
l.driverRoot = root
|
||||
}
|
||||
}
|
||||
|
||||
// WithLogger sets the logger for the library
|
||||
func WithLogger(logger *logrus.Logger) Option {
|
||||
return func(l *nvcdilib) {
|
||||
l.logger = logger
|
||||
}
|
||||
}
|
||||
|
||||
// WithNVIDIACTKPath sets the path to the NVIDIA Container Toolkit CLI path for the library
|
||||
func WithNVIDIACTKPath(path string) Option {
|
||||
return func(l *nvcdilib) {
|
||||
l.nvidiaCTKPath = path
|
||||
}
|
||||
}
|
||||
|
||||
// WithNvmlLib sets the nvml library for the library
|
||||
func WithNvmlLib(nvmllib nvml.Interface) Option {
|
||||
return func(l *nvcdilib) {
|
||||
l.nvmllib = nvmllib
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user