nvidia-container-toolkit/internal/modifier/cdi.go
Evan Lezar 1bc2a9fee3
Return annotation devices from VisibleDevices
This change includes annotation devices in CUDA.VisibleDevices
with the highest priority. This allows for the CDI device
request extraction to be consistent across all request mechanisms.

Note that this does change behaviour in the following ways:
1. Annotations are considered when resolving the runtime mode.
2. Incorrectly formed device names in annotations are no longer treated as an error.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2025-06-13 14:56:08 +02:00

194 lines
5.9 KiB
Go

/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package modifier
import (
"fmt"
"tags.cncf.io/container-device-interface/pkg/parser"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
)
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
// used to select the devices to include.
func NewCDIModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
deviceRequestor := newCDIDeviceRequestor(
logger,
image,
cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind,
)
devices := deviceRequestor.DeviceRequests()
if len(devices) == 0 {
logger.Debugf("No devices requested; no modification required.")
return nil, nil
}
logger.Debugf("Creating CDI modifier for devices: %v", devices)
automaticDevices := filterAutomaticDevices(devices)
if len(automaticDevices) != len(devices) && len(automaticDevices) > 0 {
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
}
if len(automaticDevices) > 0 {
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
if err == nil {
return automaticModifier, nil
}
logger.Warningf("Failed to create the automatic CDI modifier: %w", err)
logger.Debugf("Falling back to the standard CDI modifier")
}
return cdi.New(
cdi.WithLogger(logger),
cdi.WithDevices(devices...),
cdi.WithSpecDirs(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.SpecDirs...),
)
}
type deviceRequestor interface {
DeviceRequests() []string
}
type cdiDeviceRequestor struct {
image image.CUDA
logger logger.Interface
defaultKind string
}
func newCDIDeviceRequestor(logger logger.Interface, image image.CUDA, defaultKind string) deviceRequestor {
c := &cdiDeviceRequestor{
logger: logger,
image: image,
defaultKind: defaultKind,
}
return withUniqueDevices(c)
}
func (c *cdiDeviceRequestor) DeviceRequests() []string {
if c == nil {
return nil
}
var devices []string
for _, name := range c.image.VisibleDevices() {
if !parser.IsQualifiedName(name) {
name = fmt.Sprintf("%s=%s", c.defaultKind, name)
}
devices = append(devices, name)
}
return devices
}
// filterAutomaticDevices searches for "automatic" device names in the input slice.
// "Automatic" devices are a well-defined list of CDI device names which, when requested,
// trigger the generation of a CDI spec at runtime. This removes the need to generate a
// CDI spec on the system a-priori as well as keep it up-to-date.
func filterAutomaticDevices(devices []string) []string {
var automatic []string
for _, device := range devices {
vendor, class, _ := parser.ParseDevice(device)
if vendor == "runtime.nvidia.com" && class == "gpu" {
automatic = append(automatic, device)
}
}
return automatic
}
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
if err != nil {
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
}
cdiDeviceRequestor, err := cdi.New(
cdi.WithLogger(logger),
cdi.WithSpec(spec.Raw()),
)
if err != nil {
return nil, fmt.Errorf("failed to construct CDI modifier: %w", err)
}
return cdiDeviceRequestor, nil
}
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
cdilib, err := nvcdi.New(
nvcdi.WithLogger(logger),
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
nvcdi.WithDriverRoot(cfg.NVIDIAContainerCLIConfig.Root),
nvcdi.WithVendor("runtime.nvidia.com"),
nvcdi.WithClass("gpu"),
)
if err != nil {
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
}
identifiers := []string{}
for _, device := range devices {
_, _, id := parser.ParseDevice(device)
identifiers = append(identifiers, id)
}
deviceSpecs, err := cdilib.GetDeviceSpecsByID(identifiers...)
if err != nil {
return nil, fmt.Errorf("failed to get CDI device specs: %w", err)
}
commonEdits, err := cdilib.GetCommonEdits()
if err != nil {
return nil, fmt.Errorf("failed to get common CDI spec edits: %w", err)
}
return spec.New(
spec.WithDeviceSpecs(deviceSpecs),
spec.WithEdits(*commonEdits.ContainerEdits),
spec.WithVendor("runtime.nvidia.com"),
spec.WithClass("gpu"),
)
}
type deduplicatedDeviceRequestor struct {
deviceRequestor
}
func withUniqueDevices(deviceRequestor deviceRequestor) deviceRequestor {
return &deduplicatedDeviceRequestor{deviceRequestor: deviceRequestor}
}
func (d *deduplicatedDeviceRequestor) DeviceRequests() []string {
if d == nil {
return nil
}
seen := make(map[string]bool)
var devices []string
for _, device := range d.deviceRequestor.DeviceRequests() {
if seen[device] {
continue
}
seen[device] = true
devices = append(devices, device)
}
return devices
}