nvidia-container-toolkit/internal/modifier/cdi.go
Evan Lezar 0ed757faee
Some checks failed
CI Pipeline / code-scanning (push) Has been cancelled
CI Pipeline / variables (push) Has been cancelled
CI Pipeline / golang (push) Has been cancelled
CI Pipeline / image (push) Has been cancelled
CI Pipeline / e2e-test (push) Has been cancelled
TOFIX: split auto
Signed-off-by: Evan Lezar <elezar@nvidia.com>
2025-06-12 10:18:01 +02:00

170 lines
5.6 KiB
Go

/**
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package modifier
import (
"fmt"
"strings"
"tags.cncf.io/container-device-interface/pkg/parser"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
)
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
// used to select the devices to include.
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec, isJitCDI bool) (oci.SpecModifier, error) {
defaultKind := cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind
if isJitCDI {
defaultKind = automaticDeviceKind
}
devices, err := getDevicesFromSpec(logger, ociSpec, cfg, defaultKind)
if err != nil {
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
}
if len(devices) == 0 {
logger.Debugf("No devices requested; no modification required.")
return nil, nil
}
logger.Debugf("Creating CDI modifier for devices: %v", devices)
automaticDevices := filterAutomaticDevices(devices)
if len(automaticDevices) != len(devices) && len(automaticDevices) > 0 {
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
}
if len(automaticDevices) > 0 {
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
if err == nil {
return automaticModifier, nil
}
logger.Warningf("Failed to create the automatic CDI modifier: %w", err)
logger.Debugf("Falling back to the standard CDI modifier")
}
return cdi.New(
cdi.WithLogger(logger),
cdi.WithDevices(devices...),
cdi.WithSpecDirs(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.SpecDirs...),
)
}
func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.Config, defaultKind string) ([]string, error) {
cdiModifier := &cdiModifier{
logger: logger,
acceptDeviceListAsVolumeMounts: cfg.AcceptDeviceListAsVolumeMounts,
acceptEnvvarUnprivileged: cfg.AcceptEnvvarUnprivileged,
annotationPrefixes: cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes,
defaultKind: defaultKind,
}
return cdiModifier.getDevicesFromSpec(ociSpec)
}
// TODO: We should rename this type.
type cdiModifier struct {
logger logger.Interface
acceptDeviceListAsVolumeMounts bool
acceptEnvvarUnprivileged bool
annotationPrefixes []string
defaultKind string
}
func (c *cdiModifier) getDevicesFromSpec(ociSpec oci.Spec) ([]string, error) {
rawSpec, err := ociSpec.Load()
if err != nil {
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
}
if rawSpec != nil {
annotationDevices, err := getAnnotationDevices(c.annotationPrefixes, rawSpec.Annotations)
if err != nil {
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
}
if len(annotationDevices) > 0 {
return annotationDevices, nil
}
}
container, err := image.NewCUDAImageFromSpec(
rawSpec,
image.WithLogger(c.logger),
image.WithAcceptDeviceListAsVolumeMounts(c.acceptDeviceListAsVolumeMounts),
image.WithAcceptEnvvarUnprivileged(c.acceptEnvvarUnprivileged),
)
if err != nil {
return nil, err
}
var devices []string
seen := make(map[string]bool)
for _, name := range container.VisibleDevices() {
name = c.normalizeDevice(name)
if seen[name] {
c.logger.Debugf("Ignoring duplicate device %q", name)
continue
}
seen[name] = true
devices = append(devices, name)
}
return devices, nil
}
func (c *cdiModifier) normalizeDevice(device string) string {
if !parser.IsQualifiedName(device) {
return fmt.Sprintf("%s=%s", c.defaultKind, device)
}
return device
}
// getAnnotationDevices returns a list of devices specified in the annotations.
// Keys starting with the specified prefixes are considered and expected to contain a comma-separated list of
// fully-qualified CDI devices names. If any device name is not fully-quality an error is returned.
// The list of returned devices is deduplicated.
func getAnnotationDevices(prefixes []string, annotations map[string]string) ([]string, error) {
devicesByKey := make(map[string][]string)
for key, value := range annotations {
for _, prefix := range prefixes {
if strings.HasPrefix(key, prefix) {
devicesByKey[key] = strings.Split(value, ",")
}
}
}
seen := make(map[string]bool)
var annotationDevices []string
for key, devices := range devicesByKey {
for _, device := range devices {
if !parser.IsQualifiedName(device) {
return nil, fmt.Errorf("invalid device name %q in annotation %q", device, key)
}
if seen[device] {
continue
}
annotationDevices = append(annotationDevices, device)
seen[device] = true
}
}
return annotationDevices, nil
}