2022-06-20 15:03:02 +00:00
|
|
|
/**
|
|
|
|
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
**/
|
|
|
|
|
|
|
|
package modifier
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2023-03-23 18:18:22 +00:00
|
|
|
"strings"
|
2022-06-20 15:03:02 +00:00
|
|
|
|
2023-11-16 23:16:34 +00:00
|
|
|
"tags.cncf.io/container-device-interface/pkg/parser"
|
|
|
|
|
2022-06-20 15:03:02 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
2023-03-22 12:27:43 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
2023-06-22 12:45:25 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
|
2022-06-20 15:03:02 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
2023-11-16 23:16:34 +00:00
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
2022-06-20 15:03:02 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
|
2023-08-25 14:14:06 +00:00
|
|
|
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
|
2022-06-20 15:03:02 +00:00
|
|
|
// used to select the devices to include.
|
2023-03-22 12:27:43 +00:00
|
|
|
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
2023-03-09 09:15:34 +00:00
|
|
|
devices, err := getDevicesFromSpec(logger, ociSpec, cfg)
|
2022-06-22 08:08:52 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
|
|
|
|
}
|
|
|
|
if len(devices) == 0 {
|
|
|
|
logger.Debugf("No devices requested; no modification required.")
|
|
|
|
return nil, nil
|
|
|
|
}
|
2023-03-06 11:40:21 +00:00
|
|
|
logger.Debugf("Creating CDI modifier for devices: %v", devices)
|
2022-06-22 08:08:52 +00:00
|
|
|
|
2023-11-16 23:16:34 +00:00
|
|
|
automaticDevices := filterAutomaticDevices(devices)
|
|
|
|
if len(automaticDevices) != len(devices) && len(automaticDevices) > 0 {
|
|
|
|
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
|
|
|
|
}
|
|
|
|
if len(automaticDevices) > 0 {
|
2023-12-06 15:54:30 +00:00
|
|
|
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
|
|
|
|
if err == nil {
|
|
|
|
return automaticModifier, nil
|
|
|
|
}
|
|
|
|
logger.Warningf("Failed to create the automatic CDI modifier: %w", err)
|
|
|
|
logger.Debugf("Falling back to the standard CDI modifier")
|
2023-11-16 23:16:34 +00:00
|
|
|
}
|
|
|
|
|
2023-06-22 12:45:25 +00:00
|
|
|
return cdi.New(
|
|
|
|
cdi.WithLogger(logger),
|
|
|
|
cdi.WithDevices(devices...),
|
|
|
|
cdi.WithSpecDirs(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.SpecDirs...),
|
|
|
|
)
|
2022-06-22 08:08:52 +00:00
|
|
|
}
|
|
|
|
|
2023-03-22 12:27:43 +00:00
|
|
|
func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.Config) ([]string, error) {
|
2022-06-20 15:03:02 +00:00
|
|
|
rawSpec, err := ociSpec.Load()
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
|
|
|
}
|
|
|
|
|
2023-03-23 18:18:22 +00:00
|
|
|
annotationDevices, err := getAnnotationDevices(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes, rawSpec.Annotations)
|
2022-06-22 08:08:52 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
|
2022-06-20 15:03:02 +00:00
|
|
|
}
|
2023-03-06 11:40:21 +00:00
|
|
|
if len(annotationDevices) > 0 {
|
|
|
|
return annotationDevices, nil
|
|
|
|
}
|
2022-06-20 15:03:02 +00:00
|
|
|
|
2023-03-09 09:15:34 +00:00
|
|
|
container, err := image.NewCUDAImageFromSpec(rawSpec)
|
2023-03-06 11:40:21 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2022-06-20 15:03:02 +00:00
|
|
|
}
|
2023-10-10 11:48:38 +00:00
|
|
|
if cfg.AcceptDeviceListAsVolumeMounts {
|
|
|
|
mountDevices := container.CDIDevicesFromMounts()
|
|
|
|
if len(mountDevices) > 0 {
|
|
|
|
return mountDevices, nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-06-22 08:08:52 +00:00
|
|
|
var devices []string
|
2023-03-06 11:40:21 +00:00
|
|
|
seen := make(map[string]bool)
|
2024-10-14 13:06:06 +00:00
|
|
|
for _, name := range container.VisibleDevicesFromEnvVar() {
|
2023-06-22 12:45:25 +00:00
|
|
|
if !parser.IsQualifiedName(name) {
|
2023-03-09 09:15:34 +00:00
|
|
|
name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name)
|
2023-03-06 11:40:21 +00:00
|
|
|
}
|
|
|
|
if seen[name] {
|
|
|
|
logger.Debugf("Ignoring duplicate device %q", name)
|
|
|
|
continue
|
|
|
|
}
|
2022-06-22 08:08:52 +00:00
|
|
|
devices = append(devices, name)
|
2022-06-20 15:03:02 +00:00
|
|
|
}
|
|
|
|
|
2023-03-09 09:15:34 +00:00
|
|
|
if len(devices) == 0 {
|
|
|
|
return nil, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged(rawSpec) {
|
|
|
|
return devices, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES: %v", devices)
|
|
|
|
|
2023-03-10 21:11:29 +00:00
|
|
|
return nil, nil
|
2022-06-20 15:03:02 +00:00
|
|
|
}
|
|
|
|
|
2023-03-23 18:18:22 +00:00
|
|
|
// getAnnotationDevices returns a list of devices specified in the annotations.
|
|
|
|
// Keys starting with the specified prefixes are considered and expected to contain a comma-separated list of
|
|
|
|
// fully-qualified CDI devices names. If any device name is not fully-quality an error is returned.
|
|
|
|
// The list of returned devices is deduplicated.
|
|
|
|
func getAnnotationDevices(prefixes []string, annotations map[string]string) ([]string, error) {
|
|
|
|
devicesByKey := make(map[string][]string)
|
|
|
|
for key, value := range annotations {
|
|
|
|
for _, prefix := range prefixes {
|
|
|
|
if strings.HasPrefix(key, prefix) {
|
|
|
|
devicesByKey[key] = strings.Split(value, ",")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
seen := make(map[string]bool)
|
|
|
|
var annotationDevices []string
|
|
|
|
for key, devices := range devicesByKey {
|
|
|
|
for _, device := range devices {
|
2023-06-22 12:45:25 +00:00
|
|
|
if !parser.IsQualifiedName(device) {
|
2023-03-23 18:18:22 +00:00
|
|
|
return nil, fmt.Errorf("invalid device name %q in annotation %q", device, key)
|
|
|
|
}
|
|
|
|
if seen[device] {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
annotationDevices = append(annotationDevices, device)
|
|
|
|
seen[device] = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return annotationDevices, nil
|
|
|
|
}
|
2023-11-16 23:16:34 +00:00
|
|
|
|
|
|
|
// filterAutomaticDevices searches for "automatic" device names in the input slice.
|
|
|
|
// "Automatic" devices are a well-defined list of CDI device names which, when requested,
|
|
|
|
// trigger the generation of a CDI spec at runtime. This removes the need to generate a
|
|
|
|
// CDI spec on the system a-priori as well as keep it up-to-date.
|
|
|
|
func filterAutomaticDevices(devices []string) []string {
|
|
|
|
var automatic []string
|
|
|
|
for _, device := range devices {
|
2023-11-30 22:16:10 +00:00
|
|
|
vendor, class, _ := parser.ParseDevice(device)
|
|
|
|
if vendor == "runtime.nvidia.com" && class == "gpu" {
|
2023-11-16 23:16:34 +00:00
|
|
|
automatic = append(automatic, device)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return automatic
|
|
|
|
}
|
|
|
|
|
|
|
|
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
|
|
|
|
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
|
|
|
|
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
|
|
|
|
}
|
|
|
|
cdiModifier, err := cdi.New(
|
|
|
|
cdi.WithLogger(logger),
|
|
|
|
cdi.WithSpec(spec.Raw()),
|
|
|
|
)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to construct CDI modifier: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return cdiModifier, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
|
|
|
|
cdilib, err := nvcdi.New(
|
|
|
|
nvcdi.WithLogger(logger),
|
2024-04-24 08:47:45 +00:00
|
|
|
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
|
2023-11-16 23:16:34 +00:00
|
|
|
nvcdi.WithDriverRoot(cfg.NVIDIAContainerCLIConfig.Root),
|
|
|
|
nvcdi.WithVendor("runtime.nvidia.com"),
|
|
|
|
nvcdi.WithClass("gpu"),
|
|
|
|
)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
|
|
|
}
|
|
|
|
|
2023-12-04 20:57:12 +00:00
|
|
|
identifiers := []string{}
|
2023-11-30 22:16:10 +00:00
|
|
|
for _, device := range devices {
|
2023-12-04 20:57:12 +00:00
|
|
|
_, _, id := parser.ParseDevice(device)
|
|
|
|
identifiers = append(identifiers, id)
|
2023-11-30 22:16:10 +00:00
|
|
|
}
|
|
|
|
|
2023-12-04 20:57:12 +00:00
|
|
|
deviceSpecs, err := cdilib.GetDeviceSpecsByID(identifiers...)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to get CDI device specs: %w", err)
|
2023-11-30 22:16:10 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
commonEdits, err := cdilib.GetCommonEdits()
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to get common CDI spec edits: %w", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
return spec.New(
|
|
|
|
spec.WithDeviceSpecs(deviceSpecs),
|
|
|
|
spec.WithEdits(*commonEdits.ContainerEdits),
|
|
|
|
spec.WithVendor("runtime.nvidia.com"),
|
|
|
|
spec.WithClass("gpu"),
|
|
|
|
)
|
2023-11-16 23:16:34 +00:00
|
|
|
}
|