mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-15 10:58:45 +00:00
The gated modifiers used to add support for GDS, Mofed, and CUDA Forward Comatibility only check the NVIDIA_VISIBLE_DEVICES envvar to determine whether GPUs are requested and modifications should be made. This means that use cases where volume mounts are used to request devices (e.g. when using the GPU Device Plugin) are not supported. This patch takes visibleDevicesFromEnvVar private, making VisibleDevices the only exported method to query valid devices. And edits the gated modifiers to use this func, ensuring device requests via mounts are also taken into acount. Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
222 lines
7.1 KiB
Go
222 lines
7.1 KiB
Go
/**
|
|
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
**/
|
|
|
|
package modifier
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
|
|
"tags.cncf.io/container-device-interface/pkg/parser"
|
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
|
)
|
|
|
|
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
|
|
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
|
|
// used to select the devices to include.
|
|
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
|
|
devices, err := getDevicesFromSpec(logger, ociSpec, cfg)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
|
|
}
|
|
if len(devices) == 0 {
|
|
logger.Debugf("No devices requested; no modification required.")
|
|
return nil, nil
|
|
}
|
|
logger.Debugf("Creating CDI modifier for devices: %v", devices)
|
|
|
|
automaticDevices := filterAutomaticDevices(devices)
|
|
if len(automaticDevices) != len(devices) && len(automaticDevices) > 0 {
|
|
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
|
|
}
|
|
if len(automaticDevices) > 0 {
|
|
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
|
|
if err == nil {
|
|
return automaticModifier, nil
|
|
}
|
|
logger.Warningf("Failed to create the automatic CDI modifier: %w", err)
|
|
logger.Debugf("Falling back to the standard CDI modifier")
|
|
}
|
|
|
|
return cdi.New(
|
|
cdi.WithLogger(logger),
|
|
cdi.WithDevices(devices...),
|
|
cdi.WithSpecDirs(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.SpecDirs...),
|
|
)
|
|
}
|
|
|
|
func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.Config) ([]string, error) {
|
|
rawSpec, err := ociSpec.Load()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
|
}
|
|
|
|
annotationDevices, err := getAnnotationDevices(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes, rawSpec.Annotations)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
|
|
}
|
|
if len(annotationDevices) > 0 {
|
|
return annotationDevices, nil
|
|
}
|
|
|
|
container, err := image.NewCUDAImageFromSpec(
|
|
rawSpec,
|
|
image.WithLogger(logger),
|
|
)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var devices []string
|
|
seen := make(map[string]bool)
|
|
for _, name := range container.VisibleDevices() {
|
|
// Skip empty device names
|
|
if strings.TrimSpace(name) == "" {
|
|
continue
|
|
}
|
|
|
|
if !parser.IsQualifiedName(name) {
|
|
name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name)
|
|
}
|
|
|
|
// Skip devices that result in empty device IDs after qualification
|
|
if strings.HasSuffix(name, "=") {
|
|
logger.Debugf("Ignoring device with empty ID: %q", name)
|
|
continue
|
|
}
|
|
|
|
if seen[name] {
|
|
logger.Debugf("Ignoring duplicate device %q", name)
|
|
continue
|
|
}
|
|
devices = append(devices, name)
|
|
seen[name] = true
|
|
}
|
|
|
|
if len(devices) == 0 {
|
|
return nil, nil
|
|
}
|
|
|
|
return devices, nil
|
|
}
|
|
|
|
// getAnnotationDevices returns a list of devices specified in the annotations.
|
|
// Keys starting with the specified prefixes are considered and expected to contain a comma-separated list of
|
|
// fully-qualified CDI devices names. If any device name is not fully-quality an error is returned.
|
|
// The list of returned devices is deduplicated.
|
|
func getAnnotationDevices(prefixes []string, annotations map[string]string) ([]string, error) {
|
|
devicesByKey := make(map[string][]string)
|
|
for key, value := range annotations {
|
|
for _, prefix := range prefixes {
|
|
if strings.HasPrefix(key, prefix) {
|
|
devicesByKey[key] = strings.Split(value, ",")
|
|
}
|
|
}
|
|
}
|
|
|
|
seen := make(map[string]bool)
|
|
var annotationDevices []string
|
|
for key, devices := range devicesByKey {
|
|
for _, device := range devices {
|
|
if !parser.IsQualifiedName(device) {
|
|
return nil, fmt.Errorf("invalid device name %q in annotation %q", device, key)
|
|
}
|
|
if seen[device] {
|
|
continue
|
|
}
|
|
annotationDevices = append(annotationDevices, device)
|
|
seen[device] = true
|
|
}
|
|
}
|
|
|
|
return annotationDevices, nil
|
|
}
|
|
|
|
// filterAutomaticDevices searches for "automatic" device names in the input slice.
|
|
// "Automatic" devices are a well-defined list of CDI device names which, when requested,
|
|
// trigger the generation of a CDI spec at runtime. This removes the need to generate a
|
|
// CDI spec on the system a-priori as well as keep it up-to-date.
|
|
func filterAutomaticDevices(devices []string) []string {
|
|
var automatic []string
|
|
for _, device := range devices {
|
|
vendor, class, _ := parser.ParseDevice(device)
|
|
if vendor == "runtime.nvidia.com" && class == "gpu" {
|
|
automatic = append(automatic, device)
|
|
}
|
|
}
|
|
return automatic
|
|
}
|
|
|
|
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
|
|
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
|
|
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
|
|
}
|
|
cdiModifier, err := cdi.New(
|
|
cdi.WithLogger(logger),
|
|
cdi.WithSpec(spec.Raw()),
|
|
)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to construct CDI modifier: %w", err)
|
|
}
|
|
|
|
return cdiModifier, nil
|
|
}
|
|
|
|
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
|
|
cdilib, err := nvcdi.New(
|
|
nvcdi.WithLogger(logger),
|
|
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
|
|
nvcdi.WithDriverRoot(cfg.NVIDIAContainerCLIConfig.Root),
|
|
nvcdi.WithVendor("runtime.nvidia.com"),
|
|
nvcdi.WithClass("gpu"),
|
|
)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
|
}
|
|
|
|
identifiers := []string{}
|
|
for _, device := range devices {
|
|
_, _, id := parser.ParseDevice(device)
|
|
identifiers = append(identifiers, id)
|
|
}
|
|
|
|
deviceSpecs, err := cdilib.GetDeviceSpecsByID(identifiers...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get CDI device specs: %w", err)
|
|
}
|
|
|
|
commonEdits, err := cdilib.GetCommonEdits()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get common CDI spec edits: %w", err)
|
|
}
|
|
|
|
return spec.New(
|
|
spec.WithDeviceSpecs(deviceSpecs),
|
|
spec.WithEdits(*commonEdits.ContainerEdits),
|
|
spec.WithVendor("runtime.nvidia.com"),
|
|
spec.WithClass("gpu"),
|
|
)
|
|
}
|