mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-15 19:09:19 +00:00
This change includes annotation devices in CUDA.VisibleDevices with the highest priority. This allows for the CDI device request extraction to be consistent across all request mechanisms. Note that this does change behaviour in the following ways: 1. Annotations are considered when resolving the runtime mode. 2. Incorrectly formed device names in annotations are no longer treated as an error. Signed-off-by: Evan Lezar <elezar@nvidia.com>
194 lines
5.9 KiB
Go
194 lines
5.9 KiB
Go
/**
|
|
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
**/
|
|
|
|
package modifier
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"tags.cncf.io/container-device-interface/pkg/parser"
|
|
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
|
)
|
|
|
|
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
|
|
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
|
|
// used to select the devices to include.
|
|
func NewCDIModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
|
|
deviceRequestor := newCDIDeviceRequestor(
|
|
logger,
|
|
image,
|
|
cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind,
|
|
)
|
|
devices := deviceRequestor.DeviceRequests()
|
|
if len(devices) == 0 {
|
|
logger.Debugf("No devices requested; no modification required.")
|
|
return nil, nil
|
|
}
|
|
logger.Debugf("Creating CDI modifier for devices: %v", devices)
|
|
|
|
automaticDevices := filterAutomaticDevices(devices)
|
|
if len(automaticDevices) != len(devices) && len(automaticDevices) > 0 {
|
|
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
|
|
}
|
|
if len(automaticDevices) > 0 {
|
|
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
|
|
if err == nil {
|
|
return automaticModifier, nil
|
|
}
|
|
logger.Warningf("Failed to create the automatic CDI modifier: %w", err)
|
|
logger.Debugf("Falling back to the standard CDI modifier")
|
|
}
|
|
|
|
return cdi.New(
|
|
cdi.WithLogger(logger),
|
|
cdi.WithDevices(devices...),
|
|
cdi.WithSpecDirs(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.SpecDirs...),
|
|
)
|
|
}
|
|
|
|
type deviceRequestor interface {
|
|
DeviceRequests() []string
|
|
}
|
|
|
|
type cdiDeviceRequestor struct {
|
|
image image.CUDA
|
|
logger logger.Interface
|
|
defaultKind string
|
|
}
|
|
|
|
func newCDIDeviceRequestor(logger logger.Interface, image image.CUDA, defaultKind string) deviceRequestor {
|
|
c := &cdiDeviceRequestor{
|
|
logger: logger,
|
|
image: image,
|
|
defaultKind: defaultKind,
|
|
}
|
|
return withUniqueDevices(c)
|
|
}
|
|
|
|
func (c *cdiDeviceRequestor) DeviceRequests() []string {
|
|
if c == nil {
|
|
return nil
|
|
}
|
|
var devices []string
|
|
for _, name := range c.image.VisibleDevices() {
|
|
if !parser.IsQualifiedName(name) {
|
|
name = fmt.Sprintf("%s=%s", c.defaultKind, name)
|
|
}
|
|
devices = append(devices, name)
|
|
}
|
|
|
|
return devices
|
|
}
|
|
|
|
// filterAutomaticDevices searches for "automatic" device names in the input slice.
|
|
// "Automatic" devices are a well-defined list of CDI device names which, when requested,
|
|
// trigger the generation of a CDI spec at runtime. This removes the need to generate a
|
|
// CDI spec on the system a-priori as well as keep it up-to-date.
|
|
func filterAutomaticDevices(devices []string) []string {
|
|
var automatic []string
|
|
for _, device := range devices {
|
|
vendor, class, _ := parser.ParseDevice(device)
|
|
if vendor == "runtime.nvidia.com" && class == "gpu" {
|
|
automatic = append(automatic, device)
|
|
}
|
|
}
|
|
return automatic
|
|
}
|
|
|
|
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
|
|
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
|
|
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
|
|
}
|
|
cdiDeviceRequestor, err := cdi.New(
|
|
cdi.WithLogger(logger),
|
|
cdi.WithSpec(spec.Raw()),
|
|
)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to construct CDI modifier: %w", err)
|
|
}
|
|
|
|
return cdiDeviceRequestor, nil
|
|
}
|
|
|
|
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
|
|
cdilib, err := nvcdi.New(
|
|
nvcdi.WithLogger(logger),
|
|
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
|
|
nvcdi.WithDriverRoot(cfg.NVIDIAContainerCLIConfig.Root),
|
|
nvcdi.WithVendor("runtime.nvidia.com"),
|
|
nvcdi.WithClass("gpu"),
|
|
)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
|
}
|
|
|
|
identifiers := []string{}
|
|
for _, device := range devices {
|
|
_, _, id := parser.ParseDevice(device)
|
|
identifiers = append(identifiers, id)
|
|
}
|
|
|
|
deviceSpecs, err := cdilib.GetDeviceSpecsByID(identifiers...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get CDI device specs: %w", err)
|
|
}
|
|
|
|
commonEdits, err := cdilib.GetCommonEdits()
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to get common CDI spec edits: %w", err)
|
|
}
|
|
|
|
return spec.New(
|
|
spec.WithDeviceSpecs(deviceSpecs),
|
|
spec.WithEdits(*commonEdits.ContainerEdits),
|
|
spec.WithVendor("runtime.nvidia.com"),
|
|
spec.WithClass("gpu"),
|
|
)
|
|
}
|
|
|
|
type deduplicatedDeviceRequestor struct {
|
|
deviceRequestor
|
|
}
|
|
|
|
func withUniqueDevices(deviceRequestor deviceRequestor) deviceRequestor {
|
|
return &deduplicatedDeviceRequestor{deviceRequestor: deviceRequestor}
|
|
}
|
|
|
|
func (d *deduplicatedDeviceRequestor) DeviceRequests() []string {
|
|
if d == nil {
|
|
return nil
|
|
}
|
|
seen := make(map[string]bool)
|
|
var devices []string
|
|
for _, device := range d.deviceRequestor.DeviceRequests() {
|
|
if seen[device] {
|
|
continue
|
|
}
|
|
seen[device] = true
|
|
devices = append(devices, device)
|
|
}
|
|
return devices
|
|
}
|