mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-13 18:10:00 +00:00
Make CDI device requests consistent with other methods
Following the refactoring of device request extraction, we can now make CDI device requests consistent with other methods. This change moves to using image.VisibleDevices instead of separate calls to CDIDevicesFromMounts and VisibleDevicesFromEnvVar. The handling of annotation-based requests will be addressed in a follow-up. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
parent
27f5ec83de
commit
1b0f07a124
@ -54,8 +54,12 @@ type CUDA struct {
|
||||
// NewCUDAImageFromSpec creates a CUDA image from the input OCI runtime spec.
|
||||
// The process environment is read (if present) to construc the CUDA Image.
|
||||
func NewCUDAImageFromSpec(spec *specs.Spec, opts ...Option) (CUDA, error) {
|
||||
if spec == nil {
|
||||
return New(opts...)
|
||||
}
|
||||
|
||||
var env []string
|
||||
if spec != nil && spec.Process != nil {
|
||||
if spec.Process != nil {
|
||||
env = spec.Process.Env
|
||||
}
|
||||
|
||||
@ -212,19 +216,12 @@ func parseMajorMinorVersion(version string) (string, error) {
|
||||
// OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/
|
||||
func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
|
||||
var hasCDIdevice bool
|
||||
for _, device := range i.VisibleDevicesFromEnvVar() {
|
||||
for _, device := range i.VisibleDevices() {
|
||||
if !parser.IsQualifiedName(device) {
|
||||
return false
|
||||
}
|
||||
hasCDIdevice = true
|
||||
}
|
||||
|
||||
for _, device := range i.DevicesFromMounts() {
|
||||
if !strings.HasPrefix(device, "cdi/") {
|
||||
return false
|
||||
}
|
||||
hasCDIdevice = true
|
||||
}
|
||||
return hasCDIdevice
|
||||
}
|
||||
|
||||
@ -276,20 +273,27 @@ func (i CUDA) VisibleDevicesFromEnvVar() []string {
|
||||
// visibleDevicesFromMounts returns the set of visible devices requested as mounts.
|
||||
func (i CUDA) visibleDevicesFromMounts() []string {
|
||||
var devices []string
|
||||
for _, device := range i.DevicesFromMounts() {
|
||||
for _, device := range i.requestsFromMounts() {
|
||||
switch {
|
||||
case strings.HasPrefix(device, volumeMountDevicePrefixCDI):
|
||||
continue
|
||||
case strings.HasPrefix(device, volumeMountDevicePrefixImex):
|
||||
continue
|
||||
case strings.HasPrefix(device, volumeMountDevicePrefixCDI):
|
||||
name, err := cdiDeviceMountRequest(device).qualifiedName()
|
||||
if err != nil {
|
||||
i.logger.Warningf("Ignoring invalid mount request for CDI device %v: %w", device, err)
|
||||
continue
|
||||
}
|
||||
devices = append(devices, name)
|
||||
default:
|
||||
devices = append(devices, device)
|
||||
}
|
||||
devices = append(devices, device)
|
||||
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
// DevicesFromMounts returns a list of device specified as mounts.
|
||||
func (i CUDA) DevicesFromMounts() []string {
|
||||
// requestsFromMounts returns a list of device specified as mounts.
|
||||
func (i CUDA) requestsFromMounts() []string {
|
||||
root := filepath.Clean(DeviceListAsVolumeMountsRoot)
|
||||
seen := make(map[string]bool)
|
||||
var devices []string
|
||||
@ -321,23 +325,30 @@ func (i CUDA) DevicesFromMounts() []string {
|
||||
return devices
|
||||
}
|
||||
|
||||
// CDIDevicesFromMounts returns a list of CDI devices specified as mounts on the image.
|
||||
func (i CUDA) CDIDevicesFromMounts() []string {
|
||||
var devices []string
|
||||
for _, mountDevice := range i.DevicesFromMounts() {
|
||||
if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixCDI) {
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(strings.TrimPrefix(mountDevice, volumeMountDevicePrefixCDI), "/", 3)
|
||||
if len(parts) != 3 {
|
||||
continue
|
||||
}
|
||||
vendor := parts[0]
|
||||
class := parts[1]
|
||||
device := parts[2]
|
||||
devices = append(devices, fmt.Sprintf("%s/%s=%s", vendor, class, device))
|
||||
// a cdiDeviceMountRequest represents a CDI device requests as a mount.
|
||||
// Here the host path /dev/null is mounted to a particular path in the container.
|
||||
// The container path has the form:
|
||||
// /var/run/nvidia-container-devices/cdi/<vendor>/<class>/<device>
|
||||
// or
|
||||
// /var/run/nvidia-container-devices/cdi/<vendor>/<class>=<device>
|
||||
type cdiDeviceMountRequest string
|
||||
|
||||
// qualifiedName returns the fully-qualified name of the CDI device.
|
||||
func (m cdiDeviceMountRequest) qualifiedName() (string, error) {
|
||||
if !strings.HasPrefix(string(m), volumeMountDevicePrefixCDI) {
|
||||
return "", fmt.Errorf("invalid mount CDI device request: %s", m)
|
||||
}
|
||||
return devices
|
||||
|
||||
requestedDevice := strings.TrimPrefix(string(m), volumeMountDevicePrefixCDI)
|
||||
if parser.IsQualifiedName(requestedDevice) {
|
||||
return requestedDevice, nil
|
||||
}
|
||||
|
||||
parts := strings.SplitN(requestedDevice, "/", 3)
|
||||
if len(parts) != 3 {
|
||||
return "", fmt.Errorf("invalid mount CDI device request: %s", m)
|
||||
}
|
||||
return fmt.Sprintf("%s/%s=%s", parts[0], parts[1], parts[2]), nil
|
||||
}
|
||||
|
||||
// ImexChannelsFromEnvVar returns the list of IMEX channels requested for the image.
|
||||
@ -352,7 +363,7 @@ func (i CUDA) ImexChannelsFromEnvVar() []string {
|
||||
// ImexChannelsFromMounts returns the list of IMEX channels requested for the image.
|
||||
func (i CUDA) ImexChannelsFromMounts() []string {
|
||||
var channels []string
|
||||
for _, mountDevice := range i.DevicesFromMounts() {
|
||||
for _, mountDevice := range i.requestsFromMounts() {
|
||||
if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixImex) {
|
||||
continue
|
||||
}
|
||||
|
@ -487,9 +487,9 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) {
|
||||
expectedDevices: []string{"GPU0-MIG0/0/1", "GPU1-MIG0/0/1"},
|
||||
},
|
||||
{
|
||||
description: "cdi devices are ignored",
|
||||
mounts: makeTestMounts("GPU0", "cdi/nvidia.com/gpu=all", "GPU1"),
|
||||
expectedDevices: []string{"GPU0", "GPU1"},
|
||||
description: "cdi devices are included",
|
||||
mounts: makeTestMounts("GPU0", "nvidia.com/gpu=all", "GPU1"),
|
||||
expectedDevices: []string{"GPU0", "nvidia.com/gpu=all", "GPU1"},
|
||||
},
|
||||
{
|
||||
description: "imex devices are ignored",
|
||||
|
@ -184,7 +184,7 @@ func TestResolveAutoMode(t *testing.T) {
|
||||
expectedMode: "legacy",
|
||||
},
|
||||
{
|
||||
description: "cdi mount and non-CDI envvar resolves to legacy",
|
||||
description: "cdi mount and non-CDI envvar resolves to cdi",
|
||||
mode: "auto",
|
||||
envmap: map[string]string{
|
||||
"NVIDIA_VISIBLE_DEVICES": "0",
|
||||
@ -197,6 +197,22 @@ func TestResolveAutoMode(t *testing.T) {
|
||||
"tegra": false,
|
||||
"nvgpu": false,
|
||||
},
|
||||
expectedMode: "cdi",
|
||||
},
|
||||
{
|
||||
description: "non-cdi mount and CDI envvar resolves to legacy",
|
||||
mode: "auto",
|
||||
envmap: map[string]string{
|
||||
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0",
|
||||
},
|
||||
mounts: []string{
|
||||
"/var/run/nvidia-container-devices/0",
|
||||
},
|
||||
info: map[string]bool{
|
||||
"nvml": true,
|
||||
"tegra": false,
|
||||
"nvgpu": false,
|
||||
},
|
||||
expectedMode: "legacy",
|
||||
},
|
||||
}
|
||||
@ -232,6 +248,8 @@ func TestResolveAutoMode(t *testing.T) {
|
||||
image, _ := image.New(
|
||||
image.WithEnvMap(tc.envmap),
|
||||
image.WithMounts(mounts),
|
||||
image.WithAcceptDeviceListAsVolumeMounts(true),
|
||||
image.WithAcceptEnvvarUnprivileged(true),
|
||||
)
|
||||
mode := resolveMode(logger, tc.mode, image, properties)
|
||||
require.EqualValues(t, tc.expectedMode, mode)
|
||||
|
@ -66,57 +66,66 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe
|
||||
}
|
||||
|
||||
func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.Config) ([]string, error) {
|
||||
cdiModifier := &cdiModifier{
|
||||
logger: logger,
|
||||
acceptDeviceListAsVolumeMounts: cfg.AcceptDeviceListAsVolumeMounts,
|
||||
acceptEnvvarUnprivileged: cfg.AcceptEnvvarUnprivileged,
|
||||
annotationPrefixes: cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes,
|
||||
defaultKind: cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind,
|
||||
}
|
||||
return cdiModifier.getDevicesFromSpec(ociSpec)
|
||||
}
|
||||
|
||||
// TODO: We should rename this type.
|
||||
type cdiModifier struct {
|
||||
logger logger.Interface
|
||||
acceptDeviceListAsVolumeMounts bool
|
||||
acceptEnvvarUnprivileged bool
|
||||
annotationPrefixes []string
|
||||
defaultKind string
|
||||
}
|
||||
|
||||
func (c *cdiModifier) getDevicesFromSpec(ociSpec oci.Spec) ([]string, error) {
|
||||
rawSpec, err := ociSpec.Load()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
||||
}
|
||||
|
||||
annotationDevices, err := getAnnotationDevices(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes, rawSpec.Annotations)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
|
||||
}
|
||||
if len(annotationDevices) > 0 {
|
||||
return annotationDevices, nil
|
||||
if rawSpec != nil {
|
||||
annotationDevices, err := getAnnotationDevices(c.annotationPrefixes, rawSpec.Annotations)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
|
||||
}
|
||||
if len(annotationDevices) > 0 {
|
||||
return annotationDevices, nil
|
||||
}
|
||||
}
|
||||
|
||||
container, err := image.NewCUDAImageFromSpec(
|
||||
rawSpec,
|
||||
image.WithLogger(logger),
|
||||
image.WithLogger(c.logger),
|
||||
image.WithAcceptDeviceListAsVolumeMounts(c.acceptDeviceListAsVolumeMounts),
|
||||
image.WithAcceptEnvvarUnprivileged(c.acceptEnvvarUnprivileged),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if cfg.AcceptDeviceListAsVolumeMounts {
|
||||
mountDevices := container.CDIDevicesFromMounts()
|
||||
if len(mountDevices) > 0 {
|
||||
return mountDevices, nil
|
||||
}
|
||||
}
|
||||
|
||||
var devices []string
|
||||
seen := make(map[string]bool)
|
||||
for _, name := range container.VisibleDevicesFromEnvVar() {
|
||||
for _, name := range container.VisibleDevices() {
|
||||
if !parser.IsQualifiedName(name) {
|
||||
name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name)
|
||||
name = fmt.Sprintf("%s=%s", c.defaultKind, name)
|
||||
}
|
||||
if seen[name] {
|
||||
logger.Debugf("Ignoring duplicate device %q", name)
|
||||
c.logger.Debugf("Ignoring duplicate device %q", name)
|
||||
continue
|
||||
}
|
||||
seen[name] = true
|
||||
devices = append(devices, name)
|
||||
}
|
||||
|
||||
if len(devices) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged((*image.OCISpec)(rawSpec)) {
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES: %v", devices)
|
||||
|
||||
return nil, nil
|
||||
return devices, nil
|
||||
}
|
||||
|
||||
// getAnnotationDevices returns a list of devices specified in the annotations.
|
||||
|
@ -20,7 +20,11 @@ import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
testlog "github.com/sirupsen/logrus/hooks/test"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||
)
|
||||
|
||||
func TestGetAnnotationDevices(t *testing.T) {
|
||||
@ -90,3 +94,69 @@ func TestGetAnnotationDevices(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetDevicesFromSpec(t *testing.T) {
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
|
||||
testCases := []struct {
|
||||
description string
|
||||
input cdiModifier
|
||||
spec *specs.Spec
|
||||
expectedDevices []string
|
||||
}{
|
||||
{
|
||||
description: "empty spec yields no devices",
|
||||
},
|
||||
{
|
||||
description: "cdi devices from mounts",
|
||||
input: cdiModifier{
|
||||
defaultKind: "nvidia.com/gpu",
|
||||
acceptEnvvarUnprivileged: true,
|
||||
acceptDeviceListAsVolumeMounts: true,
|
||||
},
|
||||
spec: &specs.Spec{
|
||||
Mounts: []specs.Mount{
|
||||
{
|
||||
Destination: "/var/run/nvidia-container-devices/cdi/nvidia.com/gpu/0",
|
||||
Source: "/dev/null",
|
||||
},
|
||||
{
|
||||
Destination: "/var/run/nvidia-container-devices/cdi/nvidia.com/gpu/1",
|
||||
Source: "/dev/null",
|
||||
},
|
||||
},
|
||||
},
|
||||
expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1"},
|
||||
},
|
||||
{
|
||||
description: "cdi devices from envvar",
|
||||
input: cdiModifier{
|
||||
defaultKind: "nvidia.com/gpu",
|
||||
acceptEnvvarUnprivileged: true,
|
||||
acceptDeviceListAsVolumeMounts: true,
|
||||
},
|
||||
spec: &specs.Spec{
|
||||
Process: &specs.Process{
|
||||
Env: []string{"NVIDIA_VISIBLE_DEVICES=0,example.com/class=device"},
|
||||
},
|
||||
},
|
||||
expectedDevices: []string{"nvidia.com/gpu=0", "example.com/class=device"},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
tc.input.logger = logger
|
||||
|
||||
spec := &oci.SpecMock{
|
||||
LoadFunc: func() (*specs.Spec, error) {
|
||||
return tc.spec, nil
|
||||
},
|
||||
}
|
||||
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
devices, err := tc.input.getDevicesFromSpec(spec)
|
||||
require.NoError(t, err)
|
||||
require.EqualValues(t, tc.expectedDevices, devices)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user