This commit is contained in:
Evan Lezar 2025-06-05 19:47:06 +00:00 committed by GitHub
commit 824aa5a005
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 171 additions and 63 deletions

View File

@ -54,8 +54,12 @@ type CUDA struct {
// NewCUDAImageFromSpec creates a CUDA image from the input OCI runtime spec.
// The process environment is read (if present) to construc the CUDA Image.
func NewCUDAImageFromSpec(spec *specs.Spec, opts ...Option) (CUDA, error) {
if spec == nil {
return New(opts...)
}
var env []string
if spec != nil && spec.Process != nil {
if spec.Process != nil {
env = spec.Process.Env
}
@ -212,19 +216,12 @@ func parseMajorMinorVersion(version string) (string, error) {
// OnlyFullyQualifiedCDIDevices returns true if all devices requested in the image are requested as CDI devices/
func (i CUDA) OnlyFullyQualifiedCDIDevices() bool {
var hasCDIdevice bool
for _, device := range i.VisibleDevicesFromEnvVar() {
for _, device := range i.VisibleDevices() {
if !parser.IsQualifiedName(device) {
return false
}
hasCDIdevice = true
}
for _, device := range i.DevicesFromMounts() {
if !strings.HasPrefix(device, "cdi/") {
return false
}
hasCDIdevice = true
}
return hasCDIdevice
}
@ -276,20 +273,27 @@ func (i CUDA) VisibleDevicesFromEnvVar() []string {
// visibleDevicesFromMounts returns the set of visible devices requested as mounts.
func (i CUDA) visibleDevicesFromMounts() []string {
var devices []string
for _, device := range i.DevicesFromMounts() {
for _, device := range i.requestsFromMounts() {
switch {
case strings.HasPrefix(device, volumeMountDevicePrefixCDI):
continue
case strings.HasPrefix(device, volumeMountDevicePrefixImex):
continue
case strings.HasPrefix(device, volumeMountDevicePrefixCDI):
name, err := cdiDeviceMountRequest(device).qualifiedName()
if err != nil {
i.logger.Warningf("Ignoring invalid mount request for CDI device %v: %w", device, err)
continue
}
devices = append(devices, name)
default:
devices = append(devices, device)
}
devices = append(devices, device)
}
return devices
}
// DevicesFromMounts returns a list of device specified as mounts.
func (i CUDA) DevicesFromMounts() []string {
// requestsFromMounts returns a list of device specified as mounts.
func (i CUDA) requestsFromMounts() []string {
root := filepath.Clean(DeviceListAsVolumeMountsRoot)
seen := make(map[string]bool)
var devices []string
@ -321,23 +325,30 @@ func (i CUDA) DevicesFromMounts() []string {
return devices
}
// CDIDevicesFromMounts returns a list of CDI devices specified as mounts on the image.
func (i CUDA) CDIDevicesFromMounts() []string {
var devices []string
for _, mountDevice := range i.DevicesFromMounts() {
if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixCDI) {
continue
}
parts := strings.SplitN(strings.TrimPrefix(mountDevice, volumeMountDevicePrefixCDI), "/", 3)
if len(parts) != 3 {
continue
}
vendor := parts[0]
class := parts[1]
device := parts[2]
devices = append(devices, fmt.Sprintf("%s/%s=%s", vendor, class, device))
// a cdiDeviceMountRequest represents a CDI device requests as a mount.
// Here the host path /dev/null is mounted to a particular path in the container.
// The container path has the form:
// /var/run/nvidia-container-devices/cdi/<vendor>/<class>/<device>
// or
// /var/run/nvidia-container-devices/cdi/<vendor>/<class>=<device>
type cdiDeviceMountRequest string
// qualifiedName returns the fully-qualified name of the CDI device.
func (m cdiDeviceMountRequest) qualifiedName() (string, error) {
if !strings.HasPrefix(string(m), volumeMountDevicePrefixCDI) {
return "", fmt.Errorf("invalid mount CDI device request: %s", m)
}
return devices
requestedDevice := strings.TrimPrefix(string(m), volumeMountDevicePrefixCDI)
if parser.IsQualifiedName(requestedDevice) {
return requestedDevice, nil
}
parts := strings.SplitN(requestedDevice, "/", 3)
if len(parts) != 3 {
return "", fmt.Errorf("invalid mount CDI device request: %s", m)
}
return fmt.Sprintf("%s/%s=%s", parts[0], parts[1], parts[2]), nil
}
// ImexChannelsFromEnvVar returns the list of IMEX channels requested for the image.
@ -352,7 +363,7 @@ func (i CUDA) ImexChannelsFromEnvVar() []string {
// ImexChannelsFromMounts returns the list of IMEX channels requested for the image.
func (i CUDA) ImexChannelsFromMounts() []string {
var channels []string
for _, mountDevice := range i.DevicesFromMounts() {
for _, mountDevice := range i.requestsFromMounts() {
if !strings.HasPrefix(mountDevice, volumeMountDevicePrefixImex) {
continue
}

View File

@ -487,9 +487,9 @@ func TestGetVisibleDevicesFromMounts(t *testing.T) {
expectedDevices: []string{"GPU0-MIG0/0/1", "GPU1-MIG0/0/1"},
},
{
description: "cdi devices are ignored",
mounts: makeTestMounts("GPU0", "cdi/nvidia.com/gpu=all", "GPU1"),
expectedDevices: []string{"GPU0", "GPU1"},
description: "cdi devices are included",
mounts: makeTestMounts("GPU0", "nvidia.com/gpu=all", "GPU1"),
expectedDevices: []string{"GPU0", "nvidia.com/gpu=all", "GPU1"},
},
{
description: "imex devices are ignored",

View File

@ -184,7 +184,7 @@ func TestResolveAutoMode(t *testing.T) {
expectedMode: "legacy",
},
{
description: "cdi mount and non-CDI envvar resolves to legacy",
description: "cdi mount and non-CDI envvar resolves to cdi",
mode: "auto",
envmap: map[string]string{
"NVIDIA_VISIBLE_DEVICES": "0",
@ -197,6 +197,22 @@ func TestResolveAutoMode(t *testing.T) {
"tegra": false,
"nvgpu": false,
},
expectedMode: "cdi",
},
{
description: "non-cdi mount and CDI envvar resolves to legacy",
mode: "auto",
envmap: map[string]string{
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0",
},
mounts: []string{
"/var/run/nvidia-container-devices/0",
},
info: map[string]bool{
"nvml": true,
"tegra": false,
"nvgpu": false,
},
expectedMode: "legacy",
},
}
@ -232,6 +248,8 @@ func TestResolveAutoMode(t *testing.T) {
image, _ := image.New(
image.WithEnvMap(tc.envmap),
image.WithMounts(mounts),
image.WithAcceptDeviceListAsVolumeMounts(true),
image.WithAcceptEnvvarUnprivileged(true),
)
mode := resolveMode(logger, tc.mode, image, properties)
require.EqualValues(t, tc.expectedMode, mode)

View File

@ -66,57 +66,66 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe
}
func getDevicesFromSpec(logger logger.Interface, ociSpec oci.Spec, cfg *config.Config) ([]string, error) {
cdiModifier := &cdiModifier{
logger: logger,
acceptDeviceListAsVolumeMounts: cfg.AcceptDeviceListAsVolumeMounts,
acceptEnvvarUnprivileged: cfg.AcceptEnvvarUnprivileged,
annotationPrefixes: cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes,
defaultKind: cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind,
}
return cdiModifier.getDevicesFromSpec(ociSpec)
}
// TODO: We should rename this type.
type cdiModifier struct {
logger logger.Interface
acceptDeviceListAsVolumeMounts bool
acceptEnvvarUnprivileged bool
annotationPrefixes []string
defaultKind string
}
func (c *cdiModifier) getDevicesFromSpec(ociSpec oci.Spec) ([]string, error) {
rawSpec, err := ociSpec.Load()
if err != nil {
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
}
annotationDevices, err := getAnnotationDevices(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes, rawSpec.Annotations)
if err != nil {
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
}
if len(annotationDevices) > 0 {
return annotationDevices, nil
if rawSpec != nil {
annotationDevices, err := getAnnotationDevices(c.annotationPrefixes, rawSpec.Annotations)
if err != nil {
return nil, fmt.Errorf("failed to parse container annotations: %v", err)
}
if len(annotationDevices) > 0 {
return annotationDevices, nil
}
}
container, err := image.NewCUDAImageFromSpec(
rawSpec,
image.WithLogger(logger),
image.WithLogger(c.logger),
image.WithAcceptDeviceListAsVolumeMounts(c.acceptDeviceListAsVolumeMounts),
image.WithAcceptEnvvarUnprivileged(c.acceptEnvvarUnprivileged),
)
if err != nil {
return nil, err
}
if cfg.AcceptDeviceListAsVolumeMounts {
mountDevices := container.CDIDevicesFromMounts()
if len(mountDevices) > 0 {
return mountDevices, nil
}
}
var devices []string
seen := make(map[string]bool)
for _, name := range container.VisibleDevicesFromEnvVar() {
for _, name := range container.VisibleDevices() {
if !parser.IsQualifiedName(name) {
name = fmt.Sprintf("%s=%s", cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind, name)
name = fmt.Sprintf("%s=%s", c.defaultKind, name)
}
if seen[name] {
logger.Debugf("Ignoring duplicate device %q", name)
c.logger.Debugf("Ignoring duplicate device %q", name)
continue
}
seen[name] = true
devices = append(devices, name)
}
if len(devices) == 0 {
return nil, nil
}
if cfg.AcceptEnvvarUnprivileged || image.IsPrivileged((*image.OCISpec)(rawSpec)) {
return devices, nil
}
logger.Warningf("Ignoring devices specified in NVIDIA_VISIBLE_DEVICES: %v", devices)
return nil, nil
return devices, nil
}
// getAnnotationDevices returns a list of devices specified in the annotations.

View File

@ -20,7 +20,11 @@ import (
"fmt"
"testing"
"github.com/opencontainers/runtime-spec/specs-go"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
)
func TestGetAnnotationDevices(t *testing.T) {
@ -90,3 +94,69 @@ func TestGetAnnotationDevices(t *testing.T) {
})
}
}
func TestGetDevicesFromSpec(t *testing.T) {
logger, _ := testlog.NewNullLogger()
testCases := []struct {
description string
input cdiModifier
spec *specs.Spec
expectedDevices []string
}{
{
description: "empty spec yields no devices",
},
{
description: "cdi devices from mounts",
input: cdiModifier{
defaultKind: "nvidia.com/gpu",
acceptEnvvarUnprivileged: true,
acceptDeviceListAsVolumeMounts: true,
},
spec: &specs.Spec{
Mounts: []specs.Mount{
{
Destination: "/var/run/nvidia-container-devices/cdi/nvidia.com/gpu/0",
Source: "/dev/null",
},
{
Destination: "/var/run/nvidia-container-devices/cdi/nvidia.com/gpu/1",
Source: "/dev/null",
},
},
},
expectedDevices: []string{"nvidia.com/gpu=0", "nvidia.com/gpu=1"},
},
{
description: "cdi devices from envvar",
input: cdiModifier{
defaultKind: "nvidia.com/gpu",
acceptEnvvarUnprivileged: true,
acceptDeviceListAsVolumeMounts: true,
},
spec: &specs.Spec{
Process: &specs.Process{
Env: []string{"NVIDIA_VISIBLE_DEVICES=0,example.com/class=device"},
},
},
expectedDevices: []string{"nvidia.com/gpu=0", "example.com/class=device"},
},
}
for _, tc := range testCases {
tc.input.logger = logger
spec := &oci.SpecMock{
LoadFunc: func() (*specs.Spec, error) {
return tc.spec, nil
},
}
t.Run(tc.description, func(t *testing.T) {
devices, err := tc.input.getDevicesFromSpec(spec)
require.NoError(t, err)
require.EqualValues(t, tc.expectedDevices, devices)
})
}
}