Merge branch 'CNT-4301/resolve-auto-to-cdi' into 'main'

Resolve auto mode to cdi if all devices are cdi devices See merge request nvidia/container-toolkit/container-toolkit!421
2025-06-26 18:18:24 +00:00 · 2023-06-13 14:48:32 +00:00 · 2023-06-13 14:48:32 +00:00 · 743d290577
commit 743d290577
parent 2da32970b9 82347eb9bc
9 changed files with 328 additions and 57 deletions
--- a/cmd/nvidia-container-runtime-hook/container_config.go
+++ b/cmd/nvidia-container-runtime-hook/container_config.go
@ -45,7 +45,7 @@ type nvidiaConfig struct {
 type containerConfig struct {
 	Pid    int
 	Rootfs string
-	Env    map[string]string
+	Image  image.CUDA
 	Nvidia *nvidiaConfig
 }

@ -362,7 +362,7 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
 	return containerConfig{
 		Pid:    h.Pid,
 		Rootfs: s.Root.Path,
-		Env:    image,
+		Image:  image,
 		Nvidia: getNvidiaConfig(&hook, image, s.Mounts, privileged),
 	}
 }
--- a/cmd/nvidia-container-runtime-hook/main.go
+++ b/cmd/nvidia-container-runtime-hook/main.go
@ -78,10 +78,6 @@ func doPrestart() {
 	}
 	cli := hook.NvidiaContainerCLI

-	if !hook.NVIDIAContainerRuntimeHook.SkipModeDetection && info.ResolveAutoMode(&logInterceptor{}, hook.NVIDIAContainerRuntime.Mode) != "legacy" {
-		log.Panicln("invoking the NVIDIA Container Runtime Hook directly (e.g. specifying the docker --gpus flag) is not supported. Please use the NVIDIA Container Runtime (e.g. specify the --runtime=nvidia flag) instead.")
-	}
-
 	container := getContainerConfig(*hook)
 	nvidia := container.Nvidia
 	if nvidia == nil {
@ -89,6 +85,10 @@ func doPrestart() {
 		return
 	}

+	if !hook.NVIDIAContainerRuntimeHook.SkipModeDetection && info.ResolveAutoMode(&logInterceptor{}, hook.NVIDIAContainerRuntime.Mode, container.Image) != "legacy" {
+		log.Panicln("invoking the NVIDIA Container Runtime Hook directly (e.g. specifying the docker --gpus flag) is not supported. Please use the NVIDIA Container Runtime (e.g. specify the --runtime=nvidia flag) instead.")
+	}
+
 	rootfs := getRootfsPath(container)

 	args := []string{getCLIPath(cli)}
--- a/internal/info/auto.go
+++ b/internal/info/auto.go
@ -17,26 +17,50 @@
 package info

 import (
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
+	"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
 	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info"
 )

+// infoInterface provides an alias for mocking.
+//
+//go:generate moq -stub -out info-interface_mock.go . infoInterface
+type infoInterface info.Interface
+
+type resolver struct {
+	logger logger.Interface
+	info   info.Interface
+}
+
 // ResolveAutoMode determines the correct mode for the platform if set to "auto"
-func ResolveAutoMode(logger logger.Interface, mode string) (rmode string) {
+func ResolveAutoMode(logger logger.Interface, mode string, image image.CUDA) (rmode string) {
+	nvinfo := info.New()
+	r := resolver{
+		logger: logger,
+		info:   nvinfo,
+	}
+	return r.resolveMode(mode, image)
+}
+
+// resolveMode determines the correct mode for the platform if set to "auto"
+func (r resolver) resolveMode(mode string, image image.CUDA) (rmode string) {
 	if mode != "auto" {
 		return mode
 	}
 	defer func() {
-		logger.Infof("Auto-detected mode as '%v'", rmode)
+		r.logger.Infof("Auto-detected mode as '%v'", rmode)
 	}()

-	nvinfo := info.New()
+	if onlyFullyQualifiedCDIDevices(image) {
+		return "cdi"
+	}

-	isTegra, reason := nvinfo.IsTegraSystem()
-	logger.Debugf("Is Tegra-based system? %v: %v", isTegra, reason)
+	isTegra, reason := r.info.IsTegraSystem()
+	r.logger.Debugf("Is Tegra-based system? %v: %v", isTegra, reason)

-	hasNVML, reason := nvinfo.HasNvml()
-	logger.Debugf("Has NVML? %v: %v", hasNVML, reason)
+	hasNVML, reason := r.info.HasNvml()
+	r.logger.Debugf("Has NVML? %v: %v", hasNVML, reason)

 	if isTegra && !hasNVML {
 		return "csv"
@ -44,3 +68,14 @@ func ResolveAutoMode(logger logger.Interface, mode string) (rmode string) {

 	return "legacy"
 }
+
+func onlyFullyQualifiedCDIDevices(image image.CUDA) bool {
+	var hasCDIdevice bool
+	for _, device := range image.DevicesFromEnvvars("NVIDIA_VISIBLE_DEVICES").List() {
+		if !cdi.IsQualifiedName(device) {
+			return false
+		}
+		hasCDIdevice = true
+	}
+	return hasCDIdevice
+}
--- a/internal/info/auto_test.go
+++ b/internal/info/auto_test.go
@ -19,8 +19,10 @@ package info
 import (
 	"testing"

+	"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
 	testlog "github.com/sirupsen/logrus/hooks/test"
 	"github.com/stretchr/testify/require"
+	"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info"
 )

 func TestResolveAutoMode(t *testing.T) {
@ -30,23 +32,123 @@ func TestResolveAutoMode(t *testing.T) {
 		description  string
 		mode         string
 		expectedMode string
+		info         info.Interface
+		image        image.CUDA
 	}{
 		{
 			description:  "non-auto resolves to input",
 			mode:         "not-auto",
 			expectedMode: "not-auto",
 		},
-		// TODO: The following test is brittle in that it will break on Tegra-based systems.
-		// {
-		// 	description:  "auto resolves to legacy",
-		// 	mode:         "auto",
-		// 	expectedMode: "legacy",
-		// },
+		{
+			description: "nvml non-tegra resolves to legacy",
+			mode:        "auto",
+			info: &infoInterfaceMock{
+				HasNvmlFunc: func() (bool, string) {
+					return true, "nvml"
+				},
+				IsTegraSystemFunc: func() (bool, string) {
+					return false, "tegra"
+				},
+			},
+			expectedMode: "legacy",
+		},
+		{
+			description: "non-nvml non-tegra resolves to legacy",
+			mode:        "auto",
+			info: &infoInterfaceMock{
+				HasNvmlFunc: func() (bool, string) {
+					return false, "nvml"
+				},
+				IsTegraSystemFunc: func() (bool, string) {
+					return false, "tegra"
+				},
+			},
+			expectedMode: "legacy",
+		},
+		{
+			description: "nvml tegra resolves to legacy",
+			mode:        "auto",
+			info: &infoInterfaceMock{
+				HasNvmlFunc: func() (bool, string) {
+					return true, "nvml"
+				},
+				IsTegraSystemFunc: func() (bool, string) {
+					return true, "tegra"
+				},
+			},
+			expectedMode: "legacy",
+		},
+		{
+			description: "non-nvml tegra resolves to csv",
+			mode:        "auto",
+			info: &infoInterfaceMock{
+				HasNvmlFunc: func() (bool, string) {
+					return false, "nvml"
+				},
+				IsTegraSystemFunc: func() (bool, string) {
+					return true, "tegra"
+				},
+			},
+			expectedMode: "csv",
+		},
+		{
+			description:  "cdi devices resolves to cdi",
+			mode:         "auto",
+			expectedMode: "cdi",
+			image: image.CUDA{
+				"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=all",
+			},
+		},
+		{
+			description:  "multiple cdi devices resolves to cdi",
+			mode:         "auto",
+			expectedMode: "cdi",
+			image: image.CUDA{
+				"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,nvidia.com/gpu=1",
+			},
+		},
+		{
+			description: "at least one non-cdi device resolves to legacy",
+			mode:        "auto",
+			image: image.CUDA{
+				"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
+			},
+			info: &infoInterfaceMock{
+				HasNvmlFunc: func() (bool, string) {
+					return true, "nvml"
+				},
+				IsTegraSystemFunc: func() (bool, string) {
+					return true, "tegra"
+				},
+			},
+			expectedMode: "legacy",
+		},
+		{
+			description: "at least one non-cdi device resolves to csv",
+			mode:        "auto",
+			image: image.CUDA{
+				"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
+			},
+			info: &infoInterfaceMock{
+				HasNvmlFunc: func() (bool, string) {
+					return false, "nvml"
+				},
+				IsTegraSystemFunc: func() (bool, string) {
+					return true, "tegra"
+				},
+			},
+			expectedMode: "csv",
+		},
 	}

 	for _, tc := range testCases {
 		t.Run(tc.description, func(t *testing.T) {
-			mode := ResolveAutoMode(logger, tc.mode)
+			r := resolver{
+				logger: logger,
+				info:   tc.info,
+			}
+			mode := r.resolveMode(tc.mode, tc.image)
 			require.EqualValues(t, tc.expectedMode, mode)
 		})
 	}
--- a/internal/info/info-interface_mock.go
+++ b/internal/info/info-interface_mock.go
@ -0,0 +1,153 @@
+// Code generated by moq; DO NOT EDIT.
+// github.com/matryer/moq
+
+package info
+
+import (
+	"sync"
+)
+
+// Ensure, that infoInterfaceMock does implement infoInterface.
+// If this is not the case, regenerate this file with moq.
+var _ infoInterface = &infoInterfaceMock{}
+
+// infoInterfaceMock is a mock implementation of infoInterface.
+//
+//	func TestSomethingThatUsesinfoInterface(t *testing.T) {
+//
+//		// make and configure a mocked infoInterface
+//		mockedinfoInterface := &infoInterfaceMock{
+//			HasDXCoreFunc: func() (bool, string) {
+//				panic("mock out the HasDXCore method")
+//			},
+//			HasNvmlFunc: func() (bool, string) {
+//				panic("mock out the HasNvml method")
+//			},
+//			IsTegraSystemFunc: func() (bool, string) {
+//				panic("mock out the IsTegraSystem method")
+//			},
+//		}
+//
+//		// use mockedinfoInterface in code that requires infoInterface
+//		// and then make assertions.
+//
+//	}
+type infoInterfaceMock struct {
+	// HasDXCoreFunc mocks the HasDXCore method.
+	HasDXCoreFunc func() (bool, string)
+
+	// HasNvmlFunc mocks the HasNvml method.
+	HasNvmlFunc func() (bool, string)
+
+	// IsTegraSystemFunc mocks the IsTegraSystem method.
+	IsTegraSystemFunc func() (bool, string)
+
+	// calls tracks calls to the methods.
+	calls struct {
+		// HasDXCore holds details about calls to the HasDXCore method.
+		HasDXCore []struct {
+		}
+		// HasNvml holds details about calls to the HasNvml method.
+		HasNvml []struct {
+		}
+		// IsTegraSystem holds details about calls to the IsTegraSystem method.
+		IsTegraSystem []struct {
+		}
+	}
+	lockHasDXCore     sync.RWMutex
+	lockHasNvml       sync.RWMutex
+	lockIsTegraSystem sync.RWMutex
+}
+
+// HasDXCore calls HasDXCoreFunc.
+func (mock *infoInterfaceMock) HasDXCore() (bool, string) {
+	callInfo := struct {
+	}{}
+	mock.lockHasDXCore.Lock()
+	mock.calls.HasDXCore = append(mock.calls.HasDXCore, callInfo)
+	mock.lockHasDXCore.Unlock()
+	if mock.HasDXCoreFunc == nil {
+		var (
+			bOut bool
+			sOut string
+		)
+		return bOut, sOut
+	}
+	return mock.HasDXCoreFunc()
+}
+
+// HasDXCoreCalls gets all the calls that were made to HasDXCore.
+// Check the length with:
+//
+//	len(mockedinfoInterface.HasDXCoreCalls())
+func (mock *infoInterfaceMock) HasDXCoreCalls() []struct {
+} {
+	var calls []struct {
+	}
+	mock.lockHasDXCore.RLock()
+	calls = mock.calls.HasDXCore
+	mock.lockHasDXCore.RUnlock()
+	return calls
+}
+
+// HasNvml calls HasNvmlFunc.
+func (mock *infoInterfaceMock) HasNvml() (bool, string) {
+	callInfo := struct {
+	}{}
+	mock.lockHasNvml.Lock()
+	mock.calls.HasNvml = append(mock.calls.HasNvml, callInfo)
+	mock.lockHasNvml.Unlock()
+	if mock.HasNvmlFunc == nil {
+		var (
+			bOut bool
+			sOut string
+		)
+		return bOut, sOut
+	}
+	return mock.HasNvmlFunc()
+}
+
+// HasNvmlCalls gets all the calls that were made to HasNvml.
+// Check the length with:
+//
+//	len(mockedinfoInterface.HasNvmlCalls())
+func (mock *infoInterfaceMock) HasNvmlCalls() []struct {
+} {
+	var calls []struct {
+	}
+	mock.lockHasNvml.RLock()
+	calls = mock.calls.HasNvml
+	mock.lockHasNvml.RUnlock()
+	return calls
+}
+
+// IsTegraSystem calls IsTegraSystemFunc.
+func (mock *infoInterfaceMock) IsTegraSystem() (bool, string) {
+	callInfo := struct {
+	}{}
+	mock.lockIsTegraSystem.Lock()
+	mock.calls.IsTegraSystem = append(mock.calls.IsTegraSystem, callInfo)
+	mock.lockIsTegraSystem.Unlock()
+	if mock.IsTegraSystemFunc == nil {
+		var (
+			bOut bool
+			sOut string
+		)
+		return bOut, sOut
+	}
+	return mock.IsTegraSystemFunc()
+}
+
+// IsTegraSystemCalls gets all the calls that were made to IsTegraSystem.
+// Check the length with:
+//
+//	len(mockedinfoInterface.IsTegraSystemCalls())
+func (mock *infoInterfaceMock) IsTegraSystemCalls() []struct {
+} {
+	var calls []struct {
+	}
+	mock.lockIsTegraSystem.RLock()
+	calls = mock.calls.IsTegraSystem
+	mock.lockIsTegraSystem.RUnlock()
+	return calls
+}
--- a/internal/modifier/gds.go
+++ b/internal/modifier/gds.go
@ -32,17 +32,7 @@ const (

 // NewGDSModifier creates the modifiers for GDS devices.
 // If the spec does not contain the NVIDIA_GDS=enabled environment variable no changes are made.
-func NewGDSModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
-	rawSpec, err := ociSpec.Load()
-	if err != nil {
-		return nil, fmt.Errorf("failed to load OCI spec: %v", err)
-	}
-
-	image, err := image.NewCUDAImageFromSpec(rawSpec)
-	if err != nil {
-		return nil, err
-	}
-
+func NewGDSModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
 	if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 {
 		logger.Infof("No modification required; no devices requested")
 		return nil, nil
--- a/internal/modifier/graphics.go
+++ b/internal/modifier/graphics.go
@ -28,17 +28,7 @@ import (

 // NewGraphicsModifier constructs a modifier that injects graphics-related modifications into an OCI runtime specification.
 // The value of the NVIDIA_DRIVER_CAPABILITIES environment variable is checked to determine if this modification should be made.
-func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
-	rawSpec, err := ociSpec.Load()
-	if err != nil {
-		return nil, fmt.Errorf("failed to load OCI spec: %v", err)
-	}
-
-	image, err := image.NewCUDAImageFromSpec(rawSpec)
-	if err != nil {
-		return nil, err
-	}
-
+func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
 	if required, reason := requiresGraphicsModifier(image); !required {
 		logger.Infof("No graphics modifier required: %v", reason)
 		return nil, nil
--- a/internal/modifier/mofed.go
+++ b/internal/modifier/mofed.go
@ -32,17 +32,7 @@ const (

 // NewMOFEDModifier creates the modifiers for MOFED devices.
 // If the spec does not contain the NVIDIA_MOFED=enabled environment variable no changes are made.
-func NewMOFEDModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
-	rawSpec, err := ociSpec.Load()
-	if err != nil {
-		return nil, fmt.Errorf("failed to load OCI spec: %v", err)
-	}
-
-	image, err := image.NewCUDAImageFromSpec(rawSpec)
-	if err != nil {
-		return nil, err
-	}
-
+func NewMOFEDModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
 	if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 {
 		logger.Infof("No modification required; no devices requested")
 		return nil, nil
--- a/internal/runtime/runtime_factory.go
+++ b/internal/runtime/runtime_factory.go
@ -20,6 +20,7 @@ import (
 	"fmt"

 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/info"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier"
@ -61,7 +62,17 @@ func newNVIDIAContainerRuntime(logger logger.Interface, cfg *config.Config, argv

 // newSpecModifier is a factory method that creates constructs an OCI spec modifer based on the provided config.
 func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec, argv []string) (oci.SpecModifier, error) {
-	mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode)
+	rawSpec, err := ociSpec.Load()
+	if err != nil {
+		return nil, fmt.Errorf("failed to load OCI spec: %v", err)
+	}
+
+	image, err := image.NewCUDAImageFromSpec(rawSpec)
+	if err != nil {
+		return nil, err
+	}
+
+	mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
 	modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, argv)
 	if err != nil {
 		return nil, err
@ -71,17 +82,17 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
 		return modeModifier, nil
 	}

-	graphicsModifier, err := modifier.NewGraphicsModifier(logger, cfg, ociSpec)
+	graphicsModifier, err := modifier.NewGraphicsModifier(logger, cfg, image)
 	if err != nil {
 		return nil, err
 	}

-	gdsModifier, err := modifier.NewGDSModifier(logger, cfg, ociSpec)
+	gdsModifier, err := modifier.NewGDSModifier(logger, cfg, image)
 	if err != nil {
 		return nil, err
 	}

-	mofedModifier, err := modifier.NewMOFEDModifier(logger, cfg, ociSpec)
+	mofedModifier, err := modifier.NewMOFEDModifier(logger, cfg, image)
 	if err != nil {
 		return nil, err
 	}