Merge branch 'CNT-4301/resolve-auto-to-cdi' into 'main'

Resolve auto mode to cdi if all devices are cdi devices

See merge request nvidia/container-toolkit/container-toolkit!421
This commit is contained in:
Evan Lezar 2023-06-13 14:48:32 +00:00
commit 743d290577
9 changed files with 328 additions and 57 deletions

View File

@ -45,7 +45,7 @@ type nvidiaConfig struct {
type containerConfig struct {
Pid int
Rootfs string
Env map[string]string
Image image.CUDA
Nvidia *nvidiaConfig
}
@ -362,7 +362,7 @@ func getContainerConfig(hook HookConfig) (config containerConfig) {
return containerConfig{
Pid: h.Pid,
Rootfs: s.Root.Path,
Env: image,
Image: image,
Nvidia: getNvidiaConfig(&hook, image, s.Mounts, privileged),
}
}

View File

@ -78,10 +78,6 @@ func doPrestart() {
}
cli := hook.NvidiaContainerCLI
if !hook.NVIDIAContainerRuntimeHook.SkipModeDetection && info.ResolveAutoMode(&logInterceptor{}, hook.NVIDIAContainerRuntime.Mode) != "legacy" {
log.Panicln("invoking the NVIDIA Container Runtime Hook directly (e.g. specifying the docker --gpus flag) is not supported. Please use the NVIDIA Container Runtime (e.g. specify the --runtime=nvidia flag) instead.")
}
container := getContainerConfig(*hook)
nvidia := container.Nvidia
if nvidia == nil {
@ -89,6 +85,10 @@ func doPrestart() {
return
}
if !hook.NVIDIAContainerRuntimeHook.SkipModeDetection && info.ResolveAutoMode(&logInterceptor{}, hook.NVIDIAContainerRuntime.Mode, container.Image) != "legacy" {
log.Panicln("invoking the NVIDIA Container Runtime Hook directly (e.g. specifying the docker --gpus flag) is not supported. Please use the NVIDIA Container Runtime (e.g. specify the --runtime=nvidia flag) instead.")
}
rootfs := getRootfsPath(container)
args := []string{getCLIPath(cli)}

View File

@ -17,26 +17,50 @@
package info
import (
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info"
)
// infoInterface provides an alias for mocking.
//
//go:generate moq -stub -out info-interface_mock.go . infoInterface
type infoInterface info.Interface
type resolver struct {
logger logger.Interface
info info.Interface
}
// ResolveAutoMode determines the correct mode for the platform if set to "auto"
func ResolveAutoMode(logger logger.Interface, mode string) (rmode string) {
func ResolveAutoMode(logger logger.Interface, mode string, image image.CUDA) (rmode string) {
nvinfo := info.New()
r := resolver{
logger: logger,
info: nvinfo,
}
return r.resolveMode(mode, image)
}
// resolveMode determines the correct mode for the platform if set to "auto"
func (r resolver) resolveMode(mode string, image image.CUDA) (rmode string) {
if mode != "auto" {
return mode
}
defer func() {
logger.Infof("Auto-detected mode as '%v'", rmode)
r.logger.Infof("Auto-detected mode as '%v'", rmode)
}()
nvinfo := info.New()
if onlyFullyQualifiedCDIDevices(image) {
return "cdi"
}
isTegra, reason := nvinfo.IsTegraSystem()
logger.Debugf("Is Tegra-based system? %v: %v", isTegra, reason)
isTegra, reason := r.info.IsTegraSystem()
r.logger.Debugf("Is Tegra-based system? %v: %v", isTegra, reason)
hasNVML, reason := nvinfo.HasNvml()
logger.Debugf("Has NVML? %v: %v", hasNVML, reason)
hasNVML, reason := r.info.HasNvml()
r.logger.Debugf("Has NVML? %v: %v", hasNVML, reason)
if isTegra && !hasNVML {
return "csv"
@ -44,3 +68,14 @@ func ResolveAutoMode(logger logger.Interface, mode string) (rmode string) {
return "legacy"
}
func onlyFullyQualifiedCDIDevices(image image.CUDA) bool {
var hasCDIdevice bool
for _, device := range image.DevicesFromEnvvars("NVIDIA_VISIBLE_DEVICES").List() {
if !cdi.IsQualifiedName(device) {
return false
}
hasCDIdevice = true
}
return hasCDIdevice
}

View File

@ -19,8 +19,10 @@ package info
import (
"testing"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvlib/info"
)
func TestResolveAutoMode(t *testing.T) {
@ -30,23 +32,123 @@ func TestResolveAutoMode(t *testing.T) {
description string
mode string
expectedMode string
info info.Interface
image image.CUDA
}{
{
description: "non-auto resolves to input",
mode: "not-auto",
expectedMode: "not-auto",
},
// TODO: The following test is brittle in that it will break on Tegra-based systems.
// {
// description: "auto resolves to legacy",
// mode: "auto",
// expectedMode: "legacy",
// },
{
description: "nvml non-tegra resolves to legacy",
mode: "auto",
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return true, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return false, "tegra"
},
},
expectedMode: "legacy",
},
{
description: "non-nvml non-tegra resolves to legacy",
mode: "auto",
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return false, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return false, "tegra"
},
},
expectedMode: "legacy",
},
{
description: "nvml tegra resolves to legacy",
mode: "auto",
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return true, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return true, "tegra"
},
},
expectedMode: "legacy",
},
{
description: "non-nvml tegra resolves to csv",
mode: "auto",
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return false, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return true, "tegra"
},
},
expectedMode: "csv",
},
{
description: "cdi devices resolves to cdi",
mode: "auto",
expectedMode: "cdi",
image: image.CUDA{
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=all",
},
},
{
description: "multiple cdi devices resolves to cdi",
mode: "auto",
expectedMode: "cdi",
image: image.CUDA{
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,nvidia.com/gpu=1",
},
},
{
description: "at least one non-cdi device resolves to legacy",
mode: "auto",
image: image.CUDA{
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
},
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return true, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return true, "tegra"
},
},
expectedMode: "legacy",
},
{
description: "at least one non-cdi device resolves to csv",
mode: "auto",
image: image.CUDA{
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
},
info: &infoInterfaceMock{
HasNvmlFunc: func() (bool, string) {
return false, "nvml"
},
IsTegraSystemFunc: func() (bool, string) {
return true, "tegra"
},
},
expectedMode: "csv",
},
}
for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
mode := ResolveAutoMode(logger, tc.mode)
r := resolver{
logger: logger,
info: tc.info,
}
mode := r.resolveMode(tc.mode, tc.image)
require.EqualValues(t, tc.expectedMode, mode)
})
}

View File

@ -0,0 +1,153 @@
// Code generated by moq; DO NOT EDIT.
// github.com/matryer/moq
package info
import (
"sync"
)
// Ensure, that infoInterfaceMock does implement infoInterface.
// If this is not the case, regenerate this file with moq.
var _ infoInterface = &infoInterfaceMock{}
// infoInterfaceMock is a mock implementation of infoInterface.
//
// func TestSomethingThatUsesinfoInterface(t *testing.T) {
//
// // make and configure a mocked infoInterface
// mockedinfoInterface := &infoInterfaceMock{
// HasDXCoreFunc: func() (bool, string) {
// panic("mock out the HasDXCore method")
// },
// HasNvmlFunc: func() (bool, string) {
// panic("mock out the HasNvml method")
// },
// IsTegraSystemFunc: func() (bool, string) {
// panic("mock out the IsTegraSystem method")
// },
// }
//
// // use mockedinfoInterface in code that requires infoInterface
// // and then make assertions.
//
// }
type infoInterfaceMock struct {
// HasDXCoreFunc mocks the HasDXCore method.
HasDXCoreFunc func() (bool, string)
// HasNvmlFunc mocks the HasNvml method.
HasNvmlFunc func() (bool, string)
// IsTegraSystemFunc mocks the IsTegraSystem method.
IsTegraSystemFunc func() (bool, string)
// calls tracks calls to the methods.
calls struct {
// HasDXCore holds details about calls to the HasDXCore method.
HasDXCore []struct {
}
// HasNvml holds details about calls to the HasNvml method.
HasNvml []struct {
}
// IsTegraSystem holds details about calls to the IsTegraSystem method.
IsTegraSystem []struct {
}
}
lockHasDXCore sync.RWMutex
lockHasNvml sync.RWMutex
lockIsTegraSystem sync.RWMutex
}
// HasDXCore calls HasDXCoreFunc.
func (mock *infoInterfaceMock) HasDXCore() (bool, string) {
callInfo := struct {
}{}
mock.lockHasDXCore.Lock()
mock.calls.HasDXCore = append(mock.calls.HasDXCore, callInfo)
mock.lockHasDXCore.Unlock()
if mock.HasDXCoreFunc == nil {
var (
bOut bool
sOut string
)
return bOut, sOut
}
return mock.HasDXCoreFunc()
}
// HasDXCoreCalls gets all the calls that were made to HasDXCore.
// Check the length with:
//
// len(mockedinfoInterface.HasDXCoreCalls())
func (mock *infoInterfaceMock) HasDXCoreCalls() []struct {
} {
var calls []struct {
}
mock.lockHasDXCore.RLock()
calls = mock.calls.HasDXCore
mock.lockHasDXCore.RUnlock()
return calls
}
// HasNvml calls HasNvmlFunc.
func (mock *infoInterfaceMock) HasNvml() (bool, string) {
callInfo := struct {
}{}
mock.lockHasNvml.Lock()
mock.calls.HasNvml = append(mock.calls.HasNvml, callInfo)
mock.lockHasNvml.Unlock()
if mock.HasNvmlFunc == nil {
var (
bOut bool
sOut string
)
return bOut, sOut
}
return mock.HasNvmlFunc()
}
// HasNvmlCalls gets all the calls that were made to HasNvml.
// Check the length with:
//
// len(mockedinfoInterface.HasNvmlCalls())
func (mock *infoInterfaceMock) HasNvmlCalls() []struct {
} {
var calls []struct {
}
mock.lockHasNvml.RLock()
calls = mock.calls.HasNvml
mock.lockHasNvml.RUnlock()
return calls
}
// IsTegraSystem calls IsTegraSystemFunc.
func (mock *infoInterfaceMock) IsTegraSystem() (bool, string) {
callInfo := struct {
}{}
mock.lockIsTegraSystem.Lock()
mock.calls.IsTegraSystem = append(mock.calls.IsTegraSystem, callInfo)
mock.lockIsTegraSystem.Unlock()
if mock.IsTegraSystemFunc == nil {
var (
bOut bool
sOut string
)
return bOut, sOut
}
return mock.IsTegraSystemFunc()
}
// IsTegraSystemCalls gets all the calls that were made to IsTegraSystem.
// Check the length with:
//
// len(mockedinfoInterface.IsTegraSystemCalls())
func (mock *infoInterfaceMock) IsTegraSystemCalls() []struct {
} {
var calls []struct {
}
mock.lockIsTegraSystem.RLock()
calls = mock.calls.IsTegraSystem
mock.lockIsTegraSystem.RUnlock()
return calls
}

View File

@ -32,17 +32,7 @@ const (
// NewGDSModifier creates the modifiers for GDS devices.
// If the spec does not contain the NVIDIA_GDS=enabled environment variable no changes are made.
func NewGDSModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
rawSpec, err := ociSpec.Load()
if err != nil {
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
}
image, err := image.NewCUDAImageFromSpec(rawSpec)
if err != nil {
return nil, err
}
func NewGDSModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 {
logger.Infof("No modification required; no devices requested")
return nil, nil

View File

@ -28,17 +28,7 @@ import (
// NewGraphicsModifier constructs a modifier that injects graphics-related modifications into an OCI runtime specification.
// The value of the NVIDIA_DRIVER_CAPABILITIES environment variable is checked to determine if this modification should be made.
func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
rawSpec, err := ociSpec.Load()
if err != nil {
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
}
image, err := image.NewCUDAImageFromSpec(rawSpec)
if err != nil {
return nil, err
}
func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
if required, reason := requiresGraphicsModifier(image); !required {
logger.Infof("No graphics modifier required: %v", reason)
return nil, nil

View File

@ -32,17 +32,7 @@ const (
// NewMOFEDModifier creates the modifiers for MOFED devices.
// If the spec does not contain the NVIDIA_MOFED=enabled environment variable no changes are made.
func NewMOFEDModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
rawSpec, err := ociSpec.Load()
if err != nil {
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
}
image, err := image.NewCUDAImageFromSpec(rawSpec)
if err != nil {
return nil, err
}
func NewMOFEDModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
if devices := image.DevicesFromEnvvars(visibleDevicesEnvvar); len(devices.List()) == 0 {
logger.Infof("No modification required; no devices requested")
return nil, nil

View File

@ -20,6 +20,7 @@ import (
"fmt"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/NVIDIA/nvidia-container-toolkit/internal/info"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier"
@ -61,7 +62,17 @@ func newNVIDIAContainerRuntime(logger logger.Interface, cfg *config.Config, argv
// newSpecModifier is a factory method that creates constructs an OCI spec modifer based on the provided config.
func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec, argv []string) (oci.SpecModifier, error) {
mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode)
rawSpec, err := ociSpec.Load()
if err != nil {
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
}
image, err := image.NewCUDAImageFromSpec(rawSpec)
if err != nil {
return nil, err
}
mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, argv)
if err != nil {
return nil, err
@ -71,17 +82,17 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
return modeModifier, nil
}
graphicsModifier, err := modifier.NewGraphicsModifier(logger, cfg, ociSpec)
graphicsModifier, err := modifier.NewGraphicsModifier(logger, cfg, image)
if err != nil {
return nil, err
}
gdsModifier, err := modifier.NewGDSModifier(logger, cfg, ociSpec)
gdsModifier, err := modifier.NewGDSModifier(logger, cfg, image)
if err != nil {
return nil, err
}
mofedModifier, err := modifier.NewMOFEDModifier(logger, cfg, ociSpec)
mofedModifier, err := modifier.NewMOFEDModifier(logger, cfg, image)
if err != nil {
return nil, err
}