mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-16 11:30:20 +00:00
Merge 153a277002
into b33d475ff3
This commit is contained in:
commit
8e38fed488
@ -242,7 +242,14 @@ func (hookConfig *hookConfig) getNvidiaConfig(image image.CUDA, privileged bool)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (hookConfig *hookConfig) getContainerConfig() (config containerConfig) {
|
func (hookConfig *hookConfig) getContainerConfig() (config *containerConfig) {
|
||||||
|
hookConfig.Lock()
|
||||||
|
defer hookConfig.Unlock()
|
||||||
|
|
||||||
|
if hookConfig.containerConfig != nil {
|
||||||
|
return hookConfig.containerConfig
|
||||||
|
}
|
||||||
|
|
||||||
var h HookState
|
var h HookState
|
||||||
d := json.NewDecoder(os.Stdin)
|
d := json.NewDecoder(os.Stdin)
|
||||||
if err := d.Decode(&h); err != nil {
|
if err := d.Decode(&h); err != nil {
|
||||||
@ -271,10 +278,13 @@ func (hookConfig *hookConfig) getContainerConfig() (config containerConfig) {
|
|||||||
log.Panicln(err)
|
log.Panicln(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
return containerConfig{
|
cc := containerConfig{
|
||||||
Pid: h.Pid,
|
Pid: h.Pid,
|
||||||
Rootfs: s.Root.Path,
|
Rootfs: s.Root.Path,
|
||||||
Image: i,
|
Image: i,
|
||||||
Nvidia: hookConfig.getNvidiaConfig(i, privileged),
|
Nvidia: hookConfig.getNvidiaConfig(i, privileged),
|
||||||
}
|
}
|
||||||
|
hookConfig.containerConfig = &cc
|
||||||
|
|
||||||
|
return hookConfig.containerConfig
|
||||||
}
|
}
|
||||||
|
@ -487,7 +487,7 @@ func TestGetNvidiaConfig(t *testing.T) {
|
|||||||
hookCfg := tc.hookConfig
|
hookCfg := tc.hookConfig
|
||||||
if hookCfg == nil {
|
if hookCfg == nil {
|
||||||
defaultConfig, _ := config.GetDefault()
|
defaultConfig, _ := config.GetDefault()
|
||||||
hookCfg = &hookConfig{defaultConfig}
|
hookCfg = &hookConfig{Config: defaultConfig}
|
||||||
}
|
}
|
||||||
cfg = hookCfg.getNvidiaConfig(image, tc.privileged)
|
cfg = hookCfg.getNvidiaConfig(image, tc.privileged)
|
||||||
}
|
}
|
||||||
|
@ -7,9 +7,11 @@ import (
|
|||||||
"path"
|
"path"
|
||||||
"reflect"
|
"reflect"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
|
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||||
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/info"
|
||||||
)
|
)
|
||||||
|
|
||||||
const (
|
const (
|
||||||
@ -20,7 +22,9 @@ const (
|
|||||||
// hookConfig wraps the toolkit config.
|
// hookConfig wraps the toolkit config.
|
||||||
// This allows for functions to be defined on the local type.
|
// This allows for functions to be defined on the local type.
|
||||||
type hookConfig struct {
|
type hookConfig struct {
|
||||||
|
sync.Mutex
|
||||||
*config.Config
|
*config.Config
|
||||||
|
containerConfig *containerConfig
|
||||||
}
|
}
|
||||||
|
|
||||||
// loadConfig loads the required paths for the hook config.
|
// loadConfig loads the required paths for the hook config.
|
||||||
@ -55,7 +59,7 @@ func getHookConfig() (*hookConfig, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to load config: %v", err)
|
return nil, fmt.Errorf("failed to load config: %v", err)
|
||||||
}
|
}
|
||||||
config := &hookConfig{cfg}
|
config := &hookConfig{Config: cfg}
|
||||||
|
|
||||||
allSupportedDriverCapabilities := image.SupportedDriverCapabilities
|
allSupportedDriverCapabilities := image.SupportedDriverCapabilities
|
||||||
if config.SupportedDriverCapabilities == "all" {
|
if config.SupportedDriverCapabilities == "all" {
|
||||||
@ -73,8 +77,8 @@ func getHookConfig() (*hookConfig, error) {
|
|||||||
|
|
||||||
// getConfigOption returns the toml config option associated with the
|
// getConfigOption returns the toml config option associated with the
|
||||||
// specified struct field.
|
// specified struct field.
|
||||||
func (c hookConfig) getConfigOption(fieldName string) string {
|
func (c *hookConfig) getConfigOption(fieldName string) string {
|
||||||
t := reflect.TypeOf(c)
|
t := reflect.TypeOf(&c)
|
||||||
f, ok := t.FieldByName(fieldName)
|
f, ok := t.FieldByName(fieldName)
|
||||||
if !ok {
|
if !ok {
|
||||||
return fieldName
|
return fieldName
|
||||||
@ -127,3 +131,21 @@ func (c *hookConfig) nvidiaContainerCliCUDACompatModeFlags() []string {
|
|||||||
}
|
}
|
||||||
return []string{flag}
|
return []string{flag}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *hookConfig) assertModeIsLegacy() error {
|
||||||
|
if c.NVIDIAContainerRuntimeHookConfig.SkipModeDetection {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
mr := info.NewRuntimeModeResolver(
|
||||||
|
info.WithLogger(&logInterceptor{}),
|
||||||
|
info.WithImage(&c.containerConfig.Image),
|
||||||
|
info.WithDefaultMode(info.RuntimeModeLegacy),
|
||||||
|
)
|
||||||
|
|
||||||
|
mode := mr.ResolveRuntimeMode(c.NVIDIAContainerRuntimeConfig.Mode)
|
||||||
|
if mode == "legacy" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("invoking the NVIDIA Container Runtime Hook directly (e.g. specifying the docker --gpus flag) is not supported. Please use the NVIDIA Container Runtime (e.g. specify the --runtime=nvidia flag) instead")
|
||||||
|
}
|
||||||
|
@ -90,10 +90,10 @@ func TestGetHookConfig(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var cfg hookConfig
|
var cfg *hookConfig
|
||||||
getHookConfig := func() {
|
getHookConfig := func() {
|
||||||
c, _ := getHookConfig()
|
c, _ := getHookConfig()
|
||||||
cfg = *c
|
cfg = c
|
||||||
}
|
}
|
||||||
|
|
||||||
if tc.expectedPanic {
|
if tc.expectedPanic {
|
||||||
|
@ -55,7 +55,7 @@ func getCLIPath(config config.ContainerCLIConfig) string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// getRootfsPath returns an absolute path. We don't need to resolve symlinks for now.
|
// getRootfsPath returns an absolute path. We don't need to resolve symlinks for now.
|
||||||
func getRootfsPath(config containerConfig) string {
|
func getRootfsPath(config *containerConfig) string {
|
||||||
rootfs, err := filepath.Abs(config.Rootfs)
|
rootfs, err := filepath.Abs(config.Rootfs)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Panicln(err)
|
log.Panicln(err)
|
||||||
@ -82,8 +82,8 @@ func doPrestart() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
if !hook.NVIDIAContainerRuntimeHookConfig.SkipModeDetection && info.ResolveAutoMode(&logInterceptor{}, hook.NVIDIAContainerRuntimeConfig.Mode, container.Image) != "legacy" {
|
if err := hook.assertModeIsLegacy(); err != nil {
|
||||||
log.Panicln("invoking the NVIDIA Container Runtime Hook directly (e.g. specifying the docker --gpus flag) is not supported. Please use the NVIDIA Container Runtime (e.g. specify the --runtime=nvidia flag) instead.")
|
log.Panicf("%v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
rootfs := getRootfsPath(container)
|
rootfs := getRootfsPath(container)
|
||||||
|
@ -122,11 +122,10 @@ func TestGoodInput(t *testing.T) {
|
|||||||
err = cmdCreate.Run()
|
err = cmdCreate.Run()
|
||||||
require.NoError(t, err, "runtime should not return an error")
|
require.NoError(t, err, "runtime should not return an error")
|
||||||
|
|
||||||
// Check config.json for NVIDIA prestart hook
|
// Check config.json to ensure that the NVIDIA prestart was not inserted.
|
||||||
spec, err = cfg.getRuntimeSpec()
|
spec, err = cfg.getRuntimeSpec()
|
||||||
require.NoError(t, err, "should be no errors when reading and parsing spec from config.json")
|
require.NoError(t, err, "should be no errors when reading and parsing spec from config.json")
|
||||||
require.NotEmpty(t, spec.Hooks, "there should be hooks in config.json")
|
require.Empty(t, spec.Hooks, "there should be no hooks in config.json")
|
||||||
require.Equal(t, 1, nvidiaHookCount(spec.Hooks), "exactly one nvidia prestart hook should be inserted correctly into config.json")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NVIDIA prestart hook already present in config file
|
// NVIDIA prestart hook already present in config file
|
||||||
@ -168,11 +167,10 @@ func TestDuplicateHook(t *testing.T) {
|
|||||||
output, err := cmdCreate.CombinedOutput()
|
output, err := cmdCreate.CombinedOutput()
|
||||||
require.NoErrorf(t, err, "runtime should not return an error", "output=%v", string(output))
|
require.NoErrorf(t, err, "runtime should not return an error", "output=%v", string(output))
|
||||||
|
|
||||||
// Check config.json for NVIDIA prestart hook
|
// Check config.json to ensure that the NVIDIA prestart hook was removed.
|
||||||
spec, err = cfg.getRuntimeSpec()
|
spec, err = cfg.getRuntimeSpec()
|
||||||
require.NoError(t, err, "should be no errors when reading and parsing spec from config.json")
|
require.NoError(t, err, "should be no errors when reading and parsing spec from config.json")
|
||||||
require.NotEmpty(t, spec.Hooks, "there should be hooks in config.json")
|
require.Empty(t, spec.Hooks, "there should be no hooks in config.json")
|
||||||
require.Equal(t, 1, nvidiaHookCount(spec.Hooks), "exactly one nvidia prestart hook should be inserted correctly into config.json")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// addNVIDIAHook is a basic wrapper for an addHookModifier that is used for
|
// addNVIDIAHook is a basic wrapper for an addHookModifier that is used for
|
||||||
@ -240,18 +238,3 @@ func (c testConfig) generateNewRuntimeSpec() error {
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return number of valid NVIDIA prestart hooks in runtime spec
|
|
||||||
func nvidiaHookCount(hooks *specs.Hooks) int {
|
|
||||||
if hooks == nil {
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
count := 0
|
|
||||||
for _, hook := range hooks.Prestart {
|
|
||||||
if strings.Contains(hook.Path, nvidiaHook) {
|
|
||||||
count++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return count
|
|
||||||
}
|
|
||||||
|
@ -19,6 +19,7 @@ package image
|
|||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"slices"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
@ -281,17 +282,24 @@ func (i CUDA) cdiDeviceRequestsFromAnnotations() []string {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
var devices []string
|
var annotationKeys []string
|
||||||
for key, value := range i.annotations {
|
for key := range i.annotations {
|
||||||
for _, prefix := range i.annotationsPrefixes {
|
for _, prefix := range i.annotationsPrefixes {
|
||||||
if strings.HasPrefix(key, prefix) {
|
if strings.HasPrefix(key, prefix) {
|
||||||
devices = append(devices, strings.Split(value, ",")...)
|
annotationKeys = append(annotationKeys, key)
|
||||||
// There is no need to check additional prefixes since we
|
// There is no need to check additional prefixes since we
|
||||||
// typically deduplicate devices in any case.
|
// typically deduplicate devices in any case.
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// We sort the annotationKeys for consistent results.
|
||||||
|
slices.Sort(annotationKeys)
|
||||||
|
|
||||||
|
var devices []string
|
||||||
|
for _, key := range annotationKeys {
|
||||||
|
devices = append(devices, strings.Split(i.annotations[key], ",")...)
|
||||||
|
}
|
||||||
return devices
|
return devices
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,34 +23,101 @@ import (
|
|||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ResolveAutoMode determines the correct mode for the platform if set to "auto"
|
// A RuntimeMode is used to select a specific mode of operation for the NVIDIA Container Runtime.
|
||||||
func ResolveAutoMode(logger logger.Interface, mode string, image image.CUDA) (rmode string) {
|
type RuntimeMode string
|
||||||
return resolveMode(logger, mode, image, nil)
|
|
||||||
|
const (
|
||||||
|
RuntimeModeLegacy = RuntimeMode("legacy")
|
||||||
|
RuntimeModeCSV = RuntimeMode("csv")
|
||||||
|
RuntimeModeCDI = RuntimeMode("cdi")
|
||||||
|
RuntimeModeJitCDI = RuntimeMode("jit-cdi")
|
||||||
|
)
|
||||||
|
|
||||||
|
type RuntimeModeResolver interface {
|
||||||
|
ResolveRuntimeMode(string) RuntimeMode
|
||||||
}
|
}
|
||||||
|
|
||||||
func resolveMode(logger logger.Interface, mode string, image image.CUDA, propertyExtractor info.PropertyExtractor) (rmode string) {
|
type modeResolver struct {
|
||||||
|
logger logger.Interface
|
||||||
|
// TODO: This only needs to consider the requested devices.
|
||||||
|
image *image.CUDA
|
||||||
|
propertyExtractor info.PropertyExtractor
|
||||||
|
defaultMode RuntimeMode
|
||||||
|
}
|
||||||
|
|
||||||
|
type Option func(*modeResolver)
|
||||||
|
|
||||||
|
func WithDefaultMode(defaultMode RuntimeMode) Option {
|
||||||
|
return func(mr *modeResolver) {
|
||||||
|
mr.defaultMode = defaultMode
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithLogger(logger logger.Interface) Option {
|
||||||
|
return func(mr *modeResolver) {
|
||||||
|
mr.logger = logger
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithImage(image *image.CUDA) Option {
|
||||||
|
return func(mr *modeResolver) {
|
||||||
|
mr.image = image
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func WithPropertyExtractor(propertyExtractor info.PropertyExtractor) Option {
|
||||||
|
return func(mr *modeResolver) {
|
||||||
|
mr.propertyExtractor = propertyExtractor
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewRuntimeModeResolver(opts ...Option) RuntimeModeResolver {
|
||||||
|
r := &modeResolver{
|
||||||
|
defaultMode: RuntimeModeJitCDI,
|
||||||
|
}
|
||||||
|
for _, opt := range opts {
|
||||||
|
opt(r)
|
||||||
|
}
|
||||||
|
if r.logger == nil {
|
||||||
|
r.logger = &logger.NullLogger{}
|
||||||
|
}
|
||||||
|
|
||||||
|
return r
|
||||||
|
}
|
||||||
|
|
||||||
|
// ResolveAutoMode determines the correct mode for the platform if set to "auto"
|
||||||
|
func ResolveAutoMode(logger logger.Interface, mode string, image image.CUDA) (rmode RuntimeMode) {
|
||||||
|
r := modeResolver{
|
||||||
|
logger: logger,
|
||||||
|
image: &image,
|
||||||
|
propertyExtractor: nil,
|
||||||
|
}
|
||||||
|
return r.ResolveRuntimeMode(mode)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (m *modeResolver) ResolveRuntimeMode(mode string) (rmode RuntimeMode) {
|
||||||
if mode != "auto" {
|
if mode != "auto" {
|
||||||
logger.Infof("Using requested mode '%s'", mode)
|
m.logger.Infof("Using requested mode '%s'", mode)
|
||||||
return mode
|
return RuntimeMode(mode)
|
||||||
}
|
}
|
||||||
defer func() {
|
defer func() {
|
||||||
logger.Infof("Auto-detected mode as '%v'", rmode)
|
m.logger.Infof("Auto-detected mode as '%v'", rmode)
|
||||||
}()
|
}()
|
||||||
|
|
||||||
if image.OnlyFullyQualifiedCDIDevices() {
|
if m.image.OnlyFullyQualifiedCDIDevices() {
|
||||||
return "cdi"
|
return RuntimeModeCDI
|
||||||
}
|
}
|
||||||
|
|
||||||
nvinfo := info.New(
|
nvinfo := info.New(
|
||||||
info.WithLogger(logger),
|
info.WithLogger(m.logger),
|
||||||
info.WithPropertyExtractor(propertyExtractor),
|
info.WithPropertyExtractor(m.propertyExtractor),
|
||||||
)
|
)
|
||||||
|
|
||||||
switch nvinfo.ResolvePlatform() {
|
switch nvinfo.ResolvePlatform() {
|
||||||
case info.PlatformNVML, info.PlatformWSL:
|
case info.PlatformNVML, info.PlatformWSL:
|
||||||
return "legacy"
|
return m.defaultMode
|
||||||
case info.PlatformTegra:
|
case info.PlatformTegra:
|
||||||
return "csv"
|
return RuntimeModeCSV
|
||||||
}
|
}
|
||||||
return "legacy"
|
return m.defaultMode
|
||||||
}
|
}
|
||||||
|
@ -43,11 +43,16 @@ func TestResolveAutoMode(t *testing.T) {
|
|||||||
mode: "not-auto",
|
mode: "not-auto",
|
||||||
expectedMode: "not-auto",
|
expectedMode: "not-auto",
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
description: "legacy resolves to legacy",
|
||||||
|
mode: "legacy",
|
||||||
|
expectedMode: "legacy",
|
||||||
|
},
|
||||||
{
|
{
|
||||||
description: "no info defaults to legacy",
|
description: "no info defaults to legacy",
|
||||||
mode: "auto",
|
mode: "auto",
|
||||||
info: map[string]bool{},
|
info: map[string]bool{},
|
||||||
expectedMode: "legacy",
|
expectedMode: "jit-cdi",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "non-nvml, non-tegra, nvgpu resolves to csv",
|
description: "non-nvml, non-tegra, nvgpu resolves to csv",
|
||||||
@ -80,14 +85,14 @@ func TestResolveAutoMode(t *testing.T) {
|
|||||||
expectedMode: "csv",
|
expectedMode: "csv",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "nvml, non-tegra, non-nvgpu resolves to legacy",
|
description: "nvml, non-tegra, non-nvgpu resolves to jit-cdi",
|
||||||
mode: "auto",
|
mode: "auto",
|
||||||
info: map[string]bool{
|
info: map[string]bool{
|
||||||
"nvml": true,
|
"nvml": true,
|
||||||
"tegra": false,
|
"tegra": false,
|
||||||
"nvgpu": false,
|
"nvgpu": false,
|
||||||
},
|
},
|
||||||
expectedMode: "legacy",
|
expectedMode: "jit-cdi",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "nvml, non-tegra, nvgpu resolves to csv",
|
description: "nvml, non-tegra, nvgpu resolves to csv",
|
||||||
@ -100,14 +105,14 @@ func TestResolveAutoMode(t *testing.T) {
|
|||||||
expectedMode: "csv",
|
expectedMode: "csv",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "nvml, tegra, non-nvgpu resolves to legacy",
|
description: "nvml, tegra, non-nvgpu resolves to jit-cdi",
|
||||||
mode: "auto",
|
mode: "auto",
|
||||||
info: map[string]bool{
|
info: map[string]bool{
|
||||||
"nvml": true,
|
"nvml": true,
|
||||||
"tegra": true,
|
"tegra": true,
|
||||||
"nvgpu": false,
|
"nvgpu": false,
|
||||||
},
|
},
|
||||||
expectedMode: "legacy",
|
expectedMode: "jit-cdi",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "nvml, tegra, nvgpu resolves to csv",
|
description: "nvml, tegra, nvgpu resolves to csv",
|
||||||
@ -136,7 +141,7 @@ func TestResolveAutoMode(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "at least one non-cdi device resolves to legacy",
|
description: "at least one non-cdi device resolves to jit-cdi",
|
||||||
mode: "auto",
|
mode: "auto",
|
||||||
envmap: map[string]string{
|
envmap: map[string]string{
|
||||||
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
|
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0,0",
|
||||||
@ -146,7 +151,7 @@ func TestResolveAutoMode(t *testing.T) {
|
|||||||
"tegra": false,
|
"tegra": false,
|
||||||
"nvgpu": false,
|
"nvgpu": false,
|
||||||
},
|
},
|
||||||
expectedMode: "legacy",
|
expectedMode: "jit-cdi",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "at least one non-cdi device resolves to csv",
|
description: "at least one non-cdi device resolves to csv",
|
||||||
@ -170,7 +175,7 @@ func TestResolveAutoMode(t *testing.T) {
|
|||||||
expectedMode: "cdi",
|
expectedMode: "cdi",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "cdi mount and non-CDI devices resolves to legacy",
|
description: "cdi mount and non-CDI devices resolves to jit-cdi",
|
||||||
mode: "auto",
|
mode: "auto",
|
||||||
mounts: []string{
|
mounts: []string{
|
||||||
"/var/run/nvidia-container-devices/cdi/nvidia.com/gpu/0",
|
"/var/run/nvidia-container-devices/cdi/nvidia.com/gpu/0",
|
||||||
@ -181,7 +186,7 @@ func TestResolveAutoMode(t *testing.T) {
|
|||||||
"tegra": false,
|
"tegra": false,
|
||||||
"nvgpu": false,
|
"nvgpu": false,
|
||||||
},
|
},
|
||||||
expectedMode: "legacy",
|
expectedMode: "jit-cdi",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "cdi mount and non-CDI envvar resolves to cdi",
|
description: "cdi mount and non-CDI envvar resolves to cdi",
|
||||||
@ -199,22 +204,6 @@ func TestResolveAutoMode(t *testing.T) {
|
|||||||
},
|
},
|
||||||
expectedMode: "cdi",
|
expectedMode: "cdi",
|
||||||
},
|
},
|
||||||
{
|
|
||||||
description: "non-cdi mount and CDI envvar resolves to legacy",
|
|
||||||
mode: "auto",
|
|
||||||
envmap: map[string]string{
|
|
||||||
"NVIDIA_VISIBLE_DEVICES": "nvidia.com/gpu=0",
|
|
||||||
},
|
|
||||||
mounts: []string{
|
|
||||||
"/var/run/nvidia-container-devices/0",
|
|
||||||
},
|
|
||||||
info: map[string]bool{
|
|
||||||
"nvml": true,
|
|
||||||
"tegra": false,
|
|
||||||
"nvgpu": false,
|
|
||||||
},
|
|
||||||
expectedMode: "legacy",
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tc := range testCases {
|
for _, tc := range testCases {
|
||||||
@ -251,7 +240,12 @@ func TestResolveAutoMode(t *testing.T) {
|
|||||||
image.WithAcceptDeviceListAsVolumeMounts(true),
|
image.WithAcceptDeviceListAsVolumeMounts(true),
|
||||||
image.WithAcceptEnvvarUnprivileged(true),
|
image.WithAcceptEnvvarUnprivileged(true),
|
||||||
)
|
)
|
||||||
mode := resolveMode(logger, tc.mode, image, properties)
|
mr := NewRuntimeModeResolver(
|
||||||
|
WithLogger(logger),
|
||||||
|
WithImage(&image),
|
||||||
|
WithPropertyExtractor(properties),
|
||||||
|
)
|
||||||
|
mode := mr.ResolveRuntimeMode(tc.mode)
|
||||||
require.EqualValues(t, tc.expectedMode, mode)
|
require.EqualValues(t, tc.expectedMode, mode)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@ package modifier
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
"tags.cncf.io/container-device-interface/pkg/parser"
|
"tags.cncf.io/container-device-interface/pkg/parser"
|
||||||
|
|
||||||
@ -27,17 +28,27 @@ import (
|
|||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
|
||||||
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
automaticDeviceVendor = "runtime.nvidia.com"
|
||||||
|
automaticDeviceClass = "gpu"
|
||||||
|
automaticDeviceKind = automaticDeviceVendor + "/" + automaticDeviceClass
|
||||||
|
automaticDevicePrefix = automaticDeviceKind + "="
|
||||||
)
|
)
|
||||||
|
|
||||||
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
|
// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
|
||||||
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
|
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
|
||||||
// used to select the devices to include.
|
// used to select the devices to include.
|
||||||
func NewCDIModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
|
func NewCDIModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, isJitCDI bool) (oci.SpecModifier, error) {
|
||||||
|
defaultKind := cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind
|
||||||
|
if isJitCDI {
|
||||||
|
defaultKind = automaticDeviceKind
|
||||||
|
}
|
||||||
deviceRequestor := newCDIDeviceRequestor(
|
deviceRequestor := newCDIDeviceRequestor(
|
||||||
logger,
|
logger,
|
||||||
image,
|
image,
|
||||||
cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.DefaultKind,
|
defaultKind,
|
||||||
)
|
)
|
||||||
devices := deviceRequestor.DeviceRequests()
|
devices := deviceRequestor.DeviceRequests()
|
||||||
if len(devices) == 0 {
|
if len(devices) == 0 {
|
||||||
@ -107,17 +118,36 @@ func (c *cdiDeviceRequestor) DeviceRequests() []string {
|
|||||||
func filterAutomaticDevices(devices []string) []string {
|
func filterAutomaticDevices(devices []string) []string {
|
||||||
var automatic []string
|
var automatic []string
|
||||||
for _, device := range devices {
|
for _, device := range devices {
|
||||||
vendor, class, _ := parser.ParseDevice(device)
|
if !strings.HasPrefix(device, automaticDevicePrefix) {
|
||||||
if vendor == "runtime.nvidia.com" && class == "gpu" {
|
continue
|
||||||
automatic = append(automatic, device)
|
|
||||||
}
|
}
|
||||||
|
automatic = append(automatic, device)
|
||||||
}
|
}
|
||||||
return automatic
|
return automatic
|
||||||
}
|
}
|
||||||
|
|
||||||
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
|
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
|
||||||
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
|
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
|
||||||
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
|
|
||||||
|
var identifiers []string
|
||||||
|
for _, device := range devices {
|
||||||
|
identifiers = append(identifiers, strings.TrimPrefix(device, automaticDevicePrefix))
|
||||||
|
}
|
||||||
|
|
||||||
|
cdilib, err := nvcdi.New(
|
||||||
|
nvcdi.WithLogger(logger),
|
||||||
|
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
|
||||||
|
nvcdi.WithDriverRoot(cfg.NVIDIAContainerCLIConfig.Root),
|
||||||
|
nvcdi.WithVendor(automaticDeviceVendor),
|
||||||
|
nvcdi.WithClass(automaticDeviceClass),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: We should try to load the kernel modules and create the device nodes here.
|
||||||
|
// Failures should raise a warning and not error out.
|
||||||
|
spec, err := cdilib.GetSpec(identifiers...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
|
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
|
||||||
}
|
}
|
||||||
@ -132,27 +162,6 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de
|
|||||||
return cdiDeviceRequestor, nil
|
return cdiDeviceRequestor, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
|
|
||||||
cdilib, err := nvcdi.New(
|
|
||||||
nvcdi.WithLogger(logger),
|
|
||||||
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
|
|
||||||
nvcdi.WithDriverRoot(cfg.NVIDIAContainerCLIConfig.Root),
|
|
||||||
nvcdi.WithVendor("runtime.nvidia.com"),
|
|
||||||
nvcdi.WithClass("gpu"),
|
|
||||||
)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
var identifiers []string
|
|
||||||
for _, device := range devices {
|
|
||||||
_, _, id := parser.ParseDevice(device)
|
|
||||||
identifiers = append(identifiers, id)
|
|
||||||
}
|
|
||||||
|
|
||||||
return cdilib.GetSpec(identifiers...)
|
|
||||||
}
|
|
||||||
|
|
||||||
type deduplicatedDeviceRequestor struct {
|
type deduplicatedDeviceRequestor struct {
|
||||||
deviceRequestor
|
deviceRequestor
|
||||||
}
|
}
|
||||||
|
@ -70,6 +70,18 @@ func TestDeviceRequests(t *testing.T) {
|
|||||||
},
|
},
|
||||||
expectedDevices: []string{"nvidia.com/gpu=0", "example.com/class=device"},
|
expectedDevices: []string{"nvidia.com/gpu=0", "example.com/class=device"},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
description: "cdi devices from envvar with default kind",
|
||||||
|
input: cdiDeviceRequestor{
|
||||||
|
defaultKind: "runtime.nvidia.com/gpu",
|
||||||
|
},
|
||||||
|
spec: &specs.Spec{
|
||||||
|
Process: &specs.Process{
|
||||||
|
Env: []string{"NVIDIA_VISIBLE_DEVICES=all"},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
expectedDevices: []string{"runtime.nvidia.com/gpu=all"},
|
||||||
|
},
|
||||||
{
|
{
|
||||||
description: "no matching annotations",
|
description: "no matching annotations",
|
||||||
prefixes: []string{"not-prefix/"},
|
prefixes: []string{"not-prefix/"},
|
||||||
@ -98,7 +110,7 @@ func TestDeviceRequests(t *testing.T) {
|
|||||||
"another-prefix/bar": "example.com/device=baz",
|
"another-prefix/bar": "example.com/device=baz",
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
expectedDevices: []string{"example.com/device=bar", "example.com/device=baz"},
|
expectedDevices: []string{"example.com/device=baz", "example.com/device=bar"},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
description: "multiple matching annotations with duplicate devices",
|
description: "multiple matching annotations with duplicate devices",
|
||||||
|
@ -101,14 +101,14 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
|
|||||||
return modifiers, nil
|
return modifiers, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
|
func newModeModifier(logger logger.Interface, mode info.RuntimeMode, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
|
||||||
switch mode {
|
switch mode {
|
||||||
case "legacy":
|
case info.RuntimeModeLegacy:
|
||||||
return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil
|
return modifier.NewStableRuntimeModifier(logger, cfg.NVIDIAContainerRuntimeHookConfig.Path), nil
|
||||||
case "csv":
|
case info.RuntimeModeCSV:
|
||||||
return modifier.NewCSVModifier(logger, cfg, image)
|
return modifier.NewCSVModifier(logger, cfg, image)
|
||||||
case "cdi":
|
case info.RuntimeModeCDI, info.RuntimeModeJitCDI:
|
||||||
return modifier.NewCDIModifier(logger, cfg, image)
|
return modifier.NewCDIModifier(logger, cfg, image, mode == info.RuntimeModeJitCDI)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)
|
return nil, fmt.Errorf("invalid runtime mode: %v", cfg.NVIDIAContainerRuntimeConfig.Mode)
|
||||||
@ -119,7 +119,7 @@ func newModeModifier(logger logger.Interface, mode string, cfg *config.Config, i
|
|||||||
// The image is also used to determine the runtime mode to apply.
|
// The image is also used to determine the runtime mode to apply.
|
||||||
// If a non-CDI mode is detected we ensure that the image does not process
|
// If a non-CDI mode is detected we ensure that the image does not process
|
||||||
// annotation devices.
|
// annotation devices.
|
||||||
func initRuntimeModeAndImage(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (string, *image.CUDA, error) {
|
func initRuntimeModeAndImage(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (info.RuntimeMode, *image.CUDA, error) {
|
||||||
rawSpec, err := ociSpec.Load()
|
rawSpec, err := ociSpec.Load()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
return "", nil, fmt.Errorf("failed to load OCI spec: %v", err)
|
||||||
@ -136,9 +136,13 @@ func initRuntimeModeAndImage(logger logger.Interface, cfg *config.Config, ociSpe
|
|||||||
return "", nil, err
|
return "", nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
|
modeResolver := info.NewRuntimeModeResolver(
|
||||||
|
info.WithLogger(logger),
|
||||||
|
info.WithImage(&image),
|
||||||
|
)
|
||||||
|
mode := modeResolver.ResolveRuntimeMode(cfg.NVIDIAContainerRuntimeConfig.Mode)
|
||||||
// We update the mode here so that we can continue passing just the config to other functions.
|
// We update the mode here so that we can continue passing just the config to other functions.
|
||||||
cfg.NVIDIAContainerRuntimeConfig.Mode = mode
|
cfg.NVIDIAContainerRuntimeConfig.Mode = string(mode)
|
||||||
|
|
||||||
if mode == "cdi" || len(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes) == 0 {
|
if mode == "cdi" || len(cfg.NVIDIAContainerRuntimeConfig.Modes.CDI.AnnotationPrefixes) == 0 {
|
||||||
return mode, &image, nil
|
return mode, &image, nil
|
||||||
@ -154,12 +158,12 @@ func initRuntimeModeAndImage(logger logger.Interface, cfg *config.Config, ociSpe
|
|||||||
}
|
}
|
||||||
|
|
||||||
// supportedModifierTypes returns the modifiers supported for a specific runtime mode.
|
// supportedModifierTypes returns the modifiers supported for a specific runtime mode.
|
||||||
func supportedModifierTypes(mode string) []string {
|
func supportedModifierTypes(mode info.RuntimeMode) []string {
|
||||||
switch mode {
|
switch mode {
|
||||||
case "cdi":
|
case info.RuntimeModeCDI, info.RuntimeModeJitCDI:
|
||||||
// For CDI mode we make no additional modifications.
|
// For CDI mode we make no additional modifications.
|
||||||
return []string{"nvidia-hook-remover", "mode"}
|
return []string{"nvidia-hook-remover", "mode"}
|
||||||
case "csv":
|
case info.RuntimeModeCSV:
|
||||||
// For CSV mode we support mode and feature-gated modification.
|
// For CSV mode we support mode and feature-gated modification.
|
||||||
return []string{"nvidia-hook-remover", "feature-gated", "mode"}
|
return []string{"nvidia-hook-remover", "feature-gated", "mode"}
|
||||||
default:
|
default:
|
||||||
|
1
tests/output/bundle/config.json
Normal file
1
tests/output/bundle/config.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
{"ociVersion":"1.0.1-dev","process":{"terminal":true,"user":{"uid":0,"gid":0},"args":["sh"],"env":["PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin","TERM=xterm"],"cwd":"/","capabilities":{"bounding":["CAP_AUDIT_WRITE","CAP_KILL","CAP_NET_BIND_SERVICE"],"effective":["CAP_AUDIT_WRITE","CAP_KILL","CAP_NET_BIND_SERVICE"],"inheritable":["CAP_AUDIT_WRITE","CAP_KILL","CAP_NET_BIND_SERVICE"],"permitted":["CAP_AUDIT_WRITE","CAP_KILL","CAP_NET_BIND_SERVICE"],"ambient":["CAP_AUDIT_WRITE","CAP_KILL","CAP_NET_BIND_SERVICE"]},"rlimits":[{"type":"RLIMIT_NOFILE","hard":1024,"soft":1024}],"noNewPrivileges":true},"root":{"path":"rootfs","readonly":true},"hostname":"runc","mounts":[{"destination":"/proc","type":"proc","source":"proc"},{"destination":"/dev","type":"tmpfs","source":"tmpfs","options":["nosuid","strictatime","mode=755","size=65536k"]},{"destination":"/dev/pts","type":"devpts","source":"devpts","options":["nosuid","noexec","newinstance","ptmxmode=0666","mode=0620","gid=5"]},{"destination":"/dev/shm","type":"tmpfs","source":"shm","options":["nosuid","noexec","nodev","mode=1777","size=65536k"]},{"destination":"/dev/mqueue","type":"mqueue","source":"mqueue","options":["nosuid","noexec","nodev"]},{"destination":"/sys","type":"sysfs","source":"sysfs","options":["nosuid","noexec","nodev","ro"]},{"destination":"/sys/fs/cgroup","type":"cgroup","source":"cgroup","options":["nosuid","noexec","nodev","relatime","ro"]}],"hooks":{},"linux":{"resources":{"devices":[{"allow":false,"access":"rwm"}]},"namespaces":[{"type":"pid"},{"type":"network"},{"type":"ipc"},{"type":"uts"},{"type":"mount"}],"maskedPaths":["/proc/kcore","/proc/latency_stats","/proc/timer_list","/proc/timer_stats","/proc/sched_debug","/sys/firmware","/proc/scsi"],"readonlyPaths":["/proc/asound","/proc/bus","/proc/fs","/proc/irq","/proc/sys","/proc/sysrq-trigger"]}}
|
Loading…
Reference in New Issue
Block a user