mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-05-28 17:18:49 +00:00
Merge pull request #1055 from elezar/add-cuda-compat-mode
Some checks are pending
Some checks are pending
Add nvidia-container-cli.compat-mode config option
This commit is contained in:
commit
72b2ee9ce0
@ -104,3 +104,26 @@ func (c *hookConfig) getSwarmResourceEnvvars() []string {
|
||||
|
||||
return envvars
|
||||
}
|
||||
|
||||
// nvidiaContainerCliCUDACompatModeFlags returns required --cuda-compat-mode
|
||||
// flag(s) depending on the hook and runtime configurations.
|
||||
func (c *hookConfig) nvidiaContainerCliCUDACompatModeFlags() []string {
|
||||
var flag string
|
||||
switch c.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode {
|
||||
case config.CUDACompatModeLdconfig:
|
||||
flag = "--cuda-compat-mode=ldconfig"
|
||||
case config.CUDACompatModeMount:
|
||||
flag = "--cuda-compat-mode=mount"
|
||||
case config.CUDACompatModeDisabled, config.CUDACompatModeHook:
|
||||
flag = "--cuda-compat-mode=disabled"
|
||||
default:
|
||||
if !c.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
|
||||
flag = "--cuda-compat-mode=disabled"
|
||||
}
|
||||
}
|
||||
|
||||
if flag == "" {
|
||||
return nil
|
||||
}
|
||||
return []string{flag}
|
||||
}
|
||||
|
@ -114,9 +114,8 @@ func doPrestart() {
|
||||
}
|
||||
args = append(args, "configure")
|
||||
|
||||
if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
|
||||
args = append(args, "--no-cntlibs")
|
||||
}
|
||||
args = append(args, hook.nvidiaContainerCliCUDACompatModeFlags()...)
|
||||
|
||||
if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" {
|
||||
args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath))
|
||||
}
|
||||
|
@ -79,6 +79,9 @@ swarm-resource = ""
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.legacy]
|
||||
cuda-compat-mode = "ldconfig"
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||
skip-mode-detection = true
|
||||
@ -140,6 +143,9 @@ swarm-resource = ""
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.legacy]
|
||||
cuda-compat-mode = "ldconfig"
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||
skip-mode-detection = true
|
||||
@ -204,6 +210,9 @@ swarm-resource = ""
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.legacy]
|
||||
cuda-compat-mode = "ldconfig"
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||
skip-mode-detection = true
|
||||
@ -265,6 +274,9 @@ swarm-resource = ""
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.legacy]
|
||||
cuda-compat-mode = "ldconfig"
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||
skip-mode-detection = true
|
||||
@ -348,6 +360,9 @@ swarm-resource = ""
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.legacy]
|
||||
cuda-compat-mode = "ldconfig"
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
|
||||
skip-mode-detection = true
|
||||
|
@ -121,6 +121,9 @@ func GetDefault() (*Config, error) {
|
||||
AnnotationPrefixes: []string{cdi.AnnotationPrefix},
|
||||
SpecDirs: cdi.DefaultSpecDirs,
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: defaultCUDACompatMode,
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
|
@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) {
|
||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "ldconfig",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@ -93,6 +96,7 @@ func TestGetConfig(t *testing.T) {
|
||||
"nvidia-container-cli.load-kmods = false",
|
||||
"nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"",
|
||||
"nvidia-container-cli.user = \"foo:bar\"",
|
||||
"nvidia-container-cli.cuda-compat-mode = \"mount\"",
|
||||
"nvidia-container-runtime.debug = \"/foo/bar\"",
|
||||
"nvidia-container-runtime.discover-mode = \"not-legacy\"",
|
||||
"nvidia-container-runtime.log-level = \"debug\"",
|
||||
@ -102,6 +106,7 @@ func TestGetConfig(t *testing.T) {
|
||||
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
|
||||
"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
|
||||
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
||||
"nvidia-container-runtime.modes.legacy.cuda-compat-mode = \"mount\"",
|
||||
"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
|
||||
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
|
||||
},
|
||||
@ -134,6 +139,9 @@ func TestGetConfig(t *testing.T) {
|
||||
"/not/var/run/cdi",
|
||||
},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "mount",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@ -178,6 +186,9 @@ func TestGetConfig(t *testing.T) {
|
||||
"/var/run/cdi",
|
||||
},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "ldconfig",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@ -200,6 +211,7 @@ func TestGetConfig(t *testing.T) {
|
||||
"root = \"/bar/baz\"",
|
||||
"load-kmods = false",
|
||||
"ldconfig = \"@/foo/bar/ldconfig\"",
|
||||
"cuda-compat-mode = \"mount\"",
|
||||
"user = \"foo:bar\"",
|
||||
"[nvidia-container-runtime]",
|
||||
"debug = \"/foo/bar\"",
|
||||
@ -213,6 +225,8 @@ func TestGetConfig(t *testing.T) {
|
||||
"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
|
||||
"[nvidia-container-runtime.modes.csv]",
|
||||
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
|
||||
"[nvidia-container-runtime.modes.legacy]",
|
||||
"cuda-compat-mode = \"mount\"",
|
||||
"[nvidia-container-runtime-hook]",
|
||||
"path = \"/foo/bar/nvidia-container-runtime-hook\"",
|
||||
"[nvidia-ctk]",
|
||||
@ -247,6 +261,9 @@ func TestGetConfig(t *testing.T) {
|
||||
"/not/var/run/cdi",
|
||||
},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "mount",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@ -283,6 +300,9 @@ func TestGetConfig(t *testing.T) {
|
||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "ldconfig",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
@ -322,6 +342,9 @@ func TestGetConfig(t *testing.T) {
|
||||
AnnotationPrefixes: []string{"cdi.k8s.io/"},
|
||||
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
|
||||
},
|
||||
Legacy: legacyModeConfig{
|
||||
CUDACompatMode: "ldconfig",
|
||||
},
|
||||
},
|
||||
},
|
||||
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
|
||||
|
@ -29,8 +29,9 @@ type RuntimeConfig struct {
|
||||
|
||||
// modesConfig defines (optional) per-mode configs
|
||||
type modesConfig struct {
|
||||
CSV csvModeConfig `toml:"csv"`
|
||||
CDI cdiModeConfig `toml:"cdi"`
|
||||
CSV csvModeConfig `toml:"csv"`
|
||||
CDI cdiModeConfig `toml:"cdi"`
|
||||
Legacy legacyModeConfig `toml:"legacy"`
|
||||
}
|
||||
|
||||
type cdiModeConfig struct {
|
||||
@ -45,3 +46,31 @@ type cdiModeConfig struct {
|
||||
type csvModeConfig struct {
|
||||
MountSpecPath string `toml:"mount-spec-path"`
|
||||
}
|
||||
|
||||
type legacyModeConfig struct {
|
||||
// CUDACompatMode sets the mode to be used to make CUDA Forward Compat
|
||||
// libraries discoverable in the container.
|
||||
CUDACompatMode cudaCompatMode `toml:"cuda-compat-mode,omitempty"`
|
||||
}
|
||||
|
||||
type cudaCompatMode string
|
||||
|
||||
const (
|
||||
defaultCUDACompatMode = CUDACompatModeLdconfig
|
||||
// CUDACompatModeDisabled explicitly disables the handling of CUDA Forward
|
||||
// Compatibility in the NVIDIA Container Runtime and NVIDIA Container
|
||||
// Runtime Hook.
|
||||
CUDACompatModeDisabled = cudaCompatMode("disabled")
|
||||
// CUDACompatModeHook uses a container lifecycle hook to implement CUDA
|
||||
// Forward Compatibility support. This requires the use of the NVIDIA
|
||||
// Container Runtime and is not compatible with use cases where only the
|
||||
// NVIDIA Container Runtime Hook is used (e.g. the Docker --gpus flag).
|
||||
CUDACompatModeHook = cudaCompatMode("hook")
|
||||
// CUDACompatModeLdconfig adds the folders containing CUDA Forward Compat
|
||||
// libraries to the ldconfig command invoked from the NVIDIA Container
|
||||
// Runtime Hook.
|
||||
CUDACompatModeLdconfig = cudaCompatMode("ldconfig")
|
||||
// CUDACompatModeMount mounts CUDA Forward Compat folders from the container
|
||||
// to the container when using the NVIDIA Container Runtime Hook.
|
||||
CUDACompatModeMount = cudaCompatMode("mount")
|
||||
)
|
||||
|
@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"]
|
||||
[nvidia-container-runtime.modes.csv]
|
||||
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"
|
||||
|
||||
[nvidia-container-runtime.modes.legacy]
|
||||
cuda-compat-mode = "ldconfig"
|
||||
|
||||
[nvidia-container-runtime-hook]
|
||||
path = "nvidia-container-runtime-hook"
|
||||
skip-mode-detection = false
|
||||
|
@ -79,24 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image
|
||||
discoverers = append(discoverers, d)
|
||||
}
|
||||
|
||||
if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
|
||||
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
|
||||
discoverers = append(discoverers, compatLibHookDiscoverer)
|
||||
// For legacy mode, we also need to inject a hook to update the LDCache
|
||||
// after we have modifed the configuration.
|
||||
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" {
|
||||
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
|
||||
logger,
|
||||
discover.None{},
|
||||
cfg.NVIDIACTKConfig.Path,
|
||||
"",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
|
||||
}
|
||||
discoverers = append(discoverers, ldcacheUpdateHookDiscoverer)
|
||||
// If the feature flag has explicitly been toggled, we don't make any modification.
|
||||
if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
|
||||
cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err)
|
||||
}
|
||||
discoverers = append(discoverers, cudaCompatDiscoverer)
|
||||
}
|
||||
|
||||
return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...))
|
||||
}
|
||||
|
||||
func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) {
|
||||
// For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook.
|
||||
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode != config.CUDACompatModeHook {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
|
||||
// For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook.
|
||||
if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" {
|
||||
return compatLibHookDiscoverer, nil
|
||||
}
|
||||
|
||||
// For legacy mode, we also need to inject a hook to update the LDCache
|
||||
// after we have modifed the configuration.
|
||||
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
|
||||
logger,
|
||||
discover.None{},
|
||||
cfg.NVIDIACTKConfig.Path,
|
||||
"",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
|
||||
}
|
||||
|
||||
return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user