From 742ff3784697c4d1de84e44edaa5d251ae8697f3 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 29 Apr 2025 14:00:12 +0200 Subject: [PATCH] Add cuda-compat-mode config option This change adds an nvidia-container-runtime.modes.legacy.cuda-compat-mode config option. This can be set to one of four values: * ldconfig (default): the --cuda-compat-mode=ldconfig flag is passed to the nvidia-container-cli * mount: the --cuda-compat-mode=mount flag is passed to the nvidia-conainer-cli * disabled: the --cuda-compat-mode=disabled flag is passed to the nvidia-container-cli * hook: the --cuda-compat-mode=disabled flag is passed to the nvidia-container-cli AND the enable-cuda-compat hook is used to provide forward compatibility. Note that the disable-cuda-compat-lib-hook feature flag will prevent the enable-cuda-compat hook from being used. This change also means that the allow-cuda-compat-libs-from-container feature flag no longer has any effect. Signed-off-by: Evan Lezar --- cmd/nvidia-container-runtime-hook/main.go | 14 ++++++- cmd/nvidia-ctk-installer/main_test.go | 15 +++++++ internal/config/config.go | 3 ++ internal/config/config_test.go | 23 +++++++++++ internal/config/runtime.go | 33 ++++++++++++++- internal/config/toml_test.go | 3 ++ internal/modifier/gated.go | 49 +++++++++++++++-------- 7 files changed, 120 insertions(+), 20 deletions(-) diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index e864a51d..b5a3f823 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -114,9 +114,19 @@ func doPrestart() { } args = append(args, "configure") - if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() { - args = append(args, "--no-cntlibs") + switch hook.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode { + case config.CUDACompatModeLdconfig: + args = append(args, "--cuda-compat-mode=ldconfig") + case config.CUDACompatModeMount: + args = append(args, "--cuda-compat-mode=mount") + case config.CUDACompatModeDisabled, config.CUDACompatModeHook: + args = append(args, "--cuda-compat-mode=disabled") + default: + if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() { + args = append(args, "--cuda-compat-mode=disabled") + } } + if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" { args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath)) } diff --git a/cmd/nvidia-ctk-installer/main_test.go b/cmd/nvidia-ctk-installer/main_test.go index b98bf3ea..444a3f22 100644 --- a/cmd/nvidia-ctk-installer/main_test.go +++ b/cmd/nvidia-ctk-installer/main_test.go @@ -79,6 +79,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.legacy] + cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -140,6 +143,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.legacy] + cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -204,6 +210,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.legacy] + cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -265,6 +274,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.legacy] + cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -348,6 +360,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.legacy] + cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true diff --git a/internal/config/config.go b/internal/config/config.go index 652cc83a..0029623b 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -121,6 +121,9 @@ func GetDefault() (*Config, error) { AnnotationPrefixes: []string{cdi.AnnotationPrefix}, SpecDirs: cdi.DefaultSpecDirs, }, + Legacy: legacyModeConfig{ + CUDACompatMode: defaultCUDACompatMode, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 963058e1..3cfce7d6 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "ldconfig", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -93,6 +96,7 @@ func TestGetConfig(t *testing.T) { "nvidia-container-cli.load-kmods = false", "nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"", "nvidia-container-cli.user = \"foo:bar\"", + "nvidia-container-cli.cuda-compat-mode = \"mount\"", "nvidia-container-runtime.debug = \"/foo/bar\"", "nvidia-container-runtime.discover-mode = \"not-legacy\"", "nvidia-container-runtime.log-level = \"debug\"", @@ -102,6 +106,7 @@ func TestGetConfig(t *testing.T) { "nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]", "nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]", "nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", + "nvidia-container-runtime.modes.legacy.cuda-compat-mode = \"mount\"", "nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"", "nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"", }, @@ -134,6 +139,9 @@ func TestGetConfig(t *testing.T) { "/not/var/run/cdi", }, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "mount", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -178,6 +186,9 @@ func TestGetConfig(t *testing.T) { "/var/run/cdi", }, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "ldconfig", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -200,6 +211,7 @@ func TestGetConfig(t *testing.T) { "root = \"/bar/baz\"", "load-kmods = false", "ldconfig = \"@/foo/bar/ldconfig\"", + "cuda-compat-mode = \"mount\"", "user = \"foo:bar\"", "[nvidia-container-runtime]", "debug = \"/foo/bar\"", @@ -213,6 +225,8 @@ func TestGetConfig(t *testing.T) { "spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]", "[nvidia-container-runtime.modes.csv]", "mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", + "[nvidia-container-runtime.modes.legacy]", + "cuda-compat-mode = \"mount\"", "[nvidia-container-runtime-hook]", "path = \"/foo/bar/nvidia-container-runtime-hook\"", "[nvidia-ctk]", @@ -247,6 +261,9 @@ func TestGetConfig(t *testing.T) { "/not/var/run/cdi", }, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "mount", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -283,6 +300,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "ldconfig", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -322,6 +342,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "ldconfig", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ diff --git a/internal/config/runtime.go b/internal/config/runtime.go index 2ba1b7a8..315f23ba 100644 --- a/internal/config/runtime.go +++ b/internal/config/runtime.go @@ -29,8 +29,9 @@ type RuntimeConfig struct { // modesConfig defines (optional) per-mode configs type modesConfig struct { - CSV csvModeConfig `toml:"csv"` - CDI cdiModeConfig `toml:"cdi"` + CSV csvModeConfig `toml:"csv"` + CDI cdiModeConfig `toml:"cdi"` + Legacy legacyModeConfig `toml:"legacy"` } type cdiModeConfig struct { @@ -45,3 +46,31 @@ type cdiModeConfig struct { type csvModeConfig struct { MountSpecPath string `toml:"mount-spec-path"` } + +type legacyModeConfig struct { + // CUDACompatMode sets the mode to be used to make CUDA Forward Compat + // libraries discoverable in the container. + CUDACompatMode cudaCompatMode `toml:"cuda-compat-mode,omitempty"` +} + +type cudaCompatMode string + +const ( + defaultCUDACompatMode = CUDACompatModeLdconfig + // CUDACompatModeDisabled explicitly disables the handling of CUDA Forward + // Compatibility in the NVIDIA Container Runtime and NVIDIA Container + // Runtime Hook. + CUDACompatModeDisabled = cudaCompatMode("disabled") + // CUDACompatModeHook uses a container lifecycle hook to implement CUDA + // Forward Compatibility support. This requires the use of the NVIDIA + // Container Runtime and is not compatible with use cases where only the + // NVIDIA Container Runtime Hook is used (e.g. the Docker --gpus flag). + CUDACompatModeHook = cudaCompatMode("hook") + // CUDACompatModeLdconfig adds the folders containing CUDA Forward Compat + // libraries to the ldconfig command invoked from the NVIDIA Container + // Runtime Hook. + CUDACompatModeLdconfig = cudaCompatMode("ldconfig") + // CUDACompatModeMount mounts CUDA Forward Compat folders from the container + // to the container when using the NVIDIA Container Runtime Hook. + CUDACompatModeMount = cudaCompatMode("mount") +) diff --git a/internal/config/toml_test.go b/internal/config/toml_test.go index f7c649f7..c4394864 100644 --- a/internal/config/toml_test.go +++ b/internal/config/toml_test.go @@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"] [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" +[nvidia-container-runtime.modes.legacy] +cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "nvidia-container-runtime-hook" skip-mode-detection = false diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index 8320286e..584391aa 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -79,24 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } - if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() { - compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver) - discoverers = append(discoverers, compatLibHookDiscoverer) - // For legacy mode, we also need to inject a hook to update the LDCache - // after we have modifed the configuration. - if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" { - ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook( - logger, - discover.None{}, - cfg.NVIDIACTKConfig.Path, - "", - ) - if err != nil { - return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err) - } - discoverers = append(discoverers, ldcacheUpdateHookDiscoverer) + // If the feature flag has explicitly been toggled, we don't make any modification. + if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() { + cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver) + if err != nil { + return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err) } + discoverers = append(discoverers, cudaCompatDiscoverer) } return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...)) } + +func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) { + // For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook. + if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode != config.CUDACompatModeHook { + return nil, nil + } + + compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver) + // For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook. + if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" { + return compatLibHookDiscoverer, nil + } + + // For legacy mode, we also need to inject a hook to update the LDCache + // after we have modifed the configuration. + ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook( + logger, + discover.None{}, + cfg.NVIDIACTKConfig.Path, + "", + ) + if err != nil { + return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err) + } + + return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil +}