From aa0cb99bbb284c8a2daf00bbfeeb5379f4c785e1 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Tue, 29 Apr 2025 14:00:12 +0200 Subject: [PATCH] Add cuda-compat-mode config option Signed-off-by: Evan Lezar --- cmd/nvidia-container-runtime-hook/main.go | 12 +++++- cmd/nvidia-ctk-installer/main_test.go | 5 +++ internal/config/cli.go | 10 +++++ internal/config/config.go | 7 ++-- internal/config/config_test.go | 50 +++++++++++++---------- internal/config/toml_test.go | 1 + internal/modifier/gated.go | 49 ++++++++++++++-------- 7 files changed, 93 insertions(+), 41 deletions(-) diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index e864a51d..185587f0 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -114,9 +114,19 @@ func doPrestart() { } args = append(args, "configure") - if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() { + switch cli.CUDACompatMode { + case config.CUDACompatModeLdconfig: + args = append(args, "--cuda-compat-mode="+config.CUDACompatModeLdconfig) + case config.CUDACompatModeMount: + args = append(args, "--cuda-compat-mode="+config.CUDACompatModeMount) + case config.CUDACompatModeDisabled, config.CUDACompatModeHook: args = append(args, "--no-cntlibs") + default: + if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() { + args = append(args, "--no-cntlibs") + } } + if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" { args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath)) } diff --git a/cmd/nvidia-ctk-installer/main_test.go b/cmd/nvidia-ctk-installer/main_test.go index b98bf3ea..6099b9db 100644 --- a/cmd/nvidia-ctk-installer/main_test.go +++ b/cmd/nvidia-ctk-installer/main_test.go @@ -53,6 +53,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v swarm-resource = "" [nvidia-container-cli] + cuda-compat-mode = "ldconfig" debug = "" environment = [] ldcache = "" @@ -114,6 +115,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v swarm-resource = "" [nvidia-container-cli] + cuda-compat-mode = "ldconfig" debug = "" environment = [] ldcache = "" @@ -178,6 +180,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v swarm-resource = "" [nvidia-container-cli] + cuda-compat-mode = "ldconfig" debug = "" environment = [] ldcache = "" @@ -239,6 +242,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v swarm-resource = "" [nvidia-container-cli] + cuda-compat-mode = "ldconfig" debug = "" environment = [] ldcache = "" @@ -322,6 +326,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v swarm-resource = "" [nvidia-container-cli] + cuda-compat-mode = "ldconfig" debug = "" environment = [] ldcache = "" diff --git a/internal/config/cli.go b/internal/config/cli.go index 3621df25..b7c3ea7c 100644 --- a/internal/config/cli.go +++ b/internal/config/cli.go @@ -22,6 +22,13 @@ import ( "strings" ) +const ( + CUDACompatModeMount = "mount" + CUDACompatModeLdconfig = "ldconfig" + CUDACompatModeHook = "hook" + CUDACompatModeDisabled = "disabled" +) + // ContainerCLIConfig stores the options for the nvidia-container-cli type ContainerCLIConfig struct { Root string `toml:"root"` @@ -44,6 +51,9 @@ type ContainerCLIConfig struct { // is required, the features.allow-ldconfig-from-container feature gate must // be enabled explicitly. Ldconfig ldconfigPath `toml:"ldconfig"` + // CUDACompatMode sets the mode to be used to make CUDA Forward Compat + // libraries discoverable in the container. + CUDACompatMode string `toml:"cuda-compat-mode,omitempty"` } // NormalizeLDConfigPath returns the resolved path of the configured LDConfig binary. diff --git a/internal/config/config.go b/internal/config/config.go index 652cc83a..58f04586 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -100,9 +100,10 @@ func GetDefault() (*Config, error) { AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: image.SupportedDriverCapabilities.String(), NVIDIAContainerCLIConfig: ContainerCLIConfig{ - LoadKmods: true, - Ldconfig: getLdConfigPath(), - User: getUserGroup(), + LoadKmods: true, + Ldconfig: getLdConfigPath(), + User: getUserGroup(), + CUDACompatMode: CUDACompatModeLdconfig, }, NVIDIACTKConfig: CTKConfig{ Path: nvidiaCTKExecutable, diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 963058e1..1ce3a7ff 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -56,9 +56,10 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Root: "", - LoadKmods: true, - Ldconfig: "@/test/ld/config/path", + Root: "", + LoadKmods: true, + Ldconfig: "@/test/ld/config/path", + CUDACompatMode: "ldconfig", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/dev/null", @@ -93,6 +94,7 @@ func TestGetConfig(t *testing.T) { "nvidia-container-cli.load-kmods = false", "nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"", "nvidia-container-cli.user = \"foo:bar\"", + "nvidia-container-cli.cuda-compat-mode = \"mount\"", "nvidia-container-runtime.debug = \"/foo/bar\"", "nvidia-container-runtime.discover-mode = \"not-legacy\"", "nvidia-container-runtime.log-level = \"debug\"", @@ -109,10 +111,11 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: false, SupportedDriverCapabilities: "compute,utility", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Root: "/bar/baz", - LoadKmods: false, - Ldconfig: "@/foo/bar/ldconfig", - User: "foo:bar", + Root: "/bar/baz", + LoadKmods: false, + Ldconfig: "@/foo/bar/ldconfig", + User: "foo:bar", + CUDACompatMode: "mount", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/foo/bar", @@ -156,8 +159,9 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Ldconfig: "/foo/bar/ldconfig", - LoadKmods: true, + Ldconfig: "/foo/bar/ldconfig", + LoadKmods: true, + CUDACompatMode: "ldconfig", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/dev/null", @@ -200,6 +204,7 @@ func TestGetConfig(t *testing.T) { "root = \"/bar/baz\"", "load-kmods = false", "ldconfig = \"@/foo/bar/ldconfig\"", + "cuda-compat-mode = \"mount\"", "user = \"foo:bar\"", "[nvidia-container-runtime]", "debug = \"/foo/bar\"", @@ -222,10 +227,11 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: false, SupportedDriverCapabilities: "compute,utility", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Root: "/bar/baz", - LoadKmods: false, - Ldconfig: "@/foo/bar/ldconfig", - User: "foo:bar", + Root: "/bar/baz", + LoadKmods: false, + Ldconfig: "@/foo/bar/ldconfig", + CUDACompatMode: "mount", + User: "foo:bar", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/foo/bar", @@ -264,10 +270,11 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Root: "", - LoadKmods: true, - Ldconfig: "@/test/ld/config/path", - User: "root:video", + Root: "", + LoadKmods: true, + Ldconfig: "@/test/ld/config/path", + CUDACompatMode: "ldconfig", + User: "root:video", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/dev/null", @@ -303,10 +310,11 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Root: "", - LoadKmods: true, - Ldconfig: "@/test/ld/config/path", - User: "foo:bar", + Root: "", + LoadKmods: true, + Ldconfig: "@/test/ld/config/path", + CUDACompatMode: "ldconfig", + User: "foo:bar", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/dev/null", diff --git a/internal/config/toml_test.go b/internal/config/toml_test.go index f7c649f7..71a07b7a 100644 --- a/internal/config/toml_test.go +++ b/internal/config/toml_test.go @@ -48,6 +48,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v #swarm-resource = "DOCKER_RESOURCE_GPU" [nvidia-container-cli] +cuda-compat-mode = "ldconfig" #debug = "/var/log/nvidia-container-toolkit.log" environment = [] #ldcache = "/etc/ld.so.cache" diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index 8320286e..8559ed40 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -79,24 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } - if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() { - compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver) - discoverers = append(discoverers, compatLibHookDiscoverer) - // For legacy mode, we also need to inject a hook to update the LDCache - // after we have modifed the configuration. - if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" { - ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook( - logger, - discover.None{}, - cfg.NVIDIACTKConfig.Path, - "", - ) - if err != nil { - return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err) - } - discoverers = append(discoverers, ldcacheUpdateHookDiscoverer) + // If the feature flag has explicitly been toggled, we don't make any modification. + if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() { + cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver) + if err != nil { + return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err) } + discoverers = append(discoverers, cudaCompatDiscoverer) } return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...)) } + +func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) { + // For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook. + if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerCLIConfig.CUDACompatMode != config.CUDACompatModeHook { + return nil, nil + } + + compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver) + // For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook. + if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" { + return compatLibHookDiscoverer, nil + } + + // For legacy mode, we also need to inject a hook to update the LDCache + // after we have modifed the configuration. + ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook( + logger, + discover.None{}, + cfg.NVIDIACTKConfig.Path, + "", + ) + if err != nil { + return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err) + } + + return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil +}