diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index e864a51d..efed9d8d 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -114,9 +114,19 @@ func doPrestart() { } args = append(args, "configure") - if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() { - args = append(args, "--no-cntlibs") + switch cli.CUDACompatMode { + case config.CUDACompatModeLdconfig: + args = append(args, "--cuda-compat-mode=ldconfig") + case config.CUDACompatModeMount: + args = append(args, "--cuda-compat-mode=mount") + case config.CUDACompatModeDisabled, config.CUDACompatModeHook: + args = append(args, "--cuda-compat-mode=disabled") + default: + if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() { + args = append(args, "--cuda-compat-mode=disabled") + } } + if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" { args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath)) } diff --git a/cmd/nvidia-ctk-installer/main_test.go b/cmd/nvidia-ctk-installer/main_test.go index b98bf3ea..6099b9db 100644 --- a/cmd/nvidia-ctk-installer/main_test.go +++ b/cmd/nvidia-ctk-installer/main_test.go @@ -53,6 +53,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v swarm-resource = "" [nvidia-container-cli] + cuda-compat-mode = "ldconfig" debug = "" environment = [] ldcache = "" @@ -114,6 +115,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v swarm-resource = "" [nvidia-container-cli] + cuda-compat-mode = "ldconfig" debug = "" environment = [] ldcache = "" @@ -178,6 +180,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v swarm-resource = "" [nvidia-container-cli] + cuda-compat-mode = "ldconfig" debug = "" environment = [] ldcache = "" @@ -239,6 +242,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v swarm-resource = "" [nvidia-container-cli] + cuda-compat-mode = "ldconfig" debug = "" environment = [] ldcache = "" @@ -322,6 +326,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v swarm-resource = "" [nvidia-container-cli] + cuda-compat-mode = "ldconfig" debug = "" environment = [] ldcache = "" diff --git a/internal/config/cli.go b/internal/config/cli.go index 3621df25..aeaefd4b 100644 --- a/internal/config/cli.go +++ b/internal/config/cli.go @@ -22,6 +22,27 @@ import ( "strings" ) +type cudaCompatMode string + +const ( + // CUDACompatModeDisabled explicitly disables the handling of CUDA Forward + // Compatibility in the NVIDIA Container Runtime and NVIDIA Container + // Runtime Hook. + CUDACompatModeDisabled = cudaCompatMode("disabled") + // CUDACompatModeHook uses a container lifecycle hook to implement CUDA + // Forward Compatibility support. This requires the use of the NVIDIA + // Container Runtime and is not compatible with use cases where only the + // NVIDIA Container Runtime Hook is used (e.g. the Docker --gpus flag). + CUDACompatModeHook = cudaCompatMode("hook") + // CUDACompatModeLdconfig adds the folders containing CUDA Forward Compat + // libraries to the ldconfig command invoked from the NVIDIA Container + // Runtime Hook. + CUDACompatModeLdconfig = cudaCompatMode("ldconfig") + // CUDACompatModeMount mounts CUDA Forward Compat folders from the container + // to the container when using the NVIDIA Container Runtime Hook. + CUDACompatModeMount = cudaCompatMode("mount") +) + // ContainerCLIConfig stores the options for the nvidia-container-cli type ContainerCLIConfig struct { Root string `toml:"root"` @@ -44,6 +65,9 @@ type ContainerCLIConfig struct { // is required, the features.allow-ldconfig-from-container feature gate must // be enabled explicitly. Ldconfig ldconfigPath `toml:"ldconfig"` + // CUDACompatMode sets the mode to be used to make CUDA Forward Compat + // libraries discoverable in the container. + CUDACompatMode cudaCompatMode `toml:"cuda-compat-mode,omitempty"` } // NormalizeLDConfigPath returns the resolved path of the configured LDConfig binary. diff --git a/internal/config/config.go b/internal/config/config.go index 652cc83a..58f04586 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -100,9 +100,10 @@ func GetDefault() (*Config, error) { AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: image.SupportedDriverCapabilities.String(), NVIDIAContainerCLIConfig: ContainerCLIConfig{ - LoadKmods: true, - Ldconfig: getLdConfigPath(), - User: getUserGroup(), + LoadKmods: true, + Ldconfig: getLdConfigPath(), + User: getUserGroup(), + CUDACompatMode: CUDACompatModeLdconfig, }, NVIDIACTKConfig: CTKConfig{ Path: nvidiaCTKExecutable, diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 963058e1..1ce3a7ff 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -56,9 +56,10 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Root: "", - LoadKmods: true, - Ldconfig: "@/test/ld/config/path", + Root: "", + LoadKmods: true, + Ldconfig: "@/test/ld/config/path", + CUDACompatMode: "ldconfig", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/dev/null", @@ -93,6 +94,7 @@ func TestGetConfig(t *testing.T) { "nvidia-container-cli.load-kmods = false", "nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"", "nvidia-container-cli.user = \"foo:bar\"", + "nvidia-container-cli.cuda-compat-mode = \"mount\"", "nvidia-container-runtime.debug = \"/foo/bar\"", "nvidia-container-runtime.discover-mode = \"not-legacy\"", "nvidia-container-runtime.log-level = \"debug\"", @@ -109,10 +111,11 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: false, SupportedDriverCapabilities: "compute,utility", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Root: "/bar/baz", - LoadKmods: false, - Ldconfig: "@/foo/bar/ldconfig", - User: "foo:bar", + Root: "/bar/baz", + LoadKmods: false, + Ldconfig: "@/foo/bar/ldconfig", + User: "foo:bar", + CUDACompatMode: "mount", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/foo/bar", @@ -156,8 +159,9 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Ldconfig: "/foo/bar/ldconfig", - LoadKmods: true, + Ldconfig: "/foo/bar/ldconfig", + LoadKmods: true, + CUDACompatMode: "ldconfig", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/dev/null", @@ -200,6 +204,7 @@ func TestGetConfig(t *testing.T) { "root = \"/bar/baz\"", "load-kmods = false", "ldconfig = \"@/foo/bar/ldconfig\"", + "cuda-compat-mode = \"mount\"", "user = \"foo:bar\"", "[nvidia-container-runtime]", "debug = \"/foo/bar\"", @@ -222,10 +227,11 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: false, SupportedDriverCapabilities: "compute,utility", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Root: "/bar/baz", - LoadKmods: false, - Ldconfig: "@/foo/bar/ldconfig", - User: "foo:bar", + Root: "/bar/baz", + LoadKmods: false, + Ldconfig: "@/foo/bar/ldconfig", + CUDACompatMode: "mount", + User: "foo:bar", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/foo/bar", @@ -264,10 +270,11 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Root: "", - LoadKmods: true, - Ldconfig: "@/test/ld/config/path", - User: "root:video", + Root: "", + LoadKmods: true, + Ldconfig: "@/test/ld/config/path", + CUDACompatMode: "ldconfig", + User: "root:video", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/dev/null", @@ -303,10 +310,11 @@ func TestGetConfig(t *testing.T) { AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ - Root: "", - LoadKmods: true, - Ldconfig: "@/test/ld/config/path", - User: "foo:bar", + Root: "", + LoadKmods: true, + Ldconfig: "@/test/ld/config/path", + CUDACompatMode: "ldconfig", + User: "foo:bar", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/dev/null", diff --git a/internal/config/toml_test.go b/internal/config/toml_test.go index f7c649f7..71a07b7a 100644 --- a/internal/config/toml_test.go +++ b/internal/config/toml_test.go @@ -48,6 +48,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v #swarm-resource = "DOCKER_RESOURCE_GPU" [nvidia-container-cli] +cuda-compat-mode = "ldconfig" #debug = "/var/log/nvidia-container-toolkit.log" environment = [] #ldcache = "/etc/ld.so.cache" diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index 8320286e..8559ed40 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -79,24 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } - if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() { - compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver) - discoverers = append(discoverers, compatLibHookDiscoverer) - // For legacy mode, we also need to inject a hook to update the LDCache - // after we have modifed the configuration. - if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" { - ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook( - logger, - discover.None{}, - cfg.NVIDIACTKConfig.Path, - "", - ) - if err != nil { - return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err) - } - discoverers = append(discoverers, ldcacheUpdateHookDiscoverer) + // If the feature flag has explicitly been toggled, we don't make any modification. + if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() { + cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver) + if err != nil { + return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err) } + discoverers = append(discoverers, cudaCompatDiscoverer) } return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...)) } + +func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) { + // For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook. + if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerCLIConfig.CUDACompatMode != config.CUDACompatModeHook { + return nil, nil + } + + compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver) + // For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook. + if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" { + return compatLibHookDiscoverer, nil + } + + // For legacy mode, we also need to inject a hook to update the LDCache + // after we have modifed the configuration. + ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook( + logger, + discover.None{}, + cfg.NVIDIACTKConfig.Path, + "", + ) + if err != nil { + return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err) + } + + return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil +}