diff --git a/cmd/nvidia-container-runtime-hook/hook_config.go b/cmd/nvidia-container-runtime-hook/hook_config.go index f88815e5..3ad8e3b1 100644 --- a/cmd/nvidia-container-runtime-hook/hook_config.go +++ b/cmd/nvidia-container-runtime-hook/hook_config.go @@ -104,3 +104,26 @@ func (c *hookConfig) getSwarmResourceEnvvars() []string { return envvars } + +// nvidiaContainerCliCUDACompatModeFlags returns required --cuda-compat-mode +// flag(s) depending on the hook and runtime configurations. +func (c *hookConfig) nvidiaContainerCliCUDACompatModeFlags() []string { + var flag string + switch c.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode { + case config.CUDACompatModeLdconfig: + flag = "--cuda-compat-mode=ldconfig" + case config.CUDACompatModeMount: + flag = "--cuda-compat-mode=mount" + case config.CUDACompatModeDisabled, config.CUDACompatModeHook: + flag = "--cuda-compat-mode=disabled" + default: + if !c.Features.AllowCUDACompatLibsFromContainer.IsEnabled() { + flag = "--cuda-compat-mode=disabled" + } + } + + if flag == "" { + return nil + } + return []string{flag} +} diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index e864a51d..c77fa390 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -114,9 +114,8 @@ func doPrestart() { } args = append(args, "configure") - if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() { - args = append(args, "--no-cntlibs") - } + args = append(args, hook.nvidiaContainerCliCUDACompatModeFlags()...) + if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" { args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath)) } diff --git a/cmd/nvidia-ctk-installer/main_test.go b/cmd/nvidia-ctk-installer/main_test.go index b98bf3ea..444a3f22 100644 --- a/cmd/nvidia-ctk-installer/main_test.go +++ b/cmd/nvidia-ctk-installer/main_test.go @@ -79,6 +79,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.legacy] + cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -140,6 +143,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.legacy] + cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -204,6 +210,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.legacy] + cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -265,6 +274,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.legacy] + cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true @@ -348,6 +360,9 @@ swarm-resource = "" [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" + [nvidia-container-runtime.modes.legacy] + cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook" skip-mode-detection = true diff --git a/internal/config/config.go b/internal/config/config.go index 652cc83a..0029623b 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -121,6 +121,9 @@ func GetDefault() (*Config, error) { AnnotationPrefixes: []string{cdi.AnnotationPrefix}, SpecDirs: cdi.DefaultSpecDirs, }, + Legacy: legacyModeConfig{ + CUDACompatMode: defaultCUDACompatMode, + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 963058e1..3cfce7d6 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "ldconfig", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -93,6 +96,7 @@ func TestGetConfig(t *testing.T) { "nvidia-container-cli.load-kmods = false", "nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"", "nvidia-container-cli.user = \"foo:bar\"", + "nvidia-container-cli.cuda-compat-mode = \"mount\"", "nvidia-container-runtime.debug = \"/foo/bar\"", "nvidia-container-runtime.discover-mode = \"not-legacy\"", "nvidia-container-runtime.log-level = \"debug\"", @@ -102,6 +106,7 @@ func TestGetConfig(t *testing.T) { "nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]", "nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]", "nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", + "nvidia-container-runtime.modes.legacy.cuda-compat-mode = \"mount\"", "nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"", "nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"", }, @@ -134,6 +139,9 @@ func TestGetConfig(t *testing.T) { "/not/var/run/cdi", }, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "mount", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -178,6 +186,9 @@ func TestGetConfig(t *testing.T) { "/var/run/cdi", }, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "ldconfig", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -200,6 +211,7 @@ func TestGetConfig(t *testing.T) { "root = \"/bar/baz\"", "load-kmods = false", "ldconfig = \"@/foo/bar/ldconfig\"", + "cuda-compat-mode = \"mount\"", "user = \"foo:bar\"", "[nvidia-container-runtime]", "debug = \"/foo/bar\"", @@ -213,6 +225,8 @@ func TestGetConfig(t *testing.T) { "spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]", "[nvidia-container-runtime.modes.csv]", "mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"", + "[nvidia-container-runtime.modes.legacy]", + "cuda-compat-mode = \"mount\"", "[nvidia-container-runtime-hook]", "path = \"/foo/bar/nvidia-container-runtime-hook\"", "[nvidia-ctk]", @@ -247,6 +261,9 @@ func TestGetConfig(t *testing.T) { "/not/var/run/cdi", }, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "mount", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -283,6 +300,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "ldconfig", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ @@ -322,6 +342,9 @@ func TestGetConfig(t *testing.T) { AnnotationPrefixes: []string{"cdi.k8s.io/"}, SpecDirs: []string{"/etc/cdi", "/var/run/cdi"}, }, + Legacy: legacyModeConfig{ + CUDACompatMode: "ldconfig", + }, }, }, NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ diff --git a/internal/config/runtime.go b/internal/config/runtime.go index 2ba1b7a8..315f23ba 100644 --- a/internal/config/runtime.go +++ b/internal/config/runtime.go @@ -29,8 +29,9 @@ type RuntimeConfig struct { // modesConfig defines (optional) per-mode configs type modesConfig struct { - CSV csvModeConfig `toml:"csv"` - CDI cdiModeConfig `toml:"cdi"` + CSV csvModeConfig `toml:"csv"` + CDI cdiModeConfig `toml:"cdi"` + Legacy legacyModeConfig `toml:"legacy"` } type cdiModeConfig struct { @@ -45,3 +46,31 @@ type cdiModeConfig struct { type csvModeConfig struct { MountSpecPath string `toml:"mount-spec-path"` } + +type legacyModeConfig struct { + // CUDACompatMode sets the mode to be used to make CUDA Forward Compat + // libraries discoverable in the container. + CUDACompatMode cudaCompatMode `toml:"cuda-compat-mode,omitempty"` +} + +type cudaCompatMode string + +const ( + defaultCUDACompatMode = CUDACompatModeLdconfig + // CUDACompatModeDisabled explicitly disables the handling of CUDA Forward + // Compatibility in the NVIDIA Container Runtime and NVIDIA Container + // Runtime Hook. + CUDACompatModeDisabled = cudaCompatMode("disabled") + // CUDACompatModeHook uses a container lifecycle hook to implement CUDA + // Forward Compatibility support. This requires the use of the NVIDIA + // Container Runtime and is not compatible with use cases where only the + // NVIDIA Container Runtime Hook is used (e.g. the Docker --gpus flag). + CUDACompatModeHook = cudaCompatMode("hook") + // CUDACompatModeLdconfig adds the folders containing CUDA Forward Compat + // libraries to the ldconfig command invoked from the NVIDIA Container + // Runtime Hook. + CUDACompatModeLdconfig = cudaCompatMode("ldconfig") + // CUDACompatModeMount mounts CUDA Forward Compat folders from the container + // to the container when using the NVIDIA Container Runtime Hook. + CUDACompatModeMount = cudaCompatMode("mount") +) diff --git a/internal/config/toml_test.go b/internal/config/toml_test.go index f7c649f7..c4394864 100644 --- a/internal/config/toml_test.go +++ b/internal/config/toml_test.go @@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"] [nvidia-container-runtime.modes.csv] mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d" +[nvidia-container-runtime.modes.legacy] +cuda-compat-mode = "ldconfig" + [nvidia-container-runtime-hook] path = "nvidia-container-runtime-hook" skip-mode-detection = false diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index 8320286e..584391aa 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -79,24 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } - if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() { - compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver) - discoverers = append(discoverers, compatLibHookDiscoverer) - // For legacy mode, we also need to inject a hook to update the LDCache - // after we have modifed the configuration. - if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" { - ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook( - logger, - discover.None{}, - cfg.NVIDIACTKConfig.Path, - "", - ) - if err != nil { - return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err) - } - discoverers = append(discoverers, ldcacheUpdateHookDiscoverer) + // If the feature flag has explicitly been toggled, we don't make any modification. + if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() { + cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver) + if err != nil { + return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err) } + discoverers = append(discoverers, cudaCompatDiscoverer) } return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...)) } + +func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) { + // For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook. + if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode != config.CUDACompatModeHook { + return nil, nil + } + + compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver) + // For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook. + if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" { + return compatLibHookDiscoverer, nil + } + + // For legacy mode, we also need to inject a hook to update the LDCache + // after we have modifed the configuration. + ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook( + logger, + discover.None{}, + cfg.NVIDIACTKConfig.Path, + "", + ) + if err != nil { + return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err) + } + + return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil +}