Add cuda-compat-mode config option
Some checks failed
CI Pipeline / code-scanning (push) Has been cancelled
CI Pipeline / variables (push) Has been cancelled
CI Pipeline / golang (push) Has been cancelled
CI Pipeline / image (push) Has been cancelled
CI Pipeline / e2e-test (push) Has been cancelled

This change adds an nvidia-container-runtime.modes.legacy.cuda-compat-mode
config option. This can be set to one of four values:

* ldconfig (default): the --cuda-compat-mode=ldconfig flag is passed to the nvidia-container-cli
* mount: the --cuda-compat-mode=mount flag is passed to the nvidia-conainer-cli
* disabled: the --cuda-compat-mode=disabled flag is passed to the nvidia-container-cli
* hook: the --cuda-compat-mode=disabled flag is passed to the nvidia-container-cli AND the
  enable-cuda-compat hook is used to provide forward compatibility.

Note that the disable-cuda-compat-lib-hook feature flag will prevent the enable-cuda-compat
hook from being used. This change also means that the allow-cuda-compat-libs-from-container
feature flag no longer has any effect.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar
2025-04-29 14:00:12 +02:00
parent 0c254711e7
commit 50959cf409
8 changed files with 133 additions and 21 deletions

View File

@@ -79,24 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image
discoverers = append(discoverers, d)
}
if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
discoverers = append(discoverers, compatLibHookDiscoverer)
// For legacy mode, we also need to inject a hook to update the LDCache
// after we have modifed the configuration.
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" {
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
logger,
discover.None{},
cfg.NVIDIACTKConfig.Path,
"",
)
if err != nil {
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
}
discoverers = append(discoverers, ldcacheUpdateHookDiscoverer)
// If the feature flag has explicitly been toggled, we don't make any modification.
if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver)
if err != nil {
return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err)
}
discoverers = append(discoverers, cudaCompatDiscoverer)
}
return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...))
}
func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) {
// For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook.
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerRuntimeConfig.Modes.Legacy.CUDACompatMode != config.CUDACompatModeHook {
return nil, nil
}
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
// For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook.
if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" {
return compatLibHookDiscoverer, nil
}
// For legacy mode, we also need to inject a hook to update the LDCache
// after we have modifed the configuration.
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
logger,
discover.None{},
cfg.NVIDIACTKConfig.Path,
"",
)
if err != nil {
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
}
return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil
}