Add cuda-compat-mode config option
Some checks failed
CI Pipeline / code-scanning (push) Has been cancelled
CI Pipeline / variables (push) Has been cancelled
CI Pipeline / golang (push) Has been cancelled
CI Pipeline / image (push) Has been cancelled
CI Pipeline / e2e-test (push) Has been cancelled

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2025-04-29 14:00:12 +02:00
parent dccdfeddd1
commit aa0cb99bbb
No known key found for this signature in database
7 changed files with 93 additions and 41 deletions

View File

@ -114,9 +114,19 @@ func doPrestart() {
} }
args = append(args, "configure") args = append(args, "configure")
if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() { switch cli.CUDACompatMode {
case config.CUDACompatModeLdconfig:
args = append(args, "--cuda-compat-mode="+config.CUDACompatModeLdconfig)
case config.CUDACompatModeMount:
args = append(args, "--cuda-compat-mode="+config.CUDACompatModeMount)
case config.CUDACompatModeDisabled, config.CUDACompatModeHook:
args = append(args, "--no-cntlibs") args = append(args, "--no-cntlibs")
default:
if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
args = append(args, "--no-cntlibs")
}
} }
if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" { if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" {
args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath)) args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath))
} }

View File

@ -53,6 +53,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
swarm-resource = "" swarm-resource = ""
[nvidia-container-cli] [nvidia-container-cli]
cuda-compat-mode = "ldconfig"
debug = "" debug = ""
environment = [] environment = []
ldcache = "" ldcache = ""
@ -114,6 +115,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
swarm-resource = "" swarm-resource = ""
[nvidia-container-cli] [nvidia-container-cli]
cuda-compat-mode = "ldconfig"
debug = "" debug = ""
environment = [] environment = []
ldcache = "" ldcache = ""
@ -178,6 +180,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
swarm-resource = "" swarm-resource = ""
[nvidia-container-cli] [nvidia-container-cli]
cuda-compat-mode = "ldconfig"
debug = "" debug = ""
environment = [] environment = []
ldcache = "" ldcache = ""
@ -239,6 +242,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
swarm-resource = "" swarm-resource = ""
[nvidia-container-cli] [nvidia-container-cli]
cuda-compat-mode = "ldconfig"
debug = "" debug = ""
environment = [] environment = []
ldcache = "" ldcache = ""
@ -322,6 +326,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
swarm-resource = "" swarm-resource = ""
[nvidia-container-cli] [nvidia-container-cli]
cuda-compat-mode = "ldconfig"
debug = "" debug = ""
environment = [] environment = []
ldcache = "" ldcache = ""

View File

@ -22,6 +22,13 @@ import (
"strings" "strings"
) )
const (
CUDACompatModeMount = "mount"
CUDACompatModeLdconfig = "ldconfig"
CUDACompatModeHook = "hook"
CUDACompatModeDisabled = "disabled"
)
// ContainerCLIConfig stores the options for the nvidia-container-cli // ContainerCLIConfig stores the options for the nvidia-container-cli
type ContainerCLIConfig struct { type ContainerCLIConfig struct {
Root string `toml:"root"` Root string `toml:"root"`
@ -44,6 +51,9 @@ type ContainerCLIConfig struct {
// is required, the features.allow-ldconfig-from-container feature gate must // is required, the features.allow-ldconfig-from-container feature gate must
// be enabled explicitly. // be enabled explicitly.
Ldconfig ldconfigPath `toml:"ldconfig"` Ldconfig ldconfigPath `toml:"ldconfig"`
// CUDACompatMode sets the mode to be used to make CUDA Forward Compat
// libraries discoverable in the container.
CUDACompatMode string `toml:"cuda-compat-mode,omitempty"`
} }
// NormalizeLDConfigPath returns the resolved path of the configured LDConfig binary. // NormalizeLDConfigPath returns the resolved path of the configured LDConfig binary.

View File

@ -100,9 +100,10 @@ func GetDefault() (*Config, error) {
AcceptEnvvarUnprivileged: true, AcceptEnvvarUnprivileged: true,
SupportedDriverCapabilities: image.SupportedDriverCapabilities.String(), SupportedDriverCapabilities: image.SupportedDriverCapabilities.String(),
NVIDIAContainerCLIConfig: ContainerCLIConfig{ NVIDIAContainerCLIConfig: ContainerCLIConfig{
LoadKmods: true, LoadKmods: true,
Ldconfig: getLdConfigPath(), Ldconfig: getLdConfigPath(),
User: getUserGroup(), User: getUserGroup(),
CUDACompatMode: CUDACompatModeLdconfig,
}, },
NVIDIACTKConfig: CTKConfig{ NVIDIACTKConfig: CTKConfig{
Path: nvidiaCTKExecutable, Path: nvidiaCTKExecutable,

View File

@ -56,9 +56,10 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: true, AcceptEnvvarUnprivileged: true,
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
NVIDIAContainerCLIConfig: ContainerCLIConfig{ NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "", Root: "",
LoadKmods: true, LoadKmods: true,
Ldconfig: "@/test/ld/config/path", Ldconfig: "@/test/ld/config/path",
CUDACompatMode: "ldconfig",
}, },
NVIDIAContainerRuntimeConfig: RuntimeConfig{ NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/dev/null", DebugFilePath: "/dev/null",
@ -93,6 +94,7 @@ func TestGetConfig(t *testing.T) {
"nvidia-container-cli.load-kmods = false", "nvidia-container-cli.load-kmods = false",
"nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"", "nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"",
"nvidia-container-cli.user = \"foo:bar\"", "nvidia-container-cli.user = \"foo:bar\"",
"nvidia-container-cli.cuda-compat-mode = \"mount\"",
"nvidia-container-runtime.debug = \"/foo/bar\"", "nvidia-container-runtime.debug = \"/foo/bar\"",
"nvidia-container-runtime.discover-mode = \"not-legacy\"", "nvidia-container-runtime.discover-mode = \"not-legacy\"",
"nvidia-container-runtime.log-level = \"debug\"", "nvidia-container-runtime.log-level = \"debug\"",
@ -109,10 +111,11 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: false, AcceptEnvvarUnprivileged: false,
SupportedDriverCapabilities: "compute,utility", SupportedDriverCapabilities: "compute,utility",
NVIDIAContainerCLIConfig: ContainerCLIConfig{ NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "/bar/baz", Root: "/bar/baz",
LoadKmods: false, LoadKmods: false,
Ldconfig: "@/foo/bar/ldconfig", Ldconfig: "@/foo/bar/ldconfig",
User: "foo:bar", User: "foo:bar",
CUDACompatMode: "mount",
}, },
NVIDIAContainerRuntimeConfig: RuntimeConfig{ NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/foo/bar", DebugFilePath: "/foo/bar",
@ -156,8 +159,9 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: true, AcceptEnvvarUnprivileged: true,
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
NVIDIAContainerCLIConfig: ContainerCLIConfig{ NVIDIAContainerCLIConfig: ContainerCLIConfig{
Ldconfig: "/foo/bar/ldconfig", Ldconfig: "/foo/bar/ldconfig",
LoadKmods: true, LoadKmods: true,
CUDACompatMode: "ldconfig",
}, },
NVIDIAContainerRuntimeConfig: RuntimeConfig{ NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/dev/null", DebugFilePath: "/dev/null",
@ -200,6 +204,7 @@ func TestGetConfig(t *testing.T) {
"root = \"/bar/baz\"", "root = \"/bar/baz\"",
"load-kmods = false", "load-kmods = false",
"ldconfig = \"@/foo/bar/ldconfig\"", "ldconfig = \"@/foo/bar/ldconfig\"",
"cuda-compat-mode = \"mount\"",
"user = \"foo:bar\"", "user = \"foo:bar\"",
"[nvidia-container-runtime]", "[nvidia-container-runtime]",
"debug = \"/foo/bar\"", "debug = \"/foo/bar\"",
@ -222,10 +227,11 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: false, AcceptEnvvarUnprivileged: false,
SupportedDriverCapabilities: "compute,utility", SupportedDriverCapabilities: "compute,utility",
NVIDIAContainerCLIConfig: ContainerCLIConfig{ NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "/bar/baz", Root: "/bar/baz",
LoadKmods: false, LoadKmods: false,
Ldconfig: "@/foo/bar/ldconfig", Ldconfig: "@/foo/bar/ldconfig",
User: "foo:bar", CUDACompatMode: "mount",
User: "foo:bar",
}, },
NVIDIAContainerRuntimeConfig: RuntimeConfig{ NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/foo/bar", DebugFilePath: "/foo/bar",
@ -264,10 +270,11 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: true, AcceptEnvvarUnprivileged: true,
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
NVIDIAContainerCLIConfig: ContainerCLIConfig{ NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "", Root: "",
LoadKmods: true, LoadKmods: true,
Ldconfig: "@/test/ld/config/path", Ldconfig: "@/test/ld/config/path",
User: "root:video", CUDACompatMode: "ldconfig",
User: "root:video",
}, },
NVIDIAContainerRuntimeConfig: RuntimeConfig{ NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/dev/null", DebugFilePath: "/dev/null",
@ -303,10 +310,11 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: true, AcceptEnvvarUnprivileged: true,
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
NVIDIAContainerCLIConfig: ContainerCLIConfig{ NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "", Root: "",
LoadKmods: true, LoadKmods: true,
Ldconfig: "@/test/ld/config/path", Ldconfig: "@/test/ld/config/path",
User: "foo:bar", CUDACompatMode: "ldconfig",
User: "foo:bar",
}, },
NVIDIAContainerRuntimeConfig: RuntimeConfig{ NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/dev/null", DebugFilePath: "/dev/null",

View File

@ -48,6 +48,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
#swarm-resource = "DOCKER_RESOURCE_GPU" #swarm-resource = "DOCKER_RESOURCE_GPU"
[nvidia-container-cli] [nvidia-container-cli]
cuda-compat-mode = "ldconfig"
#debug = "/var/log/nvidia-container-toolkit.log" #debug = "/var/log/nvidia-container-toolkit.log"
environment = [] environment = []
#ldcache = "/etc/ld.so.cache" #ldcache = "/etc/ld.so.cache"

View File

@ -79,24 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image
discoverers = append(discoverers, d) discoverers = append(discoverers, d)
} }
if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() { // If the feature flag has explicitly been toggled, we don't make any modification.
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver) if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
discoverers = append(discoverers, compatLibHookDiscoverer) cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver)
// For legacy mode, we also need to inject a hook to update the LDCache if err != nil {
// after we have modifed the configuration. return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err)
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" {
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
logger,
discover.None{},
cfg.NVIDIACTKConfig.Path,
"",
)
if err != nil {
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
}
discoverers = append(discoverers, ldcacheUpdateHookDiscoverer)
} }
discoverers = append(discoverers, cudaCompatDiscoverer)
} }
return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...)) return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...))
} }
func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) {
// For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook.
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerCLIConfig.CUDACompatMode != config.CUDACompatModeHook {
return nil, nil
}
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
// For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook.
if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" {
return compatLibHookDiscoverer, nil
}
// For legacy mode, we also need to inject a hook to update the LDCache
// after we have modifed the configuration.
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
logger,
discover.None{},
cfg.NVIDIACTKConfig.Path,
"",
)
if err != nil {
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
}
return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil
}