Add cuda-compat-mode config option
Some checks failed
CI Pipeline / code-scanning (push) Has been cancelled
CI Pipeline / variables (push) Has been cancelled
CI Pipeline / golang (push) Has been cancelled
CI Pipeline / image (push) Has been cancelled
CI Pipeline / e2e-test (push) Has been cancelled

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2025-04-29 14:00:12 +02:00
parent dccdfeddd1
commit aa0cb99bbb
No known key found for this signature in database
7 changed files with 93 additions and 41 deletions

View File

@ -114,9 +114,19 @@ func doPrestart() {
}
args = append(args, "configure")
if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
switch cli.CUDACompatMode {
case config.CUDACompatModeLdconfig:
args = append(args, "--cuda-compat-mode="+config.CUDACompatModeLdconfig)
case config.CUDACompatModeMount:
args = append(args, "--cuda-compat-mode="+config.CUDACompatModeMount)
case config.CUDACompatModeDisabled, config.CUDACompatModeHook:
args = append(args, "--no-cntlibs")
default:
if !hook.Features.AllowCUDACompatLibsFromContainer.IsEnabled() {
args = append(args, "--no-cntlibs")
}
}
if ldconfigPath := cli.NormalizeLDConfigPath(); ldconfigPath != "" {
args = append(args, fmt.Sprintf("--ldconfig=%s", ldconfigPath))
}

View File

@ -53,6 +53,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
swarm-resource = ""
[nvidia-container-cli]
cuda-compat-mode = "ldconfig"
debug = ""
environment = []
ldcache = ""
@ -114,6 +115,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
swarm-resource = ""
[nvidia-container-cli]
cuda-compat-mode = "ldconfig"
debug = ""
environment = []
ldcache = ""
@ -178,6 +180,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
swarm-resource = ""
[nvidia-container-cli]
cuda-compat-mode = "ldconfig"
debug = ""
environment = []
ldcache = ""
@ -239,6 +242,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
swarm-resource = ""
[nvidia-container-cli]
cuda-compat-mode = "ldconfig"
debug = ""
environment = []
ldcache = ""
@ -322,6 +326,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
swarm-resource = ""
[nvidia-container-cli]
cuda-compat-mode = "ldconfig"
debug = ""
environment = []
ldcache = ""

View File

@ -22,6 +22,13 @@ import (
"strings"
)
const (
CUDACompatModeMount = "mount"
CUDACompatModeLdconfig = "ldconfig"
CUDACompatModeHook = "hook"
CUDACompatModeDisabled = "disabled"
)
// ContainerCLIConfig stores the options for the nvidia-container-cli
type ContainerCLIConfig struct {
Root string `toml:"root"`
@ -44,6 +51,9 @@ type ContainerCLIConfig struct {
// is required, the features.allow-ldconfig-from-container feature gate must
// be enabled explicitly.
Ldconfig ldconfigPath `toml:"ldconfig"`
// CUDACompatMode sets the mode to be used to make CUDA Forward Compat
// libraries discoverable in the container.
CUDACompatMode string `toml:"cuda-compat-mode,omitempty"`
}
// NormalizeLDConfigPath returns the resolved path of the configured LDConfig binary.

View File

@ -100,9 +100,10 @@ func GetDefault() (*Config, error) {
AcceptEnvvarUnprivileged: true,
SupportedDriverCapabilities: image.SupportedDriverCapabilities.String(),
NVIDIAContainerCLIConfig: ContainerCLIConfig{
LoadKmods: true,
Ldconfig: getLdConfigPath(),
User: getUserGroup(),
LoadKmods: true,
Ldconfig: getLdConfigPath(),
User: getUserGroup(),
CUDACompatMode: CUDACompatModeLdconfig,
},
NVIDIACTKConfig: CTKConfig{
Path: nvidiaCTKExecutable,

View File

@ -56,9 +56,10 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: true,
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "",
LoadKmods: true,
Ldconfig: "@/test/ld/config/path",
Root: "",
LoadKmods: true,
Ldconfig: "@/test/ld/config/path",
CUDACompatMode: "ldconfig",
},
NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/dev/null",
@ -93,6 +94,7 @@ func TestGetConfig(t *testing.T) {
"nvidia-container-cli.load-kmods = false",
"nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"",
"nvidia-container-cli.user = \"foo:bar\"",
"nvidia-container-cli.cuda-compat-mode = \"mount\"",
"nvidia-container-runtime.debug = \"/foo/bar\"",
"nvidia-container-runtime.discover-mode = \"not-legacy\"",
"nvidia-container-runtime.log-level = \"debug\"",
@ -109,10 +111,11 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: false,
SupportedDriverCapabilities: "compute,utility",
NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "/bar/baz",
LoadKmods: false,
Ldconfig: "@/foo/bar/ldconfig",
User: "foo:bar",
Root: "/bar/baz",
LoadKmods: false,
Ldconfig: "@/foo/bar/ldconfig",
User: "foo:bar",
CUDACompatMode: "mount",
},
NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/foo/bar",
@ -156,8 +159,9 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: true,
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
NVIDIAContainerCLIConfig: ContainerCLIConfig{
Ldconfig: "/foo/bar/ldconfig",
LoadKmods: true,
Ldconfig: "/foo/bar/ldconfig",
LoadKmods: true,
CUDACompatMode: "ldconfig",
},
NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/dev/null",
@ -200,6 +204,7 @@ func TestGetConfig(t *testing.T) {
"root = \"/bar/baz\"",
"load-kmods = false",
"ldconfig = \"@/foo/bar/ldconfig\"",
"cuda-compat-mode = \"mount\"",
"user = \"foo:bar\"",
"[nvidia-container-runtime]",
"debug = \"/foo/bar\"",
@ -222,10 +227,11 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: false,
SupportedDriverCapabilities: "compute,utility",
NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "/bar/baz",
LoadKmods: false,
Ldconfig: "@/foo/bar/ldconfig",
User: "foo:bar",
Root: "/bar/baz",
LoadKmods: false,
Ldconfig: "@/foo/bar/ldconfig",
CUDACompatMode: "mount",
User: "foo:bar",
},
NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/foo/bar",
@ -264,10 +270,11 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: true,
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "",
LoadKmods: true,
Ldconfig: "@/test/ld/config/path",
User: "root:video",
Root: "",
LoadKmods: true,
Ldconfig: "@/test/ld/config/path",
CUDACompatMode: "ldconfig",
User: "root:video",
},
NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/dev/null",
@ -303,10 +310,11 @@ func TestGetConfig(t *testing.T) {
AcceptEnvvarUnprivileged: true,
SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video",
NVIDIAContainerCLIConfig: ContainerCLIConfig{
Root: "",
LoadKmods: true,
Ldconfig: "@/test/ld/config/path",
User: "foo:bar",
Root: "",
LoadKmods: true,
Ldconfig: "@/test/ld/config/path",
CUDACompatMode: "ldconfig",
User: "foo:bar",
},
NVIDIAContainerRuntimeConfig: RuntimeConfig{
DebugFilePath: "/dev/null",

View File

@ -48,6 +48,7 @@ supported-driver-capabilities = "compat32,compute,display,graphics,ngx,utility,v
#swarm-resource = "DOCKER_RESOURCE_GPU"
[nvidia-container-cli]
cuda-compat-mode = "ldconfig"
#debug = "/var/log/nvidia-container-toolkit.log"
environment = []
#ldcache = "/etc/ld.so.cache"

View File

@ -79,24 +79,41 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image
discoverers = append(discoverers, d)
}
if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
discoverers = append(discoverers, compatLibHookDiscoverer)
// For legacy mode, we also need to inject a hook to update the LDCache
// after we have modifed the configuration.
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" {
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
logger,
discover.None{},
cfg.NVIDIACTKConfig.Path,
"",
)
if err != nil {
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
}
discoverers = append(discoverers, ldcacheUpdateHookDiscoverer)
// If the feature flag has explicitly been toggled, we don't make any modification.
if !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
cudaCompatDiscoverer, err := getCudaCompatModeDiscoverer(logger, cfg, driver)
if err != nil {
return nil, fmt.Errorf("failed to construct CUDA Compat discoverer: %w", err)
}
discoverers = append(discoverers, cudaCompatDiscoverer)
}
return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...))
}
func getCudaCompatModeDiscoverer(logger logger.Interface, cfg *config.Config, driver *root.Driver) (discover.Discover, error) {
// For legacy mode, we only include the enable-cuda-compat hook if cuda-compat-mode is set to hook.
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" && cfg.NVIDIAContainerCLIConfig.CUDACompatMode != config.CUDACompatModeHook {
return nil, nil
}
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
// For non-legacy modes we return the hook as is. These modes *should* already include the update-ldcache hook.
if cfg.NVIDIAContainerRuntimeConfig.Mode != "legacy" {
return compatLibHookDiscoverer, nil
}
// For legacy mode, we also need to inject a hook to update the LDCache
// after we have modifed the configuration.
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
logger,
discover.None{},
cfg.NVIDIACTKConfig.Path,
"",
)
if err != nil {
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
}
return discover.Merge(compatLibHookDiscoverer, ldcacheUpdateHookDiscoverer), nil
}