From ba1ed3232f011853fe82da5eff796fb79dbeeb52 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 18 Sep 2024 22:10:09 +0200 Subject: [PATCH 1/4] Skip injection of nvidia-persistenced socket by default This changes skips the injection of the nvidia-persistenced socket by default. An include-persistenced-socket feature flag is added to allow the injection of this socket to be explicitly requested. Signed-off-by: Evan Lezar --- cmd/nvidia-container-runtime-hook/main.go | 6 +++ internal/config/features.go | 65 ++++++++++++----------- internal/modifier/gated.go | 8 +-- 3 files changed, 45 insertions(+), 34 deletions(-) diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index faaf0b51..f53a649a 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -89,6 +89,12 @@ func doPrestart() { rootfs := getRootfsPath(container) args := []string{getCLIPath(cli)} + + // Only include the nvidia-persistenced socket if it is explicitly enabled. + if !hook.Features.IncludePersistencedSocket.IsEnabled() { + args = append(args, "--no-persistenced") + } + if cli.Root != "" { args = append(args, fmt.Sprintf("--root=%s", cli.Root)) } diff --git a/internal/config/features.go b/internal/config/features.go index dfc6b165..cbbecf3c 100644 --- a/internal/config/features.go +++ b/internal/config/features.go @@ -19,10 +19,11 @@ package config type featureName string const ( - FeatureGDS = featureName("gds") - FeatureMOFED = featureName("mofed") - FeatureNVSWITCH = featureName("nvswitch") - FeatureGDRCopy = featureName("gdrcopy") + FeatureGDS = featureName("gds") + FeatureMOFED = featureName("mofed") + FeatureNVSWITCH = featureName("nvswitch") + FeatureGDRCopy = featureName("gdrcopy") + FeatureIncludePersistencedSocket = featureName("include-persistenced-socket") ) // features specifies a set of named features. @@ -31,53 +32,57 @@ type features struct { MOFED *feature `toml:"mofed,omitempty"` NVSWITCH *feature `toml:"nvswitch,omitempty"` GDRCopy *feature `toml:"gdrcopy,omitempty"` + // IncludePersistencedSocket enables the injection of the nvidia-persistenced + // socket into containers. + IncludePersistencedSocket *feature `toml:"include-persistenced-socket,omitempty"` } type feature bool -// IsEnabled checks whether a specified named feature is enabled. +// IsEnabledInEnvironment checks whether a specified named feature is enabled. // An optional list of environments to check for feature-specific environment // variables can also be supplied. -func (fs features) IsEnabled(n featureName, in ...getenver) bool { - featureEnvvars := map[featureName]string{ - FeatureGDS: "NVIDIA_GDS", - FeatureMOFED: "NVIDIA_MOFED", - FeatureNVSWITCH: "NVIDIA_NVSWITCH", - FeatureGDRCopy: "NVIDIA_GDRCOPY", - } - - envvar := featureEnvvars[n] +func (fs features) IsEnabledInEnvironment(n featureName, in ...getenver) bool { switch n { + // Features with envvar overrides case FeatureGDS: - return fs.GDS.isEnabled(envvar, in...) + return fs.GDS.isEnabledWithEnvvarOverride("NVIDIA_GDS", in...) case FeatureMOFED: - return fs.MOFED.isEnabled(envvar, in...) + return fs.MOFED.isEnabledWithEnvvarOverride("NVIDIA_MOFED", in...) case FeatureNVSWITCH: - return fs.NVSWITCH.isEnabled(envvar, in...) + return fs.NVSWITCH.isEnabledWithEnvvarOverride("NVIDIA_NVSWITCH", in...) case FeatureGDRCopy: - return fs.GDRCopy.isEnabled(envvar, in...) + return fs.GDRCopy.isEnabledWithEnvvarOverride("NVIDIA_GDRCOPY", in...) + // Features without envvar overrides + case FeatureIncludePersistencedSocket: + return fs.IncludePersistencedSocket.IsEnabled() default: return false } } -// isEnabled checks whether a feature is enabled. -// If the enabled value is explicitly set, this is returned, otherwise the -// associated envvar is checked in the specified getenver for the string "enabled" -// A CUDA container / image can be passed here. -func (f *feature) isEnabled(envvar string, ins ...getenver) bool { +// IsEnabled checks whether a feature is enabled. +func (f *feature) IsEnabled() bool { if f != nil { return bool(*f) } - if envvar == "" { - return false - } - for _, in := range ins { - if in.Getenv(envvar) == "enabled" { - return true + return false +} + +// isEnabledWithEnvvarOverride checks whether a feature is enabled and allows an envvar to overide the feature. +// If the enabled value is explicitly set, this is returned, otherwise the +// associated envvar is checked in the specified getenver for the string "enabled" +// A CUDA container / image can be passed here. +func (f *feature) isEnabledWithEnvvarOverride(envvar string, ins ...getenver) bool { + if envvar != "" { + for _, in := range ins { + if in.Getenv(envvar) == "enabled" { + return true + } } } - return false + + return f.IsEnabled() } type getenver interface { diff --git a/internal/modifier/gated.go b/internal/modifier/gated.go index 5bed3eaf..70322b35 100644 --- a/internal/modifier/gated.go +++ b/internal/modifier/gated.go @@ -46,7 +46,7 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image driverRoot := cfg.NVIDIAContainerCLIConfig.Root devRoot := cfg.NVIDIAContainerCLIConfig.Root - if cfg.Features.IsEnabled(config.FeatureGDS, image) { + if cfg.Features.IsEnabledInEnvironment(config.FeatureGDS, image) { d, err := discover.NewGDSDiscoverer(logger, driverRoot, devRoot) if err != nil { return nil, fmt.Errorf("failed to construct discoverer for GDS devices: %w", err) @@ -54,7 +54,7 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } - if cfg.Features.IsEnabled(config.FeatureMOFED, image) { + if cfg.Features.IsEnabledInEnvironment(config.FeatureMOFED, image) { d, err := discover.NewMOFEDDiscoverer(logger, devRoot) if err != nil { return nil, fmt.Errorf("failed to construct discoverer for MOFED devices: %w", err) @@ -62,7 +62,7 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } - if cfg.Features.IsEnabled(config.FeatureNVSWITCH, image) { + if cfg.Features.IsEnabledInEnvironment(config.FeatureNVSWITCH, image) { d, err := discover.NewNvSwitchDiscoverer(logger, devRoot) if err != nil { return nil, fmt.Errorf("failed to construct discoverer for NVSWITCH devices: %w", err) @@ -70,7 +70,7 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image discoverers = append(discoverers, d) } - if cfg.Features.IsEnabled(config.FeatureGDRCopy, image) { + if cfg.Features.IsEnabledInEnvironment(config.FeatureGDRCopy, image) { d, err := discover.NewGDRCopyDiscoverer(logger, devRoot) if err != nil { return nil, fmt.Errorf("failed to construct discoverer for GDRCopy devices: %w", err) From a4bfccc3fec84d629c9794824e0c6cfc3d90cab7 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 16 Sep 2024 11:19:02 +0200 Subject: [PATCH 2/4] Use include-persistenced-socket feature for CDI mode This change ensures that the internal CDI representation includes the persistenced socket if the include-persistenced-socket feature flag is enabled. Signed-off-by: Evan Lezar --- internal/discover/ipc.go | 13 ++++++++----- internal/modifier/cdi.go | 1 + pkg/nvcdi/common-nvml.go | 2 +- pkg/nvcdi/driver-nvml.go | 22 +++++++++++----------- pkg/nvcdi/lib.go | 2 ++ pkg/nvcdi/management.go | 2 +- pkg/nvcdi/options.go | 11 +++++++++++ 7 files changed, 35 insertions(+), 18 deletions(-) diff --git a/internal/discover/ipc.go b/internal/discover/ipc.go index f636290f..f3d106b5 100644 --- a/internal/discover/ipc.go +++ b/internal/discover/ipc.go @@ -24,7 +24,13 @@ import ( type ipcMounts mounts // NewIPCDiscoverer creats a discoverer for NVIDIA IPC sockets. -func NewIPCDiscoverer(logger logger.Interface, driverRoot string) (Discover, error) { +func NewIPCDiscoverer(logger logger.Interface, driverRoot string, includePersistencedSocket bool) (Discover, error) { + var requiredSockets []string + if includePersistencedSocket { + requiredSockets = append(requiredSockets, "/nvidia-persistenced/socket") + } + requiredSockets = append(requiredSockets, "/nvidia-fabricmanager/socket") + sockets := newMounts( logger, lookup.NewFileLocator( @@ -34,10 +40,7 @@ func NewIPCDiscoverer(logger logger.Interface, driverRoot string) (Discover, err lookup.WithCount(1), ), driverRoot, - []string{ - "/nvidia-persistenced/socket", - "/nvidia-fabricmanager/socket", - }, + requiredSockets, ) mps := newMounts( diff --git a/internal/modifier/cdi.go b/internal/modifier/cdi.go index c5af4f88..d3e4e935 100644 --- a/internal/modifier/cdi.go +++ b/internal/modifier/cdi.go @@ -189,6 +189,7 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic nvcdi.WithDriverRoot(cfg.NVIDIAContainerCLIConfig.Root), nvcdi.WithVendor("runtime.nvidia.com"), nvcdi.WithClass("gpu"), + nvcdi.WithOptInFeature("include-persistenced-socket", cfg.Features.IncludePersistencedSocket.IsEnabled()), ) if err != nil { return nil, fmt.Errorf("failed to construct CDI library: %w", err) diff --git a/pkg/nvcdi/common-nvml.go b/pkg/nvcdi/common-nvml.go index 4dd1bc35..6e9661cb 100644 --- a/pkg/nvcdi/common-nvml.go +++ b/pkg/nvcdi/common-nvml.go @@ -41,7 +41,7 @@ func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) { l.logger.Warningf("failed to create discoverer for graphics mounts: %v", err) } - driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, l.nvmllib) + driverFiles, err := l.NewDriverDiscoverer() if err != nil { return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err) } diff --git a/pkg/nvcdi/driver-nvml.go b/pkg/nvcdi/driver-nvml.go index 8fb39888..59ff91c2 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/pkg/nvcdi/driver-nvml.go @@ -34,41 +34,41 @@ import ( // NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation. // The supplied NVML Library is used to query the expected driver version. -func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, nvmllib nvml.Interface) (discover.Discover, error) { - if r := nvmllib.Init(); r != nvml.SUCCESS { +func (l *nvmllib) NewDriverDiscoverer() (discover.Discover, error) { + if r := l.nvmllib.Init(); r != nvml.SUCCESS { return nil, fmt.Errorf("failed to initialize NVML: %v", r) } defer func() { - if r := nvmllib.Shutdown(); r != nvml.SUCCESS { - logger.Warningf("failed to shutdown NVML: %v", r) + if r := l.nvmllib.Shutdown(); r != nvml.SUCCESS { + l.logger.Warningf("failed to shutdown NVML: %v", r) } }() - version, r := nvmllib.SystemGetDriverVersion() + version, r := l.nvmllib.SystemGetDriverVersion() if r != nvml.SUCCESS { return nil, fmt.Errorf("failed to determine driver version: %v", r) } - return newDriverVersionDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version) + return (*nvcdilib)(l).newDriverVersionDiscoverer(version) } -func newDriverVersionDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath, ldconfigPath, version string) (discover.Discover, error) { - libraries, err := NewDriverLibraryDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version) +func (l *nvcdilib) newDriverVersionDiscoverer(version string) (discover.Discover, error) { + libraries, err := NewDriverLibraryDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, version) if err != nil { return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err) } - ipcs, err := discover.NewIPCDiscoverer(logger, driver.Root) + ipcs, err := discover.NewIPCDiscoverer(l.logger, l.driver.Root, l.optInFeatures["include-persistenced-socket"]) if err != nil { return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err) } - firmwares, err := NewDriverFirmwareDiscoverer(logger, driver.Root, version) + firmwares, err := NewDriverFirmwareDiscoverer(l.logger, l.driver.Root, version) if err != nil { return nil, fmt.Errorf("failed to create discoverer for GSP firmware: %v", err) } - binaries := NewDriverBinariesDiscoverer(logger, driver.Root) + binaries := NewDriverBinariesDiscoverer(l.logger, l.driver.Root) d := discover.Merge( libraries, diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index d2db3b6c..e14b2971 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -63,6 +63,8 @@ type nvcdilib struct { infolib info.Interface mergedDeviceOptions []transform.MergedDeviceOption + + optInFeatures map[string]bool } // New creates a new nvcdi library diff --git a/pkg/nvcdi/management.go b/pkg/nvcdi/management.go index 4648e5bb..8f7709af 100644 --- a/pkg/nvcdi/management.go +++ b/pkg/nvcdi/management.go @@ -66,7 +66,7 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { return nil, fmt.Errorf("failed to get CUDA version: %v", err) } - driver, err := newDriverVersionDiscoverer(m.logger, m.driver, m.nvidiaCDIHookPath, m.ldconfigPath, version) + driver, err := (*nvcdilib)(m).newDriverVersionDiscoverer(version) if err != nil { return nil, fmt.Errorf("failed to create driver library discoverer: %v", err) } diff --git a/pkg/nvcdi/options.go b/pkg/nvcdi/options.go index 417687b9..762b5ac6 100644 --- a/pkg/nvcdi/options.go +++ b/pkg/nvcdi/options.go @@ -155,3 +155,14 @@ func WithLibrarySearchPaths(paths []string) Option { o.librarySearchPaths = paths } } + +// WithOptInFeature sets a specific opt-in feature. +// Note that previous opt-in-features are not removed. +func WithOptInFeature(feature string, enabled bool) Option { + return func(n *nvcdilib) { + if n.optInFeatures == nil { + n.optInFeatures = make(map[string]bool) + } + n.optInFeatures[feature] = enabled + } +} From 70da6cfa50fb934d8da321db39254ffbb688c7ff Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Mon, 16 Sep 2024 11:27:57 +0200 Subject: [PATCH 3/4] Allow inclusion of persistenced socket in CDI specification This change adds an include-persistenced-socket flag to the nvidia-ctk cdi generate command that ensures that a generated specification includes the nvidia-persistenced socket if present on the host. Note that for mangement mode, these sockets are always included if detected. Signed-off-by: Evan Lezar --- cmd/nvidia-ctk/cdi/generate/generate.go | 8 ++++++++ pkg/nvcdi/lib.go | 3 +++ 2 files changed, 11 insertions(+) diff --git a/cmd/nvidia-ctk/cdi/generate/generate.go b/cmd/nvidia-ctk/cdi/generate/generate.go index 9f9e994b..9c2da04f 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate.go +++ b/cmd/nvidia-ctk/cdi/generate/generate.go @@ -60,6 +60,8 @@ type options struct { files cli.StringSlice ignorePatterns cli.StringSlice } + + includePersistencedSocket bool } // NewCommand constructs a generate-cdi command with the specified logger @@ -169,6 +171,11 @@ func (m command) build() *cli.Command { Usage: "Specify a pattern the CSV mount specifications.", Destination: &opts.csv.ignorePatterns, }, + &cli.BoolFlag{ + Name: "include-persistenced-socket", + Usage: "Include the nvidia-persistenced socket in the generated CDI specification.", + Destination: &opts.includePersistencedSocket, + }, } return &c @@ -273,6 +280,7 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) { nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()), nvcdi.WithCSVFiles(opts.csv.files.Value()), nvcdi.WithCSVIgnorePatterns(opts.csv.ignorePatterns.Value()), + nvcdi.WithOptInFeature("include-persistenced-socket", opts.includePersistencedSocket), ) if err != nil { return nil, fmt.Errorf("failed to create CDI library: %v", err) diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index e14b2971..e535763b 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -132,6 +132,9 @@ func New(opts ...Option) (Interface, error) { if l.vendor == "" { l.vendor = "management.nvidia.com" } + // For management specifications we always allow the fabricmanager and + // persistenced sockets. + WithOptInFeature("include-persistenced-socket", true)(l) lib = (*managementlib)(l) case ModeNvml: lib = (*nvmllib)(l) From 9c2476c98d2d1414c26d7c93c7d4d4a8d4fed763 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 18 Sep 2024 22:20:56 +0200 Subject: [PATCH 4/4] Expose opt-in features in toolkit-container This change enables opt-in (off-by-default) features to be opted into. These features can be toggled by name by specifying the (repeated) --opt-in-feature command line argument or as a comma-separated list in the NVIDIA_CONTAINER_TOOLKIT_OPT_IN_FEATURES environment variable. Signed-off-by: Evan Lezar --- tools/container/toolkit/toolkit.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/container/toolkit/toolkit.go b/tools/container/toolkit/toolkit.go index 8175ed4e..c9d0db6a 100644 --- a/tools/container/toolkit/toolkit.go +++ b/tools/container/toolkit/toolkit.go @@ -81,6 +81,8 @@ type options struct { acceptNVIDIAVisibleDevicesAsVolumeMounts bool ignoreErrors bool + + optInFeatures cli.StringSlice } func main() { @@ -250,6 +252,12 @@ func main() { Destination: &opts.createDeviceNodes, EnvVars: []string{"CREATE_DEVICE_NODES"}, }, + &cli.StringSliceFlag{ + Name: "opt-in-feature", + Hidden: true, + Destination: &opts.optInFeatures, + EnvVars: []string{"NVIDIA_CONTAINER_TOOLKIT_OPT_IN_FEATURES"}, + }, } // Update the subcommand flags with the common subcommand flags @@ -518,6 +526,10 @@ func installToolkitConfig(c *cli.Context, toolkitConfigPath string, nvidiaContai "nvidia-container-runtime.runtimes": opts.ContainerRuntimeRuntimes, "nvidia-container-cli.debug": opts.ContainerCLIDebug, } + for _, feature := range opts.optInFeatures.Value() { + optionalConfigValues["features."+feature] = true + } + for key, value := range optionalConfigValues { if !c.IsSet(key) { log.Infof("Skipping unset option: %v", key)