From f85204307852f4f97cc28884ef4f45a801ce8947 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 7 Mar 2025 14:15:31 +0200 Subject: [PATCH 1/2] Allow enable-cuda-compat hook to be disabled in CDI spec generation This change adds support to the nvcdi package to opt out of specific hooks. Currently only the `enable-cuda-compat` hook is supported. This allows clients to generate a CDI spec that is compatible with older nvidia-cdi-hook CLIs. Signed-off-by: Evan Lezar --- pkg/nvcdi/api.go | 10 ++++++ pkg/nvcdi/common-nvml.go | 2 +- pkg/nvcdi/driver-nvml.go | 62 ++++++++++++++++++--------------- pkg/nvcdi/hooks.go | 30 ++++++++++++++++ pkg/nvcdi/lib.go | 6 +++- pkg/nvcdi/management.go | 2 +- pkg/nvcdi/options.go | 11 ++++++ third_party/libnvidia-container | 2 +- 8 files changed, 93 insertions(+), 32 deletions(-) create mode 100644 pkg/nvcdi/hooks.go diff --git a/pkg/nvcdi/api.go b/pkg/nvcdi/api.go index f1c7b97a..2988026f 100644 --- a/pkg/nvcdi/api.go +++ b/pkg/nvcdi/api.go @@ -35,3 +35,13 @@ type Interface interface { GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error) GetDeviceSpecsByID(...string) ([]specs.Device, error) } + +// A HookName refers to one of the predefined set of CDI hooks that may be +// included in the generated CDI specification. +type HookName string + +const ( + // HookEnableCudaCompat refers to the hook used to enable CUDA Forward Compatibility. + // This was added with v1.17.5 of the NVIDIA Container Toolkit. + HookEnableCudaCompat = HookName("enable-cuda-compat") +) diff --git a/pkg/nvcdi/common-nvml.go b/pkg/nvcdi/common-nvml.go index 4dd1bc35..6e9661cb 100644 --- a/pkg/nvcdi/common-nvml.go +++ b/pkg/nvcdi/common-nvml.go @@ -41,7 +41,7 @@ func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) { l.logger.Warningf("failed to create discoverer for graphics mounts: %v", err) } - driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, l.nvmllib) + driverFiles, err := l.NewDriverDiscoverer() if err != nil { return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err) } diff --git a/pkg/nvcdi/driver-nvml.go b/pkg/nvcdi/driver-nvml.go index 782d60fc..f49f1129 100644 --- a/pkg/nvcdi/driver-nvml.go +++ b/pkg/nvcdi/driver-nvml.go @@ -34,41 +34,41 @@ import ( // NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation. // The supplied NVML Library is used to query the expected driver version. -func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, nvmllib nvml.Interface) (discover.Discover, error) { - if r := nvmllib.Init(); r != nvml.SUCCESS { +func (l *nvmllib) NewDriverDiscoverer() (discover.Discover, error) { + if r := l.nvmllib.Init(); r != nvml.SUCCESS { return nil, fmt.Errorf("failed to initialize NVML: %v", r) } defer func() { - if r := nvmllib.Shutdown(); r != nvml.SUCCESS { - logger.Warningf("failed to shutdown NVML: %v", r) + if r := l.nvmllib.Shutdown(); r != nvml.SUCCESS { + l.logger.Warningf("failed to shutdown NVML: %v", r) } }() - version, r := nvmllib.SystemGetDriverVersion() + version, r := l.nvmllib.SystemGetDriverVersion() if r != nvml.SUCCESS { return nil, fmt.Errorf("failed to determine driver version: %v", r) } - return newDriverVersionDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version) + return (*nvcdilib)(l).newDriverVersionDiscoverer(version) } -func newDriverVersionDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath, ldconfigPath, version string) (discover.Discover, error) { - libraries, err := NewDriverLibraryDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version) +func (l *nvcdilib) newDriverVersionDiscoverer(version string) (discover.Discover, error) { + libraries, err := l.NewDriverLibraryDiscoverer(version) if err != nil { return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err) } - ipcs, err := discover.NewIPCDiscoverer(logger, driver.Root) + ipcs, err := discover.NewIPCDiscoverer(l.logger, l.driver.Root) if err != nil { return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err) } - firmwares, err := NewDriverFirmwareDiscoverer(logger, driver.Root, version) + firmwares, err := NewDriverFirmwareDiscoverer(l.logger, l.driver.Root, version) if err != nil { return nil, fmt.Errorf("failed to create discoverer for GSP firmware: %v", err) } - binaries := NewDriverBinariesDiscoverer(logger, driver.Root) + binaries := NewDriverBinariesDiscoverer(l.logger, l.driver.Root) d := discover.Merge( libraries, @@ -81,35 +81,41 @@ func newDriverVersionDiscoverer(logger logger.Interface, driver *root.Driver, nv } // NewDriverLibraryDiscoverer creates a discoverer for the libraries associated with the specified driver version. -func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath, ldconfigPath, version string) (discover.Discover, error) { - libraryPaths, err := getVersionLibs(logger, driver, version) +func (l *nvcdilib) NewDriverLibraryDiscoverer(version string) (discover.Discover, error) { + libraryPaths, err := getVersionLibs(l.logger, l.driver, version) if err != nil { return nil, fmt.Errorf("failed to get libraries for driver version: %v", err) } libraries := discover.NewMounts( - logger, + l.logger, lookup.NewFileLocator( - lookup.WithLogger(logger), - lookup.WithRoot(driver.Root), + lookup.WithLogger(l.logger), + lookup.WithRoot(l.driver.Root), ), - driver.Root, + l.driver.Root, libraryPaths, ) - // TODO: The following should use the version directly. - cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, nvidiaCDIHookPath, driver) - updateLDCache, _ := discover.NewLDCacheUpdateHook(logger, libraries, nvidiaCDIHookPath, ldconfigPath) + var discoverers []discover.Discover - d := discover.Merge( - discover.WithDriverDotSoSymlinks( - libraries, - version, - nvidiaCDIHookPath, - ), - cudaCompatLibHookDiscoverer, - updateLDCache, + driverDotSoSymlinksDiscoverer := discover.WithDriverDotSoSymlinks( + libraries, + version, + l.nvidiaCDIHookPath, ) + discoverers = append(discoverers, driverDotSoSymlinksDiscoverer) + + if l.HookIsSupported(HookEnableCudaCompat) { + // TODO: The following should use the version directly. + cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(l.logger, l.nvidiaCDIHookPath, l.driver) + discoverers = append(discoverers, cudaCompatLibHookDiscoverer) + } + + updateLDCache, _ := discover.NewLDCacheUpdateHook(l.logger, libraries, l.nvidiaCDIHookPath, l.ldconfigPath) + discoverers = append(discoverers, updateLDCache) + + d := discover.Merge(discoverers...) return d, nil } diff --git a/pkg/nvcdi/hooks.go b/pkg/nvcdi/hooks.go new file mode 100644 index 00000000..a4620dc8 --- /dev/null +++ b/pkg/nvcdi/hooks.go @@ -0,0 +1,30 @@ +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package nvcdi + +// disabledHooks allows individual hooks to be disabled. +type disabledHooks map[HookName]bool + +// HookIsSupported checks whether a hook of the specified name is supported. +// Hooks must be explicitly disabled, meaning that if no disabled hooks are +// all hooks are supported. +func (l *nvcdilib) HookIsSupported(h HookName) bool { + if len(l.disabledHooks) == 0 { + return true + } + return !l.disabledHooks[h] +} diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index d6710862..1b181c8d 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -54,11 +54,15 @@ type nvcdilib struct { infolib info.Interface mergedDeviceOptions []transform.MergedDeviceOption + + disabledHooks disabledHooks } // New creates a new nvcdi library func New(opts ...Option) (Interface, error) { - l := &nvcdilib{} + l := &nvcdilib{ + disabledHooks: make(disabledHooks), + } for _, opt := range opts { opt(l) } diff --git a/pkg/nvcdi/management.go b/pkg/nvcdi/management.go index dee63a14..f0fa900e 100644 --- a/pkg/nvcdi/management.go +++ b/pkg/nvcdi/management.go @@ -80,7 +80,7 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) { return nil, fmt.Errorf("failed to get CUDA version: %v", err) } - driver, err := newDriverVersionDiscoverer(m.logger, m.driver, m.nvidiaCDIHookPath, m.ldconfigPath, version) + driver, err := (*nvcdilib)(m).newDriverVersionDiscoverer(version) if err != nil { return nil, fmt.Errorf("failed to create driver library discoverer: %v", err) } diff --git a/pkg/nvcdi/options.go b/pkg/nvcdi/options.go index 362545d2..f38f2b4a 100644 --- a/pkg/nvcdi/options.go +++ b/pkg/nvcdi/options.go @@ -155,3 +155,14 @@ func WithLibrarySearchPaths(paths []string) Option { o.librarySearchPaths = paths } } + +// WithDisabledHook allows specific hooks to the disabled. +// This option can be specified multiple times for each hook. +func WithDisabledHook(hook HookName) Option { + return func(o *nvcdilib) { + if o.disabledHooks == nil { + o.disabledHooks = make(map[HookName]bool) + } + o.disabledHooks[hook] = true + } +} diff --git a/third_party/libnvidia-container b/third_party/libnvidia-container index 95d3e865..f23e5e55 160000 --- a/third_party/libnvidia-container +++ b/third_party/libnvidia-container @@ -1 +1 @@ -Subproject commit 95d3e86522976061e856724867ebcaf75c4e9b60 +Subproject commit f23e5e55ea27b3680aef363436d4bcf7659e0bfc From 0f299c3431ee53e27d48ee939af3a228db00536c Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 7 Mar 2025 14:44:32 +0200 Subject: [PATCH 2/2] Disable enable-cuda-compat hook for management containers Management containers don't generally need forward compatibility. We disable the enable-cuda-compat hook to not include this in the generated CDI specifications. Signed-off-by: Evan Lezar --- cmd/nvidia-ctk-installer/container/toolkit/toolkit_test.go | 6 ------ pkg/nvcdi/lib.go | 2 ++ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/cmd/nvidia-ctk-installer/container/toolkit/toolkit_test.go b/cmd/nvidia-ctk-installer/container/toolkit/toolkit_test.go index dfc33120..855141ff 100644 --- a/cmd/nvidia-ctk-installer/container/toolkit/toolkit_test.go +++ b/cmd/nvidia-ctk-installer/container/toolkit/toolkit_test.go @@ -80,12 +80,6 @@ containerEdits: - libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so hookName: createContainer path: {{ .toolkitRoot }}/nvidia-cdi-hook - - args: - - nvidia-cdi-hook - - enable-cuda-compat - - --host-driver-version=999.88.77 - hookName: createContainer - path: {{ .toolkitRoot }}/nvidia-cdi-hook - args: - nvidia-cdi-hook - update-ldcache diff --git a/pkg/nvcdi/lib.go b/pkg/nvcdi/lib.go index 1b181c8d..8e7653b4 100644 --- a/pkg/nvcdi/lib.go +++ b/pkg/nvcdi/lib.go @@ -144,6 +144,8 @@ func New(opts ...Option) (Interface, error) { if l.vendor == "" { l.vendor = "management.nvidia.com" } + // Management containers in general do not require CUDA Forward compatibility. + l.disabledHooks[HookEnableCudaCompat] = true lib = (*managementlib)(l) case ModeNvml: lib = (*nvmllib)(l)