Merge pull request #968 from elezar/allow-hooks-disable
Some checks failed
CI Pipeline / code-scanning (push) Has been cancelled
CI Pipeline / variables (push) Has been cancelled
CI Pipeline / golang (push) Has been cancelled
CI Pipeline / image (push) Has been cancelled
CI Pipeline / e2e-test (push) Has been cancelled

Allow enable-cuda-compat hook to be disabled in CDI spec generation
This commit is contained in:
Evan Lezar 2025-03-07 16:54:34 +02:00 committed by GitHub
commit e436533a6f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 95 additions and 38 deletions

View File

@ -80,12 +80,6 @@ containerEdits:
- libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so
hookName: createContainer
path: {{ .toolkitRoot }}/nvidia-cdi-hook
- args:
- nvidia-cdi-hook
- enable-cuda-compat
- --host-driver-version=999.88.77
hookName: createContainer
path: {{ .toolkitRoot }}/nvidia-cdi-hook
- args:
- nvidia-cdi-hook
- update-ldcache

View File

@ -35,3 +35,13 @@ type Interface interface {
GetMIGDeviceSpecs(int, device.Device, int, device.MigDevice) ([]specs.Device, error)
GetDeviceSpecsByID(...string) ([]specs.Device, error)
}
// A HookName refers to one of the predefined set of CDI hooks that may be
// included in the generated CDI specification.
type HookName string
const (
// HookEnableCudaCompat refers to the hook used to enable CUDA Forward Compatibility.
// This was added with v1.17.5 of the NVIDIA Container Toolkit.
HookEnableCudaCompat = HookName("enable-cuda-compat")
)

View File

@ -41,7 +41,7 @@ func (l *nvmllib) newCommonNVMLDiscoverer() (discover.Discover, error) {
l.logger.Warningf("failed to create discoverer for graphics mounts: %v", err)
}
driverFiles, err := NewDriverDiscoverer(l.logger, l.driver, l.nvidiaCDIHookPath, l.ldconfigPath, l.nvmllib)
driverFiles, err := l.NewDriverDiscoverer()
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for driver files: %v", err)
}

View File

@ -34,41 +34,41 @@ import (
// NewDriverDiscoverer creates a discoverer for the libraries and binaries associated with a driver installation.
// The supplied NVML Library is used to query the expected driver version.
func NewDriverDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath string, ldconfigPath string, nvmllib nvml.Interface) (discover.Discover, error) {
if r := nvmllib.Init(); r != nvml.SUCCESS {
func (l *nvmllib) NewDriverDiscoverer() (discover.Discover, error) {
if r := l.nvmllib.Init(); r != nvml.SUCCESS {
return nil, fmt.Errorf("failed to initialize NVML: %v", r)
}
defer func() {
if r := nvmllib.Shutdown(); r != nvml.SUCCESS {
logger.Warningf("failed to shutdown NVML: %v", r)
if r := l.nvmllib.Shutdown(); r != nvml.SUCCESS {
l.logger.Warningf("failed to shutdown NVML: %v", r)
}
}()
version, r := nvmllib.SystemGetDriverVersion()
version, r := l.nvmllib.SystemGetDriverVersion()
if r != nvml.SUCCESS {
return nil, fmt.Errorf("failed to determine driver version: %v", r)
}
return newDriverVersionDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version)
return (*nvcdilib)(l).newDriverVersionDiscoverer(version)
}
func newDriverVersionDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath, ldconfigPath, version string) (discover.Discover, error) {
libraries, err := NewDriverLibraryDiscoverer(logger, driver, nvidiaCDIHookPath, ldconfigPath, version)
func (l *nvcdilib) newDriverVersionDiscoverer(version string) (discover.Discover, error) {
libraries, err := l.NewDriverLibraryDiscoverer(version)
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for driver libraries: %v", err)
}
ipcs, err := discover.NewIPCDiscoverer(logger, driver.Root)
ipcs, err := discover.NewIPCDiscoverer(l.logger, l.driver.Root)
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for IPC sockets: %v", err)
}
firmwares, err := NewDriverFirmwareDiscoverer(logger, driver.Root, version)
firmwares, err := NewDriverFirmwareDiscoverer(l.logger, l.driver.Root, version)
if err != nil {
return nil, fmt.Errorf("failed to create discoverer for GSP firmware: %v", err)
}
binaries := NewDriverBinariesDiscoverer(logger, driver.Root)
binaries := NewDriverBinariesDiscoverer(l.logger, l.driver.Root)
d := discover.Merge(
libraries,
@ -81,35 +81,41 @@ func newDriverVersionDiscoverer(logger logger.Interface, driver *root.Driver, nv
}
// NewDriverLibraryDiscoverer creates a discoverer for the libraries associated with the specified driver version.
func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nvidiaCDIHookPath, ldconfigPath, version string) (discover.Discover, error) {
libraryPaths, err := getVersionLibs(logger, driver, version)
func (l *nvcdilib) NewDriverLibraryDiscoverer(version string) (discover.Discover, error) {
libraryPaths, err := getVersionLibs(l.logger, l.driver, version)
if err != nil {
return nil, fmt.Errorf("failed to get libraries for driver version: %v", err)
}
libraries := discover.NewMounts(
logger,
l.logger,
lookup.NewFileLocator(
lookup.WithLogger(logger),
lookup.WithRoot(driver.Root),
lookup.WithLogger(l.logger),
lookup.WithRoot(l.driver.Root),
),
driver.Root,
l.driver.Root,
libraryPaths,
)
// TODO: The following should use the version directly.
cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, nvidiaCDIHookPath, driver)
updateLDCache, _ := discover.NewLDCacheUpdateHook(logger, libraries, nvidiaCDIHookPath, ldconfigPath)
var discoverers []discover.Discover
d := discover.Merge(
discover.WithDriverDotSoSymlinks(
libraries,
version,
nvidiaCDIHookPath,
),
cudaCompatLibHookDiscoverer,
updateLDCache,
driverDotSoSymlinksDiscoverer := discover.WithDriverDotSoSymlinks(
libraries,
version,
l.nvidiaCDIHookPath,
)
discoverers = append(discoverers, driverDotSoSymlinksDiscoverer)
if l.HookIsSupported(HookEnableCudaCompat) {
// TODO: The following should use the version directly.
cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(l.logger, l.nvidiaCDIHookPath, l.driver)
discoverers = append(discoverers, cudaCompatLibHookDiscoverer)
}
updateLDCache, _ := discover.NewLDCacheUpdateHook(l.logger, libraries, l.nvidiaCDIHookPath, l.ldconfigPath)
discoverers = append(discoverers, updateLDCache)
d := discover.Merge(discoverers...)
return d, nil
}

30
pkg/nvcdi/hooks.go Normal file
View File

@ -0,0 +1,30 @@
/**
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package nvcdi
// disabledHooks allows individual hooks to be disabled.
type disabledHooks map[HookName]bool
// HookIsSupported checks whether a hook of the specified name is supported.
// Hooks must be explicitly disabled, meaning that if no disabled hooks are
// all hooks are supported.
func (l *nvcdilib) HookIsSupported(h HookName) bool {
if len(l.disabledHooks) == 0 {
return true
}
return !l.disabledHooks[h]
}

View File

@ -54,11 +54,15 @@ type nvcdilib struct {
infolib info.Interface
mergedDeviceOptions []transform.MergedDeviceOption
disabledHooks disabledHooks
}
// New creates a new nvcdi library
func New(opts ...Option) (Interface, error) {
l := &nvcdilib{}
l := &nvcdilib{
disabledHooks: make(disabledHooks),
}
for _, opt := range opts {
opt(l)
}
@ -140,6 +144,8 @@ func New(opts ...Option) (Interface, error) {
if l.vendor == "" {
l.vendor = "management.nvidia.com"
}
// Management containers in general do not require CUDA Forward compatibility.
l.disabledHooks[HookEnableCudaCompat] = true
lib = (*managementlib)(l)
case ModeNvml:
lib = (*nvmllib)(l)

View File

@ -80,7 +80,7 @@ func (m *managementlib) GetCommonEdits() (*cdi.ContainerEdits, error) {
return nil, fmt.Errorf("failed to get CUDA version: %v", err)
}
driver, err := newDriverVersionDiscoverer(m.logger, m.driver, m.nvidiaCDIHookPath, m.ldconfigPath, version)
driver, err := (*nvcdilib)(m).newDriverVersionDiscoverer(version)
if err != nil {
return nil, fmt.Errorf("failed to create driver library discoverer: %v", err)
}

View File

@ -155,3 +155,14 @@ func WithLibrarySearchPaths(paths []string) Option {
o.librarySearchPaths = paths
}
}
// WithDisabledHook allows specific hooks to the disabled.
// This option can be specified multiple times for each hook.
func WithDisabledHook(hook HookName) Option {
return func(o *nvcdilib) {
if o.disabledHooks == nil {
o.disabledHooks = make(map[HookName]bool)
}
o.disabledHooks[hook] = true
}
}

@ -1 +1 @@
Subproject commit 95d3e86522976061e856724867ebcaf75c4e9b60
Subproject commit f23e5e55ea27b3680aef363436d4bcf7659e0bfc