Merge pull request #1103 from elezar/reenable-nvsandboxutils
Some checks failed
CI Pipeline / code-scanning (push) Has been cancelled
CI Pipeline / variables (push) Has been cancelled
CI Pipeline / golang (push) Has been cancelled
CI Pipeline / image (push) Has been cancelled
CI Pipeline / e2e-test (push) Has been cancelled

Reenable nvsandboxutils for driver discovery
This commit is contained in:
Evan Lezar 2025-05-23 11:38:46 +02:00 committed by GitHub
commit b934c68bef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 48 additions and 18 deletions

View File

@ -45,3 +45,13 @@ const (
// This was added with v1.17.5 of the NVIDIA Container Toolkit.
HookEnableCudaCompat = HookName("enable-cuda-compat")
)
// A FeatureFlag refers to a specific feature that can be toggled in the CDI api.
// All features are off by default.
type FeatureFlag string
const (
// FeatureDisableNvsandboxUtils disables the use of nvsandboxutils when
// querying devices.
FeatureDisableNvsandboxUtils = FeatureFlag("disable-nvsandbox-utils")
)

View File

@ -56,6 +56,8 @@ type nvcdilib struct {
mergedDeviceOptions []transform.MergedDeviceOption
featureFlags map[FeatureFlag]bool
disabledHooks disabledHooks
hookCreator discover.HookCreator
}
@ -64,6 +66,7 @@ type nvcdilib struct {
func New(opts ...Option) (Interface, error) {
l := &nvcdilib{
disabledHooks: make(disabledHooks),
featureFlags: make(map[FeatureFlag]bool),
}
for _, opt := range opts {
opt(l)
@ -108,24 +111,7 @@ func New(opts ...Option) (Interface, error) {
}
l.nvmllib = nvml.New(nvmlOpts...)
}
// TODO: Repeated calls to nvsandboxutils.Init and Shutdown are causing
// segmentation violations. Here we disabled nvsandbox utils unless explicitly
// specified.
// This will be reenabled as soon as we have more visibility into why this is
// happening and a mechanism to detect and disable this if required.
// if l.nvsandboxutilslib == nil {
// var nvsandboxutilsOpts []nvsandboxutils.LibraryOption
// // Set the library path for libnvidia-sandboxutils
// candidates, err := l.driver.Libraries().Locate("libnvidia-sandboxutils.so.1")
// if err != nil {
// l.logger.Warningf("Ignoring error in locating libnvidia-sandboxutils.so.1: %v", err)
// } else {
// libNvidiaSandboxutilsPath := candidates[0]
// l.logger.Infof("Using %v", libNvidiaSandboxutilsPath)
// nvsandboxutilsOpts = append(nvsandboxutilsOpts, nvsandboxutils.WithLibraryPath(libNvidiaSandboxutilsPath))
// }
// l.nvsandboxutilslib = nvsandboxutils.New(nvsandboxutilsOpts...)
// }
l.nvsandboxutilslib = l.getNvsandboxUtilsLib()
if l.devicelib == nil {
l.devicelib = device.New(l.nvmllib)
}
@ -231,3 +217,26 @@ func (l *nvcdilib) getCudaVersionNvsandboxutils() (string, error) {
}
return version, nil
}
// getNvsandboxUtilsLib returns the nvsandboxutilslib to use for CDI spec
// generation.
func (l *nvcdilib) getNvsandboxUtilsLib() nvsandboxutils.Interface {
if l.featureFlags[FeatureDisableNvsandboxUtils] {
return nil
}
if l.nvsandboxutilslib != nil {
return l.nvsandboxutilslib
}
var nvsandboxutilsOpts []nvsandboxutils.LibraryOption
// Set the library path for libnvidia-sandboxutils
candidates, err := l.driver.Libraries().Locate("libnvidia-sandboxutils.so.1")
if err != nil {
l.logger.Warningf("Ignoring error in locating libnvidia-sandboxutils.so.1: %v", err)
} else {
libNvidiaSandboxutilsPath := candidates[0]
l.logger.Infof("Using %v", libNvidiaSandboxutilsPath)
nvsandboxutilsOpts = append(nvsandboxutilsOpts, nvsandboxutils.WithLibraryPath(libNvidiaSandboxutilsPath))
}
return nvsandboxutils.New(nvsandboxutilsOpts...)
}

View File

@ -166,3 +166,14 @@ func WithDisabledHook(hook HookName) Option {
o.disabledHooks[hook] = true
}
}
// WithFeatureFlag allows specified features to be toggled on.
// This option can be specified multiple times for each feature flag.
func WithFeatureFlag(featureFlag FeatureFlag) Option {
return func(o *nvcdilib) {
if o.featureFlags == nil {
o.featureFlags = make(map[FeatureFlag]bool)
}
o.featureFlags[featureFlag] = true
}
}