Add feature gate to require NVIDIA kernel modules

This change adds an opt-in feature to the NVIDIA Container Runtime that
only uses the NVIDIA runtime if the NVIDIA kernel modules are loaded.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2024-10-30 15:05:28 +01:00
parent efb18a72ad
commit 7263d26817
No known key found for this signature in database
2 changed files with 22 additions and 0 deletions

View File

@ -21,6 +21,9 @@ type features struct {
// DisableImexChannelCreation ensures that the implicit creation of // DisableImexChannelCreation ensures that the implicit creation of
// requested IMEX channels is skipped when invoking the nvidia-container-cli. // requested IMEX channels is skipped when invoking the nvidia-container-cli.
DisableImexChannelCreation *feature `toml:"disable-imex-channel-creation,omitempty"` DisableImexChannelCreation *feature `toml:"disable-imex-channel-creation,omitempty"`
// RequireNvidiaKernelModules indicates that the NVIDIA kernel module must be
// loaded for the NVIDIA Container Runtime to perform any OCI spec modifications.
RequireNvidiaKernelModules *feature `toml:"require-nvidia-kernel-module,omitempty"`
} }
//nolint:unused //nolint:unused

View File

@ -18,6 +18,7 @@ package runtime
import ( import (
"fmt" "fmt"
"os"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
@ -41,6 +42,11 @@ func newNVIDIAContainerRuntime(logger logger.Interface, cfg *config.Config, argv
return lowLevelRuntime, nil return lowLevelRuntime, nil
} }
if cfg.Features.RequireNvidiaKernelModules.IsEnabled() && !isNvidiaModuleLoaded() {
logger.Tracef("NVIDIA driver modules are not yet loaded; skipping modifer")
return lowLevelRuntime, nil
}
ociSpec, err := oci.NewSpec(logger, argv) ociSpec, err := oci.NewSpec(logger, argv)
if err != nil { if err != nil {
return nil, fmt.Errorf("error constructing OCI specification: %v", err) return nil, fmt.Errorf("error constructing OCI specification: %v", err)
@ -62,6 +68,19 @@ func newNVIDIAContainerRuntime(logger logger.Interface, cfg *config.Config, argv
return r, nil return r, nil
} }
// isNvidiaKernelModuleLoaded checks whether the NVIDIA GPU driver is installed
// and the kernel module is available.
func isNvidiaModuleLoaded() bool {
// TODO: This was implemented as:
// cat /proc/modules | grep -e \"^nvidia \" >/dev/null 2>&1
// if [ "${?}" != "0" ]; then
// echo "nvidia driver modules are not yet loaded, invoking runc directly"
// exec runc "$@"
// fi
_, err := os.Stat("/proc/driver/nvidia/version")
return err == nil
}
// newSpecModifier is a factory method that creates constructs an OCI spec modifer based on the provided config. // newSpecModifier is a factory method that creates constructs an OCI spec modifer based on the provided config.
func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec, driver *root.Driver) (oci.SpecModifier, error) { func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec, driver *root.Driver) (oci.SpecModifier, error) {
rawSpec, err := ociSpec.Load() rawSpec, err := ociSpec.Load()