2021-10-11 14:31:02 +00:00
/ * *
# Copyright ( c ) 2021 , NVIDIA CORPORATION . All rights reserved .
#
# Licensed under the Apache License , Version 2.0 ( the "License" ) ;
# you may not use this file except in compliance with the License .
# You may obtain a copy of the License at
#
# http : //www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing , software
# distributed under the License is distributed on an "AS IS" BASIS ,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
# See the License for the specific language governing permissions and
# limitations under the License .
* /
package main
import (
2024-06-14 12:15:54 +00:00
"errors"
2021-10-11 14:31:02 +00:00
"fmt"
"io"
"os"
"path/filepath"
"strings"
toml "github.com/pelletier/go-toml"
log "github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
2023-11-01 11:40:51 +00:00
"tags.cncf.io/container-device-interface/pkg/cdi"
"tags.cncf.io/container-device-interface/pkg/parser"
2023-12-01 01:10:10 +00:00
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvdevices"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
transformroot "github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/transform/root"
2021-10-11 14:31:02 +00:00
)
const (
// DefaultNvidiaDriverRoot specifies the default NVIDIA driver run directory
DefaultNvidiaDriverRoot = "/run/nvidia/driver"
nvidiaContainerCliSource = "/usr/bin/nvidia-container-cli"
2021-09-06 09:58:03 +00:00
nvidiaContainerRuntimeHookSource = "/usr/bin/nvidia-container-runtime-hook"
2021-10-11 14:31:02 +00:00
nvidiaContainerToolkitConfigSource = "/etc/nvidia-container-runtime/config.toml"
configFilename = "config.toml"
2024-06-14 12:15:54 +00:00
toolkitPidFilename = "toolkit.pid"
2021-10-11 14:31:02 +00:00
)
2022-07-25 08:01:33 +00:00
type options struct {
2023-03-07 14:17:49 +00:00
DriverRoot string
2024-02-14 09:53:38 +00:00
DevRoot string
2023-03-07 14:17:49 +00:00
DriverRootCtrPath string
2024-02-14 09:53:38 +00:00
DevRootCtrPath string
2023-03-07 14:17:49 +00:00
2023-03-23 18:40:19 +00:00
ContainerRuntimeMode string
ContainerRuntimeDebug string
ContainerRuntimeLogLevel string
ContainerRuntimeModesCdiDefaultKind string
ContainerRuntimeModesCDIAnnotationPrefixes cli . StringSlice
2023-03-07 14:17:49 +00:00
2023-03-28 15:39:17 +00:00
ContainerRuntimeRuntimes cli . StringSlice
2023-03-09 07:49:50 +00:00
ContainerRuntimeHookSkipModeDetection bool
2023-03-07 14:17:49 +00:00
ContainerCLIDebug string
toolkitRoot string
2022-07-25 08:31:31 +00:00
2023-03-13 16:18:54 +00:00
cdiEnabled bool
2023-03-01 10:44:32 +00:00
cdiOutputDir string
cdiKind string
cdiVendor string
cdiClass string
2024-02-09 13:08:22 +00:00
createDeviceNodes cli . StringSlice
2022-07-25 08:31:31 +00:00
acceptNVIDIAVisibleDevicesWhenUnprivileged bool
acceptNVIDIAVisibleDevicesAsVolumeMounts bool
2023-03-28 14:20:27 +00:00
ignoreErrors bool
2024-09-18 20:20:56 +00:00
optInFeatures cli . StringSlice
2022-07-25 08:01:33 +00:00
}
2021-10-11 14:31:02 +00:00
func main ( ) {
2022-07-25 08:01:33 +00:00
opts := options { }
2021-10-11 14:31:02 +00:00
// Create the top-level CLI
c := cli . NewApp ( )
c . Name = "toolkit"
c . Usage = "Manage the NVIDIA container toolkit"
c . Version = "0.1.0"
// Create the 'install' subcommand
install := cli . Command { }
install . Name = "install"
install . Usage = "Install the components of the NVIDIA container toolkit"
install . ArgsUsage = "<toolkit_directory>"
2022-07-25 08:08:36 +00:00
install . Before = func ( c * cli . Context ) error {
2022-07-25 08:13:54 +00:00
return validateOptions ( c , & opts )
2022-07-25 08:08:36 +00:00
}
2022-07-25 08:01:33 +00:00
install . Action = func ( c * cli . Context ) error {
return Install ( c , & opts )
}
2021-10-11 14:31:02 +00:00
// Create the 'delete' command
delete := cli . Command { }
delete . Name = "delete"
delete . Usage = "Delete the NVIDIA container toolkit"
delete . ArgsUsage = "<toolkit_directory>"
2022-07-25 08:08:36 +00:00
delete . Before = func ( c * cli . Context ) error {
2022-07-25 08:13:54 +00:00
return validateOptions ( c , & opts )
2022-07-25 08:08:36 +00:00
}
delete . Action = func ( c * cli . Context ) error {
2024-06-14 12:15:54 +00:00
return TryDelete ( c , & opts )
2022-07-25 08:08:36 +00:00
}
2021-10-11 14:31:02 +00:00
// Register the subcommand with the top-level CLI
c . Commands = [ ] * cli . Command {
& install ,
& delete ,
}
flags := [ ] cli . Flag {
& cli . StringFlag {
2024-02-09 13:28:02 +00:00
Name : "driver-root" ,
Aliases : [ ] string { "nvidia-driver-root" } ,
2021-10-11 14:31:02 +00:00
Value : DefaultNvidiaDriverRoot ,
2022-07-25 08:01:33 +00:00
Destination : & opts . DriverRoot ,
2024-02-09 13:28:02 +00:00
EnvVars : [ ] string { "NVIDIA_DRIVER_ROOT" , "DRIVER_ROOT" } ,
2021-10-11 14:31:02 +00:00
} ,
2023-03-01 10:44:32 +00:00
& cli . StringFlag {
Name : "driver-root-ctr-path" ,
Value : DefaultNvidiaDriverRoot ,
Destination : & opts . DriverRootCtrPath ,
EnvVars : [ ] string { "DRIVER_ROOT_CTR_PATH" } ,
} ,
2024-02-14 09:53:38 +00:00
& cli . StringFlag {
Name : "dev-root" ,
Usage : "Specify the root where `/dev` is located. If this is not specified, the driver-root is assumed." ,
Destination : & opts . DevRoot ,
EnvVars : [ ] string { "NVIDIA_DEV_ROOT" , "DEV_ROOT" } ,
} ,
& cli . StringFlag {
Name : "dev-root-ctr-path" ,
Usage : "Specify the root where `/dev` is located in the container. If this is not specified, the driver-root-ctr-path is assumed." ,
Destination : & opts . DevRootCtrPath ,
EnvVars : [ ] string { "DEV_ROOT_CTR_PATH" } ,
} ,
2021-10-11 14:31:02 +00:00
& cli . StringFlag {
2023-03-23 18:40:19 +00:00
Name : "nvidia-container-runtime.debug" ,
Aliases : [ ] string { "nvidia-container-runtime-debug" } ,
2021-10-11 14:31:02 +00:00
Usage : "Specify the location of the debug log file for the NVIDIA Container Runtime" ,
2022-07-25 08:01:33 +00:00
Destination : & opts . ContainerRuntimeDebug ,
2021-10-11 14:31:02 +00:00
EnvVars : [ ] string { "NVIDIA_CONTAINER_RUNTIME_DEBUG" } ,
} ,
& cli . StringFlag {
2023-03-23 18:40:19 +00:00
Name : "nvidia-container-runtime.log-level" ,
Aliases : [ ] string { "nvidia-container-runtime-debug-log-level" } ,
2022-07-25 08:01:33 +00:00
Destination : & opts . ContainerRuntimeLogLevel ,
2021-10-11 14:31:02 +00:00
EnvVars : [ ] string { "NVIDIA_CONTAINER_RUNTIME_LOG_LEVEL" } ,
} ,
2023-02-17 16:04:46 +00:00
& cli . StringFlag {
2023-03-23 18:40:19 +00:00
Name : "nvidia-container-runtime.mode" ,
Aliases : [ ] string { "nvidia-container-runtime-mode" } ,
2023-02-17 16:04:46 +00:00
Destination : & opts . ContainerRuntimeMode ,
EnvVars : [ ] string { "NVIDIA_CONTAINER_RUNTIME_MODE" } ,
} ,
2023-03-07 14:17:49 +00:00
& cli . StringFlag {
2023-03-23 18:40:19 +00:00
Name : "nvidia-container-runtime.modes.cdi.default-kind" ,
2023-03-07 14:17:49 +00:00
Destination : & opts . ContainerRuntimeModesCdiDefaultKind ,
EnvVars : [ ] string { "NVIDIA_CONTAINER_RUNTIME_MODES_CDI_DEFAULT_KIND" } ,
} ,
2023-03-23 18:40:19 +00:00
& cli . StringSliceFlag {
Name : "nvidia-container-runtime.modes.cdi.annotation-prefixes" ,
Destination : & opts . ContainerRuntimeModesCDIAnnotationPrefixes ,
EnvVars : [ ] string { "NVIDIA_CONTAINER_RUNTIME_MODES_CDI_ANNOTATION_PREFIXES" } ,
} ,
2023-03-28 15:39:17 +00:00
& cli . StringSliceFlag {
Name : "nvidia-container-runtime.runtimes" ,
Destination : & opts . ContainerRuntimeRuntimes ,
EnvVars : [ ] string { "NVIDIA_CONTAINER_RUNTIME_RUNTIMES" } ,
} ,
2023-03-09 07:49:50 +00:00
& cli . BoolFlag {
Name : "nvidia-container-runtime-hook.skip-mode-detection" ,
Value : true ,
Destination : & opts . ContainerRuntimeHookSkipModeDetection ,
EnvVars : [ ] string { "NVIDIA_CONTAINER_RUNTIME_HOOK_SKIP_MODE_DETECTION" } ,
} ,
2021-10-11 14:31:02 +00:00
& cli . StringFlag {
2023-03-23 18:40:19 +00:00
Name : "nvidia-container-cli.debug" ,
Aliases : [ ] string { "nvidia-container-cli-debug" } ,
2021-10-11 14:31:02 +00:00
Usage : "Specify the location of the debug log file for the NVIDIA Container CLI" ,
2022-07-25 08:01:33 +00:00
Destination : & opts . ContainerCLIDebug ,
2021-10-11 14:31:02 +00:00
EnvVars : [ ] string { "NVIDIA_CONTAINER_CLI_DEBUG" } ,
} ,
2022-07-25 08:31:31 +00:00
& cli . BoolFlag {
Name : "accept-nvidia-visible-devices-envvar-when-unprivileged" ,
Usage : "Set the accept-nvidia-visible-devices-envvar-when-unprivileged config option" ,
2022-08-09 08:50:51 +00:00
Value : true ,
2022-07-25 08:31:31 +00:00
Destination : & opts . acceptNVIDIAVisibleDevicesWhenUnprivileged ,
EnvVars : [ ] string { "ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED" } ,
} ,
& cli . BoolFlag {
Name : "accept-nvidia-visible-devices-as-volume-mounts" ,
Usage : "Set the accept-nvidia-visible-devices-as-volume-mounts config option" ,
2022-08-09 08:27:51 +00:00
Destination : & opts . acceptNVIDIAVisibleDevicesAsVolumeMounts ,
2022-07-25 08:31:31 +00:00
EnvVars : [ ] string { "ACCEPT_NVIDIA_VISIBLE_DEVICES_AS_VOLUME_MOUNTS" } ,
} ,
2022-07-25 08:13:54 +00:00
& cli . StringFlag {
Name : "toolkit-root" ,
Usage : "The directory where the NVIDIA Container toolkit is to be installed" ,
Required : true ,
Destination : & opts . toolkitRoot ,
EnvVars : [ ] string { "TOOLKIT_ROOT" } ,
} ,
2023-03-13 16:18:54 +00:00
& cli . BoolFlag {
Name : "cdi-enabled" ,
Aliases : [ ] string { "enable-cdi" } ,
Usage : "enable the generation of a CDI specification" ,
Destination : & opts . cdiEnabled ,
EnvVars : [ ] string { "CDI_ENABLED" , "ENABLE_CDI" } ,
} ,
2023-03-01 10:44:32 +00:00
& cli . StringFlag {
Name : "cdi-output-dir" ,
Usage : "the directory where the CDI output files are to be written. If this is set to '', no CDI specification is generated." ,
Value : "/var/run/cdi" ,
Destination : & opts . cdiOutputDir ,
2023-03-09 10:25:05 +00:00
EnvVars : [ ] string { "CDI_OUTPUT_DIR" } ,
2023-03-01 10:44:32 +00:00
} ,
& cli . StringFlag {
Name : "cdi-kind" ,
Usage : "the vendor string to use for the generated CDI specification" ,
Value : "management.nvidia.com/gpu" ,
Destination : & opts . cdiKind ,
2023-03-09 10:25:05 +00:00
EnvVars : [ ] string { "CDI_KIND" } ,
2023-03-01 10:44:32 +00:00
} ,
2023-03-28 14:20:27 +00:00
& cli . BoolFlag {
Name : "ignore-errors" ,
Usage : "ignore errors when installing the NVIDIA Container toolkit. This is used for testing purposes only." ,
Hidden : true ,
Destination : & opts . ignoreErrors ,
} ,
2024-02-09 13:08:22 +00:00
& cli . StringSliceFlag {
Name : "create-device-nodes" ,
Usage : "(Only applicable with --cdi-enabled) specifies which device nodes should be created. If any one of the options is set to '' or 'none', no device nodes will be created." ,
Value : cli . NewStringSlice ( "control" ) ,
Destination : & opts . createDeviceNodes ,
EnvVars : [ ] string { "CREATE_DEVICE_NODES" } ,
} ,
2024-09-18 20:20:56 +00:00
& cli . StringSliceFlag {
Name : "opt-in-feature" ,
Hidden : true ,
Destination : & opts . optInFeatures ,
EnvVars : [ ] string { "NVIDIA_CONTAINER_TOOLKIT_OPT_IN_FEATURES" } ,
} ,
2021-10-11 14:31:02 +00:00
}
// Update the subcommand flags with the common subcommand flags
install . Flags = append ( [ ] cli . Flag { } , flags ... )
2022-07-25 08:13:54 +00:00
delete . Flags = append ( [ ] cli . Flag { } , flags ... )
2021-10-11 14:31:02 +00:00
// Run the top-level CLI
if err := c . Run ( os . Args ) ; err != nil {
log . Fatal ( fmt . Errorf ( "error: %v" , err ) )
}
}
2022-07-25 08:13:54 +00:00
// validateOptions checks whether the specified options are valid
func validateOptions ( c * cli . Context , opts * options ) error {
if opts . toolkitRoot == "" {
return fmt . Errorf ( "invalid --toolkit-root option: %v" , opts . toolkitRoot )
2021-10-11 14:31:02 +00:00
}
2023-08-25 14:15:30 +00:00
vendor , class := parser . ParseQualifier ( opts . cdiKind )
if err := parser . ValidateVendorName ( vendor ) ; err != nil {
2023-03-01 10:44:32 +00:00
return fmt . Errorf ( "invalid CDI vendor name: %v" , err )
}
2023-08-25 14:15:30 +00:00
if err := parser . ValidateClassName ( class ) ; err != nil {
2023-03-01 10:44:32 +00:00
return fmt . Errorf ( "invalid CDI class name: %v" , err )
}
opts . cdiVendor = vendor
opts . cdiClass = class
2024-02-09 13:08:22 +00:00
if opts . cdiEnabled && opts . cdiOutputDir == "" {
log . Warning ( "Skipping CDI spec generation (no output directory specified)" )
opts . cdiEnabled = false
}
isDisabled := false
for _ , mode := range opts . createDeviceNodes . Value ( ) {
if mode != "" && mode != "none" && mode != "control" {
return fmt . Errorf ( "invalid --create-device-nodes value: %v" , mode )
}
if mode == "" || mode == "none" {
isDisabled = true
break
}
}
if ! opts . cdiEnabled && ! isDisabled {
log . Info ( "disabling device node creation since --cdi-enabled=false" )
isDisabled = true
}
if isDisabled {
opts . createDeviceNodes = * cli . NewStringSlice ( )
}
2021-10-11 14:31:02 +00:00
return nil
}
2024-06-14 12:15:54 +00:00
// TryDelete attempts to remove the specified toolkit folder.
// A toolkit.pid file -- if present -- is skipped.
func TryDelete ( cli * cli . Context , opts * options ) error {
log . Infof ( "Attempting to delete NVIDIA container toolkit from '%v'" , opts . toolkitRoot )
contents , err := os . ReadDir ( opts . toolkitRoot )
if err != nil && errors . Is ( err , os . ErrNotExist ) {
return nil
} else if err != nil {
return fmt . Errorf ( "failed to read the contents of %v: %w" , opts . toolkitRoot , err )
}
for _ , content := range contents {
if content . Name ( ) == toolkitPidFilename {
continue
}
name := filepath . Join ( opts . toolkitRoot , content . Name ( ) )
if err := os . RemoveAll ( name ) ; err != nil {
log . Warningf ( "could not remove %v: %v" , name , err )
}
}
if err := os . RemoveAll ( opts . toolkitRoot ) ; err != nil {
log . Warningf ( "could not remove %v: %v" , opts . toolkitRoot , err )
2021-10-11 14:31:02 +00:00
}
return nil
}
// Install installs the components of the NVIDIA container toolkit.
// Any existing installation is removed.
2022-07-25 08:01:33 +00:00
func Install ( cli * cli . Context , opts * options ) error {
2022-07-25 08:12:47 +00:00
log . Infof ( "Installing NVIDIA container toolkit to '%v'" , opts . toolkitRoot )
2021-10-11 14:31:02 +00:00
log . Infof ( "Removing existing NVIDIA container toolkit installation" )
2022-07-25 08:12:47 +00:00
err := os . RemoveAll ( opts . toolkitRoot )
2023-03-28 14:20:27 +00:00
if err != nil && ! opts . ignoreErrors {
2021-10-11 14:31:02 +00:00
return fmt . Errorf ( "error removing toolkit directory: %v" , err )
2023-03-28 14:20:27 +00:00
} else if err != nil {
log . Errorf ( "Ignoring error: %v" , fmt . Errorf ( "error removing toolkit directory: %v" , err ) )
2021-10-11 14:31:02 +00:00
}
2022-07-25 08:12:47 +00:00
toolkitConfigDir := filepath . Join ( opts . toolkitRoot , ".config" , "nvidia-container-runtime" )
2021-10-11 14:31:02 +00:00
toolkitConfigPath := filepath . Join ( toolkitConfigDir , configFilename )
2022-07-25 08:12:47 +00:00
err = createDirectories ( opts . toolkitRoot , toolkitConfigDir )
2023-03-28 14:20:27 +00:00
if err != nil && ! opts . ignoreErrors {
2021-10-11 14:31:02 +00:00
return fmt . Errorf ( "could not create required directories: %v" , err )
2023-03-28 14:20:27 +00:00
} else if err != nil {
log . Errorf ( "Ignoring error: %v" , fmt . Errorf ( "could not create required directories: %v" , err ) )
2021-10-11 14:31:02 +00:00
}
2022-07-25 08:12:47 +00:00
err = installContainerLibraries ( opts . toolkitRoot )
2023-03-28 14:20:27 +00:00
if err != nil && ! opts . ignoreErrors {
2021-10-11 14:31:02 +00:00
return fmt . Errorf ( "error installing NVIDIA container library: %v" , err )
2023-03-28 14:20:27 +00:00
} else if err != nil {
log . Errorf ( "Ignoring error: %v" , fmt . Errorf ( "error installing NVIDIA container library: %v" , err ) )
2021-10-11 14:31:02 +00:00
}
2022-07-25 08:12:47 +00:00
err = installContainerRuntimes ( opts . toolkitRoot , opts . DriverRoot )
2023-03-28 14:20:27 +00:00
if err != nil && ! opts . ignoreErrors {
2021-10-11 14:31:02 +00:00
return fmt . Errorf ( "error installing NVIDIA container runtime: %v" , err )
2023-03-28 14:20:27 +00:00
} else if err != nil {
log . Errorf ( "Ignoring error: %v" , fmt . Errorf ( "error installing NVIDIA container runtime: %v" , err ) )
2021-10-11 14:31:02 +00:00
}
2022-07-25 08:12:47 +00:00
nvidiaContainerCliExecutable , err := installContainerCLI ( opts . toolkitRoot )
2023-03-28 14:20:27 +00:00
if err != nil && ! opts . ignoreErrors {
2021-10-11 14:31:02 +00:00
return fmt . Errorf ( "error installing NVIDIA container CLI: %v" , err )
2023-03-28 14:20:27 +00:00
} else if err != nil {
log . Errorf ( "Ignoring error: %v" , fmt . Errorf ( "error installing NVIDIA container CLI: %v" , err ) )
2021-10-11 14:31:02 +00:00
}
2023-05-24 08:34:01 +00:00
nvidiaContainerRuntimeHookPath , err := installRuntimeHook ( opts . toolkitRoot , toolkitConfigPath )
2023-03-28 14:20:27 +00:00
if err != nil && ! opts . ignoreErrors {
2021-10-11 14:31:02 +00:00
return fmt . Errorf ( "error installing NVIDIA container runtime hook: %v" , err )
2023-03-28 14:20:27 +00:00
} else if err != nil {
log . Errorf ( "Ignoring error: %v" , fmt . Errorf ( "error installing NVIDIA container runtime hook: %v" , err ) )
2021-10-11 14:31:02 +00:00
}
2023-03-09 15:39:12 +00:00
nvidiaCTKPath , err := installContainerToolkitCLI ( opts . toolkitRoot )
2023-03-28 14:20:27 +00:00
if err != nil && ! opts . ignoreErrors {
2023-03-09 15:39:12 +00:00
return fmt . Errorf ( "error installing NVIDIA Container Toolkit CLI: %v" , err )
2023-03-28 14:20:27 +00:00
} else if err != nil {
log . Errorf ( "Ignoring error: %v" , fmt . Errorf ( "error installing NVIDIA Container Toolkit CLI: %v" , err ) )
2021-10-11 14:31:02 +00:00
}
2024-04-24 08:47:45 +00:00
nvidiaCDIHookPath , err := installContainerCDIHookCLI ( opts . toolkitRoot )
if err != nil && ! opts . ignoreErrors {
return fmt . Errorf ( "error installing NVIDIA Container CDI Hook CLI: %v" , err )
} else if err != nil {
log . Errorf ( "Ignoring error: %v" , fmt . Errorf ( "error installing NVIDIA Container CDI Hook CLI: %v" , err ) )
}
2023-05-24 08:34:01 +00:00
err = installToolkitConfig ( cli , toolkitConfigPath , nvidiaContainerCliExecutable , nvidiaCTKPath , nvidiaContainerRuntimeHookPath , opts )
2023-03-28 14:20:27 +00:00
if err != nil && ! opts . ignoreErrors {
2023-03-09 15:39:12 +00:00
return fmt . Errorf ( "error installing NVIDIA container toolkit config: %v" , err )
2023-03-28 14:20:27 +00:00
} else if err != nil {
log . Errorf ( "Ignoring error: %v" , fmt . Errorf ( "error installing NVIDIA container toolkit config: %v" , err ) )
2023-03-01 12:51:11 +00:00
}
2024-02-09 13:08:22 +00:00
err = createDeviceNodes ( opts )
if err != nil && ! opts . ignoreErrors {
return fmt . Errorf ( "error creating device nodes: %v" , err )
} else if err != nil {
log . Errorf ( "Ignoring error: %v" , fmt . Errorf ( "error creating device nodes: %v" , err ) )
}
2024-04-24 08:47:45 +00:00
err = generateCDISpec ( opts , nvidiaCDIHookPath )
2024-02-09 13:17:27 +00:00
if err != nil && ! opts . ignoreErrors {
return fmt . Errorf ( "error generating CDI specification: %v" , err )
} else if err != nil {
log . Errorf ( "Ignoring error: %v" , fmt . Errorf ( "error generating CDI specification: %v" , err ) )
}
return nil
2021-10-11 14:31:02 +00:00
}
2022-01-21 13:23:23 +00:00
// installContainerLibraries locates and installs the libraries that are part of
// the nvidia-container-toolkit.
2021-10-11 14:31:02 +00:00
// A predefined set of library candidates are considered, with the first one
// resulting in success being installed to the toolkit folder. The install process
// resolves the symlink for the library and copies the versioned library itself.
2022-07-25 08:12:47 +00:00
func installContainerLibraries ( toolkitRoot string ) error {
log . Infof ( "Installing NVIDIA container library to '%v'" , toolkitRoot )
2021-10-11 14:31:02 +00:00
2022-01-21 13:23:23 +00:00
libs := [ ] string {
"libnvidia-container.so.1" ,
"libnvidia-container-go.so.1" ,
}
for _ , l := range libs {
2022-07-25 08:12:47 +00:00
err := installLibrary ( l , toolkitRoot )
2022-01-21 13:23:23 +00:00
if err != nil {
return fmt . Errorf ( "failed to install %s: %v" , l , err )
}
}
return nil
}
// installLibrary installs the specified library to the toolkit directory.
2022-07-25 08:12:47 +00:00
func installLibrary ( libName string , toolkitRoot string ) error {
2021-10-11 14:31:02 +00:00
libraryPath , err := findLibrary ( "" , libName )
if err != nil {
return fmt . Errorf ( "error locating NVIDIA container library: %v" , err )
}
2022-07-25 08:12:47 +00:00
installedLibPath , err := installFileToFolder ( toolkitRoot , libraryPath )
2021-10-11 14:31:02 +00:00
if err != nil {
2022-07-25 08:12:47 +00:00
return fmt . Errorf ( "error installing %v to %v: %v" , libraryPath , toolkitRoot , err )
2021-10-11 14:31:02 +00:00
}
log . Infof ( "Installed '%v' to '%v'" , libraryPath , installedLibPath )
if filepath . Base ( installedLibPath ) == libName {
return nil
}
2022-07-25 08:12:47 +00:00
err = installSymlink ( toolkitRoot , libName , installedLibPath )
2021-10-11 14:31:02 +00:00
if err != nil {
return fmt . Errorf ( "error installing symlink for NVIDIA container library: %v" , err )
}
return nil
}
// installToolkitConfig installs the config file for the NVIDIA container toolkit ensuring
// that the settings are updated to match the desired install and nvidia driver directories.
2023-05-24 08:34:01 +00:00
func installToolkitConfig ( c * cli . Context , toolkitConfigPath string , nvidiaContainerCliExecutablePath string , nvidiaCTKPath string , nvidaContainerRuntimeHookPath string , opts * options ) error {
2021-10-11 14:31:02 +00:00
log . Infof ( "Installing NVIDIA container toolkit config '%v'" , toolkitConfigPath )
2023-11-14 15:56:50 +00:00
cfg , err := loadConfig ( nvidiaContainerToolkitConfigSource )
2021-10-11 14:31:02 +00:00
if err != nil {
return fmt . Errorf ( "could not open source config file: %v" , err )
}
targetConfig , err := os . Create ( toolkitConfigPath )
if err != nil {
return fmt . Errorf ( "could not create target config file: %v" , err )
}
defer targetConfig . Close ( )
// Read the ldconfig path from the config as this may differ per platform
// On ubuntu-based systems this ends in `.real`
2023-11-14 15:56:50 +00:00
ldconfigPath := fmt . Sprintf ( "%s" , cfg . GetDefault ( "nvidia-container-cli.ldconfig" , "/sbin/ldconfig" ) )
2021-10-11 14:31:02 +00:00
// Use the driver run root as the root:
2023-11-14 15:56:50 +00:00
driverLdconfigPath := config . NormalizeLDConfigPath ( "@" + filepath . Join ( opts . DriverRoot , strings . TrimPrefix ( ldconfigPath , "@/" ) ) )
2021-10-11 14:31:02 +00:00
2023-03-23 18:51:00 +00:00
configValues := map [ string ] interface { } {
// Set the options in the root toml table
"accept-nvidia-visible-devices-envvar-when-unprivileged" : opts . acceptNVIDIAVisibleDevicesWhenUnprivileged ,
"accept-nvidia-visible-devices-as-volume-mounts" : opts . acceptNVIDIAVisibleDevicesAsVolumeMounts ,
// Set the nvidia-container-cli options
"nvidia-container-cli.root" : opts . DriverRoot ,
"nvidia-container-cli.path" : nvidiaContainerCliExecutablePath ,
"nvidia-container-cli.ldconfig" : driverLdconfigPath ,
// Set nvidia-ctk options
"nvidia-ctk.path" : nvidiaCTKPath ,
// Set the nvidia-container-runtime-hook options
2023-05-24 08:34:01 +00:00
"nvidia-container-runtime-hook.path" : nvidaContainerRuntimeHookPath ,
2023-03-23 18:51:00 +00:00
"nvidia-container-runtime-hook.skip-mode-detection" : opts . ContainerRuntimeHookSkipModeDetection ,
}
for key , value := range configValues {
2023-11-14 15:56:50 +00:00
cfg . Set ( key , value )
2023-03-23 18:51:00 +00:00
}
2021-10-11 14:31:02 +00:00
2023-03-23 18:40:19 +00:00
// Set the optional config options
optionalConfigValues := map [ string ] interface { } {
"nvidia-container-runtime.debug" : opts . ContainerRuntimeDebug ,
"nvidia-container-runtime.log-level" : opts . ContainerRuntimeLogLevel ,
"nvidia-container-runtime.mode" : opts . ContainerRuntimeMode ,
"nvidia-container-runtime.modes.cdi.annotation-prefixes" : opts . ContainerRuntimeModesCDIAnnotationPrefixes ,
"nvidia-container-runtime.modes.cdi.default-kind" : opts . ContainerRuntimeModesCdiDefaultKind ,
2023-03-28 15:39:17 +00:00
"nvidia-container-runtime.runtimes" : opts . ContainerRuntimeRuntimes ,
2023-03-23 18:40:19 +00:00
"nvidia-container-cli.debug" : opts . ContainerCLIDebug ,
2021-10-11 14:31:02 +00:00
}
2024-09-18 20:20:56 +00:00
for _ , feature := range opts . optInFeatures . Value ( ) {
optionalConfigValues [ "features." + feature ] = true
}
2023-03-23 18:40:19 +00:00
for key , value := range optionalConfigValues {
if ! c . IsSet ( key ) {
log . Infof ( "Skipping unset option: %v" , key )
continue
}
if value == nil {
log . Infof ( "Skipping option with nil value: %v" , key )
2021-10-11 14:31:02 +00:00
continue
}
2023-03-23 18:40:19 +00:00
switch v := value . ( type ) {
case string :
if v == "" {
continue
}
case cli . StringSlice :
if len ( v . Value ( ) ) == 0 {
continue
}
value = v . Value ( )
default :
2023-06-06 19:46:38 +00:00
log . Warningf ( "Unexpected type for option %v=%v: %T" , key , value , v )
2023-03-23 18:40:19 +00:00
}
2023-03-09 15:39:12 +00:00
2023-11-14 15:56:50 +00:00
cfg . Set ( key , value )
2021-10-11 14:31:02 +00:00
}
2023-03-09 07:49:50 +00:00
2023-11-14 15:56:50 +00:00
if _ , err := cfg . WriteTo ( targetConfig ) ; err != nil {
2021-10-11 14:31:02 +00:00
return fmt . Errorf ( "error writing config: %v" , err )
}
2022-08-09 09:44:19 +00:00
os . Stdout . WriteString ( "Using config:\n" )
2023-11-14 15:56:50 +00:00
if _ , err = cfg . WriteTo ( os . Stdout ) ; err != nil {
2023-08-25 14:48:11 +00:00
log . Warningf ( "Failed to output config to STDOUT: %v" , err )
}
2022-08-09 09:44:19 +00:00
2021-10-11 14:31:02 +00:00
return nil
}
2023-03-28 14:20:27 +00:00
func loadConfig ( path string ) ( * toml . Tree , error ) {
_ , err := os . Stat ( path )
if err == nil {
return toml . LoadFile ( path )
} else if os . IsNotExist ( err ) {
return toml . TreeFromMap ( nil )
}
return nil , err
}
2023-03-01 12:51:11 +00:00
// installContainerToolkitCLI installs the nvidia-ctk CLI executable and wrapper.
func installContainerToolkitCLI ( toolkitDir string ) ( string , error ) {
e := executable {
source : "/usr/bin/nvidia-ctk" ,
target : executableTarget {
dotfileName : "nvidia-ctk.real" ,
wrapperName : "nvidia-ctk" ,
} ,
}
return e . install ( toolkitDir )
}
2024-04-24 08:47:45 +00:00
// installContainerCDIHookCLI installs the nvidia-cdi-hook CLI executable and wrapper.
func installContainerCDIHookCLI ( toolkitDir string ) ( string , error ) {
e := executable {
source : "/usr/bin/nvidia-cdi-hook" ,
target : executableTarget {
dotfileName : "nvidia-cdi-hook.real" ,
wrapperName : "nvidia-cdi-hook" ,
} ,
}
return e . install ( toolkitDir )
}
2021-10-11 14:31:02 +00:00
// installContainerCLI sets up the NVIDIA container CLI executable, copying the executable
// and implementing the required wrapper
2022-07-25 08:12:47 +00:00
func installContainerCLI ( toolkitRoot string ) ( string , error ) {
2021-10-11 14:31:02 +00:00
log . Infof ( "Installing NVIDIA container CLI from '%v'" , nvidiaContainerCliSource )
env := map [ string ] string {
2022-07-25 08:12:47 +00:00
"LD_LIBRARY_PATH" : toolkitRoot ,
2021-10-11 14:31:02 +00:00
}
e := executable {
source : nvidiaContainerCliSource ,
target : executableTarget {
dotfileName : "nvidia-container-cli.real" ,
wrapperName : "nvidia-container-cli" ,
} ,
env : env ,
}
2022-07-25 08:12:47 +00:00
installedPath , err := e . install ( toolkitRoot )
2021-10-11 14:31:02 +00:00
if err != nil {
return "" , fmt . Errorf ( "error installing NVIDIA container CLI: %v" , err )
}
return installedPath , nil
}
// installRuntimeHook sets up the NVIDIA runtime hook, copying the executable
// and implementing the required wrapper
2022-07-25 08:12:47 +00:00
func installRuntimeHook ( toolkitRoot string , configFilePath string ) ( string , error ) {
2021-10-11 14:31:02 +00:00
log . Infof ( "Installing NVIDIA container runtime hook from '%v'" , nvidiaContainerRuntimeHookSource )
argLines := [ ] string {
fmt . Sprintf ( "-config \"%s\"" , configFilePath ) ,
}
e := executable {
source : nvidiaContainerRuntimeHookSource ,
target : executableTarget {
2021-09-06 09:58:03 +00:00
dotfileName : "nvidia-container-runtime-hook.real" ,
wrapperName : "nvidia-container-runtime-hook" ,
2021-10-11 14:31:02 +00:00
} ,
argLines : argLines ,
}
2022-07-25 08:12:47 +00:00
installedPath , err := e . install ( toolkitRoot )
2021-10-11 14:31:02 +00:00
if err != nil {
return "" , fmt . Errorf ( "error installing NVIDIA container runtime hook: %v" , err )
}
2022-07-25 08:12:47 +00:00
err = installSymlink ( toolkitRoot , "nvidia-container-toolkit" , installedPath )
2021-10-11 14:31:02 +00:00
if err != nil {
return "" , fmt . Errorf ( "error installing symlink to NVIDIA container runtime hook: %v" , err )
}
return installedPath , nil
}
// installSymlink creates a symlink in the toolkitDirectory that points to the specified target.
// Note: The target is assumed to be local to the toolkit directory
2022-07-25 08:12:47 +00:00
func installSymlink ( toolkitRoot string , link string , target string ) error {
symlinkPath := filepath . Join ( toolkitRoot , link )
2021-10-11 14:31:02 +00:00
targetPath := filepath . Base ( target )
log . Infof ( "Creating symlink '%v' -> '%v'" , symlinkPath , targetPath )
err := os . Symlink ( targetPath , symlinkPath )
if err != nil {
return fmt . Errorf ( "error creating symlink '%v' => '%v': %v" , symlinkPath , targetPath , err )
}
return nil
}
// installFileToFolder copies a source file to a destination folder.
// The path of the input file is ignored.
// e.g. installFileToFolder("/some/path/file.txt", "/output/path")
// will result in a file "/output/path/file.txt" being generated
func installFileToFolder ( destFolder string , src string ) ( string , error ) {
name := filepath . Base ( src )
return installFileToFolderWithName ( destFolder , name , src )
}
// cp src destFolder/name
func installFileToFolderWithName ( destFolder string , name , src string ) ( string , error ) {
dest := filepath . Join ( destFolder , name )
err := installFile ( dest , src )
if err != nil {
return "" , fmt . Errorf ( "error copying '%v' to '%v': %v" , src , dest , err )
}
return dest , nil
}
// installFile copies a file from src to dest and maintains
// file modes
func installFile ( dest string , src string ) error {
log . Infof ( "Installing '%v' to '%v'" , src , dest )
source , err := os . Open ( src )
if err != nil {
return fmt . Errorf ( "error opening source: %v" , err )
}
defer source . Close ( )
destination , err := os . Create ( dest )
if err != nil {
return fmt . Errorf ( "error creating destination: %v" , err )
}
defer destination . Close ( )
_ , err = io . Copy ( destination , source )
if err != nil {
return fmt . Errorf ( "error copying file: %v" , err )
}
err = applyModeFromSource ( dest , src )
if err != nil {
return fmt . Errorf ( "error setting destination file mode: %v" , err )
}
return nil
}
// applyModeFromSource sets the file mode for a destination file
// to match that of a specified source file
func applyModeFromSource ( dest string , src string ) error {
sourceInfo , err := os . Stat ( src )
if err != nil {
return fmt . Errorf ( "error getting file info for '%v': %v" , src , err )
}
err = os . Chmod ( dest , sourceInfo . Mode ( ) )
if err != nil {
return fmt . Errorf ( "error setting mode for '%v': %v" , dest , err )
}
return nil
}
// findLibrary searches a set of candidate libraries in the specified root for
// a given library name
func findLibrary ( root string , libName string ) ( string , error ) {
log . Infof ( "Finding library %v (root=%v)" , libName , root )
candidateDirs := [ ] string {
"/usr/lib64" ,
"/usr/lib/x86_64-linux-gnu" ,
2022-02-08 10:36:57 +00:00
"/usr/lib/aarch64-linux-gnu" ,
2021-10-11 14:31:02 +00:00
}
for _ , d := range candidateDirs {
l := filepath . Join ( root , d , libName )
log . Infof ( "Checking library candidate '%v'" , l )
libraryCandidate , err := resolveLink ( l )
if err != nil {
log . Infof ( "Skipping library candidate '%v': %v" , l , err )
continue
}
return libraryCandidate , nil
}
return "" , fmt . Errorf ( "error locating library '%v'" , libName )
}
// resolveLink finds the target of a symlink or the file itself in the
// case of a regular file.
// This is equivalent to running `readlink -f ${l}`
func resolveLink ( l string ) ( string , error ) {
resolved , err := filepath . EvalSymlinks ( l )
if err != nil {
return "" , fmt . Errorf ( "error resolving link '%v': %v" , l , err )
}
if l != resolved {
log . Infof ( "Resolved link: '%v' => '%v'" , l , resolved )
}
return resolved , nil
}
func createDirectories ( dir ... string ) error {
for _ , d := range dir {
log . Infof ( "Creating directory '%v'" , d )
err := os . MkdirAll ( d , 0755 )
if err != nil {
return fmt . Errorf ( "error creating directory: %v" , err )
}
}
return nil
}
2023-03-01 10:44:32 +00:00
2024-02-09 13:08:22 +00:00
func createDeviceNodes ( opts * options ) error {
modes := opts . createDeviceNodes . Value ( )
if len ( modes ) == 0 {
2023-03-01 10:44:32 +00:00
return nil
}
2023-06-12 18:46:56 +00:00
devices , err := nvdevices . New (
2024-02-14 09:53:38 +00:00
nvdevices . WithDevRoot ( opts . DevRootCtrPath ) ,
2023-06-12 18:46:56 +00:00
)
2023-03-27 21:02:24 +00:00
if err != nil {
return fmt . Errorf ( "failed to create library: %v" , err )
}
2024-02-09 13:08:22 +00:00
for _ , mode := range modes {
2024-02-14 09:53:38 +00:00
log . Infof ( "Creating %v device nodes at %v" , mode , opts . DevRootCtrPath )
2024-02-09 13:08:22 +00:00
if mode != "control" {
log . Warningf ( "Unrecognised device mode: %v" , mode )
continue
}
if err := devices . CreateNVIDIAControlDevices ( ) ; err != nil {
return fmt . Errorf ( "failed to create control device nodes: %v" , err )
}
2023-03-27 21:02:24 +00:00
}
2024-02-09 13:08:22 +00:00
return nil
}
2023-03-27 21:02:24 +00:00
2024-04-24 08:47:45 +00:00
// generateCDISpec generates a CDI spec for use in management containers
func generateCDISpec ( opts * options , nvidiaCDIHookPath string ) error {
2024-02-09 13:08:22 +00:00
if ! opts . cdiEnabled {
return nil
}
2023-03-27 21:02:24 +00:00
log . Info ( "Generating CDI spec for management containers" )
2023-03-22 12:04:12 +00:00
cdilib , err := nvcdi . New (
2023-03-01 10:44:32 +00:00
nvcdi . WithMode ( nvcdi . ModeManagement ) ,
nvcdi . WithDriverRoot ( opts . DriverRootCtrPath ) ,
2024-02-14 09:53:38 +00:00
nvcdi . WithDevRoot ( opts . DevRootCtrPath ) ,
2024-04-24 08:47:45 +00:00
nvcdi . WithNVIDIACDIHookPath ( nvidiaCDIHookPath ) ,
2023-03-01 10:44:32 +00:00
nvcdi . WithVendor ( opts . cdiVendor ) ,
nvcdi . WithClass ( opts . cdiClass ) ,
)
2023-03-22 12:04:12 +00:00
if err != nil {
return fmt . Errorf ( "failed to create CDI library for management containers: %v" , err )
}
2023-03-01 10:44:32 +00:00
spec , err := cdilib . GetSpec ( )
if err != nil {
return fmt . Errorf ( "failed to genereate CDI spec for management containers: %v" , err )
}
2024-02-14 09:53:38 +00:00
transformer := transformroot . NewDriverTransformer (
transformroot . WithDriverRoot ( opts . DriverRootCtrPath ) ,
transformroot . WithTargetDriverRoot ( opts . DriverRoot ) ,
transformroot . WithDevRoot ( opts . DevRootCtrPath ) ,
transformroot . WithTargetDevRoot ( opts . DevRoot ) ,
)
if err := transformer . Transform ( spec . Raw ( ) ) ; err != nil {
2023-03-01 10:44:32 +00:00
return fmt . Errorf ( "failed to transform driver root in CDI spec: %v" , err )
}
name , err := cdi . GenerateNameForSpec ( spec . Raw ( ) )
if err != nil {
return fmt . Errorf ( "failed to generate CDI name for management containers: %v" , err )
}
err = spec . Save ( filepath . Join ( opts . cdiOutputDir , name ) )
if err != nil {
return fmt . Errorf ( "failed to save CDI spec for management containers: %v" , err )
}
return nil
}