2021-10-11 14:31:02 +00:00
/ * *
# Copyright ( c ) 2021 , NVIDIA CORPORATION . All rights reserved .
#
# Licensed under the Apache License , Version 2.0 ( the "License" ) ;
# you may not use this file except in compliance with the License .
# You may obtain a copy of the License at
#
# http : //www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing , software
# distributed under the License is distributed on an "AS IS" BASIS ,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
# See the License for the specific language governing permissions and
# limitations under the License .
* /
package main
import (
"fmt"
"net"
"os"
"syscall"
"time"
2023-06-09 15:28:42 +00:00
"github.com/NVIDIA/nvidia-container-toolkit/internal/info"
2023-04-24 11:58:03 +00:00
"github.com/NVIDIA/nvidia-container-toolkit/pkg/config/engine/docker"
2023-06-09 15:17:54 +00:00
"github.com/NVIDIA/nvidia-container-toolkit/tools/container"
2021-10-11 14:31:02 +00:00
log "github.com/sirupsen/logrus"
cli "github.com/urfave/cli/v2"
)
const (
defaultConfig = "/etc/docker/daemon.json"
defaultSocket = "/var/run/docker.sock"
defaultSetAsDefault = true
// defaultRuntimeName specifies the NVIDIA runtime to be use as the default runtime if setting the default runtime is enabled
2023-06-11 09:38:22 +00:00
defaultRuntimeName = "nvidia"
2023-06-10 10:41:53 +00:00
defaultRestartMode = "signal"
2023-03-24 09:54:58 +00:00
defaultHostRootMount = "/host"
2021-10-11 14:31:02 +00:00
reloadBackoff = 5 * time . Second
maxReloadAttempts = 6
socketMessageToGetPID = "GET /info HTTP/1.0\r\n\r\n"
)
// options stores the configuration from the command line or environment variables
type options struct {
2023-06-09 15:17:54 +00:00
container . Options
2021-10-11 14:31:02 +00:00
}
func main ( ) {
options := options { }
// Create the top-level CLI
c := cli . NewApp ( )
c . Name = "docker"
c . Usage = "Update docker config with the nvidia runtime"
2023-06-09 15:28:42 +00:00
c . Version = info . GetVersionString ( )
2021-10-11 14:31:02 +00:00
// Create the 'setup' subcommand
setup := cli . Command { }
setup . Name = "setup"
setup . Usage = "Trigger docker config to be updated"
setup . ArgsUsage = "<runtime_dirname>"
setup . Action = func ( c * cli . Context ) error {
return Setup ( c , & options )
}
2023-06-09 14:50:51 +00:00
setup . Before = func ( c * cli . Context ) error {
2023-06-09 15:17:54 +00:00
return container . ParseArgs ( c , & options . Options )
2023-06-09 14:50:51 +00:00
}
2021-10-11 14:31:02 +00:00
// Create the 'cleanup' subcommand
cleanup := cli . Command { }
cleanup . Name = "cleanup"
cleanup . Usage = "Trigger any updates made to docker config to be undone"
cleanup . ArgsUsage = "<runtime_dirname>"
cleanup . Action = func ( c * cli . Context ) error {
return Cleanup ( c , & options )
}
2023-06-09 14:50:51 +00:00
cleanup . Before = func ( c * cli . Context ) error {
2023-06-09 15:17:54 +00:00
return container . ParseArgs ( c , & options . Options )
2023-06-09 14:50:51 +00:00
}
2021-10-11 14:31:02 +00:00
// Register the subcommands with the top-level CLI
c . Commands = [ ] * cli . Command {
& setup ,
& cleanup ,
}
// Setup common flags across both subcommands. All subcommands get the same
// set of flags even if they don't use some of them. This is so that we
// only require the user to specify one set of flags for both 'startup'
// and 'cleanup' to simplify things.
commonFlags := [ ] cli . Flag {
& cli . StringFlag {
Name : "config" ,
Usage : "Path to docker config file" ,
Value : defaultConfig ,
2023-06-09 15:17:54 +00:00
Destination : & options . Config ,
2023-06-27 12:13:00 +00:00
EnvVars : [ ] string { "RUNTIME_CONFIG" , "DOCKER_CONFIG" } ,
2021-10-11 14:31:02 +00:00
} ,
& cli . StringFlag {
Name : "socket" ,
Usage : "Path to the docker socket file" ,
Value : defaultSocket ,
2023-06-09 15:17:54 +00:00
Destination : & options . Socket ,
2023-06-27 12:13:00 +00:00
EnvVars : [ ] string { "RUNTIME_SOCKET" , "DOCKER_SOCKET" } ,
2023-03-24 09:54:58 +00:00
} ,
& cli . StringFlag {
Name : "restart-mode" ,
2023-06-10 10:41:53 +00:00
Usage : "Specify how docker should be restarted; If 'none' is selected it will not be restarted [signal | systemd | none ]" ,
2023-03-24 09:54:58 +00:00
Value : defaultRestartMode ,
2023-06-09 15:17:54 +00:00
Destination : & options . RestartMode ,
2023-06-27 12:13:00 +00:00
EnvVars : [ ] string { "RUNTIME_RESTART_MODE" , "DOCKER_RESTART_MODE" } ,
2023-03-24 09:54:58 +00:00
} ,
& cli . StringFlag {
Name : "host-root" ,
Usage : "Specify the path to the host root to be used when restarting docker using systemd" ,
Value : defaultHostRootMount ,
2023-06-09 15:17:54 +00:00
Destination : & options . HostRootMount ,
2023-03-24 09:54:58 +00:00
EnvVars : [ ] string { "HOST_ROOT_MOUNT" } ,
// Restart using systemd is currently not supported.
// We hide this option for the time being.
Hidden : true ,
2021-10-11 14:31:02 +00:00
} ,
& cli . StringFlag {
Name : "runtime-name" ,
2023-06-27 12:13:00 +00:00
Aliases : [ ] string { "nvidia-runtime-name" , "runtime-class" } ,
2021-10-11 14:31:02 +00:00
Usage : "Specify the name of the `nvidia` runtime. If set-as-default is selected, the runtime is used as the default runtime." ,
Value : defaultRuntimeName ,
2023-06-09 15:17:54 +00:00
Destination : & options . RuntimeName ,
2023-06-27 12:13:00 +00:00
EnvVars : [ ] string { "NVIDIA_RUNTIME_NAME" , "DOCKER_RUNTIME_NAME" } ,
2021-10-11 14:31:02 +00:00
} ,
2023-06-09 14:50:51 +00:00
& cli . StringFlag {
Name : "nvidia-runtime-dir" ,
Aliases : [ ] string { "runtime-dir" } ,
Usage : "The path where the nvidia-container-runtime binaries are located. If this is not specified, the first argument will be used instead" ,
2023-06-09 15:17:54 +00:00
Destination : & options . RuntimeDir ,
2023-06-09 14:50:51 +00:00
EnvVars : [ ] string { "NVIDIA_RUNTIME_DIR" } ,
} ,
2021-10-11 14:31:02 +00:00
& cli . BoolFlag {
Name : "set-as-default" ,
Usage : "Set the `nvidia` runtime as the default runtime. If --runtime-name is specified as `nvidia-experimental` the experimental runtime is set as the default runtime instead" ,
Value : defaultSetAsDefault ,
2023-06-09 15:17:54 +00:00
Destination : & options . SetAsDefault ,
2023-06-27 12:13:00 +00:00
EnvVars : [ ] string { "NVIDIA_RUNTIME_SET_AS_DEFAULT" , "DOCKER_SET_AS_DEFAULT" } ,
2021-10-11 14:31:02 +00:00
Hidden : true ,
} ,
}
// Update the subcommand flags with the common subcommand flags
setup . Flags = append ( [ ] cli . Flag { } , commonFlags ... )
cleanup . Flags = append ( [ ] cli . Flag { } , commonFlags ... )
// Run the top-level CLI
if err := c . Run ( os . Args ) ; err != nil {
log . Errorf ( "Error running docker configuration: %v" , err )
os . Exit ( 1 )
}
}
// Setup updates docker configuration to include the nvidia runtime and reloads it
func Setup ( c * cli . Context , o * options ) error {
log . Infof ( "Starting 'setup' for %v" , c . App . Name )
2023-02-23 13:43:15 +00:00
cfg , err := docker . New (
2023-06-09 15:17:54 +00:00
docker . WithPath ( o . Config ) ,
2023-02-23 13:43:15 +00:00
)
2021-10-11 14:31:02 +00:00
if err != nil {
return fmt . Errorf ( "unable to load config: %v" , err )
}
2023-06-09 15:17:54 +00:00
err = o . Configure ( cfg )
2021-10-11 14:31:02 +00:00
if err != nil {
2023-06-09 15:17:54 +00:00
return fmt . Errorf ( "unable to configure docker: %v" , err )
2021-10-11 14:31:02 +00:00
}
2022-02-08 13:09:07 +00:00
err = RestartDocker ( o )
2021-10-11 14:31:02 +00:00
if err != nil {
2022-02-08 13:09:07 +00:00
return fmt . Errorf ( "unable to restart docker: %v" , err )
2021-10-11 14:31:02 +00:00
}
log . Infof ( "Completed 'setup' for %v" , c . App . Name )
return nil
}
// Cleanup reverts docker configuration to remove the nvidia runtime and reloads it
func Cleanup ( c * cli . Context , o * options ) error {
log . Infof ( "Starting 'cleanup' for %v" , c . App . Name )
2023-02-23 13:43:15 +00:00
cfg , err := docker . New (
2023-06-09 15:17:54 +00:00
docker . WithPath ( o . Config ) ,
2023-02-23 13:43:15 +00:00
)
2021-10-11 14:31:02 +00:00
if err != nil {
return fmt . Errorf ( "unable to load config: %v" , err )
}
2023-06-09 15:17:54 +00:00
err = o . Unconfigure ( cfg )
2021-10-11 14:31:02 +00:00
if err != nil {
2023-06-09 15:17:54 +00:00
return fmt . Errorf ( "unable to unconfigure docker: %v" , err )
2023-02-23 13:43:15 +00:00
}
2021-10-11 14:31:02 +00:00
2022-02-08 13:09:07 +00:00
err = RestartDocker ( o )
2021-10-11 14:31:02 +00:00
if err != nil {
return fmt . Errorf ( "unable to signal docker: %v" , err )
}
log . Infof ( "Completed 'cleanup' for %v" , c . App . Name )
return nil
}
2022-02-08 13:09:07 +00:00
// RestartDocker restarts docker depending on the value of restartModeFlag
func RestartDocker ( o * options ) error {
2023-06-10 10:41:53 +00:00
return o . Restart ( "docker" , SignalDocker )
2022-02-08 13:09:07 +00:00
}
2021-10-11 14:31:02 +00:00
// SignalDocker sends a SIGHUP signal to docker daemon
func SignalDocker ( socket string ) error {
log . Infof ( "Sending SIGHUP signal to docker" )
// Wrap the logic to perform the SIGHUP in a function so we can retry it on failure
retriable := func ( ) error {
conn , err := net . Dial ( "unix" , socket )
if err != nil {
return fmt . Errorf ( "unable to dial: %v" , err )
}
defer conn . Close ( )
sconn , err := conn . ( * net . UnixConn ) . SyscallConn ( )
if err != nil {
return fmt . Errorf ( "unable to get syscall connection: %v" , err )
}
err1 := sconn . Control ( func ( fd uintptr ) {
err = syscall . SetsockoptInt ( int ( fd ) , syscall . SOL_SOCKET , syscall . SO_PASSCRED , 1 )
} )
if err1 != nil {
return fmt . Errorf ( "unable to issue call on socket fd: %v" , err1 )
}
if err != nil {
return fmt . Errorf ( "unable to SetsockoptInt on socket fd: %v" , err )
}
_ , _ , err = conn . ( * net . UnixConn ) . WriteMsgUnix ( [ ] byte ( socketMessageToGetPID ) , nil , nil )
if err != nil {
return fmt . Errorf ( "unable to WriteMsgUnix on socket fd: %v" , err )
}
oob := make ( [ ] byte , 1024 )
_ , oobn , _ , _ , err := conn . ( * net . UnixConn ) . ReadMsgUnix ( nil , oob )
if err != nil {
return fmt . Errorf ( "unable to ReadMsgUnix on socket fd: %v" , err )
}
oob = oob [ : oobn ]
scm , err := syscall . ParseSocketControlMessage ( oob )
if err != nil {
return fmt . Errorf ( "unable to ParseSocketControlMessage from message received on socket fd: %v" , err )
}
ucred , err := syscall . ParseUnixCredentials ( & scm [ 0 ] )
if err != nil {
return fmt . Errorf ( "unable to ParseUnixCredentials from message received on socket fd: %v" , err )
}
err = syscall . Kill ( int ( ucred . Pid ) , syscall . SIGHUP )
if err != nil {
return fmt . Errorf ( "unable to send SIGHUP to 'docker' process: %v" , err )
}
return nil
}
// Try to send a SIGHUP up to maxReloadAttempts times
var err error
for i := 0 ; i < maxReloadAttempts ; i ++ {
err = retriable ( )
if err == nil {
break
}
if i == maxReloadAttempts - 1 {
break
}
2023-06-06 19:46:38 +00:00
log . Warningf ( "Error signaling docker, attempt %v/%v: %v" , i + 1 , maxReloadAttempts , err )
2021-10-11 14:31:02 +00:00
time . Sleep ( reloadBackoff )
}
if err != nil {
2023-06-06 19:46:38 +00:00
log . Warningf ( "Max retries reached %v/%v, aborting" , maxReloadAttempts , maxReloadAttempts )
2021-10-11 14:31:02 +00:00
return err
}
log . Infof ( "Successfully signaled docker" )
return nil
}