Rework restart logic

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar 2023-06-10 12:41:53 +02:00
parent 19ca4338f2
commit 524802df2b
4 changed files with 57 additions and 94 deletions

View File

@ -18,6 +18,8 @@ package container
import (
"fmt"
"os"
"os/exec"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/engine"
"github.com/NVIDIA/nvidia-container-toolkit/tools/container/operator"
@ -25,6 +27,12 @@ import (
"github.com/urfave/cli/v2"
)
const (
restartModeNone = "none"
restartModeSignal = "signal"
restartModeSystemd = "systemd"
)
// Options defines the shared options for the CLIs to configure containers runtimes.
type Options struct {
Config string
@ -121,3 +129,41 @@ func (o Options) RevertConfig(cfg engine.Interface) error {
return nil
}
// Restart restarts the specified service
func (o Options) Restart(service string, withSignal func(string) error) error {
switch o.RestartMode {
case restartModeNone:
logrus.Warnf("Skipping restart of %v due to --restart-mode=%v", service, o.RestartMode)
return nil
case restartModeSignal:
return withSignal(o.Socket)
case restartModeSystemd:
return o.SystemdRestart(service)
}
return fmt.Errorf("invalid restart mode specified: %v", o.RestartMode)
}
// SystemdRestart restarts the specified service using systemd
func (o Options) SystemdRestart(service string) error {
var args []string
var msg string
if o.HostRootMount != "" {
msg = " on host"
args = append(args, "chroot", o.HostRootMount)
}
args = append(args, "systemctl", "restart", service)
logrus.Infof("Restarting %v%v using systemd: %v", service, msg, args)
cmd := exec.Command(args[0], args[1:]...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
err := cmd.Run()
if err != nil {
return fmt.Errorf("error restarting %v using systemd: %v", service, err)
}
return nil
}

View File

@ -20,7 +20,6 @@ import (
"fmt"
"net"
"os"
"os/exec"
"syscall"
"time"
@ -32,10 +31,6 @@ import (
)
const (
restartModeSignal = "signal"
restartModeSystemd = "systemd"
restartModeNone = "none"
nvidiaRuntimeName = "nvidia"
nvidiaRuntimeBinary = "nvidia-container-runtime"
nvidiaExperimentalRuntimeName = "nvidia-experimental"
@ -46,7 +41,7 @@ const (
defaultRuntimeClass = "nvidia"
defaultRuntmeType = "io.containerd.runc.v2"
defaultSetAsDefault = true
defaultRestartMode = restartModeSignal
defaultRestartMode = "signal"
defaultHostRootMount = "/host"
reloadBackoff = 5 * time.Second
@ -257,31 +252,16 @@ func Cleanup(c *cli.Context, o *options) error {
// RestartContainerd restarts containerd depending on the value of restartModeFlag
func RestartContainerd(o *options) error {
switch o.RestartMode {
case restartModeNone:
log.Warnf("Skipping sending signal to containerd due to --restart-mode=%v", o.RestartMode)
return nil
case restartModeSignal:
err := SignalContainerd(o)
if err != nil {
return fmt.Errorf("unable to signal containerd: %v", err)
}
case restartModeSystemd:
return RestartContainerdSystemd(o.HostRootMount)
default:
return fmt.Errorf("Invalid restart mode specified: %v", o.RestartMode)
}
return nil
return o.Restart("containerd", SignalContainerd)
}
// SignalContainerd sends a SIGHUP signal to the containerd daemon
func SignalContainerd(o *options) error {
func SignalContainerd(socket string) error {
log.Infof("Sending SIGHUP signal to containerd")
// Wrap the logic to perform the SIGHUP in a function so we can retry it on failure
retriable := func() error {
conn, err := net.Dial("unix", o.Socket)
conn, err := net.Dial("unix", socket)
if err != nil {
return fmt.Errorf("unable to dial: %v", err)
}
@ -355,24 +335,6 @@ func SignalContainerd(o *options) error {
return nil
}
// RestartContainerdSystemd restarts containerd using systemctl
func RestartContainerdSystemd(hostRootMount string) error {
log.Infof("Restarting containerd using systemd and host root mounted at %v", hostRootMount)
command := "chroot"
args := []string{hostRootMount, "systemctl", "restart", "containerd"}
cmd := exec.Command(command, args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
err := cmd.Run()
if err != nil {
return fmt.Errorf("error restarting containerd using systemd: %v", err)
}
return nil
}
// containerAnnotationsFromCDIPrefixes returns the container annotations to set for the given CDI prefixes.
func (o *options) containerAnnotationsFromCDIPrefixes() []string {
var annotations []string

View File

@ -20,7 +20,6 @@ import (
"encoding/json"
"fmt"
"os"
"os/exec"
"path/filepath"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
@ -32,9 +31,6 @@ import (
)
const (
restartModeSystemd = "systemd"
restartModeNone = "none"
defaultConfigMode = "hook"
// Hook-based settings
@ -46,7 +42,7 @@ const (
defaultSocket = "/var/run/crio/crio.sock"
defaultRuntimeClass = "nvidia"
defaultSetAsDefault = true
defaultRestartMode = restartModeSystemd
defaultRestartMode = "systemd"
defaultHostRootMount = "/host"
)
@ -117,6 +113,7 @@ func main() {
Value: "",
Destination: &options.Socket,
EnvVars: []string{"CRIO_SOCKET", "RUNTIME_SOCKET"},
// Note: We hide this option since restarting cri-o via a socket is not supported.
Hidden: true,
},
&cli.StringFlag{
@ -179,7 +176,6 @@ func main() {
Destination: &options.configMode,
EnvVars: []string{"CRIO_CONFIG_MODE"},
},
// The flags below are only used by the 'setup' command.
}
// Update the subcommand flags with the common subcommand flags
@ -341,31 +337,5 @@ func generateOciHook(toolkitDir string) podmanHook {
// RestartCrio restarts crio depending on the value of restartModeFlag
func RestartCrio(o *options) error {
switch o.RestartMode {
case restartModeNone:
log.Warnf("Skipping restart of crio due to --restart-mode=%v", o.RestartMode)
return nil
case restartModeSystemd:
return RestartCrioSystemd(o.HostRootMount)
default:
return fmt.Errorf("invalid restart mode specified: %v", o.RestartMode)
}
}
// RestartCrioSystemd restarts cri-o using systemctl
func RestartCrioSystemd(hostRootMount string) error {
log.Infof("Restarting cri-o using systemd and host root mounted at %v", hostRootMount)
command := "chroot"
args := []string{hostRootMount, "systemctl", "restart", "crio"}
cmd := exec.Command(command, args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
err := cmd.Run()
if err != nil {
return fmt.Errorf("error restarting crio using systemd: %v", err)
}
return nil
return o.Restart("crio", func(string) error { return fmt.Errorf("supporting crio via signal is unsupported") })
}

View File

@ -31,9 +31,6 @@ import (
)
const (
restartModeSignal = "signal"
restartModeNone = "none"
nvidiaRuntimeName = "nvidia"
nvidiaRuntimeBinary = "nvidia-container-runtime"
nvidiaExperimentalRuntimeName = "nvidia-experimental"
@ -44,7 +41,7 @@ const (
defaultSetAsDefault = true
// defaultRuntimeName specifies the NVIDIA runtime to be use as the default runtime if setting the default runtime is enabled
defaultRuntimeName = nvidiaRuntimeName
defaultRestartMode = restartModeSignal
defaultRestartMode = "signal"
defaultHostRootMount = "/host"
reloadBackoff = 5 * time.Second
@ -119,7 +116,7 @@ func main() {
},
&cli.StringFlag{
Name: "restart-mode",
Usage: "Specify how docker should be restarted; If 'none' is selected it will not be restarted [signal | none]",
Usage: "Specify how docker should be restarted; If 'none' is selected it will not be restarted [signal | systemd | none ]",
Value: defaultRestartMode,
Destination: &options.RestartMode,
EnvVars: []string{"DOCKER_RESTART_MODE", "RUNTIME_RESTART_MODE"},
@ -224,19 +221,7 @@ func Cleanup(c *cli.Context, o *options) error {
// RestartDocker restarts docker depending on the value of restartModeFlag
func RestartDocker(o *options) error {
switch o.RestartMode {
case restartModeNone:
log.Warnf("Skipping sending signal to docker due to --restart-mode=%v", o.RestartMode)
case restartModeSignal:
err := SignalDocker(o.Socket)
if err != nil {
return fmt.Errorf("unable to signal docker: %v", err)
}
default:
return fmt.Errorf("invalid restart mode specified: %v", o.RestartMode)
}
return nil
return o.Restart("docker", SignalDocker)
}
// SignalDocker sends a SIGHUP signal to docker daemon