From 9a697e340ba1fdfa9712e827f1c25528a2e9915b Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Fri, 24 Jun 2022 07:28:10 +0200 Subject: [PATCH] Add support for updating crio configs This adds support for updating crio configs (instead of installing hooks) and adds crio support to the nvidia-ctk runtime configure command. Signed-off-by: Evan Lezar --- cmd/nvidia-ctk/runtime/configure/configure.go | 48 ++++- internal/config/crio/crio.go | 125 +++++++++++ tools/container/crio/crio.go | 198 +++++++++++++++++- 3 files changed, 367 insertions(+), 4 deletions(-) create mode 100644 internal/config/crio/crio.go diff --git a/cmd/nvidia-ctk/runtime/configure/configure.go b/cmd/nvidia-ctk/runtime/configure/configure.go index 6788b751..13689986 100644 --- a/cmd/nvidia-ctk/runtime/configure/configure.go +++ b/cmd/nvidia-ctk/runtime/configure/configure.go @@ -22,7 +22,9 @@ import ( "os" "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-ctk/runtime/nvidia" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config/crio" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/docker" + "github.com/pelletier/go-toml" "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" ) @@ -31,6 +33,7 @@ const ( defaultRuntime = "docker" defaultDockerConfigFilePath = "/etc/docker/daemon.json" + defaultCrioConfigFilePath = "/etc/crio/crio.conf" ) type command struct { @@ -75,7 +78,7 @@ func (m command) build() *cli.Command { }, &cli.StringFlag{ Name: "runtime", - Usage: "the target runtime engine. One of [docker]", + Usage: "the target runtime engine. One of [crio, docker]", Value: defaultRuntime, Destination: &config.runtime, }, @@ -108,6 +111,8 @@ func (m command) build() *cli.Command { func (m command) configureWrapper(c *cli.Context, config *config) error { switch config.runtime { + case "crio": + return m.configureCrio(c, config) case "docker": return m.configureDocker(c, config) } @@ -152,3 +157,44 @@ func (m command) configureDocker(c *cli.Context, config *config) error { return nil } + +// configureCrio updates the crio config to enable the NVIDIA Container Runtime +func (m command) configureCrio(c *cli.Context, config *config) error { + configFilePath := config.configFilePath + if configFilePath == "" { + configFilePath = defaultCrioConfigFilePath + } + + cfg, err := crio.LoadConfig(configFilePath) + if err != nil { + return fmt.Errorf("unable to load config: %v", err) + } + + err = crio.UpdateConfig( + cfg, + config.nvidiaOptions.RuntimeName, + config.nvidiaOptions.RuntimePath, + config.nvidiaOptions.SetAsDefault, + ) + if err != nil { + return fmt.Errorf("unable to update config: %v", err) + } + + if config.dryRun { + output, err := toml.Marshal(cfg) + if err != nil { + return fmt.Errorf("unable to convert to TOML: %v", err) + } + os.Stdout.WriteString(fmt.Sprintf("%s\n", output)) + return nil + } + err = crio.FlushConfig(configFilePath, cfg) + if err != nil { + return fmt.Errorf("unable to flush config: %v", err) + } + + m.logger.Infof("Wrote updated config to %v", configFilePath) + m.logger.Infof("It is recommended that the cri-o daemon be restarted.") + + return nil +} diff --git a/internal/config/crio/crio.go b/internal/config/crio/crio.go new file mode 100644 index 00000000..89338a40 --- /dev/null +++ b/internal/config/crio/crio.go @@ -0,0 +1,125 @@ +/** +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package crio + +import ( + "fmt" + "os" + + "github.com/pelletier/go-toml" + log "github.com/sirupsen/logrus" +) + +// LoadConfig loads the cri-o config from disk +func LoadConfig(config string) (*toml.Tree, error) { + log.Infof("Loading config: %v", config) + + info, err := os.Stat(config) + if os.IsExist(err) && info.IsDir() { + return nil, fmt.Errorf("config file is a directory") + } + + configFile := config + if os.IsNotExist(err) { + configFile = "/dev/null" + log.Infof("Config file does not exist, creating new one") + } + + cfg, err := toml.LoadFile(configFile) + if err != nil { + return nil, err + } + + log.Infof("Successfully loaded config") + + return cfg, nil +} + +// UpdateConfig updates the cri-o config to include the NVIDIA Container Runtime +func UpdateConfig(config *toml.Tree, runtimeClass string, runtimePath string, setAsDefault bool) error { + switch runc := config.Get("crio.runtime.runtimes.runc").(type) { + case *toml.Tree: + runc, _ = toml.Load(runc.String()) + config.SetPath([]string{"crio", "runtime", "runtimes", runtimeClass}, runc) + } + + config.SetPath([]string{"crio", "runtime", "runtimes", runtimeClass, "runtime_path"}, runtimePath) + config.SetPath([]string{"crio", "runtime", "runtimes", runtimeClass, "runtime_type"}, "oci") + + if setAsDefault { + config.SetPath([]string{"crio", "runtime", "default_runtime"}, runtimeClass) + } + + return nil +} + +// RevertConfig reverts the cri-o config to remove the NVIDIA Container Runtime +func RevertConfig(config *toml.Tree, runtimeClass string) error { + if runtime, ok := config.GetPath([]string{"crio", "runtime", "default_runtime"}).(string); ok { + if runtimeClass == runtime { + config.DeletePath([]string{"crio", "runtime", "default_runtime"}) + } + } + + runtimeClassPath := []string{"crio", "runtime", "runtimes", runtimeClass} + config.DeletePath(runtimeClassPath) + for i := 0; i < len(runtimeClassPath); i++ { + remainingPath := runtimeClassPath[:len(runtimeClassPath)-i] + if entry, ok := config.GetPath(remainingPath).(*toml.Tree); ok { + if len(entry.Keys()) != 0 { + break + } + config.DeletePath(remainingPath) + } + } + + return nil +} + +// FlushConfig flushes the updated/reverted config out to disk +func FlushConfig(config string, cfg *toml.Tree) error { + log.Infof("Flushing config") + + output, err := cfg.ToTomlString() + if err != nil { + return fmt.Errorf("unable to convert to TOML: %v", err) + } + + switch len(output) { + case 0: + err := os.Remove(config) + if err != nil { + return fmt.Errorf("unable to remove empty file: %v", err) + } + log.Infof("Config empty, removing file") + default: + f, err := os.Create(config) + if err != nil { + return fmt.Errorf("unable to open '%v' for writing: %v", config, err) + } + defer f.Close() + + _, err = f.WriteString(output) + if err != nil { + return fmt.Errorf("unable to write output: %v", err) + } + } + + log.Infof("Successfully flushed config") + + return nil +} diff --git a/tools/container/crio/crio.go b/tools/container/crio/crio.go index 8579a944..daf5ec39 100644 --- a/tools/container/crio/crio.go +++ b/tools/container/crio/crio.go @@ -20,23 +20,47 @@ import ( "encoding/json" "fmt" "os" + "os/exec" "path/filepath" "github.com/NVIDIA/nvidia-container-toolkit/internal/config" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config/crio" + "github.com/pelletier/go-toml" log "github.com/sirupsen/logrus" cli "github.com/urfave/cli/v2" ) const ( + restartModeSystemd = "systemd" + restartModeNone = "none" + + defaultConfigMode = "hook" + + // Hook-based settings defaultHooksDir = "/usr/share/containers/oci/hooks.d" defaultHookFilename = "oci-nvidia-hook.json" + + // Config-based settings + defaultConfig = "/etc/crio/crio.conf" + defaultRuntimeClass = "nvidia" + defaultSetAsDefault = true + defaultRestartMode = restartModeSystemd + defaultHostRootMount = "/host" ) -// options stores the configuration from the command line or environment variables +// options stores the configuration from the command linek or environment variables type options struct { + configMode string + hooksDir string hookFilename string runtimeDir string + + config string + runtimeClass string + setAsDefault bool + restartMode string + hostRootMount string } func main() { @@ -52,7 +76,7 @@ func main() { // Create the 'setup' subcommand setup := cli.Command{} setup.Name = "setup" - setup.Usage = "Create the cri-o hook required to run NVIDIA GPU containers" + setup.Usage = "Configure cri-o for NVIDIA GPU containers" setup.ArgsUsage = "" setup.Action = func(c *cli.Context) error { return Setup(c, &options) @@ -64,7 +88,7 @@ func main() { // Create the 'cleanup' subcommand cleanup := cli.Command{} cleanup.Name = "cleanup" - cleanup.Usage = "Remove the NVIDIA cri-o hook" + cleanup.Usage = "Remove the NVIDIA-specific cri-o configuration" cleanup.Action = func(c *cli.Context) error { return Cleanup(c, &options) } @@ -97,6 +121,50 @@ func main() { EnvVars: []string{"CRIO_HOOK_FILENAME"}, DefaultText: defaultHookFilename, }, + &cli.StringFlag{ + Name: "config-mode", + Usage: "the configuration mode to use. One of [hook | config]", + Value: defaultConfigMode, + Destination: &options.configMode, + EnvVars: []string{"CRIO_CONFIG_MODE"}, + }, + &cli.StringFlag{ + Name: "config", + Usage: "Path to the cri-o config file", + Value: defaultConfig, + Destination: &options.config, + EnvVars: []string{"CRIO_CONFIG"}, + }, + &cli.StringFlag{ + Name: "runtime-class", + Usage: "The name of the runtime class to set for the nvidia-container-runtime", + Value: defaultRuntimeClass, + Destination: &options.runtimeClass, + EnvVars: []string{"CRIO_RUNTIME_CLASS"}, + }, + // The flags below are only used by the 'setup' command. + &cli.BoolFlag{ + Name: "set-as-default", + Usage: "Set nvidia-container-runtime as the default runtime", + Value: defaultSetAsDefault, + Destination: &options.setAsDefault, + EnvVars: []string{"CRIO_SET_AS_DEFAULT"}, + Hidden: true, + }, + &cli.StringFlag{ + Name: "restart-mode", + Usage: "Specify how cri-o should be restarted; If 'none' is selected, it will not be restarted [systemd | none]", + Value: defaultRestartMode, + Destination: &options.restartMode, + EnvVars: []string{"CRIO_RESTART_MODE"}, + }, + &cli.StringFlag{ + Name: "host-root", + Usage: "Specify the path to the host root to be used when restarting crio using systemd", + Value: defaultHostRootMount, + Destination: &options.hostRootMount, + EnvVars: []string{"HOST_ROOT_MOUNT"}, + }, } // Update the subcommand flags with the common subcommand flags @@ -113,6 +181,20 @@ func main() { func Setup(c *cli.Context, o *options) error { log.Infof("Starting 'setup' for %v", c.App.Name) + switch o.configMode { + case "hook": + return setupHook(o) + case "config": + return setupConfig(o) + default: + return fmt.Errorf("invalid config-mode '%v'", o.configMode) + } +} + +// setupHook installs the prestart hook required to launch GPU-enabled containers +func setupHook(o *options) error { + log.Infof("Installing prestart hook") + err := os.MkdirAll(o.hooksDir, 0755) if err != nil { return fmt.Errorf("error creating hooks directory %v: %v", o.hooksDir, err) @@ -127,10 +209,51 @@ func Setup(c *cli.Context, o *options) error { return nil } +// setupConfig updates the cri-o config for the NVIDIA container runtime +func setupConfig(o *options) error { + log.Infof("Updating config file") + + cfg, err := crio.LoadConfig(o.config) + if err != nil { + return fmt.Errorf("unable to load config: %v", err) + } + + err = UpdateConfig(cfg, o) + if err != nil { + return fmt.Errorf("unable to update config: %v", err) + } + + err = crio.FlushConfig(o.config, cfg) + if err != nil { + return fmt.Errorf("unable to flush config: %v", err) + } + + err = RestartCrio(o) + if err != nil { + return fmt.Errorf("unable to restart crio: %v", err) + } + + return nil +} + // Cleanup removes the specified prestart hook func Cleanup(c *cli.Context, o *options) error { log.Infof("Starting 'cleanup' for %v", c.App.Name) + switch o.configMode { + case "hook": + return cleanupHook(o) + case "config": + return cleanupConfig(o) + default: + return fmt.Errorf("invalid config-mode '%v'", o.configMode) + } +} + +// cleanupHook removes the prestart hook +func cleanupHook(o *options) error { + log.Infof("Removing prestart hook") + hookPath := getHookPath(o.hooksDir, o.hookFilename) err := os.Remove(hookPath) if err != nil { @@ -140,6 +263,33 @@ func Cleanup(c *cli.Context, o *options) error { return nil } +// cleanupConfig removes the NVIDIA container runtime from the cri-o config +func cleanupConfig(o *options) error { + log.Infof("Reverting config file modifications") + + cfg, err := crio.LoadConfig(o.config) + if err != nil { + return fmt.Errorf("unable to load config: %v", err) + } + + err = RevertConfig(cfg, o) + if err != nil { + return fmt.Errorf("unable to update config: %v", err) + } + + err = crio.FlushConfig(o.config, cfg) + if err != nil { + return fmt.Errorf("unable to flush config: %v", err) + } + + err = RestartCrio(o) + if err != nil { + return fmt.Errorf("unable to restart crio: %v", err) + } + + return nil +} + // ParseArgs parses the command line arguments to the CLI func ParseArgs(c *cli.Context, o *options) error { args := c.Args() @@ -193,3 +343,45 @@ func generateOciHook(toolkitDir string) podmanHook { } return hook } + +// UpdateConfig updates the cri-o config to include the NVIDIA Container Runtime +func UpdateConfig(config *toml.Tree, o *options) error { + runtimePath := filepath.Join(o.runtimeDir, "nvidia-container-runtime") + return crio.UpdateConfig(config, o.runtimeClass, runtimePath, o.setAsDefault) +} + +// RevertConfig reverts the cri-o config to remove the NVIDIA Container Runtime +func RevertConfig(config *toml.Tree, o *options) error { + return crio.RevertConfig(config, o.runtimeClass) +} + +// RestartCrio restarts crio depending on the value of restartModeFlag +func RestartCrio(o *options) error { + switch o.restartMode { + case restartModeNone: + log.Warnf("Skipping restart of crio due to --restart-mode=%v", o.restartMode) + return nil + case restartModeSystemd: + return RestartCrioSystemd(o.hostRootMount) + default: + return fmt.Errorf("invalid restart mode specified: %v", o.restartMode) + } +} + +// RestartCrioSystemd restarts cri-o using systemctl +func RestartCrioSystemd(hostRootMount string) error { + log.Infof("Restarting cri-o using systemd and host root mounted at %v", hostRootMount) + + command := "chroot" + args := []string{hostRootMount, "systemctl", "restart", "crio"} + + cmd := exec.Command(command, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + err := cmd.Run() + if err != nil { + return fmt.Errorf("error restarting crio using systemd: %v", err) + } + + return nil +}