diff --git a/tools/container/README.md b/tools/container/README.md new file mode 100644 index 00000000..3197b358 --- /dev/null +++ b/tools/container/README.md @@ -0,0 +1,75 @@ +## Introduction + +This repository contains tools that allow docker, containerd, or cri-o to be configured to use the NVIDIA Container Toolkit. + +*Note*: These were copied from the [`container-config` repository](https://gitlab.com/nvidia/container-toolkit/container-config/-/tree/383587f766a55177ede0e39e3810a974043e503e) are being migrated to commands installed with the NVIDIA Container Toolkit. + +### Docker + +After building the `docker` binary, run: +```bash +docker setup \ + --runtime-name NAME \ + /run/nvidia/toolkit +``` + +Configure the `nvidia-container-runtime` as a docker runtime named `NAME`. If the `--runtime-name` flag is not specified, this runtime would be called `nvidia`. A runtime named `nvidia-experimental` will also be configured using the `nvidia-container-runtime-experimental` OCI-compliant runtime shim. + +Since `--set-as-default` is enabled by default, the specified runtime name will also be set as the default docker runtime. This can be disabled by explicityly specifying `--set-as-default=false`. + +**Note**: If `--runtime-name` is specified as `nvidia-experimental` explicitly, the `nvidia-experimental` runtime will be configured as the default runtime, with the `nvidia` runtime still configured and available for use. + +The following table describes the behaviour for different `--runtime-name` and `--set-as-default` flag combinations. + +| Flags | Installed Runtimes | Default Runtime | +|-------------------------------------------------------------|:--------------------------------|:----------------------| +| **NONE SPECIFIED** | `nvidia`, `nvidia-experimental` | `nvidia` | +| `--runtime-name nvidia` | `nvidia`, `nvidia-experimental` | `nvidia` | +| `--runtime-name NAME` | `NAME`, `nvidia-experimental` | `NAME` | +| `--runtime-name nvidia-experimental` | `nvidia`, `nvidia-experimental` | `nvidia-experimental` | +| `--set-as-default` | `nvidia`, `nvidia-experimental` | `nvidia` | +| `--set-as-default --runtime-name nvidia` | `nvidia`, `nvidia-experimental` | `nvidia` | +| `--set-as-default --runtime-name NAME` | `NAME`, `nvidia-experimental` | `NAME` | +| `--set-as-default --runtime-name nvidia-experimental` | `nvidia`, `nvidia-experimental` | `nvidia-experimental` | +| `--set-as-default=false` | `nvidia`, `nvidia-experimental` | **NOT SET** | +| `--set-as-default=false --runtime-name NAME` | `NAME`, `nvidia-experimental` | **NOT SET** | +| `--set-as-default=false --runtime-name nvidia` | `nvidia`, `nvidia-experimental` | **NOT SET** | +| `--set-as-default=false --runtime-name nvidia-experimental` | `nvidia`, `nvidia-experimental` | **NOT SET** | + +These combinations also hold for the environment variables that map to the command line flags: `DOCKER_RUNTIME_NAME`, `DOCKER_SET_AS_DEFAULT`. + +### Containerd +After running the `containerd` binary, run: +```bash +containerd setup \ + --runtime-class NAME \ + /run/nvidia/toolkit +``` + +Configure the `nvidia-container-runtime` as a runtime class named `NAME`. If the `--runtime-class` flag is not specified, this runtime would be called `nvidia`. A runtime class named `nvidia-experimental` will also be configured using the `nvidia-container-runtime-experimental` OCI-compliant runtime shim. + +Adding the `--set-as-default` flag as follows: +```bash +containerd setup \ + --runtime-class NAME \ + --set-as-default \ + /run/nvidia/toolkit +``` +will set the runtime class `NAME` (or `nvidia` if not specified) as the default runtime class. + +**Note**: If `--runtime-class` is specified as `nvidia-experimental` explicitly and `--set-as-default` is specified, the `nvidia-experimental` runtime will be configured as the default runtime class, with the `nvidia` runtime class still configured and available for use. + +The following table describes the behaviour for different `--runtime-class` and `--set-as-default` flag combinations. + +| Flags | Installed Runtime Classes | Default Runtime Class | +|--------------------------------------------------------|:--------------------------------|:----------------------| +| **NONE SPECIFIED** | `nvidia`, `nvidia-experimental` | **NOT SET** | +| `--runtime-class NAME` | `NAME`, `nvidia-experimental` | **NOT SET** | +| `--runtime-class nvidia` | `nvidia`, `nvidia-experimental` | **NOT SET** | +| `--runtime-class nvidia-experimental` | `nvidia`, `nvidia-experimental` | **NOT SET** | +| `--set-as-default` | `nvidia`, `nvidia-experimental` | `nvidia` | +| `--set-as-default --runtime-class NAME` | `NAME`, `nvidia-experimental` | `NAME` | +| `--set-as-default --runtime-class nvidia` | `nvidia`, `nvidia-experimental` | `nvidia` | +| `--set-as-default --runtime-class nvidia-experimental` | `nvidia`, `nvidia-experimental` | `nvidia-experimental` | + +These combinations also hold for the environment variables that map to the command line flags. diff --git a/tools/container/containerd/config.go b/tools/container/containerd/config.go new file mode 100644 index 00000000..8182eb9e --- /dev/null +++ b/tools/container/containerd/config.go @@ -0,0 +1,116 @@ +/** +# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "github.com/pelletier/go-toml" +) + +// UpdateReverter defines the interface for applying and reverting configurations +type UpdateReverter interface { + Update(o *options) error + Revert(o *options) error +} + +type config struct { + *toml.Tree + version int64 + cri string + binaryKey string +} + +// update adds the specified runtime class to the the containerd config. +// if set-as default is specified, the runtime class is also set as the +// default runtime. +func (config *config) update(runtimeClass string, runtimeType string, runtimeBinary string, setAsDefault bool) { + config.Set("version", config.version) + + runcPath := config.runcPath() + runtimeClassPath := config.runtimeClassPath(runtimeClass) + + switch runc := config.GetPath(runcPath).(type) { + case *toml.Tree: + runc, _ = toml.Load(runc.String()) + config.SetPath(runtimeClassPath, runc) + } + + config.initRuntime(runtimeClassPath, runtimeType, runtimeBinary) + + if setAsDefault { + defaultRuntimeNamePath := config.defaultRuntimeNamePath() + config.SetPath(defaultRuntimeNamePath, runtimeClass) + } +} + +// revert removes the configuration applied in an update call. +func (config *config) revert(runtimeClass string) { + runtimeClassPath := config.runtimeClassPath(runtimeClass) + defaultRuntimeNamePath := config.defaultRuntimeNamePath() + + config.DeletePath(runtimeClassPath) + if runtime, ok := config.GetPath(defaultRuntimeNamePath).(string); ok { + if runtimeClass == runtime { + config.DeletePath(defaultRuntimeNamePath) + } + } + + for i := 0; i < len(runtimeClassPath); i++ { + if runtimes, ok := config.GetPath(runtimeClassPath[:len(runtimeClassPath)-i]).(*toml.Tree); ok { + if len(runtimes.Keys()) == 0 { + config.DeletePath(runtimeClassPath[:len(runtimeClassPath)-i]) + } + } + } + + if len(config.Keys()) == 1 && config.Keys()[0] == "version" { + config.Delete("version") + } +} + +// initRuntime creates a runtime config if it does not exist and ensures that the +// runtimes binary path is specified. +func (config *config) initRuntime(path []string, runtimeType string, binary string) { + if config.GetPath(path) == nil { + config.SetPath(append(path, "runtime_type"), runtimeType) + config.SetPath(append(path, "runtime_root"), "") + config.SetPath(append(path, "runtime_engine"), "") + config.SetPath(append(path, "privileged_without_host_devices"), false) + } + + binaryPath := append(path, "options", config.binaryKey) + config.SetPath(binaryPath, binary) +} + +func (config config) runcPath() []string { + return config.runtimeClassPath("runc") +} + +func (config config) runtimeClassBinaryPath(runtimeClass string) []string { + return append(config.runtimeClassPath(runtimeClass), "options", config.binaryKey) +} + +func (config config) runtimeClassPath(runtimeClass string) []string { + return append(config.containerdPath(), "runtimes", runtimeClass) +} + +func (config config) defaultRuntimeNamePath() []string { + return append(config.containerdPath(), "default_runtime_name") +} + +func (config config) containerdPath() []string { + return []string{"plugins", config.cri, "containerd"} +} diff --git a/tools/container/containerd/config_v1.go b/tools/container/containerd/config_v1.go new file mode 100644 index 00000000..e46ec21b --- /dev/null +++ b/tools/container/containerd/config_v1.go @@ -0,0 +1,126 @@ +/** +# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "path" + + "github.com/pelletier/go-toml" + log "github.com/sirupsen/logrus" +) + +// configV1 represents a V1 containerd config +type configV1 struct { + config +} + +func newConfigV1(cfg *toml.Tree) UpdateReverter { + c := configV1{ + config: config{ + Tree: cfg, + version: 1, + cri: "cri", + binaryKey: "Runtime", + }, + } + + return &c +} + +// Update performs an update specific to v1 of the containerd config +func (config *configV1) Update(o *options) error { + + // For v1 config, the `default_runtime_name` setting is only supported + // for containerd version at least v1.3 + supportsDefaultRuntimeName := !o.useLegacyConfig + + defaultRuntime := o.getDefaultRuntime() + + for runtimeClass, runtimeBinary := range o.getRuntimeBinaries() { + isDefaultRuntime := runtimeClass == defaultRuntime + config.update(runtimeClass, o.runtimeType, runtimeBinary, isDefaultRuntime && supportsDefaultRuntimeName) + + if !isDefaultRuntime { + continue + } + + if supportsDefaultRuntimeName { + defaultRuntimePath := append(config.containerdPath(), "default_runtime") + if config.GetPath(defaultRuntimePath) != nil { + log.Warnf("The setting of default_runtime (%v) in containerd is deprecated", defaultRuntimePath) + } + continue + } + + log.Warnf("Setting default_runtime is deprecated") + defaultRuntimePath := append(config.containerdPath(), "default_runtime") + config.initRuntime(defaultRuntimePath, o.runtimeType, runtimeBinary) + } + return nil +} + +// Revert performs a revert specific to v1 of the containerd config +func (config *configV1) Revert(o *options) error { + defaultRuntimePath := append(config.containerdPath(), "default_runtime") + defaultRuntimeOptionsPath := append(defaultRuntimePath, "options") + if runtime, ok := config.GetPath(append(defaultRuntimeOptionsPath, "Runtime")).(string); ok { + for _, runtimeBinary := range o.getRuntimeBinaries() { + if path.Base(runtimeBinary) == path.Base(runtime) { + config.DeletePath(append(defaultRuntimeOptionsPath, "Runtime")) + break + } + } + } + + if options, ok := config.GetPath(defaultRuntimeOptionsPath).(*toml.Tree); ok { + if len(options.Keys()) == 0 { + config.DeletePath(defaultRuntimeOptionsPath) + } + } + + if runtime, ok := config.GetPath(defaultRuntimePath).(*toml.Tree); ok { + fields := []string{"runtime_type", "runtime_root", "runtime_engine", "privileged_without_host_devices"} + if len(runtime.Keys()) <= len(fields) { + matches := []string{} + for _, f := range fields { + e := runtime.Get(f) + if e != nil { + matches = append(matches, f) + } + } + if len(matches) == len(runtime.Keys()) { + for _, m := range matches { + runtime.Delete(m) + } + } + } + } + + for i := 0; i < len(defaultRuntimePath); i++ { + if runtimes, ok := config.GetPath(defaultRuntimePath[:len(defaultRuntimePath)-i]).(*toml.Tree); ok { + if len(runtimes.Keys()) == 0 { + config.DeletePath(defaultRuntimePath[:len(defaultRuntimePath)-i]) + } + } + } + + for runtimeClass := range nvidiaRuntimeBinaries { + config.revert(runtimeClass) + } + + return nil +} diff --git a/tools/container/containerd/config_v1_test.go b/tools/container/containerd/config_v1_test.go new file mode 100644 index 00000000..246d5274 --- /dev/null +++ b/tools/container/containerd/config_v1_test.go @@ -0,0 +1,365 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "testing" + + "github.com/pelletier/go-toml" + "github.com/stretchr/testify/require" +) + +func TestUpdateV1ConfigDefaultRuntime(t *testing.T) { + const runtimeDir = "/test/runtime/dir" + + testCases := []struct { + legacyConfig bool + setAsDefault bool + runtimeClass string + expectedDefaultRuntimeName interface{} + expectedDefaultRuntimeBinary interface{} + }{ + {}, + { + legacyConfig: true, + setAsDefault: false, + expectedDefaultRuntimeName: nil, + expectedDefaultRuntimeBinary: nil, + }, + { + legacyConfig: true, + setAsDefault: true, + expectedDefaultRuntimeName: nil, + expectedDefaultRuntimeBinary: "/test/runtime/dir/nvidia-container-runtime", + }, + { + legacyConfig: true, + setAsDefault: true, + runtimeClass: "NAME", + expectedDefaultRuntimeName: nil, + expectedDefaultRuntimeBinary: "/test/runtime/dir/nvidia-container-runtime", + }, + { + legacyConfig: true, + setAsDefault: true, + runtimeClass: "nvidia-experimental", + expectedDefaultRuntimeName: nil, + expectedDefaultRuntimeBinary: "/test/runtime/dir/nvidia-container-runtime-experimental", + }, + { + legacyConfig: false, + setAsDefault: false, + expectedDefaultRuntimeName: nil, + expectedDefaultRuntimeBinary: nil, + }, + { + legacyConfig: false, + setAsDefault: true, + expectedDefaultRuntimeName: "nvidia", + expectedDefaultRuntimeBinary: nil, + }, + { + legacyConfig: false, + setAsDefault: true, + runtimeClass: "NAME", + expectedDefaultRuntimeName: "NAME", + expectedDefaultRuntimeBinary: nil, + }, + { + legacyConfig: false, + setAsDefault: true, + runtimeClass: "nvidia-experimental", + expectedDefaultRuntimeName: "nvidia-experimental", + expectedDefaultRuntimeBinary: nil, + }, + } + + for i, tc := range testCases { + o := &options{ + useLegacyConfig: tc.legacyConfig, + setAsDefault: tc.setAsDefault, + runtimeClass: tc.runtimeClass, + runtimeType: runtimeType, + runtimeDir: runtimeDir, + } + + config, err := toml.TreeFromMap(map[string]interface{}{}) + require.NoError(t, err, "%d: %v", i, tc) + + err = UpdateV1Config(config, o) + require.NoError(t, err, "%d: %v", i, tc) + + defaultRuntimeName := config.GetPath([]string{"plugins", "cri", "containerd", "default_runtime_name"}) + require.EqualValues(t, tc.expectedDefaultRuntimeName, defaultRuntimeName, "%d: %v", i, tc) + + defaultRuntime := config.GetPath([]string{"plugins", "cri", "containerd", "default_runtime"}) + if tc.expectedDefaultRuntimeBinary == nil { + require.Nil(t, defaultRuntime, "%d: %v", i, tc) + } else { + expected, err := runtimeTomlConfigV1(tc.expectedDefaultRuntimeBinary.(string)) + require.NoError(t, err, "%d: %v", i, tc) + + configContents, _ := toml.Marshal(defaultRuntime.(*toml.Tree)) + expectedContents, _ := toml.Marshal(expected) + + require.Equal(t, string(expectedContents), string(configContents), "%d: %v: %v", i, tc) + } + + } +} + +func TestUpdateV1Config(t *testing.T) { + const runtimeDir = "/test/runtime/dir" + const expectedVersion = int64(1) + + expectedBinaries := []string{ + "/test/runtime/dir/nvidia-container-runtime", + "/test/runtime/dir/nvidia-container-runtime-experimental", + } + + testCases := []struct { + runtimeClass string + expectedRuntimes []string + }{ + { + runtimeClass: "nvidia", + expectedRuntimes: []string{"nvidia", "nvidia-experimental"}, + }, + { + runtimeClass: "NAME", + expectedRuntimes: []string{"NAME", "nvidia-experimental"}, + }, + { + runtimeClass: "nvidia-experimental", + expectedRuntimes: []string{"nvidia", "nvidia-experimental"}, + }, + } + + for i, tc := range testCases { + o := &options{ + runtimeClass: tc.runtimeClass, + runtimeType: runtimeType, + runtimeDir: runtimeDir, + } + + config, err := toml.TreeFromMap(map[string]interface{}{}) + require.NoError(t, err, "%d: %v", i, tc) + + err = UpdateV1Config(config, o) + require.NoError(t, err, "%d: %v", i, tc) + + version, ok := config.Get("version").(int64) + require.True(t, ok) + require.EqualValues(t, expectedVersion, version) + + runtimes, ok := config.GetPath([]string{"plugins", "cri", "containerd", "runtimes"}).(*toml.Tree) + require.True(t, ok) + + runtimeClasses := runtimes.Keys() + require.ElementsMatch(t, tc.expectedRuntimes, runtimeClasses, "%d: %v", i, tc) + + for i, r := range tc.expectedRuntimes { + runtimeConfig := runtimes.Get(r) + + expected, err := runtimeTomlConfigV1(expectedBinaries[i]) + require.NoError(t, err, "%d: %v", i, tc) + + configContents, _ := toml.Marshal(runtimeConfig) + expectedContents, _ := toml.Marshal(expected) + + require.Equal(t, string(expectedContents), string(configContents), "%d: %v: %v", i, r, tc) + + } + } +} + +func TestUpdateV1ConfigWithRuncPresent(t *testing.T) { + const runcBinary = "/runc-binary" + const runtimeDir = "/test/runtime/dir" + const expectedVersion = int64(1) + + expectedBinaries := []string{ + runcBinary, + "/test/runtime/dir/nvidia-container-runtime", + "/test/runtime/dir/nvidia-container-runtime-experimental", + } + + testCases := []struct { + runtimeClass string + expectedRuntimes []string + }{ + { + runtimeClass: "nvidia", + expectedRuntimes: []string{"runc", "nvidia", "nvidia-experimental"}, + }, + { + runtimeClass: "NAME", + expectedRuntimes: []string{"runc", "NAME", "nvidia-experimental"}, + }, + { + runtimeClass: "nvidia-experimental", + expectedRuntimes: []string{"runc", "nvidia", "nvidia-experimental"}, + }, + } + + for i, tc := range testCases { + o := &options{ + runtimeClass: tc.runtimeClass, + runtimeType: runtimeType, + runtimeDir: runtimeDir, + } + + config, err := toml.TreeFromMap(runcConfigMapV1("/runc-binary")) + require.NoError(t, err, "%d: %v", i, tc) + + err = UpdateV1Config(config, o) + require.NoError(t, err, "%d: %v", i, tc) + + version, ok := config.Get("version").(int64) + require.True(t, ok) + require.EqualValues(t, expectedVersion, version) + + runtimes, ok := config.GetPath([]string{"plugins", "cri", "containerd", "runtimes"}).(*toml.Tree) + require.True(t, ok) + + runtimeClasses := runtimes.Keys() + require.ElementsMatch(t, tc.expectedRuntimes, runtimeClasses, "%d: %v", i, tc) + + for i, r := range tc.expectedRuntimes { + runtimeConfig := runtimes.Get(r) + + expected, err := toml.TreeFromMap(runcRuntimeConfigMapV1(expectedBinaries[i])) + require.NoError(t, err, "%d: %v", i, tc) + + configContents, _ := toml.Marshal(runtimeConfig) + expectedContents, _ := toml.Marshal(expected) + + require.Equal(t, string(expectedContents), string(configContents), "%d: %v: %v", i, r, tc) + + } + } +} + +func TestRevertV1Config(t *testing.T) { + testCases := []struct { + config map[string]interface { + } + expected map[string]interface{} + }{ + {}, + { + config: map[string]interface{}{ + "version": int64(1), + }, + }, + { + config: map[string]interface{}{ + "version": int64(1), + "plugins": map[string]interface{}{ + "cri": map[string]interface{}{ + "containerd": map[string]interface{}{ + "runtimes": map[string]interface{}{ + "nvidia": runtimeMapV1("/test/runtime/dir/nvidia-container-runtime"), + "nvidia-experimental": runtimeMapV1("/test/runtime/dir/nvidia-container-runtime-experimental"), + }, + }, + }, + }, + }, + }, + { + config: map[string]interface{}{ + "version": int64(1), + "plugins": map[string]interface{}{ + "cri": map[string]interface{}{ + "containerd": map[string]interface{}{ + "runtimes": map[string]interface{}{ + "nvidia": runtimeMapV1("/test/runtime/dir/nvidia-container-runtime"), + "nvidia-experimental": runtimeMapV1("/test/runtime/dir/nvidia-container-runtime-experimental"), + }, + "default_runtime": runtimeMapV1("/test/runtime/dir/nvidia-container-runtime"), + "default_runtime_name": "nvidia", + }, + }, + }, + }, + }, + } + + for i, tc := range testCases { + o := &options{ + runtimeClass: "nvidia", + } + + config, err := toml.TreeFromMap(tc.config) + require.NoError(t, err, "%d: %v", i, tc) + + expected, err := toml.TreeFromMap(tc.expected) + require.NoError(t, err, "%d: %v", i, tc) + + err = RevertV1Config(config, o) + require.NoError(t, err, "%d: %v", i, tc) + + configContents, _ := toml.Marshal(config) + expectedContents, _ := toml.Marshal(expected) + + require.Equal(t, string(expectedContents), string(configContents), "%d: %v", i, tc) + } +} + +func runtimeTomlConfigV1(binary string) (*toml.Tree, error) { + return toml.TreeFromMap(runtimeMapV1(binary)) +} + +func runtimeMapV1(binary string) map[string]interface{} { + return map[string]interface{}{ + "runtime_type": runtimeType, + "runtime_root": "", + "runtime_engine": "", + "privileged_without_host_devices": false, + "options": map[string]interface{}{ + "Runtime": binary, + }, + } +} + +func runcConfigMapV1(binary string) map[string]interface{} { + return map[string]interface{}{ + "plugins": map[string]interface{}{ + "cri": map[string]interface{}{ + "containerd": map[string]interface{}{ + "runtimes": map[string]interface{}{ + "runc": runcRuntimeConfigMapV1(binary), + }, + }, + }, + }, + } +} + +func runcRuntimeConfigMapV1(binary string) map[string]interface{} { + return map[string]interface{}{ + "runtime_type": "runc_runtime_type", + "runtime_root": "runc_runtime_root", + "runtime_engine": "runc_runtime_engine", + "privileged_without_host_devices": true, + "options": map[string]interface{}{ + "runc-option": "value", + "Runtime": binary, + }, + } +} diff --git a/tools/container/containerd/config_v2.go b/tools/container/containerd/config_v2.go new file mode 100644 index 00000000..4d0ef4d9 --- /dev/null +++ b/tools/container/containerd/config_v2.go @@ -0,0 +1,59 @@ +/** +# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "github.com/pelletier/go-toml" +) + +// configV2 represents a V2 containerd config +type configV2 struct { + config +} + +func newConfigV2(cfg *toml.Tree) UpdateReverter { + c := configV2{ + config: config{ + Tree: cfg, + version: 2, + cri: "io.containerd.grpc.v1.cri", + binaryKey: "BinaryName", + }, + } + + return &c +} + +// Update performs an update specific to v2 of the containerd config +func (config *configV2) Update(o *options) error { + defaultRuntime := o.getDefaultRuntime() + for runtimeClass, runtimeBinary := range o.getRuntimeBinaries() { + setAsDefault := defaultRuntime == runtimeClass + config.update(runtimeClass, o.runtimeType, runtimeBinary, setAsDefault) + } + + return nil +} + +// Revert performs a revert specific to v2 of the containerd config +func (config *configV2) Revert(o *options) error { + for runtimeClass := range o.getRuntimeBinaries() { + config.revert(runtimeClass) + } + + return nil +} diff --git a/tools/container/containerd/config_v2_test.go b/tools/container/containerd/config_v2_test.go new file mode 100644 index 00000000..9e2342d6 --- /dev/null +++ b/tools/container/containerd/config_v2_test.go @@ -0,0 +1,329 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "testing" + + "github.com/pelletier/go-toml" + "github.com/stretchr/testify/require" +) + +const ( + runtimeType = "runtime_type" +) + +func TestUpdateV2ConfigDefaultRuntime(t *testing.T) { + const runtimeDir = "/test/runtime/dir" + + testCases := []struct { + setAsDefault bool + runtimeClass string + expectedDefaultRuntimeName interface{} + }{ + {}, + { + setAsDefault: false, + runtimeClass: "nvidia", + expectedDefaultRuntimeName: nil, + }, + { + setAsDefault: false, + runtimeClass: "NAME", + expectedDefaultRuntimeName: nil, + }, + { + setAsDefault: false, + runtimeClass: "nvidia-experimental", + expectedDefaultRuntimeName: nil, + }, + { + setAsDefault: true, + runtimeClass: "nvidia", + expectedDefaultRuntimeName: "nvidia", + }, + { + setAsDefault: true, + runtimeClass: "NAME", + expectedDefaultRuntimeName: "NAME", + }, + { + setAsDefault: true, + runtimeClass: "nvidia-experimental", + expectedDefaultRuntimeName: "nvidia-experimental", + }, + } + + for i, tc := range testCases { + o := &options{ + setAsDefault: tc.setAsDefault, + runtimeClass: tc.runtimeClass, + runtimeDir: runtimeDir, + } + + config, err := toml.TreeFromMap(map[string]interface{}{}) + require.NoError(t, err, "%d: %v", i, tc) + + err = UpdateV2Config(config, o) + require.NoError(t, err, "%d: %v", i, tc) + + defaultRuntimeName := config.GetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "default_runtime_name"}) + require.EqualValues(t, tc.expectedDefaultRuntimeName, defaultRuntimeName, "%d: %v", i, tc) + } +} + +func TestUpdateV2Config(t *testing.T) { + const runtimeDir = "/test/runtime/dir" + const expectedVersion = int64(2) + + expectedBinaries := []string{ + "/test/runtime/dir/nvidia-container-runtime", + "/test/runtime/dir/nvidia-container-runtime-experimental", + } + + testCases := []struct { + runtimeClass string + expectedRuntimes []string + }{ + { + runtimeClass: "nvidia", + expectedRuntimes: []string{"nvidia", "nvidia-experimental"}, + }, + { + runtimeClass: "NAME", + expectedRuntimes: []string{"NAME", "nvidia-experimental"}, + }, + { + runtimeClass: "nvidia-experimental", + expectedRuntimes: []string{"nvidia", "nvidia-experimental"}, + }, + } + + for i, tc := range testCases { + o := &options{ + runtimeClass: tc.runtimeClass, + runtimeType: runtimeType, + runtimeDir: runtimeDir, + } + + config, err := toml.TreeFromMap(map[string]interface{}{}) + require.NoError(t, err, "%d: %v", i, tc) + + err = UpdateV2Config(config, o) + require.NoError(t, err, "%d: %v", i, tc) + + version, ok := config.Get("version").(int64) + require.True(t, ok) + require.EqualValues(t, expectedVersion, version, "%d: %v", i, tc) + + runtimes, ok := config.GetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes"}).(*toml.Tree) + require.True(t, ok) + + runtimeClasses := runtimes.Keys() + require.ElementsMatch(t, tc.expectedRuntimes, runtimeClasses, "%d: %v", i, tc) + + for i, r := range tc.expectedRuntimes { + runtimeConfig := runtimes.Get(r) + + expected, err := runtimeTomlConfigV2(expectedBinaries[i]) + require.NoError(t, err, "%d: %v", i, tc) + + configContents, _ := toml.Marshal(runtimeConfig) + expectedContents, _ := toml.Marshal(expected) + + require.Equal(t, string(expectedContents), string(configContents), "%d: %v: %v", i, r, tc) + + } + } + +} + +func TestUpdateV2ConfigWithRuncPresent(t *testing.T) { + const runcBinary = "/runc-binary" + const runtimeDir = "/test/runtime/dir" + const expectedVersion = int64(2) + + expectedBinaries := []string{ + runcBinary, + "/test/runtime/dir/nvidia-container-runtime", + "/test/runtime/dir/nvidia-container-runtime-experimental", + } + + testCases := []struct { + runtimeClass string + expectedRuntimes []string + }{ + { + runtimeClass: "nvidia", + expectedRuntimes: []string{"runc", "nvidia", "nvidia-experimental"}, + }, + { + runtimeClass: "NAME", + expectedRuntimes: []string{"runc", "NAME", "nvidia-experimental"}, + }, + { + runtimeClass: "nvidia-experimental", + expectedRuntimes: []string{"runc", "nvidia", "nvidia-experimental"}, + }, + } + + for i, tc := range testCases { + o := &options{ + runtimeClass: tc.runtimeClass, + runtimeType: runtimeType, + runtimeDir: runtimeDir, + } + + config, err := toml.TreeFromMap(runcConfigMapV2("/runc-binary")) + require.NoError(t, err, "%d: %v", i, tc) + + err = UpdateV2Config(config, o) + require.NoError(t, err, "%d: %v", i, tc) + + version, ok := config.Get("version").(int64) + require.True(t, ok) + require.EqualValues(t, expectedVersion, version) + + runtimes, ok := config.GetPath([]string{"plugins", "io.containerd.grpc.v1.cri", "containerd", "runtimes"}).(*toml.Tree) + require.True(t, ok, "%d: %v", i, tc) + + runtimeClasses := runtimes.Keys() + require.ElementsMatch(t, tc.expectedRuntimes, runtimeClasses, "%d: %v", i, tc) + + for i, r := range tc.expectedRuntimes { + runtimeConfig := runtimes.Get(r) + + expected, err := toml.TreeFromMap(runcRuntimeConfigMapV2(expectedBinaries[i])) + require.NoError(t, err, "%d: %v", i, tc) + + configContents, _ := toml.Marshal(runtimeConfig) + expectedContents, _ := toml.Marshal(expected) + + require.Equal(t, string(expectedContents), string(configContents), "%d: %v: %v", i, r, tc) + + } + } +} + +func TestRevertV2Config(t *testing.T) { + testCases := []struct { + config map[string]interface { + } + expected map[string]interface{} + }{ + {}, + { + config: map[string]interface{}{ + "version": int64(2), + }, + }, + { + config: map[string]interface{}{ + "version": int64(2), + "plugins": map[string]interface{}{ + "io.containerd.grpc.v1.cri": map[string]interface{}{ + "containerd": map[string]interface{}{ + "runtimes": map[string]interface{}{ + "nvidia": runtimeMapV2("/test/runtime/dir/nvidia-container-runtime"), + "nvidia-experimental": runtimeMapV2("/test/runtime/dir/nvidia-container-runtime-experimental"), + }, + }, + }, + }, + }, + }, + { + config: map[string]interface{}{ + "version": int64(2), + "plugins": map[string]interface{}{ + "io.containerd.grpc.v1.cri": map[string]interface{}{ + "containerd": map[string]interface{}{ + "runtimes": map[string]interface{}{ + "nvidia": runtimeMapV2("/test/runtime/dir/nvidia-container-runtime"), + "nvidia-experimental": runtimeMapV2("/test/runtime/dir/nvidia-container-runtime-experimental"), + }, + "default_runtime_name": "nvidia", + }, + }, + }, + }, + }, + } + + for i, tc := range testCases { + o := &options{ + runtimeClass: "nvidia", + } + + config, err := toml.TreeFromMap(tc.config) + require.NoError(t, err, "%d: %v", i, tc) + + expected, err := toml.TreeFromMap(tc.expected) + require.NoError(t, err, "%d: %v", i, tc) + + err = RevertV2Config(config, o) + require.NoError(t, err, "%d: %v", i, tc) + + configContents, _ := toml.Marshal(config) + expectedContents, _ := toml.Marshal(expected) + + require.Equal(t, string(expectedContents), string(configContents), "%d: %v", i, tc) + } +} + +func runtimeTomlConfigV2(binary string) (*toml.Tree, error) { + return toml.TreeFromMap(runtimeMapV2(binary)) +} + +func runtimeMapV2(binary string) map[string]interface{} { + return map[string]interface{}{ + "runtime_type": runtimeType, + "runtime_root": "", + "runtime_engine": "", + "privileged_without_host_devices": false, + "options": map[string]interface{}{ + "BinaryName": binary, + }, + } +} + +func runcConfigMapV2(binary string) map[string]interface{} { + return map[string]interface{}{ + "plugins": map[string]interface{}{ + "io.containerd.grpc.v1.cri": map[string]interface{}{ + "containerd": map[string]interface{}{ + "runtimes": map[string]interface{}{ + "runc": runcRuntimeConfigMapV2(binary), + }, + }, + }, + }, + } +} + +func runcRuntimeConfigMapV2(binary string) map[string]interface{} { + return map[string]interface{}{ + "runtime_type": "runc_runtime_type", + "runtime_root": "runc_runtime_root", + "runtime_engine": "runc_runtime_engine", + "privileged_without_host_devices": true, + "options": map[string]interface{}{ + "runc-option": "value", + "BinaryName": binary, + }, + } +} diff --git a/tools/container/containerd/containerd.go b/tools/container/containerd/containerd.go new file mode 100644 index 00000000..7a34ba2e --- /dev/null +++ b/tools/container/containerd/containerd.go @@ -0,0 +1,587 @@ +/** +# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "syscall" + "time" + + "github.com/containerd/containerd/plugin" + toml "github.com/pelletier/go-toml" + log "github.com/sirupsen/logrus" + cli "github.com/urfave/cli/v2" +) + +const ( + restartModeSignal = "signal" + restartModeSystemd = "systemd" + restartModeNone = "NONE" + + nvidiaRuntimeName = "nvidia" + nvidiaRuntimeBinary = "nvidia-container-runtime" + nvidiaExperimentalRuntimeName = "nvidia-experimental" + nvidiaExperimentalRuntimeBinary = "nvidia-container-runtime-experimental" + + defaultConfig = "/etc/containerd/config.toml" + defaultSocket = "/run/containerd/containerd.sock" + defaultRuntimeClass = "nvidia" + defaultRuntmeType = plugin.RuntimeRuncV2 + defaultSetAsDefault = true + defaultRestartMode = restartModeSignal + defaultHostRootMount = "/host" + + reloadBackoff = 5 * time.Second + maxReloadAttempts = 6 + + socketMessageToGetPID = "" +) + +// nvidiaRuntimeBinaries defines a map of runtime names to binary names +var nvidiaRuntimeBinaries = map[string]string{ + nvidiaRuntimeName: nvidiaRuntimeBinary, + nvidiaExperimentalRuntimeName: nvidiaExperimentalRuntimeBinary, +} + +// options stores the configuration from the command line or environment variables +type options struct { + config string + socket string + runtimeClass string + runtimeType string + setAsDefault bool + restartMode string + hostRootMount string + runtimeDir string + useLegacyConfig bool +} + +func main() { + options := options{} + + // Create the top-level CLI + c := cli.NewApp() + c.Name = "containerd" + c.Usage = "Update a containerd config with the nvidia-container-runtime" + c.Version = "0.1.0" + + // Create the 'setup' subcommand + setup := cli.Command{} + setup.Name = "setup" + setup.Usage = "Trigger a containerd config to be updated" + setup.ArgsUsage = "" + setup.Action = func(c *cli.Context) error { + return Setup(c, &options) + } + + // Create the 'cleanup' subcommand + cleanup := cli.Command{} + cleanup.Name = "cleanup" + cleanup.Usage = "Trigger any updates made to a containerd config to be undone" + cleanup.ArgsUsage = "" + cleanup.Action = func(c *cli.Context) error { + return Cleanup(c, &options) + } + + // Register the subcommands with the top-level CLI + c.Commands = []*cli.Command{ + &setup, + &cleanup, + } + + // Setup common flags across both subcommands. All subcommands get the same + // set of flags even if they don't use some of them. This is so that we + // only require the user to specify one set of flags for both 'startup' + // and 'cleanup' to simplify things. + commonFlags := []cli.Flag{ + &cli.StringFlag{ + Name: "config", + Aliases: []string{"c"}, + Usage: "Path to the containerd config file", + Value: defaultConfig, + Destination: &options.config, + EnvVars: []string{"CONTAINERD_CONFIG"}, + }, + &cli.StringFlag{ + Name: "socket", + Aliases: []string{"s"}, + Usage: "Path to the containerd socket file", + Value: defaultSocket, + Destination: &options.socket, + EnvVars: []string{"CONTAINERD_SOCKET"}, + }, + &cli.StringFlag{ + Name: "runtime-class", + Aliases: []string{"r"}, + Usage: "The name of the runtime class to set for the nvidia-container-runtime", + Value: defaultRuntimeClass, + Destination: &options.runtimeClass, + EnvVars: []string{"CONTAINERD_RUNTIME_CLASS"}, + }, + &cli.StringFlag{ + Name: "runtime-type", + Usage: "The runtime_type to use for the configured runtime classes", + Value: defaultRuntmeType, + Destination: &options.runtimeType, + EnvVars: []string{"CONTAINERD_RUNTIME_TYPE"}, + }, + // The flags below are only used by the 'setup' command. + &cli.BoolFlag{ + Name: "set-as-default", + Aliases: []string{"d"}, + Usage: "Set nvidia-container-runtime as the default runtime", + Value: defaultSetAsDefault, + Destination: &options.setAsDefault, + EnvVars: []string{"CONTAINERD_SET_AS_DEFAULT"}, + Hidden: true, + }, + &cli.StringFlag{ + Name: "restart-mode", + Usage: "Specify how containerd should be restarted; [signal | systemd]", + Value: defaultRestartMode, + Destination: &options.restartMode, + EnvVars: []string{"CONTAINERD_RESTART_MODE"}, + }, + &cli.StringFlag{ + Name: "host-root", + Usage: "Specify the path to the host root to be used when restarting containerd using systemd", + Value: defaultHostRootMount, + Destination: &options.hostRootMount, + EnvVars: []string{"HOST_ROOT_MOUNT"}, + }, + &cli.BoolFlag{ + Name: "use-legacy-config", + Usage: "Specify whether a legacy (pre v1.3) config should be used", + Destination: &options.useLegacyConfig, + EnvVars: []string{"CONTAINERD_USE_LEGACY_CONFIG"}, + }, + } + + // Update the subcommand flags with the common subcommand flags + setup.Flags = append([]cli.Flag{}, commonFlags...) + cleanup.Flags = append([]cli.Flag{}, commonFlags...) + + // Run the top-level CLI + if err := c.Run(os.Args); err != nil { + log.Fatal(fmt.Errorf("Error: %v", err)) + } +} + +// Setup updates a containerd configuration to include the nvidia-containerd-runtime and reloads it +func Setup(c *cli.Context, o *options) error { + log.Infof("Starting 'setup' for %v", c.App.Name) + + runtimeDir, err := ParseArgs(c) + if err != nil { + return fmt.Errorf("unable to parse args: %v", err) + } + o.runtimeDir = runtimeDir + + cfg, err := LoadConfig(o.config) + if err != nil { + return fmt.Errorf("unable to load config: %v", err) + } + + version, err := ParseVersion(cfg, o.useLegacyConfig) + if err != nil { + return fmt.Errorf("unable to parse version: %v", err) + } + + err = UpdateConfig(cfg, o, version) + if err != nil { + return fmt.Errorf("unable to update config: %v", err) + } + + err = FlushConfig(o.config, cfg) + if err != nil { + return fmt.Errorf("unable to flush config: %v", err) + } + + err = RestartContainerd(o) + if err != nil { + return fmt.Errorf("unable to restart containerd: %v", err) + } + + log.Infof("Completed 'setup' for %v", c.App.Name) + + return nil +} + +// Cleanup reverts a containerd configuration to remove the nvidia-containerd-runtime and reloads it +func Cleanup(c *cli.Context, o *options) error { + log.Infof("Starting 'cleanup' for %v", c.App.Name) + + _, err := ParseArgs(c) + if err != nil { + return fmt.Errorf("unable to parse args: %v", err) + } + + cfg, err := LoadConfig(o.config) + if err != nil { + return fmt.Errorf("unable to load config: %v", err) + } + + version, err := ParseVersion(cfg, o.useLegacyConfig) + if err != nil { + return fmt.Errorf("unable to parse version: %v", err) + } + + err = RevertConfig(cfg, o, version) + if err != nil { + return fmt.Errorf("unable to update config: %v", err) + } + + err = FlushConfig(o.config, cfg) + if err != nil { + return fmt.Errorf("unable to flush config: %v", err) + } + + err = RestartContainerd(o) + if err != nil { + return fmt.Errorf("unable to restart containerd: %v", err) + } + + log.Infof("Completed 'cleanup' for %v", c.App.Name) + + return nil +} + +// ParseArgs parses the command line arguments to the CLI +func ParseArgs(c *cli.Context) (string, error) { + args := c.Args() + + log.Infof("Parsing arguments: %v", args.Slice()) + if args.Len() != 1 { + return "", fmt.Errorf("incorrect number of arguments") + } + runtimeDir := args.Get(0) + log.Infof("Successfully parsed arguments") + + return runtimeDir, nil +} + +// LoadConfig loads the containerd config from disk +func LoadConfig(config string) (*toml.Tree, error) { + log.Infof("Loading config: %v", config) + + info, err := os.Stat(config) + if os.IsExist(err) && info.IsDir() { + return nil, fmt.Errorf("config file is a directory") + } + + configFile := config + if os.IsNotExist(err) { + configFile = "/dev/null" + log.Infof("Config file does not exist, creating new one") + } + + cfg, err := toml.LoadFile(configFile) + if err != nil { + return nil, err + } + + log.Infof("Successfully loaded config") + + return cfg, nil +} + +// ParseVersion parses the version field out of the containerd config +func ParseVersion(config *toml.Tree, useLegacyConfig bool) (int, error) { + var defaultVersion int + if !useLegacyConfig { + defaultVersion = 2 + } else { + defaultVersion = 1 + } + + var version int + switch v := config.Get("version").(type) { + case nil: + switch len(config.Keys()) { + case 0: // No config exists, or the config file is empty, use version inferred from containerd + version = defaultVersion + default: // A config file exists, has content, and no version is set + version = 1 + } + case int64: + version = int(v) + default: + return -1, fmt.Errorf("unsupported type for version field: %v", v) + } + log.Infof("Config version: %v", version) + + if version == 1 { + log.Warnf("Support for containerd config version 1 is deprecated") + } + + return version, nil +} + +// UpdateConfig updates the containerd config to include the nvidia-container-runtime +func UpdateConfig(config *toml.Tree, o *options, version int) error { + var err error + + log.Infof("Updating config") + switch version { + case 1: + err = UpdateV1Config(config, o) + case 2: + err = UpdateV2Config(config, o) + default: + err = fmt.Errorf("unsupported containerd config version: %v", version) + } + if err != nil { + return err + } + log.Infof("Successfully updated config") + + return nil +} + +// RevertConfig reverts the containerd config to remove the nvidia-container-runtime +func RevertConfig(config *toml.Tree, o *options, version int) error { + var err error + + log.Infof("Reverting config") + switch version { + case 1: + err = RevertV1Config(config, o) + case 2: + err = RevertV2Config(config, o) + default: + err = fmt.Errorf("unsupported containerd config version: %v", version) + } + if err != nil { + return err + } + log.Infof("Successfully reverted config") + + return nil +} + +// UpdateV1Config performs an update specific to v1 of the containerd config +func UpdateV1Config(config *toml.Tree, o *options) error { + c := newConfigV1(config) + return c.Update(o) +} + +// RevertV1Config performs a revert specific to v1 of the containerd config +func RevertV1Config(config *toml.Tree, o *options) error { + c := newConfigV1(config) + return c.Revert(o) +} + +// UpdateV2Config performs an update specific to v2 of the containerd config +func UpdateV2Config(config *toml.Tree, o *options) error { + c := newConfigV2(config) + return c.Update(o) +} + +// RevertV2Config performs a revert specific to v2 of the containerd config +func RevertV2Config(config *toml.Tree, o *options) error { + c := newConfigV2(config) + return c.Revert(o) +} + +// FlushConfig flushes the updated/reverted config out to disk +func FlushConfig(config string, cfg *toml.Tree) error { + log.Infof("Flushing config") + + output, err := cfg.ToTomlString() + if err != nil { + return fmt.Errorf("unable to convert to TOML: %v", err) + } + + switch len(output) { + case 0: + err := os.Remove(config) + if err != nil { + return fmt.Errorf("unable to remove empty file: %v", err) + } + log.Infof("Config empty, removing file") + default: + f, err := os.Create(config) + if err != nil { + return fmt.Errorf("unable to open '%v' for writing: %v", config, err) + } + defer f.Close() + + _, err = f.WriteString(output) + if err != nil { + return fmt.Errorf("unable to write output: %v", err) + } + } + + log.Infof("Successfully flushed config") + + return nil +} + +// RestartContainerd restarts containerd depending on the value of restartModeFlag +func RestartContainerd(o *options) error { + switch o.restartMode { + case restartModeNone: + log.Warnf("Skipping sending signal to containerd due to --restart-mode=%v", o.restartMode) + return nil + case restartModeSignal: + err := SignalContainerd(o) + if err != nil { + return fmt.Errorf("unable to signal containerd: %v", err) + } + case restartModeSystemd: + return RestartContainerdSystemd(o.hostRootMount) + default: + return fmt.Errorf("Invalid restart mode specified: %v", o.restartMode) + } + + return nil +} + +// SignalContainerd sends a SIGHUP signal to the containerd daemon +func SignalContainerd(o *options) error { + log.Infof("Sending SIGHUP signal to containerd") + + // Wrap the logic to perform the SIGHUP in a function so we can retry it on failure + retriable := func() error { + conn, err := net.Dial("unix", o.socket) + if err != nil { + return fmt.Errorf("unable to dial: %v", err) + } + defer conn.Close() + + sconn, err := conn.(*net.UnixConn).SyscallConn() + if err != nil { + return fmt.Errorf("unable to get syscall connection: %v", err) + } + + err1 := sconn.Control(func(fd uintptr) { + err = syscall.SetsockoptInt(int(fd), syscall.SOL_SOCKET, syscall.SO_PASSCRED, 1) + }) + if err1 != nil { + return fmt.Errorf("unable to issue call on socket fd: %v", err1) + } + if err != nil { + return fmt.Errorf("unable to SetsockoptInt on socket fd: %v", err) + } + + _, _, err = conn.(*net.UnixConn).WriteMsgUnix([]byte(socketMessageToGetPID), nil, nil) + if err != nil { + return fmt.Errorf("unable to WriteMsgUnix on socket fd: %v", err) + } + + oob := make([]byte, 1024) + _, oobn, _, _, err := conn.(*net.UnixConn).ReadMsgUnix(nil, oob) + if err != nil { + return fmt.Errorf("unable to ReadMsgUnix on socket fd: %v", err) + } + + oob = oob[:oobn] + scm, err := syscall.ParseSocketControlMessage(oob) + if err != nil { + return fmt.Errorf("unable to ParseSocketControlMessage from message received on socket fd: %v", err) + } + + ucred, err := syscall.ParseUnixCredentials(&scm[0]) + if err != nil { + return fmt.Errorf("unable to ParseUnixCredentials from message received on socket fd: %v", err) + } + + err = syscall.Kill(int(ucred.Pid), syscall.SIGHUP) + if err != nil { + return fmt.Errorf("unable to send SIGHUP to 'containerd' process: %v", err) + } + + return nil + } + + // Try to send a SIGHUP up to maxReloadAttempts times + var err error + for i := 0; i < maxReloadAttempts; i++ { + err = retriable() + if err == nil { + break + } + if i == maxReloadAttempts-1 { + break + } + log.Warnf("Error signaling containerd, attempt %v/%v: %v", i+1, maxReloadAttempts, err) + time.Sleep(reloadBackoff) + } + if err != nil { + log.Warnf("Max retries reached %v/%v, aborting", maxReloadAttempts, maxReloadAttempts) + return err + } + + log.Infof("Successfully signaled containerd") + + return nil +} + +// RestartContainerdSystemd restarts containerd using systemctl +func RestartContainerdSystemd(hostRootMount string) error { + log.Infof("Restarting containerd using systemd and host root mounted at %v", hostRootMount) + + command := "chroot" + args := []string{hostRootMount, "systemctl", "restart", "containerd"} + + cmd := exec.Command(command, args...) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + err := cmd.Run() + if err != nil { + return fmt.Errorf("error restarting containerd using systemd: %v", err) + } + + return nil +} + +// getDefaultRuntime returns the default runtime for the configured options. +// If the configuration is invalid or the default runtimes should not be set +// the empty string is returned. +func (o options) getDefaultRuntime() string { + if o.setAsDefault { + if o.runtimeClass == nvidiaExperimentalRuntimeName { + return nvidiaExperimentalRuntimeName + } + if o.runtimeClass == "" { + return defaultRuntimeClass + } + return o.runtimeClass + } + return "" +} + +// getRuntimeBinaries returns a map of runtime names to binary paths. This includes the +// renaming of the `nvidia` runtime as per the --runtime-class command line flag. +func (o options) getRuntimeBinaries() map[string]string { + runtimeBinaries := make(map[string]string) + + for rt, bin := range nvidiaRuntimeBinaries { + runtime := rt + if o.runtimeClass != "" && o.runtimeClass != nvidiaExperimentalRuntimeName && runtime == defaultRuntimeClass { + runtime = o.runtimeClass + } + + runtimeBinaries[runtime] = filepath.Join(o.runtimeDir, bin) + } + + return runtimeBinaries +} diff --git a/tools/container/containerd/containerd_test.go b/tools/container/containerd/containerd_test.go new file mode 100644 index 00000000..d26cd45b --- /dev/null +++ b/tools/container/containerd/containerd_test.go @@ -0,0 +1,106 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestOptions(t *testing.T) { + testCases := []struct { + options options + expectedDefaultRuntime string + expectedRuntimeBinaries map[string]string + }{ + { + expectedRuntimeBinaries: map[string]string{ + "nvidia": "nvidia-container-runtime", + "nvidia-experimental": "nvidia-container-runtime-experimental", + }, + }, + { + options: options{ + setAsDefault: true, + }, + expectedDefaultRuntime: "nvidia", + expectedRuntimeBinaries: map[string]string{ + "nvidia": "nvidia-container-runtime", + "nvidia-experimental": "nvidia-container-runtime-experimental", + }, + }, + { + options: options{ + setAsDefault: true, + runtimeClass: "nvidia", + }, + expectedDefaultRuntime: "nvidia", + expectedRuntimeBinaries: map[string]string{ + "nvidia": "nvidia-container-runtime", + "nvidia-experimental": "nvidia-container-runtime-experimental", + }, + }, + { + options: options{ + setAsDefault: true, + runtimeClass: "NAME", + }, + expectedDefaultRuntime: "NAME", + expectedRuntimeBinaries: map[string]string{ + "NAME": "nvidia-container-runtime", + "nvidia-experimental": "nvidia-container-runtime-experimental", + }, + }, + { + options: options{ + setAsDefault: false, + runtimeClass: "NAME", + }, + expectedRuntimeBinaries: map[string]string{ + "NAME": "nvidia-container-runtime", + "nvidia-experimental": "nvidia-container-runtime-experimental", + }, + }, + { + options: options{ + setAsDefault: true, + runtimeClass: "nvidia-experimental", + }, + expectedDefaultRuntime: "nvidia-experimental", + expectedRuntimeBinaries: map[string]string{ + "nvidia": "nvidia-container-runtime", + "nvidia-experimental": "nvidia-container-runtime-experimental", + }, + }, + { + options: options{ + setAsDefault: false, + runtimeClass: "nvidia-experimental", + }, + expectedRuntimeBinaries: map[string]string{ + "nvidia": "nvidia-container-runtime", + "nvidia-experimental": "nvidia-container-runtime-experimental", + }, + }, + } + + for i, tc := range testCases { + require.Equal(t, tc.expectedDefaultRuntime, tc.options.getDefaultRuntime(), "%d: %v", i, tc) + require.EqualValues(t, tc.expectedRuntimeBinaries, tc.options.getRuntimeBinaries(), "%d: %v", i, tc) + } +} diff --git a/tools/container/crio/crio.go b/tools/container/crio/crio.go new file mode 100644 index 00000000..594f75dc --- /dev/null +++ b/tools/container/crio/crio.go @@ -0,0 +1,185 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ +package main + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + + hooks "github.com/containers/podman/v2/pkg/hooks/1.0.0" + rspec "github.com/opencontainers/runtime-spec/specs-go" + log "github.com/sirupsen/logrus" + cli "github.com/urfave/cli/v2" +) + +const ( + defaultHooksDir = "/usr/share/containers/oci/hooks.d" + defaultHookFilename = "oci-nvidia-hook.json" +) + +var hooksDirFlag string +var hookFilenameFlag string +var tooklitDirArg string + +func main() { + // Create the top-level CLI + c := cli.NewApp() + c.Name = "crio" + c.Usage = "Update cri-o hooks to include the NVIDIA runtime hook" + c.ArgsUsage = "" + c.Version = "0.1.0" + + // Create the 'setup' subcommand + setup := cli.Command{} + setup.Name = "setup" + setup.Usage = "Create the cri-o hook required to run NVIDIA GPU containers" + setup.ArgsUsage = "" + setup.Action = Setup + setup.Before = ParseArgs + + // Create the 'cleanup' subcommand + cleanup := cli.Command{} + cleanup.Name = "cleanup" + cleanup.Usage = "Remove the NVIDIA cri-o hook" + cleanup.Action = Cleanup + + // Register the subcommands with the top-level CLI + c.Commands = []*cli.Command{ + &setup, + &cleanup, + } + + // Setup common flags across both subcommands. All subcommands get the same + // set of flags even if they don't use some of them. This is so that we + // only require the user to specify one set of flags for both 'startup' + // and 'cleanup' to simplify things. + commonFlags := []cli.Flag{ + &cli.StringFlag{ + Name: "hooks-dir", + Aliases: []string{"d"}, + Usage: "path to the cri-o hooks directory", + Value: defaultHooksDir, + Destination: &hooksDirFlag, + EnvVars: []string{"CRIO_HOOKS_DIR"}, + DefaultText: defaultHooksDir, + }, + &cli.StringFlag{ + Name: "hook-filename", + Aliases: []string{"f"}, + Usage: "filename of the cri-o hook that will be created / removed in the hooks directory", + Value: defaultHookFilename, + Destination: &hookFilenameFlag, + EnvVars: []string{"CRIO_HOOK_FILENAME"}, + DefaultText: defaultHookFilename, + }, + } + + // Update the subcommand flags with the common subcommand flags + setup.Flags = append([]cli.Flag{}, commonFlags...) + cleanup.Flags = append([]cli.Flag{}, commonFlags...) + + // Run the top-level CLI + if err := c.Run(os.Args); err != nil { + log.Fatal(fmt.Errorf("error: %v", err)) + } +} + +// Setup installs the prestart hook required to launch GPU-enabled containers +func Setup(c *cli.Context) error { + log.Infof("Starting 'setup' for %v", c.App.Name) + + err := os.MkdirAll(hooksDirFlag, 0755) + if err != nil { + return fmt.Errorf("error creating hooks directory %v: %v", hooksDirFlag, err) + } + + hookPath := getHookPath(hooksDirFlag, hookFilenameFlag) + err = createHook(tooklitDirArg, hookPath) + if err != nil { + return fmt.Errorf("error creating hook: %v", err) + } + + return nil +} + +// Cleanup removes the specified prestart hook +func Cleanup(c *cli.Context) error { + log.Infof("Starting 'cleanup' for %v", c.App.Name) + + hookPath := getHookPath(hooksDirFlag, hookFilenameFlag) + err := os.Remove(hookPath) + if err != nil { + return fmt.Errorf("error removing hook '%v': %v", hookPath, err) + } + + return nil +} + +// ParseArgs parses the command line arguments to the CLI +func ParseArgs(c *cli.Context) error { + args := c.Args() + + log.Infof("Parsing arguments: %v", args.Slice()) + if c.NArg() != 1 { + return fmt.Errorf("incorrect number of arguments") + } + tooklitDirArg = args.Get(0) + log.Infof("Successfully parsed arguments") + + return nil +} + +func createHook(toolkitDir string, hookPath string) error { + hook, err := os.Create(hookPath) + if err != nil { + return fmt.Errorf("error creating hook file '%v': %v", hookPath, err) + } + defer hook.Close() + + encoder := json.NewEncoder(hook) + err = encoder.Encode(generateOciHook(tooklitDirArg)) + if err != nil { + return fmt.Errorf("error writing hook file '%v': %v", hookPath, err) + } + return nil +} + +func getHookPath(hooksDir string, hookFilename string) string { + return filepath.Join(hooksDir, hookFilename) +} + +func generateOciHook(toolkitDir string) hooks.Hook { + hookPath := filepath.Join(toolkitDir, "nvidia-container-toolkit") + envPath := "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" + toolkitDir + always := true + + hook := hooks.Hook{ + Version: "1.0.0", + Stages: []string{"prestart"}, + Hook: rspec.Hook{ + Path: hookPath, + Args: []string{"nvidia-container-toolkit", "prestart"}, + Env: []string{envPath}, + }, + When: hooks.When{ + Always: &always, + Commands: []string{".*"}, + }, + } + return hook +} diff --git a/tools/container/docker/docker.go b/tools/container/docker/docker.go new file mode 100644 index 00000000..c85f7475 --- /dev/null +++ b/tools/container/docker/docker.go @@ -0,0 +1,462 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "net" + "os" + "path/filepath" + "syscall" + "time" + + log "github.com/sirupsen/logrus" + cli "github.com/urfave/cli/v2" +) + +const ( + nvidiaRuntimeName = "nvidia" + nvidiaRuntimeBinary = "nvidia-container-runtime" + nvidiaExperimentalRuntimeName = "nvidia-experimental" + nvidiaExperimentalRuntimeBinary = "nvidia-container-runtime-experimental" + + defaultConfig = "/etc/docker/daemon.json" + defaultSocket = "/var/run/docker.sock" + defaultSetAsDefault = true + // defaultRuntimeName specifies the NVIDIA runtime to be use as the default runtime if setting the default runtime is enabled + defaultRuntimeName = nvidiaRuntimeName + + reloadBackoff = 5 * time.Second + maxReloadAttempts = 6 + + defaultDockerRuntime = "runc" + socketMessageToGetPID = "GET /info HTTP/1.0\r\n\r\n" +) + +// nvidiaRuntimeBinaries defines a map of runtime names to binary names +var nvidiaRuntimeBinaries = map[string]string{ + nvidiaRuntimeName: nvidiaRuntimeBinary, + nvidiaExperimentalRuntimeName: nvidiaExperimentalRuntimeBinary, +} + +// options stores the configuration from the command line or environment variables +type options struct { + config string + socket string + runtimeName string + setAsDefault bool + runtimeDir string +} + +func main() { + options := options{} + + // Create the top-level CLI + c := cli.NewApp() + c.Name = "docker" + c.Usage = "Update docker config with the nvidia runtime" + c.Version = "0.1.0" + + // Create the 'setup' subcommand + setup := cli.Command{} + setup.Name = "setup" + setup.Usage = "Trigger docker config to be updated" + setup.ArgsUsage = "" + setup.Action = func(c *cli.Context) error { + return Setup(c, &options) + } + + // Create the 'cleanup' subcommand + cleanup := cli.Command{} + cleanup.Name = "cleanup" + cleanup.Usage = "Trigger any updates made to docker config to be undone" + cleanup.ArgsUsage = "" + cleanup.Action = func(c *cli.Context) error { + return Cleanup(c, &options) + } + + // Register the subcommands with the top-level CLI + c.Commands = []*cli.Command{ + &setup, + &cleanup, + } + + // Setup common flags across both subcommands. All subcommands get the same + // set of flags even if they don't use some of them. This is so that we + // only require the user to specify one set of flags for both 'startup' + // and 'cleanup' to simplify things. + commonFlags := []cli.Flag{ + &cli.StringFlag{ + Name: "config", + Aliases: []string{"c"}, + Usage: "Path to docker config file", + Value: defaultConfig, + Destination: &options.config, + EnvVars: []string{"DOCKER_CONFIG"}, + }, + &cli.StringFlag{ + Name: "socket", + Aliases: []string{"s"}, + Usage: "Path to the docker socket file", + Value: defaultSocket, + Destination: &options.socket, + EnvVars: []string{"DOCKER_SOCKET"}, + }, + // The flags below are only used by the 'setup' command. + &cli.StringFlag{ + Name: "runtime-name", + Aliases: []string{"r"}, + Usage: "Specify the name of the `nvidia` runtime. If set-as-default is selected, the runtime is used as the default runtime.", + Value: defaultRuntimeName, + Destination: &options.runtimeName, + EnvVars: []string{"DOCKER_RUNTIME_NAME"}, + }, + &cli.BoolFlag{ + Name: "set-as-default", + Aliases: []string{"d"}, + Usage: "Set the `nvidia` runtime as the default runtime. If --runtime-name is specified as `nvidia-experimental` the experimental runtime is set as the default runtime instead", + Value: defaultSetAsDefault, + Destination: &options.setAsDefault, + EnvVars: []string{"DOCKER_SET_AS_DEFAULT"}, + Hidden: true, + }, + } + + // Update the subcommand flags with the common subcommand flags + setup.Flags = append([]cli.Flag{}, commonFlags...) + cleanup.Flags = append([]cli.Flag{}, commonFlags...) + + // Run the top-level CLI + if err := c.Run(os.Args); err != nil { + log.Errorf("Error running docker configuration: %v", err) + os.Exit(1) + } +} + +// Setup updates docker configuration to include the nvidia runtime and reloads it +func Setup(c *cli.Context, o *options) error { + log.Infof("Starting 'setup' for %v", c.App.Name) + + runtimeDir, err := ParseArgs(c) + if err != nil { + return fmt.Errorf("unable to parse args: %v", err) + } + o.runtimeDir = runtimeDir + + cfg, err := LoadConfig(o.config) + if err != nil { + return fmt.Errorf("unable to load config: %v", err) + } + + err = UpdateConfig(cfg, o) + if err != nil { + return fmt.Errorf("unable to update config: %v", err) + } + + err = FlushConfig(cfg, o.config) + if err != nil { + return fmt.Errorf("unable to flush config: %v", err) + } + + err = SignalDocker(o.socket) + if err != nil { + return fmt.Errorf("unable to signal docker: %v", err) + } + + log.Infof("Completed 'setup' for %v", c.App.Name) + + return nil +} + +// Cleanup reverts docker configuration to remove the nvidia runtime and reloads it +func Cleanup(c *cli.Context, o *options) error { + log.Infof("Starting 'cleanup' for %v", c.App.Name) + + _, err := ParseArgs(c) + if err != nil { + return fmt.Errorf("unable to parse args: %v", err) + } + + cfg, err := LoadConfig(o.config) + if err != nil { + return fmt.Errorf("unable to load config: %v", err) + } + + err = RevertConfig(cfg) + if err != nil { + return fmt.Errorf("unable to update config: %v", err) + } + + err = FlushConfig(cfg, o.config) + if err != nil { + return fmt.Errorf("unable to flush config: %v", err) + } + + err = SignalDocker(o.socket) + if err != nil { + return fmt.Errorf("unable to signal docker: %v", err) + } + + log.Infof("Completed 'cleanup' for %v", c.App.Name) + + return nil +} + +// ParseArgs parses the command line arguments to the CLI +func ParseArgs(c *cli.Context) (string, error) { + args := c.Args() + + log.Infof("Parsing arguments: %v", args.Slice()) + if args.Len() != 1 { + return "", fmt.Errorf("incorrect number of arguments") + } + runtimeDir := args.Get(0) + log.Infof("Successfully parsed arguments") + + return runtimeDir, nil +} + +// LoadConfig loads the docker config from disk +func LoadConfig(config string) (map[string]interface{}, error) { + log.Infof("Loading config: %v", config) + + info, err := os.Stat(config) + if os.IsExist(err) && info.IsDir() { + return nil, fmt.Errorf("config file is a directory") + } + + cfg := make(map[string]interface{}) + + if os.IsNotExist(err) { + log.Infof("Config file does not exist, creating new one") + return cfg, nil + } + + readBytes, err := ioutil.ReadFile(config) + if err != nil { + return nil, fmt.Errorf("unable to read config: %v", err) + } + + reader := bytes.NewReader(readBytes) + if err := json.NewDecoder(reader).Decode(&cfg); err != nil { + return nil, err + } + + log.Infof("Successfully loaded config") + return cfg, nil +} + +// UpdateConfig updates the docker config to include the nvidia runtimes +func UpdateConfig(config map[string]interface{}, o *options) error { + defaultRuntime := o.getDefaultRuntime() + if defaultRuntime != "" { + config["default-runtime"] = defaultRuntime + } + + runtimes := make(map[string]interface{}) + if _, exists := config["runtimes"]; exists { + runtimes = config["runtimes"].(map[string]interface{}) + } + + for name, rt := range o.runtimes() { + runtimes[name] = rt + } + + config["runtimes"] = runtimes + return nil +} + +//RevertConfig reverts the docker config to remove the nvidia runtime +func RevertConfig(config map[string]interface{}) error { + if _, exists := config["default-runtime"]; exists { + defaultRuntime := config["default-runtime"].(string) + if _, exists := nvidiaRuntimeBinaries[defaultRuntime]; exists { + config["default-runtime"] = defaultDockerRuntime + } + } + + if _, exists := config["runtimes"]; exists { + runtimes := config["runtimes"].(map[string]interface{}) + + for name := range nvidiaRuntimeBinaries { + delete(runtimes, name) + } + + if len(runtimes) == 0 { + delete(config, "runtimes") + } + } + return nil +} + +// FlushConfig flushes the updated/reverted config out to disk +func FlushConfig(cfg map[string]interface{}, config string) error { + log.Infof("Flushing config") + + output, err := json.MarshalIndent(cfg, "", " ") + if err != nil { + return fmt.Errorf("unable to convert to JSON: %v", err) + } + + switch len(output) { + case 0: + err := os.Remove(config) + if err != nil { + return fmt.Errorf("unable to remove empty file: %v", err) + } + log.Infof("Config empty, removing file") + default: + f, err := os.Create(config) + if err != nil { + return fmt.Errorf("unable to open %v for writing: %v", config, err) + } + defer f.Close() + + _, err = f.WriteString(string(output)) + if err != nil { + return fmt.Errorf("unable to write output: %v", err) + } + } + + log.Infof("Successfully flushed config") + + return nil +} + +// SignalDocker sends a SIGHUP signal to docker daemon +func SignalDocker(socket string) error { + log.Infof("Sending SIGHUP signal to docker") + + // Wrap the logic to perform the SIGHUP in a function so we can retry it on failure + retriable := func() error { + conn, err := net.Dial("unix", socket) + if err != nil { + return fmt.Errorf("unable to dial: %v", err) + } + defer conn.Close() + + sconn, err := conn.(*net.UnixConn).SyscallConn() + if err != nil { + return fmt.Errorf("unable to get syscall connection: %v", err) + } + + err1 := sconn.Control(func(fd uintptr) { + err = syscall.SetsockoptInt(int(fd), syscall.SOL_SOCKET, syscall.SO_PASSCRED, 1) + }) + if err1 != nil { + return fmt.Errorf("unable to issue call on socket fd: %v", err1) + } + if err != nil { + return fmt.Errorf("unable to SetsockoptInt on socket fd: %v", err) + } + + _, _, err = conn.(*net.UnixConn).WriteMsgUnix([]byte(socketMessageToGetPID), nil, nil) + if err != nil { + return fmt.Errorf("unable to WriteMsgUnix on socket fd: %v", err) + } + + oob := make([]byte, 1024) + _, oobn, _, _, err := conn.(*net.UnixConn).ReadMsgUnix(nil, oob) + if err != nil { + return fmt.Errorf("unable to ReadMsgUnix on socket fd: %v", err) + } + + oob = oob[:oobn] + scm, err := syscall.ParseSocketControlMessage(oob) + if err != nil { + return fmt.Errorf("unable to ParseSocketControlMessage from message received on socket fd: %v", err) + } + + ucred, err := syscall.ParseUnixCredentials(&scm[0]) + if err != nil { + return fmt.Errorf("unable to ParseUnixCredentials from message received on socket fd: %v", err) + } + + err = syscall.Kill(int(ucred.Pid), syscall.SIGHUP) + if err != nil { + return fmt.Errorf("unable to send SIGHUP to 'docker' process: %v", err) + } + + return nil + } + + // Try to send a SIGHUP up to maxReloadAttempts times + var err error + for i := 0; i < maxReloadAttempts; i++ { + err = retriable() + if err == nil { + break + } + if i == maxReloadAttempts-1 { + break + } + log.Warnf("Error signaling docker, attempt %v/%v: %v", i+1, maxReloadAttempts, err) + time.Sleep(reloadBackoff) + } + if err != nil { + log.Warnf("Max retries reached %v/%v, aborting", maxReloadAttempts, maxReloadAttempts) + return err + } + + log.Infof("Successfully signaled docker") + + return nil +} + +// getDefaultRuntime returns the default runtime for the configured options. +// If the configuration is invalid or the default runtimes should not be set +// the empty string is returned. +func (o options) getDefaultRuntime() string { + if o.setAsDefault == false { + return "" + } + + return o.runtimeName +} + +// runtimes returns the docker runtime definitions for the supported nvidia runtimes +// for the given options. This includes the path with the options runtimeDir applied +func (o options) runtimes() map[string]interface{} { + runtimes := make(map[string]interface{}) + for r, bin := range o.getRuntimeBinaries() { + runtimes[r] = map[string]interface{}{ + "path": bin, + "args": []string{}, + } + } + return runtimes +} + +// getRuntimeBinaries returns a map of runtime names to binary paths. This includes the +// renaming of the `nvidia` runtime as per the --runtime-class command line flag. +func (o options) getRuntimeBinaries() map[string]string { + runtimeBinaries := make(map[string]string) + + for rt, bin := range nvidiaRuntimeBinaries { + runtime := rt + if o.runtimeName != "" && o.runtimeName != nvidiaExperimentalRuntimeName && runtime == defaultRuntimeName { + runtime = o.runtimeName + } + + runtimeBinaries[runtime] = filepath.Join(o.runtimeDir, bin) + } + + return runtimeBinaries +} diff --git a/tools/container/docker/docker_test.go b/tools/container/docker/docker_test.go new file mode 100644 index 00000000..c43ffa83 --- /dev/null +++ b/tools/container/docker/docker_test.go @@ -0,0 +1,423 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "encoding/json" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestUpdateConfigDefaultRuntime(t *testing.T) { + const runtimeDir = "/test/runtime/dir" + + testCases := []struct { + setAsDefault bool + runtimeName string + expectedDefaultRuntimeName interface{} + }{ + {}, + { + setAsDefault: false, + expectedDefaultRuntimeName: nil, + }, + { + setAsDefault: true, + runtimeName: "NAME", + expectedDefaultRuntimeName: "NAME", + }, + { + setAsDefault: true, + runtimeName: "nvidia-experimental", + expectedDefaultRuntimeName: "nvidia-experimental", + }, + { + setAsDefault: true, + runtimeName: "nvidia", + expectedDefaultRuntimeName: "nvidia", + }, + } + + for i, tc := range testCases { + o := &options{ + setAsDefault: tc.setAsDefault, + runtimeName: tc.runtimeName, + runtimeDir: runtimeDir, + } + + config := map[string]interface{}{} + + err := UpdateConfig(config, o) + require.NoError(t, err, "%d: %v", i, tc) + + defaultRuntimeName := config["default-runtime"] + require.EqualValues(t, tc.expectedDefaultRuntimeName, defaultRuntimeName, "%d: %v", i, tc) + } +} + +func TestUpdateConfig(t *testing.T) { + const runtimeDir = "/test/runtime/dir" + + testCases := []struct { + config map[string]interface{} + setAsDefault bool + runtimeName string + expectedConfig map[string]interface{} + }{ + { + config: map[string]interface{}{}, + setAsDefault: false, + expectedConfig: map[string]interface{}{ + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + "nvidia-experimental": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime-experimental", + "args": []string{}, + }, + }, + }, + }, + { + config: map[string]interface{}{}, + setAsDefault: false, + runtimeName: "NAME", + expectedConfig: map[string]interface{}{ + "runtimes": map[string]interface{}{ + "NAME": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + "nvidia-experimental": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime-experimental", + "args": []string{}, + }, + }, + }, + }, + { + config: map[string]interface{}{}, + setAsDefault: false, + runtimeName: "nvidia-experimental", + expectedConfig: map[string]interface{}{ + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + "nvidia-experimental": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime-experimental", + "args": []string{}, + }, + }, + }, + }, + { + config: map[string]interface{}{ + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "nvidia-container-runtime", + "args": []string{}, + }, + }, + }, + setAsDefault: false, + expectedConfig: map[string]interface{}{ + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + "nvidia-experimental": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime-experimental", + "args": []string{}, + }, + }, + }, + }, + { + config: map[string]interface{}{ + "runtimes": map[string]interface{}{ + "not-nvidia": map[string]interface{}{ + "path": "some-other-path", + "args": []string{}, + }, + }, + }, + expectedConfig: map[string]interface{}{ + "runtimes": map[string]interface{}{ + "not-nvidia": map[string]interface{}{ + "path": "some-other-path", + "args": []string{}, + }, + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + "nvidia-experimental": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime-experimental", + "args": []string{}, + }, + }, + }, + }, + { + config: map[string]interface{}{ + "default-runtime": "runc", + }, + setAsDefault: true, + runtimeName: "nvidia", + expectedConfig: map[string]interface{}{ + "default-runtime": "nvidia", + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + "nvidia-experimental": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime-experimental", + "args": []string{}, + }, + }, + }, + }, + { + config: map[string]interface{}{ + "default-runtime": "runc", + }, + setAsDefault: true, + runtimeName: "nvidia-experimental", + expectedConfig: map[string]interface{}{ + "default-runtime": "nvidia-experimental", + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + "nvidia-experimental": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime-experimental", + "args": []string{}, + }, + }, + }, + }, + { + config: map[string]interface{}{ + "exec-opts": []string{"native.cgroupdriver=systemd"}, + "log-driver": "json-file", + "log-opts": map[string]string{ + "max-size": "100m", + }, + "storage-driver": "overlay2", + }, + expectedConfig: map[string]interface{}{ + "exec-opts": []string{"native.cgroupdriver=systemd"}, + "log-driver": "json-file", + "log-opts": map[string]string{ + "max-size": "100m", + }, + "storage-driver": "overlay2", + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + "nvidia-experimental": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime-experimental", + "args": []string{}, + }, + }, + }, + }, + } + + for i, tc := range testCases { + options := &options{ + setAsDefault: tc.setAsDefault, + runtimeName: tc.runtimeName, + runtimeDir: runtimeDir, + } + err := UpdateConfig(tc.config, options) + require.NoError(t, err, "%d: %v", i, tc) + + configContent, err := json.MarshalIndent(tc.config, "", " ") + require.NoError(t, err) + + expectedContent, err := json.MarshalIndent(tc.expectedConfig, "", " ") + require.NoError(t, err) + + require.EqualValues(t, string(expectedContent), string(configContent), "%d: %v", i, tc) + } +} + +func TestRevertConfig(t *testing.T) { + testCases := []struct { + config map[string]interface{} + expectedConfig map[string]interface{} + }{ + { + config: map[string]interface{}{}, + expectedConfig: map[string]interface{}{}, + }, + { + config: map[string]interface{}{ + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + }, + }, + expectedConfig: map[string]interface{}{}, + }, + { + config: map[string]interface{}{ + "runtimes": map[string]interface{}{ + "nvidia-experimental": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + }, + }, + expectedConfig: map[string]interface{}{}, + }, + { + config: map[string]interface{}{ + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + "nvidia-experimental": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime-experimental", + "args": []string{}, + }, + }, + }, + expectedConfig: map[string]interface{}{}, + }, + { + config: map[string]interface{}{ + "default-runtime": "nvidia", + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + }, + }, + expectedConfig: map[string]interface{}{ + "default-runtime": "runc", + }, + }, + { + config: map[string]interface{}{ + "default-runtime": "not-nvidia", + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + }, + }, + expectedConfig: map[string]interface{}{ + "default-runtime": "not-nvidia", + }, + }, + { + config: map[string]interface{}{ + "exec-opts": []string{"native.cgroupdriver=systemd"}, + "log-driver": "json-file", + "log-opts": map[string]string{ + "max-size": "100m", + }, + "storage-driver": "overlay2", + "runtimes": map[string]interface{}{ + "nvidia": map[string]interface{}{ + "path": "/test/runtime/dir/nvidia-container-runtime", + "args": []string{}, + }, + }, + }, + expectedConfig: map[string]interface{}{ + "exec-opts": []string{"native.cgroupdriver=systemd"}, + "log-driver": "json-file", + "log-opts": map[string]string{ + "max-size": "100m", + }, + "storage-driver": "overlay2", + }, + }, + } + + for i, tc := range testCases { + err := RevertConfig(tc.config) + + require.NoError(t, err, "%d: %v", i, tc) + + configContent, err := json.MarshalIndent(tc.config, "", " ") + require.NoError(t, err) + + expectedContent, err := json.MarshalIndent(tc.expectedConfig, "", " ") + require.NoError(t, err) + + require.EqualValues(t, string(expectedContent), string(configContent), "%d: %v", i, tc) + } +} + +func TestFlagsDefaultRuntime(t *testing.T) { + testCases := []struct { + setAsDefault bool + runtimeName string + expected string + }{ + { + expected: "", + }, + { + runtimeName: "not-bool", + expected: "", + }, + { + setAsDefault: false, + runtimeName: "nvidia", + expected: "", + }, + { + setAsDefault: true, + runtimeName: "nvidia", + expected: "nvidia", + }, + { + setAsDefault: true, + runtimeName: "nvidia-experimental", + expected: "nvidia-experimental", + }, + } + + for i, tc := range testCases { + f := options{ + setAsDefault: tc.setAsDefault, + runtimeName: tc.runtimeName, + } + + require.Equal(t, tc.expected, f.getDefaultRuntime(), "%d: %v", i, tc) + } +} diff --git a/tools/container/nvidia-toolkit/run.go b/tools/container/nvidia-toolkit/run.go new file mode 100644 index 00000000..69abc9ba --- /dev/null +++ b/tools/container/nvidia-toolkit/run.go @@ -0,0 +1,290 @@ +package main + +import ( + "fmt" + "os" + "os/exec" + "os/signal" + "path/filepath" + "strings" + "syscall" + + log "github.com/sirupsen/logrus" + cli "github.com/urfave/cli/v2" + unix "golang.org/x/sys/unix" +) + +const ( + runDir = "/run/nvidia" + pidFile = runDir + "/toolkit.pid" + toolkitCommand = "toolkit" + toolkitSubDir = "toolkit" + + defaultToolkitArgs = "" + defaultRuntime = "docker" + defaultRuntimeArgs = "" +) + +var availableRuntimes = map[string]struct{}{"docker": {}, "crio": {}, "containerd": {}} + +var waitingForSignal = make(chan bool, 1) +var signalReceived = make(chan bool, 1) + +var destinationArg string +var noDaemonFlag bool +var toolkitArgsFlag string +var runtimeFlag string +var runtimeArgsFlag string + +// Version defines the CLI version. This is set at build time using LD FLAGS +var Version = "development" + +func main() { + // Create the top-level CLI + c := cli.NewApp() + c.Name = "nvidia-toolkit" + c.Usage = "Install the nvidia-container-toolkit for use by a given runtime" + c.UsageText = "DESTINATION [-n | --no-daemon] [-t | --toolkit-args] [-r | --runtime] [-u | --runtime-args]" + c.Description = "DESTINATION points to the host path underneath which the nvidia-container-toolkit should be installed.\nIt will be installed at ${DESTINATION}/toolkit" + c.Version = Version + c.Action = Run + + // Setup flags for the CLI + c.Flags = []cli.Flag{ + &cli.BoolFlag{ + Name: "no-daemon", + Aliases: []string{"n"}, + Usage: "terminate immediatly after setting up the runtime. Note that no cleanup will be performed", + Destination: &noDaemonFlag, + EnvVars: []string{"NO_DAEMON"}, + }, + &cli.StringFlag{ + Name: "toolkit-args", + Aliases: []string{"t"}, + Usage: "arguments to pass to the underlying 'toolkit' command", + Value: defaultToolkitArgs, + Destination: &toolkitArgsFlag, + EnvVars: []string{"TOOLKIT_ARGS"}, + }, + &cli.StringFlag{ + Name: "runtime", + Aliases: []string{"r"}, + Usage: "the runtime to setup on this node. One of {'docker', 'crio', 'containerd'}", + Value: defaultRuntime, + Destination: &runtimeFlag, + EnvVars: []string{"RUNTIME"}, + }, + &cli.StringFlag{ + Name: "runtime-args", + Aliases: []string{"u"}, + Usage: "arguments to pass to 'docker', 'crio', or 'containerd' setup command", + Value: defaultRuntimeArgs, + Destination: &runtimeArgsFlag, + EnvVars: []string{"RUNTIME_ARGS"}, + }, + } + + // Run the CLI + log.Infof("Starting %v", c.Name) + + remainingArgs, err := ParseArgs(os.Args) + if err != nil { + log.Errorf("Error: unable to parse arguments: %v", err) + os.Exit(1) + } + + if err := c.Run(remainingArgs); err != nil { + log.Errorf("error running nvidia-toolkit: %v", err) + os.Exit(1) + } + + log.Infof("Completed %v", c.Name) +} + +// Run runs the core logic of the CLI +func Run(c *cli.Context) error { + err := verifyFlags() + if err != nil { + return fmt.Errorf("unable to verify flags: %v", err) + } + + err = initialize() + if err != nil { + return fmt.Errorf("unable to initialize: %v", err) + } + defer shutdown() + + err = installToolkit() + if err != nil { + return fmt.Errorf("unable to install toolkit: %v", err) + } + + err = setupRuntime() + if err != nil { + return fmt.Errorf("unable to setup runtime: %v", err) + } + + if !noDaemonFlag { + err = waitForSignal() + if err != nil { + return fmt.Errorf("unable to wait for signal: %v", err) + } + + err = cleanupRuntime() + if err != nil { + return fmt.Errorf("unable to cleanup runtime: %v", err) + } + } + + return nil +} + +// ParseArgs parses the command line arguments and returns the remaining arguments +func ParseArgs(args []string) ([]string, error) { + log.Infof("Parsing arguments") + + numPositionalArgs := 2 // Includes command itself + + if len(args) < numPositionalArgs { + return nil, fmt.Errorf("missing arguments") + } + + for _, arg := range args { + if arg == "--help" || arg == "-h" { + return []string{args[0], arg}, nil + } + if arg == "--version" || arg == "-v" { + return []string{args[0], arg}, nil + } + } + + for _, arg := range args[:numPositionalArgs] { + if strings.HasPrefix(arg, "-") { + return nil, fmt.Errorf("unexpected flag where argument should be") + } + } + + for _, arg := range args[numPositionalArgs:] { + if !strings.HasPrefix(arg, "-") { + return nil, fmt.Errorf("unexpected argument where flag should be") + } + } + + destinationArg = args[1] + + return append([]string{args[0]}, args[numPositionalArgs:]...), nil +} + +func verifyFlags() error { + log.Infof("Verifying Flags") + if _, exists := availableRuntimes[runtimeFlag]; !exists { + return fmt.Errorf("unknown runtime: %v", runtimeFlag) + } + return nil +} + +func initialize() error { + log.Infof("Initializing") + + f, err := os.Create(pidFile) + if err != nil { + return fmt.Errorf("unable to create pidfile: %v", err) + } + + err = unix.Flock(int(f.Fd()), unix.LOCK_EX|unix.LOCK_NB) + if err != nil { + log.Warnf("Unable to get exclusive lock on '%v'", pidFile) + log.Warnf("This normally means an instance of the NVIDIA toolkit Container is already running, aborting") + return fmt.Errorf("unable to get flock on pidfile: %v", err) + } + + _, err = f.WriteString(fmt.Sprintf("%v\n", os.Getpid())) + if err != nil { + return fmt.Errorf("unable to write PID to pidfile: %v", err) + } + + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGQUIT, syscall.SIGPIPE, syscall.SIGTERM) + go func() { + <-sigs + select { + case <-waitingForSignal: + signalReceived <- true + default: + log.Infof("Signal received, exiting early") + shutdown() + os.Exit(0) + } + }() + + return nil +} + +func installToolkit() error { + toolkitDir := filepath.Join(destinationArg, toolkitSubDir) + + log.Infof("Installing toolkit") + + cmdline := fmt.Sprintf("%v install %v %v\n", toolkitCommand, toolkitArgsFlag, toolkitDir) + cmd := exec.Command("sh", "-c", cmdline) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + err := cmd.Run() + if err != nil { + return fmt.Errorf("error running %v command: %v", toolkitCommand, err) + } + + return nil +} + +func setupRuntime() error { + toolkitDir := filepath.Join(destinationArg, toolkitSubDir) + + log.Infof("Setting up runtime") + + cmdline := fmt.Sprintf("%v setup %v %v\n", runtimeFlag, runtimeArgsFlag, toolkitDir) + + cmd := exec.Command("sh", "-c", cmdline) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + err := cmd.Run() + if err != nil { + return fmt.Errorf("error running %v command: %v", runtimeFlag, err) + } + + return nil +} + +func waitForSignal() error { + log.Infof("Waiting for signal") + waitingForSignal <- true + <-signalReceived + return nil +} + +func cleanupRuntime() error { + toolkitDir := filepath.Join(destinationArg, toolkitSubDir) + + log.Infof("Cleaning up Runtime") + + cmdline := fmt.Sprintf("%v cleanup %v %v\n", runtimeFlag, runtimeArgsFlag, toolkitDir) + + cmd := exec.Command("sh", "-c", cmdline) + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + err := cmd.Run() + if err != nil { + return fmt.Errorf("error running %v command: %v", runtimeFlag, err) + } + + return nil +} + +func shutdown() { + log.Infof("Shutting Down") + + err := os.Remove(pidFile) + if err != nil { + log.Warnf("Unable to remove pidfile: %v", err) + } +} diff --git a/tools/container/toolkit/executable.go b/tools/container/toolkit/executable.go new file mode 100644 index 00000000..0d59e375 --- /dev/null +++ b/tools/container/toolkit/executable.go @@ -0,0 +1,153 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "fmt" + "io" + "os" + "path/filepath" + "sort" + "strings" + + log "github.com/sirupsen/logrus" +) + +type executableTarget struct { + dotfileName string + wrapperName string +} + +type executable struct { + source string + target executableTarget + env map[string]string + preLines []string + argLines []string +} + +// install installs an executable component of the NVIDIA container toolkit. The source executable +// is copied to a `.real` file and a wapper is created to set up the environment as required. +func (e executable) install(destFolder string) (string, error) { + log.Infof("Installing executable '%v' to %v", e.source, destFolder) + + dotfileName := e.dotfileName() + + installedDotfileName, err := installFileToFolderWithName(destFolder, dotfileName, e.source) + if err != nil { + return "", fmt.Errorf("error installing file '%v' as '%v': %v", e.source, dotfileName, err) + } + log.Infof("Installed '%v'", installedDotfileName) + + wrapperFilename, err := e.installWrapper(destFolder, installedDotfileName) + if err != nil { + return "", fmt.Errorf("error wrapping '%v': %v", installedDotfileName, err) + } + log.Infof("Installed wrapper '%v'", wrapperFilename) + + return wrapperFilename, nil +} + +func (e executable) dotfileName() string { + return e.target.dotfileName +} + +func (e executable) wrapperName() string { + return e.target.wrapperName +} + +func (e executable) installWrapper(destFolder string, dotfileName string) (string, error) { + wrapperPath := filepath.Join(destFolder, e.wrapperName()) + wrapper, err := os.Create(wrapperPath) + if err != nil { + return "", fmt.Errorf("error creating executable wrapper: %v", err) + } + defer wrapper.Close() + + err = e.writeWrapperTo(wrapper, destFolder, dotfileName) + if err != nil { + return "", fmt.Errorf("error writing wrapper contents: %v", err) + } + + err = ensureExecutable(wrapperPath) + if err != nil { + return "", fmt.Errorf("error making wrapper executable: %v", err) + } + return wrapperPath, nil +} + +func (e executable) writeWrapperTo(wrapper io.Writer, destFolder string, dotfileName string) error { + r := newReplacements(destDirPattern, destFolder) + + // Add the shebang + fmt.Fprintln(wrapper, "#! /bin/sh") + + // Add the preceding lines if any + for _, line := range e.preLines { + fmt.Fprintf(wrapper, "%s\n", r.apply(line)) + } + + // Update the path to include the destination folder + var env map[string]string + if e.env == nil { + env = make(map[string]string) + } else { + env = e.env + } + + path, specified := env["PATH"] + if !specified { + path = "$PATH" + } + env["PATH"] = strings.Join([]string{destFolder, path}, ":") + + var sortedEnvvars []string + for e := range env { + sortedEnvvars = append(sortedEnvvars, e) + } + sort.Strings(sortedEnvvars) + + for _, e := range sortedEnvvars { + v := env[e] + fmt.Fprintf(wrapper, "%s=%s \\\n", e, r.apply(v)) + } + // Add the call to the target executable + fmt.Fprintf(wrapper, "%s \\\n", dotfileName) + + // Insert additional lines in the `arg` list + for _, line := range e.argLines { + fmt.Fprintf(wrapper, "\t%s \\\n", r.apply(line)) + } + // Add the script arguments "$@" + fmt.Fprintln(wrapper, "\t\"$@\"") + + return nil +} + +// ensureExecutable is equivalent to running chmod +x on the specified file +func ensureExecutable(path string) error { + info, err := os.Stat(path) + if err != nil { + return fmt.Errorf("error getting file info for '%v': %v", path, err) + } + executableMode := info.Mode() | 0111 + err = os.Chmod(path, executableMode) + if err != nil { + return fmt.Errorf("error setting executable mode for '%v': %v", path, err) + } + return nil +} diff --git a/tools/container/toolkit/executable_test.go b/tools/container/toolkit/executable_test.go new file mode 100644 index 00000000..572ee2bb --- /dev/null +++ b/tools/container/toolkit/executable_test.go @@ -0,0 +1,152 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "bytes" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestWrapper(t *testing.T) { + const shebang = "#! /bin/sh" + const destFolder = "/dest/folder" + const dotfileName = "source.real" + + testCases := []struct { + e executable + expectedLines []string + }{ + { + e: executable{}, + expectedLines: []string{ + shebang, + "PATH=/dest/folder:$PATH \\", + "source.real \\", + "\t\"$@\"", + "", + }, + }, + { + e: executable{ + env: map[string]string{ + "PATH": "some-path", + }, + }, + expectedLines: []string{ + shebang, + "PATH=/dest/folder:some-path \\", + "source.real \\", + "\t\"$@\"", + "", + }, + }, + { + e: executable{ + preLines: []string{ + "preline1", + "preline2", + }, + }, + expectedLines: []string{ + shebang, + "preline1", + "preline2", + "PATH=/dest/folder:$PATH \\", + "source.real \\", + "\t\"$@\"", + "", + }, + }, + { + e: executable{ + argLines: []string{ + "argline1", + "argline2", + }, + }, + expectedLines: []string{ + shebang, + "PATH=/dest/folder:$PATH \\", + "source.real \\", + "\targline1 \\", + "\targline2 \\", + "\t\"$@\"", + "", + }, + }, + } + + for i, tc := range testCases { + buf := &bytes.Buffer{} + + err := tc.e.writeWrapperTo(buf, destFolder, dotfileName) + require.NoError(t, err) + + exepectedContents := strings.Join(tc.expectedLines, "\n") + require.Equal(t, exepectedContents, buf.String(), "%v: %v", i, tc) + } +} + +func TestInstallExecutable(t *testing.T) { + inputFolder, err := os.MkdirTemp("", "") + require.NoError(t, err) + defer os.RemoveAll(inputFolder) + + // Create the source file + source := filepath.Join(inputFolder, "input") + sourceFile, err := os.Create(source) + + base := filepath.Base(source) + + require.NoError(t, err) + require.NoError(t, sourceFile.Close()) + + e := executable{ + source: source, + target: executableTarget{ + dotfileName: "input.real", + wrapperName: "input", + }, + } + + destFolder, err := os.MkdirTemp("", "output-*") + require.NoError(t, err) + defer os.RemoveAll(destFolder) + + installed, err := e.install(destFolder) + + require.NoError(t, err) + require.Equal(t, filepath.Join(destFolder, base), installed) + + // Now check the post conditions: + sourceInfo, err := os.Stat(source) + require.NoError(t, err) + + destInfo, err := os.Stat(filepath.Join(destFolder, base+".real")) + require.NoError(t, err) + require.Equal(t, sourceInfo.Size(), destInfo.Size()) + require.Equal(t, sourceInfo.Mode(), destInfo.Mode()) + + wrapperInfo, err := os.Stat(installed) + require.NoError(t, err) + require.NotEqual(t, 0, wrapperInfo.Mode()&0111) +} diff --git a/tools/container/toolkit/replacements.go b/tools/container/toolkit/replacements.go new file mode 100644 index 00000000..4ff67a40 --- /dev/null +++ b/tools/container/toolkit/replacements.go @@ -0,0 +1,45 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import "strings" + +const ( + destDirPattern = "@destDir@" +) + +type replacements map[string]string + +func newReplacements(rules ...string) replacements { + r := make(replacements) + for i := 0; i < len(rules)-1; i += 2 { + old := rules[i] + new := rules[i+1] + + r[old] = new + } + + return r +} + +func (r replacements) apply(input string) string { + output := input + for old, new := range r { + output = strings.ReplaceAll(output, old, new) + } + return output +} diff --git a/tools/container/toolkit/runtime.go b/tools/container/toolkit/runtime.go new file mode 100644 index 00000000..a464f530 --- /dev/null +++ b/tools/container/toolkit/runtime.go @@ -0,0 +1,132 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "fmt" + "path/filepath" + "strings" + + log "github.com/sirupsen/logrus" +) + +const ( + nvidiaContainerRuntimeSource = "/usr/bin/nvidia-container-runtime" + nvidiaContainerRuntimeTarget = "nvidia-container-runtime.real" + nvidiaContainerRuntimeWrapper = "nvidia-container-runtime" + + nvidiaExperimentalContainerRuntimeSource = "nvidia-container-runtime.experimental" + nvidiaExperimentalContainerRuntimeTarget = nvidiaExperimentalContainerRuntimeSource + nvidiaExperimentalContainerRuntimeWrapper = "nvidia-container-runtime-experimental" +) + +// installContainerRuntimes sets up the NVIDIA container runtimes, copying the executables +// and implementing the required wrapper +func installContainerRuntimes(toolkitDir string, driverRoot string) error { + r := newNvidiaContainerRuntimeInstaller() + + _, err := r.install(toolkitDir) + if err != nil { + return fmt.Errorf("error installing NVIDIA container runtime: %v", err) + } + + // Install the experimental runtime and treat failures as non-fatal. + err = installExperimentalRuntime(toolkitDir, driverRoot) + if err != nil { + log.Warnf("Could not install experimental runtime: %v", err) + } + + return nil +} + +// installExperimentalRuntime ensures that the experimental NVIDIA Container runtime is installed +func installExperimentalRuntime(toolkitDir string, driverRoot string) error { + libraryRoot, err := findLibraryRoot(driverRoot) + if err != nil { + log.Warnf("Error finding library path for root %v: %v", driverRoot, err) + } + log.Infof("Using library root %v", libraryRoot) + + e := newNvidiaContainerRuntimeExperimentalInstaller(libraryRoot) + _, err = e.install(toolkitDir) + if err != nil { + return fmt.Errorf("error installing experimental NVIDIA Container Runtime: %v", err) + } + + return nil +} + +func newNvidiaContainerRuntimeInstaller() *executable { + target := executableTarget{ + dotfileName: nvidiaContainerRuntimeTarget, + wrapperName: nvidiaContainerRuntimeWrapper, + } + return newRuntimeInstaller(nvidiaContainerRuntimeSource, target, nil) +} + +func newNvidiaContainerRuntimeExperimentalInstaller(libraryRoot string) *executable { + target := executableTarget{ + dotfileName: nvidiaExperimentalContainerRuntimeTarget, + wrapperName: nvidiaExperimentalContainerRuntimeWrapper, + } + + env := make(map[string]string) + if libraryRoot != "" { + env["LD_LIBRARY_PATH"] = strings.Join([]string{libraryRoot, "$LD_LIBRARY_PATH"}, ":") + } + return newRuntimeInstaller(nvidiaExperimentalContainerRuntimeSource, target, env) +} + +func newRuntimeInstaller(source string, target executableTarget, env map[string]string) *executable { + preLines := []string{ + "", + "cat /proc/modules | grep -e \"^nvidia \" >/dev/null 2>&1", + "if [ \"${?}\" != \"0\" ]; then", + " echo \"nvidia driver modules are not yet loaded, invoking runc directly\"", + " exec runc \"$@\"", + "fi", + "", + } + + runtimeEnv := make(map[string]string) + runtimeEnv["XDG_CONFIG_HOME"] = filepath.Join(destDirPattern, ".config") + for k, v := range env { + runtimeEnv[k] = v + } + + r := executable{ + source: source, + target: target, + env: runtimeEnv, + preLines: preLines, + } + + return &r +} + +func findLibraryRoot(root string) (string, error) { + libnvidiamlPath, err := findManagementLibrary(root) + if err != nil { + return "", fmt.Errorf("error locating NVIDIA management library: %v", err) + } + + return filepath.Dir(libnvidiamlPath), nil +} + +func findManagementLibrary(root string) (string, error) { + return findLibrary(root, "libnvidia-ml.so") +} diff --git a/tools/container/toolkit/runtime_test.go b/tools/container/toolkit/runtime_test.go new file mode 100644 index 00000000..06b1b3c2 --- /dev/null +++ b/tools/container/toolkit/runtime_test.go @@ -0,0 +1,90 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "bytes" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestNvidiaContainerRuntimeInstallerWrapper(t *testing.T) { + r := newNvidiaContainerRuntimeInstaller() + + const shebang = "#! /bin/sh" + const destFolder = "/dest/folder" + const dotfileName = "source.real" + + buf := &bytes.Buffer{} + + err := r.writeWrapperTo(buf, destFolder, dotfileName) + require.NoError(t, err) + + expectedLines := []string{ + shebang, + "", + "cat /proc/modules | grep -e \"^nvidia \" >/dev/null 2>&1", + "if [ \"${?}\" != \"0\" ]; then", + " echo \"nvidia driver modules are not yet loaded, invoking runc directly\"", + " exec runc \"$@\"", + "fi", + "", + "PATH=/dest/folder:$PATH \\", + "XDG_CONFIG_HOME=/dest/folder/.config \\", + "source.real \\", + "\t\"$@\"", + "", + } + + exepectedContents := strings.Join(expectedLines, "\n") + require.Equal(t, exepectedContents, buf.String()) +} + +func TestExperimentalContainerRuntimeInstallerWrapper(t *testing.T) { + r := newNvidiaContainerRuntimeExperimentalInstaller("/some/root/usr/lib64") + + const shebang = "#! /bin/sh" + const destFolder = "/dest/folder" + const dotfileName = "source.real" + + buf := &bytes.Buffer{} + + err := r.writeWrapperTo(buf, destFolder, dotfileName) + require.NoError(t, err) + + expectedLines := []string{ + shebang, + "", + "cat /proc/modules | grep -e \"^nvidia \" >/dev/null 2>&1", + "if [ \"${?}\" != \"0\" ]; then", + " echo \"nvidia driver modules are not yet loaded, invoking runc directly\"", + " exec runc \"$@\"", + "fi", + "", + "LD_LIBRARY_PATH=/some/root/usr/lib64:$LD_LIBRARY_PATH \\", + "PATH=/dest/folder:$PATH \\", + "XDG_CONFIG_HOME=/dest/folder/.config \\", + "source.real \\", + "\t\"$@\"", + "", + } + + exepectedContents := strings.Join(expectedLines, "\n") + require.Equal(t, exepectedContents, buf.String()) +} diff --git a/tools/container/toolkit/toolkit.go b/tools/container/toolkit/toolkit.go new file mode 100644 index 00000000..6aa3fc3b --- /dev/null +++ b/tools/container/toolkit/toolkit.go @@ -0,0 +1,449 @@ +/** +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package main + +import ( + "fmt" + "io" + "os" + "path/filepath" + "strings" + + toml "github.com/pelletier/go-toml" + log "github.com/sirupsen/logrus" + "github.com/urfave/cli/v2" +) + +const ( + // DefaultNvidiaDriverRoot specifies the default NVIDIA driver run directory + DefaultNvidiaDriverRoot = "/run/nvidia/driver" + + nvidiaContainerCliSource = "/usr/bin/nvidia-container-cli" + nvidiaContainerRuntimeHookSource = "/usr/bin/nvidia-container-toolkit" + + nvidiaContainerToolkitConfigSource = "/etc/nvidia-container-runtime/config.toml" + configFilename = "config.toml" +) + +var toolkitDirArg string +var nvidiaDriverRootFlag string +var nvidiaContainerRuntimeDebugFlag string +var nvidiaContainerRuntimeLogLevelFlag string +var nvidiaContainerCLIDebugFlag string + +func main() { + // Create the top-level CLI + c := cli.NewApp() + c.Name = "toolkit" + c.Usage = "Manage the NVIDIA container toolkit" + c.Version = "0.1.0" + + // Create the 'install' subcommand + install := cli.Command{} + install.Name = "install" + install.Usage = "Install the components of the NVIDIA container toolkit" + install.ArgsUsage = "" + install.Before = parseArgs + install.Action = Install + + // Create the 'delete' command + delete := cli.Command{} + delete.Name = "delete" + delete.Usage = "Delete the NVIDIA container toolkit" + delete.ArgsUsage = "" + delete.Before = parseArgs + delete.Action = Delete + + // Register the subcommand with the top-level CLI + c.Commands = []*cli.Command{ + &install, + &delete, + } + + flags := []cli.Flag{ + &cli.StringFlag{ + Name: "nvidia-driver-root", + Value: DefaultNvidiaDriverRoot, + Destination: &nvidiaDriverRootFlag, + EnvVars: []string{"NVIDIA_DRIVER_ROOT"}, + }, + &cli.StringFlag{ + Name: "nvidia-container-runtime-debug", + Usage: "Specify the location of the debug log file for the NVIDIA Container Runtime", + Destination: &nvidiaContainerRuntimeDebugFlag, + EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_DEBUG"}, + }, + &cli.StringFlag{ + Name: "nvidia-container-runtime-debug-log-level", + Destination: &nvidiaContainerRuntimeLogLevelFlag, + EnvVars: []string{"NVIDIA_CONTAINER_RUNTIME_LOG_LEVEL"}, + }, + &cli.StringFlag{ + Name: "nvidia-container-cli-debug", + Usage: "Specify the location of the debug log file for the NVIDIA Container CLI", + Destination: &nvidiaContainerCLIDebugFlag, + EnvVars: []string{"NVIDIA_CONTAINER_CLI_DEBUG"}, + }, + } + + // Update the subcommand flags with the common subcommand flags + install.Flags = append([]cli.Flag{}, flags...) + + // Run the top-level CLI + if err := c.Run(os.Args); err != nil { + log.Fatal(fmt.Errorf("error: %v", err)) + } +} + +// parseArgs parses the command line arguments to the CLI +func parseArgs(c *cli.Context) error { + args := c.Args() + + log.Infof("Parsing arguments: %v", args.Slice()) + if c.NArg() != 1 { + return fmt.Errorf("incorrect number of arguments") + } + toolkitDirArg = args.Get(0) + log.Infof("Successfully parsed arguments") + + return nil +} + +// Delete removes the NVIDIA container toolkit +func Delete(cli *cli.Context) error { + log.Infof("Deleting NVIDIA container toolkit from '%v'", toolkitDirArg) + err := os.RemoveAll(toolkitDirArg) + if err != nil { + return fmt.Errorf("error deleting toolkit directory: %v", err) + } + return nil +} + +// Install installs the components of the NVIDIA container toolkit. +// Any existing installation is removed. +func Install(cli *cli.Context) error { + log.Infof("Installing NVIDIA container toolkit to '%v'", toolkitDirArg) + + log.Infof("Removing existing NVIDIA container toolkit installation") + err := os.RemoveAll(toolkitDirArg) + if err != nil { + return fmt.Errorf("error removing toolkit directory: %v", err) + } + + toolkitConfigDir := filepath.Join(toolkitDirArg, ".config", "nvidia-container-runtime") + toolkitConfigPath := filepath.Join(toolkitConfigDir, configFilename) + + err = createDirectories(toolkitDirArg, toolkitConfigDir) + if err != nil { + return fmt.Errorf("could not create required directories: %v", err) + } + + err = installContainerLibrary(toolkitDirArg) + if err != nil { + return fmt.Errorf("error installing NVIDIA container library: %v", err) + } + + err = installContainerRuntimes(toolkitDirArg, nvidiaDriverRootFlag) + if err != nil { + return fmt.Errorf("error installing NVIDIA container runtime: %v", err) + } + + nvidiaContainerCliExecutable, err := installContainerCLI(toolkitDirArg) + if err != nil { + return fmt.Errorf("error installing NVIDIA container CLI: %v", err) + } + + _, err = installRuntimeHook(toolkitDirArg, toolkitConfigPath) + if err != nil { + return fmt.Errorf("error installing NVIDIA container runtime hook: %v", err) + } + + err = installToolkitConfig(toolkitConfigPath, nvidiaDriverRootFlag, nvidiaContainerCliExecutable) + if err != nil { + return fmt.Errorf("error installing NVIDIA container toolkit config: %v", err) + } + + return nil +} + +// installContainerLibrary locates and installs the libnvidia-container.so.1 library. +// A predefined set of library candidates are considered, with the first one +// resulting in success being installed to the toolkit folder. The install process +// resolves the symlink for the library and copies the versioned library itself. +func installContainerLibrary(toolkitDir string) error { + log.Infof("Installing NVIDIA container library to '%v'", toolkitDir) + + const libName = "libnvidia-container.so.1" + libraryPath, err := findLibrary("", libName) + if err != nil { + return fmt.Errorf("error locating NVIDIA container library: %v", err) + } + + installedLibPath, err := installFileToFolder(toolkitDir, libraryPath) + if err != nil { + return fmt.Errorf("error installing %v to %v: %v", libraryPath, toolkitDir, err) + } + log.Infof("Installed '%v' to '%v'", libraryPath, installedLibPath) + + if filepath.Base(installedLibPath) == libName { + return nil + } + + err = installSymlink(toolkitDir, libName, installedLibPath) + if err != nil { + return fmt.Errorf("error installing symlink for NVIDIA container library: %v", err) + } + + return nil +} + +// installToolkitConfig installs the config file for the NVIDIA container toolkit ensuring +// that the settings are updated to match the desired install and nvidia driver directories. +func installToolkitConfig(toolkitConfigPath string, nvidiaDriverDir string, nvidiaContainerCliExecutablePath string) error { + log.Infof("Installing NVIDIA container toolkit config '%v'", toolkitConfigPath) + + config, err := toml.LoadFile(nvidiaContainerToolkitConfigSource) + if err != nil { + return fmt.Errorf("could not open source config file: %v", err) + } + + targetConfig, err := os.Create(toolkitConfigPath) + if err != nil { + return fmt.Errorf("could not create target config file: %v", err) + } + defer targetConfig.Close() + + nvidiaContainerCliKey := func(p string) []string { + return []string{"nvidia-container-cli", p} + } + + // Read the ldconfig path from the config as this may differ per platform + // On ubuntu-based systems this ends in `.real` + ldconfigPath := fmt.Sprintf("%s", config.GetPath(nvidiaContainerCliKey("ldconfig"))) + + // Use the driver run root as the root: + driverLdconfigPath := "@" + filepath.Join(nvidiaDriverDir, strings.TrimPrefix(ldconfigPath, "@/")) + + config.SetPath(nvidiaContainerCliKey("root"), nvidiaDriverDir) + config.SetPath(nvidiaContainerCliKey("path"), nvidiaContainerCliExecutablePath) + config.SetPath(nvidiaContainerCliKey("ldconfig"), driverLdconfigPath) + + // Set the debug options if selected + debugOptions := map[string]string{ + "nvidia-container-runtime.debug": nvidiaContainerRuntimeDebugFlag, + "nvidia-container-runtime.log-level": nvidiaContainerRuntimeLogLevelFlag, + "nvidia-container-cli.debug": nvidiaContainerCLIDebugFlag, + } + for key, value := range debugOptions { + if value == "" { + continue + } + if config.Get(key) != nil { + continue + } + config.Set(key, value) + } + + _, err = config.WriteTo(targetConfig) + if err != nil { + return fmt.Errorf("error writing config: %v", err) + } + return nil +} + +// installContainerCLI sets up the NVIDIA container CLI executable, copying the executable +// and implementing the required wrapper +func installContainerCLI(toolkitDir string) (string, error) { + log.Infof("Installing NVIDIA container CLI from '%v'", nvidiaContainerCliSource) + + env := map[string]string{ + "LD_LIBRARY_PATH": toolkitDir, + } + + e := executable{ + source: nvidiaContainerCliSource, + target: executableTarget{ + dotfileName: "nvidia-container-cli.real", + wrapperName: "nvidia-container-cli", + }, + env: env, + } + + installedPath, err := e.install(toolkitDir) + if err != nil { + return "", fmt.Errorf("error installing NVIDIA container CLI: %v", err) + } + return installedPath, nil +} + +// installRuntimeHook sets up the NVIDIA runtime hook, copying the executable +// and implementing the required wrapper +func installRuntimeHook(toolkitDir string, configFilePath string) (string, error) { + log.Infof("Installing NVIDIA container runtime hook from '%v'", nvidiaContainerRuntimeHookSource) + + argLines := []string{ + fmt.Sprintf("-config \"%s\"", configFilePath), + } + + e := executable{ + source: nvidiaContainerRuntimeHookSource, + target: executableTarget{ + dotfileName: "nvidia-container-toolkit.real", + wrapperName: "nvidia-container-toolkit", + }, + argLines: argLines, + } + + installedPath, err := e.install(toolkitDir) + if err != nil { + return "", fmt.Errorf("error installing NVIDIA container runtime hook: %v", err) + } + + err = installSymlink(toolkitDir, "nvidia-container-runtime-hook", installedPath) + if err != nil { + return "", fmt.Errorf("error installing symlink to NVIDIA container runtime hook: %v", err) + } + + return installedPath, nil +} + +// installSymlink creates a symlink in the toolkitDirectory that points to the specified target. +// Note: The target is assumed to be local to the toolkit directory +func installSymlink(toolkitDir string, link string, target string) error { + symlinkPath := filepath.Join(toolkitDir, link) + targetPath := filepath.Base(target) + log.Infof("Creating symlink '%v' -> '%v'", symlinkPath, targetPath) + + err := os.Symlink(targetPath, symlinkPath) + if err != nil { + return fmt.Errorf("error creating symlink '%v' => '%v': %v", symlinkPath, targetPath, err) + } + return nil +} + +// installFileToFolder copies a source file to a destination folder. +// The path of the input file is ignored. +// e.g. installFileToFolder("/some/path/file.txt", "/output/path") +// will result in a file "/output/path/file.txt" being generated +func installFileToFolder(destFolder string, src string) (string, error) { + name := filepath.Base(src) + return installFileToFolderWithName(destFolder, name, src) +} + +// cp src destFolder/name +func installFileToFolderWithName(destFolder string, name, src string) (string, error) { + dest := filepath.Join(destFolder, name) + err := installFile(dest, src) + if err != nil { + return "", fmt.Errorf("error copying '%v' to '%v': %v", src, dest, err) + } + return dest, nil +} + +// installFile copies a file from src to dest and maintains +// file modes +func installFile(dest string, src string) error { + log.Infof("Installing '%v' to '%v'", src, dest) + + source, err := os.Open(src) + if err != nil { + return fmt.Errorf("error opening source: %v", err) + } + defer source.Close() + + destination, err := os.Create(dest) + if err != nil { + return fmt.Errorf("error creating destination: %v", err) + } + defer destination.Close() + + _, err = io.Copy(destination, source) + if err != nil { + return fmt.Errorf("error copying file: %v", err) + } + + err = applyModeFromSource(dest, src) + if err != nil { + return fmt.Errorf("error setting destination file mode: %v", err) + } + return nil +} + +// applyModeFromSource sets the file mode for a destination file +// to match that of a specified source file +func applyModeFromSource(dest string, src string) error { + sourceInfo, err := os.Stat(src) + if err != nil { + return fmt.Errorf("error getting file info for '%v': %v", src, err) + } + err = os.Chmod(dest, sourceInfo.Mode()) + if err != nil { + return fmt.Errorf("error setting mode for '%v': %v", dest, err) + } + return nil +} + +// findLibrary searches a set of candidate libraries in the specified root for +// a given library name +func findLibrary(root string, libName string) (string, error) { + log.Infof("Finding library %v (root=%v)", libName, root) + + candidateDirs := []string{ + "/usr/lib64", + "/usr/lib/x86_64-linux-gnu", + } + + for _, d := range candidateDirs { + l := filepath.Join(root, d, libName) + log.Infof("Checking library candidate '%v'", l) + + libraryCandidate, err := resolveLink(l) + if err != nil { + log.Infof("Skipping library candidate '%v': %v", l, err) + continue + } + + return libraryCandidate, nil + } + + return "", fmt.Errorf("error locating library '%v'", libName) +} + +// resolveLink finds the target of a symlink or the file itself in the +// case of a regular file. +// This is equivalent to running `readlink -f ${l}` +func resolveLink(l string) (string, error) { + resolved, err := filepath.EvalSymlinks(l) + if err != nil { + return "", fmt.Errorf("error resolving link '%v': %v", l, err) + } + if l != resolved { + log.Infof("Resolved link: '%v' => '%v'", l, resolved) + } + return resolved, nil +} + +func createDirectories(dir ...string) error { + for _, d := range dir { + log.Infof("Creating directory '%v'", d) + err := os.MkdirAll(d, 0755) + if err != nil { + return fmt.Errorf("error creating directory: %v", err) + } + } + return nil +}