From 1afada7de5bce511830ff324f2a535cc0f6a157a Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 11 Sep 2024 11:40:34 +0200 Subject: [PATCH 1/3] [no-relnote] Remove unused code Signed-off-by: Evan Lezar --- cmd/nvidia-container-runtime-hook/hook_config.go | 9 --------- internal/config/hook.go | 10 ---------- internal/config/runtime.go | 10 ---------- 3 files changed, 29 deletions(-) diff --git a/cmd/nvidia-container-runtime-hook/hook_config.go b/cmd/nvidia-container-runtime-hook/hook_config.go index afdb8bbc..4936af67 100644 --- a/cmd/nvidia-container-runtime-hook/hook_config.go +++ b/cmd/nvidia-container-runtime-hook/hook_config.go @@ -20,15 +20,6 @@ const ( // HookConfig : options for the nvidia-container-runtime-hook. type HookConfig config.Config -func getDefaultHookConfig() (HookConfig, error) { - defaultCfg, err := config.GetDefault() - if err != nil { - return HookConfig{}, err - } - - return *(*HookConfig)(defaultCfg), nil -} - // loadConfig loads the required paths for the hook config. func loadConfig() (*config.Config, error) { var configPaths []string diff --git a/internal/config/hook.go b/internal/config/hook.go index 3292f138..4d8b448f 100644 --- a/internal/config/hook.go +++ b/internal/config/hook.go @@ -24,13 +24,3 @@ type RuntimeHookConfig struct { // SkipModeDetection disables the mode check for the runtime hook. SkipModeDetection bool `toml:"skip-mode-detection"` } - -// GetDefaultRuntimeHookConfig defines the default values for the config -func GetDefaultRuntimeHookConfig() (*RuntimeHookConfig, error) { - cfg, err := GetDefault() - if err != nil { - return nil, err - } - - return &cfg.NVIDIAContainerRuntimeHookConfig, nil -} diff --git a/internal/config/runtime.go b/internal/config/runtime.go index ed9ea646..2ba1b7a8 100644 --- a/internal/config/runtime.go +++ b/internal/config/runtime.go @@ -45,13 +45,3 @@ type cdiModeConfig struct { type csvModeConfig struct { MountSpecPath string `toml:"mount-spec-path"` } - -// GetDefaultRuntimeConfig defines the default values for the config -func GetDefaultRuntimeConfig() (*RuntimeConfig, error) { - cfg, err := GetDefault() - if err != nil { - return nil, err - } - - return &cfg.NVIDIAContainerRuntimeConfig, nil -} From c0764366d99c1296878c8559c10d3d58fde4b873 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 11 Sep 2024 11:56:55 +0200 Subject: [PATCH 2/3] [no-relnote] Refactor config handling for hook This change removes indirect calls to get the default config from the nvidia-container-runtime-hook. Signed-off-by: Evan Lezar --- .../container_config.go | 20 ++--- .../container_config_test.go | 78 +++++++++++-------- .../hook_config.go | 15 ++-- .../hook_config_test.go | 13 ++-- cmd/nvidia-container-runtime-hook/main.go | 2 +- 5 files changed, 74 insertions(+), 54 deletions(-) diff --git a/cmd/nvidia-container-runtime-hook/container_config.go b/cmd/nvidia-container-runtime-hook/container_config.go index ac34fb93..3ae8c98f 100644 --- a/cmd/nvidia-container-runtime-hook/container_config.go +++ b/cmd/nvidia-container-runtime-hook/container_config.go @@ -157,7 +157,7 @@ func getDevicesFromEnvvar(containerImage image.CUDA, swarmResourceEnvvars []stri return containerImage.VisibleDevicesFromEnvVar() } -func getDevices(hookConfig *HookConfig, image image.CUDA, privileged bool) []string { +func (hookConfig *hookConfig) getDevices(image image.CUDA, privileged bool) []string { // If enabled, try and get the device list from volume mounts first if hookConfig.AcceptDeviceListAsVolumeMounts { devices := image.VisibleDevicesFromMounts() @@ -197,7 +197,7 @@ func getMigDevices(image image.CUDA, envvar string) *string { return &devices } -func getImexChannels(hookConfig *HookConfig, image image.CUDA, privileged bool) []string { +func (hookConfig *hookConfig) getImexChannels(image image.CUDA, privileged bool) []string { // If enabled, try and get the device list from volume mounts first if hookConfig.AcceptDeviceListAsVolumeMounts { devices := image.ImexChannelsFromMounts() @@ -217,10 +217,10 @@ func getImexChannels(hookConfig *HookConfig, image image.CUDA, privileged bool) return nil } -func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage bool) image.DriverCapabilities { +func (hookConfig *hookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage bool) image.DriverCapabilities { // We use the default driver capabilities by default. This is filtered to only include the // supported capabilities - supportedDriverCapabilities := image.NewDriverCapabilities(c.SupportedDriverCapabilities) + supportedDriverCapabilities := image.NewDriverCapabilities(hookConfig.SupportedDriverCapabilities) capabilities := supportedDriverCapabilities.Intersection(image.DefaultDriverCapabilities) @@ -244,10 +244,10 @@ func (c *HookConfig) getDriverCapabilities(cudaImage image.CUDA, legacyImage boo return capabilities } -func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, privileged bool) *nvidiaConfig { +func (hookConfig *hookConfig) getNvidiaConfig(image image.CUDA, privileged bool) *nvidiaConfig { legacyImage := image.IsLegacy() - devices := getDevices(hookConfig, image, privileged) + devices := hookConfig.getDevices(image, privileged) if len(devices) == 0 { // empty devices means this is not a GPU container. return nil @@ -269,7 +269,7 @@ func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, privileged bool) log.Panicln("cannot set MIG_MONITOR_DEVICES in non privileged container") } - imexChannels := getImexChannels(hookConfig, image, privileged) + imexChannels := hookConfig.getImexChannels(image, privileged) driverCapabilities := hookConfig.getDriverCapabilities(image, legacyImage).String() @@ -288,7 +288,7 @@ func getNvidiaConfig(hookConfig *HookConfig, image image.CUDA, privileged bool) } } -func getContainerConfig(hook HookConfig) (config containerConfig) { +func (hookConfig *hookConfig) getContainerConfig() (config containerConfig) { var h HookState d := json.NewDecoder(os.Stdin) if err := d.Decode(&h); err != nil { @@ -305,7 +305,7 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { image, err := image.New( image.WithEnv(s.Process.Env), image.WithMounts(s.Mounts), - image.WithDisableRequire(hook.DisableRequire), + image.WithDisableRequire(hookConfig.DisableRequire), ) if err != nil { log.Panicln(err) @@ -316,6 +316,6 @@ func getContainerConfig(hook HookConfig) (config containerConfig) { Pid: h.Pid, Rootfs: s.Root.Path, Image: image, - Nvidia: getNvidiaConfig(&hook, image, privileged), + Nvidia: hookConfig.getNvidiaConfig(image, privileged), } } diff --git a/cmd/nvidia-container-runtime-hook/container_config_test.go b/cmd/nvidia-container-runtime-hook/container_config_test.go index cad9ca6e..1f3858b1 100644 --- a/cmd/nvidia-container-runtime-hook/container_config_test.go +++ b/cmd/nvidia-container-runtime-hook/container_config_test.go @@ -7,6 +7,7 @@ import ( "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/require" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" ) @@ -15,7 +16,7 @@ func TestGetNvidiaConfig(t *testing.T) { description string env map[string]string privileged bool - hookConfig *HookConfig + hookConfig *hookConfig expectedConfig *nvidiaConfig expectedPanic bool }{ @@ -394,8 +395,10 @@ func TestGetNvidiaConfig(t *testing.T) { image.EnvVarNvidiaDriverCapabilities: "all", }, privileged: true, - hookConfig: &HookConfig{ - SupportedDriverCapabilities: "video,display", + hookConfig: &hookConfig{ + Config: &config.Config{ + SupportedDriverCapabilities: "video,display", + }, }, expectedConfig: &nvidiaConfig{ Devices: []string{"all"}, @@ -409,8 +412,10 @@ func TestGetNvidiaConfig(t *testing.T) { image.EnvVarNvidiaDriverCapabilities: "video,display", }, privileged: true, - hookConfig: &HookConfig{ - SupportedDriverCapabilities: "video,display,compute,utility", + hookConfig: &hookConfig{ + Config: &config.Config{ + SupportedDriverCapabilities: "video,display,compute,utility", + }, }, expectedConfig: &nvidiaConfig{ Devices: []string{"all"}, @@ -423,8 +428,10 @@ func TestGetNvidiaConfig(t *testing.T) { image.EnvVarNvidiaVisibleDevices: "all", }, privileged: true, - hookConfig: &HookConfig{ - SupportedDriverCapabilities: "video,display,utility,compute", + hookConfig: &hookConfig{ + Config: &config.Config{ + SupportedDriverCapabilities: "video,display,utility,compute", + }, }, expectedConfig: &nvidiaConfig{ Devices: []string{"all"}, @@ -438,9 +445,11 @@ func TestGetNvidiaConfig(t *testing.T) { "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", }, privileged: true, - hookConfig: &HookConfig{ - SwarmResource: "DOCKER_SWARM_RESOURCE", - SupportedDriverCapabilities: "video,display,utility,compute", + hookConfig: &hookConfig{ + Config: &config.Config{ + SwarmResource: "DOCKER_SWARM_RESOURCE", + SupportedDriverCapabilities: "video,display,utility,compute", + }, }, expectedConfig: &nvidiaConfig{ Devices: []string{"GPU1", "GPU2"}, @@ -454,9 +463,11 @@ func TestGetNvidiaConfig(t *testing.T) { "DOCKER_SWARM_RESOURCE": "GPU1,GPU2", }, privileged: true, - hookConfig: &HookConfig{ - SwarmResource: "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE", - SupportedDriverCapabilities: "video,display,utility,compute", + hookConfig: &hookConfig{ + Config: &config.Config{ + SwarmResource: "NOT_DOCKER_SWARM_RESOURCE,DOCKER_SWARM_RESOURCE", + SupportedDriverCapabilities: "video,display,utility,compute", + }, }, expectedConfig: &nvidiaConfig{ Devices: []string{"GPU1", "GPU2"}, @@ -470,14 +481,14 @@ func TestGetNvidiaConfig(t *testing.T) { image.WithEnvMap(tc.env), ) // Wrap the call to getNvidiaConfig() in a closure. - var config *nvidiaConfig + var cfg *nvidiaConfig getConfig := func() { - hookConfig := tc.hookConfig - if hookConfig == nil { - defaultConfig, _ := getDefaultHookConfig() - hookConfig = &defaultConfig + hookCfg := tc.hookConfig + if hookCfg == nil { + defaultConfig, _ := config.GetDefault() + hookCfg = &hookConfig{defaultConfig} } - config = getNvidiaConfig(hookConfig, image, tc.privileged) + cfg = hookCfg.getNvidiaConfig(image, tc.privileged) } // For any tests that are expected to panic, make sure they do. @@ -491,18 +502,18 @@ func TestGetNvidiaConfig(t *testing.T) { // And start comparing the test results to the expected results. if tc.expectedConfig == nil { - require.Nil(t, config, tc.description) + require.Nil(t, cfg, tc.description) return } - require.NotNil(t, config, tc.description) + require.NotNil(t, cfg, tc.description) - require.Equal(t, tc.expectedConfig.Devices, config.Devices) - require.Equal(t, tc.expectedConfig.MigConfigDevices, config.MigConfigDevices) - require.Equal(t, tc.expectedConfig.MigMonitorDevices, config.MigMonitorDevices) - require.Equal(t, tc.expectedConfig.DriverCapabilities, config.DriverCapabilities) + require.Equal(t, tc.expectedConfig.Devices, cfg.Devices) + require.Equal(t, tc.expectedConfig.MigConfigDevices, cfg.MigConfigDevices) + require.Equal(t, tc.expectedConfig.MigMonitorDevices, cfg.MigMonitorDevices) + require.Equal(t, tc.expectedConfig.DriverCapabilities, cfg.DriverCapabilities) - require.ElementsMatch(t, tc.expectedConfig.Requirements, config.Requirements) + require.ElementsMatch(t, tc.expectedConfig.Requirements, cfg.Requirements) }) } } @@ -612,10 +623,11 @@ func TestDeviceListSourcePriority(t *testing.T) { ), image.WithMounts(tc.mountDevices), ) - hookConfig, _ := getDefaultHookConfig() - hookConfig.AcceptEnvvarUnprivileged = tc.acceptUnprivileged - hookConfig.AcceptDeviceListAsVolumeMounts = tc.acceptMounts - devices = getDevices(&hookConfig, image, tc.privileged) + defaultConfig, _ := config.GetDefault() + cfg := &hookConfig{defaultConfig} + cfg.AcceptEnvvarUnprivileged = tc.acceptUnprivileged + cfg.AcceptDeviceListAsVolumeMounts = tc.acceptMounts + devices = cfg.getDevices(image, tc.privileged) } // For all other tests, just grab the devices and check the results @@ -940,8 +952,10 @@ func TestGetDriverCapabilities(t *testing.T) { t.Run(tc.description, func(t *testing.T) { var capabilities string - c := HookConfig{ - SupportedDriverCapabilities: tc.supportedCapabilities, + c := hookConfig{ + Config: &config.Config{ + SupportedDriverCapabilities: tc.supportedCapabilities, + }, } image, _ := image.New( diff --git a/cmd/nvidia-container-runtime-hook/hook_config.go b/cmd/nvidia-container-runtime-hook/hook_config.go index 4936af67..f88815e5 100644 --- a/cmd/nvidia-container-runtime-hook/hook_config.go +++ b/cmd/nvidia-container-runtime-hook/hook_config.go @@ -17,8 +17,11 @@ const ( driverPath = "/run/nvidia/driver" ) -// HookConfig : options for the nvidia-container-runtime-hook. -type HookConfig config.Config +// hookConfig wraps the toolkit config. +// This allows for functions to be defined on the local type. +type hookConfig struct { + *config.Config +} // loadConfig loads the required paths for the hook config. func loadConfig() (*config.Config, error) { @@ -47,12 +50,12 @@ func loadConfig() (*config.Config, error) { return config.GetDefault() } -func getHookConfig() (*HookConfig, error) { +func getHookConfig() (*hookConfig, error) { cfg, err := loadConfig() if err != nil { return nil, fmt.Errorf("failed to load config: %v", err) } - config := (*HookConfig)(cfg) + config := &hookConfig{cfg} allSupportedDriverCapabilities := image.SupportedDriverCapabilities if config.SupportedDriverCapabilities == "all" { @@ -70,7 +73,7 @@ func getHookConfig() (*HookConfig, error) { // getConfigOption returns the toml config option associated with the // specified struct field. -func (c HookConfig) getConfigOption(fieldName string) string { +func (c hookConfig) getConfigOption(fieldName string) string { t := reflect.TypeOf(c) f, ok := t.FieldByName(fieldName) if !ok { @@ -84,7 +87,7 @@ func (c HookConfig) getConfigOption(fieldName string) string { } // getSwarmResourceEnvvars returns the swarm resource envvars for the config. -func (c *HookConfig) getSwarmResourceEnvvars() []string { +func (c *hookConfig) getSwarmResourceEnvvars() []string { if c.SwarmResource == "" { return nil } diff --git a/cmd/nvidia-container-runtime-hook/hook_config_test.go b/cmd/nvidia-container-runtime-hook/hook_config_test.go index 7c50ec12..ade45d27 100644 --- a/cmd/nvidia-container-runtime-hook/hook_config_test.go +++ b/cmd/nvidia-container-runtime-hook/hook_config_test.go @@ -23,6 +23,7 @@ import ( "github.com/stretchr/testify/require" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/config/image" ) @@ -89,10 +90,10 @@ func TestGetHookConfig(t *testing.T) { } } - var config HookConfig + var cfg hookConfig getHookConfig := func() { c, _ := getHookConfig() - config = *c + cfg = *c } if tc.expectedPanic { @@ -102,7 +103,7 @@ func TestGetHookConfig(t *testing.T) { getHookConfig() - require.EqualValues(t, tc.expectedDriverCapabilities, config.SupportedDriverCapabilities) + require.EqualValues(t, tc.expectedDriverCapabilities, cfg.SupportedDriverCapabilities) }) } } @@ -144,8 +145,10 @@ func TestGetSwarmResourceEnvvars(t *testing.T) { for i, tc := range testCases { t.Run(fmt.Sprintf("%d", i), func(t *testing.T) { - c := &HookConfig{ - SwarmResource: tc.value, + c := &hookConfig{ + Config: &config.Config{ + SwarmResource: tc.value, + }, } envvars := c.getSwarmResourceEnvvars() diff --git a/cmd/nvidia-container-runtime-hook/main.go b/cmd/nvidia-container-runtime-hook/main.go index a0cfe3ec..cf2322ef 100644 --- a/cmd/nvidia-container-runtime-hook/main.go +++ b/cmd/nvidia-container-runtime-hook/main.go @@ -75,7 +75,7 @@ func doPrestart() { } cli := hook.NVIDIAContainerCLIConfig - container := getContainerConfig(*hook) + container := hook.getContainerConfig() nvidia := container.Nvidia if nvidia == nil { // Not a GPU container, nothing to do. From 00f1d5a62755ce3ddcf992521f3ad28ea0363b07 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Wed, 11 Sep 2024 16:47:16 +0200 Subject: [PATCH 3/3] Only allow host-relative LDConfig paths This change only allows host-relative LDConfig paths. An allow-ldconfig-from-container feature flag is added to allow for this the default behaviour to be changed. Signed-off-by: Evan Lezar --- internal/config/cli.go | 60 +++++++-- internal/config/cli_test.go | 6 +- internal/config/config.go | 19 ++- internal/config/config_test.go | 163 +++++++++++++++++++----- internal/config/features.go | 7 +- internal/config/toml.go | 13 ++ internal/config/toml_test.go | 31 ++++- tools/container/toolkit/toolkit_test.go | 2 +- 8 files changed, 247 insertions(+), 54 deletions(-) diff --git a/internal/config/cli.go b/internal/config/cli.go index 4a1e199d..3621df25 100644 --- a/internal/config/cli.go +++ b/internal/config/cli.go @@ -17,6 +17,7 @@ package config import ( + "fmt" "os" "strings" ) @@ -34,29 +35,62 @@ type ContainerCLIConfig struct { NoPivot bool `toml:"no-pivot,omitempty"` NoCgroups bool `toml:"no-cgroups"` User string `toml:"user"` - Ldconfig string `toml:"ldconfig"` + // Ldconfig represents the path to the ldconfig binary to be used to update + // the ldcache in a container as it is being created. + // If this path starts with a '@' the path is relative to the host and if + // not it is treated as a container path. + // + // Note that the use of container paths are disabled by default and if this + // is required, the features.allow-ldconfig-from-container feature gate must + // be enabled explicitly. + Ldconfig ldconfigPath `toml:"ldconfig"` } // NormalizeLDConfigPath returns the resolved path of the configured LDConfig binary. // This is only done for host LDConfigs and is required to handle systems where // /sbin/ldconfig is a wrapper around /sbin/ldconfig.real. func (c *ContainerCLIConfig) NormalizeLDConfigPath() string { - return NormalizeLDConfigPath(c.Ldconfig) + return string(c.Ldconfig.normalize()) +} + +// An ldconfigPath is used to represent the path to ldconfig. +type ldconfigPath string + +func (p ldconfigPath) assertValid(allowContainerRelativePath bool) error { + if p.isHostRelative() { + return nil + } + if allowContainerRelativePath { + return nil + } + return fmt.Errorf("nvidia-container-cli.ldconfig value %q is not host-relative (does not start with a '@')", p) +} + +func (p ldconfigPath) isHostRelative() bool { + return strings.HasPrefix(string(p), "@") +} + +// normalize returns the resolved path of the configured LDConfig binary. +// This is only done for host LDConfigs and is required to handle systems where +// /sbin/ldconfig is a wrapper around /sbin/ldconfig.real. +func (p ldconfigPath) normalize() ldconfigPath { + if !p.isHostRelative() { + return p + } + + path := string(p) + trimmedPath := strings.TrimSuffix(strings.TrimPrefix(path, "@"), ".real") + // If the .real path exists, we return that. + if _, err := os.Stat(trimmedPath + ".real"); err == nil { + return ldconfigPath("@" + trimmedPath + ".real") + } + // If the .real path does not exists (or cannot be read) we return the non-.real path. + return ldconfigPath("@" + trimmedPath) } // NormalizeLDConfigPath returns the resolved path of the configured LDConfig binary. // This is only done for host LDConfigs and is required to handle systems where // /sbin/ldconfig is a wrapper around /sbin/ldconfig.real. func NormalizeLDConfigPath(path string) string { - if !strings.HasPrefix(path, "@") { - return path - } - - trimmedPath := strings.TrimSuffix(strings.TrimPrefix(path, "@"), ".real") - // If the .real path exists, we return that. - if _, err := os.Stat(trimmedPath + ".real"); err == nil { - return "@" + trimmedPath + ".real" - } - // If the .real path does not exists (or cannot be read) we return the non-.real path. - return "@" + trimmedPath + return string(ldconfigPath(path).normalize()) } diff --git a/internal/config/cli_test.go b/internal/config/cli_test.go index 100bea95..0d7bf27b 100644 --- a/internal/config/cli_test.go +++ b/internal/config/cli_test.go @@ -33,7 +33,7 @@ func TestNormalizeLDConfigPath(t *testing.T) { testCases := []struct { description string - ldconfig string + ldconfig ldconfigPath expected string }{ { @@ -51,12 +51,12 @@ func TestNormalizeLDConfigPath(t *testing.T) { }, { description: "host .real file exists is returned", - ldconfig: "@" + filepath.Join(testDir, "exists.real"), + ldconfig: ldconfigPath("@" + filepath.Join(testDir, "exists.real")), expected: "@" + filepath.Join(testDir, "exists.real"), }, { description: "host resolves .real file", - ldconfig: "@" + filepath.Join(testDir, "exists"), + ldconfig: ldconfigPath("@" + filepath.Join(testDir, "exists")), expected: "@" + filepath.Join(testDir, "exists.real"), }, { diff --git a/internal/config/config.go b/internal/config/config.go index 33b8ba4d..652cc83a 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -18,6 +18,7 @@ package config import ( "bufio" + "errors" "os" "path/filepath" "strings" @@ -51,6 +52,8 @@ var ( NVIDIAContainerToolkitExecutable = "nvidia-container-toolkit" ) +var errInvalidConfig = errors.New("invalid config value") + // Config represents the contents of the config.toml file for the NVIDIA Container Toolkit // Note: This is currently duplicated by the HookConfig in cmd/nvidia-container-toolkit/hook_config.go type Config struct { @@ -127,8 +130,20 @@ func GetDefault() (*Config, error) { return &d, nil } -func getLdConfigPath() string { - return NormalizeLDConfigPath("@/sbin/ldconfig") +// assertValid checks for a valid config. +func (c *Config) assertValid() error { + err := c.NVIDIAContainerCLIConfig.Ldconfig.assertValid(c.Features.AllowLDConfigFromContainer.IsEnabled()) + if err != nil { + return errors.Join(err, errInvalidConfig) + } + return nil +} + +// getLdConfigPath allows us to override this function for testing. +var getLdConfigPath = getLdConfigPathStub + +func getLdConfigPathStub() ldconfigPath { + return ldconfigPath("@/sbin/ldconfig").normalize() } func getUserGroup() string { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 0873ebd2..963058e1 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -44,23 +44,21 @@ func TestGetConfigWithCustomConfig(t *testing.T) { func TestGetConfig(t *testing.T) { testCases := []struct { - description string - contents []string - expectedError error - inspectLdconfig bool - distIdsLike []string - expectedConfig *Config + description string + contents []string + expectedError error + distIdsLike []string + expectedConfig *Config }{ { - description: "empty config is default", - inspectLdconfig: true, + description: "empty config is default", expectedConfig: &Config{ AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ Root: "", LoadKmods: true, - Ldconfig: "WAS_CHECKED", + Ldconfig: "@/test/ld/config/path", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ DebugFilePath: "/dev/null", @@ -93,7 +91,7 @@ func TestGetConfig(t *testing.T) { "supported-driver-capabilities = \"compute,utility\"", "nvidia-container-cli.root = \"/bar/baz\"", "nvidia-container-cli.load-kmods = false", - "nvidia-container-cli.ldconfig = \"/foo/bar/ldconfig\"", + "nvidia-container-cli.ldconfig = \"@/foo/bar/ldconfig\"", "nvidia-container-cli.user = \"foo:bar\"", "nvidia-container-runtime.debug = \"/foo/bar\"", "nvidia-container-runtime.discover-mode = \"not-legacy\"", @@ -113,7 +111,7 @@ func TestGetConfig(t *testing.T) { NVIDIAContainerCLIConfig: ContainerCLIConfig{ Root: "/bar/baz", LoadKmods: false, - Ldconfig: "/foo/bar/ldconfig", + Ldconfig: "@/foo/bar/ldconfig", User: "foo:bar", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ @@ -146,6 +144,53 @@ func TestGetConfig(t *testing.T) { }, }, }, + { + description: "feature allows ldconfig to be overridden", + contents: []string{ + "[nvidia-container-cli]", + "ldconfig = \"/foo/bar/ldconfig\"", + "[features]", + "allow-ldconfig-from-container = true", + }, + expectedConfig: &Config{ + AcceptEnvvarUnprivileged: true, + SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", + NVIDIAContainerCLIConfig: ContainerCLIConfig{ + Ldconfig: "/foo/bar/ldconfig", + LoadKmods: true, + }, + NVIDIAContainerRuntimeConfig: RuntimeConfig{ + DebugFilePath: "/dev/null", + LogLevel: "info", + Runtimes: []string{"docker-runc", "runc", "crun"}, + Mode: "auto", + Modes: modesConfig{ + CSV: csvModeConfig{ + MountSpecPath: "/etc/nvidia-container-runtime/host-files-for-container.d", + }, + CDI: cdiModeConfig{ + DefaultKind: "nvidia.com/gpu", + AnnotationPrefixes: []string{ + "cdi.k8s.io/", + }, + SpecDirs: []string{ + "/etc/cdi", + "/var/run/cdi", + }, + }, + }, + }, + NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{ + Path: "nvidia-container-runtime-hook", + }, + NVIDIACTKConfig: CTKConfig{ + Path: "nvidia-ctk", + }, + Features: features{ + AllowLDConfigFromContainer: ptr(feature(true)), + }, + }, + }, { description: "config options set in section", contents: []string{ @@ -154,7 +199,7 @@ func TestGetConfig(t *testing.T) { "[nvidia-container-cli]", "root = \"/bar/baz\"", "load-kmods = false", - "ldconfig = \"/foo/bar/ldconfig\"", + "ldconfig = \"@/foo/bar/ldconfig\"", "user = \"foo:bar\"", "[nvidia-container-runtime]", "debug = \"/foo/bar\"", @@ -179,7 +224,7 @@ func TestGetConfig(t *testing.T) { NVIDIAContainerCLIConfig: ContainerCLIConfig{ Root: "/bar/baz", LoadKmods: false, - Ldconfig: "/foo/bar/ldconfig", + Ldconfig: "@/foo/bar/ldconfig", User: "foo:bar", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ @@ -213,16 +258,15 @@ func TestGetConfig(t *testing.T) { }, }, { - description: "suse config", - distIdsLike: []string{"suse", "opensuse"}, - inspectLdconfig: true, + description: "suse config", + distIdsLike: []string{"suse", "opensuse"}, expectedConfig: &Config{ AcceptEnvvarUnprivileged: true, SupportedDriverCapabilities: "compat32,compute,display,graphics,ngx,utility,video", NVIDIAContainerCLIConfig: ContainerCLIConfig{ Root: "", LoadKmods: true, - Ldconfig: "WAS_CHECKED", + Ldconfig: "@/test/ld/config/path", User: "root:video", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ @@ -250,9 +294,8 @@ func TestGetConfig(t *testing.T) { }, }, { - description: "suse config overrides user", - distIdsLike: []string{"suse", "opensuse"}, - inspectLdconfig: true, + description: "suse config overrides user", + distIdsLike: []string{"suse", "opensuse"}, contents: []string{ "nvidia-container-cli.user = \"foo:bar\"", }, @@ -262,7 +305,7 @@ func TestGetConfig(t *testing.T) { NVIDIAContainerCLIConfig: ContainerCLIConfig{ Root: "", LoadKmods: true, - Ldconfig: "WAS_CHECKED", + Ldconfig: "@/test/ld/config/path", User: "foo:bar", }, NVIDIAContainerRuntimeConfig: RuntimeConfig{ @@ -293,6 +336,7 @@ func TestGetConfig(t *testing.T) { for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { + defer setGetLdConfigPathForTest()() defer setGetDistIDLikeForTest(tc.distIdsLike)() reader := strings.NewReader(strings.Join(tc.contents, "\n")) @@ -305,21 +349,63 @@ func TestGetConfig(t *testing.T) { cfg, err := tomlCfg.Config() require.NoError(t, err) - // We first handle the ldconfig path since this is currently system-dependent. - if tc.inspectLdconfig { - ldconfig := cfg.NVIDIAContainerCLIConfig.Ldconfig - require.True(t, strings.HasPrefix(ldconfig, "@/sbin/ldconfig")) - remaining := strings.TrimPrefix(ldconfig, "@/sbin/ldconfig") - require.True(t, remaining == ".real" || remaining == "") - - cfg.NVIDIAContainerCLIConfig.Ldconfig = "WAS_CHECKED" - } - require.EqualValues(t, tc.expectedConfig, cfg) }) } } +func TestAssertValid(t *testing.T) { + defer setGetLdConfigPathForTest()() + + testCases := []struct { + description string + config *Config + expectedError error + }{ + { + description: "default is valid", + config: func() *Config { + config, _ := GetDefault() + return config + }(), + }, + { + description: "alternative host ldconfig path is valid", + config: &Config{ + NVIDIAContainerCLIConfig: ContainerCLIConfig{ + Ldconfig: "@/some/host/path", + }, + }, + }, + { + description: "non-host path is invalid", + config: &Config{ + NVIDIAContainerCLIConfig: ContainerCLIConfig{ + Ldconfig: "/non/host/path", + }, + }, + expectedError: errInvalidConfig, + }, + { + description: "feature flag allows non-host path", + config: &Config{ + NVIDIAContainerCLIConfig: ContainerCLIConfig{ + Ldconfig: "/non/host/path", + }, + Features: features{ + AllowLDConfigFromContainer: ptr(feature(true)), + }, + }, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + require.ErrorIs(t, tc.config.assertValid(), tc.expectedError) + }) + } +} + // setGetDistIDsLikeForTest overrides the distribution IDs that would normally be read from the /etc/os-release file. func setGetDistIDLikeForTest(ids []string) func() { if ids == nil { @@ -335,3 +421,18 @@ func setGetDistIDLikeForTest(ids []string) func() { getDistIDLike = original } } + +// prt returns a reference to whatever type is passed into it +func ptr[T any](x T) *T { + return &x +} + +func setGetLdConfigPathForTest() func() { + previous := getLdConfigPath + getLdConfigPath = func() ldconfigPath { + return "@/test/ld/config/path" + } + return func() { + getLdConfigPath = previous + } +} diff --git a/internal/config/features.go b/internal/config/features.go index 80d3c95a..b418daef 100644 --- a/internal/config/features.go +++ b/internal/config/features.go @@ -21,14 +21,15 @@ type features struct { // DisableImexChannelCreation ensures that the implicit creation of // requested IMEX channels is skipped when invoking the nvidia-container-cli. DisableImexChannelCreation *feature `toml:"disable-imex-channel-creation,omitempty"` + // AllowLDConfigFromContainer allows non-host ldconfig paths to be used. + // If this feature flag is not set to 'true' only host-rooted config paths + // (i.e. paths starting with an '@' are considered valid) + AllowLDConfigFromContainer *feature `toml:"allow-ldconfig-from-container,omitempty"` } -//nolint:unused type feature bool // IsEnabled checks whether a feature is explicitly enabled. -// -//nolint:unused func (f *feature) IsEnabled() bool { if f != nil { return bool(*f) diff --git a/internal/config/toml.go b/internal/config/toml.go index 4c39e6b4..fbda458f 100644 --- a/internal/config/toml.go +++ b/internal/config/toml.go @@ -108,6 +108,19 @@ func loadConfigTomlFrom(reader io.Reader) (*Toml, error) { // Config returns the typed config associated with the toml tree. func (t *Toml) Config() (*Config, error) { + cfg, err := t.configNoOverrides() + if err != nil { + return nil, err + } + if err := cfg.assertValid(); err != nil { + return nil, err + } + return cfg, nil +} + +// configNoOverrides returns the typed config associated with the toml tree. +// This config does not include feature-specific overrides. +func (t *Toml) configNoOverrides() (*Config, error) { cfg, err := GetDefault() if err != nil { return nil, err diff --git a/internal/config/toml_test.go b/internal/config/toml_test.go index e017db15..f7c649f7 100644 --- a/internal/config/toml_test.go +++ b/internal/config/toml_test.go @@ -198,9 +198,12 @@ func TestTomlContents(t *testing.T) { } func TestConfigFromToml(t *testing.T) { + defer setGetLdConfigPathForTest()() + testCases := []struct { description string contents map[string]interface{} + expectedError error expectedConfig *Config }{ { @@ -226,13 +229,39 @@ func TestConfigFromToml(t *testing.T) { return c }(), }, + { + description: "invalid ldconfig value raises error", + contents: map[string]interface{}{ + "nvidia-container-cli": map[string]interface{}{ + "ldconfig": "/some/ldconfig/path", + }, + }, + expectedError: errInvalidConfig, + }, + { + description: "feature allows ldconfig override", + contents: map[string]interface{}{ + "nvidia-container-cli": map[string]interface{}{ + "ldconfig": "/some/ldconfig/path", + }, + "features": map[string]interface{}{ + "allow-ldconfig-from-container": true, + }, + }, + expectedConfig: func() *Config { + c, _ := GetDefault() + c.NVIDIAContainerCLIConfig.Ldconfig = "/some/ldconfig/path" + c.Features.AllowLDConfigFromContainer = ptr(feature(true)) + return c + }(), + }, } for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { tomlCfg := fromMap(tc.contents) config, err := tomlCfg.Config() - require.NoError(t, err) + require.ErrorIs(t, err, tc.expectedError) require.EqualValues(t, tc.expectedConfig, config) }) } diff --git a/tools/container/toolkit/toolkit_test.go b/tools/container/toolkit/toolkit_test.go index bab94c4a..358d9166 100644 --- a/tools/container/toolkit/toolkit_test.go +++ b/tools/container/toolkit/toolkit_test.go @@ -161,7 +161,7 @@ kind: example.com/class // Ensure that the config file has the required contents. // TODO: Add checks for additional config options. require.Equal(t, "/host/driver/root", cfg.NVIDIAContainerCLIConfig.Root) - require.Equal(t, "@/host/driver/root/sbin/ldconfig", cfg.NVIDIAContainerCLIConfig.Ldconfig) + require.Equal(t, "@/host/driver/root/sbin/ldconfig", string(cfg.NVIDIAContainerCLIConfig.Ldconfig)) require.EqualValues(t, filepath.Join(toolkitRoot, "nvidia-container-cli"), cfg.NVIDIAContainerCLIConfig.Path) require.EqualValues(t, filepath.Join(toolkitRoot, "nvidia-ctk"), cfg.NVIDIACTKConfig.Path)