mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-26 18:18:24 +00:00
Run update-ldcache in isolated namespaces
This change uses the reexec package to run the update of the ldcache in a container in a process with isolated namespaces. Since the hook is invoked as a createContainer hook, these namespaces are cloned from the container's namespaces. In the reexec handler, we further isolate the proc filesystem, mount the host ldconfig to a tmpfs, and pivot into the containers root. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
200
cmd/nvidia-cdi-hook/update-ldcache/ldconfig_linux.go
Normal file
200
cmd/nvidia-cdi-hook/update-ldcache/ldconfig_linux.go
Normal file
@@ -0,0 +1,200 @@
|
||||
//go:build linux
|
||||
|
||||
/**
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package ldcache
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"syscall"
|
||||
|
||||
securejoin "github.com/cyphar/filepath-securejoin"
|
||||
|
||||
"github.com/moby/sys/reexec"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// pivotRoot will call pivot_root such that rootfs becomes the new root
|
||||
// filesystem, and everything else is cleaned up.
|
||||
// This is adapted from the implementation here:
|
||||
//
|
||||
// https://github.com/opencontainers/runc/blob/e89a29929c775025419ab0d218a43588b4c12b9a/libcontainer/rootfs_linux.go#L1056-L1113
|
||||
//
|
||||
// With the `mount` and `unmount` calls changed to direct unix.Mount and unix.Unmount calls.
|
||||
func pivotRoot(rootfs string) error {
|
||||
// While the documentation may claim otherwise, pivot_root(".", ".") is
|
||||
// actually valid. What this results in is / being the new root but
|
||||
// /proc/self/cwd being the old root. Since we can play around with the cwd
|
||||
// with pivot_root this allows us to pivot without creating directories in
|
||||
// the rootfs. Shout-outs to the LXC developers for giving us this idea.
|
||||
|
||||
oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return &os.PathError{Op: "open", Path: "/", Err: err}
|
||||
}
|
||||
defer unix.Close(oldroot) //nolint: errcheck
|
||||
|
||||
newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return &os.PathError{Op: "open", Path: rootfs, Err: err}
|
||||
}
|
||||
defer unix.Close(newroot) //nolint: errcheck
|
||||
|
||||
// Change to the new root so that the pivot_root actually acts on it.
|
||||
if err := unix.Fchdir(newroot); err != nil {
|
||||
return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err}
|
||||
}
|
||||
|
||||
if err := unix.PivotRoot(".", "."); err != nil {
|
||||
return &os.PathError{Op: "pivot_root", Path: ".", Err: err}
|
||||
}
|
||||
|
||||
// Currently our "." is oldroot (according to the current kernel code).
|
||||
// However, purely for safety, we will fchdir(oldroot) since there isn't
|
||||
// really any guarantee from the kernel what /proc/self/cwd will be after a
|
||||
// pivot_root(2).
|
||||
|
||||
if err := unix.Fchdir(oldroot); err != nil {
|
||||
return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err}
|
||||
}
|
||||
|
||||
// Make oldroot rslave to make sure our unmounts don't propagate to the
|
||||
// host (and thus bork the machine). We don't use rprivate because this is
|
||||
// known to cause issues due to races where we still have a reference to a
|
||||
// mount while a process in the host namespace are trying to operate on
|
||||
// something they think has no mounts (devicemapper in particular).
|
||||
if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
|
||||
return err
|
||||
}
|
||||
// Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
|
||||
if err := unix.Unmount(".", unix.MNT_DETACH); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Switch back to our shiny new root.
|
||||
if err := unix.Chdir("/"); err != nil {
|
||||
return &os.PathError{Op: "chdir", Path: "/", Err: err}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// mountLdConfig mounts the host ldconfig to the mount namespace of the hook.
|
||||
// We use WithProcfd to perform the mount operations to ensure that the changes
|
||||
// are persisted across the pivot root.
|
||||
func mountLdConfig(hostLdconfigPath string, containerRootDirPath string) (string, error) {
|
||||
hostLdconfigInfo, err := os.Stat(hostLdconfigPath)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error reading host ldconfig: %w", err)
|
||||
}
|
||||
|
||||
hookScratchDirPath := "/var/run/nvidia-ctk-hook"
|
||||
ldconfigPath := filepath.Join(hookScratchDirPath, "ldconfig")
|
||||
if err := utils.MkdirAllInRoot(containerRootDirPath, hookScratchDirPath, 0755); err != nil {
|
||||
return "", fmt.Errorf("error creating hook scratch folder: %w", err)
|
||||
}
|
||||
|
||||
err = utils.WithProcfd(containerRootDirPath, hookScratchDirPath, func(hookScratchDirFdPath string) error {
|
||||
return createTmpFs(hookScratchDirFdPath, int(hostLdconfigInfo.Size()))
|
||||
|
||||
})
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error creating tmpfs: %w", err)
|
||||
}
|
||||
|
||||
if _, err := createFileInRoot(containerRootDirPath, ldconfigPath, hostLdconfigInfo.Mode()); err != nil {
|
||||
return "", fmt.Errorf("error creating ldconfig: %w", err)
|
||||
}
|
||||
|
||||
err = utils.WithProcfd(containerRootDirPath, ldconfigPath, func(ldconfigFdPath string) error {
|
||||
return unix.Mount(hostLdconfigPath, ldconfigFdPath, "", unix.MS_BIND|unix.MS_RDONLY|unix.MS_NODEV|unix.MS_PRIVATE|unix.MS_NOSYMFOLLOW, "")
|
||||
})
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error bind mounting host ldconfig: %w", err)
|
||||
}
|
||||
|
||||
return ldconfigPath, nil
|
||||
}
|
||||
|
||||
func createFileInRoot(containerRootDirPath string, destinationPath string, mode os.FileMode) (string, error) {
|
||||
dest, err := securejoin.SecureJoin(containerRootDirPath, destinationPath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
// Make the parent directory.
|
||||
destDir, destBase := filepath.Split(dest)
|
||||
destDirFd, err := utils.MkdirAllInRootOpen(containerRootDirPath, destDir, 0755)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("error creating parent dir: %w", err)
|
||||
}
|
||||
defer destDirFd.Close()
|
||||
// Make the target file. We want to avoid opening any file that is
|
||||
// already there because it could be a "bad" file like an invalid
|
||||
// device or hung tty that might cause a DoS, so we use mknodat.
|
||||
// destBase does not contain any "/" components, and mknodat does
|
||||
// not follow trailing symlinks, so we can safely just call mknodat
|
||||
// here.
|
||||
if err := unix.Mknodat(int(destDirFd.Fd()), destBase, unix.S_IFREG|uint32(mode), 0); err != nil {
|
||||
// If we get EEXIST, there was already an inode there and
|
||||
// we can consider that a success.
|
||||
if !errors.Is(err, unix.EEXIST) {
|
||||
return "", fmt.Errorf("error creating empty file: %w", err)
|
||||
}
|
||||
}
|
||||
return dest, nil
|
||||
}
|
||||
|
||||
// mountProc mounts a clean proc filesystem in the new root.
|
||||
func mountProc(newroot string) error {
|
||||
target := filepath.Join(newroot, "/proc")
|
||||
|
||||
if err := os.MkdirAll(target, 0755); err != nil {
|
||||
return fmt.Errorf("error creating directory: %w", err)
|
||||
}
|
||||
return unix.Mount("proc", target, "proc", 0, "")
|
||||
}
|
||||
|
||||
// createTmpFs creates a tmpfs at the specified location with the specified size.
|
||||
func createTmpFs(target string, size int) error {
|
||||
return unix.Mount("tmpfs", target, "tmpfs", 0, fmt.Sprintf("size=%d", size))
|
||||
}
|
||||
|
||||
// createReexecCommand creates a command that can be used to trigger the reexec
|
||||
// initializer.
|
||||
// On linux this command runs in new namespaces.
|
||||
func createReexecCommand(args []string) *exec.Cmd {
|
||||
cmd := reexec.Command(args...)
|
||||
cmd.Stdin = os.Stdin
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
cmd.SysProcAttr = &syscall.SysProcAttr{
|
||||
Cloneflags: syscall.CLONE_NEWNS |
|
||||
syscall.CLONE_NEWUTS |
|
||||
syscall.CLONE_NEWIPC |
|
||||
syscall.CLONE_NEWPID |
|
||||
syscall.CLONE_NEWNET,
|
||||
}
|
||||
|
||||
return cmd
|
||||
}
|
||||
51
cmd/nvidia-cdi-hook/update-ldcache/ldconfig_other.go
Normal file
51
cmd/nvidia-cdi-hook/update-ldcache/ldconfig_other.go
Normal file
@@ -0,0 +1,51 @@
|
||||
//go:build !linux
|
||||
|
||||
/**
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package ldcache
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
|
||||
"github.com/moby/sys/reexec"
|
||||
)
|
||||
|
||||
func pivotRoot(newroot string) error {
|
||||
return fmt.Errorf("not supported")
|
||||
}
|
||||
|
||||
func mountLdConfig(hostLdconfigPath string, containerRootDirPath string) (string, error) {
|
||||
return "", fmt.Errorf("not supported")
|
||||
}
|
||||
|
||||
func mountProc(newroot string) error {
|
||||
return fmt.Errorf("not supported")
|
||||
}
|
||||
|
||||
// createReexecCommand creates a command that can be used ot trigger the reexec
|
||||
// initializer.
|
||||
func createReexecCommand(args []string) *exec.Cmd {
|
||||
cmd := reexec.Command(args...)
|
||||
cmd.Stdin = os.Stdin
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
|
||||
return cmd
|
||||
}
|
||||
@@ -1,3 +1,5 @@
|
||||
//go:build linux
|
||||
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
@@ -26,10 +28,9 @@ import (
|
||||
)
|
||||
|
||||
// SafeExec attempts to clone the specified binary (as an memfd, for example) before executing it.
|
||||
func (m command) SafeExec(path string, args []string, envv []string) error {
|
||||
func SafeExec(path string, args []string, envv []string) error {
|
||||
safeExe, err := cloneBinary(path)
|
||||
if err != nil {
|
||||
m.logger.Warningf("Failed to clone binary %q: %v; falling back to Exec", path, err)
|
||||
//nolint:gosec // TODO: Can we harden this so that there is less risk of command injection
|
||||
return syscall.Exec(path, args, envv)
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
//go:build !linux
|
||||
// +build !linux
|
||||
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
@@ -23,7 +22,7 @@ import "syscall"
|
||||
|
||||
// SafeExec is not implemented on non-linux systems and forwards directly to the
|
||||
// Exec syscall.
|
||||
func (m *command) SafeExec(path string, args []string, envv []string) error {
|
||||
func SafeExec(path string, args []string, envv []string) error {
|
||||
//nolint:gosec // TODO: Can we harden this so that there is less risk of command injection
|
||||
return syscall.Exec(path, args, envv)
|
||||
}
|
||||
|
||||
@@ -19,10 +19,11 @@ package ldcache
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/moby/sys/reexec"
|
||||
"github.com/urfave/cli/v2"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||
@@ -37,6 +38,8 @@ const (
|
||||
// higher precedence than other libraries on the system, but lower than
|
||||
// the 00-cuda-compat that is included in some containers.
|
||||
ldsoconfdFilenamePattern = "00-nvcr-*.conf"
|
||||
|
||||
reexecUpdateLdCacheCommandName = "reexec-update-ldcache"
|
||||
)
|
||||
|
||||
type command struct {
|
||||
@@ -49,6 +52,13 @@ type options struct {
|
||||
containerSpec string
|
||||
}
|
||||
|
||||
func init() {
|
||||
reexec.Register(reexecUpdateLdCacheCommandName, updateLdCacheHandler)
|
||||
if reexec.Init() {
|
||||
os.Exit(0)
|
||||
}
|
||||
}
|
||||
|
||||
// NewCommand constructs an update-ldcache command with the specified logger
|
||||
func NewCommand(logger logger.Interface) *cli.Command {
|
||||
c := command{
|
||||
@@ -113,11 +123,69 @@ func (m command) run(c *cli.Context, cfg *options) error {
|
||||
return fmt.Errorf("failed to determined container root: %v", err)
|
||||
}
|
||||
|
||||
ldconfigPath := m.resolveLDConfigPath(cfg.ldconfigPath)
|
||||
args := []string{
|
||||
filepath.Base(ldconfigPath),
|
||||
// Run ldconfig in the container root directory on the host.
|
||||
"-r", containerRootDir,
|
||||
reexecUpdateLdCacheCommandName,
|
||||
strings.TrimPrefix(config.NormalizeLDConfigPath("@"+cfg.ldconfigPath), "@"),
|
||||
containerRootDir,
|
||||
}
|
||||
args = append(args, cfg.folders.Value()...)
|
||||
|
||||
cmd := createReexecCommand(args)
|
||||
|
||||
return cmd.Run()
|
||||
}
|
||||
|
||||
// updateLdCacheHandler wraps updateLdCache with error handling.
|
||||
func updateLdCacheHandler() {
|
||||
if err := updateLdCache(os.Args); err != nil {
|
||||
log.Printf("Error updating ldcache: %v", err)
|
||||
os.Exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// updateLdCache is invoked from a reexec'd handler and provides namespace
|
||||
// isolation for the operations performed by this hook.
|
||||
// At the point where this is invoked, we are in a new mount namespace that is
|
||||
// cloned from the parent.
|
||||
//
|
||||
// args[0] is the reexec initializer function name
|
||||
// args[1] is the path of the ldconfig binary on the host
|
||||
// args[2] is the container root directory
|
||||
// The remaining args are folders that need to be added to the ldcache.
|
||||
func updateLdCache(args []string) error {
|
||||
if len(args) < 3 {
|
||||
return fmt.Errorf("incorrect arguments: %v", args)
|
||||
}
|
||||
hostLdconfigPath := args[1]
|
||||
containerRootDirPath := args[2]
|
||||
|
||||
// To prevent leaking the parent proc filesystem, we create a new proc mount
|
||||
// in the container root.
|
||||
if err := mountProc(containerRootDirPath); err != nil {
|
||||
return fmt.Errorf("error mounting /proc: %w", err)
|
||||
}
|
||||
|
||||
// We mount the host ldconfig before we pivot root since host paths are not
|
||||
// visible after the pivot root operation.
|
||||
ldconfigPath, err := mountLdConfig(hostLdconfigPath, containerRootDirPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error mounting host ldconfig: %w", err)
|
||||
}
|
||||
|
||||
// We pivot to the container root for the new process, this further limits
|
||||
// access to the host.
|
||||
if err := pivotRoot(containerRootDirPath); err != nil {
|
||||
return fmt.Errorf("error running pivot_root: %w", err)
|
||||
}
|
||||
|
||||
return runLdconfig(ldconfigPath, args[3:]...)
|
||||
}
|
||||
|
||||
// runLdconfig runs the ldconfig binary and ensures that the specified directories
|
||||
// are processed for the ldcache.
|
||||
func runLdconfig(ldconfigPath string, directories ...string) error {
|
||||
args := []string{
|
||||
"ldconfig",
|
||||
// Explicitly specify using /etc/ld.so.conf since the host's ldconfig may
|
||||
// be configured to use a different config file by default.
|
||||
// Note that since we apply the `-r {{ .containerRootDir }}` argument, /etc/ld.so.conf is
|
||||
@@ -125,48 +193,35 @@ func (m command) run(c *cli.Context, cfg *options) error {
|
||||
"-f", "/etc/ld.so.conf",
|
||||
}
|
||||
|
||||
containerRoot := containerRoot(containerRootDir)
|
||||
containerRoot := containerRoot("/")
|
||||
|
||||
if containerRoot.hasPath("/etc/ld.so.cache") {
|
||||
args = append(args, "-C", "/etc/ld.so.cache")
|
||||
} else {
|
||||
m.logger.Debugf("No ld.so.cache found, skipping update")
|
||||
args = append(args, "-N")
|
||||
}
|
||||
|
||||
folders := cfg.folders.Value()
|
||||
if containerRoot.hasPath("/etc/ld.so.conf.d") {
|
||||
err := m.createLdsoconfdFile(containerRoot, ldsoconfdFilenamePattern, folders...)
|
||||
err := createLdsoconfdFile(ldsoconfdFilenamePattern, directories...)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to update ld.so.conf.d: %v", err)
|
||||
return fmt.Errorf("failed to update ld.so.conf.d: %w", err)
|
||||
}
|
||||
} else {
|
||||
args = append(args, folders...)
|
||||
args = append(args, directories...)
|
||||
}
|
||||
|
||||
return m.SafeExec(ldconfigPath, args, nil)
|
||||
return SafeExec(ldconfigPath, args, nil)
|
||||
}
|
||||
|
||||
// resolveLDConfigPath determines the LDConfig path to use for the system.
|
||||
// On systems such as Ubuntu where `/sbin/ldconfig` is a wrapper around
|
||||
// /sbin/ldconfig.real, the latter is returned.
|
||||
func (m command) resolveLDConfigPath(path string) string {
|
||||
return strings.TrimPrefix(config.NormalizeLDConfigPath("@"+path), "@")
|
||||
}
|
||||
|
||||
// createLdsoconfdFile creates a file at /etc/ld.so.conf.d/ in the specified root.
|
||||
// createLdsoconfdFile creates a file at /etc/ld.so.conf.d/.
|
||||
// The file is created at /etc/ld.so.conf.d/{{ .pattern }} using `CreateTemp` and
|
||||
// contains the specified directories on each line.
|
||||
func (m command) createLdsoconfdFile(in containerRoot, pattern string, dirs ...string) error {
|
||||
func createLdsoconfdFile(pattern string, dirs ...string) error {
|
||||
if len(dirs) == 0 {
|
||||
m.logger.Debugf("No directories to add to /etc/ld.so.conf")
|
||||
return nil
|
||||
}
|
||||
|
||||
ldsoconfdDir, err := in.resolve("/etc/ld.so.conf.d")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
ldsoconfdDir := "/etc/ld.so.conf.d"
|
||||
if err := os.MkdirAll(ldsoconfdDir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create ld.so.conf.d: %w", err)
|
||||
}
|
||||
@@ -175,9 +230,9 @@ func (m command) createLdsoconfdFile(in containerRoot, pattern string, dirs ...s
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create config file: %w", err)
|
||||
}
|
||||
defer configFile.Close()
|
||||
|
||||
m.logger.Debugf("Adding directories %v to %v", dirs, configFile.Name())
|
||||
defer func() {
|
||||
_ = configFile.Close()
|
||||
}()
|
||||
|
||||
added := make(map[string]bool)
|
||||
for _, dir := range dirs {
|
||||
|
||||
Reference in New Issue
Block a user