nvidia-container-toolkit/cmd/nvidia-cdi-hook/update-ldcache/ldconfig_linux.go
Evan Lezar ec29b602c3
Run update-ldcache in isolated namespaces
This change uses the reexec package to run the update of the
ldcache in a container in a process with isolated namespaces.
Since the hook is invoked as a createContainer hook, these
namespaces are cloned from the container's namespaces.

In the reexec handler, we further isolate the proc filesystem,
mount the host ldconfig to a tmpfs, and pivot into the containers
root.

Signed-off-by: Evan Lezar <elezar@nvidia.com>
2025-05-15 12:45:49 +02:00

201 lines
7.1 KiB
Go

//go:build linux
/**
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/
package ldcache
import (
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"syscall"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/moby/sys/reexec"
"github.com/opencontainers/runc/libcontainer/utils"
"golang.org/x/sys/unix"
)
// pivotRoot will call pivot_root such that rootfs becomes the new root
// filesystem, and everything else is cleaned up.
// This is adapted from the implementation here:
//
// https://github.com/opencontainers/runc/blob/e89a29929c775025419ab0d218a43588b4c12b9a/libcontainer/rootfs_linux.go#L1056-L1113
//
// With the `mount` and `unmount` calls changed to direct unix.Mount and unix.Unmount calls.
func pivotRoot(rootfs string) error {
// While the documentation may claim otherwise, pivot_root(".", ".") is
// actually valid. What this results in is / being the new root but
// /proc/self/cwd being the old root. Since we can play around with the cwd
// with pivot_root this allows us to pivot without creating directories in
// the rootfs. Shout-outs to the LXC developers for giving us this idea.
oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return &os.PathError{Op: "open", Path: "/", Err: err}
}
defer unix.Close(oldroot) //nolint: errcheck
newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return &os.PathError{Op: "open", Path: rootfs, Err: err}
}
defer unix.Close(newroot) //nolint: errcheck
// Change to the new root so that the pivot_root actually acts on it.
if err := unix.Fchdir(newroot); err != nil {
return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err}
}
if err := unix.PivotRoot(".", "."); err != nil {
return &os.PathError{Op: "pivot_root", Path: ".", Err: err}
}
// Currently our "." is oldroot (according to the current kernel code).
// However, purely for safety, we will fchdir(oldroot) since there isn't
// really any guarantee from the kernel what /proc/self/cwd will be after a
// pivot_root(2).
if err := unix.Fchdir(oldroot); err != nil {
return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err}
}
// Make oldroot rslave to make sure our unmounts don't propagate to the
// host (and thus bork the machine). We don't use rprivate because this is
// known to cause issues due to races where we still have a reference to a
// mount while a process in the host namespace are trying to operate on
// something they think has no mounts (devicemapper in particular).
if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
return err
}
// Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
if err := unix.Unmount(".", unix.MNT_DETACH); err != nil {
return err
}
// Switch back to our shiny new root.
if err := unix.Chdir("/"); err != nil {
return &os.PathError{Op: "chdir", Path: "/", Err: err}
}
return nil
}
// mountLdConfig mounts the host ldconfig to the mount namespace of the hook.
// We use WithProcfd to perform the mount operations to ensure that the changes
// are persisted across the pivot root.
func mountLdConfig(hostLdconfigPath string, containerRootDirPath string) (string, error) {
hostLdconfigInfo, err := os.Stat(hostLdconfigPath)
if err != nil {
return "", fmt.Errorf("error reading host ldconfig: %w", err)
}
hookScratchDirPath := "/var/run/nvidia-ctk-hook"
ldconfigPath := filepath.Join(hookScratchDirPath, "ldconfig")
if err := utils.MkdirAllInRoot(containerRootDirPath, hookScratchDirPath, 0755); err != nil {
return "", fmt.Errorf("error creating hook scratch folder: %w", err)
}
err = utils.WithProcfd(containerRootDirPath, hookScratchDirPath, func(hookScratchDirFdPath string) error {
return createTmpFs(hookScratchDirFdPath, int(hostLdconfigInfo.Size()))
})
if err != nil {
return "", fmt.Errorf("error creating tmpfs: %w", err)
}
if _, err := createFileInRoot(containerRootDirPath, ldconfigPath, hostLdconfigInfo.Mode()); err != nil {
return "", fmt.Errorf("error creating ldconfig: %w", err)
}
err = utils.WithProcfd(containerRootDirPath, ldconfigPath, func(ldconfigFdPath string) error {
return unix.Mount(hostLdconfigPath, ldconfigFdPath, "", unix.MS_BIND|unix.MS_RDONLY|unix.MS_NODEV|unix.MS_PRIVATE|unix.MS_NOSYMFOLLOW, "")
})
if err != nil {
return "", fmt.Errorf("error bind mounting host ldconfig: %w", err)
}
return ldconfigPath, nil
}
func createFileInRoot(containerRootDirPath string, destinationPath string, mode os.FileMode) (string, error) {
dest, err := securejoin.SecureJoin(containerRootDirPath, destinationPath)
if err != nil {
return "", err
}
// Make the parent directory.
destDir, destBase := filepath.Split(dest)
destDirFd, err := utils.MkdirAllInRootOpen(containerRootDirPath, destDir, 0755)
if err != nil {
return "", fmt.Errorf("error creating parent dir: %w", err)
}
defer destDirFd.Close()
// Make the target file. We want to avoid opening any file that is
// already there because it could be a "bad" file like an invalid
// device or hung tty that might cause a DoS, so we use mknodat.
// destBase does not contain any "/" components, and mknodat does
// not follow trailing symlinks, so we can safely just call mknodat
// here.
if err := unix.Mknodat(int(destDirFd.Fd()), destBase, unix.S_IFREG|uint32(mode), 0); err != nil {
// If we get EEXIST, there was already an inode there and
// we can consider that a success.
if !errors.Is(err, unix.EEXIST) {
return "", fmt.Errorf("error creating empty file: %w", err)
}
}
return dest, nil
}
// mountProc mounts a clean proc filesystem in the new root.
func mountProc(newroot string) error {
target := filepath.Join(newroot, "/proc")
if err := os.MkdirAll(target, 0755); err != nil {
return fmt.Errorf("error creating directory: %w", err)
}
return unix.Mount("proc", target, "proc", 0, "")
}
// createTmpFs creates a tmpfs at the specified location with the specified size.
func createTmpFs(target string, size int) error {
return unix.Mount("tmpfs", target, "tmpfs", 0, fmt.Sprintf("size=%d", size))
}
// createReexecCommand creates a command that can be used to trigger the reexec
// initializer.
// On linux this command runs in new namespaces.
func createReexecCommand(args []string) *exec.Cmd {
cmd := reexec.Command(args...)
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
cmd.SysProcAttr = &syscall.SysProcAttr{
Cloneflags: syscall.CLONE_NEWNS |
syscall.CLONE_NEWUTS |
syscall.CLONE_NEWIPC |
syscall.CLONE_NEWPID |
syscall.CLONE_NEWNET,
}
return cmd
}