Use libcontainer execseal to run ldconfig

This change copies ldconfig into a memfd before executing it from the createContainer hook. Signed-off-by: Evan Lezar <elezar@nvidia.com>
2025-06-26 18:18:24 +00:00 · 2025-02-25 16:58:30 +02:00
parent 9429fbac5f
commit 52b9631333
34 changed files with 3939 additions and 6 deletions
--- a/vendor/github.com/opencontainers/runc/libcontainer/dmz/cloned_binary_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/dmz/cloned_binary_linux.go
@@ -0,0 +1,258 @@
+package dmz
+
+import (
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"strconv"
+
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/system"
+)
+
+type SealFunc func(**os.File) error
+
+var (
+	_ SealFunc = sealMemfd
+	_ SealFunc = sealFile
+)
+
+func isExecutable(f *os.File) bool {
+	if err := unix.Faccessat(int(f.Fd()), "", unix.X_OK, unix.AT_EACCESS|unix.AT_EMPTY_PATH); err == nil {
+		return true
+	} else if err == unix.EACCES {
+		return false
+	}
+	path := "/proc/self/fd/" + strconv.Itoa(int(f.Fd()))
+	if err := unix.Access(path, unix.X_OK); err == nil {
+		return true
+	} else if err == unix.EACCES {
+		return false
+	}
+	// Cannot check -- assume it's executable (if not, exec will fail).
+	logrus.Debugf("cannot do X_OK check on binary %s -- assuming it's executable", f.Name())
+	return true
+}
+
+const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE
+
+func sealMemfd(f **os.File) error {
+	if err := (*f).Chmod(0o511); err != nil {
+		return err
+	}
+	// Try to set the newer memfd sealing flags, but we ignore
+	// errors because they are not needed and we want to continue
+	// to work on older kernels.
+	fd := (*f).Fd()
+	// F_SEAL_FUTURE_WRITE -- Linux 5.1
+	_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE)
+	// F_SEAL_EXEC -- Linux 6.3
+	const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name
+	_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC)
+	// Apply all original memfd seals.
+	_, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals)
+	return os.NewSyscallError("fcntl(F_ADD_SEALS)", err)
+}
+
+// Memfd creates a sealable executable memfd (supported since Linux 3.17).
+func Memfd(comment string) (*os.File, SealFunc, error) {
+	file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC)
+	return file, sealMemfd, err
+}
+
+func sealFile(f **os.File) error {
+	// When sealing an O_TMPFILE-style descriptor we need to
+	// re-open the path as O_PATH to clear the existing write
+	// handle we have.
+	opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("reopen tmpfile: %w", err)
+	}
+	_ = (*f).Close()
+	*f = opath
+	return nil
+}
+
+// otmpfile creates an open(O_TMPFILE) file in the given directory (supported
+// since Linux 3.11).
+func otmpfile(dir string) (*os.File, SealFunc, error) {
+	file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700)
+	if err != nil {
+		return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err)
+	}
+	// Make sure we actually got an unlinked O_TMPFILE descriptor.
+	var stat unix.Stat_t
+	if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
+		file.Close()
+		return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err)
+	} else if stat.Nlink != 0 {
+		file.Close()
+		return nil, nil, errors.New("O_TMPFILE has non-zero nlink")
+	}
+	return file, sealFile, err
+}
+
+// mktemp creates a classic unlinked file in the given directory.
+func mktemp(dir string) (*os.File, SealFunc, error) {
+	file, err := os.CreateTemp(dir, "runc.")
+	if err != nil {
+		return nil, nil, err
+	}
+	// Unlink the file and verify it was unlinked.
+	if err := os.Remove(file.Name()); err != nil {
+		return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err)
+	}
+	if err := file.Chmod(0o511); err != nil {
+		return nil, nil, fmt.Errorf("chmod classic tmpfile: %w", err)
+	}
+	var stat unix.Stat_t
+	if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
+		return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err)
+	} else if stat.Nlink != 0 {
+		return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name())
+	}
+	return file, sealFile, err
+}
+
+func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) {
+	// First, try an executable memfd (supported since Linux 3.17).
+	file, sealFn, err = Memfd(comment)
+	if err == nil {
+		return
+	}
+	logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)
+
+	// The tmpDir here (c.root) might be mounted noexec, so we need a couple of
+	// fallbacks to try. It's possible that none of these are writable and
+	// executable, in which case there's nothing we can practically do (other
+	// than mounting our own executable tmpfs, which would have its own
+	// issues).
+	tmpDirs := []string{
+		tmpDir,
+		os.TempDir(),
+		"/tmp",
+		".",
+		"/bin",
+		"/",
+	}
+
+	// Try to fallback to O_TMPFILE (supported since Linux 3.11).
+	for _, dir := range tmpDirs {
+		file, sealFn, err = otmpfile(dir)
+		if err != nil {
+			continue
+		}
+		if !isExecutable(file) {
+			logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
+			file.Close()
+			continue
+		}
+		return
+	}
+	logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
+	// Finally, try a classic unlinked temporary file.
+	for _, dir := range tmpDirs {
+		file, sealFn, err = mktemp(dir)
+		if err != nil {
+			continue
+		}
+		if !isExecutable(file) {
+			logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
+			file.Close()
+			continue
+		}
+		return
+	}
+	return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err)
+}
+
+// CloneBinary creates a "sealed" clone of a given binary, which can be used to
+// thwart attempts by the container process to gain access to host binaries
+// through procfs magic-link shenanigans. For more details on why this is
+// necessary, see CVE-2019-5736.
+func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) {
+	logrus.Debugf("cloning %s binary (%d bytes)", name, size)
+	file, sealFn, err := getSealableFile(name, tmpDir)
+	if err != nil {
+		return nil, err
+	}
+	copied, err := system.Copy(file, src)
+	if err != nil {
+		file.Close()
+		return nil, fmt.Errorf("copy binary: %w", err)
+	} else if copied != size {
+		file.Close()
+		return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
+	}
+	if err := sealFn(&file); err != nil {
+		file.Close()
+		return nil, fmt.Errorf("could not seal fd: %w", err)
+	}
+	return file, nil
+}
+
+// IsCloned returns whether the given file can be guaranteed to be a safe exe.
+func IsCloned(exe *os.File) bool {
+	seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0)
+	if err != nil {
+		// /proc/self/exe is probably not a memfd
+		logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err)
+		return false
+	}
+	// The memfd must have all of the base seals applied.
+	logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals)
+	return seals&baseMemfdSeals == baseMemfdSeals
+}
+
+// CloneSelfExe makes a clone of the current process's binary (through
+// /proc/self/exe). This binary can then be used for "runc init" in order to
+// make sure the container process can never resolve the original runc binary.
+// For more details on why this is necessary, see CVE-2019-5736.
+func CloneSelfExe(tmpDir string) (*os.File, error) {
+	// Try to create a temporary overlayfs to produce a readonly version of
+	// /proc/self/exe that cannot be "unwrapped" by the container. In contrast
+	// to CloneBinary, this technique does not require any extra memory usage
+	// and does not have the (fairly noticeable) performance impact of copying
+	// a large binary file into a memfd.
+	//
+	// Based on some basic performance testing, the overlayfs approach has
+	// effectively no performance overhead (it is on par with both
+	// MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds
+	// around ~60% overhead during container startup.
+	overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
+	if err == nil {
+		logrus.Debug("runc-dmz: using overlayfs for sealed /proc/self/exe") // used for tests
+		return overlayFile, nil
+	}
+	logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy")
+
+	selfExe, err := os.Open("/proc/self/exe")
+	if err != nil {
+		return nil, fmt.Errorf("opening current binary: %w", err)
+	}
+	defer selfExe.Close()
+
+	stat, err := selfExe.Stat()
+	if err != nil {
+		return nil, fmt.Errorf("checking /proc/self/exe size: %w", err)
+	}
+	size := stat.Size()
+
+	return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir)
+}
+
+// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can
+// be guaranteed to be safe. This means that it must be a sealed memfd. Other
+// types of clones cannot be completely verified as safe.
+func IsSelfExeCloned() bool {
+	selfExe, err := os.Open("/proc/self/exe")
+	if err != nil {
+		logrus.Debugf("open /proc/self/exe failed: %v", err)
+		return false
+	}
+	defer selfExe.Close()
+	return IsCloned(selfExe)
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/dmz/overlayfs_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/dmz/overlayfs_linux.go
@@ -0,0 +1,122 @@
+package dmz
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/utils"
+)
+
+func fsopen(fsName string, flags int) (*os.File, error) {
+	// Make sure we always set O_CLOEXEC.
+	flags |= unix.FSOPEN_CLOEXEC
+	fd, err := unix.Fsopen(fsName, flags)
+	if err != nil {
+		return nil, os.NewSyscallError("fsopen "+fsName, err)
+	}
+	return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
+}
+
+func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
+	// Make sure we always set O_CLOEXEC.
+	flags |= unix.FSMOUNT_CLOEXEC
+	fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
+	if err != nil {
+		return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
+	}
+	runtime.KeepAlive(ctx) // make sure fd is kept alive while it's used
+	return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
+}
+
+func escapeOverlayLowerDir(path string) string {
+	// If the lowerdir path contains ":" we need to escape them, and if there
+	// were any escape characters already (\) we need to escape those first.
+	return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`)
+}
+
+// sealedOverlayfs will create an internal overlayfs mount using fsopen() that
+// uses the directory containing the binary as a lowerdir and a temporary tmpfs
+// as an upperdir. There is no way to "unwrap" this (unlike MS_BIND+MS_RDONLY)
+// and so we can create a safe zero-copy sealed version of /proc/self/exe.
+// This only works for privileged users and on kernels with overlayfs and
+// fsopen() enabled.
+//
+// TODO: Since Linux 5.11, overlayfs can be created inside user namespaces so
+// it is technically possible to create an overlayfs even for rootless
+// containers. Unfortunately, this would require some ugly manual CGo+fork
+// magic so we can do this later if we feel it's really needed.
+func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) {
+	// Try to do the superblock creation first to bail out early if we can't
+	// use this method.
+	overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC)
+	if err != nil {
+		return nil, err
+	}
+	defer overlayCtx.Close()
+
+	// binPath is going to be /proc/self/exe, so do a readlink to get the real
+	// path. overlayfs needs the real underlying directory for this protection
+	// mode to work properly.
+	if realPath, err := os.Readlink(binPath); err == nil {
+		binPath = realPath
+	}
+	binLowerDirPath, binName := filepath.Split(binPath)
+	// Escape any ":"s or "\"s in the path.
+	binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath)
+
+	// Overlayfs requires two lowerdirs in order to run in "lower-only" mode,
+	// where writes are completely blocked. Ideally we would create a dummy
+	// tmpfs for this, but it turns out that overlayfs doesn't allow for
+	// anonymous mountns paths.
+	// NOTE: I'm working on a patch to fix this but it won't be backported.
+	dummyLowerDirPath := escapeOverlayLowerDir(tmpDir)
+
+	// Configure the lowerdirs. The binary lowerdir needs to be on the top to
+	// ensure that a file called "runc" (binName) in the dummy lowerdir doesn't
+	// mask the binary.
+	lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath
+	if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil {
+		return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err)
+	}
+
+	// We don't care about xino (Linux 4.17) but it will be auto-enabled on
+	// some systems (if /run/runc and /usr/bin are on different filesystems)
+	// and this produces spurious dmesg log entries. We can safely ignore
+	// errors when disabling this because we don't actually care about the
+	// setting and we're just opportunistically disabling it.
+	_ = unix.FsconfigSetString(int(overlayCtx.Fd()), "xino", "off")
+
+	// Get an actual handle to the overlayfs.
+	if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil {
+		return nil, os.NewSyscallError("fsconfig create overlayfs", err)
+	}
+	overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID)
+	if err != nil {
+		return nil, err
+	}
+	defer overlayFd.Close()
+
+	// Grab a handle to the binary through overlayfs.
+	exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err)
+	}
+	// NOTE: We would like to check that exeFile is the same as /proc/self/exe,
+	// except this is a little difficult. Depending on what filesystems the
+	// layers are on, overlayfs can remap the inode numbers (and it always
+	// creates its own device numbers -- see ovl_map_dev_ino) so we can't do a
+	// basic stat-based check. The only reasonable option would be to hash both
+	// files and compare them, but this would require fully reading both files
+	// which would produce a similar performance overhead to memfd cloning.
+	//
+	// Ultimately, there isn't a real attack to be worried about here. An
+	// attacker would need to be able to modify files in /usr/sbin (or wherever
+	// runc lives), at which point they could just replace the runc binary with
+	// something malicious anyway.
+	return exeFile, nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
@@ -0,0 +1,216 @@
+//go:build linux
+
+package system
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"strconv"
+	"syscall"
+	"unsafe"
+
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+type ParentDeathSignal int
+
+func (p ParentDeathSignal) Restore() error {
+	if p == 0 {
+		return nil
+	}
+	current, err := GetParentDeathSignal()
+	if err != nil {
+		return err
+	}
+	if p == current {
+		return nil
+	}
+	return p.Set()
+}
+
+func (p ParentDeathSignal) Set() error {
+	return SetParentDeathSignal(uintptr(p))
+}
+
+func Exec(cmd string, args []string, env []string) error {
+	for {
+		err := unix.Exec(cmd, args, env)
+		if err != unix.EINTR {
+			return &os.PathError{Op: "exec", Path: cmd, Err: err}
+		}
+	}
+}
+
+func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error {
+	pathnamep, err := syscall.BytePtrFromString(pathname)
+	if err != nil {
+		return err
+	}
+
+	argvp, err := syscall.SlicePtrFromStrings(args)
+	if err != nil {
+		return err
+	}
+
+	envp, err := syscall.SlicePtrFromStrings(env)
+	if err != nil {
+		return err
+	}
+
+	_, _, errno := syscall.Syscall6(
+		unix.SYS_EXECVEAT,
+		fd,
+		uintptr(unsafe.Pointer(pathnamep)),
+		uintptr(unsafe.Pointer(&argvp[0])),
+		uintptr(unsafe.Pointer(&envp[0])),
+		uintptr(flags),
+		0,
+	)
+	return errno
+}
+
+func Fexecve(fd uintptr, args []string, env []string) error {
+	var err error
+	for {
+		err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH)
+		if err != unix.EINTR { // nolint:errorlint // unix errors are bare
+			break
+		}
+	}
+	if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare
+		// Fallback to classic /proc/self/fd/... exec.
+		return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env)
+	}
+	return os.NewSyscallError("execveat", err)
+}
+
+func SetParentDeathSignal(sig uintptr) error {
+	if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
+		return err
+	}
+	return nil
+}
+
+func GetParentDeathSignal() (ParentDeathSignal, error) {
+	var sig int
+	if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil {
+		return -1, err
+	}
+	return ParentDeathSignal(sig), nil
+}
+
+func SetKeepCaps() error {
+	if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func ClearKeepCaps() error {
+	if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func Setctty() error {
+	if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil {
+		return err
+	}
+	return nil
+}
+
+// SetSubreaper sets the value i as the subreaper setting for the calling process
+func SetSubreaper(i int) error {
+	return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
+}
+
+// GetSubreaper returns the subreaper setting for the calling process
+func GetSubreaper() (int, error) {
+	var i uintptr
+
+	if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
+		return -1, err
+	}
+
+	return int(i), nil
+}
+
+func ExecutableMemfd(comment string, flags int) (*os.File, error) {
+	// Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this
+	// flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an
+	// executable memfd. For vm.memfd_noexec=2 this is a bit more complicated.
+	// The original vm.memfd_noexec=2 implementation incorrectly silently
+	// allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer
+	// kernels, we will get -EACCES if we try to use MFD_EXEC with
+	// vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value).
+	//
+	// The upshot is we only need to retry without MFD_EXEC on -EINVAL because
+	// it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on
+	// kernels where -EINVAL is actually a security denial.
+	memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC)
+	if err == unix.EINVAL {
+		memfd, err = unix.MemfdCreate(comment, flags)
+	}
+	if err != nil {
+		if err == unix.EACCES {
+			logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE")
+		}
+		err := os.NewSyscallError("memfd_create", err)
+		return nil, fmt.Errorf("failed to create executable memfd: %w", err)
+	}
+	return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
+}
+
+// Copy is like io.Copy except it uses sendfile(2) if the source and sink are
+// both (*os.File) as an optimisation to make copies faster.
+func Copy(dst io.Writer, src io.Reader) (copied int64, err error) {
+	dstFile, _ := dst.(*os.File)
+	srcFile, _ := src.(*os.File)
+
+	if dstFile != nil && srcFile != nil {
+		fi, err := srcFile.Stat()
+		if err != nil {
+			goto fallback
+		}
+		size := fi.Size()
+		for size > 0 {
+			n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size))
+			if n > 0 {
+				size -= int64(n)
+				copied += int64(n)
+			}
+			if err == unix.EINTR {
+				continue
+			}
+			if err != nil {
+				if copied == 0 {
+					// If we haven't copied anything so far, we can safely just
+					// fallback to io.Copy. We could always do the fallback but
+					// it's safer to error out in the case of a partial copy
+					// followed by an error (which should never happen).
+					goto fallback
+				}
+				return copied, fmt.Errorf("partial sendfile copy: %w", err)
+			}
+		}
+		return copied, nil
+	}
+
+fallback:
+	return io.Copy(dst, src)
+}
+
+// SetLinuxPersonality sets the Linux execution personality. For more information see the personality syscall documentation.
+// checkout getLinuxPersonalityFromStr() from libcontainer/specconv/spec_linux.go for type conversion.
+func SetLinuxPersonality(personality int) error {
+	_, _, errno := unix.Syscall(unix.SYS_PERSONALITY, uintptr(personality), 0, 0)
+	if errno != 0 {
+		return &os.SyscallError{Syscall: "set_personality", Err: errno}
+	}
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
@@ -0,0 +1,127 @@
+package system
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+)
+
+// State is the status of a process.
+type State rune
+
+const ( // Only values for Linux 3.14 and later are listed here
+	Dead        State = 'X'
+	DiskSleep   State = 'D'
+	Running     State = 'R'
+	Sleeping    State = 'S'
+	Stopped     State = 'T'
+	TracingStop State = 't'
+	Zombie      State = 'Z'
+	Parked      State = 'P'
+	Idle        State = 'I'
+)
+
+// String forms of the state from proc(5)'s documentation for
+// /proc/[pid]/status' "State" field.
+func (s State) String() string {
+	switch s {
+	case Dead:
+		return "dead"
+	case DiskSleep:
+		return "disk sleep"
+	case Running:
+		return "running"
+	case Sleeping:
+		return "sleeping"
+	case Stopped:
+		return "stopped"
+	case TracingStop:
+		return "tracing stop"
+	case Zombie:
+		return "zombie"
+	case Parked:
+		return "parked"
+	case Idle:
+		return "idle" // kernel thread
+	default:
+		return fmt.Sprintf("unknown (%c)", s)
+	}
+}
+
+// Stat_t represents the information from /proc/[pid]/stat, as
+// described in proc(5) with names based on the /proc/[pid]/status
+// fields.
+type Stat_t struct {
+	// Name is the command run by the process.
+	Name string
+
+	// State is the state of the process.
+	State State
+
+	// StartTime is the number of clock ticks after system boot (since
+	// Linux 2.6).
+	StartTime uint64
+}
+
+// Stat returns a Stat_t instance for the specified process.
+func Stat(pid int) (stat Stat_t, err error) {
+	bytes, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
+	if err != nil {
+		return stat, err
+	}
+	return parseStat(string(bytes))
+}
+
+func parseStat(data string) (stat Stat_t, err error) {
+	// Example:
+	// 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+	// The fields are space-separated, see full description in proc(5).
+	//
+	// We are only interested in:
+	//  * field 2: process name. It is the only field enclosed into
+	//    parenthesis, as it can contain spaces (and parenthesis) inside.
+	//  * field 3: process state, a single character (%c)
+	//  * field 22: process start time, a long unsigned integer (%llu).
+
+	// 1. Look for the first '(' and the last ')' first, what's in between is Name.
+	//    We expect at least 20 fields and a space after the last one.
+
+	const minAfterName = 20*2 + 1 // the min field is '0 '.
+
+	first := strings.IndexByte(data, '(')
+	if first < 0 || first+minAfterName >= len(data) {
+		return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
+	}
+
+	last := strings.LastIndexByte(data, ')')
+	if last <= first || last+minAfterName >= len(data) {
+		return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
+	}
+
+	stat.Name = data[first+1 : last]
+
+	// 2. Remove fields 1 and 2 and a space after. State is right after.
+	data = data[last+2:]
+	stat.State = State(data[0])
+
+	// 3. StartTime is field 22, data is at field 3 now, so we need to skip 19 spaces.
+	skipSpaces := 22 - 3
+	for first = 0; skipSpaces > 0 && first < len(data); first++ {
+		if data[first] == ' ' {
+			skipSpaces--
+		}
+	}
+	// Now first points to StartTime; look for space right after.
+	i := strings.IndexByte(data[first:], ' ')
+	if i < 0 {
+		return stat, fmt.Errorf("invalid stat data (too short): %q", data)
+	}
+	stat.StartTime, err = strconv.ParseUint(data[first:first+i], 10, 64)
+	if err != nil {
+		return stat, fmt.Errorf("invalid stat data (bad start time): %w", err)
+	}
+
+	return stat, nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go
@@ -0,0 +1,15 @@
+//go:build go1.23
+
+package system
+
+import (
+	"syscall"
+)
+
+// ClearRlimitNofileCache clears go runtime's nofile rlimit cache. The argument
+// is process RLIMIT_NOFILE values. Relies on go.dev/cl/588076.
+func ClearRlimitNofileCache(lim *syscall.Rlimit) {
+	// Ignore the return values since we only need to clean the cache,
+	// the limit is going to be set via unix.Prlimit elsewhere.
+	_ = syscall.Setrlimit(syscall.RLIMIT_NOFILE, lim)
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go
@@ -0,0 +1,27 @@
+//go:build !go1.23
+
+// TODO: remove this file once go 1.22 is no longer supported.
+
+package system
+
+import (
+	"sync/atomic"
+	"syscall"
+	_ "unsafe" // Needed for go:linkname to work.
+)
+
+//go:linkname syscallOrigRlimitNofile syscall.origRlimitNofile
+var syscallOrigRlimitNofile atomic.Pointer[syscall.Rlimit]
+
+// ClearRlimitNofileCache clears go runtime's nofile rlimit cache.
+// The argument is process RLIMIT_NOFILE values.
+func ClearRlimitNofileCache(_ *syscall.Rlimit) {
+	// As reported in issue #4195, the new version of go runtime(since 1.19)
+	// will cache rlimit-nofile. Before executing execve, the rlimit-nofile
+	// of the process will be restored with the cache. In runc, this will
+	// cause the rlimit-nofile setting by the parent process for the container
+	// to become invalid. It can be solved by clearing this cache. But
+	// unfortunately, go stdlib doesn't provide such function, so we need to
+	// link to the private var `origRlimitNofile` in package syscall to hack.
+	syscallOrigRlimitNofile.Store(nil)
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
@@ -0,0 +1,119 @@
+package utils
+
+/*
+ * Copyright 2016, 2017 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+
+	"golang.org/x/sys/unix"
+)
+
+// MaxNameLen is the maximum length of the name of a file descriptor being sent
+// using SendFile. The name of the file handle returned by RecvFile will never be
+// larger than this value.
+const MaxNameLen = 4096
+
+// oobSpace is the size of the oob slice required to store a single FD. Note
+// that unix.UnixRights appears to make the assumption that fd is always int32,
+// so sizeof(fd) = 4.
+var oobSpace = unix.CmsgSpace(4)
+
+// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
+// socket. The file name of the remote file descriptor will be recreated
+// locally (it is sent as non-auxiliary data in the same payload).
+func RecvFile(socket *os.File) (_ *os.File, Err error) {
+	name := make([]byte, MaxNameLen)
+	oob := make([]byte, oobSpace)
+
+	sockfd := socket.Fd()
+	n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
+	if err != nil {
+		return nil, err
+	}
+	if n >= MaxNameLen || oobn != oobSpace {
+		return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
+	}
+	// Truncate.
+	name = name[:n]
+	oob = oob[:oobn]
+
+	scms, err := unix.ParseSocketControlMessage(oob)
+	if err != nil {
+		return nil, err
+	}
+
+	// We cannot control how many SCM_RIGHTS we receive, and upon receiving
+	// them all of the descriptors are installed in our fd table, so we need to
+	// parse all of the SCM_RIGHTS we received in order to close all of the
+	// descriptors on error.
+	var fds []int
+	defer func() {
+		for i, fd := range fds {
+			if i == 0 && Err == nil {
+				// Only close the first one on error.
+				continue
+			}
+			// Always close extra ones.
+			_ = unix.Close(fd)
+		}
+	}()
+	var lastErr error
+	for _, scm := range scms {
+		if scm.Header.Type == unix.SCM_RIGHTS {
+			scmFds, err := unix.ParseUnixRights(&scm)
+			if err != nil {
+				lastErr = err
+			} else {
+				fds = append(fds, scmFds...)
+			}
+		}
+	}
+	if lastErr != nil {
+		return nil, lastErr
+	}
+
+	// We do this after collecting the fds to make sure we close them all when
+	// returning an error here.
+	if len(scms) != 1 {
+		return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
+	}
+	if len(fds) != 1 {
+		return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
+	}
+	return os.NewFile(uintptr(fds[0]), string(name)), nil
+}
+
+// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
+// included so that if the other end uses RecvFile, the file will have the same
+// name information.
+func SendFile(socket *os.File, file *os.File) error {
+	name := file.Name()
+	if len(name) >= MaxNameLen {
+		return fmt.Errorf("sendfd: filename too long: %s", name)
+	}
+	err := SendRawFd(socket, name, file.Fd())
+	runtime.KeepAlive(file)
+	return err
+}
+
+// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
+func SendRawFd(socket *os.File, msg string, fd uintptr) error {
+	oob := unix.UnixRights(int(fd))
+	return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
@@ -0,0 +1,115 @@
+package utils
+
+import (
+	"encoding/json"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"golang.org/x/sys/unix"
+)
+
+const (
+	exitSignalOffset = 128
+)
+
+// ExitStatus returns the correct exit status for a process based on if it
+// was signaled or exited cleanly
+func ExitStatus(status unix.WaitStatus) int {
+	if status.Signaled() {
+		return exitSignalOffset + int(status.Signal())
+	}
+	return status.ExitStatus()
+}
+
+// WriteJSON writes the provided struct v to w using standard json marshaling
+// without a trailing newline. This is used instead of json.Encoder because
+// there might be a problem in json decoder in some cases, see:
+// https://github.com/docker/docker/issues/14203#issuecomment-174177790
+func WriteJSON(w io.Writer, v interface{}) error {
+	data, err := json.Marshal(v)
+	if err != nil {
+		return err
+	}
+	_, err = w.Write(data)
+	return err
+}
+
+// CleanPath makes a path safe for use with filepath.Join. This is done by not
+// only cleaning the path, but also (if the path is relative) adding a leading
+// '/' and cleaning it (then removing the leading '/'). This ensures that a
+// path resulting from prepending another path will always resolve to lexically
+// be a subdirectory of the prefixed path. This is all done lexically, so paths
+// that include symlinks won't be safe as a result of using CleanPath.
+func CleanPath(path string) string {
+	// Deal with empty strings nicely.
+	if path == "" {
+		return ""
+	}
+
+	// Ensure that all paths are cleaned (especially problematic ones like
+	// "/../../../../../" which can cause lots of issues).
+	path = filepath.Clean(path)
+
+	// If the path isn't absolute, we need to do more processing to fix paths
+	// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
+	// paths to relative ones.
+	if !filepath.IsAbs(path) {
+		path = filepath.Clean(string(os.PathSeparator) + path)
+		// This can't fail, as (by definition) all paths are relative to root.
+		path, _ = filepath.Rel(string(os.PathSeparator), path)
+	}
+
+	// Clean the path again for good measure.
+	return filepath.Clean(path)
+}
+
+// stripRoot returns the passed path, stripping the root path if it was
+// (lexicially) inside it. Note that both passed paths will always be treated
+// as absolute, and the returned path will also always be absolute. In
+// addition, the paths are cleaned before stripping the root.
+func stripRoot(root, path string) string {
+	// Make the paths clean and absolute.
+	root, path = CleanPath("/"+root), CleanPath("/"+path)
+	switch {
+	case path == root:
+		path = "/"
+	case root == "/":
+		// do nothing
+	case strings.HasPrefix(path, root+"/"):
+		path = strings.TrimPrefix(path, root+"/")
+	}
+	return CleanPath("/" + path)
+}
+
+// SearchLabels searches through a list of key=value pairs for a given key,
+// returning its value, and the binary flag telling whether the key exist.
+func SearchLabels(labels []string, key string) (string, bool) {
+	key += "="
+	for _, s := range labels {
+		if strings.HasPrefix(s, key) {
+			return s[len(key):], true
+		}
+	}
+	return "", false
+}
+
+// Annotations returns the bundle path and user defined annotations from the
+// libcontainer state.  We need to remove the bundle because that is a label
+// added by libcontainer.
+func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
+	userAnnotations = make(map[string]string)
+	for _, l := range labels {
+		name, value, ok := strings.Cut(l, "=")
+		if !ok {
+			continue
+		}
+		if name == "bundle" {
+			bundle = value
+		} else {
+			userAnnotations[name] = value
+		}
+	}
+	return
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
@@ -0,0 +1,360 @@
+//go:build !windows
+
+package utils
+
+import (
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	_ "unsafe" // for go:linkname
+
+	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+// EnsureProcHandle returns whether or not the given file handle is on procfs.
+func EnsureProcHandle(fh *os.File) error {
+	var buf unix.Statfs_t
+	if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil {
+		return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err)
+	}
+	if buf.Type != unix.PROC_SUPER_MAGIC {
+		return fmt.Errorf("%s is not on procfs", fh.Name())
+	}
+	return nil
+}
+
+var (
+	haveCloseRangeCloexecBool bool
+	haveCloseRangeCloexecOnce sync.Once
+)
+
+func haveCloseRangeCloexec() bool {
+	haveCloseRangeCloexecOnce.Do(func() {
+		// Make sure we're not closing a random file descriptor.
+		tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
+		if err != nil {
+			return
+		}
+		defer unix.Close(tmpFd)
+
+		err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
+		// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
+		// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
+		// other potential error would imply that even the most basic close
+		// operation wouldn't work.
+		haveCloseRangeCloexecBool = err == nil
+	})
+	return haveCloseRangeCloexecBool
+}
+
+type fdFunc func(fd int)
+
+// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
+// the current process.
+func fdRangeFrom(minFd int, fn fdFunc) error {
+	procSelfFd, closer := ProcThreadSelf("fd")
+	defer closer()
+
+	fdDir, err := os.Open(procSelfFd)
+	if err != nil {
+		return err
+	}
+	defer fdDir.Close()
+
+	if err := EnsureProcHandle(fdDir); err != nil {
+		return err
+	}
+
+	fdList, err := fdDir.Readdirnames(-1)
+	if err != nil {
+		return err
+	}
+	for _, fdStr := range fdList {
+		fd, err := strconv.Atoi(fdStr)
+		// Ignore non-numeric file names.
+		if err != nil {
+			continue
+		}
+		// Ignore descriptors lower than our specified minimum.
+		if fd < minFd {
+			continue
+		}
+		// Ignore the file descriptor we used for readdir, as it will be closed
+		// when we return.
+		if uintptr(fd) == fdDir.Fd() {
+			continue
+		}
+		// Run the closure.
+		fn(fd)
+	}
+	return nil
+}
+
+// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
+// equal to minFd in the current process.
+func CloseExecFrom(minFd int) error {
+	// Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
+	if haveCloseRangeCloexec() {
+		err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
+		return os.NewSyscallError("close_range", err)
+	}
+	// Otherwise, fall back to the standard loop.
+	return fdRangeFrom(minFd, unix.CloseOnExec)
+}
+
+//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
+
+// In order to make sure we do not close the internal epoll descriptors the Go
+// runtime uses, we need to ensure that we skip descriptors that match
+// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
+// unfortunately there's no other way to be sure we're only keeping the file
+// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
+func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
+
+// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
+// current process, except for those critical to Go's runtime (such as the
+// netpoll management descriptors).
+//
+// NOTE: That this function is incredibly dangerous to use in most Go code, as
+// closing file descriptors from underneath *os.File handles can lead to very
+// bad behaviour (the closed file descriptor can be re-used and then any
+// *os.File operations would apply to the wrong file). This function is only
+// intended to be called from the last stage of runc init.
+func UnsafeCloseFrom(minFd int) error {
+	// We cannot use close_range(2) even if it is available, because we must
+	// not close some file descriptors.
+	return fdRangeFrom(minFd, func(fd int) {
+		if runtime_IsPollDescriptor(uintptr(fd)) {
+			// These are the Go runtimes internal netpoll file descriptors.
+			// These file descriptors are operated on deep in the Go scheduler,
+			// and closing those files from underneath Go can result in panics.
+			// There is no issue with keeping them because they are not
+			// executable and are not useful to an attacker anyway. Also we
+			// don't have any choice.
+			return
+		}
+		// There's nothing we can do about errors from close(2), and the
+		// only likely error to be seen is EBADF which indicates the fd was
+		// already closed (in which case, we got what we wanted).
+		_ = unix.Close(fd)
+	})
+}
+
+// NewSockPair returns a new SOCK_STREAM unix socket pair.
+func NewSockPair(name string) (parent, child *os.File, err error) {
+	fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
+	if err != nil {
+		return nil, nil, err
+	}
+	return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
+}
+
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
+// corresponding to the unsafePath resolved within the root. Before passing the
+// fd, this path is verified to have been inside the root -- so operating on it
+// through the passed fdpath should be safe. Do not access this path through
+// the original path strings, and do not attempt to use the pathname outside of
+// the passed closure (the file handle will be freed once the closure returns).
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
+	// Remove the root then forcefully resolve inside the root.
+	unsafePath = stripRoot(root, unsafePath)
+	path, err := securejoin.SecureJoin(root, unsafePath)
+	if err != nil {
+		return fmt.Errorf("resolving path inside rootfs failed: %w", err)
+	}
+
+	procSelfFd, closer := ProcThreadSelf("fd/")
+	defer closer()
+
+	// Open the target path.
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("open o_path procfd: %w", err)
+	}
+	defer fh.Close()
+
+	procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
+	// Double-check the path is the one we expected.
+	if realpath, err := os.Readlink(procfd); err != nil {
+		return fmt.Errorf("procfd verification failed: %w", err)
+	} else if realpath != path {
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
+	}
+
+	return fn(procfd)
+}
+
+type ProcThreadSelfCloser func()
+
+var (
+	haveProcThreadSelf     bool
+	haveProcThreadSelfOnce sync.Once
+)
+
+// ProcThreadSelf returns a string that is equivalent to
+// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
+// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
+// meaning that the passed string needs to be trusted. The caller _must_ call
+// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
+// *only once* after it has finished using the returned path string.
+func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
+	haveProcThreadSelfOnce.Do(func() {
+		if _, err := os.Stat("/proc/thread-self/"); err == nil {
+			haveProcThreadSelf = true
+		} else {
+			logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
+		}
+	})
+
+	// We need to lock our thread until the caller is done with the path string
+	// because any non-atomic operation on the path (such as opening a file,
+	// then reading it) could be interrupted by the Go runtime where the
+	// underlying thread is swapped out and the original thread is killed,
+	// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
+	// addition, the pre-3.17 fallback makes everything non-atomic because the
+	// same thing could happen between unix.Gettid() and the path operations.
+	//
+	// In theory, we don't need to lock in the atomic user case when using
+	// /proc/thread-self/, but it's better to be safe than sorry (and there are
+	// only one or two truly atomic users of /proc/thread-self/).
+	runtime.LockOSThread()
+
+	threadSelf := "/proc/thread-self/"
+	if !haveProcThreadSelf {
+		// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
+		threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
+		if _, err := os.Stat(threadSelf); err != nil {
+			// Unfortunately, this code is called from rootfs_linux.go where we
+			// are running inside the pid namespace of the container but /proc
+			// is the host's procfs. Unfortunately there is no real way to get
+			// the correct tid to use here (the kernel age means we cannot do
+			// things like set up a private fsopen("proc") -- even scanning
+			// NSpid in all of the tasks in /proc/self/task/*/status requires
+			// Linux 4.1).
+			//
+			// So, we just have to assume that /proc/self is acceptable in this
+			// one specific case.
+			if os.Getpid() == 1 {
+				logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
+			} else {
+				// This should never happen, but the fallback should work in most cases...
+				logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
+			}
+			threadSelf = "/proc/self/"
+		}
+	}
+	return threadSelf + subpath, runtime.UnlockOSThread
+}
+
+// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
+// create a /proc/thread-self handle for given file descriptor.
+//
+// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
+// without using fmt.Sprintf to avoid unneeded overhead.
+func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
+	return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
+}
+
+// IsLexicallyInRoot is shorthand for strings.HasPrefix(path+"/", root+"/"),
+// but properly handling the case where path or root are "/".
+//
+// NOTE: The return value only make sense if the path doesn't contain "..".
+func IsLexicallyInRoot(root, path string) bool {
+	if root != "/" {
+		root += "/"
+	}
+	if path != "/" {
+		path += "/"
+	}
+	return strings.HasPrefix(path, root)
+}
+
+// MkdirAllInRootOpen attempts to make
+//
+//	path, _ := securejoin.SecureJoin(root, unsafePath)
+//	os.MkdirAll(path, mode)
+//	os.Open(path)
+//
+// safer against attacks where components in the path are changed between
+// SecureJoin returning and MkdirAll (or Open) being called. In particular, we
+// try to detect any symlink components in the path while we are doing the
+// MkdirAll.
+//
+// NOTE: If unsafePath is a subpath of root, we assume that you have already
+// called SecureJoin and so we use the provided path verbatim without resolving
+// any symlinks (this is done in a way that avoids symlink-exchange races).
+// This means that the path also must not contain ".." elements, otherwise an
+// error will occur.
+//
+// This uses securejoin.MkdirAllHandle under the hood, but it has special
+// handling if unsafePath has already been scoped within the rootfs (this is
+// needed for a lot of runc callers and fixing this would require reworking a
+// lot of path logic).
+func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (_ *os.File, Err error) {
+	// If the path is already "within" the root, get the path relative to the
+	// root and use that as the unsafe path. This is necessary because a lot of
+	// MkdirAllInRootOpen callers have already done SecureJoin, and refactoring
+	// all of them to stop using these SecureJoin'd paths would require a fair
+	// amount of work.
+	// TODO(cyphar): Do the refactor to libpathrs once it's ready.
+	if IsLexicallyInRoot(root, unsafePath) {
+		subPath, err := filepath.Rel(root, unsafePath)
+		if err != nil {
+			return nil, err
+		}
+		unsafePath = subPath
+	}
+
+	// Check for any silly mode bits.
+	if mode&^0o7777 != 0 {
+		return nil, fmt.Errorf("tried to include non-mode bits in MkdirAll mode: 0o%.3o", mode)
+	}
+	// Linux (and thus os.MkdirAll) silently ignores the suid and sgid bits if
+	// passed. While it would make sense to return an error in that case (since
+	// the user has asked for a mode that won't be applied), for compatibility
+	// reasons we have to ignore these bits.
+	if ignoredBits := mode &^ 0o1777; ignoredBits != 0 {
+		logrus.Warnf("MkdirAll called with no-op mode bits that are ignored by Linux: 0o%.3o", ignoredBits)
+		mode &= 0o1777
+	}
+
+	rootDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return nil, fmt.Errorf("open root handle: %w", err)
+	}
+	defer rootDir.Close()
+
+	return securejoin.MkdirAllHandle(rootDir, unsafePath, mode)
+}
+
+// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
+// returned handle, for callers that don't need to use it.
+func MkdirAllInRoot(root, unsafePath string, mode os.FileMode) error {
+	f, err := MkdirAllInRootOpen(root, unsafePath, mode)
+	if err == nil {
+		_ = f.Close()
+	}
+	return err
+}
+
+// Openat is a Go-friendly openat(2) wrapper.
+func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
+	dirFd := unix.AT_FDCWD
+	if dir != nil {
+		dirFd = int(dir.Fd())
+	}
+	flags |= unix.O_CLOEXEC
+
+	fd, err := unix.Openat(dirFd, path, flags, mode)
+	if err != nil {
+		return nil, &os.PathError{Op: "openat", Path: path, Err: err}
+	}
+	return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil
+}