//go:build !windows package utils import ( "fmt" "math" "os" "path/filepath" "runtime" "strconv" "strings" "sync" _ "unsafe" // for go:linkname securejoin "github.com/cyphar/filepath-securejoin" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) // EnsureProcHandle returns whether or not the given file handle is on procfs. func EnsureProcHandle(fh *os.File) error { var buf unix.Statfs_t if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err) } if buf.Type != unix.PROC_SUPER_MAGIC { return fmt.Errorf("%s is not on procfs", fh.Name()) } return nil } var ( haveCloseRangeCloexecBool bool haveCloseRangeCloexecOnce sync.Once ) func haveCloseRangeCloexec() bool { haveCloseRangeCloexecOnce.Do(func() { // Make sure we're not closing a random file descriptor. tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0) if err != nil { return } defer unix.Close(tmpFd) err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC) // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC). // -ENOSYS and -EINVAL ultimately mean we don't have support, but any // other potential error would imply that even the most basic close // operation wouldn't work. haveCloseRangeCloexecBool = err == nil }) return haveCloseRangeCloexecBool } type fdFunc func(fd int) // fdRangeFrom calls the passed fdFunc for each file descriptor that is open in // the current process. func fdRangeFrom(minFd int, fn fdFunc) error { procSelfFd, closer := ProcThreadSelf("fd") defer closer() fdDir, err := os.Open(procSelfFd) if err != nil { return err } defer fdDir.Close() if err := EnsureProcHandle(fdDir); err != nil { return err } fdList, err := fdDir.Readdirnames(-1) if err != nil { return err } for _, fdStr := range fdList { fd, err := strconv.Atoi(fdStr) // Ignore non-numeric file names. if err != nil { continue } // Ignore descriptors lower than our specified minimum. if fd < minFd { continue } // Ignore the file descriptor we used for readdir, as it will be closed // when we return. if uintptr(fd) == fdDir.Fd() { continue } // Run the closure. fn(fd) } return nil } // CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or // equal to minFd in the current process. func CloseExecFrom(minFd int) error { // Use close_range(CLOSE_RANGE_CLOEXEC) if possible. if haveCloseRangeCloexec() { err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC) return os.NewSyscallError("close_range", err) } // Otherwise, fall back to the standard loop. return fdRangeFrom(minFd, unix.CloseOnExec) } //go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor // In order to make sure we do not close the internal epoll descriptors the Go // runtime uses, we need to ensure that we skip descriptors that match // "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, // unfortunately there's no other way to be sure we're only keeping the file // descriptors the Go runtime needs. Hopefully nothing blows up doing this... func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive // UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the // current process, except for those critical to Go's runtime (such as the // netpoll management descriptors). // // NOTE: That this function is incredibly dangerous to use in most Go code, as // closing file descriptors from underneath *os.File handles can lead to very // bad behaviour (the closed file descriptor can be re-used and then any // *os.File operations would apply to the wrong file). This function is only // intended to be called from the last stage of runc init. func UnsafeCloseFrom(minFd int) error { // We cannot use close_range(2) even if it is available, because we must // not close some file descriptors. return fdRangeFrom(minFd, func(fd int) { if runtime_IsPollDescriptor(uintptr(fd)) { // These are the Go runtimes internal netpoll file descriptors. // These file descriptors are operated on deep in the Go scheduler, // and closing those files from underneath Go can result in panics. // There is no issue with keeping them because they are not // executable and are not useful to an attacker anyway. Also we // don't have any choice. return } // There's nothing we can do about errors from close(2), and the // only likely error to be seen is EBADF which indicates the fd was // already closed (in which case, we got what we wanted). _ = unix.Close(fd) }) } // NewSockPair returns a new SOCK_STREAM unix socket pair. func NewSockPair(name string) (parent, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) if err != nil { return nil, nil, err } return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil } // WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) // corresponding to the unsafePath resolved within the root. Before passing the // fd, this path is verified to have been inside the root -- so operating on it // through the passed fdpath should be safe. Do not access this path through // the original path strings, and do not attempt to use the pathname outside of // the passed closure (the file handle will be freed once the closure returns). func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { // Remove the root then forcefully resolve inside the root. unsafePath = stripRoot(root, unsafePath) path, err := securejoin.SecureJoin(root, unsafePath) if err != nil { return fmt.Errorf("resolving path inside rootfs failed: %w", err) } procSelfFd, closer := ProcThreadSelf("fd/") defer closer() // Open the target path. fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) if err != nil { return fmt.Errorf("open o_path procfd: %w", err) } defer fh.Close() procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd()))) // Double-check the path is the one we expected. if realpath, err := os.Readlink(procfd); err != nil { return fmt.Errorf("procfd verification failed: %w", err) } else if realpath != path { return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) } return fn(procfd) } type ProcThreadSelfCloser func() var ( haveProcThreadSelf bool haveProcThreadSelfOnce sync.Once ) // ProcThreadSelf returns a string that is equivalent to // /proc/thread-self/, with a graceful fallback on older kernels where // /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin, // meaning that the passed string needs to be trusted. The caller _must_ call // the returned procThreadSelfCloser function (which is runtime.UnlockOSThread) // *only once* after it has finished using the returned path string. func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) { haveProcThreadSelfOnce.Do(func() { if _, err := os.Stat("/proc/thread-self/"); err == nil { haveProcThreadSelf = true } else { logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/", err) } }) // We need to lock our thread until the caller is done with the path string // because any non-atomic operation on the path (such as opening a file, // then reading it) could be interrupted by the Go runtime where the // underlying thread is swapped out and the original thread is killed, // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In // addition, the pre-3.17 fallback makes everything non-atomic because the // same thing could happen between unix.Gettid() and the path operations. // // In theory, we don't need to lock in the atomic user case when using // /proc/thread-self/, but it's better to be safe than sorry (and there are // only one or two truly atomic users of /proc/thread-self/). runtime.LockOSThread() threadSelf := "/proc/thread-self/" if !haveProcThreadSelf { // Pre-3.17 kernels did not have /proc/thread-self, so do it manually. threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/" if _, err := os.Stat(threadSelf); err != nil { // Unfortunately, this code is called from rootfs_linux.go where we // are running inside the pid namespace of the container but /proc // is the host's procfs. Unfortunately there is no real way to get // the correct tid to use here (the kernel age means we cannot do // things like set up a private fsopen("proc") -- even scanning // NSpid in all of the tasks in /proc/self/task/*/status requires // Linux 4.1). // // So, we just have to assume that /proc/self is acceptable in this // one specific case. if os.Getpid() == 1 { logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err) } else { // This should never happen, but the fallback should work in most cases... logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err) } threadSelf = "/proc/self/" } } return threadSelf + subpath, runtime.UnlockOSThread } // ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to // create a /proc/thread-self handle for given file descriptor. // // It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but // without using fmt.Sprintf to avoid unneeded overhead. func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) { return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10)) } // IsLexicallyInRoot is shorthand for strings.HasPrefix(path+"/", root+"/"), // but properly handling the case where path or root are "/". // // NOTE: The return value only make sense if the path doesn't contain "..". func IsLexicallyInRoot(root, path string) bool { if root != "/" { root += "/" } if path != "/" { path += "/" } return strings.HasPrefix(path, root) } // MkdirAllInRootOpen attempts to make // // path, _ := securejoin.SecureJoin(root, unsafePath) // os.MkdirAll(path, mode) // os.Open(path) // // safer against attacks where components in the path are changed between // SecureJoin returning and MkdirAll (or Open) being called. In particular, we // try to detect any symlink components in the path while we are doing the // MkdirAll. // // NOTE: If unsafePath is a subpath of root, we assume that you have already // called SecureJoin and so we use the provided path verbatim without resolving // any symlinks (this is done in a way that avoids symlink-exchange races). // This means that the path also must not contain ".." elements, otherwise an // error will occur. // // This uses securejoin.MkdirAllHandle under the hood, but it has special // handling if unsafePath has already been scoped within the rootfs (this is // needed for a lot of runc callers and fixing this would require reworking a // lot of path logic). func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (_ *os.File, Err error) { // If the path is already "within" the root, get the path relative to the // root and use that as the unsafe path. This is necessary because a lot of // MkdirAllInRootOpen callers have already done SecureJoin, and refactoring // all of them to stop using these SecureJoin'd paths would require a fair // amount of work. // TODO(cyphar): Do the refactor to libpathrs once it's ready. if IsLexicallyInRoot(root, unsafePath) { subPath, err := filepath.Rel(root, unsafePath) if err != nil { return nil, err } unsafePath = subPath } // Check for any silly mode bits. if mode&^0o7777 != 0 { return nil, fmt.Errorf("tried to include non-mode bits in MkdirAll mode: 0o%.3o", mode) } // Linux (and thus os.MkdirAll) silently ignores the suid and sgid bits if // passed. While it would make sense to return an error in that case (since // the user has asked for a mode that won't be applied), for compatibility // reasons we have to ignore these bits. if ignoredBits := mode &^ 0o1777; ignoredBits != 0 { logrus.Warnf("MkdirAll called with no-op mode bits that are ignored by Linux: 0o%.3o", ignoredBits) mode &= 0o1777 } rootDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0) if err != nil { return nil, fmt.Errorf("open root handle: %w", err) } defer rootDir.Close() return securejoin.MkdirAllHandle(rootDir, unsafePath, mode) } // MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the // returned handle, for callers that don't need to use it. func MkdirAllInRoot(root, unsafePath string, mode os.FileMode) error { f, err := MkdirAllInRootOpen(root, unsafePath, mode) if err == nil { _ = f.Close() } return err } // Openat is a Go-friendly openat(2) wrapper. func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) { dirFd := unix.AT_FDCWD if dir != nil { dirFd = int(dir.Fd()) } flags |= unix.O_CLOEXEC fd, err := unix.Openat(dirFd, path, flags, mode) if err != nil { return nil, &os.PathError{Op: "openat", Path: path, Err: err} } return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil }