mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-26 18:18:24 +00:00
Use libcontainer execseal to run ldconfig
This change copies ldconfig into a memfd before executing it from the createContainer hook. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
191
vendor/github.com/opencontainers/runc/LICENSE
generated
vendored
Normal file
191
vendor/github.com/opencontainers/runc/LICENSE
generated
vendored
Normal file
@@ -0,0 +1,191 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
Copyright 2014 Docker, Inc.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
17
vendor/github.com/opencontainers/runc/NOTICE
generated
vendored
Normal file
17
vendor/github.com/opencontainers/runc/NOTICE
generated
vendored
Normal file
@@ -0,0 +1,17 @@
|
||||
runc
|
||||
|
||||
Copyright 2012-2015 Docker, Inc.
|
||||
|
||||
This product includes software developed at Docker, Inc. (http://www.docker.com).
|
||||
|
||||
The following is courtesy of our legal counsel:
|
||||
|
||||
|
||||
Use and transfer of Docker may be subject to certain restrictions by the
|
||||
United States and other governments.
|
||||
It is your responsibility to ensure that your use and/or transfer does not
|
||||
violate applicable laws.
|
||||
|
||||
For more information, please see http://www.bis.doc.gov
|
||||
|
||||
See also http://www.apache.org/dev/crypto.html and/or seek legal counsel.
|
||||
258
vendor/github.com/opencontainers/runc/libcontainer/dmz/cloned_binary_linux.go
generated
vendored
Normal file
258
vendor/github.com/opencontainers/runc/libcontainer/dmz/cloned_binary_linux.go
generated
vendored
Normal file
@@ -0,0 +1,258 @@
|
||||
package dmz
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strconv"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
)
|
||||
|
||||
type SealFunc func(**os.File) error
|
||||
|
||||
var (
|
||||
_ SealFunc = sealMemfd
|
||||
_ SealFunc = sealFile
|
||||
)
|
||||
|
||||
func isExecutable(f *os.File) bool {
|
||||
if err := unix.Faccessat(int(f.Fd()), "", unix.X_OK, unix.AT_EACCESS|unix.AT_EMPTY_PATH); err == nil {
|
||||
return true
|
||||
} else if err == unix.EACCES {
|
||||
return false
|
||||
}
|
||||
path := "/proc/self/fd/" + strconv.Itoa(int(f.Fd()))
|
||||
if err := unix.Access(path, unix.X_OK); err == nil {
|
||||
return true
|
||||
} else if err == unix.EACCES {
|
||||
return false
|
||||
}
|
||||
// Cannot check -- assume it's executable (if not, exec will fail).
|
||||
logrus.Debugf("cannot do X_OK check on binary %s -- assuming it's executable", f.Name())
|
||||
return true
|
||||
}
|
||||
|
||||
const baseMemfdSeals = unix.F_SEAL_SEAL | unix.F_SEAL_SHRINK | unix.F_SEAL_GROW | unix.F_SEAL_WRITE
|
||||
|
||||
func sealMemfd(f **os.File) error {
|
||||
if err := (*f).Chmod(0o511); err != nil {
|
||||
return err
|
||||
}
|
||||
// Try to set the newer memfd sealing flags, but we ignore
|
||||
// errors because they are not needed and we want to continue
|
||||
// to work on older kernels.
|
||||
fd := (*f).Fd()
|
||||
// F_SEAL_FUTURE_WRITE -- Linux 5.1
|
||||
_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, unix.F_SEAL_FUTURE_WRITE)
|
||||
// F_SEAL_EXEC -- Linux 6.3
|
||||
const F_SEAL_EXEC = 0x20 //nolint:revive // this matches the unix.* name
|
||||
_, _ = unix.FcntlInt(fd, unix.F_ADD_SEALS, F_SEAL_EXEC)
|
||||
// Apply all original memfd seals.
|
||||
_, err := unix.FcntlInt(fd, unix.F_ADD_SEALS, baseMemfdSeals)
|
||||
return os.NewSyscallError("fcntl(F_ADD_SEALS)", err)
|
||||
}
|
||||
|
||||
// Memfd creates a sealable executable memfd (supported since Linux 3.17).
|
||||
func Memfd(comment string) (*os.File, SealFunc, error) {
|
||||
file, err := system.ExecutableMemfd("runc_cloned:"+comment, unix.MFD_ALLOW_SEALING|unix.MFD_CLOEXEC)
|
||||
return file, sealMemfd, err
|
||||
}
|
||||
|
||||
func sealFile(f **os.File) error {
|
||||
// When sealing an O_TMPFILE-style descriptor we need to
|
||||
// re-open the path as O_PATH to clear the existing write
|
||||
// handle we have.
|
||||
opath, err := os.OpenFile(fmt.Sprintf("/proc/self/fd/%d", (*f).Fd()), unix.O_PATH|unix.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("reopen tmpfile: %w", err)
|
||||
}
|
||||
_ = (*f).Close()
|
||||
*f = opath
|
||||
return nil
|
||||
}
|
||||
|
||||
// otmpfile creates an open(O_TMPFILE) file in the given directory (supported
|
||||
// since Linux 3.11).
|
||||
func otmpfile(dir string) (*os.File, SealFunc, error) {
|
||||
file, err := os.OpenFile(dir, unix.O_TMPFILE|unix.O_RDWR|unix.O_EXCL|unix.O_CLOEXEC, 0o700)
|
||||
if err != nil {
|
||||
return nil, nil, fmt.Errorf("O_TMPFILE creation failed: %w", err)
|
||||
}
|
||||
// Make sure we actually got an unlinked O_TMPFILE descriptor.
|
||||
var stat unix.Stat_t
|
||||
if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
|
||||
file.Close()
|
||||
return nil, nil, fmt.Errorf("cannot fstat O_TMPFILE fd: %w", err)
|
||||
} else if stat.Nlink != 0 {
|
||||
file.Close()
|
||||
return nil, nil, errors.New("O_TMPFILE has non-zero nlink")
|
||||
}
|
||||
return file, sealFile, err
|
||||
}
|
||||
|
||||
// mktemp creates a classic unlinked file in the given directory.
|
||||
func mktemp(dir string) (*os.File, SealFunc, error) {
|
||||
file, err := os.CreateTemp(dir, "runc.")
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
// Unlink the file and verify it was unlinked.
|
||||
if err := os.Remove(file.Name()); err != nil {
|
||||
return nil, nil, fmt.Errorf("unlinking classic tmpfile: %w", err)
|
||||
}
|
||||
if err := file.Chmod(0o511); err != nil {
|
||||
return nil, nil, fmt.Errorf("chmod classic tmpfile: %w", err)
|
||||
}
|
||||
var stat unix.Stat_t
|
||||
if err := unix.Fstat(int(file.Fd()), &stat); err != nil {
|
||||
return nil, nil, fmt.Errorf("cannot fstat classic tmpfile: %w", err)
|
||||
} else if stat.Nlink != 0 {
|
||||
return nil, nil, fmt.Errorf("classic tmpfile %s has non-zero nlink after unlink", file.Name())
|
||||
}
|
||||
return file, sealFile, err
|
||||
}
|
||||
|
||||
func getSealableFile(comment, tmpDir string) (file *os.File, sealFn SealFunc, err error) {
|
||||
// First, try an executable memfd (supported since Linux 3.17).
|
||||
file, sealFn, err = Memfd(comment)
|
||||
if err == nil {
|
||||
return
|
||||
}
|
||||
logrus.Debugf("memfd cloned binary failed, falling back to O_TMPFILE: %v", err)
|
||||
|
||||
// The tmpDir here (c.root) might be mounted noexec, so we need a couple of
|
||||
// fallbacks to try. It's possible that none of these are writable and
|
||||
// executable, in which case there's nothing we can practically do (other
|
||||
// than mounting our own executable tmpfs, which would have its own
|
||||
// issues).
|
||||
tmpDirs := []string{
|
||||
tmpDir,
|
||||
os.TempDir(),
|
||||
"/tmp",
|
||||
".",
|
||||
"/bin",
|
||||
"/",
|
||||
}
|
||||
|
||||
// Try to fallback to O_TMPFILE (supported since Linux 3.11).
|
||||
for _, dir := range tmpDirs {
|
||||
file, sealFn, err = otmpfile(dir)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if !isExecutable(file) {
|
||||
logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
|
||||
file.Close()
|
||||
continue
|
||||
}
|
||||
return
|
||||
}
|
||||
logrus.Debugf("O_TMPFILE cloned binary failed, falling back to mktemp(): %v", err)
|
||||
// Finally, try a classic unlinked temporary file.
|
||||
for _, dir := range tmpDirs {
|
||||
file, sealFn, err = mktemp(dir)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if !isExecutable(file) {
|
||||
logrus.Debugf("tmpdir %s is noexec -- trying a different tmpdir", dir)
|
||||
file.Close()
|
||||
continue
|
||||
}
|
||||
return
|
||||
}
|
||||
return nil, nil, fmt.Errorf("could not create sealable file for cloned binary: %w", err)
|
||||
}
|
||||
|
||||
// CloneBinary creates a "sealed" clone of a given binary, which can be used to
|
||||
// thwart attempts by the container process to gain access to host binaries
|
||||
// through procfs magic-link shenanigans. For more details on why this is
|
||||
// necessary, see CVE-2019-5736.
|
||||
func CloneBinary(src io.Reader, size int64, name, tmpDir string) (*os.File, error) {
|
||||
logrus.Debugf("cloning %s binary (%d bytes)", name, size)
|
||||
file, sealFn, err := getSealableFile(name, tmpDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
copied, err := system.Copy(file, src)
|
||||
if err != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("copy binary: %w", err)
|
||||
} else if copied != size {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("copied binary size mismatch: %d != %d", copied, size)
|
||||
}
|
||||
if err := sealFn(&file); err != nil {
|
||||
file.Close()
|
||||
return nil, fmt.Errorf("could not seal fd: %w", err)
|
||||
}
|
||||
return file, nil
|
||||
}
|
||||
|
||||
// IsCloned returns whether the given file can be guaranteed to be a safe exe.
|
||||
func IsCloned(exe *os.File) bool {
|
||||
seals, err := unix.FcntlInt(exe.Fd(), unix.F_GET_SEALS, 0)
|
||||
if err != nil {
|
||||
// /proc/self/exe is probably not a memfd
|
||||
logrus.Debugf("F_GET_SEALS on %s failed: %v", exe.Name(), err)
|
||||
return false
|
||||
}
|
||||
// The memfd must have all of the base seals applied.
|
||||
logrus.Debugf("checking %s memfd seals: 0x%x", exe.Name(), seals)
|
||||
return seals&baseMemfdSeals == baseMemfdSeals
|
||||
}
|
||||
|
||||
// CloneSelfExe makes a clone of the current process's binary (through
|
||||
// /proc/self/exe). This binary can then be used for "runc init" in order to
|
||||
// make sure the container process can never resolve the original runc binary.
|
||||
// For more details on why this is necessary, see CVE-2019-5736.
|
||||
func CloneSelfExe(tmpDir string) (*os.File, error) {
|
||||
// Try to create a temporary overlayfs to produce a readonly version of
|
||||
// /proc/self/exe that cannot be "unwrapped" by the container. In contrast
|
||||
// to CloneBinary, this technique does not require any extra memory usage
|
||||
// and does not have the (fairly noticeable) performance impact of copying
|
||||
// a large binary file into a memfd.
|
||||
//
|
||||
// Based on some basic performance testing, the overlayfs approach has
|
||||
// effectively no performance overhead (it is on par with both
|
||||
// MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds
|
||||
// around ~60% overhead during container startup.
|
||||
overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
|
||||
if err == nil {
|
||||
logrus.Debug("runc-dmz: using overlayfs for sealed /proc/self/exe") // used for tests
|
||||
return overlayFile, nil
|
||||
}
|
||||
logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy")
|
||||
|
||||
selfExe, err := os.Open("/proc/self/exe")
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("opening current binary: %w", err)
|
||||
}
|
||||
defer selfExe.Close()
|
||||
|
||||
stat, err := selfExe.Stat()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("checking /proc/self/exe size: %w", err)
|
||||
}
|
||||
size := stat.Size()
|
||||
|
||||
return CloneBinary(selfExe, size, "/proc/self/exe", tmpDir)
|
||||
}
|
||||
|
||||
// IsSelfExeCloned returns whether /proc/self/exe is a cloned binary that can
|
||||
// be guaranteed to be safe. This means that it must be a sealed memfd. Other
|
||||
// types of clones cannot be completely verified as safe.
|
||||
func IsSelfExeCloned() bool {
|
||||
selfExe, err := os.Open("/proc/self/exe")
|
||||
if err != nil {
|
||||
logrus.Debugf("open /proc/self/exe failed: %v", err)
|
||||
return false
|
||||
}
|
||||
defer selfExe.Close()
|
||||
return IsCloned(selfExe)
|
||||
}
|
||||
122
vendor/github.com/opencontainers/runc/libcontainer/dmz/overlayfs_linux.go
generated
vendored
Normal file
122
vendor/github.com/opencontainers/runc/libcontainer/dmz/overlayfs_linux.go
generated
vendored
Normal file
@@ -0,0 +1,122 @@
|
||||
package dmz
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
)
|
||||
|
||||
func fsopen(fsName string, flags int) (*os.File, error) {
|
||||
// Make sure we always set O_CLOEXEC.
|
||||
flags |= unix.FSOPEN_CLOEXEC
|
||||
fd, err := unix.Fsopen(fsName, flags)
|
||||
if err != nil {
|
||||
return nil, os.NewSyscallError("fsopen "+fsName, err)
|
||||
}
|
||||
return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
|
||||
}
|
||||
|
||||
func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
|
||||
// Make sure we always set O_CLOEXEC.
|
||||
flags |= unix.FSMOUNT_CLOEXEC
|
||||
fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
|
||||
if err != nil {
|
||||
return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
|
||||
}
|
||||
runtime.KeepAlive(ctx) // make sure fd is kept alive while it's used
|
||||
return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
|
||||
}
|
||||
|
||||
func escapeOverlayLowerDir(path string) string {
|
||||
// If the lowerdir path contains ":" we need to escape them, and if there
|
||||
// were any escape characters already (\) we need to escape those first.
|
||||
return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`)
|
||||
}
|
||||
|
||||
// sealedOverlayfs will create an internal overlayfs mount using fsopen() that
|
||||
// uses the directory containing the binary as a lowerdir and a temporary tmpfs
|
||||
// as an upperdir. There is no way to "unwrap" this (unlike MS_BIND+MS_RDONLY)
|
||||
// and so we can create a safe zero-copy sealed version of /proc/self/exe.
|
||||
// This only works for privileged users and on kernels with overlayfs and
|
||||
// fsopen() enabled.
|
||||
//
|
||||
// TODO: Since Linux 5.11, overlayfs can be created inside user namespaces so
|
||||
// it is technically possible to create an overlayfs even for rootless
|
||||
// containers. Unfortunately, this would require some ugly manual CGo+fork
|
||||
// magic so we can do this later if we feel it's really needed.
|
||||
func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) {
|
||||
// Try to do the superblock creation first to bail out early if we can't
|
||||
// use this method.
|
||||
overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer overlayCtx.Close()
|
||||
|
||||
// binPath is going to be /proc/self/exe, so do a readlink to get the real
|
||||
// path. overlayfs needs the real underlying directory for this protection
|
||||
// mode to work properly.
|
||||
if realPath, err := os.Readlink(binPath); err == nil {
|
||||
binPath = realPath
|
||||
}
|
||||
binLowerDirPath, binName := filepath.Split(binPath)
|
||||
// Escape any ":"s or "\"s in the path.
|
||||
binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath)
|
||||
|
||||
// Overlayfs requires two lowerdirs in order to run in "lower-only" mode,
|
||||
// where writes are completely blocked. Ideally we would create a dummy
|
||||
// tmpfs for this, but it turns out that overlayfs doesn't allow for
|
||||
// anonymous mountns paths.
|
||||
// NOTE: I'm working on a patch to fix this but it won't be backported.
|
||||
dummyLowerDirPath := escapeOverlayLowerDir(tmpDir)
|
||||
|
||||
// Configure the lowerdirs. The binary lowerdir needs to be on the top to
|
||||
// ensure that a file called "runc" (binName) in the dummy lowerdir doesn't
|
||||
// mask the binary.
|
||||
lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath
|
||||
if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil {
|
||||
return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err)
|
||||
}
|
||||
|
||||
// We don't care about xino (Linux 4.17) but it will be auto-enabled on
|
||||
// some systems (if /run/runc and /usr/bin are on different filesystems)
|
||||
// and this produces spurious dmesg log entries. We can safely ignore
|
||||
// errors when disabling this because we don't actually care about the
|
||||
// setting and we're just opportunistically disabling it.
|
||||
_ = unix.FsconfigSetString(int(overlayCtx.Fd()), "xino", "off")
|
||||
|
||||
// Get an actual handle to the overlayfs.
|
||||
if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil {
|
||||
return nil, os.NewSyscallError("fsconfig create overlayfs", err)
|
||||
}
|
||||
overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer overlayFd.Close()
|
||||
|
||||
// Grab a handle to the binary through overlayfs.
|
||||
exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err)
|
||||
}
|
||||
// NOTE: We would like to check that exeFile is the same as /proc/self/exe,
|
||||
// except this is a little difficult. Depending on what filesystems the
|
||||
// layers are on, overlayfs can remap the inode numbers (and it always
|
||||
// creates its own device numbers -- see ovl_map_dev_ino) so we can't do a
|
||||
// basic stat-based check. The only reasonable option would be to hash both
|
||||
// files and compare them, but this would require fully reading both files
|
||||
// which would produce a similar performance overhead to memfd cloning.
|
||||
//
|
||||
// Ultimately, there isn't a real attack to be worried about here. An
|
||||
// attacker would need to be able to modify files in /usr/sbin (or wherever
|
||||
// runc lives), at which point they could just replace the runc binary with
|
||||
// something malicious anyway.
|
||||
return exeFile, nil
|
||||
}
|
||||
216
vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
generated
vendored
Normal file
216
vendor/github.com/opencontainers/runc/libcontainer/system/linux.go
generated
vendored
Normal file
@@ -0,0 +1,216 @@
|
||||
//go:build linux
|
||||
|
||||
package system
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"strconv"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
type ParentDeathSignal int
|
||||
|
||||
func (p ParentDeathSignal) Restore() error {
|
||||
if p == 0 {
|
||||
return nil
|
||||
}
|
||||
current, err := GetParentDeathSignal()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if p == current {
|
||||
return nil
|
||||
}
|
||||
return p.Set()
|
||||
}
|
||||
|
||||
func (p ParentDeathSignal) Set() error {
|
||||
return SetParentDeathSignal(uintptr(p))
|
||||
}
|
||||
|
||||
func Exec(cmd string, args []string, env []string) error {
|
||||
for {
|
||||
err := unix.Exec(cmd, args, env)
|
||||
if err != unix.EINTR {
|
||||
return &os.PathError{Op: "exec", Path: cmd, Err: err}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error {
|
||||
pathnamep, err := syscall.BytePtrFromString(pathname)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
argvp, err := syscall.SlicePtrFromStrings(args)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
envp, err := syscall.SlicePtrFromStrings(env)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
_, _, errno := syscall.Syscall6(
|
||||
unix.SYS_EXECVEAT,
|
||||
fd,
|
||||
uintptr(unsafe.Pointer(pathnamep)),
|
||||
uintptr(unsafe.Pointer(&argvp[0])),
|
||||
uintptr(unsafe.Pointer(&envp[0])),
|
||||
uintptr(flags),
|
||||
0,
|
||||
)
|
||||
return errno
|
||||
}
|
||||
|
||||
func Fexecve(fd uintptr, args []string, env []string) error {
|
||||
var err error
|
||||
for {
|
||||
err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH)
|
||||
if err != unix.EINTR { // nolint:errorlint // unix errors are bare
|
||||
break
|
||||
}
|
||||
}
|
||||
if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare
|
||||
// Fallback to classic /proc/self/fd/... exec.
|
||||
return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env)
|
||||
}
|
||||
return os.NewSyscallError("execveat", err)
|
||||
}
|
||||
|
||||
func SetParentDeathSignal(sig uintptr) error {
|
||||
if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func GetParentDeathSignal() (ParentDeathSignal, error) {
|
||||
var sig int
|
||||
if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
return ParentDeathSignal(sig), nil
|
||||
}
|
||||
|
||||
func SetKeepCaps() error {
|
||||
if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func ClearKeepCaps() error {
|
||||
if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func Setctty() error {
|
||||
if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// SetSubreaper sets the value i as the subreaper setting for the calling process
|
||||
func SetSubreaper(i int) error {
|
||||
return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
|
||||
}
|
||||
|
||||
// GetSubreaper returns the subreaper setting for the calling process
|
||||
func GetSubreaper() (int, error) {
|
||||
var i uintptr
|
||||
|
||||
if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
|
||||
return -1, err
|
||||
}
|
||||
|
||||
return int(i), nil
|
||||
}
|
||||
|
||||
func ExecutableMemfd(comment string, flags int) (*os.File, error) {
|
||||
// Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this
|
||||
// flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an
|
||||
// executable memfd. For vm.memfd_noexec=2 this is a bit more complicated.
|
||||
// The original vm.memfd_noexec=2 implementation incorrectly silently
|
||||
// allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer
|
||||
// kernels, we will get -EACCES if we try to use MFD_EXEC with
|
||||
// vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value).
|
||||
//
|
||||
// The upshot is we only need to retry without MFD_EXEC on -EINVAL because
|
||||
// it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on
|
||||
// kernels where -EINVAL is actually a security denial.
|
||||
memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC)
|
||||
if err == unix.EINVAL {
|
||||
memfd, err = unix.MemfdCreate(comment, flags)
|
||||
}
|
||||
if err != nil {
|
||||
if err == unix.EACCES {
|
||||
logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE")
|
||||
}
|
||||
err := os.NewSyscallError("memfd_create", err)
|
||||
return nil, fmt.Errorf("failed to create executable memfd: %w", err)
|
||||
}
|
||||
return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
|
||||
}
|
||||
|
||||
// Copy is like io.Copy except it uses sendfile(2) if the source and sink are
|
||||
// both (*os.File) as an optimisation to make copies faster.
|
||||
func Copy(dst io.Writer, src io.Reader) (copied int64, err error) {
|
||||
dstFile, _ := dst.(*os.File)
|
||||
srcFile, _ := src.(*os.File)
|
||||
|
||||
if dstFile != nil && srcFile != nil {
|
||||
fi, err := srcFile.Stat()
|
||||
if err != nil {
|
||||
goto fallback
|
||||
}
|
||||
size := fi.Size()
|
||||
for size > 0 {
|
||||
n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size))
|
||||
if n > 0 {
|
||||
size -= int64(n)
|
||||
copied += int64(n)
|
||||
}
|
||||
if err == unix.EINTR {
|
||||
continue
|
||||
}
|
||||
if err != nil {
|
||||
if copied == 0 {
|
||||
// If we haven't copied anything so far, we can safely just
|
||||
// fallback to io.Copy. We could always do the fallback but
|
||||
// it's safer to error out in the case of a partial copy
|
||||
// followed by an error (which should never happen).
|
||||
goto fallback
|
||||
}
|
||||
return copied, fmt.Errorf("partial sendfile copy: %w", err)
|
||||
}
|
||||
}
|
||||
return copied, nil
|
||||
}
|
||||
|
||||
fallback:
|
||||
return io.Copy(dst, src)
|
||||
}
|
||||
|
||||
// SetLinuxPersonality sets the Linux execution personality. For more information see the personality syscall documentation.
|
||||
// checkout getLinuxPersonalityFromStr() from libcontainer/specconv/spec_linux.go for type conversion.
|
||||
func SetLinuxPersonality(personality int) error {
|
||||
_, _, errno := unix.Syscall(unix.SYS_PERSONALITY, uintptr(personality), 0, 0)
|
||||
if errno != 0 {
|
||||
return &os.SyscallError{Syscall: "set_personality", Err: errno}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
127
vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
generated
vendored
Normal file
127
vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
generated
vendored
Normal file
@@ -0,0 +1,127 @@
|
||||
package system
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// State is the status of a process.
|
||||
type State rune
|
||||
|
||||
const ( // Only values for Linux 3.14 and later are listed here
|
||||
Dead State = 'X'
|
||||
DiskSleep State = 'D'
|
||||
Running State = 'R'
|
||||
Sleeping State = 'S'
|
||||
Stopped State = 'T'
|
||||
TracingStop State = 't'
|
||||
Zombie State = 'Z'
|
||||
Parked State = 'P'
|
||||
Idle State = 'I'
|
||||
)
|
||||
|
||||
// String forms of the state from proc(5)'s documentation for
|
||||
// /proc/[pid]/status' "State" field.
|
||||
func (s State) String() string {
|
||||
switch s {
|
||||
case Dead:
|
||||
return "dead"
|
||||
case DiskSleep:
|
||||
return "disk sleep"
|
||||
case Running:
|
||||
return "running"
|
||||
case Sleeping:
|
||||
return "sleeping"
|
||||
case Stopped:
|
||||
return "stopped"
|
||||
case TracingStop:
|
||||
return "tracing stop"
|
||||
case Zombie:
|
||||
return "zombie"
|
||||
case Parked:
|
||||
return "parked"
|
||||
case Idle:
|
||||
return "idle" // kernel thread
|
||||
default:
|
||||
return fmt.Sprintf("unknown (%c)", s)
|
||||
}
|
||||
}
|
||||
|
||||
// Stat_t represents the information from /proc/[pid]/stat, as
|
||||
// described in proc(5) with names based on the /proc/[pid]/status
|
||||
// fields.
|
||||
type Stat_t struct {
|
||||
// Name is the command run by the process.
|
||||
Name string
|
||||
|
||||
// State is the state of the process.
|
||||
State State
|
||||
|
||||
// StartTime is the number of clock ticks after system boot (since
|
||||
// Linux 2.6).
|
||||
StartTime uint64
|
||||
}
|
||||
|
||||
// Stat returns a Stat_t instance for the specified process.
|
||||
func Stat(pid int) (stat Stat_t, err error) {
|
||||
bytes, err := os.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat"))
|
||||
if err != nil {
|
||||
return stat, err
|
||||
}
|
||||
return parseStat(string(bytes))
|
||||
}
|
||||
|
||||
func parseStat(data string) (stat Stat_t, err error) {
|
||||
// Example:
|
||||
// 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0
|
||||
// The fields are space-separated, see full description in proc(5).
|
||||
//
|
||||
// We are only interested in:
|
||||
// * field 2: process name. It is the only field enclosed into
|
||||
// parenthesis, as it can contain spaces (and parenthesis) inside.
|
||||
// * field 3: process state, a single character (%c)
|
||||
// * field 22: process start time, a long unsigned integer (%llu).
|
||||
|
||||
// 1. Look for the first '(' and the last ')' first, what's in between is Name.
|
||||
// We expect at least 20 fields and a space after the last one.
|
||||
|
||||
const minAfterName = 20*2 + 1 // the min field is '0 '.
|
||||
|
||||
first := strings.IndexByte(data, '(')
|
||||
if first < 0 || first+minAfterName >= len(data) {
|
||||
return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
|
||||
}
|
||||
|
||||
last := strings.LastIndexByte(data, ')')
|
||||
if last <= first || last+minAfterName >= len(data) {
|
||||
return stat, fmt.Errorf("invalid stat data (no comm or too short): %q", data)
|
||||
}
|
||||
|
||||
stat.Name = data[first+1 : last]
|
||||
|
||||
// 2. Remove fields 1 and 2 and a space after. State is right after.
|
||||
data = data[last+2:]
|
||||
stat.State = State(data[0])
|
||||
|
||||
// 3. StartTime is field 22, data is at field 3 now, so we need to skip 19 spaces.
|
||||
skipSpaces := 22 - 3
|
||||
for first = 0; skipSpaces > 0 && first < len(data); first++ {
|
||||
if data[first] == ' ' {
|
||||
skipSpaces--
|
||||
}
|
||||
}
|
||||
// Now first points to StartTime; look for space right after.
|
||||
i := strings.IndexByte(data[first:], ' ')
|
||||
if i < 0 {
|
||||
return stat, fmt.Errorf("invalid stat data (too short): %q", data)
|
||||
}
|
||||
stat.StartTime, err = strconv.ParseUint(data[first:first+i], 10, 64)
|
||||
if err != nil {
|
||||
return stat, fmt.Errorf("invalid stat data (bad start time): %w", err)
|
||||
}
|
||||
|
||||
return stat, nil
|
||||
}
|
||||
15
vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go
generated
vendored
Normal file
15
vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux.go
generated
vendored
Normal file
@@ -0,0 +1,15 @@
|
||||
//go:build go1.23
|
||||
|
||||
package system
|
||||
|
||||
import (
|
||||
"syscall"
|
||||
)
|
||||
|
||||
// ClearRlimitNofileCache clears go runtime's nofile rlimit cache. The argument
|
||||
// is process RLIMIT_NOFILE values. Relies on go.dev/cl/588076.
|
||||
func ClearRlimitNofileCache(lim *syscall.Rlimit) {
|
||||
// Ignore the return values since we only need to clean the cache,
|
||||
// the limit is going to be set via unix.Prlimit elsewhere.
|
||||
_ = syscall.Setrlimit(syscall.RLIMIT_NOFILE, lim)
|
||||
}
|
||||
27
vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go
generated
vendored
Normal file
27
vendor/github.com/opencontainers/runc/libcontainer/system/rlimit_linux_go122.go
generated
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
//go:build !go1.23
|
||||
|
||||
// TODO: remove this file once go 1.22 is no longer supported.
|
||||
|
||||
package system
|
||||
|
||||
import (
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
_ "unsafe" // Needed for go:linkname to work.
|
||||
)
|
||||
|
||||
//go:linkname syscallOrigRlimitNofile syscall.origRlimitNofile
|
||||
var syscallOrigRlimitNofile atomic.Pointer[syscall.Rlimit]
|
||||
|
||||
// ClearRlimitNofileCache clears go runtime's nofile rlimit cache.
|
||||
// The argument is process RLIMIT_NOFILE values.
|
||||
func ClearRlimitNofileCache(_ *syscall.Rlimit) {
|
||||
// As reported in issue #4195, the new version of go runtime(since 1.19)
|
||||
// will cache rlimit-nofile. Before executing execve, the rlimit-nofile
|
||||
// of the process will be restored with the cache. In runc, this will
|
||||
// cause the rlimit-nofile setting by the parent process for the container
|
||||
// to become invalid. It can be solved by clearing this cache. But
|
||||
// unfortunately, go stdlib doesn't provide such function, so we need to
|
||||
// link to the private var `origRlimitNofile` in package syscall to hack.
|
||||
syscallOrigRlimitNofile.Store(nil)
|
||||
}
|
||||
119
vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
generated
vendored
Normal file
119
vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go
generated
vendored
Normal file
@@ -0,0 +1,119 @@
|
||||
package utils
|
||||
|
||||
/*
|
||||
* Copyright 2016, 2017 SUSE LLC
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"runtime"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// MaxNameLen is the maximum length of the name of a file descriptor being sent
|
||||
// using SendFile. The name of the file handle returned by RecvFile will never be
|
||||
// larger than this value.
|
||||
const MaxNameLen = 4096
|
||||
|
||||
// oobSpace is the size of the oob slice required to store a single FD. Note
|
||||
// that unix.UnixRights appears to make the assumption that fd is always int32,
|
||||
// so sizeof(fd) = 4.
|
||||
var oobSpace = unix.CmsgSpace(4)
|
||||
|
||||
// RecvFile waits for a file descriptor to be sent over the given AF_UNIX
|
||||
// socket. The file name of the remote file descriptor will be recreated
|
||||
// locally (it is sent as non-auxiliary data in the same payload).
|
||||
func RecvFile(socket *os.File) (_ *os.File, Err error) {
|
||||
name := make([]byte, MaxNameLen)
|
||||
oob := make([]byte, oobSpace)
|
||||
|
||||
sockfd := socket.Fd()
|
||||
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, unix.MSG_CMSG_CLOEXEC)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if n >= MaxNameLen || oobn != oobSpace {
|
||||
return nil, fmt.Errorf("recvfile: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
|
||||
}
|
||||
// Truncate.
|
||||
name = name[:n]
|
||||
oob = oob[:oobn]
|
||||
|
||||
scms, err := unix.ParseSocketControlMessage(oob)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// We cannot control how many SCM_RIGHTS we receive, and upon receiving
|
||||
// them all of the descriptors are installed in our fd table, so we need to
|
||||
// parse all of the SCM_RIGHTS we received in order to close all of the
|
||||
// descriptors on error.
|
||||
var fds []int
|
||||
defer func() {
|
||||
for i, fd := range fds {
|
||||
if i == 0 && Err == nil {
|
||||
// Only close the first one on error.
|
||||
continue
|
||||
}
|
||||
// Always close extra ones.
|
||||
_ = unix.Close(fd)
|
||||
}
|
||||
}()
|
||||
var lastErr error
|
||||
for _, scm := range scms {
|
||||
if scm.Header.Type == unix.SCM_RIGHTS {
|
||||
scmFds, err := unix.ParseUnixRights(&scm)
|
||||
if err != nil {
|
||||
lastErr = err
|
||||
} else {
|
||||
fds = append(fds, scmFds...)
|
||||
}
|
||||
}
|
||||
}
|
||||
if lastErr != nil {
|
||||
return nil, lastErr
|
||||
}
|
||||
|
||||
// We do this after collecting the fds to make sure we close them all when
|
||||
// returning an error here.
|
||||
if len(scms) != 1 {
|
||||
return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
|
||||
}
|
||||
if len(fds) != 1 {
|
||||
return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
|
||||
}
|
||||
return os.NewFile(uintptr(fds[0]), string(name)), nil
|
||||
}
|
||||
|
||||
// SendFile sends a file over the given AF_UNIX socket. file.Name() is also
|
||||
// included so that if the other end uses RecvFile, the file will have the same
|
||||
// name information.
|
||||
func SendFile(socket *os.File, file *os.File) error {
|
||||
name := file.Name()
|
||||
if len(name) >= MaxNameLen {
|
||||
return fmt.Errorf("sendfd: filename too long: %s", name)
|
||||
}
|
||||
err := SendRawFd(socket, name, file.Fd())
|
||||
runtime.KeepAlive(file)
|
||||
return err
|
||||
}
|
||||
|
||||
// SendRawFd sends a specific file descriptor over the given AF_UNIX socket.
|
||||
func SendRawFd(socket *os.File, msg string, fd uintptr) error {
|
||||
oob := unix.UnixRights(int(fd))
|
||||
return unix.Sendmsg(int(socket.Fd()), []byte(msg), oob, nil, 0)
|
||||
}
|
||||
115
vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
generated
vendored
Normal file
115
vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
generated
vendored
Normal file
@@ -0,0 +1,115 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
const (
|
||||
exitSignalOffset = 128
|
||||
)
|
||||
|
||||
// ExitStatus returns the correct exit status for a process based on if it
|
||||
// was signaled or exited cleanly
|
||||
func ExitStatus(status unix.WaitStatus) int {
|
||||
if status.Signaled() {
|
||||
return exitSignalOffset + int(status.Signal())
|
||||
}
|
||||
return status.ExitStatus()
|
||||
}
|
||||
|
||||
// WriteJSON writes the provided struct v to w using standard json marshaling
|
||||
// without a trailing newline. This is used instead of json.Encoder because
|
||||
// there might be a problem in json decoder in some cases, see:
|
||||
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
|
||||
func WriteJSON(w io.Writer, v interface{}) error {
|
||||
data, err := json.Marshal(v)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
_, err = w.Write(data)
|
||||
return err
|
||||
}
|
||||
|
||||
// CleanPath makes a path safe for use with filepath.Join. This is done by not
|
||||
// only cleaning the path, but also (if the path is relative) adding a leading
|
||||
// '/' and cleaning it (then removing the leading '/'). This ensures that a
|
||||
// path resulting from prepending another path will always resolve to lexically
|
||||
// be a subdirectory of the prefixed path. This is all done lexically, so paths
|
||||
// that include symlinks won't be safe as a result of using CleanPath.
|
||||
func CleanPath(path string) string {
|
||||
// Deal with empty strings nicely.
|
||||
if path == "" {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Ensure that all paths are cleaned (especially problematic ones like
|
||||
// "/../../../../../" which can cause lots of issues).
|
||||
path = filepath.Clean(path)
|
||||
|
||||
// If the path isn't absolute, we need to do more processing to fix paths
|
||||
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
|
||||
// paths to relative ones.
|
||||
if !filepath.IsAbs(path) {
|
||||
path = filepath.Clean(string(os.PathSeparator) + path)
|
||||
// This can't fail, as (by definition) all paths are relative to root.
|
||||
path, _ = filepath.Rel(string(os.PathSeparator), path)
|
||||
}
|
||||
|
||||
// Clean the path again for good measure.
|
||||
return filepath.Clean(path)
|
||||
}
|
||||
|
||||
// stripRoot returns the passed path, stripping the root path if it was
|
||||
// (lexicially) inside it. Note that both passed paths will always be treated
|
||||
// as absolute, and the returned path will also always be absolute. In
|
||||
// addition, the paths are cleaned before stripping the root.
|
||||
func stripRoot(root, path string) string {
|
||||
// Make the paths clean and absolute.
|
||||
root, path = CleanPath("/"+root), CleanPath("/"+path)
|
||||
switch {
|
||||
case path == root:
|
||||
path = "/"
|
||||
case root == "/":
|
||||
// do nothing
|
||||
case strings.HasPrefix(path, root+"/"):
|
||||
path = strings.TrimPrefix(path, root+"/")
|
||||
}
|
||||
return CleanPath("/" + path)
|
||||
}
|
||||
|
||||
// SearchLabels searches through a list of key=value pairs for a given key,
|
||||
// returning its value, and the binary flag telling whether the key exist.
|
||||
func SearchLabels(labels []string, key string) (string, bool) {
|
||||
key += "="
|
||||
for _, s := range labels {
|
||||
if strings.HasPrefix(s, key) {
|
||||
return s[len(key):], true
|
||||
}
|
||||
}
|
||||
return "", false
|
||||
}
|
||||
|
||||
// Annotations returns the bundle path and user defined annotations from the
|
||||
// libcontainer state. We need to remove the bundle because that is a label
|
||||
// added by libcontainer.
|
||||
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
|
||||
userAnnotations = make(map[string]string)
|
||||
for _, l := range labels {
|
||||
name, value, ok := strings.Cut(l, "=")
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if name == "bundle" {
|
||||
bundle = value
|
||||
} else {
|
||||
userAnnotations[name] = value
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
360
vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
generated
vendored
Normal file
360
vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go
generated
vendored
Normal file
@@ -0,0 +1,360 @@
|
||||
//go:build !windows
|
||||
|
||||
package utils
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
_ "unsafe" // for go:linkname
|
||||
|
||||
securejoin "github.com/cyphar/filepath-securejoin"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// EnsureProcHandle returns whether or not the given file handle is on procfs.
|
||||
func EnsureProcHandle(fh *os.File) error {
|
||||
var buf unix.Statfs_t
|
||||
if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil {
|
||||
return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err)
|
||||
}
|
||||
if buf.Type != unix.PROC_SUPER_MAGIC {
|
||||
return fmt.Errorf("%s is not on procfs", fh.Name())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
var (
|
||||
haveCloseRangeCloexecBool bool
|
||||
haveCloseRangeCloexecOnce sync.Once
|
||||
)
|
||||
|
||||
func haveCloseRangeCloexec() bool {
|
||||
haveCloseRangeCloexecOnce.Do(func() {
|
||||
// Make sure we're not closing a random file descriptor.
|
||||
tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
defer unix.Close(tmpFd)
|
||||
|
||||
err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC)
|
||||
// Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC).
|
||||
// -ENOSYS and -EINVAL ultimately mean we don't have support, but any
|
||||
// other potential error would imply that even the most basic close
|
||||
// operation wouldn't work.
|
||||
haveCloseRangeCloexecBool = err == nil
|
||||
})
|
||||
return haveCloseRangeCloexecBool
|
||||
}
|
||||
|
||||
type fdFunc func(fd int)
|
||||
|
||||
// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in
|
||||
// the current process.
|
||||
func fdRangeFrom(minFd int, fn fdFunc) error {
|
||||
procSelfFd, closer := ProcThreadSelf("fd")
|
||||
defer closer()
|
||||
|
||||
fdDir, err := os.Open(procSelfFd)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer fdDir.Close()
|
||||
|
||||
if err := EnsureProcHandle(fdDir); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
fdList, err := fdDir.Readdirnames(-1)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
for _, fdStr := range fdList {
|
||||
fd, err := strconv.Atoi(fdStr)
|
||||
// Ignore non-numeric file names.
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
// Ignore descriptors lower than our specified minimum.
|
||||
if fd < minFd {
|
||||
continue
|
||||
}
|
||||
// Ignore the file descriptor we used for readdir, as it will be closed
|
||||
// when we return.
|
||||
if uintptr(fd) == fdDir.Fd() {
|
||||
continue
|
||||
}
|
||||
// Run the closure.
|
||||
fn(fd)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or
|
||||
// equal to minFd in the current process.
|
||||
func CloseExecFrom(minFd int) error {
|
||||
// Use close_range(CLOSE_RANGE_CLOEXEC) if possible.
|
||||
if haveCloseRangeCloexec() {
|
||||
err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC)
|
||||
return os.NewSyscallError("close_range", err)
|
||||
}
|
||||
// Otherwise, fall back to the standard loop.
|
||||
return fdRangeFrom(minFd, unix.CloseOnExec)
|
||||
}
|
||||
|
||||
//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor
|
||||
|
||||
// In order to make sure we do not close the internal epoll descriptors the Go
|
||||
// runtime uses, we need to ensure that we skip descriptors that match
|
||||
// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing,
|
||||
// unfortunately there's no other way to be sure we're only keeping the file
|
||||
// descriptors the Go runtime needs. Hopefully nothing blows up doing this...
|
||||
func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive
|
||||
|
||||
// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the
|
||||
// current process, except for those critical to Go's runtime (such as the
|
||||
// netpoll management descriptors).
|
||||
//
|
||||
// NOTE: That this function is incredibly dangerous to use in most Go code, as
|
||||
// closing file descriptors from underneath *os.File handles can lead to very
|
||||
// bad behaviour (the closed file descriptor can be re-used and then any
|
||||
// *os.File operations would apply to the wrong file). This function is only
|
||||
// intended to be called from the last stage of runc init.
|
||||
func UnsafeCloseFrom(minFd int) error {
|
||||
// We cannot use close_range(2) even if it is available, because we must
|
||||
// not close some file descriptors.
|
||||
return fdRangeFrom(minFd, func(fd int) {
|
||||
if runtime_IsPollDescriptor(uintptr(fd)) {
|
||||
// These are the Go runtimes internal netpoll file descriptors.
|
||||
// These file descriptors are operated on deep in the Go scheduler,
|
||||
// and closing those files from underneath Go can result in panics.
|
||||
// There is no issue with keeping them because they are not
|
||||
// executable and are not useful to an attacker anyway. Also we
|
||||
// don't have any choice.
|
||||
return
|
||||
}
|
||||
// There's nothing we can do about errors from close(2), and the
|
||||
// only likely error to be seen is EBADF which indicates the fd was
|
||||
// already closed (in which case, we got what we wanted).
|
||||
_ = unix.Close(fd)
|
||||
})
|
||||
}
|
||||
|
||||
// NewSockPair returns a new SOCK_STREAM unix socket pair.
|
||||
func NewSockPair(name string) (parent, child *os.File, err error) {
|
||||
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
|
||||
}
|
||||
|
||||
// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
|
||||
// corresponding to the unsafePath resolved within the root. Before passing the
|
||||
// fd, this path is verified to have been inside the root -- so operating on it
|
||||
// through the passed fdpath should be safe. Do not access this path through
|
||||
// the original path strings, and do not attempt to use the pathname outside of
|
||||
// the passed closure (the file handle will be freed once the closure returns).
|
||||
func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
|
||||
// Remove the root then forcefully resolve inside the root.
|
||||
unsafePath = stripRoot(root, unsafePath)
|
||||
path, err := securejoin.SecureJoin(root, unsafePath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("resolving path inside rootfs failed: %w", err)
|
||||
}
|
||||
|
||||
procSelfFd, closer := ProcThreadSelf("fd/")
|
||||
defer closer()
|
||||
|
||||
// Open the target path.
|
||||
fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return fmt.Errorf("open o_path procfd: %w", err)
|
||||
}
|
||||
defer fh.Close()
|
||||
|
||||
procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
|
||||
// Double-check the path is the one we expected.
|
||||
if realpath, err := os.Readlink(procfd); err != nil {
|
||||
return fmt.Errorf("procfd verification failed: %w", err)
|
||||
} else if realpath != path {
|
||||
return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
|
||||
}
|
||||
|
||||
return fn(procfd)
|
||||
}
|
||||
|
||||
type ProcThreadSelfCloser func()
|
||||
|
||||
var (
|
||||
haveProcThreadSelf bool
|
||||
haveProcThreadSelfOnce sync.Once
|
||||
)
|
||||
|
||||
// ProcThreadSelf returns a string that is equivalent to
|
||||
// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
|
||||
// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
|
||||
// meaning that the passed string needs to be trusted. The caller _must_ call
|
||||
// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
|
||||
// *only once* after it has finished using the returned path string.
|
||||
func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
|
||||
haveProcThreadSelfOnce.Do(func() {
|
||||
if _, err := os.Stat("/proc/thread-self/"); err == nil {
|
||||
haveProcThreadSelf = true
|
||||
} else {
|
||||
logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
|
||||
}
|
||||
})
|
||||
|
||||
// We need to lock our thread until the caller is done with the path string
|
||||
// because any non-atomic operation on the path (such as opening a file,
|
||||
// then reading it) could be interrupted by the Go runtime where the
|
||||
// underlying thread is swapped out and the original thread is killed,
|
||||
// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
|
||||
// addition, the pre-3.17 fallback makes everything non-atomic because the
|
||||
// same thing could happen between unix.Gettid() and the path operations.
|
||||
//
|
||||
// In theory, we don't need to lock in the atomic user case when using
|
||||
// /proc/thread-self/, but it's better to be safe than sorry (and there are
|
||||
// only one or two truly atomic users of /proc/thread-self/).
|
||||
runtime.LockOSThread()
|
||||
|
||||
threadSelf := "/proc/thread-self/"
|
||||
if !haveProcThreadSelf {
|
||||
// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
|
||||
threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
|
||||
if _, err := os.Stat(threadSelf); err != nil {
|
||||
// Unfortunately, this code is called from rootfs_linux.go where we
|
||||
// are running inside the pid namespace of the container but /proc
|
||||
// is the host's procfs. Unfortunately there is no real way to get
|
||||
// the correct tid to use here (the kernel age means we cannot do
|
||||
// things like set up a private fsopen("proc") -- even scanning
|
||||
// NSpid in all of the tasks in /proc/self/task/*/status requires
|
||||
// Linux 4.1).
|
||||
//
|
||||
// So, we just have to assume that /proc/self is acceptable in this
|
||||
// one specific case.
|
||||
if os.Getpid() == 1 {
|
||||
logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
|
||||
} else {
|
||||
// This should never happen, but the fallback should work in most cases...
|
||||
logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
|
||||
}
|
||||
threadSelf = "/proc/self/"
|
||||
}
|
||||
}
|
||||
return threadSelf + subpath, runtime.UnlockOSThread
|
||||
}
|
||||
|
||||
// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to
|
||||
// create a /proc/thread-self handle for given file descriptor.
|
||||
//
|
||||
// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but
|
||||
// without using fmt.Sprintf to avoid unneeded overhead.
|
||||
func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) {
|
||||
return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10))
|
||||
}
|
||||
|
||||
// IsLexicallyInRoot is shorthand for strings.HasPrefix(path+"/", root+"/"),
|
||||
// but properly handling the case where path or root are "/".
|
||||
//
|
||||
// NOTE: The return value only make sense if the path doesn't contain "..".
|
||||
func IsLexicallyInRoot(root, path string) bool {
|
||||
if root != "/" {
|
||||
root += "/"
|
||||
}
|
||||
if path != "/" {
|
||||
path += "/"
|
||||
}
|
||||
return strings.HasPrefix(path, root)
|
||||
}
|
||||
|
||||
// MkdirAllInRootOpen attempts to make
|
||||
//
|
||||
// path, _ := securejoin.SecureJoin(root, unsafePath)
|
||||
// os.MkdirAll(path, mode)
|
||||
// os.Open(path)
|
||||
//
|
||||
// safer against attacks where components in the path are changed between
|
||||
// SecureJoin returning and MkdirAll (or Open) being called. In particular, we
|
||||
// try to detect any symlink components in the path while we are doing the
|
||||
// MkdirAll.
|
||||
//
|
||||
// NOTE: If unsafePath is a subpath of root, we assume that you have already
|
||||
// called SecureJoin and so we use the provided path verbatim without resolving
|
||||
// any symlinks (this is done in a way that avoids symlink-exchange races).
|
||||
// This means that the path also must not contain ".." elements, otherwise an
|
||||
// error will occur.
|
||||
//
|
||||
// This uses securejoin.MkdirAllHandle under the hood, but it has special
|
||||
// handling if unsafePath has already been scoped within the rootfs (this is
|
||||
// needed for a lot of runc callers and fixing this would require reworking a
|
||||
// lot of path logic).
|
||||
func MkdirAllInRootOpen(root, unsafePath string, mode os.FileMode) (_ *os.File, Err error) {
|
||||
// If the path is already "within" the root, get the path relative to the
|
||||
// root and use that as the unsafe path. This is necessary because a lot of
|
||||
// MkdirAllInRootOpen callers have already done SecureJoin, and refactoring
|
||||
// all of them to stop using these SecureJoin'd paths would require a fair
|
||||
// amount of work.
|
||||
// TODO(cyphar): Do the refactor to libpathrs once it's ready.
|
||||
if IsLexicallyInRoot(root, unsafePath) {
|
||||
subPath, err := filepath.Rel(root, unsafePath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
unsafePath = subPath
|
||||
}
|
||||
|
||||
// Check for any silly mode bits.
|
||||
if mode&^0o7777 != 0 {
|
||||
return nil, fmt.Errorf("tried to include non-mode bits in MkdirAll mode: 0o%.3o", mode)
|
||||
}
|
||||
// Linux (and thus os.MkdirAll) silently ignores the suid and sgid bits if
|
||||
// passed. While it would make sense to return an error in that case (since
|
||||
// the user has asked for a mode that won't be applied), for compatibility
|
||||
// reasons we have to ignore these bits.
|
||||
if ignoredBits := mode &^ 0o1777; ignoredBits != 0 {
|
||||
logrus.Warnf("MkdirAll called with no-op mode bits that are ignored by Linux: 0o%.3o", ignoredBits)
|
||||
mode &= 0o1777
|
||||
}
|
||||
|
||||
rootDir, err := os.OpenFile(root, unix.O_DIRECTORY|unix.O_CLOEXEC, 0)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("open root handle: %w", err)
|
||||
}
|
||||
defer rootDir.Close()
|
||||
|
||||
return securejoin.MkdirAllHandle(rootDir, unsafePath, mode)
|
||||
}
|
||||
|
||||
// MkdirAllInRoot is a wrapper around MkdirAllInRootOpen which closes the
|
||||
// returned handle, for callers that don't need to use it.
|
||||
func MkdirAllInRoot(root, unsafePath string, mode os.FileMode) error {
|
||||
f, err := MkdirAllInRootOpen(root, unsafePath, mode)
|
||||
if err == nil {
|
||||
_ = f.Close()
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// Openat is a Go-friendly openat(2) wrapper.
|
||||
func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
|
||||
dirFd := unix.AT_FDCWD
|
||||
if dir != nil {
|
||||
dirFd = int(dir.Fd())
|
||||
}
|
||||
flags |= unix.O_CLOEXEC
|
||||
|
||||
fd, err := unix.Openat(dirFd, path, flags, mode)
|
||||
if err != nil {
|
||||
return nil, &os.PathError{Op: "openat", Path: path, Err: err}
|
||||
}
|
||||
return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil
|
||||
}
|
||||
Reference in New Issue
Block a user