mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-03-12 15:09:50 +00:00
This change copies ldconfig into a memfd before executing it from the createContainer hook. Signed-off-by: Evan Lezar <elezar@nvidia.com>
217 lines
5.6 KiB
Go
217 lines
5.6 KiB
Go
//go:build linux
|
|
|
|
package system
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"strconv"
|
|
"syscall"
|
|
"unsafe"
|
|
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
type ParentDeathSignal int
|
|
|
|
func (p ParentDeathSignal) Restore() error {
|
|
if p == 0 {
|
|
return nil
|
|
}
|
|
current, err := GetParentDeathSignal()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if p == current {
|
|
return nil
|
|
}
|
|
return p.Set()
|
|
}
|
|
|
|
func (p ParentDeathSignal) Set() error {
|
|
return SetParentDeathSignal(uintptr(p))
|
|
}
|
|
|
|
func Exec(cmd string, args []string, env []string) error {
|
|
for {
|
|
err := unix.Exec(cmd, args, env)
|
|
if err != unix.EINTR {
|
|
return &os.PathError{Op: "exec", Path: cmd, Err: err}
|
|
}
|
|
}
|
|
}
|
|
|
|
func execveat(fd uintptr, pathname string, args []string, env []string, flags int) error {
|
|
pathnamep, err := syscall.BytePtrFromString(pathname)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
argvp, err := syscall.SlicePtrFromStrings(args)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
envp, err := syscall.SlicePtrFromStrings(env)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
_, _, errno := syscall.Syscall6(
|
|
unix.SYS_EXECVEAT,
|
|
fd,
|
|
uintptr(unsafe.Pointer(pathnamep)),
|
|
uintptr(unsafe.Pointer(&argvp[0])),
|
|
uintptr(unsafe.Pointer(&envp[0])),
|
|
uintptr(flags),
|
|
0,
|
|
)
|
|
return errno
|
|
}
|
|
|
|
func Fexecve(fd uintptr, args []string, env []string) error {
|
|
var err error
|
|
for {
|
|
err = execveat(fd, "", args, env, unix.AT_EMPTY_PATH)
|
|
if err != unix.EINTR { // nolint:errorlint // unix errors are bare
|
|
break
|
|
}
|
|
}
|
|
if err == unix.ENOSYS { // nolint:errorlint // unix errors are bare
|
|
// Fallback to classic /proc/self/fd/... exec.
|
|
return Exec("/proc/self/fd/"+strconv.Itoa(int(fd)), args, env)
|
|
}
|
|
return os.NewSyscallError("execveat", err)
|
|
}
|
|
|
|
func SetParentDeathSignal(sig uintptr) error {
|
|
if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func GetParentDeathSignal() (ParentDeathSignal, error) {
|
|
var sig int
|
|
if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil {
|
|
return -1, err
|
|
}
|
|
return ParentDeathSignal(sig), nil
|
|
}
|
|
|
|
func SetKeepCaps() error {
|
|
if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func ClearKeepCaps() error {
|
|
if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func Setctty() error {
|
|
if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// SetSubreaper sets the value i as the subreaper setting for the calling process
|
|
func SetSubreaper(i int) error {
|
|
return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
|
|
}
|
|
|
|
// GetSubreaper returns the subreaper setting for the calling process
|
|
func GetSubreaper() (int, error) {
|
|
var i uintptr
|
|
|
|
if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
return int(i), nil
|
|
}
|
|
|
|
func ExecutableMemfd(comment string, flags int) (*os.File, error) {
|
|
// Try to use MFD_EXEC first. On pre-6.3 kernels we get -EINVAL for this
|
|
// flag. On post-6.3 kernels, with vm.memfd_noexec=1 this ensures we get an
|
|
// executable memfd. For vm.memfd_noexec=2 this is a bit more complicated.
|
|
// The original vm.memfd_noexec=2 implementation incorrectly silently
|
|
// allowed MFD_EXEC[1] -- this should be fixed in 6.6. On 6.6 and newer
|
|
// kernels, we will get -EACCES if we try to use MFD_EXEC with
|
|
// vm.memfd_noexec=2 (for 6.3-6.5, -EINVAL was the intended return value).
|
|
//
|
|
// The upshot is we only need to retry without MFD_EXEC on -EINVAL because
|
|
// it just so happens that passing MFD_EXEC bypasses vm.memfd_noexec=2 on
|
|
// kernels where -EINVAL is actually a security denial.
|
|
memfd, err := unix.MemfdCreate(comment, flags|unix.MFD_EXEC)
|
|
if err == unix.EINVAL {
|
|
memfd, err = unix.MemfdCreate(comment, flags)
|
|
}
|
|
if err != nil {
|
|
if err == unix.EACCES {
|
|
logrus.Info("memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE")
|
|
}
|
|
err := os.NewSyscallError("memfd_create", err)
|
|
return nil, fmt.Errorf("failed to create executable memfd: %w", err)
|
|
}
|
|
return os.NewFile(uintptr(memfd), "/memfd:"+comment), nil
|
|
}
|
|
|
|
// Copy is like io.Copy except it uses sendfile(2) if the source and sink are
|
|
// both (*os.File) as an optimisation to make copies faster.
|
|
func Copy(dst io.Writer, src io.Reader) (copied int64, err error) {
|
|
dstFile, _ := dst.(*os.File)
|
|
srcFile, _ := src.(*os.File)
|
|
|
|
if dstFile != nil && srcFile != nil {
|
|
fi, err := srcFile.Stat()
|
|
if err != nil {
|
|
goto fallback
|
|
}
|
|
size := fi.Size()
|
|
for size > 0 {
|
|
n, err := unix.Sendfile(int(dstFile.Fd()), int(srcFile.Fd()), nil, int(size))
|
|
if n > 0 {
|
|
size -= int64(n)
|
|
copied += int64(n)
|
|
}
|
|
if err == unix.EINTR {
|
|
continue
|
|
}
|
|
if err != nil {
|
|
if copied == 0 {
|
|
// If we haven't copied anything so far, we can safely just
|
|
// fallback to io.Copy. We could always do the fallback but
|
|
// it's safer to error out in the case of a partial copy
|
|
// followed by an error (which should never happen).
|
|
goto fallback
|
|
}
|
|
return copied, fmt.Errorf("partial sendfile copy: %w", err)
|
|
}
|
|
}
|
|
return copied, nil
|
|
}
|
|
|
|
fallback:
|
|
return io.Copy(dst, src)
|
|
}
|
|
|
|
// SetLinuxPersonality sets the Linux execution personality. For more information see the personality syscall documentation.
|
|
// checkout getLinuxPersonalityFromStr() from libcontainer/specconv/spec_linux.go for type conversion.
|
|
func SetLinuxPersonality(personality int) error {
|
|
_, _, errno := unix.Syscall(unix.SYS_PERSONALITY, uintptr(personality), 0, 0)
|
|
if errno != 0 {
|
|
return &os.SyscallError{Syscall: "set_personality", Err: errno}
|
|
}
|
|
return nil
|
|
}
|