From 4bf7421a80ae054ec1fc92fa71aa31fa6d9ea59b Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 27 Feb 2025 14:38:37 +0200 Subject: [PATCH] Add create-soname-symlinks hook This change adds a create-soname-symlinks hook that can be used to ensure that the soname symlinks for injected libraries exist in a container. This is done by calling ldconfig -n -N for the directories containing the injected libraries. This also ensures that libcuda.so is present in the ldcache when the update-ldcache hook is run. Signed-off-by: Evan Lezar --- cmd/nvidia-cdi-hook/commands/commands.go | 2 + .../create-soname-symlinks/ldconfig_linux.go | 200 ++++++++++++++++++ .../create-soname-symlinks/ldconfig_other.go | 51 +++++ .../create-soname-symlinks/safe-exec_linux.go | 58 +++++ .../create-soname-symlinks/safe-exec_other.go | 28 +++ .../create-soname-symlinks/soname-symlinks.go | 191 +++++++++++++++++ .../toolkit/toolkit_test.go | 9 + cmd/nvidia-ctk/cdi/generate/generate_test.go | 27 +++ internal/discover/hooks.go | 3 + internal/discover/ldconfig.go | 24 +-- internal/discover/ldconfig_test.go | 75 +++++-- pkg/nvcdi/api.go | 3 + tests/e2e/nvidia-container-toolkit_test.go | 22 ++ 13 files changed, 662 insertions(+), 31 deletions(-) create mode 100644 cmd/nvidia-cdi-hook/create-soname-symlinks/ldconfig_linux.go create mode 100644 cmd/nvidia-cdi-hook/create-soname-symlinks/ldconfig_other.go create mode 100644 cmd/nvidia-cdi-hook/create-soname-symlinks/safe-exec_linux.go create mode 100644 cmd/nvidia-cdi-hook/create-soname-symlinks/safe-exec_other.go create mode 100644 cmd/nvidia-cdi-hook/create-soname-symlinks/soname-symlinks.go diff --git a/cmd/nvidia-cdi-hook/commands/commands.go b/cmd/nvidia-cdi-hook/commands/commands.go index 8917c25d..455b2afa 100644 --- a/cmd/nvidia-cdi-hook/commands/commands.go +++ b/cmd/nvidia-cdi-hook/commands/commands.go @@ -20,6 +20,7 @@ import ( "github.com/urfave/cli/v2" "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/chmod" + createsonamesymlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/create-soname-symlinks" symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/create-symlinks" "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/cudacompat" disabledevicenodemodification "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/disable-device-node-modification" @@ -35,6 +36,7 @@ func New(logger logger.Interface) []*cli.Command { symlinks.NewCommand(logger), chmod.NewCommand(logger), cudacompat.NewCommand(logger), + createsonamesymlinks.NewCommand(logger), disabledevicenodemodification.NewCommand(logger), } } diff --git a/cmd/nvidia-cdi-hook/create-soname-symlinks/ldconfig_linux.go b/cmd/nvidia-cdi-hook/create-soname-symlinks/ldconfig_linux.go new file mode 100644 index 00000000..ffa88b41 --- /dev/null +++ b/cmd/nvidia-cdi-hook/create-soname-symlinks/ldconfig_linux.go @@ -0,0 +1,200 @@ +//go:build linux + +/** +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package create_soname_symlinks + +import ( + "errors" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "syscall" + + securejoin "github.com/cyphar/filepath-securejoin" + + "github.com/moby/sys/reexec" + "github.com/opencontainers/runc/libcontainer/utils" + "golang.org/x/sys/unix" +) + +// pivotRoot will call pivot_root such that rootfs becomes the new root +// filesystem, and everything else is cleaned up. +// This is adapted from the implementation here: +// +// https://github.com/opencontainers/runc/blob/e89a29929c775025419ab0d218a43588b4c12b9a/libcontainer/rootfs_linux.go#L1056-L1113 +// +// With the `mount` and `unmount` calls changed to direct unix.Mount and unix.Unmount calls. +func pivotRoot(rootfs string) error { + // While the documentation may claim otherwise, pivot_root(".", ".") is + // actually valid. What this results in is / being the new root but + // /proc/self/cwd being the old root. Since we can play around with the cwd + // with pivot_root this allows us to pivot without creating directories in + // the rootfs. Shout-outs to the LXC developers for giving us this idea. + + oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return &os.PathError{Op: "open", Path: "/", Err: err} + } + defer unix.Close(oldroot) //nolint: errcheck + + newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return &os.PathError{Op: "open", Path: rootfs, Err: err} + } + defer unix.Close(newroot) //nolint: errcheck + + // Change to the new root so that the pivot_root actually acts on it. + if err := unix.Fchdir(newroot); err != nil { + return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err} + } + + if err := unix.PivotRoot(".", "."); err != nil { + return &os.PathError{Op: "pivot_root", Path: ".", Err: err} + } + + // Currently our "." is oldroot (according to the current kernel code). + // However, purely for safety, we will fchdir(oldroot) since there isn't + // really any guarantee from the kernel what /proc/self/cwd will be after a + // pivot_root(2). + + if err := unix.Fchdir(oldroot); err != nil { + return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err} + } + + // Make oldroot rslave to make sure our unmounts don't propagate to the + // host (and thus bork the machine). We don't use rprivate because this is + // known to cause issues due to races where we still have a reference to a + // mount while a process in the host namespace are trying to operate on + // something they think has no mounts (devicemapper in particular). + if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { + return err + } + // Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd. + if err := unix.Unmount(".", unix.MNT_DETACH); err != nil { + return err + } + + // Switch back to our shiny new root. + if err := unix.Chdir("/"); err != nil { + return &os.PathError{Op: "chdir", Path: "/", Err: err} + } + return nil +} + +// mountLdConfig mounts the host ldconfig to the mount namespace of the hook. +// We use WithProcfd to perform the mount operations to ensure that the changes +// are persisted across the pivot root. +func mountLdConfig(hostLdconfigPath string, containerRootDirPath string) (string, error) { + hostLdconfigInfo, err := os.Stat(hostLdconfigPath) + if err != nil { + return "", fmt.Errorf("error reading host ldconfig: %w", err) + } + + hookScratchDirPath := "/var/run/nvidia-ctk-hook" + ldconfigPath := filepath.Join(hookScratchDirPath, "ldconfig") + if err := utils.MkdirAllInRoot(containerRootDirPath, hookScratchDirPath, 0755); err != nil { + return "", fmt.Errorf("error creating hook scratch folder: %w", err) + } + + err = utils.WithProcfd(containerRootDirPath, hookScratchDirPath, func(hookScratchDirFdPath string) error { + return createTmpFs(hookScratchDirFdPath, int(hostLdconfigInfo.Size())) + + }) + if err != nil { + return "", fmt.Errorf("error creating tmpfs: %w", err) + } + + if _, err := createFileInRoot(containerRootDirPath, ldconfigPath, hostLdconfigInfo.Mode()); err != nil { + return "", fmt.Errorf("error creating ldconfig: %w", err) + } + + err = utils.WithProcfd(containerRootDirPath, ldconfigPath, func(ldconfigFdPath string) error { + return unix.Mount(hostLdconfigPath, ldconfigFdPath, "", unix.MS_BIND|unix.MS_RDONLY|unix.MS_NODEV|unix.MS_PRIVATE|unix.MS_NOSYMFOLLOW, "") + }) + if err != nil { + return "", fmt.Errorf("error bind mounting host ldconfig: %w", err) + } + + return ldconfigPath, nil +} + +func createFileInRoot(containerRootDirPath string, destinationPath string, mode os.FileMode) (string, error) { + dest, err := securejoin.SecureJoin(containerRootDirPath, destinationPath) + if err != nil { + return "", err + } + // Make the parent directory. + destDir, destBase := filepath.Split(dest) + destDirFd, err := utils.MkdirAllInRootOpen(containerRootDirPath, destDir, 0755) + if err != nil { + return "", fmt.Errorf("error creating parent dir: %w", err) + } + defer destDirFd.Close() + // Make the target file. We want to avoid opening any file that is + // already there because it could be a "bad" file like an invalid + // device or hung tty that might cause a DoS, so we use mknodat. + // destBase does not contain any "/" components, and mknodat does + // not follow trailing symlinks, so we can safely just call mknodat + // here. + if err := unix.Mknodat(int(destDirFd.Fd()), destBase, unix.S_IFREG|uint32(mode), 0); err != nil { + // If we get EEXIST, there was already an inode there and + // we can consider that a success. + if !errors.Is(err, unix.EEXIST) { + return "", fmt.Errorf("error creating empty file: %w", err) + } + } + return dest, nil +} + +// mountProc mounts a clean proc filesystem in the new root. +func mountProc(newroot string) error { + target := filepath.Join(newroot, "/proc") + + if err := os.MkdirAll(target, 0755); err != nil { + return fmt.Errorf("error creating directory: %w", err) + } + return unix.Mount("proc", target, "proc", 0, "") +} + +// createTmpFs creates a tmpfs at the specified location with the specified size. +func createTmpFs(target string, size int) error { + return unix.Mount("tmpfs", target, "tmpfs", 0, fmt.Sprintf("size=%d", size)) +} + +// createReexecCommand creates a command that can be used to trigger the reexec +// initializer. +// On linux this command runs in new namespaces. +func createReexecCommand(args []string) *exec.Cmd { + cmd := reexec.Command(args...) + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + cmd.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: syscall.CLONE_NEWNS | + syscall.CLONE_NEWUTS | + syscall.CLONE_NEWIPC | + syscall.CLONE_NEWPID | + syscall.CLONE_NEWNET, + } + + return cmd +} diff --git a/cmd/nvidia-cdi-hook/create-soname-symlinks/ldconfig_other.go b/cmd/nvidia-cdi-hook/create-soname-symlinks/ldconfig_other.go new file mode 100644 index 00000000..86fc44c8 --- /dev/null +++ b/cmd/nvidia-cdi-hook/create-soname-symlinks/ldconfig_other.go @@ -0,0 +1,51 @@ +//go:build !linux + +/** +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package create_soname_symlinks + +import ( + "fmt" + "os" + "os/exec" + + "github.com/moby/sys/reexec" +) + +func pivotRoot(newroot string) error { + return fmt.Errorf("not supported") +} + +func mountLdConfig(hostLdconfigPath string, containerRootDirPath string) (string, error) { + return "", fmt.Errorf("not supported") +} + +func mountProc(newroot string) error { + return fmt.Errorf("not supported") +} + +// createReexecCommand creates a command that can be used ot trigger the reexec +// initializer. +func createReexecCommand(args []string) *exec.Cmd { + cmd := reexec.Command(args...) + cmd.Stdin = os.Stdin + cmd.Stdout = os.Stdout + cmd.Stderr = os.Stderr + + return cmd +} diff --git a/cmd/nvidia-cdi-hook/create-soname-symlinks/safe-exec_linux.go b/cmd/nvidia-cdi-hook/create-soname-symlinks/safe-exec_linux.go new file mode 100644 index 00000000..6efb2cc9 --- /dev/null +++ b/cmd/nvidia-cdi-hook/create-soname-symlinks/safe-exec_linux.go @@ -0,0 +1,58 @@ +//go:build linux + +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package create_soname_symlinks + +import ( + "fmt" + "os" + "strconv" + "syscall" + + "github.com/opencontainers/runc/libcontainer/exeseal" +) + +// SafeExec attempts to clone the specified binary (as an memfd, for example) before executing it. +func SafeExec(path string, args []string, envv []string) error { + safeExe, err := cloneBinary(path) + if err != nil { + //nolint:gosec // TODO: Can we harden this so that there is less risk of command injection + return syscall.Exec(path, args, envv) + } + defer safeExe.Close() + + exePath := "/proc/self/fd/" + strconv.Itoa(int(safeExe.Fd())) + //nolint:gosec // TODO: Can we harden this so that there is less risk of command injection + return syscall.Exec(exePath, args, envv) +} + +func cloneBinary(path string) (*os.File, error) { + exe, err := os.Open(path) + if err != nil { + return nil, fmt.Errorf("opening current binary: %w", err) + } + defer exe.Close() + + stat, err := exe.Stat() + if err != nil { + return nil, fmt.Errorf("checking %v size: %w", path, err) + } + size := stat.Size() + + return exeseal.CloneBinary(exe, size, path, os.TempDir()) +} diff --git a/cmd/nvidia-cdi-hook/create-soname-symlinks/safe-exec_other.go b/cmd/nvidia-cdi-hook/create-soname-symlinks/safe-exec_other.go new file mode 100644 index 00000000..d997fd8f --- /dev/null +++ b/cmd/nvidia-cdi-hook/create-soname-symlinks/safe-exec_other.go @@ -0,0 +1,28 @@ +//go:build !linux + +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package create_soname_symlinks + +import "syscall" + +// SafeExec is not implemented on non-linux systems and forwards directly to the +// Exec syscall. +func SafeExec(path string, args []string, envv []string) error { + //nolint:gosec // TODO: Can we harden this so that there is less risk of command injection + return syscall.Exec(path, args, envv) +} diff --git a/cmd/nvidia-cdi-hook/create-soname-symlinks/soname-symlinks.go b/cmd/nvidia-cdi-hook/create-soname-symlinks/soname-symlinks.go new file mode 100644 index 00000000..524a5536 --- /dev/null +++ b/cmd/nvidia-cdi-hook/create-soname-symlinks/soname-symlinks.go @@ -0,0 +1,191 @@ +/** +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package create_soname_symlinks + +import ( + "errors" + "fmt" + "log" + "os" + "strings" + + "github.com/moby/sys/reexec" + "github.com/urfave/cli/v2" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/config" + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" +) + +const ( + reexecUpdateLdCacheCommandName = "reexec-create-soname-symlinks" +) + +type command struct { + logger logger.Interface +} + +type options struct { + folders cli.StringSlice + ldconfigPath string + containerSpec string +} + +func init() { + reexec.Register(reexecUpdateLdCacheCommandName, createSonameSymlinksHandler) + if reexec.Init() { + os.Exit(0) + } +} + +// NewCommand constructs an create-soname-symlinks command with the specified logger +func NewCommand(logger logger.Interface) *cli.Command { + c := command{ + logger: logger, + } + return c.build() +} + +// build the create-soname-symlinks command +func (m command) build() *cli.Command { + cfg := options{} + + // Create the 'create-soname-symlinks' command + c := cli.Command{ + Name: "create-soname-symlinks", + Usage: "Create soname symlinks libraries in specified directories", + Before: func(c *cli.Context) error { + return m.validateFlags(c, &cfg) + }, + Action: func(c *cli.Context) error { + return m.run(c, &cfg) + }, + } + + c.Flags = []cli.Flag{ + &cli.StringSliceFlag{ + Name: "folder", + Usage: "Specify a directory to generate soname symlinks in. Can be specified multiple times", + Destination: &cfg.folders, + }, + &cli.StringFlag{ + Name: "ldconfig-path", + Usage: "Specify the path to ldconfig on the host", + Destination: &cfg.ldconfigPath, + Value: "/sbin/ldconfig", + }, + &cli.StringFlag{ + Name: "container-spec", + Usage: "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN", + Destination: &cfg.containerSpec, + }, + } + + return &c +} + +func (m command) validateFlags(c *cli.Context, cfg *options) error { + if cfg.ldconfigPath == "" { + return errors.New("ldconfig-path must be specified") + } + return nil +} + +func (m command) run(c *cli.Context, cfg *options) error { + s, err := oci.LoadContainerState(cfg.containerSpec) + if err != nil { + return fmt.Errorf("failed to load container state: %v", err) + } + + containerRootDir, err := s.GetContainerRoot() + if err != nil || containerRootDir == "" || containerRootDir == "/" { + return fmt.Errorf("failed to determined container root: %v", err) + } + + args := []string{ + reexecUpdateLdCacheCommandName, + strings.TrimPrefix(config.NormalizeLDConfigPath("@"+cfg.ldconfigPath), "@"), + containerRootDir, + } + args = append(args, cfg.folders.Value()...) + + cmd := createReexecCommand(args) + + return cmd.Run() +} + +// createSonameSymlinksHandler wraps createSonameSymlinks with error handling. +func createSonameSymlinksHandler() { + if err := createSonameSymlinks(os.Args); err != nil { + log.Printf("Error updating ldcache: %v", err) + os.Exit(1) + } +} + +// createSonameSymlinks is invoked from a reexec'd handler and provides namespace +// isolation for the operations performed by this hook. +// At the point where this is invoked, we are in a new mount namespace that is +// cloned from the parent. +// +// args[0] is the reexec initializer function name +// args[1] is the path of the ldconfig binary on the host +// args[2] is the container root directory +// The remaining args are directories that need to be added to the ldcache. +func createSonameSymlinks(args []string) error { + if len(args) < 3 { + return fmt.Errorf("incorrect arguments: %v", args) + } + hostLdconfigPath := args[1] + containerRootDirPath := args[2] + + // To prevent leaking the parent proc filesystem, we create a new proc mount + // in the container root. + if err := mountProc(containerRootDirPath); err != nil { + return fmt.Errorf("error mounting /proc: %w", err) + } + + // We mount the host ldconfig before we pivot root since host paths are not + // visible after the pivot root operation. + ldconfigPath, err := mountLdConfig(hostLdconfigPath, containerRootDirPath) + if err != nil { + return fmt.Errorf("error mounting host ldconfig: %w", err) + } + + // We pivot to the container root for the new process, this further limits + // access to the host. + if err := pivotRoot(containerRootDirPath); err != nil { + return fmt.Errorf("error running pivot_root: %w", err) + } + + return runLdconfig(ldconfigPath, args[3:]...) +} + +// runLdconfig runs the ldconfig binary and ensures that soname symlinks are +// created in the specified directories. +func runLdconfig(ldconfigPath string, directories ...string) error { + args := []string{ + "ldconfig", + // Explicitly disable updating the LDCache. + "-N", + // Specify -n to only process the specified directories. + "-n", + } + args = append(args, directories...) + + return SafeExec(ldconfigPath, args, nil) +} diff --git a/cmd/nvidia-ctk-installer/toolkit/toolkit_test.go b/cmd/nvidia-ctk-installer/toolkit/toolkit_test.go index d7246330..ed90c8e3 100644 --- a/cmd/nvidia-ctk-installer/toolkit/toolkit_test.go +++ b/cmd/nvidia-ctk-installer/toolkit/toolkit_test.go @@ -97,6 +97,15 @@ containerEdits: - libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so env: - NVIDIA_CTK_DEBUG=false + - hookName: createContainer + path: {{ .toolkitRoot }}/nvidia-cdi-hook + args: + - nvidia-cdi-hook + - create-soname-symlinks + - --folder + - /lib/x86_64-linux-gnu + env: + - NVIDIA_CTK_DEBUG=false - hookName: createContainer path: {{ .toolkitRoot }}/nvidia-cdi-hook args: diff --git a/cmd/nvidia-ctk/cdi/generate/generate_test.go b/cmd/nvidia-ctk/cdi/generate/generate_test.go index 6f762761..d2e31749 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate_test.go +++ b/cmd/nvidia-ctk/cdi/generate/generate_test.go @@ -102,6 +102,15 @@ containerEdits: - --host-driver-version=999.88.77 env: - NVIDIA_CTK_DEBUG=false + - hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + args: + - nvidia-cdi-hook + - create-soname-symlinks + - --folder + - /lib/x86_64-linux-gnu + env: + - NVIDIA_CTK_DEBUG=false - hookName: createContainer path: /usr/bin/nvidia-cdi-hook args: @@ -178,6 +187,15 @@ containerEdits: - libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so env: - NVIDIA_CTK_DEBUG=false + - hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + args: + - nvidia-cdi-hook + - create-soname-symlinks + - --folder + - /lib/x86_64-linux-gnu + env: + - NVIDIA_CTK_DEBUG=false - hookName: createContainer path: /usr/bin/nvidia-cdi-hook args: @@ -254,6 +272,15 @@ containerEdits: - libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so env: - NVIDIA_CTK_DEBUG=false + - hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + args: + - nvidia-cdi-hook + - create-soname-symlinks + - --folder + - /lib/x86_64-linux-gnu + env: + - NVIDIA_CTK_DEBUG=false - hookName: createContainer path: /usr/bin/nvidia-cdi-hook args: diff --git a/internal/discover/hooks.go b/internal/discover/hooks.go index 3f2c9ebb..893c052d 100644 --- a/internal/discover/hooks.go +++ b/internal/discover/hooks.go @@ -46,6 +46,9 @@ const ( // An UpdateLDCacheHook is the hook used to update the ldcache in the // container. This allows injected libraries to be discoverable. UpdateLDCacheHook = HookName("update-ldcache") + // A CreateSonameSymlinksHook is the hook used to ensure that soname symlinks + // for injected libraries exist in the container. + CreateSonameSymlinksHook = HookName("create-soname-symlinks") defaultNvidiaCDIHookPath = "/usr/bin/nvidia-cdi-hook" ) diff --git a/internal/discover/ldconfig.go b/internal/discover/ldconfig.go index eb5ab467..15356de8 100644 --- a/internal/discover/ldconfig.go +++ b/internal/discover/ldconfig.go @@ -51,28 +51,22 @@ func (d ldconfig) Hooks() ([]Hook, error) { return nil, fmt.Errorf("failed to discover mounts for ldcache update: %v", err) } - h := createLDCacheUpdateHook( - d.hookCreator, - d.ldconfigPath, - getLibraryPaths(mounts), - ) - - return h.Hooks() -} - -// createLDCacheUpdateHook locates the NVIDIA Container Toolkit CLI and creates a hook for updating the LD Cache -func createLDCacheUpdateHook(hookCreator HookCreator, ldconfig string, libraries []string) *Hook { var args []string - if ldconfig != "" { - args = append(args, "--ldconfig-path", ldconfig) + if d.ldconfigPath != "" { + args = append(args, "--ldconfig-path", d.ldconfigPath) } - for _, f := range uniqueFolders(libraries) { + for _, f := range uniqueFolders(getLibraryPaths(mounts)) { args = append(args, "--folder", f) } - return hookCreator.Create(UpdateLDCacheHook, args...) + h := Merge( + d.hookCreator.Create(CreateSonameSymlinksHook, args...), + d.hookCreator.Create(UpdateLDCacheHook, args...), + ) + + return h.Hooks() } // getLibraryPaths extracts the library dirs from the specified mounts diff --git a/internal/discover/ldconfig_test.go b/internal/discover/ldconfig_test.go index ff18118d..30576a7b 100644 --- a/internal/discover/ldconfig_test.go +++ b/internal/discover/ldconfig_test.go @@ -39,11 +39,24 @@ func TestLDCacheUpdateHook(t *testing.T) { mounts []Mount mountError error expectedError error - expectedArgs []string + expectedHooks []Hook }{ { - description: "empty mounts", - expectedArgs: []string{"nvidia-cdi-hook", "update-ldcache"}, + description: "empty mounts", + expectedHooks: []Hook{ + { + Lifecycle: "createContainer", + Path: testNvidiaCDIHookPath, + Args: []string{"nvidia-cdi-hook", "create-soname-symlinks"}, + Env: []string{"NVIDIA_CTK_DEBUG=false"}, + }, + { + Lifecycle: "createContainer", + Path: testNvidiaCDIHookPath, + Args: []string{"nvidia-cdi-hook", "update-ldcache"}, + Env: []string{"NVIDIA_CTK_DEBUG=false"}, + }, + }, }, { description: "mount error", @@ -66,7 +79,20 @@ func TestLDCacheUpdateHook(t *testing.T) { Path: "/usr/local/lib/libbar.so", }, }, - expectedArgs: []string{"nvidia-cdi-hook", "update-ldcache", "--folder", "/usr/local/lib", "--folder", "/usr/local/libother"}, + expectedHooks: []Hook{ + { + Lifecycle: "createContainer", + Path: testNvidiaCDIHookPath, + Args: []string{"nvidia-cdi-hook", "create-soname-symlinks", "--folder", "/usr/local/lib", "--folder", "/usr/local/libother"}, + Env: []string{"NVIDIA_CTK_DEBUG=false"}, + }, + { + Lifecycle: "createContainer", + Path: testNvidiaCDIHookPath, + Args: []string{"nvidia-cdi-hook", "update-ldcache", "--folder", "/usr/local/lib", "--folder", "/usr/local/libother"}, + Env: []string{"NVIDIA_CTK_DEBUG=false"}, + }, + }, }, { description: "host paths are ignored", @@ -76,12 +102,38 @@ func TestLDCacheUpdateHook(t *testing.T) { Path: "/usr/local/lib/libfoo.so", }, }, - expectedArgs: []string{"nvidia-cdi-hook", "update-ldcache", "--folder", "/usr/local/lib"}, + expectedHooks: []Hook{ + { + Lifecycle: "createContainer", + Path: testNvidiaCDIHookPath, + Args: []string{"nvidia-cdi-hook", "create-soname-symlinks", "--folder", "/usr/local/lib"}, + Env: []string{"NVIDIA_CTK_DEBUG=false"}, + }, + { + Lifecycle: "createContainer", + Path: testNvidiaCDIHookPath, + Args: []string{"nvidia-cdi-hook", "update-ldcache", "--folder", "/usr/local/lib"}, + Env: []string{"NVIDIA_CTK_DEBUG=false"}, + }, + }, }, { description: "explicit ldconfig path is passed", ldconfigPath: testLdconfigPath, - expectedArgs: []string{"nvidia-cdi-hook", "update-ldcache", "--ldconfig-path", testLdconfigPath}, + expectedHooks: []Hook{ + { + Lifecycle: "createContainer", + Path: testNvidiaCDIHookPath, + Args: []string{"nvidia-cdi-hook", "create-soname-symlinks", "--ldconfig-path", testLdconfigPath}, + Env: []string{"NVIDIA_CTK_DEBUG=false"}, + }, + { + Lifecycle: "createContainer", + Path: testNvidiaCDIHookPath, + Args: []string{"nvidia-cdi-hook", "update-ldcache", "--ldconfig-path", testLdconfigPath}, + Env: []string{"NVIDIA_CTK_DEBUG=false"}, + }, + }, }, } @@ -92,13 +144,6 @@ func TestLDCacheUpdateHook(t *testing.T) { return tc.mounts, tc.mountError }, } - expectedHook := Hook{ - Path: testNvidiaCDIHookPath, - Args: tc.expectedArgs, - Lifecycle: "createContainer", - Env: []string{"NVIDIA_CTK_DEBUG=false"}, - } - d, err := NewLDCacheUpdateHook(logger, mountMock, hookCreator, tc.ldconfigPath) require.NoError(t, err) @@ -112,9 +157,7 @@ func TestLDCacheUpdateHook(t *testing.T) { } require.NoError(t, err) - require.Len(t, hooks, 1) - - require.EqualValues(t, hooks[0], expectedHook) + require.EqualValues(t, tc.expectedHooks, hooks) devices, err := d.Devices() require.NoError(t, err) diff --git a/pkg/nvcdi/api.go b/pkg/nvcdi/api.go index 50d5c4ea..8bd0bf12 100644 --- a/pkg/nvcdi/api.go +++ b/pkg/nvcdi/api.go @@ -56,6 +56,9 @@ const ( EnableCudaCompatHook = discover.EnableCudaCompatHook // An UpdateLDCacheHook is used to update the ldcache in the container. UpdateLDCacheHook = discover.UpdateLDCacheHook + // A CreateSonameSymlinksHook is the hook used to ensure that soname symlinks + // for injected libraries exist in the container. + CreateSonameSymlinksHook = discover.CreateSonameSymlinksHook // Deprecated: Use CreateSymlinksHook instead. HookCreateSymlinks = CreateSymlinksHook diff --git a/tests/e2e/nvidia-container-toolkit_test.go b/tests/e2e/nvidia-container-toolkit_test.go index f89b79a5..c143ea7d 100644 --- a/tests/e2e/nvidia-container-toolkit_test.go +++ b/tests/e2e/nvidia-container-toolkit_test.go @@ -235,4 +235,26 @@ var _ = Describe("docker", Ordered, ContinueOnFailure, func() { Expect(output).To(Equal("ModifyDeviceFiles: 0\n")) }) }) + + When("A container is run using CDI", Ordered, func() { + BeforeAll(func(ctx context.Context) { + _, _, err := runner.Run("docker pull ubuntu") + Expect(err).ToNot(HaveOccurred()) + }) + + It("should include libcuda.so in the ldcache", func(ctx context.Context) { + ldcacheOutput, _, err := runner.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu bash -c \"ldconfig -p | grep 'libcuda.so'\"") + Expect(err).ToNot(HaveOccurred()) + Expect(ldcacheOutput).ToNot(BeEmpty()) + + ldcacheLines := strings.Split(ldcacheOutput, "\n") + var libs []string + for _, line := range ldcacheLines { + parts := strings.SplitN(line, " (", 2) + libs = append(libs, strings.TrimSpace(parts[0])) + } + + Expect(libs).To(ContainElements([]string{"libcuda.so", "libcuda.so.1"})) + }) + }) })