From fc65d3a784301fd88b46c85034af434ca320f573 Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 6 Feb 2025 17:24:42 +0100 Subject: [PATCH] Add enable-cuda-compat hook to allow compat libs to be discovered This change adds an nvidia-cdi-hook enable-cuda-compat hook that checks the container for cuda compat libs and updates /etc/ld.so.conf.d to include their parent folder if their driver major version is sufficient. This allows CUDA Forward Compatibility to be used when this is not available through the libnvidia-container. Signed-off-by: Evan Lezar --- cmd/nvidia-cdi-hook/commands/commands.go | 2 + .../cudacompat/container-root.go | 76 ++++++ cmd/nvidia-cdi-hook/cudacompat/cudacompat.go | 221 ++++++++++++++++++ .../cudacompat/cudacompat_test.go | 173 ++++++++++++++ 4 files changed, 472 insertions(+) create mode 100644 cmd/nvidia-cdi-hook/cudacompat/container-root.go create mode 100644 cmd/nvidia-cdi-hook/cudacompat/cudacompat.go create mode 100644 cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go diff --git a/cmd/nvidia-cdi-hook/commands/commands.go b/cmd/nvidia-cdi-hook/commands/commands.go index a222acf2..3f80ba9b 100644 --- a/cmd/nvidia-cdi-hook/commands/commands.go +++ b/cmd/nvidia-cdi-hook/commands/commands.go @@ -21,6 +21,7 @@ import ( "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/chmod" symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/create-symlinks" + "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/cudacompat" ldcache "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/update-ldcache" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" ) @@ -32,5 +33,6 @@ func New(logger logger.Interface) []*cli.Command { ldcache.NewCommand(logger), symlinks.NewCommand(logger), chmod.NewCommand(logger), + cudacompat.NewCommand(logger), } } diff --git a/cmd/nvidia-cdi-hook/cudacompat/container-root.go b/cmd/nvidia-cdi-hook/cudacompat/container-root.go new file mode 100644 index 00000000..8bb3b3c8 --- /dev/null +++ b/cmd/nvidia-cdi-hook/cudacompat/container-root.go @@ -0,0 +1,76 @@ +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cudacompat + +import ( + "os" + "path/filepath" + + "github.com/moby/sys/symlink" +) + +// A containerRoot represents the root filesystem of a container. +type containerRoot string + +// hasPath checks whether the specified path exists in the root. +func (r containerRoot) hasPath(path string) bool { + resolved, err := r.resolve(path) + if err != nil { + return false + } + if _, err := os.Stat(resolved); err != nil && os.IsNotExist(err) { + return false + } + return true +} + +// globFiles matches the specified pattern in the root. +// The files that match must be regular files. +func (r containerRoot) globFiles(pattern string) ([]string, error) { + patternPath, err := r.resolve(pattern) + if err != nil { + return nil, err + } + matches, err := filepath.Glob(patternPath) + if err != nil { + return nil, err + } + var files []string + for _, match := range matches { + info, err := os.Lstat(match) + if err != nil { + return nil, err + } + // Ignore symlinks. + if info.Mode()&os.ModeSymlink != 0 { + continue + } + // Ignore directories. + if info.IsDir() { + continue + } + files = append(files, match) + } + return files, nil +} + +// resolve returns the absolute path including root path. +// Symlinks are resolved, but are guaranteed to resolve in the root. +func (r containerRoot) resolve(path string) (string, error) { + absolute := filepath.Clean(filepath.Join(string(r), path)) + return symlink.FollowSymlinkInScope(absolute, string(r)) +} diff --git a/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go b/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go new file mode 100644 index 00000000..0cecd6c1 --- /dev/null +++ b/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go @@ -0,0 +1,221 @@ +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package cudacompat + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/urfave/cli/v2" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" + "github.com/NVIDIA/nvidia-container-toolkit/internal/oci" +) + +const ( + cudaCompatPath = "/usr/local/cuda/compat" + // cudaCompatLdsoconfdFilenamePattern specifies the pattern for the filename + // in ld.so.conf.d that includes a reference to the CUDA compat path. + // The 00-compat prefix is chosen to ensure that these libraries have a + // higher precedence than other libraries on the system. + cudaCompatLdsoconfdFilenamePattern = "00-compat-*.conf" +) + +type command struct { + logger logger.Interface +} + +type options struct { + hostDriverVersion string + containerSpec string +} + +// NewCommand constructs a cuda-compat command with the specified logger +func NewCommand(logger logger.Interface) *cli.Command { + c := command{ + logger: logger, + } + return c.build() +} + +// build the enable-cuda-compat command +func (m command) build() *cli.Command { + cfg := options{} + + // Create the 'enable-cuda-compat' command + c := cli.Command{ + Name: "enable-cuda-compat", + Usage: "This hook ensures that the folder containing the CUDA compat libraries is added to the ldconfig search path if required.", + Before: func(c *cli.Context) error { + return m.validateFlags(c, &cfg) + }, + Action: func(c *cli.Context) error { + return m.run(c, &cfg) + }, + } + + c.Flags = []cli.Flag{ + &cli.StringFlag{ + Name: "host-driver-version", + Usage: "Specify the host driver version. If the CUDA compat libraries detected in the container do not have a higher MAJOR version, the hook is a no-op.", + Destination: &cfg.hostDriverVersion, + }, + &cli.StringFlag{ + Name: "container-spec", + Hidden: true, + Category: "testing-only", + Usage: "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN", + Destination: &cfg.containerSpec, + }, + } + + return &c +} + +func (m command) validateFlags(_ *cli.Context, cfg *options) error { + return nil +} + +func (m command) run(_ *cli.Context, cfg *options) error { + if cfg.hostDriverVersion == "" { + return nil + } + + s, err := oci.LoadContainerState(cfg.containerSpec) + if err != nil { + return fmt.Errorf("failed to load container state: %w", err) + } + + containerRootDir, err := s.GetContainerRoot() + if err != nil { + return fmt.Errorf("failed to determined container root: %w", err) + } + + containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.hostDriverVersion) + if err != nil { + return fmt.Errorf("failed to get container forward compat directory: %w", err) + } + if containerForwardCompatDir == "" { + return nil + } + + return m.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, containerForwardCompatDir) +} + +func (m command) getContainerForwardCompatDir(containerRoot containerRoot, hostDriverVersion string) (string, error) { + if hostDriverVersion == "" { + m.logger.Debugf("Host driver version not specified") + return "", nil + } + if !containerRoot.hasPath(cudaCompatPath) { + m.logger.Debugf("No CUDA forward compatibility libraries directory in container") + return "", nil + } + if !containerRoot.hasPath("/etc/ld.so.cache") { + m.logger.Debugf("The container does not have an LDCache") + return "", nil + } + + libs, err := containerRoot.globFiles(filepath.Join(cudaCompatPath, "libcuda.so.*.*")) + if err != nil { + m.logger.Warningf("Failed to find CUDA compat library: %w", err) + return "", nil + } + + if len(libs) == 0 { + m.logger.Debugf("No CUDA forward compatibility libraries container") + return "", nil + } + + if len(libs) != 1 { + m.logger.Warningf("Unexpected number of CUDA compat libraries in container: %v", libs) + return "", nil + } + + compatDriverVersion := strings.TrimPrefix(filepath.Base(libs[0]), "libcuda.so.") + compatMajor, err := extractMajorVersion(compatDriverVersion) + if err != nil { + return "", fmt.Errorf("failed to extract major version from %q: %v", compatDriverVersion, err) + } + + driverMajor, err := extractMajorVersion(hostDriverVersion) + if err != nil { + return "", fmt.Errorf("failed to extract major version from %q: %v", hostDriverVersion, err) + } + + if driverMajor >= compatMajor { + m.logger.Debugf("Compat major version is not greater than the host driver major version (%v >= %v)", hostDriverVersion, compatDriverVersion) + return "", nil + } + + resolvedCompatDir := strings.TrimPrefix(filepath.Dir(libs[0]), string(containerRoot)) + return resolvedCompatDir, nil +} + +// createLdsoconfdFile creates a file at /etc/ld.so.conf.d/ in the specified root. +// The file is created at /etc/ld.so.conf.d/{{ .pattern }} using `CreateTemp` and +// contains the specified directories on each line. +func (m command) createLdsoconfdFile(in containerRoot, pattern string, dirs ...string) error { + if len(dirs) == 0 { + m.logger.Debugf("No directories to add to /etc/ld.so.conf") + return nil + } + + ldsoconfdDir, err := in.resolve("/etc/ld.so.conf.d") + if err != nil { + return err + } + if err := os.MkdirAll(ldsoconfdDir, 0755); err != nil { + return fmt.Errorf("failed to create ld.so.conf.d: %w", err) + } + + configFile, err := os.CreateTemp(ldsoconfdDir, pattern) + if err != nil { + return fmt.Errorf("failed to create config file: %w", err) + } + defer configFile.Close() + + m.logger.Debugf("Adding directories %v to %v", dirs, configFile.Name()) + + added := make(map[string]bool) + for _, dir := range dirs { + if added[dir] { + continue + } + _, err = configFile.WriteString(fmt.Sprintf("%s\n", dir)) + if err != nil { + return fmt.Errorf("failed to update config file: %w", err) + } + added[dir] = true + } + + // The created file needs to be world readable for the cases where the container is run as a non-root user. + if err := configFile.Chmod(0644); err != nil { + return fmt.Errorf("failed to chmod config file: %w", err) + } + + return nil +} + +// extractMajorVersion parses a version string and returns the major version as an int. +func extractMajorVersion(version string) (int, error) { + majorString := strings.SplitN(version, ".", 2)[0] + return strconv.Atoi(majorString) +} diff --git a/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go b/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go new file mode 100644 index 00000000..ad8d5695 --- /dev/null +++ b/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go @@ -0,0 +1,173 @@ +/* +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +*/ + +package cudacompat + +import ( + "os" + "path/filepath" + "strings" + "testing" + + testlog "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/require" +) + +func TestCompatLibs(t *testing.T) { + logger, _ := testlog.NewNullLogger() + + testCases := []struct { + description string + contents map[string]string + hostDriverVersion string + expectedContainerForwardCompatDir string + }{ + { + description: "empty root", + hostDriverVersion: "222.55.66", + }, + { + description: "compat lib is newer; no ldcache", + contents: map[string]string{ + "/usr/local/cuda/compat/libcuda.so.333.88.99": "", + }, + hostDriverVersion: "222.55.66", + }, + { + description: "compat lib is newer; ldcache", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/usr/local/cuda/compat/libcuda.so.333.88.99": "", + }, + hostDriverVersion: "222.55.66", + expectedContainerForwardCompatDir: "/usr/local/cuda/compat", + }, + { + description: "compat lib is older; ldcache", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/usr/local/cuda/compat/libcuda.so.111.88.99": "", + }, + hostDriverVersion: "222.55.66", + expectedContainerForwardCompatDir: "", + }, + { + description: "compat lib has same major version; ldcache", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/usr/local/cuda/compat/libcuda.so.222.88.99": "", + }, + hostDriverVersion: "222.55.66", + expectedContainerForwardCompatDir: "", + }, + { + description: "driver version empty; ldcache", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/usr/local/cuda/compat/libcuda.so.222.88.99": "", + }, + hostDriverVersion: "", + }, + { + description: "symlinks are followed", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/etc/alternatives/cuda/compat/libcuda.so.333.88.99": "", + "/usr/local/cuda": "symlink=/etc/alternatives/cuda", + }, + hostDriverVersion: "222.55.66", + expectedContainerForwardCompatDir: "/etc/alternatives/cuda/compat", + }, + { + description: "symlinks stay in container", + contents: map[string]string{ + "/etc/ld.so.cache": "", + "/compat/libcuda.so.333.88.99": "", + "/usr/local/cuda": "symlink=../../../../../../", + }, + hostDriverVersion: "222.55.66", + expectedContainerForwardCompatDir: "/compat", + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + containerRootDir := t.TempDir() + for name, contents := range tc.contents { + target := filepath.Join(containerRootDir, name) + require.NoError(t, os.MkdirAll(filepath.Dir(target), 0755)) + + if strings.HasPrefix(contents, "symlink=") { + require.NoError(t, os.Symlink(strings.TrimPrefix(contents, "symlink="), target)) + continue + } + + require.NoError(t, os.WriteFile(target, []byte(contents), 0600)) + } + + c := command{ + logger: logger, + } + containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), tc.hostDriverVersion) + require.NoError(t, err) + require.EqualValues(t, tc.expectedContainerForwardCompatDir, containerForwardCompatDir) + }) + } +} + +func TestUpdateLdconfig(t *testing.T) { + logger, _ := testlog.NewNullLogger() + testCases := []struct { + description string + folders []string + expectedContents string + }{ + { + description: "no folders; have no contents", + }, + { + description: "single folder is added", + folders: []string{"/usr/local/cuda/compat"}, + expectedContents: "/usr/local/cuda/compat\n", + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + containerRootDir := t.TempDir() + c := command{ + logger: logger, + } + err := c.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, tc.folders...) + require.NoError(t, err) + + matches, err := filepath.Glob(filepath.Join(containerRootDir, "/etc/ld.so.conf.d/00-compat-*.conf")) + require.NoError(t, err) + + if tc.expectedContents == "" { + require.Empty(t, matches) + return + } + + require.Len(t, matches, 1) + contents, err := os.ReadFile(matches[0]) + require.NoError(t, err) + + require.EqualValues(t, tc.expectedContents, string(contents)) + }) + } + +}