mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-03-10 06:10:08 +00:00
Add cuda-compat hook to allow compat libs to be discovered
This change adds an nvidia-cdi-hook cuda-compat hook that checks the container for cuda compat libs and updates /etc/ld.so.conf.d to include their parent folder if their driver major version is sufficient. This allows CUDA Forward Compatibility to be used when this is not available through the libnvidia-container. Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
parent
65b575fa96
commit
11cd8be986
@ -21,6 +21,7 @@ import (
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/chmod"
|
||||
symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/create-symlinks"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/cudacompat"
|
||||
ldcache "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/update-ldcache"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
)
|
||||
@ -32,5 +33,6 @@ func New(logger logger.Interface) []*cli.Command {
|
||||
ldcache.NewCommand(logger),
|
||||
symlinks.NewCommand(logger),
|
||||
chmod.NewCommand(logger),
|
||||
cudacompat.NewCommand(logger),
|
||||
}
|
||||
}
|
||||
|
76
cmd/nvidia-cdi-hook/cudacompat/container-root.go
Normal file
76
cmd/nvidia-cdi-hook/cudacompat/container-root.go
Normal file
@ -0,0 +1,76 @@
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package cudacompat
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/moby/sys/symlink"
|
||||
)
|
||||
|
||||
// A containerRoot represents the root filesystem of a container.
|
||||
type containerRoot string
|
||||
|
||||
// hasPath checks whether the specified path exists in the root.
|
||||
func (r containerRoot) hasPath(path string) bool {
|
||||
resolved, err := r.resolve(path)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
if _, err := os.Stat(resolved); err != nil && os.IsNotExist(err) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// globFiles matches the specified pattern in the root.
|
||||
// The files that match must be regular files.
|
||||
func (r containerRoot) globFiles(pattern string) ([]string, error) {
|
||||
patternPath, err := r.resolve(pattern)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
matches, err := filepath.Glob(patternPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var files []string
|
||||
for _, match := range matches {
|
||||
info, err := os.Lstat(match)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Ignore symlinks.
|
||||
if info.Mode()&os.ModeSymlink != 0 {
|
||||
continue
|
||||
}
|
||||
// Ignore directories.
|
||||
if info.IsDir() {
|
||||
continue
|
||||
}
|
||||
files = append(files, match)
|
||||
}
|
||||
return files, nil
|
||||
}
|
||||
|
||||
// resolve returns the absolute path including root path.
|
||||
// Symlinks are resolved, but are guaranteed to resolve in the root.
|
||||
func (r containerRoot) resolve(path string) (string, error) {
|
||||
absolute := filepath.Clean(filepath.Join(string(r), path))
|
||||
return symlink.FollowSymlinkInScope(absolute, string(r))
|
||||
}
|
208
cmd/nvidia-cdi-hook/cudacompat/cudacompat.go
Normal file
208
cmd/nvidia-cdi-hook/cudacompat/cudacompat.go
Normal file
@ -0,0 +1,208 @@
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package cudacompat
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/urfave/cli/v2"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||
)
|
||||
|
||||
const (
|
||||
cudaCompatPath = "/usr/local/cuda/compat"
|
||||
// cudaCompatLdsoconfdFilenamePattern specifies the pattern for the filename
|
||||
// in ld.so.conf.d that includes a reference to the CUDA compat path.
|
||||
// The 00-compat prefix is chosen to ensure that these libraries have a
|
||||
// higher precedence than other libraries on the system.
|
||||
cudaCompatLdsoconfdFilenamePattern = "00-compat-*.conf"
|
||||
)
|
||||
|
||||
type command struct {
|
||||
logger logger.Interface
|
||||
}
|
||||
|
||||
type options struct {
|
||||
hostDriverVersion string
|
||||
containerSpec string
|
||||
}
|
||||
|
||||
// NewCommand constructs a cuda-compat command with the specified logger
|
||||
func NewCommand(logger logger.Interface) *cli.Command {
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
return c.build()
|
||||
}
|
||||
|
||||
// build the cuda-compat command
|
||||
func (m command) build() *cli.Command {
|
||||
cfg := options{}
|
||||
|
||||
// Create the 'cuda-compat' command
|
||||
c := cli.Command{
|
||||
Name: "cuda-compat",
|
||||
Usage: "This hook ensures that the folder containing the CUDA compat libraries is added to the ldconfig search path if required.",
|
||||
Before: func(c *cli.Context) error {
|
||||
return m.validateFlags(c, &cfg)
|
||||
},
|
||||
Action: func(c *cli.Context) error {
|
||||
return m.run(c, &cfg)
|
||||
},
|
||||
}
|
||||
|
||||
c.Flags = []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "host-driver-version",
|
||||
Usage: "Specify the host driver version. If the CUDA compat libraries detected in the container do not have a higher MAJOR version, the hook is a no-op.",
|
||||
Destination: &cfg.hostDriverVersion,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "container-spec",
|
||||
Hidden: true,
|
||||
Category: "testing-only",
|
||||
Usage: "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN",
|
||||
Destination: &cfg.containerSpec,
|
||||
},
|
||||
}
|
||||
|
||||
return &c
|
||||
}
|
||||
|
||||
func (m command) validateFlags(_ *cli.Context, cfg *options) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m command) run(_ *cli.Context, cfg *options) error {
|
||||
if cfg.hostDriverVersion == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
s, err := oci.LoadContainerState(cfg.containerSpec)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load container state: %w", err)
|
||||
}
|
||||
|
||||
containerRootDir, err := s.GetContainerRoot()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to determined container root: %w", err)
|
||||
}
|
||||
|
||||
containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.hostDriverVersion)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get container forward compat directory: %w", err)
|
||||
}
|
||||
if containerForwardCompatDir == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
return m.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, containerForwardCompatDir)
|
||||
}
|
||||
|
||||
func (m command) getContainerForwardCompatDir(containerRoot containerRoot, hostDriverVersion string) (string, error) {
|
||||
if hostDriverVersion == "" {
|
||||
m.logger.Debugf("Host driver version not specified")
|
||||
return "", nil
|
||||
}
|
||||
if !containerRoot.hasPath(cudaCompatPath) {
|
||||
m.logger.Debugf("No CUDA forward compatibility libraries directory in container")
|
||||
return "", nil
|
||||
}
|
||||
if !containerRoot.hasPath("/etc/ld.so.cache") {
|
||||
m.logger.Debugf("The container does not have an LDCache")
|
||||
return "", nil
|
||||
}
|
||||
|
||||
libs, err := containerRoot.globFiles(filepath.Join(cudaCompatPath, "libcuda.so.*.*"))
|
||||
if err != nil {
|
||||
m.logger.Warningf("Failed to find CUDA compat library: %w", err)
|
||||
return "", nil
|
||||
}
|
||||
|
||||
if len(libs) == 0 {
|
||||
m.logger.Debugf("No CUDA forward compatibility libraries container")
|
||||
return "", nil
|
||||
}
|
||||
|
||||
if len(libs) != 1 {
|
||||
m.logger.Warningf("Unexpected number of CUDA compat libraries in container: %v", libs)
|
||||
return "", nil
|
||||
}
|
||||
|
||||
compatDriverVersion := strings.TrimPrefix(filepath.Base(libs[0]), "libcuda.so.")
|
||||
compatMajor := strings.SplitN(compatDriverVersion, ".", 2)[0]
|
||||
|
||||
driverMajor := strings.SplitN(hostDriverVersion, ".", 2)[0]
|
||||
|
||||
if driverMajor >= compatMajor {
|
||||
m.logger.Debugf("Compat major version is not greater than the host driver major version (%v >= %v)", hostDriverVersion, compatDriverVersion)
|
||||
return "", nil
|
||||
}
|
||||
|
||||
resolvedCompatDir := strings.TrimPrefix(filepath.Dir(libs[0]), string(containerRoot))
|
||||
return resolvedCompatDir, nil
|
||||
}
|
||||
|
||||
// createLdsoconfdFile creates a file at /etc/ld.so.conf.d/ in the specified root.
|
||||
// The file is created at /etc/ld.so.conf.d/{{ .pattern }} using `CreateTemp` and
|
||||
// contains the specified directories on each line.
|
||||
func (m command) createLdsoconfdFile(in containerRoot, pattern string, dirs ...string) error {
|
||||
if len(dirs) == 0 {
|
||||
m.logger.Debugf("No directories to add to /etc/ld.so.conf")
|
||||
return nil
|
||||
}
|
||||
|
||||
ldsoconfdDir, err := in.resolve("/etc/ld.so.conf.d")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.MkdirAll(ldsoconfdDir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create ld.so.conf.d: %w", err)
|
||||
}
|
||||
|
||||
configFile, err := os.CreateTemp(ldsoconfdDir, pattern)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create config file: %w", err)
|
||||
}
|
||||
defer configFile.Close()
|
||||
|
||||
m.logger.Debugf("Adding directories %v to %v", dirs, configFile.Name())
|
||||
|
||||
added := make(map[string]bool)
|
||||
for _, dir := range dirs {
|
||||
if added[dir] {
|
||||
continue
|
||||
}
|
||||
_, err = configFile.WriteString(fmt.Sprintf("%s\n", dir))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to update config file: %w", err)
|
||||
}
|
||||
added[dir] = true
|
||||
}
|
||||
|
||||
// The created file needs to be world readable for the cases where the container is run as a non-root user.
|
||||
if err := configFile.Chmod(0644); err != nil {
|
||||
return fmt.Errorf("failed to chmod config file: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
173
cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go
Normal file
173
cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go
Normal file
@ -0,0 +1,173 @@
|
||||
/*
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
package cudacompat
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
testlog "github.com/sirupsen/logrus/hooks/test"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestCompatLibs(t *testing.T) {
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
|
||||
testCases := []struct {
|
||||
description string
|
||||
contents map[string]string
|
||||
hostDriverVersion string
|
||||
expectedContainerForwardCompatDir string
|
||||
}{
|
||||
{
|
||||
description: "empty root",
|
||||
hostDriverVersion: "222.55.66",
|
||||
},
|
||||
{
|
||||
description: "compat lib is newer; no ldcache",
|
||||
contents: map[string]string{
|
||||
"/usr/local/cuda/compat/libcuda.so.333.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
},
|
||||
{
|
||||
description: "compat lib is newer; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.333.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "/usr/local/cuda/compat",
|
||||
},
|
||||
{
|
||||
description: "compat lib is older; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.111.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "",
|
||||
},
|
||||
{
|
||||
description: "compat lib has same major version; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.222.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "",
|
||||
},
|
||||
{
|
||||
description: "driver version empty; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.222.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "",
|
||||
},
|
||||
{
|
||||
description: "symlinks are followed",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/etc/alternatives/cuda/compat/libcuda.so.333.88.99": "",
|
||||
"/usr/local/cuda": "symlink=/etc/alternatives/cuda",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "/etc/alternatives/cuda/compat",
|
||||
},
|
||||
{
|
||||
description: "symlinks stay in container",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/compat/libcuda.so.333.88.99": "",
|
||||
"/usr/local/cuda": "symlink=../../../../../../",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "/compat",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
containerRootDir := t.TempDir()
|
||||
for name, contents := range tc.contents {
|
||||
target := filepath.Join(containerRootDir, name)
|
||||
require.NoError(t, os.MkdirAll(filepath.Dir(target), 0755))
|
||||
|
||||
if strings.HasPrefix(contents, "symlink=") {
|
||||
require.NoError(t, os.Symlink(strings.TrimPrefix(contents, "symlink="), target))
|
||||
continue
|
||||
}
|
||||
|
||||
require.NoError(t, os.WriteFile(target, []byte(contents), 0600))
|
||||
}
|
||||
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), tc.hostDriverVersion)
|
||||
require.NoError(t, err)
|
||||
require.EqualValues(t, tc.expectedContainerForwardCompatDir, containerForwardCompatDir)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateLdconfig(t *testing.T) {
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
testCases := []struct {
|
||||
description string
|
||||
folders []string
|
||||
expectedContents string
|
||||
}{
|
||||
{
|
||||
description: "no folders; have no contents",
|
||||
},
|
||||
{
|
||||
description: "single folder is added",
|
||||
folders: []string{"/usr/local/cuda/compat"},
|
||||
expectedContents: "/usr/local/cuda/compat\n",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
containerRootDir := t.TempDir()
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
err := c.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, tc.folders...)
|
||||
require.NoError(t, err)
|
||||
|
||||
matches, err := filepath.Glob(filepath.Join(containerRootDir, "/etc/ld.so.conf.d/00-compat-*.conf"))
|
||||
require.NoError(t, err)
|
||||
|
||||
if tc.expectedContents == "" {
|
||||
require.Empty(t, matches)
|
||||
return
|
||||
}
|
||||
|
||||
require.Len(t, matches, 1)
|
||||
contents, err := os.ReadFile(matches[0])
|
||||
require.NoError(t, err)
|
||||
|
||||
require.EqualValues(t, tc.expectedContents, string(contents))
|
||||
})
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user