Merge pull request #906 from elezar/add-compat-lib-hook

Add CUDA forward compatibility hook
2025-04-20 22:25:22 +00:00 · 2025-02-27 17:25:19 +02:00 · 2025-02-27 17:25:19 +02:00 · 968e2ccca4
commit 968e2ccca4
parent 04e9bf4ac1 aff9301f2e
16 changed files with 1612 additions and 5 deletions
--- a/cmd/nvidia-cdi-hook/commands/commands.go
+++ b/cmd/nvidia-cdi-hook/commands/commands.go
@ -21,6 +21,7 @@ import (

 	"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/chmod"
 	symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/create-symlinks"
+	"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/cudacompat"
 	ldcache "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/update-ldcache"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
 )
@ -32,5 +33,6 @@ func New(logger logger.Interface) []*cli.Command {
 		ldcache.NewCommand(logger),
 		symlinks.NewCommand(logger),
 		chmod.NewCommand(logger),
+		cudacompat.NewCommand(logger),
 	}
 }
--- a/cmd/nvidia-cdi-hook/cudacompat/container-root.go
+++ b/cmd/nvidia-cdi-hook/cudacompat/container-root.go
@ -0,0 +1,76 @@
+/**
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package cudacompat
+
+import (
+	"os"
+	"path/filepath"
+
+	"github.com/moby/sys/symlink"
+)
+
+// A containerRoot represents the root filesystem of a container.
+type containerRoot string
+
+// hasPath checks whether the specified path exists in the root.
+func (r containerRoot) hasPath(path string) bool {
+	resolved, err := r.resolve(path)
+	if err != nil {
+		return false
+	}
+	if _, err := os.Stat(resolved); err != nil && os.IsNotExist(err) {
+		return false
+	}
+	return true
+}
+
+// globFiles matches the specified pattern in the root.
+// The files that match must be regular files.
+func (r containerRoot) globFiles(pattern string) ([]string, error) {
+	patternPath, err := r.resolve(pattern)
+	if err != nil {
+		return nil, err
+	}
+	matches, err := filepath.Glob(patternPath)
+	if err != nil {
+		return nil, err
+	}
+	var files []string
+	for _, match := range matches {
+		info, err := os.Lstat(match)
+		if err != nil {
+			return nil, err
+		}
+		// Ignore symlinks.
+		if info.Mode()&os.ModeSymlink != 0 {
+			continue
+		}
+		// Ignore directories.
+		if info.IsDir() {
+			continue
+		}
+		files = append(files, match)
+	}
+	return files, nil
+}
+
+// resolve returns the absolute path including root path.
+// Symlinks are resolved, but are guaranteed to resolve in the root.
+func (r containerRoot) resolve(path string) (string, error) {
+	absolute := filepath.Clean(filepath.Join(string(r), path))
+	return symlink.FollowSymlinkInScope(absolute, string(r))
+}
--- a/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go
+++ b/cmd/nvidia-cdi-hook/cudacompat/cudacompat.go
@ -0,0 +1,221 @@
+/**
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package cudacompat
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/urfave/cli/v2"
+
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
+)
+
+const (
+	cudaCompatPath = "/usr/local/cuda/compat"
+	// cudaCompatLdsoconfdFilenamePattern specifies the pattern for the filename
+	// in ld.so.conf.d that includes a reference to the CUDA compat path.
+	// The 00-compat prefix is chosen to ensure that these libraries have a
+	// higher precedence than other libraries on the system.
+	cudaCompatLdsoconfdFilenamePattern = "00-compat-*.conf"
+)
+
+type command struct {
+	logger logger.Interface
+}
+
+type options struct {
+	hostDriverVersion string
+	containerSpec     string
+}
+
+// NewCommand constructs a cuda-compat command with the specified logger
+func NewCommand(logger logger.Interface) *cli.Command {
+	c := command{
+		logger: logger,
+	}
+	return c.build()
+}
+
+// build the enable-cuda-compat command
+func (m command) build() *cli.Command {
+	cfg := options{}
+
+	// Create the 'enable-cuda-compat' command
+	c := cli.Command{
+		Name:  "enable-cuda-compat",
+		Usage: "This hook ensures that the folder containing the CUDA compat libraries is added to the ldconfig search path if required.",
+		Before: func(c *cli.Context) error {
+			return m.validateFlags(c, &cfg)
+		},
+		Action: func(c *cli.Context) error {
+			return m.run(c, &cfg)
+		},
+	}
+
+	c.Flags = []cli.Flag{
+		&cli.StringFlag{
+			Name:        "host-driver-version",
+			Usage:       "Specify the host driver version. If the CUDA compat libraries detected in the container do not have a higher MAJOR version, the hook is a no-op.",
+			Destination: &cfg.hostDriverVersion,
+		},
+		&cli.StringFlag{
+			Name:        "container-spec",
+			Hidden:      true,
+			Category:    "testing-only",
+			Usage:       "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN",
+			Destination: &cfg.containerSpec,
+		},
+	}
+
+	return &c
+}
+
+func (m command) validateFlags(_ *cli.Context, cfg *options) error {
+	return nil
+}
+
+func (m command) run(_ *cli.Context, cfg *options) error {
+	if cfg.hostDriverVersion == "" {
+		return nil
+	}
+
+	s, err := oci.LoadContainerState(cfg.containerSpec)
+	if err != nil {
+		return fmt.Errorf("failed to load container state: %w", err)
+	}
+
+	containerRootDir, err := s.GetContainerRoot()
+	if err != nil {
+		return fmt.Errorf("failed to determined container root: %w", err)
+	}
+
+	containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.hostDriverVersion)
+	if err != nil {
+		return fmt.Errorf("failed to get container forward compat directory: %w", err)
+	}
+	if containerForwardCompatDir == "" {
+		return nil
+	}
+
+	return m.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, containerForwardCompatDir)
+}
+
+func (m command) getContainerForwardCompatDir(containerRoot containerRoot, hostDriverVersion string) (string, error) {
+	if hostDriverVersion == "" {
+		m.logger.Debugf("Host driver version not specified")
+		return "", nil
+	}
+	if !containerRoot.hasPath(cudaCompatPath) {
+		m.logger.Debugf("No CUDA forward compatibility libraries directory in container")
+		return "", nil
+	}
+	if !containerRoot.hasPath("/etc/ld.so.cache") {
+		m.logger.Debugf("The container does not have an LDCache")
+		return "", nil
+	}
+
+	libs, err := containerRoot.globFiles(filepath.Join(cudaCompatPath, "libcuda.so.*.*"))
+	if err != nil {
+		m.logger.Warningf("Failed to find CUDA compat library: %w", err)
+		return "", nil
+	}
+
+	if len(libs) == 0 {
+		m.logger.Debugf("No CUDA forward compatibility libraries container")
+		return "", nil
+	}
+
+	if len(libs) != 1 {
+		m.logger.Warningf("Unexpected number of CUDA compat libraries in container: %v", libs)
+		return "", nil
+	}
+
+	compatDriverVersion := strings.TrimPrefix(filepath.Base(libs[0]), "libcuda.so.")
+	compatMajor, err := extractMajorVersion(compatDriverVersion)
+	if err != nil {
+		return "", fmt.Errorf("failed to extract major version from %q: %v", compatDriverVersion, err)
+	}
+
+	driverMajor, err := extractMajorVersion(hostDriverVersion)
+	if err != nil {
+		return "", fmt.Errorf("failed to extract major version from %q: %v", hostDriverVersion, err)
+	}
+
+	if driverMajor >= compatMajor {
+		m.logger.Debugf("Compat major version is not greater than the host driver major version (%v >= %v)", hostDriverVersion, compatDriverVersion)
+		return "", nil
+	}
+
+	resolvedCompatDir := strings.TrimPrefix(filepath.Dir(libs[0]), string(containerRoot))
+	return resolvedCompatDir, nil
+}
+
+// createLdsoconfdFile creates a file at /etc/ld.so.conf.d/ in the specified root.
+// The file is created at /etc/ld.so.conf.d/{{ .pattern }} using `CreateTemp` and
+// contains the specified directories on each line.
+func (m command) createLdsoconfdFile(in containerRoot, pattern string, dirs ...string) error {
+	if len(dirs) == 0 {
+		m.logger.Debugf("No directories to add to /etc/ld.so.conf")
+		return nil
+	}
+
+	ldsoconfdDir, err := in.resolve("/etc/ld.so.conf.d")
+	if err != nil {
+		return err
+	}
+	if err := os.MkdirAll(ldsoconfdDir, 0755); err != nil {
+		return fmt.Errorf("failed to create ld.so.conf.d: %w", err)
+	}
+
+	configFile, err := os.CreateTemp(ldsoconfdDir, pattern)
+	if err != nil {
+		return fmt.Errorf("failed to create config file: %w", err)
+	}
+	defer configFile.Close()
+
+	m.logger.Debugf("Adding directories %v to %v", dirs, configFile.Name())
+
+	added := make(map[string]bool)
+	for _, dir := range dirs {
+		if added[dir] {
+			continue
+		}
+		_, err = configFile.WriteString(fmt.Sprintf("%s\n", dir))
+		if err != nil {
+			return fmt.Errorf("failed to update config file: %w", err)
+		}
+		added[dir] = true
+	}
+
+	// The created file needs to be world readable for the cases where the container is run as a non-root user.
+	if err := configFile.Chmod(0644); err != nil {
+		return fmt.Errorf("failed to chmod config file: %w", err)
+	}
+
+	return nil
+}
+
+// extractMajorVersion parses a version string and returns the major version as an int.
+func extractMajorVersion(version string) (int, error) {
+	majorString := strings.SplitN(version, ".", 2)[0]
+	return strconv.Atoi(majorString)
+}
--- a/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go
+++ b/cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go
@ -0,0 +1,182 @@
+/*
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+*/
+
+package cudacompat
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	testlog "github.com/sirupsen/logrus/hooks/test"
+	"github.com/stretchr/testify/require"
+)
+
+func TestCompatLibs(t *testing.T) {
+	logger, _ := testlog.NewNullLogger()
+
+	testCases := []struct {
+		description                       string
+		contents                          map[string]string
+		hostDriverVersion                 string
+		expectedContainerForwardCompatDir string
+	}{
+		{
+			description:       "empty root",
+			hostDriverVersion: "222.55.66",
+		},
+		{
+			description: "compat lib is newer; no ldcache",
+			contents: map[string]string{
+				"/usr/local/cuda/compat/libcuda.so.333.88.99": "",
+			},
+			hostDriverVersion: "222.55.66",
+		},
+		{
+			description: "compat lib is newer; ldcache",
+			contents: map[string]string{
+				"/etc/ld.so.cache": "",
+				"/usr/local/cuda/compat/libcuda.so.333.88.99": "",
+			},
+			hostDriverVersion:                 "222.55.66",
+			expectedContainerForwardCompatDir: "/usr/local/cuda/compat",
+		},
+		{
+			description: "compat lib is older; ldcache",
+			contents: map[string]string{
+				"/etc/ld.so.cache": "",
+				"/usr/local/cuda/compat/libcuda.so.111.88.99": "",
+			},
+			hostDriverVersion:                 "222.55.66",
+			expectedContainerForwardCompatDir: "",
+		},
+		{
+			description: "compat lib has same major version; ldcache",
+			contents: map[string]string{
+				"/etc/ld.so.cache": "",
+				"/usr/local/cuda/compat/libcuda.so.222.88.99": "",
+			},
+			hostDriverVersion:                 "222.55.66",
+			expectedContainerForwardCompatDir: "",
+		},
+		{
+			description: "numeric comparison is used; ldcache",
+			contents: map[string]string{
+				"/etc/ld.so.cache": "",
+				"/usr/local/cuda/compat/libcuda.so.222.88.99": "",
+			},
+			hostDriverVersion:                 "99.55.66",
+			expectedContainerForwardCompatDir: "/usr/local/cuda/compat",
+		},
+		{
+			description: "driver version empty; ldcache",
+			contents: map[string]string{
+				"/etc/ld.so.cache": "",
+				"/usr/local/cuda/compat/libcuda.so.222.88.99": "",
+			},
+			hostDriverVersion: "",
+		},
+		{
+			description: "symlinks are followed",
+			contents: map[string]string{
+				"/etc/ld.so.cache": "",
+				"/etc/alternatives/cuda/compat/libcuda.so.333.88.99": "",
+				"/usr/local/cuda": "symlink=/etc/alternatives/cuda",
+			},
+			hostDriverVersion:                 "222.55.66",
+			expectedContainerForwardCompatDir: "/etc/alternatives/cuda/compat",
+		},
+		{
+			description: "symlinks stay in container",
+			contents: map[string]string{
+				"/etc/ld.so.cache":             "",
+				"/compat/libcuda.so.333.88.99": "",
+				"/usr/local/cuda":              "symlink=../../../../../../",
+			},
+			hostDriverVersion:                 "222.55.66",
+			expectedContainerForwardCompatDir: "/compat",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			containerRootDir := t.TempDir()
+			for name, contents := range tc.contents {
+				target := filepath.Join(containerRootDir, name)
+				require.NoError(t, os.MkdirAll(filepath.Dir(target), 0755))
+
+				if strings.HasPrefix(contents, "symlink=") {
+					require.NoError(t, os.Symlink(strings.TrimPrefix(contents, "symlink="), target))
+					continue
+				}
+
+				require.NoError(t, os.WriteFile(target, []byte(contents), 0600))
+			}
+
+			c := command{
+				logger: logger,
+			}
+			containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), tc.hostDriverVersion)
+			require.NoError(t, err)
+			require.EqualValues(t, tc.expectedContainerForwardCompatDir, containerForwardCompatDir)
+		})
+	}
+}
+
+func TestUpdateLdconfig(t *testing.T) {
+	logger, _ := testlog.NewNullLogger()
+	testCases := []struct {
+		description      string
+		folders          []string
+		expectedContents string
+	}{
+		{
+			description: "no folders; have no contents",
+		},
+		{
+			description:      "single folder is added",
+			folders:          []string{"/usr/local/cuda/compat"},
+			expectedContents: "/usr/local/cuda/compat\n",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			containerRootDir := t.TempDir()
+			c := command{
+				logger: logger,
+			}
+			err := c.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, tc.folders...)
+			require.NoError(t, err)
+
+			matches, err := filepath.Glob(filepath.Join(containerRootDir, "/etc/ld.so.conf.d/00-compat-*.conf"))
+			require.NoError(t, err)
+
+			if tc.expectedContents == "" {
+				require.Empty(t, matches)
+				return
+			}
+
+			require.Len(t, matches, 1)
+			contents, err := os.ReadFile(matches[0])
+			require.NoError(t, err)
+
+			require.EqualValues(t, tc.expectedContents, string(contents))
+		})
+	}
+
+}
--- a/cmd/nvidia-ctk-installer/container/toolkit/toolkit_test.go
+++ b/cmd/nvidia-ctk-installer/container/toolkit/toolkit_test.go
@ -80,6 +80,12 @@ containerEdits:
    - libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so
    hookName: createContainer
    path: {{ .toolkitRoot }}/nvidia-cdi-hook
+  - args:
+    - nvidia-cdi-hook
+    - enable-cuda-compat
+    - --host-driver-version=999.88.77
+    hookName: createContainer
+    path: {{ .toolkitRoot }}/nvidia-cdi-hook
  - args:
    - nvidia-cdi-hook
    - update-ldcache
--- a/cmd/nvidia-ctk/cdi/generate/generate.go
+++ b/cmd/nvidia-ctk/cdi/generate/generate.go
@ -25,6 +25,8 @@ import (
 	"github.com/urfave/cli/v2"
 	cdi "tags.cncf.io/container-device-interface/pkg/parser"

+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv"
@ -60,6 +62,9 @@ type options struct {
 		files          cli.StringSlice
 		ignorePatterns cli.StringSlice
 	}
+
+	// the following are used for dependency injection during spec generation.
+	nvmllib nvml.Interface
 }

 // NewCommand constructs a generate-cdi command with the specified logger
@ -269,6 +274,8 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) {
 		nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()),
 		nvcdi.WithCSVFiles(opts.csv.files.Value()),
 		nvcdi.WithCSVIgnorePatterns(opts.csv.ignorePatterns.Value()),
+		// We set the following to allow for dependency injection:
+		nvcdi.WithNvmlLib(opts.nvmllib),
 	)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create CDI library: %v", err)
--- a/cmd/nvidia-ctk/cdi/generate/generate_test.go
+++ b/cmd/nvidia-ctk/cdi/generate/generate_test.go
@ -0,0 +1,157 @@
+/**
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package generate
+
+import (
+	"bytes"
+	"path/filepath"
+	"strings"
+	"testing"
+
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+	"github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100"
+	testlog "github.com/sirupsen/logrus/hooks/test"
+	"github.com/stretchr/testify/require"
+
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/test"
+)
+
+func TestGenerateSpec(t *testing.T) {
+	t.Setenv("__NVCT_TESTING_DEVICES_ARE_FILES", "true")
+	moduleRoot, err := test.GetModuleRoot()
+	require.NoError(t, err)
+
+	driverRoot := filepath.Join(moduleRoot, "testdata", "lookup", "rootfs-1")
+
+	logger, _ := testlog.NewNullLogger()
+	testCases := []struct {
+		description           string
+		options               options
+		expectedValidateError error
+		expectedOptions       options
+		expectedError         error
+		expectedSpec          string
+	}{
+		{
+			description: "default",
+			options: options{
+				format:     "yaml",
+				mode:       "nvml",
+				vendor:     "example.com",
+				class:      "device",
+				driverRoot: driverRoot,
+			},
+			expectedOptions: options{
+				format:            "yaml",
+				mode:              "nvml",
+				vendor:            "example.com",
+				class:             "device",
+				nvidiaCDIHookPath: "/usr/bin/nvidia-cdi-hook",
+				driverRoot:        driverRoot,
+			},
+			expectedSpec: `---
+cdiVersion: 0.5.0
+containerEdits:
+  deviceNodes:
+  - hostPath: {{ .driverRoot }}/dev/nvidiactl
+    path: /dev/nvidiactl
+  env:
+  - NVIDIA_VISIBLE_DEVICES=void
+  hooks:
+  - args:
+    - nvidia-cdi-hook
+    - create-symlinks
+    - --link
+    - libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so
+    hookName: createContainer
+    path: /usr/bin/nvidia-cdi-hook
+  - args:
+    - nvidia-cdi-hook
+    - enable-cuda-compat
+    - --host-driver-version=999.88.77
+    hookName: createContainer
+    path: /usr/bin/nvidia-cdi-hook
+  - args:
+    - nvidia-cdi-hook
+    - update-ldcache
+    - --folder
+    - /lib/x86_64-linux-gnu
+    hookName: createContainer
+    path: /usr/bin/nvidia-cdi-hook
+  mounts:
+  - containerPath: /lib/x86_64-linux-gnu/libcuda.so.999.88.77
+    hostPath: {{ .driverRoot }}/lib/x86_64-linux-gnu/libcuda.so.999.88.77
+    options:
+    - ro
+    - nosuid
+    - nodev
+    - bind
+devices:
+- containerEdits:
+    deviceNodes:
+    - hostPath: {{ .driverRoot }}/dev/nvidia0
+      path: /dev/nvidia0
+  name: "0"
+- containerEdits:
+    deviceNodes:
+    - hostPath: {{ .driverRoot }}/dev/nvidia0
+      path: /dev/nvidia0
+  name: all
+kind: example.com/device
+`,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.description, func(t *testing.T) {
+			c := command{
+				logger: logger,
+			}
+
+			err := c.validateFlags(nil, &tc.options)
+			require.ErrorIs(t, err, tc.expectedValidateError)
+			require.EqualValues(t, tc.expectedOptions, tc.options)
+
+			// Set up a mock server, reusing the DGX A100 mock.
+			server := dgxa100.New()
+			// Override the driver version to match the version in our mock filesystem.
+			server.SystemGetDriverVersionFunc = func() (string, nvml.Return) {
+				return "999.88.77", nvml.SUCCESS
+			}
+			// Set the device count to 1 explicitly since we only have a single device node.
+			server.DeviceGetCountFunc = func() (int, nvml.Return) {
+				return 1, nvml.SUCCESS
+			}
+			for _, d := range server.Devices {
+				// TODO: This is not implemented in the mock.
+				(d.(*dgxa100.Device)).GetMaxMigDeviceCountFunc = func() (int, nvml.Return) {
+					return 0, nvml.SUCCESS
+				}
+			}
+			tc.options.nvmllib = server
+
+			spec, err := c.generateSpec(&tc.options)
+			require.ErrorIs(t, err, tc.expectedError)
+
+			var buf bytes.Buffer
+			_, err = spec.WriteTo(&buf)
+			require.NoError(t, err)
+
+			require.Equal(t, strings.ReplaceAll(tc.expectedSpec, "{{ .driverRoot }}", driverRoot), buf.String())
+		})
+	}
+}
--- a/internal/config/features.go
+++ b/internal/config/features.go
@ -25,6 +25,12 @@ type features struct {
 	// If this feature flag is not set to 'true' only host-rooted config paths
 	// (i.e. paths starting with an '@' are considered valid)
 	AllowLDConfigFromContainer *feature `toml:"allow-ldconfig-from-container,omitempty"`
+	// DisableCUDACompatLibHook, when enabled skips the injection of a specific
+	// hook to process CUDA compatibility libraries.
+	//
+	// Note: Since this mechanism replaces the logic in the `nvidia-container-cli`,
+	// toggling this feature has no effect if `allow-cuda-compat-libs-from-container` is enabled.
+	DisableCUDACompatLibHook *feature `toml:"disable-cuda-compat-lib-hook,omitempty"`
 	// DisableImexChannelCreation ensures that the implicit creation of
 	// requested IMEX channels is skipped when invoking the nvidia-container-cli.
 	DisableImexChannelCreation *feature `toml:"disable-imex-channel-creation,omitempty"`
--- a/internal/discover/compat_libs.go
+++ b/internal/discover/compat_libs.go
@ -0,0 +1,24 @@
+package discover
+
+import (
+	"strings"
+
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
+)
+
+// NewCUDACompatHookDiscoverer creates a discoverer for a enable-cuda-compat hook.
+// This hook is responsible for setting up CUDA compatibility in the container and depends on the host driver version.
+func NewCUDACompatHookDiscoverer(logger logger.Interface, nvidiaCDIHookPath string, driver *root.Driver) Discover {
+	_, cudaVersionPattern := getCUDALibRootAndVersionPattern(logger, driver)
+	var args []string
+	if !strings.Contains(cudaVersionPattern, "*") {
+		args = append(args, "--host-driver-version="+cudaVersionPattern)
+	}
+
+	return CreateNvidiaCDIHook(
+		nvidiaCDIHookPath,
+		"enable-cuda-compat",
+		args...,
+	)
+}
--- a/internal/modifier/gated.go
+++ b/internal/modifier/gated.go
@ -23,6 +23,7 @@ import (
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
+	"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
 	"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
 )

@ -35,7 +36,7 @@ import (
 //	NVIDIA_GDRCOPY=enabled
 //
 // If not devices are selected, no changes are made.
-func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
+func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver) (oci.SpecModifier, error) {
 	if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 {
 		logger.Infof("No modification required; no devices requested")
 		return nil, nil
@ -78,5 +79,24 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image
 		discoverers = append(discoverers, d)
 	}

+	if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
+		compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
+		discoverers = append(discoverers, compatLibHookDiscoverer)
+		// For legacy mode, we also need to inject a hook to update the LDCache
+		// after we have modifed the configuration.
+		if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" {
+			ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
+				logger,
+				discover.None{},
+				cfg.NVIDIACTKConfig.Path,
+				"",
+			)
+			if err != nil {
+				return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
+			}
+			discoverers = append(discoverers, ldcacheUpdateHookDiscoverer)
+		}
+	}
+
 	return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...))
 }
--- a/internal/runtime/runtime_factory.go
+++ b/internal/runtime/runtime_factory.go
@ -75,6 +75,8 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
 	}

 	mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
+	// We update the mode here so that we can continue passing just the config to other functions.
+	cfg.NVIDIAContainerRuntimeConfig.Mode = mode
 	modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image)
 	if err != nil {
 		return nil, err
@ -94,7 +96,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
 			}
 			modifiers = append(modifiers, graphicsModifier)
 		case "feature-gated":
-			featureGatedModifier, err := modifier.NewFeatureGatedModifier(logger, cfg, image)
+			featureGatedModifier, err := modifier.NewFeatureGatedModifier(logger, cfg, image, driver)
 			if err != nil {
 				return nil, err
 			}
@ -126,8 +128,8 @@ func supportedModifierTypes(mode string) []string {
 		return []string{"nvidia-hook-remover", "mode"}
 	case "csv":
 		// For CSV mode we support mode and feature-gated modification.
-		return []string{"nvidia-hook-remover", "mode", "feature-gated"}
+		return []string{"nvidia-hook-remover", "feature-gated", "mode"}
 	default:
-		return []string{"mode", "graphics", "feature-gated"}
+		return []string{"feature-gated", "graphics", "mode"}
 	}
 }
--- a/pkg/nvcdi/driver-nvml.go
+++ b/pkg/nvcdi/driver-nvml.go
@ -97,6 +97,8 @@ func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nv
 		libraryPaths,
 	)

+	// TODO: The following should use the version directly.
+	cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, nvidiaCDIHookPath, driver)
 	updateLDCache, _ := discover.NewLDCacheUpdateHook(logger, libraries, nvidiaCDIHookPath, ldconfigPath)

 	d := discover.Merge(
@ -105,6 +107,7 @@ func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nv
 			version,
 			nvidiaCDIHookPath,
 		),
+		cudaCompatLibHookDiscoverer,
 		updateLDCache,
 	)

--- a/tests/e2e/nvidia-container-toolkit_test.go
+++ b/tests/e2e/nvidia-container-toolkit_test.go
@ -18,13 +18,15 @@ package e2e

 import (
 	"context"
+	"path/filepath"
+	"strings"

 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )

 // Integration tests for Docker runtime
-var _ = Describe("docker", Ordered, func() {
+var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
 	var r Runner

 	// Install the NVIDIA Container Toolkit
@ -166,4 +168,51 @@ var _ = Describe("docker", Ordered, func() {
 			Expect(referenceOutput).To(Equal(out4))
 		})
 	})
+
+	Describe("CUDA Forward compatibility", Ordered, func() {
+		BeforeAll(func(ctx context.Context) {
+			_, _, err := r.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
+			Expect(err).ToNot(HaveOccurred())
+		})
+
+		BeforeAll(func(ctx context.Context) {
+			compatOutput, _, err := r.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(compatOutput).ToNot(BeEmpty())
+			compatDriverVersion := strings.TrimPrefix(filepath.Base(compatOutput), "libcuda.so.")
+			compatMajor := strings.SplitN(compatDriverVersion, ".", 2)[0]
+
+			driverOutput, _, err := r.Run("nvidia-smi -q | grep \"Driver Version\"")
+			Expect(err).ToNot(HaveOccurred())
+			parts := strings.SplitN(driverOutput, ":", 2)
+			Expect(parts).To(HaveLen(2))
+
+			hostDriverVersion := strings.TrimSpace(parts[1])
+			Expect(hostDriverVersion).ToNot(BeEmpty())
+			driverMajor := strings.SplitN(hostDriverVersion, ".", 2)[0]
+
+			if driverMajor >= compatMajor {
+				GinkgoLogr.Info("CUDA Forward Compatibility tests require an older driver version", "hostDriverVersion", hostDriverVersion, "compatDriverVersion", compatDriverVersion)
+				Skip("CUDA Forward Compatibility tests require an older driver version")
+			}
+		})
+
+		It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
+			ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
+		})
+
+		It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
+			ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true  --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
+		})
+
+		It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
+			ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
+			Expect(err).ToNot(HaveOccurred())
+			Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
+		})
+	})
 })
--- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go
+++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go
@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package dgxa100
+
+import (
+	"fmt"
+	"sync"
+
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+	"github.com/NVIDIA/go-nvml/pkg/nvml/mock"
+	"github.com/google/uuid"
+)
+
+type Server struct {
+	mock.Interface
+	mock.ExtendedInterface
+	Devices           [8]nvml.Device
+	DriverVersion     string
+	NvmlVersion       string
+	CudaDriverVersion int
+}
+type Device struct {
+	mock.Device
+	sync.RWMutex
+	UUID                  string
+	Name                  string
+	Brand                 nvml.BrandType
+	Architecture          nvml.DeviceArchitecture
+	PciBusID              string
+	Minor                 int
+	Index                 int
+	CudaComputeCapability CudaComputeCapability
+	MigMode               int
+	GpuInstances          map[*GpuInstance]struct{}
+	GpuInstanceCounter    uint32
+	MemoryInfo            nvml.Memory
+}
+
+type GpuInstance struct {
+	mock.GpuInstance
+	sync.RWMutex
+	Info                   nvml.GpuInstanceInfo
+	ComputeInstances       map[*ComputeInstance]struct{}
+	ComputeInstanceCounter uint32
+}
+
+type ComputeInstance struct {
+	mock.ComputeInstance
+	Info nvml.ComputeInstanceInfo
+}
+
+type CudaComputeCapability struct {
+	Major int
+	Minor int
+}
+
+var _ nvml.Interface = (*Server)(nil)
+var _ nvml.Device = (*Device)(nil)
+var _ nvml.GpuInstance = (*GpuInstance)(nil)
+var _ nvml.ComputeInstance = (*ComputeInstance)(nil)
+
+func New() *Server {
+	server := &Server{
+		Devices: [8]nvml.Device{
+			NewDevice(0),
+			NewDevice(1),
+			NewDevice(2),
+			NewDevice(3),
+			NewDevice(4),
+			NewDevice(5),
+			NewDevice(6),
+			NewDevice(7),
+		},
+		DriverVersion:     "550.54.15",
+		NvmlVersion:       "12.550.54.15",
+		CudaDriverVersion: 12040,
+	}
+	server.setMockFuncs()
+	return server
+}
+
+func NewDevice(index int) *Device {
+	device := &Device{
+		UUID:         "GPU-" + uuid.New().String(),
+		Name:         "Mock NVIDIA A100-SXM4-40GB",
+		Brand:        nvml.BRAND_NVIDIA,
+		Architecture: nvml.DEVICE_ARCH_AMPERE,
+		PciBusID:     fmt.Sprintf("0000:%02x:00.0", index),
+		Minor:        index,
+		Index:        index,
+		CudaComputeCapability: CudaComputeCapability{
+			Major: 8,
+			Minor: 0,
+		},
+		GpuInstances:       make(map[*GpuInstance]struct{}),
+		GpuInstanceCounter: 0,
+		MemoryInfo:         nvml.Memory{42949672960, 0, 0},
+	}
+	device.setMockFuncs()
+	return device
+}
+
+func NewGpuInstance(info nvml.GpuInstanceInfo) *GpuInstance {
+	gi := &GpuInstance{
+		Info:                   info,
+		ComputeInstances:       make(map[*ComputeInstance]struct{}),
+		ComputeInstanceCounter: 0,
+	}
+	gi.setMockFuncs()
+	return gi
+}
+
+func NewComputeInstance(info nvml.ComputeInstanceInfo) *ComputeInstance {
+	ci := &ComputeInstance{
+		Info: info,
+	}
+	ci.setMockFuncs()
+	return ci
+}
+
+func (s *Server) setMockFuncs() {
+	s.ExtensionsFunc = func() nvml.ExtendedInterface {
+		return s
+	}
+
+	s.LookupSymbolFunc = func(symbol string) error {
+		return nil
+	}
+
+	s.InitFunc = func() nvml.Return {
+		return nvml.SUCCESS
+	}
+
+	s.ShutdownFunc = func() nvml.Return {
+		return nvml.SUCCESS
+	}
+
+	s.SystemGetDriverVersionFunc = func() (string, nvml.Return) {
+		return s.DriverVersion, nvml.SUCCESS
+	}
+
+	s.SystemGetNVMLVersionFunc = func() (string, nvml.Return) {
+		return s.NvmlVersion, nvml.SUCCESS
+	}
+
+	s.SystemGetCudaDriverVersionFunc = func() (int, nvml.Return) {
+		return s.CudaDriverVersion, nvml.SUCCESS
+	}
+
+	s.DeviceGetCountFunc = func() (int, nvml.Return) {
+		return len(s.Devices), nvml.SUCCESS
+	}
+
+	s.DeviceGetHandleByIndexFunc = func(index int) (nvml.Device, nvml.Return) {
+		if index < 0 || index >= len(s.Devices) {
+			return nil, nvml.ERROR_INVALID_ARGUMENT
+		}
+		return s.Devices[index], nvml.SUCCESS
+	}
+
+	s.DeviceGetHandleByUUIDFunc = func(uuid string) (nvml.Device, nvml.Return) {
+		for _, d := range s.Devices {
+			if uuid == d.(*Device).UUID {
+				return d, nvml.SUCCESS
+			}
+		}
+		return nil, nvml.ERROR_INVALID_ARGUMENT
+	}
+
+	s.DeviceGetHandleByPciBusIdFunc = func(busID string) (nvml.Device, nvml.Return) {
+		for _, d := range s.Devices {
+			if busID == d.(*Device).PciBusID {
+				return d, nvml.SUCCESS
+			}
+		}
+		return nil, nvml.ERROR_INVALID_ARGUMENT
+	}
+}
+
+func (d *Device) setMockFuncs() {
+	d.GetMinorNumberFunc = func() (int, nvml.Return) {
+		return d.Minor, nvml.SUCCESS
+	}
+
+	d.GetIndexFunc = func() (int, nvml.Return) {
+		return d.Index, nvml.SUCCESS
+	}
+
+	d.GetCudaComputeCapabilityFunc = func() (int, int, nvml.Return) {
+		return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS
+	}
+
+	d.GetUUIDFunc = func() (string, nvml.Return) {
+		return d.UUID, nvml.SUCCESS
+	}
+
+	d.GetNameFunc = func() (string, nvml.Return) {
+		return d.Name, nvml.SUCCESS
+	}
+
+	d.GetBrandFunc = func() (nvml.BrandType, nvml.Return) {
+		return d.Brand, nvml.SUCCESS
+	}
+
+	d.GetArchitectureFunc = func() (nvml.DeviceArchitecture, nvml.Return) {
+		return d.Architecture, nvml.SUCCESS
+	}
+
+	d.GetMemoryInfoFunc = func() (nvml.Memory, nvml.Return) {
+		return d.MemoryInfo, nvml.SUCCESS
+	}
+
+	d.GetPciInfoFunc = func() (nvml.PciInfo, nvml.Return) {
+		p := nvml.PciInfo{
+			PciDeviceId: 0x20B010DE,
+		}
+		return p, nvml.SUCCESS
+	}
+
+	d.SetMigModeFunc = func(mode int) (nvml.Return, nvml.Return) {
+		d.MigMode = mode
+		return nvml.SUCCESS, nvml.SUCCESS
+	}
+
+	d.GetMigModeFunc = func() (int, int, nvml.Return) {
+		return d.MigMode, d.MigMode, nvml.SUCCESS
+	}
+
+	d.GetGpuInstanceProfileInfoFunc = func(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) {
+		if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT {
+			return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT
+		}
+
+		if _, exists := MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists {
+			return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED
+		}
+
+		return MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS
+	}
+
+	d.GetGpuInstancePossiblePlacementsFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) {
+		return MIGPlacements.GpuInstancePossiblePlacements[int(info.Id)], nvml.SUCCESS
+	}
+
+	d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) {
+		d.Lock()
+		defer d.Unlock()
+		giInfo := nvml.GpuInstanceInfo{
+			Device:    d,
+			Id:        d.GpuInstanceCounter,
+			ProfileId: info.Id,
+		}
+		d.GpuInstanceCounter++
+		gi := NewGpuInstance(giInfo)
+		d.GpuInstances[gi] = struct{}{}
+		return gi, nvml.SUCCESS
+	}
+
+	d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) {
+		d.Lock()
+		defer d.Unlock()
+		giInfo := nvml.GpuInstanceInfo{
+			Device:    d,
+			Id:        d.GpuInstanceCounter,
+			ProfileId: info.Id,
+			Placement: *placement,
+		}
+		d.GpuInstanceCounter++
+		gi := NewGpuInstance(giInfo)
+		d.GpuInstances[gi] = struct{}{}
+		return gi, nvml.SUCCESS
+	}
+
+	d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) {
+		d.RLock()
+		defer d.RUnlock()
+		var gis []nvml.GpuInstance
+		for gi := range d.GpuInstances {
+			if gi.Info.ProfileId == info.Id {
+				gis = append(gis, gi)
+			}
+		}
+		return gis, nvml.SUCCESS
+	}
+}
+
+func (gi *GpuInstance) setMockFuncs() {
+	gi.GetInfoFunc = func() (nvml.GpuInstanceInfo, nvml.Return) {
+		return gi.Info, nvml.SUCCESS
+	}
+
+	gi.GetComputeInstanceProfileInfoFunc = func(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) {
+		if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT {
+			return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT
+		}
+
+		if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED {
+			return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED
+		}
+
+		giProfileId := int(gi.Info.ProfileId)
+
+		if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists {
+			return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED
+		}
+
+		if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists {
+			return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED
+		}
+
+		return MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS
+	}
+
+	gi.GetComputeInstancePossiblePlacementsFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) {
+		return MIGPlacements.ComputeInstancePossiblePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS
+	}
+
+	gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) {
+		gi.Lock()
+		defer gi.Unlock()
+		ciInfo := nvml.ComputeInstanceInfo{
+			Device:      gi.Info.Device,
+			GpuInstance: gi,
+			Id:          gi.ComputeInstanceCounter,
+			ProfileId:   info.Id,
+		}
+		gi.ComputeInstanceCounter++
+		ci := NewComputeInstance(ciInfo)
+		gi.ComputeInstances[ci] = struct{}{}
+		return ci, nvml.SUCCESS
+	}
+
+	gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) {
+		gi.RLock()
+		defer gi.RUnlock()
+		var cis []nvml.ComputeInstance
+		for ci := range gi.ComputeInstances {
+			if ci.Info.ProfileId == info.Id {
+				cis = append(cis, ci)
+			}
+		}
+		return cis, nvml.SUCCESS
+	}
+
+	gi.DestroyFunc = func() nvml.Return {
+		d := gi.Info.Device.(*Device)
+		d.Lock()
+		defer d.Unlock()
+		delete(d.GpuInstances, gi)
+		return nvml.SUCCESS
+	}
+}
+
+func (ci *ComputeInstance) setMockFuncs() {
+	ci.GetInfoFunc = func() (nvml.ComputeInstanceInfo, nvml.Return) {
+		return ci.Info, nvml.SUCCESS
+	}
+
+	ci.DestroyFunc = func() nvml.Return {
+		gi := ci.Info.GpuInstance.(*GpuInstance)
+		gi.Lock()
+		defer gi.Unlock()
+		delete(gi.ComputeInstances, ci)
+		return nvml.SUCCESS
+	}
+}
--- a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go
+++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go
@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package dgxa100
+
+import (
+	"github.com/NVIDIA/go-nvml/pkg/nvml"
+)
+
+// MIGProfiles holds the profile information for GIs and CIs in this mock server.
+// We should consider auto-generating this object in the future.
+var MIGProfiles = struct {
+	GpuInstanceProfiles     map[int]nvml.GpuInstanceProfileInfo
+	ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo
+}{
+	GpuInstanceProfiles: map[int]nvml.GpuInstanceProfileInfo{
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE: {
+			Id:                  nvml.GPU_INSTANCE_PROFILE_1_SLICE,
+			IsP2pSupported:      0,
+			SliceCount:          1,
+			InstanceCount:       7,
+			MultiprocessorCount: 14,
+			CopyEngineCount:     1,
+			DecoderCount:        0,
+			EncoderCount:        0,
+			JpegCount:           0,
+			OfaCount:            0,
+			MemorySizeMB:        4864,
+		},
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: {
+			Id:                  nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
+			IsP2pSupported:      0,
+			SliceCount:          1,
+			InstanceCount:       1,
+			MultiprocessorCount: 14,
+			CopyEngineCount:     1,
+			DecoderCount:        1,
+			EncoderCount:        0,
+			JpegCount:           1,
+			OfaCount:            1,
+			MemorySizeMB:        4864,
+		},
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: {
+			Id:                  nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2,
+			IsP2pSupported:      0,
+			SliceCount:          1,
+			InstanceCount:       4,
+			MultiprocessorCount: 14,
+			CopyEngineCount:     1,
+			DecoderCount:        1,
+			EncoderCount:        0,
+			JpegCount:           0,
+			OfaCount:            0,
+			MemorySizeMB:        9856,
+		},
+		nvml.GPU_INSTANCE_PROFILE_2_SLICE: {
+			Id:                  nvml.GPU_INSTANCE_PROFILE_2_SLICE,
+			IsP2pSupported:      0,
+			SliceCount:          2,
+			InstanceCount:       3,
+			MultiprocessorCount: 28,
+			CopyEngineCount:     2,
+			DecoderCount:        1,
+			EncoderCount:        0,
+			JpegCount:           0,
+			OfaCount:            0,
+			MemorySizeMB:        9856,
+		},
+		nvml.GPU_INSTANCE_PROFILE_3_SLICE: {
+			Id:                  nvml.GPU_INSTANCE_PROFILE_3_SLICE,
+			IsP2pSupported:      0,
+			SliceCount:          3,
+			InstanceCount:       2,
+			MultiprocessorCount: 42,
+			CopyEngineCount:     3,
+			DecoderCount:        2,
+			EncoderCount:        0,
+			JpegCount:           0,
+			OfaCount:            0,
+			MemorySizeMB:        19968,
+		},
+		nvml.GPU_INSTANCE_PROFILE_4_SLICE: {
+			Id:                  nvml.GPU_INSTANCE_PROFILE_4_SLICE,
+			IsP2pSupported:      0,
+			SliceCount:          4,
+			InstanceCount:       1,
+			MultiprocessorCount: 56,
+			CopyEngineCount:     4,
+			DecoderCount:        2,
+			EncoderCount:        0,
+			JpegCount:           0,
+			OfaCount:            0,
+			MemorySizeMB:        19968,
+		},
+		nvml.GPU_INSTANCE_PROFILE_7_SLICE: {
+			Id:                  nvml.GPU_INSTANCE_PROFILE_7_SLICE,
+			IsP2pSupported:      0,
+			SliceCount:          7,
+			InstanceCount:       1,
+			MultiprocessorCount: 98,
+			CopyEngineCount:     7,
+			DecoderCount:        5,
+			EncoderCount:        0,
+			JpegCount:           1,
+			OfaCount:            1,
+			MemorySizeMB:        40192,
+		},
+	},
+	ComputeInstanceProfiles: map[int]map[int]nvml.ComputeInstanceProfileInfo{
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
+				SliceCount:            1,
+				InstanceCount:         1,
+				MultiprocessorCount:   14,
+				SharedCopyEngineCount: 1,
+				SharedDecoderCount:    0,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       0,
+				SharedOfaCount:        0,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
+				SliceCount:            1,
+				InstanceCount:         1,
+				MultiprocessorCount:   14,
+				SharedCopyEngineCount: 1,
+				SharedDecoderCount:    1,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       1,
+				SharedOfaCount:        1,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
+				SliceCount:            1,
+				InstanceCount:         1,
+				MultiprocessorCount:   14,
+				SharedCopyEngineCount: 1,
+				SharedDecoderCount:    1,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       0,
+				SharedOfaCount:        0,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_2_SLICE: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
+				SliceCount:            1,
+				InstanceCount:         2,
+				MultiprocessorCount:   14,
+				SharedCopyEngineCount: 2,
+				SharedDecoderCount:    1,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       0,
+				SharedOfaCount:        0,
+			},
+			nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE,
+				SliceCount:            2,
+				InstanceCount:         1,
+				MultiprocessorCount:   28,
+				SharedCopyEngineCount: 2,
+				SharedDecoderCount:    1,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       0,
+				SharedOfaCount:        0,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_3_SLICE: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
+				SliceCount:            1,
+				InstanceCount:         3,
+				MultiprocessorCount:   14,
+				SharedCopyEngineCount: 3,
+				SharedDecoderCount:    2,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       0,
+				SharedOfaCount:        0,
+			},
+			nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE,
+				SliceCount:            2,
+				InstanceCount:         1,
+				MultiprocessorCount:   28,
+				SharedCopyEngineCount: 3,
+				SharedDecoderCount:    2,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       0,
+				SharedOfaCount:        0,
+			},
+			nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE,
+				SliceCount:            3,
+				InstanceCount:         1,
+				MultiprocessorCount:   42,
+				SharedCopyEngineCount: 3,
+				SharedDecoderCount:    2,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       0,
+				SharedOfaCount:        0,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_4_SLICE: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
+				SliceCount:            1,
+				InstanceCount:         4,
+				MultiprocessorCount:   14,
+				SharedCopyEngineCount: 4,
+				SharedDecoderCount:    2,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       0,
+				SharedOfaCount:        0,
+			},
+			nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE,
+				SliceCount:            2,
+				InstanceCount:         2,
+				MultiprocessorCount:   28,
+				SharedCopyEngineCount: 4,
+				SharedDecoderCount:    2,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       0,
+				SharedOfaCount:        0,
+			},
+			nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE,
+				SliceCount:            4,
+				InstanceCount:         1,
+				MultiprocessorCount:   56,
+				SharedCopyEngineCount: 4,
+				SharedDecoderCount:    2,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       0,
+				SharedOfaCount:        0,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_7_SLICE: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
+				SliceCount:            1,
+				InstanceCount:         7,
+				MultiprocessorCount:   14,
+				SharedCopyEngineCount: 7,
+				SharedDecoderCount:    5,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       1,
+				SharedOfaCount:        1,
+			},
+			nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE,
+				SliceCount:            2,
+				InstanceCount:         3,
+				MultiprocessorCount:   28,
+				SharedCopyEngineCount: 7,
+				SharedDecoderCount:    5,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       1,
+				SharedOfaCount:        1,
+			},
+			nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE,
+				SliceCount:            3,
+				InstanceCount:         2,
+				MultiprocessorCount:   42,
+				SharedCopyEngineCount: 7,
+				SharedDecoderCount:    5,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       1,
+				SharedOfaCount:        1,
+			},
+			nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE,
+				SliceCount:            4,
+				InstanceCount:         1,
+				MultiprocessorCount:   56,
+				SharedCopyEngineCount: 7,
+				SharedDecoderCount:    5,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       1,
+				SharedOfaCount:        1,
+			},
+			nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {
+				Id:                    nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE,
+				SliceCount:            7,
+				InstanceCount:         1,
+				MultiprocessorCount:   98,
+				SharedCopyEngineCount: 7,
+				SharedDecoderCount:    5,
+				SharedEncoderCount:    0,
+				SharedJpegCount:       1,
+				SharedOfaCount:        1,
+			},
+		},
+	},
+}
+
+// MIGPlacements holds the placement information for GIs and CIs in this mock server.
+// We should consider auto-generating this object in the future.
+var MIGPlacements = struct {
+	GpuInstancePossiblePlacements     map[int][]nvml.GpuInstancePlacement
+	ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement
+}{
+	GpuInstancePossiblePlacements: map[int][]nvml.GpuInstancePlacement{
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE: {
+			{
+				Start: 0,
+				Size:  1,
+			},
+			{
+				Start: 1,
+				Size:  1,
+			},
+			{
+				Start: 2,
+				Size:  1,
+			},
+			{
+				Start: 3,
+				Size:  1,
+			},
+			{
+				Start: 4,
+				Size:  1,
+			},
+			{
+				Start: 5,
+				Size:  1,
+			},
+			{
+				Start: 6,
+				Size:  1,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: {
+			{
+				Start: 0,
+				Size:  1,
+			},
+			{
+				Start: 1,
+				Size:  1,
+			},
+			{
+				Start: 2,
+				Size:  1,
+			},
+			{
+				Start: 3,
+				Size:  1,
+			},
+			{
+				Start: 4,
+				Size:  1,
+			},
+			{
+				Start: 5,
+				Size:  1,
+			},
+			{
+				Start: 6,
+				Size:  1,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: {
+			{
+				Start: 0,
+				Size:  2,
+			},
+			{
+				Start: 2,
+				Size:  2,
+			},
+			{
+				Start: 4,
+				Size:  2,
+			},
+			{
+				Start: 6,
+				Size:  2,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_2_SLICE: {
+			{
+				Start: 0,
+				Size:  2,
+			},
+			{
+				Start: 2,
+				Size:  2,
+			},
+			{
+				Start: 4,
+				Size:  2,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_3_SLICE: {
+			{
+				Start: 0,
+				Size:  4,
+			},
+			{
+				Start: 4,
+				Size:  4,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_4_SLICE: {
+			{
+				Start: 0,
+				Size:  4,
+			},
+		},
+		nvml.GPU_INSTANCE_PROFILE_7_SLICE: {
+			{
+				Start: 0,
+				Size:  8,
+			},
+		},
+	},
+	// TODO: Fill out ComputeInstancePossiblePlacements
+	ComputeInstancePossiblePlacements: map[int]map[int][]nvml.ComputeInstancePlacement{
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
+		},
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
+		},
+		nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
+		},
+		nvml.GPU_INSTANCE_PROFILE_2_SLICE: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
+			nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {},
+		},
+		nvml.GPU_INSTANCE_PROFILE_3_SLICE: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
+			nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {},
+			nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {},
+		},
+		nvml.GPU_INSTANCE_PROFILE_4_SLICE: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
+			nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {},
+			nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {},
+		},
+		nvml.GPU_INSTANCE_PROFILE_7_SLICE: {
+			nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
+			nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {},
+			nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {},
+			nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {},
+			nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {},
+		},
+	},
+}
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@ -11,6 +11,7 @@ github.com/NVIDIA/go-nvlib/pkg/pciids
 github.com/NVIDIA/go-nvml/pkg/dl
 github.com/NVIDIA/go-nvml/pkg/nvml
 github.com/NVIDIA/go-nvml/pkg/nvml/mock
+github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100
 # github.com/cpuguy83/go-md2man/v2 v2.0.5
 ## explicit; go 1.11
 github.com/cpuguy83/go-md2man/v2/md2man