mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-04-20 22:25:22 +00:00
Merge pull request #906 from elezar/add-compat-lib-hook
Add CUDA forward compatibility hook
This commit is contained in:
commit
968e2ccca4
@ -21,6 +21,7 @@ import (
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/chmod"
|
||||
symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/create-symlinks"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/cudacompat"
|
||||
ldcache "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/update-ldcache"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
)
|
||||
@ -32,5 +33,6 @@ func New(logger logger.Interface) []*cli.Command {
|
||||
ldcache.NewCommand(logger),
|
||||
symlinks.NewCommand(logger),
|
||||
chmod.NewCommand(logger),
|
||||
cudacompat.NewCommand(logger),
|
||||
}
|
||||
}
|
||||
|
76
cmd/nvidia-cdi-hook/cudacompat/container-root.go
Normal file
76
cmd/nvidia-cdi-hook/cudacompat/container-root.go
Normal file
@ -0,0 +1,76 @@
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package cudacompat
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/moby/sys/symlink"
|
||||
)
|
||||
|
||||
// A containerRoot represents the root filesystem of a container.
|
||||
type containerRoot string
|
||||
|
||||
// hasPath checks whether the specified path exists in the root.
|
||||
func (r containerRoot) hasPath(path string) bool {
|
||||
resolved, err := r.resolve(path)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
if _, err := os.Stat(resolved); err != nil && os.IsNotExist(err) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// globFiles matches the specified pattern in the root.
|
||||
// The files that match must be regular files.
|
||||
func (r containerRoot) globFiles(pattern string) ([]string, error) {
|
||||
patternPath, err := r.resolve(pattern)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
matches, err := filepath.Glob(patternPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var files []string
|
||||
for _, match := range matches {
|
||||
info, err := os.Lstat(match)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Ignore symlinks.
|
||||
if info.Mode()&os.ModeSymlink != 0 {
|
||||
continue
|
||||
}
|
||||
// Ignore directories.
|
||||
if info.IsDir() {
|
||||
continue
|
||||
}
|
||||
files = append(files, match)
|
||||
}
|
||||
return files, nil
|
||||
}
|
||||
|
||||
// resolve returns the absolute path including root path.
|
||||
// Symlinks are resolved, but are guaranteed to resolve in the root.
|
||||
func (r containerRoot) resolve(path string) (string, error) {
|
||||
absolute := filepath.Clean(filepath.Join(string(r), path))
|
||||
return symlink.FollowSymlinkInScope(absolute, string(r))
|
||||
}
|
221
cmd/nvidia-cdi-hook/cudacompat/cudacompat.go
Normal file
221
cmd/nvidia-cdi-hook/cudacompat/cudacompat.go
Normal file
@ -0,0 +1,221 @@
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package cudacompat
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/urfave/cli/v2"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||
)
|
||||
|
||||
const (
|
||||
cudaCompatPath = "/usr/local/cuda/compat"
|
||||
// cudaCompatLdsoconfdFilenamePattern specifies the pattern for the filename
|
||||
// in ld.so.conf.d that includes a reference to the CUDA compat path.
|
||||
// The 00-compat prefix is chosen to ensure that these libraries have a
|
||||
// higher precedence than other libraries on the system.
|
||||
cudaCompatLdsoconfdFilenamePattern = "00-compat-*.conf"
|
||||
)
|
||||
|
||||
type command struct {
|
||||
logger logger.Interface
|
||||
}
|
||||
|
||||
type options struct {
|
||||
hostDriverVersion string
|
||||
containerSpec string
|
||||
}
|
||||
|
||||
// NewCommand constructs a cuda-compat command with the specified logger
|
||||
func NewCommand(logger logger.Interface) *cli.Command {
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
return c.build()
|
||||
}
|
||||
|
||||
// build the enable-cuda-compat command
|
||||
func (m command) build() *cli.Command {
|
||||
cfg := options{}
|
||||
|
||||
// Create the 'enable-cuda-compat' command
|
||||
c := cli.Command{
|
||||
Name: "enable-cuda-compat",
|
||||
Usage: "This hook ensures that the folder containing the CUDA compat libraries is added to the ldconfig search path if required.",
|
||||
Before: func(c *cli.Context) error {
|
||||
return m.validateFlags(c, &cfg)
|
||||
},
|
||||
Action: func(c *cli.Context) error {
|
||||
return m.run(c, &cfg)
|
||||
},
|
||||
}
|
||||
|
||||
c.Flags = []cli.Flag{
|
||||
&cli.StringFlag{
|
||||
Name: "host-driver-version",
|
||||
Usage: "Specify the host driver version. If the CUDA compat libraries detected in the container do not have a higher MAJOR version, the hook is a no-op.",
|
||||
Destination: &cfg.hostDriverVersion,
|
||||
},
|
||||
&cli.StringFlag{
|
||||
Name: "container-spec",
|
||||
Hidden: true,
|
||||
Category: "testing-only",
|
||||
Usage: "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN",
|
||||
Destination: &cfg.containerSpec,
|
||||
},
|
||||
}
|
||||
|
||||
return &c
|
||||
}
|
||||
|
||||
func (m command) validateFlags(_ *cli.Context, cfg *options) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m command) run(_ *cli.Context, cfg *options) error {
|
||||
if cfg.hostDriverVersion == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
s, err := oci.LoadContainerState(cfg.containerSpec)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to load container state: %w", err)
|
||||
}
|
||||
|
||||
containerRootDir, err := s.GetContainerRoot()
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to determined container root: %w", err)
|
||||
}
|
||||
|
||||
containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.hostDriverVersion)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to get container forward compat directory: %w", err)
|
||||
}
|
||||
if containerForwardCompatDir == "" {
|
||||
return nil
|
||||
}
|
||||
|
||||
return m.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, containerForwardCompatDir)
|
||||
}
|
||||
|
||||
func (m command) getContainerForwardCompatDir(containerRoot containerRoot, hostDriverVersion string) (string, error) {
|
||||
if hostDriverVersion == "" {
|
||||
m.logger.Debugf("Host driver version not specified")
|
||||
return "", nil
|
||||
}
|
||||
if !containerRoot.hasPath(cudaCompatPath) {
|
||||
m.logger.Debugf("No CUDA forward compatibility libraries directory in container")
|
||||
return "", nil
|
||||
}
|
||||
if !containerRoot.hasPath("/etc/ld.so.cache") {
|
||||
m.logger.Debugf("The container does not have an LDCache")
|
||||
return "", nil
|
||||
}
|
||||
|
||||
libs, err := containerRoot.globFiles(filepath.Join(cudaCompatPath, "libcuda.so.*.*"))
|
||||
if err != nil {
|
||||
m.logger.Warningf("Failed to find CUDA compat library: %w", err)
|
||||
return "", nil
|
||||
}
|
||||
|
||||
if len(libs) == 0 {
|
||||
m.logger.Debugf("No CUDA forward compatibility libraries container")
|
||||
return "", nil
|
||||
}
|
||||
|
||||
if len(libs) != 1 {
|
||||
m.logger.Warningf("Unexpected number of CUDA compat libraries in container: %v", libs)
|
||||
return "", nil
|
||||
}
|
||||
|
||||
compatDriverVersion := strings.TrimPrefix(filepath.Base(libs[0]), "libcuda.so.")
|
||||
compatMajor, err := extractMajorVersion(compatDriverVersion)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to extract major version from %q: %v", compatDriverVersion, err)
|
||||
}
|
||||
|
||||
driverMajor, err := extractMajorVersion(hostDriverVersion)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to extract major version from %q: %v", hostDriverVersion, err)
|
||||
}
|
||||
|
||||
if driverMajor >= compatMajor {
|
||||
m.logger.Debugf("Compat major version is not greater than the host driver major version (%v >= %v)", hostDriverVersion, compatDriverVersion)
|
||||
return "", nil
|
||||
}
|
||||
|
||||
resolvedCompatDir := strings.TrimPrefix(filepath.Dir(libs[0]), string(containerRoot))
|
||||
return resolvedCompatDir, nil
|
||||
}
|
||||
|
||||
// createLdsoconfdFile creates a file at /etc/ld.so.conf.d/ in the specified root.
|
||||
// The file is created at /etc/ld.so.conf.d/{{ .pattern }} using `CreateTemp` and
|
||||
// contains the specified directories on each line.
|
||||
func (m command) createLdsoconfdFile(in containerRoot, pattern string, dirs ...string) error {
|
||||
if len(dirs) == 0 {
|
||||
m.logger.Debugf("No directories to add to /etc/ld.so.conf")
|
||||
return nil
|
||||
}
|
||||
|
||||
ldsoconfdDir, err := in.resolve("/etc/ld.so.conf.d")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.MkdirAll(ldsoconfdDir, 0755); err != nil {
|
||||
return fmt.Errorf("failed to create ld.so.conf.d: %w", err)
|
||||
}
|
||||
|
||||
configFile, err := os.CreateTemp(ldsoconfdDir, pattern)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create config file: %w", err)
|
||||
}
|
||||
defer configFile.Close()
|
||||
|
||||
m.logger.Debugf("Adding directories %v to %v", dirs, configFile.Name())
|
||||
|
||||
added := make(map[string]bool)
|
||||
for _, dir := range dirs {
|
||||
if added[dir] {
|
||||
continue
|
||||
}
|
||||
_, err = configFile.WriteString(fmt.Sprintf("%s\n", dir))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to update config file: %w", err)
|
||||
}
|
||||
added[dir] = true
|
||||
}
|
||||
|
||||
// The created file needs to be world readable for the cases where the container is run as a non-root user.
|
||||
if err := configFile.Chmod(0644); err != nil {
|
||||
return fmt.Errorf("failed to chmod config file: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// extractMajorVersion parses a version string and returns the major version as an int.
|
||||
func extractMajorVersion(version string) (int, error) {
|
||||
majorString := strings.SplitN(version, ".", 2)[0]
|
||||
return strconv.Atoi(majorString)
|
||||
}
|
182
cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go
Normal file
182
cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go
Normal file
@ -0,0 +1,182 @@
|
||||
/*
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
package cudacompat
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
testlog "github.com/sirupsen/logrus/hooks/test"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestCompatLibs(t *testing.T) {
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
|
||||
testCases := []struct {
|
||||
description string
|
||||
contents map[string]string
|
||||
hostDriverVersion string
|
||||
expectedContainerForwardCompatDir string
|
||||
}{
|
||||
{
|
||||
description: "empty root",
|
||||
hostDriverVersion: "222.55.66",
|
||||
},
|
||||
{
|
||||
description: "compat lib is newer; no ldcache",
|
||||
contents: map[string]string{
|
||||
"/usr/local/cuda/compat/libcuda.so.333.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
},
|
||||
{
|
||||
description: "compat lib is newer; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.333.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "/usr/local/cuda/compat",
|
||||
},
|
||||
{
|
||||
description: "compat lib is older; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.111.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "",
|
||||
},
|
||||
{
|
||||
description: "compat lib has same major version; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.222.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "",
|
||||
},
|
||||
{
|
||||
description: "numeric comparison is used; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.222.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "99.55.66",
|
||||
expectedContainerForwardCompatDir: "/usr/local/cuda/compat",
|
||||
},
|
||||
{
|
||||
description: "driver version empty; ldcache",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/usr/local/cuda/compat/libcuda.so.222.88.99": "",
|
||||
},
|
||||
hostDriverVersion: "",
|
||||
},
|
||||
{
|
||||
description: "symlinks are followed",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/etc/alternatives/cuda/compat/libcuda.so.333.88.99": "",
|
||||
"/usr/local/cuda": "symlink=/etc/alternatives/cuda",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "/etc/alternatives/cuda/compat",
|
||||
},
|
||||
{
|
||||
description: "symlinks stay in container",
|
||||
contents: map[string]string{
|
||||
"/etc/ld.so.cache": "",
|
||||
"/compat/libcuda.so.333.88.99": "",
|
||||
"/usr/local/cuda": "symlink=../../../../../../",
|
||||
},
|
||||
hostDriverVersion: "222.55.66",
|
||||
expectedContainerForwardCompatDir: "/compat",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
containerRootDir := t.TempDir()
|
||||
for name, contents := range tc.contents {
|
||||
target := filepath.Join(containerRootDir, name)
|
||||
require.NoError(t, os.MkdirAll(filepath.Dir(target), 0755))
|
||||
|
||||
if strings.HasPrefix(contents, "symlink=") {
|
||||
require.NoError(t, os.Symlink(strings.TrimPrefix(contents, "symlink="), target))
|
||||
continue
|
||||
}
|
||||
|
||||
require.NoError(t, os.WriteFile(target, []byte(contents), 0600))
|
||||
}
|
||||
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), tc.hostDriverVersion)
|
||||
require.NoError(t, err)
|
||||
require.EqualValues(t, tc.expectedContainerForwardCompatDir, containerForwardCompatDir)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestUpdateLdconfig(t *testing.T) {
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
testCases := []struct {
|
||||
description string
|
||||
folders []string
|
||||
expectedContents string
|
||||
}{
|
||||
{
|
||||
description: "no folders; have no contents",
|
||||
},
|
||||
{
|
||||
description: "single folder is added",
|
||||
folders: []string{"/usr/local/cuda/compat"},
|
||||
expectedContents: "/usr/local/cuda/compat\n",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
containerRootDir := t.TempDir()
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
err := c.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, tc.folders...)
|
||||
require.NoError(t, err)
|
||||
|
||||
matches, err := filepath.Glob(filepath.Join(containerRootDir, "/etc/ld.so.conf.d/00-compat-*.conf"))
|
||||
require.NoError(t, err)
|
||||
|
||||
if tc.expectedContents == "" {
|
||||
require.Empty(t, matches)
|
||||
return
|
||||
}
|
||||
|
||||
require.Len(t, matches, 1)
|
||||
contents, err := os.ReadFile(matches[0])
|
||||
require.NoError(t, err)
|
||||
|
||||
require.EqualValues(t, tc.expectedContents, string(contents))
|
||||
})
|
||||
}
|
||||
|
||||
}
|
@ -80,6 +80,12 @@ containerEdits:
|
||||
- libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so
|
||||
hookName: createContainer
|
||||
path: {{ .toolkitRoot }}/nvidia-cdi-hook
|
||||
- args:
|
||||
- nvidia-cdi-hook
|
||||
- enable-cuda-compat
|
||||
- --host-driver-version=999.88.77
|
||||
hookName: createContainer
|
||||
path: {{ .toolkitRoot }}/nvidia-cdi-hook
|
||||
- args:
|
||||
- nvidia-cdi-hook
|
||||
- update-ldcache
|
||||
|
@ -25,6 +25,8 @@ import (
|
||||
"github.com/urfave/cli/v2"
|
||||
cdi "tags.cncf.io/container-device-interface/pkg/parser"
|
||||
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv"
|
||||
@ -60,6 +62,9 @@ type options struct {
|
||||
files cli.StringSlice
|
||||
ignorePatterns cli.StringSlice
|
||||
}
|
||||
|
||||
// the following are used for dependency injection during spec generation.
|
||||
nvmllib nvml.Interface
|
||||
}
|
||||
|
||||
// NewCommand constructs a generate-cdi command with the specified logger
|
||||
@ -269,6 +274,8 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) {
|
||||
nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()),
|
||||
nvcdi.WithCSVFiles(opts.csv.files.Value()),
|
||||
nvcdi.WithCSVIgnorePatterns(opts.csv.ignorePatterns.Value()),
|
||||
// We set the following to allow for dependency injection:
|
||||
nvcdi.WithNvmlLib(opts.nvmllib),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create CDI library: %v", err)
|
||||
|
157
cmd/nvidia-ctk/cdi/generate/generate_test.go
Normal file
157
cmd/nvidia-ctk/cdi/generate/generate_test.go
Normal file
@ -0,0 +1,157 @@
|
||||
/**
|
||||
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
**/
|
||||
|
||||
package generate
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100"
|
||||
testlog "github.com/sirupsen/logrus/hooks/test"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/test"
|
||||
)
|
||||
|
||||
func TestGenerateSpec(t *testing.T) {
|
||||
t.Setenv("__NVCT_TESTING_DEVICES_ARE_FILES", "true")
|
||||
moduleRoot, err := test.GetModuleRoot()
|
||||
require.NoError(t, err)
|
||||
|
||||
driverRoot := filepath.Join(moduleRoot, "testdata", "lookup", "rootfs-1")
|
||||
|
||||
logger, _ := testlog.NewNullLogger()
|
||||
testCases := []struct {
|
||||
description string
|
||||
options options
|
||||
expectedValidateError error
|
||||
expectedOptions options
|
||||
expectedError error
|
||||
expectedSpec string
|
||||
}{
|
||||
{
|
||||
description: "default",
|
||||
options: options{
|
||||
format: "yaml",
|
||||
mode: "nvml",
|
||||
vendor: "example.com",
|
||||
class: "device",
|
||||
driverRoot: driverRoot,
|
||||
},
|
||||
expectedOptions: options{
|
||||
format: "yaml",
|
||||
mode: "nvml",
|
||||
vendor: "example.com",
|
||||
class: "device",
|
||||
nvidiaCDIHookPath: "/usr/bin/nvidia-cdi-hook",
|
||||
driverRoot: driverRoot,
|
||||
},
|
||||
expectedSpec: `---
|
||||
cdiVersion: 0.5.0
|
||||
containerEdits:
|
||||
deviceNodes:
|
||||
- hostPath: {{ .driverRoot }}/dev/nvidiactl
|
||||
path: /dev/nvidiactl
|
||||
env:
|
||||
- NVIDIA_VISIBLE_DEVICES=void
|
||||
hooks:
|
||||
- args:
|
||||
- nvidia-cdi-hook
|
||||
- create-symlinks
|
||||
- --link
|
||||
- libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so
|
||||
hookName: createContainer
|
||||
path: /usr/bin/nvidia-cdi-hook
|
||||
- args:
|
||||
- nvidia-cdi-hook
|
||||
- enable-cuda-compat
|
||||
- --host-driver-version=999.88.77
|
||||
hookName: createContainer
|
||||
path: /usr/bin/nvidia-cdi-hook
|
||||
- args:
|
||||
- nvidia-cdi-hook
|
||||
- update-ldcache
|
||||
- --folder
|
||||
- /lib/x86_64-linux-gnu
|
||||
hookName: createContainer
|
||||
path: /usr/bin/nvidia-cdi-hook
|
||||
mounts:
|
||||
- containerPath: /lib/x86_64-linux-gnu/libcuda.so.999.88.77
|
||||
hostPath: {{ .driverRoot }}/lib/x86_64-linux-gnu/libcuda.so.999.88.77
|
||||
options:
|
||||
- ro
|
||||
- nosuid
|
||||
- nodev
|
||||
- bind
|
||||
devices:
|
||||
- containerEdits:
|
||||
deviceNodes:
|
||||
- hostPath: {{ .driverRoot }}/dev/nvidia0
|
||||
path: /dev/nvidia0
|
||||
name: "0"
|
||||
- containerEdits:
|
||||
deviceNodes:
|
||||
- hostPath: {{ .driverRoot }}/dev/nvidia0
|
||||
path: /dev/nvidia0
|
||||
name: all
|
||||
kind: example.com/device
|
||||
`,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.description, func(t *testing.T) {
|
||||
c := command{
|
||||
logger: logger,
|
||||
}
|
||||
|
||||
err := c.validateFlags(nil, &tc.options)
|
||||
require.ErrorIs(t, err, tc.expectedValidateError)
|
||||
require.EqualValues(t, tc.expectedOptions, tc.options)
|
||||
|
||||
// Set up a mock server, reusing the DGX A100 mock.
|
||||
server := dgxa100.New()
|
||||
// Override the driver version to match the version in our mock filesystem.
|
||||
server.SystemGetDriverVersionFunc = func() (string, nvml.Return) {
|
||||
return "999.88.77", nvml.SUCCESS
|
||||
}
|
||||
// Set the device count to 1 explicitly since we only have a single device node.
|
||||
server.DeviceGetCountFunc = func() (int, nvml.Return) {
|
||||
return 1, nvml.SUCCESS
|
||||
}
|
||||
for _, d := range server.Devices {
|
||||
// TODO: This is not implemented in the mock.
|
||||
(d.(*dgxa100.Device)).GetMaxMigDeviceCountFunc = func() (int, nvml.Return) {
|
||||
return 0, nvml.SUCCESS
|
||||
}
|
||||
}
|
||||
tc.options.nvmllib = server
|
||||
|
||||
spec, err := c.generateSpec(&tc.options)
|
||||
require.ErrorIs(t, err, tc.expectedError)
|
||||
|
||||
var buf bytes.Buffer
|
||||
_, err = spec.WriteTo(&buf)
|
||||
require.NoError(t, err)
|
||||
|
||||
require.Equal(t, strings.ReplaceAll(tc.expectedSpec, "{{ .driverRoot }}", driverRoot), buf.String())
|
||||
})
|
||||
}
|
||||
}
|
@ -25,6 +25,12 @@ type features struct {
|
||||
// If this feature flag is not set to 'true' only host-rooted config paths
|
||||
// (i.e. paths starting with an '@' are considered valid)
|
||||
AllowLDConfigFromContainer *feature `toml:"allow-ldconfig-from-container,omitempty"`
|
||||
// DisableCUDACompatLibHook, when enabled skips the injection of a specific
|
||||
// hook to process CUDA compatibility libraries.
|
||||
//
|
||||
// Note: Since this mechanism replaces the logic in the `nvidia-container-cli`,
|
||||
// toggling this feature has no effect if `allow-cuda-compat-libs-from-container` is enabled.
|
||||
DisableCUDACompatLibHook *feature `toml:"disable-cuda-compat-lib-hook,omitempty"`
|
||||
// DisableImexChannelCreation ensures that the implicit creation of
|
||||
// requested IMEX channels is skipped when invoking the nvidia-container-cli.
|
||||
DisableImexChannelCreation *feature `toml:"disable-imex-channel-creation,omitempty"`
|
||||
|
24
internal/discover/compat_libs.go
Normal file
24
internal/discover/compat_libs.go
Normal file
@ -0,0 +1,24 @@
|
||||
package discover
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
|
||||
)
|
||||
|
||||
// NewCUDACompatHookDiscoverer creates a discoverer for a enable-cuda-compat hook.
|
||||
// This hook is responsible for setting up CUDA compatibility in the container and depends on the host driver version.
|
||||
func NewCUDACompatHookDiscoverer(logger logger.Interface, nvidiaCDIHookPath string, driver *root.Driver) Discover {
|
||||
_, cudaVersionPattern := getCUDALibRootAndVersionPattern(logger, driver)
|
||||
var args []string
|
||||
if !strings.Contains(cudaVersionPattern, "*") {
|
||||
args = append(args, "--host-driver-version="+cudaVersionPattern)
|
||||
}
|
||||
|
||||
return CreateNvidiaCDIHook(
|
||||
nvidiaCDIHookPath,
|
||||
"enable-cuda-compat",
|
||||
args...,
|
||||
)
|
||||
}
|
@ -23,6 +23,7 @@ import (
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/discover"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
|
||||
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
|
||||
)
|
||||
|
||||
@ -35,7 +36,7 @@ import (
|
||||
// NVIDIA_GDRCOPY=enabled
|
||||
//
|
||||
// If not devices are selected, no changes are made.
|
||||
func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA) (oci.SpecModifier, error) {
|
||||
func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image image.CUDA, driver *root.Driver) (oci.SpecModifier, error) {
|
||||
if devices := image.VisibleDevicesFromEnvVar(); len(devices) == 0 {
|
||||
logger.Infof("No modification required; no devices requested")
|
||||
return nil, nil
|
||||
@ -78,5 +79,24 @@ func NewFeatureGatedModifier(logger logger.Interface, cfg *config.Config, image
|
||||
discoverers = append(discoverers, d)
|
||||
}
|
||||
|
||||
if !cfg.Features.AllowCUDACompatLibsFromContainer.IsEnabled() && !cfg.Features.DisableCUDACompatLibHook.IsEnabled() {
|
||||
compatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, cfg.NVIDIACTKConfig.Path, driver)
|
||||
discoverers = append(discoverers, compatLibHookDiscoverer)
|
||||
// For legacy mode, we also need to inject a hook to update the LDCache
|
||||
// after we have modifed the configuration.
|
||||
if cfg.NVIDIAContainerRuntimeConfig.Mode == "legacy" {
|
||||
ldcacheUpdateHookDiscoverer, err := discover.NewLDCacheUpdateHook(
|
||||
logger,
|
||||
discover.None{},
|
||||
cfg.NVIDIACTKConfig.Path,
|
||||
"",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to construct ldcache update discoverer: %w", err)
|
||||
}
|
||||
discoverers = append(discoverers, ldcacheUpdateHookDiscoverer)
|
||||
}
|
||||
}
|
||||
|
||||
return NewModifierFromDiscoverer(logger, discover.Merge(discoverers...))
|
||||
}
|
||||
|
@ -75,6 +75,8 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
|
||||
}
|
||||
|
||||
mode := info.ResolveAutoMode(logger, cfg.NVIDIAContainerRuntimeConfig.Mode, image)
|
||||
// We update the mode here so that we can continue passing just the config to other functions.
|
||||
cfg.NVIDIAContainerRuntimeConfig.Mode = mode
|
||||
modeModifier, err := newModeModifier(logger, mode, cfg, ociSpec, image)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -94,7 +96,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
|
||||
}
|
||||
modifiers = append(modifiers, graphicsModifier)
|
||||
case "feature-gated":
|
||||
featureGatedModifier, err := modifier.NewFeatureGatedModifier(logger, cfg, image)
|
||||
featureGatedModifier, err := modifier.NewFeatureGatedModifier(logger, cfg, image, driver)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@ -126,8 +128,8 @@ func supportedModifierTypes(mode string) []string {
|
||||
return []string{"nvidia-hook-remover", "mode"}
|
||||
case "csv":
|
||||
// For CSV mode we support mode and feature-gated modification.
|
||||
return []string{"nvidia-hook-remover", "mode", "feature-gated"}
|
||||
return []string{"nvidia-hook-remover", "feature-gated", "mode"}
|
||||
default:
|
||||
return []string{"mode", "graphics", "feature-gated"}
|
||||
return []string{"feature-gated", "graphics", "mode"}
|
||||
}
|
||||
}
|
||||
|
@ -97,6 +97,8 @@ func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nv
|
||||
libraryPaths,
|
||||
)
|
||||
|
||||
// TODO: The following should use the version directly.
|
||||
cudaCompatLibHookDiscoverer := discover.NewCUDACompatHookDiscoverer(logger, nvidiaCDIHookPath, driver)
|
||||
updateLDCache, _ := discover.NewLDCacheUpdateHook(logger, libraries, nvidiaCDIHookPath, ldconfigPath)
|
||||
|
||||
d := discover.Merge(
|
||||
@ -105,6 +107,7 @@ func NewDriverLibraryDiscoverer(logger logger.Interface, driver *root.Driver, nv
|
||||
version,
|
||||
nvidiaCDIHookPath,
|
||||
),
|
||||
cudaCompatLibHookDiscoverer,
|
||||
updateLDCache,
|
||||
)
|
||||
|
||||
|
@ -18,13 +18,15 @@ package e2e
|
||||
|
||||
import (
|
||||
"context"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
// Integration tests for Docker runtime
|
||||
var _ = Describe("docker", Ordered, func() {
|
||||
var _ = Describe("docker", Ordered, ContinueOnFailure, func() {
|
||||
var r Runner
|
||||
|
||||
// Install the NVIDIA Container Toolkit
|
||||
@ -166,4 +168,51 @@ var _ = Describe("docker", Ordered, func() {
|
||||
Expect(referenceOutput).To(Equal(out4))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("CUDA Forward compatibility", Ordered, func() {
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
_, _, err := r.Run("docker pull nvcr.io/nvidia/cuda:12.8.0-base-ubi8")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
})
|
||||
|
||||
BeforeAll(func(ctx context.Context) {
|
||||
compatOutput, _, err := r.Run("docker run --rm -i -e NVIDIA_VISIBLE_DEVICES=void nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ls /usr/local/cuda/compat/libcuda.*.*\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(compatOutput).ToNot(BeEmpty())
|
||||
compatDriverVersion := strings.TrimPrefix(filepath.Base(compatOutput), "libcuda.so.")
|
||||
compatMajor := strings.SplitN(compatDriverVersion, ".", 2)[0]
|
||||
|
||||
driverOutput, _, err := r.Run("nvidia-smi -q | grep \"Driver Version\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
parts := strings.SplitN(driverOutput, ":", 2)
|
||||
Expect(parts).To(HaveLen(2))
|
||||
|
||||
hostDriverVersion := strings.TrimSpace(parts[1])
|
||||
Expect(hostDriverVersion).ToNot(BeEmpty())
|
||||
driverMajor := strings.SplitN(hostDriverVersion, ".", 2)[0]
|
||||
|
||||
if driverMajor >= compatMajor {
|
||||
GinkgoLogr.Info("CUDA Forward Compatibility tests require an older driver version", "hostDriverVersion", hostDriverVersion, "compatDriverVersion", compatDriverVersion)
|
||||
Skip("CUDA Forward Compatibility tests require an older driver version")
|
||||
}
|
||||
})
|
||||
|
||||
It("should work with the nvidia runtime in legacy mode", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
|
||||
})
|
||||
|
||||
It("should work with the nvidia runtime in CDI mode", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/local/cuda/compat"))
|
||||
})
|
||||
|
||||
It("should NOT work with nvidia-container-runtime-hook", func(ctx context.Context) {
|
||||
ldconfigOut, _, err := r.Run("docker run --rm -i -e NVIDIA_DISABLE_REQUIRE=true --runtime=runc --gpus all nvcr.io/nvidia/cuda:12.8.0-base-ubi8 bash -c \"ldconfig -p | grep libcuda.so.1\"")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(ldconfigOut).To(ContainSubstring("/usr/lib64"))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
380
vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go
generated
vendored
Normal file
380
vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go
generated
vendored
Normal file
@ -0,0 +1,380 @@
|
||||
/*
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dgxa100
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml/mock"
|
||||
"github.com/google/uuid"
|
||||
)
|
||||
|
||||
type Server struct {
|
||||
mock.Interface
|
||||
mock.ExtendedInterface
|
||||
Devices [8]nvml.Device
|
||||
DriverVersion string
|
||||
NvmlVersion string
|
||||
CudaDriverVersion int
|
||||
}
|
||||
type Device struct {
|
||||
mock.Device
|
||||
sync.RWMutex
|
||||
UUID string
|
||||
Name string
|
||||
Brand nvml.BrandType
|
||||
Architecture nvml.DeviceArchitecture
|
||||
PciBusID string
|
||||
Minor int
|
||||
Index int
|
||||
CudaComputeCapability CudaComputeCapability
|
||||
MigMode int
|
||||
GpuInstances map[*GpuInstance]struct{}
|
||||
GpuInstanceCounter uint32
|
||||
MemoryInfo nvml.Memory
|
||||
}
|
||||
|
||||
type GpuInstance struct {
|
||||
mock.GpuInstance
|
||||
sync.RWMutex
|
||||
Info nvml.GpuInstanceInfo
|
||||
ComputeInstances map[*ComputeInstance]struct{}
|
||||
ComputeInstanceCounter uint32
|
||||
}
|
||||
|
||||
type ComputeInstance struct {
|
||||
mock.ComputeInstance
|
||||
Info nvml.ComputeInstanceInfo
|
||||
}
|
||||
|
||||
type CudaComputeCapability struct {
|
||||
Major int
|
||||
Minor int
|
||||
}
|
||||
|
||||
var _ nvml.Interface = (*Server)(nil)
|
||||
var _ nvml.Device = (*Device)(nil)
|
||||
var _ nvml.GpuInstance = (*GpuInstance)(nil)
|
||||
var _ nvml.ComputeInstance = (*ComputeInstance)(nil)
|
||||
|
||||
func New() *Server {
|
||||
server := &Server{
|
||||
Devices: [8]nvml.Device{
|
||||
NewDevice(0),
|
||||
NewDevice(1),
|
||||
NewDevice(2),
|
||||
NewDevice(3),
|
||||
NewDevice(4),
|
||||
NewDevice(5),
|
||||
NewDevice(6),
|
||||
NewDevice(7),
|
||||
},
|
||||
DriverVersion: "550.54.15",
|
||||
NvmlVersion: "12.550.54.15",
|
||||
CudaDriverVersion: 12040,
|
||||
}
|
||||
server.setMockFuncs()
|
||||
return server
|
||||
}
|
||||
|
||||
func NewDevice(index int) *Device {
|
||||
device := &Device{
|
||||
UUID: "GPU-" + uuid.New().String(),
|
||||
Name: "Mock NVIDIA A100-SXM4-40GB",
|
||||
Brand: nvml.BRAND_NVIDIA,
|
||||
Architecture: nvml.DEVICE_ARCH_AMPERE,
|
||||
PciBusID: fmt.Sprintf("0000:%02x:00.0", index),
|
||||
Minor: index,
|
||||
Index: index,
|
||||
CudaComputeCapability: CudaComputeCapability{
|
||||
Major: 8,
|
||||
Minor: 0,
|
||||
},
|
||||
GpuInstances: make(map[*GpuInstance]struct{}),
|
||||
GpuInstanceCounter: 0,
|
||||
MemoryInfo: nvml.Memory{42949672960, 0, 0},
|
||||
}
|
||||
device.setMockFuncs()
|
||||
return device
|
||||
}
|
||||
|
||||
func NewGpuInstance(info nvml.GpuInstanceInfo) *GpuInstance {
|
||||
gi := &GpuInstance{
|
||||
Info: info,
|
||||
ComputeInstances: make(map[*ComputeInstance]struct{}),
|
||||
ComputeInstanceCounter: 0,
|
||||
}
|
||||
gi.setMockFuncs()
|
||||
return gi
|
||||
}
|
||||
|
||||
func NewComputeInstance(info nvml.ComputeInstanceInfo) *ComputeInstance {
|
||||
ci := &ComputeInstance{
|
||||
Info: info,
|
||||
}
|
||||
ci.setMockFuncs()
|
||||
return ci
|
||||
}
|
||||
|
||||
func (s *Server) setMockFuncs() {
|
||||
s.ExtensionsFunc = func() nvml.ExtendedInterface {
|
||||
return s
|
||||
}
|
||||
|
||||
s.LookupSymbolFunc = func(symbol string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
s.InitFunc = func() nvml.Return {
|
||||
return nvml.SUCCESS
|
||||
}
|
||||
|
||||
s.ShutdownFunc = func() nvml.Return {
|
||||
return nvml.SUCCESS
|
||||
}
|
||||
|
||||
s.SystemGetDriverVersionFunc = func() (string, nvml.Return) {
|
||||
return s.DriverVersion, nvml.SUCCESS
|
||||
}
|
||||
|
||||
s.SystemGetNVMLVersionFunc = func() (string, nvml.Return) {
|
||||
return s.NvmlVersion, nvml.SUCCESS
|
||||
}
|
||||
|
||||
s.SystemGetCudaDriverVersionFunc = func() (int, nvml.Return) {
|
||||
return s.CudaDriverVersion, nvml.SUCCESS
|
||||
}
|
||||
|
||||
s.DeviceGetCountFunc = func() (int, nvml.Return) {
|
||||
return len(s.Devices), nvml.SUCCESS
|
||||
}
|
||||
|
||||
s.DeviceGetHandleByIndexFunc = func(index int) (nvml.Device, nvml.Return) {
|
||||
if index < 0 || index >= len(s.Devices) {
|
||||
return nil, nvml.ERROR_INVALID_ARGUMENT
|
||||
}
|
||||
return s.Devices[index], nvml.SUCCESS
|
||||
}
|
||||
|
||||
s.DeviceGetHandleByUUIDFunc = func(uuid string) (nvml.Device, nvml.Return) {
|
||||
for _, d := range s.Devices {
|
||||
if uuid == d.(*Device).UUID {
|
||||
return d, nvml.SUCCESS
|
||||
}
|
||||
}
|
||||
return nil, nvml.ERROR_INVALID_ARGUMENT
|
||||
}
|
||||
|
||||
s.DeviceGetHandleByPciBusIdFunc = func(busID string) (nvml.Device, nvml.Return) {
|
||||
for _, d := range s.Devices {
|
||||
if busID == d.(*Device).PciBusID {
|
||||
return d, nvml.SUCCESS
|
||||
}
|
||||
}
|
||||
return nil, nvml.ERROR_INVALID_ARGUMENT
|
||||
}
|
||||
}
|
||||
|
||||
func (d *Device) setMockFuncs() {
|
||||
d.GetMinorNumberFunc = func() (int, nvml.Return) {
|
||||
return d.Minor, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetIndexFunc = func() (int, nvml.Return) {
|
||||
return d.Index, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetCudaComputeCapabilityFunc = func() (int, int, nvml.Return) {
|
||||
return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetUUIDFunc = func() (string, nvml.Return) {
|
||||
return d.UUID, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetNameFunc = func() (string, nvml.Return) {
|
||||
return d.Name, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetBrandFunc = func() (nvml.BrandType, nvml.Return) {
|
||||
return d.Brand, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetArchitectureFunc = func() (nvml.DeviceArchitecture, nvml.Return) {
|
||||
return d.Architecture, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetMemoryInfoFunc = func() (nvml.Memory, nvml.Return) {
|
||||
return d.MemoryInfo, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetPciInfoFunc = func() (nvml.PciInfo, nvml.Return) {
|
||||
p := nvml.PciInfo{
|
||||
PciDeviceId: 0x20B010DE,
|
||||
}
|
||||
return p, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.SetMigModeFunc = func(mode int) (nvml.Return, nvml.Return) {
|
||||
d.MigMode = mode
|
||||
return nvml.SUCCESS, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetMigModeFunc = func() (int, int, nvml.Return) {
|
||||
return d.MigMode, d.MigMode, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetGpuInstanceProfileInfoFunc = func(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) {
|
||||
if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT {
|
||||
return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT
|
||||
}
|
||||
|
||||
if _, exists := MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists {
|
||||
return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED
|
||||
}
|
||||
|
||||
return MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetGpuInstancePossiblePlacementsFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) {
|
||||
return MIGPlacements.GpuInstancePossiblePlacements[int(info.Id)], nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) {
|
||||
d.Lock()
|
||||
defer d.Unlock()
|
||||
giInfo := nvml.GpuInstanceInfo{
|
||||
Device: d,
|
||||
Id: d.GpuInstanceCounter,
|
||||
ProfileId: info.Id,
|
||||
}
|
||||
d.GpuInstanceCounter++
|
||||
gi := NewGpuInstance(giInfo)
|
||||
d.GpuInstances[gi] = struct{}{}
|
||||
return gi, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) {
|
||||
d.Lock()
|
||||
defer d.Unlock()
|
||||
giInfo := nvml.GpuInstanceInfo{
|
||||
Device: d,
|
||||
Id: d.GpuInstanceCounter,
|
||||
ProfileId: info.Id,
|
||||
Placement: *placement,
|
||||
}
|
||||
d.GpuInstanceCounter++
|
||||
gi := NewGpuInstance(giInfo)
|
||||
d.GpuInstances[gi] = struct{}{}
|
||||
return gi, nvml.SUCCESS
|
||||
}
|
||||
|
||||
d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) {
|
||||
d.RLock()
|
||||
defer d.RUnlock()
|
||||
var gis []nvml.GpuInstance
|
||||
for gi := range d.GpuInstances {
|
||||
if gi.Info.ProfileId == info.Id {
|
||||
gis = append(gis, gi)
|
||||
}
|
||||
}
|
||||
return gis, nvml.SUCCESS
|
||||
}
|
||||
}
|
||||
|
||||
func (gi *GpuInstance) setMockFuncs() {
|
||||
gi.GetInfoFunc = func() (nvml.GpuInstanceInfo, nvml.Return) {
|
||||
return gi.Info, nvml.SUCCESS
|
||||
}
|
||||
|
||||
gi.GetComputeInstanceProfileInfoFunc = func(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) {
|
||||
if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT {
|
||||
return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT
|
||||
}
|
||||
|
||||
if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED {
|
||||
return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED
|
||||
}
|
||||
|
||||
giProfileId := int(gi.Info.ProfileId)
|
||||
|
||||
if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists {
|
||||
return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED
|
||||
}
|
||||
|
||||
if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists {
|
||||
return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED
|
||||
}
|
||||
|
||||
return MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS
|
||||
}
|
||||
|
||||
gi.GetComputeInstancePossiblePlacementsFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) {
|
||||
return MIGPlacements.ComputeInstancePossiblePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS
|
||||
}
|
||||
|
||||
gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) {
|
||||
gi.Lock()
|
||||
defer gi.Unlock()
|
||||
ciInfo := nvml.ComputeInstanceInfo{
|
||||
Device: gi.Info.Device,
|
||||
GpuInstance: gi,
|
||||
Id: gi.ComputeInstanceCounter,
|
||||
ProfileId: info.Id,
|
||||
}
|
||||
gi.ComputeInstanceCounter++
|
||||
ci := NewComputeInstance(ciInfo)
|
||||
gi.ComputeInstances[ci] = struct{}{}
|
||||
return ci, nvml.SUCCESS
|
||||
}
|
||||
|
||||
gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) {
|
||||
gi.RLock()
|
||||
defer gi.RUnlock()
|
||||
var cis []nvml.ComputeInstance
|
||||
for ci := range gi.ComputeInstances {
|
||||
if ci.Info.ProfileId == info.Id {
|
||||
cis = append(cis, ci)
|
||||
}
|
||||
}
|
||||
return cis, nvml.SUCCESS
|
||||
}
|
||||
|
||||
gi.DestroyFunc = func() nvml.Return {
|
||||
d := gi.Info.Device.(*Device)
|
||||
d.Lock()
|
||||
defer d.Unlock()
|
||||
delete(d.GpuInstances, gi)
|
||||
return nvml.SUCCESS
|
||||
}
|
||||
}
|
||||
|
||||
func (ci *ComputeInstance) setMockFuncs() {
|
||||
ci.GetInfoFunc = func() (nvml.ComputeInstanceInfo, nvml.Return) {
|
||||
return ci.Info, nvml.SUCCESS
|
||||
}
|
||||
|
||||
ci.DestroyFunc = func() nvml.Return {
|
||||
gi := ci.Info.GpuInstance.(*GpuInstance)
|
||||
gi.Lock()
|
||||
defer gi.Unlock()
|
||||
delete(gi.ComputeInstances, ci)
|
||||
return nvml.SUCCESS
|
||||
}
|
||||
}
|
471
vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go
generated
vendored
Normal file
471
vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go
generated
vendored
Normal file
@ -0,0 +1,471 @@
|
||||
/*
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package dgxa100
|
||||
|
||||
import (
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
)
|
||||
|
||||
// MIGProfiles holds the profile information for GIs and CIs in this mock server.
|
||||
// We should consider auto-generating this object in the future.
|
||||
var MIGProfiles = struct {
|
||||
GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo
|
||||
ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo
|
||||
}{
|
||||
GpuInstanceProfiles: map[int]nvml.GpuInstanceProfileInfo{
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 7,
|
||||
MultiprocessorCount: 14,
|
||||
CopyEngineCount: 1,
|
||||
DecoderCount: 0,
|
||||
EncoderCount: 0,
|
||||
JpegCount: 0,
|
||||
OfaCount: 0,
|
||||
MemorySizeMB: 4864,
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: {
|
||||
Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 14,
|
||||
CopyEngineCount: 1,
|
||||
DecoderCount: 1,
|
||||
EncoderCount: 0,
|
||||
JpegCount: 1,
|
||||
OfaCount: 1,
|
||||
MemorySizeMB: 4864,
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: {
|
||||
Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 4,
|
||||
MultiprocessorCount: 14,
|
||||
CopyEngineCount: 1,
|
||||
DecoderCount: 1,
|
||||
EncoderCount: 0,
|
||||
JpegCount: 0,
|
||||
OfaCount: 0,
|
||||
MemorySizeMB: 9856,
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_2_SLICE: {
|
||||
Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 2,
|
||||
InstanceCount: 3,
|
||||
MultiprocessorCount: 28,
|
||||
CopyEngineCount: 2,
|
||||
DecoderCount: 1,
|
||||
EncoderCount: 0,
|
||||
JpegCount: 0,
|
||||
OfaCount: 0,
|
||||
MemorySizeMB: 9856,
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_3_SLICE: {
|
||||
Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 3,
|
||||
InstanceCount: 2,
|
||||
MultiprocessorCount: 42,
|
||||
CopyEngineCount: 3,
|
||||
DecoderCount: 2,
|
||||
EncoderCount: 0,
|
||||
JpegCount: 0,
|
||||
OfaCount: 0,
|
||||
MemorySizeMB: 19968,
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_4_SLICE: {
|
||||
Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 4,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 56,
|
||||
CopyEngineCount: 4,
|
||||
DecoderCount: 2,
|
||||
EncoderCount: 0,
|
||||
JpegCount: 0,
|
||||
OfaCount: 0,
|
||||
MemorySizeMB: 19968,
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_7_SLICE: {
|
||||
Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 7,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 98,
|
||||
CopyEngineCount: 7,
|
||||
DecoderCount: 5,
|
||||
EncoderCount: 0,
|
||||
JpegCount: 1,
|
||||
OfaCount: 1,
|
||||
MemorySizeMB: 40192,
|
||||
},
|
||||
},
|
||||
ComputeInstanceProfiles: map[int]map[int]nvml.ComputeInstanceProfileInfo{
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 14,
|
||||
SharedCopyEngineCount: 1,
|
||||
SharedDecoderCount: 0,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 14,
|
||||
SharedCopyEngineCount: 1,
|
||||
SharedDecoderCount: 1,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 1,
|
||||
SharedOfaCount: 1,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 14,
|
||||
SharedCopyEngineCount: 1,
|
||||
SharedDecoderCount: 1,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_2_SLICE: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 2,
|
||||
MultiprocessorCount: 14,
|
||||
SharedCopyEngineCount: 2,
|
||||
SharedDecoderCount: 1,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE,
|
||||
SliceCount: 2,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 28,
|
||||
SharedCopyEngineCount: 2,
|
||||
SharedDecoderCount: 1,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_3_SLICE: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 3,
|
||||
MultiprocessorCount: 14,
|
||||
SharedCopyEngineCount: 3,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE,
|
||||
SliceCount: 2,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 28,
|
||||
SharedCopyEngineCount: 3,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE,
|
||||
SliceCount: 3,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 42,
|
||||
SharedCopyEngineCount: 3,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_4_SLICE: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 4,
|
||||
MultiprocessorCount: 14,
|
||||
SharedCopyEngineCount: 4,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE,
|
||||
SliceCount: 2,
|
||||
InstanceCount: 2,
|
||||
MultiprocessorCount: 28,
|
||||
SharedCopyEngineCount: 4,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE,
|
||||
SliceCount: 4,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 56,
|
||||
SharedCopyEngineCount: 4,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_7_SLICE: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 7,
|
||||
MultiprocessorCount: 14,
|
||||
SharedCopyEngineCount: 7,
|
||||
SharedDecoderCount: 5,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 1,
|
||||
SharedOfaCount: 1,
|
||||
},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE,
|
||||
SliceCount: 2,
|
||||
InstanceCount: 3,
|
||||
MultiprocessorCount: 28,
|
||||
SharedCopyEngineCount: 7,
|
||||
SharedDecoderCount: 5,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 1,
|
||||
SharedOfaCount: 1,
|
||||
},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE,
|
||||
SliceCount: 3,
|
||||
InstanceCount: 2,
|
||||
MultiprocessorCount: 42,
|
||||
SharedCopyEngineCount: 7,
|
||||
SharedDecoderCount: 5,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 1,
|
||||
SharedOfaCount: 1,
|
||||
},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE,
|
||||
SliceCount: 4,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 56,
|
||||
SharedCopyEngineCount: 7,
|
||||
SharedDecoderCount: 5,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 1,
|
||||
SharedOfaCount: 1,
|
||||
},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {
|
||||
Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE,
|
||||
SliceCount: 7,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 98,
|
||||
SharedCopyEngineCount: 7,
|
||||
SharedDecoderCount: 5,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 1,
|
||||
SharedOfaCount: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
// MIGPlacements holds the placement information for GIs and CIs in this mock server.
|
||||
// We should consider auto-generating this object in the future.
|
||||
var MIGPlacements = struct {
|
||||
GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement
|
||||
ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement
|
||||
}{
|
||||
GpuInstancePossiblePlacements: map[int][]nvml.GpuInstancePlacement{
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE: {
|
||||
{
|
||||
Start: 0,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 1,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 2,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 3,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 4,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 5,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 6,
|
||||
Size: 1,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: {
|
||||
{
|
||||
Start: 0,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 1,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 2,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 3,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 4,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 5,
|
||||
Size: 1,
|
||||
},
|
||||
{
|
||||
Start: 6,
|
||||
Size: 1,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: {
|
||||
{
|
||||
Start: 0,
|
||||
Size: 2,
|
||||
},
|
||||
{
|
||||
Start: 2,
|
||||
Size: 2,
|
||||
},
|
||||
{
|
||||
Start: 4,
|
||||
Size: 2,
|
||||
},
|
||||
{
|
||||
Start: 6,
|
||||
Size: 2,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_2_SLICE: {
|
||||
{
|
||||
Start: 0,
|
||||
Size: 2,
|
||||
},
|
||||
{
|
||||
Start: 2,
|
||||
Size: 2,
|
||||
},
|
||||
{
|
||||
Start: 4,
|
||||
Size: 2,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_3_SLICE: {
|
||||
{
|
||||
Start: 0,
|
||||
Size: 4,
|
||||
},
|
||||
{
|
||||
Start: 4,
|
||||
Size: 4,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_4_SLICE: {
|
||||
{
|
||||
Start: 0,
|
||||
Size: 4,
|
||||
},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_7_SLICE: {
|
||||
{
|
||||
Start: 0,
|
||||
Size: 8,
|
||||
},
|
||||
},
|
||||
},
|
||||
// TODO: Fill out ComputeInstancePossiblePlacements
|
||||
ComputeInstancePossiblePlacements: map[int]map[int][]nvml.ComputeInstancePlacement{
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_2_SLICE: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_3_SLICE: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_4_SLICE: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {},
|
||||
},
|
||||
nvml.GPU_INSTANCE_PROFILE_7_SLICE: {
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {},
|
||||
nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {},
|
||||
},
|
||||
},
|
||||
}
|
1
vendor/modules.txt
vendored
1
vendor/modules.txt
vendored
@ -11,6 +11,7 @@ github.com/NVIDIA/go-nvlib/pkg/pciids
|
||||
github.com/NVIDIA/go-nvml/pkg/dl
|
||||
github.com/NVIDIA/go-nvml/pkg/nvml
|
||||
github.com/NVIDIA/go-nvml/pkg/nvml/mock
|
||||
github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100
|
||||
# github.com/cpuguy83/go-md2man/v2 v2.0.5
|
||||
## explicit; go 1.11
|
||||
github.com/cpuguy83/go-md2man/v2/md2man
|
||||
|
Loading…
Reference in New Issue
Block a user