Copy files from nvidia-container-toolkit

Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
Evan Lezar
2021-06-30 13:54:16 +02:00
parent 22fcd022f3
commit d3997eceb2
89 changed files with 8351 additions and 0 deletions

239
internal/ldcache/ldcache.go Normal file
View File

@@ -0,0 +1,239 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
// Adapted from https://github.com/rai-project/ldcache
package ldcache
import (
"bytes"
"encoding/binary"
"errors"
"os"
"path/filepath"
"syscall"
"unsafe"
log "github.com/sirupsen/logrus"
)
const ldcachePath = "/etc/ld.so.cache"
const (
magicString1 = "ld.so-1.7.0"
magicString2 = "glibc-ld.so.cache"
magicVersion = "1.1"
)
const (
flagTypeMask = 0x00ff
flagTypeELF = 0x0001
flagArchMask = 0xff00
flagArchI386 = 0x0000
flagArchX8664 = 0x0300
flagArchX32 = 0x0800
flagArchPpc64le = 0x0500
)
var ErrInvalidCache = errors.New("invalid ld.so.cache file")
type Header1 struct {
Magic [len(magicString1) + 1]byte // include null delimiter
NLibs uint32
}
type Entry1 struct {
Flags int32
Key, Value uint32
}
type Header2 struct {
Magic [len(magicString2)]byte
Version [len(magicVersion)]byte
NLibs uint32
TableSize uint32
_ [3]uint32 // unused
_ uint64 // force 8 byte alignment
}
type Entry2 struct {
Flags int32
Key, Value uint32
OSVersion uint32
HWCap uint64
}
type LDCache struct {
*bytes.Reader
data, libs []byte
header Header2
entries []Entry2
root string
logger *log.Logger
}
func NewLDCacheWithLogger(logger *log.Logger, root string) (*LDCache, error) {
return openWithRoot(logger, root)
}
func Open() (*LDCache, error) {
return openWithRoot(log.StandardLogger(), "")
}
func openWithRoot(logger *log.Logger, root string) (*LDCache, error) {
path := filepath.Join(root, ldcachePath)
logger.Debugf("Opening ld.conf at %v", path)
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
return nil, err
}
d, err := syscall.Mmap(int(f.Fd()), 0, int(fi.Size()),
syscall.PROT_READ, syscall.MAP_PRIVATE)
if err != nil {
return nil, err
}
cache := &LDCache{
data: d,
Reader: bytes.NewReader(d),
root: root,
logger: logger,
}
return cache, cache.parse()
}
func (c *LDCache) Close() error {
return syscall.Munmap(c.data)
}
func (c *LDCache) Magic() string {
return string(c.header.Magic[:])
}
func (c *LDCache) Version() string {
return string(c.header.Version[:])
}
func strn(b []byte, n int) string {
return string(b[:n])
}
func (c *LDCache) parse() error {
var header Header1
// Check for the old format (< glibc-2.2)
if c.Len() <= int(unsafe.Sizeof(header)) {
return ErrInvalidCache
}
if strn(c.data, len(magicString1)) == magicString1 {
if err := binary.Read(c, binary.LittleEndian, &header); err != nil {
return err
}
n := int64(header.NLibs) * int64(unsafe.Sizeof(Entry1{}))
offset, err := c.Seek(n, 1) // skip old entries
if err != nil {
return err
}
n = (-offset) & int64(unsafe.Alignof(c.header)-1)
_, err = c.Seek(n, 1) // skip padding
if err != nil {
return err
}
}
c.libs = c.data[c.Size()-int64(c.Len()):] // kv offsets start here
if err := binary.Read(c, binary.LittleEndian, &c.header); err != nil {
return err
}
if c.Magic() != magicString2 || c.Version() != magicVersion {
return ErrInvalidCache
}
c.entries = make([]Entry2, c.header.NLibs)
if err := binary.Read(c, binary.LittleEndian, &c.entries); err != nil {
return err
}
return nil
}
func (c *LDCache) Lookup(libs ...string) (paths32, paths64 []string) {
c.logger.Debugf("Looking up %v in cache", libs)
type void struct{}
var paths *[]string
set := make(map[string]void)
prefix := make([][]byte, len(libs))
for i := range libs {
prefix[i] = []byte(libs[i])
}
for _, e := range c.entries {
if ((e.Flags & flagTypeMask) & flagTypeELF) == 0 {
continue
}
switch e.Flags & flagArchMask {
case flagArchX8664:
fallthrough
case flagArchPpc64le:
paths = &paths64
case flagArchX32:
fallthrough
case flagArchI386:
paths = &paths32
default:
continue
}
if e.Key > uint32(len(c.libs)) || e.Value > uint32(len(c.libs)) {
continue
}
lib := c.libs[e.Key:]
value := c.libs[e.Value:]
for _, p := range prefix {
if bytes.HasPrefix(lib, p) {
n := bytes.IndexByte(value, 0)
if n < 0 {
break
}
name := filepath.Join(c.root, strn(value, n))
c.logger.Debugf("checking %v", string(name))
path, err := filepath.EvalSymlinks(name)
if err != nil {
c.logger.Debugf("could not resolve symlink for %v", name)
break
}
if _, ok := set[path]; ok {
break
}
set[path] = void{}
*paths = append(*paths, path)
break
}
}
}
return
}

78
internal/lookup/file.go Normal file
View File

@@ -0,0 +1,78 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package lookup
import (
"fmt"
"os"
"path/filepath"
log "github.com/sirupsen/logrus"
)
type file struct {
logger *log.Logger
prefixes []string
filter func(string) error
}
func NewFileLocator(root string) Locator {
return NewFileLocatorWithLogger(log.StandardLogger(), root)
}
func NewFileLocatorWithLogger(logger *log.Logger, root string) Locator {
l := file{
logger: logger,
prefixes: []string{root},
filter: assertFile,
}
return &l
}
var _ Locator = (*file)(nil)
func (p file) Locate(filename string) ([]string, error) {
var filenames []string
for _, prefix := range p.prefixes {
candidate := filepath.Join(prefix, filename)
p.logger.Debugf("Checking candidate '%v'", candidate)
err := p.filter(candidate)
if err != nil {
p.logger.Debugf("Candidate '%v' does not meet requirements: %v", candidate, err)
continue
}
filenames = append(filenames, candidate)
}
if len(filename) == 0 {
return nil, fmt.Errorf("file %v not found", filename)
}
return filenames, nil
}
func assertFile(filename string) error {
info, err := os.Stat(filename)
if err != nil {
return fmt.Errorf("error getting info for %v: %v", filename, err)
}
if info.IsDir() {
return fmt.Errorf("specified path '%v' is a directory", filename)
}
return nil
}

View File

@@ -0,0 +1,65 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package lookup
import (
"fmt"
log "github.com/sirupsen/logrus"
"gitlab.com/nvidia/cloud-native/container-toolkit/internal/ldcache"
)
type library struct {
logger *log.Logger
cache *ldcache.LDCache
}
var _ Locator = (*library)(nil)
// NewLibraryLocatorWithLogger creates a library locator using the standard logger
func NewLibraryLocator(root string) (Locator, error) {
return NewLibraryLocatorWithLogger(log.StandardLogger(), root)
}
// NewLibraryLocatorWithLogger creates a library locator using the specified logger.
func NewLibraryLocatorWithLogger(logger *log.Logger, root string) (Locator, error) {
logger.Infof("Reading ldcache at %v", root)
cache, err := ldcache.NewLDCacheWithLogger(logger, root)
if err != nil {
return nil, fmt.Errorf("error loading ldcache: %v", err)
}
l := library{
logger: logger,
cache: cache,
}
return &l, nil
}
func (l library) Locate(libname string) ([]string, error) {
paths32, paths64 := l.cache.Lookup(libname)
if len(paths32) > 0 {
l.logger.Warnf("Ignoring 32-bit libraries for %v: %v", libname, paths32)
}
if len(paths64) == 0 {
return nil, fmt.Errorf("64-bit library %v not found", libname)
}
return paths64, nil
}

24
internal/lookup/lookup.go Normal file
View File

@@ -0,0 +1,24 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package lookup
//go:generate moq -stub -out lookup_mock.go . Locator
// Locator defines the interface for locating files on a system.
type Locator interface {
Locate(string) ([]string, error)
}

View File

@@ -0,0 +1,77 @@
// Code generated by moq; DO NOT EDIT.
// github.com/matryer/moq
package lookup
import (
"sync"
)
// Ensure, that LocatorMock does implement Locator.
// If this is not the case, regenerate this file with moq.
var _ Locator = &LocatorMock{}
// LocatorMock is a mock implementation of Locator.
//
// func TestSomethingThatUsesLocator(t *testing.T) {
//
// // make and configure a mocked Locator
// mockedLocator := &LocatorMock{
// LocateFunc: func(s string) ([]string, error) {
// panic("mock out the Locate method")
// },
// }
//
// // use mockedLocator in code that requires Locator
// // and then make assertions.
//
// }
type LocatorMock struct {
// LocateFunc mocks the Locate method.
LocateFunc func(s string) ([]string, error)
// calls tracks calls to the methods.
calls struct {
// Locate holds details about calls to the Locate method.
Locate []struct {
// S is the s argument value.
S string
}
}
lockLocate sync.RWMutex
}
// Locate calls LocateFunc.
func (mock *LocatorMock) Locate(s string) ([]string, error) {
callInfo := struct {
S string
}{
S: s,
}
mock.lockLocate.Lock()
mock.calls.Locate = append(mock.calls.Locate, callInfo)
mock.lockLocate.Unlock()
if mock.LocateFunc == nil {
var (
stringsOut []string
errOut error
)
return stringsOut, errOut
}
return mock.LocateFunc(s)
}
// LocateCalls gets all the calls that were made to Locate.
// Check the length with:
// len(mockedLocator.LocateCalls())
func (mock *LocatorMock) LocateCalls() []struct {
S string
} {
var calls []struct {
S string
}
mock.lockLocate.RLock()
calls = mock.calls.Locate
mock.lockLocate.RUnlock()
return calls
}

94
internal/lookup/path.go Normal file
View File

@@ -0,0 +1,94 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package lookup
import (
"fmt"
"os"
"path/filepath"
"strings"
log "github.com/sirupsen/logrus"
)
const (
envPath = "PATH"
)
var defaultPaths = []string{"/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"}
type path struct {
file
}
func NewPathLocator(root string) Locator {
return NewPathLocatorWithLogger(log.StandardLogger(), root)
}
func NewPathLocatorWithLogger(logger *log.Logger, root string) Locator {
pathEnv := os.Getenv(envPath)
paths := filepath.SplitList(pathEnv)
if root != "" {
paths = append(paths, defaultPaths...)
}
var prefixes []string
for _, dir := range paths {
prefixes = append(prefixes, filepath.Join(root, dir))
}
l := path{
file: file{
logger: logger,
prefixes: prefixes,
filter: assertExecutable,
},
}
return &l
}
var _ Locator = (*path)(nil)
func (p path) Locate(filename string) ([]string, error) {
// For absolute paths we ensure that it is executable
if strings.Contains(filename, "/") {
err := assertExecutable(filename)
if err != nil {
return nil, fmt.Errorf("absolute path %v is not an executable file: %v", filename, err)
}
return []string{filename}, nil
}
return p.file.Locate(filename)
}
func assertExecutable(filename string) error {
err := assertFile(filename)
if err != nil {
return err
}
info, err := os.Stat(filename)
if err != nil {
return err
}
if info.Mode()&0111 == 0 {
return fmt.Errorf("specified file '%v' is not executable", filename)
}
return nil
}

141
internal/nvcaps/nvcaps.go Normal file
View File

@@ -0,0 +1,141 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package nvcaps
import (
"bufio"
"fmt"
"io"
"log"
"os"
"path/filepath"
"strconv"
"strings"
)
const (
nvidiaProcDriverPath = "/proc/driver/nvidia"
nvidiaCapabilitiesPath = nvidiaProcDriverPath + "/capabilities"
nvcapsProcDriverPath = "/proc/driver/nvidia-caps"
nvcapsMigMinorsPath = nvcapsProcDriverPath + "/mig-minors"
nvcapsDevicePath = "/dev/nvidia-caps"
)
// MigMinor represents the minor number of a MIG device
type MigMinor int
// MigCap represents the path to a MIG cap file
type MigCap string
// LoadMigMinors loads the MIG minors file and returns its contents as a map
func LoadMigMinors() (map[MigCap]MigMinor, error) {
// Open nvcapsMigMinorsPath for walking.
// If the nvcapsMigMinorsPath does not exist, then we are not on a MIG
// capable machine, so there is nothing to do.
// The format of this file is discussed in:
// https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#unique_1576522674
minorsFile, err := os.Open(nvcapsMigMinorsPath)
if os.IsNotExist(err) {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("error opening MIG minors file: %v", err)
}
defer minorsFile.Close()
return processMinorsFile(minorsFile), nil
}
func processMinorsFile(minorsFile io.Reader) map[MigCap]MigMinor {
// Walk each line of nvcapsMigMinorsPath and construct a mapping of nvidia
// capabilities path to device minor for that capability
migCaps := make(map[MigCap]MigMinor)
scanner := bufio.NewScanner(minorsFile)
for scanner.Scan() {
cap, minor, err := processMigMinorsLine(scanner.Text())
if err != nil {
log.Printf("Skipping line in MIG minors file: %v", err)
continue
}
migCaps[cap] = minor
}
return migCaps
}
func processMigMinorsLine(line string) (MigCap, MigMinor, error) {
parts := strings.Split(line, " ")
if len(parts) != 2 {
return "", 0, fmt.Errorf("error processing line: %v", line)
}
migCap := MigCap(parts[0])
if !migCap.isValid() {
return "", 0, fmt.Errorf("invalid MIG minors line: '%v'", line)
}
minor, err := strconv.Atoi(parts[1])
if err != nil {
return "", 0, fmt.Errorf("error reading MIG minor from '%v': %v", line, err)
}
return migCap, MigMinor(minor), nil
}
func (m MigCap) isValid() bool {
cap := string(m)
switch cap {
case "config", "monitor":
return true
default:
var gpu int
var gi int
var ci int
// Loog for a CI access file
n, _ := fmt.Sscanf(cap, "gpu%d/gi%d/ci%d/access", &gpu, &gi, &ci)
if n == 3 {
return true
}
// Look for a GI access file
n, _ = fmt.Sscanf(cap, "gpu%d/gi%d/access %d", &gpu, &gi)
if n == 2 {
return true
}
}
return false
}
// ProcPath returns the proc path associated with the MIG capability
func (m MigCap) ProcPath() string {
id := string(m)
var path string
switch id {
case "config", "monitor":
path = "mig/" + id
default:
parts := strings.SplitN(id, "/", 2)
path = strings.Join([]string{parts[0], "mig", parts[1]}, "/")
}
return filepath.Join(nvidiaCapabilitiesPath, path)
}
// DevicePath returns the path for the nvidia-caps device with the specified
// minor number
func (m MigMinor) DevicePath() string {
return fmt.Sprintf(nvcapsDevicePath+"/nvidia-cap%d", m)
}

View File

@@ -0,0 +1,94 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package nvcaps
import (
"strings"
"testing"
"github.com/stretchr/testify/require"
)
func TestProcessMinorsFile(t *testing.T) {
testCases := []struct {
lines []string
expected map[MigCap]MigMinor
}{
{[]string{}, map[MigCap]MigMinor{}},
{[]string{"invalidLine"}, map[MigCap]MigMinor{}},
{[]string{"config 1"}, map[MigCap]MigMinor{"config": 1}},
{[]string{"gpu0/gi0/ci0/access 4"}, map[MigCap]MigMinor{"gpu0/gi0/ci0/access": 4}},
{[]string{"config 1", "invalidLine"}, map[MigCap]MigMinor{"config": 1}},
{[]string{"config 1", "gpu0/gi0/ci0/access 4"}, map[MigCap]MigMinor{"config": 1, "gpu0/gi0/ci0/access": 4}},
}
for _, tc := range testCases {
contents := strings.NewReader(strings.Join(tc.lines, "\n"))
d := processMinorsFile(contents)
require.Equalf(t, tc.expected, d, "testCase: %v", tc)
}
}
func TestProcessMigMinorsLine(t *testing.T) {
testCases := []struct {
line string
cap MigCap
minor MigMinor
err bool
}{
{"config 1", "config", 1, false},
{"monitor 2", "monitor", 2, false},
{"gpu0/gi0/access 3", "gpu0/gi0/access", 3, false},
{"gpu0/gi0/ci0/access 4", "gpu0/gi0/ci0/access", 4, false},
{"notconfig 99", "", 0, true},
{"config notanint", "", 0, true},
{"", "", 0, true},
}
for _, tc := range testCases {
cap, minor, err := processMigMinorsLine(tc.line)
require.Equalf(t, tc.cap, cap, "testCase: %v", tc)
require.Equalf(t, tc.minor, minor, "testCase: %v", tc)
if tc.err {
require.Errorf(t, err, "testCase: %v", tc)
} else {
require.NoErrorf(t, err, "testCase: %v", tc)
}
}
}
func TestMigCapProcPaths(t *testing.T) {
testCases := []struct {
input string
expected string
}{
{"config", "/proc/driver/nvidia/capabilities/mig/config"},
{"monitor", "/proc/driver/nvidia/capabilities/mig/monitor"},
{"gpu0/gi0/access", "/proc/driver/nvidia/capabilities/gpu0/mig/gi0/access"},
{"gpu0/gi0/ci0/access", "/proc/driver/nvidia/capabilities/gpu0/mig/gi0/ci0/access"},
}
for _, tc := range testCases {
m := MigCap(tc.input)
require.Equal(t, tc.expected, m.ProcPath())
}
}
func TestMigMinorDevicePath(t *testing.T) {
m := MigMinor(0)
require.Equal(t, "/dev/nvidia-caps/nvidia-cap0", m.DevicePath())
}

79
internal/nvml/consts.go Normal file
View File

@@ -0,0 +1,79 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nvml
import (
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
const (
SUCCESS = nvml.SUCCESS
ERROR_UNINITIALIZED = nvml.ERROR_UNINITIALIZED
ERROR_INVALID_ARGUMENT = nvml.ERROR_INVALID_ARGUMENT
ERROR_NOT_SUPPORTED = nvml.ERROR_NOT_SUPPORTED
ERROR_NO_PERMISSION = nvml.ERROR_NO_PERMISSION
ERROR_ALREADY_INITIALIZED = nvml.ERROR_ALREADY_INITIALIZED
ERROR_NOT_FOUND = nvml.ERROR_NOT_FOUND
ERROR_INSUFFICIENT_SIZE = nvml.ERROR_INSUFFICIENT_SIZE
ERROR_INSUFFICIENT_POWER = nvml.ERROR_INSUFFICIENT_POWER
ERROR_DRIVER_NOT_LOADED = nvml.ERROR_DRIVER_NOT_LOADED
ERROR_TIMEOUT = nvml.ERROR_TIMEOUT
ERROR_IRQ_ISSUE = nvml.ERROR_IRQ_ISSUE
ERROR_LIBRARY_NOT_FOUND = nvml.ERROR_LIBRARY_NOT_FOUND
ERROR_FUNCTION_NOT_FOUND = nvml.ERROR_FUNCTION_NOT_FOUND
ERROR_CORRUPTED_INFOROM = nvml.ERROR_CORRUPTED_INFOROM
ERROR_GPU_IS_LOST = nvml.ERROR_GPU_IS_LOST
ERROR_RESET_REQUIRED = nvml.ERROR_RESET_REQUIRED
ERROR_OPERATING_SYSTEM = nvml.ERROR_OPERATING_SYSTEM
ERROR_LIB_RM_VERSION_MISMATCH = nvml.ERROR_LIB_RM_VERSION_MISMATCH
ERROR_IN_USE = nvml.ERROR_IN_USE
ERROR_MEMORY = nvml.ERROR_MEMORY
ERROR_NO_DATA = nvml.ERROR_NO_DATA
ERROR_VGPU_ECC_NOT_SUPPORTED = nvml.ERROR_VGPU_ECC_NOT_SUPPORTED
ERROR_INSUFFICIENT_RESOURCES = nvml.ERROR_INSUFFICIENT_RESOURCES
ERROR_UNKNOWN = nvml.ERROR_UNKNOWN
)
const (
DEVICE_MIG_ENABLE = nvml.DEVICE_MIG_ENABLE
DEVICE_MIG_DISABLE = nvml.DEVICE_MIG_DISABLE
)
const (
GPU_INSTANCE_PROFILE_1_SLICE = nvml.GPU_INSTANCE_PROFILE_1_SLICE
GPU_INSTANCE_PROFILE_2_SLICE = nvml.GPU_INSTANCE_PROFILE_2_SLICE
GPU_INSTANCE_PROFILE_3_SLICE = nvml.GPU_INSTANCE_PROFILE_3_SLICE
GPU_INSTANCE_PROFILE_4_SLICE = nvml.GPU_INSTANCE_PROFILE_4_SLICE
GPU_INSTANCE_PROFILE_7_SLICE = nvml.GPU_INSTANCE_PROFILE_7_SLICE
GPU_INSTANCE_PROFILE_8_SLICE = nvml.GPU_INSTANCE_PROFILE_8_SLICE
GPU_INSTANCE_PROFILE_COUNT = nvml.GPU_INSTANCE_PROFILE_COUNT
)
const (
COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT
)
const (
COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED
COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT
)

594
internal/nvml/mock.go Normal file
View File

@@ -0,0 +1,594 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nvml
import "fmt"
type MockServer struct {
Devices []Device
}
type MockLunaServer struct {
MockServer
}
type MockA100Device struct {
Index int
MinorNumber int
MigMode int
GpuInstances map[*MockA100GpuInstance]struct{}
GpuInstanceCounter uint32
}
type MockA100GpuInstance struct {
Info GpuInstanceInfo
ComputeInstances map[*MockA100ComputeInstance]struct{}
ComputeInstanceCounter uint32
}
type MockA100ComputeInstance struct {
Info ComputeInstanceInfo
}
var _ Interface = (*MockLunaServer)(nil)
var _ Device = (*MockA100Device)(nil)
var _ GpuInstance = (*MockA100GpuInstance)(nil)
var _ ComputeInstance = (*MockA100ComputeInstance)(nil)
var MockA100MIGProfiles = struct {
GpuInstanceProfiles map[int]GpuInstanceProfileInfo
ComputeInstanceProfiles map[int]map[int]ComputeInstanceProfileInfo
}{
GpuInstanceProfiles: map[int]GpuInstanceProfileInfo{
GPU_INSTANCE_PROFILE_1_SLICE: {
Id: GPU_INSTANCE_PROFILE_1_SLICE,
IsP2pSupported: 0,
SliceCount: 1,
InstanceCount: 7,
MultiprocessorCount: 1,
CopyEngineCount: 1,
DecoderCount: 0,
EncoderCount: 0,
JpegCount: 0,
OfaCount: 0,
MemorySizeMB: 5120,
},
GPU_INSTANCE_PROFILE_2_SLICE: {
Id: GPU_INSTANCE_PROFILE_2_SLICE,
IsP2pSupported: 0,
SliceCount: 2,
InstanceCount: 3,
MultiprocessorCount: 2,
CopyEngineCount: 2,
DecoderCount: 1,
EncoderCount: 1,
JpegCount: 0,
OfaCount: 0,
MemorySizeMB: 10240,
},
GPU_INSTANCE_PROFILE_3_SLICE: {
Id: GPU_INSTANCE_PROFILE_3_SLICE,
IsP2pSupported: 0,
SliceCount: 3,
InstanceCount: 2,
MultiprocessorCount: 3,
CopyEngineCount: 4,
DecoderCount: 2,
EncoderCount: 2,
JpegCount: 0,
OfaCount: 0,
MemorySizeMB: 20480,
},
GPU_INSTANCE_PROFILE_4_SLICE: {
Id: GPU_INSTANCE_PROFILE_4_SLICE,
IsP2pSupported: 0,
SliceCount: 4,
InstanceCount: 1,
MultiprocessorCount: 4,
CopyEngineCount: 4,
DecoderCount: 2,
EncoderCount: 2,
JpegCount: 0,
OfaCount: 0,
MemorySizeMB: 20480,
},
GPU_INSTANCE_PROFILE_7_SLICE: {
Id: GPU_INSTANCE_PROFILE_7_SLICE,
IsP2pSupported: 0,
SliceCount: 7,
InstanceCount: 1,
MultiprocessorCount: 7,
CopyEngineCount: 8,
DecoderCount: 5,
EncoderCount: 5,
JpegCount: 1,
OfaCount: 1,
MemorySizeMB: 40960,
},
},
ComputeInstanceProfiles: map[int]map[int]ComputeInstanceProfileInfo{
GPU_INSTANCE_PROFILE_1_SLICE: {
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
SliceCount: 1,
InstanceCount: 1,
MultiprocessorCount: 1,
SharedCopyEngineCount: 1,
SharedDecoderCount: 0,
SharedEncoderCount: 0,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
},
GPU_INSTANCE_PROFILE_2_SLICE: {
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
SliceCount: 1,
InstanceCount: 2,
MultiprocessorCount: 1,
SharedCopyEngineCount: 2,
SharedDecoderCount: 1,
SharedEncoderCount: 1,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
SliceCount: 2,
InstanceCount: 1,
MultiprocessorCount: 2,
SharedCopyEngineCount: 2,
SharedDecoderCount: 1,
SharedEncoderCount: 1,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
},
GPU_INSTANCE_PROFILE_3_SLICE: {
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
SliceCount: 1,
InstanceCount: 3,
MultiprocessorCount: 1,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 1,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
SliceCount: 2,
InstanceCount: 1,
MultiprocessorCount: 2,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 2,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
COMPUTE_INSTANCE_PROFILE_3_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_3_SLICE,
SliceCount: 3,
InstanceCount: 1,
MultiprocessorCount: 3,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 0,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
},
GPU_INSTANCE_PROFILE_4_SLICE: {
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
SliceCount: 1,
InstanceCount: 4,
MultiprocessorCount: 1,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 2,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
SliceCount: 2,
InstanceCount: 2,
MultiprocessorCount: 2,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 2,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
COMPUTE_INSTANCE_PROFILE_4_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_4_SLICE,
SliceCount: 4,
InstanceCount: 1,
MultiprocessorCount: 4,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 2,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
},
GPU_INSTANCE_PROFILE_7_SLICE: {
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
SliceCount: 1,
InstanceCount: 7,
MultiprocessorCount: 1,
SharedCopyEngineCount: 8,
SharedDecoderCount: 5,
SharedEncoderCount: 5,
SharedJpegCount: 1,
SharedOfaCount: 1,
},
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
SliceCount: 2,
InstanceCount: 3,
MultiprocessorCount: 2,
SharedCopyEngineCount: 8,
SharedDecoderCount: 5,
SharedEncoderCount: 5,
SharedJpegCount: 1,
SharedOfaCount: 1,
},
COMPUTE_INSTANCE_PROFILE_3_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_3_SLICE,
SliceCount: 3,
InstanceCount: 2,
MultiprocessorCount: 3,
SharedCopyEngineCount: 8,
SharedDecoderCount: 5,
SharedEncoderCount: 5,
SharedJpegCount: 1,
SharedOfaCount: 1,
},
COMPUTE_INSTANCE_PROFILE_4_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_4_SLICE,
SliceCount: 4,
InstanceCount: 1,
MultiprocessorCount: 4,
SharedCopyEngineCount: 8,
SharedDecoderCount: 5,
SharedEncoderCount: 5,
SharedJpegCount: 1,
SharedOfaCount: 1,
},
COMPUTE_INSTANCE_PROFILE_7_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_7_SLICE,
SliceCount: 7,
InstanceCount: 1,
MultiprocessorCount: 7,
SharedCopyEngineCount: 8,
SharedDecoderCount: 5,
SharedEncoderCount: 5,
SharedJpegCount: 1,
SharedOfaCount: 1,
},
},
},
}
func NewMockNVMLServer(devices ...Device) Interface {
return &MockServer{
Devices: devices,
}
}
func NewMockNVMLOnLunaServer() Interface {
devices := []Device{
NewMockA100Device(0),
NewMockA100Device(1),
NewMockA100Device(2),
NewMockA100Device(3),
NewMockA100Device(4),
NewMockA100Device(5),
NewMockA100Device(6),
NewMockA100Device(7),
}
return NewMockNVMLServer(devices...)
}
func NewMockA100Device(index int) Device {
return &MockA100Device{
Index: index,
GpuInstances: make(map[*MockA100GpuInstance]struct{}),
GpuInstanceCounter: 0,
}
}
func NewMockA100GpuInstance(info GpuInstanceInfo) GpuInstance {
return &MockA100GpuInstance{
Info: info,
ComputeInstances: make(map[*MockA100ComputeInstance]struct{}),
ComputeInstanceCounter: 0,
}
}
func NewMockA100ComputeInstance(info ComputeInstanceInfo) ComputeInstance {
return &MockA100ComputeInstance{
Info: info,
}
}
func (n *MockServer) Init() Return {
return MockReturn(SUCCESS)
}
func (n *MockServer) Shutdown() Return {
return MockReturn(SUCCESS)
}
func (n *MockServer) DeviceGetCount() (int, Return) {
return len(n.Devices), MockReturn(SUCCESS)
}
func (n *MockServer) DeviceGetHandleByIndex(index int) (Device, Return) {
if index < 0 || index >= len(n.Devices) {
return nil, MockReturn(ERROR_INVALID_ARGUMENT)
}
return n.Devices[index], MockReturn(SUCCESS)
}
func (n *MockServer) SystemGetDriverVersion() (string, Return) {
return "999.99", MockReturn(SUCCESS)
}
func (d *MockA100Device) GetIndex() (int, Return) {
return d.Index, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetPciInfo() (PciInfo, Return) {
var busID [32]int8
for i, b := range []byte("0000FFFF:FF:FF.F") {
busID[i] = int8(b)
}
p := PciInfo{
BusId: busID,
PciDeviceId: 0x20B010DE,
}
return p, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetUUID() (string, Return) {
return fmt.Sprintf("GPU-%d", d.Index), MockReturn(SUCCESS)
}
func (d *MockA100Device) GetMinorNumber() (int, Return) {
return d.MinorNumber, MockReturn(SUCCESS)
}
func (d *MockA100Device) SetMigMode(mode int) (Return, Return) {
d.MigMode = mode
return MockReturn(SUCCESS), MockReturn(SUCCESS)
}
func (d *MockA100Device) GetMigMode() (int, int, Return) {
return d.MigMode, d.MigMode, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetGpuInstanceProfileInfo(giProfileId int) (GpuInstanceProfileInfo, Return) {
if giProfileId < 0 || giProfileId >= GPU_INSTANCE_PROFILE_COUNT {
return GpuInstanceProfileInfo{}, MockReturn(ERROR_INVALID_ARGUMENT)
}
if _, exists := MockA100MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists {
return GpuInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
}
return MockA100MIGProfiles.GpuInstanceProfiles[giProfileId], MockReturn(SUCCESS)
}
func (d *MockA100Device) CreateGpuInstance(info *GpuInstanceProfileInfo) (GpuInstance, Return) {
giInfo := GpuInstanceInfo{
Device: d,
Id: d.GpuInstanceCounter,
ProfileId: info.Id,
}
d.GpuInstanceCounter++
gi := NewMockA100GpuInstance(giInfo)
d.GpuInstances[gi.(*MockA100GpuInstance)] = struct{}{}
return gi, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetGpuInstances(info *GpuInstanceProfileInfo) ([]GpuInstance, Return) {
var gis []GpuInstance
for gi := range d.GpuInstances {
if gi.Info.ProfileId == info.Id {
gis = append(gis, gi)
}
}
return gis, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetMaxMigDeviceCount() (int, Return) {
var count int
for gi := range d.GpuInstances {
count = count + int(gi.ComputeInstanceCounter)
}
return count, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetMigDeviceHandleByIndex(Index int) (Device, Return) {
var count int
for gi := range d.GpuInstances {
if count+int(gi.ComputeInstanceCounter) < Index {
count = count + int(gi.ComputeInstanceCounter)
continue
}
for ci := range gi.ComputeInstances {
if count < Index {
count++
continue
}
return ci, MockReturn(SUCCESS)
}
}
return nil, MockReturn(ERROR_NOT_FOUND)
}
func (d *MockA100Device) GetDeviceHandleFromMigDeviceHandle() (Device, Return) {
return nil, MockReturn(ERROR_NOT_SUPPORTED)
}
func (d *MockA100Device) IsMigDeviceHandle() (bool, Return) {
return false, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetComputeInstanceId() (int, Return) {
panic("Not implemented: GetComputeInstanceId")
}
func (d *MockA100Device) GetGPUInstanceId() (int, Return) {
panic("Not implemented: GetGPUInstanceId")
}
func (gi *MockA100GpuInstance) GetInfo() (GpuInstanceInfo, Return) {
return gi.Info, MockReturn(SUCCESS)
}
func (gi *MockA100GpuInstance) GetComputeInstanceProfileInfo(ciProfileId int, ciEngProfileId int) (ComputeInstanceProfileInfo, Return) {
if ciProfileId < 0 || ciProfileId >= COMPUTE_INSTANCE_PROFILE_COUNT {
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_INVALID_ARGUMENT)
}
if ciEngProfileId != COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED {
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
}
giProfileId := int(gi.Info.ProfileId)
if _, exists := MockA100MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists {
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
}
if _, exists := MockA100MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists {
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
}
return MockA100MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], MockReturn(SUCCESS)
}
func (gi *MockA100GpuInstance) CreateComputeInstance(info *ComputeInstanceProfileInfo) (ComputeInstance, Return) {
ciInfo := ComputeInstanceInfo{
Device: gi.Info.Device,
GpuInstance: gi,
Id: gi.ComputeInstanceCounter,
ProfileId: info.Id,
}
gi.ComputeInstanceCounter++
ci := NewMockA100ComputeInstance(ciInfo)
gi.ComputeInstances[ci.(*MockA100ComputeInstance)] = struct{}{}
return ci, MockReturn(SUCCESS)
}
func (gi *MockA100GpuInstance) GetComputeInstances(info *ComputeInstanceProfileInfo) ([]ComputeInstance, Return) {
var cis []ComputeInstance
for ci := range gi.ComputeInstances {
if ci.Info.ProfileId == info.Id {
cis = append(cis, ci)
}
}
return cis, MockReturn(SUCCESS)
}
func (gi *MockA100GpuInstance) Destroy() Return {
delete(gi.Info.Device.(*MockA100Device).GpuInstances, gi)
return MockReturn(SUCCESS)
}
func (ci *MockA100ComputeInstance) GetInfo() (ComputeInstanceInfo, Return) {
return ci.Info, MockReturn(SUCCESS)
}
func (ci *MockA100ComputeInstance) Destroy() Return {
delete(ci.Info.GpuInstance.(*MockA100GpuInstance).ComputeInstances, ci)
return MockReturn(SUCCESS)
}
// Since a compute instance can be used as a MIG device handle, it must also
// implement the Device interface
var _ Device = (*MockA100ComputeInstance)(nil)
func (c *MockA100ComputeInstance) GetIndex() (int, Return) {
return int(c.Info.Id), MockReturn(SUCCESS)
}
func (c *MockA100ComputeInstance) GetPciInfo() (PciInfo, Return) {
// TODO: How does this behave on an actual MIG system?
panic("Not implemented: GetPciInfo")
}
func (c *MockA100ComputeInstance) GetUUID() (string, Return) {
return fmt.Sprintf("MIG-%d", c.Info.Id), MockReturn(SUCCESS)
}
func (c *MockA100ComputeInstance) GetMinorNumber() (int, Return) {
// TODO: This depends on the content of the mig-minors file and the (gpu, gi, ci) tuple
panic("Not implemented: GetMinorNumber")
}
func (c *MockA100ComputeInstance) SetMigMode(Mode int) (Return, Return) {
panic("Not implemented: SetMigMode")
}
func (c *MockA100ComputeInstance) GetMigMode() (int, int, Return) {
panic("Not implemented: GetMigMode")
}
func (c *MockA100ComputeInstance) GetGpuInstanceProfileInfo(Profile int) (GpuInstanceProfileInfo, Return) {
panic("Not implemented: GetGpuInstanceProfileInfo")
}
func (c *MockA100ComputeInstance) CreateGpuInstance(Info *GpuInstanceProfileInfo) (GpuInstance, Return) {
panic("Not implemented: CreateGpuInstance")
}
func (c *MockA100ComputeInstance) GetGpuInstances(Info *GpuInstanceProfileInfo) ([]GpuInstance, Return) {
panic("Not implemented: GetGpuInstances")
}
func (c *MockA100ComputeInstance) GetMaxMigDeviceCount() (int, Return) {
panic("Not implemented: GetMaxMigDeviceCount")
}
func (c *MockA100ComputeInstance) GetMigDeviceHandleByIndex(Index int) (Device, Return) {
panic("Not implemented: GetMigDeviceHandleByIndex")
}
func (c *MockA100ComputeInstance) GetDeviceHandleFromMigDeviceHandle() (Device, Return) {
return c.Info.Device, MockReturn(SUCCESS)
}
func (c *MockA100ComputeInstance) IsMigDeviceHandle() (bool, Return) {
return true, MockReturn(SUCCESS)
}
func (c *MockA100ComputeInstance) GetComputeInstanceId() (int, Return) {
return int(c.Info.Id), MockReturn(SUCCESS)
}
func (c *MockA100ComputeInstance) GetGPUInstanceId() (int, Return) {
info, r := c.Info.GpuInstance.GetInfo()
if r.Value() != SUCCESS {
return 0, MockReturn(r.Value())
}
return int(info.Id), MockReturn(SUCCESS)
}

188
internal/nvml/nvml.go Normal file
View File

@@ -0,0 +1,188 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nvml
import (
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
type nvmlLib struct{}
type nvmlDevice nvml.Device
type nvmlGpuInstance nvml.GpuInstance
type nvmlComputeInstance nvml.ComputeInstance
var _ Interface = (*nvmlLib)(nil)
var _ Device = (*nvmlDevice)(nil)
var _ GpuInstance = (*nvmlGpuInstance)(nil)
var _ ComputeInstance = (*nvmlComputeInstance)(nil)
func New() Interface {
return &nvmlLib{}
}
func (n *nvmlLib) Init() Return {
return nvmlReturn(nvml.Init())
}
func (n *nvmlLib) Shutdown() Return {
return nvmlReturn(nvml.Shutdown())
}
func (n *nvmlLib) DeviceGetCount() (int, Return) {
c, r := nvml.DeviceGetCount()
return c, nvmlReturn(r)
}
func (n *nvmlLib) DeviceGetHandleByIndex(index int) (Device, Return) {
d, r := nvml.DeviceGetHandleByIndex(index)
return nvmlDevice(d), nvmlReturn(r)
}
func (n *nvmlLib) SystemGetDriverVersion() (string, Return) {
v, r := nvml.SystemGetDriverVersion()
return v, nvmlReturn(r)
}
func (d nvmlDevice) GetIndex() (int, Return) {
i, r := nvml.Device(d).GetIndex()
return i, nvmlReturn(r)
}
func (d nvmlDevice) GetPciInfo() (PciInfo, Return) {
p, r := nvml.Device(d).GetPciInfo()
return PciInfo(p), nvmlReturn(r)
}
func (d nvmlDevice) GetUUID() (string, Return) {
u, r := nvml.Device(d).GetUUID()
return u, nvmlReturn(r)
}
func (d nvmlDevice) GetMinorNumber() (int, Return) {
m, r := nvml.Device(d).GetMinorNumber()
return m, nvmlReturn(r)
}
func (d nvmlDevice) IsMigDeviceHandle() (bool, Return) {
b, r := nvml.Device(d).IsMigDeviceHandle()
return b, nvmlReturn(r)
}
func (d nvmlDevice) GetDeviceHandleFromMigDeviceHandle() (Device, Return) {
p, r := nvml.Device(d).GetDeviceHandleFromMigDeviceHandle()
return nvmlDevice(p), nvmlReturn(r)
}
func (d nvmlDevice) GetGPUInstanceId() (int, Return) {
gi, r := nvml.Device(d).GetGpuInstanceId()
return gi, nvmlReturn(r)
}
func (d nvmlDevice) GetComputeInstanceId() (int, Return) {
ci, r := nvml.Device(d).GetComputeInstanceId()
return ci, nvmlReturn(r)
}
func (d nvmlDevice) SetMigMode(mode int) (Return, Return) {
r1, r2 := nvml.Device(d).SetMigMode(mode)
return nvmlReturn(r1), nvmlReturn(r2)
}
func (d nvmlDevice) GetMigMode() (int, int, Return) {
s1, s2, r := nvml.Device(d).GetMigMode()
return s1, s2, nvmlReturn(r)
}
func (d nvmlDevice) GetGpuInstanceProfileInfo(profile int) (GpuInstanceProfileInfo, Return) {
p, r := nvml.Device(d).GetGpuInstanceProfileInfo(profile)
return GpuInstanceProfileInfo(p), nvmlReturn(r)
}
func (d nvmlDevice) CreateGpuInstance(info *GpuInstanceProfileInfo) (GpuInstance, Return) {
gi, r := nvml.Device(d).CreateGpuInstance((*nvml.GpuInstanceProfileInfo)(info))
return nvmlGpuInstance(gi), nvmlReturn(r)
}
func (d nvmlDevice) GetGpuInstances(info *GpuInstanceProfileInfo) ([]GpuInstance, Return) {
nvmlGis, r := nvml.Device(d).GetGpuInstances((*nvml.GpuInstanceProfileInfo)(info))
var gis []GpuInstance
for _, gi := range nvmlGis {
gis = append(gis, nvmlGpuInstance(gi))
}
return gis, nvmlReturn(r)
}
func (d nvmlDevice) GetMaxMigDeviceCount() (int, Return) {
m, r := nvml.Device(d).GetMaxMigDeviceCount()
return m, nvmlReturn(r)
}
func (d nvmlDevice) GetMigDeviceHandleByIndex(Index int) (Device, Return) {
h, r := nvml.Device(d).GetMigDeviceHandleByIndex(Index)
return nvmlDevice(h), nvmlReturn(r)
}
func (gi nvmlGpuInstance) GetInfo() (GpuInstanceInfo, Return) {
i, r := nvml.GpuInstance(gi).GetInfo()
info := GpuInstanceInfo{
Device: nvmlDevice(i.Device),
Id: i.Id,
ProfileId: i.ProfileId,
Placement: i.Placement,
}
return info, nvmlReturn(r)
}
func (gi nvmlGpuInstance) GetComputeInstanceProfileInfo(profile int, engProfile int) (ComputeInstanceProfileInfo, Return) {
p, r := nvml.GpuInstance(gi).GetComputeInstanceProfileInfo(profile, engProfile)
return ComputeInstanceProfileInfo(p), nvmlReturn(r)
}
func (gi nvmlGpuInstance) CreateComputeInstance(info *ComputeInstanceProfileInfo) (ComputeInstance, Return) {
ci, r := nvml.GpuInstance(gi).CreateComputeInstance((*nvml.ComputeInstanceProfileInfo)(info))
return nvmlComputeInstance(ci), nvmlReturn(r)
}
func (gi nvmlGpuInstance) GetComputeInstances(info *ComputeInstanceProfileInfo) ([]ComputeInstance, Return) {
nvmlCis, r := nvml.GpuInstance(gi).GetComputeInstances((*nvml.ComputeInstanceProfileInfo)(info))
var cis []ComputeInstance
for _, ci := range nvmlCis {
cis = append(cis, nvmlComputeInstance(ci))
}
return cis, nvmlReturn(r)
}
func (gi nvmlGpuInstance) Destroy() Return {
r := nvml.GpuInstance(gi).Destroy()
return nvmlReturn(r)
}
func (ci nvmlComputeInstance) GetInfo() (ComputeInstanceInfo, Return) {
i, r := nvml.ComputeInstance(ci).GetInfo()
info := ComputeInstanceInfo{
Device: nvmlDevice(i.Device),
GpuInstance: nvmlGpuInstance(i.GpuInstance),
Id: i.Id,
ProfileId: i.ProfileId,
}
return info, nvmlReturn(r)
}
func (ci nvmlComputeInstance) Destroy() Return {
r := nvml.ComputeInstance(ci).Destroy()
return nvmlReturn(r)
}

110
internal/nvml/return.go Normal file
View File

@@ -0,0 +1,110 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nvml
import (
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
type Return interface {
Value() nvml.Return
String() string
Error() string
}
type nvmlReturn nvml.Return
type MockReturn nvml.Return
var _ Return = (*nvmlReturn)(nil)
var _ Return = (*MockReturn)(nil)
func (r nvmlReturn) Value() nvml.Return {
return nvml.Return(r)
}
func (r nvmlReturn) String() string {
return r.Error()
}
func (r nvmlReturn) Error() string {
return nvml.ErrorString(nvml.Return(r))
}
func (r MockReturn) Value() nvml.Return {
return nvml.Return(r)
}
func (r MockReturn) String() string {
return r.Error()
}
func (r MockReturn) Error() string {
switch nvml.Return(r) {
case SUCCESS:
return "SUCCESS"
case ERROR_UNINITIALIZED:
return "ERROR_UNINITIALIZED"
case ERROR_INVALID_ARGUMENT:
return "ERROR_INVALID_ARGUMENT"
case ERROR_NOT_SUPPORTED:
return "ERROR_NOT_SUPPORTED"
case ERROR_NO_PERMISSION:
return "ERROR_NO_PERMISSION"
case ERROR_ALREADY_INITIALIZED:
return "ERROR_ALREADY_INITIALIZED"
case ERROR_NOT_FOUND:
return "ERROR_NOT_FOUND"
case ERROR_INSUFFICIENT_SIZE:
return "ERROR_INSUFFICIENT_SIZE"
case ERROR_INSUFFICIENT_POWER:
return "ERROR_INSUFFICIENT_POWER"
case ERROR_DRIVER_NOT_LOADED:
return "ERROR_DRIVER_NOT_LOADED"
case ERROR_TIMEOUT:
return "ERROR_TIMEOUT"
case ERROR_IRQ_ISSUE:
return "ERROR_IRQ_ISSUE"
case ERROR_LIBRARY_NOT_FOUND:
return "ERROR_LIBRARY_NOT_FOUND"
case ERROR_FUNCTION_NOT_FOUND:
return "ERROR_FUNCTION_NOT_FOUND"
case ERROR_CORRUPTED_INFOROM:
return "ERROR_CORRUPTED_INFOROM"
case ERROR_GPU_IS_LOST:
return "ERROR_GPU_IS_LOST"
case ERROR_RESET_REQUIRED:
return "ERROR_RESET_REQUIRED"
case ERROR_OPERATING_SYSTEM:
return "ERROR_OPERATING_SYSTEM"
case ERROR_LIB_RM_VERSION_MISMATCH:
return "ERROR_LIB_RM_VERSION_MISMATCH"
case ERROR_IN_USE:
return "ERROR_IN_USE"
case ERROR_MEMORY:
return "ERROR_MEMORY"
case ERROR_NO_DATA:
return "ERROR_NO_DATA"
case ERROR_VGPU_ECC_NOT_SUPPORTED:
return "ERROR_VGPU_ECC_NOT_SUPPORTED"
case ERROR_INSUFFICIENT_RESOURCES:
return "ERROR_INSUFFICIENT_RESOURCES"
case ERROR_UNKNOWN:
return "ERROR_UNKNOWN"
default:
return "Unknown return value"
}
}

78
internal/nvml/types.go Normal file
View File

@@ -0,0 +1,78 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nvml
import (
"github.com/NVIDIA/go-nvml/pkg/nvml"
)
type Interface interface {
Init() Return
Shutdown() Return
DeviceGetCount() (int, Return)
DeviceGetHandleByIndex(Index int) (Device, Return)
SystemGetDriverVersion() (string, Return)
}
type Device interface {
GetIndex() (int, Return)
GetPciInfo() (PciInfo, Return)
GetUUID() (string, Return)
GetMinorNumber() (int, Return)
IsMigDeviceHandle() (bool, Return)
GetDeviceHandleFromMigDeviceHandle() (Device, Return)
SetMigMode(Mode int) (Return, Return)
GetMigMode() (int, int, Return)
GetGpuInstanceProfileInfo(Profile int) (GpuInstanceProfileInfo, Return)
CreateGpuInstance(Info *GpuInstanceProfileInfo) (GpuInstance, Return)
GetGpuInstances(Info *GpuInstanceProfileInfo) ([]GpuInstance, Return)
GetMaxMigDeviceCount() (int, Return)
GetMigDeviceHandleByIndex(Index int) (Device, Return)
GetGPUInstanceId() (int, Return)
GetComputeInstanceId() (int, Return)
}
type GpuInstance interface {
GetInfo() (GpuInstanceInfo, Return)
GetComputeInstanceProfileInfo(Profile int, EngProfile int) (ComputeInstanceProfileInfo, Return)
CreateComputeInstance(Info *ComputeInstanceProfileInfo) (ComputeInstance, Return)
GetComputeInstances(Info *ComputeInstanceProfileInfo) ([]ComputeInstance, Return)
Destroy() Return
}
type ComputeInstance interface {
GetInfo() (ComputeInstanceInfo, Return)
Destroy() Return
}
type GpuInstanceInfo struct {
Device Device
Id uint32
ProfileId uint32
Placement nvml.GpuInstancePlacement
}
type ComputeInstanceInfo struct {
Device Device
GpuInstance GpuInstance
Id uint32
ProfileId uint32
}
type PciInfo nvml.PciInfo
type GpuInstanceProfileInfo nvml.GpuInstanceProfileInfo
type ComputeInstanceProfileInfo nvml.ComputeInstanceProfileInfo

116
internal/proc/devices.go Normal file
View File

@@ -0,0 +1,116 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package proc
import (
"bufio"
"fmt"
"io"
"log"
"os"
"strings"
)
const (
procDevicesPath = "/proc/devices"
nvidiaDevicePrefix = "nvidia"
)
// Device represents a device as specified under /proc/devices
type Device struct {
Name string
Major int
}
// NvidiaDevices represents the set of nvidia owned devices under /proc/devices
type NvidiaDevices interface {
Exists(name string) bool
Get(name string) (Device, bool)
}
type nvidiaDevices map[string]Device
var _ NvidiaDevices = nvidiaDevices(nil)
// Exists checks if a Device with a given name exists or not
func (d nvidiaDevices) Exists(name string) bool {
_, exists := d[name]
return exists
}
// Get a Device from NvidiaDevices
func (d nvidiaDevices) Get(name string) (Device, bool) {
device, exists := d[name]
return device, exists
}
func (d nvidiaDevices) add(devices ...Device) {
for _, device := range devices {
d[device.Name] = device
}
}
// NewMockNvidiaDevices returns NvidiaDevices populated from the devices passed in
func NewMockNvidiaDevices(devices ...Device) NvidiaDevices {
nvds := make(nvidiaDevices)
nvds.add(devices...)
return nvds
}
// GetNvidiaDevices returns the set of NvidiaDevices on the machine
func GetNvidiaDevices() (NvidiaDevices, error) {
devicesFile, err := os.Open(procDevicesPath)
if os.IsNotExist(err) {
return nil, nil
}
if err != nil {
return nil, fmt.Errorf("error opening devices file: %v", err)
}
defer devicesFile.Close()
return processDeviceFile(devicesFile), nil
}
func processDeviceFile(devicesFile io.Reader) NvidiaDevices {
nvidiaDevices := make(nvidiaDevices)
scanner := bufio.NewScanner(devicesFile)
for scanner.Scan() {
device, major, err := processProcDeviceLine(scanner.Text())
if err != nil {
log.Printf("Skipping line in devices file: %v", err)
continue
}
if strings.HasPrefix(device, nvidiaDevicePrefix) {
nvidiaDevices.add(Device{device, major})
}
}
return nvidiaDevices
}
func processProcDeviceLine(line string) (string, int, error) {
trimmed := strings.TrimSpace(line)
var name string
var major int
n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name)
if n == 2 {
return name, major, nil
}
return "", 0, fmt.Errorf("unparsable line: %v", line)
}

View File

@@ -0,0 +1,92 @@
/*
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/
package proc
import (
"strings"
"testing"
"github.com/stretchr/testify/require"
)
func TestNvidiaDevices(t *testing.T) {
devices := []Device{
{"nvidia-frontend", 195},
{"nvidia-nvlink", 234},
{"nvidia-caps", 235},
{"nvidia-uvm", 510},
{"nvidia-nvswitch", 511},
}
nvidiaDevices := NewMockNvidiaDevices(devices...)
for _, d := range devices {
device, exists := nvidiaDevices.Get(d.Name)
require.True(t, exists, "Unexpected missing device")
require.Equal(t, device.Name, d.Name, "Unexpected device name")
require.Equal(t, device.Major, d.Major, "Unexpected device major")
}
_, exists := nvidiaDevices.Get("bogus")
require.False(t, exists, "Unexpected 'bogus' device found")
}
func TestProcessDeviceFile(t *testing.T) {
testCases := []struct {
lines []string
expected []Device
}{
{[]string{}, []Device{}},
{[]string{"Not a valid line:"}, []Device{}},
{[]string{"195 nvidia-frontend"}, []Device{{"nvidia-frontend", 195}}},
{[]string{"195 nvidia-frontend", "235 nvidia-caps"}, []Device{{"nvidia-frontend", 195}, {"nvidia-caps", 235}}},
{[]string{" 195 nvidia-frontend"}, []Device{{"nvidia-frontend", 195}}},
{[]string{"Not a valid line:", "", "195 nvidia-frontend"}, []Device{{"nvidia-frontend", 195}}},
{[]string{"195 not-nvidia-frontend"}, []Device{}},
}
for _, tc := range testCases {
contents := strings.NewReader(strings.Join(tc.lines, "\n"))
d := processDeviceFile(contents)
require.Equalf(t, NewMockNvidiaDevices(tc.expected...), d, "testCase: %v", tc)
}
}
func TestProcessDeviceFileLine(t *testing.T) {
testCases := []struct {
line string
name string
major int
err bool
}{
{"", "", 0, true},
{"0", "", 0, true},
{"notint nvidia-frontend", "", 0, true},
{"195 nvidia-frontend", "nvidia-frontend", 195, false},
{" 195 nvidia-frontend", "nvidia-frontend", 195, false},
}
for _, tc := range testCases {
name, major, err := processProcDeviceLine(tc.line)
require.Equal(t, tc.name, name)
require.Equal(t, tc.major, major)
if tc.err {
require.Error(t, err)
} else {
require.NoError(t, err)
}
}
}