mirror of
https://github.com/NVIDIA/nvidia-container-toolkit
synced 2025-06-26 18:18:24 +00:00
Copy files from nvidia-container-toolkit
Signed-off-by: Evan Lezar <elezar@nvidia.com>
This commit is contained in:
239
internal/ldcache/ldcache.go
Normal file
239
internal/ldcache/ldcache.go
Normal file
@@ -0,0 +1,239 @@
|
||||
/*
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
// Adapted from https://github.com/rai-project/ldcache
|
||||
|
||||
package ldcache
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const ldcachePath = "/etc/ld.so.cache"
|
||||
|
||||
const (
|
||||
magicString1 = "ld.so-1.7.0"
|
||||
magicString2 = "glibc-ld.so.cache"
|
||||
magicVersion = "1.1"
|
||||
)
|
||||
|
||||
const (
|
||||
flagTypeMask = 0x00ff
|
||||
flagTypeELF = 0x0001
|
||||
|
||||
flagArchMask = 0xff00
|
||||
flagArchI386 = 0x0000
|
||||
flagArchX8664 = 0x0300
|
||||
flagArchX32 = 0x0800
|
||||
flagArchPpc64le = 0x0500
|
||||
)
|
||||
|
||||
var ErrInvalidCache = errors.New("invalid ld.so.cache file")
|
||||
|
||||
type Header1 struct {
|
||||
Magic [len(magicString1) + 1]byte // include null delimiter
|
||||
NLibs uint32
|
||||
}
|
||||
|
||||
type Entry1 struct {
|
||||
Flags int32
|
||||
Key, Value uint32
|
||||
}
|
||||
|
||||
type Header2 struct {
|
||||
Magic [len(magicString2)]byte
|
||||
Version [len(magicVersion)]byte
|
||||
NLibs uint32
|
||||
TableSize uint32
|
||||
_ [3]uint32 // unused
|
||||
_ uint64 // force 8 byte alignment
|
||||
}
|
||||
|
||||
type Entry2 struct {
|
||||
Flags int32
|
||||
Key, Value uint32
|
||||
OSVersion uint32
|
||||
HWCap uint64
|
||||
}
|
||||
|
||||
type LDCache struct {
|
||||
*bytes.Reader
|
||||
|
||||
data, libs []byte
|
||||
header Header2
|
||||
entries []Entry2
|
||||
|
||||
root string
|
||||
logger *log.Logger
|
||||
}
|
||||
|
||||
func NewLDCacheWithLogger(logger *log.Logger, root string) (*LDCache, error) {
|
||||
return openWithRoot(logger, root)
|
||||
}
|
||||
|
||||
func Open() (*LDCache, error) {
|
||||
return openWithRoot(log.StandardLogger(), "")
|
||||
}
|
||||
|
||||
func openWithRoot(logger *log.Logger, root string) (*LDCache, error) {
|
||||
path := filepath.Join(root, ldcachePath)
|
||||
|
||||
logger.Debugf("Opening ld.conf at %v", path)
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
fi, err := f.Stat()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
d, err := syscall.Mmap(int(f.Fd()), 0, int(fi.Size()),
|
||||
syscall.PROT_READ, syscall.MAP_PRIVATE)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
cache := &LDCache{
|
||||
data: d,
|
||||
Reader: bytes.NewReader(d),
|
||||
root: root,
|
||||
logger: logger,
|
||||
}
|
||||
return cache, cache.parse()
|
||||
}
|
||||
|
||||
func (c *LDCache) Close() error {
|
||||
return syscall.Munmap(c.data)
|
||||
}
|
||||
|
||||
func (c *LDCache) Magic() string {
|
||||
return string(c.header.Magic[:])
|
||||
}
|
||||
|
||||
func (c *LDCache) Version() string {
|
||||
return string(c.header.Version[:])
|
||||
}
|
||||
|
||||
func strn(b []byte, n int) string {
|
||||
return string(b[:n])
|
||||
}
|
||||
|
||||
func (c *LDCache) parse() error {
|
||||
var header Header1
|
||||
|
||||
// Check for the old format (< glibc-2.2)
|
||||
if c.Len() <= int(unsafe.Sizeof(header)) {
|
||||
return ErrInvalidCache
|
||||
}
|
||||
if strn(c.data, len(magicString1)) == magicString1 {
|
||||
if err := binary.Read(c, binary.LittleEndian, &header); err != nil {
|
||||
return err
|
||||
}
|
||||
n := int64(header.NLibs) * int64(unsafe.Sizeof(Entry1{}))
|
||||
offset, err := c.Seek(n, 1) // skip old entries
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
n = (-offset) & int64(unsafe.Alignof(c.header)-1)
|
||||
_, err = c.Seek(n, 1) // skip padding
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
c.libs = c.data[c.Size()-int64(c.Len()):] // kv offsets start here
|
||||
if err := binary.Read(c, binary.LittleEndian, &c.header); err != nil {
|
||||
return err
|
||||
}
|
||||
if c.Magic() != magicString2 || c.Version() != magicVersion {
|
||||
return ErrInvalidCache
|
||||
}
|
||||
c.entries = make([]Entry2, c.header.NLibs)
|
||||
if err := binary.Read(c, binary.LittleEndian, &c.entries); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *LDCache) Lookup(libs ...string) (paths32, paths64 []string) {
|
||||
c.logger.Debugf("Looking up %v in cache", libs)
|
||||
type void struct{}
|
||||
var paths *[]string
|
||||
|
||||
set := make(map[string]void)
|
||||
prefix := make([][]byte, len(libs))
|
||||
|
||||
for i := range libs {
|
||||
prefix[i] = []byte(libs[i])
|
||||
}
|
||||
for _, e := range c.entries {
|
||||
if ((e.Flags & flagTypeMask) & flagTypeELF) == 0 {
|
||||
continue
|
||||
}
|
||||
switch e.Flags & flagArchMask {
|
||||
case flagArchX8664:
|
||||
fallthrough
|
||||
case flagArchPpc64le:
|
||||
paths = &paths64
|
||||
case flagArchX32:
|
||||
fallthrough
|
||||
case flagArchI386:
|
||||
paths = &paths32
|
||||
default:
|
||||
continue
|
||||
}
|
||||
if e.Key > uint32(len(c.libs)) || e.Value > uint32(len(c.libs)) {
|
||||
continue
|
||||
}
|
||||
lib := c.libs[e.Key:]
|
||||
value := c.libs[e.Value:]
|
||||
|
||||
for _, p := range prefix {
|
||||
if bytes.HasPrefix(lib, p) {
|
||||
n := bytes.IndexByte(value, 0)
|
||||
if n < 0 {
|
||||
break
|
||||
}
|
||||
|
||||
name := filepath.Join(c.root, strn(value, n))
|
||||
c.logger.Debugf("checking %v", string(name))
|
||||
|
||||
path, err := filepath.EvalSymlinks(name)
|
||||
if err != nil {
|
||||
c.logger.Debugf("could not resolve symlink for %v", name)
|
||||
break
|
||||
}
|
||||
if _, ok := set[path]; ok {
|
||||
break
|
||||
}
|
||||
set[path] = void{}
|
||||
*paths = append(*paths, path)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
78
internal/lookup/file.go
Normal file
78
internal/lookup/file.go
Normal file
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
package lookup
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
type file struct {
|
||||
logger *log.Logger
|
||||
prefixes []string
|
||||
filter func(string) error
|
||||
}
|
||||
|
||||
func NewFileLocator(root string) Locator {
|
||||
return NewFileLocatorWithLogger(log.StandardLogger(), root)
|
||||
}
|
||||
|
||||
func NewFileLocatorWithLogger(logger *log.Logger, root string) Locator {
|
||||
l := file{
|
||||
logger: logger,
|
||||
prefixes: []string{root},
|
||||
filter: assertFile,
|
||||
}
|
||||
|
||||
return &l
|
||||
}
|
||||
|
||||
var _ Locator = (*file)(nil)
|
||||
|
||||
func (p file) Locate(filename string) ([]string, error) {
|
||||
var filenames []string
|
||||
for _, prefix := range p.prefixes {
|
||||
candidate := filepath.Join(prefix, filename)
|
||||
p.logger.Debugf("Checking candidate '%v'", candidate)
|
||||
err := p.filter(candidate)
|
||||
if err != nil {
|
||||
p.logger.Debugf("Candidate '%v' does not meet requirements: %v", candidate, err)
|
||||
continue
|
||||
}
|
||||
filenames = append(filenames, candidate)
|
||||
}
|
||||
if len(filename) == 0 {
|
||||
return nil, fmt.Errorf("file %v not found", filename)
|
||||
}
|
||||
return filenames, nil
|
||||
}
|
||||
|
||||
func assertFile(filename string) error {
|
||||
info, err := os.Stat(filename)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error getting info for %v: %v", filename, err)
|
||||
}
|
||||
|
||||
if info.IsDir() {
|
||||
return fmt.Errorf("specified path '%v' is a directory", filename)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
65
internal/lookup/library.go
Normal file
65
internal/lookup/library.go
Normal file
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
package lookup
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
"gitlab.com/nvidia/cloud-native/container-toolkit/internal/ldcache"
|
||||
)
|
||||
|
||||
type library struct {
|
||||
logger *log.Logger
|
||||
cache *ldcache.LDCache
|
||||
}
|
||||
|
||||
var _ Locator = (*library)(nil)
|
||||
|
||||
// NewLibraryLocatorWithLogger creates a library locator using the standard logger
|
||||
func NewLibraryLocator(root string) (Locator, error) {
|
||||
return NewLibraryLocatorWithLogger(log.StandardLogger(), root)
|
||||
}
|
||||
|
||||
// NewLibraryLocatorWithLogger creates a library locator using the specified logger.
|
||||
func NewLibraryLocatorWithLogger(logger *log.Logger, root string) (Locator, error) {
|
||||
logger.Infof("Reading ldcache at %v", root)
|
||||
cache, err := ldcache.NewLDCacheWithLogger(logger, root)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error loading ldcache: %v", err)
|
||||
}
|
||||
|
||||
l := library{
|
||||
logger: logger,
|
||||
cache: cache,
|
||||
}
|
||||
|
||||
return &l, nil
|
||||
}
|
||||
|
||||
func (l library) Locate(libname string) ([]string, error) {
|
||||
paths32, paths64 := l.cache.Lookup(libname)
|
||||
if len(paths32) > 0 {
|
||||
l.logger.Warnf("Ignoring 32-bit libraries for %v: %v", libname, paths32)
|
||||
}
|
||||
|
||||
if len(paths64) == 0 {
|
||||
return nil, fmt.Errorf("64-bit library %v not found", libname)
|
||||
}
|
||||
|
||||
return paths64, nil
|
||||
}
|
||||
24
internal/lookup/lookup.go
Normal file
24
internal/lookup/lookup.go
Normal file
@@ -0,0 +1,24 @@
|
||||
/*
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
package lookup
|
||||
|
||||
//go:generate moq -stub -out lookup_mock.go . Locator
|
||||
|
||||
// Locator defines the interface for locating files on a system.
|
||||
type Locator interface {
|
||||
Locate(string) ([]string, error)
|
||||
}
|
||||
77
internal/lookup/lookup_mock.go
Normal file
77
internal/lookup/lookup_mock.go
Normal file
@@ -0,0 +1,77 @@
|
||||
// Code generated by moq; DO NOT EDIT.
|
||||
// github.com/matryer/moq
|
||||
|
||||
package lookup
|
||||
|
||||
import (
|
||||
"sync"
|
||||
)
|
||||
|
||||
// Ensure, that LocatorMock does implement Locator.
|
||||
// If this is not the case, regenerate this file with moq.
|
||||
var _ Locator = &LocatorMock{}
|
||||
|
||||
// LocatorMock is a mock implementation of Locator.
|
||||
//
|
||||
// func TestSomethingThatUsesLocator(t *testing.T) {
|
||||
//
|
||||
// // make and configure a mocked Locator
|
||||
// mockedLocator := &LocatorMock{
|
||||
// LocateFunc: func(s string) ([]string, error) {
|
||||
// panic("mock out the Locate method")
|
||||
// },
|
||||
// }
|
||||
//
|
||||
// // use mockedLocator in code that requires Locator
|
||||
// // and then make assertions.
|
||||
//
|
||||
// }
|
||||
type LocatorMock struct {
|
||||
// LocateFunc mocks the Locate method.
|
||||
LocateFunc func(s string) ([]string, error)
|
||||
|
||||
// calls tracks calls to the methods.
|
||||
calls struct {
|
||||
// Locate holds details about calls to the Locate method.
|
||||
Locate []struct {
|
||||
// S is the s argument value.
|
||||
S string
|
||||
}
|
||||
}
|
||||
lockLocate sync.RWMutex
|
||||
}
|
||||
|
||||
// Locate calls LocateFunc.
|
||||
func (mock *LocatorMock) Locate(s string) ([]string, error) {
|
||||
callInfo := struct {
|
||||
S string
|
||||
}{
|
||||
S: s,
|
||||
}
|
||||
mock.lockLocate.Lock()
|
||||
mock.calls.Locate = append(mock.calls.Locate, callInfo)
|
||||
mock.lockLocate.Unlock()
|
||||
if mock.LocateFunc == nil {
|
||||
var (
|
||||
stringsOut []string
|
||||
errOut error
|
||||
)
|
||||
return stringsOut, errOut
|
||||
}
|
||||
return mock.LocateFunc(s)
|
||||
}
|
||||
|
||||
// LocateCalls gets all the calls that were made to Locate.
|
||||
// Check the length with:
|
||||
// len(mockedLocator.LocateCalls())
|
||||
func (mock *LocatorMock) LocateCalls() []struct {
|
||||
S string
|
||||
} {
|
||||
var calls []struct {
|
||||
S string
|
||||
}
|
||||
mock.lockLocate.RLock()
|
||||
calls = mock.calls.Locate
|
||||
mock.lockLocate.RUnlock()
|
||||
return calls
|
||||
}
|
||||
94
internal/lookup/path.go
Normal file
94
internal/lookup/path.go
Normal file
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
package lookup
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
log "github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
const (
|
||||
envPath = "PATH"
|
||||
)
|
||||
|
||||
var defaultPaths = []string{"/usr/local/sbin", "/usr/local/bin", "/usr/sbin", "/usr/bin", "/sbin", "/bin"}
|
||||
|
||||
type path struct {
|
||||
file
|
||||
}
|
||||
|
||||
func NewPathLocator(root string) Locator {
|
||||
return NewPathLocatorWithLogger(log.StandardLogger(), root)
|
||||
}
|
||||
|
||||
func NewPathLocatorWithLogger(logger *log.Logger, root string) Locator {
|
||||
pathEnv := os.Getenv(envPath)
|
||||
paths := filepath.SplitList(pathEnv)
|
||||
|
||||
if root != "" {
|
||||
paths = append(paths, defaultPaths...)
|
||||
}
|
||||
|
||||
var prefixes []string
|
||||
for _, dir := range paths {
|
||||
prefixes = append(prefixes, filepath.Join(root, dir))
|
||||
}
|
||||
l := path{
|
||||
file: file{
|
||||
logger: logger,
|
||||
prefixes: prefixes,
|
||||
filter: assertExecutable,
|
||||
},
|
||||
}
|
||||
return &l
|
||||
}
|
||||
|
||||
var _ Locator = (*path)(nil)
|
||||
|
||||
func (p path) Locate(filename string) ([]string, error) {
|
||||
// For absolute paths we ensure that it is executable
|
||||
if strings.Contains(filename, "/") {
|
||||
err := assertExecutable(filename)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("absolute path %v is not an executable file: %v", filename, err)
|
||||
}
|
||||
return []string{filename}, nil
|
||||
}
|
||||
|
||||
return p.file.Locate(filename)
|
||||
}
|
||||
|
||||
func assertExecutable(filename string) error {
|
||||
err := assertFile(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
info, err := os.Stat(filename)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if info.Mode()&0111 == 0 {
|
||||
return fmt.Errorf("specified file '%v' is not executable", filename)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
141
internal/nvcaps/nvcaps.go
Normal file
141
internal/nvcaps/nvcaps.go
Normal file
@@ -0,0 +1,141 @@
|
||||
/*
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
package nvcaps
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
nvidiaProcDriverPath = "/proc/driver/nvidia"
|
||||
nvidiaCapabilitiesPath = nvidiaProcDriverPath + "/capabilities"
|
||||
|
||||
nvcapsProcDriverPath = "/proc/driver/nvidia-caps"
|
||||
nvcapsMigMinorsPath = nvcapsProcDriverPath + "/mig-minors"
|
||||
nvcapsDevicePath = "/dev/nvidia-caps"
|
||||
)
|
||||
|
||||
// MigMinor represents the minor number of a MIG device
|
||||
type MigMinor int
|
||||
|
||||
// MigCap represents the path to a MIG cap file
|
||||
type MigCap string
|
||||
|
||||
// LoadMigMinors loads the MIG minors file and returns its contents as a map
|
||||
func LoadMigMinors() (map[MigCap]MigMinor, error) {
|
||||
// Open nvcapsMigMinorsPath for walking.
|
||||
// If the nvcapsMigMinorsPath does not exist, then we are not on a MIG
|
||||
// capable machine, so there is nothing to do.
|
||||
// The format of this file is discussed in:
|
||||
// https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#unique_1576522674
|
||||
minorsFile, err := os.Open(nvcapsMigMinorsPath)
|
||||
if os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error opening MIG minors file: %v", err)
|
||||
}
|
||||
defer minorsFile.Close()
|
||||
|
||||
return processMinorsFile(minorsFile), nil
|
||||
}
|
||||
|
||||
func processMinorsFile(minorsFile io.Reader) map[MigCap]MigMinor {
|
||||
// Walk each line of nvcapsMigMinorsPath and construct a mapping of nvidia
|
||||
// capabilities path to device minor for that capability
|
||||
migCaps := make(map[MigCap]MigMinor)
|
||||
scanner := bufio.NewScanner(minorsFile)
|
||||
for scanner.Scan() {
|
||||
cap, minor, err := processMigMinorsLine(scanner.Text())
|
||||
if err != nil {
|
||||
log.Printf("Skipping line in MIG minors file: %v", err)
|
||||
continue
|
||||
}
|
||||
migCaps[cap] = minor
|
||||
}
|
||||
return migCaps
|
||||
}
|
||||
|
||||
func processMigMinorsLine(line string) (MigCap, MigMinor, error) {
|
||||
parts := strings.Split(line, " ")
|
||||
if len(parts) != 2 {
|
||||
return "", 0, fmt.Errorf("error processing line: %v", line)
|
||||
}
|
||||
|
||||
migCap := MigCap(parts[0])
|
||||
if !migCap.isValid() {
|
||||
return "", 0, fmt.Errorf("invalid MIG minors line: '%v'", line)
|
||||
}
|
||||
|
||||
minor, err := strconv.Atoi(parts[1])
|
||||
if err != nil {
|
||||
return "", 0, fmt.Errorf("error reading MIG minor from '%v': %v", line, err)
|
||||
}
|
||||
|
||||
return migCap, MigMinor(minor), nil
|
||||
}
|
||||
|
||||
func (m MigCap) isValid() bool {
|
||||
cap := string(m)
|
||||
switch cap {
|
||||
case "config", "monitor":
|
||||
return true
|
||||
default:
|
||||
var gpu int
|
||||
var gi int
|
||||
var ci int
|
||||
// Loog for a CI access file
|
||||
n, _ := fmt.Sscanf(cap, "gpu%d/gi%d/ci%d/access", &gpu, &gi, &ci)
|
||||
if n == 3 {
|
||||
return true
|
||||
}
|
||||
// Look for a GI access file
|
||||
n, _ = fmt.Sscanf(cap, "gpu%d/gi%d/access %d", &gpu, &gi)
|
||||
if n == 2 {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// ProcPath returns the proc path associated with the MIG capability
|
||||
func (m MigCap) ProcPath() string {
|
||||
id := string(m)
|
||||
|
||||
var path string
|
||||
switch id {
|
||||
case "config", "monitor":
|
||||
path = "mig/" + id
|
||||
default:
|
||||
parts := strings.SplitN(id, "/", 2)
|
||||
path = strings.Join([]string{parts[0], "mig", parts[1]}, "/")
|
||||
}
|
||||
return filepath.Join(nvidiaCapabilitiesPath, path)
|
||||
}
|
||||
|
||||
// DevicePath returns the path for the nvidia-caps device with the specified
|
||||
// minor number
|
||||
func (m MigMinor) DevicePath() string {
|
||||
return fmt.Sprintf(nvcapsDevicePath+"/nvidia-cap%d", m)
|
||||
}
|
||||
94
internal/nvcaps/nvcaps_test.go
Normal file
94
internal/nvcaps/nvcaps_test.go
Normal file
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
package nvcaps
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestProcessMinorsFile(t *testing.T) {
|
||||
testCases := []struct {
|
||||
lines []string
|
||||
expected map[MigCap]MigMinor
|
||||
}{
|
||||
{[]string{}, map[MigCap]MigMinor{}},
|
||||
{[]string{"invalidLine"}, map[MigCap]MigMinor{}},
|
||||
{[]string{"config 1"}, map[MigCap]MigMinor{"config": 1}},
|
||||
{[]string{"gpu0/gi0/ci0/access 4"}, map[MigCap]MigMinor{"gpu0/gi0/ci0/access": 4}},
|
||||
{[]string{"config 1", "invalidLine"}, map[MigCap]MigMinor{"config": 1}},
|
||||
{[]string{"config 1", "gpu0/gi0/ci0/access 4"}, map[MigCap]MigMinor{"config": 1, "gpu0/gi0/ci0/access": 4}},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
contents := strings.NewReader(strings.Join(tc.lines, "\n"))
|
||||
d := processMinorsFile(contents)
|
||||
require.Equalf(t, tc.expected, d, "testCase: %v", tc)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessMigMinorsLine(t *testing.T) {
|
||||
testCases := []struct {
|
||||
line string
|
||||
cap MigCap
|
||||
minor MigMinor
|
||||
err bool
|
||||
}{
|
||||
{"config 1", "config", 1, false},
|
||||
{"monitor 2", "monitor", 2, false},
|
||||
{"gpu0/gi0/access 3", "gpu0/gi0/access", 3, false},
|
||||
{"gpu0/gi0/ci0/access 4", "gpu0/gi0/ci0/access", 4, false},
|
||||
{"notconfig 99", "", 0, true},
|
||||
{"config notanint", "", 0, true},
|
||||
{"", "", 0, true},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
cap, minor, err := processMigMinorsLine(tc.line)
|
||||
|
||||
require.Equalf(t, tc.cap, cap, "testCase: %v", tc)
|
||||
require.Equalf(t, tc.minor, minor, "testCase: %v", tc)
|
||||
if tc.err {
|
||||
require.Errorf(t, err, "testCase: %v", tc)
|
||||
} else {
|
||||
require.NoErrorf(t, err, "testCase: %v", tc)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
func TestMigCapProcPaths(t *testing.T) {
|
||||
testCases := []struct {
|
||||
input string
|
||||
expected string
|
||||
}{
|
||||
{"config", "/proc/driver/nvidia/capabilities/mig/config"},
|
||||
{"monitor", "/proc/driver/nvidia/capabilities/mig/monitor"},
|
||||
{"gpu0/gi0/access", "/proc/driver/nvidia/capabilities/gpu0/mig/gi0/access"},
|
||||
{"gpu0/gi0/ci0/access", "/proc/driver/nvidia/capabilities/gpu0/mig/gi0/ci0/access"},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
m := MigCap(tc.input)
|
||||
require.Equal(t, tc.expected, m.ProcPath())
|
||||
}
|
||||
}
|
||||
|
||||
func TestMigMinorDevicePath(t *testing.T) {
|
||||
m := MigMinor(0)
|
||||
require.Equal(t, "/dev/nvidia-caps/nvidia-cap0", m.DevicePath())
|
||||
}
|
||||
79
internal/nvml/consts.go
Normal file
79
internal/nvml/consts.go
Normal file
@@ -0,0 +1,79 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package nvml
|
||||
|
||||
import (
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
)
|
||||
|
||||
const (
|
||||
SUCCESS = nvml.SUCCESS
|
||||
ERROR_UNINITIALIZED = nvml.ERROR_UNINITIALIZED
|
||||
ERROR_INVALID_ARGUMENT = nvml.ERROR_INVALID_ARGUMENT
|
||||
ERROR_NOT_SUPPORTED = nvml.ERROR_NOT_SUPPORTED
|
||||
ERROR_NO_PERMISSION = nvml.ERROR_NO_PERMISSION
|
||||
ERROR_ALREADY_INITIALIZED = nvml.ERROR_ALREADY_INITIALIZED
|
||||
ERROR_NOT_FOUND = nvml.ERROR_NOT_FOUND
|
||||
ERROR_INSUFFICIENT_SIZE = nvml.ERROR_INSUFFICIENT_SIZE
|
||||
ERROR_INSUFFICIENT_POWER = nvml.ERROR_INSUFFICIENT_POWER
|
||||
ERROR_DRIVER_NOT_LOADED = nvml.ERROR_DRIVER_NOT_LOADED
|
||||
ERROR_TIMEOUT = nvml.ERROR_TIMEOUT
|
||||
ERROR_IRQ_ISSUE = nvml.ERROR_IRQ_ISSUE
|
||||
ERROR_LIBRARY_NOT_FOUND = nvml.ERROR_LIBRARY_NOT_FOUND
|
||||
ERROR_FUNCTION_NOT_FOUND = nvml.ERROR_FUNCTION_NOT_FOUND
|
||||
ERROR_CORRUPTED_INFOROM = nvml.ERROR_CORRUPTED_INFOROM
|
||||
ERROR_GPU_IS_LOST = nvml.ERROR_GPU_IS_LOST
|
||||
ERROR_RESET_REQUIRED = nvml.ERROR_RESET_REQUIRED
|
||||
ERROR_OPERATING_SYSTEM = nvml.ERROR_OPERATING_SYSTEM
|
||||
ERROR_LIB_RM_VERSION_MISMATCH = nvml.ERROR_LIB_RM_VERSION_MISMATCH
|
||||
ERROR_IN_USE = nvml.ERROR_IN_USE
|
||||
ERROR_MEMORY = nvml.ERROR_MEMORY
|
||||
ERROR_NO_DATA = nvml.ERROR_NO_DATA
|
||||
ERROR_VGPU_ECC_NOT_SUPPORTED = nvml.ERROR_VGPU_ECC_NOT_SUPPORTED
|
||||
ERROR_INSUFFICIENT_RESOURCES = nvml.ERROR_INSUFFICIENT_RESOURCES
|
||||
ERROR_UNKNOWN = nvml.ERROR_UNKNOWN
|
||||
)
|
||||
|
||||
const (
|
||||
DEVICE_MIG_ENABLE = nvml.DEVICE_MIG_ENABLE
|
||||
DEVICE_MIG_DISABLE = nvml.DEVICE_MIG_DISABLE
|
||||
)
|
||||
|
||||
const (
|
||||
GPU_INSTANCE_PROFILE_1_SLICE = nvml.GPU_INSTANCE_PROFILE_1_SLICE
|
||||
GPU_INSTANCE_PROFILE_2_SLICE = nvml.GPU_INSTANCE_PROFILE_2_SLICE
|
||||
GPU_INSTANCE_PROFILE_3_SLICE = nvml.GPU_INSTANCE_PROFILE_3_SLICE
|
||||
GPU_INSTANCE_PROFILE_4_SLICE = nvml.GPU_INSTANCE_PROFILE_4_SLICE
|
||||
GPU_INSTANCE_PROFILE_7_SLICE = nvml.GPU_INSTANCE_PROFILE_7_SLICE
|
||||
GPU_INSTANCE_PROFILE_8_SLICE = nvml.GPU_INSTANCE_PROFILE_8_SLICE
|
||||
GPU_INSTANCE_PROFILE_COUNT = nvml.GPU_INSTANCE_PROFILE_COUNT
|
||||
)
|
||||
|
||||
const (
|
||||
COMPUTE_INSTANCE_PROFILE_1_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_2_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_3_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_4_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_7_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_8_SLICE = nvml.COMPUTE_INSTANCE_PROFILE_8_SLICE
|
||||
COMPUTE_INSTANCE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_PROFILE_COUNT
|
||||
)
|
||||
|
||||
const (
|
||||
COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED = nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED
|
||||
COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT = nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT
|
||||
)
|
||||
594
internal/nvml/mock.go
Normal file
594
internal/nvml/mock.go
Normal file
@@ -0,0 +1,594 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package nvml
|
||||
|
||||
import "fmt"
|
||||
|
||||
type MockServer struct {
|
||||
Devices []Device
|
||||
}
|
||||
type MockLunaServer struct {
|
||||
MockServer
|
||||
}
|
||||
type MockA100Device struct {
|
||||
Index int
|
||||
MinorNumber int
|
||||
MigMode int
|
||||
GpuInstances map[*MockA100GpuInstance]struct{}
|
||||
GpuInstanceCounter uint32
|
||||
}
|
||||
|
||||
type MockA100GpuInstance struct {
|
||||
Info GpuInstanceInfo
|
||||
ComputeInstances map[*MockA100ComputeInstance]struct{}
|
||||
ComputeInstanceCounter uint32
|
||||
}
|
||||
type MockA100ComputeInstance struct {
|
||||
Info ComputeInstanceInfo
|
||||
}
|
||||
|
||||
var _ Interface = (*MockLunaServer)(nil)
|
||||
var _ Device = (*MockA100Device)(nil)
|
||||
var _ GpuInstance = (*MockA100GpuInstance)(nil)
|
||||
var _ ComputeInstance = (*MockA100ComputeInstance)(nil)
|
||||
|
||||
var MockA100MIGProfiles = struct {
|
||||
GpuInstanceProfiles map[int]GpuInstanceProfileInfo
|
||||
ComputeInstanceProfiles map[int]map[int]ComputeInstanceProfileInfo
|
||||
}{
|
||||
GpuInstanceProfiles: map[int]GpuInstanceProfileInfo{
|
||||
GPU_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: GPU_INSTANCE_PROFILE_1_SLICE,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 7,
|
||||
MultiprocessorCount: 1,
|
||||
CopyEngineCount: 1,
|
||||
DecoderCount: 0,
|
||||
EncoderCount: 0,
|
||||
JpegCount: 0,
|
||||
OfaCount: 0,
|
||||
MemorySizeMB: 5120,
|
||||
},
|
||||
GPU_INSTANCE_PROFILE_2_SLICE: {
|
||||
Id: GPU_INSTANCE_PROFILE_2_SLICE,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 2,
|
||||
InstanceCount: 3,
|
||||
MultiprocessorCount: 2,
|
||||
CopyEngineCount: 2,
|
||||
DecoderCount: 1,
|
||||
EncoderCount: 1,
|
||||
JpegCount: 0,
|
||||
OfaCount: 0,
|
||||
MemorySizeMB: 10240,
|
||||
},
|
||||
GPU_INSTANCE_PROFILE_3_SLICE: {
|
||||
Id: GPU_INSTANCE_PROFILE_3_SLICE,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 3,
|
||||
InstanceCount: 2,
|
||||
MultiprocessorCount: 3,
|
||||
CopyEngineCount: 4,
|
||||
DecoderCount: 2,
|
||||
EncoderCount: 2,
|
||||
JpegCount: 0,
|
||||
OfaCount: 0,
|
||||
MemorySizeMB: 20480,
|
||||
},
|
||||
GPU_INSTANCE_PROFILE_4_SLICE: {
|
||||
Id: GPU_INSTANCE_PROFILE_4_SLICE,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 4,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 4,
|
||||
CopyEngineCount: 4,
|
||||
DecoderCount: 2,
|
||||
EncoderCount: 2,
|
||||
JpegCount: 0,
|
||||
OfaCount: 0,
|
||||
MemorySizeMB: 20480,
|
||||
},
|
||||
GPU_INSTANCE_PROFILE_7_SLICE: {
|
||||
Id: GPU_INSTANCE_PROFILE_7_SLICE,
|
||||
IsP2pSupported: 0,
|
||||
SliceCount: 7,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 7,
|
||||
CopyEngineCount: 8,
|
||||
DecoderCount: 5,
|
||||
EncoderCount: 5,
|
||||
JpegCount: 1,
|
||||
OfaCount: 1,
|
||||
MemorySizeMB: 40960,
|
||||
},
|
||||
},
|
||||
ComputeInstanceProfiles: map[int]map[int]ComputeInstanceProfileInfo{
|
||||
GPU_INSTANCE_PROFILE_1_SLICE: {
|
||||
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 1,
|
||||
SharedCopyEngineCount: 1,
|
||||
SharedDecoderCount: 0,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
},
|
||||
GPU_INSTANCE_PROFILE_2_SLICE: {
|
||||
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 2,
|
||||
MultiprocessorCount: 1,
|
||||
SharedCopyEngineCount: 2,
|
||||
SharedDecoderCount: 1,
|
||||
SharedEncoderCount: 1,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
|
||||
SliceCount: 2,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 2,
|
||||
SharedCopyEngineCount: 2,
|
||||
SharedDecoderCount: 1,
|
||||
SharedEncoderCount: 1,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
},
|
||||
GPU_INSTANCE_PROFILE_3_SLICE: {
|
||||
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 3,
|
||||
MultiprocessorCount: 1,
|
||||
SharedCopyEngineCount: 4,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 1,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
|
||||
SliceCount: 2,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 2,
|
||||
SharedCopyEngineCount: 4,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 2,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
COMPUTE_INSTANCE_PROFILE_3_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_3_SLICE,
|
||||
SliceCount: 3,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 3,
|
||||
SharedCopyEngineCount: 4,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 0,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
},
|
||||
GPU_INSTANCE_PROFILE_4_SLICE: {
|
||||
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 4,
|
||||
MultiprocessorCount: 1,
|
||||
SharedCopyEngineCount: 4,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 2,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
|
||||
SliceCount: 2,
|
||||
InstanceCount: 2,
|
||||
MultiprocessorCount: 2,
|
||||
SharedCopyEngineCount: 4,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 2,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
COMPUTE_INSTANCE_PROFILE_4_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_4_SLICE,
|
||||
SliceCount: 4,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 4,
|
||||
SharedCopyEngineCount: 4,
|
||||
SharedDecoderCount: 2,
|
||||
SharedEncoderCount: 2,
|
||||
SharedJpegCount: 0,
|
||||
SharedOfaCount: 0,
|
||||
},
|
||||
},
|
||||
GPU_INSTANCE_PROFILE_7_SLICE: {
|
||||
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
|
||||
SliceCount: 1,
|
||||
InstanceCount: 7,
|
||||
MultiprocessorCount: 1,
|
||||
SharedCopyEngineCount: 8,
|
||||
SharedDecoderCount: 5,
|
||||
SharedEncoderCount: 5,
|
||||
SharedJpegCount: 1,
|
||||
SharedOfaCount: 1,
|
||||
},
|
||||
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
|
||||
SliceCount: 2,
|
||||
InstanceCount: 3,
|
||||
MultiprocessorCount: 2,
|
||||
SharedCopyEngineCount: 8,
|
||||
SharedDecoderCount: 5,
|
||||
SharedEncoderCount: 5,
|
||||
SharedJpegCount: 1,
|
||||
SharedOfaCount: 1,
|
||||
},
|
||||
COMPUTE_INSTANCE_PROFILE_3_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_3_SLICE,
|
||||
SliceCount: 3,
|
||||
InstanceCount: 2,
|
||||
MultiprocessorCount: 3,
|
||||
SharedCopyEngineCount: 8,
|
||||
SharedDecoderCount: 5,
|
||||
SharedEncoderCount: 5,
|
||||
SharedJpegCount: 1,
|
||||
SharedOfaCount: 1,
|
||||
},
|
||||
COMPUTE_INSTANCE_PROFILE_4_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_4_SLICE,
|
||||
SliceCount: 4,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 4,
|
||||
SharedCopyEngineCount: 8,
|
||||
SharedDecoderCount: 5,
|
||||
SharedEncoderCount: 5,
|
||||
SharedJpegCount: 1,
|
||||
SharedOfaCount: 1,
|
||||
},
|
||||
COMPUTE_INSTANCE_PROFILE_7_SLICE: {
|
||||
Id: COMPUTE_INSTANCE_PROFILE_7_SLICE,
|
||||
SliceCount: 7,
|
||||
InstanceCount: 1,
|
||||
MultiprocessorCount: 7,
|
||||
SharedCopyEngineCount: 8,
|
||||
SharedDecoderCount: 5,
|
||||
SharedEncoderCount: 5,
|
||||
SharedJpegCount: 1,
|
||||
SharedOfaCount: 1,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
func NewMockNVMLServer(devices ...Device) Interface {
|
||||
return &MockServer{
|
||||
Devices: devices,
|
||||
}
|
||||
}
|
||||
|
||||
func NewMockNVMLOnLunaServer() Interface {
|
||||
devices := []Device{
|
||||
NewMockA100Device(0),
|
||||
NewMockA100Device(1),
|
||||
NewMockA100Device(2),
|
||||
NewMockA100Device(3),
|
||||
NewMockA100Device(4),
|
||||
NewMockA100Device(5),
|
||||
NewMockA100Device(6),
|
||||
NewMockA100Device(7),
|
||||
}
|
||||
return NewMockNVMLServer(devices...)
|
||||
}
|
||||
|
||||
func NewMockA100Device(index int) Device {
|
||||
return &MockA100Device{
|
||||
Index: index,
|
||||
GpuInstances: make(map[*MockA100GpuInstance]struct{}),
|
||||
GpuInstanceCounter: 0,
|
||||
}
|
||||
}
|
||||
|
||||
func NewMockA100GpuInstance(info GpuInstanceInfo) GpuInstance {
|
||||
return &MockA100GpuInstance{
|
||||
Info: info,
|
||||
ComputeInstances: make(map[*MockA100ComputeInstance]struct{}),
|
||||
ComputeInstanceCounter: 0,
|
||||
}
|
||||
}
|
||||
|
||||
func NewMockA100ComputeInstance(info ComputeInstanceInfo) ComputeInstance {
|
||||
return &MockA100ComputeInstance{
|
||||
Info: info,
|
||||
}
|
||||
}
|
||||
|
||||
func (n *MockServer) Init() Return {
|
||||
return MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (n *MockServer) Shutdown() Return {
|
||||
return MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (n *MockServer) DeviceGetCount() (int, Return) {
|
||||
return len(n.Devices), MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (n *MockServer) DeviceGetHandleByIndex(index int) (Device, Return) {
|
||||
if index < 0 || index >= len(n.Devices) {
|
||||
return nil, MockReturn(ERROR_INVALID_ARGUMENT)
|
||||
}
|
||||
return n.Devices[index], MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (n *MockServer) SystemGetDriverVersion() (string, Return) {
|
||||
return "999.99", MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetIndex() (int, Return) {
|
||||
return d.Index, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetPciInfo() (PciInfo, Return) {
|
||||
var busID [32]int8
|
||||
for i, b := range []byte("0000FFFF:FF:FF.F") {
|
||||
busID[i] = int8(b)
|
||||
}
|
||||
p := PciInfo{
|
||||
BusId: busID,
|
||||
PciDeviceId: 0x20B010DE,
|
||||
}
|
||||
return p, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetUUID() (string, Return) {
|
||||
return fmt.Sprintf("GPU-%d", d.Index), MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetMinorNumber() (int, Return) {
|
||||
return d.MinorNumber, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) SetMigMode(mode int) (Return, Return) {
|
||||
d.MigMode = mode
|
||||
return MockReturn(SUCCESS), MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetMigMode() (int, int, Return) {
|
||||
return d.MigMode, d.MigMode, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetGpuInstanceProfileInfo(giProfileId int) (GpuInstanceProfileInfo, Return) {
|
||||
if giProfileId < 0 || giProfileId >= GPU_INSTANCE_PROFILE_COUNT {
|
||||
return GpuInstanceProfileInfo{}, MockReturn(ERROR_INVALID_ARGUMENT)
|
||||
}
|
||||
|
||||
if _, exists := MockA100MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists {
|
||||
return GpuInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
|
||||
}
|
||||
|
||||
return MockA100MIGProfiles.GpuInstanceProfiles[giProfileId], MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) CreateGpuInstance(info *GpuInstanceProfileInfo) (GpuInstance, Return) {
|
||||
giInfo := GpuInstanceInfo{
|
||||
Device: d,
|
||||
Id: d.GpuInstanceCounter,
|
||||
ProfileId: info.Id,
|
||||
}
|
||||
d.GpuInstanceCounter++
|
||||
gi := NewMockA100GpuInstance(giInfo)
|
||||
d.GpuInstances[gi.(*MockA100GpuInstance)] = struct{}{}
|
||||
return gi, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetGpuInstances(info *GpuInstanceProfileInfo) ([]GpuInstance, Return) {
|
||||
var gis []GpuInstance
|
||||
for gi := range d.GpuInstances {
|
||||
if gi.Info.ProfileId == info.Id {
|
||||
gis = append(gis, gi)
|
||||
}
|
||||
}
|
||||
return gis, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetMaxMigDeviceCount() (int, Return) {
|
||||
var count int
|
||||
for gi := range d.GpuInstances {
|
||||
count = count + int(gi.ComputeInstanceCounter)
|
||||
}
|
||||
return count, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetMigDeviceHandleByIndex(Index int) (Device, Return) {
|
||||
var count int
|
||||
for gi := range d.GpuInstances {
|
||||
if count+int(gi.ComputeInstanceCounter) < Index {
|
||||
count = count + int(gi.ComputeInstanceCounter)
|
||||
continue
|
||||
}
|
||||
for ci := range gi.ComputeInstances {
|
||||
if count < Index {
|
||||
count++
|
||||
continue
|
||||
}
|
||||
|
||||
return ci, MockReturn(SUCCESS)
|
||||
}
|
||||
}
|
||||
return nil, MockReturn(ERROR_NOT_FOUND)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetDeviceHandleFromMigDeviceHandle() (Device, Return) {
|
||||
return nil, MockReturn(ERROR_NOT_SUPPORTED)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) IsMigDeviceHandle() (bool, Return) {
|
||||
return false, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetComputeInstanceId() (int, Return) {
|
||||
panic("Not implemented: GetComputeInstanceId")
|
||||
}
|
||||
|
||||
func (d *MockA100Device) GetGPUInstanceId() (int, Return) {
|
||||
panic("Not implemented: GetGPUInstanceId")
|
||||
}
|
||||
|
||||
func (gi *MockA100GpuInstance) GetInfo() (GpuInstanceInfo, Return) {
|
||||
return gi.Info, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (gi *MockA100GpuInstance) GetComputeInstanceProfileInfo(ciProfileId int, ciEngProfileId int) (ComputeInstanceProfileInfo, Return) {
|
||||
if ciProfileId < 0 || ciProfileId >= COMPUTE_INSTANCE_PROFILE_COUNT {
|
||||
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_INVALID_ARGUMENT)
|
||||
}
|
||||
|
||||
if ciEngProfileId != COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED {
|
||||
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
|
||||
}
|
||||
|
||||
giProfileId := int(gi.Info.ProfileId)
|
||||
|
||||
if _, exists := MockA100MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists {
|
||||
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
|
||||
}
|
||||
|
||||
if _, exists := MockA100MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists {
|
||||
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
|
||||
}
|
||||
|
||||
return MockA100MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (gi *MockA100GpuInstance) CreateComputeInstance(info *ComputeInstanceProfileInfo) (ComputeInstance, Return) {
|
||||
ciInfo := ComputeInstanceInfo{
|
||||
Device: gi.Info.Device,
|
||||
GpuInstance: gi,
|
||||
Id: gi.ComputeInstanceCounter,
|
||||
ProfileId: info.Id,
|
||||
}
|
||||
gi.ComputeInstanceCounter++
|
||||
ci := NewMockA100ComputeInstance(ciInfo)
|
||||
gi.ComputeInstances[ci.(*MockA100ComputeInstance)] = struct{}{}
|
||||
return ci, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (gi *MockA100GpuInstance) GetComputeInstances(info *ComputeInstanceProfileInfo) ([]ComputeInstance, Return) {
|
||||
var cis []ComputeInstance
|
||||
for ci := range gi.ComputeInstances {
|
||||
if ci.Info.ProfileId == info.Id {
|
||||
cis = append(cis, ci)
|
||||
}
|
||||
}
|
||||
return cis, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (gi *MockA100GpuInstance) Destroy() Return {
|
||||
delete(gi.Info.Device.(*MockA100Device).GpuInstances, gi)
|
||||
return MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (ci *MockA100ComputeInstance) GetInfo() (ComputeInstanceInfo, Return) {
|
||||
return ci.Info, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (ci *MockA100ComputeInstance) Destroy() Return {
|
||||
delete(ci.Info.GpuInstance.(*MockA100GpuInstance).ComputeInstances, ci)
|
||||
return MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
// Since a compute instance can be used as a MIG device handle, it must also
|
||||
// implement the Device interface
|
||||
var _ Device = (*MockA100ComputeInstance)(nil)
|
||||
|
||||
func (c *MockA100ComputeInstance) GetIndex() (int, Return) {
|
||||
return int(c.Info.Id), MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) GetPciInfo() (PciInfo, Return) {
|
||||
// TODO: How does this behave on an actual MIG system?
|
||||
panic("Not implemented: GetPciInfo")
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) GetUUID() (string, Return) {
|
||||
return fmt.Sprintf("MIG-%d", c.Info.Id), MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) GetMinorNumber() (int, Return) {
|
||||
// TODO: This depends on the content of the mig-minors file and the (gpu, gi, ci) tuple
|
||||
panic("Not implemented: GetMinorNumber")
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) SetMigMode(Mode int) (Return, Return) {
|
||||
panic("Not implemented: SetMigMode")
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) GetMigMode() (int, int, Return) {
|
||||
panic("Not implemented: GetMigMode")
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) GetGpuInstanceProfileInfo(Profile int) (GpuInstanceProfileInfo, Return) {
|
||||
panic("Not implemented: GetGpuInstanceProfileInfo")
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) CreateGpuInstance(Info *GpuInstanceProfileInfo) (GpuInstance, Return) {
|
||||
panic("Not implemented: CreateGpuInstance")
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) GetGpuInstances(Info *GpuInstanceProfileInfo) ([]GpuInstance, Return) {
|
||||
panic("Not implemented: GetGpuInstances")
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) GetMaxMigDeviceCount() (int, Return) {
|
||||
panic("Not implemented: GetMaxMigDeviceCount")
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) GetMigDeviceHandleByIndex(Index int) (Device, Return) {
|
||||
panic("Not implemented: GetMigDeviceHandleByIndex")
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) GetDeviceHandleFromMigDeviceHandle() (Device, Return) {
|
||||
return c.Info.Device, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) IsMigDeviceHandle() (bool, Return) {
|
||||
return true, MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) GetComputeInstanceId() (int, Return) {
|
||||
return int(c.Info.Id), MockReturn(SUCCESS)
|
||||
}
|
||||
|
||||
func (c *MockA100ComputeInstance) GetGPUInstanceId() (int, Return) {
|
||||
info, r := c.Info.GpuInstance.GetInfo()
|
||||
if r.Value() != SUCCESS {
|
||||
return 0, MockReturn(r.Value())
|
||||
}
|
||||
return int(info.Id), MockReturn(SUCCESS)
|
||||
}
|
||||
188
internal/nvml/nvml.go
Normal file
188
internal/nvml/nvml.go
Normal file
@@ -0,0 +1,188 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package nvml
|
||||
|
||||
import (
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
)
|
||||
|
||||
type nvmlLib struct{}
|
||||
type nvmlDevice nvml.Device
|
||||
type nvmlGpuInstance nvml.GpuInstance
|
||||
type nvmlComputeInstance nvml.ComputeInstance
|
||||
|
||||
var _ Interface = (*nvmlLib)(nil)
|
||||
var _ Device = (*nvmlDevice)(nil)
|
||||
var _ GpuInstance = (*nvmlGpuInstance)(nil)
|
||||
var _ ComputeInstance = (*nvmlComputeInstance)(nil)
|
||||
|
||||
func New() Interface {
|
||||
return &nvmlLib{}
|
||||
}
|
||||
|
||||
func (n *nvmlLib) Init() Return {
|
||||
return nvmlReturn(nvml.Init())
|
||||
}
|
||||
|
||||
func (n *nvmlLib) Shutdown() Return {
|
||||
return nvmlReturn(nvml.Shutdown())
|
||||
}
|
||||
|
||||
func (n *nvmlLib) DeviceGetCount() (int, Return) {
|
||||
c, r := nvml.DeviceGetCount()
|
||||
return c, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (n *nvmlLib) DeviceGetHandleByIndex(index int) (Device, Return) {
|
||||
d, r := nvml.DeviceGetHandleByIndex(index)
|
||||
return nvmlDevice(d), nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (n *nvmlLib) SystemGetDriverVersion() (string, Return) {
|
||||
v, r := nvml.SystemGetDriverVersion()
|
||||
return v, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetIndex() (int, Return) {
|
||||
i, r := nvml.Device(d).GetIndex()
|
||||
return i, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetPciInfo() (PciInfo, Return) {
|
||||
p, r := nvml.Device(d).GetPciInfo()
|
||||
return PciInfo(p), nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetUUID() (string, Return) {
|
||||
u, r := nvml.Device(d).GetUUID()
|
||||
return u, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetMinorNumber() (int, Return) {
|
||||
m, r := nvml.Device(d).GetMinorNumber()
|
||||
return m, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) IsMigDeviceHandle() (bool, Return) {
|
||||
b, r := nvml.Device(d).IsMigDeviceHandle()
|
||||
return b, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetDeviceHandleFromMigDeviceHandle() (Device, Return) {
|
||||
p, r := nvml.Device(d).GetDeviceHandleFromMigDeviceHandle()
|
||||
return nvmlDevice(p), nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetGPUInstanceId() (int, Return) {
|
||||
gi, r := nvml.Device(d).GetGpuInstanceId()
|
||||
return gi, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetComputeInstanceId() (int, Return) {
|
||||
ci, r := nvml.Device(d).GetComputeInstanceId()
|
||||
return ci, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) SetMigMode(mode int) (Return, Return) {
|
||||
r1, r2 := nvml.Device(d).SetMigMode(mode)
|
||||
return nvmlReturn(r1), nvmlReturn(r2)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetMigMode() (int, int, Return) {
|
||||
s1, s2, r := nvml.Device(d).GetMigMode()
|
||||
return s1, s2, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetGpuInstanceProfileInfo(profile int) (GpuInstanceProfileInfo, Return) {
|
||||
p, r := nvml.Device(d).GetGpuInstanceProfileInfo(profile)
|
||||
return GpuInstanceProfileInfo(p), nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) CreateGpuInstance(info *GpuInstanceProfileInfo) (GpuInstance, Return) {
|
||||
gi, r := nvml.Device(d).CreateGpuInstance((*nvml.GpuInstanceProfileInfo)(info))
|
||||
return nvmlGpuInstance(gi), nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetGpuInstances(info *GpuInstanceProfileInfo) ([]GpuInstance, Return) {
|
||||
nvmlGis, r := nvml.Device(d).GetGpuInstances((*nvml.GpuInstanceProfileInfo)(info))
|
||||
var gis []GpuInstance
|
||||
for _, gi := range nvmlGis {
|
||||
gis = append(gis, nvmlGpuInstance(gi))
|
||||
}
|
||||
return gis, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetMaxMigDeviceCount() (int, Return) {
|
||||
m, r := nvml.Device(d).GetMaxMigDeviceCount()
|
||||
return m, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (d nvmlDevice) GetMigDeviceHandleByIndex(Index int) (Device, Return) {
|
||||
h, r := nvml.Device(d).GetMigDeviceHandleByIndex(Index)
|
||||
return nvmlDevice(h), nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (gi nvmlGpuInstance) GetInfo() (GpuInstanceInfo, Return) {
|
||||
i, r := nvml.GpuInstance(gi).GetInfo()
|
||||
info := GpuInstanceInfo{
|
||||
Device: nvmlDevice(i.Device),
|
||||
Id: i.Id,
|
||||
ProfileId: i.ProfileId,
|
||||
Placement: i.Placement,
|
||||
}
|
||||
return info, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (gi nvmlGpuInstance) GetComputeInstanceProfileInfo(profile int, engProfile int) (ComputeInstanceProfileInfo, Return) {
|
||||
p, r := nvml.GpuInstance(gi).GetComputeInstanceProfileInfo(profile, engProfile)
|
||||
return ComputeInstanceProfileInfo(p), nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (gi nvmlGpuInstance) CreateComputeInstance(info *ComputeInstanceProfileInfo) (ComputeInstance, Return) {
|
||||
ci, r := nvml.GpuInstance(gi).CreateComputeInstance((*nvml.ComputeInstanceProfileInfo)(info))
|
||||
return nvmlComputeInstance(ci), nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (gi nvmlGpuInstance) GetComputeInstances(info *ComputeInstanceProfileInfo) ([]ComputeInstance, Return) {
|
||||
nvmlCis, r := nvml.GpuInstance(gi).GetComputeInstances((*nvml.ComputeInstanceProfileInfo)(info))
|
||||
var cis []ComputeInstance
|
||||
for _, ci := range nvmlCis {
|
||||
cis = append(cis, nvmlComputeInstance(ci))
|
||||
}
|
||||
return cis, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (gi nvmlGpuInstance) Destroy() Return {
|
||||
r := nvml.GpuInstance(gi).Destroy()
|
||||
return nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (ci nvmlComputeInstance) GetInfo() (ComputeInstanceInfo, Return) {
|
||||
i, r := nvml.ComputeInstance(ci).GetInfo()
|
||||
info := ComputeInstanceInfo{
|
||||
Device: nvmlDevice(i.Device),
|
||||
GpuInstance: nvmlGpuInstance(i.GpuInstance),
|
||||
Id: i.Id,
|
||||
ProfileId: i.ProfileId,
|
||||
}
|
||||
return info, nvmlReturn(r)
|
||||
}
|
||||
|
||||
func (ci nvmlComputeInstance) Destroy() Return {
|
||||
r := nvml.ComputeInstance(ci).Destroy()
|
||||
return nvmlReturn(r)
|
||||
}
|
||||
110
internal/nvml/return.go
Normal file
110
internal/nvml/return.go
Normal file
@@ -0,0 +1,110 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package nvml
|
||||
|
||||
import (
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
)
|
||||
|
||||
type Return interface {
|
||||
Value() nvml.Return
|
||||
String() string
|
||||
Error() string
|
||||
}
|
||||
|
||||
type nvmlReturn nvml.Return
|
||||
type MockReturn nvml.Return
|
||||
|
||||
var _ Return = (*nvmlReturn)(nil)
|
||||
var _ Return = (*MockReturn)(nil)
|
||||
|
||||
func (r nvmlReturn) Value() nvml.Return {
|
||||
return nvml.Return(r)
|
||||
}
|
||||
|
||||
func (r nvmlReturn) String() string {
|
||||
return r.Error()
|
||||
}
|
||||
|
||||
func (r nvmlReturn) Error() string {
|
||||
return nvml.ErrorString(nvml.Return(r))
|
||||
}
|
||||
|
||||
func (r MockReturn) Value() nvml.Return {
|
||||
return nvml.Return(r)
|
||||
}
|
||||
|
||||
func (r MockReturn) String() string {
|
||||
return r.Error()
|
||||
}
|
||||
|
||||
func (r MockReturn) Error() string {
|
||||
switch nvml.Return(r) {
|
||||
case SUCCESS:
|
||||
return "SUCCESS"
|
||||
case ERROR_UNINITIALIZED:
|
||||
return "ERROR_UNINITIALIZED"
|
||||
case ERROR_INVALID_ARGUMENT:
|
||||
return "ERROR_INVALID_ARGUMENT"
|
||||
case ERROR_NOT_SUPPORTED:
|
||||
return "ERROR_NOT_SUPPORTED"
|
||||
case ERROR_NO_PERMISSION:
|
||||
return "ERROR_NO_PERMISSION"
|
||||
case ERROR_ALREADY_INITIALIZED:
|
||||
return "ERROR_ALREADY_INITIALIZED"
|
||||
case ERROR_NOT_FOUND:
|
||||
return "ERROR_NOT_FOUND"
|
||||
case ERROR_INSUFFICIENT_SIZE:
|
||||
return "ERROR_INSUFFICIENT_SIZE"
|
||||
case ERROR_INSUFFICIENT_POWER:
|
||||
return "ERROR_INSUFFICIENT_POWER"
|
||||
case ERROR_DRIVER_NOT_LOADED:
|
||||
return "ERROR_DRIVER_NOT_LOADED"
|
||||
case ERROR_TIMEOUT:
|
||||
return "ERROR_TIMEOUT"
|
||||
case ERROR_IRQ_ISSUE:
|
||||
return "ERROR_IRQ_ISSUE"
|
||||
case ERROR_LIBRARY_NOT_FOUND:
|
||||
return "ERROR_LIBRARY_NOT_FOUND"
|
||||
case ERROR_FUNCTION_NOT_FOUND:
|
||||
return "ERROR_FUNCTION_NOT_FOUND"
|
||||
case ERROR_CORRUPTED_INFOROM:
|
||||
return "ERROR_CORRUPTED_INFOROM"
|
||||
case ERROR_GPU_IS_LOST:
|
||||
return "ERROR_GPU_IS_LOST"
|
||||
case ERROR_RESET_REQUIRED:
|
||||
return "ERROR_RESET_REQUIRED"
|
||||
case ERROR_OPERATING_SYSTEM:
|
||||
return "ERROR_OPERATING_SYSTEM"
|
||||
case ERROR_LIB_RM_VERSION_MISMATCH:
|
||||
return "ERROR_LIB_RM_VERSION_MISMATCH"
|
||||
case ERROR_IN_USE:
|
||||
return "ERROR_IN_USE"
|
||||
case ERROR_MEMORY:
|
||||
return "ERROR_MEMORY"
|
||||
case ERROR_NO_DATA:
|
||||
return "ERROR_NO_DATA"
|
||||
case ERROR_VGPU_ECC_NOT_SUPPORTED:
|
||||
return "ERROR_VGPU_ECC_NOT_SUPPORTED"
|
||||
case ERROR_INSUFFICIENT_RESOURCES:
|
||||
return "ERROR_INSUFFICIENT_RESOURCES"
|
||||
case ERROR_UNKNOWN:
|
||||
return "ERROR_UNKNOWN"
|
||||
default:
|
||||
return "Unknown return value"
|
||||
}
|
||||
}
|
||||
78
internal/nvml/types.go
Normal file
78
internal/nvml/types.go
Normal file
@@ -0,0 +1,78 @@
|
||||
/*
|
||||
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package nvml
|
||||
|
||||
import (
|
||||
"github.com/NVIDIA/go-nvml/pkg/nvml"
|
||||
)
|
||||
|
||||
type Interface interface {
|
||||
Init() Return
|
||||
Shutdown() Return
|
||||
DeviceGetCount() (int, Return)
|
||||
DeviceGetHandleByIndex(Index int) (Device, Return)
|
||||
SystemGetDriverVersion() (string, Return)
|
||||
}
|
||||
|
||||
type Device interface {
|
||||
GetIndex() (int, Return)
|
||||
GetPciInfo() (PciInfo, Return)
|
||||
GetUUID() (string, Return)
|
||||
GetMinorNumber() (int, Return)
|
||||
IsMigDeviceHandle() (bool, Return)
|
||||
GetDeviceHandleFromMigDeviceHandle() (Device, Return)
|
||||
SetMigMode(Mode int) (Return, Return)
|
||||
GetMigMode() (int, int, Return)
|
||||
GetGpuInstanceProfileInfo(Profile int) (GpuInstanceProfileInfo, Return)
|
||||
CreateGpuInstance(Info *GpuInstanceProfileInfo) (GpuInstance, Return)
|
||||
GetGpuInstances(Info *GpuInstanceProfileInfo) ([]GpuInstance, Return)
|
||||
GetMaxMigDeviceCount() (int, Return)
|
||||
GetMigDeviceHandleByIndex(Index int) (Device, Return)
|
||||
GetGPUInstanceId() (int, Return)
|
||||
GetComputeInstanceId() (int, Return)
|
||||
}
|
||||
|
||||
type GpuInstance interface {
|
||||
GetInfo() (GpuInstanceInfo, Return)
|
||||
GetComputeInstanceProfileInfo(Profile int, EngProfile int) (ComputeInstanceProfileInfo, Return)
|
||||
CreateComputeInstance(Info *ComputeInstanceProfileInfo) (ComputeInstance, Return)
|
||||
GetComputeInstances(Info *ComputeInstanceProfileInfo) ([]ComputeInstance, Return)
|
||||
Destroy() Return
|
||||
}
|
||||
|
||||
type ComputeInstance interface {
|
||||
GetInfo() (ComputeInstanceInfo, Return)
|
||||
Destroy() Return
|
||||
}
|
||||
|
||||
type GpuInstanceInfo struct {
|
||||
Device Device
|
||||
Id uint32
|
||||
ProfileId uint32
|
||||
Placement nvml.GpuInstancePlacement
|
||||
}
|
||||
|
||||
type ComputeInstanceInfo struct {
|
||||
Device Device
|
||||
GpuInstance GpuInstance
|
||||
Id uint32
|
||||
ProfileId uint32
|
||||
}
|
||||
|
||||
type PciInfo nvml.PciInfo
|
||||
type GpuInstanceProfileInfo nvml.GpuInstanceProfileInfo
|
||||
type ComputeInstanceProfileInfo nvml.ComputeInstanceProfileInfo
|
||||
116
internal/proc/devices.go
Normal file
116
internal/proc/devices.go
Normal file
@@ -0,0 +1,116 @@
|
||||
/*
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
package proc
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"log"
|
||||
"os"
|
||||
"strings"
|
||||
)
|
||||
|
||||
const (
|
||||
procDevicesPath = "/proc/devices"
|
||||
nvidiaDevicePrefix = "nvidia"
|
||||
)
|
||||
|
||||
// Device represents a device as specified under /proc/devices
|
||||
type Device struct {
|
||||
Name string
|
||||
Major int
|
||||
}
|
||||
|
||||
// NvidiaDevices represents the set of nvidia owned devices under /proc/devices
|
||||
type NvidiaDevices interface {
|
||||
Exists(name string) bool
|
||||
Get(name string) (Device, bool)
|
||||
}
|
||||
|
||||
type nvidiaDevices map[string]Device
|
||||
|
||||
var _ NvidiaDevices = nvidiaDevices(nil)
|
||||
|
||||
// Exists checks if a Device with a given name exists or not
|
||||
func (d nvidiaDevices) Exists(name string) bool {
|
||||
_, exists := d[name]
|
||||
return exists
|
||||
}
|
||||
|
||||
// Get a Device from NvidiaDevices
|
||||
func (d nvidiaDevices) Get(name string) (Device, bool) {
|
||||
device, exists := d[name]
|
||||
return device, exists
|
||||
}
|
||||
|
||||
func (d nvidiaDevices) add(devices ...Device) {
|
||||
for _, device := range devices {
|
||||
d[device.Name] = device
|
||||
}
|
||||
}
|
||||
|
||||
// NewMockNvidiaDevices returns NvidiaDevices populated from the devices passed in
|
||||
func NewMockNvidiaDevices(devices ...Device) NvidiaDevices {
|
||||
nvds := make(nvidiaDevices)
|
||||
nvds.add(devices...)
|
||||
return nvds
|
||||
}
|
||||
|
||||
// GetNvidiaDevices returns the set of NvidiaDevices on the machine
|
||||
func GetNvidiaDevices() (NvidiaDevices, error) {
|
||||
devicesFile, err := os.Open(procDevicesPath)
|
||||
if os.IsNotExist(err) {
|
||||
return nil, nil
|
||||
}
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("error opening devices file: %v", err)
|
||||
}
|
||||
defer devicesFile.Close()
|
||||
|
||||
return processDeviceFile(devicesFile), nil
|
||||
}
|
||||
|
||||
func processDeviceFile(devicesFile io.Reader) NvidiaDevices {
|
||||
nvidiaDevices := make(nvidiaDevices)
|
||||
scanner := bufio.NewScanner(devicesFile)
|
||||
for scanner.Scan() {
|
||||
device, major, err := processProcDeviceLine(scanner.Text())
|
||||
if err != nil {
|
||||
log.Printf("Skipping line in devices file: %v", err)
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(device, nvidiaDevicePrefix) {
|
||||
nvidiaDevices.add(Device{device, major})
|
||||
}
|
||||
}
|
||||
return nvidiaDevices
|
||||
}
|
||||
|
||||
func processProcDeviceLine(line string) (string, int, error) {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
|
||||
var name string
|
||||
var major int
|
||||
|
||||
n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name)
|
||||
if n == 2 {
|
||||
return name, major, nil
|
||||
}
|
||||
|
||||
return "", 0, fmt.Errorf("unparsable line: %v", line)
|
||||
}
|
||||
92
internal/proc/devices_test.go
Normal file
92
internal/proc/devices_test.go
Normal file
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
*/
|
||||
|
||||
package proc
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestNvidiaDevices(t *testing.T) {
|
||||
devices := []Device{
|
||||
{"nvidia-frontend", 195},
|
||||
{"nvidia-nvlink", 234},
|
||||
{"nvidia-caps", 235},
|
||||
{"nvidia-uvm", 510},
|
||||
{"nvidia-nvswitch", 511},
|
||||
}
|
||||
|
||||
nvidiaDevices := NewMockNvidiaDevices(devices...)
|
||||
for _, d := range devices {
|
||||
device, exists := nvidiaDevices.Get(d.Name)
|
||||
require.True(t, exists, "Unexpected missing device")
|
||||
require.Equal(t, device.Name, d.Name, "Unexpected device name")
|
||||
require.Equal(t, device.Major, d.Major, "Unexpected device major")
|
||||
}
|
||||
_, exists := nvidiaDevices.Get("bogus")
|
||||
require.False(t, exists, "Unexpected 'bogus' device found")
|
||||
}
|
||||
|
||||
func TestProcessDeviceFile(t *testing.T) {
|
||||
testCases := []struct {
|
||||
lines []string
|
||||
expected []Device
|
||||
}{
|
||||
{[]string{}, []Device{}},
|
||||
{[]string{"Not a valid line:"}, []Device{}},
|
||||
{[]string{"195 nvidia-frontend"}, []Device{{"nvidia-frontend", 195}}},
|
||||
{[]string{"195 nvidia-frontend", "235 nvidia-caps"}, []Device{{"nvidia-frontend", 195}, {"nvidia-caps", 235}}},
|
||||
{[]string{" 195 nvidia-frontend"}, []Device{{"nvidia-frontend", 195}}},
|
||||
{[]string{"Not a valid line:", "", "195 nvidia-frontend"}, []Device{{"nvidia-frontend", 195}}},
|
||||
{[]string{"195 not-nvidia-frontend"}, []Device{}},
|
||||
}
|
||||
for _, tc := range testCases {
|
||||
contents := strings.NewReader(strings.Join(tc.lines, "\n"))
|
||||
d := processDeviceFile(contents)
|
||||
require.Equalf(t, NewMockNvidiaDevices(tc.expected...), d, "testCase: %v", tc)
|
||||
}
|
||||
}
|
||||
|
||||
func TestProcessDeviceFileLine(t *testing.T) {
|
||||
testCases := []struct {
|
||||
line string
|
||||
name string
|
||||
major int
|
||||
err bool
|
||||
}{
|
||||
{"", "", 0, true},
|
||||
{"0", "", 0, true},
|
||||
{"notint nvidia-frontend", "", 0, true},
|
||||
{"195 nvidia-frontend", "nvidia-frontend", 195, false},
|
||||
{" 195 nvidia-frontend", "nvidia-frontend", 195, false},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
name, major, err := processProcDeviceLine(tc.line)
|
||||
|
||||
require.Equal(t, tc.name, name)
|
||||
require.Equal(t, tc.major, major)
|
||||
if tc.err {
|
||||
require.Error(t, err)
|
||||
} else {
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user