nvidia-container-toolkit/internal/nvml/mock.go

595 lines
17 KiB
Go
Raw Normal View History

/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package nvml
import "fmt"
type MockServer struct {
Devices []Device
}
type MockLunaServer struct {
MockServer
}
type MockA100Device struct {
Index int
MinorNumber int
MigMode int
GpuInstances map[*MockA100GpuInstance]struct{}
GpuInstanceCounter uint32
}
type MockA100GpuInstance struct {
Info GpuInstanceInfo
ComputeInstances map[*MockA100ComputeInstance]struct{}
ComputeInstanceCounter uint32
}
type MockA100ComputeInstance struct {
Info ComputeInstanceInfo
}
var _ Interface = (*MockLunaServer)(nil)
var _ Device = (*MockA100Device)(nil)
var _ GpuInstance = (*MockA100GpuInstance)(nil)
var _ ComputeInstance = (*MockA100ComputeInstance)(nil)
var MockA100MIGProfiles = struct {
GpuInstanceProfiles map[int]GpuInstanceProfileInfo
ComputeInstanceProfiles map[int]map[int]ComputeInstanceProfileInfo
}{
GpuInstanceProfiles: map[int]GpuInstanceProfileInfo{
GPU_INSTANCE_PROFILE_1_SLICE: {
Id: GPU_INSTANCE_PROFILE_1_SLICE,
IsP2pSupported: 0,
SliceCount: 1,
InstanceCount: 7,
MultiprocessorCount: 1,
CopyEngineCount: 1,
DecoderCount: 0,
EncoderCount: 0,
JpegCount: 0,
OfaCount: 0,
MemorySizeMB: 5120,
},
GPU_INSTANCE_PROFILE_2_SLICE: {
Id: GPU_INSTANCE_PROFILE_2_SLICE,
IsP2pSupported: 0,
SliceCount: 2,
InstanceCount: 3,
MultiprocessorCount: 2,
CopyEngineCount: 2,
DecoderCount: 1,
EncoderCount: 1,
JpegCount: 0,
OfaCount: 0,
MemorySizeMB: 10240,
},
GPU_INSTANCE_PROFILE_3_SLICE: {
Id: GPU_INSTANCE_PROFILE_3_SLICE,
IsP2pSupported: 0,
SliceCount: 3,
InstanceCount: 2,
MultiprocessorCount: 3,
CopyEngineCount: 4,
DecoderCount: 2,
EncoderCount: 2,
JpegCount: 0,
OfaCount: 0,
MemorySizeMB: 20480,
},
GPU_INSTANCE_PROFILE_4_SLICE: {
Id: GPU_INSTANCE_PROFILE_4_SLICE,
IsP2pSupported: 0,
SliceCount: 4,
InstanceCount: 1,
MultiprocessorCount: 4,
CopyEngineCount: 4,
DecoderCount: 2,
EncoderCount: 2,
JpegCount: 0,
OfaCount: 0,
MemorySizeMB: 20480,
},
GPU_INSTANCE_PROFILE_7_SLICE: {
Id: GPU_INSTANCE_PROFILE_7_SLICE,
IsP2pSupported: 0,
SliceCount: 7,
InstanceCount: 1,
MultiprocessorCount: 7,
CopyEngineCount: 8,
DecoderCount: 5,
EncoderCount: 5,
JpegCount: 1,
OfaCount: 1,
MemorySizeMB: 40960,
},
},
ComputeInstanceProfiles: map[int]map[int]ComputeInstanceProfileInfo{
GPU_INSTANCE_PROFILE_1_SLICE: {
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
SliceCount: 1,
InstanceCount: 1,
MultiprocessorCount: 1,
SharedCopyEngineCount: 1,
SharedDecoderCount: 0,
SharedEncoderCount: 0,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
},
GPU_INSTANCE_PROFILE_2_SLICE: {
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
SliceCount: 1,
InstanceCount: 2,
MultiprocessorCount: 1,
SharedCopyEngineCount: 2,
SharedDecoderCount: 1,
SharedEncoderCount: 1,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
SliceCount: 2,
InstanceCount: 1,
MultiprocessorCount: 2,
SharedCopyEngineCount: 2,
SharedDecoderCount: 1,
SharedEncoderCount: 1,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
},
GPU_INSTANCE_PROFILE_3_SLICE: {
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
SliceCount: 1,
InstanceCount: 3,
MultiprocessorCount: 1,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 1,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
SliceCount: 2,
InstanceCount: 1,
MultiprocessorCount: 2,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 2,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
COMPUTE_INSTANCE_PROFILE_3_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_3_SLICE,
SliceCount: 3,
InstanceCount: 1,
MultiprocessorCount: 3,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 0,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
},
GPU_INSTANCE_PROFILE_4_SLICE: {
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
SliceCount: 1,
InstanceCount: 4,
MultiprocessorCount: 1,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 2,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
SliceCount: 2,
InstanceCount: 2,
MultiprocessorCount: 2,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 2,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
COMPUTE_INSTANCE_PROFILE_4_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_4_SLICE,
SliceCount: 4,
InstanceCount: 1,
MultiprocessorCount: 4,
SharedCopyEngineCount: 4,
SharedDecoderCount: 2,
SharedEncoderCount: 2,
SharedJpegCount: 0,
SharedOfaCount: 0,
},
},
GPU_INSTANCE_PROFILE_7_SLICE: {
COMPUTE_INSTANCE_PROFILE_1_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_1_SLICE,
SliceCount: 1,
InstanceCount: 7,
MultiprocessorCount: 1,
SharedCopyEngineCount: 8,
SharedDecoderCount: 5,
SharedEncoderCount: 5,
SharedJpegCount: 1,
SharedOfaCount: 1,
},
COMPUTE_INSTANCE_PROFILE_2_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_2_SLICE,
SliceCount: 2,
InstanceCount: 3,
MultiprocessorCount: 2,
SharedCopyEngineCount: 8,
SharedDecoderCount: 5,
SharedEncoderCount: 5,
SharedJpegCount: 1,
SharedOfaCount: 1,
},
COMPUTE_INSTANCE_PROFILE_3_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_3_SLICE,
SliceCount: 3,
InstanceCount: 2,
MultiprocessorCount: 3,
SharedCopyEngineCount: 8,
SharedDecoderCount: 5,
SharedEncoderCount: 5,
SharedJpegCount: 1,
SharedOfaCount: 1,
},
COMPUTE_INSTANCE_PROFILE_4_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_4_SLICE,
SliceCount: 4,
InstanceCount: 1,
MultiprocessorCount: 4,
SharedCopyEngineCount: 8,
SharedDecoderCount: 5,
SharedEncoderCount: 5,
SharedJpegCount: 1,
SharedOfaCount: 1,
},
COMPUTE_INSTANCE_PROFILE_7_SLICE: {
Id: COMPUTE_INSTANCE_PROFILE_7_SLICE,
SliceCount: 7,
InstanceCount: 1,
MultiprocessorCount: 7,
SharedCopyEngineCount: 8,
SharedDecoderCount: 5,
SharedEncoderCount: 5,
SharedJpegCount: 1,
SharedOfaCount: 1,
},
},
},
}
func NewMockNVMLServer(devices ...Device) Interface {
return &MockServer{
Devices: devices,
}
}
func NewMockNVMLOnLunaServer() Interface {
devices := []Device{
NewMockA100Device(0),
NewMockA100Device(1),
NewMockA100Device(2),
NewMockA100Device(3),
NewMockA100Device(4),
NewMockA100Device(5),
NewMockA100Device(6),
NewMockA100Device(7),
}
return NewMockNVMLServer(devices...)
}
func NewMockA100Device(index int) Device {
return &MockA100Device{
Index: index,
GpuInstances: make(map[*MockA100GpuInstance]struct{}),
GpuInstanceCounter: 0,
}
}
func NewMockA100GpuInstance(info GpuInstanceInfo) GpuInstance {
return &MockA100GpuInstance{
Info: info,
ComputeInstances: make(map[*MockA100ComputeInstance]struct{}),
ComputeInstanceCounter: 0,
}
}
func NewMockA100ComputeInstance(info ComputeInstanceInfo) ComputeInstance {
return &MockA100ComputeInstance{
Info: info,
}
}
func (n *MockServer) Init() Return {
return MockReturn(SUCCESS)
}
func (n *MockServer) Shutdown() Return {
return MockReturn(SUCCESS)
}
func (n *MockServer) DeviceGetCount() (int, Return) {
return len(n.Devices), MockReturn(SUCCESS)
}
func (n *MockServer) DeviceGetHandleByIndex(index int) (Device, Return) {
if index < 0 || index >= len(n.Devices) {
return nil, MockReturn(ERROR_INVALID_ARGUMENT)
}
return n.Devices[index], MockReturn(SUCCESS)
}
func (n *MockServer) SystemGetDriverVersion() (string, Return) {
return "999.99", MockReturn(SUCCESS)
}
func (d *MockA100Device) GetIndex() (int, Return) {
return d.Index, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetPciInfo() (PciInfo, Return) {
var busID [32]int8
for i, b := range []byte("0000FFFF:FF:FF.F") {
busID[i] = int8(b)
}
p := PciInfo{
BusId: busID,
PciDeviceId: 0x20B010DE,
}
return p, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetUUID() (string, Return) {
return fmt.Sprintf("GPU-%d", d.Index), MockReturn(SUCCESS)
}
func (d *MockA100Device) GetMinorNumber() (int, Return) {
return d.MinorNumber, MockReturn(SUCCESS)
}
func (d *MockA100Device) SetMigMode(mode int) (Return, Return) {
d.MigMode = mode
return MockReturn(SUCCESS), MockReturn(SUCCESS)
}
func (d *MockA100Device) GetMigMode() (int, int, Return) {
return d.MigMode, d.MigMode, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetGpuInstanceProfileInfo(giProfileId int) (GpuInstanceProfileInfo, Return) {
if giProfileId < 0 || giProfileId >= GPU_INSTANCE_PROFILE_COUNT {
return GpuInstanceProfileInfo{}, MockReturn(ERROR_INVALID_ARGUMENT)
}
if _, exists := MockA100MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists {
return GpuInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
}
return MockA100MIGProfiles.GpuInstanceProfiles[giProfileId], MockReturn(SUCCESS)
}
func (d *MockA100Device) CreateGpuInstance(info *GpuInstanceProfileInfo) (GpuInstance, Return) {
giInfo := GpuInstanceInfo{
Device: d,
Id: d.GpuInstanceCounter,
ProfileId: info.Id,
}
d.GpuInstanceCounter++
gi := NewMockA100GpuInstance(giInfo)
d.GpuInstances[gi.(*MockA100GpuInstance)] = struct{}{}
return gi, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetGpuInstances(info *GpuInstanceProfileInfo) ([]GpuInstance, Return) {
var gis []GpuInstance
for gi := range d.GpuInstances {
if gi.Info.ProfileId == info.Id {
gis = append(gis, gi)
}
}
return gis, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetMaxMigDeviceCount() (int, Return) {
var count int
for gi := range d.GpuInstances {
count = count + int(gi.ComputeInstanceCounter)
}
return count, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetMigDeviceHandleByIndex(Index int) (Device, Return) {
var count int
for gi := range d.GpuInstances {
if count+int(gi.ComputeInstanceCounter) < Index {
count = count + int(gi.ComputeInstanceCounter)
continue
}
for ci := range gi.ComputeInstances {
if count < Index {
count++
continue
}
return ci, MockReturn(SUCCESS)
}
}
return nil, MockReturn(ERROR_NOT_FOUND)
}
func (d *MockA100Device) GetDeviceHandleFromMigDeviceHandle() (Device, Return) {
return nil, MockReturn(ERROR_NOT_SUPPORTED)
}
func (d *MockA100Device) IsMigDeviceHandle() (bool, Return) {
return false, MockReturn(SUCCESS)
}
func (d *MockA100Device) GetComputeInstanceId() (int, Return) {
panic("Not implemented: GetComputeInstanceId")
}
func (d *MockA100Device) GetGPUInstanceId() (int, Return) {
panic("Not implemented: GetGPUInstanceId")
}
func (gi *MockA100GpuInstance) GetInfo() (GpuInstanceInfo, Return) {
return gi.Info, MockReturn(SUCCESS)
}
func (gi *MockA100GpuInstance) GetComputeInstanceProfileInfo(ciProfileId int, ciEngProfileId int) (ComputeInstanceProfileInfo, Return) {
if ciProfileId < 0 || ciProfileId >= COMPUTE_INSTANCE_PROFILE_COUNT {
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_INVALID_ARGUMENT)
}
if ciEngProfileId != COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED {
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
}
giProfileId := int(gi.Info.ProfileId)
if _, exists := MockA100MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists {
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
}
if _, exists := MockA100MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists {
return ComputeInstanceProfileInfo{}, MockReturn(ERROR_NOT_SUPPORTED)
}
return MockA100MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], MockReturn(SUCCESS)
}
func (gi *MockA100GpuInstance) CreateComputeInstance(info *ComputeInstanceProfileInfo) (ComputeInstance, Return) {
ciInfo := ComputeInstanceInfo{
Device: gi.Info.Device,
GpuInstance: gi,
Id: gi.ComputeInstanceCounter,
ProfileId: info.Id,
}
gi.ComputeInstanceCounter++
ci := NewMockA100ComputeInstance(ciInfo)
gi.ComputeInstances[ci.(*MockA100ComputeInstance)] = struct{}{}
return ci, MockReturn(SUCCESS)
}
func (gi *MockA100GpuInstance) GetComputeInstances(info *ComputeInstanceProfileInfo) ([]ComputeInstance, Return) {
var cis []ComputeInstance
for ci := range gi.ComputeInstances {
if ci.Info.ProfileId == info.Id {
cis = append(cis, ci)
}
}
return cis, MockReturn(SUCCESS)
}
func (gi *MockA100GpuInstance) Destroy() Return {
delete(gi.Info.Device.(*MockA100Device).GpuInstances, gi)
return MockReturn(SUCCESS)
}
func (ci *MockA100ComputeInstance) GetInfo() (ComputeInstanceInfo, Return) {
return ci.Info, MockReturn(SUCCESS)
}
func (ci *MockA100ComputeInstance) Destroy() Return {
delete(ci.Info.GpuInstance.(*MockA100GpuInstance).ComputeInstances, ci)
return MockReturn(SUCCESS)
}
// Since a compute instance can be used as a MIG device handle, it must also
// implement the Device interface
var _ Device = (*MockA100ComputeInstance)(nil)
func (c *MockA100ComputeInstance) GetIndex() (int, Return) {
return int(c.Info.Id), MockReturn(SUCCESS)
}
func (c *MockA100ComputeInstance) GetPciInfo() (PciInfo, Return) {
// TODO: How does this behave on an actual MIG system?
panic("Not implemented: GetPciInfo")
}
func (c *MockA100ComputeInstance) GetUUID() (string, Return) {
return fmt.Sprintf("MIG-%d", c.Info.Id), MockReturn(SUCCESS)
}
func (c *MockA100ComputeInstance) GetMinorNumber() (int, Return) {
// TODO: This depends on the content of the mig-minors file and the (gpu, gi, ci) tuple
panic("Not implemented: GetMinorNumber")
}
func (c *MockA100ComputeInstance) SetMigMode(Mode int) (Return, Return) {
panic("Not implemented: SetMigMode")
}
func (c *MockA100ComputeInstance) GetMigMode() (int, int, Return) {
panic("Not implemented: GetMigMode")
}
func (c *MockA100ComputeInstance) GetGpuInstanceProfileInfo(Profile int) (GpuInstanceProfileInfo, Return) {
panic("Not implemented: GetGpuInstanceProfileInfo")
}
func (c *MockA100ComputeInstance) CreateGpuInstance(Info *GpuInstanceProfileInfo) (GpuInstance, Return) {
panic("Not implemented: CreateGpuInstance")
}
func (c *MockA100ComputeInstance) GetGpuInstances(Info *GpuInstanceProfileInfo) ([]GpuInstance, Return) {
panic("Not implemented: GetGpuInstances")
}
func (c *MockA100ComputeInstance) GetMaxMigDeviceCount() (int, Return) {
panic("Not implemented: GetMaxMigDeviceCount")
}
func (c *MockA100ComputeInstance) GetMigDeviceHandleByIndex(Index int) (Device, Return) {
panic("Not implemented: GetMigDeviceHandleByIndex")
}
func (c *MockA100ComputeInstance) GetDeviceHandleFromMigDeviceHandle() (Device, Return) {
return c.Info.Device, MockReturn(SUCCESS)
}
func (c *MockA100ComputeInstance) IsMigDeviceHandle() (bool, Return) {
return true, MockReturn(SUCCESS)
}
func (c *MockA100ComputeInstance) GetComputeInstanceId() (int, Return) {
return int(c.Info.Id), MockReturn(SUCCESS)
}
func (c *MockA100ComputeInstance) GetGPUInstanceId() (int, Return) {
info, r := c.Info.GpuInstance.GetInfo()
if r.Value() != SUCCESS {
return 0, MockReturn(r.Value())
}
return int(info.Id), MockReturn(SUCCESS)
}