From b1f509ebdee6b5b538055d73259712d69ca4079d Mon Sep 17 00:00:00 2001 From: Evan Lezar Date: Thu, 6 Feb 2025 23:05:55 +0100 Subject: [PATCH] [no-relnote] Add basic CDI generate test Signed-off-by: Evan Lezar --- cmd/nvidia-ctk/cdi/generate/generate.go | 7 + cmd/nvidia-ctk/cdi/generate/generate_test.go | 151 ++++++ .../go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go | 380 ++++++++++++++ .../pkg/nvml/mock/dgxa100/mig-profile.go | 471 ++++++++++++++++++ vendor/modules.txt | 1 + 5 files changed, 1010 insertions(+) create mode 100644 cmd/nvidia-ctk/cdi/generate/generate_test.go create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go create mode 100644 vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go diff --git a/cmd/nvidia-ctk/cdi/generate/generate.go b/cmd/nvidia-ctk/cdi/generate/generate.go index 598a40c1..b187335b 100644 --- a/cmd/nvidia-ctk/cdi/generate/generate.go +++ b/cmd/nvidia-ctk/cdi/generate/generate.go @@ -25,6 +25,8 @@ import ( "github.com/urfave/cli/v2" cdi "tags.cncf.io/container-device-interface/pkg/parser" + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/nvidia-container-toolkit/internal/config" "github.com/NVIDIA/nvidia-container-toolkit/internal/logger" "github.com/NVIDIA/nvidia-container-toolkit/internal/platform-support/tegra/csv" @@ -60,6 +62,9 @@ type options struct { files cli.StringSlice ignorePatterns cli.StringSlice } + + // the following are used for dependency injection during spec generation. + nvmllib nvml.Interface } // NewCommand constructs a generate-cdi command with the specified logger @@ -269,6 +274,8 @@ func (m command) generateSpec(opts *options) (spec.Interface, error) { nvcdi.WithLibrarySearchPaths(opts.librarySearchPaths.Value()), nvcdi.WithCSVFiles(opts.csv.files.Value()), nvcdi.WithCSVIgnorePatterns(opts.csv.ignorePatterns.Value()), + // We set the following to allow for dependency injection: + nvcdi.WithNvmlLib(opts.nvmllib), ) if err != nil { return nil, fmt.Errorf("failed to create CDI library: %v", err) diff --git a/cmd/nvidia-ctk/cdi/generate/generate_test.go b/cmd/nvidia-ctk/cdi/generate/generate_test.go new file mode 100644 index 00000000..b428dca3 --- /dev/null +++ b/cmd/nvidia-ctk/cdi/generate/generate_test.go @@ -0,0 +1,151 @@ +/** +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +**/ + +package generate + +import ( + "bytes" + "path/filepath" + "strings" + "testing" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100" + testlog "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/nvidia-container-toolkit/internal/test" +) + +func TestGenerateSpec(t *testing.T) { + t.Setenv("__NVCT_TESTING_DEVICES_ARE_FILES", "true") + moduleRoot, err := test.GetModuleRoot() + require.NoError(t, err) + + driverRoot := filepath.Join(moduleRoot, "testdata", "lookup", "rootfs-1") + + logger, _ := testlog.NewNullLogger() + testCases := []struct { + description string + options options + expectedValidateError error + expectedOptions options + expectedError error + expectedSpec string + }{ + { + description: "default", + options: options{ + format: "yaml", + mode: "nvml", + vendor: "example.com", + class: "device", + driverRoot: driverRoot, + }, + expectedOptions: options{ + format: "yaml", + mode: "nvml", + vendor: "example.com", + class: "device", + nvidiaCDIHookPath: "/usr/bin/nvidia-cdi-hook", + driverRoot: driverRoot, + }, + expectedSpec: `--- +cdiVersion: 0.5.0 +containerEdits: + deviceNodes: + - hostPath: {{ .driverRoot }}/dev/nvidiactl + path: /dev/nvidiactl + env: + - NVIDIA_VISIBLE_DEVICES=void + hooks: + - args: + - nvidia-cdi-hook + - create-symlinks + - --link + - libcuda.so.1::/lib/x86_64-linux-gnu/libcuda.so + hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + - args: + - nvidia-cdi-hook + - update-ldcache + - --folder + - /lib/x86_64-linux-gnu + hookName: createContainer + path: /usr/bin/nvidia-cdi-hook + mounts: + - containerPath: /lib/x86_64-linux-gnu/libcuda.so.999.88.77 + hostPath: {{ .driverRoot }}/lib/x86_64-linux-gnu/libcuda.so.999.88.77 + options: + - ro + - nosuid + - nodev + - bind +devices: +- containerEdits: + deviceNodes: + - hostPath: {{ .driverRoot }}/dev/nvidia0 + path: /dev/nvidia0 + name: "0" +- containerEdits: + deviceNodes: + - hostPath: {{ .driverRoot }}/dev/nvidia0 + path: /dev/nvidia0 + name: all +kind: example.com/device +`, + }, + } + + for _, tc := range testCases { + t.Run(tc.description, func(t *testing.T) { + c := command{ + logger: logger, + } + + err := c.validateFlags(nil, &tc.options) + require.ErrorIs(t, err, tc.expectedValidateError) + require.EqualValues(t, tc.expectedOptions, tc.options) + + // Set up a mock server, reusing the DGX A100 mock. + server := dgxa100.New() + // Override the driver version to match the version in our mock filesystem. + server.SystemGetDriverVersionFunc = func() (string, nvml.Return) { + return "999.88.77", nvml.SUCCESS + } + // Set the device count to 1 explicitly since we only have a single device node. + server.DeviceGetCountFunc = func() (int, nvml.Return) { + return 1, nvml.SUCCESS + } + for _, d := range server.Devices { + // TODO: This is not implemented in the mock. + (d.(*dgxa100.Device)).GetMaxMigDeviceCountFunc = func() (int, nvml.Return) { + return 0, nvml.SUCCESS + } + } + tc.options.nvmllib = server + + spec, err := c.generateSpec(&tc.options) + require.ErrorIs(t, err, tc.expectedError) + + var buf bytes.Buffer + _, err = spec.WriteTo(&buf) + require.NoError(t, err) + + require.Equal(t, strings.ReplaceAll(tc.expectedSpec, "{{ .driverRoot }}", driverRoot), buf.String()) + }) + } +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go new file mode 100644 index 00000000..7654dc7d --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/dgxa100.go @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxa100 + +import ( + "fmt" + "sync" + + "github.com/NVIDIA/go-nvml/pkg/nvml" + "github.com/NVIDIA/go-nvml/pkg/nvml/mock" + "github.com/google/uuid" +) + +type Server struct { + mock.Interface + mock.ExtendedInterface + Devices [8]nvml.Device + DriverVersion string + NvmlVersion string + CudaDriverVersion int +} +type Device struct { + mock.Device + sync.RWMutex + UUID string + Name string + Brand nvml.BrandType + Architecture nvml.DeviceArchitecture + PciBusID string + Minor int + Index int + CudaComputeCapability CudaComputeCapability + MigMode int + GpuInstances map[*GpuInstance]struct{} + GpuInstanceCounter uint32 + MemoryInfo nvml.Memory +} + +type GpuInstance struct { + mock.GpuInstance + sync.RWMutex + Info nvml.GpuInstanceInfo + ComputeInstances map[*ComputeInstance]struct{} + ComputeInstanceCounter uint32 +} + +type ComputeInstance struct { + mock.ComputeInstance + Info nvml.ComputeInstanceInfo +} + +type CudaComputeCapability struct { + Major int + Minor int +} + +var _ nvml.Interface = (*Server)(nil) +var _ nvml.Device = (*Device)(nil) +var _ nvml.GpuInstance = (*GpuInstance)(nil) +var _ nvml.ComputeInstance = (*ComputeInstance)(nil) + +func New() *Server { + server := &Server{ + Devices: [8]nvml.Device{ + NewDevice(0), + NewDevice(1), + NewDevice(2), + NewDevice(3), + NewDevice(4), + NewDevice(5), + NewDevice(6), + NewDevice(7), + }, + DriverVersion: "550.54.15", + NvmlVersion: "12.550.54.15", + CudaDriverVersion: 12040, + } + server.setMockFuncs() + return server +} + +func NewDevice(index int) *Device { + device := &Device{ + UUID: "GPU-" + uuid.New().String(), + Name: "Mock NVIDIA A100-SXM4-40GB", + Brand: nvml.BRAND_NVIDIA, + Architecture: nvml.DEVICE_ARCH_AMPERE, + PciBusID: fmt.Sprintf("0000:%02x:00.0", index), + Minor: index, + Index: index, + CudaComputeCapability: CudaComputeCapability{ + Major: 8, + Minor: 0, + }, + GpuInstances: make(map[*GpuInstance]struct{}), + GpuInstanceCounter: 0, + MemoryInfo: nvml.Memory{42949672960, 0, 0}, + } + device.setMockFuncs() + return device +} + +func NewGpuInstance(info nvml.GpuInstanceInfo) *GpuInstance { + gi := &GpuInstance{ + Info: info, + ComputeInstances: make(map[*ComputeInstance]struct{}), + ComputeInstanceCounter: 0, + } + gi.setMockFuncs() + return gi +} + +func NewComputeInstance(info nvml.ComputeInstanceInfo) *ComputeInstance { + ci := &ComputeInstance{ + Info: info, + } + ci.setMockFuncs() + return ci +} + +func (s *Server) setMockFuncs() { + s.ExtensionsFunc = func() nvml.ExtendedInterface { + return s + } + + s.LookupSymbolFunc = func(symbol string) error { + return nil + } + + s.InitFunc = func() nvml.Return { + return nvml.SUCCESS + } + + s.ShutdownFunc = func() nvml.Return { + return nvml.SUCCESS + } + + s.SystemGetDriverVersionFunc = func() (string, nvml.Return) { + return s.DriverVersion, nvml.SUCCESS + } + + s.SystemGetNVMLVersionFunc = func() (string, nvml.Return) { + return s.NvmlVersion, nvml.SUCCESS + } + + s.SystemGetCudaDriverVersionFunc = func() (int, nvml.Return) { + return s.CudaDriverVersion, nvml.SUCCESS + } + + s.DeviceGetCountFunc = func() (int, nvml.Return) { + return len(s.Devices), nvml.SUCCESS + } + + s.DeviceGetHandleByIndexFunc = func(index int) (nvml.Device, nvml.Return) { + if index < 0 || index >= len(s.Devices) { + return nil, nvml.ERROR_INVALID_ARGUMENT + } + return s.Devices[index], nvml.SUCCESS + } + + s.DeviceGetHandleByUUIDFunc = func(uuid string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if uuid == d.(*Device).UUID { + return d, nvml.SUCCESS + } + } + return nil, nvml.ERROR_INVALID_ARGUMENT + } + + s.DeviceGetHandleByPciBusIdFunc = func(busID string) (nvml.Device, nvml.Return) { + for _, d := range s.Devices { + if busID == d.(*Device).PciBusID { + return d, nvml.SUCCESS + } + } + return nil, nvml.ERROR_INVALID_ARGUMENT + } +} + +func (d *Device) setMockFuncs() { + d.GetMinorNumberFunc = func() (int, nvml.Return) { + return d.Minor, nvml.SUCCESS + } + + d.GetIndexFunc = func() (int, nvml.Return) { + return d.Index, nvml.SUCCESS + } + + d.GetCudaComputeCapabilityFunc = func() (int, int, nvml.Return) { + return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS + } + + d.GetUUIDFunc = func() (string, nvml.Return) { + return d.UUID, nvml.SUCCESS + } + + d.GetNameFunc = func() (string, nvml.Return) { + return d.Name, nvml.SUCCESS + } + + d.GetBrandFunc = func() (nvml.BrandType, nvml.Return) { + return d.Brand, nvml.SUCCESS + } + + d.GetArchitectureFunc = func() (nvml.DeviceArchitecture, nvml.Return) { + return d.Architecture, nvml.SUCCESS + } + + d.GetMemoryInfoFunc = func() (nvml.Memory, nvml.Return) { + return d.MemoryInfo, nvml.SUCCESS + } + + d.GetPciInfoFunc = func() (nvml.PciInfo, nvml.Return) { + p := nvml.PciInfo{ + PciDeviceId: 0x20B010DE, + } + return p, nvml.SUCCESS + } + + d.SetMigModeFunc = func(mode int) (nvml.Return, nvml.Return) { + d.MigMode = mode + return nvml.SUCCESS, nvml.SUCCESS + } + + d.GetMigModeFunc = func() (int, int, nvml.Return) { + return d.MigMode, d.MigMode, nvml.SUCCESS + } + + d.GetGpuInstanceProfileInfoFunc = func(giProfileId int) (nvml.GpuInstanceProfileInfo, nvml.Return) { + if giProfileId < 0 || giProfileId >= nvml.GPU_INSTANCE_PROFILE_COUNT { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } + + if _, exists := MIGProfiles.GpuInstanceProfiles[giProfileId]; !exists { + return nvml.GpuInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + return MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS + } + + d.GetGpuInstancePossiblePlacementsFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) { + return MIGPlacements.GpuInstancePossiblePlacements[int(info.Id)], nvml.SUCCESS + } + + d.CreateGpuInstanceFunc = func(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, + } + d.GpuInstanceCounter++ + gi := NewGpuInstance(giInfo) + d.GpuInstances[gi] = struct{}{} + return gi, nvml.SUCCESS + } + + d.CreateGpuInstanceWithPlacementFunc = func(info *nvml.GpuInstanceProfileInfo, placement *nvml.GpuInstancePlacement) (nvml.GpuInstance, nvml.Return) { + d.Lock() + defer d.Unlock() + giInfo := nvml.GpuInstanceInfo{ + Device: d, + Id: d.GpuInstanceCounter, + ProfileId: info.Id, + Placement: *placement, + } + d.GpuInstanceCounter++ + gi := NewGpuInstance(giInfo) + d.GpuInstances[gi] = struct{}{} + return gi, nvml.SUCCESS + } + + d.GetGpuInstancesFunc = func(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstance, nvml.Return) { + d.RLock() + defer d.RUnlock() + var gis []nvml.GpuInstance + for gi := range d.GpuInstances { + if gi.Info.ProfileId == info.Id { + gis = append(gis, gi) + } + } + return gis, nvml.SUCCESS + } +} + +func (gi *GpuInstance) setMockFuncs() { + gi.GetInfoFunc = func() (nvml.GpuInstanceInfo, nvml.Return) { + return gi.Info, nvml.SUCCESS + } + + gi.GetComputeInstanceProfileInfoFunc = func(ciProfileId int, ciEngProfileId int) (nvml.ComputeInstanceProfileInfo, nvml.Return) { + if ciProfileId < 0 || ciProfileId >= nvml.COMPUTE_INSTANCE_PROFILE_COUNT { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_INVALID_ARGUMENT + } + + if ciEngProfileId != nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_SHARED { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + giProfileId := int(gi.Info.ProfileId) + + if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + if _, exists := MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId]; !exists { + return nvml.ComputeInstanceProfileInfo{}, nvml.ERROR_NOT_SUPPORTED + } + + return MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS + } + + gi.GetComputeInstancePossiblePlacementsFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) { + return MIGPlacements.ComputeInstancePossiblePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS + } + + gi.CreateComputeInstanceFunc = func(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) { + gi.Lock() + defer gi.Unlock() + ciInfo := nvml.ComputeInstanceInfo{ + Device: gi.Info.Device, + GpuInstance: gi, + Id: gi.ComputeInstanceCounter, + ProfileId: info.Id, + } + gi.ComputeInstanceCounter++ + ci := NewComputeInstance(ciInfo) + gi.ComputeInstances[ci] = struct{}{} + return ci, nvml.SUCCESS + } + + gi.GetComputeInstancesFunc = func(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstance, nvml.Return) { + gi.RLock() + defer gi.RUnlock() + var cis []nvml.ComputeInstance + for ci := range gi.ComputeInstances { + if ci.Info.ProfileId == info.Id { + cis = append(cis, ci) + } + } + return cis, nvml.SUCCESS + } + + gi.DestroyFunc = func() nvml.Return { + d := gi.Info.Device.(*Device) + d.Lock() + defer d.Unlock() + delete(d.GpuInstances, gi) + return nvml.SUCCESS + } +} + +func (ci *ComputeInstance) setMockFuncs() { + ci.GetInfoFunc = func() (nvml.ComputeInstanceInfo, nvml.Return) { + return ci.Info, nvml.SUCCESS + } + + ci.DestroyFunc = func() nvml.Return { + gi := ci.Info.GpuInstance.(*GpuInstance) + gi.Lock() + defer gi.Unlock() + delete(gi.ComputeInstances, ci) + return nvml.SUCCESS + } +} diff --git a/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go new file mode 100644 index 00000000..c4df4c83 --- /dev/null +++ b/vendor/github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100/mig-profile.go @@ -0,0 +1,471 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dgxa100 + +import ( + "github.com/NVIDIA/go-nvml/pkg/nvml" +) + +// MIGProfiles holds the profile information for GIs and CIs in this mock server. +// We should consider auto-generating this object in the future. +var MIGProfiles = struct { + GpuInstanceProfiles map[int]nvml.GpuInstanceProfileInfo + ComputeInstanceProfiles map[int]map[int]nvml.ComputeInstanceProfileInfo +}{ + GpuInstanceProfiles: map[int]nvml.GpuInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 0, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 4864, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + Id: nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2, + IsP2pSupported: 0, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + CopyEngineCount: 1, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_2_SLICE, + IsP2pSupported: 0, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + CopyEngineCount: 2, + DecoderCount: 1, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 9856, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_3_SLICE, + IsP2pSupported: 0, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + CopyEngineCount: 3, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_4_SLICE, + IsP2pSupported: 0, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + CopyEngineCount: 4, + DecoderCount: 2, + EncoderCount: 0, + JpegCount: 0, + OfaCount: 0, + MemorySizeMB: 19968, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.GPU_INSTANCE_PROFILE_7_SLICE, + IsP2pSupported: 0, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 98, + CopyEngineCount: 7, + DecoderCount: 5, + EncoderCount: 0, + JpegCount: 1, + OfaCount: 1, + MemorySizeMB: 40192, + }, + }, + ComputeInstanceProfiles: map[int]map[int]nvml.ComputeInstanceProfileInfo{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + SharedCopyEngineCount: 1, + SharedDecoderCount: 0, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + SharedCopyEngineCount: 1, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 1, + MultiprocessorCount: 14, + SharedCopyEngineCount: 1, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 2, + MultiprocessorCount: 14, + SharedCopyEngineCount: 2, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + SharedCopyEngineCount: 2, + SharedDecoderCount: 1, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 3, + MultiprocessorCount: 14, + SharedCopyEngineCount: 3, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 1, + MultiprocessorCount: 28, + SharedCopyEngineCount: 3, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 1, + MultiprocessorCount: 42, + SharedCopyEngineCount: 3, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 4, + MultiprocessorCount: 14, + SharedCopyEngineCount: 4, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 2, + MultiprocessorCount: 28, + SharedCopyEngineCount: 4, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + SharedCopyEngineCount: 4, + SharedDecoderCount: 2, + SharedEncoderCount: 0, + SharedJpegCount: 0, + SharedOfaCount: 0, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE, + SliceCount: 1, + InstanceCount: 7, + MultiprocessorCount: 14, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE, + SliceCount: 2, + InstanceCount: 3, + MultiprocessorCount: 28, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE, + SliceCount: 3, + InstanceCount: 2, + MultiprocessorCount: 42, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE, + SliceCount: 4, + InstanceCount: 1, + MultiprocessorCount: 56, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: { + Id: nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE, + SliceCount: 7, + InstanceCount: 1, + MultiprocessorCount: 98, + SharedCopyEngineCount: 7, + SharedDecoderCount: 5, + SharedEncoderCount: 0, + SharedJpegCount: 1, + SharedOfaCount: 1, + }, + }, + }, +} + +// MIGPlacements holds the placement information for GIs and CIs in this mock server. +// We should consider auto-generating this object in the future. +var MIGPlacements = struct { + GpuInstancePossiblePlacements map[int][]nvml.GpuInstancePlacement + ComputeInstancePossiblePlacements map[int]map[int][]nvml.ComputeInstancePlacement +}{ + GpuInstancePossiblePlacements: map[int][]nvml.GpuInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + { + Start: 0, + Size: 1, + }, + { + Start: 1, + Size: 1, + }, + { + Start: 2, + Size: 1, + }, + { + Start: 3, + Size: 1, + }, + { + Start: 4, + Size: 1, + }, + { + Start: 5, + Size: 1, + }, + { + Start: 6, + Size: 1, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + { + Start: 0, + Size: 1, + }, + { + Start: 1, + Size: 1, + }, + { + Start: 2, + Size: 1, + }, + { + Start: 3, + Size: 1, + }, + { + Start: 4, + Size: 1, + }, + { + Start: 5, + Size: 1, + }, + { + Start: 6, + Size: 1, + }, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + { + Start: 0, + Size: 2, + }, + { + Start: 2, + Size: 2, + }, + { + Start: 4, + Size: 2, + }, + { + Start: 6, + Size: 2, + }, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + { + Start: 0, + Size: 2, + }, + { + Start: 2, + Size: 2, + }, + { + Start: 4, + Size: 2, + }, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + { + Start: 0, + Size: 4, + }, + { + Start: 4, + Size: 4, + }, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + { + Start: 0, + Size: 4, + }, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + { + Start: 0, + Size: 8, + }, + }, + }, + // TODO: Fill out ComputeInstancePossiblePlacements + ComputeInstancePossiblePlacements: map[int]map[int][]nvml.ComputeInstancePlacement{ + nvml.GPU_INSTANCE_PROFILE_1_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV1: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_1_SLICE_REV2: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_2_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_3_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_4_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, + }, + nvml.GPU_INSTANCE_PROFILE_7_SLICE: { + nvml.COMPUTE_INSTANCE_PROFILE_1_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_2_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_3_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_4_SLICE: {}, + nvml.COMPUTE_INSTANCE_PROFILE_7_SLICE: {}, + }, + }, +} diff --git a/vendor/modules.txt b/vendor/modules.txt index babcea30..85913639 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -11,6 +11,7 @@ github.com/NVIDIA/go-nvlib/pkg/pciids github.com/NVIDIA/go-nvml/pkg/dl github.com/NVIDIA/go-nvml/pkg/nvml github.com/NVIDIA/go-nvml/pkg/nvml/mock +github.com/NVIDIA/go-nvml/pkg/nvml/mock/dgxa100 # github.com/cpuguy83/go-md2man/v2 v2.0.5 ## explicit; go 1.11 github.com/cpuguy83/go-md2man/v2/md2man