go-nvlib/pkg/nvlib/mig/device.go
Kevin Klues e95e3a5e8b Add a MIG package as a subpackage to nvlib
For now this package only has functions to work with MIG profiles. More
functionality will be added here in the future.

Signed-off-by: Kevin Klues <kklues@nvidia.com>
2022-09-15 17:08:00 +00:00

136 lines
3.7 KiB
Go

/*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package mig
import (
"fmt"
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
)
// Device defines the set of extended functions associated with a mig.Device
type Device interface {
GetProfile() (Profile, error)
}
type device struct {
nvml.Device
miglib *miglib
profile Profile
}
var _ Device = &device{}
// NewDevice builds a new Device from an nvml.Device
func (m *miglib) NewDevice(d nvml.Device) (Device, error) {
isMig, ret := d.IsMigDeviceHandle()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error checking if device is a MIG device: %v", ret)
}
if !isMig {
return nil, fmt.Errorf("not a MIG device")
}
return &device{d, m, nil}, nil
}
// GetProfile returns the MIG profile associated with a MIG device
func (d *device) GetProfile() (Profile, error) {
if d.profile != nil {
return d.profile, nil
}
parent, ret := d.Device.GetDeviceHandleFromMigDeviceHandle()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting parent device handle: %v", ret)
}
parentMemoryInfo, ret := parent.GetMemoryInfo()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting parent memory info: %v", ret)
}
attributes, ret := d.Device.GetAttributes()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting MIG device attributes: %v", ret)
}
giID, ret := d.Device.GetGpuInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting MIG device GPU Instance ID: %v", ret)
}
ciID, ret := d.Device.GetComputeInstanceId()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting MIG device Compute Instance ID: %v", ret)
}
gi, ret := parent.GetGpuInstanceById(giID)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU Instance: %v", ret)
}
ci, ret := gi.GetComputeInstanceById(ciID)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting Compute Instance: %v", ret)
}
giInfo, ret := gi.GetInfo()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU Instance info: %v", ret)
}
ciInfo, ret := ci.GetInfo()
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting Compute Instance info: %v", ret)
}
for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ {
giProfileInfo, ret := parent.GetGpuInstanceProfileInfo(i)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting GPU Instance profile info: %v", ret)
}
if giProfileInfo.Id != giInfo.ProfileId {
continue
}
for j := 0; j < nvml.COMPUTE_INSTANCE_PROFILE_COUNT; j++ {
for k := 0; k < nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT; k++ {
ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo(j, k)
if ret != nvml.SUCCESS {
return nil, fmt.Errorf("error getting Compute Instance profile info: %v", ret)
}
if ciProfileInfo.Id != ciInfo.ProfileId {
continue
}
p, err := d.miglib.NewProfile(i, j, k, attributes.MemorySizeMB, parentMemoryInfo.Total)
if err != nil {
return nil, fmt.Errorf("error creating MIG profile: %v", err)
}
d.profile = p
return p, nil
}
}
}
return nil, fmt.Errorf("no matching profile IDs found")
}