mirror of
https://github.com/clearml/go-nvlib
synced 2025-02-12 15:34:55 +00:00
136 lines
3.7 KiB
Go
136 lines
3.7 KiB
Go
|
/*
|
||
|
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||
|
*
|
||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
* you may not use this file except in compliance with the License.
|
||
|
* You may obtain a copy of the License at
|
||
|
*
|
||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||
|
*
|
||
|
* Unless required by applicable law or agreed to in writing, software
|
||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
* See the License for the specific language governing permissions and
|
||
|
* limitations under the License.
|
||
|
*/
|
||
|
|
||
|
package mig
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
|
||
|
"gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml"
|
||
|
)
|
||
|
|
||
|
// Device defines the set of extended functions associated with a mig.Device
|
||
|
type Device interface {
|
||
|
GetProfile() (Profile, error)
|
||
|
}
|
||
|
|
||
|
type device struct {
|
||
|
nvml.Device
|
||
|
miglib *miglib
|
||
|
profile Profile
|
||
|
}
|
||
|
|
||
|
var _ Device = &device{}
|
||
|
|
||
|
// NewDevice builds a new Device from an nvml.Device
|
||
|
func (m *miglib) NewDevice(d nvml.Device) (Device, error) {
|
||
|
isMig, ret := d.IsMigDeviceHandle()
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error checking if device is a MIG device: %v", ret)
|
||
|
}
|
||
|
if !isMig {
|
||
|
return nil, fmt.Errorf("not a MIG device")
|
||
|
}
|
||
|
return &device{d, m, nil}, nil
|
||
|
}
|
||
|
|
||
|
// GetProfile returns the MIG profile associated with a MIG device
|
||
|
func (d *device) GetProfile() (Profile, error) {
|
||
|
if d.profile != nil {
|
||
|
return d.profile, nil
|
||
|
}
|
||
|
|
||
|
parent, ret := d.Device.GetDeviceHandleFromMigDeviceHandle()
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting parent device handle: %v", ret)
|
||
|
}
|
||
|
|
||
|
parentMemoryInfo, ret := parent.GetMemoryInfo()
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting parent memory info: %v", ret)
|
||
|
}
|
||
|
|
||
|
attributes, ret := d.Device.GetAttributes()
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting MIG device attributes: %v", ret)
|
||
|
}
|
||
|
|
||
|
giID, ret := d.Device.GetGpuInstanceId()
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting MIG device GPU Instance ID: %v", ret)
|
||
|
}
|
||
|
|
||
|
ciID, ret := d.Device.GetComputeInstanceId()
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting MIG device Compute Instance ID: %v", ret)
|
||
|
}
|
||
|
|
||
|
gi, ret := parent.GetGpuInstanceById(giID)
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting GPU Instance: %v", ret)
|
||
|
}
|
||
|
|
||
|
ci, ret := gi.GetComputeInstanceById(ciID)
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting Compute Instance: %v", ret)
|
||
|
}
|
||
|
|
||
|
giInfo, ret := gi.GetInfo()
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting GPU Instance info: %v", ret)
|
||
|
}
|
||
|
|
||
|
ciInfo, ret := ci.GetInfo()
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting Compute Instance info: %v", ret)
|
||
|
}
|
||
|
|
||
|
for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ {
|
||
|
giProfileInfo, ret := parent.GetGpuInstanceProfileInfo(i)
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting GPU Instance profile info: %v", ret)
|
||
|
}
|
||
|
|
||
|
if giProfileInfo.Id != giInfo.ProfileId {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
for j := 0; j < nvml.COMPUTE_INSTANCE_PROFILE_COUNT; j++ {
|
||
|
for k := 0; k < nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT; k++ {
|
||
|
ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo(j, k)
|
||
|
if ret != nvml.SUCCESS {
|
||
|
return nil, fmt.Errorf("error getting Compute Instance profile info: %v", ret)
|
||
|
|
||
|
}
|
||
|
|
||
|
if ciProfileInfo.Id != ciInfo.ProfileId {
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
p, err := d.miglib.NewProfile(i, j, k, attributes.MemorySizeMB, parentMemoryInfo.Total)
|
||
|
if err != nil {
|
||
|
return nil, fmt.Errorf("error creating MIG profile: %v", err)
|
||
|
}
|
||
|
|
||
|
d.profile = p
|
||
|
return p, nil
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return nil, fmt.Errorf("no matching profile IDs found")
|
||
|
}
|