/* * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package mig import ( "fmt" "gitlab.com/nvidia/cloud-native/go-nvlib/pkg/nvml" ) // Device defines the set of extended functions associated with a mig.Device type Device interface { GetProfile() (Profile, error) } type device struct { nvml.Device miglib *miglib profile Profile } var _ Device = &device{} // NewDevice builds a new Device from an nvml.Device func (m *miglib) NewDevice(d nvml.Device) (Device, error) { isMig, ret := d.IsMigDeviceHandle() if ret != nvml.SUCCESS { return nil, fmt.Errorf("error checking if device is a MIG device: %v", ret) } if !isMig { return nil, fmt.Errorf("not a MIG device") } return &device{d, m, nil}, nil } // GetProfile returns the MIG profile associated with a MIG device func (d *device) GetProfile() (Profile, error) { if d.profile != nil { return d.profile, nil } parent, ret := d.Device.GetDeviceHandleFromMigDeviceHandle() if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting parent device handle: %v", ret) } parentMemoryInfo, ret := parent.GetMemoryInfo() if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting parent memory info: %v", ret) } attributes, ret := d.Device.GetAttributes() if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting MIG device attributes: %v", ret) } giID, ret := d.Device.GetGpuInstanceId() if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting MIG device GPU Instance ID: %v", ret) } ciID, ret := d.Device.GetComputeInstanceId() if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting MIG device Compute Instance ID: %v", ret) } gi, ret := parent.GetGpuInstanceById(giID) if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting GPU Instance: %v", ret) } ci, ret := gi.GetComputeInstanceById(ciID) if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting Compute Instance: %v", ret) } giInfo, ret := gi.GetInfo() if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting GPU Instance info: %v", ret) } ciInfo, ret := ci.GetInfo() if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting Compute Instance info: %v", ret) } for i := 0; i < nvml.GPU_INSTANCE_PROFILE_COUNT; i++ { giProfileInfo, ret := parent.GetGpuInstanceProfileInfo(i) if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting GPU Instance profile info: %v", ret) } if giProfileInfo.Id != giInfo.ProfileId { continue } for j := 0; j < nvml.COMPUTE_INSTANCE_PROFILE_COUNT; j++ { for k := 0; k < nvml.COMPUTE_INSTANCE_ENGINE_PROFILE_COUNT; k++ { ciProfileInfo, ret := gi.GetComputeInstanceProfileInfo(j, k) if ret != nvml.SUCCESS { return nil, fmt.Errorf("error getting Compute Instance profile info: %v", ret) } if ciProfileInfo.Id != ciInfo.ProfileId { continue } p, err := d.miglib.NewProfile(i, j, k, attributes.MemorySizeMB, parentMemoryInfo.Total) if err != nil { return nil, fmt.Errorf("error creating MIG profile: %v", err) } d.profile = p return p, nil } } } return nil, fmt.Errorf("no matching profile IDs found") }