Fix AMD GPU metrics collection

This commit is contained in:
allegroai 2024-05-08 23:41:13 +03:00
parent db4834f37e
commit 2fbd86415c
2 changed files with 800 additions and 1 deletions

View File

@ -17,10 +17,13 @@ from __future__ import unicode_literals
import json
import platform
import sys
import subprocess
from datetime import datetime
from ctypes import c_uint32, byref, c_int64
import psutil
from ..gpu import pynvml as N
from ..gpu import pyrsmi as R
NOT_SUPPORTED = 'Not Supported'
MB = 1024 * 1024
@ -193,7 +196,184 @@ class GPUStatCollection(object):
del GPUStatCollection.global_processes[pid]
@staticmethod
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
def _new_query_amd(shutdown=False, per_process_stats=False, get_driver_info=False):
initialized = False
if not GPUStatCollection._initialized:
R.smi_initialize()
GPUStatCollection._initialized = True
initialized = True
def get_gpu_info(index):
def amd_query_processes():
num_procs = c_uint32()
ret = R.rocm_lib.rsmi_compute_process_info_get(None, byref(num_procs))
if R.rsmi_ret_ok(ret):
buff_sz = num_procs.value + 10
proc_info = (R.rsmi_process_info_t * buff_sz)()
ret = R.rocm_lib.rsmi_compute_process_info_get(byref(proc_info), byref(num_procs))
proc_info_list = (
[proc_info[i] for i in range(num_procs.value)]
if R.rsmi_ret_ok(ret)
else []
)
result_proc_info_list = []
# query VRAM usage explicitly, as rsmi_compute_process_info_get
# doesn't actually return VRAM usage
for proc_info in proc_info_list:
vram_query_proc_info = R.rsmi_process_info_t()
ret = R.rocm_lib.rsmi_compute_process_info_by_pid_get(
int(proc_info.process_id), byref(vram_query_proc_info)
)
if R.rsmi_ret_ok(ret):
proc_info.vram_usage = vram_query_proc_info.vram_usage
result_proc_info_list.append(proc_info)
return result_proc_info_list
return []
def get_fan_speed():
fan_level = c_int64()
fan_max = c_int64()
sensor_ind = c_uint32(0)
ret = R.rocm_lib.rsmi_dev_fan_speed_get(index, sensor_ind, byref(fan_level))
if not R.rsmi_ret_ok(ret, log_error=False):
return None
ret = R.rocm_lib.rsmi_dev_fan_speed_max_get(index, sensor_ind, byref(fan_max))
if not R.rsmi_ret_ok(ret, log_error=False):
return None
if fan_level.value <= 0 or fan_max <= 0:
return None
return float(fan_level.value) / float(fan_max.value)
def get_process_info(comp_process):
process = {}
pid = comp_process.process_id
# skip global_processes caching because PID querying seems to be inconsistent atm
# if pid not in GPUStatCollection.global_processes:
# GPUStatCollection.global_processes[pid] = psutil.Process(pid=pid)
process["pid"] = pid
try:
process["gpu_memory_usage"] = comp_process.vram_usage // MB
except Exception:
pass
return process
if not GPUStatCollection._gpu_device_info.get(index):
uuid = R.smi_get_device_id(index)
name = R.smi_get_device_name(index)
GPUStatCollection._gpu_device_info[index] = (name, uuid)
name, uuid = GPUStatCollection._gpu_device_info[index]
temperature = None # TODO: fetch temperature. It should be possible
fan_speed = get_fan_speed()
try:
memory_total = R.smi_get_device_memory_total(index)
except Exception:
memory_total = None
try:
memory_used = R.smi_get_device_memory_used(index)
except Exception:
memory_used = None
try:
utilization = R.smi_get_device_utilization(index)
except Exception:
utilization = None
try:
power = R.smi_get_device_average_power(index)
except Exception:
power = None
power_limit = None # TODO: find a way to fetch this
processes = []
if per_process_stats:
try:
comp_processes = amd_query_processes()
except Exception:
comp_processes = []
for comp_process in comp_processes:
try:
process = get_process_info(comp_process)
except psutil.NoSuchProcess:
# skip process caching for now
pass
else:
processes.append(process)
gpu_info = {
"index": index,
"uuid": uuid,
"name": name,
"temperature.gpu": temperature if temperature is not None else 0,
"fan.speed": fan_speed if fan_speed is not None else 0,
"utilization.gpu": utilization if utilization is not None else 100,
"power.draw": power if power is not None else 0,
"enforced.power.limit": power_limit if power_limit is not None else 0,
# Convert bytes into MBytes
"memory.used": memory_used // MB if memory_used is not None else 0,
"memory.total": memory_total // MB if memory_total is not None else 100,
"processes": None if (processes and all(p is None for p in processes)) else processes,
}
if per_process_stats:
GPUStatCollection.clean_processes()
return gpu_info
gpu_list = []
if GPUStatCollection._device_count is None:
GPUStatCollection._device_count = R.smi_get_device_count()
for index in range(GPUStatCollection._device_count):
gpu_info = get_gpu_info(index)
gpu_stat = GPUStat(gpu_info)
gpu_list.append(gpu_stat)
if shutdown and initialized:
R.smi_shutdown()
GPUStatCollection._initialized = False
# noinspection PyProtectedMember
driver_version = GPUStatCollection._get_amd_driver_version() if get_driver_info else None
return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=None)
@staticmethod
def _get_amd_driver_version():
# make sure the program doesn't crash with something like a SEGFAULT when querying the driver version
try:
process = subprocess.Popen(["rocm-smi", "--showdriverversion", "--json"], stdout=subprocess.PIPE)
out, _ = process.communicate()
return json.loads(out)["system"]["Driver version"]
except Exception:
try:
process = subprocess.Popen(
[
sys.executable,
"-c",
"from clearml.utilities.gpu.pyrsmi import smi_get_kernel_version, smi_initialize; "
+ "smi_initialize(); "
+ "print(smi_get_kernel_version())",
]
)
out, _ = process.communicate()
return out.strip()
except Exception:
return None
@staticmethod
def _running_in_amd_env():
# noinspection PyProtectedMember
return bool(R._find_lib_rocm())
@staticmethod
def _new_query_nvidia(shutdown=False, per_process_stats=False, get_driver_info=False):
"""Query the information of all the GPUs on local machine"""
initialized = False
if not GPUStatCollection._initialized:
@ -408,6 +588,20 @@ class GPUStatCollection(object):
return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version)
@staticmethod
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
# noinspection PyProtectedMember
if GPUStatCollection._running_in_amd_env():
# noinspection PyProtectedMember
return GPUStatCollection._new_query_amd(
shutdown=shutdown, per_process_stats=per_process_stats, get_driver_info=get_driver_info
)
else:
# noinspection PyProtectedMember
return GPUStatCollection._new_query_nvidia(
shutdown=shutdown, per_process_stats=per_process_stats, get_driver_info=get_driver_info
)
def __len__(self):
return len(self.gpus)

View File

@ -0,0 +1,605 @@
# MIT License
#
# Copyright (c) 2023 Advanced Micro Devices, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Python bindings for ROCm-SMI library
from ctypes import *
from os.path import join, realpath, isfile
import os
import logging
import subprocess
import sys
import threading
from enum import IntEnum, auto
def get_device_uuids():
"""Get the UUIDs of all ROCm devices from rocminfo output,
according to HSA spec.
"""
uuids = []
check_cmd ='rocminfo'
try:
proc_complete = subprocess.run(
check_cmd.split(),
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
check=True,
)
for line in proc_complete.stdout.decode('utf-8').split('\n'):
# drops CPU devices from the output
if 'Uuid' in line and 'GPU-' in line:
uuids.append(line.strip().split('GPU-')[-1])
except (FileNotFoundError, subprocess.CalledProcessError) as err:
print(' Error => ', str(err))
return uuids
## Error checking
class ROCMLError_NotSupported(Exception):
pass
class ROCMLError_FunctionNotFound(Exception):
pass
class ROCMLError_LibraryNotFound(Exception):
pass
class ROCMLError_DriverNotLoaded(Exception):
pass
class ROCMLError_Unknown(Exception):
pass
class ROCMLError_Uninitialized(Exception):
pass
class ROCMLState(IntEnum):
UNINITIALIZED = auto()
"""No attempt yet made to initialize PyROCML"""
INITIALIZED = auto()
"""PyROCML was successfully initialized"""
DISABLED_PYROCML_NOT_AVAILABLE = auto()
"""PyROCML not installed"""
DISABLED_CONFIG = auto()
"""PyROCML diagnostics disabled by ``distributed.diagnostics.rocml`` config setting"""
DISABLED_LIBRARY_NOT_FOUND = auto()
"""PyROCML available, but ROCML not installed"""
LIBROCM_NAME = 'librocm_smi64.so'
RSMI_MAX_BUFFER_LENGTH = 256
# Policy enums
RSMI_MAX_NUM_FREQUENCIES = 32
class rsmi_status_t(c_int):
RSMI_STATUS_SUCCESS = 0x0
RSMI_STATUS_INVALID_ARGS = 0x1
RSMI_STATUS_NOT_SUPPORTED = 0x2
RSMI_STATUS_FILE_ERROR = 0x3
RSMI_STATUS_PERMISSION = 0x4
RSMI_STATUS_OUT_OF_RESOURCES = 0x5
RSMI_STATUS_INTERNAL_EXCEPTION = 0x6
RSMI_STATUS_INPUT_OUT_OF_BOUNDS = 0x7
RSMI_STATUS_INIT_ERROR = 0x8
RSMI_INITIALIZATION_ERROR = RSMI_STATUS_INIT_ERROR
RSMI_STATUS_NOT_YET_IMPLEMENTED = 0x9
RSMI_STATUS_NOT_FOUND = 0xA
RSMI_STATUS_INSUFFICIENT_SIZE = 0xB
RSMI_STATUS_INTERRUPT = 0xC
RSMI_STATUS_UNEXPECTED_SIZE = 0xD
RSMI_STATUS_NO_DATA = 0xE
RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF
#Dictionary of rsmi ret codes and it's verbose output
rsmi_status_verbose_err_out = {
rsmi_status_t.RSMI_STATUS_SUCCESS: 'Operation was successful',
rsmi_status_t.RSMI_STATUS_INVALID_ARGS: 'Invalid arguments provided',
rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: 'Not supported on the given system',
rsmi_status_t.RSMI_STATUS_FILE_ERROR: 'Problem accessing a file',
rsmi_status_t.RSMI_STATUS_PERMISSION: 'Permission denied',
rsmi_status_t.RSMI_STATUS_OUT_OF_RESOURCES: 'Unable to acquire memory or other resource',
rsmi_status_t.RSMI_STATUS_INTERNAL_EXCEPTION: 'An internal exception was caught',
rsmi_status_t.RSMI_STATUS_INPUT_OUT_OF_BOUNDS: 'Provided input is out of allowable or safe range',
rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occured during rsmi initialization',
rsmi_status_t.RSMI_STATUS_NOT_YET_IMPLEMENTED: 'Requested function is not implemented on this setup',
rsmi_status_t.RSMI_STATUS_NOT_FOUND: 'Item searched for but not found',
rsmi_status_t.RSMI_STATUS_INSUFFICIENT_SIZE: 'Insufficient resources available',
rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occured during execution',
rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read',
rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input',
rsmi_status_t.RSMI_STATUS_UNKNOWN_ERROR: 'Unknown error occured'
}
class rsmi_init_flags_t(c_int):
RSMI_INIT_FLAG_ALL_GPUS = 0x1
class rsmi_memory_type_t(c_int):
RSMI_MEM_TYPE_FIRST = 0
RSMI_MEM_TYPE_VRAM = RSMI_MEM_TYPE_FIRST
RSMI_MEM_TYPE_VIS_VRAM = 1
RSMI_MEM_TYPE_GTT = 2
RSMI_MEM_TYPE_LAST = RSMI_MEM_TYPE_GTT
# memory_type_l includes names for with rsmi_memory_type_t
# Usage example to get corresponding names:
# memory_type_l[rsmi_memory_type_t.RSMI_MEM_TYPE_VRAM] will return string 'vram'
memory_type_l = ['VRAM', 'VIS_VRAM', 'GTT']
class rsmi_retired_page_record_t(Structure):
_fields_ = [('page_address', c_uint64),
('page_size', c_uint64),
('status', c_int)]
class rsmi_sw_component_t(c_int):
RSMI_SW_COMP_FIRST = 0x0
RSMI_SW_COMP_DRIVER = RSMI_SW_COMP_FIRST
RSMI_SW_COMP_LAST = RSMI_SW_COMP_DRIVER
class rsmi_frequencies_t(Structure):
_fields_ = [('num_supported', c_int32),
('current', c_uint32),
('frequency', c_uint64 * RSMI_MAX_NUM_FREQUENCIES)]
class rsmi_pcie_bandwidth_t(Structure):
_fields_ = [('transfer_rate', rsmi_frequencies_t),
('lanes', c_uint32 * RSMI_MAX_NUM_FREQUENCIES)]
class rsmi_process_info_t(Structure):
_fields_ = [('process_id', c_uint32),
('pasid', c_uint32), # PSA: Power Spectrum Analysis ?
('vram_usage', c_uint64),
('sdma_usage', c_uint64), # SDMA: System Direct Memory Access
('cu_occupancy', c_uint32)]
class rsmi_xgmi_status_t(c_int):
RSMI_XGMI_STATUS_NO_ERRORS = 0
RSMI_XGMI_STATUS_ERROR = 1
RSMI_XGMI_STATUS_MULTIPLE_ERRORS = 2
class rsmi_io_link_type(c_int):
RSMI_IOLINK_TYPE_UNDEFINED = 0
RSMI_IOLINK_TYPE_HYPERTRANSPORT = 1
RSMI_IOLINK_TYPE_PCIEXPRESS = 2
RSMI_IOLINK_TYPE_AMBA = 3
RSMI_IOLINK_TYPE_MIPI = 4
RSMI_IOLINK_TYPE_QPI_1_1 = 5
RSMI_IOLINK_TYPE_RESERVED1 = 6
RSMI_IOLINK_TYPE_RESERVED2 = 7
RSMI_IOLINK_TYPE_RAPID_IO = 8
RSMI_IOLINK_TYPE_INFINIBAND = 9
RSMI_IOLINK_TYPE_RESERVED3 = 10
RSMI_IOLINK_TYPE_XGMI = 11
RSMI_IOLINK_TYPE_XGOP = 12
RSMI_IOLINK_TYPE_GZ = 13
RSMI_IOLINK_TYPE_ETHERNET_RDMA = 14
RSMI_IOLINK_TYPE_RDMA_OTHER = 15
RSMI_IOLINK_TYPE_OTHER = 16
RSMI_IOLINK_TYPE_NUMIOLINKTYPES = 17
RSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF
## Library loading
rocm_lib = None
lib_load_lock = threading.Lock()
_rocm_lib_refcount = 0
## Function access, to prevent lib_load_lock deadlock
_rocml_get_function_ptr_cache = dict()
def _rocml_get_function_ptr(name):
global rocm_lib
if name in _rocml_get_function_ptr_cache:
return _rocml_get_function_ptr_cache[name]
lib_load_lock.acquire()
try:
# ensure library was loaded
if rocm_lib == None:
raise ROCMLError_Uninitialized
try:
_rocml_get_function_ptr_cache[name] = getattr(rocm_lib, name)
return _rocml_get_function_ptr_cache[name]
except AttributeError:
raise ROCMLError_FunctionNotFound
finally:
# lock is always freed
lib_load_lock.release()
def _load_rocm_library():
"""Load ROCm library if not already loaded"""
global rocm_lib
if rocm_lib == None:
lib_load_lock.acquire()
try:
if rocm_lib == None:
try:
if sys.platform[:3] == 'win':
raise ROCMLError_NotSupported('Windows platform is not supported yet')
else:
# assume linux
path_librocm = _find_lib_rocm()
cdll.LoadLibrary(path_librocm)
rocm_lib = CDLL(path_librocm)
except OSError:
raise ROCMLError_LibraryNotFound('ROCm library not found')
if rocm_lib == None:
raise ROCMLError_LibraryNotFound('ROCm library not found')
finally:
lib_load_lock.release()
def _find_lib_rocm():
"""search for librocm and returns path
if search fails, returns empty string
"""
rocm_path = os.environ.get('ROCM_PATH', '/opt/rocm')
rocm_lib_path = join(rocm_path, f'lib/{LIBROCM_NAME}')
return rocm_lib_path if isfile(rocm_lib_path) else ''
def _driver_initialized():
""" Returns true if amdgpu is found in the list of initialized modules
"""
initialized = ''
try:
initialized = str(subprocess.check_output("cat /sys/module/amdgpu/initstate |grep live", shell=True))
except subprocess.CalledProcessError:
pass
return len(initialized) > 0
def smi_initialize():
"""Initialize ROCm binding of SMI"""
_load_rocm_library()
if _driver_initialized():
ret_init = rocm_lib.rsmi_init(0)
if ret_init != 0:
logging.error(f'ROCm SMI init returned value {ret_init}')
raise RuntimeError('ROCm SMI initialization failed')
else:
raise RuntimeError('ROCm driver initilization failed')
# update reference count
global _rocm_lib_refcount
lib_load_lock.acquire()
_rocm_lib_refcount += 1
lib_load_lock.release()
def rsmi_ret_ok(my_ret, log_error=False):
""" Returns true if RSMI call status is 0 (success)
@param device: DRM device identifier
@param my_ret: Return of RSMI call (rocm_smi_lib API)
@param log_error: Log the error message
@param metric: Parameter of GPU currently being analyzed
"""
if my_ret != rsmi_status_t.RSMI_STATUS_SUCCESS:
if log_error:
err_str = c_char_p()
rocm_lib.rsmi_status_string(my_ret, byref(err_str))
logging.error(err_str.value.decode())
return False
return True
def smi_shutdown():
"""leave the library loaded, but shutdown the interface"""
rsmi_ret_ok(rocm_lib.rsmi_shut_down())
# update reference count
global _rocm_lib_refcount
lib_load_lock.acquire()
_rocm_lib_refcount -= 1
lib_load_lock.release()
def smi_get_kernel_version():
"""returns ROCm kernerl driver version"""
ver_str = create_string_buffer(256)
ret = rocm_lib.rsmi_version_str_get(rsmi_sw_component_t.RSMI_SW_COMP_DRIVER, ver_str, 256)
return ver_str.value.decode() if rsmi_ret_ok(ret) else ''
def smi_get_device_id(dev):
"""returns device id of the device as 64bit integer"""
uid = c_uint64()
ret = rocm_lib.rsmi_dev_id_get(dev, byref(uid))
return uid.value if rsmi_ret_ok(ret) else -1
def smi_get_device_count():
"""returns a list of GPU devices """
num_device = c_uint32(0)
ret = rocm_lib.rsmi_num_monitor_devices(byref(num_device))
return num_device.value if rsmi_ret_ok(ret) else -1
def smi_get_device_name(dev):
"""returns the name of a GPU device"""
series = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
ret = rocm_lib.rsmi_dev_name_get(dev, series, RSMI_MAX_BUFFER_LENGTH)
return series.value.decode() if rsmi_ret_ok(ret) else ''
def smi_get_device_unique_id(dev):
"""returns unique id of the device as 64bit integer"""
uid = c_uint64()
ret = rocm_lib.rsmi_dev_unique_id_get(dev, byref(uid))
return uid.value if rsmi_ret_ok(ret) else -1
def smi_get_device_utilization(dev):
"""returns GPU device busy percent of device_id dev"""
busy_percent = c_uint32()
ret = rocm_lib.rsmi_dev_busy_percent_get(dev, byref(busy_percent))
return busy_percent.value if rsmi_ret_ok(ret) else -1
def smi_get_device_memory_used(dev, type='VRAM'):
"""returns used memory of device_id dev in bytes"""
type_idx = memory_type_l.index(type)
used = c_uint64()
ret = rocm_lib.rsmi_dev_memory_usage_get(dev, type_idx, byref(used))
return used.value if rsmi_ret_ok(ret) else -1
def smi_get_device_memory_total(dev, type='VRAM'):
"""returns total memory of device_id dev in bytes"""
type_idx = memory_type_l.index(type)
total = c_uint64()
ret = rocm_lib.rsmi_dev_memory_total_get(dev, type_idx, byref(total))
return total.value if rsmi_ret_ok(ret) else -1
def smi_get_device_memory_busy(dev):
"""returns percentage of time any device memory is being used"""
busy_percent = c_uint32()
ret = rocm_lib.rsmi_dev_memory_busy_percent_get(dev, byref(busy_percent))
return busy_percent.value if rsmi_ret_ok(ret) else -1
def smi_get_device_memory_reserved_pages(dev):
"""returns info about reserved memory pages"""
num_pages = c_uint32()
records = rsmi_retired_page_record_t()
ret = rocm_lib.rsmi_dev_memory_reserved_pages_get(dev, byref(num_pages), byref(records))
return (num_pages.value, records) if rsmi_ret_ok(ret) else -1
# PCIE functions
def smi_get_device_pcie_bandwidth(dev):
"""returns list of possible pcie bandwidths for the device in bytes/sec"""
bandwidth = rsmi_pcie_bandwidth_t()
ret = rocm_lib.rsmi_dev_pci_bandwidth_get(dev, byref(bandwidth))
return bandwidth if rsmi_ret_ok(ret) else -1
def smi_get_device_pci_id(dev):
"""returns unique PCI ID of the device in 64bit Hex with format:
BDFID = ((DOMAIN & 0xffffffff) << 32) | ((BUS & 0xff) << 8) |
((DEVICE & 0x1f) <<3 ) | (FUNCTION & 0x7)
"""
bdfid = c_uint64()
ret = rocm_lib.rsmi_dev_pci_id_get(dev, byref(bdfid))
return bdfid.value if rsmi_ret_ok(ret) else -1
def smi_get_device_topo_numa_affinity(dev):
"""returns the NUMA node associated with the device"""
numa_node = c_uint32()
ret = reocm_lib.rsmi_topo_numa_affinity_get(dev, byref(numa_node))
return numa_node.value if rsmi_ret_ok(ret) else -1
def smi_get_device_pcie_throughput(dev):
"""returns measured pcie throughput for the device in bytes/sec"""
sent = c_uint64()
recv = c_uint64()
max_pkt_sz = c_uint64()
ret = rocm_lib.rsmi_dev_pci_throughput_get(dev, byref(sent), byref(recv), byref(max_pkt_sz))
return (recv.value + sent.value) * max_pkt_sz.value if rsmi_ret_ok(ret) else -1
def smi_get_device_pci_replay_counter(dev):
"""return PCIe replay counter of the device"""
counter = c_uint64()
ret = rocm_lib.rsmi_dev_pci_replay_counter_get(dev, byref(counter))
return counter.value if rsmi_ret_ok(ret) else -1
# Compute partition functions
def smi_get_device_compute_partition(dev):
"""returns the compute partition of the device"""
partition = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
ret = rocm_lib.rsmi_dev_compute_partition_get(dev, byref(partition), RSMI_MAX_BUFFER_LENGTH)
return partition.value.decode() if rsmi_ret_ok(ret) else ''
def smi_set_device_compute_partition(dev, partition):
"""modifies the compute partition of the selected device"""
ret = rocm_lib.rsmi_dev_compute_partition_set(dev, partition)
return rsmi_ret_ok(ret)
def smi_reset_device_compute_partition(dev):
"""reverts the compute partition of the selected device to its boot state"""
ret = rocm_lib.rsmi_dev_compute_partition_reset(dev)
return rsmi_ret_ok(ret)
# Memory partition functions
def smi_get_device_memory_partition(dev):
"""returns the memory partition of the device"""
partition = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
ret = rocm_lib.rsmi_dev_memory_partition_get(dev, byref(partition), RSMI_MAX_BUFFER_LENGTH)
return partition.value.decode() if rsmi_ret_ok(ret) else ''
def smi_set_device_memory_partition(dev, partition):
"""modifies the memory partition of the selected device"""
ret = rocm_lib.rsmi_dev_memory_partition_set(dev, partition)
return rsmi_ret_ok(ret)
def smi_reset_device_memory_partition(dev):
"""reverts the memory partition of the selected device to its boot state"""
ret = rocm_lib.rsmi_dev_memory_partition_reset(dev)
return rsmi_ret_ok(ret)
# Hardware Topology functions
def smi_get_device_topo_numa_node_number(dev):
"""returns the NUMA node associated with the device"""
numa_node = c_uint32()
ret = rocm_lib.rsmi_topo_get_numa_node_number(dev, byref(numa_node))
return numa_node.value if rsmi_ret_ok(ret) else -1
def smi_get_device_topo_link_weight(dev_src, dev_dst):
"""returns the weight of the link between two devices"""
weight = c_uint64()
ret = rocm_lib.rsmi_topo_get_link_weight(dev_src, dev_dst, byref(weight))
return weight.value if rsmi_ret_ok(ret) else -1
def smi_get_device_minmax_bandwidth(dev_src, dev_dst):
"""returns the minimum and maximum io link bandwidth between two devices
API works if src and dst are connected via XGMI and are 1 hop away.
"""
assert smi_get_device_link_type(dev_src, dev_dst)[0] == 1, 'Devices must be 1 hop away'
min_bandwidth = c_uint64()
max_bandwidth = c_uint64()
ret = rocm_lib.rsmi_minmax_bandwidth_get(dev_src, dev_dst, byref(min_bandwidth), byref(max_bandwidth))
return (min_bandwidth.value, max_bandwidth.value) if rsmi_ret_ok(ret) else -1
def smi_get_device_link_type(dev_src, dev_dst):
"""returns the hops and the type of link between two devices"""
hops = c_uint64()
link_type = rsmi_io_link_type()
ret = rocm_lib.rsmi_topo_get_link_type(dev_src, dev_dst, byref(hops), byref(link_type))
return (hops.value, link_type.value) if rsmi_ret_ok(ret) else -1
def smi_is_device_p2p_accessible(dev_src, dev_dst):
"""returns true if two devices are p2p accessible"""
accessible = c_bool()
ret = rocm_lib.rsmi_is_P2P_accessible(dev_src, dev_dst, byref(accessible))
return accessible.value if rsmi_ret_ok(ret) else -1
def smi_get_device_compute_process():
"""returns list of process ids running compute on the system"""
num_procs = c_uint32()
ret = rocm_lib.rsmi_compute_process_info_get(None, byref(num_procs))
if rsmi_ret_ok(ret):
buff_sz = num_procs.value + 10
proc_info = (rsmi_process_info_t * buff_sz)()
ret2 = rocm_lib.rsmi_compute_process_info_get(byref(proc_info), byref(num_procs))
return [proc_info[i].process_id for i in range(num_procs.value)] if rsmi_ret_ok(ret2) else []
else:
return []
def smi_get_device_average_power(dev):
"""returns average power of device_id dev"""
power = c_uint32()
ret = rocm_lib.rsmi_dev_power_ave_get(dev, 0, byref(power))
return power.value * 1e-6 if rsmi_ret_ok(ret) else -1
# XGMI fuctions
def smi_get_device_xgmi_error_status(dev):
"""returns XGMI error status for a device"""
status = rsmi_xgmi_status_t()
ret = rocm_lib.rsmi_dev_xgmi_error_status(dev, byref(status))
return status.value if rsmi_ret_ok(ret) else -1
def smi_reset_device_xgmi_error(dev):
"""resets XGMI error status for a device"""
ret = rocm_lib.rsmi_dev_xgmi_error_reset(dev)
return rsmi_ret_ok(ret)
def smi_get_device_xgmi_hive_id(dev):
"""returns XGMI hive ID for a device"""
hive_id = c_uint64()
ret = rocm_lib.rsmi_dev_xgmi_hive_id_get(dev, byref(hive_id))
return hive_id.value if rsmi_ret_ok(ret) else -1
# constants for the UUID function
B1 = '%02x'
B2 = B1 * 2
B4 = B1 * 4
B6 = B1 * 6
nv_fmt = f'GPU-{B4}-{B2}-{B2}-{B2}-{B6}'
# UUID function
def smi_get_device_uuid(dev, format='roc'):
DEVICE_UUIDS = get_device_uuids()
"""returns the UUID of the device"""
assert dev < len(DEVICE_UUIDS), 'Device index out of range'
u_s = DEVICE_UUIDS[dev]
if format == 'roc':
# use hex strings
return f'GPU-{u_s}'
elif format == 'nv':
# break down to ASCII strings according to the format
b_a = bytearray()
b_a.extend(map(ord, u_s))
return nv_fmt % tuple(b_a)
else:
raise ValueError(f'Invalid format: \'{format}\'; use \'roc\' or \'nv\'')