mirror of
https://github.com/clearml/clearml
synced 2025-05-09 15:10:38 +00:00
Fix AMD GPU metrics collection
This commit is contained in:
parent
db4834f37e
commit
2fbd86415c
@ -17,10 +17,13 @@ from __future__ import unicode_literals
|
|||||||
import json
|
import json
|
||||||
import platform
|
import platform
|
||||||
import sys
|
import sys
|
||||||
|
import subprocess
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from ctypes import c_uint32, byref, c_int64
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
from ..gpu import pynvml as N
|
from ..gpu import pynvml as N
|
||||||
|
from ..gpu import pyrsmi as R
|
||||||
|
|
||||||
NOT_SUPPORTED = 'Not Supported'
|
NOT_SUPPORTED = 'Not Supported'
|
||||||
MB = 1024 * 1024
|
MB = 1024 * 1024
|
||||||
@ -193,7 +196,184 @@ class GPUStatCollection(object):
|
|||||||
del GPUStatCollection.global_processes[pid]
|
del GPUStatCollection.global_processes[pid]
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
def _new_query_amd(shutdown=False, per_process_stats=False, get_driver_info=False):
|
||||||
|
initialized = False
|
||||||
|
if not GPUStatCollection._initialized:
|
||||||
|
R.smi_initialize()
|
||||||
|
GPUStatCollection._initialized = True
|
||||||
|
initialized = True
|
||||||
|
|
||||||
|
def get_gpu_info(index):
|
||||||
|
def amd_query_processes():
|
||||||
|
num_procs = c_uint32()
|
||||||
|
ret = R.rocm_lib.rsmi_compute_process_info_get(None, byref(num_procs))
|
||||||
|
if R.rsmi_ret_ok(ret):
|
||||||
|
buff_sz = num_procs.value + 10
|
||||||
|
proc_info = (R.rsmi_process_info_t * buff_sz)()
|
||||||
|
ret = R.rocm_lib.rsmi_compute_process_info_get(byref(proc_info), byref(num_procs))
|
||||||
|
proc_info_list = (
|
||||||
|
[proc_info[i] for i in range(num_procs.value)]
|
||||||
|
if R.rsmi_ret_ok(ret)
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
result_proc_info_list = []
|
||||||
|
# query VRAM usage explicitly, as rsmi_compute_process_info_get
|
||||||
|
# doesn't actually return VRAM usage
|
||||||
|
for proc_info in proc_info_list:
|
||||||
|
vram_query_proc_info = R.rsmi_process_info_t()
|
||||||
|
ret = R.rocm_lib.rsmi_compute_process_info_by_pid_get(
|
||||||
|
int(proc_info.process_id), byref(vram_query_proc_info)
|
||||||
|
)
|
||||||
|
if R.rsmi_ret_ok(ret):
|
||||||
|
proc_info.vram_usage = vram_query_proc_info.vram_usage
|
||||||
|
result_proc_info_list.append(proc_info)
|
||||||
|
return result_proc_info_list
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_fan_speed():
|
||||||
|
fan_level = c_int64()
|
||||||
|
fan_max = c_int64()
|
||||||
|
sensor_ind = c_uint32(0)
|
||||||
|
|
||||||
|
ret = R.rocm_lib.rsmi_dev_fan_speed_get(index, sensor_ind, byref(fan_level))
|
||||||
|
if not R.rsmi_ret_ok(ret, log_error=False):
|
||||||
|
return None
|
||||||
|
|
||||||
|
ret = R.rocm_lib.rsmi_dev_fan_speed_max_get(index, sensor_ind, byref(fan_max))
|
||||||
|
if not R.rsmi_ret_ok(ret, log_error=False):
|
||||||
|
return None
|
||||||
|
|
||||||
|
if fan_level.value <= 0 or fan_max <= 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return float(fan_level.value) / float(fan_max.value)
|
||||||
|
|
||||||
|
def get_process_info(comp_process):
|
||||||
|
process = {}
|
||||||
|
pid = comp_process.process_id
|
||||||
|
# skip global_processes caching because PID querying seems to be inconsistent atm
|
||||||
|
# if pid not in GPUStatCollection.global_processes:
|
||||||
|
# GPUStatCollection.global_processes[pid] = psutil.Process(pid=pid)
|
||||||
|
process["pid"] = pid
|
||||||
|
try:
|
||||||
|
process["gpu_memory_usage"] = comp_process.vram_usage // MB
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return process
|
||||||
|
|
||||||
|
if not GPUStatCollection._gpu_device_info.get(index):
|
||||||
|
uuid = R.smi_get_device_id(index)
|
||||||
|
name = R.smi_get_device_name(index)
|
||||||
|
GPUStatCollection._gpu_device_info[index] = (name, uuid)
|
||||||
|
|
||||||
|
name, uuid = GPUStatCollection._gpu_device_info[index]
|
||||||
|
|
||||||
|
temperature = None # TODO: fetch temperature. It should be possible
|
||||||
|
fan_speed = get_fan_speed()
|
||||||
|
|
||||||
|
try:
|
||||||
|
memory_total = R.smi_get_device_memory_total(index)
|
||||||
|
except Exception:
|
||||||
|
memory_total = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
memory_used = R.smi_get_device_memory_used(index)
|
||||||
|
except Exception:
|
||||||
|
memory_used = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
utilization = R.smi_get_device_utilization(index)
|
||||||
|
except Exception:
|
||||||
|
utilization = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
power = R.smi_get_device_average_power(index)
|
||||||
|
except Exception:
|
||||||
|
power = None
|
||||||
|
|
||||||
|
power_limit = None # TODO: find a way to fetch this
|
||||||
|
|
||||||
|
processes = []
|
||||||
|
if per_process_stats:
|
||||||
|
try:
|
||||||
|
comp_processes = amd_query_processes()
|
||||||
|
except Exception:
|
||||||
|
comp_processes = []
|
||||||
|
for comp_process in comp_processes:
|
||||||
|
try:
|
||||||
|
process = get_process_info(comp_process)
|
||||||
|
except psutil.NoSuchProcess:
|
||||||
|
# skip process caching for now
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
processes.append(process)
|
||||||
|
|
||||||
|
gpu_info = {
|
||||||
|
"index": index,
|
||||||
|
"uuid": uuid,
|
||||||
|
"name": name,
|
||||||
|
"temperature.gpu": temperature if temperature is not None else 0,
|
||||||
|
"fan.speed": fan_speed if fan_speed is not None else 0,
|
||||||
|
"utilization.gpu": utilization if utilization is not None else 100,
|
||||||
|
"power.draw": power if power is not None else 0,
|
||||||
|
"enforced.power.limit": power_limit if power_limit is not None else 0,
|
||||||
|
# Convert bytes into MBytes
|
||||||
|
"memory.used": memory_used // MB if memory_used is not None else 0,
|
||||||
|
"memory.total": memory_total // MB if memory_total is not None else 100,
|
||||||
|
"processes": None if (processes and all(p is None for p in processes)) else processes,
|
||||||
|
}
|
||||||
|
if per_process_stats:
|
||||||
|
GPUStatCollection.clean_processes()
|
||||||
|
return gpu_info
|
||||||
|
|
||||||
|
gpu_list = []
|
||||||
|
if GPUStatCollection._device_count is None:
|
||||||
|
GPUStatCollection._device_count = R.smi_get_device_count()
|
||||||
|
|
||||||
|
for index in range(GPUStatCollection._device_count):
|
||||||
|
gpu_info = get_gpu_info(index)
|
||||||
|
gpu_stat = GPUStat(gpu_info)
|
||||||
|
gpu_list.append(gpu_stat)
|
||||||
|
|
||||||
|
if shutdown and initialized:
|
||||||
|
R.smi_shutdown()
|
||||||
|
GPUStatCollection._initialized = False
|
||||||
|
|
||||||
|
# noinspection PyProtectedMember
|
||||||
|
driver_version = GPUStatCollection._get_amd_driver_version() if get_driver_info else None
|
||||||
|
|
||||||
|
return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=None)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_amd_driver_version():
|
||||||
|
# make sure the program doesn't crash with something like a SEGFAULT when querying the driver version
|
||||||
|
try:
|
||||||
|
process = subprocess.Popen(["rocm-smi", "--showdriverversion", "--json"], stdout=subprocess.PIPE)
|
||||||
|
out, _ = process.communicate()
|
||||||
|
return json.loads(out)["system"]["Driver version"]
|
||||||
|
except Exception:
|
||||||
|
try:
|
||||||
|
process = subprocess.Popen(
|
||||||
|
[
|
||||||
|
sys.executable,
|
||||||
|
"-c",
|
||||||
|
"from clearml.utilities.gpu.pyrsmi import smi_get_kernel_version, smi_initialize; "
|
||||||
|
+ "smi_initialize(); "
|
||||||
|
+ "print(smi_get_kernel_version())",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
out, _ = process.communicate()
|
||||||
|
return out.strip()
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _running_in_amd_env():
|
||||||
|
# noinspection PyProtectedMember
|
||||||
|
return bool(R._find_lib_rocm())
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _new_query_nvidia(shutdown=False, per_process_stats=False, get_driver_info=False):
|
||||||
"""Query the information of all the GPUs on local machine"""
|
"""Query the information of all the GPUs on local machine"""
|
||||||
initialized = False
|
initialized = False
|
||||||
if not GPUStatCollection._initialized:
|
if not GPUStatCollection._initialized:
|
||||||
@ -408,6 +588,20 @@ class GPUStatCollection(object):
|
|||||||
|
|
||||||
return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version)
|
return GPUStatCollection(gpu_list, driver_version=driver_version, driver_cuda_version=cuda_driver_version)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def new_query(shutdown=False, per_process_stats=False, get_driver_info=False):
|
||||||
|
# noinspection PyProtectedMember
|
||||||
|
if GPUStatCollection._running_in_amd_env():
|
||||||
|
# noinspection PyProtectedMember
|
||||||
|
return GPUStatCollection._new_query_amd(
|
||||||
|
shutdown=shutdown, per_process_stats=per_process_stats, get_driver_info=get_driver_info
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# noinspection PyProtectedMember
|
||||||
|
return GPUStatCollection._new_query_nvidia(
|
||||||
|
shutdown=shutdown, per_process_stats=per_process_stats, get_driver_info=get_driver_info
|
||||||
|
)
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return len(self.gpus)
|
return len(self.gpus)
|
||||||
|
|
||||||
|
605
clearml/utilities/gpu/pyrsmi.py
Normal file
605
clearml/utilities/gpu/pyrsmi.py
Normal file
@ -0,0 +1,605 @@
|
|||||||
|
# MIT License
|
||||||
|
#
|
||||||
|
# Copyright (c) 2023 Advanced Micro Devices, Inc.
|
||||||
|
#
|
||||||
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
# of this software and associated documentation files (the "Software"), to deal
|
||||||
|
# in the Software without restriction, including without limitation the rights
|
||||||
|
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
# copies of the Software, and to permit persons to whom the Software is
|
||||||
|
# furnished to do so, subject to the following conditions:
|
||||||
|
#
|
||||||
|
# The above copyright notice and this permission notice shall be included in all
|
||||||
|
# copies or substantial portions of the Software.
|
||||||
|
#
|
||||||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
# SOFTWARE.
|
||||||
|
|
||||||
|
# Python bindings for ROCm-SMI library
|
||||||
|
from ctypes import *
|
||||||
|
from os.path import join, realpath, isfile
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
from enum import IntEnum, auto
|
||||||
|
|
||||||
|
|
||||||
|
def get_device_uuids():
|
||||||
|
"""Get the UUIDs of all ROCm devices from rocminfo output,
|
||||||
|
according to HSA spec.
|
||||||
|
"""
|
||||||
|
uuids = []
|
||||||
|
check_cmd ='rocminfo'
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc_complete = subprocess.run(
|
||||||
|
check_cmd.split(),
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.STDOUT,
|
||||||
|
check=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
for line in proc_complete.stdout.decode('utf-8').split('\n'):
|
||||||
|
# drops CPU devices from the output
|
||||||
|
if 'Uuid' in line and 'GPU-' in line:
|
||||||
|
uuids.append(line.strip().split('GPU-')[-1])
|
||||||
|
|
||||||
|
except (FileNotFoundError, subprocess.CalledProcessError) as err:
|
||||||
|
print(' Error => ', str(err))
|
||||||
|
|
||||||
|
return uuids
|
||||||
|
|
||||||
|
|
||||||
|
## Error checking
|
||||||
|
class ROCMLError_NotSupported(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ROCMLError_FunctionNotFound(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ROCMLError_LibraryNotFound(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ROCMLError_DriverNotLoaded(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ROCMLError_Unknown(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ROCMLError_Uninitialized(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ROCMLState(IntEnum):
|
||||||
|
UNINITIALIZED = auto()
|
||||||
|
"""No attempt yet made to initialize PyROCML"""
|
||||||
|
INITIALIZED = auto()
|
||||||
|
"""PyROCML was successfully initialized"""
|
||||||
|
DISABLED_PYROCML_NOT_AVAILABLE = auto()
|
||||||
|
"""PyROCML not installed"""
|
||||||
|
DISABLED_CONFIG = auto()
|
||||||
|
"""PyROCML diagnostics disabled by ``distributed.diagnostics.rocml`` config setting"""
|
||||||
|
DISABLED_LIBRARY_NOT_FOUND = auto()
|
||||||
|
"""PyROCML available, but ROCML not installed"""
|
||||||
|
|
||||||
|
|
||||||
|
LIBROCM_NAME = 'librocm_smi64.so'
|
||||||
|
RSMI_MAX_BUFFER_LENGTH = 256
|
||||||
|
|
||||||
|
# Policy enums
|
||||||
|
RSMI_MAX_NUM_FREQUENCIES = 32
|
||||||
|
|
||||||
|
|
||||||
|
class rsmi_status_t(c_int):
|
||||||
|
RSMI_STATUS_SUCCESS = 0x0
|
||||||
|
RSMI_STATUS_INVALID_ARGS = 0x1
|
||||||
|
RSMI_STATUS_NOT_SUPPORTED = 0x2
|
||||||
|
RSMI_STATUS_FILE_ERROR = 0x3
|
||||||
|
RSMI_STATUS_PERMISSION = 0x4
|
||||||
|
RSMI_STATUS_OUT_OF_RESOURCES = 0x5
|
||||||
|
RSMI_STATUS_INTERNAL_EXCEPTION = 0x6
|
||||||
|
RSMI_STATUS_INPUT_OUT_OF_BOUNDS = 0x7
|
||||||
|
RSMI_STATUS_INIT_ERROR = 0x8
|
||||||
|
RSMI_INITIALIZATION_ERROR = RSMI_STATUS_INIT_ERROR
|
||||||
|
RSMI_STATUS_NOT_YET_IMPLEMENTED = 0x9
|
||||||
|
RSMI_STATUS_NOT_FOUND = 0xA
|
||||||
|
RSMI_STATUS_INSUFFICIENT_SIZE = 0xB
|
||||||
|
RSMI_STATUS_INTERRUPT = 0xC
|
||||||
|
RSMI_STATUS_UNEXPECTED_SIZE = 0xD
|
||||||
|
RSMI_STATUS_NO_DATA = 0xE
|
||||||
|
RSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF
|
||||||
|
|
||||||
|
|
||||||
|
#Dictionary of rsmi ret codes and it's verbose output
|
||||||
|
rsmi_status_verbose_err_out = {
|
||||||
|
rsmi_status_t.RSMI_STATUS_SUCCESS: 'Operation was successful',
|
||||||
|
rsmi_status_t.RSMI_STATUS_INVALID_ARGS: 'Invalid arguments provided',
|
||||||
|
rsmi_status_t.RSMI_STATUS_NOT_SUPPORTED: 'Not supported on the given system',
|
||||||
|
rsmi_status_t.RSMI_STATUS_FILE_ERROR: 'Problem accessing a file',
|
||||||
|
rsmi_status_t.RSMI_STATUS_PERMISSION: 'Permission denied',
|
||||||
|
rsmi_status_t.RSMI_STATUS_OUT_OF_RESOURCES: 'Unable to acquire memory or other resource',
|
||||||
|
rsmi_status_t.RSMI_STATUS_INTERNAL_EXCEPTION: 'An internal exception was caught',
|
||||||
|
rsmi_status_t.RSMI_STATUS_INPUT_OUT_OF_BOUNDS: 'Provided input is out of allowable or safe range',
|
||||||
|
rsmi_status_t.RSMI_INITIALIZATION_ERROR: 'Error occured during rsmi initialization',
|
||||||
|
rsmi_status_t.RSMI_STATUS_NOT_YET_IMPLEMENTED: 'Requested function is not implemented on this setup',
|
||||||
|
rsmi_status_t.RSMI_STATUS_NOT_FOUND: 'Item searched for but not found',
|
||||||
|
rsmi_status_t.RSMI_STATUS_INSUFFICIENT_SIZE: 'Insufficient resources available',
|
||||||
|
rsmi_status_t.RSMI_STATUS_INTERRUPT: 'Interrupt occured during execution',
|
||||||
|
rsmi_status_t.RSMI_STATUS_UNEXPECTED_SIZE: 'Unexpected amount of data read',
|
||||||
|
rsmi_status_t.RSMI_STATUS_NO_DATA: 'No data found for the given input',
|
||||||
|
rsmi_status_t.RSMI_STATUS_UNKNOWN_ERROR: 'Unknown error occured'
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class rsmi_init_flags_t(c_int):
|
||||||
|
RSMI_INIT_FLAG_ALL_GPUS = 0x1
|
||||||
|
|
||||||
|
|
||||||
|
class rsmi_memory_type_t(c_int):
|
||||||
|
RSMI_MEM_TYPE_FIRST = 0
|
||||||
|
RSMI_MEM_TYPE_VRAM = RSMI_MEM_TYPE_FIRST
|
||||||
|
RSMI_MEM_TYPE_VIS_VRAM = 1
|
||||||
|
RSMI_MEM_TYPE_GTT = 2
|
||||||
|
RSMI_MEM_TYPE_LAST = RSMI_MEM_TYPE_GTT
|
||||||
|
|
||||||
|
|
||||||
|
# memory_type_l includes names for with rsmi_memory_type_t
|
||||||
|
# Usage example to get corresponding names:
|
||||||
|
# memory_type_l[rsmi_memory_type_t.RSMI_MEM_TYPE_VRAM] will return string 'vram'
|
||||||
|
memory_type_l = ['VRAM', 'VIS_VRAM', 'GTT']
|
||||||
|
|
||||||
|
|
||||||
|
class rsmi_retired_page_record_t(Structure):
|
||||||
|
_fields_ = [('page_address', c_uint64),
|
||||||
|
('page_size', c_uint64),
|
||||||
|
('status', c_int)]
|
||||||
|
|
||||||
|
|
||||||
|
class rsmi_sw_component_t(c_int):
|
||||||
|
RSMI_SW_COMP_FIRST = 0x0
|
||||||
|
RSMI_SW_COMP_DRIVER = RSMI_SW_COMP_FIRST
|
||||||
|
RSMI_SW_COMP_LAST = RSMI_SW_COMP_DRIVER
|
||||||
|
|
||||||
|
|
||||||
|
class rsmi_frequencies_t(Structure):
|
||||||
|
_fields_ = [('num_supported', c_int32),
|
||||||
|
('current', c_uint32),
|
||||||
|
('frequency', c_uint64 * RSMI_MAX_NUM_FREQUENCIES)]
|
||||||
|
|
||||||
|
|
||||||
|
class rsmi_pcie_bandwidth_t(Structure):
|
||||||
|
_fields_ = [('transfer_rate', rsmi_frequencies_t),
|
||||||
|
('lanes', c_uint32 * RSMI_MAX_NUM_FREQUENCIES)]
|
||||||
|
|
||||||
|
|
||||||
|
class rsmi_process_info_t(Structure):
|
||||||
|
_fields_ = [('process_id', c_uint32),
|
||||||
|
('pasid', c_uint32), # PSA: Power Spectrum Analysis ?
|
||||||
|
('vram_usage', c_uint64),
|
||||||
|
('sdma_usage', c_uint64), # SDMA: System Direct Memory Access
|
||||||
|
('cu_occupancy', c_uint32)]
|
||||||
|
|
||||||
|
|
||||||
|
class rsmi_xgmi_status_t(c_int):
|
||||||
|
RSMI_XGMI_STATUS_NO_ERRORS = 0
|
||||||
|
RSMI_XGMI_STATUS_ERROR = 1
|
||||||
|
RSMI_XGMI_STATUS_MULTIPLE_ERRORS = 2
|
||||||
|
|
||||||
|
|
||||||
|
class rsmi_io_link_type(c_int):
|
||||||
|
RSMI_IOLINK_TYPE_UNDEFINED = 0
|
||||||
|
RSMI_IOLINK_TYPE_HYPERTRANSPORT = 1
|
||||||
|
RSMI_IOLINK_TYPE_PCIEXPRESS = 2
|
||||||
|
RSMI_IOLINK_TYPE_AMBA = 3
|
||||||
|
RSMI_IOLINK_TYPE_MIPI = 4
|
||||||
|
RSMI_IOLINK_TYPE_QPI_1_1 = 5
|
||||||
|
RSMI_IOLINK_TYPE_RESERVED1 = 6
|
||||||
|
RSMI_IOLINK_TYPE_RESERVED2 = 7
|
||||||
|
RSMI_IOLINK_TYPE_RAPID_IO = 8
|
||||||
|
RSMI_IOLINK_TYPE_INFINIBAND = 9
|
||||||
|
RSMI_IOLINK_TYPE_RESERVED3 = 10
|
||||||
|
RSMI_IOLINK_TYPE_XGMI = 11
|
||||||
|
RSMI_IOLINK_TYPE_XGOP = 12
|
||||||
|
RSMI_IOLINK_TYPE_GZ = 13
|
||||||
|
RSMI_IOLINK_TYPE_ETHERNET_RDMA = 14
|
||||||
|
RSMI_IOLINK_TYPE_RDMA_OTHER = 15
|
||||||
|
RSMI_IOLINK_TYPE_OTHER = 16
|
||||||
|
RSMI_IOLINK_TYPE_NUMIOLINKTYPES = 17
|
||||||
|
RSMI_IOLINK_TYPE_SIZE = 0xFFFFFFFF
|
||||||
|
|
||||||
|
|
||||||
|
## Library loading
|
||||||
|
rocm_lib = None
|
||||||
|
lib_load_lock = threading.Lock()
|
||||||
|
_rocm_lib_refcount = 0
|
||||||
|
|
||||||
|
|
||||||
|
## Function access, to prevent lib_load_lock deadlock
|
||||||
|
_rocml_get_function_ptr_cache = dict()
|
||||||
|
|
||||||
|
def _rocml_get_function_ptr(name):
|
||||||
|
global rocm_lib
|
||||||
|
|
||||||
|
if name in _rocml_get_function_ptr_cache:
|
||||||
|
return _rocml_get_function_ptr_cache[name]
|
||||||
|
|
||||||
|
lib_load_lock.acquire()
|
||||||
|
try:
|
||||||
|
# ensure library was loaded
|
||||||
|
if rocm_lib == None:
|
||||||
|
raise ROCMLError_Uninitialized
|
||||||
|
try:
|
||||||
|
_rocml_get_function_ptr_cache[name] = getattr(rocm_lib, name)
|
||||||
|
return _rocml_get_function_ptr_cache[name]
|
||||||
|
except AttributeError:
|
||||||
|
raise ROCMLError_FunctionNotFound
|
||||||
|
finally:
|
||||||
|
# lock is always freed
|
||||||
|
lib_load_lock.release()
|
||||||
|
|
||||||
|
|
||||||
|
def _load_rocm_library():
|
||||||
|
"""Load ROCm library if not already loaded"""
|
||||||
|
global rocm_lib
|
||||||
|
|
||||||
|
if rocm_lib == None:
|
||||||
|
|
||||||
|
lib_load_lock.acquire()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if rocm_lib == None:
|
||||||
|
try:
|
||||||
|
if sys.platform[:3] == 'win':
|
||||||
|
raise ROCMLError_NotSupported('Windows platform is not supported yet')
|
||||||
|
else:
|
||||||
|
# assume linux
|
||||||
|
path_librocm = _find_lib_rocm()
|
||||||
|
cdll.LoadLibrary(path_librocm)
|
||||||
|
rocm_lib = CDLL(path_librocm)
|
||||||
|
except OSError:
|
||||||
|
raise ROCMLError_LibraryNotFound('ROCm library not found')
|
||||||
|
if rocm_lib == None:
|
||||||
|
raise ROCMLError_LibraryNotFound('ROCm library not found')
|
||||||
|
finally:
|
||||||
|
lib_load_lock.release()
|
||||||
|
|
||||||
|
|
||||||
|
def _find_lib_rocm():
|
||||||
|
"""search for librocm and returns path
|
||||||
|
if search fails, returns empty string
|
||||||
|
"""
|
||||||
|
rocm_path = os.environ.get('ROCM_PATH', '/opt/rocm')
|
||||||
|
rocm_lib_path = join(rocm_path, f'lib/{LIBROCM_NAME}')
|
||||||
|
return rocm_lib_path if isfile(rocm_lib_path) else ''
|
||||||
|
|
||||||
|
|
||||||
|
def _driver_initialized():
|
||||||
|
""" Returns true if amdgpu is found in the list of initialized modules
|
||||||
|
"""
|
||||||
|
initialized = ''
|
||||||
|
try:
|
||||||
|
initialized = str(subprocess.check_output("cat /sys/module/amdgpu/initstate |grep live", shell=True))
|
||||||
|
except subprocess.CalledProcessError:
|
||||||
|
pass
|
||||||
|
return len(initialized) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def smi_initialize():
|
||||||
|
"""Initialize ROCm binding of SMI"""
|
||||||
|
_load_rocm_library()
|
||||||
|
|
||||||
|
if _driver_initialized():
|
||||||
|
ret_init = rocm_lib.rsmi_init(0)
|
||||||
|
if ret_init != 0:
|
||||||
|
logging.error(f'ROCm SMI init returned value {ret_init}')
|
||||||
|
raise RuntimeError('ROCm SMI initialization failed')
|
||||||
|
else:
|
||||||
|
raise RuntimeError('ROCm driver initilization failed')
|
||||||
|
|
||||||
|
# update reference count
|
||||||
|
global _rocm_lib_refcount
|
||||||
|
lib_load_lock.acquire()
|
||||||
|
_rocm_lib_refcount += 1
|
||||||
|
lib_load_lock.release()
|
||||||
|
|
||||||
|
|
||||||
|
def rsmi_ret_ok(my_ret, log_error=False):
|
||||||
|
""" Returns true if RSMI call status is 0 (success)
|
||||||
|
|
||||||
|
@param device: DRM device identifier
|
||||||
|
@param my_ret: Return of RSMI call (rocm_smi_lib API)
|
||||||
|
@param log_error: Log the error message
|
||||||
|
@param metric: Parameter of GPU currently being analyzed
|
||||||
|
"""
|
||||||
|
if my_ret != rsmi_status_t.RSMI_STATUS_SUCCESS:
|
||||||
|
if log_error:
|
||||||
|
err_str = c_char_p()
|
||||||
|
rocm_lib.rsmi_status_string(my_ret, byref(err_str))
|
||||||
|
logging.error(err_str.value.decode())
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def smi_shutdown():
|
||||||
|
"""leave the library loaded, but shutdown the interface"""
|
||||||
|
rsmi_ret_ok(rocm_lib.rsmi_shut_down())
|
||||||
|
|
||||||
|
# update reference count
|
||||||
|
global _rocm_lib_refcount
|
||||||
|
lib_load_lock.acquire()
|
||||||
|
_rocm_lib_refcount -= 1
|
||||||
|
lib_load_lock.release()
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_kernel_version():
|
||||||
|
"""returns ROCm kernerl driver version"""
|
||||||
|
ver_str = create_string_buffer(256)
|
||||||
|
ret = rocm_lib.rsmi_version_str_get(rsmi_sw_component_t.RSMI_SW_COMP_DRIVER, ver_str, 256)
|
||||||
|
return ver_str.value.decode() if rsmi_ret_ok(ret) else ''
|
||||||
|
|
||||||
|
def smi_get_device_id(dev):
|
||||||
|
"""returns device id of the device as 64bit integer"""
|
||||||
|
uid = c_uint64()
|
||||||
|
ret = rocm_lib.rsmi_dev_id_get(dev, byref(uid))
|
||||||
|
return uid.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
def smi_get_device_count():
|
||||||
|
"""returns a list of GPU devices """
|
||||||
|
num_device = c_uint32(0)
|
||||||
|
ret = rocm_lib.rsmi_num_monitor_devices(byref(num_device))
|
||||||
|
return num_device.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_name(dev):
|
||||||
|
"""returns the name of a GPU device"""
|
||||||
|
series = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
|
||||||
|
ret = rocm_lib.rsmi_dev_name_get(dev, series, RSMI_MAX_BUFFER_LENGTH)
|
||||||
|
return series.value.decode() if rsmi_ret_ok(ret) else ''
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_unique_id(dev):
|
||||||
|
"""returns unique id of the device as 64bit integer"""
|
||||||
|
uid = c_uint64()
|
||||||
|
ret = rocm_lib.rsmi_dev_unique_id_get(dev, byref(uid))
|
||||||
|
return uid.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
def smi_get_device_utilization(dev):
|
||||||
|
"""returns GPU device busy percent of device_id dev"""
|
||||||
|
busy_percent = c_uint32()
|
||||||
|
ret = rocm_lib.rsmi_dev_busy_percent_get(dev, byref(busy_percent))
|
||||||
|
return busy_percent.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_memory_used(dev, type='VRAM'):
|
||||||
|
"""returns used memory of device_id dev in bytes"""
|
||||||
|
type_idx = memory_type_l.index(type)
|
||||||
|
used = c_uint64()
|
||||||
|
ret = rocm_lib.rsmi_dev_memory_usage_get(dev, type_idx, byref(used))
|
||||||
|
return used.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_memory_total(dev, type='VRAM'):
|
||||||
|
"""returns total memory of device_id dev in bytes"""
|
||||||
|
type_idx = memory_type_l.index(type)
|
||||||
|
total = c_uint64()
|
||||||
|
ret = rocm_lib.rsmi_dev_memory_total_get(dev, type_idx, byref(total))
|
||||||
|
return total.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_memory_busy(dev):
|
||||||
|
"""returns percentage of time any device memory is being used"""
|
||||||
|
busy_percent = c_uint32()
|
||||||
|
ret = rocm_lib.rsmi_dev_memory_busy_percent_get(dev, byref(busy_percent))
|
||||||
|
return busy_percent.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_memory_reserved_pages(dev):
|
||||||
|
"""returns info about reserved memory pages"""
|
||||||
|
num_pages = c_uint32()
|
||||||
|
records = rsmi_retired_page_record_t()
|
||||||
|
ret = rocm_lib.rsmi_dev_memory_reserved_pages_get(dev, byref(num_pages), byref(records))
|
||||||
|
return (num_pages.value, records) if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
# PCIE functions
|
||||||
|
def smi_get_device_pcie_bandwidth(dev):
|
||||||
|
"""returns list of possible pcie bandwidths for the device in bytes/sec"""
|
||||||
|
bandwidth = rsmi_pcie_bandwidth_t()
|
||||||
|
ret = rocm_lib.rsmi_dev_pci_bandwidth_get(dev, byref(bandwidth))
|
||||||
|
return bandwidth if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_pci_id(dev):
|
||||||
|
"""returns unique PCI ID of the device in 64bit Hex with format:
|
||||||
|
BDFID = ((DOMAIN & 0xffffffff) << 32) | ((BUS & 0xff) << 8) |
|
||||||
|
((DEVICE & 0x1f) <<3 ) | (FUNCTION & 0x7)
|
||||||
|
"""
|
||||||
|
bdfid = c_uint64()
|
||||||
|
ret = rocm_lib.rsmi_dev_pci_id_get(dev, byref(bdfid))
|
||||||
|
return bdfid.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_topo_numa_affinity(dev):
|
||||||
|
"""returns the NUMA node associated with the device"""
|
||||||
|
numa_node = c_uint32()
|
||||||
|
ret = reocm_lib.rsmi_topo_numa_affinity_get(dev, byref(numa_node))
|
||||||
|
return numa_node.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_pcie_throughput(dev):
|
||||||
|
"""returns measured pcie throughput for the device in bytes/sec"""
|
||||||
|
sent = c_uint64()
|
||||||
|
recv = c_uint64()
|
||||||
|
max_pkt_sz = c_uint64()
|
||||||
|
ret = rocm_lib.rsmi_dev_pci_throughput_get(dev, byref(sent), byref(recv), byref(max_pkt_sz))
|
||||||
|
return (recv.value + sent.value) * max_pkt_sz.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_pci_replay_counter(dev):
|
||||||
|
"""return PCIe replay counter of the device"""
|
||||||
|
counter = c_uint64()
|
||||||
|
ret = rocm_lib.rsmi_dev_pci_replay_counter_get(dev, byref(counter))
|
||||||
|
return counter.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
# Compute partition functions
|
||||||
|
def smi_get_device_compute_partition(dev):
|
||||||
|
"""returns the compute partition of the device"""
|
||||||
|
partition = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
|
||||||
|
ret = rocm_lib.rsmi_dev_compute_partition_get(dev, byref(partition), RSMI_MAX_BUFFER_LENGTH)
|
||||||
|
return partition.value.decode() if rsmi_ret_ok(ret) else ''
|
||||||
|
|
||||||
|
|
||||||
|
def smi_set_device_compute_partition(dev, partition):
|
||||||
|
"""modifies the compute partition of the selected device"""
|
||||||
|
ret = rocm_lib.rsmi_dev_compute_partition_set(dev, partition)
|
||||||
|
return rsmi_ret_ok(ret)
|
||||||
|
|
||||||
|
|
||||||
|
def smi_reset_device_compute_partition(dev):
|
||||||
|
"""reverts the compute partition of the selected device to its boot state"""
|
||||||
|
ret = rocm_lib.rsmi_dev_compute_partition_reset(dev)
|
||||||
|
return rsmi_ret_ok(ret)
|
||||||
|
|
||||||
|
|
||||||
|
# Memory partition functions
|
||||||
|
def smi_get_device_memory_partition(dev):
|
||||||
|
"""returns the memory partition of the device"""
|
||||||
|
partition = create_string_buffer(RSMI_MAX_BUFFER_LENGTH)
|
||||||
|
ret = rocm_lib.rsmi_dev_memory_partition_get(dev, byref(partition), RSMI_MAX_BUFFER_LENGTH)
|
||||||
|
return partition.value.decode() if rsmi_ret_ok(ret) else ''
|
||||||
|
|
||||||
|
|
||||||
|
def smi_set_device_memory_partition(dev, partition):
|
||||||
|
"""modifies the memory partition of the selected device"""
|
||||||
|
ret = rocm_lib.rsmi_dev_memory_partition_set(dev, partition)
|
||||||
|
return rsmi_ret_ok(ret)
|
||||||
|
|
||||||
|
|
||||||
|
def smi_reset_device_memory_partition(dev):
|
||||||
|
"""reverts the memory partition of the selected device to its boot state"""
|
||||||
|
ret = rocm_lib.rsmi_dev_memory_partition_reset(dev)
|
||||||
|
return rsmi_ret_ok(ret)
|
||||||
|
|
||||||
|
|
||||||
|
# Hardware Topology functions
|
||||||
|
def smi_get_device_topo_numa_node_number(dev):
|
||||||
|
"""returns the NUMA node associated with the device"""
|
||||||
|
numa_node = c_uint32()
|
||||||
|
ret = rocm_lib.rsmi_topo_get_numa_node_number(dev, byref(numa_node))
|
||||||
|
return numa_node.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_topo_link_weight(dev_src, dev_dst):
|
||||||
|
"""returns the weight of the link between two devices"""
|
||||||
|
weight = c_uint64()
|
||||||
|
ret = rocm_lib.rsmi_topo_get_link_weight(dev_src, dev_dst, byref(weight))
|
||||||
|
return weight.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_minmax_bandwidth(dev_src, dev_dst):
|
||||||
|
"""returns the minimum and maximum io link bandwidth between two devices
|
||||||
|
API works if src and dst are connected via XGMI and are 1 hop away.
|
||||||
|
"""
|
||||||
|
assert smi_get_device_link_type(dev_src, dev_dst)[0] == 1, 'Devices must be 1 hop away'
|
||||||
|
min_bandwidth = c_uint64()
|
||||||
|
max_bandwidth = c_uint64()
|
||||||
|
ret = rocm_lib.rsmi_minmax_bandwidth_get(dev_src, dev_dst, byref(min_bandwidth), byref(max_bandwidth))
|
||||||
|
return (min_bandwidth.value, max_bandwidth.value) if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_link_type(dev_src, dev_dst):
|
||||||
|
"""returns the hops and the type of link between two devices"""
|
||||||
|
hops = c_uint64()
|
||||||
|
link_type = rsmi_io_link_type()
|
||||||
|
ret = rocm_lib.rsmi_topo_get_link_type(dev_src, dev_dst, byref(hops), byref(link_type))
|
||||||
|
return (hops.value, link_type.value) if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_is_device_p2p_accessible(dev_src, dev_dst):
|
||||||
|
"""returns true if two devices are p2p accessible"""
|
||||||
|
accessible = c_bool()
|
||||||
|
ret = rocm_lib.rsmi_is_P2P_accessible(dev_src, dev_dst, byref(accessible))
|
||||||
|
return accessible.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_compute_process():
|
||||||
|
"""returns list of process ids running compute on the system"""
|
||||||
|
num_procs = c_uint32()
|
||||||
|
ret = rocm_lib.rsmi_compute_process_info_get(None, byref(num_procs))
|
||||||
|
if rsmi_ret_ok(ret):
|
||||||
|
buff_sz = num_procs.value + 10
|
||||||
|
proc_info = (rsmi_process_info_t * buff_sz)()
|
||||||
|
ret2 = rocm_lib.rsmi_compute_process_info_get(byref(proc_info), byref(num_procs))
|
||||||
|
|
||||||
|
return [proc_info[i].process_id for i in range(num_procs.value)] if rsmi_ret_ok(ret2) else []
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_average_power(dev):
|
||||||
|
"""returns average power of device_id dev"""
|
||||||
|
power = c_uint32()
|
||||||
|
ret = rocm_lib.rsmi_dev_power_ave_get(dev, 0, byref(power))
|
||||||
|
|
||||||
|
return power.value * 1e-6 if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
# XGMI fuctions
|
||||||
|
def smi_get_device_xgmi_error_status(dev):
|
||||||
|
"""returns XGMI error status for a device"""
|
||||||
|
status = rsmi_xgmi_status_t()
|
||||||
|
ret = rocm_lib.rsmi_dev_xgmi_error_status(dev, byref(status))
|
||||||
|
return status.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
def smi_reset_device_xgmi_error(dev):
|
||||||
|
"""resets XGMI error status for a device"""
|
||||||
|
ret = rocm_lib.rsmi_dev_xgmi_error_reset(dev)
|
||||||
|
return rsmi_ret_ok(ret)
|
||||||
|
|
||||||
|
|
||||||
|
def smi_get_device_xgmi_hive_id(dev):
|
||||||
|
"""returns XGMI hive ID for a device"""
|
||||||
|
hive_id = c_uint64()
|
||||||
|
ret = rocm_lib.rsmi_dev_xgmi_hive_id_get(dev, byref(hive_id))
|
||||||
|
return hive_id.value if rsmi_ret_ok(ret) else -1
|
||||||
|
|
||||||
|
|
||||||
|
# constants for the UUID function
|
||||||
|
B1 = '%02x'
|
||||||
|
B2 = B1 * 2
|
||||||
|
B4 = B1 * 4
|
||||||
|
B6 = B1 * 6
|
||||||
|
nv_fmt = f'GPU-{B4}-{B2}-{B2}-{B2}-{B6}'
|
||||||
|
|
||||||
|
# UUID function
|
||||||
|
def smi_get_device_uuid(dev, format='roc'):
|
||||||
|
DEVICE_UUIDS = get_device_uuids()
|
||||||
|
"""returns the UUID of the device"""
|
||||||
|
assert dev < len(DEVICE_UUIDS), 'Device index out of range'
|
||||||
|
|
||||||
|
u_s = DEVICE_UUIDS[dev]
|
||||||
|
|
||||||
|
if format == 'roc':
|
||||||
|
# use hex strings
|
||||||
|
return f'GPU-{u_s}'
|
||||||
|
elif format == 'nv':
|
||||||
|
# break down to ASCII strings according to the format
|
||||||
|
b_a = bytearray()
|
||||||
|
b_a.extend(map(ord, u_s))
|
||||||
|
return nv_fmt % tuple(b_a)
|
||||||
|
else:
|
||||||
|
raise ValueError(f'Invalid format: \'{format}\'; use \'roc\' or \'nv\'')
|
Loading…
Reference in New Issue
Block a user