From 762240d14d61eb779e49ebb76fc05b15c3a9ae63 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Tue, 9 Apr 2024 19:13:16 +0300 Subject: [PATCH] Fix resource monitor fails to get GPU stats in some edge cases --- clearml/utilities/resource_monitor.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/clearml/utilities/resource_monitor.py b/clearml/utilities/resource_monitor.py index 1e022127..88c6b40a 100644 --- a/clearml/utilities/resource_monitor.py +++ b/clearml/utilities/resource_monitor.py @@ -45,6 +45,12 @@ class ResourceMonitor(BackgroundMonitor): self._last_process_id_list = [] self._gpu_memory_per_process = True + # noinspection PyBroadException + try: + self._debug_mode = bool(os.getenv("CLEARML_RESMON_DEBUG", "")) + except Exception: + self._debug_mode = False + if not self._gpustat: self._task.get_logger().report_text('ClearML Monitor: GPU monitoring is not available') else: # if running_remotely(): @@ -247,8 +253,11 @@ class ResourceMonitor(BackgroundMonitor): # something happened and we can't use gpu stats, self._gpustat_fail += 1 if self._gpustat_fail >= 3: - self._task.get_logger().report_text('ClearML Monitor: GPU monitoring failed getting GPU reading, ' - 'switching off GPU monitoring') + msg = 'ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring' + if self._debug_mode: + import traceback + msg += "\n" + traceback.format_exc() + self._task.get_logger().report_text(msg) self._gpustat = None return stats @@ -366,7 +375,7 @@ class ResourceMonitor(BackgroundMonitor): # already in MBs stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024 # use previously sampled process gpu memory, or global if it does not exist - stats["gpu_%d_mem_used_gb" % i] = float(gpu_mem[i] if gpu_mem else g["memory.used"]) / 1024 + stats["gpu_%d_mem_used_gb" % i] = float(gpu_mem[i] if gpu_mem and i in gpu_mem else g["memory.used"]) / 1024 return stats