From 96646dc46a7c5437d30c6029c4c493db63de2dd2 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Mon, 25 Sep 2023 22:16:02 +0300 Subject: [PATCH] Fix GPU memory used reports 0 when memory can not be queried per process (edge case) --- clearml/utilities/resource_monitor.py | 45 ++++++++++++++------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/clearml/utilities/resource_monitor.py b/clearml/utilities/resource_monitor.py index c0cf6cb5..ac3165c3 100644 --- a/clearml/utilities/resource_monitor.py +++ b/clearml/utilities/resource_monitor.py @@ -311,39 +311,40 @@ class ResourceMonitor(BackgroundMonitor): # On the rest of the samples we return the previous memory measurement # update mem used by our process and sub processes - if self._process_info and (not self._last_process_pool.get('gpu') or - (time() - self._last_process_pool['gpu'][0]) >= self._report_frequency): + if self._gpu_memory_per_process and self._process_info and \ + (not self._last_process_pool.get('gpu') or + (time() - self._last_process_pool['gpu'][0]) >= self._report_frequency): gpu_mem = {} - if self._gpu_memory_per_process: - # noinspection PyBroadException - try: - gpu_stat = self._gpustat.new_query(per_process_stats=True) - except Exception: - gpu_stat = self._gpustat.new_query(per_process_stats=False) + # noinspection PyBroadException + try: + gpu_stat = self._gpustat.new_query(per_process_stats=True) + except Exception: + gpu_stat = self._gpustat.new_query(per_process_stats=False) - for i, g in enumerate(gpu_stat.gpus): - # if processes is None, that means we can't query GPU memory usage per proces, so we can stop - if g.processes is None: - self._gpu_memory_per_process = False - break - # only monitor the active gpu's, if none were selected, monitor everything - if self._active_gpus and i not in self._active_gpus: - continue + for i, g in enumerate(gpu_stat.gpus): + # if processes is None, that means we can't query GPU memory usage per proces, so we can stop + if g.processes is None: + self._gpu_memory_per_process = False + break + # only monitor the active gpu's, if none were selected, monitor everything + if self._active_gpus and i not in self._active_gpus: + continue - gpu_mem[i] = 0 - for p in g.processes: - if p is not None and p['pid'] in self._last_process_id_list: - gpu_mem[i] += p.get('gpu_memory_usage', 0) + gpu_mem[i] = 0 + for p in g.processes: + if p is not None and p['pid'] in self._last_process_id_list: + gpu_mem[i] += p.get('gpu_memory_usage', 0) self._last_process_pool['gpu'] = time(), gpu_mem else: # if we do no need to update the memory usage, run global query # if we have no parent process (backward compatibility), return global stats - gpu_stat = self._gpustat.new_query() + gpu_stat = self._gpustat.new_query(per_process_stats=False) gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None # generate the statistics dict for actual report stats = {} + for i, g in enumerate(gpu_stat.gpus): # only monitor the active gpu's, if none were selected, monitor everything if self._active_gpus and i not in self._active_gpus: @@ -381,7 +382,7 @@ class ResourceMonitor(BackgroundMonitor): specs.update( gpu_count=int(len(gpus)), gpu_type=', '.join(g.name for g in gpus), - gpu_memory=', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus), + gpu_memory=', '.join('{}GB'.format(round(g.memory_total / 1024.0)) for g in gpus), gpu_driver_version=gpu_stat.driver_version or '', gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '', )