Fix GPU memory used reports 0 when memory can not be queried per process (edge case)

2025-06-26 18:16:07 +00:00 · 2023-09-25 22:16:02 +03:00 · 2023-09-25 22:16:02 +03:00 · 96646dc46a
commit 96646dc46a
parent e71c257290
1 changed files with 23 additions and 22 deletions
--- a/clearml/utilities/resource_monitor.py
+++ b/clearml/utilities/resource_monitor.py
@ -311,39 +311,40 @@ class ResourceMonitor(BackgroundMonitor):
        # On the rest of the samples we return the previous memory measurement
        # update mem used by our process and sub processes
-        if self._process_info and (not self._last_process_pool.get('gpu') or
+        if self._gpu_memory_per_process and self._process_info and \
-                                   (time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
+                (not self._last_process_pool.get('gpu') or
                 (time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
            gpu_mem = {}
-            if self._gpu_memory_per_process:
+            # noinspection PyBroadException
-                # noinspection PyBroadException
+            try:
-                try:
+                gpu_stat = self._gpustat.new_query(per_process_stats=True)
-                    gpu_stat = self._gpustat.new_query(per_process_stats=True)
+            except Exception:
-                except Exception:
+                gpu_stat = self._gpustat.new_query(per_process_stats=False)
                    gpu_stat = self._gpustat.new_query(per_process_stats=False)
-                for i, g in enumerate(gpu_stat.gpus):
+            for i, g in enumerate(gpu_stat.gpus):
-                    # if processes is None, that means we can't query GPU memory usage per proces, so we can stop
+                # if processes is None, that means we can't query GPU memory usage per proces, so we can stop
-                    if g.processes is None:
+                if g.processes is None:
-                        self._gpu_memory_per_process = False
+                    self._gpu_memory_per_process = False
-                        break
+                    break
-                    # only monitor the active gpu's, if none were selected, monitor everything
+                # only monitor the active gpu's, if none were selected, monitor everything
-                    if self._active_gpus and i not in self._active_gpus:
+                if self._active_gpus and i not in self._active_gpus:
-                        continue
+                    continue
-                    gpu_mem[i] = 0
+                gpu_mem[i] = 0
-                    for p in g.processes:
+                for p in g.processes:
-                        if p is not None and p['pid'] in self._last_process_id_list:
+                    if p is not None and p['pid'] in self._last_process_id_list:
-                            gpu_mem[i] += p.get('gpu_memory_usage', 0)
+                        gpu_mem[i] += p.get('gpu_memory_usage', 0)
            self._last_process_pool['gpu'] = time(), gpu_mem
        else:
            # if we do no need to update the memory usage, run global query
            # if we have no parent process (backward compatibility), return global stats
-            gpu_stat = self._gpustat.new_query()
+            gpu_stat = self._gpustat.new_query(per_process_stats=False)
            gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None
        # generate the statistics dict for actual report
        stats = {}
        for i, g in enumerate(gpu_stat.gpus):
            # only monitor the active gpu's, if none were selected, monitor everything
            if self._active_gpus and i not in self._active_gpus:
@ -381,7 +382,7 @@ class ResourceMonitor(BackgroundMonitor):
                    specs.update(
                        gpu_count=int(len(gpus)),
                        gpu_type=', '.join(g.name for g in gpus),
-                        gpu_memory=', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus),
+                        gpu_memory=', '.join('{}GB'.format(round(g.memory_total / 1024.0)) for g in gpus),
                        gpu_driver_version=gpu_stat.driver_version or '',
                        gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '',
                    )