mirror of
https://github.com/clearml/clearml
synced 2025-02-07 13:23:40 +00:00
Fix GPU memory used reports 0 when memory can not be queried per process (edge case)
This commit is contained in:
parent
e71c257290
commit
96646dc46a
@ -311,10 +311,10 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
# On the rest of the samples we return the previous memory measurement
|
||||
|
||||
# update mem used by our process and sub processes
|
||||
if self._process_info and (not self._last_process_pool.get('gpu') or
|
||||
if self._gpu_memory_per_process and self._process_info and \
|
||||
(not self._last_process_pool.get('gpu') or
|
||||
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
|
||||
gpu_mem = {}
|
||||
if self._gpu_memory_per_process:
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
gpu_stat = self._gpustat.new_query(per_process_stats=True)
|
||||
@ -339,11 +339,12 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
else:
|
||||
# if we do no need to update the memory usage, run global query
|
||||
# if we have no parent process (backward compatibility), return global stats
|
||||
gpu_stat = self._gpustat.new_query()
|
||||
gpu_stat = self._gpustat.new_query(per_process_stats=False)
|
||||
gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None
|
||||
|
||||
# generate the statistics dict for actual report
|
||||
stats = {}
|
||||
|
||||
for i, g in enumerate(gpu_stat.gpus):
|
||||
# only monitor the active gpu's, if none were selected, monitor everything
|
||||
if self._active_gpus and i not in self._active_gpus:
|
||||
@ -381,7 +382,7 @@ class ResourceMonitor(BackgroundMonitor):
|
||||
specs.update(
|
||||
gpu_count=int(len(gpus)),
|
||||
gpu_type=', '.join(g.name for g in gpus),
|
||||
gpu_memory=', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus),
|
||||
gpu_memory=', '.join('{}GB'.format(round(g.memory_total / 1024.0)) for g in gpus),
|
||||
gpu_driver_version=gpu_stat.driver_version or '',
|
||||
gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '',
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user