mirror of
https://github.com/clearml/clearml
synced 2025-02-12 07:35:08 +00:00
Fix GPU memory used reports 0 when memory can not be queried per process (edge case)
This commit is contained in:
parent
e71c257290
commit
96646dc46a
@ -311,39 +311,40 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
# On the rest of the samples we return the previous memory measurement
|
# On the rest of the samples we return the previous memory measurement
|
||||||
|
|
||||||
# update mem used by our process and sub processes
|
# update mem used by our process and sub processes
|
||||||
if self._process_info and (not self._last_process_pool.get('gpu') or
|
if self._gpu_memory_per_process and self._process_info and \
|
||||||
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
|
(not self._last_process_pool.get('gpu') or
|
||||||
|
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
|
||||||
gpu_mem = {}
|
gpu_mem = {}
|
||||||
if self._gpu_memory_per_process:
|
# noinspection PyBroadException
|
||||||
# noinspection PyBroadException
|
try:
|
||||||
try:
|
gpu_stat = self._gpustat.new_query(per_process_stats=True)
|
||||||
gpu_stat = self._gpustat.new_query(per_process_stats=True)
|
except Exception:
|
||||||
except Exception:
|
gpu_stat = self._gpustat.new_query(per_process_stats=False)
|
||||||
gpu_stat = self._gpustat.new_query(per_process_stats=False)
|
|
||||||
|
|
||||||
for i, g in enumerate(gpu_stat.gpus):
|
for i, g in enumerate(gpu_stat.gpus):
|
||||||
# if processes is None, that means we can't query GPU memory usage per proces, so we can stop
|
# if processes is None, that means we can't query GPU memory usage per proces, so we can stop
|
||||||
if g.processes is None:
|
if g.processes is None:
|
||||||
self._gpu_memory_per_process = False
|
self._gpu_memory_per_process = False
|
||||||
break
|
break
|
||||||
# only monitor the active gpu's, if none were selected, monitor everything
|
# only monitor the active gpu's, if none were selected, monitor everything
|
||||||
if self._active_gpus and i not in self._active_gpus:
|
if self._active_gpus and i not in self._active_gpus:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
gpu_mem[i] = 0
|
gpu_mem[i] = 0
|
||||||
for p in g.processes:
|
for p in g.processes:
|
||||||
if p is not None and p['pid'] in self._last_process_id_list:
|
if p is not None and p['pid'] in self._last_process_id_list:
|
||||||
gpu_mem[i] += p.get('gpu_memory_usage', 0)
|
gpu_mem[i] += p.get('gpu_memory_usage', 0)
|
||||||
|
|
||||||
self._last_process_pool['gpu'] = time(), gpu_mem
|
self._last_process_pool['gpu'] = time(), gpu_mem
|
||||||
else:
|
else:
|
||||||
# if we do no need to update the memory usage, run global query
|
# if we do no need to update the memory usage, run global query
|
||||||
# if we have no parent process (backward compatibility), return global stats
|
# if we have no parent process (backward compatibility), return global stats
|
||||||
gpu_stat = self._gpustat.new_query()
|
gpu_stat = self._gpustat.new_query(per_process_stats=False)
|
||||||
gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None
|
gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None
|
||||||
|
|
||||||
# generate the statistics dict for actual report
|
# generate the statistics dict for actual report
|
||||||
stats = {}
|
stats = {}
|
||||||
|
|
||||||
for i, g in enumerate(gpu_stat.gpus):
|
for i, g in enumerate(gpu_stat.gpus):
|
||||||
# only monitor the active gpu's, if none were selected, monitor everything
|
# only monitor the active gpu's, if none were selected, monitor everything
|
||||||
if self._active_gpus and i not in self._active_gpus:
|
if self._active_gpus and i not in self._active_gpus:
|
||||||
@ -381,7 +382,7 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
specs.update(
|
specs.update(
|
||||||
gpu_count=int(len(gpus)),
|
gpu_count=int(len(gpus)),
|
||||||
gpu_type=', '.join(g.name for g in gpus),
|
gpu_type=', '.join(g.name for g in gpus),
|
||||||
gpu_memory=', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus),
|
gpu_memory=', '.join('{}GB'.format(round(g.memory_total / 1024.0)) for g in gpus),
|
||||||
gpu_driver_version=gpu_stat.driver_version or '',
|
gpu_driver_version=gpu_stat.driver_version or '',
|
||||||
gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '',
|
gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '',
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user