mirror of
https://github.com/clearml/clearml
synced 2025-02-07 21:33:25 +00:00
Fix GPU memory used reports 0 when memory can not be queried per process (edge case)
This commit is contained in:
parent
e71c257290
commit
96646dc46a
@ -311,10 +311,10 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
# On the rest of the samples we return the previous memory measurement
|
# On the rest of the samples we return the previous memory measurement
|
||||||
|
|
||||||
# update mem used by our process and sub processes
|
# update mem used by our process and sub processes
|
||||||
if self._process_info and (not self._last_process_pool.get('gpu') or
|
if self._gpu_memory_per_process and self._process_info and \
|
||||||
|
(not self._last_process_pool.get('gpu') or
|
||||||
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
|
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
|
||||||
gpu_mem = {}
|
gpu_mem = {}
|
||||||
if self._gpu_memory_per_process:
|
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
gpu_stat = self._gpustat.new_query(per_process_stats=True)
|
gpu_stat = self._gpustat.new_query(per_process_stats=True)
|
||||||
@ -339,11 +339,12 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
else:
|
else:
|
||||||
# if we do no need to update the memory usage, run global query
|
# if we do no need to update the memory usage, run global query
|
||||||
# if we have no parent process (backward compatibility), return global stats
|
# if we have no parent process (backward compatibility), return global stats
|
||||||
gpu_stat = self._gpustat.new_query()
|
gpu_stat = self._gpustat.new_query(per_process_stats=False)
|
||||||
gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None
|
gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None
|
||||||
|
|
||||||
# generate the statistics dict for actual report
|
# generate the statistics dict for actual report
|
||||||
stats = {}
|
stats = {}
|
||||||
|
|
||||||
for i, g in enumerate(gpu_stat.gpus):
|
for i, g in enumerate(gpu_stat.gpus):
|
||||||
# only monitor the active gpu's, if none were selected, monitor everything
|
# only monitor the active gpu's, if none were selected, monitor everything
|
||||||
if self._active_gpus and i not in self._active_gpus:
|
if self._active_gpus and i not in self._active_gpus:
|
||||||
@ -381,7 +382,7 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
specs.update(
|
specs.update(
|
||||||
gpu_count=int(len(gpus)),
|
gpu_count=int(len(gpus)),
|
||||||
gpu_type=', '.join(g.name for g in gpus),
|
gpu_type=', '.join(g.name for g in gpus),
|
||||||
gpu_memory=', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus),
|
gpu_memory=', '.join('{}GB'.format(round(g.memory_total / 1024.0)) for g in gpus),
|
||||||
gpu_driver_version=gpu_stat.driver_version or '',
|
gpu_driver_version=gpu_stat.driver_version or '',
|
||||||
gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '',
|
gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '',
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user