Fix GPU memory used reports 0 when memory can not be queried per process (edge case)

This commit is contained in:
allegroai 2023-09-25 22:16:02 +03:00
parent e71c257290
commit 96646dc46a

View File

@ -311,10 +311,10 @@ class ResourceMonitor(BackgroundMonitor):
# On the rest of the samples we return the previous memory measurement
# update mem used by our process and sub processes
if self._process_info and (not self._last_process_pool.get('gpu') or
if self._gpu_memory_per_process and self._process_info and \
(not self._last_process_pool.get('gpu') or
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
gpu_mem = {}
if self._gpu_memory_per_process:
# noinspection PyBroadException
try:
gpu_stat = self._gpustat.new_query(per_process_stats=True)
@ -339,11 +339,12 @@ class ResourceMonitor(BackgroundMonitor):
else:
# if we do no need to update the memory usage, run global query
# if we have no parent process (backward compatibility), return global stats
gpu_stat = self._gpustat.new_query()
gpu_stat = self._gpustat.new_query(per_process_stats=False)
gpu_mem = self._last_process_pool['gpu'][1] if self._last_process_pool.get('gpu') else None
# generate the statistics dict for actual report
stats = {}
for i, g in enumerate(gpu_stat.gpus):
# only monitor the active gpu's, if none were selected, monitor everything
if self._active_gpus and i not in self._active_gpus:
@ -381,7 +382,7 @@ class ResourceMonitor(BackgroundMonitor):
specs.update(
gpu_count=int(len(gpus)),
gpu_type=', '.join(g.name for g in gpus),
gpu_memory=', '.join('{}GB'.format(round(g.memory_total/1024.0)) for g in gpus),
gpu_memory=', '.join('{}GB'.format(round(g.memory_total / 1024.0)) for g in gpus),
gpu_driver_version=gpu_stat.driver_version or '',
gpu_driver_cuda_version=gpu_stat.driver_cuda_version or '',
)