mirror of
https://github.com/clearml/clearml
synced 2025-02-12 07:35:08 +00:00
Fix GPU memory used reports 0 when memory can not be queried per process
This commit is contained in:
parent
0442579e23
commit
c5675733a0
@ -285,11 +285,11 @@ class GPUStatCollection(object):
|
|||||||
for nv_process in nv_comp_processes + nv_graphics_processes:
|
for nv_process in nv_comp_processes + nv_graphics_processes:
|
||||||
try:
|
try:
|
||||||
process = get_process_info(nv_process)
|
process = get_process_info(nv_process)
|
||||||
processes.append(process)
|
|
||||||
except psutil.NoSuchProcess:
|
except psutil.NoSuchProcess:
|
||||||
# TODO: add some reminder for NVML broken context
|
# TODO: add some reminder for NVML broken context
|
||||||
# e.g. nvidia-smi reset or reboot the system
|
# e.g. nvidia-smi reset or reboot the system
|
||||||
pass
|
process = None
|
||||||
|
processes.append(process)
|
||||||
|
|
||||||
# we do not actually use these, so no point in collecting them
|
# we do not actually use these, so no point in collecting them
|
||||||
# # TODO: Do not block if full process info is not requested
|
# # TODO: Do not block if full process info is not requested
|
||||||
@ -313,7 +313,7 @@ class GPUStatCollection(object):
|
|||||||
# Convert bytes into MBytes
|
# Convert bytes into MBytes
|
||||||
'memory.used': memory.used // MB if memory else None,
|
'memory.used': memory.used // MB if memory else None,
|
||||||
'memory.total': memory.total // MB if memory else None,
|
'memory.total': memory.total // MB if memory else None,
|
||||||
'processes': processes,
|
'processes': None if (processes and all(p is None for p in processes)) else processes
|
||||||
}
|
}
|
||||||
if per_process_stats:
|
if per_process_stats:
|
||||||
GPUStatCollection.clean_processes()
|
GPUStatCollection.clean_processes()
|
||||||
|
@ -43,6 +43,8 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
self._process_info = psutil.Process() if report_mem_used_per_process else None
|
self._process_info = psutil.Process() if report_mem_used_per_process else None
|
||||||
self._last_process_pool = {}
|
self._last_process_pool = {}
|
||||||
self._last_process_id_list = []
|
self._last_process_id_list = []
|
||||||
|
self._gpu_memory_per_process = True
|
||||||
|
|
||||||
if not self._gpustat:
|
if not self._gpustat:
|
||||||
self._task.get_logger().report_text('ClearML Monitor: GPU monitoring is not available')
|
self._task.get_logger().report_text('ClearML Monitor: GPU monitoring is not available')
|
||||||
else: # if running_remotely():
|
else: # if running_remotely():
|
||||||
@ -311,16 +313,28 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
# update mem used by our process and sub processes
|
# update mem used by our process and sub processes
|
||||||
if self._process_info and (not self._last_process_pool.get('gpu') or
|
if self._process_info and (not self._last_process_pool.get('gpu') or
|
||||||
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
|
(time() - self._last_process_pool['gpu'][0]) >= self._report_frequency):
|
||||||
gpu_stat = self._gpustat.new_query(per_process_stats=True)
|
|
||||||
gpu_mem = {}
|
gpu_mem = {}
|
||||||
|
if self._gpu_memory_per_process:
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
gpu_stat = self._gpustat.new_query(per_process_stats=True)
|
||||||
|
except Exception:
|
||||||
|
gpu_stat = self._gpustat.new_query(per_process_stats=False)
|
||||||
|
|
||||||
for i, g in enumerate(gpu_stat.gpus):
|
for i, g in enumerate(gpu_stat.gpus):
|
||||||
|
# if processes is None, that means we can't query GPU memory usage per proces, so we can stop
|
||||||
|
if g.processes is None:
|
||||||
|
self._gpu_memory_per_process = False
|
||||||
|
break
|
||||||
# only monitor the active gpu's, if none were selected, monitor everything
|
# only monitor the active gpu's, if none were selected, monitor everything
|
||||||
if self._active_gpus and i not in self._active_gpus:
|
if self._active_gpus and i not in self._active_gpus:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
gpu_mem[i] = 0
|
gpu_mem[i] = 0
|
||||||
for p in g.processes:
|
for p in g.processes:
|
||||||
if p['pid'] in self._last_process_id_list:
|
if p is not None and p['pid'] in self._last_process_id_list:
|
||||||
gpu_mem[i] += p.get('gpu_memory_usage', 0)
|
gpu_mem[i] += p.get('gpu_memory_usage', 0)
|
||||||
|
|
||||||
self._last_process_pool['gpu'] = time(), gpu_mem
|
self._last_process_pool['gpu'] = time(), gpu_mem
|
||||||
else:
|
else:
|
||||||
# if we do no need to update the memory usage, run global query
|
# if we do no need to update the memory usage, run global query
|
||||||
|
Loading…
Reference in New Issue
Block a user