mirror of
https://github.com/clearml/clearml
synced 2025-03-13 07:08:24 +00:00
Fix resource monitor fails to get GPU stats in some edge cases
This commit is contained in:
parent
d4e136307c
commit
762240d14d
@ -45,6 +45,12 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
self._last_process_id_list = []
|
self._last_process_id_list = []
|
||||||
self._gpu_memory_per_process = True
|
self._gpu_memory_per_process = True
|
||||||
|
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
self._debug_mode = bool(os.getenv("CLEARML_RESMON_DEBUG", ""))
|
||||||
|
except Exception:
|
||||||
|
self._debug_mode = False
|
||||||
|
|
||||||
if not self._gpustat:
|
if not self._gpustat:
|
||||||
self._task.get_logger().report_text('ClearML Monitor: GPU monitoring is not available')
|
self._task.get_logger().report_text('ClearML Monitor: GPU monitoring is not available')
|
||||||
else: # if running_remotely():
|
else: # if running_remotely():
|
||||||
@ -247,8 +253,11 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
# something happened and we can't use gpu stats,
|
# something happened and we can't use gpu stats,
|
||||||
self._gpustat_fail += 1
|
self._gpustat_fail += 1
|
||||||
if self._gpustat_fail >= 3:
|
if self._gpustat_fail >= 3:
|
||||||
self._task.get_logger().report_text('ClearML Monitor: GPU monitoring failed getting GPU reading, '
|
msg = 'ClearML Monitor: GPU monitoring failed getting GPU reading, switching off GPU monitoring'
|
||||||
'switching off GPU monitoring')
|
if self._debug_mode:
|
||||||
|
import traceback
|
||||||
|
msg += "\n" + traceback.format_exc()
|
||||||
|
self._task.get_logger().report_text(msg)
|
||||||
self._gpustat = None
|
self._gpustat = None
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
@ -366,7 +375,7 @@ class ResourceMonitor(BackgroundMonitor):
|
|||||||
# already in MBs
|
# already in MBs
|
||||||
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
|
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
|
||||||
# use previously sampled process gpu memory, or global if it does not exist
|
# use previously sampled process gpu memory, or global if it does not exist
|
||||||
stats["gpu_%d_mem_used_gb" % i] = float(gpu_mem[i] if gpu_mem else g["memory.used"]) / 1024
|
stats["gpu_%d_mem_used_gb" % i] = float(gpu_mem[i] if gpu_mem and i in gpu_mem else g["memory.used"]) / 1024
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user