mirror of
https://github.com/clearml/clearml
synced 2025-02-07 13:23:40 +00:00
Fix, gpu resource monitoring fail safety mechanism
This commit is contained in:
parent
22a65abdc1
commit
7b1c0d8c21
@ -29,7 +29,9 @@ class ResourceMonitor(object):
|
||||
self._previous_readouts_ts = time()
|
||||
self._thread = None
|
||||
self._exit_event = Event()
|
||||
if not gpustat:
|
||||
self._gpustat_fail = 0
|
||||
self._gpustat = gpustat
|
||||
if not self._gpustat:
|
||||
self._task.get_logger().console('TRAINS Monitor: GPU monitoring is not available, '
|
||||
'run \"pip install gpustat\"')
|
||||
|
||||
@ -134,8 +136,7 @@ class ResourceMonitor(object):
|
||||
self._readouts = {}
|
||||
self._num_readouts = 0
|
||||
|
||||
@staticmethod
|
||||
def _machine_stats():
|
||||
def _machine_stats(self):
|
||||
"""
|
||||
:return: machine stats dictionary, all values expressed in megabytes
|
||||
"""
|
||||
@ -169,14 +170,22 @@ class ResourceMonitor(object):
|
||||
stats["io_write_mbs"] = bytes_to_megabytes(io_stats.write_bytes)
|
||||
|
||||
# check if we can access the gpu statistics
|
||||
if gpustat:
|
||||
gpu_stat = gpustat.new_query()
|
||||
for i, g in enumerate(gpu_stat.gpus):
|
||||
stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
|
||||
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
|
||||
stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
|
||||
# already in MBs
|
||||
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
|
||||
stats["gpu_%d_mem_used_gb" % i] = float(g["memory.used"]) / 1024
|
||||
if self._gpustat:
|
||||
try:
|
||||
gpu_stat = self._gpustat.new_query()
|
||||
for i, g in enumerate(gpu_stat.gpus):
|
||||
stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
|
||||
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
|
||||
stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
|
||||
# already in MBs
|
||||
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
|
||||
stats["gpu_%d_mem_used_gb" % i] = float(g["memory.used"]) / 1024
|
||||
except Exception:
|
||||
# something happened and we can't use gpu stats,
|
||||
self._gpustat_fail += 1
|
||||
if self._gpustat_fail >= 3:
|
||||
self._task.get_logger().console('TRAINS Monitor: GPU monitoring failed getting GPU reading, '
|
||||
'switching off GPU monitoring')
|
||||
self._gpustat = None
|
||||
|
||||
return stats
|
||||
|
Loading…
Reference in New Issue
Block a user