Fix, gpu resource monitoring fail safety mechanism

This commit is contained in:
allegroai 2019-07-06 22:58:36 +03:00
parent 22a65abdc1
commit 7b1c0d8c21

View File

@ -29,7 +29,9 @@ class ResourceMonitor(object):
self._previous_readouts_ts = time()
self._thread = None
self._exit_event = Event()
if not gpustat:
self._gpustat_fail = 0
self._gpustat = gpustat
if not self._gpustat:
self._task.get_logger().console('TRAINS Monitor: GPU monitoring is not available, '
'run \"pip install gpustat\"')
@ -134,8 +136,7 @@ class ResourceMonitor(object):
self._readouts = {}
self._num_readouts = 0
@staticmethod
def _machine_stats():
def _machine_stats(self):
"""
:return: machine stats dictionary, all values expressed in megabytes
"""
@ -169,14 +170,22 @@ class ResourceMonitor(object):
stats["io_write_mbs"] = bytes_to_megabytes(io_stats.write_bytes)
# check if we can access the gpu statistics
if gpustat:
gpu_stat = gpustat.new_query()
for i, g in enumerate(gpu_stat.gpus):
stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
# already in MBs
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
stats["gpu_%d_mem_used_gb" % i] = float(g["memory.used"]) / 1024
if self._gpustat:
try:
gpu_stat = self._gpustat.new_query()
for i, g in enumerate(gpu_stat.gpus):
stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
stats["gpu_%d_mem_usage" % i] = 100. * float(g["memory.used"]) / float(g["memory.total"])
# already in MBs
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
stats["gpu_%d_mem_used_gb" % i] = float(g["memory.used"]) / 1024
except Exception:
# something happened and we can't use gpu stats,
self._gpustat_fail += 1
if self._gpustat_fail >= 3:
self._task.get_logger().console('TRAINS Monitor: GPU monitoring failed getting GPU reading, '
'switching off GPU monitoring')
self._gpustat = None
return stats