Fix, gpu resource monitoring fail safety mechanism

This commit is contained in:
allegroai 2019-07-06 22:58:36 +03:00
parent 22a65abdc1
commit 7b1c0d8c21

View File

@ -29,7 +29,9 @@ class ResourceMonitor(object):
self._previous_readouts_ts = time() self._previous_readouts_ts = time()
self._thread = None self._thread = None
self._exit_event = Event() self._exit_event = Event()
if not gpustat: self._gpustat_fail = 0
self._gpustat = gpustat
if not self._gpustat:
self._task.get_logger().console('TRAINS Monitor: GPU monitoring is not available, ' self._task.get_logger().console('TRAINS Monitor: GPU monitoring is not available, '
'run \"pip install gpustat\"') 'run \"pip install gpustat\"')
@ -134,8 +136,7 @@ class ResourceMonitor(object):
self._readouts = {} self._readouts = {}
self._num_readouts = 0 self._num_readouts = 0
@staticmethod def _machine_stats(self):
def _machine_stats():
""" """
:return: machine stats dictionary, all values expressed in megabytes :return: machine stats dictionary, all values expressed in megabytes
""" """
@ -169,8 +170,9 @@ class ResourceMonitor(object):
stats["io_write_mbs"] = bytes_to_megabytes(io_stats.write_bytes) stats["io_write_mbs"] = bytes_to_megabytes(io_stats.write_bytes)
# check if we can access the gpu statistics # check if we can access the gpu statistics
if gpustat: if self._gpustat:
gpu_stat = gpustat.new_query() try:
gpu_stat = self._gpustat.new_query()
for i, g in enumerate(gpu_stat.gpus): for i, g in enumerate(gpu_stat.gpus):
stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"]) stats["gpu_%d_temperature" % i] = float(g["temperature.gpu"])
stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"]) stats["gpu_%d_utilization" % i] = float(g["utilization.gpu"])
@ -178,5 +180,12 @@ class ResourceMonitor(object):
# already in MBs # already in MBs
stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024 stats["gpu_%d_mem_free_gb" % i] = float(g["memory.total"] - g["memory.used"]) / 1024
stats["gpu_%d_mem_used_gb" % i] = float(g["memory.used"]) / 1024 stats["gpu_%d_mem_used_gb" % i] = float(g["memory.used"]) / 1024
except Exception:
# something happened and we can't use gpu stats,
self._gpustat_fail += 1
if self._gpustat_fail >= 3:
self._task.get_logger().console('TRAINS Monitor: GPU monitoring failed getting GPU reading, '
'switching off GPU monitoring')
self._gpustat = None
return stats return stats