Print error on resource monitor failure

This commit is contained in:
allegroai 2023-05-11 16:18:11 +03:00
parent 307ec9213e
commit b6ca0fa6a5

View File

@ -139,42 +139,45 @@ class ResourceMonitor(object):
def _daemon(self): def _daemon(self):
seconds_since_started = 0 seconds_since_started = 0
reported = 0 reported = 0
while True: try:
last_report = time() while True:
current_report_frequency = ( last_report = time()
self._report_frequency if reported != 0 else self._first_report_sec current_report_frequency = (
) self._report_frequency if reported != 0 else self._first_report_sec
while (time() - last_report) < current_report_frequency: )
# wait for self._sample_frequency seconds, if event set quit while (time() - last_report) < current_report_frequency:
if self._exit_event.wait(1 / self._sample_frequency): # wait for self._sample_frequency seconds, if event set quit
return if self._exit_event.wait(1 / self._sample_frequency):
# noinspection PyBroadException return
try: # noinspection PyBroadException
self._update_readouts() try:
except Exception as ex: self._update_readouts()
log.warning("failed getting machine stats: %s", report_error(ex)) except Exception as ex:
self._failure() log.warning("failed getting machine stats: %s", report_error(ex))
self._failure()
seconds_since_started += int(round(time() - last_report)) seconds_since_started += int(round(time() - last_report))
# check if we do not report any metric (so it means the last iteration will not be changed) # check if we do not report any metric (so it means the last iteration will not be changed)
# if we do not have last_iteration, we just use seconds as iteration # if we do not have last_iteration, we just use seconds as iteration
# start reporting only when we figured out, if this is seconds based, or iterations based # start reporting only when we figured out, if this is seconds based, or iterations based
average_readouts = self._get_average_readouts() average_readouts = self._get_average_readouts()
stats = { stats = {
# 3 points after the dot # 3 points after the dot
key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value] key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
for key, value in average_readouts.items() for key, value in average_readouts.items()
} }
# send actual report # send actual report
if self.send_report(stats): if self.send_report(stats):
# clear readouts if this is update was sent # clear readouts if this is update was sent
self._clear_readouts() self._clear_readouts()
# count reported iterations # count reported iterations
reported += 1 reported += 1
except Exception as ex:
log.exception("Error reporting monitoring info: %s", str(ex))
def _update_readouts(self): def _update_readouts(self):
readouts = self._machine_stats() readouts = self._machine_stats()