mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Print error on resource monitor failure
This commit is contained in:
parent
307ec9213e
commit
b6ca0fa6a5
@ -139,42 +139,45 @@ class ResourceMonitor(object):
|
|||||||
def _daemon(self):
|
def _daemon(self):
|
||||||
seconds_since_started = 0
|
seconds_since_started = 0
|
||||||
reported = 0
|
reported = 0
|
||||||
while True:
|
try:
|
||||||
last_report = time()
|
while True:
|
||||||
current_report_frequency = (
|
last_report = time()
|
||||||
self._report_frequency if reported != 0 else self._first_report_sec
|
current_report_frequency = (
|
||||||
)
|
self._report_frequency if reported != 0 else self._first_report_sec
|
||||||
while (time() - last_report) < current_report_frequency:
|
)
|
||||||
# wait for self._sample_frequency seconds, if event set quit
|
while (time() - last_report) < current_report_frequency:
|
||||||
if self._exit_event.wait(1 / self._sample_frequency):
|
# wait for self._sample_frequency seconds, if event set quit
|
||||||
return
|
if self._exit_event.wait(1 / self._sample_frequency):
|
||||||
# noinspection PyBroadException
|
return
|
||||||
try:
|
# noinspection PyBroadException
|
||||||
self._update_readouts()
|
try:
|
||||||
except Exception as ex:
|
self._update_readouts()
|
||||||
log.warning("failed getting machine stats: %s", report_error(ex))
|
except Exception as ex:
|
||||||
self._failure()
|
log.warning("failed getting machine stats: %s", report_error(ex))
|
||||||
|
self._failure()
|
||||||
|
|
||||||
seconds_since_started += int(round(time() - last_report))
|
seconds_since_started += int(round(time() - last_report))
|
||||||
# check if we do not report any metric (so it means the last iteration will not be changed)
|
# check if we do not report any metric (so it means the last iteration will not be changed)
|
||||||
|
|
||||||
# if we do not have last_iteration, we just use seconds as iteration
|
# if we do not have last_iteration, we just use seconds as iteration
|
||||||
|
|
||||||
# start reporting only when we figured out, if this is seconds based, or iterations based
|
# start reporting only when we figured out, if this is seconds based, or iterations based
|
||||||
average_readouts = self._get_average_readouts()
|
average_readouts = self._get_average_readouts()
|
||||||
stats = {
|
stats = {
|
||||||
# 3 points after the dot
|
# 3 points after the dot
|
||||||
key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
|
key: round(value, 3) if isinstance(value, float) else [round(v, 3) for v in value]
|
||||||
for key, value in average_readouts.items()
|
for key, value in average_readouts.items()
|
||||||
}
|
}
|
||||||
|
|
||||||
# send actual report
|
# send actual report
|
||||||
if self.send_report(stats):
|
if self.send_report(stats):
|
||||||
# clear readouts if this is update was sent
|
# clear readouts if this is update was sent
|
||||||
self._clear_readouts()
|
self._clear_readouts()
|
||||||
|
|
||||||
# count reported iterations
|
# count reported iterations
|
||||||
reported += 1
|
reported += 1
|
||||||
|
except Exception as ex:
|
||||||
|
log.exception("Error reporting monitoring info: %s", str(ex))
|
||||||
|
|
||||||
def _update_readouts(self):
|
def _update_readouts(self):
|
||||||
readouts = self._machine_stats()
|
readouts = self._machine_stats()
|
||||||
|
Loading…
Reference in New Issue
Block a user