Fix resource monitor fall back to seconds-from-start

This commit is contained in:
allegroai 2019-07-02 00:00:05 +03:00
parent 4099efc26b
commit 738f00340a

View File

@ -51,7 +51,7 @@ class ResourceMonitor(object):
last_iteration_ts = 0 last_iteration_ts = 0
last_iteration_interval = None last_iteration_interval = None
repeated_iterations = 0 repeated_iterations = 0
fallback_to_sec_as_iterations = 0 fallback_to_sec_as_iterations = None
while True: while True:
last_report = time() last_report = time()
current_report_frequency = self._report_frequency if reported != 0 else self._first_report_sec current_report_frequency = self._report_frequency if reported != 0 else self._first_report_sec
@ -73,6 +73,8 @@ class ResourceMonitor(object):
if IsTensorboardInit.tensorboard_used(): if IsTensorboardInit.tensorboard_used():
fallback_to_sec_as_iterations = False fallback_to_sec_as_iterations = False
elif seconds_since_started >= self._wait_for_first_iteration: elif seconds_since_started >= self._wait_for_first_iteration:
self._task.get_logger().console('TRAINS Monitor: Could not detect iteration reporting, '
'falling back to iterations as seconds-from-start')
fallback_to_sec_as_iterations = True fallback_to_sec_as_iterations = True
# if we do not have last_iteration, we just use seconds as iteration # if we do not have last_iteration, we just use seconds as iteration
@ -95,16 +97,18 @@ class ResourceMonitor(object):
repeated_iterations = 0 repeated_iterations = 0
fallback_to_sec_as_iterations = False fallback_to_sec_as_iterations = False
for k, v in average_readouts.items(): # start reporting only when we figured out, if this is seconds based, or iterations based
# noinspection PyBroadException if fallback_to_sec_as_iterations is not None:
try: for k, v in average_readouts.items():
title = self._title_gpu if k.startswith('gpu_') else self._title_machine # noinspection PyBroadException
# 3 points after the dot try:
value = round(v*1000) / 1000. title = self._title_gpu if k.startswith('gpu_') else self._title_machine
logger.report_scalar(title=title, series=k, iteration=iteration, value=value) # 3 points after the dot
except Exception: value = round(v*1000) / 1000.
pass logger.report_scalar(title=title, series=k, iteration=iteration, value=value)
self._clear_readouts() except Exception:
pass
self._clear_readouts()
def _update_readouts(self): def _update_readouts(self):
readouts = self._machine_stats() readouts = self._machine_stats()