Fix resource monitor fall back to seconds since experiment started, will keep waiting for any reporting for 30min, while sending monitor report after 3min

This commit is contained in:
allegroai 2019-09-27 13:20:41 +03:00
parent ce77734f5d
commit c0ab9a2f52
2 changed files with 39 additions and 9 deletions

View File

@ -39,6 +39,7 @@ class Logger(object):
self._flusher = None
self._report_worker = None
self._task_handler = None
self._graph_titles = {}
StdStreamPatch.patch_std_streams(self)
@ -78,7 +79,7 @@ class Logger(object):
# if task was not started, we have to start it
self._start_task_if_needed()
self._touch_title_series(title, series)
return self._task.reporter.report_scalar(title=title, series=series, value=float(value), iter=iteration)
def report_vector(self, title, series, values, iteration, labels=None, xlabels=None):
@ -92,6 +93,7 @@ class Logger(object):
:param list(str) labels: optional, labels for each bar group.
:param list(str) xlabels: optional label per entry in the vector (bucket in the histogram)
"""
self._touch_title_series(title, series)
return self.report_histogram(title, series, values, iteration, labels=labels, xlabels=xlabels)
def report_histogram(self, title, series, values, iteration, labels=None, xlabels=None):
@ -111,7 +113,7 @@ class Logger(object):
# if task was not started, we have to start it
self._start_task_if_needed()
self._touch_title_series(title, series)
return self._task.reporter.report_histogram(
title=title,
series=series,
@ -140,7 +142,7 @@ class Logger(object):
# if task was not started, we have to start it
self._start_task_if_needed()
self._touch_title_series(title, series[0].name if series else '')
return self._task.reporter.report_line_plot(
title=title,
series=series,
@ -175,7 +177,7 @@ class Logger(object):
# if task was not started, we have to start it
self._start_task_if_needed()
self._touch_title_series(title, series)
return self._task.reporter.report_2d_scatter(
title=title,
series=series,
@ -228,7 +230,7 @@ class Logger(object):
# if task was not started, we have to start it
self._start_task_if_needed()
self._touch_title_series(title, series)
return self._task.reporter.report_3d_scatter(
title=title,
series=series,
@ -258,7 +260,7 @@ class Logger(object):
# if task was not started, we have to start it
self._start_task_if_needed()
self._touch_title_series(title, series)
return self._task.reporter.report_value_matrix(
title=title,
series=series,
@ -281,6 +283,7 @@ class Logger(object):
:param list(str) xlabels: optional label per column of the matrix
:param list(str) ylabels: optional label per row of the matrix
"""
self._touch_title_series(title, series)
return self.report_confusion_matrix(title, series, matrix, iteration, xlabels=xlabels, ylabels=ylabels)
def report_surface(self, title, series, matrix, iteration, xlabels=None, ylabels=None,
@ -306,7 +309,7 @@ class Logger(object):
# if task was not started, we have to start it
self._start_task_if_needed()
self._touch_title_series(title, series)
return self._task.reporter.report_value_surface(
title=title,
series=series,
@ -351,7 +354,7 @@ class Logger(object):
upload_uri = str(upload_uri)
storage = StorageHelper.get(upload_uri)
upload_uri = storage.verify_upload(folder_uri=upload_uri)
self._touch_title_series(title, series)
self._task.reporter.report_image_and_upload(
title=title,
series=series,
@ -606,6 +609,14 @@ class Logger(object):
if self._task_handler and DevWorker.report_stdout:
self._task_handler.flush()
def _touch_title_series(self, title, series):
if title not in self._graph_titles:
self._graph_titles[title] = set()
self._graph_titles[title].add(series)
def _get_used_title_series(self):
return self._graph_titles
@classmethod
def _get_tensorboard_auto_group_scalars(cls):
"""

View File

@ -19,12 +19,14 @@ class ResourceMonitor(object):
_title_gpu = ':monitor:gpu'
def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30.,
first_report_sec=None, wait_for_first_iteration_to_start_sec=180.):
first_report_sec=None, wait_for_first_iteration_to_start_sec=180.0,
max_wait_for_first_iteration_to_start_sec=1800.):
self._task = task
self._sample_frequency = sample_frequency_per_sec
self._report_frequency = report_frequency_sec
self._first_report_sec = first_report_sec or report_frequency_sec
self._wait_for_first_iteration = wait_for_first_iteration_to_start_sec
self._max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
self._num_readouts = 0
self._readouts = {}
self._previous_readouts = {}
@ -78,6 +80,11 @@ class ResourceMonitor(object):
self._task.get_logger().report_text('TRAINS Monitor: Could not detect iteration reporting, '
'falling back to iterations as seconds-from-start')
fallback_to_sec_as_iterations = True
elif fallback_to_sec_as_iterations is True and seconds_since_started <= self._max_check_first_iteration:
if self._check_logger_reported():
fallback_to_sec_as_iterations = False
self._task.get_logger().report_text('TRAINS Monitor: Reporting detected, '
'reverting back to iteration based reporting')
clear_readouts = True
# if we do not have last_iteration, we just use seconds as iteration
@ -206,3 +213,15 @@ class ResourceMonitor(object):
self._gpustat = None
return stats
def _check_logger_reported(self):
titles = list(self._task.get_logger()._get_used_title_series().keys())
try:
titles.remove(self._title_machine)
except ValueError:
pass
try:
titles.remove(self._title_gpu)
except ValueError:
pass
return len(titles) > 0