From c0ab9a2f525c477952174f0c22afecffa9156b89 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Fri, 27 Sep 2019 13:20:41 +0300 Subject: [PATCH] Fix resource monitor fall back to seconds since experiment started, will keep waiting for any reporting for 30min, while sending monitor report after 3min --- trains/logger.py | 27 +++++++++++++++++++-------- trains/utilities/resource_monitor.py | 21 ++++++++++++++++++++- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/trains/logger.py b/trains/logger.py index 3e472678..e8a3bb5e 100644 --- a/trains/logger.py +++ b/trains/logger.py @@ -39,6 +39,7 @@ class Logger(object): self._flusher = None self._report_worker = None self._task_handler = None + self._graph_titles = {} StdStreamPatch.patch_std_streams(self) @@ -78,7 +79,7 @@ class Logger(object): # if task was not started, we have to start it self._start_task_if_needed() - + self._touch_title_series(title, series) return self._task.reporter.report_scalar(title=title, series=series, value=float(value), iter=iteration) def report_vector(self, title, series, values, iteration, labels=None, xlabels=None): @@ -92,6 +93,7 @@ class Logger(object): :param list(str) labels: optional, labels for each bar group. :param list(str) xlabels: optional label per entry in the vector (bucket in the histogram) """ + self._touch_title_series(title, series) return self.report_histogram(title, series, values, iteration, labels=labels, xlabels=xlabels) def report_histogram(self, title, series, values, iteration, labels=None, xlabels=None): @@ -111,7 +113,7 @@ class Logger(object): # if task was not started, we have to start it self._start_task_if_needed() - + self._touch_title_series(title, series) return self._task.reporter.report_histogram( title=title, series=series, @@ -140,7 +142,7 @@ class Logger(object): # if task was not started, we have to start it self._start_task_if_needed() - + self._touch_title_series(title, series[0].name if series else '') return self._task.reporter.report_line_plot( title=title, series=series, @@ -175,7 +177,7 @@ class Logger(object): # if task was not started, we have to start it self._start_task_if_needed() - + self._touch_title_series(title, series) return self._task.reporter.report_2d_scatter( title=title, series=series, @@ -228,7 +230,7 @@ class Logger(object): # if task was not started, we have to start it self._start_task_if_needed() - + self._touch_title_series(title, series) return self._task.reporter.report_3d_scatter( title=title, series=series, @@ -258,7 +260,7 @@ class Logger(object): # if task was not started, we have to start it self._start_task_if_needed() - + self._touch_title_series(title, series) return self._task.reporter.report_value_matrix( title=title, series=series, @@ -281,6 +283,7 @@ class Logger(object): :param list(str) xlabels: optional label per column of the matrix :param list(str) ylabels: optional label per row of the matrix """ + self._touch_title_series(title, series) return self.report_confusion_matrix(title, series, matrix, iteration, xlabels=xlabels, ylabels=ylabels) def report_surface(self, title, series, matrix, iteration, xlabels=None, ylabels=None, @@ -306,7 +309,7 @@ class Logger(object): # if task was not started, we have to start it self._start_task_if_needed() - + self._touch_title_series(title, series) return self._task.reporter.report_value_surface( title=title, series=series, @@ -351,7 +354,7 @@ class Logger(object): upload_uri = str(upload_uri) storage = StorageHelper.get(upload_uri) upload_uri = storage.verify_upload(folder_uri=upload_uri) - + self._touch_title_series(title, series) self._task.reporter.report_image_and_upload( title=title, series=series, @@ -606,6 +609,14 @@ class Logger(object): if self._task_handler and DevWorker.report_stdout: self._task_handler.flush() + def _touch_title_series(self, title, series): + if title not in self._graph_titles: + self._graph_titles[title] = set() + self._graph_titles[title].add(series) + + def _get_used_title_series(self): + return self._graph_titles + @classmethod def _get_tensorboard_auto_group_scalars(cls): """ diff --git a/trains/utilities/resource_monitor.py b/trains/utilities/resource_monitor.py index a40a790f..c7669b77 100644 --- a/trains/utilities/resource_monitor.py +++ b/trains/utilities/resource_monitor.py @@ -19,12 +19,14 @@ class ResourceMonitor(object): _title_gpu = ':monitor:gpu' def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30., - first_report_sec=None, wait_for_first_iteration_to_start_sec=180.): + first_report_sec=None, wait_for_first_iteration_to_start_sec=180.0, + max_wait_for_first_iteration_to_start_sec=1800.): self._task = task self._sample_frequency = sample_frequency_per_sec self._report_frequency = report_frequency_sec self._first_report_sec = first_report_sec or report_frequency_sec self._wait_for_first_iteration = wait_for_first_iteration_to_start_sec + self._max_check_first_iteration = max_wait_for_first_iteration_to_start_sec self._num_readouts = 0 self._readouts = {} self._previous_readouts = {} @@ -78,6 +80,11 @@ class ResourceMonitor(object): self._task.get_logger().report_text('TRAINS Monitor: Could not detect iteration reporting, ' 'falling back to iterations as seconds-from-start') fallback_to_sec_as_iterations = True + elif fallback_to_sec_as_iterations is True and seconds_since_started <= self._max_check_first_iteration: + if self._check_logger_reported(): + fallback_to_sec_as_iterations = False + self._task.get_logger().report_text('TRAINS Monitor: Reporting detected, ' + 'reverting back to iteration based reporting') clear_readouts = True # if we do not have last_iteration, we just use seconds as iteration @@ -206,3 +213,15 @@ class ResourceMonitor(object): self._gpustat = None return stats + + def _check_logger_reported(self): + titles = list(self._task.get_logger()._get_used_title_series().keys()) + try: + titles.remove(self._title_machine) + except ValueError: + pass + try: + titles.remove(self._title_gpu) + except ValueError: + pass + return len(titles) > 0