diff --git a/trains/backend_interface/metrics/reporter.py b/trains/backend_interface/metrics/reporter.py index ea5a172b..04e8cd56 100644 --- a/trains/backend_interface/metrics/reporter.py +++ b/trains/backend_interface/metrics/reporter.py @@ -55,6 +55,7 @@ class Reporter(InterfaceBase, AbstractContextManager, SetupUploadMixin, AsyncMan self._thread = Thread(target=self._daemon) self._thread.daemon = True self._thread.start() + self._max_iteration = 0 def _set_storage_uri(self, value): value = '/'.join(x for x in (value.rstrip('/'), self._metrics.storage_key_prefix) if x) @@ -78,6 +79,10 @@ class Reporter(InterfaceBase, AbstractContextManager, SetupUploadMixin, AsyncMan def async_enable(self, value): self._async_enable = bool(value) + @property + def max_iteration(self): + return self._max_iteration + def _daemon(self): while not self._exit_flag: self._flush_event.wait(self._flush_frequency) @@ -92,6 +97,9 @@ class Reporter(InterfaceBase, AbstractContextManager, SetupUploadMixin, AsyncMan self.wait_for_results() def _report(self, ev): + ev_iteration = ev.get_iteration() + if ev_iteration is not None: + self._max_iteration = max(self._max_iteration, ev_iteration) self._events.append(ev) if len(self._events) >= self._flush_threshold: self.flush() diff --git a/trains/backend_interface/task/task.py b/trains/backend_interface/task/task.py index 05c835ab..cdcd8e0e 100644 --- a/trains/backend_interface/task/task.py +++ b/trains/backend_interface/task/task.py @@ -389,7 +389,7 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin): return self._reporter def _get_output_destination_suffix(self, extra_path=None): - return '/'.join(quote(x, safe='[]{}()$^,.; -_+-=') for x in + return '/'.join(quote(x, safe="'[]{}()$^,.; -_+-=") for x in (self.get_project_name(), '%s.%s' % (self.name, self.data.id), extra_path) if x) def _reload(self): diff --git a/trains/binding/frameworks/tensorflow_bind.py b/trains/binding/frameworks/tensorflow_bind.py index ab263bb8..c68637d3 100644 --- a/trains/binding/frameworks/tensorflow_bind.py +++ b/trains/binding/frameworks/tensorflow_bind.py @@ -66,6 +66,15 @@ class EventTrainsWriter(object): _title_series_writers_lookup = {} _event_writers_id_to_logdir = {} + # Protect against step (iteration) reuse, for example, + # steps counter inside an epoch, but wrapping around when epoch ends + # i.e. step = 0..100 then epoch ends and again step = 0..100 + # We store the first report per title/series combination, and if wraparound occurs + # we synthetically continue to increase the step/iteration based on the previous epoch counter + # example: _title_series_wraparound_counter[('title', 'series')] = + # {'first_step':None, 'last_step':None, 'adjust_counter':0,} + _title_series_wraparound_counter = {} + @property def variants(self): return self._variants @@ -111,8 +120,8 @@ class EventTrainsWriter(object): org_series = series org_title = title other_logdir = self._event_writers_id_to_logdir[event_writer_id] - split_logddir = self._logdir.split(os.path.sep) - unique_logdir = set(split_logddir) - set(other_logdir.split(os.path.sep)) + split_logddir = self._logdir.split('/') + unique_logdir = set(split_logddir) - set(other_logdir.split('/')) header = '/'.join(s for s in split_logddir if s in unique_logdir) if logdir_header == 'series_last': series = header + ': ' + series @@ -160,6 +169,9 @@ class EventTrainsWriter(object): # We are the events_writer, so that's what we'll pass IsTensorboardInit.set_tensorboard_used() self._logdir = logdir or ('unknown %d' % len(self._event_writers_id_to_logdir)) + # conform directory structure to unix + if os.path.sep == '\\': + self._logdir = self._logdir.replace('\\', '/') self._id = hash(self._logdir) self._event_writers_id_to_logdir[self._id] = self._logdir self.max_keep_images = max_keep_images @@ -220,6 +232,8 @@ class EventTrainsWriter(object): title, series = self.tag_splitter(tag, num_split_parts=3, default_title='Images', logdir_header='title', auto_reduce_num_split=True) + step = self._fix_step_counter(title, series, step) + if img_data_np.dtype != np.uint8: # assume scale 0-1 img_data_np = (img_data_np * 255).astype(np.uint8) @@ -259,6 +273,7 @@ class EventTrainsWriter(object): default_title = tag if not self._logger._get_tensorboard_auto_group_scalars() else 'Scalars' title, series = self.tag_splitter(tag, num_split_parts=1, default_title=default_title, logdir_header='series_last') + step = self._fix_step_counter(title, series, step) # update scalar cache num, value = self._scalar_report_cache.get((title, series), (0, 0)) @@ -310,6 +325,7 @@ class EventTrainsWriter(object): # Z-axis actual value (interpolated 'bucket') title, series = self.tag_splitter(tag, num_split_parts=1, default_title='Histograms', logdir_header='series') + step = self._fix_step_counter(title, series, step) # get histograms from cache hist_list, hist_iters, minmax = self._hist_report_cache.get((title, series), ([], np.array([]), None)) @@ -418,6 +434,23 @@ class EventTrainsWriter(object): except Exception: pass + def _fix_step_counter(self, title, series, step): + key = (title, series) + if key not in EventTrainsWriter._title_series_wraparound_counter: + EventTrainsWriter._title_series_wraparound_counter[key] = {'first_step': step, 'last_step': step, + 'adjust_counter': 0} + return step + wraparound_counter = EventTrainsWriter._title_series_wraparound_counter[key] + # we decide on wrap around if the current step is less than 10% of the previous step + # notice since counter is int and we want to avoid rounding error, we have double check in the if + if step < wraparound_counter['last_step'] and step < 0.9*wraparound_counter['last_step']: + # adjust step base line + wraparound_counter['adjust_counter'] += wraparound_counter['last_step'] + (1 if step <= 0 else step) + + # return adjusted step + wraparound_counter['last_step'] = step + return step + wraparound_counter['adjust_counter'] + def add_event(self, event, step=None, walltime=None, **kwargs): supported_metrics = { 'simpleValue', 'image', 'histo', 'tensor' diff --git a/trains/binding/matplotlib_bind.py b/trains/binding/matplotlib_bind.py index d1507152..f4e083c6 100644 --- a/trains/binding/matplotlib_bind.py +++ b/trains/binding/matplotlib_bind.py @@ -21,6 +21,8 @@ class PatchedMatplotlib: __patched_draw_all_recursion_guard = False _global_plot_counter = -1 _global_image_counter = -1 + _global_image_counter_limit = None + _last_iteration_plot_titles = (-1, []) _current_task = None _support_image_plot = False _matplotlylib = None @@ -125,6 +127,9 @@ class PatchedMatplotlib: def update_current_task(task): if PatchedMatplotlib.patch_matplotlib(): PatchedMatplotlib._current_task = task + if PatchedMatplotlib._global_image_counter_limit is None: + from ..config import config + PatchedMatplotlib._global_image_counter_limit = config.get('metric.matplotlib_untitled_history_size', 100) @staticmethod def patched_imshow(*args, **kw): @@ -310,8 +315,13 @@ class PatchedMatplotlib: # remove borders and size, we should let the web take care of that if plotly_fig: - PatchedMatplotlib._global_plot_counter += 1 - title = plot_title or 'untitled %d' % PatchedMatplotlib._global_plot_counter + last_iteration = PatchedMatplotlib._current_task.get_last_iteration() + if plot_title: + title = PatchedMatplotlib._enforce_unique_title_per_iteration(plot_title, last_iteration) + else: + PatchedMatplotlib._global_plot_counter += 1 + title = 'untitled %d' % PatchedMatplotlib._global_plot_counter + plotly_fig.layout.margin = {} plotly_fig.layout.autosize = True plotly_fig.layout.height = None @@ -321,38 +331,59 @@ class PatchedMatplotlib: if not plotly_dict.get('layout'): plotly_dict['layout'] = {} plotly_dict['layout']['title'] = title - reporter.report_plot(title=title, series='plot', plot=plotly_dict, - iter=PatchedMatplotlib._global_plot_counter if plot_title else 0) + reporter.report_plot(title=title, series='plot', plot=plotly_dict, iter=last_iteration) else: logger = PatchedMatplotlib._current_task.get_logger() # this is actually a failed plot, we should put it under plots: # currently disabled if force_save_as_image or not PatchedMatplotlib._support_image_plot: + last_iteration = PatchedMatplotlib._current_task.get_last_iteration() # send the plot as image - PatchedMatplotlib._global_image_counter += 1 - title = plot_title or 'untitled %d' % PatchedMatplotlib._global_image_counter + if plot_title: + title = PatchedMatplotlib._enforce_unique_title_per_iteration(plot_title, last_iteration) + else: + PatchedMatplotlib._global_image_counter += 1 + title = 'untitled %d' % (PatchedMatplotlib._global_image_counter % + PatchedMatplotlib._global_image_counter_limit) logger.report_image(title=title, series='plot image', local_path=image, - delete_after_upload=True, - iteration=PatchedMatplotlib._global_image_counter - if plot_title else 0) + delete_after_upload=True, iteration=last_iteration) else: # send the plot as plotly with embedded image - PatchedMatplotlib._global_plot_counter += 1 - title = plot_title or 'untitled %d' % PatchedMatplotlib._global_plot_counter + last_iteration = PatchedMatplotlib._current_task.get_last_iteration() + if plot_title: + title = PatchedMatplotlib._enforce_unique_title_per_iteration(plot_title, last_iteration) + else: + PatchedMatplotlib._global_plot_counter += 1 + title = 'untitled %d' % (PatchedMatplotlib._global_plot_counter % + PatchedMatplotlib._global_image_counter_limit) logger._report_image_plot_and_upload(title=title, series='plot image', path=image, - delete_after_upload=True, - iteration=PatchedMatplotlib._global_plot_counter - if plot_title else 0) - + delete_after_upload=True, iteration=last_iteration) except Exception: # plotly failed pass return + @staticmethod + def _enforce_unique_title_per_iteration(title, last_iteration): + if last_iteration != PatchedMatplotlib._last_iteration_plot_titles[0]: + PatchedMatplotlib._last_iteration_plot_titles = (last_iteration, [title]) + elif title not in PatchedMatplotlib._last_iteration_plot_titles[1]: + PatchedMatplotlib._last_iteration_plot_titles[1].append(title) + else: + base_title = title + counter = 1 + while title in PatchedMatplotlib._last_iteration_plot_titles[1]: + # we already used this title in this iteration, we should change the title + title = base_title + ' %d' % counter + counter += 1 + # store the new title + PatchedMatplotlib._last_iteration_plot_titles[1].append(title) + return title + @staticmethod def _get_output_figures(stored_figure, all_figures): try: diff --git a/trains/config/default/sdk.conf b/trains/config/default/sdk.conf index 96ddf0b4..d181c4e3 100644 --- a/trains/config/default/sdk.conf +++ b/trains/config/default/sdk.conf @@ -21,6 +21,11 @@ # X files are stored in the upload destination for each metric/variant combination. file_history_size: 100 + # Max history size for matplotlib imshow files per plot title. + # File names for the uploaded images will be recycled in such a way that no more than + # X images are stored in the upload destination for each matplotlib plot title. + matplotlib_untitled_history_size: 100 + # Settings for generated debug images images { format: JPEG