mirror of
https://github.com/clearml/clearml
synced 2025-04-10 07:26:03 +00:00
Fix resource monitor fall back to seconds since experiment started, will keep waiting for any reporting for 30min, while sending monitor report after 3min
This commit is contained in:
parent
ce77734f5d
commit
c0ab9a2f52
@ -39,6 +39,7 @@ class Logger(object):
|
||||
self._flusher = None
|
||||
self._report_worker = None
|
||||
self._task_handler = None
|
||||
self._graph_titles = {}
|
||||
|
||||
StdStreamPatch.patch_std_streams(self)
|
||||
|
||||
@ -78,7 +79,7 @@ class Logger(object):
|
||||
|
||||
# if task was not started, we have to start it
|
||||
self._start_task_if_needed()
|
||||
|
||||
self._touch_title_series(title, series)
|
||||
return self._task.reporter.report_scalar(title=title, series=series, value=float(value), iter=iteration)
|
||||
|
||||
def report_vector(self, title, series, values, iteration, labels=None, xlabels=None):
|
||||
@ -92,6 +93,7 @@ class Logger(object):
|
||||
:param list(str) labels: optional, labels for each bar group.
|
||||
:param list(str) xlabels: optional label per entry in the vector (bucket in the histogram)
|
||||
"""
|
||||
self._touch_title_series(title, series)
|
||||
return self.report_histogram(title, series, values, iteration, labels=labels, xlabels=xlabels)
|
||||
|
||||
def report_histogram(self, title, series, values, iteration, labels=None, xlabels=None):
|
||||
@ -111,7 +113,7 @@ class Logger(object):
|
||||
|
||||
# if task was not started, we have to start it
|
||||
self._start_task_if_needed()
|
||||
|
||||
self._touch_title_series(title, series)
|
||||
return self._task.reporter.report_histogram(
|
||||
title=title,
|
||||
series=series,
|
||||
@ -140,7 +142,7 @@ class Logger(object):
|
||||
|
||||
# if task was not started, we have to start it
|
||||
self._start_task_if_needed()
|
||||
|
||||
self._touch_title_series(title, series[0].name if series else '')
|
||||
return self._task.reporter.report_line_plot(
|
||||
title=title,
|
||||
series=series,
|
||||
@ -175,7 +177,7 @@ class Logger(object):
|
||||
|
||||
# if task was not started, we have to start it
|
||||
self._start_task_if_needed()
|
||||
|
||||
self._touch_title_series(title, series)
|
||||
return self._task.reporter.report_2d_scatter(
|
||||
title=title,
|
||||
series=series,
|
||||
@ -228,7 +230,7 @@ class Logger(object):
|
||||
|
||||
# if task was not started, we have to start it
|
||||
self._start_task_if_needed()
|
||||
|
||||
self._touch_title_series(title, series)
|
||||
return self._task.reporter.report_3d_scatter(
|
||||
title=title,
|
||||
series=series,
|
||||
@ -258,7 +260,7 @@ class Logger(object):
|
||||
|
||||
# if task was not started, we have to start it
|
||||
self._start_task_if_needed()
|
||||
|
||||
self._touch_title_series(title, series)
|
||||
return self._task.reporter.report_value_matrix(
|
||||
title=title,
|
||||
series=series,
|
||||
@ -281,6 +283,7 @@ class Logger(object):
|
||||
:param list(str) xlabels: optional label per column of the matrix
|
||||
:param list(str) ylabels: optional label per row of the matrix
|
||||
"""
|
||||
self._touch_title_series(title, series)
|
||||
return self.report_confusion_matrix(title, series, matrix, iteration, xlabels=xlabels, ylabels=ylabels)
|
||||
|
||||
def report_surface(self, title, series, matrix, iteration, xlabels=None, ylabels=None,
|
||||
@ -306,7 +309,7 @@ class Logger(object):
|
||||
|
||||
# if task was not started, we have to start it
|
||||
self._start_task_if_needed()
|
||||
|
||||
self._touch_title_series(title, series)
|
||||
return self._task.reporter.report_value_surface(
|
||||
title=title,
|
||||
series=series,
|
||||
@ -351,7 +354,7 @@ class Logger(object):
|
||||
upload_uri = str(upload_uri)
|
||||
storage = StorageHelper.get(upload_uri)
|
||||
upload_uri = storage.verify_upload(folder_uri=upload_uri)
|
||||
|
||||
self._touch_title_series(title, series)
|
||||
self._task.reporter.report_image_and_upload(
|
||||
title=title,
|
||||
series=series,
|
||||
@ -606,6 +609,14 @@ class Logger(object):
|
||||
if self._task_handler and DevWorker.report_stdout:
|
||||
self._task_handler.flush()
|
||||
|
||||
def _touch_title_series(self, title, series):
|
||||
if title not in self._graph_titles:
|
||||
self._graph_titles[title] = set()
|
||||
self._graph_titles[title].add(series)
|
||||
|
||||
def _get_used_title_series(self):
|
||||
return self._graph_titles
|
||||
|
||||
@classmethod
|
||||
def _get_tensorboard_auto_group_scalars(cls):
|
||||
"""
|
||||
|
@ -19,12 +19,14 @@ class ResourceMonitor(object):
|
||||
_title_gpu = ':monitor:gpu'
|
||||
|
||||
def __init__(self, task, sample_frequency_per_sec=2., report_frequency_sec=30.,
|
||||
first_report_sec=None, wait_for_first_iteration_to_start_sec=180.):
|
||||
first_report_sec=None, wait_for_first_iteration_to_start_sec=180.0,
|
||||
max_wait_for_first_iteration_to_start_sec=1800.):
|
||||
self._task = task
|
||||
self._sample_frequency = sample_frequency_per_sec
|
||||
self._report_frequency = report_frequency_sec
|
||||
self._first_report_sec = first_report_sec or report_frequency_sec
|
||||
self._wait_for_first_iteration = wait_for_first_iteration_to_start_sec
|
||||
self._max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
|
||||
self._num_readouts = 0
|
||||
self._readouts = {}
|
||||
self._previous_readouts = {}
|
||||
@ -78,6 +80,11 @@ class ResourceMonitor(object):
|
||||
self._task.get_logger().report_text('TRAINS Monitor: Could not detect iteration reporting, '
|
||||
'falling back to iterations as seconds-from-start')
|
||||
fallback_to_sec_as_iterations = True
|
||||
elif fallback_to_sec_as_iterations is True and seconds_since_started <= self._max_check_first_iteration:
|
||||
if self._check_logger_reported():
|
||||
fallback_to_sec_as_iterations = False
|
||||
self._task.get_logger().report_text('TRAINS Monitor: Reporting detected, '
|
||||
'reverting back to iteration based reporting')
|
||||
|
||||
clear_readouts = True
|
||||
# if we do not have last_iteration, we just use seconds as iteration
|
||||
@ -206,3 +213,15 @@ class ResourceMonitor(object):
|
||||
self._gpustat = None
|
||||
|
||||
return stats
|
||||
|
||||
def _check_logger_reported(self):
|
||||
titles = list(self._task.get_logger()._get_used_title_series().keys())
|
||||
try:
|
||||
titles.remove(self._title_machine)
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
titles.remove(self._title_gpu)
|
||||
except ValueError:
|
||||
pass
|
||||
return len(titles) > 0
|
||||
|
Loading…
Reference in New Issue
Block a user