From 30a08ee937a7e35c3e94269d0aa36d9a5aa6201b Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Tue, 29 Sep 2020 19:19:35 +0300 Subject: [PATCH] Add set ResourceMonitor iteration wait duration timeout (issue #208) --- trains/task.py | 22 +++++++++++++++++++++- trains/utilities/resource_monitor.py | 8 ++++---- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/trains/task.py b/trains/task.py index f122d7e1..e65db58e 100644 --- a/trains/task.py +++ b/trains/task.py @@ -343,6 +343,7 @@ class Task(_Task): - ``True`` - Automatically create resource monitoring plots. (default) - ``False`` - Do not automatically create. + - Class Type - Create ResourceMonitor object of the specified class type. :return: The main execution Task (Task context). """ @@ -502,7 +503,9 @@ class Task(_Task): if is_auto_connect_frameworks_bool or auto_connect_frameworks.get('fastai', True): PatchFastai.update_current_task(task) if auto_resource_monitoring and not is_sub_process_task_id: - task._resource_monitor = ResourceMonitor( + resource_monitor_cls = auto_resource_monitoring \ + if isinstance(auto_resource_monitoring, six.class_types) else ResourceMonitor + task._resource_monitor = resource_monitor_cls( task, report_mem_used_per_process=not config.get( 'development.worker.report_global_mem_used', False)) task._resource_monitor.start() @@ -1519,6 +1522,23 @@ class Task(_Task): super(Task, self).set_base_docker(docker_cmd) + def set_resource_monitor_iteration_timeout(self, seconds_from_start=1800): + # type: (float) -> bool + """ + Set the ResourceMonitor maximum duration (in seconds) to wait until first scalar/plot is reported. + If timeout is reached without any reporting, the ResourceMonitor will start reporting machine statistics based + on seconds from Task start time (instead of based on iteration) + + :param seconds_from_start: Maximum number of seconds to wait for scalar/plot reporting before defaulting + to machine statistics reporting based on seconds from experiment start time + :return: True if success + """ + if not self._resource_monitor: + return False + self._resource_monitor.wait_for_first_iteration = seconds_from_start + self._resource_monitor.max_check_first_iteration = seconds_from_start + return True + def execute_remotely(self, queue_name=None, clone=False, exit_process=True): # type: (Optional[str], bool, bool) -> () """ diff --git a/trains/utilities/resource_monitor.py b/trains/utilities/resource_monitor.py index 46b03f46..15f4d9b2 100644 --- a/trains/utilities/resource_monitor.py +++ b/trains/utilities/resource_monitor.py @@ -26,8 +26,8 @@ class ResourceMonitor(object): self._sample_frequency = sample_frequency_per_sec self._report_frequency = report_frequency_sec self._first_report_sec = first_report_sec or report_frequency_sec - self._wait_for_first_iteration = wait_for_first_iteration_to_start_sec - self._max_check_first_iteration = max_wait_for_first_iteration_to_start_sec + self.wait_for_first_iteration = wait_for_first_iteration_to_start_sec + self.max_check_first_iteration = max_wait_for_first_iteration_to_start_sec self._num_readouts = 0 self._readouts = {} self._previous_readouts = {} @@ -104,11 +104,11 @@ class ResourceMonitor(object): if fallback_to_sec_as_iterations is None: if IsTensorboardInit.tensorboard_used(): fallback_to_sec_as_iterations = False - elif seconds_since_started >= self._wait_for_first_iteration: + elif seconds_since_started >= self.wait_for_first_iteration: self._task.get_logger().report_text('TRAINS Monitor: Could not detect iteration reporting, ' 'falling back to iterations as seconds-from-start') fallback_to_sec_as_iterations = True - elif fallback_to_sec_as_iterations is True and seconds_since_started <= self._max_check_first_iteration: + elif fallback_to_sec_as_iterations is True and seconds_since_started <= self.max_check_first_iteration: if self._check_logger_reported(): fallback_to_sec_as_iterations = False self._task.get_logger().report_text('TRAINS Monitor: Reporting detected, '