Add set ResourceMonitor iteration wait duration timeout (issue #208)

This commit is contained in:
allegroai 2020-09-29 19:19:35 +03:00
parent 86da29b560
commit 30a08ee937
2 changed files with 25 additions and 5 deletions

View File

@ -343,6 +343,7 @@ class Task(_Task):
- ``True`` - Automatically create resource monitoring plots. (default)
- ``False`` - Do not automatically create.
- Class Type - Create ResourceMonitor object of the specified class type.
:return: The main execution Task (Task context).
"""
@ -502,7 +503,9 @@ class Task(_Task):
if is_auto_connect_frameworks_bool or auto_connect_frameworks.get('fastai', True):
PatchFastai.update_current_task(task)
if auto_resource_monitoring and not is_sub_process_task_id:
task._resource_monitor = ResourceMonitor(
resource_monitor_cls = auto_resource_monitoring \
if isinstance(auto_resource_monitoring, six.class_types) else ResourceMonitor
task._resource_monitor = resource_monitor_cls(
task, report_mem_used_per_process=not config.get(
'development.worker.report_global_mem_used', False))
task._resource_monitor.start()
@ -1519,6 +1522,23 @@ class Task(_Task):
super(Task, self).set_base_docker(docker_cmd)
def set_resource_monitor_iteration_timeout(self, seconds_from_start=1800):
# type: (float) -> bool
"""
Set the ResourceMonitor maximum duration (in seconds) to wait until first scalar/plot is reported.
If timeout is reached without any reporting, the ResourceMonitor will start reporting machine statistics based
on seconds from Task start time (instead of based on iteration)
:param seconds_from_start: Maximum number of seconds to wait for scalar/plot reporting before defaulting
to machine statistics reporting based on seconds from experiment start time
:return: True if success
"""
if not self._resource_monitor:
return False
self._resource_monitor.wait_for_first_iteration = seconds_from_start
self._resource_monitor.max_check_first_iteration = seconds_from_start
return True
def execute_remotely(self, queue_name=None, clone=False, exit_process=True):
# type: (Optional[str], bool, bool) -> ()
"""

View File

@ -26,8 +26,8 @@ class ResourceMonitor(object):
self._sample_frequency = sample_frequency_per_sec
self._report_frequency = report_frequency_sec
self._first_report_sec = first_report_sec or report_frequency_sec
self._wait_for_first_iteration = wait_for_first_iteration_to_start_sec
self._max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
self.wait_for_first_iteration = wait_for_first_iteration_to_start_sec
self.max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
self._num_readouts = 0
self._readouts = {}
self._previous_readouts = {}
@ -104,11 +104,11 @@ class ResourceMonitor(object):
if fallback_to_sec_as_iterations is None:
if IsTensorboardInit.tensorboard_used():
fallback_to_sec_as_iterations = False
elif seconds_since_started >= self._wait_for_first_iteration:
elif seconds_since_started >= self.wait_for_first_iteration:
self._task.get_logger().report_text('TRAINS Monitor: Could not detect iteration reporting, '
'falling back to iterations as seconds-from-start')
fallback_to_sec_as_iterations = True
elif fallback_to_sec_as_iterations is True and seconds_since_started <= self._max_check_first_iteration:
elif fallback_to_sec_as_iterations is True and seconds_since_started <= self.max_check_first_iteration:
if self._check_logger_reported():
fallback_to_sec_as_iterations = False
self._task.get_logger().report_text('TRAINS Monitor: Reporting detected, '