mirror of
https://github.com/clearml/clearml
synced 2025-01-31 17:17:00 +00:00
Add set ResourceMonitor iteration wait duration timeout (issue #208)
This commit is contained in:
parent
86da29b560
commit
30a08ee937
@ -343,6 +343,7 @@ class Task(_Task):
|
||||
|
||||
- ``True`` - Automatically create resource monitoring plots. (default)
|
||||
- ``False`` - Do not automatically create.
|
||||
- Class Type - Create ResourceMonitor object of the specified class type.
|
||||
|
||||
:return: The main execution Task (Task context).
|
||||
"""
|
||||
@ -502,7 +503,9 @@ class Task(_Task):
|
||||
if is_auto_connect_frameworks_bool or auto_connect_frameworks.get('fastai', True):
|
||||
PatchFastai.update_current_task(task)
|
||||
if auto_resource_monitoring and not is_sub_process_task_id:
|
||||
task._resource_monitor = ResourceMonitor(
|
||||
resource_monitor_cls = auto_resource_monitoring \
|
||||
if isinstance(auto_resource_monitoring, six.class_types) else ResourceMonitor
|
||||
task._resource_monitor = resource_monitor_cls(
|
||||
task, report_mem_used_per_process=not config.get(
|
||||
'development.worker.report_global_mem_used', False))
|
||||
task._resource_monitor.start()
|
||||
@ -1519,6 +1522,23 @@ class Task(_Task):
|
||||
|
||||
super(Task, self).set_base_docker(docker_cmd)
|
||||
|
||||
def set_resource_monitor_iteration_timeout(self, seconds_from_start=1800):
|
||||
# type: (float) -> bool
|
||||
"""
|
||||
Set the ResourceMonitor maximum duration (in seconds) to wait until first scalar/plot is reported.
|
||||
If timeout is reached without any reporting, the ResourceMonitor will start reporting machine statistics based
|
||||
on seconds from Task start time (instead of based on iteration)
|
||||
|
||||
:param seconds_from_start: Maximum number of seconds to wait for scalar/plot reporting before defaulting
|
||||
to machine statistics reporting based on seconds from experiment start time
|
||||
:return: True if success
|
||||
"""
|
||||
if not self._resource_monitor:
|
||||
return False
|
||||
self._resource_monitor.wait_for_first_iteration = seconds_from_start
|
||||
self._resource_monitor.max_check_first_iteration = seconds_from_start
|
||||
return True
|
||||
|
||||
def execute_remotely(self, queue_name=None, clone=False, exit_process=True):
|
||||
# type: (Optional[str], bool, bool) -> ()
|
||||
"""
|
||||
|
@ -26,8 +26,8 @@ class ResourceMonitor(object):
|
||||
self._sample_frequency = sample_frequency_per_sec
|
||||
self._report_frequency = report_frequency_sec
|
||||
self._first_report_sec = first_report_sec or report_frequency_sec
|
||||
self._wait_for_first_iteration = wait_for_first_iteration_to_start_sec
|
||||
self._max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
|
||||
self.wait_for_first_iteration = wait_for_first_iteration_to_start_sec
|
||||
self.max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
|
||||
self._num_readouts = 0
|
||||
self._readouts = {}
|
||||
self._previous_readouts = {}
|
||||
@ -104,11 +104,11 @@ class ResourceMonitor(object):
|
||||
if fallback_to_sec_as_iterations is None:
|
||||
if IsTensorboardInit.tensorboard_used():
|
||||
fallback_to_sec_as_iterations = False
|
||||
elif seconds_since_started >= self._wait_for_first_iteration:
|
||||
elif seconds_since_started >= self.wait_for_first_iteration:
|
||||
self._task.get_logger().report_text('TRAINS Monitor: Could not detect iteration reporting, '
|
||||
'falling back to iterations as seconds-from-start')
|
||||
fallback_to_sec_as_iterations = True
|
||||
elif fallback_to_sec_as_iterations is True and seconds_since_started <= self._max_check_first_iteration:
|
||||
elif fallback_to_sec_as_iterations is True and seconds_since_started <= self.max_check_first_iteration:
|
||||
if self._check_logger_reported():
|
||||
fallback_to_sec_as_iterations = False
|
||||
self._task.get_logger().report_text('TRAINS Monitor: Reporting detected, '
|
||||
|
Loading…
Reference in New Issue
Block a user