mirror of
https://github.com/clearml/clearml
synced 2025-02-07 21:33:25 +00:00
Add set ResourceMonitor iteration wait duration timeout (issue #208)
This commit is contained in:
parent
86da29b560
commit
30a08ee937
@ -343,6 +343,7 @@ class Task(_Task):
|
|||||||
|
|
||||||
- ``True`` - Automatically create resource monitoring plots. (default)
|
- ``True`` - Automatically create resource monitoring plots. (default)
|
||||||
- ``False`` - Do not automatically create.
|
- ``False`` - Do not automatically create.
|
||||||
|
- Class Type - Create ResourceMonitor object of the specified class type.
|
||||||
|
|
||||||
:return: The main execution Task (Task context).
|
:return: The main execution Task (Task context).
|
||||||
"""
|
"""
|
||||||
@ -502,7 +503,9 @@ class Task(_Task):
|
|||||||
if is_auto_connect_frameworks_bool or auto_connect_frameworks.get('fastai', True):
|
if is_auto_connect_frameworks_bool or auto_connect_frameworks.get('fastai', True):
|
||||||
PatchFastai.update_current_task(task)
|
PatchFastai.update_current_task(task)
|
||||||
if auto_resource_monitoring and not is_sub_process_task_id:
|
if auto_resource_monitoring and not is_sub_process_task_id:
|
||||||
task._resource_monitor = ResourceMonitor(
|
resource_monitor_cls = auto_resource_monitoring \
|
||||||
|
if isinstance(auto_resource_monitoring, six.class_types) else ResourceMonitor
|
||||||
|
task._resource_monitor = resource_monitor_cls(
|
||||||
task, report_mem_used_per_process=not config.get(
|
task, report_mem_used_per_process=not config.get(
|
||||||
'development.worker.report_global_mem_used', False))
|
'development.worker.report_global_mem_used', False))
|
||||||
task._resource_monitor.start()
|
task._resource_monitor.start()
|
||||||
@ -1519,6 +1522,23 @@ class Task(_Task):
|
|||||||
|
|
||||||
super(Task, self).set_base_docker(docker_cmd)
|
super(Task, self).set_base_docker(docker_cmd)
|
||||||
|
|
||||||
|
def set_resource_monitor_iteration_timeout(self, seconds_from_start=1800):
|
||||||
|
# type: (float) -> bool
|
||||||
|
"""
|
||||||
|
Set the ResourceMonitor maximum duration (in seconds) to wait until first scalar/plot is reported.
|
||||||
|
If timeout is reached without any reporting, the ResourceMonitor will start reporting machine statistics based
|
||||||
|
on seconds from Task start time (instead of based on iteration)
|
||||||
|
|
||||||
|
:param seconds_from_start: Maximum number of seconds to wait for scalar/plot reporting before defaulting
|
||||||
|
to machine statistics reporting based on seconds from experiment start time
|
||||||
|
:return: True if success
|
||||||
|
"""
|
||||||
|
if not self._resource_monitor:
|
||||||
|
return False
|
||||||
|
self._resource_monitor.wait_for_first_iteration = seconds_from_start
|
||||||
|
self._resource_monitor.max_check_first_iteration = seconds_from_start
|
||||||
|
return True
|
||||||
|
|
||||||
def execute_remotely(self, queue_name=None, clone=False, exit_process=True):
|
def execute_remotely(self, queue_name=None, clone=False, exit_process=True):
|
||||||
# type: (Optional[str], bool, bool) -> ()
|
# type: (Optional[str], bool, bool) -> ()
|
||||||
"""
|
"""
|
||||||
|
@ -26,8 +26,8 @@ class ResourceMonitor(object):
|
|||||||
self._sample_frequency = sample_frequency_per_sec
|
self._sample_frequency = sample_frequency_per_sec
|
||||||
self._report_frequency = report_frequency_sec
|
self._report_frequency = report_frequency_sec
|
||||||
self._first_report_sec = first_report_sec or report_frequency_sec
|
self._first_report_sec = first_report_sec or report_frequency_sec
|
||||||
self._wait_for_first_iteration = wait_for_first_iteration_to_start_sec
|
self.wait_for_first_iteration = wait_for_first_iteration_to_start_sec
|
||||||
self._max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
|
self.max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
|
||||||
self._num_readouts = 0
|
self._num_readouts = 0
|
||||||
self._readouts = {}
|
self._readouts = {}
|
||||||
self._previous_readouts = {}
|
self._previous_readouts = {}
|
||||||
@ -104,11 +104,11 @@ class ResourceMonitor(object):
|
|||||||
if fallback_to_sec_as_iterations is None:
|
if fallback_to_sec_as_iterations is None:
|
||||||
if IsTensorboardInit.tensorboard_used():
|
if IsTensorboardInit.tensorboard_used():
|
||||||
fallback_to_sec_as_iterations = False
|
fallback_to_sec_as_iterations = False
|
||||||
elif seconds_since_started >= self._wait_for_first_iteration:
|
elif seconds_since_started >= self.wait_for_first_iteration:
|
||||||
self._task.get_logger().report_text('TRAINS Monitor: Could not detect iteration reporting, '
|
self._task.get_logger().report_text('TRAINS Monitor: Could not detect iteration reporting, '
|
||||||
'falling back to iterations as seconds-from-start')
|
'falling back to iterations as seconds-from-start')
|
||||||
fallback_to_sec_as_iterations = True
|
fallback_to_sec_as_iterations = True
|
||||||
elif fallback_to_sec_as_iterations is True and seconds_since_started <= self._max_check_first_iteration:
|
elif fallback_to_sec_as_iterations is True and seconds_since_started <= self.max_check_first_iteration:
|
||||||
if self._check_logger_reported():
|
if self._check_logger_reported():
|
||||||
fallback_to_sec_as_iterations = False
|
fallback_to_sec_as_iterations = False
|
||||||
self._task.get_logger().report_text('TRAINS Monitor: Reporting detected, '
|
self._task.get_logger().report_text('TRAINS Monitor: Reporting detected, '
|
||||||
|
Loading…
Reference in New Issue
Block a user