Add set ResourceMonitor iteration wait duration timeout (issue #208)

2025-06-26 18:16:07 +00:00 · 2020-09-29 19:19:35 +03:00 · 2020-09-29 19:19:35 +03:00 · 30a08ee937
commit 30a08ee937
parent 86da29b560
2 changed files with 25 additions and 5 deletions
--- a/trains/task.py
+++ b/trains/task.py
@ -343,6 +343,7 @@ class Task(_Task):

            - ``True`` - Automatically create resource monitoring plots. (default)
            - ``False`` - Do not automatically create.
+            - Class Type - Create ResourceMonitor object of the specified class type.

        :return: The main execution Task (Task context).
        """
@ -502,7 +503,9 @@ class Task(_Task):
                if is_auto_connect_frameworks_bool or auto_connect_frameworks.get('fastai', True):
                    PatchFastai.update_current_task(task)
            if auto_resource_monitoring and not is_sub_process_task_id:
-                task._resource_monitor = ResourceMonitor(
+                resource_monitor_cls = auto_resource_monitoring \
+                    if isinstance(auto_resource_monitoring, six.class_types) else ResourceMonitor
+                task._resource_monitor = resource_monitor_cls(
                    task, report_mem_used_per_process=not config.get(
                        'development.worker.report_global_mem_used', False))
                task._resource_monitor.start()
@ -1519,6 +1522,23 @@ class Task(_Task):

        super(Task, self).set_base_docker(docker_cmd)

+    def set_resource_monitor_iteration_timeout(self, seconds_from_start=1800):
+        # type: (float) -> bool
+        """
+        Set the ResourceMonitor maximum duration (in seconds) to wait until first scalar/plot is reported.
+        If timeout is reached without any reporting, the ResourceMonitor will start reporting machine statistics based
+        on seconds from Task start time (instead of based on iteration)
+
+        :param seconds_from_start: Maximum number of seconds to wait for scalar/plot reporting before defaulting
+            to machine statistics reporting based on seconds from experiment start time
+        :return: True if success
+        """
+        if not self._resource_monitor:
+            return False
+        self._resource_monitor.wait_for_first_iteration = seconds_from_start
+        self._resource_monitor.max_check_first_iteration = seconds_from_start
+        return True
+
    def execute_remotely(self, queue_name=None, clone=False, exit_process=True):
        # type: (Optional[str], bool, bool) -> ()
        """
--- a/trains/utilities/resource_monitor.py
+++ b/trains/utilities/resource_monitor.py
@ -26,8 +26,8 @@ class ResourceMonitor(object):
        self._sample_frequency = sample_frequency_per_sec
        self._report_frequency = report_frequency_sec
        self._first_report_sec = first_report_sec or report_frequency_sec
-        self._wait_for_first_iteration = wait_for_first_iteration_to_start_sec
-        self._max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
+        self.wait_for_first_iteration = wait_for_first_iteration_to_start_sec
+        self.max_check_first_iteration = max_wait_for_first_iteration_to_start_sec
        self._num_readouts = 0
        self._readouts = {}
        self._previous_readouts = {}
@ -104,11 +104,11 @@ class ResourceMonitor(object):
            if fallback_to_sec_as_iterations is None:
                if IsTensorboardInit.tensorboard_used():
                    fallback_to_sec_as_iterations = False
-                elif seconds_since_started >= self._wait_for_first_iteration:
+                elif seconds_since_started >= self.wait_for_first_iteration:
                    self._task.get_logger().report_text('TRAINS Monitor: Could not detect iteration reporting, '
                                                        'falling back to iterations as seconds-from-start')
                    fallback_to_sec_as_iterations = True
-            elif fallback_to_sec_as_iterations is True and seconds_since_started <= self._max_check_first_iteration:
+            elif fallback_to_sec_as_iterations is True and seconds_since_started <= self.max_check_first_iteration:
                if self._check_logger_reported():
                    fallback_to_sec_as_iterations = False
                    self._task.get_logger().report_text('TRAINS Monitor: Reporting detected, '