Add PipelineController.stop() mark_failed and mark_aborted args

Add pipeline decorator run_locally Add PipelineController.continue_on_fail property Add PipelineController.__init__() abort_on_failure arg
2025-06-26 18:16:07 +00:00 · 2021-10-16 23:10:23 +03:00 · 2021-10-16 23:10:23 +03:00 · 04797d25d1
commit 04797d25d1
parent e4ceeb2c11
1 changed files with 138 additions and 23 deletions
--- a/clearml/automation/controller.py
+++ b/clearml/automation/controller.py
@ -20,7 +20,7 @@ from ..debugging.log import LoggerRoot
 from ..model import BaseModel
 from ..task import Task
 from ..utilities.process.mp import leave_process
-from ..utilities.proxy_object import LazyEvalWrapper
+from ..utilities.proxy_object import LazyEvalWrapper, flatten_dictionary
 class PipelineController(object):
@ -57,6 +57,7 @@ class PipelineController(object):
        clone_task = attrib(type=bool, default=True)  # If True cline the base_task_id, then execute the cloned Task
        job = attrib(type=ClearmlJob, default=None)  # ClearMLJob object
        skip_job = attrib(type=bool, default=False)  # if True, this step should be skipped
        continue_on_fail = attrib(type=bool, default=False)  # if True, the pipeline continues even if the step failed
        cache_executed_step = attrib(type=bool, default=False)  # if True this pipeline step should be cached
        return_artifacts = attrib(type=list, default=[])  # List of artifact names returned by the step
        monitor_metrics = attrib(type=list, default=[])  # List of metric title/series to monitor
@ -86,6 +87,7 @@ class PipelineController(object):
            add_pipeline_tags=False,  # type: bool
            target_project=None,  # type: Optional[str]
            auto_version_bump=True,  # type: bool
            abort_on_failure=False,  # type: bool
    ):
        # type: (...) -> None
        """
@ -102,6 +104,12 @@ class PipelineController(object):
        :param bool auto_version_bump: If True (default), if the same pipeline version already exists
            (with any difference from the current one), the current pipeline version will be bumped to a new version
            version bump examples: 1.0.0 -> 1.0.1 , 1.2 -> 1.3, 10 -> 11 etc.
        :param bool abort_on_failure: If False (default), failed pipeline steps will not cause the pipeline
            to stop immediately, instead any step that is not connected (or indeirectly connected) to the failed step,
            will still be executed. Nonetheless the pipeline itself will be marked failed, unless the failed step
            was specifically defined with "continue_on_fail=True".
            If True, any failed step will cause the pipeline to immediately abort, stop all running steps,
            and mark the pipeline as failed.
        """
        self._nodes = {}
        self._running_nodes = []
@ -147,6 +155,7 @@ class PipelineController(object):
            self._task.add_tags([self._tag])
        self._monitored_nodes = {}  # type: Dict[str, dict]
        self._abort_running_steps_on_failure = abort_on_failure
    def set_default_execution_queue(self, default_execution_queue):
        # type: (Optional[str]) -> None
@ -183,6 +192,7 @@ class PipelineController(object):
            base_task_project=None,  # type: Optional[str]
            base_task_name=None,  # type: Optional[str]
            clone_base_task=True,  # type: bool
            continue_on_fail=False,  # type: bool
            pre_execute_callback=None,  # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]]  # noqa
            post_execute_callback=None,  # type: Optional[Callable[[PipelineController, PipelineController.Node], None]]  # noqa
            cache_executed_step=False,  # type: bool
@ -258,6 +268,9 @@ class PipelineController(object):
            use the base_task_project and base_task_name combination to retrieve the base_task_id to use for the step.
        :param clone_base_task: If True (default) the pipeline will clone the base task, and modify/enqueue
            the cloned Task. If False, the base-task is used directly, notice it has to be in draft-mode (created).
        :param continue_on_fail: (default False). If True, failed step will not cause the pipeline to stop
            (or marked as failed). Notice, that steps that are connected (or indirectly connected)
            to the failed step will be skipped.
        :param pre_execute_callback: Callback function, called when the step (Task) is created
            and before it is sent for execution. Allows a user to modify the Task before launch.
            Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object.
@ -344,6 +357,9 @@ class PipelineController(object):
                raise ValueError("configuration_overrides must be a dictionary, with all values "
                                 "either dicts or strings, got \'{}\' instead".format(configuration_overrides))
        if task_overrides:
            task_overrides = flatten_dictionary(task_overrides, sep='.')
        self._nodes[name] = self.Node(
            name=name, base_task_id=base_task_id, parents=parents or [],
            queue=execution_queue, timeout=time_limit,
@ -352,6 +368,7 @@ class PipelineController(object):
            clone_task=clone_base_task,
            task_overrides=task_overrides,
            cache_executed_step=cache_executed_step,
            continue_on_fail=continue_on_fail,
            task_factory_func=base_task_factory,
            monitor_metrics=monitor_metrics or [],
            monitor_artifacts=monitor_artifacts or [],
@ -386,6 +403,7 @@ class PipelineController(object):
            monitor_artifacts=None,  # type: Optional[List[Union[str, Tuple[str, str]]]]
            monitor_models=None,  # type: Optional[List[Union[str, Tuple[str, str]]]]
            time_limit=None,  # type: Optional[float]
            continue_on_fail=False,  # type: bool
            pre_execute_callback=None,  # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]]  # noqa
            post_execute_callback=None,  # type: Optional[Callable[[PipelineController, PipelineController.Node], None]]  # noqa
            cache_executed_step=False,  # type: bool
@ -474,6 +492,9 @@ class PipelineController(object):
            Example:  ['model_weights_*', ]
        :param time_limit: Default None, no time limit.
            Step execution time limit, if exceeded the Task is aborted and the pipeline is stopped and marked failed.
        :param continue_on_fail: (default False). If True, failed step will not cause the pipeline to stop
            (or marked as failed). Notice, that steps that are connected (or indirectly connected)
            to the failed step will be skipped.
        :param pre_execute_callback: Callback function, called when the step (Task) is created
            and before it is sent for execution. Allows a user to modify the Task before launch.
            Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object.
@ -587,6 +608,7 @@ class PipelineController(object):
            clone_task=False,
            cache_executed_step=cache_executed_step,
            task_factory_func=_create_task,
            continue_on_fail=continue_on_fail,
            return_artifacts=function_return,
            monitor_artifacts=monitor_artifacts,
            monitor_metrics=monitor_metrics,
@ -674,9 +696,12 @@ class PipelineController(object):
    def start_locally(self, run_pipeline_steps_locally=False):
        # type: (bool) -> None
        """
-        Start the current pipeline locally, in most cases for debug purposes.
+        Start the current pipeline locally, meaning the pipeline logic is running on the current machine,
-        By default it will be running the DAG itself locally, as sub-processes.
+        instead of on the `services` queue.
-        Notice: running the DAG locally assumes the local code execution (i.e. it will not clone & apply git diff)
+
        Using run_pipeline_steps_locally=True you can run all the pipeline steps locally as sub-processes.
        Notice: when running pipeline steps locally, it assumes local code execution
        (i.e. it is running the local code as is, regardless of the git commit/diff on the pipeline steps Task)
        :param run_pipeline_steps_locally: (default False) If True, run the
        pipeline steps themselves locally as a subprocess (use for debugging the pipeline locally,
@ -789,20 +814,31 @@ class PipelineController(object):
            name=name, artifact_object=artifact_object, metadata=metadata, delete_after_upload=delete_after_upload,
            auto_pickle=auto_pickle, preview=preview, wait_on_upload=wait_on_upload)
-    def stop(self, timeout=None):
+    def stop(self, timeout=None, mark_failed=False, mark_aborted=False):
-        # type: (Optional[float]) -> ()
+        # type: (Optional[float], bool, bool) -> ()
        """
        Stop the pipeline controller and the optimization thread.
        If mark_failed and mark_aborted are False (default) mark the pipeline as completed,
        unless one of the steps failed, then mark the pipeline as failed
-        :param float timeout: Wait timeout for the optimization thread to exit (minutes).
+        :param timeout: Wait timeout for the optimization thread to exit (minutes).
            The default is ``None``, indicating do not wait terminate immediately.
        :param mark_failed: If True, mark the pipeline task as failed. (default False)
        :param mark_aborted: If False, mark the pipeline task as aborted. (default False)
        """
        self._stop_event.set()
        self.wait(timeout=timeout)
-        if self._task and self._pipeline_task_status_failed:
+        if not self._task:
            return
        self._task.close()
        if mark_failed:
            self._task.mark_failed(status_reason='Pipeline aborted and failed', force=True)
        elif mark_aborted:
            self._task.mark_aborted(status_reason='Pipeline aborted', force=True)
        elif self._pipeline_task_status_failed:
            print('Setting pipeline controller Task as failed (due to failed steps) !')
            self._task.close()
            self._task.mark_failed(status_reason='Pipeline step failed', force=True)
    def wait(self, timeout=None):
@ -1600,6 +1636,8 @@ class PipelineController(object):
                return "#bdf5bd"  # lightgreen, pending in queue
            elif node.job.is_completed():
                return "blue"  # completed job
            elif node.job.is_failed():
                return "red"  # failed job
            else:
                return "green"  # running job
        elif node.skip_job:
@ -1639,15 +1677,20 @@ class PipelineController(object):
            # if no a job ended, continue
            completed_jobs = []
            force_execution_plot_update = False
            nodes_failed_stop_pipeline = []
            for j in self._running_nodes:
                node = self._nodes[j]
                if not node.job:
                    continue
                if node.job.is_stopped():
                    completed_jobs.append(j)
-                    node.executed = node.job.task_id() if not node.job.is_failed() else False
+                    node_failed = node.job.is_failed()
                    node.executed = node.job.task_id() if not node_failed else False
                    if j in launched_nodes:
                        launched_nodes.remove(j)
                    # check if we need to stop all running steps
                    if node_failed and self._abort_running_steps_on_failure and not node.continue_on_fail:
                        nodes_failed_stop_pipeline.append(node.name)
                elif node.timeout:
                    started = node.job.task.data.started
                    if (datetime.now().astimezone(started.tzinfo) - started).total_seconds() > node.timeout:
@ -1686,6 +1729,12 @@ class PipelineController(object):
                    if self._post_step_callbacks.get(job_node.name):
                        self._post_step_callbacks[job_node.name](self, job_node)
            # check if we need to stop the pipeline, and abort all running steps
            if nodes_failed_stop_pipeline:
                print('Aborting pipeline and stopping all running steps, node {} failed'.format(
                    nodes_failed_stop_pipeline))
                break
            # Pull the next jobs in the pipeline, based on the completed list
            next_nodes = []
            for node in list(self._nodes.values()):
@ -1725,8 +1774,9 @@ class PipelineController(object):
        # stop all currently running jobs:
        for node in list(self._nodes.values()):
-            if node.executed is False:
+            if node.executed is False and not node.continue_on_fail:
                self._pipeline_task_status_failed = True
            if node.job and node.executed and not node.job.is_stopped():
                node.job.abort()
            elif not node.job and not node.executed:
@ -2117,6 +2167,7 @@ class PipelineDecorator(PipelineController):
            pool_frequency=0.2,  # type: float
            add_pipeline_tags=False,  # type: bool
            target_project=None,  # type: Optional[str]
            abort_on_failure=False,  # type: bool
    ):
        # type: (...) -> ()
        """
@ -2130,6 +2181,12 @@ class PipelineDecorator(PipelineController):
        :param bool add_pipeline_tags: (default: False) if True, add `pipe: <pipeline_task_id>` tag to all
            steps (Tasks) created by this pipeline.
        :param str target_project: If provided, all pipeline steps are cloned into the target project
        :param bool abort_on_failure: If False (default), failed pipeline steps will not cause the pipeline
            to stop immediately, instead any step that is not connected (or indeirectly connected) to the failed step,
            will still be executed. Nonetheless the pipeline itself will be marked failed, unless the failed step
            was specifically defined with "continue_on_fail=True".
            If True, any failed step will cause the pipeline to immediately abort, stop all running steps,
            and mark the pipeline as failed.
        """
        super(PipelineDecorator, self).__init__(
            name=name,
@ -2181,6 +2238,7 @@ class PipelineDecorator(PipelineController):
            # check the state of all current jobs
            # if no a job ended, continue
            completed_jobs = []
            nodes_failed_stop_pipeline = []
            force_execution_plot_update = False
            for j in self._running_nodes:
                node = self._nodes[j]
@ -2188,9 +2246,13 @@ class PipelineDecorator(PipelineController):
                    continue
                if node.job.is_stopped():
                    completed_jobs.append(j)
-                    node.executed = node.job.task_id() if not node.job.is_failed() else False
+                    node_failed = node.job.is_failed()
                    node.executed = node.job.task_id() if not node_failed else False
                    if j in launched_nodes:
                        launched_nodes.remove(j)
                    # check if we need to stop all running steps
                    if node_failed and self._abort_running_steps_on_failure and not node.continue_on_fail:
                        nodes_failed_stop_pipeline.append(node.name)
                elif node.timeout:
                    started = node.job.task.data.started
                    if (datetime.now().astimezone(started.tzinfo) - started).total_seconds() > node.timeout:
@ -2229,6 +2291,12 @@ class PipelineDecorator(PipelineController):
                    if self._post_step_callbacks.get(job_node.name):
                        self._post_step_callbacks[job_node.name](self, job_node)
            # check if we need to stop the pipeline, and abort all running steps
            if nodes_failed_stop_pipeline:
                print('Aborting pipeline and stopping all running steps, node {} failed'.format(
                    nodes_failed_stop_pipeline))
                break
            # update current state (in configuration, so that we could later continue an aborted pipeline)
            self._force_task_configuration_update()
@ -2237,8 +2305,9 @@ class PipelineDecorator(PipelineController):
        # stop all currently running jobs, protect against changes while iterating):
        for node in list(self._nodes.values()):
-            if node.executed is False:
+            if node.executed is False and not node.continue_on_fail:
                self._pipeline_task_status_failed = True
            if node.job and node.executed and not node.job.is_stopped():
                node.job.abort()
            elif not node.job and not node.executed:
@ -2413,6 +2482,7 @@ class PipelineDecorator(PipelineController):
            packages=None,  # type: Optional[Union[str, Sequence[str]]]
            parents=None,  # type:  Optional[List[str]]
            execution_queue=None,  # type: Optional[str]
            continue_on_fail=False,  # type: bool
            docker=None,  # type: Optional[str]
            docker_args=None,  # type: Optional[str]
            docker_bash_setup_script=None,  # type: Optional[str]
@ -2446,6 +2516,9 @@ class PipelineDecorator(PipelineController):
            have been executed successfully.
        :param execution_queue: Optional, the queue to use for executing this specific step.
            If not provided, the task will be sent to the default execution queue, as defined on the class
        :param continue_on_fail: (default False). If True, failed step will not cause the pipeline to stop
            (or marked as failed). Notice, that steps that are connected (or indirectly connected)
            to the failed step will be skipped.
        :param docker: Select the docker image to be executed in by the remote session
        :param docker_args: Add docker arguments, pass a single string
        :param docker_bash_setup_script: Add bash script to be executed
@ -2509,6 +2582,7 @@ class PipelineDecorator(PipelineController):
                packages=packages,
                parents=parents,
                execution_queue=execution_queue,
                continue_on_fail=continue_on_fail,
                docker=docker,
                docker_args=docker_args,
                docker_bash_setup_script=docker_bash_setup_script,
@ -2668,7 +2742,7 @@ class PipelineDecorator(PipelineController):
                def results_reference(return_name):
                    # wait until job is completed
                    _node.job.wait(pool_period=0.2)
-                    if _node.job.is_failed():
+                    if _node.job.is_failed() and not _node.continue_on_fail:
                        raise ValueError(
                            'Pipeline step "{}", Task ID={} failed'.format(_node.name, _node.job.task_id()))
@ -2706,7 +2780,8 @@ class PipelineDecorator(PipelineController):
            pool_frequency=0.2,  # type: float
            add_pipeline_tags=False,  # type: bool
            target_project=None,  # type: Optional[str]
-            pipeline_execution_queue='services'  # type: Optional[str]
+            abort_on_failure=False,  # type: bool
            pipeline_execution_queue='services',  # type: Optional[str]
    ):
        # type: (...) -> Callable
        """
@ -2721,6 +2796,12 @@ class PipelineDecorator(PipelineController):
        :param bool add_pipeline_tags: (default: False) if True, add `pipe: <pipeline_task_id>` tag to all
            steps (Tasks) created by this pipeline.
        :param str target_project: If provided, all pipeline steps are cloned into the target project
        :param bool abort_on_failure: If False (default), failed pipeline steps will not cause the pipeline
            to stop immediately, instead any step that is not connected (or indirectly connected) to the failed step,
            will still be executed. Nonetheless the pipeline itself will be marked failed, unless the failed step
            was specifically defined with "continue_on_fail=True".
            If True, any failed step will cause the pipeline to immediately abort, stop all running steps,
            and mark the pipeline as failed.
        :param pipeline_execution_queue: remote pipeline execution queue (default 'services' queue).
            If None is passed, execute the pipeline logic locally (pipeline steps are still executed remotely)
        """
@ -2759,6 +2840,7 @@ class PipelineDecorator(PipelineController):
                    pool_frequency=pool_frequency,
                    add_pipeline_tags=add_pipeline_tags,
                    target_project=target_project,
                    abort_on_failure=abort_on_failure,
                )
                if PipelineDecorator._debug_execute_step_process:
@ -2786,8 +2868,18 @@ class PipelineDecorator(PipelineController):
                    # when we get here it means we are running remotely
                # this time the pipeline is executed only on the remote machine
-                func(**pipeline_kwargs)
+                try:
-                LazyEvalWrapper.trigger_all_remote_references()
+                    func(**pipeline_kwargs)
                except Exception:
                    a_pipeline.stop(mark_failed=True)
                    raise
                triggered_exception = None
                try:
                    LazyEvalWrapper.trigger_all_remote_references()
                except Exception as ex:
                    triggered_exception = ex
                # make sure we wait for all nodes to finish
                waited = True
                while waited:
@ -2799,6 +2891,9 @@ class PipelineDecorator(PipelineController):
                        waited = True
                # now we can stop the pipeline
                a_pipeline.stop()
                # now we can raise the exception
                if triggered_exception:
                    raise triggered_exception
                return
            return internal_decorator
@ -2816,15 +2911,35 @@ class PipelineDecorator(PipelineController):
        cls._default_execution_queue = str(default_execution_queue) if default_execution_queue else None
    @classmethod
-    def debug_pipeline(cls, execute_steps_as_functions=False):
+    def run_locally(cls):
-        # type: (bool) -> ()
+        # type: () -> ()
        """
-        Set debugging mode, run all functions locally as subprocess or serially as functions
+        Set local mode, run all functions locally as subprocess or serially as functions
        Run the full pipeline DAG locally, where steps are executed as sub-processes Tasks
        Notice: running the DAG locally assumes the local code execution (i.e. it will not clone & apply git diff)
        :param execute_steps_as_functions: If True, run the pipeline steps locally
            as a function (no Task will be created). Default False.
        """
        cls._debug_execute_step_process = True
-        cls._debug_execute_step_function = execute_steps_as_functions
+        cls._debug_execute_step_function = False
    @classmethod
    def debug_pipeline(cls):
        # type: () -> ()
        """
        Set debugging mode, run all functions locally as subprocess or serially as functions
        Run the full pipeline DAG locally, where steps are executed as sub-processes Tasks
        Notice:
            running the DAG locally assumes the local code execution (i.e. it will not clone & apply git diff)
            Pipeline steps are executed as functions (no Task will be created), fo ease debugging J
        """
        cls._debug_execute_step_process = True
        cls._debug_execute_step_function = True
    @classmethod
    def get_current_pipeline(cls):
        # type: () -> "PipelineDecorator"
        """
        Return the currently running pipeline instance
        """
        return cls._singleton