Fix pipeline DAG

2025-06-16 11:28:31 +00:00 · 2022-07-15 16:19:15 +03:00 · 2022-07-15 16:19:15 +03:00 · 7ec0691910
commit 7ec0691910
parent 0c4555fcef
3 changed files with 48 additions and 30 deletions
--- a/clearml/automation/controller.py
+++ b/clearml/automation/controller.py
@ -10,7 +10,7 @@ from datetime import datetime
 from logging import getLogger
 from multiprocessing import Process, Queue
 from multiprocessing.pool import ThreadPool
-from threading import Thread, Event, RLock
+from threading import Thread, Event, RLock, current_thread
 from time import time
 from typing import Sequence, Optional, Mapping, Callable, Any, List, Dict, Union, Tuple
@ -57,6 +57,8 @@ class PipelineController(object):
    _monitor_node_interval = 5.*60
    _report_plot_execution_flow = dict(title='Pipeline', series='Execution Flow')
    _report_plot_execution_details = dict(title='Pipeline Details', series='Execution Details')
    _evaluated_return_values = {}  # TID: pipeline_name
    _add_to_evaluated_return_values = {}  # TID: bool
    valid_job_status = ["failed", "cached", "completed", "aborted", "queued", "running", "skipped", "pending"]
@ -66,11 +68,11 @@ class PipelineController(object):
        base_task_id = attrib(type=str, default=None)  # base Task ID to be cloned and launched
        task_factory_func = attrib(type=Callable, default=None)  # alternative to base_task_id, function creating a Task
        queue = attrib(type=str, default=None)  # execution queue name to use
-        parents = attrib(type=list, default=[])  # list of parent DAG steps
+        parents = attrib(type=list, default=None)  # list of parent DAG steps
        timeout = attrib(type=float, default=None)  # execution timeout limit
-        parameters = attrib(type=dict, default={})  # Task hyper parameters to change
+        parameters = attrib(type=dict, default=None)  # Task hyper parameters to change
-        configurations = attrib(type=dict, default={})  # Task configuration objects to change
+        configurations = attrib(type=dict, default=None)  # Task configuration objects to change
-        task_overrides = attrib(type=dict, default={})  # Task overrides to change
+        task_overrides = attrib(type=dict, default=None)  # Task overrides to change
        executed = attrib(type=str, default=None)  # The actual executed Task ID (None if not executed yet)
        status = attrib(type=str, default="pending")  # The Node Task status (cached, aborted, etc.)
        clone_task = attrib(type=bool, default=True)  # If True cline the base_task_id, then execute the cloned Task
@ -82,10 +84,28 @@ class PipelineController(object):
        skip_job = attrib(type=bool, default=False)  # if True, this step should be skipped
        continue_on_fail = attrib(type=bool, default=False)  # if True, the pipeline continues even if the step failed
        cache_executed_step = attrib(type=bool, default=False)  # if True this pipeline step should be cached
-        return_artifacts = attrib(type=list, default=[])  # List of artifact names returned by the step
+        return_artifacts = attrib(type=list, default=None)  # List of artifact names returned by the step
-        monitor_metrics = attrib(type=list, default=[])  # List of metric title/series to monitor
+        monitor_metrics = attrib(type=list, default=None)  # List of metric title/series to monitor
-        monitor_artifacts = attrib(type=list, default=[])  # List of artifact names to monitor
+        monitor_artifacts = attrib(type=list, default=None)  # List of artifact names to monitor
-        monitor_models = attrib(type=list, default=[])  # List of models to monitor
+        monitor_models = attrib(type=list, default=None)  # List of models to monitor
        def __attrs_post_init__(self):
            if self.parents is None:
                self.parents = []
            if self.parameters is None:
                self.parameters = {}
            if self.configurations is None:
                self.configurations = {}
            if self.task_overrides is None:
                self.task_overrides = {}
            if self.return_artifacts is None:
                self.return_artifacts = []
            if self.monitor_metrics is None:
                self.monitor_metrics = []
            if self.monitor_artifacts is None:
                self.monitor_artifacts = []
            if self.monitor_models is None:
                self.monitor_models = []
        def copy(self):
            # type: () -> PipelineController.Node
@ -3000,6 +3020,11 @@ class PipelineDecorator(PipelineController):
                for i, v in enumerate(args):
                    kwargs[inspect_func.args[i]] = v
                # We need to remember when a pipeline step's return value is evaluated by the pipeline
                # controller, but not when it's done here (as we would remember the step every time).
                # _add_to_evaluated_return_values protects that
                tid = current_thread().ident
                cls._add_to_evaluated_return_values[tid] = False
                kwargs_artifacts.update(
                    {
                        k: walk_nested_dict_tuple_list(
@ -3011,6 +3036,7 @@ class PipelineDecorator(PipelineController):
                        if isinstance(v, LazyEvalWrapper)
                    }
                )
                cls._add_to_evaluated_return_values[tid] = True
                kwargs = {k: deepcopy(v) for k, v in kwargs.items() if not isinstance(v, LazyEvalWrapper)}
                # check if we have the singleton
@ -3050,6 +3076,7 @@ class PipelineDecorator(PipelineController):
                    # if we already launched a JOB on the node, this means we are calling the same function/task
                    # twice inside the pipeline, this means we need to replicate the node.
                    _node = cls._singleton._nodes[_node_name].copy()
                    _node.parents = []
                    # find a new name
                    counter = 1
                    while _node.name in cls._singleton._nodes:
@ -3065,7 +3092,7 @@ class PipelineDecorator(PipelineController):
                # The actual launch is a bit slow, we run it in the background
                launch_thread = Thread(
                    target=cls._component_launch,
-                    args=(_node_name, _node, kwargs_artifacts, kwargs))
+                    args=(_node_name, _node, kwargs_artifacts, kwargs, current_thread().ident))
                def results_reference(return_name):
                    # wait until launch is completed
@ -3102,6 +3129,11 @@ class PipelineDecorator(PipelineController):
                            'Pipeline step "{}", Task ID={} failed'.format(_node.name, _node.job.task_id()))
                    _node.executed = _node.job.task_id()
                    tid = current_thread().ident
                    if cls._add_to_evaluated_return_values.get(tid, True):
                        if tid not in cls._evaluated_return_values:
                            cls._evaluated_return_values[tid] = []
                        cls._evaluated_return_values[tid].append(_node.name)
                    return Task.get_task(_node.job.task_id()).artifacts[return_name].get()
                return_w = [LazyEvalWrapper(
@ -3402,7 +3434,7 @@ class PipelineDecorator(PipelineController):
        return cls._wait_for_multi_pipelines()
    @classmethod
-    def _component_launch(cls, node_name, node, kwargs_artifacts, kwargs):
+    def _component_launch(cls, node_name, node, kwargs_artifacts, kwargs, tid):
        _node_name = node_name
        _node = node
        # update artifacts kwargs
@ -3414,37 +3446,26 @@ class PipelineDecorator(PipelineController):
            if v and '.' in str(v):
                parent_id, _ = str(v).split('.', 1)
                # find parent and push it into the _node.parents
-                for n, node in list(cls._singleton._nodes.items()):
+                for n, node in sorted(list(cls._singleton._nodes.items()), reverse=True):
                    if n != _node.name and node.executed and node.executed == parent_id:
                        if n not in _node.parents:
                            _node.parents.append(n)
                        break
        if kwargs:
            leaves = cls._singleton._find_executed_node_leaves()
            _node.parents = (_node.parents or []) + [x for x in cls._evaluated_return_values.get(tid, []) if x in leaves]
        for k, v in kwargs.items():
            if v is None or isinstance(v, (bool, int, float, str)):
                _node.parameters["{}/{}".format(CreateFromFunction.kwargs_section, k)] = v
            elif isinstance(v, (list, tuple)) and all(isinstance(i, (bool, int, float, str)) for i in v):
                _node.parameters["{}/{}".format(CreateFromFunction.kwargs_section, k)] = v
            else:
                # find parents if we have any
                arg_parents = []
                if isinstance(v, (list, tuple, dict)):
                    walk_nested_dict_tuple_list(
                        v,
                        callback=lambda x:
                            not cls._ref_lazy_loader_id_to_node_name.get(id(x)) or
                            arg_parents.append(cls._ref_lazy_loader_id_to_node_name[id(x)])
                    )
                # we need to create an artifact
                artifact_name = 'result_{}_{}'.format(re.sub(r'\W+', '', _node.name), k)
                cls._singleton._task.upload_artifact(
                    name=artifact_name, artifact_object=v, wait_on_upload=True)
                _node.parameters["{}/{}".format(CreateFromFunction.input_artifact_section, k)] = \
                    "{}.{}".format(cls._singleton._task.id, artifact_name)
                # now add all the executed nodes as parents (only the leaves of the DAG, no need for parents)
                _node.parents = list(
                    set((_node.parents or []) + cls._singleton._find_executed_node_leaves() + arg_parents)
                    - set(list(_node.name)))
        # verify the new step
        cls._singleton._verify_node(_node)
--- a/clearml/backend_interface/task/repo/scriptinfo.py
+++ b/clearml/backend_interface/task/repo/scriptinfo.py
@ -708,7 +708,7 @@ class ScriptInfo(object):
        try:
            # Use os.path.relpath as it calculates up dir movements (../)
            entry_point = os.path.relpath(
-                str(script_path), str(cls._get_working_dir(repo_root, return_abs=True)))
+                str(os.path.realpath(script_path)), str(cls._get_working_dir(repo_root, return_abs=True)))
        except ValueError:
            # Working directory not under repository root
            entry_point = script_path.relative_to(repo_root)
--- a/examples/pipeline/pipeline_from_decorator.py
+++ b/examples/pipeline/pipeline_from_decorator.py
@ -77,9 +77,6 @@ def executing_pipeline(pickle_url, mock_parameter='mock'):
    print('launch step two')
    processed_data = step_two(data_frame)
    # Notice we can actually process/modify the returned values inside the pipeline logic context.
    # This means the modified object will be stored on the pipeline Task.
    processed_data = [processed_data[0], processed_data[1]*2, processed_data[2], processed_data[3]]
    print('launch step three')
    model = step_three(processed_data)