import atexit import functools import inspect import json import os import re import six import warnings from copy import copy, deepcopy from datetime import datetime from logging import getLogger from multiprocessing import Process, Queue from multiprocessing.pool import ThreadPool from threading import Thread, Event, RLock, current_thread from time import time, sleep from typing import Sequence, Optional, Mapping, Callable, Any, List, Dict, Union, Tuple from attr import attrib, attrs from pathlib2 import Path from .job import LocalClearmlJob, RunningJob, BaseJob from .. import Logger from ..automation import ClearmlJob from ..backend_api import Session from ..backend_interface.task.populate import CreateFromFunction from ..backend_interface.util import get_or_create_project, mutually_exclusive from ..config import get_remote_task_id from ..debugging.log import LoggerRoot from ..errors import UsageError from ..model import BaseModel, OutputModel from ..storage.util import hash_dict from ..task import Task from ..utilities.process.mp import leave_process from ..utilities.proxy_object import LazyEvalWrapper, flatten_dictionary, walk_nested_dict_tuple_list from ..utilities.version import Version class PipelineController(object): """ Pipeline controller. Pipeline is a DAG of base tasks, each task will be cloned (arguments changed as required), executed, and monitored. The pipeline process (task) itself can be executed manually or by the clearml-agent services queue. Notice: The pipeline controller lives as long as the pipeline itself is being executed. """ _tag = 'pipeline' _project_system_tags = ['pipeline', 'hidden'] _node_tag_prefix = 'pipe:' _step_pattern = r"\${[^}]*}" _config_section = 'Pipeline' _state_artifact_name = 'pipeline_state' _args_section = 'Args' _pipeline_section = 'pipeline' _pipeline_step_ref = 'pipeline' _runtime_property_hash = '_pipeline_hash' _relaunch_status_message = "Relaunching pipeline step..." _reserved_pipeline_names = (_pipeline_step_ref, ) _task_project_lookup = {} _clearml_job_class = ClearmlJob _update_execution_plot_interval = 5.*60 _update_progress_interval = 10. _monitor_node_interval = 5.*60 _report_plot_execution_flow = dict(title='Pipeline', series='Execution Flow') _report_plot_execution_details = dict(title='Pipeline Details', series='Execution Details') _evaluated_return_values = {} # TID: pipeline_name _add_to_evaluated_return_values = {} # TID: bool _retries = {} # Node.name: int _retries_callbacks = {} # Node.name: Callable[[PipelineController, PipelineController.Node, int], bool] # noqa _status_change_callbacks = {} # Node.name: Callable[PipelineController, PipelineController.Node, str] _final_failure = {} # Node.name: bool _task_template_header = CreateFromFunction.default_task_template_header _default_pipeline_version = "1.0.0" _project_section = ".pipelines" valid_job_status = ["failed", "cached", "completed", "aborted", "queued", "running", "skipped", "pending"] @attrs class Node(object): name = attrib(type=str) # pipeline step name base_task_id = attrib(type=str, default=None) # base Task ID to be cloned and launched task_factory_func = attrib(type=Callable, default=None) # alternative to base_task_id, function creating a Task queue = attrib(type=str, default=None) # execution queue name to use parents = attrib(type=list, default=None) # list of parent DAG steps timeout = attrib(type=float, default=None) # execution timeout limit parameters = attrib(type=dict, default=None) # Task hyper-parameters to change configurations = attrib(type=dict, default=None) # Task configuration objects to change task_overrides = attrib(type=dict, default=None) # Task overrides to change executed = attrib(type=str, default=None) # The actual executed Task ID (None if not executed yet) status = attrib(type=str, default="pending") # The Node Task status (cached, aborted, etc.) clone_task = attrib(type=bool, default=True) # If True cline the base_task_id, then execute the cloned Task job = attrib(type=ClearmlJob, default=None) # ClearMLJob object job_type = attrib(type=str, default=None) # task type (string) job_started = attrib(type=float, default=None) # job startup timestamp (epoch ts in seconds) job_ended = attrib(type=float, default=None) # job startup timestamp (epoch ts in seconds) job_code_section = attrib(type=str, default=None) # pipeline code configuration section name skip_job = attrib(type=bool, default=False) # if True, this step should be skipped continue_on_fail = attrib(type=bool, default=False) # if True, the pipeline continues even if the step failed cache_executed_step = attrib(type=bool, default=False) # if True this pipeline step should be cached return_artifacts = attrib(type=list, default=None) # List of artifact names returned by the step monitor_metrics = attrib(type=list, default=None) # List of metric title/series to monitor monitor_artifacts = attrib(type=list, default=None) # List of artifact names to monitor monitor_models = attrib(type=list, default=None) # List of models to monitor explicit_docker_image = attrib(type=str, default=None) # The Docker image the node uses, specified at creation recursively_parse_parameters = attrib(type=bool, default=False) # if True, recursively parse parameters in # lists, dicts, or tuples output_uri = attrib(type=Union[bool, str], default=None) # The default location for output models and other artifacts def __attrs_post_init__(self): if self.parents is None: self.parents = [] if self.parameters is None: self.parameters = {} if self.configurations is None: self.configurations = {} if self.task_overrides is None: self.task_overrides = {} if self.return_artifacts is None: self.return_artifacts = [] if self.monitor_metrics is None: self.monitor_metrics = [] if self.monitor_artifacts is None: self.monitor_artifacts = [] if self.monitor_models is None: self.monitor_models = [] def copy(self): # type: () -> PipelineController.Node """ return a copy of the current Node, excluding the `job`, `executed`, fields :return: new Node copy """ new_copy = PipelineController.Node( name=self.name, **dict((k, deepcopy(v)) for k, v in self.__dict__.items() if k not in ('name', 'job', 'executed', 'task_factory_func')) ) new_copy.task_factory_func = self.task_factory_func return new_copy def set_job_ended(self): if self.job_ended: return # noinspection PyBroadException try: self.job.task.reload() self.job_ended = self.job_started + self.job.task.data.active_duration except Exception: pass def set_job_started(self): if self.job_started: return # noinspection PyBroadException try: self.job_started = self.job.task.data.started.timestamp() except Exception: pass def __init__( self, name, # type: str project, # type: str version=None, # type: Optional[str] pool_frequency=0.2, # type: float add_pipeline_tags=False, # type: bool target_project=True, # type: Optional[Union[str, bool]] auto_version_bump=None, # type: Optional[bool] abort_on_failure=False, # type: bool add_run_number=True, # type: bool retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa docker=None, # type: Optional[str] docker_args=None, # type: Optional[str] docker_bash_setup_script=None, # type: Optional[str] packages=None, # type: Optional[Union[str, Sequence[str]]] repo=None, # type: Optional[str] repo_branch=None, # type: Optional[str] repo_commit=None, # type: Optional[str] always_create_from_code=True, # type: bool artifact_serialization_function=None, # type: Optional[Callable[[Any], Union[bytes, bytearray]]] artifact_deserialization_function=None, # type: Optional[Callable[[bytes], Any]] output_uri=None # type: Optional[Union[str, bool]] ): # type: (...) -> None """ Create a new pipeline controller. The newly created object will launch and monitor the new experiments. :param name: Provide pipeline name (if main Task exists it overrides its name) :param project: Provide project storing the pipeline (if main Task exists it overrides its project) :param version: Pipeline version. This version allows to uniquely identify the pipeline template execution. Examples for semantic versions: version='1.0.1' , version='23', version='1.2'. If not set, find the latest version of the pipeline and increment it. If no such version is found, default to '1.0.0' :param float pool_frequency: The pooling frequency (in minutes) for monitoring experiments / states. :param bool add_pipeline_tags: (default: False) if True, add `pipe: <pipeline_task_id>` tag to all steps (Tasks) created by this pipeline. :param str target_project: If provided, all pipeline steps are cloned into the target project. If True, pipeline steps are stored into the pipeline project :param bool auto_version_bump: (Deprecated) If True, if the same pipeline version already exists (with any difference from the current one), the current pipeline version will be bumped to a new version version bump examples: 1.0.0 -> 1.0.1 , 1.2 -> 1.3, 10 -> 11 etc. :param bool abort_on_failure: If False (default), failed pipeline steps will not cause the pipeline to stop immediately, instead any step that is not connected (or indirectly connected) to the failed step, will still be executed. Nonetheless the pipeline itself will be marked failed, unless the failed step was specifically defined with "continue_on_fail=True". If True, any failed step will cause the pipeline to immediately abort, stop all running steps, and mark the pipeline as failed. :param add_run_number: If True (default), add the run number of the pipeline to the pipeline name. Example, the second time we launch the pipeline "best pipeline", we rename it to "best pipeline #2" :param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry - Integer: In case of node failure, retry the node the number of times indicated by this parameter. - Callable: A function called on node failure. Takes as parameters: the PipelineController instance, the PipelineController.Node that failed and an int representing the number of previous retries for the node that failed. The function must return ``True`` if the node should be retried and ``False`` otherwise. If True, the node will be re-queued and the number of retries left will be decremented by 1. By default, if this callback is not specified, the function will be retried the number of times indicated by `retry_on_failure`. .. code-block:: py def example_retry_on_failure_callback(pipeline, node, retries): print(node.name, ' failed') # allow up to 5 retries (total of 6 runs) return retries < 5 :param docker: Select the docker image to be executed in by the remote session :param docker_args: Add docker arguments, pass a single string :param docker_bash_setup_script: Add bash script to be executed inside the docker before setting up the Task's environment :param packages: Manually specify a list of required packages or a local requirements.txt file. Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt" If not provided, packages are automatically added. :param repo: Optional, specify a repository to attach to the pipeline controller, when remotely executing. Allow users to execute the controller inside the specified repository, enabling them to load modules/script from the repository. Notice the execution work directory will be the repository root folder. Supports both git repo url link, and local repository path (automatically converted into the remote git/commit as is currently checkout). Example remote url: 'https://github.com/user/repo.git' Example local repo copy: './repo' -> will automatically store the remote repo url and commit ID based on the locally cloned copy Use empty string ("") to disable any repository auto-detection :param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used) :param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used) :param always_create_from_code: If True (default) the pipeline is always constructed from code, if False, pipeline is generated from pipeline configuration section on the pipeline Task itsef. this allows to edit (also add/remove) pipeline steps without changing the original codebase :param artifact_serialization_function: A serialization function that takes one parameter of any type which is the object to be serialized. The function should return a `bytes` or `bytearray` object, which represents the serialized object. All parameter/return artifacts uploaded by the pipeline will be serialized using this function. All relevant imports must be done in this function. For example: .. code-block:: py def serialize(obj): import dill return dill.dumps(obj) :param artifact_deserialization_function: A deserialization function that takes one parameter of type `bytes`, which represents the serialized object. This function should return the deserialized object. All parameter/return artifacts fetched by the pipeline will be deserialized using this function. All relevant imports must be done in this function. For example: .. code-block:: py def deserialize(bytes_): import dill return dill.loads(bytes_) :param output_uri: The storage / output url for this pipeline. This is the default location for output models and other artifacts. Check Task.init reference docs for more info (output_uri is a parameter). The `output_uri` of this pipeline's steps will default to this value. """ if auto_version_bump is not None: warnings.warn("PipelineController.auto_version_bump is deprecated. It will be ignored", DeprecationWarning) self._nodes = {} self._running_nodes = [] self._start_time = None self._pipeline_time_limit = None self._default_execution_queue = None self._always_create_from_code = bool(always_create_from_code) self._version = str(version).strip() if version else None if self._version and not Version.is_valid_version_string(self._version): raise ValueError( "Setting non-semantic dataset version '{}'".format(self._version) ) self._pool_frequency = pool_frequency * 60. self._thread = None self._pipeline_args = dict() self._pipeline_args_desc = dict() self._pipeline_args_type = dict() self._args_map = dict() self._stop_event = None self._experiment_created_cb = None self._experiment_completed_cb = None self._pre_step_callbacks = {} self._post_step_callbacks = {} self._target_project = target_project self._add_pipeline_tags = add_pipeline_tags self._task = Task.current_task() self._step_ref_pattern = re.compile(self._step_pattern) self._reporting_lock = RLock() self._pipeline_task_status_failed = None self._mock_execution = False # used for nested pipelines (eager execution) self._pipeline_as_sub_project = bool(Session.check_min_api_server_version("2.17")) self._last_progress_update_time = 0 self._artifact_serialization_function = artifact_serialization_function self._artifact_deserialization_function = artifact_deserialization_function if not self._task: task_name = name or project or '{}'.format(datetime.now()) if self._pipeline_as_sub_project: parent_project = (project + "/" if project else "") + self._pipeline_section project_name = "{}/{}".format(parent_project, task_name) else: parent_project = None project_name = project or 'Pipelines' # if user disabled the auto-repo, we force local script storage (repo="" or repo=False) set_force_local_repo = False if Task.running_locally() and repo is not None and not repo: Task.force_store_standalone_script(force=True) set_force_local_repo = True self._task = Task.init( project_name=project_name, task_name=task_name, task_type=Task.TaskTypes.controller, auto_resource_monitoring=False, reuse_last_task_id=False ) # if user disabled the auto-repo, set it back to False (just in case) if set_force_local_repo: # noinspection PyProtectedMember self._task._wait_for_repo_detection(timeout=300.) Task.force_store_standalone_script(force=False) # make sure project is hidden if self._pipeline_as_sub_project: get_or_create_project( self._task.session, project_name=parent_project, system_tags=["hidden"]) get_or_create_project( self._task.session, project_name=project_name, project_id=self._task.project, system_tags=self._project_system_tags) self._task.set_system_tags((self._task.get_system_tags() or []) + [self._tag]) if output_uri is not None: self._task.output_uri = output_uri self._output_uri = output_uri self._task.set_base_docker( docker_image=docker, docker_arguments=docker_args, docker_setup_bash_script=docker_bash_setup_script ) self._task.set_packages(packages) self._task.set_repo(repo, branch=repo_branch, commit=repo_commit) self._auto_connect_task = bool(self._task) # make sure we add to the main Task the pipeline tag if self._task and not self._pipeline_as_sub_project: self._task.add_tags([self._tag]) self._monitored_nodes = {} # type: Dict[str, dict] self._abort_running_steps_on_failure = abort_on_failure self._def_max_retry_on_failure = retry_on_failure if isinstance(retry_on_failure, int) else 0 self._retry_on_failure_callback = retry_on_failure if callable(retry_on_failure) \ else self._default_retry_on_failure_callback # add direct link to the pipeline page if self._pipeline_as_sub_project and self._task: if add_run_number and self._task.running_locally(): self._add_pipeline_name_run_number() # noinspection PyProtectedMember self._task.get_logger().report_text('ClearML pipeline page: {}'.format( '{}/pipelines/{}/experiments/{}'.format( self._task._get_app_server(), self._task.project if self._task.project is not None else '*', self._task.id, )) ) def set_default_execution_queue(self, default_execution_queue): # type: (Optional[str]) -> None """ Set the default execution queue if pipeline step does not specify an execution queue :param default_execution_queue: The execution queue to use if no execution queue is provided """ self._default_execution_queue = str(default_execution_queue) if default_execution_queue else None def set_pipeline_execution_time_limit(self, max_execution_minutes): # type: (Optional[float]) -> None """ Set maximum execution time (minutes) for the entire pipeline. Pass None or 0 to disable execution time limit. :param float max_execution_minutes: The maximum time (minutes) for the entire pipeline process. The default is ``None``, indicating no time limit. """ self._pipeline_time_limit = max_execution_minutes * 60. if max_execution_minutes else None def add_step( self, name, # type: str base_task_id=None, # type: Optional[str] parents=None, # type: Optional[Sequence[str]] parameter_override=None, # type: Optional[Mapping[str, Any]] configuration_overrides=None, # type: Optional[Mapping[str, Union[str, Mapping]]] task_overrides=None, # type: Optional[Mapping[str, Any]] execution_queue=None, # type: Optional[str] monitor_metrics=None, # type: Optional[List[Union[Tuple[str, str], Tuple[(str, str), (str, str)]]]] monitor_artifacts=None, # type: Optional[List[Union[str, Tuple[str, str]]]] monitor_models=None, # type: Optional[List[Union[str, Tuple[str, str]]]] time_limit=None, # type: Optional[float] base_task_project=None, # type: Optional[str] base_task_name=None, # type: Optional[str] clone_base_task=True, # type: bool continue_on_fail=False, # type: bool pre_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa post_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa cache_executed_step=False, # type: bool base_task_factory=None, # type: Optional[Callable[[PipelineController.Node], Task]] retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa status_change_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, str], None]] # noqa recursively_parse_parameters=False, # type: bool output_uri=None # type: Optional[Union[str, bool]] ): # type: (...) -> bool """ Add a step to the pipeline execution DAG. Each step must have a unique name (this name will later be used to address the step) :param name: Unique of the step. For example `stage1` :param base_task_id: The Task ID to use for the step. Each time the step is executed, the base Task is cloned, then the cloned task will be sent for execution. :param parents: Optional list of parent nodes in the DAG. The current step in the pipeline will be sent for execution only after all the parent nodes have been executed successfully. :param parameter_override: Optional parameter overriding dictionary. The dict values can reference a previously executed step using the following form '${step_name}'. Examples: - Artifact access ``parameter_override={'Args/input_file': '${<step_name>.artifacts.<artifact_name>.url}' }`` - Model access (last model used) ``parameter_override={'Args/input_file': '${<step_name>.models.output.-1.url}' }`` - Parameter access ``parameter_override={'Args/input_file': '${<step_name>.parameters.Args/input_file}' }`` - Pipeline Task argument (see `Pipeline.add_parameter`) ``parameter_override={'Args/input_file': '${pipeline.<pipeline_parameter>}' }`` - Task ID ``parameter_override={'Args/input_file': '${stage3.id}' }`` :param recursively_parse_parameters: If True, recursively parse parameters from parameter_override in lists, dicts, or tuples. Example: - ``parameter_override={'Args/input_file': ['${<step_name>.artifacts.<artifact_name>.url}', 'file2.txt']}`` will be correctly parsed. - ``parameter_override={'Args/input_file': ('${<step_name_1>.parameters.Args/input_file}', '${<step_name_2>.parameters.Args/input_file}')}`` will be correctly parsed. :param configuration_overrides: Optional, override Task configuration objects. Expected dictionary of configuration object name and configuration object content. Examples: {'General': dict(key='value')} {'General': 'configuration file content'} {'OmegaConf': YAML.dumps(full_hydra_dict)} :param task_overrides: Optional task section overriding dictionary. The dict values can reference a previously executed step using the following form '${step_name}'. Examples: - get the latest commit from a specific branch ``task_overrides={'script.version_num': '', 'script.branch': 'main'}`` - match git repository branch to a previous step ``task_overrides={'script.branch': '${stage1.script.branch}', 'script.version_num': ''}`` - change container image ``task_overrides={'container.image': 'nvidia/cuda:11.6.0-devel-ubuntu20.04', 'container.arguments': '--ipc=host'}`` - match container image to a previous step ``task_overrides={'container.image': '${stage1.container.image}'}`` - reset requirements (the agent will use the "requirements.txt" inside the repo) ``task_overrides={'script.requirements.pip': ""}`` :param execution_queue: Optional, the queue to use for executing this specific step. If not provided, the task will be sent to the default execution queue, as defined on the class :param monitor_metrics: Optional, log the step's metrics on the pipeline Task. Format is a list of pairs metric (title, series) to log: [(step_metric_title, step_metric_series), ] Example: [('test', 'accuracy'), ] Or a list of tuple pairs, to specify a different target metric for to use on the pipeline Task: [((step_metric_title, step_metric_series), (target_metric_title, target_metric_series)), ] Example: [[('test', 'accuracy'), ('model', 'accuracy')], ] :param monitor_artifacts: Optional, log the step's artifacts on the pipeline Task. Provided a list of artifact names existing on the step's Task, they will also appear on the Pipeline itself. Example: [('processed_data', 'final_processed_data'), ] Alternatively user can also provide a list of artifacts to monitor (target artifact name will be the same as original artifact name) Example: ['processed_data', ] :param monitor_models: Optional, log the step's output models on the pipeline Task. Provided a list of model names existing on the step's Task, they will also appear on the Pipeline itself. Example: [('model_weights', 'final_model_weights'), ] Alternatively user can also provide a list of models to monitor (target models name will be the same as original model) Example: ['model_weights', ] To select the latest (lexicographic) model use "model_*", or the last created model with just "*" Example: ['model_weights_*', ] :param time_limit: Default None, no time limit. Step execution time limit, if exceeded the Task is aborted and the pipeline is stopped and marked failed. :param base_task_project: If base_task_id is not given, use the base_task_project and base_task_name combination to retrieve the base_task_id to use for the step. :param base_task_name: If base_task_id is not given, use the base_task_project and base_task_name combination to retrieve the base_task_id to use for the step. :param clone_base_task: If True (default), the pipeline will clone the base task, and modify/enqueue the cloned Task. If False, the base-task is used directly, notice it has to be in draft-mode (created). :param continue_on_fail: (default False). If True, failed step will not cause the pipeline to stop (or marked as failed). Notice, that steps that are connected (or indirectly connected) to the failed step will be skipped. :param pre_execute_callback: Callback function, called when the step (Task) is created and before it is sent for execution. Allows a user to modify the Task before launch. Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object. `parameters` are the configuration arguments passed to the ClearmlJob. If the callback returned value is `False`, the Node is skipped and so is any node in the DAG that relies on this node. Notice the `parameters` are already parsed, e.g. `${step1.parameters.Args/param}` is replaced with relevant value. .. code-block:: py def step_created_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, parameters, # type: dict ): pass :param post_execute_callback: Callback function, called when a step (Task) is completed and other jobs are executed. Allows a user to modify the Task status after completion. .. code-block:: py def step_completed_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, ): pass :param cache_executed_step: If True, before launching the new step, after updating with the latest configuration, check if an exact Task with the same parameter/code was already executed. If it was found, use it instead of launching a new Task. Default: False, a new cloned copy of base_task is always used. Notice: If the git repo reference does not have a specific commit ID, the Task will never be used. If `clone_base_task` is False there is no cloning, hence the base_task is used. :param base_task_factory: Optional, instead of providing a pre-existing Task, provide a Callable function to create the Task (returns Task object) :param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry - Integer: In case of node failure, retry the node the number of times indicated by this parameter. - Callable: A function called on node failure. Takes as parameters: the PipelineController instance, the PipelineController.Node that failed and an int representing the number of previous retries for the node that failed. The function must return ``True`` if the node should be retried and ``False`` otherwise. If True, the node will be re-queued and the number of retries left will be decremented by 1. By default, if this callback is not specified, the function will be retried the number of times indicated by `retry_on_failure`. .. code-block:: py def example_retry_on_failure_callback(pipeline, node, retries): print(node.name, ' failed') # allow up to 5 retries (total of 6 runs) return retries < 5 :param status_change_callback: Callback function, called when the status of a step (Task) changes. Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object. The signature of the function must look the following way: .. code-block:: py def status_change_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, previous_status # type: str ): pass :param output_uri: The storage / output url for this step. This is the default location for output models and other artifacts. Check Task.init reference docs for more info (output_uri is a parameter). :return: True if successful """ # always store callback functions (even when running remotely) if pre_execute_callback: self._pre_step_callbacks[name] = pre_execute_callback if post_execute_callback: self._post_step_callbacks[name] = post_execute_callback self._verify_node_name(name) if not base_task_factory and not base_task_id: if not base_task_project or not base_task_name: raise ValueError('Either base_task_id or base_task_project/base_task_name must be provided') base_task = Task.get_task( project_name=base_task_project, task_name=base_task_name, allow_archived=True, task_filter=dict( status=[str(Task.TaskStatusEnum.created), str(Task.TaskStatusEnum.queued), str(Task.TaskStatusEnum.in_progress), str(Task.TaskStatusEnum.published), str(Task.TaskStatusEnum.stopped), str(Task.TaskStatusEnum.completed), str(Task.TaskStatusEnum.closed)], ) ) if not base_task: raise ValueError('Could not find base_task_project={} base_task_name={}'.format( base_task_project, base_task_name)) if Task.archived_tag in base_task.get_system_tags(): LoggerRoot.get_base_logger().warning( 'Found base_task_project={} base_task_name={} but it is archived'.format( base_task_project, base_task_name)) base_task_id = base_task.id if configuration_overrides is not None: # verify we have a dict or a string on all values if not isinstance(configuration_overrides, dict) or \ not all(isinstance(v, (str, dict)) for v in configuration_overrides.values()): raise ValueError("configuration_overrides must be a dictionary, with all values " "either dicts or strings, got \'{}\' instead".format(configuration_overrides)) if task_overrides: task_overrides = flatten_dictionary(task_overrides, sep='.') self._nodes[name] = self.Node( name=name, base_task_id=base_task_id, parents=parents or [], queue=execution_queue, timeout=time_limit, parameters=parameter_override or {}, recursively_parse_parameters=recursively_parse_parameters, configurations=configuration_overrides, clone_task=clone_base_task, task_overrides=task_overrides, cache_executed_step=cache_executed_step, continue_on_fail=continue_on_fail, task_factory_func=base_task_factory, monitor_metrics=monitor_metrics or [], monitor_artifacts=monitor_artifacts or [], monitor_models=monitor_models or [], output_uri=self._output_uri if output_uri is None else output_uri ) self._retries[name] = 0 self._retries_callbacks[name] = retry_on_failure if callable(retry_on_failure) else \ (functools.partial(self._default_retry_on_failure_callback, max_retries=retry_on_failure) if isinstance(retry_on_failure, int) else self._retry_on_failure_callback) if status_change_callback: self._status_change_callbacks[name] = status_change_callback if self._task and not self._task.running_locally(): self.update_execution_plot() return True def add_function_step( self, name, # type: str function, # type: Callable function_kwargs=None, # type: Optional[Dict[str, Any]] function_return=None, # type: Optional[List[str]] project_name=None, # type: Optional[str] task_name=None, # type: Optional[str] task_type=None, # type: Optional[str] auto_connect_frameworks=None, # type: Optional[dict] auto_connect_arg_parser=None, # type: Optional[dict] packages=None, # type: Optional[Union[str, Sequence[str]]] repo=None, # type: Optional[str] repo_branch=None, # type: Optional[str] repo_commit=None, # type: Optional[str] helper_functions=None, # type: Optional[Sequence[Callable]] docker=None, # type: Optional[str] docker_args=None, # type: Optional[str] docker_bash_setup_script=None, # type: Optional[str] parents=None, # type: Optional[Sequence[str]] execution_queue=None, # type: Optional[str] monitor_metrics=None, # type: Optional[List[Union[Tuple[str, str], Tuple[(str, str), (str, str)]]]] monitor_artifacts=None, # type: Optional[List[Union[str, Tuple[str, str]]]] monitor_models=None, # type: Optional[List[Union[str, Tuple[str, str]]]] time_limit=None, # type: Optional[float] continue_on_fail=False, # type: bool pre_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa post_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa cache_executed_step=False, # type: bool retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa status_change_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, str], None]] # noqa tags=None, # type: Optional[Union[str, Sequence[str]]] output_uri=None # type: Optional[Union[str, bool]] ): # type: (...) -> bool """ Create a Task from a function, including wrapping the function input arguments into the hyper-parameter section as kwargs, and storing function results as named artifacts Example: .. code-block:: py def mock_func(a=6, b=9): c = a*b print(a, b, c) return c, c**2 create_task_from_function(mock_func, function_return=['mul', 'square']) Example arguments from other Tasks (artifact): .. code-block:: py def mock_func(matrix_np): c = matrix_np*matrix_np print(matrix_np, c) return c create_task_from_function( mock_func, function_kwargs={'matrix_np': 'aabb1122.previous_matrix'}, function_return=['square_matrix'] ) :param name: Unique of the step. For example `stage1` :param function: A global function to convert into a standalone Task :param function_kwargs: Optional, provide subset of function arguments and default values to expose. If not provided automatically take all function arguments & defaults Optional, pass input arguments to the function from other Tasks' output artifact. Example argument named `numpy_matrix` from Task ID `aabbcc` artifact name `answer`: {'numpy_matrix': 'aabbcc.answer'} :param function_return: Provide a list of names for all the results. If not provided, no results will be stored as artifacts. :param project_name: Set the project name for the task. Required if base_task_id is None. :param task_name: Set the name of the remote task, if not provided use `name` argument. :param task_type: Optional, The task type to be created. Supported values: 'training', 'testing', 'inference', 'data_processing', 'application', 'monitor', 'controller', 'optimizer', 'service', 'qc', 'custom' :param auto_connect_frameworks: Control the frameworks auto connect, see `Task.init` auto_connect_frameworks :param auto_connect_arg_parser: Control the ArgParser auto connect, see `Task.init` auto_connect_arg_parser :param packages: Manually specify a list of required packages or a local requirements.txt file. Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt" If not provided, packages are automatically added based on the imports used in the function. :param repo: Optional, specify a repository to attach to the function, when remotely executing. Allow users to execute the function inside the specified repository, enabling to load modules/script from a repository Notice the execution work directory will be the repository root folder. Supports both git repo url link, and local repository path. Example remote url: 'https://github.com/user/repo.git' Example local repo copy: './repo' -> will automatically store the remote repo url and commit ID based on the locally cloned copy :param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used) :param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used) :param helper_functions: Optional, a list of helper functions to make available for the standalone function Task. :param docker: Select the docker image to be executed in by the remote session :param docker_args: Add docker arguments, pass a single string :param docker_bash_setup_script: Add bash script to be executed inside the docker before setting up the Task's environment :param parents: Optional list of parent nodes in the DAG. The current step in the pipeline will be sent for execution only after all the parent nodes have been executed successfully. :param execution_queue: Optional, the queue to use for executing this specific step. If not provided, the task will be sent to the default execution queue, as defined on the class :param monitor_metrics: Optional, log the step's metrics on the pipeline Task. Format is a list of pairs metric (title, series) to log: [(step_metric_title, step_metric_series), ] Example: [('test', 'accuracy'), ] Or a list of tuple pairs, to specify a different target metric for to use on the pipeline Task: [((step_metric_title, step_metric_series), (target_metric_title, target_metric_series)), ] Example: [[('test', 'accuracy'), ('model', 'accuracy')], ] :param monitor_artifacts: Optional, log the step's artifacts on the pipeline Task. Provided a list of artifact names existing on the step's Task, they will also appear on the Pipeline itself. Example: [('processed_data', 'final_processed_data'), ] Alternatively user can also provide a list of artifacts to monitor (target artifact name will be the same as original artifact name) Example: ['processed_data', ] :param monitor_models: Optional, log the step's output models on the pipeline Task. Provided a list of model names existing on the step's Task, they will also appear on the Pipeline itself. Example: [('model_weights', 'final_model_weights'), ] Alternatively user can also provide a list of models to monitor (target models name will be the same as original model) Example: ['model_weights', ] To select the latest (lexicographic) model use "model_*", or the last created model with just "*" Example: ['model_weights_*', ] :param time_limit: Default None, no time limit. Step execution time limit, if exceeded the Task is aborted and the pipeline is stopped and marked failed. :param continue_on_fail: (default False). If True, failed step will not cause the pipeline to stop (or marked as failed). Notice, that steps that are connected (or indirectly connected) to the failed step will be skipped. :param pre_execute_callback: Callback function, called when the step (Task) is created and before it is sent for execution. Allows a user to modify the Task before launch. Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object. `parameters` are the configuration arguments passed to the ClearmlJob. If the callback returned value is `False`, the Node is skipped and so is any node in the DAG that relies on this node. Notice the `parameters` are already parsed, e.g. `${step1.parameters.Args/param}` is replaced with relevant value. .. code-block:: py def step_created_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, parameters, # type: dict ): pass :param post_execute_callback: Callback function, called when a step (Task) is completed and other jobs are executed. Allows a user to modify the Task status after completion. .. code-block:: py def step_completed_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, ): pass :param cache_executed_step: If True, before launching the new step, after updating with the latest configuration, check if an exact Task with the same parameter/code was already executed. If it was found, use it instead of launching a new Task. Default: False, a new cloned copy of base_task is always used. Notice: If the git repo reference does not have a specific commit ID, the Task will never be used. :param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry - Integer: In case of node failure, retry the node the number of times indicated by this parameter. - Callable: A function called on node failure. Takes as parameters: the PipelineController instance, the PipelineController.Node that failed and an int representing the number of previous retries for the node that failed. The function must return ``True`` if the node should be retried and ``False`` otherwise. If True, the node will be re-queued and the number of retries left will be decremented by 1. By default, if this callback is not specified, the function will be retried the number of times indicated by `retry_on_failure`. .. code-block:: py def example_retry_on_failure_callback(pipeline, node, retries): print(node.name, ' failed') # allow up to 5 retries (total of 6 runs) return retries < 5 :param status_change_callback: Callback function, called when the status of a step (Task) changes. Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object. The signature of the function must look the following way: .. code-block:: py def status_change_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, previous_status # type: str ): pass :param tags: A list of tags for the specific pipeline step. When executing a Pipeline remotely (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. :param output_uri: The storage / output url for this step. This is the default location for output models and other artifacts. Check Task.init reference docs for more info (output_uri is a parameter). :return: True if successful """ function_kwargs = function_kwargs or {} default_kwargs = inspect.getfullargspec(function) if default_kwargs and default_kwargs.args and default_kwargs.defaults: for key, val in zip(default_kwargs.args[-len(default_kwargs.defaults):], default_kwargs.defaults): function_kwargs.setdefault(key, val) return self._add_function_step( name=name, function=function, function_kwargs=function_kwargs, function_return=function_return, project_name=project_name, task_name=task_name, task_type=task_type, auto_connect_frameworks=auto_connect_frameworks, auto_connect_arg_parser=auto_connect_arg_parser, packages=packages, repo=repo, repo_branch=repo_branch, repo_commit=repo_commit, helper_functions=helper_functions, docker=docker, docker_args=docker_args, docker_bash_setup_script=docker_bash_setup_script, parents=parents, execution_queue=execution_queue, monitor_metrics=monitor_metrics, monitor_artifacts=monitor_artifacts, monitor_models=monitor_models, time_limit=time_limit, continue_on_fail=continue_on_fail, pre_execute_callback=pre_execute_callback, post_execute_callback=post_execute_callback, cache_executed_step=cache_executed_step, retry_on_failure=retry_on_failure, status_change_callback=status_change_callback, tags=tags, output_uri=output_uri ) def start( self, queue='services', step_task_created_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa step_task_completed_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa wait=True, ): # type: (...) -> bool """ Start the current pipeline remotely (on the selected services queue). The current process will be stopped and launched remotely. :param queue: queue name to launch the pipeline on :param Callable step_task_created_callback: Callback function, called when a step (Task) is created and before it is sent for execution. Allows a user to modify the Task before launch. Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object. `parameters` are the configuration arguments passed to the ClearmlJob. If the callback returned value is `False`, the Node is skipped and so is any node in the DAG that relies on this node. Notice the `parameters` are already parsed, e.g. `${step1.parameters.Args/param}` is replaced with relevant value. .. code-block:: py def step_created_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, parameters, # type: dict ): pass :param Callable step_task_completed_callback: Callback function, called when a step (Task) is completed and other jobs are executed. Allows a user to modify the Task status after completion. .. code-block:: py def step_completed_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, ): pass :param wait: If True (default), start the pipeline controller, return only after the pipeline is done (completed/aborted/failed) :return: True, if the controller started. False, if the controller did not start. """ if not self._task: raise ValueError( "Could not find main Task, " "PipelineController must be created with `always_create_task=True`") # serialize state only if we are running locally if Task.running_locally() or not self._task.is_main_task(): self._verify() self._serialize_pipeline_task() self.update_execution_plot() # stop current Task and execute remotely or no-op self._task.execute_remotely(queue_name=queue, exit_process=True, clone=False) if not Task.running_locally() and self._task.is_main_task(): self._start( step_task_created_callback=step_task_created_callback, step_task_completed_callback=step_task_completed_callback, wait=wait ) return True def start_locally(self, run_pipeline_steps_locally=False): # type: (bool) -> None """ Start the current pipeline locally, meaning the pipeline logic is running on the current machine, instead of on the `services` queue. Using run_pipeline_steps_locally=True you can run all the pipeline steps locally as sub-processes. Notice: when running pipeline steps locally, it assumes local code execution (i.e. it is running the local code as is, regardless of the git commit/diff on the pipeline steps Task) :param run_pipeline_steps_locally: (default False) If True, run the pipeline steps themselves locally as a subprocess (use for debugging the pipeline locally, notice the pipeline code is expected to be available on the local machine) """ if not self._task: raise ValueError( "Could not find main Task, " "PipelineController must be created with `always_create_task=True`") if run_pipeline_steps_locally: self._clearml_job_class = LocalClearmlJob self._default_execution_queue = self._default_execution_queue or 'mock' # serialize state only if we are running locally if Task.running_locally() or not self._task.is_main_task(): self._verify() self._serialize_pipeline_task() self._start(wait=True) def create_draft(self): # type: () -> None """ Optional, manually create & serialize the Pipeline Task (use with care for manual multi pipeline creation). **Notice** The recommended flow would be to call `pipeline.start(queue=None)` which would have a similar effect and will allow you to clone/enqueue later on. After calling Pipeline.create(), users can edit the pipeline in the UI and enqueue it for execution. Notice: this function should be used to programmatically create pipeline for later usage. To automatically create and launch pipelines, call the `start()` method. """ self._verify() self._serialize_pipeline_task() self._task.close() self._task.reset() def connect_configuration(self, configuration, name=None, description=None): # type: (Union[Mapping, list, Path, str], Optional[str], Optional[str]) -> Union[dict, Path, str] """ Connect a configuration dictionary or configuration file (pathlib.Path / str) to the PipelineController object. This method should be called before reading the configuration file. For example, a local file: .. code-block:: py config_file = pipe.connect_configuration(config_file) my_params = json.load(open(config_file,'rt')) A parameter dictionary/list: .. code-block:: py my_params = pipe.connect_configuration(my_params) :param configuration: The configuration. This is usually the configuration used in the model training process. Specify one of the following: - A dictionary/list - A dictionary containing the configuration. ClearML stores the configuration in the **ClearML Server** (backend), in a HOCON format (JSON-like format) which is editable. - A ``pathlib2.Path`` string - A path to the configuration file. ClearML stores the content of the file. A local path must be relative path. When executing a pipeline remotely in a worker, the contents brought from the **ClearML Server** (backend) overwrites the contents of the file. :param str name: Configuration section name. default: 'General' Allowing users to store multiple configuration dicts/files :param str description: Configuration section description (text). default: None :return: If a dictionary is specified, then a dictionary is returned. If pathlib2.Path / string is specified, then a path to a local configuration file is returned. Configuration object. """ return self._task.connect_configuration(configuration, name=name, description=description) @classmethod def get_logger(cls): # type: () -> Logger """ Return a logger connected to the Pipeline Task. The logger can be used by any function/tasks executed by the pipeline, in order to report directly to the pipeline Task itself. It can also be called from the main pipeline control Task. Raise ValueError if main Pipeline task could not be located. :return: Logger object for reporting metrics (scalars, plots, debug samples etc.) """ return cls._get_pipeline_task().get_logger() @classmethod def upload_model(cls, model_name, model_local_path, upload_uri=None): # type: (str, str, Optional[str]) -> OutputModel """ Upload (add) a model to the main Pipeline Task object. This function can be called from any pipeline component to directly add models into the main pipeline Task The model file/path will be uploaded to the Pipeline Task and registered on the model repository. Raise ValueError if main Pipeline task could not be located. :param model_name: Model name as will appear in the model registry (in the pipeline's project) :param model_local_path: Path to the local model file or directory to be uploaded. If a local directory is provided the content of the folder (recursively) will be packaged into a zip file and uploaded :param upload_uri: The URI of the storage destination for model weights upload. The default value is the previously used URI. :return: The uploaded OutputModel """ task = cls._get_pipeline_task() model_name = str(model_name) model_local_path = Path(model_local_path) out_model = OutputModel(task=task, name=model_name) out_model.update_weights(weights_filename=model_local_path.as_posix(), upload_uri=upload_uri) return out_model @classmethod def upload_artifact( cls, name, # type: str artifact_object, # type: Any metadata=None, # type: Optional[Mapping] delete_after_upload=False, # type: bool auto_pickle=True, # type: bool preview=None, # type: Any wait_on_upload=False, # type: bool serialization_function=None # type: Optional[Callable[[Any], Union[bytes, bytearray]]] ): # type: (...) -> bool """ Upload (add) an artifact to the main Pipeline Task object. This function can be called from any pipeline component to directly add artifacts into the main pipeline Task. The artifact can be uploaded by any function/tasks executed by the pipeline, in order to report directly to the pipeline Task itself. It can also be called from the main pipeline control Task. Raise ValueError if main Pipeline task could not be located. The currently supported upload artifact types include: - string / Path - A path to artifact file. If a wildcard or a folder is specified, then ClearML creates and uploads a ZIP file. - dict - ClearML stores a dictionary as ``.json`` file and uploads it. - pandas.DataFrame - ClearML stores a pandas.DataFrame as ``.csv.gz`` (compressed CSV) file and uploads it. - numpy.ndarray - ClearML stores a numpy.ndarray as ``.npz`` file and uploads it. - PIL.Image - ClearML stores a PIL.Image as ``.png`` file and uploads it. - Any - If called with auto_pickle=True, the object will be pickled and uploaded. :param str name: The artifact name. .. warning:: If an artifact with the same name was previously uploaded, then it is overwritten. :param object artifact_object: The artifact object. :param dict metadata: A dictionary of key-value pairs for any metadata. This dictionary appears with the experiment in the **ClearML Web-App (UI)**, **ARTIFACTS** tab. :param bool delete_after_upload: After the upload, delete the local copy of the artifact - ``True`` - Delete the local copy of the artifact. - ``False`` - Do not delete. (default) :param bool auto_pickle: If True (default), and the artifact_object is not one of the following types: pathlib2.Path, dict, pandas.DataFrame, numpy.ndarray, PIL.Image, url (string), local_file (string) the artifact_object will be pickled and uploaded as pickle file artifact (with file extension .pkl) :param Any preview: The artifact preview :param bool wait_on_upload: Whether the upload should be synchronous, forcing the upload to complete before continuing. :param Callable[Any, Union[bytes, bytearray]] serialization_function: A serialization function that takes one parameter of any type which is the object to be serialized. The function should return a `bytes` or `bytearray` object, which represents the serialized object. Note that the object will be immediately serialized using this function, thus other serialization methods will not be used (e.g. `pandas.DataFrame.to_csv`), even if possible. To deserialize this artifact when getting it using the `Artifact.get` method, use its `deserialization_function` argument. :return: The status of the upload. - ``True`` - Upload succeeded. - ``False`` - Upload failed. :raise: If the artifact object type is not supported, raise a ``ValueError``. """ task = cls._get_pipeline_task() return task.upload_artifact( name=name, artifact_object=artifact_object, metadata=metadata, delete_after_upload=delete_after_upload, auto_pickle=auto_pickle, preview=preview, wait_on_upload=wait_on_upload, serialization_function=serialization_function ) def stop(self, timeout=None, mark_failed=False, mark_aborted=False): # type: (Optional[float], bool, bool) -> () """ Stop the pipeline controller and the optimization thread. If mark_failed and mark_aborted are False (default) mark the pipeline as completed, unless one of the steps failed, then mark the pipeline as failed. :param timeout: Wait timeout for the optimization thread to exit (minutes). The default is ``None``, indicating do not wait to terminate immediately. :param mark_failed: If True, mark the pipeline task as failed. (default False) :param mark_aborted: If True, mark the pipeline task as aborted. (default False) """ self._stop_event.set() self.wait(timeout=timeout) if not self._task: return # sync pipeline state self.update_execution_plot() self._task.close() if mark_failed: self._task.mark_failed(status_reason='Pipeline aborted and failed', force=True) elif mark_aborted: self._task.mark_stopped(status_message='Pipeline aborted', force=True) elif self._pipeline_task_status_failed: print('Setting pipeline controller Task as failed (due to failed steps) !') self._task.mark_failed(status_reason='Pipeline step failed', force=True) def wait(self, timeout=None): # type: (Optional[float]) -> bool """ Wait for the pipeline to finish. .. note:: This method does not stop the pipeline. Call :meth:`stop` to terminate the pipeline. :param float timeout: The timeout to wait for the pipeline to complete (minutes). If ``None``, then wait until we reached the timeout, or pipeline completed. :return: True, if the pipeline finished. False, if the pipeline timed out. """ if not self.is_running(): return True if timeout is not None: timeout *= 60. _thread = self._thread _thread.join(timeout=timeout) if _thread.is_alive(): return False return True def is_running(self): # type: () -> bool """ return True if the pipeline controller is running. :return: A boolean indicating whether the pipeline controller is active (still running) or stopped. """ return self._thread is not None and self._thread.is_alive() def is_successful(self, fail_on_step_fail=True, fail_condition="all"): # type: (bool, str) -> bool """ Evaluate whether the pipeline is successful. :param fail_on_step_fail: If True (default), evaluate the pipeline steps' status to assess if the pipeline is successful. If False, only evaluate the controller :param fail_condition: Must be one of the following: 'all' (default), 'failed' or 'aborted'. If 'failed', this function will return False if the pipeline failed and True if the pipeline was aborted. If 'aborted', this function will return False if the pipeline was aborted and True if the pipeline failed. If 'all', this function will return False in both cases. :return: A boolean indicating whether the pipeline was successful or not. Note that if the pipeline is in a running/pending state, this function will return False """ if fail_condition == "all": success_status = [Task.TaskStatusEnum.completed] elif fail_condition == "failed": success_status = [Task.TaskStatusEnum.completed, Task.TaskStatusEnum.stopped] elif fail_condition == "aborted": success_status = [Task.TaskStatusEnum.completed, Task.TaskStatusEnum.failed] else: raise UsageError("fail_condition needs to be one of the following: 'all', 'failed', 'aborted'") if self._task.status not in success_status: return False if not fail_on_step_fail: return True self._update_nodes_status() for node in self._nodes.values(): if node.status not in success_status: return False return True def elapsed(self): # type: () -> float """ Return minutes elapsed from controller stating time stamp. :return: The minutes from controller start time. A negative value means the process has not started yet. """ if self._start_time is None: return -1.0 return (time() - self._start_time) / 60. def get_pipeline_dag(self): # type: () -> Mapping[str, PipelineController.Node] """ Return the pipeline execution graph, each node in the DAG is PipelineController.Node object. Graph itself is a dictionary of Nodes (key based on the Node name), each node holds links to its parent Nodes (identified by their unique names) :return: execution tree, as a nested dictionary. Example: .. code-block:: py { 'stage1' : Node() { name: 'stage1' job: ClearmlJob ... }, } """ return self._nodes def get_processed_nodes(self): # type: () -> Sequence[PipelineController.Node] """ Return a list of the processed pipeline nodes, each entry in the list is PipelineController.Node object. :return: executed (excluding currently executing) nodes list """ return {k: n for k, n in self._nodes.items() if n.executed} def get_running_nodes(self): # type: () -> Sequence[PipelineController.Node] """ Return a list of the currently running pipeline nodes, each entry in the list is PipelineController.Node object. :return: Currently running nodes list """ return {k: n for k, n in self._nodes.items() if k in self._running_nodes} def update_execution_plot(self): # type: () -> () """ Update sankey diagram of the current pipeline """ with self._reporting_lock: self._update_execution_plot() # also trigger node monitor scanning self._scan_monitored_nodes() def add_parameter(self, name, default=None, description=None, param_type=None): # type: (str, Optional[Any], Optional[str], Optional[str]) -> None """ Add a parameter to the pipeline Task. The parameter can be used as input parameter for any step in the pipeline. Notice all parameters will appear under the PipelineController Task's Hyper-parameters -> Pipeline section Example: pipeline.add_parameter(name='dataset', description='dataset ID to process the pipeline') Then in one of the steps we can refer to the value of the parameter with '${pipeline.dataset}' :param name: String name of the parameter. :param default: Default value to be put as the default value (can be later changed in the UI) :param description: String description of the parameter and its usage in the pipeline :param param_type: Optional, parameter type information (to be used as hint for casting and description) """ self._pipeline_args[str(name)] = default if description: self._pipeline_args_desc[str(name)] = str(description) if param_type: self._pipeline_args_type[str(name)] = param_type def get_parameters(self): # type: () -> dict """ Return the pipeline parameters dictionary :return: Dictionary str -> str """ return self._pipeline_args @classmethod def enqueue(cls, pipeline_controller, queue_name=None, queue_id=None, force=False): # type: (Union[PipelineController, str], Optional[str], Optional[str], bool) -> Any """ Enqueue a PipelineController for execution, by adding it to an execution queue. .. note:: A worker daemon must be listening at the queue for the worker to fetch the Task and execute it, see `ClearML Agent <../clearml_agent>`_ in the ClearML Documentation. :param pipeline_controller: The PipelineController to enqueue. Specify a PipelineController object or PipelineController ID :param queue_name: The name of the queue. If not specified, then ``queue_id`` must be specified. :param queue_id: The ID of the queue. If not specified, then ``queue_name`` must be specified. :param bool force: If True, reset the PipelineController if necessary before enqueuing it :return: An enqueue JSON response. .. code-block:: javascript { "queued": 1, "updated": 1, "fields": { "status": "queued", "status_reason": "", "status_message": "", "status_changed": "2020-02-24T15:05:35.426770+00:00", "last_update": "2020-02-24T15:05:35.426770+00:00", "execution.queue": "2bd96ab2d9e54b578cc2fb195e52c7cf" } } - ``queued`` - The number of Tasks enqueued (an integer or ``null``). - ``updated`` - The number of Tasks updated (an integer or ``null``). - ``fields`` - ``status`` - The status of the experiment. - ``status_reason`` - The reason for the last status change. - ``status_message`` - Information about the status. - ``status_changed`` - The last status change date and time (ISO 8601 format). - ``last_update`` - The last Task update time, including Task creation, update, change, or events for this task (ISO 8601 format). - ``execution.queue`` - The ID of the queue where the Task is enqueued. ``null`` indicates not enqueued. """ pipeline_controller = ( pipeline_controller if isinstance(pipeline_controller, PipelineController) else cls.get(pipeline_id=pipeline_controller) ) return Task.enqueue(pipeline_controller._task, queue_name=queue_name, queue_id=queue_id, force=force) @classmethod def get( cls, pipeline_id=None, # type: Optional[str] pipeline_project=None, # type: Optional[str] pipeline_name=None, # type: Optional[str] pipeline_version=None, # type: Optional[str] pipeline_tags=None, # type: Optional[Sequence[str]] shallow_search=False # type: bool ): # type: (...) -> "PipelineController" """ Get a specific PipelineController. If multiple pipeline controllers are found, the pipeline controller with the highest semantic version is returned. If no semantic version is found, the most recently updated pipeline controller is returned. This function raises aan Exception if no pipeline controller was found Note: In order to run the pipeline controller returned by this function, use PipelineController.enqueue :param pipeline_id: Requested PipelineController ID :param pipeline_project: Requested PipelineController project :param pipeline_name: Requested PipelineController name :param pipeline_tags: Requested PipelineController tags (list of tag strings) :param shallow_search: If True, search only the first 500 results (first page) """ mutually_exclusive(pipeline_id=pipeline_id, pipeline_project=pipeline_project, _require_at_least_one=False) mutually_exclusive(pipeline_id=pipeline_id, pipeline_name=pipeline_name, _require_at_least_one=False) if not pipeline_id: pipeline_project_hidden = "{}/{}/{}".format(pipeline_project, cls._pipeline_section, pipeline_name) name_with_runtime_number_regex = r"^{}( #[0-9]+)*$".format(re.escape(pipeline_name)) pipelines = Task._query_tasks( pipeline_project=[pipeline_project_hidden], task_name=name_with_runtime_number_regex, fetch_only_first_page=False if not pipeline_version else shallow_search, only_fields=["id"] if not pipeline_version else ["id", "runtime.version"], system_tags=[cls._tag], order_by=["-last_update"], tags=pipeline_tags, search_hidden=True, _allow_extra_fields_=True, ) if pipelines: if not pipeline_version: pipeline_id = pipelines[0].id current_version = None for pipeline in pipelines: if not pipeline.runtime: continue candidate_version = pipeline.runtime.get("version") if not candidate_version or not Version.is_valid_version_string(candidate_version): continue if not current_version or Version(candidate_version) > current_version: current_version = Version(candidate_version) pipeline_id = pipeline.id else: for pipeline in pipelines: if pipeline.runtime.get("version") == pipeline_version: pipeline_id = pipeline.id break if not pipeline_id: error_msg = "Could not find dataset with pipeline_project={}, pipeline_name={}".format(pipeline_project, pipeline_name) if pipeline_version: error_msg += ", pipeline_version={}".format(pipeline_version) raise ValueError(error_msg) pipeline_task = Task.get_task(task_id=pipeline_id) pipeline_object = cls.__new__(cls) pipeline_object._task = pipeline_task pipeline_object._nodes = {} pipeline_object._running_nodes = [] try: pipeline_object._deserialize(pipeline_task._get_configuration_dict(cls._config_section), force=True) except Exception: pass return pipeline_object @property def id(self): # type: () -> str return self._task.id @property def tags(self): # type: () -> List[str] return self._task.get_tags() or [] def add_tags(self, tags): # type: (Union[Sequence[str], str]) -> None """ Add tags to this pipeline. Old tags are not deleted. When executing a Pipeline remotely (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. :param tags: A list of tags for this pipeline. """ if not self._task: return # should not actually happen self._task.add_tags(tags) def _create_task_from_function( self, docker, docker_args, docker_bash_setup_script, function, function_input_artifacts, function_kwargs, function_return, auto_connect_frameworks, auto_connect_arg_parser, packages, project_name, task_name, task_type, repo, branch, commit, helper_functions, output_uri=None ): task_definition = CreateFromFunction.create_task_from_function( a_function=function, function_kwargs=function_kwargs or None, function_input_artifacts=function_input_artifacts, function_return=function_return, project_name=project_name, task_name=task_name, task_type=task_type, auto_connect_frameworks=auto_connect_frameworks, auto_connect_arg_parser=auto_connect_arg_parser, repo=repo, branch=branch, commit=commit, packages=packages, docker=docker, docker_args=docker_args, docker_bash_setup_script=docker_bash_setup_script, output_uri=output_uri, helper_functions=helper_functions, dry_run=True, task_template_header=self._task_template_header, artifact_serialization_function=self._artifact_serialization_function, artifact_deserialization_function=self._artifact_deserialization_function ) return task_definition def _start( self, step_task_created_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa step_task_completed_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa wait=True, ): # type: (...) -> bool """ Start the pipeline controller. If the calling process is stopped, then the controller stops as well. :param Callable step_task_created_callback: Callback function, called when a step (Task) is created and before it is sent for execution. Allows a user to modify the Task before launch. Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object. `parameters` are the configuration arguments passed to the ClearmlJob. If the callback returned value is `False`, the Node is skipped and so is any node in the DAG that relies on this node. Notice the `parameters` are already parsed, e.g. `${step1.parameters.Args/param}` is replaced with relevant value. .. code-block:: py def step_created_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, parameters, # type: dict ): pass :param Callable step_task_completed_callback: Callback function, called when a step (Task) is completed and other jobs are executed. Allows a user to modify the Task status after completion. .. code-block:: py def step_completed_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, ): pass :param wait: If True (default), start the pipeline controller, return only after the pipeline is done (completed/aborted/failed) :return: True, if the controller started. False, if the controller did not start. """ if self._thread: return True self._prepare_pipeline(step_task_completed_callback, step_task_created_callback) self._thread = Thread(target=self._daemon) self._thread.daemon = True self._thread.start() if wait: self.wait() self.stop() return True def _prepare_pipeline( self, step_task_created_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa step_task_completed_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa ): # type (...) -> None params, pipeline_dag = self._serialize_pipeline_task() # deserialize back pipeline state if not params['continue_pipeline']: for k in pipeline_dag: pipeline_dag[k]['executed'] = None pipeline_dag[k]['job_started'] = None pipeline_dag[k]['job_ended'] = None self._default_execution_queue = params['default_queue'] self._add_pipeline_tags = params['add_pipeline_tags'] self._target_project = params['target_project'] or '' self._deserialize(pipeline_dag) # if we continue the pipeline, make sure that we re-execute failed tasks if params['continue_pipeline']: for node in list(self._nodes.values()): if node.executed is False: node.executed = None if not self._verify(): raise ValueError("Failed verifying pipeline execution graph, " "it has either inaccessible nodes, or contains cycles") self.update_execution_plot() self._start_time = time() self._stop_event = Event() self._experiment_created_cb = step_task_created_callback self._experiment_completed_cb = step_task_completed_callback def _serialize_pipeline_task(self): # type: () -> (dict, dict) """ Serialize current pipeline state into the main Task :return: params, pipeline_dag """ params = { 'default_queue': self._default_execution_queue, 'add_pipeline_tags': self._add_pipeline_tags, 'target_project': self._target_project, } pipeline_dag = self._serialize() # serialize pipeline state if self._task and self._auto_connect_task: # check if we are either running locally or that we are running remotely, # but we have no configuration, so we need to act as if this is a local run and create everything if self._task.running_locally() or self._task.get_configuration_object(name=self._config_section) is None: # noinspection PyProtectedMember self._task._set_configuration( name=self._config_section, config_type='dictionary', config_text=json.dumps(pipeline_dag, indent=2)) args_map_inversed = {} for section, arg_list in self._args_map.items(): for arg in arg_list: args_map_inversed[arg] = section pipeline_args = flatten_dictionary(self._pipeline_args) # noinspection PyProtectedMember self._task._set_parameters( { "{}/{}".format(args_map_inversed.get(k, self._args_section), k): v for k, v in pipeline_args.items() }, __parameters_descriptions=self._pipeline_args_desc, __parameters_types=self._pipeline_args_type, __update=True, ) self._task.connect(params, name=self._pipeline_section) params['continue_pipeline'] = False # make sure we have a unique version number (auto bump version if needed) # only needed when manually (from code) creating pipelines self._handle_pipeline_version() # noinspection PyProtectedMember pipeline_hash = self._get_task_hash() # noinspection PyProtectedMember self._task._set_runtime_properties({ self._runtime_property_hash: "{}:{}".format(pipeline_hash, self._version), "version": self._version }) self._task.set_user_properties(version=self._version) else: self._task.connect_configuration(pipeline_dag, name=self._config_section) connected_args = set() new_pipeline_args = {} for section, arg_list in self._args_map.items(): mutable_dict = {arg: self._pipeline_args.get(arg) for arg in arg_list} self._task.connect(mutable_dict, name=section) new_pipeline_args.update(mutable_dict) connected_args.update(arg_list) mutable_dict = {k: v for k, v in self._pipeline_args.items() if k not in connected_args} self._task.connect( mutable_dict, name=self._args_section ) new_pipeline_args.update(mutable_dict) self._pipeline_args = new_pipeline_args self._task.connect(params, name=self._pipeline_section) # noinspection PyProtectedMember if self._task._get_runtime_properties().get(self._runtime_property_hash): params['continue_pipeline'] = True else: # noinspection PyProtectedMember pipeline_hash = ClearmlJob._create_task_hash(self._task) # noinspection PyProtectedMember self._task._set_runtime_properties({ self._runtime_property_hash: "{}:{}".format(pipeline_hash, self._version), }) params['continue_pipeline'] = False return params, pipeline_dag def _handle_pipeline_version(self): if not self._version: # noinspection PyProtectedMember self._version = self._task._get_runtime_properties().get("version") if not self._version: previous_pipeline_tasks = Task._query_tasks( project=[self._task.project], fetch_only_first_page=True, only_fields=["runtime.version"], order_by=["-last_update"], system_tags=[self._tag], search_hidden=True, _allow_extra_fields_=True ) for previous_pipeline_task in previous_pipeline_tasks: if previous_pipeline_task.runtime.get("version"): self._version = str(Version(previous_pipeline_task.runtime.get("version")).get_next_version()) break self._version = self._version or self._default_pipeline_version def _get_task_hash(self): params_override = dict(**(self._task.get_parameters() or {})) params_override.pop('properties/version', None) # dag state without status / states nodes_items = list(self._nodes.items()) dag = { name: { k: v for k, v in node.__dict__.items() if k not in ( 'job', 'name', 'task_factory_func', 'executed', 'status', 'job_started', 'job_ended', 'skip_job' ) } for name, node in nodes_items } # get all configurations (as dict of strings for hashing) configurations_override = dict(**self._task.get_configuration_objects()) # store as text so we can hash it later configurations_override[self._config_section] = json.dumps(dag) # noinspection PyProtectedMember pipeline_hash = ClearmlJob._create_task_hash( self._task, params_override=params_override, configurations_override=configurations_override, ) return pipeline_hash def _serialize(self): # type: () -> dict """ Store the definition of the pipeline DAG into a dictionary. This dictionary will be used to store the DAG as a configuration on the Task :return: """ nodes_items = list(self._nodes.items()) dag = {name: dict((k, v) for k, v in node.__dict__.items() if k not in ('job', 'name', 'task_factory_func')) for name, node in nodes_items} # update state for presentation only for name, node in nodes_items: dag[name]['job_id'] = node.executed or (node.job.task_id() if node.job else None) return dag def _deserialize(self, dag_dict, force=False): # type: (dict, bool) -> () """ Restore the DAG from a dictionary. This will be used to create the DAG from the dict stored on the Task, when running remotely. :return: """ # if we always want to load the pipeline DAG from code, we are skipping the deserialization step if not force and self._always_create_from_code: return # if we do not clone the Task, only merge the parts we can override. for name in list(self._nodes.keys()): if not self._nodes[name].clone_task and name in dag_dict and not dag_dict[name].get('clone_task'): for k in ('queue', 'parents', 'timeout', 'parameters', 'configurations', 'task_overrides', 'executed', 'job_started', 'job_ended'): setattr(self._nodes[name], k, dag_dict[name].get(k) or type(getattr(self._nodes[name], k))()) # if we do clone the Task deserialize everything, except the function creating self._nodes = { k: self.Node(name=k, **{kk: vv for kk, vv in v.items() if kk not in ('job_id', )}) if k not in self._nodes or (v.get('base_task_id') and v.get('clone_task')) else self._nodes[k] for k, v in dag_dict.items()} # set the task_factory_func for each cloned node for node in list(self._nodes.values()): if not node.base_task_id and not node.task_factory_func and node.job_code_section: if node.job_code_section in self._nodes: func = self._nodes[node.job_code_section].task_factory_func if func: node.task_factory_func = func def _has_stored_configuration(self): """ Return True if we are running remotely, and we have stored configuration on the Task """ if self._auto_connect_task and self._task and not self._task.running_locally() and self._task.is_main_task(): stored_config = self._task.get_configuration_object(self._config_section) return bool(stored_config) return False def _verify(self): # type: () -> bool """ Verify the DAG, (i.e. no cycles and no missing parents) On error raise ValueError with verification details :return: return True iff DAG has no errors """ # verify nodes for node in list(self._nodes.values()): # raise value error if not verified self._verify_node(node) # check the dag itself if not self._verify_dag(): return False return True def _verify_node(self, node): # type: (PipelineController.Node) -> bool """ Raise ValueError on verification errors :return: Return True iff the specific node is verified """ if not node.base_task_id and not node.task_factory_func: raise ValueError("Node '{}', base_task_id is empty".format(node.name)) if not self._default_execution_queue and not node.queue: raise ValueError("Node '{}' missing execution queue, " "no default queue defined and no specific node queue defined".format(node.name)) task = node.task_factory_func or Task.get_task(task_id=node.base_task_id) if not task: raise ValueError("Node '{}', base_task_id={} is invalid".format(node.name, node.base_task_id)) pattern = self._step_ref_pattern # verify original node parents if node.parents and not all(isinstance(p, str) and p in self._nodes for p in node.parents): raise ValueError("Node '{}', parents={} is invalid".format(node.name, node.parents)) parents = set() for k, v in node.parameters.items(): if isinstance(v, str): for g in pattern.findall(v): ref_step = self.__verify_step_reference(node, g) if ref_step: parents.add(ref_step) # verify we have a section name if '/' not in k: raise ValueError( "Section name is missing in parameter \"{}\", " "parameters should be in the form of " "\"`section-name`/parameter\", example: \"Args/param\"".format(v)) if parents and parents != set(node.parents or []): parents = parents - set(node.parents or []) getLogger('clearml.automation.controller').info( 'Node "{}" missing parent reference, adding: {}'.format(node.name, parents)) node.parents = (node.parents or []) + list(parents) # verify and fix monitoring sections: def _verify_monitors(monitors, monitor_type, nested_pairs=False): if not monitors: return monitors if nested_pairs: if not all(isinstance(x, (list, tuple)) and x for x in monitors): raise ValueError("{} should be a list of tuples, found: {}".format(monitor_type, monitors)) # convert single pair into a pair of pairs: conformed_monitors = [ pair if isinstance(pair[0], (list, tuple)) else (pair, pair) for pair in monitors ] # verify the pair of pairs if not all(isinstance(x[0][0], str) and isinstance(x[0][1], str) and isinstance(x[1][0], str) and isinstance(x[1][1], str) for x in conformed_monitors): raise ValueError("{} should be a list of tuples, found: {}".format(monitor_type, monitors)) else: # verify a list of tuples if not all(isinstance(x, (list, tuple, str)) and x for x in monitors): raise ValueError( "{} should be a list of tuples, found: {}".format(monitor_type, monitors)) # convert single str into a pair of pairs: conformed_monitors = [ pair if isinstance(pair, (list, tuple)) else (pair, pair) for pair in monitors ] # verify the pair of pairs if not all(isinstance(x[0], str) and isinstance(x[1], str) for x in conformed_monitors): raise ValueError( "{} should be a list of tuples, found: {}".format(monitor_type, monitors)) return conformed_monitors # verify and fix monitoring sections: node.monitor_metrics = _verify_monitors(node.monitor_metrics, 'monitor_metrics', nested_pairs=True) node.monitor_artifacts = _verify_monitors(node.monitor_artifacts, 'monitor_artifacts') node.monitor_models = _verify_monitors(node.monitor_models, 'monitor_models') return True def _verify_dag(self): # type: () -> bool """ :return: True iff the pipeline dag is fully accessible and contains no cycles """ visited = set() prev_visited = None while prev_visited != visited: prev_visited = copy(visited) for k, node in list(self._nodes.items()): if k in visited: continue if any(p == node.name for p in node.parents or []): # node cannot have itself as parent return False if not all(p in visited for p in node.parents or []): continue visited.add(k) # return False if we did not cover all the nodes return not bool(set(self._nodes.keys()) - visited) def _add_function_step( self, name, # type: str function, # type: Callable function_kwargs=None, # type: Optional[Dict[str, Any]] function_return=None, # type: Optional[List[str]] project_name=None, # type: Optional[str] task_name=None, # type: Optional[str] task_type=None, # type: Optional[str] auto_connect_frameworks=None, # type: Optional[dict] auto_connect_arg_parser=None, # type: Optional[dict] packages=None, # type: Optional[Union[str, Sequence[str]]] repo=None, # type: Optional[str] repo_branch=None, # type: Optional[str] repo_commit=None, # type: Optional[str] helper_functions=None, # type: Optional[Sequence[Callable]] docker=None, # type: Optional[str] docker_args=None, # type: Optional[str] docker_bash_setup_script=None, # type: Optional[str] parents=None, # type: Optional[Sequence[str]] execution_queue=None, # type: Optional[str] monitor_metrics=None, # type: Optional[List[Union[Tuple[str, str], Tuple[(str, str), (str, str)]]]] monitor_artifacts=None, # type: Optional[List[Union[str, Tuple[str, str]]]] monitor_models=None, # type: Optional[List[Union[str, Tuple[str, str]]]] time_limit=None, # type: Optional[float] continue_on_fail=False, # type: bool pre_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa post_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa cache_executed_step=False, # type: bool retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa status_change_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, str], None]] # noqa tags=None, # type: Optional[Union[str, Sequence[str]]] output_uri=None # type: Optional[Union[str, bool]] ): # type: (...) -> bool """ Create a Task from a function, including wrapping the function input arguments into the hyperparameter section as kwargs, and storing function results as named artifacts Example: .. code-block:: py def mock_func(a=6, b=9): c = a*b print(a, b, c) return c, c**2 create_task_from_function(mock_func, function_return=['mul', 'square']) Example arguments from other Tasks (artifact): .. code-block:: py def mock_func(matrix_np): c = matrix_np*matrix_np print(matrix_np, c) return c create_task_from_function( mock_func, function_kwargs={'matrix_np': 'aabb1122.previous_matrix'}, function_return=['square_matrix'] ) :param name: Unique of the step. For example `stage1` :param function: A global function to convert into a standalone Task :param function_kwargs: Optional, provide subset of function arguments and default values to expose. If not provided automatically take all function arguments & defaults Optional, pass input arguments to the function from other Tasks's output artifact. Example argument named `numpy_matrix` from Task ID `aabbcc` artifact name `answer`: {'numpy_matrix': 'aabbcc.answer'} :param function_return: Provide a list of names for all the results. If not provided, no results will be stored as artifacts. :param project_name: Set the project name for the task. Required if base_task_id is None. :param task_name: Set the name of the remote task, if not provided use `name` argument. :param task_type: Optional, The task type to be created. Supported values: 'training', 'testing', 'inference', 'data_processing', 'application', 'monitor', 'controller', 'optimizer', 'service', 'qc', 'custom' :param auto_connect_frameworks: Control the frameworks auto connect, see `Task.init` auto_connect_frameworks :param auto_connect_arg_parser: Control the ArgParser auto connect, see `Task.init` auto_connect_arg_parser :param packages: Manually specify a list of required packages or a local requirements.txt file. Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt" If not provided, packages are automatically added based on the imports used in the function. :param repo: Optional, specify a repository to attach to the function, when remotely executing. Allow users to execute the function inside the specified repository, enabling to load modules/script from a repository Notice the execution work directory will be the repository root folder. Supports both git repo url link, and local repository path. Example remote url: 'https://github.com/user/repo.git' Example local repo copy: './repo' -> will automatically store the remote repo url and commit ID based on the locally cloned copy :param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used) :param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used) :param helper_functions: Optional, a list of helper functions to make available for the standalone function Task. :param docker: Select the docker image to be executed in by the remote session :param docker_args: Add docker arguments, pass a single string :param docker_bash_setup_script: Add bash script to be executed inside the docker before setting up the Task's environment :param parents: Optional list of parent nodes in the DAG. The current step in the pipeline will be sent for execution only after all the parent nodes have been executed successfully. :param execution_queue: Optional, the queue to use for executing this specific step. If not provided, the task will be sent to the default execution queue, as defined on the class :param monitor_metrics: Optional, log the step's metrics on the pipeline Task. Format is a list of pairs metric (title, series) to log: [(step_metric_title, step_metric_series), ] Example: [('test', 'accuracy'), ] Or a list of tuple pairs, to specify a different target metric for to use on the pipeline Task: [((step_metric_title, step_metric_series), (target_metric_title, target_metric_series)), ] Example: [[('test', 'accuracy'), ('model', 'accuracy')], ] :param monitor_artifacts: Optional, log the step's artifacts on the pipeline Task. Provided a list of artifact names existing on the step's Task, they will also appear on the Pipeline itself. Example: [('processed_data', 'final_processed_data'), ] Alternatively user can also provide a list of artifacts to monitor (target artifact name will be the same as original artifact name) Example: ['processed_data', ] :param monitor_models: Optional, log the step's output models on the pipeline Task. Provided a list of model names existing on the step's Task, they will also appear on the Pipeline itself. Example: [('model_weights', 'final_model_weights'), ] Alternatively user can also provide a list of models to monitor (target models name will be the same as original model) Example: ['model_weights', ] To select the latest (lexicographic) model use "model_*", or the last created model with just "*" Example: ['model_weights_*', ] :param time_limit: Default None, no time limit. Step execution time limit, if exceeded the Task is aborted and the pipeline is stopped and marked failed. :param continue_on_fail: (default False). If True, failed step will not cause the pipeline to stop (or marked as failed). Notice, that steps that are connected (or indirectly connected) to the failed step will be skipped. :param pre_execute_callback: Callback function, called when the step (Task) is created, and before it is sent for execution. Allows a user to modify the Task before launch. Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object. `parameters` are the configuration arguments passed to the ClearmlJob. If the callback returned value is `False`, the Node is skipped and so is any node in the DAG that relies on this node. Notice the `parameters` are already parsed, e.g. `${step1.parameters.Args/param}` is replaced with relevant value. .. code-block:: py def step_created_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, parameters, # type: dict ): pass :param post_execute_callback: Callback function, called when a step (Task) is completed and other jobs are executed. Allows a user to modify the Task status after completion. .. code-block:: py def step_completed_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, ): pass :param cache_executed_step: If True, before launching the new step, after updating with the latest configuration, check if an exact Task with the same parameter/code was already executed. If it was found, use it instead of launching a new Task. Default: False, a new cloned copy of base_task is always used. Notice: If the git repo reference does not have a specific commit ID, the Task will never be used. :param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry - Integer: In case of node failure, retry the node the number of times indicated by this parameter. - Callable: A function called on node failure. Takes as parameters: the PipelineController instance, the PipelineController.Node that failed and an int representing the number of previous retries for the node that failed The function must return a `bool`: True if the node should be retried and False otherwise. If True, the node will be re-queued and the number of retries left will be decremented by 1. By default, if this callback is not specified, the function will be retried the number of times indicated by `retry_on_failure`. .. code-block:: py def example_retry_on_failure_callback(pipeline, node, retries): print(node.name, ' failed') # allow up to 5 retries (total of 6 runs) return retries < 5 :param status_change_callback: Callback function, called when the status of a step (Task) changes. Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object. The signature of the function must look the following way: .. code-block:: py def status_change_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, previous_status # type: str ): pass :param tags: A list of tags for the specific pipeline step. When executing a Pipeline remotely (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. :param output_uri: The storage / output url for this step. This is the default location for output models and other artifacts. Check Task.init reference docs for more info (output_uri is a parameter). :return: True if successful """ # always store callback functions (even when running remotely) if pre_execute_callback: self._pre_step_callbacks[name] = pre_execute_callback if post_execute_callback: self._post_step_callbacks[name] = post_execute_callback if status_change_callback: self._status_change_callbacks[name] = status_change_callback self._verify_node_name(name) if output_uri is None: output_uri = self._output_uri function_input_artifacts = {} # go over function_kwargs, split it into string and input artifacts for k, v in function_kwargs.items(): if v is None: continue if self._step_ref_pattern.match(str(v)): # check for step artifacts step, _, artifact = v[2:-1].partition('.') if step in self._nodes and artifact in self._nodes[step].return_artifacts: function_input_artifacts[k] = "${{{}.id}}.{}".format(step, artifact) continue # verify the reference only if we are running locally (on remote when we have multiple # steps from tasks the _nodes is till empty, only after deserializing we will have the full DAG) if self._task.running_locally(): self.__verify_step_reference(node=self.Node(name=name), step_ref_string=v) elif not isinstance(v, (float, int, bool, six.string_types)): function_input_artifacts[k] = "{}.{}.{}".format(self._task.id, name, k) self._upload_pipeline_artifact(artifact_name="{}.{}".format(name, k), artifact_object=v) function_kwargs = {k: v for k, v in function_kwargs.items() if k not in function_input_artifacts} parameters = {"{}/{}".format(CreateFromFunction.kwargs_section, k): v for k, v in function_kwargs.items()} if function_input_artifacts: parameters.update( {"{}/{}".format(CreateFromFunction.input_artifact_section, k): str(v) for k, v in function_input_artifacts.items()} ) job_code_section = name task_name = task_name or name or None if self._mock_execution: project_name = project_name or self._get_target_project() or self._task.get_project_name() task_definition = self._create_task_from_function( docker, docker_args, docker_bash_setup_script, function, function_input_artifacts, function_kwargs, function_return, auto_connect_frameworks, auto_connect_arg_parser, packages, project_name, task_name, task_type, repo, repo_branch, repo_commit, helper_functions, output_uri=output_uri) elif self._task.running_locally() or self._task.get_configuration_object(name=name) is None: project_name = project_name or self._get_target_project() or self._task.get_project_name() task_definition = self._create_task_from_function( docker, docker_args, docker_bash_setup_script, function, function_input_artifacts, function_kwargs, function_return, auto_connect_frameworks, auto_connect_arg_parser, packages, project_name, task_name, task_type, repo, repo_branch, repo_commit, helper_functions, output_uri=output_uri) # update configuration with the task definitions # noinspection PyProtectedMember self._task._set_configuration( name=name, config_type='json', config_text=json.dumps(task_definition, indent=1) ) else: # load task definition from configuration # noinspection PyProtectedMember config_text = self._task._get_configuration_text(name=name) task_definition = json.loads(config_text) if config_text else dict() def _create_task(_): a_task = Task.create( project_name=project_name, task_name=task_definition.get('name'), task_type=task_definition.get('type'), ) # replace reference a_task.update_task(task_definition) if tags: a_task.add_tags(tags) if output_uri is not None: a_task.output_uri = output_uri return a_task self._nodes[name] = self.Node( name=name, base_task_id=None, parents=parents or [], queue=execution_queue, timeout=time_limit, parameters=parameters, clone_task=False, cache_executed_step=cache_executed_step, task_factory_func=_create_task, continue_on_fail=continue_on_fail, return_artifacts=function_return, monitor_artifacts=monitor_artifacts, monitor_metrics=monitor_metrics, monitor_models=monitor_models, job_code_section=job_code_section, explicit_docker_image=docker, output_uri=output_uri ) self._retries[name] = 0 self._retries_callbacks[name] = retry_on_failure if callable(retry_on_failure) else \ (functools.partial(self._default_retry_on_failure_callback, max_retries=retry_on_failure) if isinstance(retry_on_failure, int) else self._retry_on_failure_callback) return True def _relaunch_node(self, node): if not node.job: getLogger("clearml.automation.controller").warning( "Could not relaunch node {} (job object is missing)".format(node.name) ) return self._retries[node.name] = self._retries.get(node.name, 0) + 1 getLogger("clearml.automation.controller").warning( "Node '{}' failed. Retrying... (this is retry number {})".format(node.name, self._retries[node.name]) ) node.job.task.mark_stopped(force=True, status_message=self._relaunch_status_message) node.job.task.set_progress(0) node.job.task.get_logger().report_text( "\nNode '{}' failed. Retrying... (this is retry number {})\n".format(node.name, self._retries[node.name]) ) parsed_queue_name = self._parse_step_ref(node.queue) node.job.launch(queue_name=parsed_queue_name or self._default_execution_queue) def _launch_node(self, node): # type: (PipelineController.Node) -> () """ Launch a single node (create and enqueue a ClearmlJob) :param node: Node to launch :return: Return True if a new job was launched """ # clear state if we are creating a new job if not node.job: node.job_started = None node.job_ended = None node.job_type = None if node.job or node.executed: print('Skipping cached/executed step [{}]'.format(node.name)) return False print('Launching step [{}]'.format(node.name)) updated_hyper_parameters = {} for k, v in node.parameters.items(): updated_hyper_parameters[k] = self._parse_step_ref(v, recursive=node.recursively_parse_parameters) task_overrides = self._parse_task_overrides(node.task_overrides) if node.task_overrides else None extra_args = dict() extra_args["project"] = self._get_target_project(return_project_id=True) or None # set Task name to match job name if self._pipeline_as_sub_project: extra_args["name"] = node.name if node.explicit_docker_image: extra_args["explicit_docker_image"] = node.explicit_docker_image skip_node = None if self._pre_step_callbacks.get(node.name): skip_node = self._pre_step_callbacks[node.name](self, node, updated_hyper_parameters) if skip_node is False: node.skip_job = True return True task_id = node.base_task_id disable_clone_task = not node.clone_task task_factory_func_task = None if node.task_factory_func: # create Task task_factory_func_task = node.task_factory_func(node) task_id = task_factory_func_task.id disable_clone_task = True try: node.job = self._clearml_job_class( base_task_id=task_id, parameter_override=updated_hyper_parameters, configuration_overrides=node.configurations, tags=['{} {}'.format(self._node_tag_prefix, self._task.id)] if self._add_pipeline_tags and self._task else None, parent=self._task.id if self._task else None, disable_clone_task=disable_clone_task, task_overrides=task_overrides, allow_caching=node.cache_executed_step, output_uri=node.output_uri, **extra_args ) except Exception: self._pipeline_task_status_failed = True raise node.job_started = None node.job_ended = None node.job_type = str(node.job.task.task_type) if self._experiment_created_cb: skip_node = self._experiment_created_cb(self, node, updated_hyper_parameters) if skip_node is False: # skipping node getLogger('clearml.automation.controller').warning( 'Skipping node {} on callback request'.format(node)) # delete the job we just created node.job.delete() node.skip_job = True elif node.job.is_cached_task(): node.executed = node.job.task_id() if task_factory_func_task: task_factory_func_task.delete(raise_on_error=False) self._running_nodes.append(node.name) else: self._running_nodes.append(node.name) parsed_queue_name = self._parse_step_ref(node.queue) return node.job.launch(queue_name=parsed_queue_name or self._default_execution_queue) return True def _update_execution_plot(self): # type: () -> () """ Update sankey diagram of the current pipeline Also update the controller Task artifact storing the DAG state (with all the nodes states) """ if not self._task: return nodes = list(self._nodes.values()) self._update_nodes_status() # update the configuration state, so that the UI is presents the correct state self._force_task_configuration_update() sankey_node = dict( label=[], color=[], hovertemplate='%{label}<extra></extra>', # customdata=[], # hovertemplate='%{label}<br />Hyper-Parameters:<br />%{customdata}<extra></extra>', ) sankey_link = dict( source=[], target=[], value=[], # hovertemplate='%{target.label}<extra></extra>', hovertemplate='<extra></extra>', ) visited = [] node_params = [] # update colors while nodes: next_nodes = [] for node in nodes: if not all(p in visited for p in node.parents or []): next_nodes.append(node) continue visited.append(node.name) idx = len(visited) - 1 parents = [visited.index(p) for p in node.parents or []] if node.job and node.job.task_parameter_override is not None: node.job.task_parameter_override.update(node.parameters or {}) node_params.append( ( node.job.task_parameter_override if node.job and node.job.task_parameter_override else node.parameters ) or {} ) # sankey_node['label'].append(node.name) # sankey_node['customdata'].append( # '<br />'.join('{}: {}'.format(k, v) for k, v in (node.parameters or {}).items())) sankey_node['label'].append( '{}<br />'.format(node.name) + '<br />'.join('{}: {}'.format(k, v if len(str(v)) < 24 else (str(v)[:24]+' ...')) for k, v in (node.parameters or {}).items())) sankey_node['color'].append(self._get_node_color(node)) for p in parents: sankey_link['source'].append(p) sankey_link['target'].append(idx) sankey_link['value'].append(1) # if nothing changed, we give up if nodes == next_nodes: break nodes = next_nodes # make sure we have no independent (unconnected) nodes single_nodes = [] for i in [n for n in range(len(visited)) if n not in sankey_link['source'] and n not in sankey_link['target']]: single_nodes.append(i) # create the sankey graph dag_flow = dict( link=sankey_link, node=sankey_node, textfont=dict(color='rgba(0,0,0,0)', size=1), type='sankey', orientation='h' ) table_values = self._build_table_report(node_params, visited) # hack, show single node sankey if single_nodes: singles_flow = dict( x=list(range(len(single_nodes))), y=[1] * len(single_nodes), text=[v for i, v in enumerate(sankey_node['label']) if i in single_nodes], mode='markers', hovertemplate="%{text}<extra></extra>", marker=dict( color=[v for i, v in enumerate(sankey_node['color']) if i in single_nodes], size=[40] * len(single_nodes), ), showlegend=False, type='scatter', ) # only single nodes if len(single_nodes) == len(sankey_node['label']): fig = dict(data=[singles_flow], layout={ 'hovermode': 'closest', 'xaxis': {'visible': False}, 'yaxis': {'visible': False}}) else: dag_flow['domain'] = {'x': [0.0, 1.0], 'y': [0.2, 1.0]} fig = dict(data=[dag_flow, singles_flow], layout={'autosize': True, 'hovermode': 'closest', 'xaxis': {'anchor': 'y', 'domain': [0.0, 1.0], 'visible': False}, 'yaxis': {'anchor': 'x', 'domain': [0.0, 0.15], 'visible': False} }) else: # create the sankey plot fig = dict(data=[dag_flow], layout={'xaxis': {'visible': False}, 'yaxis': {'visible': False}}) # report DAG self._task.get_logger().report_plotly( title=self._report_plot_execution_flow['title'], series=self._report_plot_execution_flow['series'], iteration=0, figure=fig) # report detailed table self._task.get_logger().report_table( title=self._report_plot_execution_details['title'], series=self._report_plot_execution_details['series'], iteration=0, table_plot=table_values) def _build_table_report(self, node_params, visited): # type: (List, List) -> List[List] """ Create the detailed table report on all the jobs in the pipeline :param node_params: list of node parameters :param visited: list of nodes :return: Table as a List of a List of strings (cell) """ task_link_template = self._task.get_output_log_web_page() \ .replace('/{}/'.format(self._task.project), '/{project}/') \ .replace('/{}/'.format(self._task.id), '/{task}/') table_values = [["Pipeline Step", "Task ID", "Task Name", "Status", "Parameters"]] for name, param in zip(visited, node_params): param_str = str(param) if param else '' if len(param_str) > 3: # remove {} from string param_str = param_str[1:-1] step_name = name if self._nodes[name].base_task_id: step_name += '\n[<a href="{}"> {} </a>]'.format( task_link_template.format(project='*', task=self._nodes[name].base_task_id), 'base task') table_values.append( [step_name, self.__create_task_link(self._nodes[name], task_link_template), self._nodes[name].job.task.name if self._nodes[name].job else '', str(self._nodes[name].status or ""), param_str] ) return table_values def _call_retries_callback(self, node): # if this functions returns True, we should relaunch the node # if False, don't relaunch if node.name not in self._retries_callbacks: return False try: return self._retries_callbacks[node.name](self, node, self._retries.get(node.name, 0)) except Exception as e: getLogger("clearml.automation.controller").warning( "Failed calling the retry callback for node '{}'. Error is '{}'".format(node.name, e) ) return False @classmethod def _get_node_color(cls, node): # type (self.Mode) -> str """ Return the node color based on the node/job state :param node: A node in the pipeline :return: string representing the color of the node (e.g. "red", "green", etc) """ if not node: return "" color_lookup = { "failed": "red", "cached": "darkslateblue", "completed": "blue", "aborted": "royalblue", "queued": "#bdf5bd", "running": "green", "skipped": "gray", "pending": "lightsteelblue", } return color_lookup.get(node.status, "") def _update_nodes_status(self): # type () -> () """ Update the status of all nodes in the pipeline """ jobs = [] previous_status_map = {} # copy to avoid race condition nodes = self._nodes.copy() for name, node in nodes.items(): if not node.job: continue # noinspection PyProtectedMember previous_status_map[name] = node.job._last_status jobs.append(node.job) BaseJob.update_status_batch(jobs) for node in nodes.values(): self._update_node_status(node) def _update_node_status(self, node): # type (self.Node) -> () """ Update the node status entry based on the node/job state :param node: A node in the pipeline """ previous_status = node.status if node.job and node.job.is_running(): node.set_job_started() update_job_ended = node.job_started and not node.job_ended if node.executed is not None: if node.job and node.job.is_failed(): # failed job node.status = "failed" elif node.job and node.job.is_cached_task(): # cached job node.status = "cached" elif not node.job or node.job.is_completed(): # completed job node.status = "completed" else: # aborted job node.status = "aborted" elif node.job: if node.job.is_pending(): # lightgreen, pending in queue node.status = "queued" elif node.job.is_completed(): # completed job node.status = "completed" elif node.job.is_failed(): # failed job node.status = "failed" elif node.job.is_stopped(): # aborted job node.status = "aborted" else: node.status = "running" elif node.skip_job: node.status = "skipped" else: node.status = "pending" if update_job_ended and node.status in ("aborted", "failed", "completed"): node.set_job_ended() if ( previous_status is not None and previous_status != node.status and self._status_change_callbacks.get(node.name) ): # noinspection PyBroadException try: self._status_change_callbacks[node.name](self, node, previous_status) except Exception as e: getLogger("clearml.automation.controller").warning( "Failed calling the status change callback for node '{}'. Error is '{}'".format(node.name, e) ) def _update_dag_state_artifact(self): # type: () -> () pipeline_dag = self._serialize() self._task.upload_artifact( name=self._state_artifact_name, artifact_object='', metadata=dict(pipeline=hash_dict(pipeline_dag)), preview=json.dumps(pipeline_dag, indent=1)) def _force_task_configuration_update(self): # type: () -> () pipeline_dag = self._serialize() if self._task: # noinspection PyProtectedMember self._task._set_configuration( name=self._config_section, config_type='dictionary', description="pipeline state: {}".format(hash_dict(pipeline_dag)), config_text=json.dumps(pipeline_dag, indent=2), force=True) def _update_progress(self): # type: () -> () """ Update progress of the pipeline every PipelineController._update_progress_interval seconds. Progress is calculated as the mean of the progress of each step in the pipeline. """ if time() - self._last_progress_update_time < self._update_progress_interval: return # copy to avoid race condition nodes = self._nodes.copy() job_progress = [(node.job.task.get_progress() or 0) if node.job else 0 for node in nodes.values()] if len(job_progress): self._task.set_progress(int(sum(job_progress) / len(job_progress))) self._last_progress_update_time = time() def _daemon(self): # type: () -> () """ The main pipeline execution loop. This loop is executed on its own dedicated thread. :return: """ launch_thread_pool = ThreadPool(16) pooling_counter = 0 launched_nodes = set() last_monitor_report = last_plot_report = time() while self._stop_event: # stop request if self._stop_event.wait(self._pool_frequency if pooling_counter else 0.01): break pooling_counter += 1 # check the pipeline time limit if self._pipeline_time_limit and (time() - self._start_time) > self._pipeline_time_limit: break self._update_progress() self._update_nodes_status() # check the state of all current jobs # if no a job ended, continue completed_jobs = [] force_execution_plot_update = False nodes_failed_stop_pipeline = [] for j in self._running_nodes: node = self._nodes[j] if not node.job: continue if node.job.is_stopped(aborted_nonresponsive_as_running=True): node_failed = node.job.is_failed() if node_failed: if self._call_retries_callback(node): self._relaunch_node(node) continue else: self._final_failure[node.name] = True completed_jobs.append(j) node.executed = node.job.task_id() if not node_failed else False if j in launched_nodes: launched_nodes.remove(j) # check if we need to stop all running steps if node_failed and self._abort_running_steps_on_failure and not node.continue_on_fail: nodes_failed_stop_pipeline.append(node.name) elif node.timeout: started = node.job.task.data.started if (datetime.now().astimezone(started.tzinfo) - started).total_seconds() > node.timeout: node.job.abort() completed_jobs.append(j) node.executed = node.job.task_id() elif j in launched_nodes and node.job.is_running(): # make sure update the execution graph when the job started running # (otherwise it will still be marked queued) launched_nodes.remove(j) force_execution_plot_update = True # update running jobs self._running_nodes = [j for j in self._running_nodes if j not in completed_jobs] # nothing changed, we can sleep if not completed_jobs and self._running_nodes: # force updating the pipeline state (plot) at least every 5 min. if force_execution_plot_update or time()-last_plot_report > self._update_execution_plot_interval: last_plot_report = time() last_monitor_report = time() self.update_execution_plot() elif time()-last_monitor_report > self._monitor_node_interval: last_monitor_report = time() self._scan_monitored_nodes() continue # callback on completed jobs if self._experiment_completed_cb or self._post_step_callbacks: for job in completed_jobs: job_node = self._nodes.get(job) if not job_node: continue if self._experiment_completed_cb: self._experiment_completed_cb(self, job_node) if self._post_step_callbacks.get(job_node.name): self._post_step_callbacks[job_node.name](self, job_node) # check if we need to stop the pipeline, and abort all running steps if nodes_failed_stop_pipeline: print('Aborting pipeline and stopping all running steps, node {} failed'.format( nodes_failed_stop_pipeline)) break # Pull the next jobs in the pipeline, based on the completed list next_nodes = [] for node in list(self._nodes.values()): # check if already processed or needs to be skipped if node.job or node.executed or node.skip_job: continue completed_parents = [bool(p in self._nodes and self._nodes[p].executed) for p in node.parents or []] if all(completed_parents): next_nodes.append(node.name) # update the execution graph print('Launching the next {} steps'.format(len(next_nodes))) node_launch_success = launch_thread_pool.map( self._launch_node, [self._nodes[name] for name in next_nodes]) for name, success in zip(next_nodes, node_launch_success): if success and not self._nodes[name].skip_job: if self._nodes[name].job and self._nodes[name].job.task_parameter_override is not None: self._nodes[name].job.task_parameter_override.update(self._nodes[name].parameters or {}) print('Launching step: {}'.format(name)) print('Parameters:\n{}'.format( self._nodes[name].job.task_parameter_override if self._nodes[name].job else self._nodes[name].parameters)) print('Configurations:\n{}'.format(self._nodes[name].configurations)) print('Overrides:\n{}'.format(self._nodes[name].task_overrides)) launched_nodes.add(name) # check if node is cached do not wait for event but run the loop again if self._nodes[name].executed: pooling_counter = 0 else: getLogger('clearml.automation.controller').warning( 'Skipping launching step \'{}\': {}'.format(name, self._nodes[name])) # update current state (in configuration, so that we could later continue an aborted pipeline) # visualize pipeline state (plot) self.update_execution_plot() # quit if all pipelines nodes are fully executed. if not next_nodes and not self._running_nodes: break # stop all currently running jobs: for node in list(self._nodes.values()): if node.executed is False and not node.continue_on_fail: self._pipeline_task_status_failed = True if node.job and not node.job.is_stopped(): node.job.abort() elif not node.job and not node.executed: # mark Node as skipped if it has no Job object and it is not executed node.skip_job = True # visualize pipeline state (plot) self.update_execution_plot() if self._stop_event: # noinspection PyBroadException try: self._stop_event.set() except Exception: pass def _parse_step_ref(self, value, recursive=False): # type: (Any) -> Optional[str] """ Return the step reference. For example "${step1.parameters.Args/param}" :param value: string :param recursive: if True, recursively parse all values in the dict, list or tuple :return: """ # look for all the step references pattern = self._step_ref_pattern updated_value = value if isinstance(value, str): for g in pattern.findall(value): # update with actual value new_val = self.__parse_step_reference(g) if not isinstance(new_val, six.string_types): return new_val updated_value = updated_value.replace(g, new_val, 1) # if we have a dict, list or tuple, we need to recursively update the values if recursive: if isinstance(value, dict): updated_value = {} for k, v in value.items(): updated_value[k] = self._parse_step_ref(v, recursive=True) elif isinstance(value, list): updated_value = [self._parse_step_ref(v, recursive=True) for v in value] elif isinstance(value, tuple): updated_value = tuple(self._parse_step_ref(v, recursive=True) for v in value) return updated_value def _parse_task_overrides(self, task_overrides): # type: (dict) -> dict """ Return the step reference. For example "${step1.parameters.Args/param}" :param task_overrides: string :return: """ updated_overrides = {} for k, v in task_overrides.items(): updated_overrides[k] = self._parse_step_ref(v) return updated_overrides def _verify_node_name(self, name): # type: (str) -> None if name in self._nodes: raise ValueError('Node named \'{}\' already exists in the pipeline dag'.format(name)) if name in self._reserved_pipeline_names: raise ValueError('Node named \'{}\' is a reserved keyword, use a different name'.format(name)) def _scan_monitored_nodes(self): # type: () -> None """ Scan all nodes and monitor their metrics/artifacts/models """ for node in list(self._nodes.values()): self._monitor_node(node) def _monitor_node(self, node): # type: (PipelineController.Node) -> None """ If Node is running, put the metrics from the node on the pipeline itself. :param node: Node to test """ if not node: return # verify we have the node if node.name not in self._monitored_nodes: self._monitored_nodes[node.name] = {} # if we are done with this node, skip it if self._monitored_nodes[node.name].get('completed'): return if node.job and node.job.task: task = node.job.task elif node.job and node.executed and isinstance(node.executed, str): task = Task.get_task(task_id=node.executed) else: return # update the metrics if node.monitor_metrics: metrics_state = self._monitored_nodes[node.name].get('metrics', {}) logger = self._task.get_logger() scalars = task.get_reported_scalars(x_axis='iter') for (s_title, s_series), (t_title, t_series) in node.monitor_metrics: values = scalars.get(s_title, {}).get(s_series) if values and values.get('x') is not None and values.get('y') is not None: x = values['x'][-1] y = values['y'][-1] last_y = metrics_state.get(s_title, {}).get(s_series) if last_y is None or y > last_y: logger.report_scalar(title=t_title, series=t_series, value=y, iteration=int(x)) last_y = y if not metrics_state.get(s_title): metrics_state[s_title] = {} metrics_state[s_title][s_series] = last_y self._monitored_nodes[node.name]['metrics'] = metrics_state if node.monitor_artifacts: task.reload() artifacts = task.data.execution.artifacts self._task.reload() output_artifacts = [] for s_artifact, t_artifact in node.monitor_artifacts: # find artifact for a in artifacts: if a.key != s_artifact: continue new_a = copy(a) new_a.key = t_artifact output_artifacts.append(new_a) break # update artifacts directly on the Task if output_artifacts: # noinspection PyProtectedMember self._task._add_artifacts(output_artifacts) if node.monitor_models: task.reload() output_models = task.data.models.output self._task.reload() target_models = [] for s_model, t_model in node.monitor_models: # find artifact for a in output_models: if a.name != s_model: continue new_a = copy(a) new_a.name = t_model target_models.append(new_a) break # update artifacts directly on the Task if target_models: self._task.reload() models = self._task.data.models keys = [a.name for a in target_models] models.output = [a for a in models.output or [] if a.name not in keys] + target_models # noinspection PyProtectedMember self._task._edit(models=models) # update the state (so that we do not scan the node twice) if node.job.is_stopped(aborted_nonresponsive_as_running=True): self._monitored_nodes[node.name]['completed'] = True def _get_target_project(self, return_project_id=False): # type: (bool) -> str """ return the pipeline components target folder name/id :param return_project_id: if False (default), return target folder name. If True, return project id :return: project id/name (None if not valid) """ if not self._target_project: return '' if str(self._target_project).lower().strip() == 'true': if not self._task: return '' return self._task.project if return_project_id else self._task.get_project_name() if not return_project_id: return self._target_project return get_or_create_project( session=self._task.session if self._task else Task.default_session, project_name=self._target_project) def _add_pipeline_name_run_number(self): # type: () -> None if not self._task: return # if we were already executed, do not rename (meaning aborted pipeline that was continued) # noinspection PyProtectedMember if self._task._get_runtime_properties().get(self._runtime_property_hash): return # remove the #<num> suffix if we have one: task_name = re.compile(r" #\d+$").split(self._task.name or "", 1)[0] page_size = 100 # find exact name or " #<num>" extension prev_pipelines_ids = self._task.query_tasks( task_name=r"^{}(| #\d+)$".format(task_name), task_filter=dict( project=[self._task.project], system_tags=[self._tag], order_by=['-created'], page_size=page_size, fetch_only_first_page=True, ) ) max_value = len(prev_pipelines_ids) if prev_pipelines_ids else 0 # we hit the limit if max_value == page_size: # make sure that if we get something wrong we do not stop the pipeline, # worst case fail to auto increment try: # we assume we are the latest so let's take a few (last 10) and check the max number last_task_name = self._task.query_tasks( task_filter=dict(task_ids=prev_pipelines_ids[:10], project=[self._task.project]), additional_return_fields=['name'], ) # type: List[Dict] # let's parse the names pattern = re.compile(r" #(?P<key>\d+)$") task_parts = [pattern.split(t.get('name') or "", 1) for t in last_task_name] # find the highest number for parts in task_parts: if len(parts) >= 2: try: max_value = max(max_value, int(parts[1])+1) except (TypeError, ValueError): pass except Exception as ex: getLogger('clearml.automation.controller').warning( 'Pipeline auto run increment failed (skipping): {}'.format(ex)) max_value = 0 if max_value > 1: self._task.set_name(task_name + " #{}".format(max_value)) @classmethod def _get_pipeline_task(cls): # type: () -> Task """ Return the pipeline Task (either the current one, or the parent Task of the currently running Task) Raise ValueError if we could not locate the pipeline Task :return: Pipeline Task """ # get main Task. task = Task.current_task() if str(task.task_type) == str(Task.TaskTypes.controller) and cls._tag in task.get_system_tags(): return task # get the parent Task, it should be the pipeline if not task.parent: raise ValueError("Could not locate parent Pipeline Task") parent = Task.get_task(task_id=task.parent) if str(parent.task_type) == str(Task.TaskTypes.controller) and cls._tag in parent.get_system_tags(): return parent raise ValueError("Could not locate parent Pipeline Task") def __verify_step_reference(self, node, step_ref_string): # type: (PipelineController.Node, str) -> Optional[str] """ Verify the step reference. For example "${step1.parameters.Args/param}" Raise ValueError on misconfiguration :param Node node: calling reference node (used for logging) :param str step_ref_string: For example "${step1.parameters.Args/param}" :return: If step reference is used, return the pipeline step name, otherwise return None """ parts = step_ref_string[2:-1].split('.') v = step_ref_string if len(parts) < 2: raise ValueError("Node '{}', parameter '{}' is invalid".format(node.name, v)) prev_step = parts[0] input_type = parts[1] # check if we reference the pipeline arguments themselves if prev_step == self._pipeline_step_ref: if input_type not in self._pipeline_args: raise ValueError("Node '{}', parameter '{}', step name '{}' is invalid".format(node.name, v, prev_step)) return None if prev_step not in self._nodes: raise ValueError("Node '{}', parameter '{}', step name '{}' is invalid".format(node.name, v, prev_step)) if input_type not in ('artifacts', 'parameters', 'models', 'id'): raise ValueError( "Node {}, parameter '{}', input type '{}' is invalid".format(node.name, v, input_type)) if input_type != 'id' and len(parts) < 3: raise ValueError("Node '{}', parameter '{}' is invalid".format(node.name, v)) if input_type == 'models': try: model_type = parts[2].lower() except Exception: raise ValueError( "Node '{}', parameter '{}', input type '{}', model_type is missing {}".format( node.name, v, input_type, parts)) if model_type not in ('input', 'output'): raise ValueError( "Node '{}', parameter '{}', input type '{}', " "model_type is invalid (input/output) found {}".format( node.name, v, input_type, model_type)) if len(parts) < 4: raise ValueError( "Node '{}', parameter '{}', input type '{}', model index is missing".format( node.name, v, input_type)) # check casting try: int(parts[3]) except Exception: raise ValueError( "Node '{}', parameter '{}', input type '{}', model index is missing {}".format( node.name, v, input_type, parts)) if len(parts) < 5: raise ValueError( "Node '{}', parameter '{}', input type '{}', model property is missing".format( node.name, v, input_type)) if not hasattr(BaseModel, parts[4]): raise ValueError( "Node '{}', parameter '{}', input type '{}', model property is invalid {}".format( node.name, v, input_type, parts[4])) return prev_step def __parse_step_reference(self, step_ref_string): """ return the adjusted value for "${step...}" :param step_ref_string: reference string of the form ${step_name.type.value}" :return: str with value """ parts = step_ref_string[2:-1].split('.') if len(parts) < 2: raise ValueError("Could not parse reference '{}'".format(step_ref_string)) prev_step = parts[0] input_type = parts[1].lower() # check if we reference the pipeline arguments themselves if prev_step == self._pipeline_step_ref: if parts[1] not in self._pipeline_args: raise ValueError("Could not parse reference '{}', " "pipeline argument '{}' could not be found".format(step_ref_string, parts[1])) return self._pipeline_args[parts[1]] if prev_step not in self._nodes or ( not self._nodes[prev_step].job and not self._nodes[prev_step].executed and not self._nodes[prev_step].base_task_id ): raise ValueError("Could not parse reference '{}', step '{}' could not be found".format( step_ref_string, prev_step)) if input_type not in ( 'artifacts', 'parameters', 'models', 'id', 'script', 'execution', 'container', 'output', 'comment', 'models', 'tags', 'system_tags', 'project'): raise ValueError("Could not parse reference '{}', type '{}' not valid".format(step_ref_string, input_type)) if input_type != 'id' and len(parts) < 3: raise ValueError("Could not parse reference '{}', missing fields in '{}'".format(step_ref_string, parts)) task = self._nodes[prev_step].job.task if self._nodes[prev_step].job \ else Task.get_task(task_id=self._nodes[prev_step].executed or self._nodes[prev_step].base_task_id) task.reload() if input_type == 'artifacts': # fix \. to use . in artifacts artifact_path = ('.'.join(parts[2:])).replace('\\.', '\\_dot_\\') artifact_path = artifact_path.split('.') obj = task.artifacts for p in artifact_path: p = p.replace('\\_dot_\\', '.') if isinstance(obj, dict): obj = obj.get(p) elif hasattr(obj, p): obj = getattr(obj, p) else: raise ValueError("Could not locate artifact {} on previous step {}".format( '.'.join(parts[1:]), prev_step)) return str(obj) elif input_type == 'parameters': step_params = task.get_parameters() param_name = '.'.join(parts[2:]) if param_name not in step_params: raise ValueError("Could not locate parameter {} on previous step {}".format( '.'.join(parts[1:]), prev_step)) return step_params.get(param_name) elif input_type == 'models': model_type = parts[2].lower() if model_type not in ('input', 'output'): raise ValueError("Could not locate model {} on previous step {}".format( '.'.join(parts[1:]), prev_step)) try: model_idx = int(parts[3]) model = task.models[model_type][model_idx] except Exception: raise ValueError("Could not locate model {} on previous step {}, index {} is invalid".format( '.'.join(parts[1:]), prev_step, parts[3])) return str(getattr(model, parts[4])) elif input_type == 'id': return task.id elif input_type in ( 'script', 'execution', 'container', 'output', 'comment', 'models', 'tags', 'system_tags', 'project'): # noinspection PyProtectedMember return task._get_task_property('.'.join(parts[1:])) return None @classmethod def __create_task_link(cls, a_node, task_link_template): # type: (PipelineController.Node, str) -> str if not a_node: return '' # create the detailed parameter table task_id = project_id = None if a_node.job: project_id = a_node.job.task.project task_id = a_node.job.task.id elif a_node.executed: task_id = a_node.executed if cls._task_project_lookup.get(task_id): project_id = cls._task_project_lookup[task_id] else: # noinspection PyBroadException try: project_id = Task.get_task(task_id=task_id).project except Exception: project_id = '*' cls._task_project_lookup[task_id] = project_id if not task_id: return '' return '<a href="{}"> {} </a>'.format(task_link_template.format(project=project_id, task=task_id), task_id) def _default_retry_on_failure_callback(self, _pipeline_controller, _node, retries, max_retries=None): return retries < (self._def_max_retry_on_failure if max_retries is None else max_retries) def _upload_pipeline_artifact(self, artifact_name, artifact_object): self._task.upload_artifact( name=artifact_name, artifact_object=artifact_object, wait_on_upload=True, extension_name=( ".pkl" if isinstance(artifact_object, dict) and not self._artifact_serialization_function else None ), serialization_function=self._artifact_serialization_function ) class PipelineDecorator(PipelineController): _added_decorator = [] # type: List[dict] _ref_lazy_loader_id_to_node_name = {} # type: dict _singleton = None # type: Optional[PipelineDecorator] _eager_step_artifact = 'eager_step' _eager_execution_instance = False _debug_execute_step_process = False _debug_execute_step_function = False _default_execution_queue = None _multi_pipeline_instances = [] _multi_pipeline_call_counter = -1 _atexit_registered = False def __init__( self, name, # type: str project, # type: str version=None, # type: Optional[str] pool_frequency=0.2, # type: float add_pipeline_tags=False, # type: bool target_project=None, # type: Optional[str] abort_on_failure=False, # type: bool add_run_number=True, # type: bool retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa docker=None, # type: Optional[str] docker_args=None, # type: Optional[str] docker_bash_setup_script=None, # type: Optional[str] packages=None, # type: Optional[Union[str, Sequence[str]]] repo=None, # type: Optional[str] repo_branch=None, # type: Optional[str] repo_commit=None, # type: Optional[str] artifact_serialization_function=None, # type: Optional[Callable[[Any], Union[bytes, bytearray]]] artifact_deserialization_function=None, # type: Optional[Callable[[bytes], Any]] output_uri=None # type: Optional[Union[str, bool]] ): # type: (...) -> () """ Create a new pipeline controller. The newly created object will launch and monitor the new experiments. :param name: Provide pipeline name (if main Task exists it overrides its name) :param project: Provide project storing the pipeline (if main Task exists it overrides its project) :param version: Pipeline version. This version allows to uniquely identify the pipeline template execution. Examples for semantic versions: version='1.0.1' , version='23', version='1.2'. If not set, find the latest version of the pipeline and increment it. If no such version is found, default to '1.0.0' :param float pool_frequency: The pooling frequency (in minutes) for monitoring experiments / states. :param bool add_pipeline_tags: (default: False) if True, add `pipe: <pipeline_task_id>` tag to all steps (Tasks) created by this pipeline. :param str target_project: If provided, all pipeline steps are cloned into the target project :param bool abort_on_failure: If False (default), failed pipeline steps will not cause the pipeline to stop immediately, instead any step that is not connected (or indirectly connected) to the failed step, will still be executed. Nonetheless, the pipeline itself will be marked failed, unless the failed step was specifically defined with "continue_on_fail=True". If True, any failed step will cause the pipeline to immediately abort, stop all running steps, and mark the pipeline as failed. :param add_run_number: If True (default), add the run number of the pipeline to the pipeline name. Example, the second time we launch the pipeline "best pipeline", we rename it to "best pipeline #2" :param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry - Integer: In case of node failure, retry the node the number of times indicated by this parameter. - Callable: A function called on node failure. Takes as parameters: the PipelineController instance, the PipelineController.Node that failed and an int representing the number of previous retries for the node that failed. The function must return ``True`` if the node should be retried and ``False`` otherwise. If True, the node will be re-queued and the number of retries left will be decremented by 1. By default, if this callback is not specified, the function will be retried the number of times indicated by `retry_on_failure`. .. code-block:: py def example_retry_on_failure_callback(pipeline, node, retries): print(node.name, ' failed') # allow up to 5 retries (total of 6 runs) return retries < 5 :param docker: Select the docker image to be executed in by the remote session :param docker_args: Add docker arguments, pass a single string :param docker_bash_setup_script: Add bash script to be executed inside the docker before setting up the Task's environment :param packages: Manually specify a list of required packages or a local requirements.txt file. Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt" If not provided, packages are automatically added. :param repo: Optional, specify a repository to attach to the pipeline controller, when remotely executing. Allow users to execute the controller inside the specified repository, enabling them to load modules/script from the repository. Notice the execution work directory will be the repository root folder. Supports both git repo url link, and local repository path (automatically converted into the remote git/commit as is currently checkout). Example remote url: 'https://github.com/user/repo.git' Example local repo copy: './repo' -> will automatically store the remote repo url and commit ID based on the locally cloned copy Use empty string ("") to disable any repository auto-detection :param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used) :param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used) :param artifact_serialization_function: A serialization function that takes one parameter of any type which is the object to be serialized. The function should return a `bytes` or `bytearray` object, which represents the serialized object. All parameter/return artifacts uploaded by the pipeline will be serialized using this function. All relevant imports must be done in this function. For example: .. code-block:: py def serialize(obj): import dill return dill.dumps(obj) :param artifact_deserialization_function: A deserialization function that takes one parameter of type `bytes`, which represents the serialized object. This function should return the deserialized object. All parameter/return artifacts fetched by the pipeline will be deserialized using this function. All relevant imports must be done in this function. For example: .. code-block:: py def deserialize(bytes_): import dill return dill.loads(bytes_) :param output_uri: The storage / output url for this pipeline. This is the default location for output models and other artifacts. Check Task.init reference docs for more info (output_uri is a parameter). The `output_uri` of this pipeline's steps will default to this value. """ super(PipelineDecorator, self).__init__( name=name, project=project, version=version, pool_frequency=pool_frequency, add_pipeline_tags=add_pipeline_tags, target_project=target_project, abort_on_failure=abort_on_failure, add_run_number=add_run_number, retry_on_failure=retry_on_failure, docker=docker, docker_args=docker_args, docker_bash_setup_script=docker_bash_setup_script, packages=packages, repo=repo, repo_branch=repo_branch, repo_commit=repo_commit, always_create_from_code=False, artifact_serialization_function=artifact_serialization_function, artifact_deserialization_function=artifact_deserialization_function, output_uri=output_uri ) # if we are in eager execution, make sure parent class knows it if self._eager_execution_instance: self._mock_execution = True if PipelineDecorator._default_execution_queue: super(PipelineDecorator, self).set_default_execution_queue( PipelineDecorator._default_execution_queue) for n in self._added_decorator: self._add_function_step(**n) self._added_decorator.clear() PipelineDecorator._singleton = self self._reference_callback = [] # store launched nodes, in case we call the same function multiple times, and need renaming: self._launched_step_names = set() # map eager steps task id to the new step name self._eager_steps_task_id = {} # type: Dict[str, str] def _daemon(self): # type: () -> () """ The main pipeline execution loop. This loop is executed on its own dedicated thread. override the daemon function, we only need to update the state :return: """ pooling_counter = 0 launched_nodes = set() last_monitor_report = last_plot_report = time() while self._stop_event: # stop request if self._stop_event.wait(self._pool_frequency if pooling_counter else 0.01): break pooling_counter += 1 # check the pipeline time limit if self._pipeline_time_limit and (time() - self._start_time) > self._pipeline_time_limit: break self._update_progress() self._update_nodes_status() # check the state of all current jobs # if no a job ended, continue completed_jobs = [] nodes_failed_stop_pipeline = [] force_execution_plot_update = False for j in self._running_nodes: node = self._nodes[j] if not node.job: continue if node.job.is_stopped(aborted_nonresponsive_as_running=True): node_failed = node.job.is_failed() if node_failed: if self._call_retries_callback(node): self._relaunch_node(node) continue else: self._final_failure[node.name] = True completed_jobs.append(j) node.executed = node.job.task_id() if not node_failed else False if j in launched_nodes: launched_nodes.remove(j) # check if we need to stop all running steps if node_failed and self._abort_running_steps_on_failure and not node.continue_on_fail: nodes_failed_stop_pipeline.append(node.name) elif node.timeout: started = node.job.task.data.started if (datetime.now().astimezone(started.tzinfo) - started).total_seconds() > node.timeout: node.job.abort() completed_jobs.append(j) node.executed = node.job.task_id() elif j in launched_nodes and node.job.is_running(): # make sure update the execution graph when the job started running # (otherwise it will still be marked queued) launched_nodes.remove(j) force_execution_plot_update = True # update running jobs self._running_nodes = [j for j in self._running_nodes if j not in completed_jobs] # nothing changed, we can sleep if not completed_jobs and self._running_nodes: # force updating the pipeline state (plot) at least every 5 min. if force_execution_plot_update or time()-last_plot_report > self._update_execution_plot_interval: last_plot_report = time() last_monitor_report = time() self.update_execution_plot() elif time()-last_monitor_report > self._monitor_node_interval: last_monitor_report = time() self._scan_monitored_nodes() continue # callback on completed jobs if self._experiment_completed_cb or self._post_step_callbacks: for job in completed_jobs: job_node = self._nodes.get(job) if not job_node: continue if self._experiment_completed_cb: self._experiment_completed_cb(self, job_node) if self._post_step_callbacks.get(job_node.name): self._post_step_callbacks[job_node.name](self, job_node) # check if we need to stop the pipeline, and abort all running steps if nodes_failed_stop_pipeline: print('Aborting pipeline and stopping all running steps, node {} failed'.format( nodes_failed_stop_pipeline)) break # update current state (in configuration, so that we could later continue an aborted pipeline) self._force_task_configuration_update() # visualize pipeline state (plot) self.update_execution_plot() # stop all currently running jobs, protect against changes while iterating): for node in list(self._nodes.values()): if node.executed is False and not node.continue_on_fail: self._pipeline_task_status_failed = True if node.job and not node.job.is_stopped(): node.job.abort() elif not node.job and not node.executed: # mark Node as skipped if it has no Job object and it is not executed node.skip_job = True # if this is a standalone node, we need to remove it from the graph if not node.parents: # check if this node is anyone's parent found_parent = False for v in list(self._nodes.values()): if node.name in (v.parents or []): found_parent = True break if not found_parent: self._nodes.pop(node.name, None) # visualize pipeline state (plot) self.update_execution_plot() self._scan_monitored_nodes() if self._stop_event: # noinspection PyBroadException try: self._stop_event.set() except Exception: pass def update_execution_plot(self): # type: () -> () """ Update sankey diagram of the current pipeline """ with self._reporting_lock: self._update_eager_generated_steps() super(PipelineDecorator, self).update_execution_plot() def _update_eager_generated_steps(self): # noinspection PyProtectedMember self._task.reload() artifacts = self._task.data.execution.artifacts # check if we have a new step on the DAG eager_artifacts = [] for a in artifacts: if a.key and a.key.startswith('{}:'.format(self._eager_step_artifact)): # expected value: '"eager_step":"parent-node-task-id":"eager-step-task-id' eager_artifacts.append(a) # verify we have the step, if we do not, add it. delete_artifact_keys = [] for artifact in eager_artifacts: _, parent_step_task_id, eager_step_task_id = artifact.key.split(':', 2) # deserialize node definition eager_node_def = json.loads(artifact.type_data.preview) eager_node_name, eager_node_def = list(eager_node_def.items())[0] # verify we do not have any new nodes on the DAG (i.e. a step generating a Node eagerly) parent_node = None for node in list(self._nodes.values()): if not node.job and not node.executed: continue t_id = node.executed or node.job.task_id if t_id == parent_step_task_id: parent_node = node break if not parent_node: # should not happen continue new_step_node_name = '{}_{}'.format(parent_node.name, eager_node_name) counter = 1 while new_step_node_name in self._nodes: new_step_node_name = '{}_{}'.format(new_step_node_name, counter) counter += 1 eager_node_def['name'] = new_step_node_name eager_node_def['parents'] = [parent_node.name] is_cached = eager_node_def.pop('is_cached', None) self._nodes[new_step_node_name] = self.Node(**eager_node_def) self._nodes[new_step_node_name].job = RunningJob(existing_task=eager_step_task_id) if is_cached: self._nodes[new_step_node_name].job.force_set_is_cached(is_cached) # make sure we will not rescan it. delete_artifact_keys.append(artifact.key) # remove all processed eager step artifacts if delete_artifact_keys: # noinspection PyProtectedMember self._task._delete_artifacts(delete_artifact_keys) self._force_task_configuration_update() def _create_task_from_function( self, docker, docker_args, docker_bash_setup_script, function, function_input_artifacts, function_kwargs, function_return, auto_connect_frameworks, auto_connect_arg_parser, packages, project_name, task_name, task_type, repo, branch, commit, helper_functions, output_uri=None ): def sanitize(function_source): matched = re.match(r"[\s]*@[\w]*.component[\s\\]*\(", function_source) if matched: function_source = function_source[matched.span()[1]:] # find the last ")" open_parenthesis = 0 last_index = -1 for i, c in enumerate(function_source): if not open_parenthesis and c == ')': last_index = i break elif c == ')': open_parenthesis -= 1 elif c == '(': open_parenthesis += 1 if last_index >= 0: function_source = function_source[last_index+1:].lstrip() return function_source task_definition = CreateFromFunction.create_task_from_function( a_function=function, function_kwargs=function_kwargs or None, function_input_artifacts=function_input_artifacts, function_return=function_return, project_name=project_name, task_name=task_name, task_type=task_type, auto_connect_frameworks=auto_connect_frameworks, auto_connect_arg_parser=auto_connect_arg_parser, repo=repo, branch=branch, commit=commit, packages=packages, docker=docker, docker_args=docker_args, docker_bash_setup_script=docker_bash_setup_script, output_uri=output_uri, helper_functions=helper_functions, dry_run=True, task_template_header=self._task_template_header, _sanitize_function=sanitize, artifact_serialization_function=self._artifact_serialization_function, artifact_deserialization_function=self._artifact_deserialization_function ) return task_definition def _find_executed_node_leaves(self): # type: () -> List[PipelineController.Node] all_parents = set([p for n in list(self._nodes.values()) if n.executed for p in n.parents]) executed_leaves = [name for name, n in list(self._nodes.items()) if n.executed and name not in all_parents] return executed_leaves def _adjust_task_hashing(self, task_hash): # type: (dict) -> dict """ Fix the Task hashing so that parameters pointing to the current Task artifact are encoded using the hash content of the artifact, instead of the Task.id :param task_hash: Task representation dict :return: Adjusted Task representation dict """ if task_hash.get('hyper_params'): updated_params = {} for k, v in task_hash['hyper_params'].items(): if k.startswith("{}/".format(CreateFromFunction.input_artifact_section)) and \ str(v).startswith("{}.".format(self._task.id)): task_id, artifact_name = str(v).split(".", 1) if artifact_name in self._task.artifacts: updated_params[k] = self._task.artifacts[artifact_name].hash task_hash['hyper_params'].update(updated_params) return task_hash @classmethod def _wait_for_node(cls, node): pool_period = 5.0 if cls._debug_execute_step_process else 20.0 while True: node.job.wait(pool_period=pool_period, aborted_nonresponsive_as_running=True) job_status = str(node.job.status(force=True)) if ( ( job_status == str(Task.TaskStatusEnum.stopped) and node.job.status_message() == cls._relaunch_status_message ) or (job_status == str(Task.TaskStatusEnum.failed) and not cls._final_failure.get(node.name)) or not node.job.is_stopped() ): sleep(pool_period) else: break @classmethod def component( cls, _func=None, *, return_values=('return_object', ), # type: Union[str, Sequence[str]] name=None, # type: Optional[str] cache=False, # type: bool packages=None, # type: Optional[Union[str, Sequence[str]]] parents=None, # type: Optional[List[str]] execution_queue=None, # type: Optional[str] continue_on_fail=False, # type: bool docker=None, # type: Optional[str] docker_args=None, # type: Optional[str] docker_bash_setup_script=None, # type: Optional[str] task_type=None, # type: Optional[str] auto_connect_frameworks=None, # type: Optional[dict] auto_connect_arg_parser=None, # type: Optional[dict] repo=None, # type: Optional[str] repo_branch=None, # type: Optional[str] repo_commit=None, # type: Optional[str] helper_functions=None, # type: Optional[Sequence[Callable]] monitor_metrics=None, # type: Optional[List[Union[Tuple[str, str], Tuple[(str, str), (str, str)]]]] monitor_artifacts=None, # type: Optional[List[Union[str, Tuple[str, str]]]] monitor_models=None, # type: Optional[List[Union[str, Tuple[str, str]]]] retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa pre_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa post_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa status_change_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, str], None]] # noqa tags=None, # type: Optional[Union[str, Sequence[str]]] output_uri=None # type: Optional[Union[str, bool]] ): # type: (...) -> Callable """ pipeline component function to be executed remotely :param _func: wrapper function :param return_values: Provide a list of names for all the results. Notice! If not provided, no results will be stored as artifacts. :param name: Optional, set the name of the pipeline component task. If not provided, the wrapped function name is used as the pipeline component name :param cache: If True, before launching the new step, after updating with the latest configuration, check if an exact Task with the same parameter/code was already executed. If it was found, use it instead of launching a new Task. Default: False :param packages: Manually specify a list of required packages or a local requirements.txt file. Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt" If not provided, packages are automatically added based on the imports used inside the wrapped function. :param parents: Optional list of parent nodes in the DAG. The current step in the pipeline will be sent for execution only after all the parent nodes have been executed successfully. :param execution_queue: Optional, the queue to use for executing this specific step. If not provided, the task will be sent to the pipeline's default execution queue :param continue_on_fail: (default False). If True, a failed step will not cause the pipeline to stop (or marked as failed). Notice, that steps that are connected (or indirectly connected) to the failed step will be skipped. :param docker: Specify the docker image to be used when executing the pipeline step remotely :param docker_args: Add docker execution arguments for the remote execution (use single string for all docker arguments). :param docker_bash_setup_script: Add a bash script to be executed inside the docker before setting up the Task's environment :param task_type: Optional, The task type to be created. Supported values: 'training', 'testing', 'inference', 'data_processing', 'application', 'monitor', 'controller', 'optimizer', 'service', 'qc', 'custom' :param auto_connect_frameworks: Control the frameworks auto connect, see `Task.init` auto_connect_frameworks :param auto_connect_arg_parser: Control the ArgParser auto connect, see `Task.init` auto_connect_arg_parser :param repo: Optional, specify a repository to attach to the function, when remotely executing. Allow users to execute the function inside the specified repository, enabling them to load modules/script from the repository. Notice the execution work directory will be the repository root folder. Supports both git repo url link, and local repository path (automatically converted into the remote git/commit as is currently checkout). Example remote url: 'https://github.com/user/repo.git' Example local repo copy: './repo' -> will automatically store the remote repo url and commit ID based on the locally cloned copy :param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used) :param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used) :param helper_functions: Optional, a list of helper functions to make available for the standalone pipeline step function Task. By default the pipeline step function has no access to any of the other functions, by specifying additional functions here, the remote pipeline step could call the additional functions. Example, assuming we have two functions parse_data(), and load_data(): [parse_data, load_data] :param monitor_metrics: Optional, Automatically log the step's reported metrics also on the pipeline Task. The expected format is a list of pairs metric (title, series) to log: [(step_metric_title, step_metric_series), ] Example: [('test', 'accuracy'), ] Or a list of tuple pairs, to specify a different target metric to use on the pipeline Task: [((step_metric_title, step_metric_series), (target_metric_title, target_metric_series)), ] Example: [[('test', 'accuracy'), ('model', 'accuracy')], ] :param monitor_artifacts: Optional, Automatically log the step's artifacts on the pipeline Task. Provided a list of artifact names created by the step function, these artifacts will be logged automatically also on the Pipeline Task itself. Example: ['processed_data', ] (target artifact name on the Pipeline Task will hav ethe same name as the original artifact) Alternatively, provide a list of pairs (source_artifact_name, target_artifact_name): where the first string is the artifact name as it appears on the component Task, and the second is the target artifact name to put on the Pipeline Task Example: [('processed_data', 'final_processed_data'), ] :param monitor_models: Optional, Automatically log the step's output models on the pipeline Task. Provided a list of model names created by the step's Task, they will also appear on the Pipeline itself. Example: ['model_weights', ] To select the latest (lexicographic) model use "model_*", or the last created model with just "*" Example: ['model_weights_*', ] Alternatively, provide a list of pairs (source_model_name, target_model_name): where the first string is the model name as it appears on the component Task, and the second is the target model name to put on the Pipeline Task Example: [('model_weights', 'final_model_weights'), ] :param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry - Integer: In case of node failure, retry the node the number of times indicated by this parameter. - Callable: A function called on node failure. Takes as parameters: the PipelineController instance, the PipelineController.Node that failed and an int representing the number of previous retries for the node that failed The function must return a `bool`: True if the node should be retried and False otherwise. If True, the node will be re-queued and the number of retries left will be decremented by 1. By default, if this callback is not specified, the function will be retried the number of times indicated by `retry_on_failure`. .. code-block:: py def example_retry_on_failure_callback(pipeline, node, retries): print(node.name, ' failed') # allow up to 5 retries (total of 6 runs) return retries < 5 :param pre_execute_callback: Callback function, called when the step (Task) is created, and before it is sent for execution. Allows a user to modify the Task before launch. Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object. `parameters` are the configuration arguments passed to the ClearmlJob. If the callback returned value is `False`, the Node is skipped and so is any node in the DAG that relies on this node. Notice the `parameters` are already parsed, e.g. `${step1.parameters.Args/param}` is replaced with relevant value. .. code-block:: py def step_created_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, parameters, # type: dict ): pass :param post_execute_callback: Callback function, called when a step (Task) is completed and other jobs are going to be executed. Allows a user to modify the Task status after completion. .. code-block:: py def step_completed_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, ): pass :param status_change_callback: Callback function, called when the status of a step (Task) changes. Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object. The signature of the function must look the following way: .. code-block:: py def status_change_callback( pipeline, # type: PipelineController, node, # type: PipelineController.Node, previous_status # type: str ): pass :param tags: A list of tags for the specific pipeline step. When executing a Pipeline remotely (i.e. launching the pipeline from the UI/enqueuing it), this method has no effect. :param output_uri: The storage / output url for this step. This is the default location for output models and other artifacts. Check Task.init reference docs for more info (output_uri is a parameter). :return: function wrapper """ def decorator_wrap(func): _name = name or str(func.__name__) function_return = return_values if isinstance(return_values, (tuple, list)) else [return_values] inspect_func = inspect.getfullargspec(func) # add default argument values if inspect_func.args: default_values = list(inspect_func.defaults or []) default_values = ([None] * (len(inspect_func.args)-len(default_values))) + default_values function_kwargs = {k: v for k, v in zip(inspect_func.args, default_values)} else: function_kwargs = dict() add_step_spec = dict( name=_name, function=func, function_kwargs=function_kwargs, function_return=function_return, cache_executed_step=cache, packages=packages, parents=parents, execution_queue=execution_queue, continue_on_fail=continue_on_fail, docker=docker, docker_args=docker_args, docker_bash_setup_script=docker_bash_setup_script, auto_connect_frameworks=auto_connect_frameworks, auto_connect_arg_parser=auto_connect_arg_parser, task_type=task_type, repo=repo, repo_branch=repo_branch, repo_commit=repo_commit, helper_functions=helper_functions, monitor_metrics=monitor_metrics, monitor_models=monitor_models, monitor_artifacts=monitor_artifacts, pre_execute_callback=pre_execute_callback, post_execute_callback=post_execute_callback, status_change_callback=status_change_callback, tags=tags, output_uri=output_uri ) if cls._singleton: cls._singleton._add_function_step(**add_step_spec) else: cls._added_decorator.append(add_step_spec) @functools.wraps(func) def wrapper(*args, **kwargs): if cls._debug_execute_step_function: args = walk_nested_dict_tuple_list( args, lambda x: x._remoteref() if isinstance(x, LazyEvalWrapper) else x) kwargs = walk_nested_dict_tuple_list( kwargs, lambda x: x._remoteref() if isinstance(x, LazyEvalWrapper) else x) func_return = [] def result_wrapper(a_func_return, return_index): if not a_func_return: a_func_return.append(func(*args, **kwargs)) a_func_return = a_func_return[0] return a_func_return if return_index is None else a_func_return[return_index] if len(function_return) == 1: ret_val = LazyEvalWrapper( callback=functools.partial(result_wrapper, func_return, None), remote_reference=functools.partial(result_wrapper, func_return, None)) cls._ref_lazy_loader_id_to_node_name[id(ret_val)] = _name return ret_val else: return_w = [LazyEvalWrapper( callback=functools.partial(result_wrapper, func_return, i), remote_reference=functools.partial(result_wrapper, func_return, i)) for i, _ in enumerate(function_return)] for i in return_w: cls._ref_lazy_loader_id_to_node_name[id(i)] = _name return return_w # resolve all lazy objects if we have any: kwargs_artifacts = {} for i, v in enumerate(args): kwargs[inspect_func.args[i]] = v # We need to remember when a pipeline step's return value is evaluated by the pipeline # controller, but not when it's done here (as we would remember the step every time). # _add_to_evaluated_return_values protects that tid = current_thread().ident cls._add_to_evaluated_return_values[tid] = False kwargs_artifacts.update( { k: walk_nested_dict_tuple_list( v, lambda x: x._remoteref() if isinstance(x, LazyEvalWrapper) else x ) for k, v in kwargs.items() if isinstance(v, LazyEvalWrapper) } ) cls._add_to_evaluated_return_values[tid] = True kwargs = {k: deepcopy(v) for k, v in kwargs.items() if not isinstance(v, LazyEvalWrapper)} # check if we have the singleton if not cls._singleton: # todo: somehow make sure the generated tasks list the parent pipeline as parent original_tags = Task.current_task().get_tags(), Task.current_task().get_system_tags() # This is an adhoc pipeline step, PipelineDecorator._eager_execution_instance = True a_pipeline = PipelineDecorator( name=name, project='DevOps', # it will not actually be used version='0.0.0', pool_frequency=111, add_pipeline_tags=False, target_project=None, ) target_queue = \ PipelineDecorator._default_execution_queue or \ Task.current_task().data.execution.queue if target_queue: PipelineDecorator.set_default_execution_queue(target_queue) else: # if we are not running from a queue, we are probably in debug mode a_pipeline._clearml_job_class = LocalClearmlJob a_pipeline._default_execution_queue = 'mock' # restore tags, the pipeline might add a few Task.current_task().set_tags(original_tags[0]) Task.current_task().set_system_tags(original_tags[1]) # get node name _node_name = _name # check if we are launching the same node twice if _node_name in cls._singleton._launched_step_names: # if we already launched a JOB on the node, this means we are calling the same function/task # twice inside the pipeline, this means we need to replicate the node. _node = cls._singleton._nodes[_node_name].copy() _node.parents = [] # find a new name counter = 1 # Use nodes in `_singleton._nodes` that have not been launched. # First check if we launched the node. # If it wasn't launched we also need to check that the new name of `_node` # points to the original code section it was meant to run. # Note that for the first iteration (when `_node.name == _node_name`) # we always increment the name, as the name is always in `_launched_step_names` while _node.name in cls._singleton._launched_step_names or ( _node.name in cls._singleton._nodes and cls._singleton._nodes[_node.name].job_code_section != cls._singleton._nodes[_node_name].job_code_section ): _node.name = "{}_{}".format(_node_name, counter) counter += 1 _node_name = _node.name if _node.name not in cls._singleton._nodes: cls._singleton._nodes[_node.name] = _node # get node and park is as launched cls._singleton._launched_step_names.add(_node_name) _node = cls._singleton._nodes[_node_name] cls._retries[_node_name] = 0 cls._retries_callbacks[_node_name] = retry_on_failure if callable(retry_on_failure) else \ (functools.partial(cls._singleton._default_retry_on_failure_callback, max_retries=retry_on_failure) if isinstance(retry_on_failure, int) else cls._singleton._retry_on_failure_callback) # The actual launch is a bit slow, we run it in the background launch_thread = Thread( target=cls._component_launch, args=(_node_name, _node, kwargs_artifacts, kwargs, current_thread().ident)) def results_reference(return_name): # wait until launch is completed if launch_thread and launch_thread.is_alive(): try: launch_thread.join() except: # noqa pass cls._wait_for_node(_node) if not _node.job: if not _node.executed: raise ValueError("Job was not created and is also not cached/executed") return "{}.{}".format(_node.executed, return_name) if _node.job.is_failed() and not _node.continue_on_fail: raise ValueError( 'Pipeline step "{}", Task ID={} failed'.format(_node.name, _node.job.task_id())) _node.executed = _node.job.task_id() return "{}.{}".format(_node.job.task_id(), return_name) def result_wrapper(return_name): # wait until launch is completed if launch_thread and launch_thread.is_alive(): try: launch_thread.join() except: # noqa pass cls._wait_for_node(_node) if (_node.job.is_failed() and not _node.continue_on_fail) or _node.job.is_aborted(): raise ValueError( 'Pipeline step "{}", Task ID={} failed'.format(_node.name, _node.job.task_id()) ) _node.executed = _node.job.task_id() # make sure we mark the current state of the DAG execution tree # so that later we can find the "parents" to the current node _tid = current_thread().ident if cls._add_to_evaluated_return_values.get(_tid, True): if _tid not in cls._evaluated_return_values: cls._evaluated_return_values[_tid] = [] cls._evaluated_return_values[_tid].append(_node.name) task = Task.get_task(_node.job.task_id()) if return_name in task.artifacts: return task.artifacts[return_name].get( deserialization_function=cls._singleton._artifact_deserialization_function ) return task.get_parameters(cast=True)[CreateFromFunction.return_section + "/" + return_name] return_w = [LazyEvalWrapper( callback=functools.partial(result_wrapper, n), remote_reference=functools.partial(results_reference, n)) for n in function_return] for i in return_w: cls._ref_lazy_loader_id_to_node_name[id(i)] = _node_name # start the launch thread now launch_thread.start() return return_w[0] if len(return_w) == 1 else return_w return wrapper return decorator_wrap if _func is None else decorator_wrap(_func) @classmethod def pipeline( cls, _func=None, *, # noqa name, # type: str project, # type: str version=None, # type: Optional[str] return_value=None, # type: Optional[str] default_queue=None, # type: Optional[str] pool_frequency=0.2, # type: float add_pipeline_tags=False, # type: bool target_project=None, # type: Optional[str] abort_on_failure=False, # type: bool pipeline_execution_queue='services', # type: Optional[str] multi_instance_support=False, # type: bool add_run_number=True, # type: bool args_map=None, # type: dict[str, List[str]] start_controller_locally=False, # type: bool retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa docker=None, # type: Optional[str] docker_args=None, # type: Optional[str] docker_bash_setup_script=None, # type: Optional[str] packages=None, # type: Optional[Union[str, Sequence[str]]] repo=None, # type: Optional[str] repo_branch=None, # type: Optional[str] repo_commit=None, # type: Optional[str] artifact_serialization_function=None, # type: Optional[Callable[[Any], Union[bytes, bytearray]]] artifact_deserialization_function=None, # type: Optional[Callable[[bytes], Any]] output_uri=None # type: Optional[Union[str, bool]] ): # type: (...) -> Callable """ Decorate pipeline logic function. :param name: Provide pipeline name (if main Task exists it overrides its name) :param project: Provide project storing the pipeline (if main Task exists it overrides its project) :param version: Pipeline version. This version allows to uniquely identify the pipeline template execution. Examples for semantic versions: version='1.0.1' , version='23', version='1.2'. If not set, find the latest version of the pipeline and increment it. If no such version is found, default to '1.0.0' :param return_value: Optional, Provide an artifact name to store the pipeline function return object Notice, If not provided the pipeline will not store the pipeline function return value. :param default_queue: default pipeline step queue :param float pool_frequency: The pooling frequency (in minutes) for monitoring experiments / states. :param bool add_pipeline_tags: (default: False) if True, add `pipe: <pipeline_task_id>` tag to all steps (Tasks) created by this pipeline. :param str target_project: If provided, all pipeline steps are cloned into the target project :param bool abort_on_failure: If False (default), failed pipeline steps will not cause the pipeline to stop immediately, instead any step that is not connected (or indirectly connected) to the failed step, will still be executed. Nonetheless, the pipeline itself will be marked failed, unless the failed step was specifically defined with "continue_on_fail=True". If True, any failed step will cause the pipeline to immediately abort, stop all running steps, and mark the pipeline as failed. :param pipeline_execution_queue: remote pipeline execution queue (default 'services' queue). If None is passed, execute the pipeline logic locally (pipeline steps are still executed remotely) :param multi_instance_support: If True, allow multiple calls to the same pipeline function, each call creating a new Pipeline Task. Notice it is recommended to create an additional Task on the "main process" acting as a master pipeline, automatically collecting the execution plots. If multi_instance_support=='parallel' then the pipeline calls are executed in parallel, in the `parallel` case the function calls return None, to collect all pipeline results call `PipelineDecorator.wait_for_multi_pipelines()`. Default False, no multi instance pipeline support. :param add_run_number: If True (default), add the run number of the pipeline to the pipeline name. Example, the second time we launch the pipeline "best pipeline", we rename it to "best pipeline #2" :param args_map: Map arguments to their specific configuration section. Arguments not included in this map will default to `Args` section. For example, for the following code: .. code-block:: py @PipelineDecorator.pipeline(args_map={'sectionA':['paramA'], 'sectionB:['paramB','paramC'] def executing_pipeline(paramA, paramB, paramC, paramD): pass Parameters would be stored as: - paramA: sectionA/paramA - paramB: sectionB/paramB - paramC: sectionB/paramC - paramD: Args/paramD :param start_controller_locally: If True, start the controller on the local machine. The steps will run remotely if `PipelineDecorator.run_locally` or `PipelineDecorator.debug_pipeline` are not called. Default: False :param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry - Integer: In case of node failure, retry the node the number of times indicated by this parameter. - Callable: A function called on node failure. Takes as parameters: the PipelineController instance, the PipelineController.Node that failed and an int representing the number of previous retries for the node that failed. The function must return ``True`` if the node should be retried and ``False`` otherwise. If True, the node will be re-queued and the number of retries left will be decremented by 1. By default, if this callback is not specified, the function will be retried the number of times indicated by `retry_on_failure`. .. code-block:: py def example_retry_on_failure_callback(pipeline, node, retries): print(node.name, ' failed') # allow up to 5 retries (total of 6 runs) return retries < 5 :param docker: Select the docker image to be executed in by the remote session :param docker_args: Add docker arguments, pass a single string :param docker_bash_setup_script: Add bash script to be executed inside the docker before setting up the Task's environment :param packages: Manually specify a list of required packages or a local requirements.txt file. Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt" If not provided, packages are automatically added based on the imports used in the function. :param repo: Optional, specify a repository to attach to the function, when remotely executing. Allow users to execute the function inside the specified repository, enabling them to load modules/script from the repository. Notice the execution work directory will be the repository root folder. Supports both git repo url link, and local repository path (automatically converted into the remote git/commit as is currently checkout). Example remote url: 'https://github.com/user/repo.git' Example local repo copy: './repo' -> will automatically store the remote repo url and commit ID based on the locally cloned copy Use empty string ("") to disable any repository auto-detection :param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used) :param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used) :param artifact_serialization_function: A serialization function that takes one parameter of any type which is the object to be serialized. The function should return a `bytes` or `bytearray` object, which represents the serialized object. All parameter/return artifacts uploaded by the pipeline will be serialized using this function. All relevant imports must be done in this function. For example: .. code-block:: py def serialize(obj): import dill return dill.dumps(obj) :param artifact_deserialization_function: A deserialization function that takes one parameter of type `bytes`, which represents the serialized object. This function should return the deserialized object. All parameter/return artifacts fetched by the pipeline will be deserialized using this function. All relevant imports must be done in this function. For example: .. code-block:: py def deserialize(bytes_): import dill return dill.loads(bytes_) :param output_uri: The storage / output url for this pipeline. This is the default location for output models and other artifacts. Check Task.init reference docs for more info (output_uri is a parameter). The `output_uri` of this pipeline's steps will default to this value. """ def decorator_wrap(func): def internal_decorator(*args, **kwargs): pipeline_kwargs = dict(**(kwargs or {})) pipeline_kwargs_types = dict() inspect_func = inspect.getfullargspec(func) if args: if not inspect_func.args: raise ValueError("Could not parse function arguments") pipeline_kwargs.update({inspect_func.args[i]: v for i, v in enumerate(args)}) # add default function arguments if we have defaults for all arguments if inspect_func.args: default_values = list(inspect_func.defaults or []) default_values = ([None] * (len(inspect_func.args) - len(default_values))) + default_values default_kwargs = {k: v for k, v in zip(inspect_func.args, default_values)} default_kwargs.update(pipeline_kwargs) pipeline_kwargs = default_kwargs if inspect_func.annotations: pipeline_kwargs_types = { str(k): inspect_func.annotations[k] for k in inspect_func.annotations} # run the entire pipeline locally, as python functions if cls._debug_execute_step_function: a_pipeline = PipelineDecorator( name=name, project=project, version=version, pool_frequency=pool_frequency, add_pipeline_tags=add_pipeline_tags, target_project=target_project, abort_on_failure=abort_on_failure, add_run_number=add_run_number, retry_on_failure=retry_on_failure, docker=docker, docker_args=docker_args, docker_bash_setup_script=docker_bash_setup_script, packages=packages, repo=repo, repo_branch=repo_branch, repo_commit=repo_commit, artifact_serialization_function=artifact_serialization_function, artifact_deserialization_function=artifact_deserialization_function, output_uri=output_uri ) ret_val = func(**pipeline_kwargs) LazyEvalWrapper.trigger_all_remote_references() a_pipeline._task.close() return ret_val # check if we are in a multi pipeline force_single_multi_pipeline_call = False if multi_instance_support and cls._multi_pipeline_call_counter >= 0: # check if we are running remotely if not Task.running_locally(): # get the main Task property t = Task.get_task(task_id=get_remote_task_id()) if str(t.task_type) == str(Task.TaskTypes.controller): # noinspection PyBroadException try: # noinspection PyProtectedMember multi_pipeline_call_counter = int( t._get_runtime_properties().get('multi_pipeline_counter', None)) # NOTICE! if this is not our call we LEAVE immediately # check if this is our call to start, if not we will wait for the next one if multi_pipeline_call_counter != cls._multi_pipeline_call_counter: return except Exception: # this is not the one, so we should just run the first # instance and leave immediately force_single_multi_pipeline_call = True if default_queue: cls.set_default_execution_queue(default_queue) a_pipeline = PipelineDecorator( name=name, project=project, version=version, pool_frequency=pool_frequency, add_pipeline_tags=add_pipeline_tags, target_project=target_project, abort_on_failure=abort_on_failure, add_run_number=add_run_number, retry_on_failure=retry_on_failure, docker=docker, docker_args=docker_args, docker_bash_setup_script=docker_bash_setup_script, packages=packages, repo=repo, repo_branch=repo_branch, repo_commit=repo_commit, artifact_serialization_function=artifact_serialization_function, artifact_deserialization_function=artifact_deserialization_function, output_uri=output_uri ) a_pipeline._args_map = args_map or {} if PipelineDecorator._debug_execute_step_process: a_pipeline._clearml_job_class = LocalClearmlJob a_pipeline._default_execution_queue = 'mock' a_pipeline._clearml_job_class.register_hashing_callback(a_pipeline._adjust_task_hashing) # add pipeline arguments for k in pipeline_kwargs: a_pipeline.add_parameter( name=k, default=pipeline_kwargs.get(k), param_type=pipeline_kwargs_types.get(k) ) # sync multi-pipeline call counter (so we know which one to skip) if Task.running_locally() and multi_instance_support and cls._multi_pipeline_call_counter >= 0: # noinspection PyProtectedMember a_pipeline._task._set_runtime_properties( dict(multi_pipeline_counter=str(cls._multi_pipeline_call_counter))) # run the actual pipeline if not start_controller_locally and \ not PipelineDecorator._debug_execute_step_process and pipeline_execution_queue: # rerun the pipeline on a remote machine a_pipeline._task.execute_remotely(queue_name=pipeline_execution_queue) # when we get here it means we are running remotely # this will also deserialize the pipeline and arguments a_pipeline._start(wait=False) # sync arguments back (post deserialization and casting back) for k in pipeline_kwargs.keys(): if k in a_pipeline.get_parameters(): pipeline_kwargs[k] = a_pipeline.get_parameters()[k] # this time the pipeline is executed only on the remote machine try: pipeline_result = func(**pipeline_kwargs) except Exception: a_pipeline.stop(mark_failed=True) raise triggered_exception = None try: LazyEvalWrapper.trigger_all_remote_references() except Exception as ex: triggered_exception = ex # make sure we wait for all nodes to finish waited = True while waited: waited = False for node in list(a_pipeline._nodes.values()): if node.executed or not node.job or node.job.is_stopped(aborted_nonresponsive_as_running=True): continue cls._wait_for_node(node) waited = True # store the pipeline result of we have any: if return_value and pipeline_result is not None: a_pipeline._upload_pipeline_artifact( artifact_name=str(return_value), artifact_object=pipeline_result ) # now we can stop the pipeline a_pipeline.stop() # now we can raise the exception if triggered_exception: raise triggered_exception # Make sure that if we do not need to run all pipelines we forcefully leave the process if force_single_multi_pipeline_call: leave_process() # we will never get here return pipeline_result if multi_instance_support: return cls._multi_pipeline_wrapper( func=internal_decorator, parallel=bool(multi_instance_support == 'parallel')) return internal_decorator return decorator_wrap if _func is None else decorator_wrap(_func) @classmethod def set_default_execution_queue(cls, default_execution_queue): # type: (Optional[str]) -> None """ Set the default execution queue if pipeline step does not specify an execution queue :param default_execution_queue: The execution queue to use if no execution queue is provided """ cls._default_execution_queue = str(default_execution_queue) if default_execution_queue else None @classmethod def run_locally(cls): # type: () -> () """ Set local mode, run all functions locally as subprocess Run the full pipeline DAG locally, where steps are executed as sub-processes Tasks Notice: running the DAG locally assumes the local code execution (i.e. it will not clone & apply git diff) """ cls._debug_execute_step_process = True cls._debug_execute_step_function = False @classmethod def debug_pipeline(cls): # type: () -> () """ Set debugging mode, run all functions locally as functions (serially) Run the full pipeline DAG locally, where steps are executed as functions Notice: running the DAG locally assumes the local code execution (i.e. it will not clone & apply git diff) Pipeline steps are executed as functions (no Task will be created), fo ease debugging J """ cls._debug_execute_step_process = True cls._debug_execute_step_function = True @classmethod def get_current_pipeline(cls): # type: () -> "PipelineDecorator" """ Return the currently running pipeline instance """ return cls._singleton @classmethod def wait_for_multi_pipelines(cls): # type () -> List[Any] """ Wait until all background multi pipeline execution is completed. Returns all the pipeline results in call order (first pipeline call at index 0) :return: List of return values from executed pipeline, based on call order. """ return cls._wait_for_multi_pipelines() @classmethod def _component_launch(cls, node_name, node, kwargs_artifacts, kwargs, tid): _node_name = node_name _node = node # update artifacts kwargs for k, v in kwargs_artifacts.items(): if k in kwargs: kwargs.pop(k, None) _node.parameters.pop("{}/{}".format(CreateFromFunction.kwargs_section, k), None) _node.parameters["{}/{}".format(CreateFromFunction.input_artifact_section, k)] = v if v and '.' in str(v): parent_id, _ = str(v).split('.', 1) # find parent and push it into the _node.parents for n, node in sorted(list(cls._singleton._nodes.items()), reverse=True): if n != _node.name and node.executed and node.executed == parent_id: if n not in _node.parents: _node.parents.append(n) break if kwargs: leaves = cls._singleton._find_executed_node_leaves() _node.parents = (_node.parents or []) + [ x for x in cls._evaluated_return_values.get(tid, []) if x in leaves ] for k, v in kwargs.items(): if v is None or isinstance(v, (float, int, bool, six.string_types)): _node.parameters["{}/{}".format(CreateFromFunction.kwargs_section, k)] = v else: # we need to create an artifact artifact_name = 'result_{}_{}'.format(re.sub(r'\W+', '', _node.name), k) cls._singleton._upload_pipeline_artifact(artifact_name=artifact_name, artifact_object=v) _node.parameters["{}/{}".format(CreateFromFunction.input_artifact_section, k)] = \ "{}.{}".format(cls._singleton._task.id, artifact_name) # verify the new step cls._singleton._verify_node(_node) # launch the new step cls._singleton._launch_node(_node) # check if we generated the pipeline we need to update the new eager step if PipelineDecorator._eager_execution_instance and _node.job: # check if we need to add the pipeline tag on the new node pipeline_tags = [t for t in Task.current_task().get_tags() or [] if str(t).startswith(cls._node_tag_prefix)] if pipeline_tags and _node.job and _node.job.task: pipeline_tags = list(set((_node.job.task.get_tags() or []) + pipeline_tags)) _node.job.task.set_tags(pipeline_tags) # force parent task as pipeline _node.job.task._edit(parent=Task.current_task().parent) # store the new generated node, so we can later serialize it pipeline_dag = cls._singleton._serialize() # check if node is cached if _node.job.is_cached_task(): pipeline_dag[_node_name]['is_cached'] = True # store entire definition on the parent pipeline from clearml.backend_api.services import tasks artifact = tasks.Artifact( key='{}:{}:{}'.format(cls._eager_step_artifact, Task.current_task().id, _node.job.task_id()), type="json", mode='output', type_data=tasks.ArtifactTypeData( preview=json.dumps({_node_name: pipeline_dag[_node_name]}), content_type='application/pipeline') ) req = tasks.AddOrUpdateArtifactsRequest( task=Task.current_task().parent, artifacts=[artifact], force=True) res = Task.current_task().send(req, raise_on_errors=False) if not res or not res.response or not res.response.updated: pass # update pipeline execution graph cls._singleton.update_execution_plot() @classmethod def _multi_pipeline_wrapper( cls, func=None, # type: Callable parallel=False, # type: bool ): # type: (...) -> Callable """ Add support for multiple pipeline function calls, enabling execute multiple instances of the same pipeline from a single script. .. code-block:: py @PipelineDecorator.pipeline( multi_instance_support=True, name="custom pipeline logic", project="examples", version="1.0") def pipeline(parameter=1): print(f"running with parameter={parameter}") # run both pipeline (if multi_instance_support=='parallel', run pipelines in parallel) pipeline(parameter=1) pipeline(parameter=2) :param parallel: If True, the pipeline is running in the background, which implies calling the pipeline twice means running the pipelines in parallel. Default: False, pipeline function returns when pipeline completes :return: Return wrapped pipeline function. Notice the return value of the pipeline wrapped function: if parallel==True, return will be None, otherwise expect the return of the pipeline wrapped function """ def internal_decorator(*args, **kwargs): cls._multi_pipeline_call_counter += 1 # if this is a debug run just call the function (no parallelization). if cls._debug_execute_step_function: return func(*args, **kwargs) def sanitized_env(a_queue, *a_args, **a_kwargs): os.environ.pop('CLEARML_PROC_MASTER_ID', None) os.environ.pop('TRAINS_PROC_MASTER_ID', None) os.environ.pop('CLEARML_TASK_ID', None) os.environ.pop('TRAINS_TASK_ID', None) if Task.current_task(): # noinspection PyProtectedMember Task.current_task()._reset_current_task_obj() a_result = func(*a_args, **a_kwargs) if a_queue is not None: task_id = Task.current_task().id if Task.current_task() else None a_queue.put((task_id, a_result)) return a_result queue = Queue() p = Process(target=sanitized_env, args=(queue, ) + args, kwargs=kwargs) # make sure we wait for the subprocess. p.daemon = False p.start() if parallel and Task.running_locally(): cls._multi_pipeline_instances.append((p, queue)) return else: p.join() # noinspection PyBroadException try: pipeline_task, result = queue.get_nowait() except Exception: return None # we should update the master Task plot: if pipeline_task and Task.current_task(): cls._add_pipeline_plots(pipeline_task) return result if parallel and not cls._atexit_registered: cls._atexit_registered = True atexit.register(cls._wait_for_multi_pipelines) return internal_decorator @classmethod def _wait_for_multi_pipelines(cls): results = [] if not cls._multi_pipeline_instances: return results print('Waiting for background pipelines to finish') for p, queue in cls._multi_pipeline_instances: try: p.join() except: # noqa pass # noinspection PyBroadException try: pipeline_task, result = queue.get_nowait() results.append(result) cls._add_pipeline_plots(pipeline_task) except Exception: pass cls._multi_pipeline_instances = [] return results @classmethod def _add_pipeline_plots(cls, pipeline_task_id): if not Task.current_task(): return from clearml.backend_api.services import events res = Task.current_task().send( events.GetTaskPlotsRequest(task=pipeline_task_id, iters=1), raise_on_errors=False, ignore_errors=True, ) execution_flow = None execution_details = None for p in res.response.plots: try: if p['metric'] == cls._report_plot_execution_flow['title'] and \ p['variant'] == cls._report_plot_execution_flow['series']: execution_flow = json.loads(p['plot_str']) elif p['metric'] == cls._report_plot_execution_details['title'] and \ p['variant'] == cls._report_plot_execution_details['series']: execution_details = json.loads(p['plot_str']) execution_details['layout']['name'] += ' - ' + str(pipeline_task_id) except Exception as ex: getLogger('clearml.automation.controller').warning( 'Multi-pipeline plot update failed: {}'.format(ex)) if execution_flow: Task.current_task().get_logger().report_plotly( title=cls._report_plot_execution_flow['title'], series='{} - {}'.format(cls._report_plot_execution_flow['series'], pipeline_task_id), iteration=0, figure=execution_flow) if execution_details: Task.current_task().get_logger().report_plotly( title=cls._report_plot_execution_details['title'], series='{} - {}'.format(cls._report_plot_execution_details['series'], pipeline_task_id), iteration=0, figure=execution_details)