mirror of
https://github.com/clearml/clearml
synced 2025-04-20 14:25:19 +00:00
4235 lines
207 KiB
Python
4235 lines
207 KiB
Python
import atexit
|
|
import functools
|
|
import inspect
|
|
import json
|
|
import os
|
|
import re
|
|
import six
|
|
from copy import copy, deepcopy
|
|
from datetime import datetime
|
|
from logging import getLogger
|
|
from multiprocessing import Process, Queue
|
|
from multiprocessing.pool import ThreadPool
|
|
from threading import Thread, Event, RLock, current_thread
|
|
from time import time, sleep
|
|
from typing import Sequence, Optional, Mapping, Callable, Any, List, Dict, Union, Tuple
|
|
|
|
from attr import attrib, attrs
|
|
from pathlib2 import Path
|
|
|
|
from .job import LocalClearmlJob, RunningJob, BaseJob
|
|
from .. import Logger
|
|
from ..automation import ClearmlJob
|
|
from ..backend_api import Session
|
|
from ..backend_interface.task.populate import CreateFromFunction
|
|
from ..backend_interface.util import get_or_create_project, exact_match_regex
|
|
from ..config import get_remote_task_id
|
|
from ..debugging.log import LoggerRoot
|
|
from ..model import BaseModel, OutputModel
|
|
from ..storage.util import hash_dict
|
|
from ..task import Task
|
|
from ..utilities.process.mp import leave_process
|
|
from ..utilities.proxy_object import LazyEvalWrapper, flatten_dictionary, walk_nested_dict_tuple_list
|
|
|
|
|
|
class PipelineController(object):
|
|
"""
|
|
Pipeline controller.
|
|
Pipeline is a DAG of base tasks, each task will be cloned (arguments changed as required), executed, and monitored.
|
|
The pipeline process (task) itself can be executed manually or by the clearml-agent services queue.
|
|
Notice: The pipeline controller lives as long as the pipeline itself is being executed.
|
|
"""
|
|
_tag = 'pipeline'
|
|
_project_system_tags = ['pipeline', 'hidden']
|
|
_node_tag_prefix = 'pipe:'
|
|
_step_pattern = r"\${[^}]*}"
|
|
_config_section = 'Pipeline'
|
|
_state_artifact_name = 'pipeline_state'
|
|
_args_section = 'Args'
|
|
_pipeline_section = 'pipeline'
|
|
_pipeline_step_ref = 'pipeline'
|
|
_runtime_property_hash = '_pipeline_hash'
|
|
_relaunch_status_message = "Relaunching pipeline step..."
|
|
_reserved_pipeline_names = (_pipeline_step_ref, )
|
|
_task_project_lookup = {}
|
|
_clearml_job_class = ClearmlJob
|
|
_update_execution_plot_interval = 5.*60
|
|
_update_progress_interval = 10.
|
|
_monitor_node_interval = 5.*60
|
|
_report_plot_execution_flow = dict(title='Pipeline', series='Execution Flow')
|
|
_report_plot_execution_details = dict(title='Pipeline Details', series='Execution Details')
|
|
_evaluated_return_values = {} # TID: pipeline_name
|
|
_add_to_evaluated_return_values = {} # TID: bool
|
|
_retries = {} # Node.name: int
|
|
_retries_callbacks = {} # Node.name: Callable[[PipelineController, PipelineController.Node, int], bool] # noqa
|
|
_final_failure = {} # Node.name: bool
|
|
_task_template_header = CreateFromFunction.default_task_template_header
|
|
|
|
valid_job_status = ["failed", "cached", "completed", "aborted", "queued", "running", "skipped", "pending"]
|
|
|
|
@attrs
|
|
class Node(object):
|
|
name = attrib(type=str) # pipeline step name
|
|
base_task_id = attrib(type=str, default=None) # base Task ID to be cloned and launched
|
|
task_factory_func = attrib(type=Callable, default=None) # alternative to base_task_id, function creating a Task
|
|
queue = attrib(type=str, default=None) # execution queue name to use
|
|
parents = attrib(type=list, default=None) # list of parent DAG steps
|
|
timeout = attrib(type=float, default=None) # execution timeout limit
|
|
parameters = attrib(type=dict, default=None) # Task hyper-parameters to change
|
|
configurations = attrib(type=dict, default=None) # Task configuration objects to change
|
|
task_overrides = attrib(type=dict, default=None) # Task overrides to change
|
|
executed = attrib(type=str, default=None) # The actual executed Task ID (None if not executed yet)
|
|
status = attrib(type=str, default="pending") # The Node Task status (cached, aborted, etc.)
|
|
clone_task = attrib(type=bool, default=True) # If True cline the base_task_id, then execute the cloned Task
|
|
job = attrib(type=ClearmlJob, default=None) # ClearMLJob object
|
|
job_type = attrib(type=str, default=None) # task type (string)
|
|
job_started = attrib(type=float, default=None) # job startup timestamp (epoch ts in seconds)
|
|
job_ended = attrib(type=float, default=None) # job startup timestamp (epoch ts in seconds)
|
|
job_code_section = attrib(type=str, default=None) # pipeline code configuration section name
|
|
skip_job = attrib(type=bool, default=False) # if True, this step should be skipped
|
|
continue_on_fail = attrib(type=bool, default=False) # if True, the pipeline continues even if the step failed
|
|
cache_executed_step = attrib(type=bool, default=False) # if True this pipeline step should be cached
|
|
return_artifacts = attrib(type=list, default=None) # List of artifact names returned by the step
|
|
monitor_metrics = attrib(type=list, default=None) # List of metric title/series to monitor
|
|
monitor_artifacts = attrib(type=list, default=None) # List of artifact names to monitor
|
|
monitor_models = attrib(type=list, default=None) # List of models to monitor
|
|
|
|
def __attrs_post_init__(self):
|
|
if self.parents is None:
|
|
self.parents = []
|
|
if self.parameters is None:
|
|
self.parameters = {}
|
|
if self.configurations is None:
|
|
self.configurations = {}
|
|
if self.task_overrides is None:
|
|
self.task_overrides = {}
|
|
if self.return_artifacts is None:
|
|
self.return_artifacts = []
|
|
if self.monitor_metrics is None:
|
|
self.monitor_metrics = []
|
|
if self.monitor_artifacts is None:
|
|
self.monitor_artifacts = []
|
|
if self.monitor_models is None:
|
|
self.monitor_models = []
|
|
|
|
def copy(self):
|
|
# type: () -> PipelineController.Node
|
|
"""
|
|
return a copy of the current Node, excluding the `job`, `executed`, fields
|
|
:return: new Node copy
|
|
"""
|
|
new_copy = PipelineController.Node(
|
|
name=self.name,
|
|
**dict((k, deepcopy(v)) for k, v in self.__dict__.items()
|
|
if k not in ('name', 'job', 'executed', 'task_factory_func'))
|
|
)
|
|
new_copy.task_factory_func = self.task_factory_func
|
|
return new_copy
|
|
|
|
def __init__(
|
|
self,
|
|
name, # type: str
|
|
project, # type: str
|
|
version, # type: str
|
|
pool_frequency=0.2, # type: float
|
|
add_pipeline_tags=False, # type: bool
|
|
target_project=True, # type: Optional[Union[str, bool]]
|
|
auto_version_bump=True, # type: bool
|
|
abort_on_failure=False, # type: bool
|
|
add_run_number=True, # type: bool
|
|
retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa
|
|
docker=None, # type: Optional[str]
|
|
docker_args=None, # type: Optional[str]
|
|
docker_bash_setup_script=None, # type: Optional[str]
|
|
packages=None, # type: Optional[Union[str, Sequence[str]]]
|
|
repo=None, # type: Optional[str]
|
|
repo_branch=None, # type: Optional[str]
|
|
repo_commit=None # type: Optional[str]
|
|
):
|
|
# type: (...) -> None
|
|
"""
|
|
Create a new pipeline controller. The newly created object will launch and monitor the new experiments.
|
|
|
|
:param name: Provide pipeline name (if main Task exists it overrides its name)
|
|
:param project: Provide project storing the pipeline (if main Task exists it overrides its project)
|
|
:param version: Must provide pipeline version. This version allows to uniquely identify the pipeline
|
|
template execution. Examples for semantic versions: version='1.0.1' , version='23', version='1.2'
|
|
:param float pool_frequency: The pooling frequency (in minutes) for monitoring experiments / states.
|
|
:param bool add_pipeline_tags: (default: False) if True, add `pipe: <pipeline_task_id>` tag to all
|
|
steps (Tasks) created by this pipeline.
|
|
:param str target_project: If provided, all pipeline steps are cloned into the target project.
|
|
If True, pipeline steps are stored into the pipeline project
|
|
:param bool auto_version_bump: If True (default), if the same pipeline version already exists
|
|
(with any difference from the current one), the current pipeline version will be bumped to a new version
|
|
version bump examples: 1.0.0 -> 1.0.1 , 1.2 -> 1.3, 10 -> 11 etc.
|
|
:param bool abort_on_failure: If False (default), failed pipeline steps will not cause the pipeline
|
|
to stop immediately, instead any step that is not connected (or indirectly connected) to the failed step,
|
|
will still be executed. Nonetheless the pipeline itself will be marked failed, unless the failed step
|
|
was specifically defined with "continue_on_fail=True".
|
|
If True, any failed step will cause the pipeline to immediately abort, stop all running steps,
|
|
and mark the pipeline as failed.
|
|
:param add_run_number: If True (default), add the run number of the pipeline to the pipeline name.
|
|
Example, the second time we launch the pipeline "best pipeline", we rename it to "best pipeline #2"
|
|
:param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry
|
|
- Integer: In case of node failure, retry the node the number of times indicated by this parameter.
|
|
- Callable: A function called on node failure. Takes as parameters:
|
|
the PipelineController instance, the PipelineController.Node that failed and an int
|
|
representing the number of previous retries for the node that failed
|
|
The function must return a `bool`: True if the node should be retried and False otherwise.
|
|
If True, the node will be re-queued and the number of retries left will be decremented by 1.
|
|
By default, if this callback is not specified, the function will be retried the number of
|
|
times indicated by `retry_on_failure`.
|
|
|
|
.. code-block:: py
|
|
|
|
def example_retry_on_failure_callback(pipeline, node, retries):
|
|
print(node.name, ' failed')
|
|
# allow up to 5 retries (total of 6 runs)
|
|
return retries < 5
|
|
:param docker: Select the docker image to be executed in by the remote session
|
|
:param docker_args: Add docker arguments, pass a single string
|
|
:param docker_bash_setup_script: Add bash script to be executed
|
|
inside the docker before setting up the Task's environment
|
|
:param packages: Manually specify a list of required packages or a local requirements.txt file.
|
|
Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt"
|
|
If not provided, packages are automatically added.
|
|
:param repo: Optional, specify a repository to attach to the pipeline controller, when remotely executing.
|
|
Allow users to execute the controller inside the specified repository, enabling them to load modules/script
|
|
from the repository. Notice the execution work directory will be the repository root folder.
|
|
Supports both git repo url link, and local repository path (automatically converted into the remote
|
|
git/commit as is currently checkout).
|
|
Example remote url: 'https://github.com/user/repo.git'
|
|
Example local repo copy: './repo' -> will automatically store the remote
|
|
repo url and commit ID based on the locally cloned copy
|
|
:param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used)
|
|
:param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used)
|
|
"""
|
|
self._nodes = {}
|
|
self._running_nodes = []
|
|
self._start_time = None
|
|
self._pipeline_time_limit = None
|
|
self._default_execution_queue = None
|
|
self._version = str(version).strip()
|
|
if not self._version or not all(i and i.isnumeric() for i in self._version.split('.')):
|
|
raise ValueError(
|
|
"Pipeline version has to be in a semantic version form, "
|
|
"examples: version='1.0.1', version='1.2', version='23'")
|
|
self._pool_frequency = pool_frequency * 60.
|
|
self._thread = None
|
|
self._pipeline_args = dict()
|
|
self._pipeline_args_desc = dict()
|
|
self._pipeline_args_type = dict()
|
|
self._args_map = dict()
|
|
self._stop_event = None
|
|
self._experiment_created_cb = None
|
|
self._experiment_completed_cb = None
|
|
self._pre_step_callbacks = {}
|
|
self._post_step_callbacks = {}
|
|
self._target_project = target_project
|
|
self._add_pipeline_tags = add_pipeline_tags
|
|
self._task = Task.current_task()
|
|
self._step_ref_pattern = re.compile(self._step_pattern)
|
|
self._reporting_lock = RLock()
|
|
self._pipeline_task_status_failed = None
|
|
self._auto_version_bump = bool(auto_version_bump)
|
|
self._mock_execution = False # used for nested pipelines (eager execution)
|
|
self._pipeline_as_sub_project = bool(Session.check_min_api_server_version("2.17"))
|
|
self._last_progress_update_time = 0
|
|
if not self._task:
|
|
task_name = name or project or '{}'.format(datetime.now())
|
|
if self._pipeline_as_sub_project:
|
|
parent_project = "{}.pipelines".format(project+'/' if project else '')
|
|
project_name = "{}/{}".format(parent_project, task_name)
|
|
else:
|
|
parent_project = None
|
|
project_name = project or 'Pipelines'
|
|
|
|
self._task = Task.init(
|
|
project_name=project_name,
|
|
task_name=task_name,
|
|
task_type=Task.TaskTypes.controller,
|
|
auto_resource_monitoring=False,
|
|
reuse_last_task_id=False
|
|
)
|
|
# make sure project is hidden
|
|
if self._pipeline_as_sub_project:
|
|
get_or_create_project(
|
|
self._task.session, project_name=parent_project, system_tags=["hidden"])
|
|
get_or_create_project(
|
|
self._task.session, project_name=project_name,
|
|
project_id=self._task.project, system_tags=self._project_system_tags)
|
|
|
|
self._task.set_system_tags((self._task.get_system_tags() or []) + [self._tag])
|
|
self._task.set_user_properties(version=self._version)
|
|
self._task.set_base_docker(
|
|
docker_image=docker, docker_arguments=docker_args, docker_setup_bash_script=docker_bash_setup_script
|
|
)
|
|
self._task.set_packages(packages)
|
|
self._task.set_repo(repo, branch=repo_branch, commit=repo_commit)
|
|
self._auto_connect_task = bool(self._task)
|
|
# make sure we add to the main Task the pipeline tag
|
|
if self._task and not self._pipeline_as_sub_project:
|
|
self._task.add_tags([self._tag])
|
|
|
|
self._monitored_nodes = {} # type: Dict[str, dict]
|
|
self._abort_running_steps_on_failure = abort_on_failure
|
|
self._def_max_retry_on_failure = retry_on_failure if isinstance(retry_on_failure, int) else 0
|
|
self._retry_on_failure_callback = retry_on_failure if callable(retry_on_failure) \
|
|
else self._default_retry_on_failure_callback
|
|
|
|
# add direct link to the pipeline page
|
|
if self._pipeline_as_sub_project and self._task:
|
|
if add_run_number and self._task.running_locally():
|
|
self._add_pipeline_name_run_number()
|
|
# noinspection PyProtectedMember
|
|
self._task.get_logger().report_text('ClearML pipeline page: {}'.format(
|
|
'{}/pipelines/{}/experiments/{}'.format(
|
|
self._task._get_app_server(),
|
|
self._task.project if self._task.project is not None else '*',
|
|
self._task.id,
|
|
))
|
|
)
|
|
|
|
def set_default_execution_queue(self, default_execution_queue):
|
|
# type: (Optional[str]) -> None
|
|
"""
|
|
Set the default execution queue if pipeline step does not specify an execution queue
|
|
|
|
:param default_execution_queue: The execution queue to use if no execution queue is provided
|
|
"""
|
|
self._default_execution_queue = str(default_execution_queue) if default_execution_queue else None
|
|
|
|
def set_pipeline_execution_time_limit(self, max_execution_minutes):
|
|
# type: (Optional[float]) -> None
|
|
"""
|
|
Set maximum execution time (minutes) for the entire pipeline. Pass None or 0 to disable execution time limit.
|
|
|
|
:param float max_execution_minutes: The maximum time (minutes) for the entire pipeline process. The
|
|
default is ``None``, indicating no time limit.
|
|
"""
|
|
self._pipeline_time_limit = max_execution_minutes * 60. if max_execution_minutes else None
|
|
|
|
def add_step(
|
|
self,
|
|
name, # type: str
|
|
base_task_id=None, # type: Optional[str]
|
|
parents=None, # type: Optional[Sequence[str]]
|
|
parameter_override=None, # type: Optional[Mapping[str, Any]]
|
|
configuration_overrides=None, # type: Optional[Mapping[str, Union[str, Mapping]]]
|
|
task_overrides=None, # type: Optional[Mapping[str, Any]]
|
|
execution_queue=None, # type: Optional[str]
|
|
monitor_metrics=None, # type: Optional[List[Union[Tuple[str, str], Tuple[(str, str), (str, str)]]]]
|
|
monitor_artifacts=None, # type: Optional[List[Union[str, Tuple[str, str]]]]
|
|
monitor_models=None, # type: Optional[List[Union[str, Tuple[str, str]]]]
|
|
time_limit=None, # type: Optional[float]
|
|
base_task_project=None, # type: Optional[str]
|
|
base_task_name=None, # type: Optional[str]
|
|
clone_base_task=True, # type: bool
|
|
continue_on_fail=False, # type: bool
|
|
pre_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa
|
|
post_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa
|
|
cache_executed_step=False, # type: bool
|
|
base_task_factory=None, # type: Optional[Callable[[PipelineController.Node], Task]]
|
|
retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa
|
|
):
|
|
# type: (...) -> bool
|
|
"""
|
|
Add a step to the pipeline execution DAG.
|
|
Each step must have a unique name (this name will later be used to address the step)
|
|
|
|
:param name: Unique of the step. For example `stage1`
|
|
:param base_task_id: The Task ID to use for the step. Each time the step is executed,
|
|
the base Task is cloned, then the cloned task will be sent for execution.
|
|
:param parents: Optional list of parent nodes in the DAG.
|
|
The current step in the pipeline will be sent for execution only after all the parent nodes
|
|
have been executed successfully.
|
|
:param parameter_override: Optional parameter overriding dictionary.
|
|
The dict values can reference a previously executed step using the following form '${step_name}'
|
|
Examples:
|
|
- Artifact access
|
|
parameter_override={'Args/input_file': '${<step_name>.artifacts.<artifact_name>.url}' }
|
|
- Model access (last model used)
|
|
parameter_override={'Args/input_file': '${<step_name>.models.output.-1.url}' }
|
|
- Parameter access
|
|
parameter_override={'Args/input_file': '${<step_name>.parameters.Args/input_file}' }
|
|
- Pipeline Task argument (see `Pipeline.add_parameter`)
|
|
parameter_override={'Args/input_file': '${pipeline.<pipeline_parameter>}' }
|
|
- Task ID
|
|
parameter_override={'Args/input_file': '${stage3.id}' }
|
|
:param configuration_overrides: Optional, override Task configuration objects.
|
|
Expected dictionary of configuration object name and configuration object content.
|
|
Examples:
|
|
{'General': dict(key='value')}
|
|
{'General': 'configuration file content'}
|
|
{'OmegaConf': YAML.dumps(full_hydra_dict)}
|
|
:param task_overrides: Optional task section overriding dictionary.
|
|
The dict values can reference a previously executed step using the following form '${step_name}'
|
|
Examples:
|
|
- get the latest commit from a specific branch
|
|
task_overrides={'script.version_num': '', 'script.branch': 'main'}
|
|
- match git repository branch to a previous step
|
|
task_overrides={'script.branch': '${stage1.script.branch}', 'script.version_num': ''}
|
|
- change container image
|
|
task_overrides={'container.image': 'nvidia/cuda:11.6.0-devel-ubuntu20.04',
|
|
'container.arguments': '--ipc=host'}
|
|
- match container image to a previous step
|
|
task_overrides={'container.image': '${stage1.container.image}'}
|
|
- reset requirements (the agent will use the "requirements.txt" inside the repo)
|
|
task_overrides={'script.requirements.pip': ""}
|
|
:param execution_queue: Optional, the queue to use for executing this specific step.
|
|
If not provided, the task will be sent to the default execution queue, as defined on the class
|
|
:param monitor_metrics: Optional, log the step's metrics on the pipeline Task.
|
|
Format is a list of pairs metric (title, series) to log:
|
|
[(step_metric_title, step_metric_series), ]
|
|
Example: [('test', 'accuracy'), ]
|
|
Or a list of tuple pairs, to specify a different target metric for to use on the pipeline Task:
|
|
[((step_metric_title, step_metric_series), (target_metric_title, target_metric_series)), ]
|
|
Example: [[('test', 'accuracy'), ('model', 'accuracy')], ]
|
|
:param monitor_artifacts: Optional, log the step's artifacts on the pipeline Task.
|
|
Provided a list of artifact names existing on the step's Task, they will also appear on the Pipeline itself.
|
|
Example: [('processed_data', 'final_processed_data'), ]
|
|
Alternatively user can also provide a list of artifacts to monitor
|
|
(target artifact name will be the same as original artifact name)
|
|
Example: ['processed_data', ]
|
|
:param monitor_models: Optional, log the step's output models on the pipeline Task.
|
|
Provided a list of model names existing on the step's Task, they will also appear on the Pipeline itself.
|
|
Example: [('model_weights', 'final_model_weights'), ]
|
|
Alternatively user can also provide a list of models to monitor
|
|
(target models name will be the same as original model)
|
|
Example: ['model_weights', ]
|
|
To select the latest (lexicographic) model use "model_*", or the last created model with just "*"
|
|
Example: ['model_weights_*', ]
|
|
:param time_limit: Default None, no time limit.
|
|
Step execution time limit, if exceeded the Task is aborted and the pipeline is stopped and marked failed.
|
|
:param base_task_project: If base_task_id is not given,
|
|
use the base_task_project and base_task_name combination to retrieve the base_task_id to use for the step.
|
|
:param base_task_name: If base_task_id is not given,
|
|
use the base_task_project and base_task_name combination to retrieve the base_task_id to use for the step.
|
|
:param clone_base_task: If True (default), the pipeline will clone the base task, and modify/enqueue
|
|
the cloned Task. If False, the base-task is used directly, notice it has to be in draft-mode (created).
|
|
:param continue_on_fail: (default False). If True, failed step will not cause the pipeline to stop
|
|
(or marked as failed). Notice, that steps that are connected (or indirectly connected)
|
|
to the failed step will be skipped.
|
|
:param pre_execute_callback: Callback function, called when the step (Task) is created
|
|
and before it is sent for execution. Allows a user to modify the Task before launch.
|
|
Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object.
|
|
`parameters` are the configuration arguments passed to the ClearmlJob.
|
|
|
|
If the callback returned value is `False`,
|
|
the Node is skipped and so is any node in the DAG that relies on this node.
|
|
|
|
Notice the `parameters` are already parsed,
|
|
e.g. `${step1.parameters.Args/param}` is replaced with relevant value.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_created_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
parameters, # type: dict
|
|
):
|
|
pass
|
|
|
|
:param post_execute_callback: Callback function, called when a step (Task) is completed
|
|
and it other jobs are executed. Allows a user to modify the Task status after completion.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_completed_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
):
|
|
pass
|
|
|
|
:param cache_executed_step: If True, before launching the new step,
|
|
after updating with the latest configuration, check if an exact Task with the same parameter/code
|
|
was already executed. If it was found, use it instead of launching a new Task.
|
|
Default: False, a new cloned copy of base_task is always used.
|
|
Notice: If the git repo reference does not have a specific commit ID, the Task will never be used.
|
|
If `clone_base_task` is False there is no cloning, hence the base_task is used.
|
|
:param base_task_factory: Optional, instead of providing a pre-existing Task,
|
|
provide a Callable function to create the Task (returns Task object)
|
|
:param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry
|
|
- Integer: In case of node failure, retry the node the number of times indicated by this parameter.
|
|
- Callable: A function called on node failure. Takes as parameters:
|
|
the PipelineController instance, the PipelineController.Node that failed and an int
|
|
representing the number of previous retries for the node that failed
|
|
The function must return a `bool`: True if the node should be retried and False otherwise.
|
|
If True, the node will be re-queued and the number of retries left will be decremented by 1.
|
|
By default, if this callback is not specified, the function will be retried the number of
|
|
times indicated by `retry_on_failure`.
|
|
|
|
.. code-block:: py
|
|
|
|
def example_retry_on_failure_callback(pipeline, node, retries):
|
|
print(node.name, ' failed')
|
|
# allow up to 5 retries (total of 6 runs)
|
|
return retries < 5
|
|
|
|
:return: True if successful
|
|
"""
|
|
# always store callback functions (even when running remotely)
|
|
if pre_execute_callback:
|
|
self._pre_step_callbacks[name] = pre_execute_callback
|
|
if post_execute_callback:
|
|
self._post_step_callbacks[name] = post_execute_callback
|
|
|
|
self._verify_node_name(name)
|
|
|
|
if not base_task_factory and not base_task_id:
|
|
if not base_task_project or not base_task_name:
|
|
raise ValueError('Either base_task_id or base_task_project/base_task_name must be provided')
|
|
base_task = Task.get_task(
|
|
project_name=base_task_project,
|
|
task_name=base_task_name,
|
|
allow_archived=True,
|
|
task_filter=dict(
|
|
status=[str(Task.TaskStatusEnum.created), str(Task.TaskStatusEnum.queued),
|
|
str(Task.TaskStatusEnum.in_progress), str(Task.TaskStatusEnum.published),
|
|
str(Task.TaskStatusEnum.stopped), str(Task.TaskStatusEnum.completed),
|
|
str(Task.TaskStatusEnum.closed)],
|
|
)
|
|
)
|
|
if not base_task:
|
|
raise ValueError('Could not find base_task_project={} base_task_name={}'.format(
|
|
base_task_project, base_task_name))
|
|
if Task.archived_tag in base_task.get_system_tags():
|
|
LoggerRoot.get_base_logger().warning(
|
|
'Found base_task_project={} base_task_name={} but it is archived'.format(
|
|
base_task_project, base_task_name))
|
|
base_task_id = base_task.id
|
|
|
|
if configuration_overrides is not None:
|
|
# verify we have a dict or a string on all values
|
|
if not isinstance(configuration_overrides, dict) or \
|
|
not all(isinstance(v, (str, dict)) for v in configuration_overrides.values()):
|
|
raise ValueError("configuration_overrides must be a dictionary, with all values "
|
|
"either dicts or strings, got \'{}\' instead".format(configuration_overrides))
|
|
|
|
if task_overrides:
|
|
task_overrides = flatten_dictionary(task_overrides, sep='.')
|
|
|
|
self._nodes[name] = self.Node(
|
|
name=name, base_task_id=base_task_id, parents=parents or [],
|
|
queue=execution_queue, timeout=time_limit,
|
|
parameters=parameter_override or {},
|
|
configurations=configuration_overrides,
|
|
clone_task=clone_base_task,
|
|
task_overrides=task_overrides,
|
|
cache_executed_step=cache_executed_step,
|
|
continue_on_fail=continue_on_fail,
|
|
task_factory_func=base_task_factory,
|
|
monitor_metrics=monitor_metrics or [],
|
|
monitor_artifacts=monitor_artifacts or [],
|
|
monitor_models=monitor_models or [],
|
|
)
|
|
self._retries[name] = 0
|
|
self._retries_callbacks[name] = retry_on_failure if callable(retry_on_failure) else \
|
|
(functools.partial(self._default_retry_on_failure_callback, max_retries=retry_on_failure)
|
|
if isinstance(retry_on_failure, int) else self._retry_on_failure_callback)
|
|
|
|
if self._task and not self._task.running_locally():
|
|
self.update_execution_plot()
|
|
|
|
return True
|
|
|
|
def add_function_step(
|
|
self,
|
|
name, # type: str
|
|
function, # type: Callable
|
|
function_kwargs=None, # type: Optional[Dict[str, Any]]
|
|
function_return=None, # type: Optional[List[str]]
|
|
project_name=None, # type: Optional[str]
|
|
task_name=None, # type: Optional[str]
|
|
task_type=None, # type: Optional[str]
|
|
auto_connect_frameworks=None, # type: Optional[dict]
|
|
auto_connect_arg_parser=None, # type: Optional[dict]
|
|
packages=None, # type: Optional[Union[str, Sequence[str]]]
|
|
repo=None, # type: Optional[str]
|
|
repo_branch=None, # type: Optional[str]
|
|
repo_commit=None, # type: Optional[str]
|
|
helper_functions=None, # type: Optional[Sequence[Callable]]
|
|
docker=None, # type: Optional[str]
|
|
docker_args=None, # type: Optional[str]
|
|
docker_bash_setup_script=None, # type: Optional[str]
|
|
parents=None, # type: Optional[Sequence[str]],
|
|
execution_queue=None, # type: Optional[str]
|
|
monitor_metrics=None, # type: Optional[List[Union[Tuple[str, str], Tuple[(str, str), (str, str)]]]]
|
|
monitor_artifacts=None, # type: Optional[List[Union[str, Tuple[str, str]]]]
|
|
monitor_models=None, # type: Optional[List[Union[str, Tuple[str, str]]]]
|
|
time_limit=None, # type: Optional[float]
|
|
continue_on_fail=False, # type: bool
|
|
pre_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa
|
|
post_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa
|
|
cache_executed_step=False, # type: bool
|
|
retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa
|
|
):
|
|
# type: (...) -> bool
|
|
"""
|
|
Create a Task from a function, including wrapping the function input arguments
|
|
into the hyper-parameter section as kwargs, and storing function results as named artifacts
|
|
|
|
Example:
|
|
|
|
.. code-block:: py
|
|
|
|
def mock_func(a=6, b=9):
|
|
c = a*b
|
|
print(a, b, c)
|
|
return c, c**2
|
|
|
|
create_task_from_function(mock_func, function_return=['mul', 'square'])
|
|
|
|
Example arguments from other Tasks (artifact):
|
|
|
|
.. code-block:: py
|
|
|
|
def mock_func(matrix_np):
|
|
c = matrix_np*matrix_np
|
|
print(matrix_np, c)
|
|
return c
|
|
|
|
create_task_from_function(
|
|
mock_func,
|
|
function_kwargs={'matrix_np': 'aabb1122.previous_matrix'},
|
|
function_return=['square_matrix']
|
|
)
|
|
|
|
:param name: Unique of the step. For example `stage1`
|
|
:param function: A global function to convert into a standalone Task
|
|
:param function_kwargs: Optional, provide subset of function arguments and default values to expose.
|
|
If not provided automatically take all function arguments & defaults
|
|
Optional, pass input arguments to the function from other Tasks's output artifact.
|
|
Example argument named `numpy_matrix` from Task ID `aabbcc` artifact name `answer`:
|
|
{'numpy_matrix': 'aabbcc.answer'}
|
|
:param function_return: Provide a list of names for all the results.
|
|
If not provided, no results will be stored as artifacts.
|
|
:param project_name: Set the project name for the task. Required if base_task_id is None.
|
|
:param task_name: Set the name of the remote task, if not provided use `name` argument.
|
|
:param task_type: Optional, The task type to be created. Supported values: 'training', 'testing', 'inference',
|
|
'data_processing', 'application', 'monitor', 'controller', 'optimizer', 'service', 'qc', 'custom'
|
|
:param auto_connect_frameworks: Control the frameworks auto connect, see `Task.init` auto_connect_frameworks
|
|
:param auto_connect_arg_parser: Control the ArgParser auto connect, see `Task.init` auto_connect_arg_parser
|
|
:param packages: Manually specify a list of required packages or a local requirements.txt file.
|
|
Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt"
|
|
If not provided, packages are automatically added based on the imports used in the function.
|
|
:param repo: Optional, specify a repository to attach to the function, when remotely executing.
|
|
Allow users to execute the function inside the specified repository, enabling to load modules/script
|
|
from a repository Notice the execution work directory will be the repository root folder.
|
|
Supports both git repo url link, and local repository path.
|
|
Example remote url: 'https://github.com/user/repo.git'
|
|
Example local repo copy: './repo' -> will automatically store the remote
|
|
repo url and commit ID based on the locally cloned copy
|
|
:param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used)
|
|
:param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used)
|
|
:param helper_functions: Optional, a list of helper functions to make available
|
|
for the standalone function Task.
|
|
:param docker: Select the docker image to be executed in by the remote session
|
|
:param docker_args: Add docker arguments, pass a single string
|
|
:param docker_bash_setup_script: Add bash script to be executed
|
|
inside the docker before setting up the Task's environment
|
|
:param parents: Optional list of parent nodes in the DAG.
|
|
The current step in the pipeline will be sent for execution only after all the parent nodes
|
|
have been executed successfully.
|
|
:param execution_queue: Optional, the queue to use for executing this specific step.
|
|
If not provided, the task will be sent to the default execution queue, as defined on the class
|
|
:param monitor_metrics: Optional, log the step's metrics on the pipeline Task.
|
|
Format is a list of pairs metric (title, series) to log:
|
|
[(step_metric_title, step_metric_series), ]
|
|
Example: [('test', 'accuracy'), ]
|
|
Or a list of tuple pairs, to specify a different target metric for to use on the pipeline Task:
|
|
[((step_metric_title, step_metric_series), (target_metric_title, target_metric_series)), ]
|
|
Example: [[('test', 'accuracy'), ('model', 'accuracy')], ]
|
|
:param monitor_artifacts: Optional, log the step's artifacts on the pipeline Task.
|
|
Provided a list of artifact names existing on the step's Task, they will also appear on the Pipeline itself.
|
|
Example: [('processed_data', 'final_processed_data'), ]
|
|
Alternatively user can also provide a list of artifacts to monitor
|
|
(target artifact name will be the same as original artifact name)
|
|
Example: ['processed_data', ]
|
|
:param monitor_models: Optional, log the step's output models on the pipeline Task.
|
|
Provided a list of model names existing on the step's Task, they will also appear on the Pipeline itself.
|
|
Example: [('model_weights', 'final_model_weights'), ]
|
|
Alternatively user can also provide a list of models to monitor
|
|
(target models name will be the same as original model)
|
|
Example: ['model_weights', ]
|
|
To select the latest (lexicographic) model use "model_*", or the last created model with just "*"
|
|
Example: ['model_weights_*', ]
|
|
:param time_limit: Default None, no time limit.
|
|
Step execution time limit, if exceeded the Task is aborted and the pipeline is stopped and marked failed.
|
|
:param continue_on_fail: (default False). If True, failed step will not cause the pipeline to stop
|
|
(or marked as failed). Notice, that steps that are connected (or indirectly connected)
|
|
to the failed step will be skipped.
|
|
:param pre_execute_callback: Callback function, called when the step (Task) is created
|
|
and before it is sent for execution. Allows a user to modify the Task before launch.
|
|
Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object.
|
|
`parameters` are the configuration arguments passed to the ClearmlJob.
|
|
|
|
If the callback returned value is `False`,
|
|
the Node is skipped and so is any node in the DAG that relies on this node.
|
|
|
|
Notice the `parameters` are already parsed,
|
|
e.g. `${step1.parameters.Args/param}` is replaced with relevant value.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_created_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
parameters, # type: dict
|
|
):
|
|
pass
|
|
|
|
:param post_execute_callback: Callback function, called when a step (Task) is completed
|
|
and it other jobs are executed. Allows a user to modify the Task status after completion.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_completed_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
):
|
|
pass
|
|
|
|
:param cache_executed_step: If True, before launching the new step,
|
|
after updating with the latest configuration, check if an exact Task with the same parameter/code
|
|
was already executed. If it was found, use it instead of launching a new Task.
|
|
Default: False, a new cloned copy of base_task is always used.
|
|
Notice: If the git repo reference does not have a specific commit ID, the Task will never be used.
|
|
|
|
:param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry
|
|
- Integer: In case of node failure, retry the node the number of times indicated by this parameter.
|
|
- Callable: A function called on node failure. Takes as parameters:
|
|
the PipelineController instance, the PipelineController.Node that failed and an int
|
|
representing the number of previous retries for the node that failed
|
|
The function must return a `bool`: True if the node should be retried and False otherwise.
|
|
If True, the node will be re-queued and the number of retries left will be decremented by 1.
|
|
By default, if this callback is not specified, the function will be retried the number of
|
|
times indicated by `retry_on_failure`.
|
|
|
|
.. code-block:: py
|
|
|
|
def example_retry_on_failure_callback(pipeline, node, retries):
|
|
print(node.name, ' failed')
|
|
# allow up to 5 retries (total of 6 runs)
|
|
return retries < 5
|
|
|
|
:return: True if successful
|
|
"""
|
|
function_kwargs = function_kwargs or {}
|
|
default_kwargs = inspect.getfullargspec(function)
|
|
if default_kwargs and default_kwargs.args and default_kwargs.defaults:
|
|
for key, val in zip(default_kwargs.args[-len(default_kwargs.defaults):], default_kwargs.defaults):
|
|
function_kwargs.setdefault(key, val)
|
|
|
|
return self._add_function_step(
|
|
name=name,
|
|
function=function,
|
|
function_kwargs=function_kwargs,
|
|
function_return=function_return,
|
|
project_name=project_name,
|
|
task_name=task_name,
|
|
task_type=task_type,
|
|
auto_connect_frameworks=auto_connect_frameworks,
|
|
auto_connect_arg_parser=auto_connect_arg_parser,
|
|
packages=packages,
|
|
repo=repo,
|
|
repo_branch=repo_branch,
|
|
repo_commit=repo_commit,
|
|
helper_functions=helper_functions,
|
|
docker=docker,
|
|
docker_args=docker_args,
|
|
docker_bash_setup_script=docker_bash_setup_script,
|
|
parents=parents,
|
|
execution_queue=execution_queue,
|
|
monitor_metrics=monitor_metrics,
|
|
monitor_artifacts=monitor_artifacts,
|
|
monitor_models=monitor_models,
|
|
time_limit=time_limit,
|
|
continue_on_fail=continue_on_fail,
|
|
pre_execute_callback=pre_execute_callback,
|
|
post_execute_callback=post_execute_callback,
|
|
cache_executed_step=cache_executed_step,
|
|
retry_on_failure=retry_on_failure,
|
|
)
|
|
|
|
def start(
|
|
self,
|
|
queue='services',
|
|
step_task_created_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa
|
|
step_task_completed_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa
|
|
wait=True,
|
|
):
|
|
# type: (...) -> bool
|
|
"""
|
|
Start the current pipeline remotely (on the selected services queue).
|
|
The current process will be stopped and launched remotely.
|
|
|
|
:param queue: queue name to launch the pipeline on
|
|
:param Callable step_task_created_callback: Callback function, called when a step (Task) is created
|
|
and before it is sent for execution. Allows a user to modify the Task before launch.
|
|
Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object.
|
|
`parameters` are the configuration arguments passed to the ClearmlJob.
|
|
|
|
If the callback returned value is `False`,
|
|
the Node is skipped and so is any node in the DAG that relies on this node.
|
|
|
|
Notice the `parameters` are already parsed,
|
|
e.g. `${step1.parameters.Args/param}` is replaced with relevant value.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_created_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
parameters, # type: dict
|
|
):
|
|
pass
|
|
|
|
:param Callable step_task_completed_callback: Callback function, called when a step (Task) is completed
|
|
and it other jobs are executed. Allows a user to modify the Task status after completion.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_completed_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
):
|
|
pass
|
|
:param wait: If True (default), start the pipeline controller, return only
|
|
after the pipeline is done (completed/aborted/failed)
|
|
|
|
:return: True, if the controller started. False, if the controller did not start.
|
|
|
|
"""
|
|
if not self._task:
|
|
raise ValueError(
|
|
"Could not find main Task, "
|
|
"PipelineController must be created with `always_create_task=True`")
|
|
|
|
# serialize state only if we are running locally
|
|
if Task.running_locally() or not self._task.is_main_task():
|
|
self._verify()
|
|
self._serialize_pipeline_task()
|
|
self.update_execution_plot()
|
|
|
|
# stop current Task and execute remotely or no-op
|
|
self._task.execute_remotely(queue_name=queue, exit_process=True, clone=False)
|
|
|
|
if not Task.running_locally() and self._task.is_main_task():
|
|
self._start(
|
|
step_task_created_callback=step_task_created_callback,
|
|
step_task_completed_callback=step_task_completed_callback,
|
|
wait=wait
|
|
)
|
|
|
|
return True
|
|
|
|
def start_locally(self, run_pipeline_steps_locally=False):
|
|
# type: (bool) -> None
|
|
"""
|
|
Start the current pipeline locally, meaning the pipeline logic is running on the current machine,
|
|
instead of on the `services` queue.
|
|
|
|
Using run_pipeline_steps_locally=True you can run all the pipeline steps locally as sub-processes.
|
|
Notice: when running pipeline steps locally, it assumes local code execution
|
|
(i.e. it is running the local code as is, regardless of the git commit/diff on the pipeline steps Task)
|
|
|
|
:param run_pipeline_steps_locally: (default False) If True, run the pipeline steps themselves locally as a
|
|
subprocess (use for debugging the pipeline locally, notice the pipeline code is expected to be available
|
|
on the local machine)
|
|
"""
|
|
if not self._task:
|
|
raise ValueError(
|
|
"Could not find main Task, "
|
|
"PipelineController must be created with `always_create_task=True`")
|
|
|
|
if run_pipeline_steps_locally:
|
|
self._clearml_job_class = LocalClearmlJob
|
|
self._default_execution_queue = self._default_execution_queue or 'mock'
|
|
|
|
# serialize state only if we are running locally
|
|
if Task.running_locally() or not self._task.is_main_task():
|
|
self._verify()
|
|
self._serialize_pipeline_task()
|
|
|
|
self._start(wait=True)
|
|
|
|
def create_draft(self):
|
|
# type: () -> None
|
|
"""
|
|
Optional, manually create & serialize the Pipeline Task (use with care for manual multi pipeline creation).
|
|
|
|
**Notice** The recommended flow would be to call `pipeline.start(queue=None)`
|
|
which would have a similar effect and will allow you to clone/enqueue later on.
|
|
|
|
After calling Pipeline.create(), users can edit the pipeline in the UI and enqueue it for execution.
|
|
|
|
Notice: this function should be used to programmatically create pipeline for later usage.
|
|
To automatically create and launch pipelines, call the `start()` method.
|
|
"""
|
|
self._verify()
|
|
self._serialize_pipeline_task()
|
|
self._task.close()
|
|
self._task.reset()
|
|
|
|
@classmethod
|
|
def get_logger(cls):
|
|
# type: () -> Logger
|
|
"""
|
|
Return a logger connected to the Pipeline Task.
|
|
The logger can be used by any function/tasks executed by the pipeline, in order to report
|
|
directly to the pipeline Task itself. It can also be called from the main pipeline control Task.
|
|
|
|
Raise ValueError if main Pipeline task could not be located.
|
|
|
|
:return: Logger object for reporting metrics (scalars, plots, debug samples etc.)
|
|
"""
|
|
return cls._get_pipeline_task().get_logger()
|
|
|
|
@classmethod
|
|
def upload_model(cls, model_name, model_local_path):
|
|
# type: (str, str) -> OutputModel
|
|
"""
|
|
Upload (add) a model to the main Pipeline Task object.
|
|
This function can be called from any pipeline component to directly add models into the main pipeline Task
|
|
|
|
The model file/path will be uploaded to the Pipeline Task and registered on the model repository.
|
|
|
|
Raise ValueError if main Pipeline task could not be located.
|
|
|
|
:param model_name: Model name as will appear in the model registry (in the pipeline's project)
|
|
:param model_local_path: Path to the local model file or directory to be uploaded.
|
|
If a local directory is provided the content of the folder (recursively) will be
|
|
packaged into a zip file and uploaded
|
|
"""
|
|
task = cls._get_pipeline_task()
|
|
model_name = str(model_name)
|
|
model_local_path = Path(model_local_path)
|
|
out_model = OutputModel(task=task, name=model_name)
|
|
out_model.update_weights(weights_filename=model_local_path.as_posix())
|
|
return out_model
|
|
|
|
@classmethod
|
|
def upload_artifact(
|
|
cls,
|
|
name, # type: str
|
|
artifact_object, # type: Any
|
|
metadata=None, # type: Optional[Mapping]
|
|
delete_after_upload=False, # type: bool
|
|
auto_pickle=True, # type: bool
|
|
preview=None, # type: Any
|
|
wait_on_upload=False, # type: bool
|
|
):
|
|
# type: (...) -> bool
|
|
"""
|
|
Upload (add) an artifact to the main Pipeline Task object.
|
|
This function can be called from any pipeline component to directly add artifacts into the main pipeline Task.
|
|
|
|
The artifact can be uploaded by any function/tasks executed by the pipeline, in order to report
|
|
directly to the pipeline Task itself. It can also be called from the main pipeline control Task.
|
|
|
|
Raise ValueError if main Pipeline task could not be located.
|
|
|
|
The currently supported upload artifact types include:
|
|
- string / Path - A path to artifact file. If a wildcard or a folder is specified, then ClearML
|
|
creates and uploads a ZIP file.
|
|
- dict - ClearML stores a dictionary as ``.json`` file and uploads it.
|
|
- pandas.DataFrame - ClearML stores a pandas.DataFrame as ``.csv.gz`` (compressed CSV) file and uploads it.
|
|
- numpy.ndarray - ClearML stores a numpy.ndarray as ``.npz`` file and uploads it.
|
|
- PIL.Image - ClearML stores a PIL.Image as ``.png`` file and uploads it.
|
|
- Any - If called with auto_pickle=True, the object will be pickled and uploaded.
|
|
|
|
:param str name: The artifact name.
|
|
|
|
.. warning::
|
|
If an artifact with the same name was previously uploaded, then it is overwritten.
|
|
|
|
:param object artifact_object: The artifact object.
|
|
:param dict metadata: A dictionary of key-value pairs for any metadata. This dictionary appears with the
|
|
experiment in the **ClearML Web-App (UI)**, **ARTIFACTS** tab.
|
|
:param bool delete_after_upload: After the upload, delete the local copy of the artifact
|
|
|
|
- ``True`` - Delete the local copy of the artifact.
|
|
- ``False`` - Do not delete. (default)
|
|
|
|
:param bool auto_pickle: If True (default), and the artifact_object is not one of the following types:
|
|
pathlib2.Path, dict, pandas.DataFrame, numpy.ndarray, PIL.Image, url (string), local_file (string)
|
|
the artifact_object will be pickled and uploaded as pickle file artifact (with file extension .pkl)
|
|
|
|
:param Any preview: The artifact preview
|
|
|
|
:param bool wait_on_upload: Whether the upload should be synchronous, forcing the upload to complete
|
|
before continuing.
|
|
|
|
:return: The status of the upload.
|
|
|
|
- ``True`` - Upload succeeded.
|
|
- ``False`` - Upload failed.
|
|
|
|
:raise: If the artifact object type is not supported, raise a ``ValueError``.
|
|
"""
|
|
task = cls._get_pipeline_task()
|
|
return task.upload_artifact(
|
|
name=name, artifact_object=artifact_object, metadata=metadata, delete_after_upload=delete_after_upload,
|
|
auto_pickle=auto_pickle, preview=preview, wait_on_upload=wait_on_upload)
|
|
|
|
def stop(self, timeout=None, mark_failed=False, mark_aborted=False):
|
|
# type: (Optional[float], bool, bool) -> ()
|
|
"""
|
|
Stop the pipeline controller and the optimization thread.
|
|
If mark_failed and mark_aborted are False (default) mark the pipeline as completed,
|
|
unless one of the steps failed, then mark the pipeline as failed.
|
|
|
|
:param timeout: Wait timeout for the optimization thread to exit (minutes).
|
|
The default is ``None``, indicating do not wait to terminate immediately.
|
|
:param mark_failed: If True, mark the pipeline task as failed. (default False)
|
|
:param mark_aborted: If True, mark the pipeline task as aborted. (default False)
|
|
"""
|
|
self._stop_event.set()
|
|
|
|
self.wait(timeout=timeout)
|
|
if not self._task:
|
|
return
|
|
|
|
# sync pipeline state
|
|
self.update_execution_plot()
|
|
|
|
self._task.close()
|
|
if mark_failed:
|
|
self._task.mark_failed(status_reason='Pipeline aborted and failed', force=True)
|
|
elif mark_aborted:
|
|
self._task.mark_stopped(status_message='Pipeline aborted', force=True)
|
|
elif self._pipeline_task_status_failed:
|
|
print('Setting pipeline controller Task as failed (due to failed steps) !')
|
|
self._task.mark_failed(status_reason='Pipeline step failed', force=True)
|
|
|
|
def wait(self, timeout=None):
|
|
# type: (Optional[float]) -> bool
|
|
"""
|
|
Wait for the pipeline to finish.
|
|
|
|
.. note::
|
|
This method does not stop the pipeline. Call :meth:`stop` to terminate the pipeline.
|
|
|
|
:param float timeout: The timeout to wait for the pipeline to complete (minutes).
|
|
If ``None``, then wait until we reached the timeout, or pipeline completed.
|
|
|
|
:return: True, if the pipeline finished. False, if the pipeline timed out.
|
|
|
|
"""
|
|
if not self.is_running():
|
|
return True
|
|
|
|
if timeout is not None:
|
|
timeout *= 60.
|
|
|
|
_thread = self._thread
|
|
|
|
_thread.join(timeout=timeout)
|
|
if _thread.is_alive():
|
|
return False
|
|
|
|
return True
|
|
|
|
def is_running(self):
|
|
# type: () -> bool
|
|
"""
|
|
return True if the pipeline controller is running.
|
|
|
|
:return: A boolean indicating whether the pipeline controller is active (still running) or stopped.
|
|
"""
|
|
return self._thread is not None and self._thread.is_alive()
|
|
|
|
def is_successful(self):
|
|
# type: () -> bool
|
|
"""
|
|
return True if the pipeline controller is fully executed and none of the steps / Tasks failed
|
|
|
|
:return: A boolean indicating whether all steps did not fail
|
|
"""
|
|
return self._thread and not self.is_running() and not self._pipeline_task_status_failed
|
|
|
|
def elapsed(self):
|
|
# type: () -> float
|
|
"""
|
|
Return minutes elapsed from controller stating time stamp.
|
|
|
|
:return: The minutes from controller start time. A negative value means the process has not started yet.
|
|
"""
|
|
if self._start_time is None:
|
|
return -1.0
|
|
return (time() - self._start_time) / 60.
|
|
|
|
def get_pipeline_dag(self):
|
|
# type: () -> Mapping[str, PipelineController.Node]
|
|
"""
|
|
Return the pipeline execution graph, each node in the DAG is PipelineController.Node object.
|
|
Graph itself is a dictionary of Nodes (key based on the Node name),
|
|
each node holds links to its parent Nodes (identified by their unique names)
|
|
|
|
:return: execution tree, as a nested dictionary. Example:
|
|
|
|
.. code-block:: py
|
|
|
|
{
|
|
'stage1' : Node() {
|
|
name: 'stage1'
|
|
job: ClearmlJob
|
|
...
|
|
},
|
|
}
|
|
|
|
"""
|
|
return self._nodes
|
|
|
|
def get_processed_nodes(self):
|
|
# type: () -> Sequence[PipelineController.Node]
|
|
"""
|
|
Return a list of the processed pipeline nodes, each entry in the list is PipelineController.Node object.
|
|
|
|
:return: executed (excluding currently executing) nodes list
|
|
"""
|
|
return {k: n for k, n in self._nodes.items() if n.executed}
|
|
|
|
def get_running_nodes(self):
|
|
# type: () -> Sequence[PipelineController.Node]
|
|
"""
|
|
Return a list of the currently running pipeline nodes,
|
|
each entry in the list is PipelineController.Node object.
|
|
|
|
:return: Currently running nodes list
|
|
"""
|
|
return {k: n for k, n in self._nodes.items() if k in self._running_nodes}
|
|
|
|
def update_execution_plot(self):
|
|
# type: () -> ()
|
|
"""
|
|
Update sankey diagram of the current pipeline
|
|
"""
|
|
with self._reporting_lock:
|
|
self._update_execution_plot()
|
|
# also trigger node monitor scanning
|
|
self._scan_monitored_nodes()
|
|
|
|
def add_parameter(self, name, default=None, description=None, param_type=None):
|
|
# type: (str, Optional[Any], Optional[str], Optional[str]) -> None
|
|
"""
|
|
Add a parameter to the pipeline Task.
|
|
The parameter can be used as input parameter for any step in the pipeline.
|
|
Notice all parameters will appear under the PipelineController Task's Hyper-parameters -> Pipeline section
|
|
Example: pipeline.add_parameter(name='dataset', description='dataset ID to process the pipeline')
|
|
Then in one of the steps we can refer to the value of the parameter with '${pipeline.dataset}'
|
|
|
|
:param name: String name of the parameter.
|
|
:param default: Default value to be put as the default value (can be later changed in the UI)
|
|
:param description: String description of the parameter and its usage in the pipeline
|
|
:param param_type: Optional, parameter type information (to used as hint for casting and description)
|
|
"""
|
|
self._pipeline_args[str(name)] = default
|
|
if description:
|
|
self._pipeline_args_desc[str(name)] = str(description)
|
|
if param_type:
|
|
self._pipeline_args_type[str(name)] = param_type
|
|
|
|
def get_parameters(self):
|
|
# type: () -> dict
|
|
"""
|
|
Return the pipeline parameters dictionary
|
|
:return: Dictionary str -> str
|
|
"""
|
|
return self._pipeline_args
|
|
|
|
def _create_task_from_function(
|
|
self, docker, docker_args, docker_bash_setup_script,
|
|
function, function_input_artifacts, function_kwargs, function_return,
|
|
auto_connect_frameworks, auto_connect_arg_parser,
|
|
packages, project_name, task_name, task_type, repo, branch, commit, helper_functions
|
|
):
|
|
task_definition = CreateFromFunction.create_task_from_function(
|
|
a_function=function,
|
|
function_kwargs=function_kwargs or None,
|
|
function_input_artifacts=function_input_artifacts,
|
|
function_return=function_return,
|
|
project_name=project_name,
|
|
task_name=task_name,
|
|
task_type=task_type,
|
|
auto_connect_frameworks=auto_connect_frameworks,
|
|
auto_connect_arg_parser=auto_connect_arg_parser,
|
|
repo=repo,
|
|
branch=branch,
|
|
commit=commit,
|
|
packages=packages,
|
|
docker=docker,
|
|
docker_args=docker_args,
|
|
docker_bash_setup_script=docker_bash_setup_script,
|
|
output_uri=None,
|
|
helper_functions=helper_functions,
|
|
dry_run=True,
|
|
task_template_header=self._task_template_header
|
|
)
|
|
return task_definition
|
|
|
|
def _start(
|
|
self,
|
|
step_task_created_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa
|
|
step_task_completed_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa
|
|
wait=True,
|
|
):
|
|
# type: (...) -> bool
|
|
"""
|
|
Start the pipeline controller.
|
|
If the calling process is stopped, then the controller stops as well.
|
|
|
|
:param Callable step_task_created_callback: Callback function, called when a step (Task) is created
|
|
and before it is sent for execution. Allows a user to modify the Task before launch.
|
|
Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object.
|
|
`parameters` are the configuration arguments passed to the ClearmlJob.
|
|
|
|
If the callback returned value is `False`,
|
|
the Node is skipped and so is any node in the DAG that relies on this node.
|
|
|
|
Notice the `parameters` are already parsed,
|
|
e.g. `${step1.parameters.Args/param}` is replaced with relevant value.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_created_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
parameters, # type: dict
|
|
):
|
|
pass
|
|
|
|
:param Callable step_task_completed_callback: Callback function, called when a step (Task) is completed
|
|
and it other jobs are executed. Allows a user to modify the Task status after completion.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_completed_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
):
|
|
pass
|
|
:param wait: If True (default), start the pipeline controller, return only
|
|
after the pipeline is done (completed/aborted/failed)
|
|
|
|
:return: True, if the controller started. False, if the controller did not start.
|
|
|
|
"""
|
|
if self._thread:
|
|
return True
|
|
|
|
self._prepare_pipeline(step_task_completed_callback, step_task_created_callback)
|
|
self._thread = Thread(target=self._daemon)
|
|
self._thread.daemon = True
|
|
self._thread.start()
|
|
|
|
if wait:
|
|
self.wait()
|
|
self.stop()
|
|
|
|
return True
|
|
|
|
def _prepare_pipeline(
|
|
self,
|
|
step_task_created_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa
|
|
step_task_completed_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa
|
|
):
|
|
# type (...) -> None
|
|
|
|
params, pipeline_dag = self._serialize_pipeline_task()
|
|
# deserialize back pipeline state
|
|
if not params['continue_pipeline']:
|
|
for k in pipeline_dag:
|
|
pipeline_dag[k]['executed'] = None
|
|
pipeline_dag[k]['job_started'] = None
|
|
pipeline_dag[k]['job_ended'] = None
|
|
self._default_execution_queue = params['default_queue']
|
|
self._add_pipeline_tags = params['add_pipeline_tags']
|
|
self._target_project = params['target_project'] or ''
|
|
self._deserialize(pipeline_dag)
|
|
# if we continue the pipeline, make sure that we re-execute failed tasks
|
|
if params['continue_pipeline']:
|
|
for node in list(self._nodes.values()):
|
|
if node.executed is False:
|
|
node.executed = None
|
|
if not self._verify():
|
|
raise ValueError("Failed verifying pipeline execution graph, "
|
|
"it has either inaccessible nodes, or contains cycles")
|
|
self.update_execution_plot()
|
|
self._start_time = time()
|
|
self._stop_event = Event()
|
|
self._experiment_created_cb = step_task_created_callback
|
|
self._experiment_completed_cb = step_task_completed_callback
|
|
|
|
def _serialize_pipeline_task(self):
|
|
# type: () -> (dict, dict)
|
|
"""
|
|
Serialize current pipeline state into the main Task
|
|
|
|
:return: params, pipeline_dag
|
|
"""
|
|
params = {
|
|
'default_queue': self._default_execution_queue,
|
|
'add_pipeline_tags': self._add_pipeline_tags,
|
|
'target_project': self._target_project,
|
|
}
|
|
pipeline_dag = self._serialize()
|
|
|
|
# serialize pipeline state
|
|
if self._task and self._auto_connect_task:
|
|
# check if we are either running locally or that we are running remotely,
|
|
# but we have no configuration, so we need to act as if this is a local run and create everything
|
|
if self._task.running_locally() or self._task.get_configuration_object(name=self._config_section) is None:
|
|
# noinspection PyProtectedMember
|
|
self._task._set_configuration(
|
|
name=self._config_section, config_type='dictionary',
|
|
config_text=json.dumps(pipeline_dag, indent=2))
|
|
args_map_inversed = {}
|
|
for section, arg_list in self._args_map.items():
|
|
for arg in arg_list:
|
|
args_map_inversed[arg] = section
|
|
pipeline_args = flatten_dictionary(self._pipeline_args)
|
|
# noinspection PyProtectedMember
|
|
self._task._set_parameters(
|
|
{
|
|
"{}/{}".format(args_map_inversed.get(k, self._args_section), k): v
|
|
for k, v in pipeline_args.items()
|
|
},
|
|
__parameters_descriptions=self._pipeline_args_desc,
|
|
__parameters_types=self._pipeline_args_type,
|
|
__update=True,
|
|
)
|
|
self._task.connect(params, name=self._pipeline_section)
|
|
params['continue_pipeline'] = False
|
|
|
|
# make sure we have a unique version number (auto bump version if needed)
|
|
# only needed when manually (from code) creating pipelines
|
|
self._verify_pipeline_version()
|
|
|
|
# noinspection PyProtectedMember
|
|
pipeline_hash = self._get_task_hash()
|
|
|
|
# noinspection PyProtectedMember
|
|
self._task._set_runtime_properties({
|
|
self._runtime_property_hash: "{}:{}".format(pipeline_hash, self._version),
|
|
})
|
|
else:
|
|
self._task.connect_configuration(pipeline_dag, name=self._config_section)
|
|
connected_args = set()
|
|
new_pipeline_args = {}
|
|
for section, arg_list in self._args_map.items():
|
|
mutable_dict = {arg: self._pipeline_args.get(arg) for arg in arg_list}
|
|
self._task.connect(mutable_dict, name=section)
|
|
new_pipeline_args.update(mutable_dict)
|
|
connected_args.update(arg_list)
|
|
mutable_dict = {k: v for k, v in self._pipeline_args.items() if k not in connected_args}
|
|
self._task.connect(
|
|
mutable_dict, name=self._args_section
|
|
)
|
|
new_pipeline_args.update(mutable_dict)
|
|
self._pipeline_args = new_pipeline_args
|
|
self._task.connect(params, name=self._pipeline_section)
|
|
# noinspection PyProtectedMember
|
|
if self._task._get_runtime_properties().get(self._runtime_property_hash):
|
|
params['continue_pipeline'] = True
|
|
else:
|
|
# noinspection PyProtectedMember
|
|
pipeline_hash = ClearmlJob._create_task_hash(self._task)
|
|
# noinspection PyProtectedMember
|
|
self._task._set_runtime_properties({
|
|
self._runtime_property_hash: "{}:{}".format(pipeline_hash, self._version),
|
|
})
|
|
params['continue_pipeline'] = False
|
|
|
|
return params, pipeline_dag
|
|
|
|
def _verify_pipeline_version(self):
|
|
# if no version bump needed, just set the property
|
|
if not self._auto_version_bump:
|
|
self._task.set_user_properties(version=self._version)
|
|
return
|
|
|
|
# check if pipeline version exists, if it does increase version
|
|
pipeline_hash = self._get_task_hash()
|
|
# noinspection PyProtectedMember
|
|
existing_tasks = Task._query_tasks(
|
|
project=[self._task.project], task_name=exact_match_regex(self._task.name),
|
|
type=[str(self._task.task_type)],
|
|
system_tags=["__$all", self._tag, "__$not", Task.archived_tag],
|
|
_all_=dict(fields=['runtime.{}'.format(self._runtime_property_hash)],
|
|
pattern=":{}".format(self._version)),
|
|
only_fields=['id', 'runtime'],
|
|
)
|
|
if existing_tasks:
|
|
# check if hash match the current version.
|
|
matched = True
|
|
for t in existing_tasks:
|
|
h, _, v = t.runtime.get(self._runtime_property_hash, '').partition(':')
|
|
if v == self._version:
|
|
matched = bool(h == pipeline_hash)
|
|
break
|
|
# if hash did not match, look for the highest version
|
|
if not matched:
|
|
# noinspection PyProtectedMember
|
|
existing_tasks = Task._query_tasks(
|
|
project=[self._task.project], task_name=exact_match_regex(self._task.name),
|
|
type=[str(self._task.task_type)],
|
|
system_tags=["__$all", self._tag, "__$not", Task.archived_tag],
|
|
only_fields=['id', 'hyperparams', 'runtime'],
|
|
)
|
|
found_match_version = False
|
|
existing_versions = set([self._version]) # noqa
|
|
for t in existing_tasks:
|
|
# exclude ourselves
|
|
if t.id == self._task.id:
|
|
continue
|
|
if not t.hyperparams:
|
|
continue
|
|
v = t.hyperparams.get('properties', {}).get('version')
|
|
if v:
|
|
existing_versions.add(v.value)
|
|
if t.runtime:
|
|
h, _, _ = t.runtime.get(self._runtime_property_hash, '').partition(':')
|
|
if h == pipeline_hash:
|
|
self._version = v.value
|
|
found_match_version = True
|
|
break
|
|
|
|
# match to the version we found:
|
|
if found_match_version:
|
|
getLogger('clearml.automation.controller').info(
|
|
'Existing Pipeline found, matching version to: {}'.format(self._version))
|
|
else:
|
|
# if we did not find a matched pipeline version, get the max one and bump the version by 1
|
|
while True:
|
|
v = self._version.split('.')
|
|
self._version = '.'.join(v[:-1] + [str(int(v[-1]) + 1)])
|
|
if self._version not in existing_versions:
|
|
break
|
|
|
|
getLogger('clearml.automation.controller').info(
|
|
'No matching Pipelines found, bump new version to: {}'.format(self._version))
|
|
|
|
self._task.set_user_properties(version=self._version)
|
|
|
|
def _get_task_hash(self):
|
|
params_override = dict(**(self._task.get_parameters() or {}))
|
|
params_override.pop('properties/version', None)
|
|
# dag state without status / states
|
|
nodes_items = list(self._nodes.items())
|
|
dag = {
|
|
name: {
|
|
k: v for k, v in node.__dict__.items()
|
|
if k not in (
|
|
'job', 'name', 'task_factory_func', 'executed', 'status',
|
|
'job_started', 'job_ended', 'skip_job'
|
|
)
|
|
}
|
|
for name, node in nodes_items
|
|
}
|
|
|
|
# get all configurations (as dict of strings for hashing)
|
|
configurations_override = dict(**self._task.get_configuration_objects())
|
|
# store as text so we can hash it later
|
|
configurations_override[self._config_section] = json.dumps(dag)
|
|
|
|
# noinspection PyProtectedMember
|
|
pipeline_hash = ClearmlJob._create_task_hash(
|
|
self._task,
|
|
params_override=params_override,
|
|
configurations_override=configurations_override,
|
|
)
|
|
return pipeline_hash
|
|
|
|
def _serialize(self):
|
|
# type: () -> dict
|
|
"""
|
|
Store the definition of the pipeline DAG into a dictionary.
|
|
This dictionary will be used to store the DAG as a configuration on the Task
|
|
:return:
|
|
"""
|
|
nodes_items = list(self._nodes.items())
|
|
dag = {name: dict((k, v) for k, v in node.__dict__.items()
|
|
if k not in ('job', 'name', 'task_factory_func'))
|
|
for name, node in nodes_items}
|
|
# update state for presentation only
|
|
for name, node in nodes_items:
|
|
dag[name]['job_id'] = node.executed or (node.job.task_id() if node.job else None)
|
|
|
|
return dag
|
|
|
|
def _deserialize(self, dag_dict):
|
|
# type: (dict) -> ()
|
|
"""
|
|
Restore the DAG from a dictionary.
|
|
This will be used to create the DAG from the dict stored on the Task, when running remotely.
|
|
:return:
|
|
"""
|
|
|
|
# if we do not clone the Task, only merge the parts we can override.
|
|
for name in list(self._nodes.keys()):
|
|
if not self._nodes[name].clone_task and name in dag_dict and not dag_dict[name].get('clone_task'):
|
|
for k in ('queue', 'parents', 'timeout', 'parameters', 'configurations', 'task_overrides',
|
|
'executed', 'job_started', 'job_ended'):
|
|
setattr(self._nodes[name], k, dag_dict[name].get(k) or type(getattr(self._nodes[name], k))())
|
|
|
|
# if we do clone the Task deserialize everything, except the function creating
|
|
self._nodes = {
|
|
k: self.Node(name=k, **{kk: vv for kk, vv in v.items() if kk not in ('job_id', )})
|
|
if k not in self._nodes or (v.get('base_task_id') and v.get('clone_task'))
|
|
else self._nodes[k]
|
|
for k, v in dag_dict.items()}
|
|
|
|
# set the task_factory_func for each cloned node
|
|
for node in list(self._nodes.values()):
|
|
if not node.base_task_id and not node.task_factory_func and node.job_code_section:
|
|
if node.job_code_section in self._nodes:
|
|
func = self._nodes[node.job_code_section].task_factory_func
|
|
if func:
|
|
node.task_factory_func = func
|
|
|
|
def _has_stored_configuration(self):
|
|
"""
|
|
Return True if we are running remotely and we have stored configuration on the Task
|
|
"""
|
|
if self._auto_connect_task and self._task and not self._task.running_locally() and self._task.is_main_task():
|
|
stored_config = self._task.get_configuration_object(self._config_section)
|
|
return bool(stored_config)
|
|
|
|
return False
|
|
|
|
def _verify(self):
|
|
# type: () -> bool
|
|
"""
|
|
Verify the DAG, (i.e. no cycles and no missing parents)
|
|
On error raise ValueError with verification details
|
|
|
|
:return: return True iff DAG has no errors
|
|
"""
|
|
# verify nodes
|
|
for node in list(self._nodes.values()):
|
|
# raise value error if not verified
|
|
self._verify_node(node)
|
|
|
|
# check the dag itself
|
|
if not self._verify_dag():
|
|
return False
|
|
|
|
return True
|
|
|
|
def _verify_node(self, node):
|
|
# type: (PipelineController.Node) -> bool
|
|
"""
|
|
Raise ValueError on verification errors
|
|
|
|
:return: Return True iff the specific node is verified
|
|
"""
|
|
if not node.base_task_id and not node.task_factory_func:
|
|
raise ValueError("Node '{}', base_task_id is empty".format(node.name))
|
|
|
|
if not self._default_execution_queue and not node.queue:
|
|
raise ValueError("Node '{}' missing execution queue, "
|
|
"no default queue defined and no specific node queue defined".format(node.name))
|
|
|
|
task = node.task_factory_func or Task.get_task(task_id=node.base_task_id)
|
|
if not task:
|
|
raise ValueError("Node '{}', base_task_id={} is invalid".format(node.name, node.base_task_id))
|
|
|
|
pattern = self._step_ref_pattern
|
|
|
|
# verify original node parents
|
|
if node.parents and not all(isinstance(p, str) and p in self._nodes for p in node.parents):
|
|
raise ValueError("Node '{}', parents={} is invalid".format(node.name, node.parents))
|
|
|
|
parents = set()
|
|
for k, v in node.parameters.items():
|
|
if isinstance(v, str):
|
|
for g in pattern.findall(v):
|
|
ref_step = self.__verify_step_reference(node, g)
|
|
if ref_step:
|
|
parents.add(ref_step)
|
|
# verify we have a section name
|
|
if '/' not in k:
|
|
raise ValueError(
|
|
"Section name is missing in parameter \"{}\", "
|
|
"parameters should be in the form of "
|
|
"\"`section-name`/parameter\", example: \"Args/param\"".format(v))
|
|
|
|
if parents and parents != set(node.parents or []):
|
|
parents = parents - set(node.parents or [])
|
|
getLogger('clearml.automation.controller').info(
|
|
'Node "{}" missing parent reference, adding: {}'.format(node.name, parents))
|
|
node.parents = (node.parents or []) + list(parents)
|
|
|
|
# verify and fix monitoring sections:
|
|
def _verify_monitors(monitors, monitor_type, nested_pairs=False):
|
|
if not monitors:
|
|
return monitors
|
|
|
|
if nested_pairs:
|
|
if not all(isinstance(x, (list, tuple)) and x for x in monitors):
|
|
raise ValueError("{} should be a list of tuples, found: {}".format(monitor_type, monitors))
|
|
# convert single pair into a pair of pairs:
|
|
conformed_monitors = [
|
|
pair if isinstance(pair[0], (list, tuple)) else (pair, pair) for pair in monitors
|
|
]
|
|
# verify pair of pairs
|
|
if not all(isinstance(x[0][0], str) and isinstance(x[0][1], str) and
|
|
isinstance(x[1][0], str) and isinstance(x[1][1], str) for x in conformed_monitors):
|
|
raise ValueError("{} should be a list of tuples, found: {}".format(monitor_type, monitors))
|
|
else:
|
|
# verify a list of tuples
|
|
if not all(isinstance(x, (list, tuple, str)) and x for x in monitors):
|
|
raise ValueError(
|
|
"{} should be a list of tuples, found: {}".format(monitor_type, monitors))
|
|
# convert single str into a pair of pairs:
|
|
conformed_monitors = [
|
|
pair if isinstance(pair, (list, tuple)) else (pair, pair) for pair in monitors
|
|
]
|
|
# verify pair of pairs
|
|
if not all(isinstance(x[0], str) and isinstance(x[1], str) for x in conformed_monitors):
|
|
raise ValueError(
|
|
"{} should be a list of tuples, found: {}".format(monitor_type, monitors))
|
|
|
|
return conformed_monitors
|
|
|
|
# verify and fix monitoring sections:
|
|
node.monitor_metrics = _verify_monitors(node.monitor_metrics, 'monitor_metrics', nested_pairs=True)
|
|
node.monitor_artifacts = _verify_monitors(node.monitor_artifacts, 'monitor_artifacts')
|
|
node.monitor_models = _verify_monitors(node.monitor_models, 'monitor_models')
|
|
|
|
return True
|
|
|
|
def _verify_dag(self):
|
|
# type: () -> bool
|
|
"""
|
|
:return: True iff the pipeline dag is fully accessible and contains no cycles
|
|
"""
|
|
visited = set()
|
|
prev_visited = None
|
|
while prev_visited != visited:
|
|
prev_visited = copy(visited)
|
|
for k, node in list(self._nodes.items()):
|
|
if k in visited:
|
|
continue
|
|
if any(p == node.name for p in node.parents or []):
|
|
# node cannot have itself as parent
|
|
return False
|
|
if not all(p in visited for p in node.parents or []):
|
|
continue
|
|
visited.add(k)
|
|
# return False if we did not cover all the nodes
|
|
return not bool(set(self._nodes.keys()) - visited)
|
|
|
|
def _add_function_step(
|
|
self,
|
|
name, # type: str
|
|
function, # type: Callable
|
|
function_kwargs=None, # type: Optional[Dict[str, Any]]
|
|
function_return=None, # type: Optional[List[str]]
|
|
project_name=None, # type: Optional[str]
|
|
task_name=None, # type: Optional[str]
|
|
task_type=None, # type: Optional[str]
|
|
auto_connect_frameworks=None, # type: Optional[dict]
|
|
auto_connect_arg_parser=None, # type: Optional[dict]
|
|
packages=None, # type: Optional[Union[str, Sequence[str]]]
|
|
repo=None, # type: Optional[str]
|
|
repo_branch=None, # type: Optional[str]
|
|
repo_commit=None, # type: Optional[str]
|
|
helper_functions=None, # type: Optional[Sequence[Callable]]
|
|
docker=None, # type: Optional[str]
|
|
docker_args=None, # type: Optional[str]
|
|
docker_bash_setup_script=None, # type: Optional[str]
|
|
parents=None, # type: Optional[Sequence[str]],
|
|
execution_queue=None, # type: Optional[str]
|
|
monitor_metrics=None, # type: Optional[List[Union[Tuple[str, str], Tuple[(str, str), (str, str)]]]]
|
|
monitor_artifacts=None, # type: Optional[List[Union[str, Tuple[str, str]]]]
|
|
monitor_models=None, # type: Optional[List[Union[str, Tuple[str, str]]]]
|
|
time_limit=None, # type: Optional[float]
|
|
continue_on_fail=False, # type: bool
|
|
pre_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa
|
|
post_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa
|
|
cache_executed_step=False, # type: bool
|
|
retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa
|
|
):
|
|
# type: (...) -> bool
|
|
"""
|
|
Create a Task from a function, including wrapping the function input arguments
|
|
into the hyper-parameter section as kwargs, and storing function results as named artifacts
|
|
|
|
Example:
|
|
|
|
.. code-block:: py
|
|
|
|
def mock_func(a=6, b=9):
|
|
c = a*b
|
|
print(a, b, c)
|
|
return c, c**2
|
|
|
|
create_task_from_function(mock_func, function_return=['mul', 'square'])
|
|
|
|
Example arguments from other Tasks (artifact):
|
|
|
|
.. code-block:: py
|
|
|
|
def mock_func(matrix_np):
|
|
c = matrix_np*matrix_np
|
|
print(matrix_np, c)
|
|
return c
|
|
|
|
create_task_from_function(
|
|
mock_func,
|
|
function_kwargs={'matrix_np': 'aabb1122.previous_matrix'},
|
|
function_return=['square_matrix']
|
|
)
|
|
|
|
:param name: Unique of the step. For example `stage1`
|
|
:param function: A global function to convert into a standalone Task
|
|
:param function_kwargs: Optional, provide subset of function arguments and default values to expose.
|
|
If not provided automatically take all function arguments & defaults
|
|
Optional, pass input arguments to the function from other Tasks's output artifact.
|
|
Example argument named `numpy_matrix` from Task ID `aabbcc` artifact name `answer`:
|
|
{'numpy_matrix': 'aabbcc.answer'}
|
|
:param function_return: Provide a list of names for all the results.
|
|
If not provided, no results will be stored as artifacts.
|
|
:param project_name: Set the project name for the task. Required if base_task_id is None.
|
|
:param task_name: Set the name of the remote task, if not provided use `name` argument.
|
|
:param task_type: Optional, The task type to be created. Supported values: 'training', 'testing', 'inference',
|
|
'data_processing', 'application', 'monitor', 'controller', 'optimizer', 'service', 'qc', 'custom'
|
|
:param auto_connect_frameworks: Control the frameworks auto connect, see `Task.init` auto_connect_frameworks
|
|
:param auto_connect_arg_parser: Control the ArgParser auto connect, see `Task.init` auto_connect_arg_parser
|
|
:param packages: Manually specify a list of required packages or a local requirements.txt file.
|
|
Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt"
|
|
If not provided, packages are automatically added based on the imports used in the function.
|
|
:param repo: Optional, specify a repository to attach to the function, when remotely executing.
|
|
Allow users to execute the function inside the specified repository, enabling to load modules/script
|
|
from a repository Notice the execution work directory will be the repository root folder.
|
|
Supports both git repo url link, and local repository path.
|
|
Example remote url: 'https://github.com/user/repo.git'
|
|
Example local repo copy: './repo' -> will automatically store the remote
|
|
repo url and commit ID based on the locally cloned copy
|
|
:param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used)
|
|
:param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used)
|
|
:param helper_functions: Optional, a list of helper functions to make available
|
|
for the standalone function Task.
|
|
:param docker: Select the docker image to be executed in by the remote session
|
|
:param docker_args: Add docker arguments, pass a single string
|
|
:param docker_bash_setup_script: Add bash script to be executed
|
|
inside the docker before setting up the Task's environment
|
|
:param parents: Optional list of parent nodes in the DAG.
|
|
The current step in the pipeline will be sent for execution only after all the parent nodes
|
|
have been executed successfully.
|
|
:param execution_queue: Optional, the queue to use for executing this specific step.
|
|
If not provided, the task will be sent to the default execution queue, as defined on the class
|
|
:param monitor_metrics: Optional, log the step's metrics on the pipeline Task.
|
|
Format is a list of pairs metric (title, series) to log:
|
|
[(step_metric_title, step_metric_series), ]
|
|
Example: [('test', 'accuracy'), ]
|
|
Or a list of tuple pairs, to specify a different target metric for to use on the pipeline Task:
|
|
[((step_metric_title, step_metric_series), (target_metric_title, target_metric_series)), ]
|
|
Example: [[('test', 'accuracy'), ('model', 'accuracy')], ]
|
|
:param monitor_artifacts: Optional, log the step's artifacts on the pipeline Task.
|
|
Provided a list of artifact names existing on the step's Task, they will also appear on the Pipeline itself.
|
|
Example: [('processed_data', 'final_processed_data'), ]
|
|
Alternatively user can also provide a list of artifacts to monitor
|
|
(target artifact name will be the same as original artifact name)
|
|
Example: ['processed_data', ]
|
|
:param monitor_models: Optional, log the step's output models on the pipeline Task.
|
|
Provided a list of model names existing on the step's Task, they will also appear on the Pipeline itself.
|
|
Example: [('model_weights', 'final_model_weights'), ]
|
|
Alternatively user can also provide a list of models to monitor
|
|
(target models name will be the same as original model)
|
|
Example: ['model_weights', ]
|
|
To select the latest (lexicographic) model use "model_*", or the last created model with just "*"
|
|
Example: ['model_weights_*', ]
|
|
:param time_limit: Default None, no time limit.
|
|
Step execution time limit, if exceeded the Task is aborted and the pipeline is stopped and marked failed.
|
|
:param continue_on_fail: (default False). If True, failed step will not cause the pipeline to stop
|
|
(or marked as failed). Notice, that steps that are connected (or indirectly connected)
|
|
to the failed step will be skipped.
|
|
:param pre_execute_callback: Callback function, called when the step (Task) is created
|
|
and before it is sent for execution. Allows a user to modify the Task before launch.
|
|
Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object.
|
|
`parameters` are the configuration arguments passed to the ClearmlJob.
|
|
|
|
If the callback returned value is `False`,
|
|
the Node is skipped and so is any node in the DAG that relies on this node.
|
|
|
|
Notice the `parameters` are already parsed,
|
|
e.g. `${step1.parameters.Args/param}` is replaced with relevant value.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_created_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
parameters, # type: dict
|
|
):
|
|
pass
|
|
|
|
:param post_execute_callback: Callback function, called when a step (Task) is completed
|
|
and it other jobs are executed. Allows a user to modify the Task status after completion.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_completed_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
):
|
|
pass
|
|
|
|
:param cache_executed_step: If True, before launching the new step,
|
|
after updating with the latest configuration, check if an exact Task with the same parameter/code
|
|
was already executed. If it was found, use it instead of launching a new Task.
|
|
Default: False, a new cloned copy of base_task is always used.
|
|
Notice: If the git repo reference does not have a specific commit ID, the Task will never be used.
|
|
|
|
:param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry
|
|
- Integer: In case of node failure, retry the node the number of times indicated by this parameter.
|
|
- Callable: A function called on node failure. Takes as parameters:
|
|
the PipelineController instance, the PipelineController.Node that failed and an int
|
|
representing the number of previous retries for the node that failed
|
|
The function must return a `bool`: True if the node should be retried and False otherwise.
|
|
If True, the node will be re-queued and the number of retries left will be decremented by 1.
|
|
By default, if this callback is not specified, the function will be retried the number of
|
|
times indicated by `retry_on_failure`.
|
|
|
|
.. code-block:: py
|
|
|
|
def example_retry_on_failure_callback(pipeline, node, retries):
|
|
print(node.name, ' failed')
|
|
# allow up to 5 retries (total of 6 runs)
|
|
return retries < 5
|
|
|
|
:return: True if successful
|
|
"""
|
|
# always store callback functions (even when running remotely)
|
|
if pre_execute_callback:
|
|
self._pre_step_callbacks[name] = pre_execute_callback
|
|
if post_execute_callback:
|
|
self._post_step_callbacks[name] = post_execute_callback
|
|
|
|
self._verify_node_name(name)
|
|
|
|
function_input_artifacts = {}
|
|
# go over function_kwargs, split it into string and input artifacts
|
|
for k, v in function_kwargs.items():
|
|
if v is None:
|
|
continue
|
|
if self._step_ref_pattern.match(str(v)):
|
|
# check for step artifacts
|
|
step, _, artifact = v[2:-1].partition('.')
|
|
if step in self._nodes and artifact in self._nodes[step].return_artifacts:
|
|
function_input_artifacts[k] = "${{{}.id}}.{}".format(step, artifact)
|
|
continue
|
|
# verify the reference only if we are running locally (on remote when we have multiple
|
|
# steps from tasks the _nodes is till empty, only after deserializing we will have the full DAG)
|
|
if self._task.running_locally():
|
|
self.__verify_step_reference(node=self.Node(name=name), step_ref_string=v)
|
|
elif not isinstance(v, (float, int, bool, six.string_types)):
|
|
function_input_artifacts[k] = "{}.{}.{}".format(self._task.id, name, k)
|
|
self._upload_pipeline_artifact(artifact_name="{}.{}".format(name, k), artifact_object=v)
|
|
|
|
function_kwargs = {k: v for k, v in function_kwargs.items() if k not in function_input_artifacts}
|
|
parameters = {"{}/{}".format(CreateFromFunction.kwargs_section, k): v for k, v in function_kwargs.items()}
|
|
if function_input_artifacts:
|
|
parameters.update(
|
|
{"{}/{}".format(CreateFromFunction.input_artifact_section, k): str(v)
|
|
for k, v in function_input_artifacts.items()}
|
|
)
|
|
|
|
job_code_section = name
|
|
task_name = task_name or name or None
|
|
|
|
if self._mock_execution:
|
|
project_name = project_name or self._get_target_project() or self._task.get_project_name()
|
|
|
|
task_definition = self._create_task_from_function(
|
|
docker, docker_args, docker_bash_setup_script, function,
|
|
function_input_artifacts, function_kwargs, function_return,
|
|
auto_connect_frameworks, auto_connect_arg_parser,
|
|
packages, project_name, task_name,
|
|
task_type, repo, repo_branch, repo_commit, helper_functions)
|
|
|
|
elif self._task.running_locally() or self._task.get_configuration_object(name=name) is None:
|
|
project_name = project_name or self._get_target_project() or self._task.get_project_name()
|
|
|
|
task_definition = self._create_task_from_function(
|
|
docker, docker_args, docker_bash_setup_script, function,
|
|
function_input_artifacts, function_kwargs, function_return,
|
|
auto_connect_frameworks, auto_connect_arg_parser,
|
|
packages, project_name, task_name,
|
|
task_type, repo, repo_branch, repo_commit, helper_functions)
|
|
# update configuration with the task definitions
|
|
# noinspection PyProtectedMember
|
|
self._task._set_configuration(
|
|
name=name, config_type='json',
|
|
config_text=json.dumps(task_definition, indent=1)
|
|
)
|
|
else:
|
|
# load task definition from configuration
|
|
# noinspection PyProtectedMember
|
|
config_text = self._task._get_configuration_text(name=name)
|
|
task_definition = json.loads(config_text) if config_text else dict()
|
|
|
|
def _create_task(_):
|
|
a_task = Task.create(
|
|
project_name=project_name,
|
|
task_name=task_definition.get('name'),
|
|
task_type=task_definition.get('type'),
|
|
)
|
|
# replace reference
|
|
a_task.update_task(task_definition)
|
|
return a_task
|
|
|
|
self._nodes[name] = self.Node(
|
|
name=name, base_task_id=None, parents=parents or [],
|
|
queue=execution_queue, timeout=time_limit,
|
|
parameters=parameters,
|
|
clone_task=False,
|
|
cache_executed_step=cache_executed_step,
|
|
task_factory_func=_create_task,
|
|
continue_on_fail=continue_on_fail,
|
|
return_artifacts=function_return,
|
|
monitor_artifacts=monitor_artifacts,
|
|
monitor_metrics=monitor_metrics,
|
|
monitor_models=monitor_models,
|
|
job_code_section=job_code_section,
|
|
)
|
|
self._retries[name] = 0
|
|
self._retries_callbacks[name] = retry_on_failure if callable(retry_on_failure) else \
|
|
(functools.partial(self._default_retry_on_failure_callback, max_retries=retry_on_failure)
|
|
if isinstance(retry_on_failure, int) else self._retry_on_failure_callback)
|
|
|
|
return True
|
|
|
|
def _relaunch_node(self, node):
|
|
if not node.job:
|
|
getLogger("clearml.automation.controller").warning(
|
|
"Could not relaunch node {} (job object is missing)".format(node.name)
|
|
)
|
|
return
|
|
self._retries[node.name] = self._retries.get(node.name, 0) + 1
|
|
getLogger("clearml.automation.controller").warning(
|
|
"Node '{}' failed. Retrying... (this is retry number {})".format(node.name, self._retries[node.name])
|
|
)
|
|
node.job.task.mark_stopped(force=True, status_message=self._relaunch_status_message)
|
|
node.job.task.set_progress(0)
|
|
node.job.task.get_logger().report_text(
|
|
"\nNode '{}' failed. Retrying... (this is retry number {})\n".format(node.name, self._retries[node.name])
|
|
)
|
|
parsed_queue_name = self._parse_step_ref(node.queue)
|
|
node.job.launch(queue_name=parsed_queue_name or self._default_execution_queue)
|
|
|
|
def _launch_node(self, node):
|
|
# type: (PipelineController.Node) -> ()
|
|
"""
|
|
Launch a single node (create and enqueue a ClearmlJob)
|
|
|
|
:param node: Node to launch
|
|
:return: Return True if a new job was launched
|
|
"""
|
|
# clear state if we are creating a new job
|
|
if not node.job:
|
|
node.job_started = None
|
|
node.job_ended = None
|
|
node.job_type = None
|
|
|
|
if node.job or node.executed:
|
|
print('Skipping cached/executed step [{}]'.format(node.name))
|
|
return False
|
|
|
|
print('Launching step [{}]'.format(node.name))
|
|
|
|
updated_hyper_parameters = {}
|
|
for k, v in node.parameters.items():
|
|
updated_hyper_parameters[k] = self._parse_step_ref(v)
|
|
|
|
task_overrides = self._parse_task_overrides(node.task_overrides) if node.task_overrides else None
|
|
|
|
extra_args = dict()
|
|
extra_args['project'] = self._get_target_project(return_project_id=True) or None
|
|
# set Task name to match job name
|
|
if self._pipeline_as_sub_project:
|
|
extra_args['name'] = node.name
|
|
|
|
skip_node = None
|
|
if self._pre_step_callbacks.get(node.name):
|
|
skip_node = self._pre_step_callbacks[node.name](self, node, updated_hyper_parameters)
|
|
|
|
if skip_node is False:
|
|
node.skip_job = True
|
|
return True
|
|
|
|
task_id = node.base_task_id
|
|
disable_clone_task = not node.clone_task
|
|
task_factory_func_task = None
|
|
if node.task_factory_func:
|
|
# create Task
|
|
task_factory_func_task = node.task_factory_func(node)
|
|
task_id = task_factory_func_task.id
|
|
disable_clone_task = True
|
|
|
|
try:
|
|
node.job = self._clearml_job_class(
|
|
base_task_id=task_id,
|
|
parameter_override=updated_hyper_parameters,
|
|
configuration_overrides=node.configurations,
|
|
tags=['{} {}'.format(self._node_tag_prefix, self._task.id)]
|
|
if self._add_pipeline_tags and self._task else None,
|
|
parent=self._task.id if self._task else None,
|
|
disable_clone_task=disable_clone_task,
|
|
task_overrides=task_overrides,
|
|
allow_caching=node.cache_executed_step,
|
|
**extra_args
|
|
)
|
|
except Exception:
|
|
self._pipeline_task_status_failed = True
|
|
raise
|
|
|
|
node.job_started = time()
|
|
node.job_ended = None
|
|
node.job_type = str(node.job.task.task_type)
|
|
|
|
if self._experiment_created_cb:
|
|
skip_node = self._experiment_created_cb(self, node, updated_hyper_parameters)
|
|
|
|
if skip_node is False:
|
|
# skipping node
|
|
getLogger('clearml.automation.controller').warning(
|
|
'Skipping node {} on callback request'.format(node))
|
|
# delete the job we just created
|
|
node.job.delete()
|
|
node.skip_job = True
|
|
elif node.job.is_cached_task():
|
|
node.executed = node.job.task_id()
|
|
if task_factory_func_task:
|
|
task_factory_func_task.delete(raise_on_error=False)
|
|
self._running_nodes.append(node.name)
|
|
else:
|
|
self._running_nodes.append(node.name)
|
|
|
|
parsed_queue_name = self._parse_step_ref(node.queue)
|
|
return node.job.launch(queue_name=parsed_queue_name or self._default_execution_queue)
|
|
|
|
return True
|
|
|
|
def _update_execution_plot(self):
|
|
# type: () -> ()
|
|
"""
|
|
Update sankey diagram of the current pipeline
|
|
Also update the controller Task artifact storing the DAG state (with all the nodes states)
|
|
"""
|
|
if not self._task:
|
|
return
|
|
|
|
nodes = list(self._nodes.values())
|
|
# update status
|
|
for n in nodes:
|
|
self._update_node_status(n)
|
|
|
|
# update the configuration state, so that the UI is presents the correct state
|
|
self._force_task_configuration_update()
|
|
|
|
sankey_node = dict(
|
|
label=[],
|
|
color=[],
|
|
hovertemplate='%{label}<extra></extra>',
|
|
# customdata=[],
|
|
# hovertemplate='%{label}<br />Hyper-Parameters:<br />%{customdata}<extra></extra>',
|
|
)
|
|
sankey_link = dict(
|
|
source=[],
|
|
target=[],
|
|
value=[],
|
|
# hovertemplate='%{target.label}<extra></extra>',
|
|
hovertemplate='<extra></extra>',
|
|
)
|
|
visited = []
|
|
node_params = []
|
|
# update colors
|
|
while nodes:
|
|
next_nodes = []
|
|
for node in nodes:
|
|
if not all(p in visited for p in node.parents or []):
|
|
next_nodes.append(node)
|
|
continue
|
|
visited.append(node.name)
|
|
idx = len(visited) - 1
|
|
parents = [visited.index(p) for p in node.parents or []]
|
|
if node.job and node.job.task_parameter_override is not None:
|
|
node.job.task_parameter_override.update(node.parameters or {})
|
|
node_params.append(
|
|
(
|
|
node.job.task_parameter_override
|
|
if node.job and node.job.task_parameter_override
|
|
else node.parameters
|
|
)
|
|
or {}
|
|
)
|
|
# sankey_node['label'].append(node.name)
|
|
# sankey_node['customdata'].append(
|
|
# '<br />'.join('{}: {}'.format(k, v) for k, v in (node.parameters or {}).items()))
|
|
sankey_node['label'].append(
|
|
'{}<br />'.format(node.name) +
|
|
'<br />'.join('{}: {}'.format(k, v if len(str(v)) < 24 else (str(v)[:24]+' ...'))
|
|
for k, v in (node.parameters or {}).items()))
|
|
|
|
sankey_node['color'].append(self._get_node_color(node))
|
|
|
|
for p in parents:
|
|
sankey_link['source'].append(p)
|
|
sankey_link['target'].append(idx)
|
|
sankey_link['value'].append(1)
|
|
|
|
# if nothing changed, we give up
|
|
if nodes == next_nodes:
|
|
break
|
|
|
|
nodes = next_nodes
|
|
|
|
# make sure we have no independent (unconnected) nodes
|
|
single_nodes = []
|
|
for i in [n for n in range(len(visited)) if n not in sankey_link['source'] and n not in sankey_link['target']]:
|
|
single_nodes.append(i)
|
|
|
|
# create the sankey graph
|
|
dag_flow = dict(
|
|
link=sankey_link,
|
|
node=sankey_node,
|
|
textfont=dict(color='rgba(0,0,0,0)', size=1),
|
|
type='sankey',
|
|
orientation='h'
|
|
)
|
|
|
|
table_values = self._build_table_report(node_params, visited)
|
|
|
|
# hack, show single node sankey
|
|
if single_nodes:
|
|
singles_flow = dict(
|
|
x=list(range(len(single_nodes))), y=[1] * len(single_nodes),
|
|
text=[v for i, v in enumerate(sankey_node['label']) if i in single_nodes],
|
|
mode='markers',
|
|
hovertemplate="%{text}<extra></extra>",
|
|
marker=dict(
|
|
color=[v for i, v in enumerate(sankey_node['color']) if i in single_nodes],
|
|
size=[40] * len(single_nodes),
|
|
),
|
|
showlegend=False,
|
|
type='scatter',
|
|
)
|
|
# only single nodes
|
|
if len(single_nodes) == len(sankey_node['label']):
|
|
fig = dict(data=[singles_flow], layout={
|
|
'hovermode': 'closest', 'xaxis': {'visible': False}, 'yaxis': {'visible': False}})
|
|
else:
|
|
dag_flow['domain'] = {'x': [0.0, 1.0], 'y': [0.2, 1.0]}
|
|
fig = dict(data=[dag_flow, singles_flow],
|
|
layout={'autosize': True,
|
|
'hovermode': 'closest',
|
|
'xaxis': {'anchor': 'y', 'domain': [0.0, 1.0], 'visible': False},
|
|
'yaxis': {'anchor': 'x', 'domain': [0.0, 0.15], 'visible': False}
|
|
})
|
|
else:
|
|
# create the sankey plot
|
|
fig = dict(data=[dag_flow], layout={'xaxis': {'visible': False}, 'yaxis': {'visible': False}})
|
|
|
|
# report DAG
|
|
self._task.get_logger().report_plotly(
|
|
title=self._report_plot_execution_flow['title'],
|
|
series=self._report_plot_execution_flow['series'],
|
|
iteration=0, figure=fig)
|
|
# report detailed table
|
|
self._task.get_logger().report_table(
|
|
title=self._report_plot_execution_details['title'],
|
|
series=self._report_plot_execution_details['series'],
|
|
iteration=0, table_plot=table_values)
|
|
|
|
def _build_table_report(self, node_params, visited):
|
|
# type: (List, List) -> List[List]
|
|
"""
|
|
Create the detailed table report on all the jobs in the pipeline
|
|
|
|
:param node_params: list of node parameters
|
|
:param visited: list of nodes
|
|
:return: Table as List of List of strings (cell)
|
|
"""
|
|
task_link_template = self._task.get_output_log_web_page() \
|
|
.replace('/{}/'.format(self._task.project), '/{project}/') \
|
|
.replace('/{}/'.format(self._task.id), '/{task}/')
|
|
|
|
table_values = [["Pipeline Step", "Task ID", "Task Name", "Status", "Parameters"]]
|
|
|
|
for name, param in zip(visited, node_params):
|
|
param_str = str(param) if param else ''
|
|
if len(param_str) > 3:
|
|
# remove {} from string
|
|
param_str = param_str[1:-1]
|
|
|
|
step_name = name
|
|
if self._nodes[name].base_task_id:
|
|
step_name += '\n[<a href="{}"> {} </a>]'.format(
|
|
task_link_template.format(project='*', task=self._nodes[name].base_task_id), 'base task')
|
|
|
|
table_values.append(
|
|
[step_name,
|
|
self.__create_task_link(self._nodes[name], task_link_template),
|
|
self._nodes[name].job.task.name if self._nodes[name].job else '',
|
|
str(self._nodes[name].status or ""),
|
|
param_str]
|
|
)
|
|
|
|
return table_values
|
|
|
|
def _call_retries_callback(self, node):
|
|
# if this functions returns True, we should relaunch the node
|
|
# if False, don't relaunch
|
|
if node.name not in self._retries_callbacks:
|
|
return False
|
|
try:
|
|
return self._retries_callbacks[node.name](self, node, self._retries.get(node.name, 0))
|
|
except Exception as e:
|
|
getLogger("clearml.automation.controller").warning(
|
|
"Failed calling the retry callback for node '{}'. Error is '{}'".format(node.name, e)
|
|
)
|
|
return False
|
|
|
|
@classmethod
|
|
def _get_node_color(cls, node):
|
|
# type (self.Mode) -> str
|
|
"""
|
|
Return the node color based on the node/job state
|
|
:param node: A node in the pipeline
|
|
:return: string representing the color of the node (e.g. "red", "green", etc)
|
|
"""
|
|
if not node:
|
|
return ""
|
|
|
|
color_lookup = {
|
|
"failed": "red",
|
|
"cached": "darkslateblue",
|
|
"completed": "blue",
|
|
"aborted": "royalblue",
|
|
"queued": "#bdf5bd",
|
|
"running": "green",
|
|
"skipped": "gray",
|
|
"pending": "lightsteelblue",
|
|
}
|
|
return color_lookup.get(node.status, "")
|
|
|
|
@classmethod
|
|
def _update_node_status(cls, node):
|
|
# type (self.Mode) -> ()
|
|
"""
|
|
Update the node status entry based on the node/job state
|
|
:param node: A node in the pipeline
|
|
"""
|
|
if not node:
|
|
return
|
|
|
|
# update job ended:
|
|
update_job_ended = node.job_started and not node.job_ended
|
|
|
|
# refresh status
|
|
if node.job and isinstance(node.job, BaseJob):
|
|
node.job.status(force=True)
|
|
|
|
if node.executed is not None:
|
|
if node.job and node.job.is_failed():
|
|
# failed job
|
|
node.status = "failed"
|
|
elif node.job and node.job.is_cached_task():
|
|
# cached job
|
|
node.status = "cached"
|
|
elif not node.job or node.job.is_completed():
|
|
# completed job
|
|
node.status = "completed"
|
|
else:
|
|
# aborted job
|
|
node.status = "aborted"
|
|
elif node.job:
|
|
if node.job.is_pending():
|
|
# lightgreen, pending in queue
|
|
node.status = "queued"
|
|
elif node.job.is_completed():
|
|
# completed job
|
|
node.status = "completed"
|
|
elif node.job.is_failed():
|
|
# failed job
|
|
node.status = "failed"
|
|
elif node.job.is_stopped():
|
|
# aborted job
|
|
node.status = "aborted"
|
|
else:
|
|
node.status = "running"
|
|
elif node.skip_job:
|
|
node.status = "skipped"
|
|
else:
|
|
node.status = "pending"
|
|
|
|
if update_job_ended and node.status in ("aborted", "failed", "completed"):
|
|
node.job_ended = time()
|
|
|
|
assert node.status in cls.valid_job_status
|
|
|
|
def _update_dag_state_artifact(self):
|
|
# type: () -> ()
|
|
pipeline_dag = self._serialize()
|
|
self._task.upload_artifact(
|
|
name=self._state_artifact_name, artifact_object='',
|
|
metadata=dict(pipeline=hash_dict(pipeline_dag)),
|
|
preview=json.dumps(pipeline_dag, indent=1))
|
|
|
|
def _force_task_configuration_update(self):
|
|
# type: () -> ()
|
|
pipeline_dag = self._serialize()
|
|
if self._task:
|
|
# noinspection PyProtectedMember
|
|
self._task._set_configuration(
|
|
name=self._config_section, config_type='dictionary',
|
|
description="pipeline state: {}".format(hash_dict(pipeline_dag)),
|
|
config_text=json.dumps(pipeline_dag, indent=2), force=True)
|
|
|
|
def _update_progress(self):
|
|
# type: () -> ()
|
|
"""
|
|
Update progress of the pipeline every PipelineController._update_progress_interval seconds.
|
|
Progress is calculated as the mean of the progress of each step in the pipeline.
|
|
"""
|
|
if time() - self._last_progress_update_time < self._update_progress_interval:
|
|
return
|
|
job_progress = [(node.job.task.get_progress() or 0) if node.job else 0 for node in self._nodes.values()]
|
|
if len(job_progress):
|
|
self._task.set_progress(int(sum(job_progress) / len(job_progress)))
|
|
self._last_progress_update_time = time()
|
|
|
|
def _daemon(self):
|
|
# type: () -> ()
|
|
"""
|
|
The main pipeline execution loop. This loop is executed on its own dedicated thread.
|
|
:return:
|
|
"""
|
|
launch_thread_pool = ThreadPool(16)
|
|
pooling_counter = 0
|
|
launched_nodes = set()
|
|
last_monitor_report = last_plot_report = time()
|
|
while self._stop_event:
|
|
# stop request
|
|
if self._stop_event.wait(self._pool_frequency if pooling_counter else 0.01):
|
|
break
|
|
|
|
pooling_counter += 1
|
|
|
|
# check the pipeline time limit
|
|
if self._pipeline_time_limit and (time() - self._start_time) > self._pipeline_time_limit:
|
|
break
|
|
|
|
self._update_progress()
|
|
# check the state of all current jobs
|
|
# if no a job ended, continue
|
|
completed_jobs = []
|
|
force_execution_plot_update = False
|
|
nodes_failed_stop_pipeline = []
|
|
for j in self._running_nodes:
|
|
node = self._nodes[j]
|
|
if not node.job:
|
|
continue
|
|
if node.job.is_stopped(aborted_nonresponsive_as_running=True):
|
|
node_failed = node.job.is_failed()
|
|
if node_failed:
|
|
if self._call_retries_callback(node):
|
|
self._relaunch_node(node)
|
|
continue
|
|
else:
|
|
self._final_failure[node.name] = True
|
|
|
|
completed_jobs.append(j)
|
|
node.executed = node.job.task_id() if not node_failed else False
|
|
if j in launched_nodes:
|
|
launched_nodes.remove(j)
|
|
# check if we need to stop all running steps
|
|
if node_failed and self._abort_running_steps_on_failure and not node.continue_on_fail:
|
|
nodes_failed_stop_pipeline.append(node.name)
|
|
elif node.timeout:
|
|
started = node.job.task.data.started
|
|
if (datetime.now().astimezone(started.tzinfo) - started).total_seconds() > node.timeout:
|
|
node.job.abort()
|
|
completed_jobs.append(j)
|
|
node.executed = node.job.task_id()
|
|
elif j in launched_nodes and node.job.is_running():
|
|
# make sure update the execution graph when the job started running
|
|
# (otherwise it will still be marked queued)
|
|
launched_nodes.remove(j)
|
|
force_execution_plot_update = True
|
|
|
|
# update running jobs
|
|
self._running_nodes = [j for j in self._running_nodes if j not in completed_jobs]
|
|
|
|
# nothing changed, we can sleep
|
|
if not completed_jobs and self._running_nodes:
|
|
# force updating the pipeline state (plot) at least every 5 min.
|
|
if force_execution_plot_update or time()-last_plot_report > self._update_execution_plot_interval:
|
|
last_plot_report = time()
|
|
last_monitor_report = time()
|
|
self.update_execution_plot()
|
|
elif time()-last_monitor_report > self._monitor_node_interval:
|
|
last_monitor_report = time()
|
|
self._scan_monitored_nodes()
|
|
continue
|
|
|
|
# callback on completed jobs
|
|
if self._experiment_completed_cb or self._post_step_callbacks:
|
|
for job in completed_jobs:
|
|
job_node = self._nodes.get(job)
|
|
if not job_node:
|
|
continue
|
|
if self._experiment_completed_cb:
|
|
self._experiment_completed_cb(self, job_node)
|
|
if self._post_step_callbacks.get(job_node.name):
|
|
self._post_step_callbacks[job_node.name](self, job_node)
|
|
|
|
# check if we need to stop the pipeline, and abort all running steps
|
|
if nodes_failed_stop_pipeline:
|
|
print('Aborting pipeline and stopping all running steps, node {} failed'.format(
|
|
nodes_failed_stop_pipeline))
|
|
break
|
|
|
|
# Pull the next jobs in the pipeline, based on the completed list
|
|
next_nodes = []
|
|
for node in list(self._nodes.values()):
|
|
# check if already processed or needs to be skipped
|
|
if node.job or node.executed or node.skip_job:
|
|
continue
|
|
completed_parents = [bool(p in self._nodes and self._nodes[p].executed) for p in node.parents or []]
|
|
if all(completed_parents):
|
|
next_nodes.append(node.name)
|
|
|
|
# update the execution graph
|
|
print('Launching the next {} steps'.format(len(next_nodes)))
|
|
node_launch_success = launch_thread_pool.map(
|
|
self._launch_node, [self._nodes[name] for name in next_nodes])
|
|
for name, success in zip(next_nodes, node_launch_success):
|
|
if success and not self._nodes[name].skip_job:
|
|
if self._nodes[name].job and self._nodes[name].job.task_parameter_override is not None:
|
|
self._nodes[name].job.task_parameter_override.update(self._nodes[name].parameters or {})
|
|
print('Launching step: {}'.format(name))
|
|
print('Parameters:\n{}'.format(
|
|
self._nodes[name].job.task_parameter_override if self._nodes[name].job
|
|
else self._nodes[name].parameters))
|
|
print('Configurations:\n{}'.format(self._nodes[name].configurations))
|
|
print('Overrides:\n{}'.format(self._nodes[name].task_overrides))
|
|
launched_nodes.add(name)
|
|
# check if node is cached do not wait for event but run the loop again
|
|
if self._nodes[name].executed:
|
|
pooling_counter = 0
|
|
else:
|
|
getLogger('clearml.automation.controller').warning(
|
|
'Skipping launching step \'{}\': {}'.format(name, self._nodes[name]))
|
|
|
|
# update current state (in configuration, so that we could later continue an aborted pipeline)
|
|
# visualize pipeline state (plot)
|
|
self.update_execution_plot()
|
|
|
|
# quit if all pipelines nodes are fully executed.
|
|
if not next_nodes and not self._running_nodes:
|
|
break
|
|
|
|
# stop all currently running jobs:
|
|
for node in list(self._nodes.values()):
|
|
if node.executed is False and not node.continue_on_fail:
|
|
self._pipeline_task_status_failed = True
|
|
|
|
if node.job and not node.job.is_stopped():
|
|
node.job.abort()
|
|
elif not node.job and not node.executed:
|
|
# mark Node as skipped if it has no Job object and it is not executed
|
|
node.skip_job = True
|
|
|
|
# visualize pipeline state (plot)
|
|
self.update_execution_plot()
|
|
|
|
if self._stop_event:
|
|
# noinspection PyBroadException
|
|
try:
|
|
self._stop_event.set()
|
|
except Exception:
|
|
pass
|
|
|
|
def _parse_step_ref(self, value):
|
|
# type: (Any) -> Optional[str]
|
|
"""
|
|
Return the step reference. For example "${step1.parameters.Args/param}"
|
|
:param value: string
|
|
:return:
|
|
"""
|
|
# look for all the step references
|
|
pattern = self._step_ref_pattern
|
|
updated_value = value
|
|
if isinstance(value, str):
|
|
for g in pattern.findall(value):
|
|
# update with actual value
|
|
new_val = self.__parse_step_reference(g)
|
|
if not isinstance(new_val, six.string_types):
|
|
return new_val
|
|
updated_value = updated_value.replace(g, new_val, 1)
|
|
return updated_value
|
|
|
|
def _parse_task_overrides(self, task_overrides):
|
|
# type: (dict) -> dict
|
|
"""
|
|
Return the step reference. For example "${step1.parameters.Args/param}"
|
|
:param task_overrides: string
|
|
:return:
|
|
"""
|
|
updated_overrides = {}
|
|
for k, v in task_overrides.items():
|
|
updated_overrides[k] = self._parse_step_ref(v)
|
|
|
|
return updated_overrides
|
|
|
|
def _verify_node_name(self, name):
|
|
# type: (str) -> None
|
|
if name in self._nodes:
|
|
raise ValueError('Node named \'{}\' already exists in the pipeline dag'.format(name))
|
|
if name in self._reserved_pipeline_names:
|
|
raise ValueError('Node named \'{}\' is a reserved keyword, use a different name'.format(name))
|
|
|
|
def _scan_monitored_nodes(self):
|
|
# type: () -> None
|
|
"""
|
|
Scan all nodes and monitor their metrics/artifacts/models
|
|
"""
|
|
for node in list(self._nodes.values()):
|
|
self._monitor_node(node)
|
|
|
|
def _monitor_node(self, node):
|
|
# type: (PipelineController.Node) -> None
|
|
"""
|
|
If Node is running, put the metrics from the node on the pipeline itself.
|
|
:param node: Node to test
|
|
"""
|
|
if not node:
|
|
return
|
|
|
|
# verify we have the node
|
|
if node.name not in self._monitored_nodes:
|
|
self._monitored_nodes[node.name] = {}
|
|
|
|
# if we are done with this node, skip it
|
|
if self._monitored_nodes[node.name].get('completed'):
|
|
return
|
|
|
|
if node.job and node.job.task:
|
|
task = node.job.task
|
|
elif node.job and node.executed and isinstance(node.executed, str):
|
|
task = Task.get_task(task_id=node.executed)
|
|
else:
|
|
return
|
|
|
|
# update the metrics
|
|
if node.monitor_metrics:
|
|
metrics_state = self._monitored_nodes[node.name].get('metrics', {})
|
|
logger = self._task.get_logger()
|
|
scalars = task.get_reported_scalars(x_axis='iter')
|
|
for (s_title, s_series), (t_title, t_series) in node.monitor_metrics:
|
|
values = scalars.get(s_title, {}).get(s_series)
|
|
if values and values.get('x') is not None and values.get('y') is not None:
|
|
x = values['x'][-1]
|
|
y = values['y'][-1]
|
|
last_y = metrics_state.get(s_title, {}).get(s_series)
|
|
if last_y is None or y > last_y:
|
|
logger.report_scalar(title=t_title, series=t_series, value=y, iteration=int(x))
|
|
last_y = y
|
|
if not metrics_state.get(s_title):
|
|
metrics_state[s_title] = {}
|
|
metrics_state[s_title][s_series] = last_y
|
|
|
|
self._monitored_nodes[node.name]['metrics'] = metrics_state
|
|
|
|
if node.monitor_artifacts:
|
|
task.reload()
|
|
artifacts = task.data.execution.artifacts
|
|
self._task.reload()
|
|
output_artifacts = []
|
|
for s_artifact, t_artifact in node.monitor_artifacts:
|
|
# find artifact
|
|
for a in artifacts:
|
|
if a.key != s_artifact:
|
|
continue
|
|
|
|
new_a = copy(a)
|
|
new_a.key = t_artifact
|
|
output_artifacts.append(new_a)
|
|
break
|
|
|
|
# update artifacts directly on the Task
|
|
if output_artifacts:
|
|
# noinspection PyProtectedMember
|
|
self._task._add_artifacts(output_artifacts)
|
|
|
|
if node.monitor_models:
|
|
task.reload()
|
|
output_models = task.data.models.output
|
|
self._task.reload()
|
|
target_models = []
|
|
for s_model, t_model in node.monitor_models:
|
|
# find artifact
|
|
for a in output_models:
|
|
if a.name != s_model:
|
|
continue
|
|
|
|
new_a = copy(a)
|
|
new_a.name = t_model
|
|
target_models.append(new_a)
|
|
break
|
|
|
|
# update artifacts directly on the Task
|
|
if target_models:
|
|
self._task.reload()
|
|
models = self._task.data.models
|
|
keys = [a.name for a in target_models]
|
|
models.output = [a for a in models.output or [] if a.name not in keys] + target_models
|
|
# noinspection PyProtectedMember
|
|
self._task._edit(models=models)
|
|
|
|
# update the state (so that we do not scan the node twice)
|
|
if node.job.is_stopped(aborted_nonresponsive_as_running=True):
|
|
self._monitored_nodes[node.name]['completed'] = True
|
|
|
|
def _get_target_project(self, return_project_id=False):
|
|
# type: (bool) -> str
|
|
"""
|
|
return the pipeline components target folder name/id
|
|
|
|
:param return_project_id: if False (default), return target folder name. If True, return project id
|
|
:return: project id/name (None if not valid)
|
|
"""
|
|
if not self._target_project:
|
|
return ''
|
|
|
|
if str(self._target_project).lower().strip() == 'true':
|
|
if not self._task:
|
|
return ''
|
|
return self._task.project if return_project_id else self._task.get_project_name()
|
|
|
|
if not return_project_id:
|
|
return self._target_project
|
|
|
|
return get_or_create_project(
|
|
session=self._task.session if self._task else Task.default_session,
|
|
project_name=self._target_project)
|
|
|
|
def _add_pipeline_name_run_number(self):
|
|
# type: () -> None
|
|
if not self._task:
|
|
return
|
|
# if we were already executed, do not rename (meaning aborted pipeline that was continued)
|
|
# noinspection PyProtectedMember
|
|
if self._task._get_runtime_properties().get(self._runtime_property_hash):
|
|
return
|
|
|
|
# remove the #<num> suffix if we have one:
|
|
task_name = re.compile(r" #\d+$").split(self._task.name or "", 1)[0]
|
|
page_size = 100
|
|
# find exact name or " #<num>" extension
|
|
prev_pipelines_ids = self._task.query_tasks(
|
|
task_name=r"^{}(| #\d+)$".format(task_name),
|
|
task_filter=dict(
|
|
project=[self._task.project], system_tags=[self._tag],
|
|
order_by=['-created'],
|
|
page_size=page_size,
|
|
fetch_only_first_page=True,
|
|
)
|
|
)
|
|
max_value = len(prev_pipelines_ids) if prev_pipelines_ids else 0
|
|
# we hit the limit
|
|
if max_value == page_size:
|
|
# make sure that if we get something wrong we do not stop the pipeline,
|
|
# worst case fail to auto increment
|
|
try:
|
|
# we assume we are the latest so let's take a few (last 10) and check the max number
|
|
last_task_name = self._task.query_tasks(
|
|
task_filter=dict(task_ids=prev_pipelines_ids[:10], project=[self._task.project]),
|
|
additional_return_fields=['name'],
|
|
) # type: List[Dict]
|
|
# let's parse the names
|
|
pattern = re.compile(r" #(?P<key>\d+)$")
|
|
task_parts = [pattern.split(t.get('name') or "", 1) for t in last_task_name]
|
|
# find the highest number
|
|
for parts in task_parts:
|
|
if len(parts) >= 2:
|
|
try:
|
|
max_value = max(max_value, int(parts[1])+1)
|
|
except (TypeError, ValueError):
|
|
pass
|
|
except Exception as ex:
|
|
getLogger('clearml.automation.controller').warning(
|
|
'Pipeline auto run increment failed (skipping): {}'.format(ex))
|
|
max_value = 0
|
|
|
|
if max_value > 1:
|
|
self._task.set_name(task_name + " #{}".format(max_value))
|
|
|
|
@classmethod
|
|
def _get_pipeline_task(cls):
|
|
# type: () -> Task
|
|
"""
|
|
Return the pipeline Task (either the current one, or the parent Task of the currently running Task)
|
|
Raise ValueError if we could not locate the pipeline Task
|
|
|
|
:return: Pipeline Task
|
|
"""
|
|
# get main Task.
|
|
task = Task.current_task()
|
|
if str(task.task_type) == str(Task.TaskTypes.controller) and cls._tag in task.get_system_tags():
|
|
return task
|
|
# get the parent Task, it should be the pipeline
|
|
if not task.parent:
|
|
raise ValueError("Could not locate parent Pipeline Task")
|
|
parent = Task.get_task(task_id=task.parent)
|
|
if str(parent.task_type) == str(Task.TaskTypes.controller) and cls._tag in parent.get_system_tags():
|
|
return parent
|
|
raise ValueError("Could not locate parent Pipeline Task")
|
|
|
|
def __verify_step_reference(self, node, step_ref_string):
|
|
# type: (PipelineController.Node, str) -> Optional[str]
|
|
"""
|
|
Verify the step reference. For example "${step1.parameters.Args/param}"
|
|
Raise ValueError on misconfiguration
|
|
|
|
:param Node node: calling reference node (used for logging)
|
|
:param str step_ref_string: For example "${step1.parameters.Args/param}"
|
|
:return: If step reference is used, return the pipeline step name, otherwise return None
|
|
"""
|
|
parts = step_ref_string[2:-1].split('.')
|
|
v = step_ref_string
|
|
if len(parts) < 2:
|
|
raise ValueError("Node '{}', parameter '{}' is invalid".format(node.name, v))
|
|
prev_step = parts[0]
|
|
input_type = parts[1]
|
|
|
|
# check if we reference the pipeline arguments themselves
|
|
if prev_step == self._pipeline_step_ref:
|
|
if input_type not in self._pipeline_args:
|
|
raise ValueError("Node '{}', parameter '{}', step name '{}' is invalid".format(node.name, v, prev_step))
|
|
return None
|
|
|
|
if prev_step not in self._nodes:
|
|
raise ValueError("Node '{}', parameter '{}', step name '{}' is invalid".format(node.name, v, prev_step))
|
|
if input_type not in ('artifacts', 'parameters', 'models', 'id'):
|
|
raise ValueError(
|
|
"Node {}, parameter '{}', input type '{}' is invalid".format(node.name, v, input_type))
|
|
|
|
if input_type != 'id' and len(parts) < 3:
|
|
raise ValueError("Node '{}', parameter '{}' is invalid".format(node.name, v))
|
|
|
|
if input_type == 'models':
|
|
try:
|
|
model_type = parts[2].lower()
|
|
except Exception:
|
|
raise ValueError(
|
|
"Node '{}', parameter '{}', input type '{}', model_type is missing {}".format(
|
|
node.name, v, input_type, parts))
|
|
if model_type not in ('input', 'output'):
|
|
raise ValueError(
|
|
"Node '{}', parameter '{}', input type '{}', "
|
|
"model_type is invalid (input/output) found {}".format(
|
|
node.name, v, input_type, model_type))
|
|
|
|
if len(parts) < 4:
|
|
raise ValueError(
|
|
"Node '{}', parameter '{}', input type '{}', model index is missing".format(
|
|
node.name, v, input_type))
|
|
|
|
# check casting
|
|
try:
|
|
int(parts[3])
|
|
except Exception:
|
|
raise ValueError(
|
|
"Node '{}', parameter '{}', input type '{}', model index is missing {}".format(
|
|
node.name, v, input_type, parts))
|
|
|
|
if len(parts) < 5:
|
|
raise ValueError(
|
|
"Node '{}', parameter '{}', input type '{}', model property is missing".format(
|
|
node.name, v, input_type))
|
|
|
|
if not hasattr(BaseModel, parts[4]):
|
|
raise ValueError(
|
|
"Node '{}', parameter '{}', input type '{}', model property is invalid {}".format(
|
|
node.name, v, input_type, parts[4]))
|
|
return prev_step
|
|
|
|
def __parse_step_reference(self, step_ref_string):
|
|
"""
|
|
return the adjusted value for "${step...}"
|
|
:param step_ref_string: reference string of the form ${step_name.type.value}"
|
|
:return: str with value
|
|
"""
|
|
parts = step_ref_string[2:-1].split('.')
|
|
if len(parts) < 2:
|
|
raise ValueError("Could not parse reference '{}'".format(step_ref_string))
|
|
prev_step = parts[0]
|
|
input_type = parts[1].lower()
|
|
|
|
# check if we reference the pipeline arguments themselves
|
|
if prev_step == self._pipeline_step_ref:
|
|
if parts[1] not in self._pipeline_args:
|
|
raise ValueError("Could not parse reference '{}', "
|
|
"pipeline argument '{}' could not be found".format(step_ref_string, parts[1]))
|
|
return self._pipeline_args[parts[1]]
|
|
|
|
if prev_step not in self._nodes or (
|
|
not self._nodes[prev_step].job and
|
|
not self._nodes[prev_step].executed and
|
|
not self._nodes[prev_step].base_task_id
|
|
):
|
|
raise ValueError("Could not parse reference '{}', step '{}' could not be found".format(
|
|
step_ref_string, prev_step))
|
|
|
|
if input_type not in (
|
|
'artifacts', 'parameters', 'models', 'id',
|
|
'script', 'execution', 'container', 'output',
|
|
'comment', 'models', 'tags', 'system_tags', 'project'):
|
|
raise ValueError("Could not parse reference '{}', type '{}' not valid".format(step_ref_string, input_type))
|
|
if input_type != 'id' and len(parts) < 3:
|
|
raise ValueError("Could not parse reference '{}', missing fields in '{}'".format(step_ref_string, parts))
|
|
|
|
task = self._nodes[prev_step].job.task if self._nodes[prev_step].job \
|
|
else Task.get_task(task_id=self._nodes[prev_step].executed or self._nodes[prev_step].base_task_id)
|
|
task.reload()
|
|
if input_type == 'artifacts':
|
|
# fix \. to use . in artifacts
|
|
artifact_path = ('.'.join(parts[2:])).replace('\\.', '\\_dot_\\')
|
|
artifact_path = artifact_path.split('.')
|
|
|
|
obj = task.artifacts
|
|
for p in artifact_path:
|
|
p = p.replace('\\_dot_\\', '.')
|
|
if isinstance(obj, dict):
|
|
obj = obj.get(p)
|
|
elif hasattr(obj, p):
|
|
obj = getattr(obj, p)
|
|
else:
|
|
raise ValueError("Could not locate artifact {} on previous step {}".format(
|
|
'.'.join(parts[1:]), prev_step))
|
|
return str(obj)
|
|
elif input_type == 'parameters':
|
|
step_params = task.get_parameters()
|
|
param_name = '.'.join(parts[2:])
|
|
if param_name not in step_params:
|
|
raise ValueError("Could not locate parameter {} on previous step {}".format(
|
|
'.'.join(parts[1:]), prev_step))
|
|
return step_params.get(param_name)
|
|
elif input_type == 'models':
|
|
model_type = parts[2].lower()
|
|
if model_type not in ('input', 'output'):
|
|
raise ValueError("Could not locate model {} on previous step {}".format(
|
|
'.'.join(parts[1:]), prev_step))
|
|
try:
|
|
model_idx = int(parts[3])
|
|
model = task.models[model_type][model_idx]
|
|
except Exception:
|
|
raise ValueError("Could not locate model {} on previous step {}, index {} is invalid".format(
|
|
'.'.join(parts[1:]), prev_step, parts[3]))
|
|
|
|
return str(getattr(model, parts[4]))
|
|
elif input_type == 'id':
|
|
return task.id
|
|
elif input_type in (
|
|
'script', 'execution', 'container', 'output',
|
|
'comment', 'models', 'tags', 'system_tags', 'project'):
|
|
# noinspection PyProtectedMember
|
|
return task._get_task_property('.'.join(parts[1:]))
|
|
|
|
return None
|
|
|
|
@classmethod
|
|
def __create_task_link(cls, a_node, task_link_template):
|
|
# type: (PipelineController.Node, str) -> str
|
|
if not a_node:
|
|
return ''
|
|
# create the detailed parameter table
|
|
task_id = project_id = None
|
|
if a_node.job:
|
|
project_id = a_node.job.task.project
|
|
task_id = a_node.job.task.id
|
|
elif a_node.executed:
|
|
task_id = a_node.executed
|
|
if cls._task_project_lookup.get(task_id):
|
|
project_id = cls._task_project_lookup[task_id]
|
|
else:
|
|
# noinspection PyBroadException
|
|
try:
|
|
project_id = Task.get_task(task_id=task_id).project
|
|
except Exception:
|
|
project_id = '*'
|
|
cls._task_project_lookup[task_id] = project_id
|
|
|
|
if not task_id:
|
|
return ''
|
|
|
|
return '<a href="{}"> {} </a>'.format(task_link_template.format(project=project_id, task=task_id), task_id)
|
|
|
|
def _default_retry_on_failure_callback(self, _pipeline_controller, _node, retries, max_retries=None):
|
|
return retries < (self._def_max_retry_on_failure if max_retries is None else max_retries)
|
|
|
|
def _upload_pipeline_artifact(self, artifact_name, artifact_object):
|
|
self._task.upload_artifact(
|
|
name=artifact_name,
|
|
artifact_object=artifact_object,
|
|
wait_on_upload=True,
|
|
extension_name=".pkl" if isinstance(artifact_object, dict) else None,
|
|
)
|
|
|
|
|
|
class PipelineDecorator(PipelineController):
|
|
_added_decorator = [] # type: List[dict]
|
|
_ref_lazy_loader_id_to_node_name = {} # type: dict
|
|
_singleton = None # type: Optional[PipelineDecorator]
|
|
_eager_step_artifact = 'eager_step'
|
|
_eager_execution_instance = False
|
|
_debug_execute_step_process = False
|
|
_debug_execute_step_function = False
|
|
_default_execution_queue = None
|
|
_multi_pipeline_instances = []
|
|
_multi_pipeline_call_counter = -1
|
|
_atexit_registered = False
|
|
|
|
def __init__(
|
|
self,
|
|
name, # type: str
|
|
project, # type: str
|
|
version, # type: str
|
|
pool_frequency=0.2, # type: float
|
|
add_pipeline_tags=False, # type: bool
|
|
target_project=None, # type: Optional[str]
|
|
abort_on_failure=False, # type: bool
|
|
add_run_number=True, # type: bool
|
|
retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa
|
|
docker=None, # type: Optional[str]
|
|
docker_args=None, # type: Optional[str]
|
|
docker_bash_setup_script=None, # type: Optional[str]
|
|
packages=None, # type: Optional[Union[str, Sequence[str]]]
|
|
repo=None, # type: Optional[str]
|
|
repo_branch=None, # type: Optional[str]
|
|
repo_commit=None # type: Optional[str]
|
|
):
|
|
# type: (...) -> ()
|
|
"""
|
|
Create a new pipeline controller. The newly created object will launch and monitor the new experiments.
|
|
|
|
:param name: Provide pipeline name (if main Task exists it overrides its name)
|
|
:param project: Provide project storing the pipeline (if main Task exists it overrides its project)
|
|
:param version: Must provide pipeline version. This version allows to uniquely identify the pipeline
|
|
template execution. Examples for semantic versions: version='1.0.1' , version='23', version='1.2'
|
|
:param float pool_frequency: The pooling frequency (in minutes) for monitoring experiments / states.
|
|
:param bool add_pipeline_tags: (default: False) if True, add `pipe: <pipeline_task_id>` tag to all
|
|
steps (Tasks) created by this pipeline.
|
|
:param str target_project: If provided, all pipeline steps are cloned into the target project
|
|
:param bool abort_on_failure: If False (default), failed pipeline steps will not cause the pipeline
|
|
to stop immediately, instead any step that is not connected (or indirectly connected) to the failed step,
|
|
will still be executed. Nonetheless the pipeline itself will be marked failed, unless the failed step
|
|
was specifically defined with "continue_on_fail=True".
|
|
If True, any failed step will cause the pipeline to immediately abort, stop all running steps,
|
|
and mark the pipeline as failed.
|
|
:param add_run_number: If True (default), add the run number of the pipeline to the pipeline name.
|
|
Example, the second time we launch the pipeline "best pipeline", we rename it to "best pipeline #2"
|
|
:param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry
|
|
- Integer: In case of node failure, retry the node the number of times indicated by this parameter.
|
|
- Callable: A function called on node failure. Takes as parameters:
|
|
the PipelineController instance, the PipelineController.Node that failed and an int
|
|
representing the number of previous retries for the node that failed
|
|
The function must return a `bool`: True if the node should be retried and False otherwise.
|
|
If True, the node will be re-queued and the number of retries left will be decremented by 1.
|
|
By default, if this callback is not specified, the function will be retried the number of
|
|
times indicated by `retry_on_failure`.
|
|
|
|
.. code-block:: py
|
|
|
|
def example_retry_on_failure_callback(pipeline, node, retries):
|
|
print(node.name, ' failed')
|
|
# allow up to 5 retries (total of 6 runs)
|
|
return retries < 5
|
|
:param docker: Select the docker image to be executed in by the remote session
|
|
:param docker_args: Add docker arguments, pass a single string
|
|
:param docker_bash_setup_script: Add bash script to be executed
|
|
inside the docker before setting up the Task's environment
|
|
:param packages: Manually specify a list of required packages or a local requirements.txt file.
|
|
Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt"
|
|
If not provided, packages are automatically added.
|
|
:param repo: Optional, specify a repository to attach to the pipeline controller, when remotely executing.
|
|
Allow users to execute the controller inside the specified repository, enabling them to load modules/script
|
|
from the repository. Notice the execution work directory will be the repository root folder.
|
|
Supports both git repo url link, and local repository path (automatically converted into the remote
|
|
git/commit as is currently checkout).
|
|
Example remote url: 'https://github.com/user/repo.git'
|
|
Example local repo copy: './repo' -> will automatically store the remote
|
|
repo url and commit ID based on the locally cloned copy
|
|
:param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used)
|
|
:param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used)
|
|
"""
|
|
super(PipelineDecorator, self).__init__(
|
|
name=name,
|
|
project=project,
|
|
version=version,
|
|
pool_frequency=pool_frequency,
|
|
add_pipeline_tags=add_pipeline_tags,
|
|
target_project=target_project,
|
|
abort_on_failure=abort_on_failure,
|
|
add_run_number=add_run_number,
|
|
retry_on_failure=retry_on_failure,
|
|
docker=docker,
|
|
docker_args=docker_args,
|
|
docker_bash_setup_script=docker_bash_setup_script,
|
|
packages=packages,
|
|
repo=repo,
|
|
repo_branch=repo_branch,
|
|
repo_commit=repo_commit
|
|
)
|
|
|
|
# if we are in eager execution, make sure parent class knows it
|
|
if self._eager_execution_instance:
|
|
self._mock_execution = True
|
|
|
|
if PipelineDecorator._default_execution_queue:
|
|
super(PipelineDecorator, self).set_default_execution_queue(
|
|
PipelineDecorator._default_execution_queue)
|
|
|
|
for n in self._added_decorator:
|
|
self._add_function_step(**n)
|
|
self._added_decorator.clear()
|
|
PipelineDecorator._singleton = self
|
|
self._reference_callback = []
|
|
# store launched nodes, in case we call the same function multiple times, and need renaming:
|
|
self._launched_step_names = set()
|
|
# map eager steps task id to the new step name
|
|
self._eager_steps_task_id = {} # type: Dict[str, str]
|
|
|
|
def _daemon(self):
|
|
# type: () -> ()
|
|
"""
|
|
The main pipeline execution loop. This loop is executed on its own dedicated thread.
|
|
override the daemon function, we only need to update the state
|
|
|
|
:return:
|
|
"""
|
|
pooling_counter = 0
|
|
launched_nodes = set()
|
|
last_monitor_report = last_plot_report = time()
|
|
while self._stop_event:
|
|
# stop request
|
|
if self._stop_event.wait(self._pool_frequency if pooling_counter else 0.01):
|
|
break
|
|
|
|
pooling_counter += 1
|
|
|
|
# check the pipeline time limit
|
|
if self._pipeline_time_limit and (time() - self._start_time) > self._pipeline_time_limit:
|
|
break
|
|
|
|
self._update_progress()
|
|
# check the state of all current jobs
|
|
# if no a job ended, continue
|
|
completed_jobs = []
|
|
nodes_failed_stop_pipeline = []
|
|
force_execution_plot_update = False
|
|
for j in self._running_nodes:
|
|
node = self._nodes[j]
|
|
if not node.job:
|
|
continue
|
|
if node.job.is_stopped(aborted_nonresponsive_as_running=True):
|
|
node_failed = node.job.is_failed()
|
|
if node_failed:
|
|
if self._call_retries_callback(node):
|
|
self._relaunch_node(node)
|
|
continue
|
|
else:
|
|
self._final_failure[node.name] = True
|
|
completed_jobs.append(j)
|
|
node.executed = node.job.task_id() if not node_failed else False
|
|
if j in launched_nodes:
|
|
launched_nodes.remove(j)
|
|
# check if we need to stop all running steps
|
|
if node_failed and self._abort_running_steps_on_failure and not node.continue_on_fail:
|
|
nodes_failed_stop_pipeline.append(node.name)
|
|
elif node.timeout:
|
|
started = node.job.task.data.started
|
|
if (datetime.now().astimezone(started.tzinfo) - started).total_seconds() > node.timeout:
|
|
node.job.abort()
|
|
completed_jobs.append(j)
|
|
node.executed = node.job.task_id()
|
|
elif j in launched_nodes and node.job.is_running():
|
|
# make sure update the execution graph when the job started running
|
|
# (otherwise it will still be marked queued)
|
|
launched_nodes.remove(j)
|
|
force_execution_plot_update = True
|
|
|
|
# update running jobs
|
|
self._running_nodes = [j for j in self._running_nodes if j not in completed_jobs]
|
|
|
|
# nothing changed, we can sleep
|
|
if not completed_jobs and self._running_nodes:
|
|
# force updating the pipeline state (plot) at least every 5 min.
|
|
if force_execution_plot_update or time()-last_plot_report > self._update_execution_plot_interval:
|
|
last_plot_report = time()
|
|
last_monitor_report = time()
|
|
self.update_execution_plot()
|
|
elif time()-last_monitor_report > self._monitor_node_interval:
|
|
last_monitor_report = time()
|
|
self._scan_monitored_nodes()
|
|
continue
|
|
|
|
# callback on completed jobs
|
|
if self._experiment_completed_cb or self._post_step_callbacks:
|
|
for job in completed_jobs:
|
|
job_node = self._nodes.get(job)
|
|
if not job_node:
|
|
continue
|
|
if self._experiment_completed_cb:
|
|
self._experiment_completed_cb(self, job_node)
|
|
if self._post_step_callbacks.get(job_node.name):
|
|
self._post_step_callbacks[job_node.name](self, job_node)
|
|
|
|
# check if we need to stop the pipeline, and abort all running steps
|
|
if nodes_failed_stop_pipeline:
|
|
print('Aborting pipeline and stopping all running steps, node {} failed'.format(
|
|
nodes_failed_stop_pipeline))
|
|
break
|
|
|
|
# update current state (in configuration, so that we could later continue an aborted pipeline)
|
|
self._force_task_configuration_update()
|
|
|
|
# visualize pipeline state (plot)
|
|
self.update_execution_plot()
|
|
|
|
# stop all currently running jobs, protect against changes while iterating):
|
|
for node in list(self._nodes.values()):
|
|
if node.executed is False and not node.continue_on_fail:
|
|
self._pipeline_task_status_failed = True
|
|
|
|
if node.job and not node.job.is_stopped():
|
|
node.job.abort()
|
|
elif not node.job and not node.executed:
|
|
# mark Node as skipped if it has no Job object and it is not executed
|
|
node.skip_job = True
|
|
# if this is a standalone node, we need to remove it from the graph
|
|
if not node.parents:
|
|
# check if this node is anyone's parent
|
|
found_parent = False
|
|
for v in list(self._nodes.values()):
|
|
if node.name in (v.parents or []):
|
|
found_parent = True
|
|
break
|
|
if not found_parent:
|
|
self._nodes.pop(node.name, None)
|
|
|
|
# visualize pipeline state (plot)
|
|
self.update_execution_plot()
|
|
|
|
if self._stop_event:
|
|
# noinspection PyBroadException
|
|
try:
|
|
self._stop_event.set()
|
|
except Exception:
|
|
pass
|
|
|
|
def update_execution_plot(self):
|
|
# type: () -> ()
|
|
"""
|
|
Update sankey diagram of the current pipeline
|
|
"""
|
|
with self._reporting_lock:
|
|
self._update_eager_generated_steps()
|
|
super(PipelineDecorator, self).update_execution_plot()
|
|
|
|
def _update_eager_generated_steps(self):
|
|
# noinspection PyProtectedMember
|
|
self._task.reload()
|
|
artifacts = self._task.data.execution.artifacts
|
|
# check if we have a new step on the DAG
|
|
eager_artifacts = []
|
|
for a in artifacts:
|
|
if a.key and a.key.startswith('{}:'.format(self._eager_step_artifact)):
|
|
# expected value: '"eager_step":"parent-node-task-id":"eager-step-task-id'
|
|
eager_artifacts.append(a)
|
|
|
|
# verify we have the step, if we do not, add it.
|
|
delete_artifact_keys = []
|
|
for artifact in eager_artifacts:
|
|
_, parent_step_task_id, eager_step_task_id = artifact.key.split(':', 2)
|
|
|
|
# deserialize node definition
|
|
eager_node_def = json.loads(artifact.type_data.preview)
|
|
eager_node_name, eager_node_def = list(eager_node_def.items())[0]
|
|
|
|
# verify we do not have any new nodes on the DAG (i.e. a step generating a Node eagerly)
|
|
parent_node = None
|
|
for node in list(self._nodes.values()):
|
|
if not node.job and not node.executed:
|
|
continue
|
|
t_id = node.executed or node.job.task_id
|
|
if t_id == parent_step_task_id:
|
|
parent_node = node
|
|
break
|
|
|
|
if not parent_node:
|
|
# should not happen
|
|
continue
|
|
|
|
new_step_node_name = '{}_{}'.format(parent_node.name, eager_node_name)
|
|
counter = 1
|
|
while new_step_node_name in self._nodes:
|
|
new_step_node_name = '{}_{}'.format(new_step_node_name, counter)
|
|
counter += 1
|
|
|
|
eager_node_def['name'] = new_step_node_name
|
|
eager_node_def['parents'] = [parent_node.name]
|
|
is_cached = eager_node_def.pop('is_cached', None)
|
|
self._nodes[new_step_node_name] = self.Node(**eager_node_def)
|
|
self._nodes[new_step_node_name].job = RunningJob(existing_task=eager_step_task_id)
|
|
if is_cached:
|
|
self._nodes[new_step_node_name].job.force_set_is_cached(is_cached)
|
|
|
|
# make sure we will not rescan it.
|
|
delete_artifact_keys.append(artifact.key)
|
|
|
|
# remove all processed eager step artifacts
|
|
if delete_artifact_keys:
|
|
# noinspection PyProtectedMember
|
|
self._task._delete_artifacts(delete_artifact_keys)
|
|
self._force_task_configuration_update()
|
|
|
|
def _create_task_from_function(
|
|
self, docker, docker_args, docker_bash_setup_script,
|
|
function, function_input_artifacts, function_kwargs, function_return,
|
|
auto_connect_frameworks, auto_connect_arg_parser,
|
|
packages, project_name, task_name, task_type, repo, branch, commit,
|
|
helper_functions
|
|
):
|
|
def sanitize(function_source):
|
|
matched = re.match(r"[\s]*@[\w]*.component[\s\\]*\(", function_source)
|
|
if matched:
|
|
function_source = function_source[matched.span()[1]:]
|
|
# find the last ")"
|
|
open_parenthesis = 0
|
|
last_index = -1
|
|
for i, c in enumerate(function_source):
|
|
if not open_parenthesis and c == ')':
|
|
last_index = i
|
|
break
|
|
elif c == ')':
|
|
open_parenthesis -= 1
|
|
elif c == '(':
|
|
open_parenthesis += 1
|
|
if last_index >= 0:
|
|
function_source = function_source[last_index+1:].lstrip()
|
|
return function_source
|
|
|
|
task_definition = CreateFromFunction.create_task_from_function(
|
|
a_function=function,
|
|
function_kwargs=function_kwargs or None,
|
|
function_input_artifacts=function_input_artifacts,
|
|
function_return=function_return,
|
|
project_name=project_name,
|
|
task_name=task_name,
|
|
task_type=task_type,
|
|
auto_connect_frameworks=auto_connect_frameworks,
|
|
auto_connect_arg_parser=auto_connect_arg_parser,
|
|
repo=repo,
|
|
branch=branch,
|
|
commit=commit,
|
|
packages=packages,
|
|
docker=docker,
|
|
docker_args=docker_args,
|
|
docker_bash_setup_script=docker_bash_setup_script,
|
|
output_uri=None,
|
|
helper_functions=helper_functions,
|
|
dry_run=True,
|
|
task_template_header=self._task_template_header,
|
|
_sanitize_function=sanitize
|
|
)
|
|
return task_definition
|
|
|
|
def _find_executed_node_leaves(self):
|
|
# type: () -> List[PipelineController.Node]
|
|
all_parents = set([p for n in list(self._nodes.values()) if n.executed for p in n.parents])
|
|
executed_leaves = [name for name, n in list(self._nodes.items()) if n.executed and name not in all_parents]
|
|
return executed_leaves
|
|
|
|
def _adjust_task_hashing(self, task_hash):
|
|
# type: (dict) -> dict
|
|
"""
|
|
Fix the Task hashing so that parameters pointing to the current Task artifact are encoded using the
|
|
hash content of the artifact, instead of the Task.id
|
|
:param task_hash: Task representation dict
|
|
:return: Adjusted Task representation dict
|
|
"""
|
|
if task_hash.get('hyper_params'):
|
|
updated_params = {}
|
|
for k, v in task_hash['hyper_params'].items():
|
|
if k.startswith("{}/".format(CreateFromFunction.input_artifact_section)) and \
|
|
str(v).startswith("{}.".format(self._task.id)):
|
|
task_id, artifact_name = str(v).split(".", 1)
|
|
if artifact_name in self._task.artifacts:
|
|
updated_params[k] = self._task.artifacts[artifact_name].hash
|
|
task_hash['hyper_params'].update(updated_params)
|
|
|
|
return task_hash
|
|
|
|
@classmethod
|
|
def _wait_for_node(cls, node):
|
|
pool_period = 5.0 if cls._debug_execute_step_process else 20.0
|
|
while True:
|
|
node.job.wait(pool_period=pool_period, aborted_nonresponsive_as_running=True)
|
|
job_status = str(node.job.status(force=True))
|
|
if (
|
|
(
|
|
job_status == str(Task.TaskStatusEnum.stopped)
|
|
and node.job.status_message() == cls._relaunch_status_message
|
|
)
|
|
or (job_status == str(Task.TaskStatusEnum.failed) and not cls._final_failure.get(node.name))
|
|
or not node.job.is_stopped()
|
|
):
|
|
sleep(pool_period)
|
|
else:
|
|
break
|
|
|
|
@classmethod
|
|
def component(
|
|
cls,
|
|
_func=None, *,
|
|
return_values=('return_object', ), # type: Union[str, Sequence[str]]
|
|
name=None, # type: Optional[str]
|
|
cache=False, # type: bool
|
|
packages=None, # type: Optional[Union[str, Sequence[str]]]
|
|
parents=None, # type: Optional[List[str]]
|
|
execution_queue=None, # type: Optional[str]
|
|
continue_on_fail=False, # type: bool
|
|
docker=None, # type: Optional[str]
|
|
docker_args=None, # type: Optional[str]
|
|
docker_bash_setup_script=None, # type: Optional[str]
|
|
task_type=None, # type: Optional[str]
|
|
auto_connect_frameworks=None, # type: Optional[dict]
|
|
auto_connect_arg_parser=None, # type: Optional[dict]
|
|
repo=None, # type: Optional[str]
|
|
repo_branch=None, # type: Optional[str]
|
|
repo_commit=None, # type: Optional[str]
|
|
helper_functions=None, # type: Optional[Sequence[Callable]]
|
|
monitor_metrics=None, # type: Optional[List[Union[Tuple[str, str], Tuple[(str, str), (str, str)]]]]
|
|
monitor_artifacts=None, # type: Optional[List[Union[str, Tuple[str, str]]]]
|
|
monitor_models=None, # type: Optional[List[Union[str, Tuple[str, str]]]]
|
|
retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa
|
|
pre_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa
|
|
post_execute_callback=None # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa
|
|
):
|
|
# type: (...) -> Callable
|
|
"""
|
|
pipeline component function to be executed remotely
|
|
|
|
:param _func: wrapper function
|
|
:param return_values: Provide a list of names for all the results.
|
|
Notice! If not provided no results will be stored as artifacts.
|
|
:param name: Optional, set the name of the pipeline component task.
|
|
If not provided, the wrapped function name is used as the pipeline component name
|
|
:param cache: If True, before launching the new step,
|
|
after updating with the latest configuration, check if an exact Task with the same parameter/code
|
|
was already executed. If it was found, use it instead of launching a new Task. Default: False
|
|
:param packages: Manually specify a list of required packages or a local requirements.txt file.
|
|
Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt"
|
|
If not provided, packages are automatically added based on the imports used inside the wrapped function.
|
|
:param parents: Optional list of parent nodes in the DAG.
|
|
The current step in the pipeline will be sent for execution only after all the parent nodes
|
|
have been executed successfully.
|
|
:param execution_queue: Optional, the queue to use for executing this specific step.
|
|
If not provided, the task will be sent to the pipeline's default execution queue
|
|
:param continue_on_fail: (default False). If True, a failed step will not cause the pipeline to stop
|
|
(or marked as failed). Notice, that steps that are connected (or indirectly connected)
|
|
to the failed step will be skipped.
|
|
:param docker: Specify the docker image to be used when executing the pipeline step remotely
|
|
:param docker_args: Add docker execution arguments for the remote execution
|
|
(use single string for all docker arguments).
|
|
:param docker_bash_setup_script: Add a bash script to be executed inside the docker before
|
|
setting up the Task's environment
|
|
:param task_type: Optional, The task type to be created. Supported values: 'training', 'testing', 'inference',
|
|
'data_processing', 'application', 'monitor', 'controller', 'optimizer', 'service', 'qc', 'custom'
|
|
:param auto_connect_frameworks: Control the frameworks auto connect, see `Task.init` auto_connect_frameworks
|
|
:param auto_connect_arg_parser: Control the ArgParser auto connect, see `Task.init` auto_connect_arg_parser
|
|
:param repo: Optional, specify a repository to attach to the function, when remotely executing.
|
|
Allow users to execute the function inside the specified repository, enabling them to load modules/script
|
|
from the repository. Notice the execution work directory will be the repository root folder.
|
|
Supports both git repo url link, and local repository path (automatically converted into the remote
|
|
git/commit as is currently checkout).
|
|
Example remote url: 'https://github.com/user/repo.git'
|
|
Example local repo copy: './repo' -> will automatically store the remote
|
|
repo url and commit ID based on the locally cloned copy
|
|
:param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used)
|
|
:param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used)
|
|
:param helper_functions: Optional, a list of helper functions to make available
|
|
for the standalone pipeline step function Task. By default the pipeline step function has
|
|
no access to any of the other functions, by specifying additional functions here, the remote pipeline step
|
|
could call the additional functions.
|
|
Example, assuming we have two functions parse_data(), and load_data(): [parse_data, load_data]
|
|
:param monitor_metrics: Optional, Automatically log the step's reported metrics also on the pipeline Task.
|
|
The expected format is a list of pairs metric (title, series) to log:
|
|
[(step_metric_title, step_metric_series), ]
|
|
Example: [('test', 'accuracy'), ]
|
|
Or a list of tuple pairs, to specify a different target metric to use on the pipeline Task:
|
|
[((step_metric_title, step_metric_series), (target_metric_title, target_metric_series)), ]
|
|
Example: [[('test', 'accuracy'), ('model', 'accuracy')], ]
|
|
:param monitor_artifacts: Optional, Automatically log the step's artifacts on the pipeline Task.
|
|
Provided a list of artifact names created by the step function, these artifacts will be logged
|
|
automatically also on the Pipeline Task itself.
|
|
Example: ['processed_data', ]
|
|
(target artifact name on the Pipeline Task will hav ethe same name as the original artifact)
|
|
Alternatively, provide a list of pairs (source_artifact_name, target_artifact_name):
|
|
where the first string is the artifact name as it appears on the component Task,
|
|
and the second is the target artifact name to put on the Pipeline Task
|
|
Example: [('processed_data', 'final_processed_data'), ]
|
|
:param monitor_models: Optional, Automatically log the step's output models on the pipeline Task.
|
|
Provided a list of model names created by the step's Task, they will also appear on the Pipeline itself.
|
|
Example: ['model_weights', ]
|
|
To select the latest (lexicographic) model use "model_*", or the last created model with just "*"
|
|
Example: ['model_weights_*', ]
|
|
Alternatively, provide a list of pairs (source_model_name, target_model_name):
|
|
where the first string is the model name as it appears on the component Task,
|
|
and the second is the target model name to put on the Pipeline Task
|
|
Example: [('model_weights', 'final_model_weights'), ]
|
|
:param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry
|
|
- Integer: In case of node failure, retry the node the number of times indicated by this parameter.
|
|
- Callable: A function called on node failure. Takes as parameters:
|
|
the PipelineController instance, the PipelineController.Node that failed and an int
|
|
representing the number of previous retries for the node that failed
|
|
The function must return a `bool`: True if the node should be retried and False otherwise.
|
|
If True, the node will be re-queued and the number of retries left will be decremented by 1.
|
|
By default, if this callback is not specified, the function will be retried the number of
|
|
times indicated by `retry_on_failure`.
|
|
|
|
.. code-block:: py
|
|
|
|
def example_retry_on_failure_callback(pipeline, node, retries):
|
|
print(node.name, ' failed')
|
|
# allow up to 5 retries (total of 6 runs)
|
|
return retries < 5
|
|
|
|
:param pre_execute_callback: Callback function, called when the step (Task) is created
|
|
and before it is sent for execution. Allows a user to modify the Task before launch.
|
|
Use `node.job` to access the ClearmlJob object, or `node.job.task` to directly access the Task object.
|
|
`parameters` are the configuration arguments passed to the ClearmlJob.
|
|
|
|
If the callback returned value is `False`,
|
|
the Node is skipped and so is any node in the DAG that relies on this node.
|
|
|
|
Notice the `parameters` are already parsed,
|
|
e.g. `${step1.parameters.Args/param}` is replaced with relevant value.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_created_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
parameters, # type: dict
|
|
):
|
|
pass
|
|
|
|
:param post_execute_callback: Callback function, called when a step (Task) is completed
|
|
and it other jobs are executed. Allows a user to modify the Task status after completion.
|
|
|
|
.. code-block:: py
|
|
|
|
def step_completed_callback(
|
|
pipeline, # type: PipelineController,
|
|
node, # type: PipelineController.Node,
|
|
):
|
|
pass
|
|
|
|
|
|
:return: function wrapper
|
|
"""
|
|
def decorator_wrap(func):
|
|
_name = name or str(func.__name__)
|
|
function_return = return_values if isinstance(return_values, (tuple, list)) else [return_values]
|
|
|
|
inspect_func = inspect.getfullargspec(func)
|
|
# add default argument values
|
|
if inspect_func.args:
|
|
default_values = list(inspect_func.defaults or [])
|
|
default_values = ([None] * (len(inspect_func.args)-len(default_values))) + default_values
|
|
function_kwargs = {k: v for k, v in zip(inspect_func.args, default_values)}
|
|
else:
|
|
function_kwargs = dict()
|
|
|
|
add_step_spec = dict(
|
|
name=_name,
|
|
function=func,
|
|
function_kwargs=function_kwargs,
|
|
function_return=function_return,
|
|
cache_executed_step=cache,
|
|
packages=packages,
|
|
parents=parents,
|
|
execution_queue=execution_queue,
|
|
continue_on_fail=continue_on_fail,
|
|
docker=docker,
|
|
docker_args=docker_args,
|
|
docker_bash_setup_script=docker_bash_setup_script,
|
|
auto_connect_frameworks=auto_connect_frameworks,
|
|
auto_connect_arg_parser=auto_connect_arg_parser,
|
|
task_type=task_type,
|
|
repo=repo,
|
|
repo_branch=repo_branch,
|
|
repo_commit=repo_commit,
|
|
helper_functions=helper_functions,
|
|
monitor_metrics=monitor_metrics,
|
|
monitor_models=monitor_models,
|
|
monitor_artifacts=monitor_artifacts,
|
|
pre_execute_callback=pre_execute_callback,
|
|
post_execute_callback=post_execute_callback
|
|
)
|
|
|
|
if cls._singleton:
|
|
cls._singleton._add_function_step(**add_step_spec)
|
|
else:
|
|
cls._added_decorator.append(add_step_spec)
|
|
|
|
@functools.wraps(func)
|
|
def wrapper(*args, **kwargs):
|
|
if cls._debug_execute_step_function:
|
|
args = walk_nested_dict_tuple_list(
|
|
args, lambda x: x._remoteref() if isinstance(x, LazyEvalWrapper) else x)
|
|
kwargs = walk_nested_dict_tuple_list(
|
|
kwargs, lambda x: x._remoteref() if isinstance(x, LazyEvalWrapper) else x)
|
|
|
|
func_return = []
|
|
|
|
def result_wrapper(a_func_return, return_index):
|
|
if not a_func_return:
|
|
a_func_return.append(func(*args, **kwargs))
|
|
a_func_return = a_func_return[0]
|
|
return a_func_return if return_index is None else a_func_return[return_index]
|
|
|
|
if len(function_return) == 1:
|
|
ret_val = LazyEvalWrapper(
|
|
callback=functools.partial(result_wrapper, func_return, None),
|
|
remote_reference=functools.partial(result_wrapper, func_return, None))
|
|
cls._ref_lazy_loader_id_to_node_name[id(ret_val)] = _name
|
|
return ret_val
|
|
else:
|
|
return_w = [LazyEvalWrapper(
|
|
callback=functools.partial(result_wrapper, func_return, i),
|
|
remote_reference=functools.partial(result_wrapper, func_return, i))
|
|
for i, _ in enumerate(function_return)]
|
|
for i in return_w:
|
|
cls._ref_lazy_loader_id_to_node_name[id(i)] = _name
|
|
return return_w
|
|
|
|
# resolve all lazy objects if we have any:
|
|
kwargs_artifacts = {}
|
|
for i, v in enumerate(args):
|
|
kwargs[inspect_func.args[i]] = v
|
|
|
|
# We need to remember when a pipeline step's return value is evaluated by the pipeline
|
|
# controller, but not when it's done here (as we would remember the step every time).
|
|
# _add_to_evaluated_return_values protects that
|
|
tid = current_thread().ident
|
|
cls._add_to_evaluated_return_values[tid] = False
|
|
kwargs_artifacts.update(
|
|
{
|
|
k: walk_nested_dict_tuple_list(
|
|
v,
|
|
lambda x: x._remoteref() if isinstance(x, LazyEvalWrapper) else x
|
|
)
|
|
for k, v in kwargs.items()
|
|
if isinstance(v, LazyEvalWrapper)
|
|
}
|
|
)
|
|
cls._add_to_evaluated_return_values[tid] = True
|
|
kwargs = {k: deepcopy(v) for k, v in kwargs.items() if not isinstance(v, LazyEvalWrapper)}
|
|
|
|
# check if we have the singleton
|
|
if not cls._singleton:
|
|
# todo: somehow make sure the generated tasks list the parent pipeline as parent
|
|
original_tags = Task.current_task().get_tags(), Task.current_task().get_system_tags()
|
|
# This is an adhoc pipeline step,
|
|
PipelineDecorator._eager_execution_instance = True
|
|
a_pipeline = PipelineDecorator(
|
|
name=name,
|
|
project='DevOps', # it will not actually be used
|
|
version='0.0.0',
|
|
pool_frequency=111,
|
|
add_pipeline_tags=False,
|
|
target_project=None,
|
|
)
|
|
|
|
target_queue = \
|
|
PipelineDecorator._default_execution_queue or \
|
|
Task.current_task().data.execution.queue
|
|
if target_queue:
|
|
PipelineDecorator.set_default_execution_queue(target_queue)
|
|
else:
|
|
# if we are are not running from a queue, we are probably in debug mode
|
|
a_pipeline._clearml_job_class = LocalClearmlJob
|
|
a_pipeline._default_execution_queue = 'mock'
|
|
|
|
# restore tags, the pipeline might add a few
|
|
Task.current_task().set_tags(original_tags[0])
|
|
Task.current_task().set_system_tags(original_tags[1])
|
|
|
|
# get node name
|
|
_node_name = _name
|
|
|
|
# check if we are launching the same node twice
|
|
if _node_name in cls._singleton._launched_step_names:
|
|
# if we already launched a JOB on the node, this means we are calling the same function/task
|
|
# twice inside the pipeline, this means we need to replicate the node.
|
|
_node = cls._singleton._nodes[_node_name].copy()
|
|
_node.parents = []
|
|
# find a new name
|
|
counter = 1
|
|
# Use nodes in `_singleton._nodes` that have not been launched.
|
|
# First check if we launched the node.
|
|
# If it wasn't launched we also need to check that the new name of `_node`
|
|
# points to the original code section it was meant to run.
|
|
# Note that for the first iteration (when `_node.name == _node_name`)
|
|
# we always increment the name, as the name is always in `_launched_step_names`
|
|
while _node.name in cls._singleton._launched_step_names or (
|
|
_node.name in cls._singleton._nodes
|
|
and cls._singleton._nodes[_node.name].job_code_section != cls._singleton._nodes[_node_name].job_code_section
|
|
):
|
|
_node.name = "{}_{}".format(_node_name, counter)
|
|
counter += 1
|
|
_node_name = _node.name
|
|
if _node.name not in cls._singleton._nodes:
|
|
cls._singleton._nodes[_node.name] = _node
|
|
|
|
# get node and park is as launched
|
|
cls._singleton._launched_step_names.add(_node_name)
|
|
_node = cls._singleton._nodes[_node_name]
|
|
cls._retries[_node_name] = 0
|
|
cls._retries_callbacks[_node_name] = retry_on_failure if callable(retry_on_failure) else \
|
|
(functools.partial(cls._singleton._default_retry_on_failure_callback, max_retries=retry_on_failure)
|
|
if isinstance(retry_on_failure, int) else cls._singleton._retry_on_failure_callback)
|
|
|
|
# The actual launch is a bit slow, we run it in the background
|
|
launch_thread = Thread(
|
|
target=cls._component_launch,
|
|
args=(_node_name, _node, kwargs_artifacts, kwargs, current_thread().ident))
|
|
|
|
def results_reference(return_name):
|
|
# wait until launch is completed
|
|
if launch_thread and launch_thread.is_alive():
|
|
try:
|
|
launch_thread.join()
|
|
except: # noqa
|
|
pass
|
|
|
|
cls._wait_for_node(_node)
|
|
if not _node.job:
|
|
if not _node.executed:
|
|
raise ValueError("Job was not created and is also not cached/executed")
|
|
return "{}.{}".format(_node.executed, return_name)
|
|
|
|
if _node.job.is_failed() and not _node.continue_on_fail:
|
|
raise ValueError(
|
|
'Pipeline step "{}", Task ID={} failed'.format(_node.name, _node.job.task_id()))
|
|
|
|
_node.executed = _node.job.task_id()
|
|
return "{}.{}".format(_node.job.task_id(), return_name)
|
|
|
|
def result_wrapper(return_name):
|
|
# wait until launch is completed
|
|
if launch_thread and launch_thread.is_alive():
|
|
try:
|
|
launch_thread.join()
|
|
except: # noqa
|
|
pass
|
|
|
|
cls._wait_for_node(_node)
|
|
if (_node.job.is_failed() and not _node.continue_on_fail) or _node.job.is_aborted():
|
|
raise ValueError(
|
|
'Pipeline step "{}", Task ID={} failed'.format(_node.name, _node.job.task_id())
|
|
)
|
|
|
|
_node.executed = _node.job.task_id()
|
|
|
|
# make sure we mark the current state of the DAG execution tree
|
|
# so that later we can find the "parents" to the current node
|
|
_tid = current_thread().ident
|
|
if cls._add_to_evaluated_return_values.get(_tid, True):
|
|
if _tid not in cls._evaluated_return_values:
|
|
cls._evaluated_return_values[_tid] = []
|
|
cls._evaluated_return_values[_tid].append(_node.name)
|
|
|
|
task = Task.get_task(_node.job.task_id())
|
|
if return_name in task.artifacts:
|
|
return task.artifacts[return_name].get()
|
|
return task.get_parameters(cast=True)[CreateFromFunction.return_section + "/" + return_name]
|
|
|
|
return_w = [LazyEvalWrapper(
|
|
callback=functools.partial(result_wrapper, n),
|
|
remote_reference=functools.partial(results_reference, n)) for n in function_return]
|
|
for i in return_w:
|
|
cls._ref_lazy_loader_id_to_node_name[id(i)] = _node_name
|
|
|
|
# start the launch thread now
|
|
launch_thread.start()
|
|
|
|
return return_w[0] if len(return_w) == 1 else return_w
|
|
|
|
return wrapper
|
|
|
|
return decorator_wrap if _func is None else decorator_wrap(_func)
|
|
|
|
@classmethod
|
|
def pipeline(
|
|
cls,
|
|
_func=None, *, # noqa
|
|
name, # type: str
|
|
project, # type: str
|
|
version, # type: str
|
|
return_value=None, # type: Optional[str]
|
|
default_queue=None, # type: Optional[str]
|
|
pool_frequency=0.2, # type: float
|
|
add_pipeline_tags=False, # type: bool
|
|
target_project=None, # type: Optional[str]
|
|
abort_on_failure=False, # type: bool
|
|
pipeline_execution_queue='services', # type: Optional[str]
|
|
multi_instance_support=False, # type: bool
|
|
add_run_number=True, # type: bool
|
|
args_map=None, # type: dict[str, List[str]]
|
|
start_controller_locally=False, # type: bool
|
|
retry_on_failure=None, # type: Optional[Union[int, Callable[[PipelineController, PipelineController.Node, int], bool]]] # noqa
|
|
docker=None, # type: Optional[str]
|
|
docker_args=None, # type: Optional[str]
|
|
docker_bash_setup_script=None, # type: Optional[str]
|
|
packages=None, # type: Optional[Union[str, Sequence[str]]]
|
|
repo=None, # type: Optional[str]
|
|
repo_branch=None, # type: Optional[str]
|
|
repo_commit=None # type: Optional[str]
|
|
):
|
|
# type: (...) -> Callable
|
|
"""
|
|
Decorate pipeline logic function.
|
|
|
|
:param name: Provide pipeline name (if main Task exists it overrides its name)
|
|
:param project: Provide project storing the pipeline (if main Task exists it overrides its project)
|
|
:param version: Must provide pipeline version. This version allows to uniquely identify the pipeline
|
|
template execution. Examples for semantic versions: version='1.0.1' , version='23', version='1.2'
|
|
:param return_value: Optional, Provide an artifact name to store the pipeline function return object
|
|
Notice, If not provided the pipeline will not store the pipeline function return value.
|
|
:param default_queue: default pipeline step queue
|
|
:param float pool_frequency: The pooling frequency (in minutes) for monitoring experiments / states.
|
|
:param bool add_pipeline_tags: (default: False) if True, add `pipe: <pipeline_task_id>` tag to all
|
|
steps (Tasks) created by this pipeline.
|
|
:param str target_project: If provided, all pipeline steps are cloned into the target project
|
|
:param bool abort_on_failure: If False (default), failed pipeline steps will not cause the pipeline
|
|
to stop immediately, instead any step that is not connected (or indirectly connected) to the failed step,
|
|
will still be executed. Nonetheless the pipeline itself will be marked failed, unless the failed step
|
|
was specifically defined with "continue_on_fail=True".
|
|
If True, any failed step will cause the pipeline to immediately abort, stop all running steps,
|
|
and mark the pipeline as failed.
|
|
:param pipeline_execution_queue: remote pipeline execution queue (default 'services' queue).
|
|
If None is passed, execute the pipeline logic locally (pipeline steps are still executed remotely)
|
|
:param multi_instance_support: If True, allow multiple calls to the same pipeline function,
|
|
each call creating a new Pipeline Task. Notice it is recommended to create an additional Task on the
|
|
"main process" acting as a master pipeline, automatically collecting the execution plots.
|
|
If multi_instance_support=='parallel' then the pipeline calls are executed in parallel,
|
|
in the `parallel` case the function calls return None, to collect all pipeline results call
|
|
`PipelineDecorator.wait_for_multi_pipelines()`.
|
|
Default False, no multi instance pipeline support.
|
|
:param add_run_number: If True (default), add the run number of the pipeline to the pipeline name.
|
|
Example, the second time we launch the pipeline "best pipeline", we rename it to "best pipeline #2"
|
|
:param args_map: Map arguments to their specific configuration section. Arguments not included in this map
|
|
will default to `Args` section. For example, for the following code:
|
|
|
|
.. code-block:: py
|
|
|
|
@PipelineDecorator.pipeline(args_map={'sectionA':['paramA'], 'sectionB:['paramB','paramC']
|
|
def executing_pipeline(paramA, paramB, paramC, paramD):
|
|
pass
|
|
|
|
Parameters would be stored as:
|
|
- paramA: sectionA/paramA
|
|
- paramB: sectionB/paramB
|
|
- paramC: sectionB/paramC
|
|
- paramD: Args/paramD
|
|
:param start_controller_locally: If True, start the controller on the local machine. The steps will run
|
|
remotely if `PipelineDecorator.run_locally` or `PipelineDecorator.debug_pipeline` are not called.
|
|
Default: False
|
|
:param retry_on_failure: Integer (number of retries) or Callback function that returns True to allow a retry
|
|
- Integer: In case of node failure, retry the node the number of times indicated by this parameter.
|
|
- Callable: A function called on node failure. Takes as parameters:
|
|
the PipelineController instance, the PipelineController.Node that failed and an int
|
|
representing the number of previous retries for the node that failed
|
|
The function must return a `bool`: True if the node should be retried and False otherwise.
|
|
If True, the node will be re-queued and the number of retries left will be decremented by 1.
|
|
By default, if this callback is not specified, the function will be retried the number of
|
|
times indicated by `retry_on_failure`.
|
|
|
|
.. code-block:: py
|
|
|
|
def example_retry_on_failure_callback(pipeline, node, retries):
|
|
print(node.name, ' failed')
|
|
# allow up to 5 retries (total of 6 runs)
|
|
return retries < 5
|
|
:param docker: Select the docker image to be executed in by the remote session
|
|
:param docker_args: Add docker arguments, pass a single string
|
|
:param docker_bash_setup_script: Add bash script to be executed
|
|
inside the docker before setting up the Task's environment
|
|
:param packages: Manually specify a list of required packages or a local requirements.txt file.
|
|
Example: ["tqdm>=2.1", "scikit-learn"] or "./requirements.txt"
|
|
If not provided, packages are automatically added based on the imports used in the function.
|
|
:param repo: Optional, specify a repository to attach to the function, when remotely executing.
|
|
Allow users to execute the function inside the specified repository, enabling them to load modules/script
|
|
from the repository. Notice the execution work directory will be the repository root folder.
|
|
Supports both git repo url link, and local repository path (automatically converted into the remote
|
|
git/commit as is currently checkout).
|
|
Example remote url: 'https://github.com/user/repo.git'
|
|
Example local repo copy: './repo' -> will automatically store the remote
|
|
repo url and commit ID based on the locally cloned copy
|
|
:param repo_branch: Optional, specify the remote repository branch (Ignored, if local repo path is used)
|
|
:param repo_commit: Optional, specify the repository commit ID (Ignored, if local repo path is used)
|
|
"""
|
|
def decorator_wrap(func):
|
|
|
|
def internal_decorator(*args, **kwargs):
|
|
pipeline_kwargs = dict(**(kwargs or {}))
|
|
pipeline_kwargs_types = dict()
|
|
inspect_func = inspect.getfullargspec(func)
|
|
if args:
|
|
if not inspect_func.args:
|
|
raise ValueError("Could not parse function arguments")
|
|
|
|
pipeline_kwargs.update({inspect_func.args[i]: v for i, v in enumerate(args)})
|
|
|
|
# add default function arguments if we have defaults for all arguments
|
|
if inspect_func.args:
|
|
default_values = list(inspect_func.defaults or [])
|
|
default_values = ([None] * (len(inspect_func.args) - len(default_values))) + default_values
|
|
default_kwargs = {k: v for k, v in zip(inspect_func.args, default_values)}
|
|
default_kwargs.update(pipeline_kwargs)
|
|
pipeline_kwargs = default_kwargs
|
|
|
|
if inspect_func.annotations:
|
|
pipeline_kwargs_types = {
|
|
str(k): inspect_func.annotations[k] for k in inspect_func.annotations}
|
|
|
|
# run the entire pipeline locally, as python functions
|
|
if cls._debug_execute_step_function:
|
|
a_pipeline = PipelineDecorator(
|
|
name=name,
|
|
project=project,
|
|
version=version,
|
|
pool_frequency=pool_frequency,
|
|
add_pipeline_tags=add_pipeline_tags,
|
|
target_project=target_project,
|
|
abort_on_failure=abort_on_failure,
|
|
add_run_number=add_run_number,
|
|
retry_on_failure=retry_on_failure,
|
|
docker=docker,
|
|
docker_args=docker_args,
|
|
docker_bash_setup_script=docker_bash_setup_script,
|
|
packages=packages,
|
|
repo=repo,
|
|
repo_branch=repo_branch,
|
|
repo_commit=repo_commit
|
|
)
|
|
ret_val = func(**pipeline_kwargs)
|
|
LazyEvalWrapper.trigger_all_remote_references()
|
|
a_pipeline._task.close()
|
|
return ret_val
|
|
|
|
# check if we are in a multi pipeline
|
|
force_single_multi_pipeline_call = False
|
|
if multi_instance_support and cls._multi_pipeline_call_counter >= 0:
|
|
# check if we are running remotely
|
|
if not Task.running_locally():
|
|
# get the main Task property
|
|
t = Task.get_task(task_id=get_remote_task_id())
|
|
if str(t.task_type) == str(Task.TaskTypes.controller):
|
|
# noinspection PyBroadException
|
|
try:
|
|
# noinspection PyProtectedMember
|
|
multi_pipeline_call_counter = int(
|
|
t._get_runtime_properties().get('multi_pipeline_counter', None))
|
|
|
|
# NOTICE! if this is not our call we LEAVE immediately
|
|
# check if this is our call to start, if not we will wait for the next one
|
|
if multi_pipeline_call_counter != cls._multi_pipeline_call_counter:
|
|
return
|
|
except Exception:
|
|
# this is not the one, so we should just run the first
|
|
# instance and leave immediately
|
|
force_single_multi_pipeline_call = True
|
|
|
|
if default_queue:
|
|
cls.set_default_execution_queue(default_queue)
|
|
|
|
a_pipeline = PipelineDecorator(
|
|
name=name,
|
|
project=project,
|
|
version=version,
|
|
pool_frequency=pool_frequency,
|
|
add_pipeline_tags=add_pipeline_tags,
|
|
target_project=target_project,
|
|
abort_on_failure=abort_on_failure,
|
|
add_run_number=add_run_number,
|
|
retry_on_failure=retry_on_failure,
|
|
docker=docker,
|
|
docker_args=docker_args,
|
|
docker_bash_setup_script=docker_bash_setup_script,
|
|
packages=packages,
|
|
repo=repo,
|
|
repo_branch=repo_branch,
|
|
repo_commit=repo_commit
|
|
)
|
|
|
|
a_pipeline._args_map = args_map or {}
|
|
|
|
if PipelineDecorator._debug_execute_step_process:
|
|
a_pipeline._clearml_job_class = LocalClearmlJob
|
|
a_pipeline._default_execution_queue = 'mock'
|
|
|
|
a_pipeline._clearml_job_class.register_hashing_callback(a_pipeline._adjust_task_hashing)
|
|
|
|
# add pipeline arguments
|
|
for k in pipeline_kwargs:
|
|
a_pipeline.add_parameter(
|
|
name=k,
|
|
default=pipeline_kwargs.get(k),
|
|
param_type=pipeline_kwargs_types.get(k)
|
|
)
|
|
|
|
# sync multi-pipeline call counter (so we know which one to skip)
|
|
if Task.running_locally() and multi_instance_support and cls._multi_pipeline_call_counter >= 0:
|
|
# noinspection PyProtectedMember
|
|
a_pipeline._task._set_runtime_properties(
|
|
dict(multi_pipeline_counter=str(cls._multi_pipeline_call_counter)))
|
|
|
|
a_pipeline._start(wait=False)
|
|
|
|
# sync arguments back (post deserialization and casting back)
|
|
for k in pipeline_kwargs.keys():
|
|
if k in a_pipeline.get_parameters():
|
|
pipeline_kwargs[k] = a_pipeline.get_parameters()[k]
|
|
|
|
# run the actual pipeline
|
|
if not start_controller_locally and \
|
|
not PipelineDecorator._debug_execute_step_process and pipeline_execution_queue:
|
|
# rerun the pipeline on a remote machine
|
|
a_pipeline._task.execute_remotely(queue_name=pipeline_execution_queue)
|
|
# when we get here it means we are running remotely
|
|
|
|
# this time the pipeline is executed only on the remote machine
|
|
try:
|
|
pipeline_result = func(**pipeline_kwargs)
|
|
except Exception:
|
|
a_pipeline.stop(mark_failed=True)
|
|
raise
|
|
|
|
triggered_exception = None
|
|
try:
|
|
LazyEvalWrapper.trigger_all_remote_references()
|
|
except Exception as ex:
|
|
triggered_exception = ex
|
|
|
|
# make sure we wait for all nodes to finish
|
|
waited = True
|
|
while waited:
|
|
waited = False
|
|
for node in list(a_pipeline._nodes.values()):
|
|
if node.executed or not node.job or node.job.is_stopped(aborted_nonresponsive_as_running=True):
|
|
continue
|
|
cls._wait_for_node(node)
|
|
waited = True
|
|
# store the pipeline result of we have any:
|
|
if return_value and pipeline_result is not None:
|
|
a_pipeline._upload_pipeline_artifact(
|
|
artifact_name=str(return_value), artifact_object=pipeline_result
|
|
)
|
|
|
|
# now we can stop the pipeline
|
|
a_pipeline.stop()
|
|
# now we can raise the exception
|
|
if triggered_exception:
|
|
raise triggered_exception
|
|
|
|
# Make sure that if we do not need to run all pipelines we forcefully leave the process
|
|
if force_single_multi_pipeline_call:
|
|
leave_process()
|
|
# we will never get here
|
|
|
|
return pipeline_result
|
|
|
|
if multi_instance_support:
|
|
return cls._multi_pipeline_wrapper(
|
|
func=internal_decorator, parallel=bool(multi_instance_support == 'parallel'))
|
|
|
|
return internal_decorator
|
|
|
|
return decorator_wrap if _func is None else decorator_wrap(_func)
|
|
|
|
@classmethod
|
|
def set_default_execution_queue(cls, default_execution_queue):
|
|
# type: (Optional[str]) -> None
|
|
"""
|
|
Set the default execution queue if pipeline step does not specify an execution queue
|
|
|
|
:param default_execution_queue: The execution queue to use if no execution queue is provided
|
|
"""
|
|
cls._default_execution_queue = str(default_execution_queue) if default_execution_queue else None
|
|
|
|
@classmethod
|
|
def run_locally(cls):
|
|
# type: () -> ()
|
|
"""
|
|
Set local mode, run all functions locally as subprocess
|
|
|
|
Run the full pipeline DAG locally, where steps are executed as sub-processes Tasks
|
|
Notice: running the DAG locally assumes the local code execution (i.e. it will not clone & apply git diff)
|
|
|
|
"""
|
|
cls._debug_execute_step_process = True
|
|
cls._debug_execute_step_function = False
|
|
|
|
@classmethod
|
|
def debug_pipeline(cls):
|
|
# type: () -> ()
|
|
"""
|
|
Set debugging mode, run all functions locally as functions (serially)
|
|
Run the full pipeline DAG locally, where steps are executed as functions
|
|
Notice:
|
|
running the DAG locally assumes the local code execution (i.e. it will not clone & apply git diff)
|
|
Pipeline steps are executed as functions (no Task will be created), fo ease debugging J
|
|
"""
|
|
cls._debug_execute_step_process = True
|
|
cls._debug_execute_step_function = True
|
|
|
|
@classmethod
|
|
def get_current_pipeline(cls):
|
|
# type: () -> "PipelineDecorator"
|
|
"""
|
|
Return the currently running pipeline instance
|
|
"""
|
|
return cls._singleton
|
|
|
|
@classmethod
|
|
def wait_for_multi_pipelines(cls):
|
|
# type () -> List[Any]
|
|
"""
|
|
Wait until all background multi pipeline execution is completed.
|
|
Returns all the pipeline results in call order (first pipeline call at index 0)
|
|
|
|
:return: List of return values from executed pipeline, based on call order.
|
|
"""
|
|
return cls._wait_for_multi_pipelines()
|
|
|
|
@classmethod
|
|
def _component_launch(cls, node_name, node, kwargs_artifacts, kwargs, tid):
|
|
_node_name = node_name
|
|
_node = node
|
|
# update artifacts kwargs
|
|
for k, v in kwargs_artifacts.items():
|
|
if k in kwargs:
|
|
kwargs.pop(k, None)
|
|
_node.parameters.pop("{}/{}".format(CreateFromFunction.kwargs_section, k), None)
|
|
_node.parameters["{}/{}".format(CreateFromFunction.input_artifact_section, k)] = v
|
|
if v and '.' in str(v):
|
|
parent_id, _ = str(v).split('.', 1)
|
|
# find parent and push it into the _node.parents
|
|
for n, node in sorted(list(cls._singleton._nodes.items()), reverse=True):
|
|
if n != _node.name and node.executed and node.executed == parent_id:
|
|
if n not in _node.parents:
|
|
_node.parents.append(n)
|
|
break
|
|
if kwargs:
|
|
leaves = cls._singleton._find_executed_node_leaves()
|
|
_node.parents = (_node.parents or []) + [
|
|
x for x in cls._evaluated_return_values.get(tid, []) if x in leaves
|
|
]
|
|
for k, v in kwargs.items():
|
|
if v is None or isinstance(v, (float, int, bool, six.string_types)):
|
|
_node.parameters["{}/{}".format(CreateFromFunction.kwargs_section, k)] = v
|
|
else:
|
|
# we need to create an artifact
|
|
artifact_name = 'result_{}_{}'.format(re.sub(r'\W+', '', _node.name), k)
|
|
cls._singleton._upload_pipeline_artifact(artifact_name=artifact_name, artifact_object=v)
|
|
_node.parameters["{}/{}".format(CreateFromFunction.input_artifact_section, k)] = \
|
|
"{}.{}".format(cls._singleton._task.id, artifact_name)
|
|
|
|
# verify the new step
|
|
cls._singleton._verify_node(_node)
|
|
# launch the new step
|
|
cls._singleton._launch_node(_node)
|
|
# check if we generated the pipeline we need to update the new eager step
|
|
if PipelineDecorator._eager_execution_instance and _node.job:
|
|
# check if we need to add the pipeline tag on the new node
|
|
pipeline_tags = [t for t in Task.current_task().get_tags() or []
|
|
if str(t).startswith(cls._node_tag_prefix)]
|
|
if pipeline_tags and _node.job and _node.job.task:
|
|
pipeline_tags = list(set((_node.job.task.get_tags() or []) + pipeline_tags))
|
|
_node.job.task.set_tags(pipeline_tags)
|
|
# force parent task as pipeline
|
|
_node.job.task._edit(parent=Task.current_task().parent)
|
|
# store the new generated node, so we can later serialize it
|
|
pipeline_dag = cls._singleton._serialize()
|
|
# check if node is cached
|
|
if _node.job.is_cached_task():
|
|
pipeline_dag[_node_name]['is_cached'] = True
|
|
# store entire definition on the parent pipeline
|
|
from clearml.backend_api.services import tasks
|
|
artifact = tasks.Artifact(
|
|
key='{}:{}:{}'.format(cls._eager_step_artifact, Task.current_task().id, _node.job.task_id()),
|
|
type="json",
|
|
mode='output',
|
|
type_data=tasks.ArtifactTypeData(
|
|
preview=json.dumps({_node_name: pipeline_dag[_node_name]}),
|
|
content_type='application/pipeline')
|
|
)
|
|
req = tasks.AddOrUpdateArtifactsRequest(
|
|
task=Task.current_task().parent, artifacts=[artifact], force=True)
|
|
res = Task.current_task().send(req, raise_on_errors=False)
|
|
if not res or not res.response or not res.response.updated:
|
|
pass
|
|
|
|
# update pipeline execution graph
|
|
cls._singleton.update_execution_plot()
|
|
|
|
@classmethod
|
|
def _multi_pipeline_wrapper(
|
|
cls,
|
|
func=None, # type: Callable
|
|
parallel=False, # type: bool
|
|
):
|
|
# type: (...) -> Callable
|
|
"""
|
|
Add support for multiple pipeline function calls,
|
|
enabling execute multiple instances of the same pipeline from a single script.
|
|
|
|
.. code-block:: py
|
|
|
|
@PipelineDecorator.pipeline(
|
|
multi_instance_support=True, name="custom pipeline logic", project="examples", version="1.0")
|
|
def pipeline(parameter=1):
|
|
print(f"running with parameter={parameter}")
|
|
|
|
# run both pipeline (if multi_instance_support=='parallel', run pipelines in parallel)
|
|
pipeline(parameter=1)
|
|
pipeline(parameter=2)
|
|
|
|
:param parallel: If True, the pipeline is running in the background, which implies calling
|
|
the pipeline twice means running the pipelines in parallel.
|
|
Default: False, pipeline function returns when pipeline completes
|
|
:return: Return wrapped pipeline function.
|
|
Notice the return value of the pipeline wrapped function:
|
|
if parallel==True, return will be None, otherwise expect the return of the pipeline wrapped function
|
|
"""
|
|
|
|
def internal_decorator(*args, **kwargs):
|
|
cls._multi_pipeline_call_counter += 1
|
|
|
|
# if this is a debug run just call the function (no parallelization).
|
|
if cls._debug_execute_step_function:
|
|
return func(*args, **kwargs)
|
|
|
|
def sanitized_env(a_queue, *a_args, **a_kwargs):
|
|
os.environ.pop('CLEARML_PROC_MASTER_ID', None)
|
|
os.environ.pop('TRAINS_PROC_MASTER_ID', None)
|
|
os.environ.pop('CLEARML_TASK_ID', None)
|
|
os.environ.pop('TRAINS_TASK_ID', None)
|
|
if Task.current_task():
|
|
# noinspection PyProtectedMember
|
|
Task.current_task()._reset_current_task_obj()
|
|
a_result = func(*a_args, **a_kwargs)
|
|
if a_queue is not None:
|
|
task_id = Task.current_task().id if Task.current_task() else None
|
|
a_queue.put((task_id, a_result))
|
|
return a_result
|
|
|
|
queue = Queue()
|
|
|
|
p = Process(target=sanitized_env, args=(queue, ) + args, kwargs=kwargs)
|
|
# make sure we wait for the subprocess.
|
|
p.daemon = False
|
|
p.start()
|
|
if parallel and Task.running_locally():
|
|
cls._multi_pipeline_instances.append((p, queue))
|
|
return
|
|
else:
|
|
p.join()
|
|
# noinspection PyBroadException
|
|
try:
|
|
pipeline_task, result = queue.get_nowait()
|
|
except Exception:
|
|
return None
|
|
|
|
# we should update the master Task plot:
|
|
if pipeline_task and Task.current_task():
|
|
cls._add_pipeline_plots(pipeline_task)
|
|
|
|
return result
|
|
|
|
if parallel and not cls._atexit_registered:
|
|
cls._atexit_registered = True
|
|
atexit.register(cls._wait_for_multi_pipelines)
|
|
|
|
return internal_decorator
|
|
|
|
@classmethod
|
|
def _wait_for_multi_pipelines(cls):
|
|
results = []
|
|
if not cls._multi_pipeline_instances:
|
|
return results
|
|
print('Waiting for background pipelines to finish')
|
|
for p, queue in cls._multi_pipeline_instances:
|
|
try:
|
|
p.join()
|
|
except: # noqa
|
|
pass
|
|
# noinspection PyBroadException
|
|
try:
|
|
pipeline_task, result = queue.get_nowait()
|
|
results.append(result)
|
|
cls._add_pipeline_plots(pipeline_task)
|
|
except Exception:
|
|
pass
|
|
cls._multi_pipeline_instances = []
|
|
return results
|
|
|
|
@classmethod
|
|
def _add_pipeline_plots(cls, pipeline_task_id):
|
|
if not Task.current_task():
|
|
return
|
|
from clearml.backend_api.services import events
|
|
res = Task.current_task().send(
|
|
events.GetTaskPlotsRequest(task=pipeline_task_id, iters=1),
|
|
raise_on_errors=False,
|
|
ignore_errors=True,
|
|
)
|
|
execution_flow = None
|
|
execution_details = None
|
|
for p in res.response.plots:
|
|
try:
|
|
if p['metric'] == cls._report_plot_execution_flow['title'] and \
|
|
p['variant'] == cls._report_plot_execution_flow['series']:
|
|
execution_flow = json.loads(p['plot_str'])
|
|
|
|
elif p['metric'] == cls._report_plot_execution_details['title'] and \
|
|
p['variant'] == cls._report_plot_execution_details['series']:
|
|
execution_details = json.loads(p['plot_str'])
|
|
execution_details['layout']['name'] += ' - ' + str(pipeline_task_id)
|
|
except Exception as ex:
|
|
getLogger('clearml.automation.controller').warning(
|
|
'Multi-pipeline plot update failed: {}'.format(ex))
|
|
|
|
if execution_flow:
|
|
Task.current_task().get_logger().report_plotly(
|
|
title=cls._report_plot_execution_flow['title'],
|
|
series='{} - {}'.format(cls._report_plot_execution_flow['series'], pipeline_task_id),
|
|
iteration=0, figure=execution_flow)
|
|
if execution_details:
|
|
Task.current_task().get_logger().report_plotly(
|
|
title=cls._report_plot_execution_details['title'],
|
|
series='{} - {}'.format(cls._report_plot_execution_details['series'], pipeline_task_id),
|
|
iteration=0, figure=execution_details)
|