mirror of
https://github.com/clearml/clearml
synced 2025-04-18 21:34:41 +00:00
Add support for pipelines with spot instances
This commit is contained in:
parent
a42c4b0bd3
commit
157df5dad3
@ -11,7 +11,7 @@ from logging import getLogger
|
|||||||
from multiprocessing import Process, Queue
|
from multiprocessing import Process, Queue
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
from threading import Thread, Event, RLock, current_thread
|
from threading import Thread, Event, RLock, current_thread
|
||||||
from time import time
|
from time import time, sleep
|
||||||
from typing import Sequence, Optional, Mapping, Callable, Any, List, Dict, Union, Tuple
|
from typing import Sequence, Optional, Mapping, Callable, Any, List, Dict, Union, Tuple
|
||||||
|
|
||||||
from attr import attrib, attrs
|
from attr import attrib, attrs
|
||||||
@ -31,6 +31,11 @@ from ..task import Task
|
|||||||
from ..utilities.process.mp import leave_process
|
from ..utilities.process.mp import leave_process
|
||||||
from ..utilities.proxy_object import LazyEvalWrapper, flatten_dictionary, walk_nested_dict_tuple_list
|
from ..utilities.proxy_object import LazyEvalWrapper, flatten_dictionary, walk_nested_dict_tuple_list
|
||||||
|
|
||||||
|
try:
|
||||||
|
import boto3
|
||||||
|
except ImportError:
|
||||||
|
boto3 = None
|
||||||
|
|
||||||
|
|
||||||
class PipelineController(object):
|
class PipelineController(object):
|
||||||
"""
|
"""
|
||||||
@ -57,6 +62,7 @@ class PipelineController(object):
|
|||||||
_monitor_node_interval = 5.*60
|
_monitor_node_interval = 5.*60
|
||||||
_report_plot_execution_flow = dict(title='Pipeline', series='Execution Flow')
|
_report_plot_execution_flow = dict(title='Pipeline', series='Execution Flow')
|
||||||
_report_plot_execution_details = dict(title='Pipeline Details', series='Execution Details')
|
_report_plot_execution_details = dict(title='Pipeline Details', series='Execution Details')
|
||||||
|
_relaunch_check_cache = {}
|
||||||
_evaluated_return_values = {} # TID: pipeline_name
|
_evaluated_return_values = {} # TID: pipeline_name
|
||||||
_add_to_evaluated_return_values = {} # TID: bool
|
_add_to_evaluated_return_values = {} # TID: bool
|
||||||
|
|
||||||
@ -88,6 +94,8 @@ class PipelineController(object):
|
|||||||
monitor_metrics = attrib(type=list, default=None) # List of metric title/series to monitor
|
monitor_metrics = attrib(type=list, default=None) # List of metric title/series to monitor
|
||||||
monitor_artifacts = attrib(type=list, default=None) # List of artifact names to monitor
|
monitor_artifacts = attrib(type=list, default=None) # List of artifact names to monitor
|
||||||
monitor_models = attrib(type=list, default=None) # List of models to monitor
|
monitor_models = attrib(type=list, default=None) # List of models to monitor
|
||||||
|
# If True, relaunch the node if the instance it ran on crashed
|
||||||
|
relaunch_on_instance_failure = attrib(type=bool, default=False)
|
||||||
|
|
||||||
def __attrs_post_init__(self):
|
def __attrs_post_init__(self):
|
||||||
if self.parents is None:
|
if self.parents is None:
|
||||||
@ -132,6 +140,7 @@ class PipelineController(object):
|
|||||||
auto_version_bump=True, # type: bool
|
auto_version_bump=True, # type: bool
|
||||||
abort_on_failure=False, # type: bool
|
abort_on_failure=False, # type: bool
|
||||||
add_run_number=True, # type: bool
|
add_run_number=True, # type: bool
|
||||||
|
relaunch_on_instance_failure=False, # type: bool
|
||||||
):
|
):
|
||||||
# type: (...) -> None
|
# type: (...) -> None
|
||||||
"""
|
"""
|
||||||
@ -157,6 +166,9 @@ class PipelineController(object):
|
|||||||
and mark the pipeline as failed.
|
and mark the pipeline as failed.
|
||||||
:param add_run_number: If True (default), add the run number of the pipeline to the pipeline name.
|
:param add_run_number: If True (default), add the run number of the pipeline to the pipeline name.
|
||||||
Example, the second time we launch the pipeline "best pipeline", we rename it to "best pipeline #2"
|
Example, the second time we launch the pipeline "best pipeline", we rename it to "best pipeline #2"
|
||||||
|
:param relaunch_on_instance_failure: If True, check if the machine a pipeline step ran on
|
||||||
|
was terminated. In case it was, the step will be relaunched. As of now, only AWS instances are supported.
|
||||||
|
Default: False.
|
||||||
"""
|
"""
|
||||||
self._nodes = {}
|
self._nodes = {}
|
||||||
self._running_nodes = []
|
self._running_nodes = []
|
||||||
@ -189,6 +201,7 @@ class PipelineController(object):
|
|||||||
self._mock_execution = False # used for nested pipelines (eager execution)
|
self._mock_execution = False # used for nested pipelines (eager execution)
|
||||||
self._pipeline_as_sub_project = bool(Session.check_min_api_server_version("2.17"))
|
self._pipeline_as_sub_project = bool(Session.check_min_api_server_version("2.17"))
|
||||||
self._last_progress_update_time = 0
|
self._last_progress_update_time = 0
|
||||||
|
self._relaunch_on_instance_failure = relaunch_on_instance_failure
|
||||||
if not self._task:
|
if not self._task:
|
||||||
task_name = name or project or '{}'.format(datetime.now())
|
task_name = name or project or '{}'.format(datetime.now())
|
||||||
if self._pipeline_as_sub_project:
|
if self._pipeline_as_sub_project:
|
||||||
@ -493,6 +506,7 @@ class PipelineController(object):
|
|||||||
pre_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa
|
pre_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node, dict], bool]] # noqa
|
||||||
post_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa
|
post_execute_callback=None, # type: Optional[Callable[[PipelineController, PipelineController.Node], None]] # noqa
|
||||||
cache_executed_step=False, # type: bool
|
cache_executed_step=False, # type: bool
|
||||||
|
relaunch_on_instance_failure=None # type: bool
|
||||||
):
|
):
|
||||||
# type: (...) -> bool
|
# type: (...) -> bool
|
||||||
"""
|
"""
|
||||||
@ -536,6 +550,7 @@ class PipelineController(object):
|
|||||||
If not provided, no results will be stored as artifacts.
|
If not provided, no results will be stored as artifacts.
|
||||||
:param project_name: Set the project name for the task. Required if base_task_id is None.
|
:param project_name: Set the project name for the task. Required if base_task_id is None.
|
||||||
:param task_name: Set the name of the remote task, if not provided use `name` argument.
|
:param task_name: Set the name of the remote task, if not provided use `name` argument.
|
||||||
|
self.task.reload()
|
||||||
:param task_type: Optional, The task type to be created. Supported values: 'training', 'testing', 'inference',
|
:param task_type: Optional, The task type to be created. Supported values: 'training', 'testing', 'inference',
|
||||||
'data_processing', 'application', 'monitor', 'controller', 'optimizer', 'service', 'qc', 'custom'
|
'data_processing', 'application', 'monitor', 'controller', 'optimizer', 'service', 'qc', 'custom'
|
||||||
:param auto_connect_frameworks: Control the frameworks auto connect, see `Task.init` auto_connect_frameworks
|
:param auto_connect_frameworks: Control the frameworks auto connect, see `Task.init` auto_connect_frameworks
|
||||||
@ -626,6 +641,11 @@ class PipelineController(object):
|
|||||||
Default: False, a new cloned copy of base_task is always used.
|
Default: False, a new cloned copy of base_task is always used.
|
||||||
Notice: If the git repo reference does not have a specific commit ID, the Task will never be used.
|
Notice: If the git repo reference does not have a specific commit ID, the Task will never be used.
|
||||||
|
|
||||||
|
:param relaunch_on_instance_failure: If True, check if the machine this step ran on
|
||||||
|
was terminated. In case it was, the step will be relaunched. As of now, only AWS instances are supported.
|
||||||
|
By default, the value of this flag will be the one found in the PipelineController instance
|
||||||
|
this function is called from, which it itself is False by default.
|
||||||
|
|
||||||
:return: True if successful
|
:return: True if successful
|
||||||
"""
|
"""
|
||||||
# always store callback functions (even when running remotely)
|
# always store callback functions (even when running remotely)
|
||||||
@ -704,6 +724,8 @@ class PipelineController(object):
|
|||||||
a_task.update_task(task_definition)
|
a_task.update_task(task_definition)
|
||||||
return a_task
|
return a_task
|
||||||
|
|
||||||
|
if relaunch_on_instance_failure is None:
|
||||||
|
relaunch_on_instance_failure = self._relaunch_on_instance_failure
|
||||||
self._nodes[name] = self.Node(
|
self._nodes[name] = self.Node(
|
||||||
name=name, base_task_id=None, parents=parents or [],
|
name=name, base_task_id=None, parents=parents or [],
|
||||||
queue=execution_queue, timeout=time_limit,
|
queue=execution_queue, timeout=time_limit,
|
||||||
@ -717,6 +739,7 @@ class PipelineController(object):
|
|||||||
monitor_metrics=monitor_metrics,
|
monitor_metrics=monitor_metrics,
|
||||||
monitor_models=monitor_models,
|
monitor_models=monitor_models,
|
||||||
job_code_section=job_code_section,
|
job_code_section=job_code_section,
|
||||||
|
relaunch_on_instance_failure=relaunch_on_instance_failure
|
||||||
)
|
)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
@ -1975,11 +1998,20 @@ class PipelineController(object):
|
|||||||
if not node.job:
|
if not node.job:
|
||||||
continue
|
continue
|
||||||
if node.job.is_stopped():
|
if node.job.is_stopped():
|
||||||
completed_jobs.append(j)
|
|
||||||
node_failed = node.job.is_failed()
|
node_failed = node.job.is_failed()
|
||||||
node.executed = node.job.task_id() if not node_failed else False
|
node.executed = node.job.task_id() if not node_failed else False
|
||||||
if j in launched_nodes:
|
if j in launched_nodes:
|
||||||
launched_nodes.remove(j)
|
launched_nodes.remove(j)
|
||||||
|
if (node_failed or node.job.is_aborted()) and self._should_relaunch_node(node):
|
||||||
|
# marking the job as None lets us know that this node has to be requeued
|
||||||
|
self._task.get_logger().report_text(
|
||||||
|
"Relaunching step {} on instance termination".format(node.name)
|
||||||
|
)
|
||||||
|
node.job = None
|
||||||
|
node.executed = None
|
||||||
|
self._running_nodes.remove(j)
|
||||||
|
continue
|
||||||
|
completed_jobs.append(j)
|
||||||
# check if we need to stop all running steps
|
# check if we need to stop all running steps
|
||||||
if node_failed and self._abort_running_steps_on_failure and not node.continue_on_fail:
|
if node_failed and self._abort_running_steps_on_failure and not node.continue_on_fail:
|
||||||
nodes_failed_stop_pipeline.append(node.name)
|
nodes_failed_stop_pipeline.append(node.name)
|
||||||
@ -2502,6 +2534,57 @@ class PipelineController(object):
|
|||||||
|
|
||||||
return '<a href="{}"> {} </a>'.format(task_link_template.format(project=project_id, task=task_id), task_id)
|
return '<a href="{}"> {} </a>'.format(task_link_template.format(project=project_id, task=task_id), task_id)
|
||||||
|
|
||||||
|
def _should_relaunch_node(self, node):
|
||||||
|
# type: ('PipelineController.Node') -> bool
|
||||||
|
"""
|
||||||
|
Check if a node should be relaunched. At the moment, this function returns True only if the AWS instance
|
||||||
|
the node ran on terminated during the run
|
||||||
|
|
||||||
|
:param node: The node to check if it should be relaunched
|
||||||
|
|
||||||
|
:return: True if the node should be relaunched and False otherwise
|
||||||
|
"""
|
||||||
|
if not boto3 or not self._relaunch_on_instance_failure:
|
||||||
|
return False
|
||||||
|
worker = node.job.worker().split(":")[-1]
|
||||||
|
if (worker, node.name) in self._relaunch_check_cache:
|
||||||
|
return self._relaunch_check_cache[(worker, node.name)]
|
||||||
|
# get credentials from all autoscalers (shouldn't be too many)
|
||||||
|
aws_scaler_tasks = Task.get_tasks(
|
||||||
|
tags=["AWS"],
|
||||||
|
task_filter={
|
||||||
|
"status": ["in_progress"],
|
||||||
|
"type": ["application"],
|
||||||
|
"search_hidden": True,
|
||||||
|
"_allow_extra_fields_": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
used_creds = []
|
||||||
|
for task in aws_scaler_tasks:
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
parameters = task.get_parameters()
|
||||||
|
cred = {
|
||||||
|
"aws_access_key_id": parameters["General/cloud_credentials_key"],
|
||||||
|
"aws_secret_access_key": parameters["General/cloud_credentials_secret"],
|
||||||
|
"region_name": parameters["General/cloud_credentials_region"],
|
||||||
|
}
|
||||||
|
if cred in used_creds:
|
||||||
|
continue
|
||||||
|
used_creds.append(cred)
|
||||||
|
ec2_resource = boto3.resource("ec2", **cred)
|
||||||
|
# check if AWS instance is still running
|
||||||
|
# we would like to requeue if the instance somehow died
|
||||||
|
# (like when a spot instance is terminated)
|
||||||
|
instance = next(iter(ec2_resource.instances.filter(InstanceIds=[worker])))
|
||||||
|
if instance.state["Name"] != "running":
|
||||||
|
self._relaunch_check_cache[(worker, node.name)] = True
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
self._relaunch_check_cache[(worker, node.name)] = False
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
class PipelineDecorator(PipelineController):
|
class PipelineDecorator(PipelineController):
|
||||||
_added_decorator = [] # type: List[dict]
|
_added_decorator = [] # type: List[dict]
|
||||||
@ -2526,6 +2609,7 @@ class PipelineDecorator(PipelineController):
|
|||||||
target_project=None, # type: Optional[str]
|
target_project=None, # type: Optional[str]
|
||||||
abort_on_failure=False, # type: bool
|
abort_on_failure=False, # type: bool
|
||||||
add_run_number=True, # type: bool
|
add_run_number=True, # type: bool
|
||||||
|
relaunch_on_instance_failure=False # type: bool
|
||||||
):
|
):
|
||||||
# type: (...) -> ()
|
# type: (...) -> ()
|
||||||
"""
|
"""
|
||||||
@ -2547,6 +2631,9 @@ class PipelineDecorator(PipelineController):
|
|||||||
and mark the pipeline as failed.
|
and mark the pipeline as failed.
|
||||||
:param add_run_number: If True (default), add the run number of the pipeline to the pipeline name.
|
:param add_run_number: If True (default), add the run number of the pipeline to the pipeline name.
|
||||||
Example, the second time we launch the pipeline "best pipeline", we rename it to "best pipeline #2"
|
Example, the second time we launch the pipeline "best pipeline", we rename it to "best pipeline #2"
|
||||||
|
:param relaunch_on_instance_failure: If True, check if the machine a pipeline step ran on
|
||||||
|
was terminated. In case it was, the step will be relaunched. As of now, only AWS instances are supported.
|
||||||
|
Default: False
|
||||||
"""
|
"""
|
||||||
super(PipelineDecorator, self).__init__(
|
super(PipelineDecorator, self).__init__(
|
||||||
name=name,
|
name=name,
|
||||||
@ -2557,6 +2644,7 @@ class PipelineDecorator(PipelineController):
|
|||||||
target_project=target_project,
|
target_project=target_project,
|
||||||
abort_on_failure=abort_on_failure,
|
abort_on_failure=abort_on_failure,
|
||||||
add_run_number=add_run_number,
|
add_run_number=add_run_number,
|
||||||
|
relaunch_on_instance_failure=relaunch_on_instance_failure
|
||||||
)
|
)
|
||||||
|
|
||||||
# if we are in eager execution, make sure parent class knows it
|
# if we are in eager execution, make sure parent class knows it
|
||||||
@ -2610,13 +2698,15 @@ class PipelineDecorator(PipelineController):
|
|||||||
if not node.job:
|
if not node.job:
|
||||||
continue
|
continue
|
||||||
if node.job.is_stopped():
|
if node.job.is_stopped():
|
||||||
completed_jobs.append(j)
|
|
||||||
node_failed = node.job.is_failed()
|
node_failed = node.job.is_failed()
|
||||||
|
if (node_failed or node.job.is_aborted()) and self._should_relaunch_node(node):
|
||||||
|
continue
|
||||||
|
completed_jobs.append(j)
|
||||||
node.executed = node.job.task_id() if not node_failed else False
|
node.executed = node.job.task_id() if not node_failed else False
|
||||||
if j in launched_nodes:
|
if j in launched_nodes:
|
||||||
launched_nodes.remove(j)
|
launched_nodes.remove(j)
|
||||||
# check if we need to stop all running steps
|
# check if we need to stop all running steps
|
||||||
if node_failed and self._abort_running_steps_on_failure and not node.continue_on_fail:
|
elif node_failed and self._abort_running_steps_on_failure and not node.continue_on_fail:
|
||||||
nodes_failed_stop_pipeline.append(node.name)
|
nodes_failed_stop_pipeline.append(node.name)
|
||||||
elif node.timeout:
|
elif node.timeout:
|
||||||
started = node.job.task.data.started
|
started = node.job.task.data.started
|
||||||
@ -2864,7 +2954,8 @@ class PipelineDecorator(PipelineController):
|
|||||||
helper_functions=None, # type: Optional[Sequence[Callable]]
|
helper_functions=None, # type: Optional[Sequence[Callable]]
|
||||||
monitor_metrics=None, # type: Optional[List[Union[Tuple[str, str], Tuple[(str, str), (str, str)]]]]
|
monitor_metrics=None, # type: Optional[List[Union[Tuple[str, str], Tuple[(str, str), (str, str)]]]]
|
||||||
monitor_artifacts=None, # type: Optional[List[Union[str, Tuple[str, str]]]]
|
monitor_artifacts=None, # type: Optional[List[Union[str, Tuple[str, str]]]]
|
||||||
monitor_models=None # type: Optional[List[Union[str, Tuple[str, str]]]]
|
monitor_models=None, # type: Optional[List[Union[str, Tuple[str, str]]]]
|
||||||
|
relaunch_on_instance_failure=None # type: bool
|
||||||
):
|
):
|
||||||
# type: (...) -> Callable
|
# type: (...) -> Callable
|
||||||
"""
|
"""
|
||||||
@ -2938,6 +3029,10 @@ class PipelineDecorator(PipelineController):
|
|||||||
where the first string is the model name as it appears on the component Task,
|
where the first string is the model name as it appears on the component Task,
|
||||||
and the second is the target model name to put on the Pipeline Task
|
and the second is the target model name to put on the Pipeline Task
|
||||||
Example: [('model_weights', 'final_model_weights'), ]
|
Example: [('model_weights', 'final_model_weights'), ]
|
||||||
|
:param relaunch_on_instance_failure: If True, check if the machine this step ran on
|
||||||
|
was terminated. In case it was, the step will be relaunched. As of now, only AWS instances are supported.
|
||||||
|
By default, the value of this flag will be the one found passed in the PipelineDecorator.pipeline decorator,
|
||||||
|
which it itself is False by default.
|
||||||
|
|
||||||
:return: function wrapper
|
:return: function wrapper
|
||||||
"""
|
"""
|
||||||
@ -2977,6 +3072,7 @@ class PipelineDecorator(PipelineController):
|
|||||||
monitor_metrics=monitor_metrics,
|
monitor_metrics=monitor_metrics,
|
||||||
monitor_models=monitor_models,
|
monitor_models=monitor_models,
|
||||||
monitor_artifacts=monitor_artifacts,
|
monitor_artifacts=monitor_artifacts,
|
||||||
|
relaunch_on_instance_failure=relaunch_on_instance_failure
|
||||||
)
|
)
|
||||||
|
|
||||||
if cls._singleton:
|
if cls._singleton:
|
||||||
@ -2994,7 +3090,7 @@ class PipelineDecorator(PipelineController):
|
|||||||
|
|
||||||
func_return = []
|
func_return = []
|
||||||
|
|
||||||
def result_wrapper(a_func_return, return_index):
|
def result_wrapper_callback(a_func_return, return_index):
|
||||||
if not a_func_return:
|
if not a_func_return:
|
||||||
a_func_return.append(func(*args, **kwargs))
|
a_func_return.append(func(*args, **kwargs))
|
||||||
a_func_return = a_func_return[0]
|
a_func_return = a_func_return[0]
|
||||||
@ -3002,14 +3098,14 @@ class PipelineDecorator(PipelineController):
|
|||||||
|
|
||||||
if len(function_return) == 1:
|
if len(function_return) == 1:
|
||||||
ret_val = LazyEvalWrapper(
|
ret_val = LazyEvalWrapper(
|
||||||
callback=functools.partial(result_wrapper, func_return, None),
|
callback=functools.partial(result_wrapper_callback, func_return, None),
|
||||||
remote_reference=functools.partial(result_wrapper, func_return, None))
|
remote_reference=functools.partial(result_wrapper_callback, func_return, None))
|
||||||
cls._ref_lazy_loader_id_to_node_name[id(ret_val)] = _name
|
cls._ref_lazy_loader_id_to_node_name[id(ret_val)] = _name
|
||||||
return ret_val
|
return ret_val
|
||||||
else:
|
else:
|
||||||
return_w = [LazyEvalWrapper(
|
return_w = [LazyEvalWrapper(
|
||||||
callback=functools.partial(result_wrapper, func_return, i),
|
callback=functools.partial(result_wrapper_callback, func_return, i),
|
||||||
remote_reference=functools.partial(result_wrapper, func_return, i))
|
remote_reference=functools.partial(result_wrapper_callback, func_return, i))
|
||||||
for i, _ in enumerate(function_return)]
|
for i, _ in enumerate(function_return)]
|
||||||
for i in return_w:
|
for i in return_w:
|
||||||
cls._ref_lazy_loader_id_to_node_name[id(i)] = _name
|
cls._ref_lazy_loader_id_to_node_name[id(i)] = _name
|
||||||
@ -3087,6 +3183,10 @@ class PipelineDecorator(PipelineController):
|
|||||||
# get node and park is as launched
|
# get node and park is as launched
|
||||||
cls._singleton._launched_step_names.add(_node_name)
|
cls._singleton._launched_step_names.add(_node_name)
|
||||||
_node = cls._singleton._nodes[_node_name]
|
_node = cls._singleton._nodes[_node_name]
|
||||||
|
if relaunch_on_instance_failure is not None:
|
||||||
|
_node.relaunch_on_instance_failure = relaunch_on_instance_failure
|
||||||
|
else:
|
||||||
|
_node.relaunch_on_instance_failure = cls._singleton._relaunch_on_instance_failure
|
||||||
|
|
||||||
# The actual launch is a bit slow, we run it in the background
|
# The actual launch is a bit slow, we run it in the background
|
||||||
launch_thread = Thread(
|
launch_thread = Thread(
|
||||||
@ -3121,11 +3221,34 @@ class PipelineDecorator(PipelineController):
|
|||||||
launch_thread.join()
|
launch_thread.join()
|
||||||
except: # noqa
|
except: # noqa
|
||||||
pass
|
pass
|
||||||
# wait until job is completed
|
while True:
|
||||||
_node.job.wait(pool_period=0.2)
|
# wait until job is completed
|
||||||
if _node.job.is_failed():
|
if _node.job:
|
||||||
raise ValueError(
|
_node.job.wait(pool_period=0.2)
|
||||||
'Pipeline step "{}", Task ID={} failed'.format(_node.name, _node.job.task_id()))
|
else:
|
||||||
|
sleep(0.2)
|
||||||
|
continue
|
||||||
|
if _node.job.is_failed() or _node.job.is_aborted():
|
||||||
|
if cls._singleton._should_relaunch_node(_node):
|
||||||
|
cls._singleton._task.get_logger().report_text(
|
||||||
|
"Relaunching step {} on instance termination".format(_node.name)
|
||||||
|
)
|
||||||
|
_node.job = None
|
||||||
|
_node.executed = None
|
||||||
|
cls._component_launch(
|
||||||
|
_node.name,
|
||||||
|
_node,
|
||||||
|
kwargs_artifacts,
|
||||||
|
kwargs,
|
||||||
|
current_thread().ident
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
'Pipeline step "{}", Task ID={} failed'.format(_node.name, _node.job.task_id())
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
_node.executed = _node.job.task_id()
|
_node.executed = _node.job.task_id()
|
||||||
|
|
||||||
@ -3170,7 +3293,9 @@ class PipelineDecorator(PipelineController):
|
|||||||
pipeline_execution_queue='services', # type: Optional[str]
|
pipeline_execution_queue='services', # type: Optional[str]
|
||||||
multi_instance_support=False, # type: bool
|
multi_instance_support=False, # type: bool
|
||||||
add_run_number=True, # type: bool
|
add_run_number=True, # type: bool
|
||||||
args_map=None # type: dict[str, List[str]]
|
args_map=None, # type: dict[str, List[str]]
|
||||||
|
relaunch_on_instance_failure=False, # type: bool
|
||||||
|
start_controller_locally=False # type: bool
|
||||||
):
|
):
|
||||||
# type: (...) -> Callable
|
# type: (...) -> Callable
|
||||||
"""
|
"""
|
||||||
@ -3217,6 +3342,13 @@ class PipelineDecorator(PipelineController):
|
|||||||
- paramB: sectionB/paramB
|
- paramB: sectionB/paramB
|
||||||
- paramC: sectionB/paramC
|
- paramC: sectionB/paramC
|
||||||
- paramD: Args/paramD
|
- paramD: Args/paramD
|
||||||
|
|
||||||
|
:param relaunch_on_instance_failure: If True, check if the machine a pipeline step ran on
|
||||||
|
was terminated. In case it was, the step will be relaunched. As of now, only AWS instances are supported.
|
||||||
|
Default: False
|
||||||
|
:param start_controller_locally: If True, start the controller on the local machine. The steps will run
|
||||||
|
remotely if `PipelineDecorator.run_locally` or `PipelineDecorator.debug_pipeline` are not called.
|
||||||
|
Default: False
|
||||||
"""
|
"""
|
||||||
def decorator_wrap(func):
|
def decorator_wrap(func):
|
||||||
|
|
||||||
@ -3253,6 +3385,7 @@ class PipelineDecorator(PipelineController):
|
|||||||
target_project=target_project,
|
target_project=target_project,
|
||||||
abort_on_failure=abort_on_failure,
|
abort_on_failure=abort_on_failure,
|
||||||
add_run_number=add_run_number,
|
add_run_number=add_run_number,
|
||||||
|
relaunch_on_instance_failure=relaunch_on_instance_failure
|
||||||
)
|
)
|
||||||
ret_val = func(**pipeline_kwargs)
|
ret_val = func(**pipeline_kwargs)
|
||||||
LazyEvalWrapper.trigger_all_remote_references()
|
LazyEvalWrapper.trigger_all_remote_references()
|
||||||
@ -3294,6 +3427,7 @@ class PipelineDecorator(PipelineController):
|
|||||||
target_project=target_project,
|
target_project=target_project,
|
||||||
abort_on_failure=abort_on_failure,
|
abort_on_failure=abort_on_failure,
|
||||||
add_run_number=add_run_number,
|
add_run_number=add_run_number,
|
||||||
|
relaunch_on_instance_failure=relaunch_on_instance_failure
|
||||||
)
|
)
|
||||||
|
|
||||||
a_pipeline._args_map = args_map or {}
|
a_pipeline._args_map = args_map or {}
|
||||||
@ -3318,7 +3452,6 @@ class PipelineDecorator(PipelineController):
|
|||||||
a_pipeline._task._set_runtime_properties(
|
a_pipeline._task._set_runtime_properties(
|
||||||
dict(multi_pipeline_counter=str(cls._multi_pipeline_call_counter)))
|
dict(multi_pipeline_counter=str(cls._multi_pipeline_call_counter)))
|
||||||
|
|
||||||
# serialize / deserialize state only if we are running locally
|
|
||||||
a_pipeline._start(wait=False)
|
a_pipeline._start(wait=False)
|
||||||
|
|
||||||
# sync arguments back (post deserialization and casting back)
|
# sync arguments back (post deserialization and casting back)
|
||||||
@ -3327,7 +3460,7 @@ class PipelineDecorator(PipelineController):
|
|||||||
pipeline_kwargs[k] = a_pipeline.get_parameters()[k]
|
pipeline_kwargs[k] = a_pipeline.get_parameters()[k]
|
||||||
|
|
||||||
# run the actual pipeline
|
# run the actual pipeline
|
||||||
if not PipelineDecorator._debug_execute_step_process and pipeline_execution_queue:
|
if not start_controller_locally and not PipelineDecorator._debug_execute_step_process and pipeline_execution_queue:
|
||||||
# rerun the pipeline on a remote machine
|
# rerun the pipeline on a remote machine
|
||||||
a_pipeline._task.execute_remotely(queue_name=pipeline_execution_queue)
|
a_pipeline._task.execute_remotely(queue_name=pipeline_execution_queue)
|
||||||
# when we get here it means we are running remotely
|
# when we get here it means we are running remotely
|
||||||
|
@ -202,11 +202,8 @@ class BaseJob(object):
|
|||||||
return self._worker
|
return self._worker
|
||||||
|
|
||||||
if self._worker is None:
|
if self._worker is None:
|
||||||
# the last console outputs will update the worker
|
self.task.reload()
|
||||||
self.get_console_output(number_of_reports=1)
|
self._worker = self.task.last_worker
|
||||||
# if we still do not have it, store empty string
|
|
||||||
if not self._worker:
|
|
||||||
self._worker = ''
|
|
||||||
|
|
||||||
return self._worker
|
return self._worker
|
||||||
|
|
||||||
@ -780,10 +777,6 @@ class _JobStub(object):
|
|||||||
# type: () -> str
|
# type: () -> str
|
||||||
return 'stub'
|
return 'stub'
|
||||||
|
|
||||||
def worker(self):
|
|
||||||
# type: () -> ()
|
|
||||||
return None
|
|
||||||
|
|
||||||
def status(self):
|
def status(self):
|
||||||
# type: () -> str
|
# type: () -> str
|
||||||
return 'in_progress'
|
return 'in_progress'
|
||||||
|
@ -1058,6 +1058,10 @@ class Task(_Task):
|
|||||||
# type: () -> str
|
# type: () -> str
|
||||||
return self.storage_uri
|
return self.storage_uri
|
||||||
|
|
||||||
|
@property
|
||||||
|
def last_worker(self):
|
||||||
|
return self._data.last_worker
|
||||||
|
|
||||||
@output_uri.setter
|
@output_uri.setter
|
||||||
def output_uri(self, value):
|
def output_uri(self, value):
|
||||||
# type: (Union[str, bool]) -> None
|
# type: (Union[str, bool]) -> None
|
||||||
|
Loading…
Reference in New Issue
Block a user