mirror of
https://github.com/clearml/clearml
synced 2025-04-21 23:04:42 +00:00
Fix Pipeline component is treated as aborted if running on instance that was killed externally (e.g. spot instances dying)
This commit is contained in:
parent
1cc87c9a21
commit
5a83aa433d
@ -1997,7 +1997,7 @@ class PipelineController(object):
|
|||||||
node = self._nodes[j]
|
node = self._nodes[j]
|
||||||
if not node.job:
|
if not node.job:
|
||||||
continue
|
continue
|
||||||
if node.job.is_stopped():
|
if node.job.is_stopped(aborted_nonresponsive_as_running=True):
|
||||||
node_failed = node.job.is_failed()
|
node_failed = node.job.is_failed()
|
||||||
node.executed = node.job.task_id() if not node_failed else False
|
node.executed = node.job.task_id() if not node_failed else False
|
||||||
if j in launched_nodes:
|
if j in launched_nodes:
|
||||||
@ -2256,7 +2256,7 @@ class PipelineController(object):
|
|||||||
self._task._edit(models=models)
|
self._task._edit(models=models)
|
||||||
|
|
||||||
# update the state (so that we do not scan the node twice)
|
# update the state (so that we do not scan the node twice)
|
||||||
if node.job.is_stopped():
|
if node.job.is_stopped(aborted_nonresponsive_as_running=True):
|
||||||
self._monitored_nodes[node.name]['completed'] = True
|
self._monitored_nodes[node.name]['completed'] = True
|
||||||
|
|
||||||
def _get_target_project(self, return_project_id=False):
|
def _get_target_project(self, return_project_id=False):
|
||||||
@ -2546,7 +2546,10 @@ class PipelineController(object):
|
|||||||
"""
|
"""
|
||||||
if not boto3 or not self._relaunch_on_instance_failure:
|
if not boto3 or not self._relaunch_on_instance_failure:
|
||||||
return False
|
return False
|
||||||
worker = node.job.worker().split(":")[-1]
|
worker = (node.job.worker() or "").split(":")[-1]
|
||||||
|
if not worker:
|
||||||
|
return False
|
||||||
|
|
||||||
if (worker, node.name) in self._relaunch_check_cache:
|
if (worker, node.name) in self._relaunch_check_cache:
|
||||||
return self._relaunch_check_cache[(worker, node.name)]
|
return self._relaunch_check_cache[(worker, node.name)]
|
||||||
# get credentials from all autoscalers (shouldn't be too many)
|
# get credentials from all autoscalers (shouldn't be too many)
|
||||||
@ -2697,7 +2700,7 @@ class PipelineDecorator(PipelineController):
|
|||||||
node = self._nodes[j]
|
node = self._nodes[j]
|
||||||
if not node.job:
|
if not node.job:
|
||||||
continue
|
continue
|
||||||
if node.job.is_stopped():
|
if node.job.is_stopped(aborted_nonresponsive_as_running=True):
|
||||||
node_failed = node.job.is_failed()
|
node_failed = node.job.is_failed()
|
||||||
if (node_failed or node.job.is_aborted()) and self._should_relaunch_node(node):
|
if (node_failed or node.job.is_aborted()) and self._should_relaunch_node(node):
|
||||||
continue
|
continue
|
||||||
@ -2935,7 +2938,7 @@ class PipelineDecorator(PipelineController):
|
|||||||
def component(
|
def component(
|
||||||
cls,
|
cls,
|
||||||
_func=None, *,
|
_func=None, *,
|
||||||
return_values=('return_object', ), # type: Union[str, List[str]]
|
return_values=('return_object', ), # type: Union[str, Sequence[str]]
|
||||||
name=None, # type: Optional[str]
|
name=None, # type: Optional[str]
|
||||||
cache=False, # type: bool
|
cache=False, # type: bool
|
||||||
packages=None, # type: Optional[Union[str, Sequence[str]]]
|
packages=None, # type: Optional[Union[str, Sequence[str]]]
|
||||||
@ -3206,7 +3209,8 @@ class PipelineDecorator(PipelineController):
|
|||||||
raise ValueError("Job was not created and is also not cached/executed")
|
raise ValueError("Job was not created and is also not cached/executed")
|
||||||
return "{}.{}".format(_node.executed, return_name)
|
return "{}.{}".format(_node.executed, return_name)
|
||||||
|
|
||||||
_node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5)
|
_node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5,
|
||||||
|
aborted_nonresponsive_as_running=True)
|
||||||
if _node.job.is_failed() and not _node.continue_on_fail:
|
if _node.job.is_failed() and not _node.continue_on_fail:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
'Pipeline step "{}", Task ID={} failed'.format(_node.name, _node.job.task_id()))
|
'Pipeline step "{}", Task ID={} failed'.format(_node.name, _node.job.task_id()))
|
||||||
@ -3224,7 +3228,8 @@ class PipelineDecorator(PipelineController):
|
|||||||
while True:
|
while True:
|
||||||
# wait until job is completed
|
# wait until job is completed
|
||||||
if _node.job:
|
if _node.job:
|
||||||
_node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5)
|
_node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5,
|
||||||
|
aborted_nonresponsive_as_running=True)
|
||||||
else:
|
else:
|
||||||
sleep(2)
|
sleep(2)
|
||||||
continue
|
continue
|
||||||
@ -3486,9 +3491,10 @@ class PipelineDecorator(PipelineController):
|
|||||||
while waited:
|
while waited:
|
||||||
waited = False
|
waited = False
|
||||||
for node in list(a_pipeline._nodes.values()):
|
for node in list(a_pipeline._nodes.values()):
|
||||||
if node.executed or not node.job or node.job.is_stopped():
|
if node.executed or not node.job or node.job.is_stopped(aborted_nonresponsive_as_running=True):
|
||||||
continue
|
continue
|
||||||
node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5)
|
node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5,
|
||||||
|
aborted_nonresponsive_as_running=True)
|
||||||
waited = True
|
waited = True
|
||||||
# store the pipeline result of we have any:
|
# store the pipeline result of we have any:
|
||||||
if return_value and pipeline_result is not None:
|
if return_value and pipeline_result is not None:
|
||||||
|
@ -163,22 +163,25 @@ class BaseJob(object):
|
|||||||
self._last_status_ts = time()
|
self._last_status_ts = time()
|
||||||
return self._last_status
|
return self._last_status
|
||||||
|
|
||||||
def wait(self, timeout=None, pool_period=30.):
|
def wait(self, timeout=None, pool_period=30., aborted_nonresponsive_as_running=False):
|
||||||
# type: (Optional[float], float) -> bool
|
# type: (Optional[float], float, bool) -> bool
|
||||||
"""
|
"""
|
||||||
Wait until the task is fully executed (i.e., aborted/completed/failed)
|
Wait until the task is fully executed (i.e., aborted/completed/failed)
|
||||||
|
|
||||||
:param timeout: maximum time (minutes) to wait for Task to finish
|
:param timeout: maximum time (minutes) to wait for Task to finish
|
||||||
:param pool_period: check task status every pool_period seconds
|
:param pool_period: check task status every pool_period seconds
|
||||||
|
:param aborted_nonresponsive_as_running: (default: False) If True, ignore the stopped state if the backend
|
||||||
|
non-responsive watchdog sets this Task to stopped. This scenario could happen if
|
||||||
|
an instance running the job is killed without warning (e.g. spot instances)
|
||||||
:return: True, if Task finished.
|
:return: True, if Task finished.
|
||||||
"""
|
"""
|
||||||
tic = time()
|
tic = time()
|
||||||
while timeout is None or time() - tic < timeout * 60.:
|
while timeout is None or time() - tic < timeout * 60.:
|
||||||
if self.is_stopped():
|
if self.is_stopped(aborted_nonresponsive_as_running=aborted_nonresponsive_as_running):
|
||||||
return True
|
return True
|
||||||
sleep(pool_period)
|
sleep(pool_period)
|
||||||
|
|
||||||
return self.is_stopped()
|
return self.is_stopped(aborted_nonresponsive_as_running=aborted_nonresponsive_as_running)
|
||||||
|
|
||||||
def get_console_output(self, number_of_reports=1):
|
def get_console_output(self, number_of_reports=1):
|
||||||
# type: (int) -> Sequence[str]
|
# type: (int) -> Sequence[str]
|
||||||
@ -192,7 +195,7 @@ class BaseJob(object):
|
|||||||
return self.task.get_reported_console_output(number_of_reports=number_of_reports)
|
return self.task.get_reported_console_output(number_of_reports=number_of_reports)
|
||||||
|
|
||||||
def worker(self):
|
def worker(self):
|
||||||
# type: () -> str
|
# type: () -> Optional[str]
|
||||||
"""
|
"""
|
||||||
Return the current worker id executing this Job. If job is pending, returns None
|
Return the current worker id executing this Job. If job is pending, returns None
|
||||||
|
|
||||||
@ -216,16 +219,35 @@ class BaseJob(object):
|
|||||||
"""
|
"""
|
||||||
return self.status() == Task.TaskStatusEnum.in_progress
|
return self.status() == Task.TaskStatusEnum.in_progress
|
||||||
|
|
||||||
def is_stopped(self):
|
def is_stopped(self, aborted_nonresponsive_as_running=False):
|
||||||
# type: () -> bool
|
# type: (bool) -> bool
|
||||||
"""
|
"""
|
||||||
Return True, if job finished executing (for any reason)
|
Return True, if job finished executing (for any reason)
|
||||||
|
|
||||||
|
:param aborted_nonresponsive_as_running: (default: False) If True, ignore the stopped state if the backend
|
||||||
|
non-responsive watchdog sets this Task to stopped. This scenario could happen if
|
||||||
|
an instance running the job is killed without warning (e.g. spot instances)
|
||||||
|
|
||||||
:return: True the task is currently one of these states, stopped / completed / failed / published.
|
:return: True the task is currently one of these states, stopped / completed / failed / published.
|
||||||
"""
|
"""
|
||||||
return self.status() in (
|
task_status = self.status()
|
||||||
Task.TaskStatusEnum.stopped, Task.TaskStatusEnum.completed,
|
# check if we are Not in any of the non-running states
|
||||||
Task.TaskStatusEnum.failed, Task.TaskStatusEnum.published)
|
if task_status not in (Task.TaskStatusEnum.stopped, Task.TaskStatusEnum.completed,
|
||||||
|
Task.TaskStatusEnum.failed, Task.TaskStatusEnum.published):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# notice the status update also refresh the "status_message" field on the Task
|
||||||
|
|
||||||
|
# if we are stopped but the message says "non-responsive" it means for some reason the
|
||||||
|
# Task's instance was killed, we should ignore it if requested because we assume someone will bring it back
|
||||||
|
if aborted_nonresponsive_as_running and task_status == Task.TaskStatusEnum.stopped and \
|
||||||
|
str(self.task.data.status_message).lower() == "forced stop (non-responsive)":
|
||||||
|
# if we are here it means the state is "stopped" but we should ignore it
|
||||||
|
# because the non-responsive watchdog set it. We assume someone (autoscaler) will relaunch it.
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
# if we do not need to ignore the nonactive state, it means this Task stopped
|
||||||
|
return True
|
||||||
|
|
||||||
def is_failed(self):
|
def is_failed(self):
|
||||||
# type: () -> bool
|
# type: () -> bool
|
||||||
|
@ -1637,16 +1637,18 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin):
|
|||||||
def get_status(self):
|
def get_status(self):
|
||||||
# type: () -> str
|
# type: () -> str
|
||||||
"""
|
"""
|
||||||
Return The task status without refreshing the entire Task object object (only the status property)
|
Return The task status without refreshing the entire Task object (only the status property)
|
||||||
|
|
||||||
TaskStatusEnum: ["created", "in_progress", "stopped", "closed", "failed", "completed",
|
TaskStatusEnum: ["created", "in_progress", "stopped", "closed", "failed", "completed",
|
||||||
"queued", "published", "publishing", "unknown"]
|
"queued", "published", "publishing", "unknown"]
|
||||||
|
|
||||||
:return: str: Task status as string (TaskStatusEnum)
|
:return: str: Task status as string (TaskStatusEnum)
|
||||||
"""
|
"""
|
||||||
status = self._get_status()[0]
|
status, status_message = self._get_status()
|
||||||
if self._data:
|
if self._data:
|
||||||
self._data.status = status
|
self._data.status = status
|
||||||
|
self._data.status_message = str(status_message)
|
||||||
|
|
||||||
return str(status)
|
return str(status)
|
||||||
|
|
||||||
def get_output_log_web_page(self):
|
def get_output_log_web_page(self):
|
||||||
|
Loading…
Reference in New Issue
Block a user