mirror of
				https://github.com/clearml/clearml
				synced 2025-06-26 18:16:07 +00:00 
			
		
		
		
	Fix Pipeline component is treated as aborted if running on instance that was killed externally (e.g. spot instances dying)
This commit is contained in:
		
							parent
							
								
									1cc87c9a21
								
							
						
					
					
						commit
						5a83aa433d
					
				| @ -1997,7 +1997,7 @@ class PipelineController(object): | ||||
|                 node = self._nodes[j] | ||||
|                 if not node.job: | ||||
|                     continue | ||||
|                 if node.job.is_stopped(): | ||||
|                 if node.job.is_stopped(aborted_nonresponsive_as_running=True): | ||||
|                     node_failed = node.job.is_failed() | ||||
|                     node.executed = node.job.task_id() if not node_failed else False | ||||
|                     if j in launched_nodes: | ||||
| @ -2256,7 +2256,7 @@ class PipelineController(object): | ||||
|                 self._task._edit(models=models) | ||||
| 
 | ||||
|         # update the state (so that we do not scan the node twice) | ||||
|         if node.job.is_stopped(): | ||||
|         if node.job.is_stopped(aborted_nonresponsive_as_running=True): | ||||
|             self._monitored_nodes[node.name]['completed'] = True | ||||
| 
 | ||||
|     def _get_target_project(self, return_project_id=False): | ||||
| @ -2546,7 +2546,10 @@ class PipelineController(object): | ||||
|         """ | ||||
|         if not boto3 or not self._relaunch_on_instance_failure: | ||||
|             return False | ||||
|         worker = node.job.worker().split(":")[-1] | ||||
|         worker = (node.job.worker() or "").split(":")[-1] | ||||
|         if not worker: | ||||
|             return False | ||||
| 
 | ||||
|         if (worker, node.name) in self._relaunch_check_cache: | ||||
|             return self._relaunch_check_cache[(worker, node.name)] | ||||
|         # get credentials from all autoscalers (shouldn't be too many) | ||||
| @ -2697,7 +2700,7 @@ class PipelineDecorator(PipelineController): | ||||
|                 node = self._nodes[j] | ||||
|                 if not node.job: | ||||
|                     continue | ||||
|                 if node.job.is_stopped(): | ||||
|                 if node.job.is_stopped(aborted_nonresponsive_as_running=True): | ||||
|                     node_failed = node.job.is_failed() | ||||
|                     if (node_failed or node.job.is_aborted()) and self._should_relaunch_node(node): | ||||
|                         continue | ||||
| @ -2935,7 +2938,7 @@ class PipelineDecorator(PipelineController): | ||||
|     def component( | ||||
|             cls, | ||||
|             _func=None, *, | ||||
|             return_values=('return_object', ),  # type: Union[str, List[str]] | ||||
|             return_values=('return_object', ),  # type: Union[str, Sequence[str]] | ||||
|             name=None,  # type: Optional[str] | ||||
|             cache=False,  # type: bool | ||||
|             packages=None,  # type: Optional[Union[str, Sequence[str]]] | ||||
| @ -3206,7 +3209,8 @@ class PipelineDecorator(PipelineController): | ||||
|                             raise ValueError("Job was not created and is also not cached/executed") | ||||
|                         return "{}.{}".format(_node.executed, return_name) | ||||
| 
 | ||||
|                     _node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5) | ||||
|                     _node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5, | ||||
|                                    aborted_nonresponsive_as_running=True) | ||||
|                     if _node.job.is_failed() and not _node.continue_on_fail: | ||||
|                         raise ValueError( | ||||
|                             'Pipeline step "{}", Task ID={} failed'.format(_node.name, _node.job.task_id())) | ||||
| @ -3224,7 +3228,8 @@ class PipelineDecorator(PipelineController): | ||||
|                     while True: | ||||
|                         # wait until job is completed | ||||
|                         if _node.job: | ||||
|                             _node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5) | ||||
|                             _node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5, | ||||
|                                            aborted_nonresponsive_as_running=True) | ||||
|                         else: | ||||
|                             sleep(2) | ||||
|                             continue | ||||
| @ -3486,9 +3491,10 @@ class PipelineDecorator(PipelineController): | ||||
|                 while waited: | ||||
|                     waited = False | ||||
|                     for node in list(a_pipeline._nodes.values()): | ||||
|                         if node.executed or not node.job or node.job.is_stopped(): | ||||
|                         if node.executed or not node.job or node.job.is_stopped(aborted_nonresponsive_as_running=True): | ||||
|                             continue | ||||
|                         node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5) | ||||
|                         node.job.wait(pool_period=1 if cls._debug_execute_step_process else 5, | ||||
|                                       aborted_nonresponsive_as_running=True) | ||||
|                         waited = True | ||||
|                 # store the pipeline result of we have any: | ||||
|                 if return_value and pipeline_result is not None: | ||||
|  | ||||
| @ -163,22 +163,25 @@ class BaseJob(object): | ||||
|         self._last_status_ts = time() | ||||
|         return self._last_status | ||||
| 
 | ||||
|     def wait(self, timeout=None, pool_period=30.): | ||||
|         # type: (Optional[float], float) -> bool | ||||
|     def wait(self, timeout=None, pool_period=30., aborted_nonresponsive_as_running=False): | ||||
|         # type: (Optional[float], float, bool) -> bool | ||||
|         """ | ||||
|         Wait until the task is fully executed (i.e., aborted/completed/failed) | ||||
| 
 | ||||
|         :param timeout: maximum time (minutes) to wait for Task to finish | ||||
|         :param pool_period: check task status every pool_period seconds | ||||
|         :param aborted_nonresponsive_as_running: (default: False) If True, ignore the stopped state if the backend | ||||
|             non-responsive watchdog sets this Task to stopped. This scenario could happen if | ||||
|             an instance running the job is killed without warning (e.g. spot instances) | ||||
|         :return: True, if Task finished. | ||||
|         """ | ||||
|         tic = time() | ||||
|         while timeout is None or time() - tic < timeout * 60.: | ||||
|             if self.is_stopped(): | ||||
|             if self.is_stopped(aborted_nonresponsive_as_running=aborted_nonresponsive_as_running): | ||||
|                 return True | ||||
|             sleep(pool_period) | ||||
| 
 | ||||
|         return self.is_stopped() | ||||
|         return self.is_stopped(aborted_nonresponsive_as_running=aborted_nonresponsive_as_running) | ||||
| 
 | ||||
|     def get_console_output(self, number_of_reports=1): | ||||
|         # type: (int) -> Sequence[str] | ||||
| @ -192,7 +195,7 @@ class BaseJob(object): | ||||
|         return self.task.get_reported_console_output(number_of_reports=number_of_reports) | ||||
| 
 | ||||
|     def worker(self): | ||||
|         # type: () -> str | ||||
|         # type: () -> Optional[str] | ||||
|         """ | ||||
|         Return the current worker id executing this Job. If job is pending, returns None | ||||
| 
 | ||||
| @ -216,16 +219,35 @@ class BaseJob(object): | ||||
|         """ | ||||
|         return self.status() == Task.TaskStatusEnum.in_progress | ||||
| 
 | ||||
|     def is_stopped(self): | ||||
|         # type: () -> bool | ||||
|     def is_stopped(self, aborted_nonresponsive_as_running=False): | ||||
|         # type: (bool) -> bool | ||||
|         """ | ||||
|         Return True, if job finished executing (for any reason) | ||||
| 
 | ||||
|         :param aborted_nonresponsive_as_running: (default: False) If True, ignore the stopped state if the backend | ||||
|             non-responsive watchdog sets this Task to stopped. This scenario could happen if | ||||
|             an instance running the job is killed without warning (e.g. spot instances) | ||||
| 
 | ||||
|         :return: True the task is currently one of these states, stopped / completed / failed / published. | ||||
|         """ | ||||
|         return self.status() in ( | ||||
|             Task.TaskStatusEnum.stopped, Task.TaskStatusEnum.completed, | ||||
|             Task.TaskStatusEnum.failed, Task.TaskStatusEnum.published) | ||||
|         task_status = self.status() | ||||
|         # check if we are Not in any of the non-running states | ||||
|         if task_status not in (Task.TaskStatusEnum.stopped, Task.TaskStatusEnum.completed, | ||||
|                                Task.TaskStatusEnum.failed, Task.TaskStatusEnum.published): | ||||
|             return False | ||||
| 
 | ||||
|         # notice the status update also refresh the "status_message" field on the Task | ||||
| 
 | ||||
|         # if we are stopped but the message says "non-responsive" it means for some reason the | ||||
|         # Task's instance was killed, we should ignore it if requested because we assume someone will bring it back | ||||
|         if aborted_nonresponsive_as_running and task_status == Task.TaskStatusEnum.stopped and \ | ||||
|                 str(self.task.data.status_message).lower() == "forced stop (non-responsive)": | ||||
|             # if we are here it means the state is "stopped" but we should ignore it | ||||
|             # because the non-responsive watchdog set it. We assume someone (autoscaler) will relaunch it. | ||||
|             return False | ||||
|         else: | ||||
|             # if we do not need to ignore the nonactive state, it means this Task stopped | ||||
|             return True | ||||
| 
 | ||||
|     def is_failed(self): | ||||
|         # type: () -> bool | ||||
|  | ||||
| @ -1637,16 +1637,18 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin): | ||||
|     def get_status(self): | ||||
|         # type: () -> str | ||||
|         """ | ||||
|         Return The task status without refreshing the entire Task object object (only the status property) | ||||
|         Return The task status without refreshing the entire Task object (only the status property) | ||||
| 
 | ||||
|         TaskStatusEnum: ["created", "in_progress", "stopped", "closed", "failed", "completed", | ||||
|                          "queued", "published", "publishing", "unknown"] | ||||
| 
 | ||||
|         :return: str: Task status as string (TaskStatusEnum) | ||||
|         """ | ||||
|         status = self._get_status()[0] | ||||
|         status, status_message = self._get_status() | ||||
|         if self._data: | ||||
|             self._data.status = status | ||||
|             self._data.status_message = str(status_message) | ||||
| 
 | ||||
|         return str(status) | ||||
| 
 | ||||
|     def get_output_log_web_page(self): | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 allegroai
						allegroai