mirror of
				https://github.com/clearml/clearml-agent
				synced 2025-06-26 18:16:15 +00:00 
			
		
		
		
	Fix on_abort bash callback, if main processes leave while on_abort callback is running, wait for the on_abort to complete
This commit is contained in:
		
							parent
							
								
									d30d4e7e61
								
							
						
					
					
						commit
						66494c598d
					
				| @ -578,7 +578,7 @@ class TaskStopSignal(object): | |||||||
|         self._bash_callback = shlex.split(ENV_ABORT_CALLBACK_CMD.get()) |         self._bash_callback = shlex.split(ENV_ABORT_CALLBACK_CMD.get()) | ||||||
|         # make sure we are re-testing in subprocesses |         # make sure we are re-testing in subprocesses | ||||||
|         os.environ[ENV_ABORT_CALLBACK_CMD.vars[0]+"_REGISTERED"] = ENV_ABORT_CALLBACK_CMD.get() |         os.environ[ENV_ABORT_CALLBACK_CMD.vars[0]+"_REGISTERED"] = ENV_ABORT_CALLBACK_CMD.get() | ||||||
|         os.environ.pop(ENV_ABORT_CALLBACK_CMD.vars[0], None) |         ENV_ABORT_CALLBACK_CMD.pop() | ||||||
| 
 | 
 | ||||||
|         # noinspection PyBroadException |         # noinspection PyBroadException | ||||||
|         try: |         try: | ||||||
| @ -611,7 +611,21 @@ class TaskStopSignal(object): | |||||||
|         self._self_monitor_thread.start() |         self._self_monitor_thread.start() | ||||||
|         print("INFO: bash on_abort monitor thread started") |         print("INFO: bash on_abort monitor thread started") | ||||||
| 
 | 
 | ||||||
|     def stop_monitor_thread(self): |     def stop_monitor_thread(self, wait_on_abort_timeout=-1): | ||||||
|  |         # if no on_abort callback was launched, or it's done, nothing to do | ||||||
|  |         if not self._bash_callback_thread or not self._self_monitor_thread: | ||||||
|  |             self._self_monitor_thread = None | ||||||
|  |             return | ||||||
|  | 
 | ||||||
|  |         # we need to wait for the on_abort callback to be done | ||||||
|  |         if wait_on_abort_timeout and wait_on_abort_timeout < 0: | ||||||
|  |             wait_on_abort_timeout = self._abort_callback_max_timeout | ||||||
|  | 
 | ||||||
|  |         if wait_on_abort_timeout: | ||||||
|  |             tic = time() | ||||||
|  |             while self._bash_callback_thread and (time() - tic < wait_on_abort_timeout): | ||||||
|  |                 sleep(1) | ||||||
|  | 
 | ||||||
|         self._self_monitor_thread = None |         self._self_monitor_thread = None | ||||||
| 
 | 
 | ||||||
|     def _monitor_thread_loop(self, polling_interval_sec=10): |     def _monitor_thread_loop(self, polling_interval_sec=10): | ||||||
| @ -620,6 +634,7 @@ class TaskStopSignal(object): | |||||||
|             stop_reason = self.test() |             stop_reason = self.test() | ||||||
|             if stop_reason != TaskStopSignal.default: |             if stop_reason != TaskStopSignal.default: | ||||||
|                 # mark quit loop |                 # mark quit loop | ||||||
|  |                 self._self_monitor_thread = None | ||||||
|                 break |                 break | ||||||
| 
 | 
 | ||||||
|     def _bash_callback_launch_thread(self): |     def _bash_callback_launch_thread(self): | ||||||
| @ -641,6 +656,9 @@ class TaskStopSignal(object): | |||||||
|                               task=self.task_id, runtime=runtime_properties, force=True) |                               task=self.task_id, runtime=runtime_properties, force=True) | ||||||
|         except Exception as ex: |         except Exception as ex: | ||||||
|             print("WARNING: failed updating bash callback completed: {}".format(ex)) |             print("WARNING: failed updating bash callback completed: {}".format(ex)) | ||||||
|  | 
 | ||||||
|  |         # mark we are done | ||||||
|  |         self._bash_callback_thread = None | ||||||
|         return |         return | ||||||
| 
 | 
 | ||||||
|     def test(self): |     def test(self): | ||||||
| @ -3224,6 +3242,9 @@ class Worker(ServiceCommandSection): | |||||||
|                         os.execv(command.argv[0].as_posix(), tuple([command.argv[0].as_posix()])+command.argv[1:]) |                         os.execv(command.argv[0].as_posix(), tuple([command.argv[0].as_posix()])+command.argv[1:]) | ||||||
|                     else: |                     else: | ||||||
|                         exit_code = command.check_call(cwd=script_dir) |                         exit_code = command.check_call(cwd=script_dir) | ||||||
|  |                         # if we have an on_abort callback still running, wait for it | ||||||
|  |                         if stop_signal: | ||||||
|  |                             stop_signal.stop_monitor_thread() | ||||||
|                         exit(exit_code) |                         exit(exit_code) | ||||||
|                 except subprocess.CalledProcessError as ex: |                 except subprocess.CalledProcessError as ex: | ||||||
|                     # non zero return code |                     # non zero return code | ||||||
| @ -3257,6 +3278,10 @@ class Worker(ServiceCommandSection): | |||||||
|             self.log_traceback(e) |             self.log_traceback(e) | ||||||
|             exit_code = -1 |             exit_code = -1 | ||||||
| 
 | 
 | ||||||
|  |         # if we have an on_abort callback still running, wait for it | ||||||
|  |         if stop_signal: | ||||||
|  |             stop_signal.stop_monitor_thread() | ||||||
|  | 
 | ||||||
|         # kill leftover processes |         # kill leftover processes | ||||||
|         kill_all_child_processes() |         kill_all_child_processes() | ||||||
| 
 | 
 | ||||||
| @ -3266,8 +3291,6 @@ class Worker(ServiceCommandSection): | |||||||
|         exit_code = exit_code if exit_code != ExitStatus.interrupted else -1 |         exit_code = exit_code if exit_code != ExitStatus.interrupted else -1 | ||||||
| 
 | 
 | ||||||
|         if not disable_monitoring: |         if not disable_monitoring: | ||||||
|             if stop_signal: |  | ||||||
|                 stop_signal.stop_monitor_thread() |  | ||||||
|             # we need to change task status according to exit code |             # we need to change task status according to exit code | ||||||
|             self.handle_task_termination(current_task.id, exit_code, TaskStopReason.no_stop) |             self.handle_task_termination(current_task.id, exit_code, TaskStopReason.no_stop) | ||||||
|             self.stop_monitor() |             self.stop_monitor() | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 clearml
						clearml