Fix check if process return code is SIGKILL (-9 or 137) and abort callback was called, do not mark as failed but as aborted

This commit is contained in:
allegroai 2023-12-20 17:43:02 +02:00
parent 564f769ff7
commit 030cbb69f1

View File

@ -505,19 +505,7 @@ class TaskStopSignal(object):
return True return True
# check if abort callback is turned on # check if abort callback is turned on
cb_completed = None abort_timeout, poll_timeout, cb_completed = self._get_abort_callback_stat()
# TODO: add retries on network error with timeout
try:
task_info = self.session.get(
service="tasks", action="get_all", version="2.13", id=[self.task_id],
only_fields=["status", "status_message", "runtime._abort_callback_timeout",
"runtime._abort_poll_freq", "runtime._abort_callback_completed"])
abort_timeout = task_info['tasks'][0]['runtime'].get('_abort_callback_timeout', 0)
poll_timeout = task_info['tasks'][0]['runtime'].get('_abort_poll_freq', 0)
cb_completed = task_info['tasks'][0]['runtime'].get('_abort_callback_completed', None)
except: # noqa
abort_timeout = None
poll_timeout = None
if not abort_timeout: if not abort_timeout:
# no callback set we can leave # no callback set we can leave
@ -540,8 +528,39 @@ class TaskStopSignal(object):
self._active_callback_timeout = timeout self._active_callback_timeout = timeout
return bool(cb_completed) return bool(cb_completed)
def was_abort_function_called(self): def _get_abort_callback_stat(self):
return bool(self._active_callback_timestamp) # TODO: add retries on network error with timeout
try:
task_info = self.session.get(
service="tasks", action="get_all", version="2.13", id=[self.task_id],
only_fields=["status", "status_message", "runtime._abort_callback_timeout",
"runtime._abort_poll_freq", "runtime._abort_callback_completed"])
abort_timeout = task_info['tasks'][0]['runtime'].get('_abort_callback_timeout', 0)
poll_timeout = task_info['tasks'][0]['runtime'].get('_abort_poll_freq', 0)
cb_completed = task_info['tasks'][0]['runtime'].get('_abort_callback_completed', None)
except: # noqa
abort_timeout = None
poll_timeout = None
cb_completed = None
return abort_timeout, poll_timeout, cb_completed
def was_abort_function_called(self, process_error_code=None):
if not self._support_callback:
return False
if self._active_callback_timestamp:
return True
# if the process error code is SIGKILL (exit code 137) -
# check the runtime info of the Task - it might have killed itself because it was aborted
if process_error_code in (-9, 137):
# check if abort callback is turned on
_, _, cb_completed = self._get_abort_callback_stat()
if cb_completed:
return True
return False
def _test(self): def _test(self):
# type: () -> TaskStopReason # type: () -> TaskStopReason
@ -2005,7 +2024,7 @@ class Worker(ServiceCommandSection):
stderr_line_count += report_lines(printed_lines, "stderr") stderr_line_count += report_lines(printed_lines, "stderr")
# make sure that if the abort function was called, the task is marked as aborted # make sure that if the abort function was called, the task is marked as aborted
if stop_signal and stop_signal.was_abort_function_called(): if stop_signal and stop_signal.was_abort_function_called(status):
stop_reason = TaskStopReason.stopped stop_reason = TaskStopReason.stopped
return status, stop_reason return status, stop_reason