Fix a race condition where in rare conditions popping a Task from a queue that was aborted did not set it to started before the watchdog killed it. Does not happen in k8s/slurm

This commit is contained in:
allegroai 2024-07-24 17:59:46 +03:00
parent 0bb267115b
commit aef6aa9fc8

View File

@ -938,9 +938,9 @@ class Worker(ServiceCommandSection):
# set task status to in_progress so we know it was popped from the queue # set task status to in_progress so we know it was popped from the queue
# noinspection PyBroadException # noinspection PyBroadException
try: try:
task_session.send_api(tasks_api.StartedRequest(task=task_id, status_message="pulled by agent", force=True)) task_session.send_api(tasks_api.StartedRequest(task=task_id, status_message="launch by agent", force=True))
except Exception: except Exception:
print("Warning: Could not start task id '{}', skipping".format(task_id)) print("Warning: Could not set status=in_progress task id '{}', skipping".format(task_id))
return return
# setup console log # setup console log
temp_stdout_name = safe_mkstemp( temp_stdout_name = safe_mkstemp(
@ -1330,6 +1330,16 @@ class Worker(ServiceCommandSection):
except: except:
pass pass
# set task status to in_progress so we know it was popped from the queue
# next api version we will set the status when pulling from the queue
# noinspection PyBroadException
try:
self._session.send_api(
tasks_api.StartedRequest(task=task_id, status_message="pulled by agent", force=True))
except Exception:
print("Warning: Could not set status=in_progress task id '{}', retrying in a bit".format(task_id))
# check if we need to impersonate
task_session = None task_session = None
if self._impersonate_as_task_owner: if self._impersonate_as_task_owner:
try: try: