From aef6aa9fc89c6a8e5e7e6a52fab39cd67f3e496a Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 24 Jul 2024 17:59:46 +0300 Subject: [PATCH] Fix a race condition where in rare conditions popping a Task from a queue that was aborted did not set it to started before the watchdog killed it. Does not happen in k8s/slurm --- clearml_agent/commands/worker.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index 41c7a3f..c157e5b 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -938,9 +938,9 @@ class Worker(ServiceCommandSection): # set task status to in_progress so we know it was popped from the queue # noinspection PyBroadException try: - task_session.send_api(tasks_api.StartedRequest(task=task_id, status_message="pulled by agent", force=True)) + task_session.send_api(tasks_api.StartedRequest(task=task_id, status_message="launch by agent", force=True)) except Exception: - print("Warning: Could not start task id '{}', skipping".format(task_id)) + print("Warning: Could not set status=in_progress task id '{}', skipping".format(task_id)) return # setup console log temp_stdout_name = safe_mkstemp( @@ -1330,6 +1330,16 @@ class Worker(ServiceCommandSection): except: pass + # set task status to in_progress so we know it was popped from the queue + # next api version we will set the status when pulling from the queue + # noinspection PyBroadException + try: + self._session.send_api( + tasks_api.StartedRequest(task=task_id, status_message="pulled by agent", force=True)) + except Exception: + print("Warning: Could not set status=in_progress task id '{}', retrying in a bit".format(task_id)) + + # check if we need to impersonate task_session = None if self._impersonate_as_task_owner: try: