mirror of
https://github.com/clearml/clearml-agent
synced 2025-01-31 00:56:53 +00:00
Fix a race condition where in rare conditions popping a Task from a queue that was aborted did not set it to started before the watchdog killed it. Does not happen in k8s/slurm
This commit is contained in:
parent
0bb267115b
commit
aef6aa9fc8
@ -938,9 +938,9 @@ class Worker(ServiceCommandSection):
|
||||
# set task status to in_progress so we know it was popped from the queue
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
task_session.send_api(tasks_api.StartedRequest(task=task_id, status_message="pulled by agent", force=True))
|
||||
task_session.send_api(tasks_api.StartedRequest(task=task_id, status_message="launch by agent", force=True))
|
||||
except Exception:
|
||||
print("Warning: Could not start task id '{}', skipping".format(task_id))
|
||||
print("Warning: Could not set status=in_progress task id '{}', skipping".format(task_id))
|
||||
return
|
||||
# setup console log
|
||||
temp_stdout_name = safe_mkstemp(
|
||||
@ -1330,6 +1330,16 @@ class Worker(ServiceCommandSection):
|
||||
except:
|
||||
pass
|
||||
|
||||
# set task status to in_progress so we know it was popped from the queue
|
||||
# next api version we will set the status when pulling from the queue
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
self._session.send_api(
|
||||
tasks_api.StartedRequest(task=task_id, status_message="pulled by agent", force=True))
|
||||
except Exception:
|
||||
print("Warning: Could not set status=in_progress task id '{}', retrying in a bit".format(task_id))
|
||||
|
||||
# check if we need to impersonate
|
||||
task_session = None
|
||||
if self._impersonate_as_task_owner:
|
||||
try:
|
||||
|
Loading…
Reference in New Issue
Block a user