mirror of
https://github.com/clearml/clearml-agent
synced 2025-04-07 14:04:16 +00:00
Fix use same state transition if supported by the server (instead of stopping the task before re-enqueue)
This commit is contained in:
parent
99e1e54f94
commit
b8c762401b
@ -64,6 +64,7 @@ class Session(TokenManager):
|
|||||||
default_key = "EGRTCO8JMSIGI6S39GTP43NFWXDQOW"
|
default_key = "EGRTCO8JMSIGI6S39GTP43NFWXDQOW"
|
||||||
default_secret = "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"
|
default_secret = "x!XTov_G-#vspE*Y(h$Anm&DIc5Ou-F)jsl$PdOyj5wG1&E!Z8"
|
||||||
force_max_api_version = ENV_FORCE_MAX_API_VERSION.get()
|
force_max_api_version = ENV_FORCE_MAX_API_VERSION.get()
|
||||||
|
server_version = "1.0.0"
|
||||||
|
|
||||||
# TODO: add requests.codes.gateway_timeout once we support async commits
|
# TODO: add requests.codes.gateway_timeout once we support async commits
|
||||||
_retry_codes = [
|
_retry_codes = [
|
||||||
@ -191,6 +192,7 @@ class Session(TokenManager):
|
|||||||
|
|
||||||
Session.api_version = str(api_version)
|
Session.api_version = str(api_version)
|
||||||
Session.feature_set = str(token_dict.get('feature_set', self.feature_set) or "basic")
|
Session.feature_set = str(token_dict.get('feature_set', self.feature_set) or "basic")
|
||||||
|
Session.server_version = token_dict.get('server_version', self.server_version)
|
||||||
except (jwt.DecodeError, ValueError):
|
except (jwt.DecodeError, ValueError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -651,11 +653,14 @@ class Session(TokenManager):
|
|||||||
"""
|
"""
|
||||||
Return True if Session.api_version is greater or equal >= to min_api_version
|
Return True if Session.api_version is greater or equal >= to min_api_version
|
||||||
"""
|
"""
|
||||||
def version_tuple(v):
|
|
||||||
v = tuple(map(int, (v.split("."))))
|
|
||||||
return v + (0,) * max(0, 3 - len(v))
|
|
||||||
return version_tuple(cls.api_version) >= version_tuple(str(min_api_version))
|
return version_tuple(cls.api_version) >= version_tuple(str(min_api_version))
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def check_min_server_version(cls, min_server_version):
|
||||||
|
"""
|
||||||
|
Return True if Session.server_version is greater or equal >= to min_server_version
|
||||||
|
"""
|
||||||
|
return version_tuple(cls.server_version) >= version_tuple(str(min_server_version))
|
||||||
def _do_refresh_token(self, current_token, exp=None):
|
def _do_refresh_token(self, current_token, exp=None):
|
||||||
""" TokenManager abstract method implementation.
|
""" TokenManager abstract method implementation.
|
||||||
Here we ignore the old token and simply obtain a new token.
|
Here we ignore the old token and simply obtain a new token.
|
||||||
@ -733,3 +738,8 @@ class Session(TokenManager):
|
|||||||
def propagate_exceptions_on_send(self, value):
|
def propagate_exceptions_on_send(self, value):
|
||||||
# type: (bool) -> None
|
# type: (bool) -> None
|
||||||
self._propagate_exceptions_on_send = value
|
self._propagate_exceptions_on_send = value
|
||||||
|
|
||||||
|
|
||||||
|
def version_tuple(v):
|
||||||
|
v = tuple(map(int, (v.split("."))))
|
||||||
|
return v + (0,) * max(0, 3 - len(v))
|
||||||
|
@ -194,6 +194,10 @@ class K8sIntegration(Worker):
|
|||||||
self._min_cleanup_interval_per_ns_sec = 1.0
|
self._min_cleanup_interval_per_ns_sec = 1.0
|
||||||
self._last_pod_cleanup_per_ns = defaultdict(lambda: 0.)
|
self._last_pod_cleanup_per_ns = defaultdict(lambda: 0.)
|
||||||
|
|
||||||
|
self._server_supports_same_state_transition = (
|
||||||
|
self._session.feature_set != "basic" and self._session.check_min_server_version("3.22.3")
|
||||||
|
)
|
||||||
|
|
||||||
def _create_daemon_instance(self, cls_, **kwargs):
|
def _create_daemon_instance(self, cls_, **kwargs):
|
||||||
return cls_(agent=self, **kwargs)
|
return cls_(agent=self, **kwargs)
|
||||||
|
|
||||||
@ -435,7 +439,9 @@ class K8sIntegration(Worker):
|
|||||||
if self._is_same_tenant(task_session):
|
if self._is_same_tenant(task_session):
|
||||||
try:
|
try:
|
||||||
print('Pushing task {} into temporary pending queue'.format(task_id))
|
print('Pushing task {} into temporary pending queue'.format(task_id))
|
||||||
_ = session.api_client.tasks.stop(task_id, force=True, status_reason="moving to k8s pending queue")
|
|
||||||
|
if not self._server_supports_same_state_transition:
|
||||||
|
_ = session.api_client.tasks.stop(task_id, force=True, status_reason="moving to k8s pending queue")
|
||||||
|
|
||||||
# Just make sure to clean up in case the task is stuck in the queue (known issue)
|
# Just make sure to clean up in case the task is stuck in the queue (known issue)
|
||||||
self._session.api_client.queues.remove_task(
|
self._session.api_client.queues.remove_task(
|
||||||
@ -956,7 +962,7 @@ class K8sIntegration(Worker):
|
|||||||
result = self._session.get(
|
result = self._session.get(
|
||||||
service='tasks',
|
service='tasks',
|
||||||
action='get_all',
|
action='get_all',
|
||||||
json={"id": task_ids, "status": ["in_progress", "queued"], "only_fields": ["id", "status"]},
|
json={"id": task_ids, "status": ["in_progress", "queued"], "only_fields": ["id", "status", "status_reason"]},
|
||||||
method=Request.def_method,
|
method=Request.def_method,
|
||||||
)
|
)
|
||||||
tasks_to_abort = result["tasks"]
|
tasks_to_abort = result["tasks"]
|
||||||
@ -966,9 +972,13 @@ class K8sIntegration(Worker):
|
|||||||
for task in tasks_to_abort:
|
for task in tasks_to_abort:
|
||||||
task_id = task.get("id")
|
task_id = task.get("id")
|
||||||
status = task.get("status")
|
status = task.get("status")
|
||||||
|
status_reason = (task.get("status_reason") or "").lower()
|
||||||
if not task_id or not status:
|
if not task_id or not status:
|
||||||
self.log.warning('Failed getting task information: id={}, status={}'.format(task_id, status))
|
self.log.warning('Failed getting task information: id={}, status={}'.format(task_id, status))
|
||||||
continue
|
continue
|
||||||
|
if status == "queued" and "pushed back by policy manager" in status_reason:
|
||||||
|
# Task was pushed back to policy queue by policy manager, don't touch it
|
||||||
|
continue
|
||||||
try:
|
try:
|
||||||
if status == "queued":
|
if status == "queued":
|
||||||
self._session.get(
|
self._session.get(
|
||||||
|
Loading…
Reference in New Issue
Block a user