mirror of
https://github.com/clearml/clearml-agent
synced 2025-02-07 05:19:17 +00:00
Support skipping re-enqueue on suspected preempted k8s pods
This commit is contained in:
parent
6fb48a4c6e
commit
10c6629982
@ -9,3 +9,6 @@ Script will be appended to the specified file.
|
|||||||
ENV_DEFAULT_EXECUTION_AGENT_ARGS = EnvEntry("K8S_GLUE_DEF_EXEC_AGENT_ARGS", default="--full-monitoring --require-queue")
|
ENV_DEFAULT_EXECUTION_AGENT_ARGS = EnvEntry("K8S_GLUE_DEF_EXEC_AGENT_ARGS", default="--full-monitoring --require-queue")
|
||||||
ENV_POD_AGENT_INSTALL_ARGS = EnvEntry("K8S_GLUE_POD_AGENT_INSTALL_ARGS", default="", lstrip=False)
|
ENV_POD_AGENT_INSTALL_ARGS = EnvEntry("K8S_GLUE_POD_AGENT_INSTALL_ARGS", default="", lstrip=False)
|
||||||
ENV_POD_MONITOR_LOG_BATCH_SIZE = EnvEntry("K8S_GLUE_POD_MONITOR_LOG_BATCH_SIZE", default=5, converter=int)
|
ENV_POD_MONITOR_LOG_BATCH_SIZE = EnvEntry("K8S_GLUE_POD_MONITOR_LOG_BATCH_SIZE", default=5, converter=int)
|
||||||
|
ENV_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION = EnvEntry(
|
||||||
|
"K8S_GLUE_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION", default=False, converter=bool
|
||||||
|
)
|
||||||
|
@ -9,6 +9,7 @@ from clearml_agent.helper.process import stringify_bash_output
|
|||||||
from .daemon import K8sDaemon
|
from .daemon import K8sDaemon
|
||||||
from .utilities import get_path
|
from .utilities import get_path
|
||||||
from .errors import GetPodsError
|
from .errors import GetPodsError
|
||||||
|
from .definitions import ENV_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION
|
||||||
|
|
||||||
|
|
||||||
class PendingPodsDaemon(K8sDaemon):
|
class PendingPodsDaemon(K8sDaemon):
|
||||||
@ -17,16 +18,16 @@ class PendingPodsDaemon(K8sDaemon):
|
|||||||
self._polling_interval = polling_interval
|
self._polling_interval = polling_interval
|
||||||
self._last_tasks_msgs = {} # last msg updated for every task
|
self._last_tasks_msgs = {} # last msg updated for every task
|
||||||
|
|
||||||
def get_pods(self, pod_name=None):
|
def get_pods(self, pod_name=None, debug_msg="Detecting pending pods: {cmd}"):
|
||||||
filters = ["status.phase=Pending"]
|
filters = ["status.phase=Pending"]
|
||||||
if pod_name:
|
if pod_name:
|
||||||
filters.append(f"metadata.name={pod_name}")
|
filters.append(f"metadata.name={pod_name}")
|
||||||
|
|
||||||
if self._agent.using_jobs:
|
if self._agent.using_jobs:
|
||||||
return self._agent.get_pods_for_jobs(
|
return self._agent.get_pods_for_jobs(
|
||||||
job_condition="status.active=1", pod_filters=filters, debug_msg="Detecting pending pods: {cmd}"
|
job_condition="status.active=1", pod_filters=filters, debug_msg=debug_msg
|
||||||
)
|
)
|
||||||
return self._agent.get_pods(filters=filters, debug_msg="Detecting pending pods: {cmd}")
|
return self._agent.get_pods(filters=filters, debug_msg=debug_msg)
|
||||||
|
|
||||||
def _get_pod_name(self, pod: dict):
|
def _get_pod_name(self, pod: dict):
|
||||||
return get_path(pod, "metadata", "name")
|
return get_path(pod, "metadata", "name")
|
||||||
@ -72,6 +73,11 @@ class PendingPodsDaemon(K8sDaemon):
|
|||||||
if not namespace:
|
if not namespace:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
updated_pod = self.get_pods(pod_name=pod_name, debug_msg="Refreshing pod information: {cmd}")
|
||||||
|
if not updated_pod:
|
||||||
|
continue
|
||||||
|
pod = updated_pod[0]
|
||||||
|
|
||||||
task_id_to_pod[task_id] = pod
|
task_id_to_pod[task_id] = pod
|
||||||
|
|
||||||
msg = None
|
msg = None
|
||||||
@ -190,32 +196,39 @@ class PendingPodsDaemon(K8sDaemon):
|
|||||||
if not msg or self._last_tasks_msgs.get(task_id, None) == (msg, tags):
|
if not msg or self._last_tasks_msgs.get(task_id, None) == (msg, tags):
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
# Make sure the task is queued
|
if ENV_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION.get():
|
||||||
result = self._session.send_request(
|
# This disables the option to enqueue the task which is supposed to sync the ClearML task status
|
||||||
service='tasks',
|
# in case the pod was preempted. In some cases this does not happen due to preemption but due to
|
||||||
action='get_all',
|
# cluster communication lag issues that cause us not to discover the pod is no longer pending and
|
||||||
json={"id": task_id, "only_fields": ["status"]},
|
# enqueue the task when it's actually already running, thus essentially killing the task
|
||||||
method=Request.def_method,
|
pass
|
||||||
async_enable=False,
|
else:
|
||||||
)
|
# Make sure the task is queued
|
||||||
if result.ok:
|
result = self._session.send_request(
|
||||||
status = get_path(result.json(), 'data', 'tasks', 0, 'status')
|
service='tasks',
|
||||||
# if task is in progress, change its status to enqueued
|
action='get_all',
|
||||||
if status == "in_progress":
|
json={"id": task_id, "only_fields": ["status"]},
|
||||||
result = self._session.send_request(
|
method=Request.def_method,
|
||||||
service='tasks', action='enqueue',
|
async_enable=False,
|
||||||
json={
|
)
|
||||||
"task": task_id, "force": True, "queue": self._agent.k8s_pending_queue_id
|
if result.ok:
|
||||||
},
|
status = get_path(result.json(), 'data', 'tasks', 0, 'status')
|
||||||
method=Request.def_method,
|
# if task is in progress, change its status to enqueued
|
||||||
async_enable=False,
|
if status == "in_progress":
|
||||||
)
|
result = self._session.send_request(
|
||||||
if not result.ok:
|
service='tasks', action='enqueue',
|
||||||
result_msg = get_path(result.json(), 'meta', 'result_msg')
|
json={
|
||||||
self.log.debug(
|
"task": task_id, "force": True, "queue": self._agent.k8s_pending_queue_id
|
||||||
"K8S Glue pods monitor: failed forcing task status change"
|
},
|
||||||
" for pending task {}: {}".format(task_id, result_msg)
|
method=Request.def_method,
|
||||||
|
async_enable=False,
|
||||||
)
|
)
|
||||||
|
if not result.ok:
|
||||||
|
result_msg = get_path(result.json(), 'meta', 'result_msg')
|
||||||
|
self.log.debug(
|
||||||
|
"K8S Glue pods monitor: failed forcing task status change"
|
||||||
|
" for pending task {}: {}".format(task_id, result_msg)
|
||||||
|
)
|
||||||
|
|
||||||
# Update task status message
|
# Update task status message
|
||||||
payload = {"task": task_id, "status_message": "K8S glue status: {}".format(msg)}
|
payload = {"task": task_id, "status_message": "K8S glue status: {}".format(msg)}
|
||||||
|
Loading…
Reference in New Issue
Block a user