mirror of
https://github.com/clearml/clearml-agent
synced 2025-04-26 17:10:24 +00:00
Support ignoring kubectl errors
This commit is contained in:
parent
26d748a4d8
commit
4fa61dde1f
@ -1,3 +1,5 @@
|
|||||||
|
import shlex
|
||||||
|
|
||||||
from clearml_agent.helper.environment import EnvEntry
|
from clearml_agent.helper.environment import EnvEntry
|
||||||
|
|
||||||
ENV_START_AGENT_SCRIPT_PATH = EnvEntry("CLEARML_K8S_GLUE_START_AGENT_SCRIPT_PATH", default="~/__start_agent__.sh")
|
ENV_START_AGENT_SCRIPT_PATH = EnvEntry("CLEARML_K8S_GLUE_START_AGENT_SCRIPT_PATH", default="~/__start_agent__.sh")
|
||||||
@ -18,3 +20,13 @@ ENV_POD_USE_IMAGE_ENTRYPOINT = EnvEntry("K8S_GLUE_POD_USE_IMAGE_ENTRYPOINT", def
|
|||||||
Do not inject a cmd and args to the container's image when building the k8s template (depend on the built-in image
|
Do not inject a cmd and args to the container's image when building the k8s template (depend on the built-in image
|
||||||
entrypoint)
|
entrypoint)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
ENV_KUBECTL_IGNORE_ERROR = EnvEntry("K8S_GLUE_IGNORE_KUBECTL_ERROR", default=None)
|
||||||
|
"""
|
||||||
|
Ignore kubectl errors matching this string pattern (allows ignoring warnings sent on stderr while
|
||||||
|
kubectl actually works and starts the pod)
|
||||||
|
"""
|
||||||
|
|
||||||
|
ENV_DEFAULT_SCHEDULER_QUEUE_TAGS = EnvEntry(
|
||||||
|
"K8S_GLUE_DEFAULT_SCHEDULER_QUEUE_TAGS", default=["k8s-glue"], converter=shlex.split
|
||||||
|
)
|
||||||
|
@ -42,6 +42,8 @@ from clearml_agent.glue.definitions import (
|
|||||||
ENV_DEFAULT_EXECUTION_AGENT_ARGS,
|
ENV_DEFAULT_EXECUTION_AGENT_ARGS,
|
||||||
ENV_POD_AGENT_INSTALL_ARGS,
|
ENV_POD_AGENT_INSTALL_ARGS,
|
||||||
ENV_POD_USE_IMAGE_ENTRYPOINT,
|
ENV_POD_USE_IMAGE_ENTRYPOINT,
|
||||||
|
ENV_KUBECTL_IGNORE_ERROR,
|
||||||
|
ENV_DEFAULT_SCHEDULER_QUEUE_TAGS,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -208,6 +210,10 @@ class K8sIntegration(Worker):
|
|||||||
self._session.feature_set != "basic" and self._session.check_min_server_version("3.22.3")
|
self._session.feature_set != "basic" and self._session.check_min_server_version("3.22.3")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.ignore_kubectl_errors_re = (
|
||||||
|
re.compile(ENV_KUBECTL_IGNORE_ERROR.get()) if ENV_KUBECTL_IGNORE_ERROR.get() else None
|
||||||
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def agent_label(self):
|
def agent_label(self):
|
||||||
return self._get_agent_label()
|
return self._get_agent_label()
|
||||||
@ -466,13 +472,34 @@ class K8sIntegration(Worker):
|
|||||||
queue=self.k8s_pending_queue_id,
|
queue=self.k8s_pending_queue_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
res = self._session.api_client.tasks.enqueue(
|
for attempt in range(2):
|
||||||
task_id,
|
res = self._session.send_request(
|
||||||
queue=self.k8s_pending_queue_id,
|
"tasks",
|
||||||
status_reason='k8s pending scheduler',
|
"enqueue",
|
||||||
)
|
json={
|
||||||
if res.meta.result_code != 200:
|
"task": task_id,
|
||||||
raise Exception(res.meta.result_msg)
|
"queue": self.k8s_pending_queue_id,
|
||||||
|
"status_reason": "k8s pending scheduler",
|
||||||
|
"update_execution_queue": False,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
if res.ok:
|
||||||
|
break
|
||||||
|
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
result_subcode = res.json()["meta"]["result_subcode"]
|
||||||
|
result_msg = res.json()["meta"]["result_msg"]
|
||||||
|
except Exception:
|
||||||
|
result_subcode = None
|
||||||
|
result_msg = res.text
|
||||||
|
|
||||||
|
if attempt == 0 and res.status_code == 400 and result_subcode == 701:
|
||||||
|
# Invalid queue ID, only retry once
|
||||||
|
self._ensure_pending_queue_exists()
|
||||||
|
continue
|
||||||
|
raise Exception(result_msg)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue {} [{}], error: {}".format(
|
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue {} [{}], error: {}".format(
|
||||||
task_id, self.k8s_pending_queue_name, self.k8s_pending_queue_id, e))
|
task_id, self.k8s_pending_queue_name, self.k8s_pending_queue_id, e))
|
||||||
@ -627,18 +654,21 @@ class K8sIntegration(Worker):
|
|||||||
|
|
||||||
print('kubectl output:\n{}\n{}'.format(error, output))
|
print('kubectl output:\n{}\n{}'.format(error, output))
|
||||||
if error:
|
if error:
|
||||||
send_log = "Running kubectl encountered an error: {}".format(error)
|
if self.ignore_kubectl_errors_re and self.ignore_kubectl_errors_re.match(error):
|
||||||
self.log.error(send_log)
|
print(f"Ignoring error due to {ENV_KUBECTL_IGNORE_ERROR.key}")
|
||||||
self.send_logs(task_id, send_log.splitlines())
|
else:
|
||||||
|
send_log = "Running kubectl encountered an error: {}".format(error)
|
||||||
|
self.log.error(send_log)
|
||||||
|
self.send_logs(task_id, send_log.splitlines())
|
||||||
|
|
||||||
# Make sure to remove the task from our k8s pending queue
|
# Make sure to remove the task from our k8s pending queue
|
||||||
self._session.api_client.queues.remove_task(
|
self._session.api_client.queues.remove_task(
|
||||||
task=task_id,
|
task=task_id,
|
||||||
queue=self.k8s_pending_queue_id,
|
queue=self.k8s_pending_queue_id,
|
||||||
)
|
)
|
||||||
# Set task as failed
|
# Set task as failed
|
||||||
session.api_client.tasks.failed(task_id, force=True)
|
session.api_client.tasks.failed(task_id, force=True)
|
||||||
return
|
return
|
||||||
|
|
||||||
if pod_name:
|
if pod_name:
|
||||||
self.resource_applied(
|
self.resource_applied(
|
||||||
@ -1089,6 +1119,18 @@ class K8sIntegration(Worker):
|
|||||||
def check_if_suspended(self) -> bool:
|
def check_if_suspended(self) -> bool:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def _ensure_pending_queue_exists(self):
|
||||||
|
resolved_ids = self._resolve_queue_names(
|
||||||
|
[self.k8s_pending_queue_name],
|
||||||
|
create_if_missing=True,
|
||||||
|
create_system_tags=ENV_DEFAULT_SCHEDULER_QUEUE_TAGS.get()
|
||||||
|
)
|
||||||
|
if not resolved_ids:
|
||||||
|
raise ValueError(
|
||||||
|
"Failed resolving or creating k8s pending queue {}".format(self.k8s_pending_queue_name)
|
||||||
|
)
|
||||||
|
self.k8s_pending_queue_id = resolved_ids[0]
|
||||||
|
|
||||||
def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
|
def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
|
||||||
"""
|
"""
|
||||||
:summary: Pull and run tasks from queues.
|
:summary: Pull and run tasks from queues.
|
||||||
@ -1104,14 +1146,8 @@ class K8sIntegration(Worker):
|
|||||||
|
|
||||||
events_service = self.get_service(Events)
|
events_service = self.get_service(Events)
|
||||||
|
|
||||||
# make sure we have a k8s pending queue
|
|
||||||
if not self.k8s_pending_queue_id:
|
if not self.k8s_pending_queue_id:
|
||||||
resolved_ids = self._resolve_queue_names([self.k8s_pending_queue_name], create_if_missing=True)
|
self._ensure_pending_queue_exists()
|
||||||
if not resolved_ids:
|
|
||||||
raise ValueError(
|
|
||||||
"Failed resolving or creating k8s pending queue {}".format(self.k8s_pending_queue_name)
|
|
||||||
)
|
|
||||||
self.k8s_pending_queue_id = resolved_ids[0]
|
|
||||||
|
|
||||||
_last_machine_update_ts = 0
|
_last_machine_update_ts = 0
|
||||||
while True:
|
while True:
|
||||||
|
Loading…
Reference in New Issue
Block a user