mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Detect and delete "stuck" k8s pods k8s glue
This commit is contained in:
parent
8fdb87f1f5
commit
b7f87fb8d3
@ -1,18 +1,21 @@
|
|||||||
from __future__ import print_function, division, unicode_literals
|
from __future__ import print_function, division, unicode_literals
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
import functools
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
from pathlib import Path
|
||||||
import yaml
|
from threading import Thread
|
||||||
import json
|
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import Text, List
|
from typing import Text, List
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
from clearml_agent.commands.events import Events
|
from clearml_agent.commands.events import Events
|
||||||
from clearml_agent.commands.worker import Worker
|
from clearml_agent.commands.worker import Worker
|
||||||
from clearml_agent.definitions import ENV_DOCKER_IMAGE
|
from clearml_agent.definitions import ENV_DOCKER_IMAGE
|
||||||
@ -86,8 +89,9 @@ class K8sIntegration(Worker):
|
|||||||
user_props_cb=None,
|
user_props_cb=None,
|
||||||
overrides_yaml=None,
|
overrides_yaml=None,
|
||||||
template_yaml=None,
|
template_yaml=None,
|
||||||
trains_conf_file=None,
|
clearml_conf_file=None,
|
||||||
extra_bash_init_script=None,
|
extra_bash_init_script=None,
|
||||||
|
**kwargs
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize the k8s integration glue layer daemon
|
Initialize the k8s integration glue layer daemon
|
||||||
@ -110,7 +114,7 @@ class K8sIntegration(Worker):
|
|||||||
:param str overrides_yaml: YAML file containing the overrides for the pod (optional)
|
:param str overrides_yaml: YAML file containing the overrides for the pod (optional)
|
||||||
:param str template_yaml: YAML file containing the template for the pod (optional).
|
:param str template_yaml: YAML file containing the template for the pod (optional).
|
||||||
If provided the pod is scheduled with kubectl apply and overrides are ignored, otherwise with kubectl run.
|
If provided the pod is scheduled with kubectl apply and overrides are ignored, otherwise with kubectl run.
|
||||||
:param str trains_conf_file: clearml.conf file to be use by the pod itself (optional)
|
:param str clearml_conf_file: clearml.conf file to be use by the pod itself (optional)
|
||||||
:param str extra_bash_init_script: Additional bash script to run before starting the Task inside the container
|
:param str extra_bash_init_script: Additional bash script to run before starting the Task inside the container
|
||||||
"""
|
"""
|
||||||
super(K8sIntegration, self).__init__()
|
super(K8sIntegration, self).__init__()
|
||||||
@ -127,7 +131,7 @@ class K8sIntegration(Worker):
|
|||||||
self.num_of_services = num_of_services
|
self.num_of_services = num_of_services
|
||||||
self._edit_hyperparams_support = None
|
self._edit_hyperparams_support = None
|
||||||
self._user_props_cb = user_props_cb
|
self._user_props_cb = user_props_cb
|
||||||
self.trains_conf_file = None
|
self.conf_file_content = None
|
||||||
self.overrides_json_string = None
|
self.overrides_json_string = None
|
||||||
self.template_dict = None
|
self.template_dict = None
|
||||||
self.extra_bash_init_script = extra_bash_init_script or None
|
self.extra_bash_init_script = extra_bash_init_script or None
|
||||||
@ -161,11 +165,57 @@ class K8sIntegration(Worker):
|
|||||||
with open(os.path.expandvars(os.path.expanduser(str(template_yaml))), 'rt') as f:
|
with open(os.path.expandvars(os.path.expanduser(str(template_yaml))), 'rt') as f:
|
||||||
self.template_dict = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))
|
self.template_dict = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))
|
||||||
|
|
||||||
if trains_conf_file:
|
clearml_conf_file = clearml_conf_file or kwargs.get('trains_conf_file')
|
||||||
with open(os.path.expandvars(os.path.expanduser(str(trains_conf_file))), 'rt') as f:
|
|
||||||
self.trains_conf_file = f.read()
|
if clearml_conf_file:
|
||||||
|
with open(os.path.expandvars(os.path.expanduser(str(clearml_conf_file))), 'rt') as f:
|
||||||
|
self.conf_file_content = f.read()
|
||||||
# make sure we use system packages!
|
# make sure we use system packages!
|
||||||
self.trains_conf_file += '\nagent.package_manager.system_site_packages=true\n'
|
self.conf_file_content += '\nagent.package_manager.system_site_packages=true\n'
|
||||||
|
|
||||||
|
self._monitor_hanging_pods()
|
||||||
|
|
||||||
|
def _monitor_hanging_pods(self):
|
||||||
|
_check_pod_thread = Thread(target=self._monitor_hanging_pods_daemon)
|
||||||
|
_check_pod_thread.daemon = True
|
||||||
|
_check_pod_thread.start()
|
||||||
|
|
||||||
|
def _monitor_hanging_pods_daemon(self):
|
||||||
|
while True:
|
||||||
|
output = get_bash_output('kubectl get pods -n clearml -o=JSON')
|
||||||
|
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
|
||||||
|
try:
|
||||||
|
output_config = json.loads(output)
|
||||||
|
except Exception as ex:
|
||||||
|
self.log.warning('K8S Glue pods monitor: Failed parsing kubectl output:\n{}\nEx: {}'.format(output, ex))
|
||||||
|
sleep(self._polling_interval)
|
||||||
|
continue
|
||||||
|
pods = output_config.get('items', [])
|
||||||
|
for pod in pods:
|
||||||
|
try:
|
||||||
|
reason = functools.reduce(
|
||||||
|
lambda a, b: a[b], ('status', 'containerStatuses', 0, 'state', 'waiting', 'reason'), pod
|
||||||
|
)
|
||||||
|
except (IndexError, KeyError):
|
||||||
|
continue
|
||||||
|
if reason == 'ImagePullBackOff':
|
||||||
|
pod_name = pod.get('metadata', {}).get('name', None)
|
||||||
|
if pod_name:
|
||||||
|
task_id = pod_name.rpartition('-')[-1]
|
||||||
|
delete_pod_cmd = 'kubectl delete pods {} -n clearml'.format(pod_name)
|
||||||
|
get_bash_output(delete_pod_cmd)
|
||||||
|
try:
|
||||||
|
self._session.api_client.tasks.failed(
|
||||||
|
task=task_id,
|
||||||
|
status_reason="K8S glue error due to ImagePullBackOff",
|
||||||
|
status_message="Changed by K8S glue",
|
||||||
|
force=True
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
self.log.warning(
|
||||||
|
'K8S Glue pods monitor: Failed deleting task "{}"\nEX: {}'.format(task_id, ex)
|
||||||
|
)
|
||||||
|
sleep(self._polling_interval)
|
||||||
|
|
||||||
def _set_task_user_properties(self, task_id: str, **properties: str):
|
def _set_task_user_properties(self, task_id: str, **properties: str):
|
||||||
if self._edit_hyperparams_support is not True:
|
if self._edit_hyperparams_support is not True:
|
||||||
@ -225,8 +275,11 @@ class K8sIntegration(Worker):
|
|||||||
|
|
||||||
# get the clearml.conf encoded file
|
# get the clearml.conf encoded file
|
||||||
# noinspection PyProtectedMember
|
# noinspection PyProtectedMember
|
||||||
hocon_config_encoded = (self.trains_conf_file or self._session._config_file).encode('ascii')
|
hocon_config_encoded = (
|
||||||
create_trains_conf = "echo '{}' | base64 --decode >> ~/clearml.conf".format(
|
self.conf_file_content
|
||||||
|
or Path(self._session._config_file).read_text()
|
||||||
|
).encode("ascii")
|
||||||
|
create_clearml_conf = "echo '{}' | base64 --decode >> ~/clearml.conf".format(
|
||||||
base64.b64encode(
|
base64.b64encode(
|
||||||
hocon_config_encoded
|
hocon_config_encoded
|
||||||
).decode('ascii')
|
).decode('ascii')
|
||||||
@ -283,12 +336,12 @@ class K8sIntegration(Worker):
|
|||||||
|
|
||||||
if self.template_dict:
|
if self.template_dict:
|
||||||
output, error = self._kubectl_apply(
|
output, error = self._kubectl_apply(
|
||||||
create_trains_conf=create_trains_conf,
|
create_clearml_conf=create_clearml_conf,
|
||||||
labels=labels, docker_image=docker_image, docker_args=docker_args,
|
labels=labels, docker_image=docker_image, docker_args=docker_args,
|
||||||
task_id=task_id, queue=queue, queue_name=safe_queue_name)
|
task_id=task_id, queue=queue, queue_name=safe_queue_name)
|
||||||
else:
|
else:
|
||||||
output, error = self._kubectl_run(
|
output, error = self._kubectl_run(
|
||||||
create_trains_conf=create_trains_conf,
|
create_clearml_conf=create_clearml_conf,
|
||||||
labels=labels, docker_image=docker_image,
|
labels=labels, docker_image=docker_image,
|
||||||
task_data=task_data,
|
task_data=task_data,
|
||||||
task_id=task_id, queue=queue, queue_name=safe_queue_name)
|
task_id=task_id, queue=queue, queue_name=safe_queue_name)
|
||||||
@ -297,7 +350,9 @@ class K8sIntegration(Worker):
|
|||||||
output = '' if not output else (output if isinstance(output, str) else output.decode('utf-8'))
|
output = '' if not output else (output if isinstance(output, str) else output.decode('utf-8'))
|
||||||
print('kubectl output:\n{}\n{}'.format(error, output))
|
print('kubectl output:\n{}\n{}'.format(error, output))
|
||||||
if error:
|
if error:
|
||||||
self.log.error("Running kubectl encountered an error: {}".format(error))
|
send_log = "Running kubectl encountered an error: {}".format(error)
|
||||||
|
self.log.error(send_log)
|
||||||
|
self.send_logs(task_id, send_log.splitlines())
|
||||||
|
|
||||||
user_props = {"k8s-queue": str(queue_name)}
|
user_props = {"k8s-queue": str(queue_name)}
|
||||||
if self.ports_mode:
|
if self.ports_mode:
|
||||||
@ -330,7 +385,7 @@ class K8sIntegration(Worker):
|
|||||||
self.log.warning('skipping docker argument {} (only -e --env supported)'.format(cmd))
|
self.log.warning('skipping docker argument {} (only -e --env supported)'.format(cmd))
|
||||||
return kube_args
|
return kube_args
|
||||||
|
|
||||||
def _kubectl_apply(self, create_trains_conf, docker_image, docker_args, labels, queue, task_id, queue_name):
|
def _kubectl_apply(self, create_clearml_conf, docker_image, docker_args, labels, queue, task_id, queue_name):
|
||||||
template = deepcopy(self.template_dict)
|
template = deepcopy(self.template_dict)
|
||||||
template.setdefault('apiVersion', 'v1')
|
template.setdefault('apiVersion', 'v1')
|
||||||
template['kind'] = 'Pod'
|
template['kind'] = 'Pod'
|
||||||
@ -364,7 +419,7 @@ class K8sIntegration(Worker):
|
|||||||
container,
|
container,
|
||||||
dict(name=name, image=docker_image,
|
dict(name=name, image=docker_image,
|
||||||
command=['/bin/bash'],
|
command=['/bin/bash'],
|
||||||
args=['-c', '{} ; {}'.format(create_trains_conf, create_init_script)])
|
args=['-c', '{} ; {}'.format(create_clearml_conf, create_init_script)])
|
||||||
)
|
)
|
||||||
|
|
||||||
if template['spec']['containers']:
|
if template['spec']['containers']:
|
||||||
@ -398,7 +453,7 @@ class K8sIntegration(Worker):
|
|||||||
|
|
||||||
return output, error
|
return output, error
|
||||||
|
|
||||||
def _kubectl_run(self, create_trains_conf, docker_image, labels, queue, task_data, task_id, queue_name):
|
def _kubectl_run(self, create_clearml_conf, docker_image, labels, queue, task_data, task_id, queue_name):
|
||||||
if callable(self.kubectl_cmd):
|
if callable(self.kubectl_cmd):
|
||||||
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data, queue_name)
|
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data, queue_name)
|
||||||
else:
|
else:
|
||||||
@ -430,7 +485,7 @@ class K8sIntegration(Worker):
|
|||||||
"--",
|
"--",
|
||||||
"/bin/sh",
|
"/bin/sh",
|
||||||
"-c",
|
"-c",
|
||||||
"{} ; {}".format(create_trains_conf, container_bash_script.format(
|
"{} ; {}".format(create_clearml_conf, container_bash_script.format(
|
||||||
extra_bash_init_cmd=self.extra_bash_init_script, task_id=task_id)),
|
extra_bash_init_cmd=self.extra_bash_init_script, task_id=task_id)),
|
||||||
]
|
]
|
||||||
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
|
@ -64,7 +64,7 @@ def main():
|
|||||||
|
|
||||||
k8s = K8sIntegration(
|
k8s = K8sIntegration(
|
||||||
ports_mode=args.ports_mode, num_of_services=args.num_of_services, user_props_cb=user_props_cb,
|
ports_mode=args.ports_mode, num_of_services=args.num_of_services, user_props_cb=user_props_cb,
|
||||||
overrides_yaml=args.overrides_yaml, trains_conf_file=args.pod_trains_conf, template_yaml=args.template_yaml,
|
overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf, template_yaml=args.template_yaml,
|
||||||
extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
|
extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
|
||||||
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None
|
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user