mirror of
https://github.com/clearml/clearml-agent
synced 2025-05-07 21:44:24 +00:00
Improve k8s glue add --template-yaml
This commit is contained in:
parent
6bad2b5352
commit
293a92f486
@ -34,6 +34,11 @@ def parse_args():
|
|||||||
"--overrides-yaml", type=str,
|
"--overrides-yaml", type=str,
|
||||||
help="YAML file containing pod overrides to be used when launching a new pod"
|
help="YAML file containing pod overrides to be used when launching a new pod"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--template-yaml", type=str,
|
||||||
|
help="YAML file containing pod template. If provided pod will be scheduled with kubectl apply "
|
||||||
|
"and overrides are ignored, otherwise it will be scheduled with kubectl run"
|
||||||
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
@ -47,7 +52,7 @@ def main():
|
|||||||
|
|
||||||
k8s = K8sIntegration(
|
k8s = K8sIntegration(
|
||||||
ports_mode=args.ports_mode, num_of_services=args.num_of_services, user_props_cb=user_props_cb,
|
ports_mode=args.ports_mode, num_of_services=args.num_of_services, user_props_cb=user_props_cb,
|
||||||
overrides_yaml=args.overrides_yaml, trains_conf_file=args.pod_trains_conf)
|
overrides_yaml=args.overrides_yaml, trains_conf_file=args.pod_trains_conf, template_yaml=args.template_yaml)
|
||||||
k8s.k8s_daemon(args.queue)
|
k8s.k8s_daemon(args.queue)
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,6 +4,9 @@ import base64
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
import json
|
import json
|
||||||
from time import sleep
|
from time import sleep
|
||||||
@ -14,6 +17,8 @@ from pyhocon import HOCONConverter
|
|||||||
from trains_agent.commands.events import Events
|
from trains_agent.commands.events import Events
|
||||||
from trains_agent.commands.worker import Worker
|
from trains_agent.commands.worker import Worker
|
||||||
from trains_agent.errors import APIError
|
from trains_agent.errors import APIError
|
||||||
|
from trains_agent.helper.base import safe_remove_file
|
||||||
|
from trains_agent.helper.dicts import merge_dicts
|
||||||
from trains_agent.helper.process import get_bash_output
|
from trains_agent.helper.process import get_bash_output
|
||||||
from trains_agent.helper.resource_monitor import ResourceMonitor
|
from trains_agent.helper.resource_monitor import ResourceMonitor
|
||||||
from trains_agent.interface.base import ObjectID
|
from trains_agent.interface.base import ObjectID
|
||||||
@ -22,6 +27,8 @@ from trains_agent.interface.base import ObjectID
|
|||||||
class K8sIntegration(Worker):
|
class K8sIntegration(Worker):
|
||||||
K8S_PENDING_QUEUE = "k8s_scheduler"
|
K8S_PENDING_QUEUE = "k8s_scheduler"
|
||||||
|
|
||||||
|
KUBECTL_APPLY_CMD = "kubectl apply -f"
|
||||||
|
|
||||||
KUBECTL_RUN_CMD = "kubectl run trains-id-{task_id} " \
|
KUBECTL_RUN_CMD = "kubectl run trains-id-{task_id} " \
|
||||||
"--image {docker_image} " \
|
"--image {docker_image} " \
|
||||||
"--restart=Never --replicas=1 " \
|
"--restart=Never --replicas=1 " \
|
||||||
@ -58,6 +65,7 @@ class K8sIntegration(Worker):
|
|||||||
num_of_services=20,
|
num_of_services=20,
|
||||||
user_props_cb=None,
|
user_props_cb=None,
|
||||||
overrides_yaml=None,
|
overrides_yaml=None,
|
||||||
|
template_yaml=None,
|
||||||
trains_conf_file=None,
|
trains_conf_file=None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@ -76,7 +84,9 @@ class K8sIntegration(Worker):
|
|||||||
:param callable user_props_cb: An Optional callable allowing additional user properties to be specified
|
:param callable user_props_cb: An Optional callable allowing additional user properties to be specified
|
||||||
when scheduling a task to run in a pod. Callable can receive an optional pod number and should return
|
when scheduling a task to run in a pod. Callable can receive an optional pod number and should return
|
||||||
a dictionary of user properties (name and value). Signature is [[Optional[int]], Dict[str,str]]
|
a dictionary of user properties (name and value). Signature is [[Optional[int]], Dict[str,str]]
|
||||||
:param str overrides_yaml: YAML file containing the overides for the pod (optional)
|
:param str overrides_yaml: YAML file containing the overrides for the pod (optional)
|
||||||
|
:param str template_yaml: YAML file containing the template for the pod (optional).
|
||||||
|
If provided the pod is scheduled with kubectl apply and overrides are ignored, otherwise with kubectl run.
|
||||||
:param str trains_conf_file: trains.conf file to be use by the pod itself (optional)
|
:param str trains_conf_file: trains.conf file to be use by the pod itself (optional)
|
||||||
"""
|
"""
|
||||||
super(K8sIntegration, self).__init__()
|
super(K8sIntegration, self).__init__()
|
||||||
@ -95,11 +105,34 @@ class K8sIntegration(Worker):
|
|||||||
self._user_props_cb = user_props_cb
|
self._user_props_cb = user_props_cb
|
||||||
self.trains_conf_file = None
|
self.trains_conf_file = None
|
||||||
self.overrides_json_string = None
|
self.overrides_json_string = None
|
||||||
|
self.template_dict = None
|
||||||
|
self.pod_limits = []
|
||||||
|
self.pod_requests = []
|
||||||
if overrides_yaml:
|
if overrides_yaml:
|
||||||
with open(os.path.expandvars(os.path.expanduser(str(overrides_yaml))), 'rt') as f:
|
with open(os.path.expandvars(os.path.expanduser(str(overrides_yaml))), 'rt') as f:
|
||||||
overrides = yaml.load(f)
|
overrides = yaml.load(f)
|
||||||
if overrides:
|
if overrides:
|
||||||
|
containers = overrides.get('spec', {}).get('containers', [])
|
||||||
|
for c in containers:
|
||||||
|
resources = {str(k).lower(): v for k, v in c.get('resources', {}).items()}
|
||||||
|
if not resources:
|
||||||
|
continue
|
||||||
|
if resources.get('limits'):
|
||||||
|
self.pod_limits += ['{}={}'.format(k, v) for k, v in resources['limits'].items()]
|
||||||
|
if resources.get('requests'):
|
||||||
|
self.pod_requests += ['{}={}'.format(k, v) for k, v in resources['requests'].items()]
|
||||||
|
# remove double entries
|
||||||
|
self.pod_limits = list(set(self.pod_limits))
|
||||||
|
self.pod_requests = list(set(self.pod_requests))
|
||||||
|
if self.pod_limits or self.pod_requests:
|
||||||
|
self.log.warning('Found pod container requests={} limits={}'.format(
|
||||||
|
self.pod_limits, self.pod_requests))
|
||||||
|
if containers:
|
||||||
|
self.log.warning('Removing containers section: {}'.format(overrides['spec'].pop('containers')))
|
||||||
self.overrides_json_string = json.dumps(overrides)
|
self.overrides_json_string = json.dumps(overrides)
|
||||||
|
if template_yaml:
|
||||||
|
with open(os.path.expandvars(os.path.expanduser(str(template_yaml))), 'rt') as f:
|
||||||
|
self.template_dict = yaml.load(f)
|
||||||
|
|
||||||
if trains_conf_file:
|
if trains_conf_file:
|
||||||
with open(os.path.expandvars(os.path.expanduser(str(trains_conf_file))), 'rt') as f:
|
with open(os.path.expandvars(os.path.expanduser(str(trains_conf_file))), 'rt') as f:
|
||||||
@ -151,14 +184,17 @@ class K8sIntegration(Worker):
|
|||||||
return
|
return
|
||||||
|
|
||||||
if task_data.execution.docker_cmd:
|
if task_data.execution.docker_cmd:
|
||||||
docker_image = task_data.execution.docker_cmd
|
docker_parts = task_data.execution.docker_cmd
|
||||||
else:
|
else:
|
||||||
docker_image = str(os.environ.get("TRAINS_DOCKER_IMAGE") or
|
docker_parts = str(os.environ.get("TRAINS_DOCKER_IMAGE") or
|
||||||
self._session.config.get("agent.default_docker.image", "nvidia/cuda"))
|
self._session.config.get("agent.default_docker.image", "nvidia/cuda"))
|
||||||
|
|
||||||
# take the first part, this is the docker image name (not arguments)
|
# take the first part, this is the docker image name (not arguments)
|
||||||
docker_image = docker_image.split()[0]
|
docker_parts = docker_parts.split()
|
||||||
|
docker_image = docker_parts[0]
|
||||||
|
docker_args = docker_parts[1:] if len(docker_parts) > 1 else []
|
||||||
|
|
||||||
|
# get the trains.conf encoded file
|
||||||
hocon_config_encoded = (
|
hocon_config_encoded = (
|
||||||
self.trains_conf_file or HOCONConverter.to_hocon(self._session.config._config)).encode('ascii')
|
self.trains_conf_file or HOCONConverter.to_hocon(self._session.config._config)).encode('ascii')
|
||||||
create_trains_conf = "echo '{}' | base64 --decode >> ~/trains.conf && ".format(
|
create_trains_conf = "echo '{}' | base64 --decode >> ~/trains.conf && ".format(
|
||||||
@ -167,22 +203,6 @@ class K8sIntegration(Worker):
|
|||||||
).decode('ascii')
|
).decode('ascii')
|
||||||
)
|
)
|
||||||
|
|
||||||
if callable(self.kubectl_cmd):
|
|
||||||
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data)
|
|
||||||
else:
|
|
||||||
kubectl_cmd = self.kubectl_cmd.format(
|
|
||||||
task_id=task_id,
|
|
||||||
docker_image=docker_image,
|
|
||||||
queue_id=queue
|
|
||||||
)
|
|
||||||
|
|
||||||
# make sure we provide a list
|
|
||||||
if isinstance(kubectl_cmd, str):
|
|
||||||
kubectl_cmd = kubectl_cmd.split()
|
|
||||||
|
|
||||||
if self.overrides_json_string:
|
|
||||||
kubectl_cmd += ['--overrides=' + self.overrides_json_string]
|
|
||||||
|
|
||||||
if self.ports_mode:
|
if self.ports_mode:
|
||||||
print("Kubernetes looking for available pod to use")
|
print("Kubernetes looking for available pod to use")
|
||||||
|
|
||||||
@ -214,26 +234,27 @@ class K8sIntegration(Worker):
|
|||||||
return
|
return
|
||||||
pod_number += 1
|
pod_number += 1
|
||||||
|
|
||||||
labels = [self.AGENT_LABEL]
|
labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + [self.AGENT_LABEL]
|
||||||
|
|
||||||
if self.ports_mode:
|
if self.ports_mode:
|
||||||
labels.insert(0, self.LIMIT_POD_LABEL.format(pod_number=pod_number))
|
|
||||||
print("Kubernetes scheduling task id={} on pod={}".format(task_id, pod_number))
|
print("Kubernetes scheduling task id={} on pod={}".format(task_id, pod_number))
|
||||||
else:
|
else:
|
||||||
print("Kubernetes scheduling task id={}".format(task_id))
|
print("Kubernetes scheduling task id={}".format(task_id))
|
||||||
|
|
||||||
kubectl_cmd += [
|
if self.template_dict:
|
||||||
"--labels=" + ",".join(labels),
|
output, error = self._kubectl_apply(
|
||||||
"--command",
|
create_trains_conf=create_trains_conf,
|
||||||
"--",
|
labels=labels, docker_image=docker_image, docker_args=docker_args,
|
||||||
"/bin/sh",
|
task_id=task_id, queue=queue)
|
||||||
"-c",
|
else:
|
||||||
create_trains_conf + self.container_bash_script.format(task_id),
|
output, error = self._kubectl_run(
|
||||||
]
|
create_trains_conf=create_trains_conf,
|
||||||
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
labels=labels, docker_image=docker_image,
|
||||||
output, error = process.communicate()
|
task_data=task_data,
|
||||||
|
task_id=task_id, queue=queue)
|
||||||
|
|
||||||
error = '' if not error else error if isinstance(error, str) else error.decode('utf-8')
|
error = '' if not error else (error if isinstance(error, str) else error.decode('utf-8'))
|
||||||
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
|
output = '' if not output else (output if isinstance(output, str) else output.decode('utf-8'))
|
||||||
print('kubectl output:\n{}\n{}'.format(error, output))
|
print('kubectl output:\n{}\n{}'.format(error, output))
|
||||||
|
|
||||||
if error:
|
if error:
|
||||||
@ -249,9 +270,106 @@ class K8sIntegration(Worker):
|
|||||||
pass
|
pass
|
||||||
self._set_task_user_properties(
|
self._set_task_user_properties(
|
||||||
task_id=task_id,
|
task_id=task_id,
|
||||||
**user_props,
|
**user_props
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _parse_docker_args(self, docker_args):
|
||||||
|
# type: (list) -> dict
|
||||||
|
kube_args = {'env': []}
|
||||||
|
while docker_args:
|
||||||
|
cmd = docker_args.pop().strip()
|
||||||
|
if cmd in ('-e', '--env',):
|
||||||
|
env = docker_args.pop().strip()
|
||||||
|
key, value = env.split('=', 1)
|
||||||
|
kube_args[key] += {key: value}
|
||||||
|
else:
|
||||||
|
self.log.warning('skipping docker argument {} (only -e --env supported)'.format(cmd))
|
||||||
|
return kube_args
|
||||||
|
|
||||||
|
def _kubectl_apply(self, create_trains_conf, docker_image, docker_args, labels, queue, task_id):
|
||||||
|
template = deepcopy(self.template_dict)
|
||||||
|
template.setdefault('apiVersion', 'v1')
|
||||||
|
template['kind'] = 'Pod'
|
||||||
|
template.setdefault('metadata', {})
|
||||||
|
name = 'trains-id-{task_id}'.format(task_id=task_id)
|
||||||
|
template['metadata']['name'] = name
|
||||||
|
template.setdefault('spec', {})
|
||||||
|
template['spec'].setdefault('containers', [])
|
||||||
|
if labels:
|
||||||
|
labels_dict = dict(pair.split('=', 1) for pair in labels)
|
||||||
|
template['metadata'].setdefault('labels', {})
|
||||||
|
template['metadata']['labels'].update(labels_dict)
|
||||||
|
container = self._parse_docker_args(docker_args)
|
||||||
|
container = merge_dicts(
|
||||||
|
container,
|
||||||
|
dict(name=name, image=docker_image,
|
||||||
|
command=['/bin/sh'],
|
||||||
|
args=['-c', create_trains_conf + self.container_bash_script.format(task_id)]))
|
||||||
|
|
||||||
|
if template['spec']['containers']:
|
||||||
|
template['spec']['containers'][0] = merge_dicts(template['spec']['containers'][0], container)
|
||||||
|
else:
|
||||||
|
template['spec']['containers'].append(container)
|
||||||
|
|
||||||
|
fp, yaml_file = tempfile.mkstemp(prefix='trains_k8stmpl_', suffix='.yml')
|
||||||
|
os.close(fp)
|
||||||
|
with open(yaml_file, 'wt') as f:
|
||||||
|
yaml.dump(template, f)
|
||||||
|
|
||||||
|
kubectl_cmd = self.KUBECTL_APPLY_CMD.format(
|
||||||
|
task_id=task_id,
|
||||||
|
docker_image=docker_image,
|
||||||
|
queue_id=queue,
|
||||||
|
)
|
||||||
|
# make sure we provide a list
|
||||||
|
if isinstance(kubectl_cmd, str):
|
||||||
|
kubectl_cmd = kubectl_cmd.split()
|
||||||
|
|
||||||
|
# add the template file at the end
|
||||||
|
kubectl_cmd += [yaml_file]
|
||||||
|
try:
|
||||||
|
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
|
output, error = process.communicate()
|
||||||
|
except Exception as ex:
|
||||||
|
return None, str(ex)
|
||||||
|
finally:
|
||||||
|
safe_remove_file(yaml_file)
|
||||||
|
|
||||||
|
return output, error
|
||||||
|
|
||||||
|
def _kubectl_run(self, create_trains_conf, docker_image, labels, queue, task_data, task_id):
|
||||||
|
if callable(self.kubectl_cmd):
|
||||||
|
kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data)
|
||||||
|
else:
|
||||||
|
kubectl_cmd = self.kubectl_cmd.format(
|
||||||
|
task_id=task_id,
|
||||||
|
docker_image=docker_image,
|
||||||
|
queue_id=queue
|
||||||
|
)
|
||||||
|
# make sure we provide a list
|
||||||
|
if isinstance(kubectl_cmd, str):
|
||||||
|
kubectl_cmd = kubectl_cmd.split()
|
||||||
|
|
||||||
|
if self.overrides_json_string:
|
||||||
|
kubectl_cmd += ['--overrides=' + self.overrides_json_string]
|
||||||
|
|
||||||
|
if self.pod_limits:
|
||||||
|
kubectl_cmd += ['--limits', ",".join(self.pod_limits)]
|
||||||
|
if self.pod_requests:
|
||||||
|
kubectl_cmd += ['--requests', ",".join(self.pod_requests)]
|
||||||
|
|
||||||
|
kubectl_cmd += [
|
||||||
|
"--labels=" + ",".join(labels),
|
||||||
|
"--command",
|
||||||
|
"--",
|
||||||
|
"/bin/sh",
|
||||||
|
"-c",
|
||||||
|
create_trains_conf + self.container_bash_script.format(task_id),
|
||||||
|
]
|
||||||
|
process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
|
output, error = process.communicate()
|
||||||
|
return output, error
|
||||||
|
|
||||||
def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
|
def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
|
||||||
"""
|
"""
|
||||||
:summary: Pull and run tasks from queues.
|
:summary: Pull and run tasks from queues.
|
||||||
@ -277,7 +395,7 @@ class K8sIntegration(Worker):
|
|||||||
while True:
|
while True:
|
||||||
# iterate over queues (priority style, queues[0] is highest)
|
# iterate over queues (priority style, queues[0] is highest)
|
||||||
for queue in queues:
|
for queue in queues:
|
||||||
# delete old completed /failed pods
|
# delete old completed / failed pods
|
||||||
get_bash_output(self.KUBECTL_DELETE_CMD)
|
get_bash_output(self.KUBECTL_DELETE_CMD)
|
||||||
|
|
||||||
# get next task in queue
|
# get next task in queue
|
||||||
|
@ -3,3 +3,15 @@ from typing import Callable, Dict, Any
|
|||||||
|
|
||||||
def filter_keys(filter_, dct): # type: (Callable[[Any], bool], Dict) -> Dict
|
def filter_keys(filter_, dct): # type: (Callable[[Any], bool], Dict) -> Dict
|
||||||
return {key: value for key, value in dct.items() if filter_(key)}
|
return {key: value for key, value in dct.items() if filter_(key)}
|
||||||
|
|
||||||
|
|
||||||
|
def merge_dicts(dict1, dict2):
|
||||||
|
""" Recursively merges dict2 into dict1 """
|
||||||
|
if not isinstance(dict1, dict) or not isinstance(dict2, dict):
|
||||||
|
return dict2
|
||||||
|
for k in dict2:
|
||||||
|
if k in dict1:
|
||||||
|
dict1[k] = merge_dicts(dict1[k], dict2[k])
|
||||||
|
else:
|
||||||
|
dict1[k] = dict2[k]
|
||||||
|
return dict1
|
||||||
|
Loading…
Reference in New Issue
Block a user