Add K8s glue support for limited number of services exposing ports

2025-06-26 18:16:15 +00:00 · 2020-10-20 14:17:30 +03:00 · 2020-10-20 14:17:30 +03:00 · 92fc8e838f
commit 92fc8e838f
parent 89a3020c5e
2 changed files with 122 additions and 19 deletions
--- a/examples/k8s_glue_example.py
+++ b/examples/k8s_glue_example.py
@ -0,0 +1,34 @@
+"""
+This example assumes you have preconfigured services with selectors in the form of
+ "ai.allegro.agent.serial=pod-<number>" and a targetPort of 10022.
+The K8sIntegration component will label each pod accordingly.
+"""
+from argparse import ArgumentParser
+
+from trains_agent.glue.k8s import K8sIntegration
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--queue", type=str, help="Queue to pull tasks from"
+    )
+    parser.add_argument(
+        "--ports-mode", action='store_true', default=False,
+        help="Ports-mode will add a label to the pod which can be used in services in order to expose ports"
+    )
+    parser.add_argument(
+        "--num-of-services", type=int, default=20,
+        help="Specify the number of k8s services to be used. Use only with ports-mode."
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    k8s = K8sIntegration(ports_mode=args.ports_mode, num_of_services=args.num_of_services)
+    k8s.k8s_daemon(args.queue)
+
+
+if __name__ == "__main__":
+    main()
--- a/trains_agent/glue/k8s.py
+++ b/trains_agent/glue/k8s.py
@ -1,5 +1,6 @@
 from __future__ import print_function, division, unicode_literals

+import base64
 import logging
 import os
 import subprocess
@ -12,39 +13,58 @@ from trains_agent.commands.events import Events
 from trains_agent.commands.worker import Worker
 from trains_agent.helper.process import get_bash_output
 from trains_agent.helper.resource_monitor import ResourceMonitor
+from trains_agent.interface.base import ObjectID


 class K8sIntegration(Worker):
    K8S_PENDING_QUEUE = "k8s_scheduler"

-    KUBECTL_RUN_CMD = "kubectl run trains_id_{task_id} " \
+    KUBECTL_RUN_CMD = "kubectl run trains-id-{task_id} " \
                      "--image {docker_image} " \
                      "--restart=Never --replicas=1 " \
-                      "--generator=run-pod/v1"
+                      "--generator=run-pod/v1 " \
+                      "--namespace=trains"

    KUBECTL_DELETE_CMD = "kubectl delete pods " \
                         "--selector=TRAINS=agent " \
-                         "--field-selector=status.phase!=Pending,status.phase!=Running"
+                         "--field-selector=status.phase!=Pending,status.phase!=Running " \
+                         "--namespace=trains"

    CONTAINER_BASH_SCRIPT = \
+        "export DEBIAN_FRONTEND='noninteractive'; " \
        "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean ; " \
        "chown -R root /root/.cache/pip ; " \
        "apt-get update ; " \
        "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0 ; " \
        "(which python3 && python3 -m pip --version) || apt-get install -y python3-pip ; " \
        "python3 -m pip install trains-agent ; " \
-        "python3 -m trains_agent execute --full-monitoring --require-queue --id {}"
+        "python3 -m trains_agent execute --full-monitoring --require-queue --id {} ; "

-    def __init__(self, k8s_pending_queue_name=None, kubectl_cmd=None, container_bash_script=None, debug=False):
+    AGENT_LABEL = "TRAINS=agent"
+    LIMIT_POD_LABEL = "ai.allegro.agent.serial=pod-{pod_number}"
+
+    def __init__(
+            self,
+            k8s_pending_queue_name=None,
+            kubectl_cmd=None,
+            container_bash_script=None,
+            debug=False,
+            ports_mode=False,
+            num_of_services=20,
+    ):
        """
        Initialize the k8s integration glue layer daemon

        :param str k8s_pending_queue_name: queue name to use when task is pending in the k8s scheduler
-        :param str|callable kubectl_cmd: kubectl command line str, supports formating (default: KUBECTL_RUN_CMD)
+        :param str|callable kubectl_cmd: kubectl command line str, supports formatting (default: KUBECTL_RUN_CMD)
            example: "task={task_id} image={docker_image} queue_id={queue_id}"
            or a callable function: kubectl_cmd(task_id, docker_image, queue_id, task_data)
        :param str container_bash_script: container bash script to be executed in k8s (default: CONTAINER_BASH_SCRIPT)
        :param bool debug: Switch logging on
+        :param bool ports_mode: Adds a label to each pod which can be used in services in order to expose ports.
+            Requires the `num_of_services` parameter.
+        :param int num_of_services: Number of k8s services configured in the cluster. Required if `port_mode` is True.
+            (default: 20)
        """
        super(K8sIntegration, self).__init__()
        self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
@ -56,12 +76,15 @@ class K8sIntegration(Worker):
        if debug:
            self.log.logger.disabled = False
            self.log.logger.setLevel(logging.INFO)
+        self.ports_mode = ports_mode
+        self.num_of_services = num_of_services

-    def run_one_task(self, queue: Text, task_id: Text, worker_args=None):
+    def run_one_task(self, queue: Text, task_id: Text, worker_args=None, **_):
        task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]

        # push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
        try:
+            self._session.api_client.tasks.reset(task_id)
            self._session.api_client.tasks.enqueue(task_id, queue=self.k8s_pending_queue_name,
                                                   status_reason='k8s pending scheduler')
        except Exception as e:
@ -78,28 +101,74 @@ class K8sIntegration(Worker):
        # take the first part, this is the docker image name (not arguments)
        docker_image = docker_image.split()[0]

-        create_trains_conf = "echo '{}' >> ~/trains.conf && ".format(
-            HOCONConverter.to_hocon(self._session.config._config))
+        hocon_config_encoded = HOCONConverter.to_hocon(
+                self._session.config._config
+            ).encode('ascii')
+        create_trains_conf = "echo '{}' | base64 --decode >> ~/trains.conf && ".format(
+            base64.b64encode(
+                hocon_config_encoded
+            ).decode('ascii')
+        )

        if callable(self.kubectl_cmd):
            kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data)
        else:
-            kubectl_cmd = self.kubectl_cmd.format(task_id=task_id, docker_image=docker_image, queue_id=queue)
+            kubectl_cmd = self.kubectl_cmd.format(
+                task_id=task_id,
+                docker_image=docker_image,
+                queue_id=queue
+            )

-        # make sure we gave a list
+        # Search for a free pod number
+        pod_number = 1
+        while self.ports_mode:
+            kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n trains".format(
+                pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number),
+                agent_label=self.AGENT_LABEL
+            )
+            process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            output, error = process.communicate()
+            if not output:
+                # No such pod exist so we can use the pod_number we found
+                break
+            if pod_number >= self.num_of_services:
+                # All pod numbers are taken, exit
+                self.log.info(
+                    "All k8s services are in use, task '{}' will be enqueued back to queue '{}'".format(
+                        task_id, queue
+                    )
+                )
+                self._session.api_client.tasks.reset(task_id)
+                self._session.api_client.tasks.enqueue(task_id, queue=queue)
+                return
+            pod_number += 1
+
+        # make sure we provide a list
        if isinstance(kubectl_cmd, str):
            kubectl_cmd = kubectl_cmd.split()

-        kubectl_cmd += ["--labels=TRAINS=agent", "--command", "--", "/bin/sh", "-c",
-                        create_trains_conf + self.container_bash_script.format(task_id)]
+        labels = [self.AGENT_LABEL]
+        message = "K8s scheduling experiment task id={}".format(task_id)
+        if self.ports_mode:
+            labels.insert(0, self.LIMIT_POD_LABEL.format(pod_number=pod_number))
+            message += " pod #{}".format(pod_number)
+
+        kubectl_cmd += [
+            "--labels=" + ",".join(labels),
+            "--command",
+            "--",
+            "/bin/sh",
+            "-c",
+            create_trains_conf + self.container_bash_script.format(task_id),
+        ]
        process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        output, error = process.communicate()
-        self.log.info("K8s scheduling experiment task id={}".format(task_id))
+        self.log.info(message)
        if error:
            self.log.error("Running kubectl encountered an error: {}".format(
                error if isinstance(error, str) else error.decode()))

-    def run_tasks_loop(self, queues: List[Text], worker_params):
+    def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
        """
        :summary: Pull and run tasks from queues.
        :description: 1. Go through ``queues`` by order.
@ -160,15 +229,15 @@ class K8sIntegration(Worker):
            if self._session.config["agent.reload_config"]:
                self.reload_config()

-    def k8s_daemon(self, queues):
+    def k8s_daemon(self, queue):
        """
        Start the k8s Glue service.
-        This service will be pulling tasks from *queues* and scheduling them for execution using kubectl.
+        This service will be pulling tasks from *queue* and scheduling them for execution using kubectl.
        Notice all scheduled tasks are pushed back into K8S_PENDING_QUEUE,
        and popped when execution actually starts. This creates full visibility into the k8s scheduler.
        Manually popping a task from the K8S_PENDING_QUEUE,
        will cause the k8s scheduler to skip the execution once the scheduled tasks needs to be executed

-        :param list(str) queues: List of queue names to pull from
+        :param list(str) queue: queue name to pull from
        """
-        return self.daemon(queues=queues, log_level=logging.INFO, foreground=True, docker=False)
+        return self.daemon(queues=[ObjectID(name=queue)], log_level=logging.INFO, foreground=True, docker=False)