From 92fc8e838f6b078c348069eb6b6a2feeb88f7c61 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Tue, 20 Oct 2020 14:17:30 +0300
Subject: [PATCH] Add K8s glue support for limited number of services exposing
 ports

---
 examples/k8s_glue_example.py |  34 +++++++++++
 trains_agent/glue/k8s.py     | 107 ++++++++++++++++++++++++++++-------
 2 files changed, 122 insertions(+), 19 deletions(-)
 create mode 100644 examples/k8s_glue_example.py

diff --git a/examples/k8s_glue_example.py b/examples/k8s_glue_example.py
new file mode 100644
index 0000000..6c1b6bf
--- /dev/null
+++ b/examples/k8s_glue_example.py
@@ -0,0 +1,34 @@
+"""
+This example assumes you have preconfigured services with selectors in the form of
+ "ai.allegro.agent.serial=pod-<number>" and a targetPort of 10022.
+The K8sIntegration component will label each pod accordingly.
+"""
+from argparse import ArgumentParser
+
+from trains_agent.glue.k8s import K8sIntegration
+
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--queue", type=str, help="Queue to pull tasks from"
+    )
+    parser.add_argument(
+        "--ports-mode", action='store_true', default=False,
+        help="Ports-mode will add a label to the pod which can be used in services in order to expose ports"
+    )
+    parser.add_argument(
+        "--num-of-services", type=int, default=20,
+        help="Specify the number of k8s services to be used. Use only with ports-mode."
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    k8s = K8sIntegration(ports_mode=args.ports_mode, num_of_services=args.num_of_services)
+    k8s.k8s_daemon(args.queue)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/trains_agent/glue/k8s.py b/trains_agent/glue/k8s.py
index 9f0a061..9fea7a9 100644
--- a/trains_agent/glue/k8s.py
+++ b/trains_agent/glue/k8s.py
@@ -1,5 +1,6 @@
 from __future__ import print_function, division, unicode_literals
 
+import base64
 import logging
 import os
 import subprocess
@@ -12,39 +13,58 @@ from trains_agent.commands.events import Events
 from trains_agent.commands.worker import Worker
 from trains_agent.helper.process import get_bash_output
 from trains_agent.helper.resource_monitor import ResourceMonitor
+from trains_agent.interface.base import ObjectID
 
 
 class K8sIntegration(Worker):
     K8S_PENDING_QUEUE = "k8s_scheduler"
 
-    KUBECTL_RUN_CMD = "kubectl run trains_id_{task_id} " \
+    KUBECTL_RUN_CMD = "kubectl run trains-id-{task_id} " \
                       "--image {docker_image} " \
                       "--restart=Never --replicas=1 " \
-                      "--generator=run-pod/v1"
+                      "--generator=run-pod/v1 " \
+                      "--namespace=trains"
 
     KUBECTL_DELETE_CMD = "kubectl delete pods " \
                          "--selector=TRAINS=agent " \
-                         "--field-selector=status.phase!=Pending,status.phase!=Running"
+                         "--field-selector=status.phase!=Pending,status.phase!=Running " \
+                         "--namespace=trains"
 
     CONTAINER_BASH_SCRIPT = \
+        "export DEBIAN_FRONTEND='noninteractive'; " \
         "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean ; " \
         "chown -R root /root/.cache/pip ; " \
         "apt-get update ; " \
         "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0 ; " \
         "(which python3 && python3 -m pip --version) || apt-get install -y python3-pip ; " \
         "python3 -m pip install trains-agent ; " \
-        "python3 -m trains_agent execute --full-monitoring --require-queue --id {}"
+        "python3 -m trains_agent execute --full-monitoring --require-queue --id {} ; "
 
-    def __init__(self, k8s_pending_queue_name=None, kubectl_cmd=None, container_bash_script=None, debug=False):
+    AGENT_LABEL = "TRAINS=agent"
+    LIMIT_POD_LABEL = "ai.allegro.agent.serial=pod-{pod_number}"
+
+    def __init__(
+            self,
+            k8s_pending_queue_name=None,
+            kubectl_cmd=None,
+            container_bash_script=None,
+            debug=False,
+            ports_mode=False,
+            num_of_services=20,
+    ):
         """
         Initialize the k8s integration glue layer daemon
 
         :param str k8s_pending_queue_name: queue name to use when task is pending in the k8s scheduler
-        :param str|callable kubectl_cmd: kubectl command line str, supports formating (default: KUBECTL_RUN_CMD)
+        :param str|callable kubectl_cmd: kubectl command line str, supports formatting (default: KUBECTL_RUN_CMD)
             example: "task={task_id} image={docker_image} queue_id={queue_id}"
             or a callable function: kubectl_cmd(task_id, docker_image, queue_id, task_data)
         :param str container_bash_script: container bash script to be executed in k8s (default: CONTAINER_BASH_SCRIPT)
         :param bool debug: Switch logging on
+        :param bool ports_mode: Adds a label to each pod which can be used in services in order to expose ports.
+            Requires the `num_of_services` parameter.
+        :param int num_of_services: Number of k8s services configured in the cluster. Required if `port_mode` is True.
+            (default: 20)
         """
         super(K8sIntegration, self).__init__()
         self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
@@ -56,12 +76,15 @@ class K8sIntegration(Worker):
         if debug:
             self.log.logger.disabled = False
             self.log.logger.setLevel(logging.INFO)
+        self.ports_mode = ports_mode
+        self.num_of_services = num_of_services
 
-    def run_one_task(self, queue: Text, task_id: Text, worker_args=None):
+    def run_one_task(self, queue: Text, task_id: Text, worker_args=None, **_):
         task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]
 
         # push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
         try:
+            self._session.api_client.tasks.reset(task_id)
             self._session.api_client.tasks.enqueue(task_id, queue=self.k8s_pending_queue_name,
                                                    status_reason='k8s pending scheduler')
         except Exception as e:
@@ -78,28 +101,74 @@ class K8sIntegration(Worker):
         # take the first part, this is the docker image name (not arguments)
         docker_image = docker_image.split()[0]
 
-        create_trains_conf = "echo '{}' >> ~/trains.conf && ".format(
-            HOCONConverter.to_hocon(self._session.config._config))
+        hocon_config_encoded = HOCONConverter.to_hocon(
+                self._session.config._config
+            ).encode('ascii')
+        create_trains_conf = "echo '{}' | base64 --decode >> ~/trains.conf && ".format(
+            base64.b64encode(
+                hocon_config_encoded
+            ).decode('ascii')
+        )
 
         if callable(self.kubectl_cmd):
             kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data)
         else:
-            kubectl_cmd = self.kubectl_cmd.format(task_id=task_id, docker_image=docker_image, queue_id=queue)
+            kubectl_cmd = self.kubectl_cmd.format(
+                task_id=task_id,
+                docker_image=docker_image,
+                queue_id=queue
+            )
 
-        # make sure we gave a list
+        # Search for a free pod number
+        pod_number = 1
+        while self.ports_mode:
+            kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n trains".format(
+                pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number),
+                agent_label=self.AGENT_LABEL
+            )
+            process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            output, error = process.communicate()
+            if not output:
+                # No such pod exist so we can use the pod_number we found
+                break
+            if pod_number >= self.num_of_services:
+                # All pod numbers are taken, exit
+                self.log.info(
+                    "All k8s services are in use, task '{}' will be enqueued back to queue '{}'".format(
+                        task_id, queue
+                    )
+                )
+                self._session.api_client.tasks.reset(task_id)
+                self._session.api_client.tasks.enqueue(task_id, queue=queue)
+                return
+            pod_number += 1
+
+        # make sure we provide a list
         if isinstance(kubectl_cmd, str):
             kubectl_cmd = kubectl_cmd.split()
 
-        kubectl_cmd += ["--labels=TRAINS=agent", "--command", "--", "/bin/sh", "-c",
-                        create_trains_conf + self.container_bash_script.format(task_id)]
+        labels = [self.AGENT_LABEL]
+        message = "K8s scheduling experiment task id={}".format(task_id)
+        if self.ports_mode:
+            labels.insert(0, self.LIMIT_POD_LABEL.format(pod_number=pod_number))
+            message += " pod #{}".format(pod_number)
+
+        kubectl_cmd += [
+            "--labels=" + ",".join(labels),
+            "--command",
+            "--",
+            "/bin/sh",
+            "-c",
+            create_trains_conf + self.container_bash_script.format(task_id),
+        ]
         process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         output, error = process.communicate()
-        self.log.info("K8s scheduling experiment task id={}".format(task_id))
+        self.log.info(message)
         if error:
             self.log.error("Running kubectl encountered an error: {}".format(
                 error if isinstance(error, str) else error.decode()))
 
-    def run_tasks_loop(self, queues: List[Text], worker_params):
+    def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
         """
         :summary: Pull and run tasks from queues.
         :description: 1. Go through ``queues`` by order.
@@ -160,15 +229,15 @@ class K8sIntegration(Worker):
             if self._session.config["agent.reload_config"]:
                 self.reload_config()
 
-    def k8s_daemon(self, queues):
+    def k8s_daemon(self, queue):
         """
         Start the k8s Glue service.
-        This service will be pulling tasks from *queues* and scheduling them for execution using kubectl.
+        This service will be pulling tasks from *queue* and scheduling them for execution using kubectl.
         Notice all scheduled tasks are pushed back into K8S_PENDING_QUEUE,
         and popped when execution actually starts. This creates full visibility into the k8s scheduler.
         Manually popping a task from the K8S_PENDING_QUEUE,
         will cause the k8s scheduler to skip the execution once the scheduled tasks needs to be executed
 
-        :param list(str) queues: List of queue names to pull from
+        :param list(str) queue: queue name to pull from
         """
-        return self.daemon(queues=queues, log_level=logging.INFO, foreground=True, docker=False)
+        return self.daemon(queues=[ObjectID(name=queue)], log_level=logging.INFO, foreground=True, docker=False)