From 84706ba66d560896cfcc9f0837696d03bc5d6aa0 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Tue, 3 Aug 2021 11:23:33 +0300
Subject: [PATCH] Add docker example for running the agent k8s glue as a pod in
 a k8s cluster

---
 docker/k8s-glue/README                        |   8 +
 docker/k8s-glue/glue-build/Dockerfile         |  18 +
 docker/k8s-glue/glue-build/clearml.conf       | 402 ++++++++++++++++++
 docker/k8s-glue/glue-build/entrypoint.sh      |  27 ++
 .../k8s-glue/glue-build/k8s_glue_example.py   |  94 ++++
 docker/k8s-glue/glue-build/setup.sh           |  25 ++
 docker/k8s-glue/k8s-glue.yml                  |  54 +++
 docker/k8s-glue/pod_template.yml              |  13 +
 8 files changed, 641 insertions(+)
 create mode 100644 docker/k8s-glue/README
 create mode 100644 docker/k8s-glue/glue-build/Dockerfile
 create mode 100644 docker/k8s-glue/glue-build/clearml.conf
 create mode 100644 docker/k8s-glue/glue-build/entrypoint.sh
 create mode 100644 docker/k8s-glue/glue-build/k8s_glue_example.py
 create mode 100644 docker/k8s-glue/glue-build/setup.sh
 create mode 100644 docker/k8s-glue/k8s-glue.yml
 create mode 100644 docker/k8s-glue/pod_template.yml

diff --git a/docker/k8s-glue/README b/docker/k8s-glue/README
new file mode 100644
index 0000000..7f7a8a2
--- /dev/null
+++ b/docker/k8s-glue/README
@@ -0,0 +1,8 @@
+This folder contains an example docker and templates for running the k8s glue as a pod in a k8s cluster
+
+Please note that ClearML credentials and server addresses should either be filled in the clearml.conf file before
+ building the glue docker or provided in the k8s-glue.yml template.
+
+To run, you'll need to:
+* Create a secret from pod_template.yml: `kubectl create secret generic k8s-glue-pod-template --from-file=pod_template.yml`
+* Apply the k8s glue template: `kubectl apply -f k8s-glue.yml`
\ No newline at end of file
diff --git a/docker/k8s-glue/glue-build/Dockerfile b/docker/k8s-glue/glue-build/Dockerfile
new file mode 100644
index 0000000..33560f9
--- /dev/null
+++ b/docker/k8s-glue/glue-build/Dockerfile
@@ -0,0 +1,18 @@
+FROM ubuntu:18.04
+
+USER root
+WORKDIR /root
+
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+ENV PYTHONIOENCODING=UTF-8
+
+COPY ./entrypoint.sh /root/entrypoint.sh
+COPY ./k8s_glue_example.py /root/k8s_glue_example.py
+COPY ./setup.sh /root/setup.sh
+COPY ./clearml.conf /root/clearml.conf
+
+RUN /root/setup.sh
+
+ENTRYPOINT ["/root/entrypoint.sh"]
\ No newline at end of file
diff --git a/docker/k8s-glue/glue-build/clearml.conf b/docker/k8s-glue/glue-build/clearml.conf
new file mode 100644
index 0000000..4959473
--- /dev/null
+++ b/docker/k8s-glue/glue-build/clearml.conf
@@ -0,0 +1,402 @@
+# CLEARML-AGENT configuration file
+api {
+    # Notice: 'host' is the api server (default port 8008), not the web server.
+    api_server: ""
+    web_server: ""
+    files_server: ""
+    # Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY
+    credentials {"access_key": "", "secret_key": ""}
+}
+
+# Set GIT user/pass credentials
+# leave blank for GIT SSH credentials
+agent.git_user=""
+agent.git_pass=""
+
+# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
+agent.package_manager.extra_index_url= [
+
+]
+
+agent {
+    # unique name of this worker, if None, created based on hostname:process_id
+    # Override with os environment: CLEARML_WORKER_ID
+    # worker_id: "clearml-agent-machine1:gpu0"
+    worker_id: ""
+
+    # worker name, replaces the hostname when creating a unique name for this worker
+    # Override with os environment: CLEARML_WORKER_NAME
+    # worker_name: "clearml-agent-machine1"
+    worker_name: ""
+
+    # Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
+    # leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
+    # git_user: ""
+    # git_pass: ""
+    # git_host: ""
+
+    # Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
+    force_git_ssh_protocol: false
+    # Force a specific SSH port when converting http to ssh links (the domain is kept the same)
+    # force_git_ssh_port: 0
+    # Force a specific SSH username when converting http to ssh links (the default username is 'git')
+    # force_git_ssh_user: git
+
+    # Set the python version to use when creating the virtual environment and launching the experiment
+    # Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
+    # The default is the python executing the clearml_agent
+    python_binary: ""
+    # ignore any requested python version (Default: False, if a Task was using a
+    # specific python version and the system supports multiple python the agent will use the requested python version)
+    # ignore_requested_python_version: true
+
+    # select python package manager:
+    # currently supported pip and conda
+    # poetry is used if pip selected and repository contains poetry.lock file
+    package_manager: {
+        # supported options: pip, conda, poetry
+        type: pip,
+
+        # specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
+        pip_version: "<20.2",
+
+        # virtual environment inheres packages from system
+        system_site_packages: false,
+
+        # install with --upgrade
+        force_upgrade: false,
+
+        # additional artifact repositories to use when installing python packages
+        # extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"]
+
+        # additional conda channels to use when installing with conda package manager
+        conda_channels: ["pytorch", "conda-forge", "defaults", ]
+
+        # If set to true, Task's "installed packages" are ignored,
+        # and the repository's "requirements.txt" is used instead
+        # force_repo_requirements_txt: false
+
+        # set the priority packages to be installed before the rest of the required packages
+        # priority_packages: ["cython", "numpy", "setuptools", ]
+
+        # set the optional priority packages to be installed before the rest of the required packages,
+        # In case a package installation fails, the package will be ignored,
+        # and the virtual environment process will continue
+        # priority_optional_packages: ["pygobject", ]
+
+        # set the post packages to be installed after all the rest of the required packages
+        # post_packages: ["horovod", ]
+
+        # set the optional post packages to be installed after all the rest of the required packages,
+        # In case a package installation fails, the package will be ignored,
+        # and the virtual environment process will continue
+        # post_optional_packages: []
+
+        # set to True to support torch nightly build installation,
+        # notice: torch nightly builds are ephemeral and are deleted from time to time
+        torch_nightly: false,
+    },
+
+    # target folder for virtual environments builds, created when executing experiment
+    venvs_dir = ~/.clearml/venvs-builds
+
+    # cached virtual environment folder
+    venvs_cache: {
+        # maximum number of cached venvs
+        max_entries: 10
+        # minimum required free space to allow for cache entry, disable by passing 0 or negative value
+        free_space_threshold_gb: 2.0
+        # unmark to enable virtual environment caching
+        # path: ~/.clearml/venvs-cache
+    },
+
+    # cached git clone folder
+    vcs_cache: {
+        enabled: true,
+        path: ~/.clearml/vcs-cache
+    },
+
+    # use venv-update in order to accelerate python virtual environment building
+    # Still in beta, turned off by default
+    venv_update: {
+        enabled: false,
+    },
+
+    # cached folder for specific python package download (used for pytorch package caching)
+    pip_download_cache {
+        enabled: true,
+        path: ~/.clearml/pip-download-cache
+    },
+
+    translate_ssh: true,
+    # reload configuration file every daemon execution
+    reload_config: false,
+
+    # pip cache folder mapped into docker, used for python package caching
+    docker_pip_cache = ~/.clearml/pip-cache
+    # apt cache folder mapped into docker, used for ubuntu package caching
+    docker_apt_cache = ~/.clearml/apt-cache
+
+    # optional arguments to pass to docker image
+    # these are local for this agent and will not be updated in the experiment's docker_cmd section
+    # extra_docker_arguments: ["--ipc=host", ]
+
+    # optional shell script to run in docker when started before the experiment is started
+    # extra_docker_shell_script: ["apt-get install -y bindfs", ]
+
+    # Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
+    # for backwards compatibility reasons, true as default,
+    # change to false to skip installation and decrease docker spin up time
+    # docker_install_opencv_libs: true
+
+    # optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
+    # If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
+    # Outside of the specified time-spans, the agent will be idle.
+    # Defined using a list of items of the format: "<hours> <days>".
+    # hours - use values 0-23, single values would count as start hour and end at midnight.
+    # days - use days in abbreviated format (SUN-SAT)
+    # use '-' for ranges and ',' to separate singular values.
+    # for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to:
+    # uptime: ["17-20 SUN,TUE"]
+
+    # optional downtime configuration, can be used only when uptime is not used.
+    # If downtime is specified, agent will be idle in the time-spans defined here.
+    # Outside of the specified time-spans, the agent will actively poll (and execute) tasks.
+    # Use the same format as described above for uptime
+    # downtime: []
+
+    # set to true in order to force "docker pull" before running an experiment using a docker image.
+    # This makes sure the docker image is updated.
+    docker_force_pull: false
+
+    default_docker: {
+        # default docker image to use when running in docker mode
+        image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
+
+        # optional arguments to pass to docker image
+        # arguments: ["--ipc=host", ]
+    }
+
+    # set the OS environments based on the Task's Environment section before launching the Task process.
+    enable_task_env: false
+
+    # set the initial bash script to execute at the startup of any docker.
+    # all lines will be executed regardless of their exit code.
+    # {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
+    # docker_init_bash_script = [
+    #     "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
+    #     "chown -R root /root/.cache/pip",
+    #     "apt-get update",
+    #     "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
+    #     "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip",
+    # ]
+
+    # set the preprocessing bash script to execute at the startup of any docker.
+    # all lines will be executed regardless of their exit code.
+    # docker_preprocess_bash_script = [
+    #     "echo \"starting docker\"",
+    #]
+
+    # If False replace \r with \n and display full console output
+    # default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
+    # suppress_carriage_return: true
+
+    # cuda versions used for solving pytorch wheel packages
+    # should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
+    # cuda_version: 10.1
+    # cudnn_version: 7.6
+
+    # Hide docker environment variables containing secrets when printing out the docker command by replacing their
+    # values with "********". Turning this feature on will hide the following environment variables values:
+    #   CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
+    # To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
+    # your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
+    # docker command, set:
+    #   extra_keys: ["MY_SPECIAL_PASSWORD"]
+    hide_docker_command_env_vars {
+        enabled: true
+        extra_keys: []
+    }
+}
+
+sdk {
+    # ClearML - default SDK configuration
+
+    storage {
+        cache {
+            # Defaults to system temp folder / cache
+            default_base_dir: "~/.clearml/cache"
+            size {
+                # max_used_bytes = -1
+                min_free_bytes = 10GB
+                # cleanup_margin_percent = 5%
+            }
+        }
+
+        direct_access: [
+            # Objects matching are considered to be available for direct access, i.e. they will not be downloaded
+            # or cached, and any download request will return a direct reference.
+            # Objects are specified in glob format, available for url and content_type.
+            { url: "file://*" }  # file-urls are always directly referenced
+        ]
+    }
+
+    metrics {
+        # History size for debug files per metric/variant. For each metric/variant combination with an attached file
+        # (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than
+        # X files are stored in the upload destination for each metric/variant combination.
+        file_history_size: 100
+
+        # Max history size for matplotlib imshow files per plot title.
+        # File names for the uploaded images will be recycled in such a way that no more than
+        # X images are stored in the upload destination for each matplotlib plot title.
+        matplotlib_untitled_history_size: 100
+
+        # Limit the number of digits after the dot in plot reporting (reducing plot report size)
+        # plot_max_num_digits: 5
+
+        # Settings for generated debug images
+        images {
+            format: JPEG
+            quality: 87
+            subsampling: 0
+        }
+
+        # Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
+        tensorboard_single_series_per_graph: false
+    }
+
+    network {
+        metrics {
+            # Number of threads allocated to uploading files (typically debug images) when transmitting metrics for
+            # a specific iteration
+            file_upload_threads: 4
+
+            # Warn about upload starvation if no uploads were made in specified period while file-bearing events keep
+            # being sent for upload
+            file_upload_starvation_warning_sec: 120
+        }
+
+        iteration {
+            # Max number of retries when getting frames if the server returned an error (http code 500)
+            max_retries_on_server_error: 5
+            # Backoff factory for consecutive retry attempts.
+            # SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries.
+            retry_backoff_factor_sec: 10
+        }
+    }
+    aws {
+        s3 {
+            # S3 credentials, used for read/write access by various SDK elements
+
+            # default, used for any bucket not specified below
+            key: ""
+            secret: ""
+            region: ""
+
+            credentials: [
+                # specifies key/secret credentials to use when handling s3 urls (read or write)
+                # {
+                #     bucket: "my-bucket-name"
+                #     key: "my-access-key"
+                #     secret: "my-secret-key"
+                # },
+                # {
+                #     # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
+                #     host: "my-minio-host:9000"
+                #     key: "12345678"
+                #     secret: "12345678"
+                #     multipart: false
+                #     secure: false
+                # }
+            ]
+        }
+        boto3 {
+            pool_connections: 512
+            max_multipart_concurrency: 16
+        }
+    }
+    google.storage {
+        # # Default project and credentials file
+        # # Will be used when no bucket configuration is found
+        # project: "clearml"
+        # credentials_json: "/path/to/credentials.json"
+
+        # # Specific credentials per bucket and sub directory
+        # credentials = [
+        #     {
+        #         bucket: "my-bucket"
+        #         subdir: "path/in/bucket" # Not required
+        #         project: "clearml"
+        #         credentials_json: "/path/to/credentials.json"
+        #     },
+        # ]
+    }
+    azure.storage {
+        # containers: [
+        #     {
+        #         account_name: "clearml"
+        #         account_key: "secret"
+        #         # container_name:
+        #     }
+        # ]
+    }
+
+    log {
+        # debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
+        null_log_propagate: false
+        task_log_buffer_capacity: 66
+
+        # disable urllib info and lower levels
+        disable_urllib3_info: true
+    }
+
+    development {
+        # Development-mode options
+
+        # dev task reuse window
+        task_reuse_time_window_in_hours: 72.0
+
+        # Run VCS repository detection asynchronously
+        vcs_repo_detect_async: true
+
+        # Store uncommitted git/hg source code diff in experiment manifest when training in development mode
+        # This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
+        store_uncommitted_code_diff: true
+
+        # Support stopping an experiment in case it was externally stopped, status was changed or task was reset
+        support_stopping: true
+
+        # Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
+        default_output_uri: ""
+
+        # Default auto generated requirements optimize for smaller requirements
+        # If True, analyze the entire repository regardless of the entry point.
+        # If False, first analyze the entry point script, if it does not contain other to local files,
+        # do not analyze the entire repository.
+        force_analyze_entire_repo: false
+
+        # If set to true, *clearml* update message will not be printed to the console
+        # this value can be overwritten with os environment variable CLEARML_SUPPRESS_UPDATE_MESSAGE=1
+        suppress_update_message: false
+
+        # If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
+        detect_with_pip_freeze: false
+
+        # Development mode worker
+        worker {
+            # Status report period in seconds
+            report_period_sec: 2
+
+            # ping to the server - check connectivity
+            ping_period_sec: 30
+
+            # Log all stdout & stderr
+            log_stdout: true
+
+            # compatibility feature, report memory usage for the entire machine
+            # default (false), report only on the running process and its sub-processes
+            report_global_mem_used: false
+        }
+    }
+}
diff --git a/docker/k8s-glue/glue-build/entrypoint.sh b/docker/k8s-glue/glue-build/entrypoint.sh
new file mode 100644
index 0000000..f17ea0b
--- /dev/null
+++ b/docker/k8s-glue/glue-build/entrypoint.sh
@@ -0,0 +1,27 @@
+#!/bin/bash -x
+
+export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-$TRAINS_FILES_HOST}
+
+if [ -z "$CLEARML_FILES_HOST" ]; then
+    CLEARML_HOST_IP=${CLEARML_HOST_IP:-${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}}
+fi
+
+export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-${TRAINS_FILES_HOST:-"http://$CLEARML_HOST_IP:8081"}}
+export CLEARML_WEB_HOST=${CLEARML_WEB_HOST:-${TRAINS_WEB_HOST:-"http://$CLEARML_HOST_IP:8080"}}
+export CLEARML_API_HOST=${CLEARML_API_HOST:-${TRAINS_API_HOST:-"http://$CLEARML_HOST_IP:8008"}}
+
+echo $CLEARML_FILES_HOST $CLEARML_WEB_HOST $CLEARML_API_HOST 1>&2
+
+if [ -z "$CLEARML_AGENT_NO_UPDATE" ]; then
+  if [ -n "$CLEARML_AGENT_UPDATE_REPO" ]; then
+    python3 -m pip install -q -U "$CLEARML_AGENT_UPDATE_REPO"
+  else
+    python3 -m pip install -q -U "clearml-agent${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}"
+  fi
+fi
+
+QUEUE=${K8S_GLUE_QUEUE:-k8s_glue}
+MAX_PODS=${K8S_GLUE_MAX_PODS:-2}
+EXTRA_ARGS=${K8S_GLUE_EXTRA_ARGS:-}
+
+python3 k8s_glue_example.py --queue ${QUEUE} --max-pods ${MAX_PODS} ${EXTRA_ARGS}
diff --git a/docker/k8s-glue/glue-build/k8s_glue_example.py b/docker/k8s-glue/glue-build/k8s_glue_example.py
new file mode 100644
index 0000000..dc69c37
--- /dev/null
+++ b/docker/k8s-glue/glue-build/k8s_glue_example.py
@@ -0,0 +1,94 @@
+"""
+This example assumes you have preconfigured services with selectors in the form of
+ "ai.allegro.agent.serial=pod-<number>" and a targetPort of 10022.
+The K8sIntegration component will label each pod accordingly.
+"""
+from argparse import ArgumentParser
+
+from clearml_agent.glue.k8s import K8sIntegration
+
+
+def parse_args():
+    parser = ArgumentParser()
+    group = parser.add_mutually_exclusive_group()
+
+    parser.add_argument(
+        "--queue", type=str, help="Queue to pull tasks from"
+    )
+    group.add_argument(
+        "--ports-mode", action='store_true', default=False,
+        help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
+             "Should not be used with max-pods"
+    )
+    parser.add_argument(
+        "--num-of-services", type=int, default=20,
+        help="Specify the number of k8s services to be used. Use only with ports-mode."
+    )
+    parser.add_argument(
+        "--base-port", type=int,
+        help="Used in conjunction with ports-mode, specifies the base port exposed by the services. "
+             "For pod #X, the port will be <base-port>+X. Note that pod number is calculated based on base-pod-num"
+             "e.g. if base-port=20000 and base-pod-num=3, the port for the first pod will be 20003"
+    )
+    parser.add_argument(
+        "--base-pod-num", type=int, default=1,
+        help="Used in conjunction with ports-mode and base-port, specifies the base pod number to be used by the "
+             "service (default: %(default)s)"
+    )
+    parser.add_argument(
+        "--gateway-address", type=str, default=None,
+        help="Used in conjunction with ports-mode, specify the external address of the k8s ingress / ELB"
+    )
+    parser.add_argument(
+        "--pod-clearml-conf", type=str,
+        help="Configuration file to be used by the pod itself (if not provided, current configuration is used)"
+    )
+    parser.add_argument(
+        "--overrides-yaml", type=str,
+        help="YAML file containing pod overrides to be used when launching a new pod"
+    )
+    parser.add_argument(
+        "--template-yaml", type=str,
+        help="YAML file containing pod template. If provided pod will be scheduled with kubectl apply "
+             "and overrides are ignored, otherwise it will be scheduled with kubectl run"
+    )
+    parser.add_argument(
+        "--ssh-server-port", type=int, default=0,
+        help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)"
+    )
+    parser.add_argument(
+        "--namespace", type=str,
+        help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml"
+    )
+    group.add_argument(
+        "--max-pods", type=int,
+        help="Limit the maximum number of pods that this service can run at the same time."
+             "Should not be used with ports-mode"
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    user_props_cb = None
+    if args.ports_mode and args.base_port:
+        def k8s_user_props_cb(pod_number=0):
+            user_prop = {"k8s-pod-port": args.base_port + pod_number}
+            if args.gateway_address:
+                user_prop["k8s-gateway-address"] = args.gateway_address
+            return user_prop
+        user_props_cb = k8s_user_props_cb
+
+    k8s = K8sIntegration(
+        ports_mode=args.ports_mode, num_of_services=args.num_of_services, base_pod_num=args.base_pod_num,
+        user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf,
+        template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
+            ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
+        namespace=args.namespace, max_pods_limit=args.max_pods or None,
+    )
+    k8s.k8s_daemon(args.queue)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docker/k8s-glue/glue-build/setup.sh b/docker/k8s-glue/glue-build/setup.sh
new file mode 100644
index 0000000..d0459d6
--- /dev/null
+++ b/docker/k8s-glue/glue-build/setup.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+
+chmod +x /root/entrypoint.sh
+
+apt-get update -y
+apt-get dist-upgrade -y
+apt-get install -y curl unzip less locales
+
+locale-gen en_US.UTF-8
+
+apt-get install -y curl python3-pip git
+python3 -m pip install -U pip
+python3 -m pip install clearml-agent
+python3 -m pip install -U "cryptography>=2.9"
+
+curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
+unzip awscliv2.zip
+./aws/install
+
+curl -o kubectl https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl
+chmod +x ./kubectl && mkdir -p $HOME/bin && cp ./kubectl $HOME/bin/kubectl && export PATH=$PATH:$HOME/bin
+
+curl -o aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/aws-iam-authenticator
+chmod +x ./aws-iam-authenticator && mkdir -p $HOME/bin && cp ./aws-iam-authenticator $HOME/bin/aws-iam-authenticator && export PATH=$PATH:$HOME/bin
+echo 'export PATH=$PATH:$HOME/bin' >> ~/.bashrc
diff --git a/docker/k8s-glue/k8s-glue.yml b/docker/k8s-glue/k8s-glue.yml
new file mode 100644
index 0000000..4b3698d
--- /dev/null
+++ b/docker/k8s-glue/k8s-glue.yml
@@ -0,0 +1,54 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: k8s-glue
+spec:
+  containers:
+    - name: k8s-glue-container
+      image: allegroai/clearml-agent-k8s:test
+      imagePullPolicy: Always
+      command: [
+          "/bin/bash",
+          "-c",
+          "echo \"api.credentials.access_key: $CLEARML_API_ACCESS_KEY\" >> ~/clearml.conf \
+             && echo \"api.credentials.secret_key: $CLEARML_API_SECRET_KEY\" >> ~/clearml.conf \
+             && echo \"api.api_server: $CLEARML_API_HOST\" >> ~/clearml.conf \
+             && echo \"api.web_server: $CLEARML_WEB_HOST\" >> ~/clearml.conf \
+             && echo \"api.files_server: $CLEARML_FILES_HOST\" >> ~/clearml.conf \
+             && source /root/.bashrc \
+             && export PATH=$PATH:$HOME/bin \
+             && /root/entrypoint.sh
+          "
+      ]
+      volumeMounts:
+        - name: pod-template
+          mountPath: /root/template
+      env:
+        - name: CLEARML_API_HOST
+          value: ""
+        - name: CLEARML_WEB_HOST
+          value: ""
+        - name: CLEARML_FILES_HOST
+          value: ""
+#        - name: K8S_GLUE_MAX_PODS
+#          value: "2"
+        - name: K8S_GLUE_QUEUE
+          value: "k8s-glue"
+        - name: K8S_GLUE_EXTRA_ARGS
+          value: "--template-yaml /root/template/pod_template.yml"
+        - name: CLEARML_API_ACCESS_KEY
+          value: ""
+        - name: CLEARML_API_SECRET_KEY
+          value: ""
+        - name: CLEARML_WORKER_ID
+          value: "k8s-glue-agent" 
+        - name: CLEARML_AGENT_UPDATE_REPO
+          value: ""
+        - name: FORCE_CLEARML_AGENT_REPO
+          value: ""
+        - name: CLEARML_DOCKER_IMAGE
+          value: "ubuntu:18.04"
+  volumes:
+    - name: pod-template
+      secret:
+        secretName: k8s-glue-pod-template
diff --git a/docker/k8s-glue/pod_template.yml b/docker/k8s-glue/pod_template.yml
new file mode 100644
index 0000000..8d6771e
--- /dev/null
+++ b/docker/k8s-glue/pod_template.yml
@@ -0,0 +1,13 @@
+apiVersion: v1
+metadata:
+  namespace: clearml
+spec:
+  containers:
+  - resources:
+      limits:
+        cpu:             1000m
+        memory:          4G
+      requests:
+        cpu:             1000m
+        memory:          4G
+  restartPolicy: Never