From 84706ba66d560896cfcc9f0837696d03bc5d6aa0 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Tue, 3 Aug 2021 11:23:33 +0300 Subject: [PATCH] Add docker example for running the agent k8s glue as a pod in a k8s cluster --- docker/k8s-glue/README | 8 + docker/k8s-glue/glue-build/Dockerfile | 18 + docker/k8s-glue/glue-build/clearml.conf | 402 ++++++++++++++++++ docker/k8s-glue/glue-build/entrypoint.sh | 27 ++ .../k8s-glue/glue-build/k8s_glue_example.py | 94 ++++ docker/k8s-glue/glue-build/setup.sh | 25 ++ docker/k8s-glue/k8s-glue.yml | 54 +++ docker/k8s-glue/pod_template.yml | 13 + 8 files changed, 641 insertions(+) create mode 100644 docker/k8s-glue/README create mode 100644 docker/k8s-glue/glue-build/Dockerfile create mode 100644 docker/k8s-glue/glue-build/clearml.conf create mode 100644 docker/k8s-glue/glue-build/entrypoint.sh create mode 100644 docker/k8s-glue/glue-build/k8s_glue_example.py create mode 100644 docker/k8s-glue/glue-build/setup.sh create mode 100644 docker/k8s-glue/k8s-glue.yml create mode 100644 docker/k8s-glue/pod_template.yml diff --git a/docker/k8s-glue/README b/docker/k8s-glue/README new file mode 100644 index 0000000..7f7a8a2 --- /dev/null +++ b/docker/k8s-glue/README @@ -0,0 +1,8 @@ +This folder contains an example docker and templates for running the k8s glue as a pod in a k8s cluster + +Please note that ClearML credentials and server addresses should either be filled in the clearml.conf file before + building the glue docker or provided in the k8s-glue.yml template. + +To run, you'll need to: +* Create a secret from pod_template.yml: `kubectl create secret generic k8s-glue-pod-template --from-file=pod_template.yml` +* Apply the k8s glue template: `kubectl apply -f k8s-glue.yml` \ No newline at end of file diff --git a/docker/k8s-glue/glue-build/Dockerfile b/docker/k8s-glue/glue-build/Dockerfile new file mode 100644 index 0000000..33560f9 --- /dev/null +++ b/docker/k8s-glue/glue-build/Dockerfile @@ -0,0 +1,18 @@ +FROM ubuntu:18.04 + +USER root +WORKDIR /root + +ENV LC_ALL=en_US.UTF-8 +ENV LANG=en_US.UTF-8 +ENV LANGUAGE=en_US.UTF-8 +ENV PYTHONIOENCODING=UTF-8 + +COPY ./entrypoint.sh /root/entrypoint.sh +COPY ./k8s_glue_example.py /root/k8s_glue_example.py +COPY ./setup.sh /root/setup.sh +COPY ./clearml.conf /root/clearml.conf + +RUN /root/setup.sh + +ENTRYPOINT ["/root/entrypoint.sh"] \ No newline at end of file diff --git a/docker/k8s-glue/glue-build/clearml.conf b/docker/k8s-glue/glue-build/clearml.conf new file mode 100644 index 0000000..4959473 --- /dev/null +++ b/docker/k8s-glue/glue-build/clearml.conf @@ -0,0 +1,402 @@ +# CLEARML-AGENT configuration file +api { + # Notice: 'host' is the api server (default port 8008), not the web server. + api_server: "" + web_server: "" + files_server: "" + # Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY + credentials {"access_key": "", "secret_key": ""} +} + +# Set GIT user/pass credentials +# leave blank for GIT SSH credentials +agent.git_user="" +agent.git_pass="" + +# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"] +agent.package_manager.extra_index_url= [ + +] + +agent { + # unique name of this worker, if None, created based on hostname:process_id + # Override with os environment: CLEARML_WORKER_ID + # worker_id: "clearml-agent-machine1:gpu0" + worker_id: "" + + # worker name, replaces the hostname when creating a unique name for this worker + # Override with os environment: CLEARML_WORKER_NAME + # worker_name: "clearml-agent-machine1" + worker_name: "" + + # Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https) + # leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol) + # git_user: "" + # git_pass: "" + # git_host: "" + + # Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank) + force_git_ssh_protocol: false + # Force a specific SSH port when converting http to ssh links (the domain is kept the same) + # force_git_ssh_port: 0 + # Force a specific SSH username when converting http to ssh links (the default username is 'git') + # force_git_ssh_user: git + + # Set the python version to use when creating the virtual environment and launching the experiment + # Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6" + # The default is the python executing the clearml_agent + python_binary: "" + # ignore any requested python version (Default: False, if a Task was using a + # specific python version and the system supports multiple python the agent will use the requested python version) + # ignore_requested_python_version: true + + # select python package manager: + # currently supported pip and conda + # poetry is used if pip selected and repository contains poetry.lock file + package_manager: { + # supported options: pip, conda, poetry + type: pip, + + # specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version) + pip_version: "<20.2", + + # virtual environment inheres packages from system + system_site_packages: false, + + # install with --upgrade + force_upgrade: false, + + # additional artifact repositories to use when installing python packages + # extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"] + + # additional conda channels to use when installing with conda package manager + conda_channels: ["pytorch", "conda-forge", "defaults", ] + + # If set to true, Task's "installed packages" are ignored, + # and the repository's "requirements.txt" is used instead + # force_repo_requirements_txt: false + + # set the priority packages to be installed before the rest of the required packages + # priority_packages: ["cython", "numpy", "setuptools", ] + + # set the optional priority packages to be installed before the rest of the required packages, + # In case a package installation fails, the package will be ignored, + # and the virtual environment process will continue + # priority_optional_packages: ["pygobject", ] + + # set the post packages to be installed after all the rest of the required packages + # post_packages: ["horovod", ] + + # set the optional post packages to be installed after all the rest of the required packages, + # In case a package installation fails, the package will be ignored, + # and the virtual environment process will continue + # post_optional_packages: [] + + # set to True to support torch nightly build installation, + # notice: torch nightly builds are ephemeral and are deleted from time to time + torch_nightly: false, + }, + + # target folder for virtual environments builds, created when executing experiment + venvs_dir = ~/.clearml/venvs-builds + + # cached virtual environment folder + venvs_cache: { + # maximum number of cached venvs + max_entries: 10 + # minimum required free space to allow for cache entry, disable by passing 0 or negative value + free_space_threshold_gb: 2.0 + # unmark to enable virtual environment caching + # path: ~/.clearml/venvs-cache + }, + + # cached git clone folder + vcs_cache: { + enabled: true, + path: ~/.clearml/vcs-cache + }, + + # use venv-update in order to accelerate python virtual environment building + # Still in beta, turned off by default + venv_update: { + enabled: false, + }, + + # cached folder for specific python package download (used for pytorch package caching) + pip_download_cache { + enabled: true, + path: ~/.clearml/pip-download-cache + }, + + translate_ssh: true, + # reload configuration file every daemon execution + reload_config: false, + + # pip cache folder mapped into docker, used for python package caching + docker_pip_cache = ~/.clearml/pip-cache + # apt cache folder mapped into docker, used for ubuntu package caching + docker_apt_cache = ~/.clearml/apt-cache + + # optional arguments to pass to docker image + # these are local for this agent and will not be updated in the experiment's docker_cmd section + # extra_docker_arguments: ["--ipc=host", ] + + # optional shell script to run in docker when started before the experiment is started + # extra_docker_shell_script: ["apt-get install -y bindfs", ] + + # Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0), + # for backwards compatibility reasons, true as default, + # change to false to skip installation and decrease docker spin up time + # docker_install_opencv_libs: true + + # optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both. + # If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here. + # Outside of the specified time-spans, the agent will be idle. + # Defined using a list of items of the format: " ". + # hours - use values 0-23, single values would count as start hour and end at midnight. + # days - use days in abbreviated format (SUN-SAT) + # use '-' for ranges and ',' to separate singular values. + # for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to: + # uptime: ["17-20 SUN,TUE"] + + # optional downtime configuration, can be used only when uptime is not used. + # If downtime is specified, agent will be idle in the time-spans defined here. + # Outside of the specified time-spans, the agent will actively poll (and execute) tasks. + # Use the same format as described above for uptime + # downtime: [] + + # set to true in order to force "docker pull" before running an experiment using a docker image. + # This makes sure the docker image is updated. + docker_force_pull: false + + default_docker: { + # default docker image to use when running in docker mode + image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04" + + # optional arguments to pass to docker image + # arguments: ["--ipc=host", ] + } + + # set the OS environments based on the Task's Environment section before launching the Task process. + enable_task_env: false + + # set the initial bash script to execute at the startup of any docker. + # all lines will be executed regardless of their exit code. + # {python_single_digit} is translated to 'python3' or 'python2' according to requested python version + # docker_init_bash_script = [ + # "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean", + # "chown -R root /root/.cache/pip", + # "apt-get update", + # "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0", + # "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip", + # ] + + # set the preprocessing bash script to execute at the startup of any docker. + # all lines will be executed regardless of their exit code. + # docker_preprocess_bash_script = [ + # "echo \"starting docker\"", + #] + + # If False replace \r with \n and display full console output + # default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds. + # suppress_carriage_return: true + + # cuda versions used for solving pytorch wheel packages + # should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION + # cuda_version: 10.1 + # cudnn_version: 7.6 + + # Hide docker environment variables containing secrets when printing out the docker command by replacing their + # values with "********". Turning this feature on will hide the following environment variables values: + # CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY + # To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of + # your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the + # docker command, set: + # extra_keys: ["MY_SPECIAL_PASSWORD"] + hide_docker_command_env_vars { + enabled: true + extra_keys: [] + } +} + +sdk { + # ClearML - default SDK configuration + + storage { + cache { + # Defaults to system temp folder / cache + default_base_dir: "~/.clearml/cache" + size { + # max_used_bytes = -1 + min_free_bytes = 10GB + # cleanup_margin_percent = 5% + } + } + + direct_access: [ + # Objects matching are considered to be available for direct access, i.e. they will not be downloaded + # or cached, and any download request will return a direct reference. + # Objects are specified in glob format, available for url and content_type. + { url: "file://*" } # file-urls are always directly referenced + ] + } + + metrics { + # History size for debug files per metric/variant. For each metric/variant combination with an attached file + # (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than + # X files are stored in the upload destination for each metric/variant combination. + file_history_size: 100 + + # Max history size for matplotlib imshow files per plot title. + # File names for the uploaded images will be recycled in such a way that no more than + # X images are stored in the upload destination for each matplotlib plot title. + matplotlib_untitled_history_size: 100 + + # Limit the number of digits after the dot in plot reporting (reducing plot report size) + # plot_max_num_digits: 5 + + # Settings for generated debug images + images { + format: JPEG + quality: 87 + subsampling: 0 + } + + # Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph) + tensorboard_single_series_per_graph: false + } + + network { + metrics { + # Number of threads allocated to uploading files (typically debug images) when transmitting metrics for + # a specific iteration + file_upload_threads: 4 + + # Warn about upload starvation if no uploads were made in specified period while file-bearing events keep + # being sent for upload + file_upload_starvation_warning_sec: 120 + } + + iteration { + # Max number of retries when getting frames if the server returned an error (http code 500) + max_retries_on_server_error: 5 + # Backoff factory for consecutive retry attempts. + # SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries. + retry_backoff_factor_sec: 10 + } + } + aws { + s3 { + # S3 credentials, used for read/write access by various SDK elements + + # default, used for any bucket not specified below + key: "" + secret: "" + region: "" + + credentials: [ + # specifies key/secret credentials to use when handling s3 urls (read or write) + # { + # bucket: "my-bucket-name" + # key: "my-access-key" + # secret: "my-secret-key" + # }, + # { + # # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket) + # host: "my-minio-host:9000" + # key: "12345678" + # secret: "12345678" + # multipart: false + # secure: false + # } + ] + } + boto3 { + pool_connections: 512 + max_multipart_concurrency: 16 + } + } + google.storage { + # # Default project and credentials file + # # Will be used when no bucket configuration is found + # project: "clearml" + # credentials_json: "/path/to/credentials.json" + + # # Specific credentials per bucket and sub directory + # credentials = [ + # { + # bucket: "my-bucket" + # subdir: "path/in/bucket" # Not required + # project: "clearml" + # credentials_json: "/path/to/credentials.json" + # }, + # ] + } + azure.storage { + # containers: [ + # { + # account_name: "clearml" + # account_key: "secret" + # # container_name: + # } + # ] + } + + log { + # debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout) + null_log_propagate: false + task_log_buffer_capacity: 66 + + # disable urllib info and lower levels + disable_urllib3_info: true + } + + development { + # Development-mode options + + # dev task reuse window + task_reuse_time_window_in_hours: 72.0 + + # Run VCS repository detection asynchronously + vcs_repo_detect_async: true + + # Store uncommitted git/hg source code diff in experiment manifest when training in development mode + # This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section + store_uncommitted_code_diff: true + + # Support stopping an experiment in case it was externally stopped, status was changed or task was reset + support_stopping: true + + # Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead. + default_output_uri: "" + + # Default auto generated requirements optimize for smaller requirements + # If True, analyze the entire repository regardless of the entry point. + # If False, first analyze the entry point script, if it does not contain other to local files, + # do not analyze the entire repository. + force_analyze_entire_repo: false + + # If set to true, *clearml* update message will not be printed to the console + # this value can be overwritten with os environment variable CLEARML_SUPPRESS_UPDATE_MESSAGE=1 + suppress_update_message: false + + # If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze` + detect_with_pip_freeze: false + + # Development mode worker + worker { + # Status report period in seconds + report_period_sec: 2 + + # ping to the server - check connectivity + ping_period_sec: 30 + + # Log all stdout & stderr + log_stdout: true + + # compatibility feature, report memory usage for the entire machine + # default (false), report only on the running process and its sub-processes + report_global_mem_used: false + } + } +} diff --git a/docker/k8s-glue/glue-build/entrypoint.sh b/docker/k8s-glue/glue-build/entrypoint.sh new file mode 100644 index 0000000..f17ea0b --- /dev/null +++ b/docker/k8s-glue/glue-build/entrypoint.sh @@ -0,0 +1,27 @@ +#!/bin/bash -x + +export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-$TRAINS_FILES_HOST} + +if [ -z "$CLEARML_FILES_HOST" ]; then + CLEARML_HOST_IP=${CLEARML_HOST_IP:-${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}} +fi + +export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-${TRAINS_FILES_HOST:-"http://$CLEARML_HOST_IP:8081"}} +export CLEARML_WEB_HOST=${CLEARML_WEB_HOST:-${TRAINS_WEB_HOST:-"http://$CLEARML_HOST_IP:8080"}} +export CLEARML_API_HOST=${CLEARML_API_HOST:-${TRAINS_API_HOST:-"http://$CLEARML_HOST_IP:8008"}} + +echo $CLEARML_FILES_HOST $CLEARML_WEB_HOST $CLEARML_API_HOST 1>&2 + +if [ -z "$CLEARML_AGENT_NO_UPDATE" ]; then + if [ -n "$CLEARML_AGENT_UPDATE_REPO" ]; then + python3 -m pip install -q -U "$CLEARML_AGENT_UPDATE_REPO" + else + python3 -m pip install -q -U "clearml-agent${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}" + fi +fi + +QUEUE=${K8S_GLUE_QUEUE:-k8s_glue} +MAX_PODS=${K8S_GLUE_MAX_PODS:-2} +EXTRA_ARGS=${K8S_GLUE_EXTRA_ARGS:-} + +python3 k8s_glue_example.py --queue ${QUEUE} --max-pods ${MAX_PODS} ${EXTRA_ARGS} diff --git a/docker/k8s-glue/glue-build/k8s_glue_example.py b/docker/k8s-glue/glue-build/k8s_glue_example.py new file mode 100644 index 0000000..dc69c37 --- /dev/null +++ b/docker/k8s-glue/glue-build/k8s_glue_example.py @@ -0,0 +1,94 @@ +""" +This example assumes you have preconfigured services with selectors in the form of + "ai.allegro.agent.serial=pod-" and a targetPort of 10022. +The K8sIntegration component will label each pod accordingly. +""" +from argparse import ArgumentParser + +from clearml_agent.glue.k8s import K8sIntegration + + +def parse_args(): + parser = ArgumentParser() + group = parser.add_mutually_exclusive_group() + + parser.add_argument( + "--queue", type=str, help="Queue to pull tasks from" + ) + group.add_argument( + "--ports-mode", action='store_true', default=False, + help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports" + "Should not be used with max-pods" + ) + parser.add_argument( + "--num-of-services", type=int, default=20, + help="Specify the number of k8s services to be used. Use only with ports-mode." + ) + parser.add_argument( + "--base-port", type=int, + help="Used in conjunction with ports-mode, specifies the base port exposed by the services. " + "For pod #X, the port will be +X. Note that pod number is calculated based on base-pod-num" + "e.g. if base-port=20000 and base-pod-num=3, the port for the first pod will be 20003" + ) + parser.add_argument( + "--base-pod-num", type=int, default=1, + help="Used in conjunction with ports-mode and base-port, specifies the base pod number to be used by the " + "service (default: %(default)s)" + ) + parser.add_argument( + "--gateway-address", type=str, default=None, + help="Used in conjunction with ports-mode, specify the external address of the k8s ingress / ELB" + ) + parser.add_argument( + "--pod-clearml-conf", type=str, + help="Configuration file to be used by the pod itself (if not provided, current configuration is used)" + ) + parser.add_argument( + "--overrides-yaml", type=str, + help="YAML file containing pod overrides to be used when launching a new pod" + ) + parser.add_argument( + "--template-yaml", type=str, + help="YAML file containing pod template. If provided pod will be scheduled with kubectl apply " + "and overrides are ignored, otherwise it will be scheduled with kubectl run" + ) + parser.add_argument( + "--ssh-server-port", type=int, default=0, + help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)" + ) + parser.add_argument( + "--namespace", type=str, + help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml" + ) + group.add_argument( + "--max-pods", type=int, + help="Limit the maximum number of pods that this service can run at the same time." + "Should not be used with ports-mode" + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + user_props_cb = None + if args.ports_mode and args.base_port: + def k8s_user_props_cb(pod_number=0): + user_prop = {"k8s-pod-port": args.base_port + pod_number} + if args.gateway_address: + user_prop["k8s-gateway-address"] = args.gateway_address + return user_prop + user_props_cb = k8s_user_props_cb + + k8s = K8sIntegration( + ports_mode=args.ports_mode, num_of_services=args.num_of_services, base_pod_num=args.base_pod_num, + user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf, + template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash( + ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None, + namespace=args.namespace, max_pods_limit=args.max_pods or None, + ) + k8s.k8s_daemon(args.queue) + + +if __name__ == "__main__": + main() diff --git a/docker/k8s-glue/glue-build/setup.sh b/docker/k8s-glue/glue-build/setup.sh new file mode 100644 index 0000000..d0459d6 --- /dev/null +++ b/docker/k8s-glue/glue-build/setup.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +chmod +x /root/entrypoint.sh + +apt-get update -y +apt-get dist-upgrade -y +apt-get install -y curl unzip less locales + +locale-gen en_US.UTF-8 + +apt-get install -y curl python3-pip git +python3 -m pip install -U pip +python3 -m pip install clearml-agent +python3 -m pip install -U "cryptography>=2.9" + +curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" +unzip awscliv2.zip +./aws/install + +curl -o kubectl https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl +chmod +x ./kubectl && mkdir -p $HOME/bin && cp ./kubectl $HOME/bin/kubectl && export PATH=$PATH:$HOME/bin + +curl -o aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/aws-iam-authenticator +chmod +x ./aws-iam-authenticator && mkdir -p $HOME/bin && cp ./aws-iam-authenticator $HOME/bin/aws-iam-authenticator && export PATH=$PATH:$HOME/bin +echo 'export PATH=$PATH:$HOME/bin' >> ~/.bashrc diff --git a/docker/k8s-glue/k8s-glue.yml b/docker/k8s-glue/k8s-glue.yml new file mode 100644 index 0000000..4b3698d --- /dev/null +++ b/docker/k8s-glue/k8s-glue.yml @@ -0,0 +1,54 @@ +apiVersion: v1 +kind: Pod +metadata: + name: k8s-glue +spec: + containers: + - name: k8s-glue-container + image: allegroai/clearml-agent-k8s:test + imagePullPolicy: Always + command: [ + "/bin/bash", + "-c", + "echo \"api.credentials.access_key: $CLEARML_API_ACCESS_KEY\" >> ~/clearml.conf \ + && echo \"api.credentials.secret_key: $CLEARML_API_SECRET_KEY\" >> ~/clearml.conf \ + && echo \"api.api_server: $CLEARML_API_HOST\" >> ~/clearml.conf \ + && echo \"api.web_server: $CLEARML_WEB_HOST\" >> ~/clearml.conf \ + && echo \"api.files_server: $CLEARML_FILES_HOST\" >> ~/clearml.conf \ + && source /root/.bashrc \ + && export PATH=$PATH:$HOME/bin \ + && /root/entrypoint.sh + " + ] + volumeMounts: + - name: pod-template + mountPath: /root/template + env: + - name: CLEARML_API_HOST + value: "" + - name: CLEARML_WEB_HOST + value: "" + - name: CLEARML_FILES_HOST + value: "" +# - name: K8S_GLUE_MAX_PODS +# value: "2" + - name: K8S_GLUE_QUEUE + value: "k8s-glue" + - name: K8S_GLUE_EXTRA_ARGS + value: "--template-yaml /root/template/pod_template.yml" + - name: CLEARML_API_ACCESS_KEY + value: "" + - name: CLEARML_API_SECRET_KEY + value: "" + - name: CLEARML_WORKER_ID + value: "k8s-glue-agent" + - name: CLEARML_AGENT_UPDATE_REPO + value: "" + - name: FORCE_CLEARML_AGENT_REPO + value: "" + - name: CLEARML_DOCKER_IMAGE + value: "ubuntu:18.04" + volumes: + - name: pod-template + secret: + secretName: k8s-glue-pod-template diff --git a/docker/k8s-glue/pod_template.yml b/docker/k8s-glue/pod_template.yml new file mode 100644 index 0000000..8d6771e --- /dev/null +++ b/docker/k8s-glue/pod_template.yml @@ -0,0 +1,13 @@ +apiVersion: v1 +metadata: + namespace: clearml +spec: + containers: + - resources: + limits: + cpu: 1000m + memory: 4G + requests: + cpu: 1000m + memory: 4G + restartPolicy: Never