Add docker example for running the agent k8s glue as a pod in a k8s cluster

This commit is contained in:
allegroai 2021-08-03 11:23:33 +03:00
parent 6b602889a5
commit 84706ba66d
8 changed files with 641 additions and 0 deletions

8
docker/k8s-glue/README Normal file
View File

@ -0,0 +1,8 @@
This folder contains an example docker and templates for running the k8s glue as a pod in a k8s cluster
Please note that ClearML credentials and server addresses should either be filled in the clearml.conf file before
building the glue docker or provided in the k8s-glue.yml template.
To run, you'll need to:
* Create a secret from pod_template.yml: `kubectl create secret generic k8s-glue-pod-template --from-file=pod_template.yml`
* Apply the k8s glue template: `kubectl apply -f k8s-glue.yml`

View File

@ -0,0 +1,18 @@
FROM ubuntu:18.04
USER root
WORKDIR /root
ENV LC_ALL=en_US.UTF-8
ENV LANG=en_US.UTF-8
ENV LANGUAGE=en_US.UTF-8
ENV PYTHONIOENCODING=UTF-8
COPY ./entrypoint.sh /root/entrypoint.sh
COPY ./k8s_glue_example.py /root/k8s_glue_example.py
COPY ./setup.sh /root/setup.sh
COPY ./clearml.conf /root/clearml.conf
RUN /root/setup.sh
ENTRYPOINT ["/root/entrypoint.sh"]

View File

@ -0,0 +1,402 @@
# CLEARML-AGENT configuration file
api {
# Notice: 'host' is the api server (default port 8008), not the web server.
api_server: ""
web_server: ""
files_server: ""
# Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY
credentials {"access_key": "", "secret_key": ""}
}
# Set GIT user/pass credentials
# leave blank for GIT SSH credentials
agent.git_user=""
agent.git_pass=""
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
agent.package_manager.extra_index_url= [
]
agent {
# unique name of this worker, if None, created based on hostname:process_id
# Override with os environment: CLEARML_WORKER_ID
# worker_id: "clearml-agent-machine1:gpu0"
worker_id: ""
# worker name, replaces the hostname when creating a unique name for this worker
# Override with os environment: CLEARML_WORKER_NAME
# worker_name: "clearml-agent-machine1"
worker_name: ""
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
# git_user: ""
# git_pass: ""
# git_host: ""
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
force_git_ssh_protocol: false
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
# force_git_ssh_port: 0
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
# force_git_ssh_user: git
# Set the python version to use when creating the virtual environment and launching the experiment
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
# The default is the python executing the clearml_agent
python_binary: ""
# ignore any requested python version (Default: False, if a Task was using a
# specific python version and the system supports multiple python the agent will use the requested python version)
# ignore_requested_python_version: true
# select python package manager:
# currently supported pip and conda
# poetry is used if pip selected and repository contains poetry.lock file
package_manager: {
# supported options: pip, conda, poetry
type: pip,
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
pip_version: "<20.2",
# virtual environment inheres packages from system
system_site_packages: false,
# install with --upgrade
force_upgrade: false,
# additional artifact repositories to use when installing python packages
# extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"]
# additional conda channels to use when installing with conda package manager
conda_channels: ["pytorch", "conda-forge", "defaults", ]
# If set to true, Task's "installed packages" are ignored,
# and the repository's "requirements.txt" is used instead
# force_repo_requirements_txt: false
# set the priority packages to be installed before the rest of the required packages
# priority_packages: ["cython", "numpy", "setuptools", ]
# set the optional priority packages to be installed before the rest of the required packages,
# In case a package installation fails, the package will be ignored,
# and the virtual environment process will continue
# priority_optional_packages: ["pygobject", ]
# set the post packages to be installed after all the rest of the required packages
# post_packages: ["horovod", ]
# set the optional post packages to be installed after all the rest of the required packages,
# In case a package installation fails, the package will be ignored,
# and the virtual environment process will continue
# post_optional_packages: []
# set to True to support torch nightly build installation,
# notice: torch nightly builds are ephemeral and are deleted from time to time
torch_nightly: false,
},
# target folder for virtual environments builds, created when executing experiment
venvs_dir = ~/.clearml/venvs-builds
# cached virtual environment folder
venvs_cache: {
# maximum number of cached venvs
max_entries: 10
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
free_space_threshold_gb: 2.0
# unmark to enable virtual environment caching
# path: ~/.clearml/venvs-cache
},
# cached git clone folder
vcs_cache: {
enabled: true,
path: ~/.clearml/vcs-cache
},
# use venv-update in order to accelerate python virtual environment building
# Still in beta, turned off by default
venv_update: {
enabled: false,
},
# cached folder for specific python package download (used for pytorch package caching)
pip_download_cache {
enabled: true,
path: ~/.clearml/pip-download-cache
},
translate_ssh: true,
# reload configuration file every daemon execution
reload_config: false,
# pip cache folder mapped into docker, used for python package caching
docker_pip_cache = ~/.clearml/pip-cache
# apt cache folder mapped into docker, used for ubuntu package caching
docker_apt_cache = ~/.clearml/apt-cache
# optional arguments to pass to docker image
# these are local for this agent and will not be updated in the experiment's docker_cmd section
# extra_docker_arguments: ["--ipc=host", ]
# optional shell script to run in docker when started before the experiment is started
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
# Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
# for backwards compatibility reasons, true as default,
# change to false to skip installation and decrease docker spin up time
# docker_install_opencv_libs: true
# optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
# If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
# Outside of the specified time-spans, the agent will be idle.
# Defined using a list of items of the format: "<hours> <days>".
# hours - use values 0-23, single values would count as start hour and end at midnight.
# days - use days in abbreviated format (SUN-SAT)
# use '-' for ranges and ',' to separate singular values.
# for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to:
# uptime: ["17-20 SUN,TUE"]
# optional downtime configuration, can be used only when uptime is not used.
# If downtime is specified, agent will be idle in the time-spans defined here.
# Outside of the specified time-spans, the agent will actively poll (and execute) tasks.
# Use the same format as described above for uptime
# downtime: []
# set to true in order to force "docker pull" before running an experiment using a docker image.
# This makes sure the docker image is updated.
docker_force_pull: false
default_docker: {
# default docker image to use when running in docker mode
image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
# optional arguments to pass to docker image
# arguments: ["--ipc=host", ]
}
# set the OS environments based on the Task's Environment section before launching the Task process.
enable_task_env: false
# set the initial bash script to execute at the startup of any docker.
# all lines will be executed regardless of their exit code.
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
# docker_init_bash_script = [
# "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
# "chown -R root /root/.cache/pip",
# "apt-get update",
# "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
# "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip",
# ]
# set the preprocessing bash script to execute at the startup of any docker.
# all lines will be executed regardless of their exit code.
# docker_preprocess_bash_script = [
# "echo \"starting docker\"",
#]
# If False replace \r with \n and display full console output
# default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
# suppress_carriage_return: true
# cuda versions used for solving pytorch wheel packages
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
# cuda_version: 10.1
# cudnn_version: 7.6
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
# values with "********". Turning this feature on will hide the following environment variables values:
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
# docker command, set:
# extra_keys: ["MY_SPECIAL_PASSWORD"]
hide_docker_command_env_vars {
enabled: true
extra_keys: []
}
}
sdk {
# ClearML - default SDK configuration
storage {
cache {
# Defaults to system temp folder / cache
default_base_dir: "~/.clearml/cache"
size {
# max_used_bytes = -1
min_free_bytes = 10GB
# cleanup_margin_percent = 5%
}
}
direct_access: [
# Objects matching are considered to be available for direct access, i.e. they will not be downloaded
# or cached, and any download request will return a direct reference.
# Objects are specified in glob format, available for url and content_type.
{ url: "file://*" } # file-urls are always directly referenced
]
}
metrics {
# History size for debug files per metric/variant. For each metric/variant combination with an attached file
# (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than
# X files are stored in the upload destination for each metric/variant combination.
file_history_size: 100
# Max history size for matplotlib imshow files per plot title.
# File names for the uploaded images will be recycled in such a way that no more than
# X images are stored in the upload destination for each matplotlib plot title.
matplotlib_untitled_history_size: 100
# Limit the number of digits after the dot in plot reporting (reducing plot report size)
# plot_max_num_digits: 5
# Settings for generated debug images
images {
format: JPEG
quality: 87
subsampling: 0
}
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
tensorboard_single_series_per_graph: false
}
network {
metrics {
# Number of threads allocated to uploading files (typically debug images) when transmitting metrics for
# a specific iteration
file_upload_threads: 4
# Warn about upload starvation if no uploads were made in specified period while file-bearing events keep
# being sent for upload
file_upload_starvation_warning_sec: 120
}
iteration {
# Max number of retries when getting frames if the server returned an error (http code 500)
max_retries_on_server_error: 5
# Backoff factory for consecutive retry attempts.
# SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries.
retry_backoff_factor_sec: 10
}
}
aws {
s3 {
# S3 credentials, used for read/write access by various SDK elements
# default, used for any bucket not specified below
key: ""
secret: ""
region: ""
credentials: [
# specifies key/secret credentials to use when handling s3 urls (read or write)
# {
# bucket: "my-bucket-name"
# key: "my-access-key"
# secret: "my-secret-key"
# },
# {
# # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
# host: "my-minio-host:9000"
# key: "12345678"
# secret: "12345678"
# multipart: false
# secure: false
# }
]
}
boto3 {
pool_connections: 512
max_multipart_concurrency: 16
}
}
google.storage {
# # Default project and credentials file
# # Will be used when no bucket configuration is found
# project: "clearml"
# credentials_json: "/path/to/credentials.json"
# # Specific credentials per bucket and sub directory
# credentials = [
# {
# bucket: "my-bucket"
# subdir: "path/in/bucket" # Not required
# project: "clearml"
# credentials_json: "/path/to/credentials.json"
# },
# ]
}
azure.storage {
# containers: [
# {
# account_name: "clearml"
# account_key: "secret"
# # container_name:
# }
# ]
}
log {
# debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
null_log_propagate: false
task_log_buffer_capacity: 66
# disable urllib info and lower levels
disable_urllib3_info: true
}
development {
# Development-mode options
# dev task reuse window
task_reuse_time_window_in_hours: 72.0
# Run VCS repository detection asynchronously
vcs_repo_detect_async: true
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
store_uncommitted_code_diff: true
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
support_stopping: true
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
default_output_uri: ""
# Default auto generated requirements optimize for smaller requirements
# If True, analyze the entire repository regardless of the entry point.
# If False, first analyze the entry point script, if it does not contain other to local files,
# do not analyze the entire repository.
force_analyze_entire_repo: false
# If set to true, *clearml* update message will not be printed to the console
# this value can be overwritten with os environment variable CLEARML_SUPPRESS_UPDATE_MESSAGE=1
suppress_update_message: false
# If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
detect_with_pip_freeze: false
# Development mode worker
worker {
# Status report period in seconds
report_period_sec: 2
# ping to the server - check connectivity
ping_period_sec: 30
# Log all stdout & stderr
log_stdout: true
# compatibility feature, report memory usage for the entire machine
# default (false), report only on the running process and its sub-processes
report_global_mem_used: false
}
}
}

View File

@ -0,0 +1,27 @@
#!/bin/bash -x
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-$TRAINS_FILES_HOST}
if [ -z "$CLEARML_FILES_HOST" ]; then
CLEARML_HOST_IP=${CLEARML_HOST_IP:-${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}}
fi
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-${TRAINS_FILES_HOST:-"http://$CLEARML_HOST_IP:8081"}}
export CLEARML_WEB_HOST=${CLEARML_WEB_HOST:-${TRAINS_WEB_HOST:-"http://$CLEARML_HOST_IP:8080"}}
export CLEARML_API_HOST=${CLEARML_API_HOST:-${TRAINS_API_HOST:-"http://$CLEARML_HOST_IP:8008"}}
echo $CLEARML_FILES_HOST $CLEARML_WEB_HOST $CLEARML_API_HOST 1>&2
if [ -z "$CLEARML_AGENT_NO_UPDATE" ]; then
if [ -n "$CLEARML_AGENT_UPDATE_REPO" ]; then
python3 -m pip install -q -U "$CLEARML_AGENT_UPDATE_REPO"
else
python3 -m pip install -q -U "clearml-agent${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}"
fi
fi
QUEUE=${K8S_GLUE_QUEUE:-k8s_glue}
MAX_PODS=${K8S_GLUE_MAX_PODS:-2}
EXTRA_ARGS=${K8S_GLUE_EXTRA_ARGS:-}
python3 k8s_glue_example.py --queue ${QUEUE} --max-pods ${MAX_PODS} ${EXTRA_ARGS}

View File

@ -0,0 +1,94 @@
"""
This example assumes you have preconfigured services with selectors in the form of
"ai.allegro.agent.serial=pod-<number>" and a targetPort of 10022.
The K8sIntegration component will label each pod accordingly.
"""
from argparse import ArgumentParser
from clearml_agent.glue.k8s import K8sIntegration
def parse_args():
parser = ArgumentParser()
group = parser.add_mutually_exclusive_group()
parser.add_argument(
"--queue", type=str, help="Queue to pull tasks from"
)
group.add_argument(
"--ports-mode", action='store_true', default=False,
help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
"Should not be used with max-pods"
)
parser.add_argument(
"--num-of-services", type=int, default=20,
help="Specify the number of k8s services to be used. Use only with ports-mode."
)
parser.add_argument(
"--base-port", type=int,
help="Used in conjunction with ports-mode, specifies the base port exposed by the services. "
"For pod #X, the port will be <base-port>+X. Note that pod number is calculated based on base-pod-num"
"e.g. if base-port=20000 and base-pod-num=3, the port for the first pod will be 20003"
)
parser.add_argument(
"--base-pod-num", type=int, default=1,
help="Used in conjunction with ports-mode and base-port, specifies the base pod number to be used by the "
"service (default: %(default)s)"
)
parser.add_argument(
"--gateway-address", type=str, default=None,
help="Used in conjunction with ports-mode, specify the external address of the k8s ingress / ELB"
)
parser.add_argument(
"--pod-clearml-conf", type=str,
help="Configuration file to be used by the pod itself (if not provided, current configuration is used)"
)
parser.add_argument(
"--overrides-yaml", type=str,
help="YAML file containing pod overrides to be used when launching a new pod"
)
parser.add_argument(
"--template-yaml", type=str,
help="YAML file containing pod template. If provided pod will be scheduled with kubectl apply "
"and overrides are ignored, otherwise it will be scheduled with kubectl run"
)
parser.add_argument(
"--ssh-server-port", type=int, default=0,
help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)"
)
parser.add_argument(
"--namespace", type=str,
help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml"
)
group.add_argument(
"--max-pods", type=int,
help="Limit the maximum number of pods that this service can run at the same time."
"Should not be used with ports-mode"
)
return parser.parse_args()
def main():
args = parse_args()
user_props_cb = None
if args.ports_mode and args.base_port:
def k8s_user_props_cb(pod_number=0):
user_prop = {"k8s-pod-port": args.base_port + pod_number}
if args.gateway_address:
user_prop["k8s-gateway-address"] = args.gateway_address
return user_prop
user_props_cb = k8s_user_props_cb
k8s = K8sIntegration(
ports_mode=args.ports_mode, num_of_services=args.num_of_services, base_pod_num=args.base_pod_num,
user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf,
template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
namespace=args.namespace, max_pods_limit=args.max_pods or None,
)
k8s.k8s_daemon(args.queue)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,25 @@
#!/bin/bash
chmod +x /root/entrypoint.sh
apt-get update -y
apt-get dist-upgrade -y
apt-get install -y curl unzip less locales
locale-gen en_US.UTF-8
apt-get install -y curl python3-pip git
python3 -m pip install -U pip
python3 -m pip install clearml-agent
python3 -m pip install -U "cryptography>=2.9"
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
unzip awscliv2.zip
./aws/install
curl -o kubectl https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl
chmod +x ./kubectl && mkdir -p $HOME/bin && cp ./kubectl $HOME/bin/kubectl && export PATH=$PATH:$HOME/bin
curl -o aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/aws-iam-authenticator
chmod +x ./aws-iam-authenticator && mkdir -p $HOME/bin && cp ./aws-iam-authenticator $HOME/bin/aws-iam-authenticator && export PATH=$PATH:$HOME/bin
echo 'export PATH=$PATH:$HOME/bin' >> ~/.bashrc

View File

@ -0,0 +1,54 @@
apiVersion: v1
kind: Pod
metadata:
name: k8s-glue
spec:
containers:
- name: k8s-glue-container
image: allegroai/clearml-agent-k8s:test
imagePullPolicy: Always
command: [
"/bin/bash",
"-c",
"echo \"api.credentials.access_key: $CLEARML_API_ACCESS_KEY\" >> ~/clearml.conf \
&& echo \"api.credentials.secret_key: $CLEARML_API_SECRET_KEY\" >> ~/clearml.conf \
&& echo \"api.api_server: $CLEARML_API_HOST\" >> ~/clearml.conf \
&& echo \"api.web_server: $CLEARML_WEB_HOST\" >> ~/clearml.conf \
&& echo \"api.files_server: $CLEARML_FILES_HOST\" >> ~/clearml.conf \
&& source /root/.bashrc \
&& export PATH=$PATH:$HOME/bin \
&& /root/entrypoint.sh
"
]
volumeMounts:
- name: pod-template
mountPath: /root/template
env:
- name: CLEARML_API_HOST
value: ""
- name: CLEARML_WEB_HOST
value: ""
- name: CLEARML_FILES_HOST
value: ""
# - name: K8S_GLUE_MAX_PODS
# value: "2"
- name: K8S_GLUE_QUEUE
value: "k8s-glue"
- name: K8S_GLUE_EXTRA_ARGS
value: "--template-yaml /root/template/pod_template.yml"
- name: CLEARML_API_ACCESS_KEY
value: ""
- name: CLEARML_API_SECRET_KEY
value: ""
- name: CLEARML_WORKER_ID
value: "k8s-glue-agent"
- name: CLEARML_AGENT_UPDATE_REPO
value: ""
- name: FORCE_CLEARML_AGENT_REPO
value: ""
- name: CLEARML_DOCKER_IMAGE
value: "ubuntu:18.04"
volumes:
- name: pod-template
secret:
secretName: k8s-glue-pod-template

View File

@ -0,0 +1,13 @@
apiVersion: v1
metadata:
namespace: clearml
spec:
containers:
- resources:
limits:
cpu: 1000m
memory: 4G
requests:
cpu: 1000m
memory: 4G
restartPolicy: Never