mirror of
https://github.com/clearml/clearml-agent
synced 2025-03-13 06:58:37 +00:00
Add docker example for running the agent k8s glue as a pod in a k8s cluster
This commit is contained in:
parent
6b602889a5
commit
84706ba66d
8
docker/k8s-glue/README
Normal file
8
docker/k8s-glue/README
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
This folder contains an example docker and templates for running the k8s glue as a pod in a k8s cluster
|
||||||
|
|
||||||
|
Please note that ClearML credentials and server addresses should either be filled in the clearml.conf file before
|
||||||
|
building the glue docker or provided in the k8s-glue.yml template.
|
||||||
|
|
||||||
|
To run, you'll need to:
|
||||||
|
* Create a secret from pod_template.yml: `kubectl create secret generic k8s-glue-pod-template --from-file=pod_template.yml`
|
||||||
|
* Apply the k8s glue template: `kubectl apply -f k8s-glue.yml`
|
18
docker/k8s-glue/glue-build/Dockerfile
Normal file
18
docker/k8s-glue/glue-build/Dockerfile
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
FROM ubuntu:18.04
|
||||||
|
|
||||||
|
USER root
|
||||||
|
WORKDIR /root
|
||||||
|
|
||||||
|
ENV LC_ALL=en_US.UTF-8
|
||||||
|
ENV LANG=en_US.UTF-8
|
||||||
|
ENV LANGUAGE=en_US.UTF-8
|
||||||
|
ENV PYTHONIOENCODING=UTF-8
|
||||||
|
|
||||||
|
COPY ./entrypoint.sh /root/entrypoint.sh
|
||||||
|
COPY ./k8s_glue_example.py /root/k8s_glue_example.py
|
||||||
|
COPY ./setup.sh /root/setup.sh
|
||||||
|
COPY ./clearml.conf /root/clearml.conf
|
||||||
|
|
||||||
|
RUN /root/setup.sh
|
||||||
|
|
||||||
|
ENTRYPOINT ["/root/entrypoint.sh"]
|
402
docker/k8s-glue/glue-build/clearml.conf
Normal file
402
docker/k8s-glue/glue-build/clearml.conf
Normal file
@ -0,0 +1,402 @@
|
|||||||
|
# CLEARML-AGENT configuration file
|
||||||
|
api {
|
||||||
|
# Notice: 'host' is the api server (default port 8008), not the web server.
|
||||||
|
api_server: ""
|
||||||
|
web_server: ""
|
||||||
|
files_server: ""
|
||||||
|
# Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY
|
||||||
|
credentials {"access_key": "", "secret_key": ""}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set GIT user/pass credentials
|
||||||
|
# leave blank for GIT SSH credentials
|
||||||
|
agent.git_user=""
|
||||||
|
agent.git_pass=""
|
||||||
|
|
||||||
|
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
|
||||||
|
agent.package_manager.extra_index_url= [
|
||||||
|
|
||||||
|
]
|
||||||
|
|
||||||
|
agent {
|
||||||
|
# unique name of this worker, if None, created based on hostname:process_id
|
||||||
|
# Override with os environment: CLEARML_WORKER_ID
|
||||||
|
# worker_id: "clearml-agent-machine1:gpu0"
|
||||||
|
worker_id: ""
|
||||||
|
|
||||||
|
# worker name, replaces the hostname when creating a unique name for this worker
|
||||||
|
# Override with os environment: CLEARML_WORKER_NAME
|
||||||
|
# worker_name: "clearml-agent-machine1"
|
||||||
|
worker_name: ""
|
||||||
|
|
||||||
|
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||||
|
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||||
|
# git_user: ""
|
||||||
|
# git_pass: ""
|
||||||
|
# git_host: ""
|
||||||
|
|
||||||
|
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||||
|
force_git_ssh_protocol: false
|
||||||
|
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||||
|
# force_git_ssh_port: 0
|
||||||
|
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
|
||||||
|
# force_git_ssh_user: git
|
||||||
|
|
||||||
|
# Set the python version to use when creating the virtual environment and launching the experiment
|
||||||
|
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||||
|
# The default is the python executing the clearml_agent
|
||||||
|
python_binary: ""
|
||||||
|
# ignore any requested python version (Default: False, if a Task was using a
|
||||||
|
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||||
|
# ignore_requested_python_version: true
|
||||||
|
|
||||||
|
# select python package manager:
|
||||||
|
# currently supported pip and conda
|
||||||
|
# poetry is used if pip selected and repository contains poetry.lock file
|
||||||
|
package_manager: {
|
||||||
|
# supported options: pip, conda, poetry
|
||||||
|
type: pip,
|
||||||
|
|
||||||
|
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
|
||||||
|
pip_version: "<20.2",
|
||||||
|
|
||||||
|
# virtual environment inheres packages from system
|
||||||
|
system_site_packages: false,
|
||||||
|
|
||||||
|
# install with --upgrade
|
||||||
|
force_upgrade: false,
|
||||||
|
|
||||||
|
# additional artifact repositories to use when installing python packages
|
||||||
|
# extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"]
|
||||||
|
|
||||||
|
# additional conda channels to use when installing with conda package manager
|
||||||
|
conda_channels: ["pytorch", "conda-forge", "defaults", ]
|
||||||
|
|
||||||
|
# If set to true, Task's "installed packages" are ignored,
|
||||||
|
# and the repository's "requirements.txt" is used instead
|
||||||
|
# force_repo_requirements_txt: false
|
||||||
|
|
||||||
|
# set the priority packages to be installed before the rest of the required packages
|
||||||
|
# priority_packages: ["cython", "numpy", "setuptools", ]
|
||||||
|
|
||||||
|
# set the optional priority packages to be installed before the rest of the required packages,
|
||||||
|
# In case a package installation fails, the package will be ignored,
|
||||||
|
# and the virtual environment process will continue
|
||||||
|
# priority_optional_packages: ["pygobject", ]
|
||||||
|
|
||||||
|
# set the post packages to be installed after all the rest of the required packages
|
||||||
|
# post_packages: ["horovod", ]
|
||||||
|
|
||||||
|
# set the optional post packages to be installed after all the rest of the required packages,
|
||||||
|
# In case a package installation fails, the package will be ignored,
|
||||||
|
# and the virtual environment process will continue
|
||||||
|
# post_optional_packages: []
|
||||||
|
|
||||||
|
# set to True to support torch nightly build installation,
|
||||||
|
# notice: torch nightly builds are ephemeral and are deleted from time to time
|
||||||
|
torch_nightly: false,
|
||||||
|
},
|
||||||
|
|
||||||
|
# target folder for virtual environments builds, created when executing experiment
|
||||||
|
venvs_dir = ~/.clearml/venvs-builds
|
||||||
|
|
||||||
|
# cached virtual environment folder
|
||||||
|
venvs_cache: {
|
||||||
|
# maximum number of cached venvs
|
||||||
|
max_entries: 10
|
||||||
|
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
|
||||||
|
free_space_threshold_gb: 2.0
|
||||||
|
# unmark to enable virtual environment caching
|
||||||
|
# path: ~/.clearml/venvs-cache
|
||||||
|
},
|
||||||
|
|
||||||
|
# cached git clone folder
|
||||||
|
vcs_cache: {
|
||||||
|
enabled: true,
|
||||||
|
path: ~/.clearml/vcs-cache
|
||||||
|
},
|
||||||
|
|
||||||
|
# use venv-update in order to accelerate python virtual environment building
|
||||||
|
# Still in beta, turned off by default
|
||||||
|
venv_update: {
|
||||||
|
enabled: false,
|
||||||
|
},
|
||||||
|
|
||||||
|
# cached folder for specific python package download (used for pytorch package caching)
|
||||||
|
pip_download_cache {
|
||||||
|
enabled: true,
|
||||||
|
path: ~/.clearml/pip-download-cache
|
||||||
|
},
|
||||||
|
|
||||||
|
translate_ssh: true,
|
||||||
|
# reload configuration file every daemon execution
|
||||||
|
reload_config: false,
|
||||||
|
|
||||||
|
# pip cache folder mapped into docker, used for python package caching
|
||||||
|
docker_pip_cache = ~/.clearml/pip-cache
|
||||||
|
# apt cache folder mapped into docker, used for ubuntu package caching
|
||||||
|
docker_apt_cache = ~/.clearml/apt-cache
|
||||||
|
|
||||||
|
# optional arguments to pass to docker image
|
||||||
|
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||||
|
# extra_docker_arguments: ["--ipc=host", ]
|
||||||
|
|
||||||
|
# optional shell script to run in docker when started before the experiment is started
|
||||||
|
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||||
|
|
||||||
|
# Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
|
||||||
|
# for backwards compatibility reasons, true as default,
|
||||||
|
# change to false to skip installation and decrease docker spin up time
|
||||||
|
# docker_install_opencv_libs: true
|
||||||
|
|
||||||
|
# optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
|
||||||
|
# If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
|
||||||
|
# Outside of the specified time-spans, the agent will be idle.
|
||||||
|
# Defined using a list of items of the format: "<hours> <days>".
|
||||||
|
# hours - use values 0-23, single values would count as start hour and end at midnight.
|
||||||
|
# days - use days in abbreviated format (SUN-SAT)
|
||||||
|
# use '-' for ranges and ',' to separate singular values.
|
||||||
|
# for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to:
|
||||||
|
# uptime: ["17-20 SUN,TUE"]
|
||||||
|
|
||||||
|
# optional downtime configuration, can be used only when uptime is not used.
|
||||||
|
# If downtime is specified, agent will be idle in the time-spans defined here.
|
||||||
|
# Outside of the specified time-spans, the agent will actively poll (and execute) tasks.
|
||||||
|
# Use the same format as described above for uptime
|
||||||
|
# downtime: []
|
||||||
|
|
||||||
|
# set to true in order to force "docker pull" before running an experiment using a docker image.
|
||||||
|
# This makes sure the docker image is updated.
|
||||||
|
docker_force_pull: false
|
||||||
|
|
||||||
|
default_docker: {
|
||||||
|
# default docker image to use when running in docker mode
|
||||||
|
image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
|
||||||
|
|
||||||
|
# optional arguments to pass to docker image
|
||||||
|
# arguments: ["--ipc=host", ]
|
||||||
|
}
|
||||||
|
|
||||||
|
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||||
|
enable_task_env: false
|
||||||
|
|
||||||
|
# set the initial bash script to execute at the startup of any docker.
|
||||||
|
# all lines will be executed regardless of their exit code.
|
||||||
|
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
|
||||||
|
# docker_init_bash_script = [
|
||||||
|
# "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
|
||||||
|
# "chown -R root /root/.cache/pip",
|
||||||
|
# "apt-get update",
|
||||||
|
# "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
|
||||||
|
# "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip",
|
||||||
|
# ]
|
||||||
|
|
||||||
|
# set the preprocessing bash script to execute at the startup of any docker.
|
||||||
|
# all lines will be executed regardless of their exit code.
|
||||||
|
# docker_preprocess_bash_script = [
|
||||||
|
# "echo \"starting docker\"",
|
||||||
|
#]
|
||||||
|
|
||||||
|
# If False replace \r with \n and display full console output
|
||||||
|
# default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
|
||||||
|
# suppress_carriage_return: true
|
||||||
|
|
||||||
|
# cuda versions used for solving pytorch wheel packages
|
||||||
|
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||||
|
# cuda_version: 10.1
|
||||||
|
# cudnn_version: 7.6
|
||||||
|
|
||||||
|
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||||
|
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||||
|
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||||
|
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||||
|
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||||
|
# docker command, set:
|
||||||
|
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||||
|
hide_docker_command_env_vars {
|
||||||
|
enabled: true
|
||||||
|
extra_keys: []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sdk {
|
||||||
|
# ClearML - default SDK configuration
|
||||||
|
|
||||||
|
storage {
|
||||||
|
cache {
|
||||||
|
# Defaults to system temp folder / cache
|
||||||
|
default_base_dir: "~/.clearml/cache"
|
||||||
|
size {
|
||||||
|
# max_used_bytes = -1
|
||||||
|
min_free_bytes = 10GB
|
||||||
|
# cleanup_margin_percent = 5%
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
direct_access: [
|
||||||
|
# Objects matching are considered to be available for direct access, i.e. they will not be downloaded
|
||||||
|
# or cached, and any download request will return a direct reference.
|
||||||
|
# Objects are specified in glob format, available for url and content_type.
|
||||||
|
{ url: "file://*" } # file-urls are always directly referenced
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
metrics {
|
||||||
|
# History size for debug files per metric/variant. For each metric/variant combination with an attached file
|
||||||
|
# (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than
|
||||||
|
# X files are stored in the upload destination for each metric/variant combination.
|
||||||
|
file_history_size: 100
|
||||||
|
|
||||||
|
# Max history size for matplotlib imshow files per plot title.
|
||||||
|
# File names for the uploaded images will be recycled in such a way that no more than
|
||||||
|
# X images are stored in the upload destination for each matplotlib plot title.
|
||||||
|
matplotlib_untitled_history_size: 100
|
||||||
|
|
||||||
|
# Limit the number of digits after the dot in plot reporting (reducing plot report size)
|
||||||
|
# plot_max_num_digits: 5
|
||||||
|
|
||||||
|
# Settings for generated debug images
|
||||||
|
images {
|
||||||
|
format: JPEG
|
||||||
|
quality: 87
|
||||||
|
subsampling: 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
|
||||||
|
tensorboard_single_series_per_graph: false
|
||||||
|
}
|
||||||
|
|
||||||
|
network {
|
||||||
|
metrics {
|
||||||
|
# Number of threads allocated to uploading files (typically debug images) when transmitting metrics for
|
||||||
|
# a specific iteration
|
||||||
|
file_upload_threads: 4
|
||||||
|
|
||||||
|
# Warn about upload starvation if no uploads were made in specified period while file-bearing events keep
|
||||||
|
# being sent for upload
|
||||||
|
file_upload_starvation_warning_sec: 120
|
||||||
|
}
|
||||||
|
|
||||||
|
iteration {
|
||||||
|
# Max number of retries when getting frames if the server returned an error (http code 500)
|
||||||
|
max_retries_on_server_error: 5
|
||||||
|
# Backoff factory for consecutive retry attempts.
|
||||||
|
# SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries.
|
||||||
|
retry_backoff_factor_sec: 10
|
||||||
|
}
|
||||||
|
}
|
||||||
|
aws {
|
||||||
|
s3 {
|
||||||
|
# S3 credentials, used for read/write access by various SDK elements
|
||||||
|
|
||||||
|
# default, used for any bucket not specified below
|
||||||
|
key: ""
|
||||||
|
secret: ""
|
||||||
|
region: ""
|
||||||
|
|
||||||
|
credentials: [
|
||||||
|
# specifies key/secret credentials to use when handling s3 urls (read or write)
|
||||||
|
# {
|
||||||
|
# bucket: "my-bucket-name"
|
||||||
|
# key: "my-access-key"
|
||||||
|
# secret: "my-secret-key"
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
|
||||||
|
# host: "my-minio-host:9000"
|
||||||
|
# key: "12345678"
|
||||||
|
# secret: "12345678"
|
||||||
|
# multipart: false
|
||||||
|
# secure: false
|
||||||
|
# }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
boto3 {
|
||||||
|
pool_connections: 512
|
||||||
|
max_multipart_concurrency: 16
|
||||||
|
}
|
||||||
|
}
|
||||||
|
google.storage {
|
||||||
|
# # Default project and credentials file
|
||||||
|
# # Will be used when no bucket configuration is found
|
||||||
|
# project: "clearml"
|
||||||
|
# credentials_json: "/path/to/credentials.json"
|
||||||
|
|
||||||
|
# # Specific credentials per bucket and sub directory
|
||||||
|
# credentials = [
|
||||||
|
# {
|
||||||
|
# bucket: "my-bucket"
|
||||||
|
# subdir: "path/in/bucket" # Not required
|
||||||
|
# project: "clearml"
|
||||||
|
# credentials_json: "/path/to/credentials.json"
|
||||||
|
# },
|
||||||
|
# ]
|
||||||
|
}
|
||||||
|
azure.storage {
|
||||||
|
# containers: [
|
||||||
|
# {
|
||||||
|
# account_name: "clearml"
|
||||||
|
# account_key: "secret"
|
||||||
|
# # container_name:
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
}
|
||||||
|
|
||||||
|
log {
|
||||||
|
# debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
|
||||||
|
null_log_propagate: false
|
||||||
|
task_log_buffer_capacity: 66
|
||||||
|
|
||||||
|
# disable urllib info and lower levels
|
||||||
|
disable_urllib3_info: true
|
||||||
|
}
|
||||||
|
|
||||||
|
development {
|
||||||
|
# Development-mode options
|
||||||
|
|
||||||
|
# dev task reuse window
|
||||||
|
task_reuse_time_window_in_hours: 72.0
|
||||||
|
|
||||||
|
# Run VCS repository detection asynchronously
|
||||||
|
vcs_repo_detect_async: true
|
||||||
|
|
||||||
|
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
|
||||||
|
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
|
||||||
|
store_uncommitted_code_diff: true
|
||||||
|
|
||||||
|
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
|
||||||
|
support_stopping: true
|
||||||
|
|
||||||
|
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
|
||||||
|
default_output_uri: ""
|
||||||
|
|
||||||
|
# Default auto generated requirements optimize for smaller requirements
|
||||||
|
# If True, analyze the entire repository regardless of the entry point.
|
||||||
|
# If False, first analyze the entry point script, if it does not contain other to local files,
|
||||||
|
# do not analyze the entire repository.
|
||||||
|
force_analyze_entire_repo: false
|
||||||
|
|
||||||
|
# If set to true, *clearml* update message will not be printed to the console
|
||||||
|
# this value can be overwritten with os environment variable CLEARML_SUPPRESS_UPDATE_MESSAGE=1
|
||||||
|
suppress_update_message: false
|
||||||
|
|
||||||
|
# If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
|
||||||
|
detect_with_pip_freeze: false
|
||||||
|
|
||||||
|
# Development mode worker
|
||||||
|
worker {
|
||||||
|
# Status report period in seconds
|
||||||
|
report_period_sec: 2
|
||||||
|
|
||||||
|
# ping to the server - check connectivity
|
||||||
|
ping_period_sec: 30
|
||||||
|
|
||||||
|
# Log all stdout & stderr
|
||||||
|
log_stdout: true
|
||||||
|
|
||||||
|
# compatibility feature, report memory usage for the entire machine
|
||||||
|
# default (false), report only on the running process and its sub-processes
|
||||||
|
report_global_mem_used: false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
27
docker/k8s-glue/glue-build/entrypoint.sh
Normal file
27
docker/k8s-glue/glue-build/entrypoint.sh
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
#!/bin/bash -x
|
||||||
|
|
||||||
|
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-$TRAINS_FILES_HOST}
|
||||||
|
|
||||||
|
if [ -z "$CLEARML_FILES_HOST" ]; then
|
||||||
|
CLEARML_HOST_IP=${CLEARML_HOST_IP:-${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}}
|
||||||
|
fi
|
||||||
|
|
||||||
|
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-${TRAINS_FILES_HOST:-"http://$CLEARML_HOST_IP:8081"}}
|
||||||
|
export CLEARML_WEB_HOST=${CLEARML_WEB_HOST:-${TRAINS_WEB_HOST:-"http://$CLEARML_HOST_IP:8080"}}
|
||||||
|
export CLEARML_API_HOST=${CLEARML_API_HOST:-${TRAINS_API_HOST:-"http://$CLEARML_HOST_IP:8008"}}
|
||||||
|
|
||||||
|
echo $CLEARML_FILES_HOST $CLEARML_WEB_HOST $CLEARML_API_HOST 1>&2
|
||||||
|
|
||||||
|
if [ -z "$CLEARML_AGENT_NO_UPDATE" ]; then
|
||||||
|
if [ -n "$CLEARML_AGENT_UPDATE_REPO" ]; then
|
||||||
|
python3 -m pip install -q -U "$CLEARML_AGENT_UPDATE_REPO"
|
||||||
|
else
|
||||||
|
python3 -m pip install -q -U "clearml-agent${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
QUEUE=${K8S_GLUE_QUEUE:-k8s_glue}
|
||||||
|
MAX_PODS=${K8S_GLUE_MAX_PODS:-2}
|
||||||
|
EXTRA_ARGS=${K8S_GLUE_EXTRA_ARGS:-}
|
||||||
|
|
||||||
|
python3 k8s_glue_example.py --queue ${QUEUE} --max-pods ${MAX_PODS} ${EXTRA_ARGS}
|
94
docker/k8s-glue/glue-build/k8s_glue_example.py
Normal file
94
docker/k8s-glue/glue-build/k8s_glue_example.py
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
"""
|
||||||
|
This example assumes you have preconfigured services with selectors in the form of
|
||||||
|
"ai.allegro.agent.serial=pod-<number>" and a targetPort of 10022.
|
||||||
|
The K8sIntegration component will label each pod accordingly.
|
||||||
|
"""
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
from clearml_agent.glue.k8s import K8sIntegration
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = ArgumentParser()
|
||||||
|
group = parser.add_mutually_exclusive_group()
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--queue", type=str, help="Queue to pull tasks from"
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--ports-mode", action='store_true', default=False,
|
||||||
|
help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
|
||||||
|
"Should not be used with max-pods"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-of-services", type=int, default=20,
|
||||||
|
help="Specify the number of k8s services to be used. Use only with ports-mode."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--base-port", type=int,
|
||||||
|
help="Used in conjunction with ports-mode, specifies the base port exposed by the services. "
|
||||||
|
"For pod #X, the port will be <base-port>+X. Note that pod number is calculated based on base-pod-num"
|
||||||
|
"e.g. if base-port=20000 and base-pod-num=3, the port for the first pod will be 20003"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--base-pod-num", type=int, default=1,
|
||||||
|
help="Used in conjunction with ports-mode and base-port, specifies the base pod number to be used by the "
|
||||||
|
"service (default: %(default)s)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--gateway-address", type=str, default=None,
|
||||||
|
help="Used in conjunction with ports-mode, specify the external address of the k8s ingress / ELB"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--pod-clearml-conf", type=str,
|
||||||
|
help="Configuration file to be used by the pod itself (if not provided, current configuration is used)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--overrides-yaml", type=str,
|
||||||
|
help="YAML file containing pod overrides to be used when launching a new pod"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--template-yaml", type=str,
|
||||||
|
help="YAML file containing pod template. If provided pod will be scheduled with kubectl apply "
|
||||||
|
"and overrides are ignored, otherwise it will be scheduled with kubectl run"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ssh-server-port", type=int, default=0,
|
||||||
|
help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--namespace", type=str,
|
||||||
|
help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml"
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--max-pods", type=int,
|
||||||
|
help="Limit the maximum number of pods that this service can run at the same time."
|
||||||
|
"Should not be used with ports-mode"
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
user_props_cb = None
|
||||||
|
if args.ports_mode and args.base_port:
|
||||||
|
def k8s_user_props_cb(pod_number=0):
|
||||||
|
user_prop = {"k8s-pod-port": args.base_port + pod_number}
|
||||||
|
if args.gateway_address:
|
||||||
|
user_prop["k8s-gateway-address"] = args.gateway_address
|
||||||
|
return user_prop
|
||||||
|
user_props_cb = k8s_user_props_cb
|
||||||
|
|
||||||
|
k8s = K8sIntegration(
|
||||||
|
ports_mode=args.ports_mode, num_of_services=args.num_of_services, base_pod_num=args.base_pod_num,
|
||||||
|
user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf,
|
||||||
|
template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
|
||||||
|
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
|
||||||
|
namespace=args.namespace, max_pods_limit=args.max_pods or None,
|
||||||
|
)
|
||||||
|
k8s.k8s_daemon(args.queue)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
25
docker/k8s-glue/glue-build/setup.sh
Normal file
25
docker/k8s-glue/glue-build/setup.sh
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
chmod +x /root/entrypoint.sh
|
||||||
|
|
||||||
|
apt-get update -y
|
||||||
|
apt-get dist-upgrade -y
|
||||||
|
apt-get install -y curl unzip less locales
|
||||||
|
|
||||||
|
locale-gen en_US.UTF-8
|
||||||
|
|
||||||
|
apt-get install -y curl python3-pip git
|
||||||
|
python3 -m pip install -U pip
|
||||||
|
python3 -m pip install clearml-agent
|
||||||
|
python3 -m pip install -U "cryptography>=2.9"
|
||||||
|
|
||||||
|
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
|
||||||
|
unzip awscliv2.zip
|
||||||
|
./aws/install
|
||||||
|
|
||||||
|
curl -o kubectl https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl
|
||||||
|
chmod +x ./kubectl && mkdir -p $HOME/bin && cp ./kubectl $HOME/bin/kubectl && export PATH=$PATH:$HOME/bin
|
||||||
|
|
||||||
|
curl -o aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/aws-iam-authenticator
|
||||||
|
chmod +x ./aws-iam-authenticator && mkdir -p $HOME/bin && cp ./aws-iam-authenticator $HOME/bin/aws-iam-authenticator && export PATH=$PATH:$HOME/bin
|
||||||
|
echo 'export PATH=$PATH:$HOME/bin' >> ~/.bashrc
|
54
docker/k8s-glue/k8s-glue.yml
Normal file
54
docker/k8s-glue/k8s-glue.yml
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: k8s-glue
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: k8s-glue-container
|
||||||
|
image: allegroai/clearml-agent-k8s:test
|
||||||
|
imagePullPolicy: Always
|
||||||
|
command: [
|
||||||
|
"/bin/bash",
|
||||||
|
"-c",
|
||||||
|
"echo \"api.credentials.access_key: $CLEARML_API_ACCESS_KEY\" >> ~/clearml.conf \
|
||||||
|
&& echo \"api.credentials.secret_key: $CLEARML_API_SECRET_KEY\" >> ~/clearml.conf \
|
||||||
|
&& echo \"api.api_server: $CLEARML_API_HOST\" >> ~/clearml.conf \
|
||||||
|
&& echo \"api.web_server: $CLEARML_WEB_HOST\" >> ~/clearml.conf \
|
||||||
|
&& echo \"api.files_server: $CLEARML_FILES_HOST\" >> ~/clearml.conf \
|
||||||
|
&& source /root/.bashrc \
|
||||||
|
&& export PATH=$PATH:$HOME/bin \
|
||||||
|
&& /root/entrypoint.sh
|
||||||
|
"
|
||||||
|
]
|
||||||
|
volumeMounts:
|
||||||
|
- name: pod-template
|
||||||
|
mountPath: /root/template
|
||||||
|
env:
|
||||||
|
- name: CLEARML_API_HOST
|
||||||
|
value: ""
|
||||||
|
- name: CLEARML_WEB_HOST
|
||||||
|
value: ""
|
||||||
|
- name: CLEARML_FILES_HOST
|
||||||
|
value: ""
|
||||||
|
# - name: K8S_GLUE_MAX_PODS
|
||||||
|
# value: "2"
|
||||||
|
- name: K8S_GLUE_QUEUE
|
||||||
|
value: "k8s-glue"
|
||||||
|
- name: K8S_GLUE_EXTRA_ARGS
|
||||||
|
value: "--template-yaml /root/template/pod_template.yml"
|
||||||
|
- name: CLEARML_API_ACCESS_KEY
|
||||||
|
value: ""
|
||||||
|
- name: CLEARML_API_SECRET_KEY
|
||||||
|
value: ""
|
||||||
|
- name: CLEARML_WORKER_ID
|
||||||
|
value: "k8s-glue-agent"
|
||||||
|
- name: CLEARML_AGENT_UPDATE_REPO
|
||||||
|
value: ""
|
||||||
|
- name: FORCE_CLEARML_AGENT_REPO
|
||||||
|
value: ""
|
||||||
|
- name: CLEARML_DOCKER_IMAGE
|
||||||
|
value: "ubuntu:18.04"
|
||||||
|
volumes:
|
||||||
|
- name: pod-template
|
||||||
|
secret:
|
||||||
|
secretName: k8s-glue-pod-template
|
13
docker/k8s-glue/pod_template.yml
Normal file
13
docker/k8s-glue/pod_template.yml
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
metadata:
|
||||||
|
namespace: clearml
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- resources:
|
||||||
|
limits:
|
||||||
|
cpu: 1000m
|
||||||
|
memory: 4G
|
||||||
|
requests:
|
||||||
|
cpu: 1000m
|
||||||
|
memory: 4G
|
||||||
|
restartPolicy: Never
|
Loading…
Reference in New Issue
Block a user