mirror of
https://github.com/clearml/clearml-agent
synced 2025-06-26 18:16:15 +00:00
Compare commits
38 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f6f043d1ca | ||
|
|
db57441c5d | ||
|
|
31d90be0a1 | ||
|
|
5a080798cb | ||
|
|
21c4857795 | ||
|
|
4149afa896 | ||
|
|
b196ab5793 | ||
|
|
b39b54bbaf | ||
|
|
26d76f52ac | ||
|
|
2fff28845d | ||
|
|
5e4c495d62 | ||
|
|
5c5802c089 | ||
|
|
06010ef1b7 | ||
|
|
bd411a1984 | ||
|
|
29d24e3eaa | ||
|
|
0fbbe774fa | ||
|
|
aede6f4bac | ||
|
|
84706ba66d | ||
|
|
6b602889a5 | ||
|
|
cd046927f3 | ||
|
|
5ed47d2d2c | ||
|
|
fd068c0933 | ||
|
|
9456e493ac | ||
|
|
3b08a73245 | ||
|
|
42606d9247 | ||
|
|
499b3dfa66 | ||
|
|
ca360b7d43 | ||
|
|
6470b16b70 | ||
|
|
4c9410c5fe | ||
|
|
351f0657c3 | ||
|
|
382604e923 | ||
|
|
b48f25a7f9 | ||
|
|
b76e4fc02b | ||
|
|
27cf7dd67f | ||
|
|
05ec45352c | ||
|
|
6373237960 | ||
|
|
1caf7b104f | ||
|
|
161656d9e4 |
13
README.md
13
README.md
@@ -8,7 +8,8 @@ ML-Ops scheduler & orchestration solution supporting Linux, macOS and Windows**
|
||||
[](https://img.shields.io/github/license/allegroai/clearml-agent.svg)
|
||||
[](https://img.shields.io/pypi/pyversions/clearml-agent.svg)
|
||||
[](https://img.shields.io/pypi/v/clearml-agent.svg)
|
||||
|
||||
[](https://artifacthub.io/packages/search?repo=allegroai)
|
||||
|
||||
</div>
|
||||
|
||||
---
|
||||
@@ -21,9 +22,9 @@ ML-Ops scheduler & orchestration solution supporting Linux, macOS and Windows**
|
||||
* Implement optimized resource utilization policies
|
||||
* Deploy execution environments with either virtualenv or fully docker containerized with zero effort
|
||||
* Launch-and-Forget service containers
|
||||
* [Cloud autoscaling](https://allegro.ai/clearml/docs/docs/examples/services/aws_autoscaler/aws_autoscaler.html)
|
||||
* [Customizable cleanup](https://allegro.ai/clearml/docs/docs/examples/services/cleanup/cleanup_service.html)
|
||||
* Advanced [pipeline building and execution](https://allegro.ai/clearml/docs/docs/examples/frameworks/pytorch/notebooks/table/tabular_training_pipeline.html)
|
||||
* [Cloud autoscaling](https://clear.ml/docs/latest/docs/guides/services/aws_autoscaler)
|
||||
* [Customizable cleanup](https://clear.ml/docs/latest/docs/guides/services/cleanup_service)
|
||||
* Advanced [pipeline building and execution](https://clear.ml/docs/latest/docs/guides/frameworks/pytorch/notebooks/table/tabular_training_pipeline)
|
||||
|
||||
It is a zero configuration fire-and-forget execution agent, providing a full ML/DL cluster solution.
|
||||
|
||||
@@ -37,7 +38,7 @@ It is a zero configuration fire-and-forget execution agent, providing a full ML/
|
||||
"All the Deep/Machine-Learning DevOps your research needs, and then some... Because ain't nobody got time for that"
|
||||
|
||||
**Try ClearML now** [Self Hosted](https://github.com/allegroai/clearml-server) or [Free tier Hosting](https://app.community.clear.ml)
|
||||
<a href="https://app.community.clear.ml"><img src="https://github.com/allegroai/clearml/blob/master/docs/webapp_screenshots.gif?raw=true" width="100%"></a>
|
||||
<a href="https://app.community.clear.ml"><img src="https://github.com/allegroai/clearml-agent/blob/master/docs/screenshots.gif?raw=true" width="100%"></a>
|
||||
|
||||
### Simple, Flexible Experiment Orchestration
|
||||
**The ClearML Agent was built to address the DL/ML R&D DevOps needs:**
|
||||
@@ -122,7 +123,7 @@ The ClearML Agent executes experiments using the following process:
|
||||
|
||||
#### System Design & Flow
|
||||
|
||||
<img src="https://allegro.ai/clearml/docs/_images/ClearML_Architecture.png" width="100%" alt="clearml-architecture">
|
||||
<img src="https://github.com/allegroai/clearml-agent/blob/master/docs/clearml_architecture.png" width="100%" alt="clearml-architecture">
|
||||
|
||||
|
||||
#### Installing the ClearML Agent
|
||||
|
||||
@@ -31,8 +31,13 @@
|
||||
# ignore_requested_python_version: true
|
||||
|
||||
# select python package manager:
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
# currently supported: pip, conda and poetry
|
||||
# if "pip" or "conda" are used, the agent installs the required packages
|
||||
# based on the "installed packages" section of the Task. If the "installed packages" is empty,
|
||||
# it will revert to using `requirements.txt` from the repository's root directory.
|
||||
# If Poetry is selected and the root repository contains `poetry.lock` or `pyproject.toml`,
|
||||
# the "installed packages" section is ignored, and poetry is used.
|
||||
# If Poetry is selected and no lock file is found, it reverts to "pip" package manager behaviour.
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
@@ -197,4 +202,23 @@
|
||||
enabled: true
|
||||
extra_keys: []
|
||||
}
|
||||
|
||||
# allow to set internal mount points inside the docker,
|
||||
# especially useful for non-root docker container images.
|
||||
docker_internal_mounts {
|
||||
sdk_cache: "/clearml_agent_cache"
|
||||
apt_cache: "/var/cache/apt/archives"
|
||||
ssh_folder: "/root/.ssh"
|
||||
pip_cache: "/root/.cache/pip"
|
||||
poetry_cache: "/root/.cache/pypoetry"
|
||||
vcs_cache: "/root/.clearml/vcs-cache"
|
||||
venv_build: "/root/.clearml/venvs-builds"
|
||||
pip_download: "/root/.clearml/pip-download-cache"
|
||||
}
|
||||
|
||||
# Name docker containers created by the daemon using the following string format (supported from Docker 0.6.5)
|
||||
# Allowed variables are task_id, worker_id and rand_string (random lower-case letters string, up to 32 charaters)
|
||||
# Note: resulting name must start with an alpha-numeric character and continue with a alpha-numeric characters,
|
||||
# underscores (_), dots (.) and/or dashes (-)
|
||||
#docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"
|
||||
}
|
||||
|
||||
@@ -31,7 +31,9 @@
|
||||
}
|
||||
|
||||
auth {
|
||||
# When creating a request, if token will expire in less than this value, try to refresh the token
|
||||
token_expiration_threshold_sec = 360
|
||||
# When creating a request, if token will expire in less than this value, try to refresh the token. Default 12 hours
|
||||
token_expiration_threshold_sec: 43200
|
||||
# When requesting a token, request specific expiration time. Server default (and maximum) is 30 days
|
||||
# request_token_expiration_sec: 2592000
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from ...backend_config.converters import safe_text_to_bool
|
||||
from ...backend_config.environment import EnvEntry
|
||||
|
||||
|
||||
@@ -6,6 +7,12 @@ ENV_WEB_HOST = EnvEntry("CLEARML_WEB_HOST", "TRAINS_WEB_HOST")
|
||||
ENV_FILES_HOST = EnvEntry("CLEARML_FILES_HOST", "TRAINS_FILES_HOST")
|
||||
ENV_ACCESS_KEY = EnvEntry("CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY")
|
||||
ENV_SECRET_KEY = EnvEntry("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
|
||||
ENV_AUTH_TOKEN = EnvEntry("CLEARML_AUTH_TOKEN")
|
||||
ENV_VERBOSE = EnvEntry("CLEARML_API_VERBOSE", "TRAINS_API_VERBOSE", type=bool, default=False)
|
||||
ENV_HOST_VERIFY_CERT = EnvEntry("CLEARML_API_HOST_VERIFY_CERT", "TRAINS_API_HOST_VERIFY_CERT", type=bool, default=True)
|
||||
ENV_CONDA_ENV_PACKAGE = EnvEntry("CLEARML_CONDA_ENV_PACKAGE", "TRAINS_CONDA_ENV_PACKAGE")
|
||||
ENV_NO_DEFAULT_SERVER = EnvEntry("CLEARML_NO_DEFAULT_SERVER", "TRAINS_NO_DEFAULT_SERVER", type=bool, default=True)
|
||||
ENV_DISABLE_VAULT_SUPPORT = EnvEntry('CLEARML_AGENT_DISABLE_VAULT_SUPPORT', type=bool)
|
||||
ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
|
||||
'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
|
||||
)
|
||||
|
||||
@@ -1,17 +1,21 @@
|
||||
|
||||
import json as json_lib
|
||||
import os
|
||||
import sys
|
||||
import types
|
||||
from socket import gethostname
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
from typing import Optional
|
||||
|
||||
import jwt
|
||||
import requests
|
||||
import six
|
||||
from pyhocon import ConfigTree
|
||||
from pyhocon import ConfigTree, ConfigFactory
|
||||
from requests.auth import HTTPBasicAuth
|
||||
from six.moves.urllib.parse import urlparse, urlunparse
|
||||
|
||||
from .callresult import CallResult
|
||||
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST
|
||||
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN, \
|
||||
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE
|
||||
from .request import Request, BatchRequest
|
||||
from .token_manager import TokenManager
|
||||
from ..config import load
|
||||
@@ -40,11 +44,12 @@ class Session(TokenManager):
|
||||
_session_requests = 0
|
||||
_session_initial_timeout = (3.0, 10.)
|
||||
_session_timeout = (10.0, 30.)
|
||||
_session_initial_connect_retry = 4
|
||||
_session_initial_retry_connect_override = 4
|
||||
_write_session_data_size = 15000
|
||||
_write_session_timeout = (30.0, 30.)
|
||||
|
||||
api_version = '2.1'
|
||||
feature_set = 'basic'
|
||||
default_host = "https://demoapi.demo.clear.ml"
|
||||
default_web = "https://demoapp.demo.clear.ml"
|
||||
default_files = "https://demofiles.demo.clear.ml"
|
||||
@@ -99,44 +104,48 @@ class Session(TokenManager):
|
||||
if initialize_logging:
|
||||
self.config.initialize_logging(debug=kwargs.get('debug', False))
|
||||
|
||||
token_expiration_threshold_sec = self.config.get(
|
||||
"auth.token_expiration_threshold_sec", 60
|
||||
)
|
||||
|
||||
super(Session, self).__init__(
|
||||
token_expiration_threshold_sec=token_expiration_threshold_sec, **kwargs
|
||||
)
|
||||
super(Session, self).__init__(config=config, **kwargs)
|
||||
|
||||
self._verbose = verbose if verbose is not None else ENV_VERBOSE.get()
|
||||
self._logger = logger
|
||||
self.__auth_token = None
|
||||
|
||||
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
||||
default=(self.config.get("api.credentials.access_key", None) or self.default_key),
|
||||
value_cb=lambda key, value: print("Using environment access key {}={}".format(key, value))
|
||||
)
|
||||
if not self.access_key:
|
||||
raise ValueError(
|
||||
"Missing access_key. Please set in configuration file or pass in session init."
|
||||
if ENV_AUTH_TOKEN.get(
|
||||
value_cb=lambda key, value: print("Using environment access token {}=********".format(key))
|
||||
):
|
||||
self.set_auth_token(ENV_AUTH_TOKEN.get())
|
||||
else:
|
||||
self.__access_key = api_key or ENV_ACCESS_KEY.get(
|
||||
default=(self.config.get("api.credentials.access_key", None) or self.default_key),
|
||||
value_cb=lambda key, value: print("Using environment access key {}={}".format(key, value))
|
||||
)
|
||||
if not self.access_key:
|
||||
raise ValueError(
|
||||
"Missing access_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
|
||||
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
||||
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret),
|
||||
value_cb=lambda key, value: print("Using environment secret key {}=********".format(key))
|
||||
)
|
||||
if not self.secret_key:
|
||||
raise ValueError(
|
||||
"Missing secret_key. Please set in configuration file or pass in session init."
|
||||
self.__secret_key = secret_key or ENV_SECRET_KEY.get(
|
||||
default=(self.config.get("api.credentials.secret_key", None) or self.default_secret),
|
||||
value_cb=lambda key, value: print("Using environment secret key {}=********".format(key))
|
||||
)
|
||||
if not self.secret_key:
|
||||
raise ValueError(
|
||||
"Missing secret_key. Please set in configuration file or pass in session init."
|
||||
)
|
||||
|
||||
if self.access_key == self.default_key and self.secret_key == self.default_secret:
|
||||
print("Using built-in ClearML default key/secret")
|
||||
|
||||
host = host or self.get_api_server_host(config=self.config)
|
||||
if not host:
|
||||
raise ValueError("host is required in init or config")
|
||||
raise ValueError(
|
||||
"Could not find host server definition "
|
||||
"(missing `~/clearml.conf` or Environment CLEARML_API_HOST)\n"
|
||||
"To get started with ClearML: setup your own `clearml-server`, "
|
||||
"or create a free account at https://app.community.clear.ml and run `clearml-agent init`"
|
||||
)
|
||||
|
||||
self.__host = host.strip("/")
|
||||
http_retries_config = http_retries_config or self.config.get(
|
||||
"api.http.retries", ConfigTree()
|
||||
).as_plain_ordered_dict()
|
||||
http_retries_config["status_forcelist"] = self._retry_codes
|
||||
|
||||
self.__worker = worker or gethostname()
|
||||
|
||||
@@ -147,13 +156,15 @@ class Session(TokenManager):
|
||||
self.client = client or "api-{}".format(__version__)
|
||||
|
||||
# limit the reconnect retries, so we get an error if we are starting the session
|
||||
http_no_retries_config = dict(**http_retries_config)
|
||||
http_no_retries_config['connect'] = self._session_initial_connect_retry
|
||||
self.__http_session = get_http_session_with_retry(**http_no_retries_config)
|
||||
_, self.__http_session = self._setup_session(
|
||||
http_retries_config,
|
||||
initial_session=True,
|
||||
default_initial_connect_override=(False if kwargs.get("command") == "execute" else None)
|
||||
)
|
||||
# try to connect with the server
|
||||
self.refresh_token()
|
||||
# create the default session with many retries
|
||||
self.__http_session = get_http_session_with_retry(**http_retries_config)
|
||||
http_retries_config, self.__http_session = self._setup_session(http_retries_config)
|
||||
|
||||
# update api version from server response
|
||||
try:
|
||||
@@ -163,6 +174,7 @@ class Session(TokenManager):
|
||||
api_version = '2.2' if token_dict.get('env', '') == 'prod' else Session.api_version
|
||||
|
||||
Session.api_version = str(api_version)
|
||||
Session.feature_set = str(token_dict.get('feature_set', self.feature_set) or "basic")
|
||||
except (jwt.DecodeError, ValueError):
|
||||
pass
|
||||
|
||||
@@ -171,6 +183,65 @@ class Session(TokenManager):
|
||||
# notice: this is across the board warning omission
|
||||
urllib_log_warning_setup(total_retries=http_retries_config.get('total', 0), display_warning_after=3)
|
||||
|
||||
self._load_vaults()
|
||||
|
||||
def _setup_session(self, http_retries_config, initial_session=False, default_initial_connect_override=None):
|
||||
# type: (dict, bool, Optional[bool]) -> (dict, requests.Session)
|
||||
http_retries_config = http_retries_config or self.config.get(
|
||||
"api.http.retries", ConfigTree()
|
||||
).as_plain_ordered_dict()
|
||||
http_retries_config["status_forcelist"] = self._retry_codes
|
||||
|
||||
if initial_session:
|
||||
kwargs = {} if default_initial_connect_override is None else {
|
||||
"default": default_initial_connect_override
|
||||
}
|
||||
if ENV_INITIAL_CONNECT_RETRY_OVERRIDE.get(**kwargs):
|
||||
connect_retries = self._session_initial_retry_connect_override
|
||||
try:
|
||||
value = ENV_INITIAL_CONNECT_RETRY_OVERRIDE.get(converter=str)
|
||||
if not isinstance(value, bool):
|
||||
connect_retries = abs(int(value))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
http_retries_config = dict(**http_retries_config)
|
||||
http_retries_config['connect'] = connect_retries
|
||||
|
||||
return http_retries_config, get_http_session_with_retry(**http_retries_config)
|
||||
|
||||
def _load_vaults(self):
|
||||
if not self.check_min_api_version("2.15") or self.feature_set == "basic":
|
||||
return
|
||||
|
||||
if ENV_DISABLE_VAULT_SUPPORT.get():
|
||||
print("Vault support is disabled")
|
||||
return
|
||||
|
||||
def parse(vault):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
d = vault.get('data', None)
|
||||
if d:
|
||||
r = ConfigFactory.parse_string(d)
|
||||
if isinstance(r, (ConfigTree, dict)):
|
||||
return r
|
||||
except Exception as e:
|
||||
print("Failed parsing vault {}: {}".format(vault.get("description", "<unknown>"), e))
|
||||
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
res = self.send_request("users", "get_vaults", json={"enabled": True, "types": ["config"]})
|
||||
if res.ok:
|
||||
vaults = res.json().get("data", {}).get("vaults", [])
|
||||
data = list(filter(None, map(parse, vaults)))
|
||||
if data:
|
||||
self.config.set_overrides(*data)
|
||||
elif res.status_code != 404:
|
||||
raise Exception(res.json().get("meta", {}).get("result_msg", res.text))
|
||||
except Exception as ex:
|
||||
print("Failed getting vaults: {}".format(ex))
|
||||
|
||||
def _send_request(
|
||||
self,
|
||||
service,
|
||||
@@ -244,6 +315,10 @@ class Session(TokenManager):
|
||||
headers[self._AUTHORIZATION_HEADER] = "Bearer {}".format(self.token)
|
||||
return headers
|
||||
|
||||
def set_auth_token(self, auth_token):
|
||||
self.__access_key = self.__secret_key = None
|
||||
self._set_token(auth_token)
|
||||
|
||||
def send_request(
|
||||
self,
|
||||
service,
|
||||
@@ -441,8 +516,11 @@ class Session(TokenManager):
|
||||
if not config:
|
||||
return None
|
||||
|
||||
return ENV_HOST.get(default=(config.get("api.api_server", None) or
|
||||
config.get("api.host", None) or cls.default_host))
|
||||
default = config.get("api.api_server", None) or config.get("api.host", None)
|
||||
if not ENV_NO_DEFAULT_SERVER.get():
|
||||
default = default or cls.default_host
|
||||
|
||||
return ENV_HOST.get(default=default)
|
||||
|
||||
@classmethod
|
||||
def get_app_server_host(cls, config=None):
|
||||
@@ -510,7 +588,7 @@ class Session(TokenManager):
|
||||
return v + (0,) * max(0, 3 - len(v))
|
||||
return version_tuple(cls.api_version) >= version_tuple(str(min_api_version))
|
||||
|
||||
def _do_refresh_token(self, old_token, exp=None):
|
||||
def _do_refresh_token(self, current_token, exp=None):
|
||||
""" TokenManager abstract method implementation.
|
||||
Here we ignore the old token and simply obtain a new token.
|
||||
"""
|
||||
@@ -522,7 +600,13 @@ class Session(TokenManager):
|
||||
)
|
||||
)
|
||||
|
||||
auth = HTTPBasicAuth(self.access_key, self.secret_key)
|
||||
auth = None
|
||||
headers = None
|
||||
if self.access_key and self.secret_key:
|
||||
auth = HTTPBasicAuth(self.access_key, self.secret_key)
|
||||
elif current_token:
|
||||
headers = dict(Authorization="Bearer {}".format(current_token))
|
||||
|
||||
res = None
|
||||
try:
|
||||
data = {"expiration_sec": exp} if exp else {}
|
||||
@@ -531,6 +615,7 @@ class Session(TokenManager):
|
||||
action="login",
|
||||
auth=auth,
|
||||
json=data,
|
||||
headers=headers,
|
||||
refresh_token_if_unauthorized=False,
|
||||
)
|
||||
try:
|
||||
@@ -546,7 +631,10 @@ class Session(TokenManager):
|
||||
)
|
||||
if verbose:
|
||||
self._logger.info("Received new token")
|
||||
return resp["data"]["token"]
|
||||
token = resp["data"]["token"]
|
||||
if ENV_AUTH_TOKEN.get():
|
||||
os.environ[ENV_AUTH_TOKEN.key] = token
|
||||
return token
|
||||
except LoginError:
|
||||
six.reraise(*sys.exc_info())
|
||||
except KeyError as ex:
|
||||
|
||||
@@ -9,6 +9,8 @@ import six
|
||||
|
||||
@six.add_metaclass(ABCMeta)
|
||||
class TokenManager(object):
|
||||
_default_token_exp_threshold_sec = 12 * 60 * 60
|
||||
_default_req_token_expiration_sec = None
|
||||
|
||||
@property
|
||||
def token_expiration_threshold_sec(self):
|
||||
@@ -41,17 +43,30 @@ class TokenManager(object):
|
||||
return self.__token
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
token=None,
|
||||
req_token_expiration_sec=None,
|
||||
token_history=None,
|
||||
token_expiration_threshold_sec=60,
|
||||
**kwargs
|
||||
self,
|
||||
token=None,
|
||||
req_token_expiration_sec=None,
|
||||
token_history=None,
|
||||
token_expiration_threshold_sec=None,
|
||||
config=None,
|
||||
**kwargs
|
||||
):
|
||||
super(TokenManager, self).__init__()
|
||||
assert isinstance(token_history, (type(None), dict))
|
||||
self.token_expiration_threshold_sec = token_expiration_threshold_sec
|
||||
self.req_token_expiration_sec = req_token_expiration_sec
|
||||
if config:
|
||||
req_token_expiration_sec = req_token_expiration_sec or config.get(
|
||||
"api.auth.request_token_expiration_sec", None
|
||||
)
|
||||
token_expiration_threshold_sec = (
|
||||
token_expiration_threshold_sec
|
||||
or config.get("api.auth.token_expiration_threshold_sec", None)
|
||||
)
|
||||
self.token_expiration_threshold_sec = (
|
||||
token_expiration_threshold_sec or self._default_token_exp_threshold_sec
|
||||
)
|
||||
self.req_token_expiration_sec = (
|
||||
req_token_expiration_sec or self._default_req_token_expiration_sec
|
||||
)
|
||||
self._set_token(token)
|
||||
|
||||
def _calc_token_valid_period_sec(self, token, exp=None, at_least_sec=None):
|
||||
@@ -59,7 +74,9 @@ class TokenManager(object):
|
||||
try:
|
||||
exp = exp or self._get_token_exp(token)
|
||||
if at_least_sec:
|
||||
at_least_sec = max(at_least_sec, self.token_expiration_threshold_sec)
|
||||
at_least_sec = max(
|
||||
at_least_sec, self.token_expiration_threshold_sec
|
||||
)
|
||||
else:
|
||||
at_least_sec = self.token_expiration_threshold_sec
|
||||
return max(0, (exp - time() - at_least_sec))
|
||||
@@ -71,14 +88,16 @@ class TokenManager(object):
|
||||
def get_decoded_token(cls, token, verify=False):
|
||||
""" Get token expiration time. If not present, assume forever """
|
||||
return jwt.decode(
|
||||
token, verify=verify,
|
||||
token,
|
||||
verify=verify,
|
||||
options=dict(verify_signature=False),
|
||||
algorithms=get_default_algorithms())
|
||||
algorithms=get_default_algorithms(),
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _get_token_exp(cls, token):
|
||||
""" Get token expiration time. If not present, assume forever """
|
||||
return cls.get_decoded_token(token).get('exp', sys.maxsize)
|
||||
return cls.get_decoded_token(token).get("exp", sys.maxsize)
|
||||
|
||||
def _set_token(self, token):
|
||||
if token:
|
||||
@@ -89,7 +108,9 @@ class TokenManager(object):
|
||||
self.__token_expiration_sec = 0
|
||||
|
||||
def get_token_valid_period_sec(self):
|
||||
return self._calc_token_valid_period_sec(self.__token, self.token_expiration_sec)
|
||||
return self._calc_token_valid_period_sec(
|
||||
self.__token, self.token_expiration_sec
|
||||
)
|
||||
|
||||
def _get_token(self):
|
||||
if self.get_token_valid_period_sec() <= 0:
|
||||
@@ -101,4 +122,6 @@ class TokenManager(object):
|
||||
pass
|
||||
|
||||
def refresh_token(self):
|
||||
self._set_token(self._do_refresh_token(self.__token, exp=self.req_token_expiration_sec))
|
||||
self._set_token(
|
||||
self._do_refresh_token(self.__token, exp=self.req_token_expiration_sec)
|
||||
)
|
||||
|
||||
@@ -4,15 +4,13 @@ import functools
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import warnings
|
||||
from fnmatch import fnmatch
|
||||
from os.path import expanduser
|
||||
from typing import Any
|
||||
|
||||
import pyhocon
|
||||
import six
|
||||
from pathlib2 import Path
|
||||
from pyhocon import ConfigTree
|
||||
from pyhocon import ConfigTree, ConfigFactory
|
||||
from pyparsing import (
|
||||
ParseFatalException,
|
||||
ParseException,
|
||||
@@ -71,6 +69,10 @@ class Config(object):
|
||||
|
||||
# used in place of None in Config.get as default value because None is a valid value
|
||||
_MISSING = object()
|
||||
extra_config_values_env_key_sep = "__"
|
||||
extra_config_values_env_key_prefix = [
|
||||
"CLEARML_AGENT" + extra_config_values_env_key_sep,
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -80,7 +82,7 @@ class Config(object):
|
||||
relative_to=None,
|
||||
app=None,
|
||||
is_server=False,
|
||||
**_
|
||||
**_,
|
||||
):
|
||||
self._app = app
|
||||
self._verbose = verbose
|
||||
@@ -90,6 +92,7 @@ class Config(object):
|
||||
self._env = env or os.environ.get("TRAINS_ENV", Environment.default)
|
||||
self.config_paths = set()
|
||||
self.is_server = is_server
|
||||
self._overrides_configs = None
|
||||
|
||||
if self._verbose:
|
||||
print("Config env:%s" % str(self._env))
|
||||
@@ -100,6 +103,7 @@ class Config(object):
|
||||
)
|
||||
if self._env not in get_options(Environment):
|
||||
raise ValueError("Invalid environment %s" % env)
|
||||
|
||||
if relative_to is not None:
|
||||
self.load_relative_to(relative_to)
|
||||
|
||||
@@ -158,7 +162,9 @@ class Config(object):
|
||||
if LOCAL_CONFIG_PATHS:
|
||||
config = functools.reduce(
|
||||
lambda cfg, path: ConfigTree.merge_configs(
|
||||
cfg, self._read_recursive(path, verbose=self._verbose), copy_trees=True
|
||||
cfg,
|
||||
self._read_recursive(path, verbose=self._verbose),
|
||||
copy_trees=True,
|
||||
),
|
||||
LOCAL_CONFIG_PATHS,
|
||||
config,
|
||||
@@ -181,9 +187,38 @@ class Config(object):
|
||||
config,
|
||||
)
|
||||
|
||||
config = ConfigTree.merge_configs(
|
||||
config, self._read_extra_env_config_values(), copy_trees=True
|
||||
)
|
||||
|
||||
if self._overrides_configs:
|
||||
config = functools.reduce(
|
||||
lambda cfg, override: ConfigTree.merge_configs(cfg, override, copy_trees=True),
|
||||
self._overrides_configs,
|
||||
config,
|
||||
)
|
||||
|
||||
config["env"] = env
|
||||
return config
|
||||
|
||||
def _read_extra_env_config_values(self) -> ConfigTree:
|
||||
""" Loads extra configuration from environment-injected values """
|
||||
result = ConfigTree()
|
||||
|
||||
for prefix in self.extra_config_values_env_key_prefix:
|
||||
keys = sorted(k for k in os.environ if k.startswith(prefix))
|
||||
for key in keys:
|
||||
path = (
|
||||
key[len(prefix) :]
|
||||
.replace(self.extra_config_values_env_key_sep, ".")
|
||||
.lower()
|
||||
)
|
||||
result = ConfigTree.merge_configs(
|
||||
result, ConfigFactory.parse_string(f"{path}: {os.environ[key]}")
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
def replace(self, config):
|
||||
self._config = config
|
||||
|
||||
@@ -340,3 +375,10 @@ class Config(object):
|
||||
except Exception as ex:
|
||||
print("Failed loading %s: %s" % (file_path, ex))
|
||||
raise
|
||||
|
||||
def set_overrides(self, *dicts):
|
||||
""" Set several override dictionaries or ConfigTree objects which should be merged onto the configuration """
|
||||
self._overrides_configs = [
|
||||
d if isinstance(d, ConfigTree) else pyhocon.ConfigFactory.from_dict(d) for d in dicts
|
||||
]
|
||||
self.reload()
|
||||
|
||||
@@ -24,6 +24,14 @@ def text_to_bool(value):
|
||||
return bool(strtobool(value))
|
||||
|
||||
|
||||
def safe_text_to_bool(value):
|
||||
# type: (Text) -> bool
|
||||
try:
|
||||
return text_to_bool(value)
|
||||
except ValueError:
|
||||
return bool(value)
|
||||
|
||||
|
||||
def any_to_bool(value):
|
||||
# type: (Optional[Union[int, float, Text]]) -> bool
|
||||
if isinstance(value, six.text_type):
|
||||
|
||||
@@ -118,11 +118,13 @@ class ServiceCommandSection(BaseCommandSection):
|
||||
""" The name of the REST service used by this command """
|
||||
pass
|
||||
|
||||
def get(self, endpoint, *args, **kwargs):
|
||||
return self._session.get(service=self.service, action=endpoint, *args, **kwargs)
|
||||
def get(self, endpoint, *args, session=None, **kwargs):
|
||||
session = session or self._session
|
||||
return session.get(service=self.service, action=endpoint, *args, **kwargs)
|
||||
|
||||
def post(self, endpoint, *args, **kwargs):
|
||||
return self._session.post(service=self.service, action=endpoint, *args, **kwargs)
|
||||
def post(self, endpoint, *args, session=None, **kwargs):
|
||||
session = session or self._session
|
||||
return session.post(service=self.service, action=endpoint, *args, **kwargs)
|
||||
|
||||
def get_with_act_as(self, endpoint, *args, **kwargs):
|
||||
return self._session.get_with_act_as(service=self.service, action=endpoint, *args, **kwargs)
|
||||
|
||||
@@ -11,8 +11,8 @@ from clearml_agent.backend_config.defs import LOCAL_CONFIG_FILES
|
||||
|
||||
|
||||
description = """
|
||||
Please create new clearml credentials through the profile page in your clearml web app (e.g. https://demoapp.demo.clear.ml/profile)
|
||||
Or with the free hosted service at https://app.community.clear.ml/profile
|
||||
Please create new clearml credentials through the profile page in your `clearml-server` web app,
|
||||
or create a free account at https://app.community.clear.ml/profile
|
||||
|
||||
In the profile page, press "Create new credentials", then press "Copy to clipboard".
|
||||
|
||||
|
||||
@@ -21,14 +21,16 @@ class Events(ServiceCommandSection):
|
||||
""" Events command service endpoint """
|
||||
return 'events'
|
||||
|
||||
def send_events(self, list_events):
|
||||
def send_events(self, list_events, session=None):
|
||||
def send_packet(jsonlines):
|
||||
if not jsonlines:
|
||||
return 0
|
||||
num_lines = len(jsonlines)
|
||||
jsonlines = '\n'.join(jsonlines)
|
||||
|
||||
new_events = self.post('add_batch', data=jsonlines, headers={'Content-type': 'application/json-lines'})
|
||||
new_events = self.post(
|
||||
'add_batch', data=jsonlines, headers={'Content-type': 'application/json-lines'}, session=session
|
||||
)
|
||||
if new_events['added'] != num_lines:
|
||||
print('Error (%s) sending events only %d of %d registered' %
|
||||
(new_events['errors'], new_events['added'], num_lines))
|
||||
@@ -57,7 +59,7 @@ class Events(ServiceCommandSection):
|
||||
# print('Sending events done: %d / %d events sent' % (sent_events, len(list_events)))
|
||||
return sent_events
|
||||
|
||||
def send_log_events(self, worker_id, task_id, lines, level='DEBUG'):
|
||||
def send_log_events(self, worker_id, task_id, lines, level='DEBUG', session=None):
|
||||
log_events = []
|
||||
base_timestamp = int(time.time() * 1000)
|
||||
base_log_items = {
|
||||
@@ -94,4 +96,4 @@ class Events(ServiceCommandSection):
|
||||
log_events.append(get_event(count))
|
||||
|
||||
# now send the events
|
||||
return self.send_events(list_events=log_events)
|
||||
return self.send_events(list_events=log_events, session=session)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,10 +1,10 @@
|
||||
import shlex
|
||||
from datetime import timedelta
|
||||
from distutils.util import strtobool
|
||||
from enum import IntEnum
|
||||
from os import getenv, environ
|
||||
from typing import Text, Optional, Union, Tuple, Any
|
||||
|
||||
from furl import furl
|
||||
from pathlib2 import Path
|
||||
|
||||
import six
|
||||
@@ -34,6 +34,7 @@ class EnvironmentConfig(object):
|
||||
conversions = {
|
||||
bool: lambda value: bool(strtobool(value)),
|
||||
six.text_type: lambda s: six.text_type(s).strip(),
|
||||
list: lambda s: shlex.split(s.strip()),
|
||||
}
|
||||
|
||||
def __init__(self, *names, **kwargs):
|
||||
@@ -63,6 +64,7 @@ class EnvironmentConfig(object):
|
||||
|
||||
|
||||
ENV_AGENT_SECRET_KEY = EnvironmentConfig("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
|
||||
ENV_AGENT_AUTH_TOKEN = EnvironmentConfig("CLEARML_AUTH_TOKEN")
|
||||
ENV_AWS_SECRET_KEY = EnvironmentConfig("AWS_SECRET_ACCESS_KEY")
|
||||
ENV_AZURE_ACCOUNT_KEY = EnvironmentConfig("AZURE_STORAGE_KEY")
|
||||
|
||||
@@ -130,15 +132,20 @@ PIP_EXTRA_INDICES = [
|
||||
DEFAULT_PIP_DOWNLOAD_CACHE = normalize_path(CONFIG_DIR, "pip-download-cache")
|
||||
ENV_DOCKER_IMAGE = EnvironmentConfig('CLEARML_DOCKER_IMAGE', 'TRAINS_DOCKER_IMAGE')
|
||||
ENV_WORKER_ID = EnvironmentConfig('CLEARML_WORKER_ID', 'TRAINS_WORKER_ID')
|
||||
ENV_WORKER_TAGS = EnvironmentConfig('CLEARML_WORKER_TAGS')
|
||||
ENV_AGENT_SKIP_PIP_VENV_INSTALL = EnvironmentConfig('CLEARML_AGENT_SKIP_PIP_VENV_INSTALL')
|
||||
ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig('CLEARML_DOCKER_SKIP_GPUS_FLAG', 'TRAINS_DOCKER_SKIP_GPUS_FLAG')
|
||||
ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER')
|
||||
ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS')
|
||||
ENV_AGENT_GIT_HOST = EnvironmentConfig('CLEARML_AGENT_GIT_HOST', 'TRAINS_AGENT_GIT_HOST')
|
||||
ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig('CLEARML_AGENT_DISABLE_SSH_MOUNT', type=bool)
|
||||
ENV_SSH_AUTH_SOCK = EnvironmentConfig('SSH_AUTH_SOCK')
|
||||
ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig('CLEARML_AGENT_EXEC_USER', 'TRAINS_AGENT_EXEC_USER')
|
||||
ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig('CLEARML_AGENT_EXTRA_PYTHON_PATH', 'TRAINS_AGENT_EXTRA_PYTHON_PATH')
|
||||
ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEARML_AGENT_DOCKER_HOST_MOUNT',
|
||||
'TRAINS_AGENT_K8S_HOST_MOUNT', 'TRAINS_AGENT_DOCKER_HOST_MOUNT')
|
||||
ENV_VENV_CACHE_PATH = EnvironmentConfig('CLEARML_AGENT_VENV_CACHE_PATH')
|
||||
ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig('CLEARML_AGENT_EXTRA_DOCKER_ARGS', type=list)
|
||||
|
||||
|
||||
class FileBuffering(IntEnum):
|
||||
|
||||
@@ -31,10 +31,12 @@ def parse(reqstr, cwd=None):
|
||||
elif not line or line.startswith('#'):
|
||||
# comments are lines that start with # only
|
||||
continue
|
||||
elif line.startswith('-r') or line.startswith('--requirement'):
|
||||
elif line.startswith('-r ') or line.startswith('--requirement '):
|
||||
_, new_filename = line.split()
|
||||
new_file_path = os.path.join(
|
||||
os.path.dirname(filename or '.') if filename or not cwd else cwd, new_filename)
|
||||
if not os.path.exists(new_file_path):
|
||||
continue
|
||||
with open(new_file_path) as f:
|
||||
for requirement in parse(f):
|
||||
yield requirement
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import print_function, division, unicode_literals
|
||||
|
||||
import base64
|
||||
import functools
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -17,7 +18,7 @@ from typing import Text, List, Callable, Any, Collection, Optional, Union
|
||||
import yaml
|
||||
|
||||
from clearml_agent.commands.events import Events
|
||||
from clearml_agent.commands.worker import Worker, get_task_container
|
||||
from clearml_agent.commands.worker import Worker, get_task_container, set_task_container
|
||||
from clearml_agent.definitions import ENV_DOCKER_IMAGE
|
||||
from clearml_agent.errors import APIError
|
||||
from clearml_agent.helper.base import safe_remove_file
|
||||
@@ -184,6 +185,8 @@ class K8sIntegration(Worker):
|
||||
# make sure we use system packages!
|
||||
self.conf_file_content += '\nagent.package_manager.system_site_packages=true\n'
|
||||
|
||||
self._agent_label = None
|
||||
|
||||
self._monitor_hanging_pods()
|
||||
|
||||
def _monitor_hanging_pods(self):
|
||||
@@ -191,7 +194,18 @@ class K8sIntegration(Worker):
|
||||
_check_pod_thread.daemon = True
|
||||
_check_pod_thread.start()
|
||||
|
||||
@staticmethod
|
||||
def _get_path(d, *path, default=None):
|
||||
try:
|
||||
return functools.reduce(
|
||||
lambda a, b: a[b], path, d
|
||||
)
|
||||
except (IndexError, KeyError):
|
||||
return default
|
||||
|
||||
def _monitor_hanging_pods_daemon(self):
|
||||
last_tasks_msgs = {} # last msg updated for every task
|
||||
|
||||
while True:
|
||||
output = get_bash_output('kubectl get pods -n {namespace} -o=JSON'.format(
|
||||
namespace=self.namespace
|
||||
@@ -204,23 +218,44 @@ class K8sIntegration(Worker):
|
||||
sleep(self._polling_interval)
|
||||
continue
|
||||
pods = output_config.get('items', [])
|
||||
task_ids = set()
|
||||
for pod in pods:
|
||||
try:
|
||||
reason = functools.reduce(
|
||||
lambda a, b: a[b], ('status', 'containerStatuses', 0, 'state', 'waiting', 'reason'), pod
|
||||
)
|
||||
except (IndexError, KeyError):
|
||||
if self._get_path(pod, 'status', 'phase') != "Pending":
|
||||
continue
|
||||
if reason == 'ImagePullBackOff':
|
||||
pod_name = pod.get('metadata', {}).get('name', None)
|
||||
if pod_name:
|
||||
task_id = pod_name.rpartition('-')[-1]
|
||||
|
||||
pod_name = pod.get('metadata', {}).get('name', None)
|
||||
if not pod_name:
|
||||
continue
|
||||
|
||||
task_id = pod_name.rpartition('-')[-1]
|
||||
if not task_id:
|
||||
continue
|
||||
|
||||
task_ids.add(task_id)
|
||||
|
||||
msg = None
|
||||
|
||||
waiting = self._get_path(pod, 'status', 'containerStatuses', 0, 'state', 'waiting')
|
||||
if not waiting:
|
||||
condition = self._get_path(pod, 'status', 'conditions', 0)
|
||||
if condition:
|
||||
reason = condition.get('reason')
|
||||
if reason == 'Unschedulable':
|
||||
message = condition.get('message')
|
||||
msg = reason + (" ({})".format(message) if message else "")
|
||||
else:
|
||||
reason = waiting.get("reason", None)
|
||||
message = waiting.get("message", None)
|
||||
|
||||
msg = reason + (" ({})".format(message) if message else "")
|
||||
|
||||
if reason == 'ImagePullBackOff':
|
||||
delete_pod_cmd = 'kubectl delete pods {} -n {}'.format(pod_name, self.namespace)
|
||||
get_bash_output(delete_pod_cmd)
|
||||
try:
|
||||
self._session.api_client.tasks.failed(
|
||||
task=task_id,
|
||||
status_reason="K8S glue error due to ImagePullBackOff",
|
||||
status_reason="K8S glue error: {}".format(msg),
|
||||
status_message="Changed by K8S glue",
|
||||
force=True
|
||||
)
|
||||
@@ -228,6 +263,35 @@ class K8sIntegration(Worker):
|
||||
self.log.warning(
|
||||
'K8S Glue pods monitor: Failed deleting task "{}"\nEX: {}'.format(task_id, ex)
|
||||
)
|
||||
|
||||
# clean up any msg for this task
|
||||
last_tasks_msgs.pop(task_id, None)
|
||||
continue
|
||||
if msg and last_tasks_msgs.get(task_id, None) != msg:
|
||||
try:
|
||||
result = self._session.send_request(
|
||||
service='tasks',
|
||||
action='update',
|
||||
json={"task": task_id, "status_message": "K8S glue status: {}".format(msg)},
|
||||
method='get',
|
||||
async_enable=False,
|
||||
)
|
||||
if not result.ok:
|
||||
result_msg = self._get_path(result.json(), 'meta', 'result_msg')
|
||||
raise Exception(result_msg or result.text)
|
||||
|
||||
# update last msg for this task
|
||||
last_tasks_msgs[task_id] = msg
|
||||
except Exception as ex:
|
||||
self.log.warning(
|
||||
'K8S Glue pods monitor: Failed setting status message for task "{}"\nEX: {}'.format(
|
||||
task_id, ex
|
||||
)
|
||||
)
|
||||
|
||||
# clean up any last message for a task that wasn't seen as a pod
|
||||
last_tasks_msgs = {k: v for k, v in last_tasks_msgs.items() if k in task_ids}
|
||||
|
||||
sleep(self._polling_interval)
|
||||
|
||||
def _set_task_user_properties(self, task_id: str, **properties: str):
|
||||
@@ -260,6 +324,44 @@ class K8sIntegration(Worker):
|
||||
if error.code == 404:
|
||||
self._edit_hyperparams_support = self._session.api_version
|
||||
|
||||
def _get_agent_label(self):
|
||||
if not self.worker_id:
|
||||
print('WARNING! no worker ID found!!!')
|
||||
return self.AGENT_LABEL
|
||||
|
||||
if not self._agent_label:
|
||||
h = hashlib.md5()
|
||||
h.update(str(self.worker_id).encode('utf-8'))
|
||||
self._agent_label = '{}-{}'.format(self.AGENT_LABEL, h.hexdigest()[:8])
|
||||
|
||||
return self._agent_label
|
||||
|
||||
def _get_number_used_pods(self):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
kubectl_cmd_new = "kubectl get pods -l {agent_label} -n {namespace} -o json".format(
|
||||
agent_label=self._get_agent_label(),
|
||||
namespace=self.namespace,
|
||||
)
|
||||
process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
output, error = process.communicate()
|
||||
output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
|
||||
error = '' if not error else error if isinstance(error, str) else error.decode('utf-8')
|
||||
|
||||
if not output:
|
||||
# No such pod exist so we can use the pod_number we found
|
||||
return 0
|
||||
|
||||
try:
|
||||
current_pod_count = len(json.loads(output).get("items", []))
|
||||
except (ValueError, TypeError) as ex:
|
||||
return -1
|
||||
|
||||
return current_pod_count
|
||||
except Exception as ex:
|
||||
print('Failed getting number of used pods: {}'.format(ex))
|
||||
return -2
|
||||
|
||||
def run_one_task(self, queue: Text, task_id: Text, worker_args=None, **_):
|
||||
print('Pulling task {} launching on kubernetes cluster'.format(task_id))
|
||||
task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]
|
||||
@@ -267,9 +369,14 @@ class K8sIntegration(Worker):
|
||||
# push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
|
||||
try:
|
||||
print('Pushing task {} into temporary pending queue'.format(task_id))
|
||||
self._session.api_client.tasks.reset(task_id)
|
||||
self._session.api_client.tasks.enqueue(task_id, queue=self.k8s_pending_queue_name,
|
||||
status_reason='k8s pending scheduler')
|
||||
res = self._session.api_client.tasks.stop(task_id, force=True)
|
||||
res = self._session.api_client.tasks.enqueue(
|
||||
task_id,
|
||||
queue=self.k8s_pending_queue_name,
|
||||
status_reason='k8s pending scheduler',
|
||||
)
|
||||
if res.meta.result_code != 200:
|
||||
raise Exception(res.meta.result_msg)
|
||||
except Exception as e:
|
||||
self.log.error("ERROR: Could not push back task [{}] to k8s pending queue [{}], error: {}".format(
|
||||
task_id, self.k8s_pending_queue_name, e))
|
||||
@@ -281,6 +388,9 @@ class K8sIntegration(Worker):
|
||||
ENV_DOCKER_IMAGE.get() or self._session.config.get("agent.default_docker.image", "nvidia/cuda")
|
||||
)
|
||||
container['arguments'] = self._session.config.get("agent.default_docker.arguments", None)
|
||||
set_task_container(
|
||||
self._session, task_id, docker_image=container['image'], docker_arguments=container['arguments']
|
||||
)
|
||||
|
||||
# get the clearml.conf encoded file
|
||||
# noinspection PyProtectedMember
|
||||
@@ -311,12 +421,12 @@ class K8sIntegration(Worker):
|
||||
if self.ports_mode:
|
||||
kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n {namespace}".format(
|
||||
pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number),
|
||||
agent_label=self.AGENT_LABEL,
|
||||
agent_label=self._get_agent_label(),
|
||||
namespace=self.namespace,
|
||||
)
|
||||
else:
|
||||
kubectl_cmd_new = "kubectl get pods -l {agent_label} -n {namespace} -o json".format(
|
||||
agent_label=self.AGENT_LABEL,
|
||||
agent_label=self._get_agent_label(),
|
||||
namespace=self.namespace,
|
||||
)
|
||||
process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
@@ -338,7 +448,7 @@ class K8sIntegration(Worker):
|
||||
output, task_id, queue, ex
|
||||
)
|
||||
)
|
||||
self._session.api_client.tasks.reset(task_id)
|
||||
self._session.api_client.tasks.stop(task_id, force=True)
|
||||
self._session.api_client.tasks.enqueue(task_id, queue=queue, status_reason='kubectl parsing error')
|
||||
return
|
||||
max_count = self.max_pods_limit
|
||||
@@ -356,7 +466,7 @@ class K8sIntegration(Worker):
|
||||
task_id, queue
|
||||
)
|
||||
)
|
||||
self._session.api_client.tasks.reset(task_id)
|
||||
self._session.api_client.tasks.stop(task_id, force=True)
|
||||
self._session.api_client.tasks.enqueue(
|
||||
task_id, queue=queue, status_reason='k8s max pod limit (no free k8s service)')
|
||||
return
|
||||
@@ -365,7 +475,8 @@ class K8sIntegration(Worker):
|
||||
break
|
||||
pod_count += 1
|
||||
|
||||
labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + [self.AGENT_LABEL]
|
||||
labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + \
|
||||
[self._get_agent_label()]
|
||||
labels.append("clearml-agent-queue={}".format(self._safe_k8s_label_value(queue)))
|
||||
labels.append("clearml-agent-queue-name={}".format(self._safe_k8s_label_value(queue_name)))
|
||||
|
||||
@@ -608,10 +719,26 @@ class K8sIntegration(Worker):
|
||||
|
||||
_last_machine_update_ts = 0
|
||||
while True:
|
||||
# check if have pod limit, then check if we hit it.
|
||||
if self.max_pods_limit:
|
||||
current_pods = self._get_number_used_pods()
|
||||
if current_pods >= self.max_pods_limit:
|
||||
print("Maximum pod limit reached {}/{}, sleeping for {:.1f} seconds".format(
|
||||
current_pods, self.max_pods_limit, self._polling_interval))
|
||||
# delete old completed / failed pods
|
||||
get_bash_output(
|
||||
self.KUBECTL_DELETE_CMD.format(namespace=self.namespace, selector=self._get_agent_label())
|
||||
)
|
||||
# go to sleep
|
||||
sleep(self._polling_interval)
|
||||
continue
|
||||
|
||||
# iterate over queues (priority style, queues[0] is highest)
|
||||
for queue in queues:
|
||||
# delete old completed / failed pods
|
||||
get_bash_output(self.KUBECTL_DELETE_CMD.format(namespace=self.namespace, selector=self.AGENT_LABEL))
|
||||
get_bash_output(
|
||||
self.KUBECTL_DELETE_CMD.format(namespace=self.namespace, selector=self._get_agent_label())
|
||||
)
|
||||
|
||||
# get next task in queue
|
||||
try:
|
||||
|
||||
@@ -3,11 +3,13 @@ from __future__ import unicode_literals
|
||||
import abc
|
||||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
from typing import Text, Iterable, Union, Optional, Dict, List
|
||||
from pathlib2 import Path
|
||||
from hashlib import md5
|
||||
from typing import Text, Iterable, Union, Optional, Dict, List
|
||||
|
||||
import six
|
||||
from pathlib2 import Path
|
||||
|
||||
from clearml_agent.definitions import ENV_VENV_CACHE_PATH
|
||||
from clearml_agent.helper.base import mkstemp, safe_remove_file, join_lines, select_for_platform
|
||||
from clearml_agent.helper.console import ensure_binary
|
||||
from clearml_agent.helper.os.folder_cache import FolderCache
|
||||
@@ -239,6 +241,9 @@ class PackageManager(object):
|
||||
if p.strip(strip_chars) and not p.strip(strip_chars).startswith('#')])
|
||||
if not pip_reqs and not conda_reqs:
|
||||
continue
|
||||
# do not process "-r" or "--requirement" because we cannot know what we have in the git repo.
|
||||
if any(r.strip().startswith('-r ') or r.strip().startswith('--requirement ') for r in pip_reqs):
|
||||
continue
|
||||
hash_text = '{class_type}\n{docker_cmd}\n{cuda_ver}\n{python_version}\n{pip_reqs}\n{conda_reqs}'.format(
|
||||
class_type=str(cls),
|
||||
docker_cmd=str(docker_cmd or ''),
|
||||
@@ -252,7 +257,7 @@ class PackageManager(object):
|
||||
|
||||
def _get_cache_manager(self):
|
||||
if not self._cache_manager:
|
||||
cache_folder = self.session.config.get(self._config_cache_folder, None)
|
||||
cache_folder = ENV_VENV_CACHE_PATH.get() or self.session.config.get(self._config_cache_folder, None)
|
||||
if not cache_folder:
|
||||
return None
|
||||
|
||||
|
||||
@@ -51,31 +51,7 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
except:
|
||||
freeze_base = ''
|
||||
|
||||
req_line = req.tostr(markers=False)
|
||||
if req_line.strip().startswith('-e ') or req_line.strip().startswith('--editable'):
|
||||
req_line = re.sub(r'^(-e|--editable=?)\s*', '', req_line, count=1)
|
||||
|
||||
if req.req.vcs and req_line.startswith('git+'):
|
||||
try:
|
||||
url_no_frag = furl(req_line)
|
||||
url_no_frag.set(fragment=None)
|
||||
# reverse replace
|
||||
fragment = req_line[::-1].replace(url_no_frag.url[::-1], '', 1)[::-1]
|
||||
vcs_url = req_line[4:]
|
||||
# reverse replace
|
||||
vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
|
||||
from ..repo import Git
|
||||
vcs = Git(session=session, url=vcs_url, location=None, revision=None)
|
||||
vcs._set_ssh_url()
|
||||
new_req_line = 'git+{}{}'.format(vcs.url_with_auth, fragment)
|
||||
if new_req_line != req_line:
|
||||
furl_line = furl(new_req_line)
|
||||
print('Replacing original pip vcs \'{}\' with \'{}\''.format(
|
||||
req_line,
|
||||
furl_line.set(password='xxxxxx').tostr() if furl_line.password else new_req_line))
|
||||
req_line = new_req_line
|
||||
except Exception:
|
||||
print('WARNING: Failed parsing pip git install, using original line {}'.format(req_line))
|
||||
req_line = self._add_vcs_credentials(req, session)
|
||||
|
||||
# if we have older pip version we have to make sure we replace back the package name with the
|
||||
# git repository link. In new versions this is supported and we get "package @ git+https://..."
|
||||
@@ -95,6 +71,43 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
if not PackageManager.out_of_scope_install_package(req_line):
|
||||
raise ValueError("Failed installing GIT/HTTPs package \'{}\'".format(req_line))
|
||||
|
||||
@staticmethod
|
||||
def _add_vcs_credentials(req, session):
|
||||
req_line = req.tostr(markers=False)
|
||||
if req_line.strip().startswith('-e ') or req_line.strip().startswith('--editable'):
|
||||
req_line = re.sub(r'^(-e|--editable=?)\s*', '', req_line, count=1)
|
||||
if req.req.vcs and req_line.startswith('git+'):
|
||||
try:
|
||||
url_no_frag = furl(req_line)
|
||||
url_no_frag.set(fragment=None)
|
||||
# reverse replace
|
||||
fragment = req_line[::-1].replace(url_no_frag.url[::-1], '', 1)[::-1]
|
||||
vcs_url = req_line[4:]
|
||||
# reverse replace
|
||||
vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
|
||||
# remove ssh:// or git:// prefix for git detection and credentials
|
||||
scheme = ''
|
||||
if vcs_url and (vcs_url.startswith('ssh://') or vcs_url.startswith('git://')):
|
||||
scheme = 'ssh://' # notice git:// is actually ssh://
|
||||
vcs_url = vcs_url[6:]
|
||||
|
||||
from ..repo import Git
|
||||
vcs = Git(session=session, url=vcs_url, location=None, revision=None)
|
||||
vcs._set_ssh_url()
|
||||
new_req_line = 'git+{}{}{}'.format(
|
||||
'' if scheme and '://' in vcs.url else scheme,
|
||||
vcs.url_with_auth, fragment
|
||||
)
|
||||
if new_req_line != req_line:
|
||||
furl_line = furl(new_req_line)
|
||||
print('Replacing original pip vcs \'{}\' with \'{}\''.format(
|
||||
req_line,
|
||||
furl_line.set(password='xxxxxx').tostr() if furl_line.password else new_req_line))
|
||||
req_line = new_req_line
|
||||
except Exception:
|
||||
print('WARNING: Failed parsing pip git install, using original line {}'.format(req_line))
|
||||
return req_line
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
Replace a requirement
|
||||
@@ -139,7 +152,8 @@ class ExternalRequirements(SimpleSubstitution):
|
||||
try:
|
||||
if not req.name and req.req and not req.req.editable and not req.req.vcs and \
|
||||
req.req.line and req.req.line.strip().split('#')[0] and \
|
||||
not req.req.line.strip().split('#')[0].lower().endswith('.whl'):
|
||||
not req.req.line.strip().split('#')[0].lower().endswith('.whl') and \
|
||||
not (req.req.line.strip().startswith('-r ') or req.req.line.strip().startswith('--requirement ')):
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
@@ -151,7 +165,7 @@ class OnlyExternalRequirements(ExternalRequirements):
|
||||
super(OnlyExternalRequirements, self).__init__(*args, **kwargs)
|
||||
|
||||
def match(self, req):
|
||||
return not super(OnlyExternalRequirements, self).match(req)
|
||||
return True
|
||||
|
||||
def replace(self, req):
|
||||
"""
|
||||
@@ -160,4 +174,6 @@ class OnlyExternalRequirements(ExternalRequirements):
|
||||
"""
|
||||
# Do not store the skipped requirements
|
||||
# mark skip package
|
||||
if super(OnlyExternalRequirements, self).match(req):
|
||||
return self._add_vcs_credentials(req, self._session)
|
||||
return Text('')
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
import sys
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
from typing import Text, Optional
|
||||
|
||||
from clearml_agent.definitions import PIP_EXTRA_INDICES, PROGRAM_NAME
|
||||
@@ -19,7 +20,7 @@ class SystemPip(PackageManager):
|
||||
Program interface to the system pip.
|
||||
"""
|
||||
super(SystemPip, self).__init__()
|
||||
self._bin = interpreter or sys.executable
|
||||
self._bin = Path(interpreter or sys.executable)
|
||||
self.session = session
|
||||
|
||||
@property
|
||||
|
||||
@@ -469,16 +469,17 @@ class RequirementsManager(object):
|
||||
|
||||
def replace(self, requirements): # type: (Text) -> Text
|
||||
def safe_parse(req_str):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
return next(parse(req_str, cwd=self._cwd))
|
||||
return list(parse(req_str, cwd=self._cwd))
|
||||
except Exception as ex:
|
||||
return Requirement(req_str)
|
||||
return [Requirement(req_str)]
|
||||
|
||||
parsed_requirements = tuple(
|
||||
map(
|
||||
MarkerRequirement,
|
||||
[safe_parse(line) for line in (requirements.splitlines()
|
||||
if isinstance(requirements, six.text_type) else requirements)]
|
||||
[r for line in (requirements.splitlines() if isinstance(requirements, six.text_type) else requirements)
|
||||
for r in safe_parse(line)]
|
||||
)
|
||||
)
|
||||
if not parsed_requirements:
|
||||
|
||||
@@ -274,6 +274,18 @@ class VCS(object):
|
||||
url = parsed_url.url
|
||||
return url
|
||||
|
||||
@classmethod
|
||||
def rewrite_ssh_url(cls, url, port=None, username=None):
|
||||
# type: (Text, Optional[int], Optional[str]) -> Text
|
||||
"""
|
||||
Rewrite SSH URL with custom port and username
|
||||
"""
|
||||
parsed_url = furl(url)
|
||||
if parsed_url.scheme == "ssh":
|
||||
parsed_url.username = username or "git"
|
||||
parsed_url.port = port or None
|
||||
return parsed_url.url
|
||||
|
||||
def _set_ssh_url(self):
|
||||
"""
|
||||
Replace instance URL with SSH substitution result and report to log.
|
||||
@@ -297,6 +309,20 @@ class VCS(object):
|
||||
self.url, new_url))
|
||||
self.url = new_url
|
||||
return
|
||||
# rewrite ssh URLs only if either ssh port or ssh user are forced in config
|
||||
if parsed_url.scheme == "ssh" and (
|
||||
self.session.config.get('agent.force_git_ssh_port', None) or
|
||||
self.session.config.get('agent.force_git_ssh_user', None)
|
||||
):
|
||||
new_url = self.rewrite_ssh_url(
|
||||
self.url,
|
||||
port=self.session.config.get('agent.force_git_ssh_port', None),
|
||||
username=self.session.config.get('agent.force_git_ssh_user', None)
|
||||
)
|
||||
if new_url != self.url:
|
||||
print("Using SSH credentials - ssh url '{}' with ssh url '{}'".format(
|
||||
self.url, new_url))
|
||||
self.url = new_url
|
||||
|
||||
if not self.session.config.agent.translate_ssh:
|
||||
return
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import unicode_literals, division
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shlex
|
||||
from collections import deque
|
||||
from itertools import starmap
|
||||
from threading import Thread, Event
|
||||
@@ -12,6 +13,7 @@ import attr
|
||||
import psutil
|
||||
from pathlib2 import Path
|
||||
from clearml_agent.session import Session
|
||||
from clearml_agent.definitions import ENV_WORKER_TAGS
|
||||
|
||||
try:
|
||||
from .gpu import gpustat
|
||||
@@ -59,6 +61,7 @@ class ResourceMonitor(object):
|
||||
sample_frequency_per_sec=2.0,
|
||||
report_frequency_sec=30.0,
|
||||
first_report_sec=None,
|
||||
worker_tags=None,
|
||||
):
|
||||
self.session = session
|
||||
self.queue = deque(maxlen=1)
|
||||
@@ -76,6 +79,9 @@ class ResourceMonitor(object):
|
||||
self._gpustat_fail = 0
|
||||
self._gpustat = gpustat
|
||||
self._active_gpus = None
|
||||
if not worker_tags and ENV_WORKER_TAGS.get():
|
||||
worker_tags = shlex.split(ENV_WORKER_TAGS.get())
|
||||
self._worker_tags = worker_tags
|
||||
if os.environ.get('NVIDIA_VISIBLE_DEVICES') == 'none':
|
||||
# NVIDIA_VISIBLE_DEVICES set to none, marks cpu_only flag
|
||||
# active_gpus == False means no GPU reporting
|
||||
@@ -118,6 +124,7 @@ class ResourceMonitor(object):
|
||||
machine_stats=stats,
|
||||
timestamp=(int(time()) * 1000),
|
||||
worker=self._worker_id,
|
||||
tags=self._worker_tags,
|
||||
**self.get_report().to_dict()
|
||||
)
|
||||
log.debug("sending report: %s", report)
|
||||
|
||||
@@ -83,6 +83,12 @@ DAEMON_ARGS = dict({
|
||||
'type': int,
|
||||
'default': None,
|
||||
},
|
||||
'--child-report-tags': {
|
||||
'help': 'List of tags to send with the status reports from the worker that runs a task',
|
||||
'nargs': '+',
|
||||
'type': str,
|
||||
'default': None,
|
||||
},
|
||||
'--create-queue': {
|
||||
'help': 'Create requested queue if it does not exist already.',
|
||||
'action': 'store_true',
|
||||
@@ -121,6 +127,10 @@ DAEMON_ARGS = dict({
|
||||
'help': 'Print the worker\'s schedule (uptime properties, server\'s runtime properties and listening queues)',
|
||||
'action': 'store_true',
|
||||
},
|
||||
'--use-owner-token': {
|
||||
'help': 'Generate and use task owner token for the execution of the task',
|
||||
'action': 'store_true',
|
||||
}
|
||||
}, **WORKER_ARGS)
|
||||
|
||||
COMMANDS = {
|
||||
|
||||
@@ -1 +1 @@
|
||||
__version__ = '1.0.1rc1'
|
||||
__version__ = '1.1.0'
|
||||
|
||||
14
docker/k8s-glue/README.md
Normal file
14
docker/k8s-glue/README.md
Normal file
@@ -0,0 +1,14 @@
|
||||
This folder contains an example docker and templates for running the k8s glue as a pod in a k8s cluster
|
||||
|
||||
Please note that ClearML credentials and server addresses should either be filled in the clearml.conf file before
|
||||
building the glue docker or provided in the k8s-glue.yml template.
|
||||
|
||||
To run, you'll need to:
|
||||
* Create a secret from pod_template.yml:
|
||||
```bash
|
||||
kubectl -n clearml create secret generic k8s-glue-pod-template --from-file=pod_template.yml
|
||||
```
|
||||
* Apply the k8s glue template:
|
||||
```bash
|
||||
kubectl -n clearml apply -f k8s-glue.yml
|
||||
```
|
||||
402
docker/k8s-glue/build-resources/clearml.conf
Normal file
402
docker/k8s-glue/build-resources/clearml.conf
Normal file
@@ -0,0 +1,402 @@
|
||||
# CLEARML-AGENT configuration file
|
||||
api {
|
||||
# Notice: 'host' is the api server (default port 8008), not the web server.
|
||||
api_server: ""
|
||||
web_server: ""
|
||||
files_server: ""
|
||||
# Override with os environment: CLEARML_API_ACCESS_KEY / CLEARML_API_SECRET_KEY
|
||||
credentials {"access_key": "", "secret_key": ""}
|
||||
}
|
||||
|
||||
# Set GIT user/pass credentials
|
||||
# leave blank for GIT SSH credentials
|
||||
agent.git_user=""
|
||||
agent.git_pass=""
|
||||
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]
|
||||
agent.package_manager.extra_index_url= [
|
||||
|
||||
]
|
||||
|
||||
agent {
|
||||
# unique name of this worker, if None, created based on hostname:process_id
|
||||
# Override with os environment: CLEARML_WORKER_ID
|
||||
# worker_id: "clearml-agent-machine1:gpu0"
|
||||
worker_id: ""
|
||||
|
||||
# worker name, replaces the hostname when creating a unique name for this worker
|
||||
# Override with os environment: CLEARML_WORKER_NAME
|
||||
# worker_name: "clearml-agent-machine1"
|
||||
worker_name: ""
|
||||
|
||||
# Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
|
||||
# leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
|
||||
# git_user: ""
|
||||
# git_pass: ""
|
||||
# git_host: ""
|
||||
|
||||
# Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
|
||||
force_git_ssh_protocol: false
|
||||
# Force a specific SSH port when converting http to ssh links (the domain is kept the same)
|
||||
# force_git_ssh_port: 0
|
||||
# Force a specific SSH username when converting http to ssh links (the default username is 'git')
|
||||
# force_git_ssh_user: git
|
||||
|
||||
# Set the python version to use when creating the virtual environment and launching the experiment
|
||||
# Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
|
||||
# The default is the python executing the clearml_agent
|
||||
python_binary: ""
|
||||
# ignore any requested python version (Default: False, if a Task was using a
|
||||
# specific python version and the system supports multiple python the agent will use the requested python version)
|
||||
# ignore_requested_python_version: true
|
||||
|
||||
# select python package manager:
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
|
||||
# specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
|
||||
pip_version: "<20.2",
|
||||
|
||||
# virtual environment inheres packages from system
|
||||
system_site_packages: false,
|
||||
|
||||
# install with --upgrade
|
||||
force_upgrade: false,
|
||||
|
||||
# additional artifact repositories to use when installing python packages
|
||||
# extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"]
|
||||
|
||||
# additional conda channels to use when installing with conda package manager
|
||||
conda_channels: ["pytorch", "conda-forge", "defaults", ]
|
||||
|
||||
# If set to true, Task's "installed packages" are ignored,
|
||||
# and the repository's "requirements.txt" is used instead
|
||||
# force_repo_requirements_txt: false
|
||||
|
||||
# set the priority packages to be installed before the rest of the required packages
|
||||
# priority_packages: ["cython", "numpy", "setuptools", ]
|
||||
|
||||
# set the optional priority packages to be installed before the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# priority_optional_packages: ["pygobject", ]
|
||||
|
||||
# set the post packages to be installed after all the rest of the required packages
|
||||
# post_packages: ["horovod", ]
|
||||
|
||||
# set the optional post packages to be installed after all the rest of the required packages,
|
||||
# In case a package installation fails, the package will be ignored,
|
||||
# and the virtual environment process will continue
|
||||
# post_optional_packages: []
|
||||
|
||||
# set to True to support torch nightly build installation,
|
||||
# notice: torch nightly builds are ephemeral and are deleted from time to time
|
||||
torch_nightly: false,
|
||||
},
|
||||
|
||||
# target folder for virtual environments builds, created when executing experiment
|
||||
venvs_dir = ~/.clearml/venvs-builds
|
||||
|
||||
# cached virtual environment folder
|
||||
venvs_cache: {
|
||||
# maximum number of cached venvs
|
||||
max_entries: 10
|
||||
# minimum required free space to allow for cache entry, disable by passing 0 or negative value
|
||||
free_space_threshold_gb: 2.0
|
||||
# unmark to enable virtual environment caching
|
||||
# path: ~/.clearml/venvs-cache
|
||||
},
|
||||
|
||||
# cached git clone folder
|
||||
vcs_cache: {
|
||||
enabled: true,
|
||||
path: ~/.clearml/vcs-cache
|
||||
},
|
||||
|
||||
# use venv-update in order to accelerate python virtual environment building
|
||||
# Still in beta, turned off by default
|
||||
venv_update: {
|
||||
enabled: false,
|
||||
},
|
||||
|
||||
# cached folder for specific python package download (used for pytorch package caching)
|
||||
pip_download_cache {
|
||||
enabled: true,
|
||||
path: ~/.clearml/pip-download-cache
|
||||
},
|
||||
|
||||
translate_ssh: true,
|
||||
# reload configuration file every daemon execution
|
||||
reload_config: false,
|
||||
|
||||
# pip cache folder mapped into docker, used for python package caching
|
||||
docker_pip_cache = ~/.clearml/pip-cache
|
||||
# apt cache folder mapped into docker, used for ubuntu package caching
|
||||
docker_apt_cache = ~/.clearml/apt-cache
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# these are local for this agent and will not be updated in the experiment's docker_cmd section
|
||||
# extra_docker_arguments: ["--ipc=host", ]
|
||||
|
||||
# optional shell script to run in docker when started before the experiment is started
|
||||
# extra_docker_shell_script: ["apt-get install -y bindfs", ]
|
||||
|
||||
# Install the required packages for opencv libraries (libsm6 libxext6 libxrender-dev libglib2.0-0),
|
||||
# for backwards compatibility reasons, true as default,
|
||||
# change to false to skip installation and decrease docker spin up time
|
||||
# docker_install_opencv_libs: true
|
||||
|
||||
# optional uptime configuration, make sure to use only one of 'uptime/downtime' and not both.
|
||||
# If uptime is specified, agent will actively poll (and execute) tasks in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will be idle.
|
||||
# Defined using a list of items of the format: "<hours> <days>".
|
||||
# hours - use values 0-23, single values would count as start hour and end at midnight.
|
||||
# days - use days in abbreviated format (SUN-SAT)
|
||||
# use '-' for ranges and ',' to separate singular values.
|
||||
# for example, to enable the workers every Sunday and Tuesday between 17:00-20:00 set uptime to:
|
||||
# uptime: ["17-20 SUN,TUE"]
|
||||
|
||||
# optional downtime configuration, can be used only when uptime is not used.
|
||||
# If downtime is specified, agent will be idle in the time-spans defined here.
|
||||
# Outside of the specified time-spans, the agent will actively poll (and execute) tasks.
|
||||
# Use the same format as described above for uptime
|
||||
# downtime: []
|
||||
|
||||
# set to true in order to force "docker pull" before running an experiment using a docker image.
|
||||
# This makes sure the docker image is updated.
|
||||
docker_force_pull: false
|
||||
|
||||
default_docker: {
|
||||
# default docker image to use when running in docker mode
|
||||
image: "nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04"
|
||||
|
||||
# optional arguments to pass to docker image
|
||||
# arguments: ["--ipc=host", ]
|
||||
}
|
||||
|
||||
# set the OS environments based on the Task's Environment section before launching the Task process.
|
||||
enable_task_env: false
|
||||
|
||||
# set the initial bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# {python_single_digit} is translated to 'python3' or 'python2' according to requested python version
|
||||
# docker_init_bash_script = [
|
||||
# "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
|
||||
# "chown -R root /root/.cache/pip",
|
||||
# "apt-get update",
|
||||
# "apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0",
|
||||
# "(which {python_single_digit} && {python_single_digit} -m pip --version) || apt-get install -y {python_single_digit}-pip",
|
||||
# ]
|
||||
|
||||
# set the preprocessing bash script to execute at the startup of any docker.
|
||||
# all lines will be executed regardless of their exit code.
|
||||
# docker_preprocess_bash_script = [
|
||||
# "echo \"starting docker\"",
|
||||
#]
|
||||
|
||||
# If False replace \r with \n and display full console output
|
||||
# default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
|
||||
# suppress_carriage_return: true
|
||||
|
||||
# cuda versions used for solving pytorch wheel packages
|
||||
# should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
# cudnn_version: 7.6
|
||||
|
||||
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||
# docker command, set:
|
||||
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||
hide_docker_command_env_vars {
|
||||
enabled: true
|
||||
extra_keys: []
|
||||
}
|
||||
}
|
||||
|
||||
sdk {
|
||||
# ClearML - default SDK configuration
|
||||
|
||||
storage {
|
||||
cache {
|
||||
# Defaults to system temp folder / cache
|
||||
default_base_dir: "~/.clearml/cache"
|
||||
size {
|
||||
# max_used_bytes = -1
|
||||
min_free_bytes = 10GB
|
||||
# cleanup_margin_percent = 5%
|
||||
}
|
||||
}
|
||||
|
||||
direct_access: [
|
||||
# Objects matching are considered to be available for direct access, i.e. they will not be downloaded
|
||||
# or cached, and any download request will return a direct reference.
|
||||
# Objects are specified in glob format, available for url and content_type.
|
||||
{ url: "file://*" } # file-urls are always directly referenced
|
||||
]
|
||||
}
|
||||
|
||||
metrics {
|
||||
# History size for debug files per metric/variant. For each metric/variant combination with an attached file
|
||||
# (e.g. debug image event), file names for the uploaded files will be recycled in such a way that no more than
|
||||
# X files are stored in the upload destination for each metric/variant combination.
|
||||
file_history_size: 100
|
||||
|
||||
# Max history size for matplotlib imshow files per plot title.
|
||||
# File names for the uploaded images will be recycled in such a way that no more than
|
||||
# X images are stored in the upload destination for each matplotlib plot title.
|
||||
matplotlib_untitled_history_size: 100
|
||||
|
||||
# Limit the number of digits after the dot in plot reporting (reducing plot report size)
|
||||
# plot_max_num_digits: 5
|
||||
|
||||
# Settings for generated debug images
|
||||
images {
|
||||
format: JPEG
|
||||
quality: 87
|
||||
subsampling: 0
|
||||
}
|
||||
|
||||
# Support plot-per-graph fully matching Tensorboard behavior (i.e. if this is set to true, each series should have its own graph)
|
||||
tensorboard_single_series_per_graph: false
|
||||
}
|
||||
|
||||
network {
|
||||
metrics {
|
||||
# Number of threads allocated to uploading files (typically debug images) when transmitting metrics for
|
||||
# a specific iteration
|
||||
file_upload_threads: 4
|
||||
|
||||
# Warn about upload starvation if no uploads were made in specified period while file-bearing events keep
|
||||
# being sent for upload
|
||||
file_upload_starvation_warning_sec: 120
|
||||
}
|
||||
|
||||
iteration {
|
||||
# Max number of retries when getting frames if the server returned an error (http code 500)
|
||||
max_retries_on_server_error: 5
|
||||
# Backoff factory for consecutive retry attempts.
|
||||
# SDK will wait for {backoff factor} * (2 ^ ({number of total retries} - 1)) between retries.
|
||||
retry_backoff_factor_sec: 10
|
||||
}
|
||||
}
|
||||
aws {
|
||||
s3 {
|
||||
# S3 credentials, used for read/write access by various SDK elements
|
||||
|
||||
# default, used for any bucket not specified below
|
||||
key: ""
|
||||
secret: ""
|
||||
region: ""
|
||||
|
||||
credentials: [
|
||||
# specifies key/secret credentials to use when handling s3 urls (read or write)
|
||||
# {
|
||||
# bucket: "my-bucket-name"
|
||||
# key: "my-access-key"
|
||||
# secret: "my-secret-key"
|
||||
# },
|
||||
# {
|
||||
# # This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
|
||||
# host: "my-minio-host:9000"
|
||||
# key: "12345678"
|
||||
# secret: "12345678"
|
||||
# multipart: false
|
||||
# secure: false
|
||||
# }
|
||||
]
|
||||
}
|
||||
boto3 {
|
||||
pool_connections: 512
|
||||
max_multipart_concurrency: 16
|
||||
}
|
||||
}
|
||||
google.storage {
|
||||
# # Default project and credentials file
|
||||
# # Will be used when no bucket configuration is found
|
||||
# project: "clearml"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
|
||||
# # Specific credentials per bucket and sub directory
|
||||
# credentials = [
|
||||
# {
|
||||
# bucket: "my-bucket"
|
||||
# subdir: "path/in/bucket" # Not required
|
||||
# project: "clearml"
|
||||
# credentials_json: "/path/to/credentials.json"
|
||||
# },
|
||||
# ]
|
||||
}
|
||||
azure.storage {
|
||||
# containers: [
|
||||
# {
|
||||
# account_name: "clearml"
|
||||
# account_key: "secret"
|
||||
# # container_name:
|
||||
# }
|
||||
# ]
|
||||
}
|
||||
|
||||
log {
|
||||
# debugging feature: set this to true to make null log propagate messages to root logger (so they appear in stdout)
|
||||
null_log_propagate: false
|
||||
task_log_buffer_capacity: 66
|
||||
|
||||
# disable urllib info and lower levels
|
||||
disable_urllib3_info: true
|
||||
}
|
||||
|
||||
development {
|
||||
# Development-mode options
|
||||
|
||||
# dev task reuse window
|
||||
task_reuse_time_window_in_hours: 72.0
|
||||
|
||||
# Run VCS repository detection asynchronously
|
||||
vcs_repo_detect_async: true
|
||||
|
||||
# Store uncommitted git/hg source code diff in experiment manifest when training in development mode
|
||||
# This stores "git diff" or "hg diff" into the experiment's "script.requirements.diff" section
|
||||
store_uncommitted_code_diff: true
|
||||
|
||||
# Support stopping an experiment in case it was externally stopped, status was changed or task was reset
|
||||
support_stopping: true
|
||||
|
||||
# Default Task output_uri. if output_uri is not provided to Task.init, default_output_uri will be used instead.
|
||||
default_output_uri: ""
|
||||
|
||||
# Default auto generated requirements optimize for smaller requirements
|
||||
# If True, analyze the entire repository regardless of the entry point.
|
||||
# If False, first analyze the entry point script, if it does not contain other to local files,
|
||||
# do not analyze the entire repository.
|
||||
force_analyze_entire_repo: false
|
||||
|
||||
# If set to true, *clearml* update message will not be printed to the console
|
||||
# this value can be overwritten with os environment variable CLEARML_SUPPRESS_UPDATE_MESSAGE=1
|
||||
suppress_update_message: false
|
||||
|
||||
# If this flag is true (default is false), instead of analyzing the code with Pigar, analyze with `pip freeze`
|
||||
detect_with_pip_freeze: false
|
||||
|
||||
# Development mode worker
|
||||
worker {
|
||||
# Status report period in seconds
|
||||
report_period_sec: 2
|
||||
|
||||
# ping to the server - check connectivity
|
||||
ping_period_sec: 30
|
||||
|
||||
# Log all stdout & stderr
|
||||
log_stdout: true
|
||||
|
||||
# compatibility feature, report memory usage for the entire machine
|
||||
# default (false), report only on the running process and its sub-processes
|
||||
report_global_mem_used: false
|
||||
}
|
||||
}
|
||||
}
|
||||
36
docker/k8s-glue/build-resources/entrypoint.sh
Normal file
36
docker/k8s-glue/build-resources/entrypoint.sh
Normal file
@@ -0,0 +1,36 @@
|
||||
#!/bin/bash -x
|
||||
|
||||
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-$TRAINS_FILES_HOST}
|
||||
|
||||
if [ -z "$CLEARML_FILES_HOST" ]; then
|
||||
CLEARML_HOST_IP=${CLEARML_HOST_IP:-${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}}
|
||||
fi
|
||||
|
||||
export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-${TRAINS_FILES_HOST:-"http://$CLEARML_HOST_IP:8081"}}
|
||||
export CLEARML_WEB_HOST=${CLEARML_WEB_HOST:-${TRAINS_WEB_HOST:-"http://$CLEARML_HOST_IP:8080"}}
|
||||
export CLEARML_API_HOST=${CLEARML_API_HOST:-${TRAINS_API_HOST:-"http://$CLEARML_HOST_IP:8008"}}
|
||||
|
||||
echo $CLEARML_FILES_HOST $CLEARML_WEB_HOST $CLEARML_API_HOST 1>&2
|
||||
|
||||
if [ -z "$CLEARML_AGENT_NO_UPDATE" ]; then
|
||||
if [ -n "$CLEARML_AGENT_UPDATE_REPO" ]; then
|
||||
python3 -m pip install -q -U "$CLEARML_AGENT_UPDATE_REPO"
|
||||
else
|
||||
python3 -m pip install -q -U "clearml-agent${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}"
|
||||
fi
|
||||
fi
|
||||
|
||||
QUEUE=${K8S_GLUE_QUEUE:-k8s_glue}
|
||||
MAX_PODS=${K8S_GLUE_MAX_PODS:-2}
|
||||
EXTRA_ARGS=${K8S_GLUE_EXTRA_ARGS:-}
|
||||
|
||||
# shellcheck disable=SC2129
|
||||
echo "api.credentials.access_key: ${CLEARML_API_ACCESS_KEY}" >> ~/clearml.conf
|
||||
echo "api.credentials.secret_key: ${CLEARML_API_SECRET_KEY}" >> ~/clearml.conf
|
||||
echo "api.api_server: ${CLEARML_API_HOST}" >> ~/clearml.conf
|
||||
echo "api.web_server: ${CLEARML_WEB_HOST}" >> ~/clearml.conf
|
||||
echo "api.files_server: ${CLEARML_FILES_HOST}" >> ~/clearml.conf
|
||||
|
||||
./provider_entrypoint.sh
|
||||
|
||||
python3 k8s_glue_example.py --queue ${QUEUE} --max-pods ${MAX_PODS} ${EXTRA_ARGS}
|
||||
94
docker/k8s-glue/build-resources/k8s_glue_example.py
Normal file
94
docker/k8s-glue/build-resources/k8s_glue_example.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""
|
||||
This example assumes you have preconfigured services with selectors in the form of
|
||||
"ai.allegro.agent.serial=pod-<number>" and a targetPort of 10022.
|
||||
The K8sIntegration component will label each pod accordingly.
|
||||
"""
|
||||
from argparse import ArgumentParser
|
||||
|
||||
from clearml_agent.glue.k8s import K8sIntegration
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = ArgumentParser()
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
|
||||
parser.add_argument(
|
||||
"--queue", type=str, help="Queue to pull tasks from"
|
||||
)
|
||||
group.add_argument(
|
||||
"--ports-mode", action='store_true', default=False,
|
||||
help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
|
||||
"Should not be used with max-pods"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-of-services", type=int, default=20,
|
||||
help="Specify the number of k8s services to be used. Use only with ports-mode."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-port", type=int,
|
||||
help="Used in conjunction with ports-mode, specifies the base port exposed by the services. "
|
||||
"For pod #X, the port will be <base-port>+X. Note that pod number is calculated based on base-pod-num"
|
||||
"e.g. if base-port=20000 and base-pod-num=3, the port for the first pod will be 20003"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--base-pod-num", type=int, default=1,
|
||||
help="Used in conjunction with ports-mode and base-port, specifies the base pod number to be used by the "
|
||||
"service (default: %(default)s)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--gateway-address", type=str, default=None,
|
||||
help="Used in conjunction with ports-mode, specify the external address of the k8s ingress / ELB"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--pod-clearml-conf", type=str,
|
||||
help="Configuration file to be used by the pod itself (if not provided, current configuration is used)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--overrides-yaml", type=str,
|
||||
help="YAML file containing pod overrides to be used when launching a new pod"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--template-yaml", type=str,
|
||||
help="YAML file containing pod template. If provided pod will be scheduled with kubectl apply "
|
||||
"and overrides are ignored, otherwise it will be scheduled with kubectl run"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ssh-server-port", type=int, default=0,
|
||||
help="If non-zero, every pod will also start an SSH server on the selected port (default: zero, not active)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--namespace", type=str,
|
||||
help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml"
|
||||
)
|
||||
group.add_argument(
|
||||
"--max-pods", type=int,
|
||||
help="Limit the maximum number of pods that this service can run at the same time."
|
||||
"Should not be used with ports-mode"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
user_props_cb = None
|
||||
if args.ports_mode and args.base_port:
|
||||
def k8s_user_props_cb(pod_number=0):
|
||||
user_prop = {"k8s-pod-port": args.base_port + pod_number}
|
||||
if args.gateway_address:
|
||||
user_prop["k8s-gateway-address"] = args.gateway_address
|
||||
return user_prop
|
||||
user_props_cb = k8s_user_props_cb
|
||||
|
||||
k8s = K8sIntegration(
|
||||
ports_mode=args.ports_mode, num_of_services=args.num_of_services, base_pod_num=args.base_pod_num,
|
||||
user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf,
|
||||
template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
|
||||
ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
|
||||
namespace=args.namespace, max_pods_limit=args.max_pods or None,
|
||||
)
|
||||
k8s.k8s_daemon(args.queue)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
14
docker/k8s-glue/build-resources/setup.sh
Normal file
14
docker/k8s-glue/build-resources/setup.sh
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
chmod +x /root/entrypoint.sh
|
||||
|
||||
apt-get update -y
|
||||
apt-get dist-upgrade -y
|
||||
apt-get install -y curl unzip less locales
|
||||
|
||||
locale-gen en_US.UTF-8
|
||||
|
||||
apt-get install -y curl python3-pip git
|
||||
python3 -m pip install -U pip
|
||||
python3 -m pip install clearml-agent
|
||||
python3 -m pip install -U "cryptography>=2.9"
|
||||
22
docker/k8s-glue/glue-build-aws/Dockerfile
Normal file
22
docker/k8s-glue/glue-build-aws/Dockerfile
Normal file
@@ -0,0 +1,22 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
USER root
|
||||
WORKDIR /root
|
||||
|
||||
ENV LC_ALL=en_US.UTF-8
|
||||
ENV LANG=en_US.UTF-8
|
||||
ENV LANGUAGE=en_US.UTF-8
|
||||
ENV PYTHONIOENCODING=UTF-8
|
||||
|
||||
COPY ../build-resources/setup.sh /root/setup.sh
|
||||
RUN /root/setup.sh
|
||||
|
||||
COPY ./setup_aws.sh /root/setup_aws.sh
|
||||
RUN /root/setup_aws.sh
|
||||
|
||||
COPY ../build-resources/entrypoint.sh /root/entrypoint.sh
|
||||
COPY ./provider_entrypoint.sh /root/provider_entrypoint.sh
|
||||
COPY ./build-resources/k8s_glue_example.py /root/k8s_glue_example.py
|
||||
COPY ./clearml.conf /root/clearml.conf
|
||||
|
||||
ENTRYPOINT ["/root/entrypoint.sh"]
|
||||
4
docker/k8s-glue/glue-build-aws/provider_entrypoint.sh
Normal file
4
docker/k8s-glue/glue-build-aws/provider_entrypoint.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash -x
|
||||
|
||||
source /root/.bashrc
|
||||
export PATH=$PATH:$HOME/bin
|
||||
14
docker/k8s-glue/glue-build-aws/setup_aws.sh
Normal file
14
docker/k8s-glue/glue-build-aws/setup_aws.sh
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
|
||||
unzip awscliv2.zip
|
||||
./aws/install
|
||||
|
||||
curl -o kubectl https://amazon-eks.s3-us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/kubectl
|
||||
#curl -o kubectl https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/kubectl
|
||||
chmod +x ./kubectl && mkdir -p $HOME/bin && cp ./kubectl $HOME/bin/kubectl && export PATH=$PATH:$HOME/bin
|
||||
|
||||
curl -o aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.21.2/2021-07-05/bin/linux/amd64/aws-iam-authenticator
|
||||
#curl -o aws-iam-authenticator https://amazon-eks.s3.us-west-2.amazonaws.com/1.19.6/2021-01-05/bin/linux/amd64/aws-iam-authenticator
|
||||
chmod +x ./aws-iam-authenticator && mkdir -p $HOME/bin && cp ./aws-iam-authenticator $HOME/bin/aws-iam-authenticator && export PATH=$PATH:$HOME/bin
|
||||
echo 'export PATH=$PATH:$HOME/bin' >> ~/.bashrc
|
||||
22
docker/k8s-glue/glue-build-gcp/Dockerfile
Normal file
22
docker/k8s-glue/glue-build-gcp/Dockerfile
Normal file
@@ -0,0 +1,22 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
USER root
|
||||
WORKDIR /root
|
||||
|
||||
ENV LC_ALL=en_US.UTF-8
|
||||
ENV LANG=en_US.UTF-8
|
||||
ENV LANGUAGE=en_US.UTF-8
|
||||
ENV PYTHONIOENCODING=UTF-8
|
||||
|
||||
COPY ../build-resources/setup.sh /root/setup.sh
|
||||
RUN /root/setup.sh
|
||||
|
||||
COPY ./setup_gcp.sh /root/setup_gcp.sh
|
||||
RUN /root/setup_gcp.sh
|
||||
|
||||
COPY ../build-resources/entrypoint.sh /root/entrypoint.sh
|
||||
COPY ./provider_entrypoint.sh /root/provider_entrypoint.sh
|
||||
COPY ./build-resources/k8s_glue_example.py /root/k8s_glue_example.py
|
||||
COPY ./clearml.conf /root/clearml.conf
|
||||
|
||||
ENTRYPOINT ["/root/entrypoint.sh"]
|
||||
4
docker/k8s-glue/glue-build-gcp/provider_entrypoint.sh
Normal file
4
docker/k8s-glue/glue-build-gcp/provider_entrypoint.sh
Normal file
@@ -0,0 +1,4 @@
|
||||
#!/bin/bash -x
|
||||
|
||||
gcloud auth activate-service-account ${CLEARML_SERVICE_ACC} --key-file=/root/keys/${SERVICE_ACC_KEY_JSON}
|
||||
gcloud container clusters get-credentials ${CLUSTER_CRED}
|
||||
14
docker/k8s-glue/glue-build-gcp/setup_gcp.sh
Normal file
14
docker/k8s-glue/glue-build-gcp/setup_gcp.sh
Normal file
@@ -0,0 +1,14 @@
|
||||
#!/bin/bash
|
||||
|
||||
curl -LO https://dl.k8s.io/release/v1.21.0/bin/linux/amd64/kubectl
|
||||
|
||||
install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl
|
||||
|
||||
sudo apt-get install -y apt-transport-https ca-certificates gnupg
|
||||
|
||||
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
|
||||
|
||||
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
|
||||
|
||||
apt-get update -y
|
||||
apt-get install -y google-cloud-sdk
|
||||
47
docker/k8s-glue/k8s-glue-aws.yml
Normal file
47
docker/k8s-glue/k8s-glue-aws.yml
Normal file
@@ -0,0 +1,47 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: k8s-glue
|
||||
spec:
|
||||
serviceAccountName: ""
|
||||
containers:
|
||||
- name: k8s-glue-container
|
||||
image: allegroai/clearml-agent-k8s:aws-latest-1.21
|
||||
imagePullPolicy: Always
|
||||
command: [
|
||||
"/bin/bash",
|
||||
"-c",
|
||||
"source /root/.bashrc && /root/entrypoint.sh"
|
||||
]
|
||||
volumeMounts:
|
||||
- name: pod-template
|
||||
mountPath: /root/template
|
||||
env:
|
||||
- name: CLEARML_API_HOST
|
||||
value: ""
|
||||
- name: CLEARML_WEB_HOST
|
||||
value: ""
|
||||
- name: CLEARML_FILES_HOST
|
||||
value: ""
|
||||
# - name: K8S_GLUE_MAX_PODS
|
||||
# value: "2"
|
||||
- name: K8S_GLUE_QUEUE
|
||||
value: "k8s-glue"
|
||||
- name: K8S_GLUE_EXTRA_ARGS
|
||||
value: "--template-yaml /root/template/pod_template.yml"
|
||||
- name: CLEARML_API_ACCESS_KEY
|
||||
value: ""
|
||||
- name: CLEARML_API_SECRET_KEY
|
||||
value: ""
|
||||
- name: CLEARML_WORKER_ID
|
||||
value: "k8s-glue-agent"
|
||||
- name: CLEARML_AGENT_UPDATE_REPO
|
||||
value: ""
|
||||
- name: FORCE_CLEARML_AGENT_REPO
|
||||
value: ""
|
||||
- name: CLEARML_DOCKER_IMAGE
|
||||
value: "ubuntu:18.04"
|
||||
volumes:
|
||||
- name: pod-template
|
||||
secret:
|
||||
secretName: k8s-glue-pod-template
|
||||
58
docker/k8s-glue/k8s-glue-gcp.yml
Normal file
58
docker/k8s-glue/k8s-glue-gcp.yml
Normal file
@@ -0,0 +1,58 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: k8s-glue
|
||||
spec:
|
||||
serviceAccountName: ""
|
||||
containers:
|
||||
- name: k8s-glue-container
|
||||
image: allegroai/clearml-agent-k8s:gcp-latest-1.21
|
||||
imagePullPolicy: Always
|
||||
command: [
|
||||
"/bin/bash",
|
||||
"-c",
|
||||
"source /root/.bashrc && /root/entrypoint.sh"
|
||||
]
|
||||
volumeMounts:
|
||||
- name: pod-template
|
||||
mountPath: /root/template
|
||||
- name: service-acc-key
|
||||
mountPath: /root/keys
|
||||
env:
|
||||
- name: CLEARML_API_HOST
|
||||
value: ""
|
||||
- name: CLEARML_WEB_HOST
|
||||
value: ""
|
||||
- name: CLEARML_FILES_HOST
|
||||
value: ""
|
||||
# - name: K8S_GLUE_MAX_PODS
|
||||
# value: "2"
|
||||
- name: K8S_GLUE_QUEUE
|
||||
value: "k8s-glue"
|
||||
- name: K8S_GLUE_EXTRA_ARGS
|
||||
value: "--template-yaml /root/template/pod_template.yml"
|
||||
- name: CLEARML_API_ACCESS_KEY
|
||||
value: ""
|
||||
- name: CLEARML_API_SECRET_KEY
|
||||
value: ""
|
||||
- name: CLEARML_WORKER_ID
|
||||
value: "k8s-glue-agent"
|
||||
- name: CLEARML_AGENT_UPDATE_REPO
|
||||
value: ""
|
||||
- name: FORCE_CLEARML_AGENT_REPO
|
||||
value: ""
|
||||
- name: CLEARML_DOCKER_IMAGE
|
||||
value: "ubuntu:18.04"
|
||||
- name: CLEARML_SERVICE_ACC
|
||||
value: ""
|
||||
- name: SERVICE_ACC_KEY_JSON
|
||||
value: service-account-key.json
|
||||
- name: CLUSTER_CRED
|
||||
value: ""
|
||||
volumes:
|
||||
- name: pod-template
|
||||
secret:
|
||||
secretName: k8s-glue-pod-template
|
||||
- name: service-acc-key
|
||||
secret:
|
||||
secretName: k8s-glue-service-acc-key
|
||||
13
docker/k8s-glue/pod_template.yml
Normal file
13
docker/k8s-glue/pod_template.yml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
namespace: clearml
|
||||
spec:
|
||||
containers:
|
||||
- resources:
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 4G
|
||||
requests:
|
||||
cpu: 1000m
|
||||
memory: 4G
|
||||
restartPolicy: Never
|
||||
7
docker/k8s-glue/task-pod-build/Dockerfile
Normal file
7
docker/k8s-glue/task-pod-build/Dockerfile
Normal file
@@ -0,0 +1,7 @@
|
||||
FROM ubuntu:18.04
|
||||
|
||||
USER root
|
||||
WORKDIR /root
|
||||
COPY ./setup.sh /root/setup.sh
|
||||
|
||||
RUN /root/setup.sh
|
||||
10
docker/k8s-glue/task-pod-build/setup.sh
Normal file
10
docker/k8s-glue/task-pod-build/setup.sh
Normal file
@@ -0,0 +1,10 @@
|
||||
echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean
|
||||
chown -R root /root/.cache/pip
|
||||
|
||||
apt-get update -y
|
||||
apt-get dist-upgrade -y
|
||||
apt-get install -y git libsm6 libxext6 libxrender-dev libglib2.0-0 curl python3-pip
|
||||
|
||||
python3 -m pip install -U pip
|
||||
python3 -m pip install clearml-agent
|
||||
python3 -m pip install -U "cryptography>=2.9"
|
||||
@@ -47,8 +47,13 @@ agent {
|
||||
# ignore_requested_python_version: true
|
||||
|
||||
# select python package manager:
|
||||
# currently supported pip and conda
|
||||
# poetry is used if pip selected and repository contains poetry.lock file
|
||||
# currently supported: pip, conda and poetry
|
||||
# if "pip" or "conda" are used, the agent installs the required packages
|
||||
# based on the "installed packages" section of the Task. If the "installed packages" is empty,
|
||||
# it will revert to using `requirements.txt` from the repository's root directory.
|
||||
# If Poetry is selected and the root repository contains `poetry.lock` or `pyproject.toml`,
|
||||
# the "installed packages" section is ignored, and poetry is used.
|
||||
# If Poetry is selected and no lock file is found, it reverts to "pip" package manager behaviour.
|
||||
package_manager: {
|
||||
# supported options: pip, conda, poetry
|
||||
type: pip,
|
||||
@@ -163,6 +168,37 @@ agent {
|
||||
# it Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
|
||||
# cuda_version: 10.1
|
||||
# cudnn_version: 7.6
|
||||
|
||||
# Hide docker environment variables containing secrets when printing out the docker command by replacing their
|
||||
# values with "********". Turning this feature on will hide the following environment variables values:
|
||||
# CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
|
||||
# To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
|
||||
# your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
|
||||
# docker command, set:
|
||||
# extra_keys: ["MY_SPECIAL_PASSWORD"]
|
||||
hide_docker_command_env_vars {
|
||||
enabled: true
|
||||
extra_keys: []
|
||||
}
|
||||
|
||||
# allow to set internal mount points inside the docker,
|
||||
# especially useful for non-root docker container images.
|
||||
# docker_internal_mounts {
|
||||
# sdk_cache: "/clearml_agent_cache"
|
||||
# apt_cache: "/var/cache/apt/archives"
|
||||
# ssh_folder: "/root/.ssh"
|
||||
# pip_cache: "/root/.cache/pip"
|
||||
# poetry_cache: "/root/.cache/pypoetry"
|
||||
# vcs_cache: "/root/.clearml/vcs-cache"
|
||||
# venv_build: "/root/.clearml/venvs-builds"
|
||||
# pip_download: "/root/.clearml/pip-download-cache"
|
||||
# }
|
||||
|
||||
# Name docker containers created by the daemon using the following string format (supported from Docker 0.6.5)
|
||||
# Allowed variables are task_id, worker_id and rand_string (random lower-case letters string, up to 32 charaters)
|
||||
# Note: resulting name must start with an alpha-numeric character and
|
||||
# continue with a alpha-numeric characters, underscores (_), dots (.) and/or dashes (-)
|
||||
# docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"
|
||||
}
|
||||
|
||||
sdk {
|
||||
|
||||
BIN
docs/clearml_architecture.png
Normal file
BIN
docs/clearml_architecture.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 123 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 2.0 MiB After Width: | Height: | Size: 1.1 MiB |
Reference in New Issue
Block a user