Compare commits

..

13 Commits

14 changed files with 131 additions and 38 deletions

View File

@@ -137,6 +137,12 @@
},
translate_ssh: true,
# set "disable_ssh_mount: true" to disable the automatic mount of ~/.ssh folder into the docker containers
# default is false, automatically mounts ~/.ssh
# Must be set to True if using "clearml-session" with this agent!
# disable_ssh_mount: false
# reload configuration file every daemon execution
reload_config: false,

View File

@@ -28,6 +28,9 @@
pool_maxsize: 512
pool_connections: 512
# Override the default http method, use "put" if working behind GCP load balancer (default: "get")
# default_method: "get"
}
auth {

View File

@@ -8,13 +8,14 @@ from .datamodel import DataModel
from .defs import ENV_API_DEFAULT_REQ_METHOD
if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST"):
if ENV_API_DEFAULT_REQ_METHOD.get().upper() not in ("GET", "POST", "PUT"):
raise ValueError(
"CLEARML_API_DEFAULT_REQ_METHOD environment variable must be 'get' or 'post' (any case is allowed)."
)
class Request(ApiModel):
def_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
_method = ENV_API_DEFAULT_REQ_METHOD.get(default="get")
def __init__(self, **kwargs):

View File

@@ -14,8 +14,9 @@ from requests.auth import HTTPBasicAuth
from six.moves.urllib.parse import urlparse, urlunparse
from .callresult import CallResult
from .defs import ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN, \
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD
from .defs import (
ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN,
ENV_NO_DEFAULT_SERVER, ENV_DISABLE_VAULT_SUPPORT, ENV_INITIAL_CONNECT_RETRY_OVERRIDE, ENV_API_DEFAULT_REQ_METHOD, )
from .request import Request, BatchRequest
from .token_manager import TokenManager
from ..config import load
@@ -110,6 +111,19 @@ class Session(TokenManager):
self._logger = logger
self.__auth_token = None
if ENV_API_DEFAULT_REQ_METHOD.get(default=None):
# Make sure we update the config object, so we pass it into the new containers when we map them
self.config["api.http.default_method"] = ENV_API_DEFAULT_REQ_METHOD.get()
# notice the default setting of Request.def_method are already set by the OS environment
elif self.config.get("api.http.default_method", None):
def_method = str(self.config.get("api.http.default_method", None)).strip()
if def_method.upper() not in ("GET", "POST", "PUT"):
raise ValueError(
"api.http.default_method variable must be 'get' or 'post' (any case is allowed)."
)
Request.def_method = def_method
Request._method = Request.def_method
if ENV_AUTH_TOKEN.get(
value_cb=lambda key, value: print("Using environment access token {}=********".format(key))
):
@@ -251,7 +265,7 @@ class Session(TokenManager):
service,
action,
version=None,
method="get",
method=Request.def_method,
headers=None,
auth=None,
data=None,
@@ -328,7 +342,7 @@ class Session(TokenManager):
service,
action,
version=None,
method="get",
method=Request.def_method,
headers=None,
data=None,
json=None,
@@ -371,7 +385,7 @@ class Session(TokenManager):
headers=None,
data=None,
json=None,
method="get",
method=Request.def_method,
):
"""
Send a raw batch API request. Batch requests always use application/json-lines content type.
@@ -615,7 +629,7 @@ class Session(TokenManager):
try:
data = {"expiration_sec": exp} if exp else {}
res = self._send_request(
method=ENV_API_DEFAULT_REQ_METHOD.get(default="get"),
method=Request.def_method,
service="auth",
action="login",
auth=auth,

View File

@@ -1,6 +1,8 @@
import json
import re
import shlex
from clearml_agent.backend_api.session import Request
from clearml_agent.helper.package.requirements import (
RequirementsManager, MarkerRequirement,
compare_version_rules, )
@@ -26,7 +28,7 @@ def resolve_default_container(session, task_id, container_config):
'script.repository', 'script.branch',
'project', 'container'],
'search_hidden': True},
method='get',
method=Request.def_method,
async_enable=False,
)
try:
@@ -53,7 +55,7 @@ def resolve_default_container(session, task_id, container_config):
'id': [task_info.get('project')],
'only_fields': ['name'],
},
method='get',
method=Request.def_method,
async_enable=False,
)
try:

View File

@@ -38,7 +38,7 @@ from clearml_agent.backend_api.services import auth as auth_api
from clearml_agent.backend_api.services import queues as queues_api
from clearml_agent.backend_api.services import tasks as tasks_api
from clearml_agent.backend_api.services import workers as workers_api
from clearml_agent.backend_api.session import CallResult
from clearml_agent.backend_api.session import CallResult, Request
from clearml_agent.backend_api.session.defs import (
ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
@@ -71,6 +71,8 @@ from clearml_agent.definitions import (
ENV_AGENT_SKIP_PYTHON_ENV_INSTALL,
WORKING_STANDALONE_DIR,
ENV_DEBUG_INFO,
ENV_CHILD_AGENTS_COUNT_CMD,
ENV_DOCKER_ARGS_FILTERS,
)
from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
from clearml_agent.errors import (
@@ -272,7 +274,7 @@ def get_task(session, task_id, **kwargs):
action='get_all',
version='2.14',
json={"id": [task_id], "search_hidden": True, **kwargs},
method='get',
method=Request.def_method,
async_enable=False,
)
result = CallResult.from_result(
@@ -304,7 +306,7 @@ def get_next_task(session, queue, get_task_info=False):
action='get_next_task',
version='2.14',
json=request,
method='get',
method=Request.def_method,
async_enable=False,
)
if not result.ok:
@@ -325,7 +327,7 @@ def get_task_container(session, task_id):
action='get_all',
version='2.14',
json={'id': [task_id], 'only_fields': ['container'], 'search_hidden': True},
method='get',
method=Request.def_method,
async_enable=False,
)
try:
@@ -366,7 +368,7 @@ def set_task_container(session, task_id, docker_image=None, docker_arguments=Non
action='edit',
version='2.13',
json={'task': task_id, 'container': container, 'force': True},
method='get',
method=Request.def_method,
async_enable=False,
)
return result.ok
@@ -685,6 +687,16 @@ class Worker(ServiceCommandSection):
# str - not supported, version string indicates last server version
self._runtime_props_support = None
# allow docker sanitization, needs backend support
if ENV_DOCKER_ARGS_FILTERS.get():
self._docker_args_filters = \
[re.compile(f) for f in shlex.split(ENV_DOCKER_ARGS_FILTERS.get())]
elif self._session.config.get('agent.docker_args_filters', None):
self._docker_args_filters = \
[re.compile(f) for f in self._session.config.get('agent.docker_args_filters', [])]
else:
self._docker_args_filters = []
@classmethod
def _verify_command_states(cls, kwargs):
"""
@@ -3268,6 +3280,11 @@ class Worker(ServiceCommandSection):
first_time=first_time,
)
# print message so users know they can enable cache
if not self.package_api.is_cached_enabled():
print('::: Python virtual environment cache is disabled. '
'To accelerate spin-up time set `agent.venvs_cache.path=~/.clearml/venvs-cache` :::\n')
# check if we have a cached folder
if cached_requirements and not skip_pip_venv_install and self.package_api.get_cached_venv(
requirements=cached_requirements,
@@ -3438,7 +3455,7 @@ class Worker(ServiceCommandSection):
'-v', '{}:{}'.format(ENV_SSH_AUTH_SOCK.get(), ENV_SSH_AUTH_SOCK.get()),
'-e', ssh_auth_sock_env,
]
elif ENV_AGENT_DISABLE_SSH_MOUNT.get():
elif ENV_AGENT_DISABLE_SSH_MOUNT.get() or self._session.config.get("agent.disable_ssh_mount", None):
self._host_ssh_cache = None
else:
self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.')
@@ -3589,15 +3606,11 @@ class Worker(ServiceCommandSection):
def _get_child_agents_count_for_worker(self):
"""Get the amount of running child agents. In case of any error return 0"""
parent_worker_label = self._parent_worker_label.format(self.worker_id)
cmd = [
'docker',
'ps',
'--filter',
'label={}'.format(parent_worker_label),
'--format',
# get some fields for debugging
'{"ID":"{{ .ID }}", "Image": "{{ .Image }}", "Names":"{{ .Names }}", "Labels":"{{ .Labels }}"}'
]
default_cmd = 'docker ps --filter label={parent_worker_label} --format {{{{.ID}}}}'
child_agents_cmd = ENV_CHILD_AGENTS_COUNT_CMD.get() or default_cmd
cmd = shlex.split(child_agents_cmd.format(parent_worker_label=parent_worker_label))
try:
output = Argv(*cmd).get_output(
stderr=subprocess.STDOUT
@@ -3608,6 +3621,31 @@ class Worker(ServiceCommandSection):
return len(output.splitlines()) if output else 0
def _filter_docker_args(self, docker_args):
# type: (List[str]) -> List[str]
"""
Filter docker args matching specific flags.
Supports list of Regular expressions, e.g self._docker_args_filters = ["^--env$", "^-e$"]
:argument docker_args: List of docker argument strings (flags and values)
"""
# if no filtering, do nothing
if not docker_args or not self._docker_args_filters:
return docker_args
args = docker_args[:]
results = []
while args:
cmd = args.pop(0).strip()
if any(f.match(cmd) for f in self._docker_args_filters):
results.append(cmd)
if "=" not in cmd and args and not args[0].startswith("-"):
try:
results.append(args.pop(0).strip())
except IndexError:
pass
return results
def _get_docker_cmd(
self,
worker_id, parent_worker_id,
@@ -3661,6 +3699,7 @@ class Worker(ServiceCommandSection):
if docker_arguments:
docker_arguments = list(docker_arguments) \
if isinstance(docker_arguments, (list, tuple)) else [docker_arguments]
docker_arguments = self._filter_docker_args(docker_arguments)
base_cmd += [a for a in docker_arguments if a]
if extra_docker_arguments:

View File

@@ -149,6 +149,8 @@ ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEAR
ENV_VENV_CACHE_PATH = EnvironmentConfig('CLEARML_AGENT_VENV_CACHE_PATH')
ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig('CLEARML_AGENT_EXTRA_DOCKER_ARGS', type=list)
ENV_DEBUG_INFO = EnvironmentConfig('CLEARML_AGENT_DEBUG_INFO')
ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig('CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD')
ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig('CLEARML_AGENT_DOCKER_ARGS_FILTERS')
ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig('CLEARML_AGENT_CUSTOM_BUILD_SCRIPT')
"""

View File

@@ -27,8 +27,8 @@ from clearml_agent.helper.dicts import merge_dicts
from clearml_agent.helper.process import get_bash_output
from clearml_agent.helper.resource_monitor import ResourceMonitor
from clearml_agent.interface.base import ObjectID
from .definitions import ENV_START_AGENT_SCRIPT_PATH
from clearml_agent.backend_api.session import Request
from clearml_agent.glue.definitions import ENV_START_AGENT_SCRIPT_PATH
class K8sIntegration(Worker):
@@ -299,7 +299,7 @@ class K8sIntegration(Worker):
service='tasks',
action='update',
json={"task": task_id, "status_message": "K8S glue status: {}".format(msg)},
method='get',
method=Request.def_method,
async_enable=False,
)
if not result.ok:

View File

@@ -213,6 +213,13 @@ class PackageManager(object):
return
return self._get_cache_manager().get_last_copied_entry()
def is_cached_enabled(self):
if not self._cache_manager:
cache_folder = ENV_VENV_CACHE_PATH.get() or self.session.config.get(self._config_cache_folder, None)
if not cache_folder:
return False
return True
@classmethod
def _generate_reqs_hash_keys(cls, requirements_list, docker_cmd, python_version, cuda_version):
# type: (Union[Dict, List[Dict]], Optional[Union[dict, str]], Optional[str], Optional[str]) -> List[str]

View File

@@ -53,17 +53,16 @@ class PytorchWheel(object):
python = attr.ib(type=str, converter=lambda x: str(x).replace(".", ""))
torch_version = attr.ib(type=str, converter=fix_version)
url_template = (
"http://download.pytorch.org/whl/"
"{0.cuda_version}/torch-{0.torch_version}-cp{0.python}-cp{0.python}m{0.unicode}-{0.os_name}.whl"
)
url_template_prefix = "http://download.pytorch.org/whl/"
url_template = "{0.cuda_version}/torch-{0.torch_version}" \
"-cp{0.python}-cp{0.python}m{0.unicode}-{0.os_name}.whl"
def __attrs_post_init__(self):
self.unicode = "u" if self.python.startswith("2") else ""
def make_url(self):
# type: () -> Text
return self.url_template.format(self)
return (self.url_template_prefix + self.url_template).format(self)
class PytorchResolutionError(FatalSpecsResolutionError):
@@ -183,6 +182,19 @@ class PytorchRequirement(SimpleSubstitution):
self._fix_setuptools = None
self.exceptions = []
self._original_req = []
# allow override pytorch lookup pages
if self.config.get("agent.package_manager.torch_page", None):
SimplePytorchRequirement.page_lookup_template = \
self.config.get("agent.package_manager.torch_page", None)
if self.config.get("agent.package_manager.torch_nightly_page", None):
SimplePytorchRequirement.nightly_page_lookup_template = \
self.config.get("agent.package_manager.torch_nightly_page", None)
if self.config.get("agent.package_manager.torch_url_template_prefix", None):
PytorchWheel.url_template_prefix = \
self.config.get("agent.package_manager.torch_url_template_prefix", None)
if self.config.get("agent.package_manager.torch_url_template", None):
PytorchWheel.url_template = \
self.config.get("agent.package_manager.torch_url_template", None)
def _init_python_ver_cuda_ver(self):
if self.cuda is None:

View File

@@ -92,9 +92,10 @@ class ResourceMonitor(object):
# None means no filtering, report all gpus
self._active_gpus = None
try:
active_gpus = Session.get_nvidia_visible_env() or ""
if active_gpus:
self._active_gpus = [g.strip() for g in active_gpus.split(',')]
active_gpus = Session.get_nvidia_visible_env()
# None means no filtering, report all gpus
if active_gpus and active_gpus != "all":
self._active_gpus = [g.strip() for g in str(active_gpus).split(',')]
except Exception:
pass

View File

@@ -288,7 +288,7 @@ class Session(_Session):
def get(self, service, action, version=None, headers=None,
data=None, json=None, async_enable=False, **kwargs):
return self._manual_request(service=service, action=action,
version=version, method="get", headers=headers,
version=version, method=Request.def_method, headers=headers,
data=data, async_enable=async_enable,
json=json or kwargs)
@@ -299,7 +299,7 @@ class Session(_Session):
data=data, async_enable=async_enable,
json=json or kwargs)
def _manual_request(self, service, action, version=None, method="get", headers=None,
def _manual_request(self, service, action, version=None, method=Request.def_method, headers=None,
data=None, json=None, async_enable=False, **kwargs):
res = self.send_request(service=service, action=action,

View File

@@ -1 +1 @@
__version__ = '1.3.0'
__version__ = '1.4.1'

View File

@@ -136,6 +136,12 @@ agent {
},
translate_ssh: true,
# set "disable_ssh_mount: true" to disable the automatic mount of ~/.ssh folder into the docker containers
# default is false, automatically mounts ~/.ssh
# Must be set to True if using "clearml-session" with this agent!
# disable_ssh_mount: false
# reload configuration file every daemon execution
reload_config: false,