Version bump to v1.5.2

Add agent.package_manager.poetry_install_extra_args configuration option
Fix git+ssh:// links inside installed packages not being converted properly to https authenticated links
2025-06-26 18:16:15 +00:00 · 2023-03-29 12:49:33 +03:00 · 2023-03-28 14:37:48 +03:00 · 2023-03-28 14:35:51 +03:00 · 2023-03-28 14:35:41 +03:00 · 2023-03-28 14:34:19 +03:00
36 changed files with 3049 additions and 549 deletions
--- a/clearml_agent/backend_api/config/default/agent.conf
+++ b/clearml_agent/backend_api/config/default/agent.conf
@@ -18,6 +18,8 @@
    # https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html
    # git_user: ""
    # git_pass: ""
+    # Limit credentials to a single domain, for example: github.com,
+    # all other domains will use public access (no user/pass). Default: always send user/pass for any VCS domain
    # git_host: ""

    # Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
@@ -63,19 +65,20 @@
        # supported options: pip, conda, poetry
        type: pip,

-        # specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
-        pip_version: "<20.2",
+        # specify pip version to use (examples "<20.2", "==19.3.1", "", empty string will install the latest version)
+        pip_version: ["<20.2 ; python_version < '3.10'", "<22.3 ; python_version >= '3.10'"],
        # specify poetry version to use (examples "<2", "==1.1.1", "", empty string will install the latest version)
        # poetry_version: "<2",
+        # poetry_install_extra_args: ["-v"]

-        # virtual environment inheres packages from system
+        # virtual environment inherits packages from system
        system_site_packages: false,

        # install with --upgrade
        force_upgrade: false,

        # additional artifact repositories to use when installing python packages
-        # extra_index_url: ["https://allegroai.jfrog.io/clearmlai/api/pypi/public/simple"]
+        # extra_index_url: ["https://allegroai.jfrog.io/clearml/api/pypi/public/simple"]

        # additional conda channels to use when installing with conda package manager
        conda_channels: ["pytorch", "conda-forge", "defaults", ]
@@ -103,6 +106,10 @@
        # set to True to support torch nightly build installation,
        # notice: torch nightly builds are ephemeral and are deleted from time to time
        torch_nightly: false,
+
+        # if set to true, the agent will look for the "poetry.lock" file 
+        # in the passed current working directory instead of the repository's root directory.
+        poetry_files_from_repo_working_dir: false
    },

    # target folder for virtual environments builds, created when executing experiment
@@ -115,7 +122,7 @@
        # minimum required free space to allow for cache entry, disable by passing 0 or negative value
        free_space_threshold_gb: 2.0
        # unmark to enable virtual environment caching
-        # path: ~/.clearml/venvs-cache
+        path: ~/.clearml/venvs-cache
    },

    # cached git clone folder
@@ -137,6 +144,12 @@
    },

    translate_ssh: true,
+
+    # set "disable_ssh_mount: true" to disable the automatic mount of ~/.ssh folder into the docker containers
+    # default is false, automatically mounts ~/.ssh
+    # Must be set to True if using "clearml-session" with this agent!
+    # disable_ssh_mount: false
+
    # reload configuration file every daemon execution
    reload_config: false,

@@ -209,8 +222,8 @@
    # default is True, report a single \r line in a sequence of consecutive lines, per 5 seconds.
    # suppress_carriage_return: true

-    # cuda versions used for solving pytorch wheel packages
-    # should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
+    # CUDA versions used for Conda setup & solving PyTorch wheel packages
+    # Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
    # cuda_version: 10.1
    # cudnn_version: 7.6

@@ -246,9 +259,9 @@

    # Name docker containers created by the daemon using the following string format (supported from Docker 0.6.5)
    # Allowed variables are task_id, worker_id and rand_string (random lower-case letters string, up to 32 characters)
-    # Note: resulting name must start with an alphanumeric character and continue with alphanumeric characters,
-    #  underscores (_), dots (.) and/or dashes (-)
-    #docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"
+    # Note: resulting name must start with an alphanumeric character and
+    #       continue with alphanumeric characters, underscores (_), dots (.) and/or dashes (-)
+    # docker_container_name_format: "clearml-id-{task_id}-{rand_string:.8}"

    # Apply top-level environment section from configuration into os.environ
    apply_environment: true
@@ -319,4 +332,57 @@
    # into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
    # standard flow.
    custom_build_script: ""
+
+    # Crash on exception: by default when encountering an exception while running a task,
+    # the agent will catch the exception, log it and continue running.
+    # Set this to `true` to propagate exceptions and crash the agent.
+    # crash_on_exception: true
+
+    # Disable task docker override. If true, the agent will use the default docker image and ignore any docker image
+    # and arguments specified in the task's container section (setup shell script from the task container section will
+    # be used in any case, if specified).
+    disable_task_docker_override: false
+
+    # Choose the default docker based on the Task properties,
+    # Examples: 'script.requirements', 'script.binary', 'script.repository', 'script.branch', 'project'
+    # Notice: Matching is done via regular expression, for example "^searchme$" will match exactly "searchme$" string
+    #
+    #     "default_docker": {
+    #         "image": "nvidia/cuda:10.2-cudnn7-runtime-ubuntu18.04",
+    #         # optional arguments to pass to docker image
+    #         # arguments: ["--ipc=host", ]
+    #         "match_rules": [
+    #             {
+    #                 "image": "sample_container:tag",
+    #                 "arguments": "-e VALUE=1 --ipc=host",
+    #                 "match": {
+    #                     "script": {
+    #                         "requirements": {
+    #                             "pip": {
+    #                                 "tensorflow": "~=1.6"
+    #                             }
+    #                         },
+    #                         "repository": "",
+    #                         "branch": "master"
+    #                     },
+    #                     "project": "example"
+    #                 }
+    #             },
+    #             {
+    #                 "image": "better_container:tag",
+    #                 "arguments": "",
+    #                 "match": {
+    #                     "container": "replace_me_please"
+    #                 }
+    #             },
+    #             {
+    #                 "image": "another_container:tag",
+    #                 "arguments": "",
+    #                 "match": {
+    #                     "project": "^examples", # anything that starts with "examples", e.g. "examples", "examples/sub_project"
+    #                 }
+    #             }
+    #         ]
+    #     },
+    #
 }
--- a/clearml_agent/backend_api/schema/service.py
+++ b/clearml_agent/backend_api/schema/service.py
@@ -4,7 +4,7 @@ import re
 import attr
 import six

-import pyhocon
+from clearml_agent.external import pyhocon

 from .action import Action

--- a/clearml_agent/backend_api/session/datamodel.py
+++ b/clearml_agent/backend_api/session/datamodel.py
@@ -66,11 +66,16 @@ class DataModel(object):
        }

    def validate(self, schema=None):
-        jsonschema.validate(
-            self.to_dict(),
-            schema or self._schema,
-            types=dict(array=(list, tuple), integer=six.integer_types),
+        schema = schema or self._schema
+        validator = jsonschema.validators.validator_for(schema)
+        validator_cls = jsonschema.validators.extend(
+            validator=validator,
+            type_checker=validator.TYPE_CHECKER.redefine_many({
+                "array": lambda s, instance: isinstance(instance, (list, tuple)),
+                "integer": lambda s, instance: isinstance(instance, six.integer_types),
+            }),
        )
+        jsonschema.validate(self.to_dict(), schema, cls=validator_cls)

    def __repr__(self):
        return '<{}.{}: {}>'.format(
--- a/clearml_agent/backend_api/session/session.py
+++ b/clearml_agent/backend_api/session/session.py
@@ -2,17 +2,21 @@
 import json as json_lib
 import os
 import sys
+import time
 import types
+from random import SystemRandom
 from socket import gethostname
 from typing import Optional

 import jwt
 import requests
 import six
-from pyhocon import ConfigTree, ConfigFactory
+from requests import RequestException
 from requests.auth import HTTPBasicAuth
 from six.moves.urllib.parse import urlparse, urlunparse

+from clearml_agent.external.pyhocon import ConfigTree, ConfigFactory
+
 from .callresult import CallResult
 from .defs import (
    ENV_VERBOSE, ENV_HOST, ENV_ACCESS_KEY, ENV_SECRET_KEY, ENV_WEB_HOST, ENV_FILES_HOST, ENV_AUTH_TOKEN,
@@ -25,6 +29,9 @@ from ...backend_config.environment import backward_compatibility_support
 from ...version import __version__


+sys_random = SystemRandom()
+
+
 class LoginError(Exception):
    pass

@@ -48,6 +55,7 @@ class Session(TokenManager):
    _session_initial_retry_connect_override = 4
    _write_session_data_size = 15000
    _write_session_timeout = (30.0, 30.)
+    _request_exception_retry_timeout = (2.0, 3.0)

    api_version = '2.1'
    feature_set = 'basic'
@@ -110,19 +118,9 @@ class Session(TokenManager):
        self._verbose = verbose if verbose is not None else ENV_VERBOSE.get()
        self._logger = logger
        self.__auth_token = None
+        self._propagate_exceptions_on_send = True

-        if ENV_API_DEFAULT_REQ_METHOD.get(default=None):
-            # Make sure we update the config object, so we pass it into the new containers when we map them
-            self.config["api.http.default_method"] = ENV_API_DEFAULT_REQ_METHOD.get()
-            # notice the default setting of Request.def_method are already set by the OS environment
-        elif self.config.get("api.http.default_method", None):
-            def_method = str(self.config.get("api.http.default_method", None)).strip()
-            if def_method.upper() not in ("GET", "POST", "PUT"):
-                raise ValueError(
-                    "api.http.default_method variable must be 'get' or 'post' (any case is allowed)."
-                )
-            Request.def_method = def_method
-            Request._method = Request.def_method
+        self.update_default_api_method()

        if ENV_AUTH_TOKEN.get(
            value_cb=lambda key, value: print("Using environment access token {}=********".format(key))
@@ -177,6 +175,10 @@ class Session(TokenManager):
        )
        # try to connect with the server
        self.refresh_token()
+
+        # for resilience, from now on we won't allow propagating exceptions when sending requests
+        self._propagate_exceptions_on_send = False
+
        # create the default session with many retries
        http_retries_config, self.__http_session = self._setup_session(http_retries_config)

@@ -222,7 +224,22 @@ class Session(TokenManager):

        return http_retries_config, get_http_session_with_retry(config=self.config or None, **http_retries_config)

+    def update_default_api_method(self):
+        if ENV_API_DEFAULT_REQ_METHOD.get(default=None):
+            # Make sure we update the config object, so we pass it into the new containers when we map them
+            self.config.put("api.http.default_method", ENV_API_DEFAULT_REQ_METHOD.get())
+            # notice the default setting of Request.def_method are already set by the OS environment
+        elif self.config.get("api.http.default_method", None):
+            def_method = str(self.config.get("api.http.default_method", None)).strip()
+            if def_method.upper() not in ("GET", "POST", "PUT"):
+                raise ValueError(
+                    "api.http.default_method variable must be 'get', 'post' or 'put' (any case is allowed)."
+                )
+            Request.def_method = def_method
+            Request._method = Request.def_method
+
    def load_vaults(self):
+        # () -> Optional[bool]
        if not self.check_min_api_version("2.15") or self.feature_set == "basic":
            return

@@ -243,12 +260,14 @@ class Session(TokenManager):

        # noinspection PyBroadException
        try:
-            res = self.send_request("users", "get_vaults", json={"enabled": True, "types": ["config"]})
+            # Use params and not data/json otherwise payload might be dropped if we're using GET with a strict firewall
+            res = self.send_request("users", "get_vaults", params="enabled=true&types=config&types=config")
            if res.ok:
                vaults = res.json().get("data", {}).get("vaults", [])
                data = list(filter(None, map(parse, vaults)))
                if data:
                    self.config.set_overrides(*data)
+                    return True
            elif res.status_code != 404:
                raise Exception(res.json().get("meta", {}).get("result_msg", res.text))
        except Exception as ex:
@@ -271,6 +290,7 @@ class Session(TokenManager):
        data=None,
        json=None,
        refresh_token_if_unauthorized=True,
+        params=None,
    ):
        """ Internal implementation for making a raw API request.
            - Constructs the api endpoint name
@@ -294,6 +314,7 @@ class Session(TokenManager):
            if version
            else "{host}/{service}.{action}"
        ).format(**locals())
+
        while True:
            if data and len(data) > self._write_session_data_size:
                timeout = self._write_session_timeout
@@ -301,16 +322,29 @@ class Session(TokenManager):
                timeout = self._session_initial_timeout
            else:
                timeout = self._session_timeout
-            res = self.__http_session.request(
-                method, url, headers=headers, auth=auth, data=data, json=json, timeout=timeout)
+
+            try:
+                res = self.__http_session.request(
+                    method, url, headers=headers, auth=auth, data=data, json=json, timeout=timeout, params=params)
+            except RequestException as ex:
+                if self._propagate_exceptions_on_send:
+                    raise
+                sleep_time = sys_random.uniform(*self._request_exception_retry_timeout)
+                self._logger.error(
+                    "{} exception sending {} {}: {} (retrying in {:.1f}sec)".format(
+                        type(ex).__name__, method.upper(), url, str(ex), sleep_time
+                    )
+                )
+                time.sleep(sleep_time)
+                continue

            if (
                refresh_token_if_unauthorized
                and res.status_code == requests.codes.unauthorized
                and not token_refreshed_on_error
            ):
-                # it seems we're unauthorized, so we'll try to refresh our token once in case permissions changed since
-                # the last time we got the token, and try again
+                # it seems we're unauthorized, so we'll try to refresh our token once in case permissions changed
+                # since the last time we got the token, and try again
                self.refresh_token()
                token_refreshed_on_error = True
                # try again
@@ -347,6 +381,7 @@ class Session(TokenManager):
        data=None,
        json=None,
        async_enable=False,
+        params=None,
    ):
        """
        Send a raw API request.
@@ -359,6 +394,7 @@ class Session(TokenManager):
                     content type will be application/json)
        :param data: Dictionary, bytes, or file-like object to send in the request body
        :param async_enable: whether request is asynchronous
+        :param params: additional query parameters
        :return: requests Response instance
        """
        headers = self.add_auth_headers(
@@ -375,6 +411,7 @@ class Session(TokenManager):
            headers=headers,
            data=data,
            json=json,
+            params=params,
        )

    def send_request_batch(
@@ -627,15 +664,14 @@ class Session(TokenManager):

        res = None
        try:
-            data = {"expiration_sec": exp} if exp else {}
            res = self._send_request(
                method=Request.def_method,
                service="auth",
                action="login",
                auth=auth,
-                json=data,
                headers=headers,
                refresh_token_if_unauthorized=False,
+                params={"expiration_sec": exp} if exp else {},
            )
            try:
                resp = res.json()
@@ -674,3 +710,13 @@ class Session(TokenManager):
        return "{self.__class__.__name__}[{self.host}, {self.access_key}/{secret_key}]".format(
            self=self, secret_key=self.secret_key[:5] + "*" * (len(self.secret_key) - 5)
        )
+
+    @property
+    def propagate_exceptions_on_send(self):
+        # type: () -> bool
+        return self._propagate_exceptions_on_send
+
+    @propagate_exceptions_on_send.setter
+    def propagate_exceptions_on_send(self, value):
+        # type: (bool) -> None
+        self._propagate_exceptions_on_send = value
--- a/clearml_agent/backend_config/config.py
+++ b/clearml_agent/backend_config/config.py
@@ -7,10 +7,8 @@ import sys
 from os.path import expanduser
 from typing import Any

-import pyhocon
 import six
 from pathlib2 import Path
-from pyhocon import ConfigTree, ConfigFactory
 from pyparsing import (
    ParseFatalException,
    ParseException,
@@ -18,6 +16,9 @@ from pyparsing import (
    ParseSyntaxException,
 )

+from clearml_agent.external import pyhocon
+from clearml_agent.external.pyhocon import ConfigTree, ConfigFactory
+
 from .defs import (
    Environment,
    DEFAULT_CONFIG_FOLDER,
@@ -191,16 +192,20 @@ class Config(object):
            config, self._read_extra_env_config_values(), copy_trees=True
        )

-        if self._overrides_configs:
-            config = functools.reduce(
-                lambda cfg, override: ConfigTree.merge_configs(cfg, override, copy_trees=True),
-                self._overrides_configs,
-                config,
-            )
+        config = self.resolve_override_configs(config)

        config["env"] = env
        return config

+    def resolve_override_configs(self, initial=None):
+        if not self._overrides_configs:
+            return initial
+        return functools.reduce(
+            lambda cfg, override: ConfigTree.merge_configs(cfg, override, copy_trees=True),
+            self._overrides_configs,
+            initial or ConfigTree(),
+        )
+
    def _read_extra_env_config_values(self) -> ConfigTree:
        """ Loads extra configuration from environment-injected values """
        result = ConfigTree()
@@ -289,6 +294,9 @@ class Config(object):
            )
        return value

+    def put(self, key, value):
+        self._config.put(key, value)
+
    def to_dict(self):
        return self._config.as_plain_ordered_dict()

--- a/clearml_agent/backend_config/converters.py
+++ b/clearml_agent/backend_config/converters.py
@@ -14,6 +14,14 @@ except ImportError:
 ConverterType = TypeVar("ConverterType", bound=Callable[[Any], Any])


+def text_to_int(value, default=0):
+    # type: (Any, int) -> int
+    try:
+        return int(value)
+    except (ValueError, TypeError):
+        return default
+
+
 def base64_to_text(value):
    # type: (Any) -> Text
    return base64.b64decode(value).decode("utf-8")
--- a/clearml_agent/backend_config/utils.py
+++ b/clearml_agent/backend_config/utils.py
@@ -4,7 +4,7 @@ from os.path import expandvars, expanduser
 from pathlib import Path
 from typing import List, TYPE_CHECKING

-from pyhocon import HOCONConverter, ConfigTree
+from clearml_agent.external.pyhocon import HOCONConverter, ConfigTree

 if TYPE_CHECKING:
    from .config import Config
--- a/clearml_agent/commands/base.py
+++ b/clearml_agent/commands/base.py
@@ -118,13 +118,15 @@ class ServiceCommandSection(BaseCommandSection):
        """ The name of the REST service used by this command """
        pass

-    def get(self, endpoint, *args, session=None, **kwargs):
+    def get(self, endpoint, *args, service=None, session=None, **kwargs):
        session = session or self._session
-        return session.get(service=self.service, action=endpoint, *args, **kwargs)
+        service = service or self.service
+        return session.get(service=service, action=endpoint, *args, **kwargs)

-    def post(self, endpoint, *args, session=None, **kwargs):
+    def post(self, endpoint, *args, service=None, session=None, **kwargs):
        session = session or self._session
-        return session.post(service=self.service, action=endpoint, *args, **kwargs)
+        service = service or self.service
+        return session.post(service=service, action=endpoint, *args, **kwargs)

    def get_with_act_as(self, endpoint, *args, **kwargs):
        return self._session.get_with_act_as(service=self.service, action=endpoint, *args, **kwargs)
--- a/clearml_agent/commands/config.py
+++ b/clearml_agent/commands/config.py
@@ -1,14 +1,15 @@
 from __future__ import print_function

-from six.moves import input
-from pyhocon import ConfigFactory, ConfigMissingException
+from typing import Dict, Optional
+
 from pathlib2 import Path
+from six.moves import input
 from six.moves.urllib.parse import urlparse

 from clearml_agent.backend_api.session import Session
 from clearml_agent.backend_api.session.defs import ENV_HOST
 from clearml_agent.backend_config.defs import LOCAL_CONFIG_FILES
-
+from clearml_agent.external.pyhocon import ConfigFactory, ConfigMissingException

 description = """
 Please create new clearml credentials through the settings page in your `clearml-server` web app, 
@@ -112,6 +113,21 @@ def main():
        print('Exiting setup without creating configuration file')
        return

+    selection = input_options(
+        'Default Output URI (used to automatically store models and artifacts)',
+        {'N': 'None', 'S': 'ClearML Server', 'C': 'Custom'},
+        default='None'
+    )
+    if selection == 'Custom':
+        print('Custom Default Output URI: ', end='')
+        default_output_uri = input().strip()
+    elif selection == "ClearML Server":
+        default_output_uri = files_host
+    else:
+        default_output_uri = None
+
+    print('\nDefault Output URI: {}'.format(default_output_uri if default_output_uri else 'not set'))
+
    # get GIT User/Pass for cloning
    print('Enter git username for repository cloning (leave blank for SSH key authentication): [] ', end='')
    git_user = input()
@@ -179,6 +195,13 @@ def main():
                              'agent.package_manager.extra_index_url= ' \
                              '[\n{}\n]\n\n'.format("\n".join(map("\"{}\"".format, extra_index_urls)))
            f.write(extra_index_str)
+            if default_output_uri:
+                default_output_url_str = '# Default Task output_uri. if output_uri is not provided to Task.init, ' \
+                                         'default_output_uri will be used instead.\n' \
+                                         'sdk.development.default_output_uri="{}"\n' \
+                                         '\n'.format(default_output_uri.strip('"'))
+                f.write(default_output_url_str)
+                default_conf = default_conf.replace('default_output_uri: ""', '# default_output_uri: ""')
            f.write(default_conf)
    except Exception:
        print('Error! Could not write configuration file at: {}'.format(str(conf_file)))
@@ -305,6 +328,25 @@ def input_url(host_type, host=None):
    return host


+def input_options(message, options, default=None):
+    # type: (str, Dict[str, str], Optional[str]) -> str
+    options_msg = "/".join(
+        "".join(('(' + c.upper() + ')') if c == o else c for c in option)
+        for o, option in options.items()
+    )
+    if default:
+        options_msg += " [{}]".format(default)
+    while True:
+        print('{}: {} '.format(message, options_msg), end='')
+        res = input().strip()
+        if not res:
+            return default
+        elif res.lower() in options:
+            return options[res.lower()]
+        elif res.upper() in options:
+            return options[res.upper()]
+
+
 def input_host_port(host_type, parsed_host):
    print('Enter port for {} host '.format(host_type), end='')
    replace_port = input().lower()
--- a/clearml_agent/commands/events.py
+++ b/clearml_agent/commands/events.py
@@ -3,8 +3,6 @@ from __future__ import print_function
 import json
 import time

-from future.builtins import super
-
 from clearml_agent.commands.base import ServiceCommandSection
 from clearml_agent.helper.base import return_list

--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -1,6 +1,7 @@
 from __future__ import print_function, division, unicode_literals

 import errno
+import functools
 import json
 import logging
 import os
@@ -20,20 +21,17 @@ from datetime import datetime
 from distutils.spawn import find_executable
 from distutils.util import strtobool
 from functools import partial
-from itertools import chain
 from os.path import basename
 from tempfile import mkdtemp, NamedTemporaryFile
 from time import sleep, time
 from typing import Text, Optional, Any, Tuple, List

 import attr
-import psutil
 import six
 from pathlib2 import Path
-from pyhocon import ConfigTree, ConfigFactory
 from six.moves.urllib.parse import quote
-from six.moves.urllib.parse import urlparse, urlunparse

+from clearml_agent.external.pyhocon import ConfigTree, ConfigFactory
 from clearml_agent.backend_api.services import auth as auth_api
 from clearml_agent.backend_api.services import queues as queues_api
 from clearml_agent.backend_api.services import tasks as tasks_api
@@ -44,6 +42,7 @@ from clearml_agent.backend_api.session.defs import (
    ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
 from clearml_agent.backend_config.defs import UptimeConf
 from clearml_agent.backend_config.utils import apply_environment, apply_files
+from clearml_agent.backend_config.converters import text_to_int
 from clearml_agent.commands.base import resolve_names, ServiceCommandSection
 from clearml_agent.commands.resolver import resolve_default_container
 from clearml_agent.definitions import (
@@ -59,10 +58,7 @@ from clearml_agent.definitions import (
    ENV_WORKER_ID,
    ENV_WORKER_TAGS,
    ENV_DOCKER_SKIP_GPUS_FLAG,
-    ENV_AGENT_SECRET_KEY,
    ENV_AGENT_AUTH_TOKEN,
-    ENV_AWS_SECRET_KEY,
-    ENV_AZURE_ACCOUNT_KEY,
    ENV_AGENT_DISABLE_SSH_MOUNT,
    ENV_SSH_AUTH_SOCK,
    ENV_AGENT_SKIP_PIP_VENV_INSTALL,
@@ -72,6 +68,9 @@ from clearml_agent.definitions import (
    WORKING_STANDALONE_DIR,
    ENV_DEBUG_INFO,
    ENV_CHILD_AGENTS_COUNT_CMD,
+    ENV_DOCKER_ARGS_FILTERS,
+    ENV_FORCE_SYSTEM_SITE_PACKAGES,
+    ENV_SERVICES_DOCKER_RESTART,
 )
 from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
 from clearml_agent.errors import (
@@ -139,6 +138,7 @@ from clearml_agent.helper.repo import clone_repository_cached, RepoInfo, VCS, fi
 from clearml_agent.helper.resource_monitor import ResourceMonitor
 from clearml_agent.helper.runtime_verification import check_runtime, print_uptime_properties
 from clearml_agent.helper.singleton import Singleton
+from clearml_agent.helper.docker_args import DockerArgsSanitizer
 from clearml_agent.session import Session
 from .events import Events

@@ -635,21 +635,12 @@ class Worker(ServiceCommandSection):
                    self._pip_extra_index_url.insert(0, e)
        except Exception:
            self.log.warning('Failed adding extra-index-url to pip environment: {}'.format(extra_url))
-        # update pip install command
-        pip_install_cmd = ["pip", "install"]
-        if self._pip_extra_index_url:
-            pip_install_cmd.extend(
-                chain.from_iterable(
-                    ("--extra-index-url", x) for x in self._pip_extra_index_url
-                )
-            )
-        self.pip_install_cmd = tuple(pip_install_cmd)
+
        self.worker_id = self._session.config["agent.worker_id"] or "{}:{}".format(
            self._session.config["agent.worker_name"], os.getpid()
        )
        self.parent_worker_id = None  # maybe add os env for overriding
-        self._last_stats = defaultdict(lambda: 0)
-        self._last_report_timestamp = psutil.time.time()
+
        self.temp_config_path = None
        self.queues = ()
        self.venv_folder = None  # type: Optional[Text]
@@ -686,6 +677,20 @@ class Worker(ServiceCommandSection):
        # str - not supported, version string indicates last server version
        self._runtime_props_support = None

+        # allow docker sanitization, needs backend support
+        if ENV_DOCKER_ARGS_FILTERS.get():
+            self._docker_args_filters = \
+                [re.compile(f) for f in shlex.split(ENV_DOCKER_ARGS_FILTERS.get())]
+        elif self._session.config.get('agent.docker_args_filters', None):
+            self._docker_args_filters = \
+                [re.compile(f) for f in self._session.config.get('agent.docker_args_filters', [])]
+        else:
+            self._docker_args_filters = []
+
+        self._task_ping_interval_sec = max(
+            0, text_to_int(self._session.config.get("agent.task_ping_interval_sec", 60.0))
+        )
+
    @classmethod
    def _verify_command_states(cls, kwargs):
        """
@@ -731,8 +736,68 @@ class Worker(ServiceCommandSection):
        except Exception:
            pass

+    def _get_docker_restart_value(self, task_session, task_id: str):
+        try:
+            self._session.verify_feature_set('advanced')
+        except ValueError:
+            return
+
+        restart = (ENV_SERVICES_DOCKER_RESTART.get() or "").strip()
+        if not restart:
+            return
+
+        # Parse value and selector
+        restart_value, _, selector = restart.partition(";")
+
+        if restart_value not in ("unless-stopped", "no", "always") and not restart_value.startswith("on-failure"):
+            self.log.error(
+                "Invalid value \"{}\" provided for {}, ignoring".format(restart, ENV_SERVICES_DOCKER_RESTART.vars[0])
+            )
+            return
+
+        if not selector:
+            return restart_value
+
+        path, _, expected_value = selector.partition("=")
+
+        result = task_session.send_request(
+            service='tasks',
+            action='get_all',
+            json={'id': [task_id], 'only_fields': [path], 'search_hidden': True},
+            method=Request.def_method,
+        )
+        if not result.ok:
+            result_msg = self._get_path(result.json(), 'meta', 'result_msg')
+            self.log.error(
+                "Failed obtaining selector value for restart option \"{}\", ignoring: {}".format(selector, result_msg)
+            )
+            return
+
+        not_found = object()
+        try:
+            value = self._get_path(result.json(), 'data', 'tasks', 0, *path.split("."), default=not_found)
+        except (ValueError, TypeError):
+            return
+
+        if value is not_found:
+            return
+
+        if not expected_value:
+            return restart_value
+
+        # noinspection PyBroadException
+        try:
+            if (
+                (isinstance(value, bool) and value == strtobool(expected_value))  # check first - bool is also an int
+                or (isinstance(value, (int, float)) and value == float(expected_value))
+                or (str(value) == str(expected_value))
+            ):
+                return restart_value
+        except Exception as ex:
+            pass
+
    def run_one_task(self, queue, task_id, worker_args, docker=None, task_session=None):
-        # type: (Text, Text, WorkerParams, Optional[Text]) -> int
+        # type: (Text, Text, WorkerParams, Optional[Text], Optional[Session]) -> Optional[int]
        """
        Run one task pulled from queue.
        :param queue: ID of queue that task was pulled from
@@ -777,10 +842,18 @@ class Worker(ServiceCommandSection):
            except Exception:
                task_container = {}

-            default_docker = not bool(task_container.get('image'))
-            docker_image = task_container.get('image') or self._docker_image
-            docker_arguments = task_container.get(
-                'arguments', self._docker_arguments if default_docker else None)
+            default_docker = (
+                self._session.config.get('agent.disable_task_docker_override', False)
+                or not bool(task_container.get('image'))
+            )
+            if default_docker:
+                docker_image = self._docker_image
+                docker_arguments = self._docker_arguments
+            else:
+                docker_image = task_container.get('image') or self._docker_image
+                docker_arguments = task_container.get(
+                    'arguments', self._docker_arguments if default_docker else None)
+
            docker_setup_script = task_container.get('setup_shell_script')

            self.send_logs(
@@ -788,7 +861,7 @@ class Worker(ServiceCommandSection):
                lines=
                ['Running Task {} inside {}docker: {} arguments: {}\n'.format(
                    task_id, "default " if default_docker else '',
-                    docker_image, self._sanitize_docker_command(docker_arguments or []))]
+                    docker_image, DockerArgsSanitizer.sanitize_docker_command(self._session, docker_arguments or []))]
                + (['custom_setup_bash_script:\n{}'.format(docker_setup_script)] if docker_setup_script else []),
                level="INFO",
                session=task_session,
@@ -799,6 +872,7 @@ class Worker(ServiceCommandSection):
                docker_image=docker_image,
                docker_arguments=docker_arguments,
                docker_bash_setup_script=docker_setup_script,
+                restart=self._get_docker_restart_value(task_session, task_id),
            )
            if self._impersonate_as_task_owner:
                docker_params["auth_token"] = task_session.token
@@ -851,7 +925,7 @@ class Worker(ServiceCommandSection):
                '--standalone-mode' if self._standalone_mode else '',
                task_id)

-            display_docker_command = self._sanitize_docker_command(full_docker_cmd)
+            display_docker_command = DockerArgsSanitizer.sanitize_docker_command(self._session, full_docker_cmd)

            # send the actual used command line to the backend
            self.send_logs(
@@ -958,6 +1032,7 @@ class Worker(ServiceCommandSection):
            if not (result.ok() and result.response):
                return
            new_session = copy(session)
+            new_session.api_client = None
            new_session.set_auth_token(result.response.token)
            return new_session

@@ -1166,7 +1241,7 @@ class Worker(ServiceCommandSection):
                        print("No tasks in Queues, sleeping for {:.1f} seconds".format(self._polling_interval))
                    sleep(self._polling_interval)

-                if self._session.config["agent.reload_config"]:
+                if self._session.config.get("agent.reload_config", False):
                    self.reload_config()
        finally:
            # if we are in dynamic gpus mode, shutdown all active runs
@@ -1197,7 +1272,7 @@ class Worker(ServiceCommandSection):
        except Exception:
            return None

-        worker_name = self._session.config["agent.worker_name"] + ':gpu'
+        worker_name = self._session.config.get("agent.worker_name", "") + ':gpu'
        our_workers = [
            w.id for w in response.workers
            if w.id.startswith(worker_name) and w.id != self.worker_id]
@@ -1548,10 +1623,14 @@ class Worker(ServiceCommandSection):
                        gpu_indexes=gpu_indexes,
                        gpu_queues=dynamic_gpus,
                    )
-                except Exception:
+                except Exception as e:
                    tb = six.text_type(traceback.format_exc())
                    print("FATAL ERROR:")
                    print(tb)
+
+                    if self._session.config.get("agent.crash_on_exception", False):
+                        raise e
+
                    crash_file, name = safe_mkstemp(prefix=".clearml_agent-crash", suffix=".log")
                    try:
                        with crash_file:
@@ -1657,7 +1736,9 @@ class Worker(ServiceCommandSection):

        # noinspection PyBroadException
        try:
-            config_data = self._session.config.as_plain_ordered_dict() if config is None else config.as_plain_ordered_dict()
+            config_data = (
+                self._session.config.as_plain_ordered_dict() if config is None else config.as_plain_ordered_dict()
+            )
            if clean_api_credentials:
                api = config_data.get("api")
                if api:
@@ -1730,6 +1811,7 @@ class Worker(ServiceCommandSection):
        stopping = False
        status = None
        process = None
+        last_task_ping = 0
        try:
            _last_machine_update_ts = time()
            stop_reason = None
@@ -1765,6 +1847,18 @@ class Worker(ServiceCommandSection):
                if stderr:
                    stderr.flush()

+                if not stopping and self._task_ping_interval_sec and \
+                        time() - last_task_ping > self._task_ping_interval_sec:
+                    # noinspection PyBroadException
+                    try:
+                        res = (session or self._session).send(tasks_api.PingRequest(task=task_id))
+                        if not res:
+                            self.log.error("Failed sending ping for task %s: %s", task_id, res.response)
+                    except Exception as ex:
+                        self.log.error("Failed sending ping: %s", str(ex))
+                    finally:
+                        last_task_ping = time()
+
                # get diff from previous poll
                printed_lines, stdout_pos_count = _print_file(stdout_path, stdout_pos_count)
                if self._services_mode and not stopping and status is None:
@@ -1936,6 +2030,11 @@ class Worker(ServiceCommandSection):
            except Exception as ex:
                print("Error: failed applying files from configuration: {}".format(ex))

+        try:
+            self._session.update_default_api_method()
+        except Exception as ex:
+            print("Error: failed updating default API method: {}".format(ex))
+
    @resolve_names
    def build(
        self,
@@ -1950,6 +2049,10 @@ class Worker(ServiceCommandSection):
    ):
        if not task_id:
            raise CommandFailedError("Worker build must have valid task id")
+        
+        if target and not os.path.isabs(target):
+            # Non absolute target path will lead to errors with relative python executable
+            target = os.path.abspath(target)

        self._session.print_configuration()

@@ -2055,7 +2158,10 @@ class Worker(ServiceCommandSection):
            # noinspection PyBroadException
            try:
                task_container = get_task_container(self._session, task_id)
-                if task_container.get('image'):
+                if (
+                    task_container.get('image')
+                    and not self._session.config.get('agent.disable_task_docker_override', False)
+                ):
                    docker_image = task_container.get('image')
                    print('Ignoring default docker image, using task docker image {}'.format(docker_image))
                    docker_arguments = task_container.get('arguments')
@@ -2066,12 +2172,14 @@ class Worker(ServiceCommandSection):
        print('Building Task {} inside docker image: {} {} setup_script={}\n'.format(
            task_id, docker_image, docker_arguments or '', docker_setup_script or ''))
        full_docker_cmd = self.docker_image_func(
-            docker_image=docker_image, docker_arguments=docker_arguments, docker_bash_setup_script=docker_setup_script)
+            docker_image=docker_image, docker_arguments=docker_arguments, docker_bash_setup_script=docker_setup_script
+        )

        end_of_build_marker = "build.done=true"
        docker_cmd_suffix = ' build --id {task_id} --install-globally; ' \
-                            'echo "" >> {conf_file} ; ' \
-                            'echo {end_of_build_marker} >> {conf_file} ; ' \
+                            'ORG=$(stat -c "%u:%g" {conf_file}) ; chown $(whoami):$(whoami) {conf_file} ; ' \
+                            'echo "" >> {conf_file} ; echo {end_of_build_marker} >> {conf_file} ; ' \
+                            'chown $ORG {conf_file} ; ' \
                            'bash'.format(
                                task_id=task_id,
                                end_of_build_marker=end_of_build_marker,
@@ -2090,10 +2198,16 @@ class Worker(ServiceCommandSection):

        # now we need to wait until the line shows on our configuration file.
        while True:
-            while temp_config.stat().st_mtime == base_time_stamp:
-                sleep(5.0)
-            with open(temp_config.as_posix()) as f:
-                lines = [l.strip() for l in f.readlines()]
+            # noinspection PyBroadException
+            try:
+                while temp_config.stat().st_mtime == base_time_stamp:
+                    sleep(5.0)
+                with open(temp_config.as_posix()) as f:
+                    lines = [l.strip() for l in f.readlines()]
+            except Exception as ex:
+                # print("Failed reading status file [{}], retrying in 2 seconds".format(ex))
+                sleep(2.0)
+
            if 'build.done=true' in lines:
                break
            base_time_stamp = temp_config.stat().st_mtime
@@ -2122,6 +2236,8 @@ class Worker(ServiceCommandSection):
        print(commit_docker(container_name=target, docker_id=docker_id, apply_change=change))
        shutdown_docker_process(docker_id=docker_id)

+        safe_remove_file(temp_config.as_posix())
+
        return

    def _get_task_python_version(self, task):
@@ -2592,7 +2708,9 @@ class Worker(ServiceCommandSection):
        print("Executing task id [%s]:" % current_task.id)
        sanitized_execution = attr.evolve(
            execution,
-            docker_cmd=" ".join(self._sanitize_docker_command(shlex.split(execution.docker_cmd or ""))),
+            docker_cmd=" ".join(DockerArgsSanitizer.sanitize_docker_command(
+                self._session, shlex.split(execution.docker_cmd or ""))
+            ),
        )
        for pair in attr.asdict(sanitized_execution).items():
            print("{} = {}".format(*pair))
@@ -2799,8 +2917,8 @@ class Worker(ServiceCommandSection):
        # Todo: add support for poetry caching
        if not self.poetry.enabled:
            # add to cache
-            print('Adding venv into cache: {}'.format(add_venv_folder_cache))
            if add_venv_folder_cache:
+                print('Adding venv into cache: {}'.format(add_venv_folder_cache))
                self.package_api.add_cached_venv(
                    requirements=[freeze, previous_reqs],
                    docker_cmd=execution_info.docker_cmd if execution_info else None,
@@ -2821,19 +2939,27 @@ class Worker(ServiceCommandSection):
            self.log_traceback(e)
        return freeze

-    def _install_poetry_requirements(self, repo_info):
-        # type: (Optional[RepoInfo]) -> Optional[PoetryAPI]
+    def _install_poetry_requirements(self, repo_info, working_dir=None):
+        # type: (Optional[RepoInfo], Optional[str]) -> Optional[PoetryAPI]
        if not repo_info:
            return None
+
+        files_from_working_dir = self._session.config.get(
+            "agent.package_manager.poetry_files_from_repo_working_dir", False)
+        lockfile_path = Path(repo_info.root) / ((working_dir or "") if files_from_working_dir else "")
+
        try:
            if not self.poetry.enabled:
                return None
-            self.poetry.initialize(cwd=repo_info.root)
-            api = self.poetry.get_api(repo_info.root)
+
+            self.poetry.initialize(cwd=lockfile_path)
+            api = self.poetry.get_api(lockfile_path)
            if api.enabled:
                print('Poetry Enabled: Ignoring requested python packages, using repository poetry lock file!')
                api.install()
                return api
+            
+            print(f"Could not find pyproject.toml or poetry.lock file in {lockfile_path} \n")
        except Exception as ex:
            self.log.error("failed installing poetry requirements: {}".format(ex))
        return None
@@ -2864,7 +2990,8 @@ class Worker(ServiceCommandSection):
         """
        if package_api:
            package_api.cwd = cwd
-        api = self._install_poetry_requirements(repo_info)
+
+        api = self._install_poetry_requirements(repo_info, execution.working_dir)
        if api:
            # update back the package manager, this hack should be fixed
            if package_api == self.package_api:
@@ -3269,6 +3396,11 @@ class Worker(ServiceCommandSection):
            first_time=first_time,
        )

+        # print message so users know they can enable cache
+        if not self.package_api.is_cached_enabled():
+            print('::: Python virtual environment cache is DISABLED. '
+                  'To accelerate spin-up time set `agent.venvs_cache.path=~/.clearml/venvs-cache` :::\n')
+
        # check if we have a cached folder
        if cached_requirements and not skip_pip_venv_install and self.package_api.get_cached_venv(
            requirements=cached_requirements,
@@ -3402,7 +3534,7 @@ class Worker(ServiceCommandSection):

        print("Running in Docker{} mode (v19.03 and above) - using default docker image: {} {}\n".format(
            ' *standalone*' if self._standalone_mode else '', self._docker_image,
-            self._sanitize_docker_command(self._docker_arguments) or ''))
+            DockerArgsSanitizer.sanitize_docker_command(self._session, self._docker_arguments) or ''))

        temp_config = deepcopy(self._session.config)
        mounted_cache_dir = temp_config.get(
@@ -3416,7 +3548,6 @@ class Worker(ServiceCommandSection):
        temp_config.put("sdk.storage.cache.default_base_dir", mounted_cache_dir)
        temp_config.put("agent.pip_download_cache.path", mounted_pip_dl_dir)
        temp_config.put("agent.vcs_cache.path", mounted_vcs_cache)
-        temp_config.put("agent.package_manager.system_site_packages", True)
        temp_config.put("agent.package_manager.conda_env_as_base_docker", False)
        temp_config.put("agent.default_python", "")
        temp_config.put("agent.python_binary", "")
@@ -3428,6 +3559,11 @@ class Worker(ServiceCommandSection):
        temp_config.put("agent.git_pass", (ENV_AGENT_GIT_PASS.get() or
                                           self._session.config.get("agent.git_pass", None)))

+        force_system_site_packages = ENV_FORCE_SYSTEM_SITE_PACKAGES.get()
+        force_system_site_packages = force_system_site_packages if force_system_site_packages is not None else True
+        if force_system_site_packages:
+            temp_config.put("agent.package_manager.system_site_packages", True)
+
        if temp_config.get("agent.venvs_cache.path", None):
            temp_config.put("agent.venvs_cache.path", '/root/.clearml/venvs-cache')

@@ -3439,7 +3575,7 @@ class Worker(ServiceCommandSection):
                    '-v', '{}:{}'.format(ENV_SSH_AUTH_SOCK.get(), ENV_SSH_AUTH_SOCK.get()),
                    '-e', ssh_auth_sock_env,
                ]
-        elif ENV_AGENT_DISABLE_SSH_MOUNT.get():
+        elif ENV_AGENT_DISABLE_SSH_MOUNT.get() or self._session.config.get("agent.disable_ssh_mount", None):
            self._host_ssh_cache = None
        else:
            self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.')
@@ -3591,9 +3727,7 @@ class Worker(ServiceCommandSection):
        """Get the amount of running child agents. In case of any error return 0"""
        parent_worker_label = self._parent_worker_label.format(self.worker_id)

-        default_cmd = 'docker ps --filter label={parent_worker_label} --format ' \
-                      '{{"ID":"{{{{ .ID }}}}", "Image": "{{{{ .Image }}}}", ' \
-                      '"Names":"{{{{ .Names }}}}", "Labels":"{{{{ .Labels }}}}"}}'
+        default_cmd = 'docker ps --filter label={parent_worker_label} --format {{{{.ID}}}}'
        child_agents_cmd = ENV_CHILD_AGENTS_COUNT_CMD.get() or default_cmd

        cmd = shlex.split(child_agents_cmd.format(parent_worker_label=parent_worker_label))
@@ -3607,6 +3741,31 @@ class Worker(ServiceCommandSection):

        return len(output.splitlines()) if output else 0

+    def _filter_docker_args(self, docker_args):
+        # type: (List[str]) -> List[str]
+        """
+        Filter docker args matching specific flags.
+        Supports list of Regular expressions, e.g self._docker_args_filters = ["^--env$", "^-e$"]
+
+        :argument docker_args: List of docker argument strings (flags and values)
+        """
+        # if no filtering, do nothing
+        if not docker_args or not self._docker_args_filters:
+            return docker_args
+
+        args = docker_args[:]
+        results = []
+        while args:
+            cmd = args.pop(0).strip()
+            if any(f.match(cmd) for f in self._docker_args_filters):
+                results.append(cmd)
+                if "=" not in cmd and args and not args[0].startswith("-"):
+                    try:
+                        results.append(args.pop(0).strip())
+                    except IndexError:
+                        pass
+        return results
+
    def _get_docker_cmd(
            self,
            worker_id, parent_worker_id,
@@ -3632,11 +3791,20 @@ class Worker(ServiceCommandSection):
            name=None,
            mount_ssh=None, mount_ssh_ro=None, mount_apt_cache=None, mount_pip_cache=None, mount_poetry_cache=None,
            env_task_id=None,
+            restart=None,
    ):
        self.debug("Constructing docker command", context="docker")
        docker = 'docker'

        base_cmd = [docker, 'run', '-t']
+        use_rm = True
+        if restart:
+            if restart in ("unless-stopped", "no", "always") or restart.startswith("on-failure"):
+                base_cmd += ["--restart", restart]
+                use_rm = False
+            else:
+                self.log.error("Invalid restart value \"{}\" , ignoring".format(restart))
+
        update_scheme = ""
        dockers_nvidia_visible_devices = 'all'
        gpu_devices = Session.get_nvidia_visible_env()
@@ -3660,6 +3828,7 @@ class Worker(ServiceCommandSection):
        if docker_arguments:
            docker_arguments = list(docker_arguments) \
                if isinstance(docker_arguments, (list, tuple)) else [docker_arguments]
+            docker_arguments = self._filter_docker_args(docker_arguments)
            base_cmd += [a for a in docker_arguments if a]

        if extra_docker_arguments:
@@ -3761,7 +3930,6 @@ class Worker(ServiceCommandSection):
        except:
            pass

-        agent_install_bash_script = []
        if os.environ.get('FORCE_LOCAL_CLEARML_AGENT_WHEEL'):
            local_wheel = os.path.expanduser(os.environ.get('FORCE_LOCAL_CLEARML_AGENT_WHEEL'))
            docker_wheel = '/tmp/{}'.format(basename(local_wheel))
@@ -3802,9 +3970,6 @@ class Worker(ServiceCommandSection):
            if preprocess_bash_script:
                bash_script = preprocess_bash_script + bash_script

-            if agent_install_bash_script:
-                bash_script += agent_install_bash_script
-
            docker_bash_script = " ; ".join([line for line in bash_script if line]) \
                if not isinstance(bash_script, str) else bash_script

@@ -3813,10 +3978,10 @@ class Worker(ServiceCommandSection):
            update_scheme += (
                    docker_bash_script + " ; " +
                    "[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON={python} ; " +
-                    "$LOCAL_PYTHON -m pip install -U \"pip{pip_version}\" ; " +
+                    "$LOCAL_PYTHON -m pip install -U {pip_version} ; " +
                    "$LOCAL_PYTHON -m pip install -U {clearml_agent_wheel} ; ").format(
                python_single_digit=python_version.split('.')[0],
-                python=python_version, pip_version=PackageManager.get_pip_version(),
+                python=python_version, pip_version=" ".join(PackageManager.get_pip_versions(wrap='\"')),
                clearml_agent_wheel=clearml_agent_wheel,
                mount_ssh_ro=mount_ssh_ro, mount_ssh=mount_ssh,
            )
@@ -3852,7 +4017,8 @@ class Worker(ServiceCommandSection):
            (['-v', host_cache+':'+mounted_cache] if host_cache else []) +
            (['-v', host_vcs_cache+':'+mounted_vcs_cache] if host_vcs_cache else []) +
            (['-v', host_venvs_cache + ':' + mounted_venvs_cache] if host_venvs_cache else []) +
-            ['--rm', docker_image, 'bash', '-c',
+            (['--rm'] if use_rm else []) +
+            [docker_image, 'bash', '-c',
                update_scheme +
                extra_shell_script +
                "cp {} {} ; ".format(DOCKER_ROOT_CONF_FILE, DOCKER_DEFAULT_CONF_FILE) +
@@ -4038,76 +4204,6 @@ class Worker(ServiceCommandSection):
            queue_ids.append(q_id)
        return queue_ids

-    @staticmethod
-    def _sanitize_urls(s: str) -> Tuple[str, bool]:
-        """ Replaces passwords in URLs with asterisks """
-        regex = re.compile("^([^:]*:)[^@]+(.*)$")
-        tokens = re.split(r"\s", s)
-        changed = False
-        for k in range(len(tokens)):
-            if "@" in tokens[k]:
-                res = urlparse(tokens[k])
-                if regex.match(res.netloc):
-                    changed = True
-                    tokens[k] = urlunparse((
-                        res.scheme,
-                        regex.sub("\\1********\\2", res.netloc),
-                        res.path,
-                        res.params,
-                        res.query,
-                        res.fragment
-                    ))
-        return " ".join(tokens) if changed else s, changed
-
-    def _sanitize_docker_command(self, docker_command):
-        # type: (List[str]) -> List[str]
-        if not docker_command:
-            return docker_command
-        if not self._session.config.get('agent.hide_docker_command_env_vars.enabled', False):
-            return docker_command
-
-        keys = set(self._session.config.get('agent.hide_docker_command_env_vars.extra_keys', []))
-        keys.update(
-            ENV_AGENT_GIT_PASS.vars,
-            ENV_AGENT_SECRET_KEY.vars,
-            ENV_AWS_SECRET_KEY.vars,
-            ENV_AZURE_ACCOUNT_KEY.vars,
-            ENV_AGENT_AUTH_TOKEN.vars,
-        )
-
-        parse_embedded_urls = bool(self._session.config.get(
-            'agent.hide_docker_command_env_vars.parse_embedded_urls', True
-        ))
-
-        skip_next = False
-        result = docker_command[:]
-        for i, item in enumerate(docker_command):
-            if skip_next:
-                skip_next = False
-                continue
-            try:
-                if item in ("-e", "--env"):
-                    key, sep, val = result[i + 1].partition("=")
-                    if not sep:
-                        continue
-                    if key in ENV_DOCKER_IMAGE.vars:
-                        # special case - this contains a complete docker command
-                        val = " ".join(self._sanitize_docker_command(re.split(r"\s", val)))
-                    elif key in keys:
-                        val = "********"
-                    elif parse_embedded_urls:
-                        val = self._sanitize_urls(val)[0]
-                    result[i + 1] = "{}={}".format(key, val)
-                    skip_next = True
-                elif parse_embedded_urls and not item.startswith("-"):
-                    item, changed = self._sanitize_urls(item)
-                    if changed:
-                        result[i] = item
-            except (KeyError, TypeError):
-                pass
-
-        return result
-
    @staticmethod
    def _valid_docker_container_name(name):
        # type: (str) -> bool
@@ -4128,6 +4224,15 @@ class Worker(ServiceCommandSection):
                    " found {})".format(role)
                )

+    @staticmethod
+    def _get_path(d, *path, default=None):
+        try:
+            return functools.reduce(
+                lambda a, b: a[b], path, d
+            )
+        except (IndexError, KeyError):
+            return default
+

 if __name__ == "__main__":
    pass
--- a/clearml_agent/config.py
+++ b/clearml_agent/config.py
@@ -1,6 +1,6 @@
-from pyhocon import ConfigTree
-
 import six
+
+from clearml_agent.external.pyhocon import ConfigTree
 from clearml_agent.helper.base import Singleton


--- a/clearml_agent/definitions.py
+++ b/clearml_agent/definitions.py
@@ -5,9 +5,9 @@ from enum import IntEnum
 from os import getenv, environ
 from typing import Text, Optional, Union, Tuple, Any

+import six
 from pathlib2 import Path

-import six
 from clearml_agent.helper.base import normalize_path

 PROGRAM_NAME = "clearml-agent"
@@ -69,41 +69,65 @@ ENV_AWS_SECRET_KEY = EnvironmentConfig("AWS_SECRET_ACCESS_KEY")
 ENV_AZURE_ACCOUNT_KEY = EnvironmentConfig("AZURE_STORAGE_KEY")

 ENVIRONMENT_CONFIG = {
-    "api.api_server": EnvironmentConfig("CLEARML_API_HOST", "TRAINS_API_HOST", ),
-    "api.files_server": EnvironmentConfig("CLEARML_FILES_HOST", "TRAINS_FILES_HOST", ),
-    "api.web_server": EnvironmentConfig("CLEARML_WEB_HOST", "TRAINS_WEB_HOST", ),
+    "api.api_server": EnvironmentConfig(
+        "CLEARML_API_HOST",
+        "TRAINS_API_HOST",
+    ),
+    "api.files_server": EnvironmentConfig(
+        "CLEARML_FILES_HOST",
+        "TRAINS_FILES_HOST",
+    ),
+    "api.web_server": EnvironmentConfig(
+        "CLEARML_WEB_HOST",
+        "TRAINS_WEB_HOST",
+    ),
    "api.credentials.access_key": EnvironmentConfig(
-        "CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY",
+        "CLEARML_API_ACCESS_KEY",
+        "TRAINS_API_ACCESS_KEY",
    ),
    "api.credentials.secret_key": ENV_AGENT_SECRET_KEY,
-    "agent.worker_name": EnvironmentConfig("CLEARML_WORKER_NAME", "TRAINS_WORKER_NAME", ),
-    "agent.worker_id": EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID", ),
-    "agent.cuda_version": EnvironmentConfig(
-        "CLEARML_CUDA_VERSION", "TRAINS_CUDA_VERSION", "CUDA_VERSION"
+    "agent.worker_name": EnvironmentConfig(
+        "CLEARML_WORKER_NAME",
+        "TRAINS_WORKER_NAME",
    ),
-    "agent.cudnn_version": EnvironmentConfig(
-        "CLEARML_CUDNN_VERSION", "TRAINS_CUDNN_VERSION", "CUDNN_VERSION"
-    ),
-    "agent.cpu_only": EnvironmentConfig(
-        names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool
+    "agent.worker_id": EnvironmentConfig(
+        "CLEARML_WORKER_ID",
+        "TRAINS_WORKER_ID",
    ),
+    "agent.cuda_version": EnvironmentConfig("CLEARML_CUDA_VERSION", "TRAINS_CUDA_VERSION", "CUDA_VERSION"),
+    "agent.cudnn_version": EnvironmentConfig("CLEARML_CUDNN_VERSION", "TRAINS_CUDNN_VERSION", "CUDNN_VERSION"),
+    "agent.cpu_only": EnvironmentConfig(names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool),
+    "agent.crash_on_exception": EnvironmentConfig("CLEAMRL_AGENT_CRASH_ON_EXCEPTION", type=bool),
    "sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
    "sdk.aws.s3.secret": ENV_AWS_SECRET_KEY,
    "sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),
-    "sdk.azure.storage.containers.0": {'account_name': EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
-                                       'account_key': ENV_AZURE_ACCOUNT_KEY},
+    "sdk.azure.storage.containers.0": {
+        "account_name": EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
+        "account_key": ENV_AZURE_ACCOUNT_KEY,
+    },
    "sdk.google.storage.credentials_json": EnvironmentConfig("GOOGLE_APPLICATION_CREDENTIALS"),
 }

 ENVIRONMENT_SDK_PARAMS = {
-    "task_id": ("CLEARML_TASK_ID", "TRAINS_TASK_ID", ),
-    "config_file": ("CLEARML_CONFIG_FILE", "TRAINS_CONFIG_FILE", ),
-    "log_level": ("CLEARML_LOG_LEVEL", "TRAINS_LOG_LEVEL", ),
-    "log_to_backend": ("CLEARML_LOG_TASK_TO_BACKEND", "TRAINS_LOG_TASK_TO_BACKEND", ),
+    "task_id": (
+        "CLEARML_TASK_ID",
+        "TRAINS_TASK_ID",
+    ),
+    "config_file": (
+        "CLEARML_CONFIG_FILE",
+        "TRAINS_CONFIG_FILE",
+    ),
+    "log_level": (
+        "CLEARML_LOG_LEVEL",
+        "TRAINS_LOG_LEVEL",
+    ),
+    "log_to_backend": (
+        "CLEARML_LOG_TASK_TO_BACKEND",
+        "TRAINS_LOG_TASK_TO_BACKEND",
+    ),
 }

-ENVIRONMENT_BACKWARD_COMPATIBLE = EnvironmentConfig(
-    names=("CLEARML_AGENT_ALG_ENV", "TRAINS_AGENT_ALG_ENV"), type=bool)
+ENVIRONMENT_BACKWARD_COMPATIBLE = EnvironmentConfig(names=("CLEARML_AGENT_ALG_ENV", "TRAINS_AGENT_ALG_ENV"), type=bool)

 VIRTUAL_ENVIRONMENT_PATH = {
    "python2": normalize_path(CONFIG_DIR, "py2venv"),
@@ -122,36 +146,61 @@ TOKEN_EXPIRATION_SECONDS = int(timedelta(days=2).total_seconds())

 METADATA_EXTENSION = ".json"

-DEFAULT_VENV_UPDATE_URL = (
-    "https://raw.githubusercontent.com/Yelp/venv-update/v3.2.4/venv_update.py"
-)
+DEFAULT_VENV_UPDATE_URL = "https://raw.githubusercontent.com/Yelp/venv-update/v3.2.4/venv_update.py"
 WORKING_REPOSITORY_DIR = "task_repository"
 WORKING_STANDALONE_DIR = "code"
 DEFAULT_VCS_CACHE = normalize_path(CONFIG_DIR, "vcs-cache")
-PIP_EXTRA_INDICES = [
-]
+PIP_EXTRA_INDICES = []
 DEFAULT_PIP_DOWNLOAD_CACHE = normalize_path(CONFIG_DIR, "pip-download-cache")
-ENV_DOCKER_IMAGE = EnvironmentConfig('CLEARML_DOCKER_IMAGE', 'TRAINS_DOCKER_IMAGE')
-ENV_WORKER_ID = EnvironmentConfig('CLEARML_WORKER_ID', 'TRAINS_WORKER_ID')
-ENV_WORKER_TAGS = EnvironmentConfig('CLEARML_WORKER_TAGS')
-ENV_AGENT_SKIP_PIP_VENV_INSTALL = EnvironmentConfig('CLEARML_AGENT_SKIP_PIP_VENV_INSTALL')
-ENV_AGENT_SKIP_PYTHON_ENV_INSTALL = EnvironmentConfig('CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL', type=bool)
-ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig('CLEARML_DOCKER_SKIP_GPUS_FLAG', 'TRAINS_DOCKER_SKIP_GPUS_FLAG')
-ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER')
-ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS')
-ENV_AGENT_GIT_HOST = EnvironmentConfig('CLEARML_AGENT_GIT_HOST', 'TRAINS_AGENT_GIT_HOST')
-ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig('CLEARML_AGENT_DISABLE_SSH_MOUNT', type=bool)
-ENV_SSH_AUTH_SOCK = EnvironmentConfig('SSH_AUTH_SOCK')
-ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig('CLEARML_AGENT_EXEC_USER', 'TRAINS_AGENT_EXEC_USER')
-ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig('CLEARML_AGENT_EXTRA_PYTHON_PATH', 'TRAINS_AGENT_EXTRA_PYTHON_PATH')
-ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEARML_AGENT_DOCKER_HOST_MOUNT',
-                                          'TRAINS_AGENT_K8S_HOST_MOUNT', 'TRAINS_AGENT_DOCKER_HOST_MOUNT')
-ENV_VENV_CACHE_PATH = EnvironmentConfig('CLEARML_AGENT_VENV_CACHE_PATH')
-ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig('CLEARML_AGENT_EXTRA_DOCKER_ARGS', type=list)
-ENV_DEBUG_INFO = EnvironmentConfig('CLEARML_AGENT_DEBUG_INFO')
-ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig('CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD')
+ENV_DOCKER_IMAGE = EnvironmentConfig("CLEARML_DOCKER_IMAGE", "TRAINS_DOCKER_IMAGE")
+ENV_WORKER_ID = EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID")
+ENV_WORKER_TAGS = EnvironmentConfig("CLEARML_WORKER_TAGS")
+ENV_AGENT_SKIP_PIP_VENV_INSTALL = EnvironmentConfig("CLEARML_AGENT_SKIP_PIP_VENV_INSTALL")
+ENV_AGENT_SKIP_PYTHON_ENV_INSTALL = EnvironmentConfig("CLEARML_AGENT_SKIP_PYTHON_ENV_INSTALL", type=bool)
+ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig("CLEARML_DOCKER_SKIP_GPUS_FLAG", "TRAINS_DOCKER_SKIP_GPUS_FLAG")
+ENV_AGENT_GIT_USER = EnvironmentConfig("CLEARML_AGENT_GIT_USER", "TRAINS_AGENT_GIT_USER")
+ENV_AGENT_GIT_PASS = EnvironmentConfig("CLEARML_AGENT_GIT_PASS", "TRAINS_AGENT_GIT_PASS")
+ENV_AGENT_GIT_HOST = EnvironmentConfig("CLEARML_AGENT_GIT_HOST", "TRAINS_AGENT_GIT_HOST")
+ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig("CLEARML_AGENT_DISABLE_SSH_MOUNT", type=bool)
+ENV_SSH_AUTH_SOCK = EnvironmentConfig("SSH_AUTH_SOCK")
+ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig("CLEARML_AGENT_EXEC_USER", "TRAINS_AGENT_EXEC_USER")
+ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig("CLEARML_AGENT_EXTRA_PYTHON_PATH", "TRAINS_AGENT_EXTRA_PYTHON_PATH")
+ENV_DOCKER_HOST_MOUNT = EnvironmentConfig(
+    "CLEARML_AGENT_K8S_HOST_MOUNT",
+    "CLEARML_AGENT_DOCKER_HOST_MOUNT",
+    "TRAINS_AGENT_K8S_HOST_MOUNT",
+    "TRAINS_AGENT_DOCKER_HOST_MOUNT",
+)
+ENV_VENV_CACHE_PATH = EnvironmentConfig("CLEARML_AGENT_VENV_CACHE_PATH")
+ENV_EXTRA_DOCKER_ARGS = EnvironmentConfig("CLEARML_AGENT_EXTRA_DOCKER_ARGS", type=list)
+ENV_DEBUG_INFO = EnvironmentConfig("CLEARML_AGENT_DEBUG_INFO")
+ENV_CHILD_AGENTS_COUNT_CMD = EnvironmentConfig("CLEARML_AGENT_CHILD_AGENTS_COUNT_CMD")
+ENV_DOCKER_ARGS_FILTERS = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_FILTERS")
+ENV_DOCKER_ARGS_HIDE_ENV = EnvironmentConfig("CLEARML_AGENT_DOCKER_ARGS_HIDE_ENV")

-ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig('CLEARML_AGENT_CUSTOM_BUILD_SCRIPT')
+ENV_SERVICES_DOCKER_RESTART = EnvironmentConfig("CLEARML_AGENT_SERVICES_DOCKER_RESTART")
+"""
+    Specify a restart value for a services agent task containers.
+    Note that when a restart value is provided, task containers will not be run with the '--rm' flag and will
+     not be cleaned up automatically when completed (this will need to be done externally using the
+     'docker container prune' command to free up resources).
+    Value format for this env var is "<restart-value>;<task-selector>", where:
+    - <restart-value> can be any valid restart value for docker-run (see https://docs.docker.com/engine/reference/commandline/run/#restart)
+    - <task-selector> is optional, allowing to restrict this behaviour to specific tasks. The format is:
+        "<path-to-task-field>=<value>" where:
+        * <path-to-task-field> is a dot-separated path to a task field (e.g. "container.image")
+        * <value> is optional. If not provided, the restart policy till be applied for the task container if the
+            path provided exists. If provided, the restart policy will be applied if the value matches the value
+            obtained from the task (value parsing and comparison is based on the type of value obtained from the task) 
+    For example:
+        CLEARML_AGENT_SERVICES_DOCKER_RESTART=unless-stopped
+        CLEARML_AGENT_SERVICES_DOCKER_RESTART=unless-stopped;container.image=some-image
+"""
+
+ENV_FORCE_SYSTEM_SITE_PACKAGES = EnvironmentConfig("CLEARML_AGENT_FORCE_SYSTEM_SITE_PACKAGES", type=bool)
+""" Force system_site_packages: true when running tasks in containers (i.e. docker mode or k8s glue) """
+
+ENV_CUSTOM_BUILD_SCRIPT = EnvironmentConfig("CLEARML_AGENT_CUSTOM_BUILD_SCRIPT")
 """
    Specifies a custom environment setup script to be executed instead of installing a virtual environment.
    If provided, this script is executed following Git cloning. Script command may include environment variable and
--- a/clearml_agent/external/pyhocon/init.py
+++ b/clearml_agent/external/pyhocon/init.py
@@ -0,0 +1,5 @@
+from .config_parser import ConfigParser, ConfigFactory, ConfigMissingException
+from .config_tree import ConfigTree
+from .converter import HOCONConverter
+
+__all__ = ["ConfigParser", "ConfigFactory", "ConfigMissingException", "ConfigTree", "HOCONConverter"]
--- a/clearml_agent/external/pyhocon/config_parser.py
+++ b/clearml_agent/external/pyhocon/config_parser.py
@@ -0,0 +1,762 @@
+import itertools
+import re
+import os
+import socket
+import contextlib
+import codecs
+from datetime import timedelta
+
+from pyparsing import Forward, Keyword, QuotedString, Word, Literal, Suppress, Regex, Optional, SkipTo, ZeroOrMore, \
+    Group, lineno, col, TokenConverter, replaceWith, alphanums, alphas8bit, ParseSyntaxException, StringEnd
+from pyparsing import ParserElement
+from .config_tree import ConfigTree, ConfigSubstitution, ConfigList, ConfigValues, ConfigUnquotedString, \
+    ConfigInclude, NoneValue, ConfigQuotedString
+from .exceptions import ConfigSubstitutionException, ConfigMissingException, ConfigException
+import logging
+import copy
+
+use_urllib2 = False
+try:
+    # For Python 3.0 and later
+    from urllib.request import urlopen
+    from urllib.error import HTTPError, URLError
+except ImportError:  # pragma: no cover
+    # Fall back to Python 2's urllib2
+    from urllib2 import urlopen, HTTPError, URLError
+
+    use_urllib2 = True
+try:
+    basestring
+except NameError:  # pragma: no cover
+    basestring = str
+    unicode = str
+
+logger = logging.getLogger(__name__)
+
+#
+# Substitution Defaults
+#
+
+
+class DEFAULT_SUBSTITUTION(object):
+    pass
+
+
+class MANDATORY_SUBSTITUTION(object):
+    pass
+
+
+class NO_SUBSTITUTION(object):
+    pass
+
+
+class STR_SUBSTITUTION(object):
+    pass
+
+
+def period(period_value, period_unit):
+    try:
+        from dateutil.relativedelta import relativedelta as period_impl
+    except Exception:
+        from datetime import timedelta as period_impl
+
+    if period_unit == 'nanoseconds':
+        period_unit = 'microseconds'
+        period_value = int(period_value / 1000)
+
+    arguments = dict(zip((period_unit,), (period_value,)))
+
+    if period_unit == 'milliseconds':
+        return timedelta(**arguments)
+
+    return period_impl(**arguments)
+
+
+class ConfigFactory(object):
+
+    @classmethod
+    def parse_file(cls, filename, encoding='utf-8', required=True, resolve=True, unresolved_value=DEFAULT_SUBSTITUTION):
+        """Parse file
+
+        :param filename: filename
+        :type filename: basestring
+        :param encoding: file encoding
+        :type encoding: basestring
+        :param required: If true, raises an exception if can't load file
+        :type required: boolean
+        :param resolve: if true, resolve substitutions
+        :type resolve: boolean
+        :param unresolved_value: assigned value value to unresolved substitution.
+            If overriden with a default value, it will replace all unresolved value to the default value.
+            If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by its
+            substitution expression (e.g., ${x})
+        :type unresolved_value: class
+        :return: Config object
+        :type return: Config
+        """
+        try:
+            with codecs.open(filename, 'r', encoding=encoding) as fd:
+                content = fd.read()
+                return cls.parse_string(content, os.path.dirname(filename), resolve, unresolved_value)
+        except IOError as e:
+            if required:
+                raise e
+            logger.warn('Cannot include file %s. File does not exist or cannot be read.', filename)
+            return []
+
+    @classmethod
+    def parse_URL(cls, url, timeout=None, resolve=True, required=False, unresolved_value=DEFAULT_SUBSTITUTION):
+        """Parse URL
+
+        :param url: url to parse
+        :type url: basestring
+        :param resolve: if true, resolve substitutions
+        :type resolve: boolean
+        :param unresolved_value: assigned value value to unresolved substitution.
+            If overriden with a default value, it will replace all unresolved value to the default value.
+            If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by
+            its substitution expression (e.g., ${x})
+        :type unresolved_value: boolean
+        :return: Config object or []
+        :type return: Config or list
+        """
+        socket_timeout = socket._GLOBAL_DEFAULT_TIMEOUT if timeout is None else timeout
+
+        try:
+            with contextlib.closing(urlopen(url, timeout=socket_timeout)) as fd:
+                content = fd.read() if use_urllib2 else fd.read().decode('utf-8')
+                return cls.parse_string(content, os.path.dirname(url), resolve, unresolved_value)
+        except (HTTPError, URLError) as e:
+            logger.warn('Cannot include url %s. Resource is inaccessible.', url)
+            if required:
+                raise e
+            else:
+                return []
+
+    @classmethod
+    def parse_string(cls, content, basedir=None, resolve=True, unresolved_value=DEFAULT_SUBSTITUTION):
+        """Parse URL
+
+        :param content: content to parse
+        :type content: basestring
+        :param resolve: If true, resolve substitutions
+        :param resolve: if true, resolve substitutions
+        :type resolve: boolean
+        :param unresolved_value: assigned value value to unresolved substitution.
+            If overriden with a default value, it will replace all unresolved value to the default value.
+            If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by
+            its substitution expression (e.g., ${x})
+        :type unresolved_value: boolean
+        :return: Config object
+        :type return: Config
+        """
+        return ConfigParser().parse(content, basedir, resolve, unresolved_value)
+
+    @classmethod
+    def from_dict(cls, dictionary, root=False):
+        """Convert dictionary (and ordered dictionary) into a ConfigTree
+        :param dictionary: dictionary to convert
+        :type dictionary: dict
+        :return: Config object
+        :type return: Config
+        """
+
+        def create_tree(value):
+            if isinstance(value, dict):
+                res = ConfigTree(root=root)
+                for key, child_value in value.items():
+                    res.put(key, create_tree(child_value))
+                return res
+            if isinstance(value, list):
+                return [create_tree(v) for v in value]
+            else:
+                return value
+
+        return create_tree(dictionary)
+
+
+class ConfigParser(object):
+    """
+    Parse HOCON files: https://github.com/typesafehub/config/blob/master/HOCON.md
+    """
+
+    REPLACEMENTS = {
+        '\\\\': '\\',
+        '\\\n': '\n',
+        '\\n': '\n',
+        '\\r': '\r',
+        '\\t': '\t',
+        '\\=': '=',
+        '\\#': '#',
+        '\\!': '!',
+        '\\"': '"',
+    }
+
+    period_type_map = {
+        'nanoseconds': ['ns', 'nano', 'nanos', 'nanosecond', 'nanoseconds'],
+
+        'microseconds': ['us', 'micro', 'micros', 'microsecond', 'microseconds'],
+        'milliseconds': ['ms', 'milli', 'millis', 'millisecond', 'milliseconds'],
+        'seconds': ['s', 'second', 'seconds'],
+        'minutes': ['m', 'minute', 'minutes'],
+        'hours': ['h', 'hour', 'hours'],
+        'weeks': ['w', 'week', 'weeks'],
+        'days': ['d', 'day', 'days'],
+
+    }
+
+    optional_period_type_map = {
+        'months': ['mo', 'month', 'months'],  # 'm' from hocon spec removed. conflicts with minutes syntax.
+        'years': ['y', 'year', 'years']
+    }
+
+    supported_period_map = None
+
+    @classmethod
+    def get_supported_period_type_map(cls):
+        if cls.supported_period_map is None:
+            cls.supported_period_map = {}
+            cls.supported_period_map.update(cls.period_type_map)
+
+            try:
+                from dateutil import relativedelta
+
+                if relativedelta is not None:
+                    cls.supported_period_map.update(cls.optional_period_type_map)
+            except Exception:
+                pass
+
+        return cls.supported_period_map
+
+    @classmethod
+    def parse(cls, content, basedir=None, resolve=True, unresolved_value=DEFAULT_SUBSTITUTION):
+        """parse a HOCON content
+
+        :param content: HOCON content to parse
+        :type content: basestring
+        :param resolve: if true, resolve substitutions
+        :type resolve: boolean
+        :param unresolved_value: assigned value value to unresolved substitution.
+            If overriden with a default value, it will replace all unresolved value to the default value.
+            If it is set to to pyhocon.STR_SUBSTITUTION then it will replace the value by
+            its substitution expression (e.g., ${x})
+        :type unresolved_value: boolean
+        :return: a ConfigTree or a list
+        """
+
+        unescape_pattern = re.compile(r'\\.')
+
+        def replace_escape_sequence(match):
+            value = match.group(0)
+            return cls.REPLACEMENTS.get(value, value)
+
+        def norm_string(value):
+            return unescape_pattern.sub(replace_escape_sequence, value)
+
+        def unescape_string(tokens):
+            return ConfigUnquotedString(norm_string(tokens[0]))
+
+        def parse_multi_string(tokens):
+            # remove the first and last 3 "
+            return tokens[0][3: -3]
+
+        def convert_number(tokens):
+            n = tokens[0]
+            try:
+                return int(n, 10)
+            except ValueError:
+                return float(n)
+
+        def safe_convert_number(tokens):
+            n = tokens[0]
+            try:
+                return int(n, 10)
+            except ValueError:
+                try:
+                    return float(n)
+                except ValueError:
+                    return n
+
+        def convert_period(tokens):
+
+            period_value = int(tokens.value)
+            period_identifier = tokens.unit
+
+            period_unit = next((single_unit for single_unit, values
+                                in cls.get_supported_period_type_map().items()
+                                if period_identifier in values))
+
+            return period(period_value, period_unit)
+
+        # ${path} or ${?path} for optional substitution
+        SUBSTITUTION_PATTERN = r"\$\{(?P<optional>\?)?(?P<variable>[^}]+)\}(?P<ws>[ \t]*)"
+
+        def create_substitution(instring, loc, token):
+            # remove the ${ and }
+            match = re.match(SUBSTITUTION_PATTERN, token[0])
+            variable = match.group('variable')
+            ws = match.group('ws')
+            optional = match.group('optional') == '?'
+            substitution = ConfigSubstitution(variable, optional, ws, instring, loc)
+            return substitution
+
+        # ${path} or ${?path} for optional substitution
+        STRING_PATTERN = '"(?P<value>(?:[^"\\\\]|\\\\.)*)"(?P<ws>[ \t]*)'
+
+        def create_quoted_string(instring, loc, token):
+            # remove the ${ and }
+            match = re.match(STRING_PATTERN, token[0])
+            value = norm_string(match.group('value'))
+            ws = match.group('ws')
+            return ConfigQuotedString(value, ws, instring, loc)
+
+        def include_config(instring, loc, token):
+            url = None
+            file = None
+            required = False
+
+            if token[0] == 'required':
+                required = True
+                final_tokens = token[1:]
+            else:
+                final_tokens = token
+
+            if len(final_tokens) == 1:  # include "test"
+                value = final_tokens[0].value if isinstance(final_tokens[0], ConfigQuotedString) else final_tokens[0]
+                if value.startswith("http://") or value.startswith("https://") or value.startswith("file://"):
+                    url = value
+                else:
+                    file = value
+            elif len(final_tokens) == 2:  # include url("test") or file("test")
+                value = final_tokens[1].value if isinstance(token[1], ConfigQuotedString) else final_tokens[1]
+                if final_tokens[0] == 'url':
+                    url = value
+                else:
+                    file = value
+
+            if url is not None:
+                logger.debug('Loading config from url %s', url)
+                obj = ConfigFactory.parse_URL(
+                    url,
+                    resolve=False,
+                    required=required,
+                    unresolved_value=NO_SUBSTITUTION
+                )
+            elif file is not None:
+                path = file if basedir is None else os.path.join(basedir, file)
+                logger.debug('Loading config from file %s', path)
+                obj = ConfigFactory.parse_file(
+                    path,
+                    resolve=False,
+                    required=required,
+                    unresolved_value=NO_SUBSTITUTION
+                )
+            else:
+                raise ConfigException('No file or URL specified at: {loc}: {instring}', loc=loc, instring=instring)
+
+            return ConfigInclude(obj if isinstance(obj, list) else obj.items())
+
+        @contextlib.contextmanager
+        def set_default_white_spaces():
+            default = ParserElement.DEFAULT_WHITE_CHARS
+            ParserElement.setDefaultWhitespaceChars(' \t')
+            yield
+            ParserElement.setDefaultWhitespaceChars(default)
+
+        with set_default_white_spaces():
+            assign_expr = Forward()
+            true_expr = Keyword("true", caseless=True).setParseAction(replaceWith(True))
+            false_expr = Keyword("false", caseless=True).setParseAction(replaceWith(False))
+            null_expr = Keyword("null", caseless=True).setParseAction(replaceWith(NoneValue()))
+            # key = QuotedString('"', escChar='\\', unquoteResults=False) | Word(alphanums + alphas8bit + '._- /')
+            regexp_numbers = r'[+-]?(\d*\.\d+|\d+(\.\d+)?)([eE][+\-]?\d+)?(?=$|[ \t]*([\$\}\],#\n\r]|//))'
+            key = QuotedString('"', escChar='\\', unquoteResults=False) | \
+                Regex(regexp_numbers, re.DOTALL).setParseAction(safe_convert_number) | \
+                Word(alphanums + alphas8bit + '._- /')
+
+            eol = Word('\n\r').suppress()
+            eol_comma = Word('\n\r,').suppress()
+            comment = (Literal('#') | Literal('//')) - SkipTo(eol | StringEnd())
+            comment_eol = Suppress(Optional(eol_comma) + comment)
+            comment_no_comma_eol = (comment | eol).suppress()
+            number_expr = Regex(regexp_numbers, re.DOTALL).setParseAction(convert_number)
+
+            period_types = itertools.chain.from_iterable(cls.get_supported_period_type_map().values())
+            period_expr = Regex(r'(?P<value>\d+)\s*(?P<unit>' + '|'.join(period_types) + ')$'
+                                ).setParseAction(convert_period)
+
+            # multi line string using """
+            # Using fix described in http://pyparsing.wikispaces.com/share/view/3778969
+            multiline_string = Regex('""".*?"*"""', re.DOTALL | re.UNICODE).setParseAction(parse_multi_string)
+            # single quoted line string
+            quoted_string = Regex(r'"(?:[^"\\\n]|\\.)*"[ \t]*', re.UNICODE).setParseAction(create_quoted_string)
+            # unquoted string that takes the rest of the line until an optional comment
+            # we support .properties multiline support which is like this:
+            # line1  \
+            # line2 \
+            # so a backslash precedes the \n
+            unquoted_string = Regex(r'(?:[^^`+?!@*&"\[\{\s\]\}#,=\$\\]|\\.)+[ \t]*',
+                                    re.UNICODE).setParseAction(unescape_string)
+            substitution_expr = Regex(r'[ \t]*\$\{[^\}]+\}[ \t]*').setParseAction(create_substitution)
+            string_expr = multiline_string | quoted_string | unquoted_string
+
+            value_expr = period_expr | number_expr | true_expr | false_expr | null_expr | string_expr
+
+            include_content = (quoted_string | ((Keyword('url') | Keyword(
+                'file')) - Literal('(').suppress() - quoted_string - Literal(')').suppress()))
+            include_expr = (
+                Keyword("include", caseless=True).suppress() + (
+                    include_content | (
+                        Keyword("required") - Literal('(').suppress() - include_content - Literal(')').suppress()
+                    )
+                )
+            ).setParseAction(include_config)
+
+            root_dict_expr = Forward()
+            dict_expr = Forward()
+            list_expr = Forward()
+            multi_value_expr = ZeroOrMore(comment_eol | include_expr | substitution_expr |
+                                          dict_expr | list_expr | value_expr | (Literal('\\') - eol).suppress())
+            # for a dictionary : or = is optional
+            # last zeroOrMore is because we can have t = {a:4} {b: 6} {c: 7} which is dictionary concatenation
+            inside_dict_expr = ConfigTreeParser(ZeroOrMore(comment_eol | include_expr | assign_expr | eol_comma))
+            inside_root_dict_expr = ConfigTreeParser(ZeroOrMore(
+                comment_eol | include_expr | assign_expr | eol_comma), root=True)
+            dict_expr << Suppress('{') - inside_dict_expr - Suppress('}')
+            root_dict_expr << Suppress('{') - inside_root_dict_expr - Suppress('}')
+            list_entry = ConcatenatedValueParser(multi_value_expr)
+            list_expr << Suppress('[') - ListParser(list_entry - ZeroOrMore(eol_comma - list_entry)) - Suppress(']')
+
+            # special case when we have a value assignment where the string can potentially be the remainder of the line
+            assign_expr << Group(key - ZeroOrMore(comment_no_comma_eol) -
+                                 (dict_expr | (Literal('=') | Literal(':') | Literal('+=')) -
+                                  ZeroOrMore(comment_no_comma_eol) - ConcatenatedValueParser(multi_value_expr)))
+
+            # the file can be { ... } where {} can be omitted or []
+            config_expr = ZeroOrMore(comment_eol | eol) + (list_expr | root_dict_expr |
+                                                           inside_root_dict_expr) + ZeroOrMore(comment_eol | eol_comma)
+            config = config_expr.parseString(content, parseAll=True)[0]
+
+            if resolve:
+                allow_unresolved = resolve and unresolved_value is not DEFAULT_SUBSTITUTION and \
+                                   unresolved_value is not MANDATORY_SUBSTITUTION
+                has_unresolved = cls.resolve_substitutions(config, allow_unresolved)
+                if has_unresolved and unresolved_value is MANDATORY_SUBSTITUTION:
+                    raise ConfigSubstitutionException(
+                        'resolve cannot be set to True and unresolved_value to MANDATORY_SUBSTITUTION')
+
+            if unresolved_value is not NO_SUBSTITUTION and unresolved_value is not DEFAULT_SUBSTITUTION:
+                cls.unresolve_substitutions_to_value(config, unresolved_value)
+        return config
+
+    @classmethod
+    def _resolve_variable(cls, config, substitution):
+        """
+        :param config:
+        :param substitution:
+        :return: (is_resolved, resolved_variable)
+        """
+        variable = substitution.variable
+        try:
+            return True, config.get(variable)
+        except ConfigMissingException:
+            # default to environment variable
+            value = os.environ.get(variable)
+
+            if value is None:
+                if substitution.optional:
+                    return False, None
+                else:
+                    raise ConfigSubstitutionException(
+                        "Cannot resolve variable ${{{variable}}} (line: {line}, col: {col})".format(
+                            variable=variable,
+                            line=lineno(substitution.loc, substitution.instring),
+                            col=col(substitution.loc, substitution.instring)))
+            elif isinstance(value, ConfigList) or isinstance(value, ConfigTree):
+                raise ConfigSubstitutionException(
+                    "Cannot substitute variable ${{{variable}}} because it does not point to a "
+                    "string, int, float, boolean or null {type} (line:{line}, col: {col})".format(
+                        variable=variable,
+                        type=value.__class__.__name__,
+                        line=lineno(substitution.loc, substitution.instring),
+                        col=col(substitution.loc, substitution.instring)))
+            return True, value
+
+    @classmethod
+    def _fixup_self_references(cls, config, accept_unresolved=False):
+        if isinstance(config, ConfigTree) and config.root:
+            for key in config:  # Traverse history of element
+                history = config.history[key]
+                previous_item = history[0]
+                for current_item in history[1:]:
+                    for substitution in cls._find_substitutions(current_item):
+                        prop_path = ConfigTree.parse_key(substitution.variable)
+                        if len(prop_path) > 1 and config.get(substitution.variable, None) is not None:
+                            continue  # If value is present in latest version, don't do anything
+                        if prop_path[0] == key:
+                            if isinstance(previous_item, ConfigValues) and not accept_unresolved:
+                                # We hit a dead end, we cannot evaluate
+                                raise ConfigSubstitutionException(
+                                    "Property {variable} cannot be substituted. Check for cycles.".format(
+                                        variable=substitution.variable
+                                    )
+                                )
+                            else:
+                                value = previous_item if len(
+                                    prop_path) == 1 else previous_item.get(".".join(prop_path[1:]))
+                                _, _, current_item = cls._do_substitute(substitution, value)
+                    previous_item = current_item
+
+                if len(history) == 1:
+                    for substitution in cls._find_substitutions(previous_item):
+                        prop_path = ConfigTree.parse_key(substitution.variable)
+                        if len(prop_path) > 1 and config.get(substitution.variable, None) is not None:
+                            continue  # If value is present in latest version, don't do anything
+                        if prop_path[0] == key and substitution.optional:
+                            cls._do_substitute(substitution, None)
+                        if prop_path[0] == key:
+                            value = os.environ.get(key)
+                            if value is not None:
+                                cls._do_substitute(substitution, value)
+                                continue
+                            if substitution.optional:  # special case, when self optional referencing without existing
+                                cls._do_substitute(substitution, None)
+
+    # traverse config to find all the substitutions
+    @classmethod
+    def _find_substitutions(cls, item):
+        """Convert HOCON input into a JSON output
+
+        :return: JSON string representation
+        :type return: basestring
+        """
+        if isinstance(item, ConfigValues):
+            return item.get_substitutions()
+
+        substitutions = []
+        elements = []
+        if isinstance(item, ConfigTree):
+            elements = item.values()
+        elif isinstance(item, list):
+            elements = item
+
+        for child in elements:
+            substitutions += cls._find_substitutions(child)
+        return substitutions
+
+    @classmethod
+    def _do_substitute(cls, substitution, resolved_value, is_optional_resolved=True):
+        unresolved = False
+        new_substitutions = []
+        if isinstance(resolved_value, ConfigValues):
+            resolved_value = resolved_value.transform()
+        if isinstance(resolved_value, ConfigValues):
+            unresolved = True
+            result = resolved_value
+        else:
+            # replace token by substitution
+            config_values = substitution.parent
+            # if it is a string, then add the extra ws that was present in the original string after the substitution
+            formatted_resolved_value = resolved_value \
+                if resolved_value is None \
+                or isinstance(resolved_value, (dict, list)) \
+                or substitution.index == len(config_values.tokens) - 1 \
+                else (str(resolved_value) + substitution.ws)
+            # use a deepcopy of resolved_value to avoid mutation
+            config_values.put(substitution.index, copy.deepcopy(formatted_resolved_value))
+            transformation = config_values.transform()
+            result = config_values.overriden_value \
+                if transformation is None and not is_optional_resolved \
+                else transformation
+
+            if result is None and config_values.key in config_values.parent:
+                del config_values.parent[config_values.key]
+            else:
+                config_values.parent[config_values.key] = result
+                s = cls._find_substitutions(result)
+                if s:
+                    new_substitutions = s
+                    unresolved = True
+
+        return (unresolved, new_substitutions, result)
+
+    @classmethod
+    def _final_fixup(cls, item):
+        if isinstance(item, ConfigValues):
+            return item.transform()
+        elif isinstance(item, list):
+            return list([cls._final_fixup(child) for child in item])
+        elif isinstance(item, ConfigTree):
+            items = list(item.items())
+            for key, child in items:
+                item[key] = cls._final_fixup(child)
+        return item
+
+    @classmethod
+    def unresolve_substitutions_to_value(cls, config, unresolved_value=STR_SUBSTITUTION):
+        for substitution in cls._find_substitutions(config):
+            if unresolved_value is STR_SUBSTITUTION:
+                value = substitution.raw_str()
+            elif unresolved_value is None:
+                value = NoneValue()
+            else:
+                value = unresolved_value
+            cls._do_substitute(substitution, value, False)
+        cls._final_fixup(config)
+
+    @classmethod
+    def resolve_substitutions(cls, config, accept_unresolved=False):
+        has_unresolved = False
+        cls._fixup_self_references(config, accept_unresolved)
+        substitutions = cls._find_substitutions(config)
+        if len(substitutions) > 0:
+            unresolved = True
+            any_unresolved = True
+            _substitutions = []
+            cache = {}
+            while any_unresolved and len(substitutions) > 0 and set(substitutions) != set(_substitutions):
+                unresolved = False
+                any_unresolved = True
+                _substitutions = substitutions[:]
+
+                for substitution in _substitutions:
+                    is_optional_resolved, resolved_value = cls._resolve_variable(config, substitution)
+
+                    # if the substitution is optional
+                    if not is_optional_resolved and substitution.optional:
+                        resolved_value = None
+                    if isinstance(resolved_value, ConfigValues):
+                        parents = cache.get(resolved_value)
+                        if parents is None:
+                            parents = []
+                            link = resolved_value
+                            while isinstance(link, ConfigValues):
+                                parents.append(link)
+                                link = link.overriden_value
+                            cache[resolved_value] = parents
+
+                    if isinstance(resolved_value, ConfigValues) \
+                       and substitution.parent in parents \
+                       and hasattr(substitution.parent, 'overriden_value') \
+                       and substitution.parent.overriden_value:
+
+                        # self resolution, backtrack
+                        resolved_value = substitution.parent.overriden_value
+
+                    unresolved, new_substitutions, result = cls._do_substitute(
+                        substitution, resolved_value, is_optional_resolved)
+                    any_unresolved = unresolved or any_unresolved
+                    substitutions.extend(new_substitutions)
+                    if not isinstance(result, ConfigValues):
+                        substitutions.remove(substitution)
+
+            cls._final_fixup(config)
+            if unresolved:
+                has_unresolved = True
+                if not accept_unresolved:
+                    raise ConfigSubstitutionException("Cannot resolve {variables}. Check for cycles.".format(
+                        variables=', '.join('${{{variable}}}: (line: {line}, col: {col})'.format(
+                            variable=substitution.variable,
+                            line=lineno(substitution.loc, substitution.instring),
+                            col=col(substitution.loc, substitution.instring)) for substitution in substitutions)))
+
+        cls._final_fixup(config)
+        return has_unresolved
+
+
+class ListParser(TokenConverter):
+    """Parse a list [elt1, etl2, ...]
+    """
+
+    def __init__(self, expr=None):
+        super(ListParser, self).__init__(expr)
+        self.saveAsList = True
+
+    def postParse(self, instring, loc, token_list):
+        """Create a list from the tokens
+
+        :param instring:
+        :param loc:
+        :param token_list:
+        :return:
+        """
+        cleaned_token_list = [token for tokens in (token.tokens if isinstance(token, ConfigInclude) else [token]
+                                                   for token in token_list if token != '')
+                              for token in tokens]
+        config_list = ConfigList(cleaned_token_list)
+        return [config_list]
+
+
+class ConcatenatedValueParser(TokenConverter):
+    def __init__(self, expr=None):
+        super(ConcatenatedValueParser, self).__init__(expr)
+        self.parent = None
+        self.key = None
+
+    def postParse(self, instring, loc, token_list):
+        config_values = ConfigValues(token_list, instring, loc)
+        return [config_values.transform()]
+
+
+class ConfigTreeParser(TokenConverter):
+    """
+    Parse a config tree from tokens
+    """
+
+    def __init__(self, expr=None, root=False):
+        super(ConfigTreeParser, self).__init__(expr)
+        self.root = root
+        self.saveAsList = True
+
+    def postParse(self, instring, loc, token_list):
+        """Create ConfigTree from tokens
+
+        :param instring:
+        :param loc:
+        :param token_list:
+        :return:
+        """
+        config_tree = ConfigTree(root=self.root)
+        for element in token_list:
+            expanded_tokens = element.tokens if isinstance(element, ConfigInclude) else [element]
+
+            for tokens in expanded_tokens:
+                # key, value1 (optional), ...
+                key = tokens[0].strip() if isinstance(tokens[0], (unicode, basestring)) else tokens[0]
+                operator = '='
+                if len(tokens) == 3 and tokens[1].strip() in [':', '=', '+=']:
+                    operator = tokens[1].strip()
+                    values = tokens[2:]
+                elif len(tokens) == 2:
+                    values = tokens[1:]
+                else:
+                    raise ParseSyntaxException("Unknown tokens {tokens} received".format(tokens=tokens))
+                # empty string
+                if len(values) == 0:
+                    config_tree.put(key, '')
+                else:
+                    value = values[0]
+                    if isinstance(value, list) and operator == "+=":
+                        value = ConfigValues([ConfigSubstitution(key, True, '', False, loc), value], False, loc)
+                        config_tree.put(key, value, False)
+                    elif isinstance(value, unicode) and operator == "+=":
+                        value = ConfigValues([ConfigSubstitution(key, True, '', True, loc), ' ' + value], True, loc)
+                        config_tree.put(key, value, False)
+                    elif isinstance(value, list):
+                        config_tree.put(key, value, False)
+                    else:
+                        existing_value = config_tree.get(key, None)
+                        if isinstance(value, ConfigTree) and not isinstance(existing_value, list):
+                            # Only Tree has to be merged with tree
+                            config_tree.put(key, value, True)
+                        elif isinstance(value, ConfigValues):
+                            conf_value = value
+                            value.parent = config_tree
+                            value.key = key
+                            if isinstance(existing_value, list) or isinstance(existing_value, ConfigTree):
+                                config_tree.put(key, conf_value, True)
+                            else:
+                                config_tree.put(key, conf_value, False)
+                        else:
+                            config_tree.put(key, value, False)
+        return config_tree
--- a/clearml_agent/external/pyhocon/config_tree.py
+++ b/clearml_agent/external/pyhocon/config_tree.py
@@ -0,0 +1,608 @@
+from collections import OrderedDict
+from pyparsing import lineno
+from pyparsing import col
+try:
+    basestring
+except NameError:  # pragma: no cover
+    basestring = str
+    unicode = str
+
+import re
+import copy
+from .exceptions import ConfigException, ConfigWrongTypeException, ConfigMissingException
+
+
+class UndefinedKey(object):
+    pass
+
+
+class NonExistentKey(object):
+    pass
+
+
+class NoneValue(object):
+    pass
+
+
+class ConfigTree(OrderedDict):
+    KEY_SEP = '.'
+
+    def __init__(self, *args, **kwds):
+        self.root = kwds.pop('root') if 'root' in kwds else False
+        if self.root:
+            self.history = {}
+        super(ConfigTree, self).__init__(*args, **kwds)
+        for key, value in self.items():
+            if isinstance(value, ConfigValues):
+                value.parent = self
+                value.index = key
+
+    @staticmethod
+    def merge_configs(a, b, copy_trees=False):
+        """Merge config b into a
+
+        :param a: target config
+        :type a: ConfigTree
+        :param b: source config
+        :type b: ConfigTree
+        :return: merged config a
+        """
+        for key, value in b.items():
+            # if key is in both a and b and both values are dictionary then merge it otherwise override it
+            if key in a and isinstance(a[key], ConfigTree) and isinstance(b[key], ConfigTree):
+                if copy_trees:
+                    a[key] = a[key].copy()
+                ConfigTree.merge_configs(a[key], b[key], copy_trees=copy_trees)
+            else:
+                if isinstance(value, ConfigValues):
+                    value.parent = a
+                    value.key = key
+                    if key in a:
+                        value.overriden_value = a[key]
+                a[key] = value
+                if a.root:
+                    if b.root:
+                        a.history[key] = a.history.get(key, []) + b.history.get(key, [value])
+                    else:
+                        a.history[key] = a.history.get(key, []) + [value]
+
+        return a
+
+    def _put(self, key_path, value, append=False):
+        key_elt = key_path[0]
+        if len(key_path) == 1:
+            # if value to set does not exist, override
+            # if they are both configs then merge
+            # if not then override
+            if key_elt in self and isinstance(self[key_elt], ConfigTree) and isinstance(value, ConfigTree):
+                if self.root:
+                    new_value = ConfigTree.merge_configs(ConfigTree(), self[key_elt], copy_trees=True)
+                    new_value = ConfigTree.merge_configs(new_value, value, copy_trees=True)
+                    self._push_history(key_elt, new_value)
+                    self[key_elt] = new_value
+                else:
+                    ConfigTree.merge_configs(self[key_elt], value)
+            elif append:
+                # If we have t=1
+                # and we try to put t.a=5 then t is replaced by {a: 5}
+                l_value = self.get(key_elt, None)
+                if isinstance(l_value, ConfigValues):
+                    l_value.tokens.append(value)
+                    l_value.recompute()
+                elif isinstance(l_value, ConfigTree) and isinstance(value, ConfigValues):
+                    value.overriden_value = l_value
+                    value.tokens.insert(0, l_value)
+                    value.recompute()
+                    value.parent = self
+                    value.key = key_elt
+                    self._push_history(key_elt, value)
+                    self[key_elt] = value
+                elif isinstance(l_value, list) and isinstance(value, ConfigValues):
+                    self._push_history(key_elt, value)
+                    value.overriden_value = l_value
+                    value.parent = self
+                    value.key = key_elt
+                    self[key_elt] = value
+                elif isinstance(l_value, list):
+                    self[key_elt] = l_value + value
+                    self._push_history(key_elt, l_value)
+                elif l_value is None:
+                    self._push_history(key_elt, value)
+                    self[key_elt] = value
+
+                else:
+                    raise ConfigWrongTypeException(
+                        u"Cannot concatenate the list {key}: {value} to {prev_value} of {type}".format(
+                            key='.'.join(key_path),
+                            value=value,
+                            prev_value=l_value,
+                            type=l_value.__class__.__name__)
+                    )
+            else:
+                # if there was an override keep overide value
+                if isinstance(value, ConfigValues):
+                    value.parent = self
+                    value.key = key_elt
+                    value.overriden_value = self.get(key_elt, None)
+                self._push_history(key_elt, value)
+                self[key_elt] = value
+        else:
+            next_config_tree = super(ConfigTree, self).get(key_elt)
+            if not isinstance(next_config_tree, ConfigTree):
+                # create a new dictionary or overwrite a previous value
+                next_config_tree = ConfigTree()
+                self._push_history(key_elt, next_config_tree)
+                self[key_elt] = next_config_tree
+            next_config_tree._put(key_path[1:], value, append)
+
+    def _push_history(self, key, value):
+        if self.root:
+            hist = self.history.get(key)
+            if hist is None:
+                hist = self.history[key] = []
+            hist.append(value)
+
+    def _get(self, key_path, key_index=0, default=UndefinedKey):
+        key_elt = key_path[key_index]
+        elt = super(ConfigTree, self).get(key_elt, UndefinedKey)
+
+        if elt is UndefinedKey:
+            if default is UndefinedKey:
+                raise ConfigMissingException(u"No configuration setting found for key {key}".format(
+                    key='.'.join(key_path[: key_index + 1])))
+            else:
+                return default
+
+        if key_index == len(key_path) - 1:
+            if isinstance(elt, NoneValue):
+                return None
+            elif isinstance(elt, list):
+                return [None if isinstance(x, NoneValue) else x for x in elt]
+            else:
+                return elt
+        elif isinstance(elt, ConfigTree):
+            return elt._get(key_path, key_index + 1, default)
+        else:
+            if default is UndefinedKey:
+                raise ConfigWrongTypeException(
+                    u"{key} has type {type} rather than dict".format(key='.'.join(key_path[:key_index + 1]),
+                                                                     type=type(elt).__name__))
+            else:
+                return default
+
+    @staticmethod
+    def parse_key(string):
+        """
+        Split a key into path elements:
+        - a.b.c => a, b, c
+        - a."b.c" => a, QuotedKey("b.c") if . is any of the special characters: $}[]:=+#`^?!@*&.
+        - "a" => a
+        - a.b."c" => a, b, c (special case)
+        :param string: either string key (parse '.' as sub-key) or int / float as regular keys
+        :return:
+        """
+        if isinstance(string, (int, float)):
+            return [string]
+
+        special_characters = '$}[]:=+#`^?!@*&.'
+        tokens = re.findall(
+            r'"[^"]+"|[^{special_characters}]+'.format(special_characters=re.escape(special_characters)),
+            string)
+
+        def contains_special_character(token):
+            return any((c in special_characters) for c in token)
+
+        return [token if contains_special_character(token) else token.strip('"') for token in tokens]
+
+    def put(self, key, value, append=False):
+        """Put a value in the tree (dot separated)
+
+        :param key: key to use (dot separated). E.g., a.b.c
+        :type key: basestring
+        :param value: value to put
+        """
+        self._put(ConfigTree.parse_key(key), value, append)
+
+    def get(self, key, default=UndefinedKey):
+        """Get a value from the tree
+
+        :param key: key to use (dot separated). E.g., a.b.c
+        :type key: basestring
+        :param default: default value if key not found
+        :type default: object
+        :return: value in the tree located at key
+        """
+        return self._get(ConfigTree.parse_key(key), 0, default)
+
+    def get_string(self, key, default=UndefinedKey):
+        """Return string representation of value found at key
+
+        :param key: key to use (dot separated). E.g., a.b.c
+        :type key: basestring
+        :param default: default value if key not found
+        :type default: basestring
+        :return: string value
+        :type return: basestring
+        """
+        value = self.get(key, default)
+        if value is None:
+            return None
+
+        string_value = unicode(value)
+        if isinstance(value, bool):
+            string_value = string_value.lower()
+        return string_value
+
+    def pop(self, key, default=UndefinedKey):
+        """Remove specified key and return the corresponding value.
+        If key is not found, default is returned if given, otherwise ConfigMissingException is raised
+
+        This method assumes the user wants to remove the last value in the chain so it parses via parse_key
+        and pops the last value out of the dict.
+
+        :param key: key to use (dot separated). E.g., a.b.c
+        :type key: basestring
+        :param default: default value if key not found
+        :type default: object
+        :param default: default value if key not found
+        :return: value in the tree located at key
+        """
+        if default != UndefinedKey and key not in self:
+            return default
+
+        value = self.get(key, UndefinedKey)
+        lst = ConfigTree.parse_key(key)
+        parent = self.KEY_SEP.join(lst[0:-1])
+        child = lst[-1]
+
+        if parent:
+            self.get(parent).__delitem__(child)
+        else:
+            self.__delitem__(child)
+        return value
+
+    def get_int(self, key, default=UndefinedKey):
+        """Return int representation of value found at key
+
+        :param key: key to use (dot separated). E.g., a.b.c
+        :type key: basestring
+        :param default: default value if key not found
+        :type default: int
+        :return: int value
+        :type return: int
+        """
+        value = self.get(key, default)
+        try:
+            return int(value) if value is not None else None
+        except (TypeError, ValueError):
+            raise ConfigException(
+                u"{key} has type '{type}' rather than 'int'".format(key=key, type=type(value).__name__))
+
+    def get_float(self, key, default=UndefinedKey):
+        """Return float representation of value found at key
+
+        :param key: key to use (dot separated). E.g., a.b.c
+        :type key: basestring
+        :param default: default value if key not found
+        :type default: float
+        :return: float value
+        :type return: float
+        """
+        value = self.get(key, default)
+        try:
+            return float(value) if value is not None else None
+        except (TypeError, ValueError):
+            raise ConfigException(
+                u"{key} has type '{type}' rather than 'float'".format(key=key, type=type(value).__name__))
+
+    def get_bool(self, key, default=UndefinedKey):
+        """Return boolean representation of value found at key
+
+        :param key: key to use (dot separated). E.g., a.b.c
+        :type key: basestring
+        :param default: default value if key not found
+        :type default: bool
+        :return: boolean value
+        :type return: bool
+        """
+
+        # String conversions as per API-recommendations:
+        # https://github.com/typesafehub/config/blob/master/HOCON.md#automatic-type-conversions
+        bool_conversions = {
+            None: None,
+            'true': True, 'yes': True, 'on': True,
+            'false': False, 'no': False, 'off': False
+        }
+        string_value = self.get_string(key, default)
+        if string_value is not None:
+            string_value = string_value.lower()
+        try:
+            return bool_conversions[string_value]
+        except KeyError:
+            raise ConfigException(
+                u"{key} does not translate to a Boolean value".format(key=key))
+
+    def get_list(self, key, default=UndefinedKey):
+        """Return list representation of value found at key
+
+        :param key: key to use (dot separated). E.g., a.b.c
+        :type key: basestring
+        :param default: default value if key not found
+        :type default: list
+        :return: list value
+        :type return: list
+        """
+        value = self.get(key, default)
+        if isinstance(value, list):
+            return value
+        elif isinstance(value, ConfigTree):
+            lst = []
+            for k, v in sorted(value.items(), key=lambda kv: kv[0]):
+                if re.match('^[1-9][0-9]*$|0', k):
+                    lst.append(v)
+                else:
+                    raise ConfigException(u"{key} does not translate to a list".format(key=key))
+            return lst
+        elif value is None:
+            return None
+        else:
+            raise ConfigException(
+                u"{key} has type '{type}' rather than 'list'".format(key=key, type=type(value).__name__))
+
+    def get_config(self, key, default=UndefinedKey):
+        """Return tree config representation of value found at key
+
+        :param key: key to use (dot separated). E.g., a.b.c
+        :type key: basestring
+        :param default: default value if key not found
+        :type default: config
+        :return: config value
+        :type return: ConfigTree
+        """
+        value = self.get(key, default)
+        if isinstance(value, dict):
+            return value
+        elif value is None:
+            return None
+        else:
+            raise ConfigException(
+                u"{key} has type '{type}' rather than 'config'".format(key=key, type=type(value).__name__))
+
+    def __getitem__(self, item):
+        val = self.get(item)
+        if val is UndefinedKey:
+            raise KeyError(item)
+        return val
+
+    try:
+        from collections import _OrderedDictItemsView
+    except ImportError:  # pragma: nocover
+        pass
+    else:
+        def items(self):  # pragma: nocover
+            return self._OrderedDictItemsView(self)
+
+    def __getattr__(self, item):
+        val = self.get(item, NonExistentKey)
+        if val is NonExistentKey:
+            return super(ConfigTree, self).__getattr__(item)
+        return val
+
+    def __contains__(self, item):
+        return self._get(self.parse_key(item), default=NoneValue) is not NoneValue
+
+    def with_fallback(self, config, resolve=True):
+        """
+        return a new config with fallback on config
+        :param config: config or filename of the config to fallback on
+        :param resolve: resolve substitutions
+        :return: new config with fallback on config
+        """
+        if isinstance(config, ConfigTree):
+            result = ConfigTree.merge_configs(copy.deepcopy(config), copy.deepcopy(self))
+        else:
+            from . import ConfigFactory
+            result = ConfigTree.merge_configs(ConfigFactory.parse_file(config, resolve=False), copy.deepcopy(self))
+
+        if resolve:
+            from . import ConfigParser
+            ConfigParser.resolve_substitutions(result)
+        return result
+
+    def as_plain_ordered_dict(self):
+        """return a deep copy of this config as a plain OrderedDict
+
+        The config tree should be fully resolved.
+
+        This is useful to get an object with no special semantics such as path expansion for the keys.
+        In particular this means that keys that contain dots are not surrounded with '"' in the plain OrderedDict.
+
+        :return: this config as an OrderedDict
+        :type return: OrderedDict
+        """
+        def plain_value(v):
+            if isinstance(v, list):
+                return [plain_value(e) for e in v]
+            elif isinstance(v, ConfigTree):
+                return v.as_plain_ordered_dict()
+            else:
+                if isinstance(v, ConfigValues):
+                    raise ConfigException("The config tree contains unresolved elements")
+                return v
+
+        return OrderedDict((key.strip('"') if isinstance(key, (unicode, basestring)) else key, plain_value(value))
+                           for key, value in self.items())
+
+
+class ConfigList(list):
+    def __init__(self, iterable=[]):
+        new_list = list(iterable)
+        super(ConfigList, self).__init__(new_list)
+        for index, value in enumerate(new_list):
+            if isinstance(value, ConfigValues):
+                value.parent = self
+                value.key = index
+
+
+class ConfigInclude(object):
+    def __init__(self, tokens):
+        self.tokens = tokens
+
+
+class ConfigValues(object):
+    def __init__(self, tokens, instring, loc):
+        self.tokens = tokens
+        self.parent = None
+        self.key = None
+        self._instring = instring
+        self._loc = loc
+        self.overriden_value = None
+        self.recompute()
+
+    def recompute(self):
+        for index, token in enumerate(self.tokens):
+            if isinstance(token, ConfigSubstitution):
+                token.parent = self
+                token.index = index
+
+        # no value return empty string
+        if len(self.tokens) == 0:
+            self.tokens = ['']
+
+        # if the last token is an unquoted string then right strip it
+        if isinstance(self.tokens[-1], ConfigUnquotedString):
+            # rstrip only whitespaces, not \n\r because they would have been used escaped
+            self.tokens[-1] = self.tokens[-1].rstrip(' \t')
+
+    def has_substitution(self):
+        return len(self.get_substitutions()) > 0
+
+    def get_substitutions(self):
+        lst = []
+        node = self
+        while node:
+            lst = [token for token in node.tokens if isinstance(token, ConfigSubstitution)] + lst
+            if hasattr(node, 'overriden_value'):
+                node = node.overriden_value
+                if not isinstance(node, ConfigValues):
+                    break
+            else:
+                break
+        return lst
+
+    def transform(self):
+        def determine_type(token):
+            return ConfigTree if isinstance(token, ConfigTree) else ConfigList if isinstance(token, list) else str
+
+        def format_str(v, last=False):
+            if isinstance(v, ConfigQuotedString):
+                return v.value + ('' if last else v.ws)
+            else:
+                return '' if v is None else unicode(v)
+
+        if self.has_substitution():
+            return self
+
+        # remove None tokens
+        tokens = [token for token in self.tokens if token is not None]
+
+        if not tokens:
+            return None
+
+        # check if all tokens are compatible
+        first_tok_type = determine_type(tokens[0])
+        for index, token in enumerate(tokens[1:]):
+            tok_type = determine_type(token)
+            if first_tok_type is not tok_type:
+                raise ConfigWrongTypeException(
+                    "Token '{token}' of type {tok_type} (index {index}) must be of type {req_tok_type} "
+                    "(line: {line}, col: {col})".format(
+                        token=token,
+                        index=index + 1,
+                        tok_type=tok_type.__name__,
+                        req_tok_type=first_tok_type.__name__,
+                        line=lineno(self._loc, self._instring),
+                        col=col(self._loc, self._instring)))
+
+        if first_tok_type is ConfigTree:
+            child = []
+            if hasattr(self, 'overriden_value'):
+                node = self.overriden_value
+                while node:
+                    if isinstance(node, ConfigValues):
+                        value = node.transform()
+                        if isinstance(value, ConfigTree):
+                            child.append(value)
+                        else:
+                            break
+                    elif isinstance(node, ConfigTree):
+                        child.append(node)
+                    else:
+                        break
+                    if hasattr(node, 'overriden_value'):
+                        node = node.overriden_value
+                    else:
+                        break
+
+            result = ConfigTree()
+            for conf in reversed(child):
+                ConfigTree.merge_configs(result, conf, copy_trees=True)
+            for token in tokens:
+                ConfigTree.merge_configs(result, token, copy_trees=True)
+            return result
+        elif first_tok_type is ConfigList:
+            result = []
+            main_index = 0
+            for sublist in tokens:
+                sublist_result = ConfigList()
+                for token in sublist:
+                    if isinstance(token, ConfigValues):
+                        token.parent = result
+                        token.key = main_index
+                    main_index += 1
+                    sublist_result.append(token)
+                result.extend(sublist_result)
+            return result
+        else:
+            if len(tokens) == 1:
+                if isinstance(tokens[0], ConfigQuotedString):
+                    return tokens[0].value
+                return tokens[0]
+            else:
+                return ''.join(format_str(token) for token in tokens[:-1]) + format_str(tokens[-1], True)
+
+    def put(self, index, value):
+        self.tokens[index] = value
+
+    def __repr__(self):  # pragma: no cover
+        return '[ConfigValues: ' + ','.join(str(o) for o in self.tokens) + ']'
+
+
+class ConfigSubstitution(object):
+    def __init__(self, variable, optional, ws, instring, loc):
+        self.variable = variable
+        self.optional = optional
+        self.ws = ws
+        self.index = None
+        self.parent = None
+        self.instring = instring
+        self.loc = loc
+
+    def __repr__(self):  # pragma: no cover
+        return '[ConfigSubstitution: ' + self.variable + ']'
+
+
+class ConfigUnquotedString(unicode):
+    def __new__(cls, value):
+        return super(ConfigUnquotedString, cls).__new__(cls, value)
+
+
+class ConfigQuotedString(object):
+    def __init__(self, value, ws, instring, loc):
+        self.value = value
+        self.ws = ws
+        self.instring = instring
+        self.loc = loc
+
+    def __repr__(self):  # pragma: no cover
+        return '[ConfigQuotedString: ' + self.value + ']'
--- a/clearml_agent/external/pyhocon/converter.py
+++ b/clearml_agent/external/pyhocon/converter.py
@@ -0,0 +1,329 @@
+import json
+import re
+import sys
+
+from . import ConfigFactory
+from .config_tree import ConfigQuotedString
+from .config_tree import ConfigSubstitution
+from .config_tree import ConfigTree
+from .config_tree import ConfigValues
+from .config_tree import NoneValue
+
+
+try:
+    basestring
+except NameError:
+    basestring = str
+    unicode = str
+
+
+class HOCONConverter(object):
+    _number_re = r'[+-]?(\d*\.\d+|\d+(\.\d+)?)([eE][+\-]?\d+)?(?=$|[ \t]*([\$\}\],#\n\r]|//))'
+    _number_re_matcher = re.compile(_number_re)
+
+    @classmethod
+    def to_json(cls, config, compact=False, indent=2, level=0):
+        """Convert HOCON input into a JSON output
+
+        :return: JSON string representation
+        :type return: basestring
+        """
+        lines = ""
+        if isinstance(config, ConfigTree):
+            if len(config) == 0:
+                lines += '{}'
+            else:
+                lines += '{\n'
+                bet_lines = []
+                for key, item in config.items():
+                    bet_lines.append('{indent}"{key}": {value}'.format(
+                        indent=''.rjust((level + 1) * indent, ' '),
+                        key=key.strip('"'),  # for dotted keys enclosed with "" to not be interpreted as nested key
+                        value=cls.to_json(item, compact, indent, level + 1))
+                    )
+                lines += ',\n'.join(bet_lines)
+                lines += '\n{indent}}}'.format(indent=''.rjust(level * indent, ' '))
+        elif isinstance(config, list):
+            if len(config) == 0:
+                lines += '[]'
+            else:
+                lines += '[\n'
+                bet_lines = []
+                for item in config:
+                    bet_lines.append('{indent}{value}'.format(
+                        indent=''.rjust((level + 1) * indent, ' '),
+                        value=cls.to_json(item, compact, indent, level + 1))
+                    )
+                lines += ',\n'.join(bet_lines)
+                lines += '\n{indent}]'.format(indent=''.rjust(level * indent, ' '))
+        elif isinstance(config, basestring):
+            lines = json.dumps(config)
+        elif config is None or isinstance(config, NoneValue):
+            lines = 'null'
+        elif config is True:
+            lines = 'true'
+        elif config is False:
+            lines = 'false'
+        else:
+            lines = str(config)
+        return lines
+
+    @staticmethod
+    def _auto_indent(lines, section):
+        # noinspection PyBroadException
+        try:
+            indent = len(lines) - lines.rindex('\n')
+        except Exception:
+            indent = len(lines)
+        # noinspection PyBroadException
+        try:
+            section_indent = section.index('\n')
+        except Exception:
+            section_indent = len(section)
+        if section_indent < 3:
+            return lines + section
+
+        indent = '\n' + ''.rjust(indent, ' ')
+        return lines + indent.join([sec.strip() for sec in section.split('\n')])
+        # indent = ''.rjust(indent, ' ')
+        # return lines + section.replace('\n', '\n'+indent)
+
+    @classmethod
+    def to_hocon(cls, config, compact=False, indent=2, level=0):
+        """Convert HOCON input into a HOCON output
+
+        :return: JSON string representation
+        :type return: basestring
+        """
+        lines = ""
+        if isinstance(config, ConfigTree):
+            if len(config) == 0:
+                lines += '{}'
+            else:
+                if level > 0:  # don't display { at root level
+                    lines += '{\n'
+                bet_lines = []
+
+                for key, item in config.items():
+                    if compact:
+                        full_key = key
+                        while isinstance(item, ConfigTree) and len(item) == 1:
+                            key, item = next(iter(item.items()))
+                            full_key += '.' + key
+                    else:
+                        full_key = key
+
+                    if isinstance(full_key, float) or \
+                            (isinstance(full_key, (basestring, unicode)) and cls._number_re_matcher.match(full_key)):
+                        # if key can be casted to float, and it is a string, make sure we quote it
+                        full_key = '\"{}\"'.format(full_key)
+
+                    bet_line = ('{indent}{key}{assign_sign} '.format(
+                        indent=''.rjust(level * indent, ' '),
+                        key=full_key,
+                        assign_sign='' if isinstance(item, dict) else ' =',)
+                    )
+                    value_line = cls.to_hocon(item, compact, indent, level + 1)
+                    if isinstance(item, (list, tuple)):
+                        bet_lines.append(cls._auto_indent(bet_line, value_line))
+                    else:
+                        bet_lines.append(bet_line + value_line)
+                lines += '\n'.join(bet_lines)
+
+                if level > 0:  # don't display { at root level
+                    lines += '\n{indent}}}'.format(indent=''.rjust((level - 1) * indent, ' '))
+        elif isinstance(config, (list, tuple)):
+            if len(config) == 0:
+                lines += '[]'
+            else:
+                # lines += '[\n'
+                lines += '['
+                bet_lines = []
+                base_len = len(lines)
+                skip_comma = False
+                for i, item in enumerate(config):
+                    if 0 < i and not skip_comma:
+                        # if not isinstance(item, (str, int, float)):
+                        #     lines += ',\n{indent}'.format(indent=''.rjust(level * indent, ' '))
+                        # else:
+                        #     lines += ', '
+                        lines += ', '
+
+                    skip_comma = False
+                    new_line = cls.to_hocon(item, compact, indent, level + 1)
+                    lines += new_line
+                    if '\n' in new_line or len(lines) - base_len > 80:
+                        if i < len(config) - 1:
+                            lines += ',\n{indent}'.format(indent=''.rjust(level * indent, ' '))
+                        base_len = len(lines)
+                        skip_comma = True
+                    # bet_lines.append('{value}'.format(value=cls.to_hocon(item, compact, indent, level + 1)))
+
+                # lines += '\n'.join(bet_lines)
+                # lines += ', '.join(bet_lines)
+
+                # lines += '\n{indent}]'.format(indent=''.rjust((level - 1) * indent, ' '))
+                lines += ']'
+        elif isinstance(config, basestring):
+            if '\n' in config and len(config) > 1:
+                lines = '"""{value}"""'.format(value=config)  # multilines
+            else:
+                lines = '"{value}"'.format(value=cls.__escape_string(config))
+        elif isinstance(config, ConfigValues):
+            lines = ''.join(cls.to_hocon(o, compact, indent, level) for o in config.tokens)
+        elif isinstance(config, ConfigSubstitution):
+            lines = '${'
+            if config.optional:
+                lines += '?'
+            lines += config.variable + '}' + config.ws
+        elif isinstance(config, ConfigQuotedString):
+            if '\n' in config.value and len(config.value) > 1:
+                lines = '"""{value}"""'.format(value=config.value)  # multilines
+            else:
+                lines = '"{value}"'.format(value=cls.__escape_string(config.value))
+        elif config is None or isinstance(config, NoneValue):
+            lines = 'null'
+        elif config is True:
+            lines = 'true'
+        elif config is False:
+            lines = 'false'
+        else:
+            lines = str(config)
+        return lines
+
+    @classmethod
+    def to_yaml(cls, config, compact=False, indent=2, level=0):
+        """Convert HOCON input into a YAML output
+
+        :return: YAML string representation
+        :type return: basestring
+        """
+        lines = ""
+        if isinstance(config, ConfigTree):
+            if len(config) > 0:
+                if level > 0:
+                    lines += '\n'
+                bet_lines = []
+                for key, item in config.items():
+                    bet_lines.append('{indent}{key}: {value}'.format(
+                        indent=''.rjust(level * indent, ' '),
+                        key=key.strip('"'),  # for dotted keys enclosed with "" to not be interpreted as nested key,
+                        value=cls.to_yaml(item, compact, indent, level + 1))
+                    )
+                lines += '\n'.join(bet_lines)
+        elif isinstance(config, list):
+            config_list = [line for line in config if line is not None]
+            if len(config_list) == 0:
+                lines += '[]'
+            else:
+                lines += '\n'
+                bet_lines = []
+                for item in config_list:
+                    bet_lines.append('{indent}- {value}'.format(indent=''.rjust(level * indent, ' '),
+                                                                value=cls.to_yaml(item, compact, indent, level + 1)))
+                lines += '\n'.join(bet_lines)
+        elif isinstance(config, basestring):
+            # if it contains a \n then it's multiline
+            lines = config.split('\n')
+            if len(lines) == 1:
+                lines = config
+            else:
+                lines = '|\n' + '\n'.join([line.rjust(level * indent, ' ') for line in lines])
+        elif config is None or isinstance(config, NoneValue):
+            lines = 'null'
+        elif config is True:
+            lines = 'true'
+        elif config is False:
+            lines = 'false'
+        else:
+            lines = str(config)
+        return lines
+
+    @classmethod
+    def to_properties(cls, config, compact=False, indent=2, key_stack=[]):
+        """Convert HOCON input into a .properties output
+
+        :return: .properties string representation
+        :type return: basestring
+        :return:
+        """
+
+        def escape_value(value):
+            return value.replace('=', '\\=').replace('!', '\\!').replace('#', '\\#').replace('\n', '\\\n')
+
+        stripped_key_stack = [key.strip('"') for key in key_stack]
+        lines = []
+        if isinstance(config, ConfigTree):
+            for key, item in config.items():
+                if item is not None:
+                    lines.append(cls.to_properties(item, compact, indent, stripped_key_stack + [key]))
+        elif isinstance(config, list):
+            for index, item in enumerate(config):
+                if item is not None:
+                    lines.append(cls.to_properties(item, compact, indent, stripped_key_stack + [str(index)]))
+        elif isinstance(config, basestring):
+            lines.append('.'.join(stripped_key_stack) + ' = ' + escape_value(config))
+        elif config is True:
+            lines.append('.'.join(stripped_key_stack) + ' = true')
+        elif config is False:
+            lines.append('.'.join(stripped_key_stack) + ' = false')
+        elif config is None or isinstance(config, NoneValue):
+            pass
+        else:
+            lines.append('.'.join(stripped_key_stack) + ' = ' + str(config))
+        return '\n'.join([line for line in lines if len(line) > 0])
+
+    @classmethod
+    def convert(cls, config, output_format='json', indent=2, compact=False):
+        converters = {
+            'json': cls.to_json,
+            'properties': cls.to_properties,
+            'yaml': cls.to_yaml,
+            'hocon': cls.to_hocon,
+        }
+
+        if output_format in converters:
+            return converters[output_format](config, compact, indent)
+        else:
+            raise Exception("Invalid format '{format}'. Format must be 'json', 'properties', 'yaml' or 'hocon'".format(
+                format=output_format))
+
+    @classmethod
+    def convert_from_file(cls, input_file=None, output_file=None, output_format='json', indent=2, compact=False):
+        """Convert to json, properties or yaml
+
+        :param input_file: input file, if not specified stdin
+        :param output_file: output file, if not specified stdout
+        :param output_format: json, properties or yaml
+        :return: json, properties or yaml string representation
+        """
+
+        if input_file is None:
+            content = sys.stdin.read()
+            config = ConfigFactory.parse_string(content)
+        else:
+            config = ConfigFactory.parse_file(input_file)
+
+        res = cls.convert(config, output_format, indent, compact)
+        if output_file is None:
+            print(res)
+        else:
+            with open(output_file, "w") as fd:
+                fd.write(res)
+
+    @classmethod
+    def __escape_match(cls, match):
+        char = match.group(0)
+        return {
+            '\b': r'\b',
+            '\t': r'\t',
+            '\n': r'\n',
+            '\f': r'\f',
+            '\r': r'\r',
+            '"': r'\"',
+            '\\': r'\\',
+        }.get(char) or (r'\u%04x' % ord(char))
+
+    @classmethod
+    def __escape_string(cls, string):
+        return re.sub(r'[\x00-\x1F"\\]', cls.__escape_match, string)
--- a/clearml_agent/external/pyhocon/exceptions.py
+++ b/clearml_agent/external/pyhocon/exceptions.py
@@ -0,0 +1,17 @@
+class ConfigException(Exception):
+
+    def __init__(self, message, ex=None):
+        super(ConfigException, self).__init__(message)
+        self._exception = ex
+
+
+class ConfigMissingException(ConfigException, KeyError):
+    pass
+
+
+class ConfigSubstitutionException(ConfigException):
+    pass
+
+
+class ConfigWrongTypeException(ConfigException):
+    pass
--- a/clearml_agent/glue/k8s.py
+++ b/clearml_agent/glue/k8s.py
@@ -9,26 +9,32 @@ import os
 import re
 import subprocess
 import tempfile
+from collections import defaultdict
 from copy import deepcopy
 from pathlib import Path
 from pprint import pformat
 from threading import Thread
-from time import sleep
-from typing import Text, List, Callable, Any, Collection, Optional, Union
+from time import sleep, time
+from typing import Text, List, Callable, Any, Collection, Optional, Union, Iterable, Dict, Tuple, Set

 import yaml

+from clearml_agent.backend_api.session import Request
 from clearml_agent.commands.events import Events
 from clearml_agent.commands.worker import Worker, get_task_container, set_task_container, get_next_task
-from clearml_agent.definitions import ENV_DOCKER_IMAGE
+from clearml_agent.definitions import (
+    ENV_DOCKER_IMAGE,
+    ENV_AGENT_GIT_USER,
+    ENV_AGENT_GIT_PASS,
+    ENV_FORCE_SYSTEM_SITE_PACKAGES,
+)
 from clearml_agent.errors import APIError
+from clearml_agent.glue.definitions import ENV_START_AGENT_SCRIPT_PATH
 from clearml_agent.helper.base import safe_remove_file
 from clearml_agent.helper.dicts import merge_dicts
-from clearml_agent.helper.process import get_bash_output
+from clearml_agent.helper.process import get_bash_output, stringify_bash_output
 from clearml_agent.helper.resource_monitor import ResourceMonitor
 from clearml_agent.interface.base import ObjectID
-from clearml_agent.backend_api.session import Request
-from clearml_agent.glue.definitions import ENV_START_AGENT_SCRIPT_PATH


 class K8sIntegration(Worker):
@@ -36,19 +42,14 @@ class K8sIntegration(Worker):

    K8S_DEFAULT_NAMESPACE = "clearml"
    AGENT_LABEL = "CLEARML=agent"
-    LIMIT_POD_LABEL = "ai.allegro.agent.serial=pod-{pod_number}"

    KUBECTL_APPLY_CMD = "kubectl apply --namespace={namespace} -f"

-    KUBECTL_RUN_CMD = "kubectl run clearml-id-{task_id} " \
-                      "--image {docker_image} {docker_args} " \
-                      "--restart=Never " \
-                      "--namespace={namespace}"
-
-    KUBECTL_DELETE_CMD = "kubectl delete pods " \
-                         "--selector={selector} " \
-                         "--field-selector=status.phase!=Pending,status.phase!=Running " \
-                         "--namespace={namespace}"
+    KUBECTL_CLEANUP_DELETE_CMD = "kubectl delete pods " \
+                                 "-l={agent_label} " \
+                                 "--field-selector=status.phase!=Pending,status.phase!=Running " \
+                                 "--namespace={namespace} " \
+                                 "--output name"

    BASH_INSTALL_SSH_CMD = [
        "apt-get update",
@@ -65,6 +66,9 @@ class K8sIntegration(Worker):
        'echo "ldconfig" >> /etc/profile',
        "/usr/sbin/sshd -p {port}"]

+    DEFAULT_EXECUTION_AGENT_ARGS = os.getenv("K8S_GLUE_DEF_EXEC_AGENT_ARGS", "--full-monitoring --require-queue")
+    POD_AGENT_INSTALL_ARGS = os.getenv("K8S_GLUE_POD_AGENT_INSTALL_ARGS", "")
+
    CONTAINER_BASH_SCRIPT = [
        "export DEBIAN_FRONTEND='noninteractive'",
        "echo 'Binary::apt::APT::Keep-Downloaded-Packages \"true\";' > /etc/apt/apt.conf.d/docker-clean",
@@ -77,17 +81,19 @@ class K8sIntegration(Worker):
        "[ ! -z $LOCAL_PYTHON ] || apt-get install -y python3-pip",
        "[ ! -z $LOCAL_PYTHON ] || export LOCAL_PYTHON=python3",
        "{extra_bash_init_cmd}",
-        "$LOCAL_PYTHON -m pip install clearml-agent",
+        "$LOCAL_PYTHON -m pip install clearml-agent{agent_install_args}",
        "{extra_docker_bash_script}",
-        "$LOCAL_PYTHON -m clearml_agent execute --full-monitoring --require-queue --id {task_id}"
+        "$LOCAL_PYTHON -m clearml_agent execute {default_execution_agent_args} --id {task_id}"
    ]

+    DEFAULT_POD_NAME_PREFIX = "clearml-id-"
+    DEFAULT_LIMIT_POD_LABEL = "ai.allegro.agent.serial=pod-{pod_number}"
+
    _edit_hyperparams_version = "2.9"

    def __init__(
            self,
            k8s_pending_queue_name=None,
-            kubectl_cmd=None,
            container_bash_script=None,
            debug=False,
            ports_mode=False,
@@ -100,15 +106,14 @@ class K8sIntegration(Worker):
            extra_bash_init_script=None,
            namespace=None,
            max_pods_limit=None,
+            pod_name_prefix=None,
+            limit_pod_label=None,
            **kwargs
    ):
        """
        Initialize the k8s integration glue layer daemon

        :param str k8s_pending_queue_name: queue name to use when task is pending in the k8s scheduler
-        :param str|callable kubectl_cmd: kubectl command line str, supports formatting (default: KUBECTL_RUN_CMD)
-            example: "task={task_id} image={docker_image} queue_id={queue_id}"
-            or a callable function: kubectl_cmd(task_id, docker_image, docker_args, queue_id, task_data)
        :param str container_bash_script: container bash script to be executed in k8s (default: CONTAINER_BASH_SCRIPT)
            Notice this string will use format() call, if you have curly brackets they should be doubled { -> {{
            Format arguments passed: {task_id} and {extra_bash_init_cmd}
@@ -130,12 +135,16 @@ class K8sIntegration(Worker):
        :param int max_pods_limit: Maximum number of pods that K8S glue can run at the same time
        """
        super(K8sIntegration, self).__init__()
+        self.pod_name_prefix = pod_name_prefix or self.DEFAULT_POD_NAME_PREFIX
+        self.limit_pod_label = limit_pod_label or self.DEFAULT_LIMIT_POD_LABEL
        self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
        self.k8s_pending_queue_id = None
-        self.kubectl_cmd = kubectl_cmd or self.KUBECTL_RUN_CMD
        self.container_bash_script = container_bash_script or self.CONTAINER_BASH_SCRIPT
-        # Always do system packages, because by we will be running inside a docker
-        self._session.config.put("agent.package_manager.system_site_packages", True)
+        force_system_packages = ENV_FORCE_SYSTEM_SITE_PACKAGES.get()
+        self._force_system_site_packages = force_system_packages if force_system_packages is not None else True
+        if self._force_system_site_packages:
+            # Use system packages, because by we will be running inside a docker
+            self._session.config.put("agent.package_manager.system_site_packages", True)
        # Add debug logging
        if debug:
            self.log.logger.disabled = False
@@ -156,27 +165,9 @@ class K8sIntegration(Worker):
        self.pod_limits = []
        self.pod_requests = []
        self.max_pods_limit = max_pods_limit if not self.ports_mode else None
-        if overrides_yaml:
-            overrides = self._load_template_file(overrides_yaml)
-            if overrides:
-                containers = overrides.get('spec', {}).get('containers', [])
-                for c in containers:
-                    resources = {str(k).lower(): v for k, v in c.get('resources', {}).items()}
-                    if not resources:
-                        continue
-                    if resources.get('limits'):
-                        self.pod_limits += ['{}={}'.format(k, v) for k, v in resources['limits'].items()]
-                    if resources.get('requests'):
-                        self.pod_requests += ['{}={}'.format(k, v) for k, v in resources['requests'].items()]
-                # remove double entries
-                self.pod_limits = list(set(self.pod_limits))
-                self.pod_requests = list(set(self.pod_requests))
-                if self.pod_limits or self.pod_requests:
-                    self.log.warning('Found pod container requests={} limits={}'.format(
-                        self.pod_limits, self.pod_requests))
-                if containers:
-                    self.log.warning('Removing containers section: {}'.format(overrides['spec'].pop('containers')))
-                self.overrides_json_string = json.dumps(overrides)
+
+        self._load_overrides_yaml(overrides_yaml)
+
        if template_yaml:
            self.template_dict = self._load_template_file(template_yaml)

@@ -185,13 +176,39 @@ class K8sIntegration(Worker):
        if clearml_conf_file:
            with open(os.path.expandvars(os.path.expanduser(str(clearml_conf_file))), 'rt') as f:
                self.conf_file_content = f.read()
-            # make sure we use system packages!
-            self.conf_file_content += '\nagent.package_manager.system_site_packages=true\n'

        self._agent_label = None

        self._monitor_hanging_pods()

+        self._min_cleanup_interval_per_ns_sec = 1.0
+        self._last_pod_cleanup_per_ns = defaultdict(lambda: 0.)
+
+    def _load_overrides_yaml(self, overrides_yaml):
+        if not overrides_yaml:
+            return
+        overrides = self._load_template_file(overrides_yaml)
+        if not overrides:
+            return
+        containers = overrides.get('spec', {}).get('containers', [])
+        for c in containers:
+            resources = {str(k).lower(): v for k, v in c.get('resources', {}).items()}
+            if not resources:
+                continue
+            if resources.get('limits'):
+                self.pod_limits += ['{}={}'.format(k, v) for k, v in resources['limits'].items()]
+            if resources.get('requests'):
+                self.pod_requests += ['{}={}'.format(k, v) for k, v in resources['requests'].items()]
+        # remove double entries
+        self.pod_limits = list(set(self.pod_limits))
+        self.pod_requests = list(set(self.pod_requests))
+        if self.pod_limits or self.pod_requests:
+            self.log.warning('Found pod container requests={} limits={}'.format(
+                self.pod_limits, self.pod_requests))
+        if containers:
+            self.log.warning('Removing containers section: {}'.format(overrides['spec'].pop('containers')))
+        self.overrides_json_string = json.dumps(overrides)
+
    def _monitor_hanging_pods(self):
        _check_pod_thread = Thread(target=self._monitor_hanging_pods_daemon)
        _check_pod_thread.daemon = True
@@ -202,25 +219,22 @@ class K8sIntegration(Worker):
        with open(os.path.expandvars(os.path.expanduser(str(path))), 'rt') as f:
            return yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))

-    @staticmethod
-    def _get_path(d, *path, default=None):
-        try:
-            return functools.reduce(
-                lambda a, b: a[b], path, d
-            )
-        except (IndexError, KeyError):
-            return default
-
-    def _get_kubectl_options(self, command, extra_labels=None):
-        labels = [self._get_agent_label()] + (list(extra_labels) if extra_labels else [])
-        return {
+    def _get_kubectl_options(self, command, extra_labels=None, filters=None, output="json", labels=None):
+        # type: (str, Iterable[str], Iterable[str], str, Iterable[str]) -> Dict
+        if not labels:
+            labels = [self._get_agent_label()]
+        labels = list(labels) + (list(extra_labels) if extra_labels else [])
+        d = {
            "-l": ",".join(labels),
            "-n": str(self.namespace),
-            "-o": "json"
+            "-o": output,
        }
+        if filters:
+            d["--field-selector"] = ",".join(filters)
+        return d

-    def get_kubectl_command(self, command, extra_labels=None):
-        opts = self._get_kubectl_options(command, extra_labels)
+    def get_kubectl_command(self, command, output="json", **args):
+        opts = self._get_kubectl_options(command, output=output, **args)
        return 'kubectl {command} {opts}'.format(
            command=command, opts=" ".join(x for item in opts.items() for x in item)
        )
@@ -229,10 +243,9 @@ class K8sIntegration(Worker):
        last_tasks_msgs = {}  # last msg updated for every task

        while True:
-            kubectl_cmd = self.get_kubectl_command("get pods")
+            kubectl_cmd = self.get_kubectl_command("get pods", filters=["status.phase=Pending"])
            self.log.debug("Detecting hanging pods: {}".format(kubectl_cmd))
-            output = get_bash_output(kubectl_cmd)
-            output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
+            output = stringify_bash_output(get_bash_output(kubectl_cmd))
            try:
                output_config = json.loads(output)
            except Exception as ex:
@@ -240,11 +253,8 @@ class K8sIntegration(Worker):
                sleep(self._polling_interval)
                continue
            pods = output_config.get('items', [])
-            task_ids = set()
+            task_id_to_details = dict()
            for pod in pods:
-                if self._get_path(pod, 'status', 'phase') != "Pending":
-                    continue
-
                pod_name = pod.get('metadata', {}).get('name', None)
                if not pod_name:
                    continue
@@ -257,7 +267,7 @@ class K8sIntegration(Worker):
                if not namespace:
                    continue

-                task_ids.add(task_id)
+                task_id_to_details[task_id] = (pod_name, namespace)

                msg = None

@@ -277,8 +287,10 @@ class K8sIntegration(Worker):

                    if reason == 'ImagePullBackOff':
                        delete_pod_cmd = 'kubectl delete pods {} -n {}'.format(pod_name, namespace)
+                        self.log.debug(" - deleting pod due to ImagePullBackOff: {}".format(delete_pod_cmd))
                        get_bash_output(delete_pod_cmd)
                        try:
+                            self.log.debug(" - Detecting hanging pods: {}".format(kubectl_cmd))
                            self._session.api_client.tasks.failed(
                                task=task_id,
                                status_reason="K8S glue error: {}".format(msg),
@@ -310,17 +322,51 @@ class K8sIntegration(Worker):
                        last_tasks_msgs[task_id] = msg
                    except Exception as ex:
                        self.log.warning(
-                            'K8S Glue pods monitor: Failed setting status message for task "{}"\nEX: {}'.format(
-                                task_id, ex
+                            'K8S Glue pods monitor: Failed setting status message for task "{}"\nMSG: {}\nEX: {}'.format(
+                                task_id, msg, ex
                            )
                        )

+            if task_id_to_details:
+                try:
+                    result = self._session.get(
+                        service='tasks',
+                        action='get_all',
+                        json={"id": list(task_id_to_details), "status": ["stopped"], "only_fields": ["id"]},
+                        method=Request.def_method,
+                        async_enable=False,
+                    )
+                    aborted_task_ids = list(filter(None, (task.get("id") for task in result["tasks"])))
+
+                    for task_id in aborted_task_ids:
+                        pod_name, namespace = task_id_to_details.get(task_id)
+                        if not pod_name:
+                            self.log.error("Failed locating aborted task {} in pending pods list".format(task_id))
+                            continue
+                        self.log.info(
+                            "K8S Glue pods monitor: task {} was aborted by its pod {} is still pending, "
+                            "deleting pod".format(task_id, pod_name)
+                        )
+
+                        kubectl_cmd = "kubectl delete pod {pod_name} --output name {namespace}".format(
+                            namespace=f"--namespace={namespace}" if namespace else "", pod_name=pod_name,
+                        ).strip()
+                        self.log.debug("Deleting aborted task pending pod: {}".format(kubectl_cmd))
+                        output = stringify_bash_output(get_bash_output(kubectl_cmd))
+                        if not output:
+                            self.log.warning("K8S Glue pods monitor: failed deleting pod {}".format(pod_name))
+                except Exception as ex:
+                    self.log.warning(
+                        'K8S Glue pods monitor: failed checking aborted tasks for hanging pods: {}'.format(ex)
+                    )
+
            # clean up any last message for a task that wasn't seen as a pod
-            last_tasks_msgs = {k: v for k, v in last_tasks_msgs.items() if k in task_ids}
+            last_tasks_msgs = {k: v for k, v in last_tasks_msgs.items() if k in task_id_to_details}

            sleep(self._polling_interval)

-    def _set_task_user_properties(self, task_id: str, **properties: str):
+    def _set_task_user_properties(self, task_id: str, task_session=None, **properties: str):
+        session = task_session or self._session
        if self._edit_hyperparams_support is not True:
            # either not supported or never tested
            if self._edit_hyperparams_support == self._session.api_version:
@@ -331,7 +377,7 @@ class K8sIntegration(Worker):
                self._edit_hyperparams_support = self._session.api_version
                return
        try:
-            self._session.get(
+            session.get(
                service="tasks",
                action="edit_hyper_params",
                task=task_id,
@@ -363,68 +409,94 @@ class K8sIntegration(Worker):
        return self._agent_label

    def _get_used_pods(self):
+        # type: () -> Tuple[int, Set[str]]
        # noinspection PyBroadException
        try:
-            kubectl_cmd_new = self.get_kubectl_command("get pods")
-            self.log.debug("Getting used pods: {}".format(kubectl_cmd_new))
-            process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            output, error = process.communicate()
-            output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
-            error = '' if not error else error if isinstance(error, str) else error.decode('utf-8')
+            kubectl_cmd = self.get_kubectl_command(
+                "get pods",
+                output="jsonpath=\"{range .items[*]}{.metadata.name}{' '}{.metadata.namespace}{'\\n'}{end}\""
+            )
+            self.log.debug("Getting used pods: {}".format(kubectl_cmd))
+            output = stringify_bash_output(get_bash_output(kubectl_cmd, raise_error=True))

            if not output:
                # No such pod exist so we can use the pod_number we found
-                return 0, {}
+                return 0, set([])

            try:
-                items = json.loads(output).get("items", [])
+                items = output.splitlines()
                current_pod_count = len(items)
-                namespaces = {item["metadata"]["namespace"] for item in items}
+                namespaces = {item.rpartition(" ")[-1] for item in items}
+                self.log.debug(" - found {} pods in namespaces {}".format(current_pod_count, ", ".join(namespaces)))
            except (KeyError, ValueError, TypeError, AttributeError) as ex:
                print("Failed parsing used pods command response for cleanup: {}".format(ex))
-                return -1, {}
+                return -1, set([])

            return current_pod_count, namespaces
        except Exception as ex:
            print('Failed obtaining used pods information: {}'.format(ex))
-            return -2, {}
+            return -2, set([])
+
+    def _is_same_tenant(self, task_session):
+        if not task_session or task_session is self._session:
+            return True
+        # noinspection PyStatementEffect
+        try:
+            tenant = self._session.get_decoded_token(self._session.token, verify=False)["tenant"]
+            task_tenant = task_session.get_decoded_token(task_session.token, verify=False)["tenant"]
+            return tenant == task_tenant
+        except Exception as ex:
+            print("ERROR: Failed getting tenant for task session: {}".format(ex))

    def run_one_task(self, queue: Text, task_id: Text, worker_args=None, task_session=None, **_):
        print('Pulling task {} launching on kubernetes cluster'.format(task_id))
-        task_data = self._session.api_client.tasks.get_all(id=[task_id])[0]
+        session = task_session or self._session
+        task_data = session.api_client.tasks.get_all(id=[task_id])[0]

        # push task into the k8s queue, so we have visibility on pending tasks in the k8s scheduler
-        try:
-            print('Pushing task {} into temporary pending queue'.format(task_id))
-            _ = self._session.api_client.tasks.stop(task_id, force=True)
-            res = self._session.api_client.tasks.enqueue(
-                task_id,
-                queue=self.k8s_pending_queue_id,
-                status_reason='k8s pending scheduler',
-            )
-            if res.meta.result_code != 200:
-                raise Exception(res.meta.result_msg)
-        except Exception as e:
-            self.log.error("ERROR: Could not push back task [{}] to k8s pending queue {} [{}], error: {}".format(
-                task_id, self.k8s_pending_queue_name, self.k8s_pending_queue_id, e))
-            return
+        if self._is_same_tenant(task_session):
+            try:
+                print('Pushing task {} into temporary pending queue'.format(task_id))
+                _ = session.api_client.tasks.stop(task_id, force=True)

-        container = get_task_container(self._session, task_id)
+                res = self._session.api_client.tasks.enqueue(
+                    task_id,
+                    queue=self.k8s_pending_queue_id,
+                    status_reason='k8s pending scheduler',
+                )
+                if res.meta.result_code != 200:
+                    raise Exception(res.meta.result_msg)
+            except Exception as e:
+                self.log.error("ERROR: Could not push back task [{}] to k8s pending queue {} [{}], error: {}".format(
+                    task_id, self.k8s_pending_queue_name, self.k8s_pending_queue_id, e))
+                return
+
+        container = get_task_container(session, task_id)
        if not container.get('image'):
            container['image'] = str(
-                ENV_DOCKER_IMAGE.get() or self._session.config.get("agent.default_docker.image", "nvidia/cuda")
+                ENV_DOCKER_IMAGE.get() or session.config.get("agent.default_docker.image", "nvidia/cuda")
            )
-            container['arguments'] = self._session.config.get("agent.default_docker.arguments", None)
+            container['arguments'] = session.config.get("agent.default_docker.arguments", None)
            set_task_container(
-                self._session, task_id, docker_image=container['image'], docker_arguments=container['arguments']
+                session, task_id, docker_image=container['image'], docker_arguments=container['arguments']
            )

-        # get the clearml.conf encoded file
+        # get the clearml.conf encoded file, make sure we use system packages!
+
+        git_user = ENV_AGENT_GIT_USER.get() or self._session.config.get("agent.git_user", None)
+        git_pass = ENV_AGENT_GIT_PASS.get() or self._session.config.get("agent.git_pass", None)
+        extra_config_values = [
+            'agent.package_manager.system_site_packages: true' if self._force_system_site_packages else '',
+            'agent.git_user: "{}"'.format(git_user) if git_user else '',
+            'agent.git_pass: "{}"'.format(git_pass) if git_pass else '',
+        ]
+
        # noinspection PyProtectedMember
-        hocon_config_encoded = (
-            self.conf_file_content
-            or Path(self._session._config_file).read_text()
-        ).encode("ascii")
+        config_content = (
+            self.conf_file_content or (session._config_file and Path(session._config_file).read_text()) or ""
+        ) + '\n{}\n'.format('\n'.join(x for x in extra_config_values if x))
+
+        hocon_config_encoded = config_content.encode("ascii")

        create_clearml_conf = ["echo '{}' | base64 --decode >> ~/clearml.conf".format(
            base64.b64encode(
@@ -456,13 +528,13 @@ class K8sIntegration(Worker):

            kubectl_cmd_new = self.get_kubectl_command(
                "get pods",
-                extra_labels=[self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else None
+                extra_labels=[self.limit_pod_label.format(pod_number=pod_number)] if self.ports_mode else None
            )
            self.log.debug("Looking for a free pod/port: {}".format(kubectl_cmd_new))
            process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            output, error = process.communicate()
-            output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
-            error = '' if not error else error if isinstance(error, str) else error.decode('utf-8')
+            output = stringify_bash_output(output)
+            error = stringify_bash_output(error)

            try:
                items_count = len(json.loads(output).get("items", []))
@@ -473,8 +545,12 @@ class K8sIntegration(Worker):
                        output, task_id, queue, ex
                    )
                )
-                self._session.api_client.tasks.stop(task_id, force=True)
-                self._session.api_client.tasks.enqueue(task_id, queue=queue, status_reason='kubectl parsing error')
+                session.api_client.tasks.stop(task_id, force=True)
+                # noinspection PyBroadException
+                try:
+                    self._session.api_client.tasks.enqueue(task_id, queue=queue, status_reason='kubectl parsing error')
+                except:
+                    self.log.warning("Failed enqueuing task to queue '{}'".format(queue))
                return

            if not items_count:
@@ -498,9 +574,14 @@ class K8sIntegration(Worker):
                        task_id, queue
                    )
                )
-                self._session.api_client.tasks.stop(task_id, force=True)
-                self._session.api_client.tasks.enqueue(
-                    task_id, queue=queue, status_reason='k8s max pod limit (no free k8s service)')
+                session.api_client.tasks.stop(task_id, force=True)
+                # noinspection PyBroadException
+                try:
+                    self._session.api_client.tasks.enqueue(
+                        task_id, queue=queue, status_reason='k8s max pod limit (no free k8s service)'
+                    )
+                except:
+                    self.log.warning("Failed enqueuing task to queue '{}'".format(queue))
                return
            elif self.max_pods_limit:
                # max pods limit hasn't reached yet, so we can create the pod
@@ -509,36 +590,38 @@ class K8sIntegration(Worker):

        labels = self._get_pod_labels(queue, queue_name)
        if self.ports_mode:
-            labels.append(self.LIMIT_POD_LABEL.format(pod_number=pod_number))
+            labels.append(self.limit_pod_label.format(pod_number=pod_number))

        if self.ports_mode:
            print("Kubernetes scheduling task id={} on pod={} (pod_count={})".format(task_id, pod_number, pod_count))
        else:
            print("Kubernetes scheduling task id={}".format(task_id))

-        kubectl_kwargs = dict(
-            create_clearml_conf=create_clearml_conf,
-            labels=labels,
-            docker_image=container['image'],
-            docker_args=container['arguments'],
-            docker_bash=container.get('setup_shell_script'),
-            task_id=task_id,
-            queue=queue
-        )
-
        try:
            template = self._resolve_template(task_session, task_data, queue)
        except Exception as ex:
            print("ERROR: Failed resolving template (skipping): {}".format(ex))
            return

-        if template:
-            output, error = self._kubectl_apply(template=template, **kubectl_kwargs)
-        else:
-            output, error = self._kubectl_run(task_data=task_data, **kubectl_kwargs)
+        try:
+            namespace = template['metadata']['namespace'] or self.namespace
+        except (KeyError, TypeError, AttributeError):
+            namespace = self.namespace
+
+        if template:
+            output, error = self._kubectl_apply(
+                template=template,
+                pod_number=pod_number,
+                create_clearml_conf=create_clearml_conf,
+                labels=labels,
+                docker_image=container['image'],
+                docker_args=container['arguments'],
+                docker_bash=container.get('setup_shell_script'),
+                task_id=task_id,
+                queue=queue,
+                namespace=namespace,
+            )

-        error = '' if not error else (error if isinstance(error, str) else error.decode('utf-8'))
-        output = '' if not output else (output if isinstance(output, str) else output.decode('utf-8'))
        print('kubectl output:\n{}\n{}'.format(error, output))
        if error:
            send_log = "Running kubectl encountered an error: {}".format(error)
@@ -552,6 +635,7 @@ class K8sIntegration(Worker):
                    "k8s-pod-number": pod_number,
                    "k8s-pod-label": labels[0],
                    "k8s-internal-pod-count": pod_count,
+                    "k8s-agent": self._get_agent_label(),
                }
            )

@@ -566,6 +650,7 @@ class K8sIntegration(Worker):
        if user_props:
            self._set_task_user_properties(
                task_id=task_id,
+                task_session=task_session,
                **user_props
            )

@@ -603,19 +688,22 @@ class K8sIntegration(Worker):
        return results

    def _kubectl_apply(
-        self, create_clearml_conf, docker_image, docker_args, docker_bash, labels, queue, task_id, template=None
+        self,
+        create_clearml_conf,
+        docker_image,
+        docker_args,
+        docker_bash,
+        labels,
+        queue,
+        task_id,
+        namespace,
+        template=None,
+        pod_number=None
    ):
-        template = template or deepcopy(self.template_dict)
-
-        try:
-            namespace = template['metadata']['namespace'] or self.namespace
-        except (KeyError, TypeError, AttributeError):
-            namespace = self.namespace
-
        template.setdefault('apiVersion', 'v1')
        template['kind'] = 'Pod'
        template.setdefault('metadata', {})
-        name = 'clearml-id-{task_id}'.format(task_id=task_id)
+        name = self.pod_name_prefix + str(task_id)
        template['metadata']['name'] = name
        template.setdefault('spec', {})
        template['spec'].setdefault('containers', [])
@@ -643,7 +731,9 @@ class K8sIntegration(Worker):
            ['#!/bin/bash', ] +
            [line.format(extra_bash_init_cmd=self.extra_bash_init_script or '',
                         task_id=task_id,
-                         extra_docker_bash_script=extra_docker_bash_script)
+                         extra_docker_bash_script=extra_docker_bash_script,
+                         default_execution_agent_args=self.DEFAULT_EXECUTION_AGENT_ARGS,
+                         agent_install_args=self.POD_AGENT_INSTALL_ARGS)
             for line in container_bash_script])

        extra_bash_commands = list(create_clearml_conf or [])
@@ -703,57 +793,81 @@ class K8sIntegration(Worker):
        finally:
            safe_remove_file(yaml_file)

-        return output, error
+        return stringify_bash_output(output), stringify_bash_output(error)

-    def _kubectl_run(
-        self, create_clearml_conf, docker_image, docker_args, docker_bash, labels, queue, task_data, task_id
-    ):
-        if callable(self.kubectl_cmd):
-            kubectl_cmd = self.kubectl_cmd(task_id, docker_image, docker_args, queue, task_data)
-        else:
-            kubectl_cmd = self.kubectl_cmd.format(
-                task_id=task_id,
-                docker_image=docker_image,
-                docker_args=" ".join(self._get_docker_args(
-                    docker_args, flags={"-e", "--env"}, convert=lambda env: '--env={}'.format(env))
-                ),
-                queue_id=queue,
-                namespace=self.namespace,
+    def _cleanup_old_pods(self, namespaces, extra_msg=None):
+        # type: (Iterable[str], Optional[str]) -> Dict[str, List[str]]
+        self.log.debug("Cleaning up pods")
+        deleted_pods = defaultdict(list)
+        for namespace in namespaces:
+            if time() - self._last_pod_cleanup_per_ns[namespace] < self._min_cleanup_interval_per_ns_sec:
+                # Do not try to cleanup the same namespace too quickly
+                continue
+            kubectl_cmd = self.KUBECTL_CLEANUP_DELETE_CMD.format(
+                namespace=namespace, agent_label=self._get_agent_label()
            )
-        # make sure we provide a list
-        if isinstance(kubectl_cmd, str):
-            kubectl_cmd = kubectl_cmd.split()
+            self.log.debug("Deleting old/failed pods{} for ns {}: {}".format(
+                extra_msg or "", namespace, kubectl_cmd
+            ))
+            try:
+                res = get_bash_output(kubectl_cmd, raise_error=True)
+                lines = [
+                    line for line in
+                    (r.strip().rpartition("/")[-1] for r in res.splitlines())
+                    if line.startswith(self.pod_name_prefix)
+                ]
+                self.log.debug(" - deleted pod(s) %s", ", ".join(lines))
+                deleted_pods[namespace].extend(lines)
+            except Exception as ex:
+                self.log.error("Failed deleting old/failed pods for ns %s: %s", namespace, str(ex))
+            finally:
+                self._last_pod_cleanup_per_ns[namespace] = time()

-        if self.overrides_json_string:
-            kubectl_cmd += ['--overrides=' + self.overrides_json_string]
+        # Locate tasks belonging to deleted pods that are still marked as pending or running
+        tasks_to_abort = []
+        try:
+            task_ids = list(filter(None, (
+                pod_name[len(self.pod_name_prefix):].strip()
+                for pod_names in deleted_pods.values()
+                for pod_name in pod_names
+            )))
+            if task_ids:
+                result = self._session.get(
+                    service='tasks',
+                    action='get_all',
+                    json={"id": task_ids, "status": ["in_progress", "queued"], "only_fields": ["id", "status"]},
+                    method=Request.def_method,
+                )
+                tasks_to_abort = result["tasks"]
+        except Exception as ex:
+            self.log.warning('Failed getting running tasks for deleted pods: {}'.format(ex))

-        if self.pod_limits:
-            kubectl_cmd += ['--limits', ",".join(self.pod_limits)]
-        if self.pod_requests:
-            kubectl_cmd += ['--requests', ",".join(self.pod_requests)]
+        for task in tasks_to_abort:
+            task_id = task.get("id")
+            status = task.get("status")
+            if not task_id or not status:
+                self.log.warning('Failed getting task information: id={}, status={}'.format(task_id, status))
+                continue
+            try:
+                if status == "queued":
+                    self._session.get(
+                        service='tasks',
+                        action='dequeue',
+                        json={"task": task_id, "force": True, "status_reason": "Pod deleted (not pending or running)",
+                              "status_message": "Pod deleted by agent {}".format(self.worker_id or "unknown")},
+                        method=Request.def_method,
+                    )
+                self._session.get(
+                    service='tasks',
+                    action='failed',
+                    json={"task": task_id, "force": True, "status_reason": "Pod deleted (not pending or running)",
+                          "status_message": "Pod deleted by agent {}".format(self.worker_id or "unknown")},
+                    method=Request.def_method,
+                )
+            except Exception as ex:
+                self.log.warning('Failed setting task {} to status "failed": {}'.format(task_id, ex))

-        if self._docker_force_pull and not any(x.startswith("--image-pull-policy=") for x in kubectl_cmd):
-            kubectl_cmd += ["--image-pull-policy='always'"]
-
-        container_bash_script = [self.container_bash_script] if isinstance(self.container_bash_script, str) \
-            else self.container_bash_script
-        container_bash_script = ' ; '.join(container_bash_script)
-
-        kubectl_cmd += [
-            "--labels=" + ",".join(labels),
-            "--command",
-            "--",
-            "/bin/sh",
-            "-c",
-            "{} ; {}".format(" ; ".join(create_clearml_conf or []), container_bash_script.format(
-                extra_bash_init_cmd=self.extra_bash_init_script or "",
-                extra_docker_bash_script=docker_bash or "",
-                task_id=task_id
-            )),
-        ]
-        process = subprocess.Popen(kubectl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        output, error = process.communicate()
-        return output, error
+        return deleted_pods

    def run_tasks_loop(self, queues: List[Text], worker_params, **kwargs):
        """
@@ -782,16 +896,16 @@ class K8sIntegration(Worker):
            # Get used pods and namespaces
            current_pods, namespaces = self._get_used_pods()

+            # just in case there are no pods, make sure we look at our base namespace
+            namespaces.add(self.namespace)
+
            # check if have pod limit, then check if we hit it.
            if self.max_pods_limit:
                if current_pods >= self.max_pods_limit:
                    print("Maximum pod limit reached {}/{}, sleeping for {:.1f} seconds".format(
                        current_pods, self.max_pods_limit, self._polling_interval))
                    # delete old completed / failed pods
-                    for namespace in namespaces:
-                        kubectl_cmd = self.KUBECTL_DELETE_CMD.format(namespace=namespace, selector=self._get_agent_label())
-                        self.log.debug("Deleting old/failed pods due to pod limit: {}".format(kubectl_cmd))
-                        get_bash_output(kubectl_cmd)
+                    self._cleanup_old_pods(namespaces, " due to pod limit")
                    # go to sleep
                    sleep(self._polling_interval)
                    continue
@@ -799,10 +913,7 @@ class K8sIntegration(Worker):
            # iterate over queues (priority style, queues[0] is highest)
            for queue in queues:
                # delete old completed / failed pods
-                for namespace in namespaces:
-                    kubectl_cmd = self.KUBECTL_DELETE_CMD.format(namespace=namespace, selector=self._get_agent_label())
-                    self.log.debug("Deleting old/failed pods: {}".format(kubectl_cmd))
-                    get_bash_output(kubectl_cmd)
+                self._cleanup_old_pods(namespaces)

                # get next task in queue
                try:
@@ -818,14 +929,6 @@ class K8sIntegration(Worker):
                    except (KeyError, TypeError, AttributeError):
                        print("No tasks in queue {}".format(queue))
                        continue
-                    events_service.send_log_events(
-                        self.worker_id,
-                        task_id=task_id,
-                        lines="task {} pulled from {} by worker {}".format(
-                            task_id, queue, self.worker_id
-                        ),
-                        level="INFO",
-                    )

                    task_session = None
                    if self._impersonate_as_task_owner:
@@ -845,6 +948,16 @@ class K8sIntegration(Worker):
                            )
                            continue

+                    events_service.send_log_events(
+                        self.worker_id,
+                        task_id=task_id,
+                        lines="task {} pulled from {} by worker {}".format(
+                            task_id, queue, self.worker_id
+                        ),
+                        level="INFO",
+                        session=task_session,
+                    )
+
                    self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
                    self.run_one_task(queue, task_id, worker_params, task_session)
                    self.report_monitor(ResourceMonitor.StatusReport(queues=self.queues))
@@ -907,5 +1020,6 @@ class K8sIntegration(Worker):
        value = re.sub(r'^[^A-Za-z0-9]+', '', value)  # strip leading non-alphanumeric chars
        value = re.sub(r'[^A-Za-z0-9]+$', '', value)  # strip trailing non-alphanumeric chars
        value = re.sub(r'\W+', '-', value)  # allow only word chars (this removed "." which is supported, but nvm)
+        value = re.sub(r'_+', '-', value)  # "_" is not allowed as well
        value = re.sub(r'-+', '-', value)  # don't leave messy "--" after replacing previous chars
        return value[:63]
--- a/clearml_agent/helper/base.py
+++ b/clearml_agent/helper/base.py
@@ -20,13 +20,13 @@ from typing import Text, Dict, Any, Optional, AnyStr, IO, Union

 import attr
 import furl
-import pyhocon
 import yaml
 from attr import fields_dict
 from pathlib2 import Path

 import six
 from six.moves import reduce
+from clearml_agent.external import pyhocon
 from clearml_agent.errors import CommandFailedError
 from clearml_agent.helper.dicts import filter_keys

--- a/clearml_agent/helper/docker_args.py
+++ b/clearml_agent/helper/docker_args.py
@@ -0,0 +1,96 @@
+import re
+import shlex
+from typing import Tuple, List, TYPE_CHECKING
+from urllib.parse import urlunparse, urlparse
+
+from clearml_agent.definitions import (
+    ENV_AGENT_GIT_PASS,
+    ENV_AGENT_SECRET_KEY,
+    ENV_AWS_SECRET_KEY,
+    ENV_AZURE_ACCOUNT_KEY,
+    ENV_AGENT_AUTH_TOKEN,
+    ENV_DOCKER_IMAGE,
+    ENV_DOCKER_ARGS_HIDE_ENV,
+)
+
+if TYPE_CHECKING:
+    from clearml_agent.session import Session
+
+
+class DockerArgsSanitizer:
+    @classmethod
+    def sanitize_docker_command(cls, session, docker_command):
+        # type: (Session, List[str]) -> List[str]
+        if not docker_command:
+            return docker_command
+
+        enabled = (
+            session.config.get('agent.hide_docker_command_env_vars.enabled', False) or ENV_DOCKER_ARGS_HIDE_ENV.get()
+        )
+        if not enabled:
+            return docker_command
+
+        keys = set(session.config.get('agent.hide_docker_command_env_vars.extra_keys', []))
+        if ENV_DOCKER_ARGS_HIDE_ENV.get():
+            keys.update(shlex.split(ENV_DOCKER_ARGS_HIDE_ENV.get().strip()))
+        keys.update(
+            ENV_AGENT_GIT_PASS.vars,
+            ENV_AGENT_SECRET_KEY.vars,
+            ENV_AWS_SECRET_KEY.vars,
+            ENV_AZURE_ACCOUNT_KEY.vars,
+            ENV_AGENT_AUTH_TOKEN.vars,
+        )
+
+        parse_embedded_urls = bool(session.config.get(
+            'agent.hide_docker_command_env_vars.parse_embedded_urls', True
+        ))
+
+        skip_next = False
+        result = docker_command[:]
+        for i, item in enumerate(docker_command):
+            if skip_next:
+                skip_next = False
+                continue
+            try:
+                if item in ("-e", "--env"):
+                    key, sep, val = result[i + 1].partition("=")
+                    if not sep:
+                        continue
+                    if key in ENV_DOCKER_IMAGE.vars:
+                        # special case - this contains a complete docker command
+                        val = " ".join(cls.sanitize_docker_command(session, re.split(r"\s", val)))
+                    elif key in keys:
+                        val = "********"
+                    elif parse_embedded_urls:
+                        val = cls._sanitize_urls(val)[0]
+                    result[i + 1] = "{}={}".format(key, val)
+                    skip_next = True
+                elif parse_embedded_urls and not item.startswith("-"):
+                    item, changed = cls._sanitize_urls(item)
+                    if changed:
+                        result[i] = item
+            except (KeyError, TypeError):
+                pass
+
+        return result
+
+    @staticmethod
+    def _sanitize_urls(s: str) -> Tuple[str, bool]:
+        """ Replaces passwords in URLs with asterisks """
+        regex = re.compile("^([^:]*:)[^@]+(.*)$")
+        tokens = re.split(r"\s", s)
+        changed = False
+        for k in range(len(tokens)):
+            if "@" in tokens[k]:
+                res = urlparse(tokens[k])
+                if regex.match(res.netloc):
+                    changed = True
+                    tokens[k] = urlunparse((
+                        res.scheme,
+                        regex.sub("\\1********\\2", res.netloc),
+                        res.path,
+                        res.params,
+                        res.query,
+                        res.fragment
+                    ))
+        return " ".join(tokens) if changed else s, changed
--- a/clearml_agent/helper/package/base.py
+++ b/clearml_agent/helper/package/base.py
@@ -80,7 +80,12 @@ class PackageManager(object):

    def upgrade_pip(self):
        result = self._install(
-            select_for_platform(windows='pip{}', linux='pip{}').format(self.get_pip_version()), "--upgrade")
+            *select_for_platform(
+                windows=self.get_pip_versions(),
+                linux=self.get_pip_versions()
+            ),
+            "--upgrade"
+        )
        packages = self.run_with_env(('list',), output=True).splitlines()
        # p.split is ('pip', 'x.y.z')
        pip = [p.split() for p in packages if len(p.split()) == 2 and p.split()[0] == 'pip']
@@ -157,15 +162,26 @@ class PackageManager(object):
    def set_pip_version(cls, version):
        if not version:
            return
-        version = version.replace(' ', '')
-        if ('=' in version) or ('~' in version) or ('<' in version) or ('>' in version):
-            cls._pip_version = version
+
+        if isinstance(version, (list, tuple)):
+            versions = version
        else:
-            cls._pip_version = "=="+version
+            versions = [version]
+
+        cls._pip_version = []
+        for version in versions:
+            version = version.strip()
+            if ('=' in version) or ('~' in version) or ('<' in version) or ('>' in version):
+                cls._pip_version.append(version)
+            else:
+                cls._pip_version.append("==" + version)

    @classmethod
-    def get_pip_version(cls):
-        return cls._pip_version or ''
+    def get_pip_versions(cls, pip="pip", wrap=''):
+        return [
+            (wrap + pip + version + wrap)
+            for version in cls._pip_version or [pip]
+        ]

    def get_cached_venv(self, requirements, docker_cmd, python_version, cuda_version, destination_folder):
        # type: (Dict, Optional[Union[dict, str]], Optional[str], Optional[str], Path) -> Optional[Path]
@@ -176,8 +192,13 @@ class PackageManager(object):
        if not self._get_cache_manager():
            return None

-        keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
-        return self._get_cache_manager().copy_cached_entry(keys, destination_folder)
+        try:
+            keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
+            return self._get_cache_manager().copy_cached_entry(keys, destination_folder)
+        except Exception as ex:
+            print("WARNING: Failed accessing venvs cache at {}: {}".format(destination_folder, ex))
+            print("WARNING: Skipping venv cache - folder not accessible!")
+            return None

    def add_cached_venv(
            self,
@@ -194,9 +215,15 @@ class PackageManager(object):
        """
        if not self._get_cache_manager():
            return
-        keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
-        return self._get_cache_manager().add_entry(
-            keys=keys, source_folder=source_folder, exclude_sub_folders=exclude_sub_folders)
+
+        try:
+            keys = self._generate_reqs_hash_keys(requirements, docker_cmd, python_version, cuda_version)
+            return self._get_cache_manager().add_entry(
+                keys=keys, source_folder=source_folder, exclude_sub_folders=exclude_sub_folders)
+        except Exception as ex:
+            print("WARNING: Failed accessing venvs cache at {}: {}".format(source_folder, ex))
+            print("WARNING: Skipping venv cache - folder not accessible!")
+            return None

    def get_cache_folder(self):
        # type: () -> Optional[Path]
@@ -213,6 +240,13 @@ class PackageManager(object):
            return
        return self._get_cache_manager().get_last_copied_entry()

+    def is_cached_enabled(self):
+        if not self._cache_manager:
+            cache_folder = ENV_VENV_CACHE_PATH.get() or self.session.config.get(self._config_cache_folder, None)
+            if not cache_folder:
+                return False
+        return True
+
    @classmethod
    def _generate_reqs_hash_keys(cls, requirements_list, docker_cmd, python_version, cuda_version):
        # type: (Union[Dict, List[Dict]], Optional[Union[dict, str]], Optional[str], Optional[str]) -> List[str]
@@ -257,12 +291,19 @@ class PackageManager(object):

    def _get_cache_manager(self):
        if not self._cache_manager:
-            cache_folder = ENV_VENV_CACHE_PATH.get() or self.session.config.get(self._config_cache_folder, None)
-            if not cache_folder:
+            cache_folder = None
+            try:
+                cache_folder = ENV_VENV_CACHE_PATH.get() or self.session.config.get(self._config_cache_folder, None)
+                if not cache_folder:
+                    return None
+
+                max_entries = int(self.session.config.get(self._config_cache_max_entries, 10))
+                free_space_threshold = float(self.session.config.get(self._config_cache_free_space_threshold, 0))
+                self._cache_manager = FolderCache(
+                    cache_folder, max_cache_entries=max_entries, min_free_space_gb=free_space_threshold)
+            except Exception as ex:
+                print("WARNING: Failed accessing venvs cache at {}: {}".format(cache_folder, ex))
+                print("WARNING: Skipping venv cache - folder not accessible!")
                return None

-            max_entries = int(self.session.config.get(self._config_cache_max_entries, 10))
-            free_space_threshold = float(self.session.config.get(self._config_cache_free_space_threshold, 0))
-            self._cache_manager = FolderCache(
-                cache_folder, max_cache_entries=max_entries, min_free_space_gb=free_space_threshold)
        return self._cache_manager
--- a/clearml_agent/helper/package/conda_api.py
+++ b/clearml_agent/helper/package/conda_api.py
@@ -135,7 +135,12 @@ class CondaAPI(PackageManager):
        if self.env_read_only:
            print('Conda environment in read-only mode, skipping pip upgrade.')
            return ''
-        return self._install(select_for_platform(windows='pip{}', linux='pip{}').format(self.pip.get_pip_version()))
+        return self._install(
+            *select_for_platform(
+                windows=self.pip.get_pip_versions(),
+                linux=self.pip.get_pip_versions()
+            )
+        )

    def create(self):
        """
--- a/clearml_agent/helper/package/external_req.py
+++ b/clearml_agent/helper/package/external_req.py
@@ -50,6 +50,14 @@ class ExternalRequirements(SimpleSubstitution):
                print("No need to reinstall \'{}\' from VCS, "
                      "the exact same version is already installed".format(req.name))
                continue
+
+            if not req.pip_new_version:
+                # noinspection PyBroadException
+                try:
+                    freeze_base = PackageManager.out_of_scope_freeze() or dict(pip=[])
+                except Exception:
+                    freeze_base = dict(pip=[])
+
            req_line = self._add_vcs_credentials(req, session)

            # if we have older pip version we have to make sure we replace back the package name with the
@@ -58,14 +66,14 @@ class ExternalRequirements(SimpleSubstitution):
                PackageManager.out_of_scope_install_package(req_line, "--no-deps")
                # noinspection PyBroadException
                try:
-                    freeze_post = PackageManager.out_of_scope_freeze() or ''
+                    freeze_post = PackageManager.out_of_scope_freeze() or dict(pip=[])
                    package_name = list(set(freeze_post['pip']) - set(freeze_base['pip']))
                    if package_name and package_name[0] not in self.post_install_req_lookup:
                        self.post_install_req_lookup[package_name[0]] = req.req.line
                except Exception:
                    pass

-            # no need to force reinstall, pip will always rebuilt if the package comes from git
+            # no need to force reinstall, pip will always rebuild if the package comes from git
            # and make sure the required packages are installed (if they are not it will install them)
            if not PackageManager.out_of_scope_install_package(req_line):
                raise ValueError("Failed installing GIT/HTTPs package \'{}\'".format(req_line))
@@ -86,12 +94,13 @@ class ExternalRequirements(SimpleSubstitution):
                vcs_url = vcs_url[::-1].replace(fragment[::-1], '', 1)[::-1]
                # remove ssh:// or git:// prefix for git detection and credentials
                scheme = ''
+                full_vcs_url = vcs_url
                if vcs_url and (vcs_url.startswith('ssh://') or vcs_url.startswith('git://')):
                    scheme = 'ssh://'  # notice git:// is actually ssh://
                    vcs_url = vcs_url[6:]

                from ..repo import Git
-                vcs = Git(session=session, url=vcs_url, location=None, revision=None)
+                vcs = Git(session=session, url=full_vcs_url, location=None, revision=None)
                vcs._set_ssh_url()
                new_req_line = 'git+{}{}{}'.format(
                    '' if scheme and '://' in vcs.url else scheme,
--- a/clearml_agent/helper/package/poetry_api.py
+++ b/clearml_agent/helper/package/poetry_api.py
@@ -69,6 +69,11 @@ class PoetryConfig:
                path = path.replace(':'+sys.base_prefix, ':'+sys.real_prefix, 1)
                kwargs['env']['PATH'] = path

+        if self.session and self.session.config:
+            extra_args = self.session.config.get("agent.package_manager.poetry_install_extra_args", None)
+            if extra_args:
+                args = args + tuple(extra_args)
+
        if check_if_command_exists("poetry"):
            argv = Argv("poetry", *args)
        else:
--- a/clearml_agent/helper/package/pytorch.py
+++ b/clearml_agent/helper/package/pytorch.py
@@ -13,7 +13,9 @@ import attr
 import requests

 import six
-from .requirements import SimpleSubstitution, FatalSpecsResolutionError, SimpleVersion, MarkerRequirement
+from .requirements import (
+    SimpleSubstitution, FatalSpecsResolutionError, SimpleVersion, MarkerRequirement,
+    compare_version_rules, )
 from ...external.requirements_parser.requirement import Requirement

 OS_TO_WHEEL_NAME = {"linux": "linux_x86_64", "windows": "win_amd64"}
@@ -169,6 +171,10 @@ class PytorchRequirement(SimpleSubstitution):
    name = "torch"
    packages = ("torch", "torchvision", "torchaudio", "torchcsprng", "torchtext")

+    extra_index_url_template = 'https://download.pytorch.org/whl/cu{}/'
+    nightly_extra_index_url_template = 'https://download.pytorch.org/whl/nightly/cu{}/'
+    torch_index_url_lookup = {}
+
    def __init__(self, *args, **kwargs):
        os_name = kwargs.pop("os_override", None)
        super(PytorchRequirement, self).__init__(*args, **kwargs)
@@ -183,6 +189,13 @@ class PytorchRequirement(SimpleSubstitution):
        self.exceptions = []
        self._original_req = []
        # allow override pytorch lookup pages
+        if self.config.get("agent.package_manager.extra_index_url_template", None):
+            self.extra_index_url_template = \
+                self.config.get("agent.package_manager.extra_index_url_template", None)
+        if self.config.get("agent.package_manager.nightly_extra_index_url_template", None):
+            self.nightly_extra_index_url_template = \
+                self.config.get("agent.package_manager.nightly_extra_index_url_template", None)
+        # allow override pytorch lookup pages
        if self.config.get("agent.package_manager.torch_page", None):
            SimplePytorchRequirement.page_lookup_template = \
                self.config.get("agent.package_manager.torch_page", None)
@@ -381,7 +394,8 @@ class PytorchRequirement(SimpleSubstitution):
                print('Trying PyTorch CUDA version {} support'.format(torch_url_key))

        # fix broken pytorch setuptools incompatibility
-        if closest_matched_version and SimpleVersion.compare_versions(closest_matched_version, "<", "1.11.0"):
+        if req.name == "torch" and closest_matched_version and \
+                SimpleVersion.compare_versions(closest_matched_version, "<", "1.11.0"):
            self._fix_setuptools = "setuptools < 59"

        if not url:
@@ -461,6 +475,44 @@ class PytorchRequirement(SimpleSubstitution):
        return self.match_version(req, base).replace(" ", "\n")

    def replace(self, req):
+        # check if package is already installed with system packages
+        self.validate_python_version()
+
+        # try to check if we can just use the new index URL, if we do not we will revert to old method
+        try:
+            extra_index_url = self.get_torch_index_url(self.cuda_version)
+            if extra_index_url:
+                # check if the torch version cannot be above 1.11 , we need to fix setup tools
+                try:
+                    if req.name == "torch" and not compare_version_rules(req.specs, [(">=", "1.11.0")]):
+                        self._fix_setuptools = "setuptools < 59"
+                except Exception:  # noqa
+                    pass
+                # now we just need to add the correct extra index url for the cuda version
+                self.set_add_install_extra_index(extra_index_url[0])
+
+                if req.specs and len(req.specs) == 1 and req.specs[0][0] == "==":
+                    # remove any +cu extension and let pip resolve that
+                    # and add .* if we have 3 parts version to deal with nvidia container 'a' version
+                    # i.e. "1.13.0" -> "1.13.0.*" so it should match preinstalled "1.13.0a0+936e930"
+                    spec_3_parts = req.format_specs(num_parts=3)
+                    spec_max3_parts = req.format_specs(max_num_parts=3)
+                    if spec_3_parts == spec_max3_parts and not spec_max3_parts.endswith("*"):
+                        line = "{} {}.*".format(req.name, spec_max3_parts)
+                    else:
+                        line = "{} {}".format(req.name, spec_max3_parts)
+
+                    if req.marker:
+                        line += " ; {}".format(req.marker)
+                else:
+                    # return the original line
+                    line = req.line
+
+                return line
+
+        except Exception:  # noqa
+            pass
+
        try:
            new_req = self._replace(req)
            if new_req:
@@ -556,6 +608,51 @@ class PytorchRequirement(SimpleSubstitution):
            return MarkerRequirement(Requirement.parse(self._fix_setuptools))
        return None

+    @classmethod
+    def get_torch_index_url(cls, cuda_version, nightly=False):
+        # noinspection PyBroadException
+        try:
+            cuda = int(cuda_version)
+        except Exception:
+            cuda = 0
+
+        if nightly:
+            for c in range(cuda, max(-1, cuda-15), -1):
+                # then try the nightly builds, it might be there...
+                torch_url = cls.nightly_extra_index_url_template.format(c)
+                # noinspection PyBroadException
+                try:
+                    if requests.get(torch_url, timeout=10).ok:
+                        print('Torch nightly CUDA {} index page found'.format(c))
+                        cls.torch_index_url_lookup[c] = torch_url
+                        return cls.torch_index_url_lookup[c], c
+                except Exception:
+                    pass
+            return
+
+        # first check if key is valid
+        if cuda in cls.torch_index_url_lookup:
+            return cls.torch_index_url_lookup[cuda], cuda
+
+        # then try a new cuda version page
+        for c in range(cuda, max(-1, cuda-15), -1):
+            torch_url = cls.extra_index_url_template.format(c)
+            # noinspection PyBroadException
+            try:
+                if requests.get(torch_url, timeout=10).ok:
+                    print('Torch CUDA {} index page found'.format(c))
+                    cls.torch_index_url_lookup[c] = torch_url
+                    return cls.torch_index_url_lookup[c], c
+            except Exception:
+                pass
+
+        keys = sorted(cls.torch_index_url_lookup.keys(), reverse=True)
+        for k in keys:
+            if k <= cuda:
+                return cls.torch_index_url_lookup[k], k
+        # return default - zero
+        return cls.torch_index_url_lookup[0], 0
+
    MAP = {
        "windows": {
            "cuda100": {
--- a/clearml_agent/helper/package/requirements.py
+++ b/clearml_agent/helper/package/requirements.py
@@ -11,7 +11,7 @@ from os import path
 from typing import Text, List, Type, Optional, Tuple, Dict

 from pathlib2 import Path
-from pyhocon import ConfigTree
+from clearml_agent.external.pyhocon import ConfigTree

 import six
 from six.moves.urllib.parse import unquote
@@ -100,7 +100,8 @@ class MarkerRequirement(object):
            return ','.join(starmap(operator.add, self.specs))

        op, version = self.specs[0]
-        for v in self._sub_versions_pep440:
+        # noinspection PyProtectedMember
+        for v in SimpleVersion._sub_versions_pep440:
            version = version.replace(v, '.')
        if num_parts:
            version = (version.strip('.').split('.') + ['0'] * num_parts)[:max_num_parts]
@@ -278,6 +279,8 @@ class SimpleVersion:
            return version_a_key > version_b_key
        if op == '<':
            return version_a_key < version_b_key
+        if op == '!=':
+            return version_a_key != version_b_key
        raise ValueError('Unrecognized comparison operator [{}]'.format(op))

    @classmethod
@@ -362,7 +365,7 @@ def compare_version_rules(specs_a, specs_b):
    # specs_a/b are a list of tuples: [('==', '1.2.3'), ] or [('>=', '1.2'), ('<', '1.3')]
    # section definition:
    class Section(object):
-        def __init__(self, left=None, left_eq=False, right=None, right_eq=False):
+        def __init__(self, left="-999999999", left_eq=False, right="999999999", right_eq=False):
            self.left, self.left_eq, self.right, self.right_eq = left, left_eq, right, right_eq
    # first create a list of in/out sections for each spec
    # >, >= are left rule
@@ -434,6 +437,11 @@ class RequirementSubstitution(object):

    _pip_extra_index_url = PIP_EXTRA_INDICES

+    @classmethod
+    def set_add_install_extra_index(cls, extra_index_url):
+        if extra_index_url not in cls._pip_extra_index_url:
+            cls._pip_extra_index_url.append(extra_index_url)
+
    def __init__(self, session):
        # type: (Session) -> ()
        self._session = session
--- a/clearml_agent/helper/package/translator.py
+++ b/clearml_agent/helper/package/translator.py
@@ -1,3 +1,4 @@
+from tempfile import mkdtemp
 from typing import Text

 from furl import furl
@@ -20,7 +21,16 @@ class RequirementsTranslator(object):
        config = session.config
        self.cache_dir = cache_dir or Path(config["agent.pip_download_cache.path"]).expanduser().as_posix()
        self.enabled = config["agent.pip_download_cache.enabled"]
-        Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
+        # noinspection PyBroadException
+        try:
+            Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
+        except Exception:
+            temp_cache_folder = mkdtemp(prefix='pip_download_cache.')
+            print("Failed creating pip download cache folder at `{}` reverting to `{}`".format(
+                self.cache_dir, temp_cache_folder))
+            self.cache_dir = temp_cache_folder
+            Path(self.cache_dir).mkdir(parents=True, exist_ok=True)
+
        self.config = Config()
        self.pip = SystemPip(interpreter=interpreter, session=self._session)
        self._translate_back = {}
--- a/clearml_agent/helper/process.py
+++ b/clearml_agent/helper/process.py
@@ -16,7 +16,6 @@ from typing import Union, Text, Sequence, Any, TypeVar, Callable

 import psutil
 from furl import furl
-from future.builtins import super
 from pathlib2 import Path

 import six
@@ -26,7 +25,7 @@ from clearml_agent.helper.base import bash_c, is_windows_platform, select_for_pl
 PathLike = Union[Text, Path]


-def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False):
+def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False, raise_error=False):
    try:
        output = (
            subprocess.check_output(
@@ -38,10 +37,16 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False):
            .strip()
        )
    except subprocess.CalledProcessError:
+        if raise_error:
+            raise
        output = None
    return output if not strip or not output else output.strip()


+def stringify_bash_output(value):
+    return '' if not value else (value if isinstance(value, str) else value.decode('utf-8'))
+
+
 def terminate_process(pid, timeout=10., ignore_zombie=True, include_children=False):
    # noinspection PyBroadException
    try:
@@ -112,10 +117,11 @@ def terminate_all_child_processes(pid=None, timeout=10., include_parent=True):


 def get_docker_id(docker_cmd_contains):
+    # noinspection PyBroadException
    try:
        containers_running = get_bash_output(cmd='docker ps --no-trunc --format \"{{.ID}}: {{.Command}}\"')
        for docker_line in containers_running.split('\n'):
-            parts = docker_line.split(':')
+            parts = docker_line.split(':', 1)
            if docker_cmd_contains in parts[-1]:
                # we found our docker, return it
                return parts[0]
--- a/clearml_agent/helper/resource_monitor.py
+++ b/clearml_agent/helper/resource_monitor.py
@@ -92,9 +92,10 @@ class ResourceMonitor(object):
            # None means no filtering, report all gpus
            self._active_gpus = None
            try:
-                active_gpus = Session.get_nvidia_visible_env() or ""
-                if active_gpus:
-                    self._active_gpus = [g.strip() for g in active_gpus.split(',')]
+                active_gpus = Session.get_nvidia_visible_env()
+                # None means no filtering, report all gpus
+                if active_gpus and active_gpus != "all":
+                    self._active_gpus = [g.strip() for g in str(active_gpus).split(',')]
            except Exception:
                pass

--- a/clearml_agent/session.py
+++ b/clearml_agent/session.py
@@ -10,8 +10,8 @@ from typing import Any, Callable

 import attr
 from pathlib2 import Path
-from pyhocon import ConfigFactory, HOCONConverter, ConfigTree

+from clearml_agent.external.pyhocon import ConfigFactory, HOCONConverter, ConfigTree
 from clearml_agent.backend_api.session import Session as _Session, Request
 from clearml_agent.backend_api.session.client import APIClient
 from clearml_agent.backend_config.defs import LOCAL_CONFIG_FILE_OVERRIDE_VAR, LOCAL_CONFIG_FILES
@@ -19,6 +19,7 @@ from clearml_agent.definitions import ENVIRONMENT_CONFIG, ENV_TASK_EXECUTE_AS_US
 from clearml_agent.errors import APIError
 from clearml_agent.helper.base import HOCONEncoder
 from clearml_agent.helper.process import Argv
+from clearml_agent.helper.docker_args import DockerArgsSanitizer
 from .version import __version__

 POETRY = "poetry"
@@ -105,7 +106,7 @@ class Session(_Session):
                if os.path.exists(os.path.expanduser(os.path.expandvars(f))):
                    self._config_file = f
                    break
-        self.api_client = APIClient(session=self, api_version="2.5")
+        self._api_client = None
        # HACK make sure we have python version to execute,
        # if nothing was specific, use the one that runs us
        def_python = ConfigValue(self.config, "agent.default_python")
@@ -132,7 +133,7 @@ class Session(_Session):
        # override with environment variables
        # cuda_version & cudnn_version are overridden with os.environ here, and normalized in the next section
        for config_key, env_config in ENVIRONMENT_CONFIG.items():
-            # check if the propery is of a list:
+            # check if the property is of a list:
            if config_key.endswith('.0'):
                if all(not i.get() for i in env_config.values()):
                    continue
@@ -166,6 +167,16 @@ class Session(_Session):
        if not kwargs.get('only_load_config'):
            self.create_cache_folders()

+    @property
+    def api_client(self):
+        if self._api_client is None:
+            self._api_client = APIClient(session=self, api_version="2.5")
+        return self._api_client
+
+    @api_client.setter
+    def api_client(self, value):
+        self._api_client = value
+
    @staticmethod
    def get_logger(name):
        logger = logging.getLogger(name)
@@ -232,7 +243,8 @@ class Session(_Session):
    def print_configuration(
            self,
            remove_secret_keys=("secret", "pass", "token", "account_key", "contents"),
-            skip_value_keys=("environment", )
+            skip_value_keys=("environment", ),
+            docker_args_sanitize_keys=("extra_docker_arguments", ),
    ):
        # remove all the secrets from the print
        def recursive_remove_secrets(dictionary, secret_keys=(), empty_keys=()):
@@ -249,6 +261,8 @@ class Session(_Session):
                if isinstance(dictionary.get(k, None), dict):
                    recursive_remove_secrets(dictionary[k], secret_keys=secret_keys, empty_keys=empty_keys)
                elif isinstance(dictionary.get(k, None), (list, tuple)):
+                    if k in (docker_args_sanitize_keys or []):
+                        dictionary[k] = DockerArgsSanitizer.sanitize_docker_command(self, dictionary[k])
                    for item in dictionary[k]:
                        if isinstance(item, dict):
                            recursive_remove_secrets(item, secret_keys=secret_keys, empty_keys=empty_keys)
@@ -256,7 +270,7 @@ class Session(_Session):
        config = deepcopy(self.config.to_dict())
        # remove the env variable, it's not important
        config.pop('env', None)
-        if remove_secret_keys or skip_value_keys:
+        if remove_secret_keys or skip_value_keys or docker_args_sanitize_keys:
            recursive_remove_secrets(config, secret_keys=remove_secret_keys, empty_keys=skip_value_keys)
        # remove logging.loggers.urllib3.level from the print
        try:
--- a/clearml_agent/version.py
+++ b/clearml_agent/version.py
@@ -1 +1 @@
-__version__ = '1.4.0'
+__version__ = '1.5.2'
--- a/docker/k8s-glue/build-resources/clearml.conf
+++ b/docker/k8s-glue/build-resources/clearml.conf
@@ -57,8 +57,8 @@ agent {
        # supported options: pip, conda, poetry
        type: pip,

-        # specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
-        pip_version: "<20.2",
+        # specify pip version to use (examples "<20.2", "==19.3.1", "", empty string will install the latest version)
+        pip_version: "<21",

        # virtual environment inheres packages from system
        system_site_packages: false,
--- a/docker/services/entrypoint.sh
+++ b/docker/services/entrypoint.sh
@@ -1,16 +1,36 @@
-#!/bin/sh
+#!/bin/bash +x

-CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-$TRAINS_FILES_HOST}
+if [ -n "$SHUTDOWN_IF_NO_ACCESS_KEY" ] && [ -z "$CLEARML_API_ACCESS_KEY" ] && [ -z "$TRAINS_API_ACCESS_KEY" ]; then
+  echo "CLEARML_API_ACCESS_KEY was not provided, service will not be started"
+  exit 0
+fi
+
+export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-$TRAINS_FILES_HOST}

 if [ -z "$CLEARML_FILES_HOST" ]; then
    CLEARML_HOST_IP=${CLEARML_HOST_IP:-${TRAINS_HOST_IP:-$(curl -s https://ifconfig.me/ip)}}
 fi

-CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-${TRAINS_FILES_HOST:-"http://$CLEARML_HOST_IP:8081"}}
-CLEARML_WEB_HOST=${CLEARML_WEB_HOST:-${TRAINS_WEB_HOST:-"http://$CLEARML_HOST_IP:8080"}}
-CLEARML_API_HOST=${CLEARML_API_HOST:-${TRAINS_API_HOST:-"http://$CLEARML_HOST_IP:8008"}}
+export CLEARML_FILES_HOST=${CLEARML_FILES_HOST:-${TRAINS_FILES_HOST:-"http://$CLEARML_HOST_IP:8081"}}
+export CLEARML_WEB_HOST=${CLEARML_WEB_HOST:-${TRAINS_WEB_HOST:-"http://$CLEARML_HOST_IP:8080"}}
+export CLEARML_API_HOST=${CLEARML_API_HOST:-${TRAINS_API_HOST:-"http://$CLEARML_HOST_IP:8008"}}

 echo $CLEARML_FILES_HOST $CLEARML_WEB_HOST $CLEARML_API_HOST 1>&2

-python3 -m pip install -q -U "clearml-agent${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}"
-clearml-agent daemon --services-mode --queue services --create-queue --docker "${CLEARML_AGENT_DEFAULT_BASE_DOCKER:-$TRAINS_AGENT_DEFAULT_BASE_DOCKER}" --cpu-only ${CLEARML_AGENT_EXTRA_ARGS:-$TRAINS_AGENT_EXTRA_ARGS}
+if [[ "$CLEARML_AGENT_UPDATE_VERSION" =~ ^[0-9]{1,3}\.[0-9]{1,3}(\.[0-9]{1,3}([a-zA-Z]{1,3}[0-9]{1,3})?)?$ ]]
+then
+    CLEARML_AGENT_UPDATE_VERSION="==$CLEARML_AGENT_UPDATE_VERSION"
+fi
+
+DAEMON_OPTIONS=${CLEARML_AGENT_DAEMON_OPTIONS:---services-mode --create-queue}
+QUEUES=${CLEARML_AGENT_QUEUES:-services}
+
+if [ -z "$CLEARML_AGENT_NO_UPDATE" ]; then
+  if [ -n "$CLEARML_AGENT_UPDATE_REPO" ]; then
+    python3 -m pip install -q -U $CLEARML_AGENT_UPDATE_REPO
+  else
+    python3 -m pip install -q -U "clearml-agent${CLEARML_AGENT_UPDATE_VERSION:-$TRAINS_AGENT_UPDATE_VERSION}"
+  fi
+fi
+
+clearml-agent daemon $DAEMON_OPTIONS --queue $QUEUES --docker "${CLEARML_AGENT_DEFAULT_BASE_DOCKER:-$TRAINS_AGENT_DEFAULT_BASE_DOCKER}" --cpu-only ${CLEARML_AGENT_EXTRA_ARGS:-$TRAINS_AGENT_EXTRA_ARGS}
--- a/docs/clearml.conf
+++ b/docs/clearml.conf
@@ -13,6 +13,15 @@ api {
 }

 agent {
+    # unique name of this worker, if None, created based on hostname:process_id
+    # Override with os environment: CLEARML_WORKER_ID
+    # worker_id: "clearml-agent-machine1:gpu0"
+    worker_id: ""
+
+    # worker name, replaces the hostname when creating a unique name for this worker
+    # Override with os environment: CLEARML_WORKER_NAME
+    # worker_name: "clearml-agent-machine1"
+    worker_name: ""
    # Set GIT user/pass credentials (if user/pass are set, GIT protocol will be set to https)
    # leave blank for GIT SSH credentials (set force_git_ssh_protocol=true to force SSH protocol)
    # **Notice**: GitHub personal token is equivalent to password, you can put it directly into `git_pass`
@@ -20,11 +29,11 @@ agent {
    # https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
    # https://support.atlassian.com/bitbucket-cloud/docs/app-passwords/
    # https://docs.gitlab.com/ee/user/profile/personal_access_tokens.html
-    git_user=""
-    git_pass=""
+    # git_user: ""
+    # git_pass: ""
    # Limit credentials to a single domain, for example: github.com,
    # all other domains will use public access (no user/pass). Default: always send user/pass for any VCS domain
-    git_host=""
+    # git_host: ""

    # Force GIT protocol to use SSH regardless of the git url (Assumes GIT user/pass are blank)
    force_git_ssh_protocol: false
@@ -33,16 +42,6 @@ agent {
    # Force a specific SSH username when converting http to ssh links (the default username is 'git')
    # force_git_ssh_user: git

-    # unique name of this worker, if None, created based on hostname:process_id
-    # Overridden with os environment: CLEARML_WORKER_ID
-    # worker_id: "clearml-agent-machine1:gpu0"
-    worker_id: ""
-
-    # worker name, replaces the hostname when creating a unique name for this worker
-    # Overridden with os environment: CLEARML_WORKER_NAME
-    # worker_name: "clearml-agent-machine1"
-    worker_name: ""
-
    # Set the python version to use when creating the virtual environment and launching the experiment
    # Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
    # The default is the python executing the clearml_agent
@@ -51,6 +50,22 @@ agent {
    # specific python version and the system supports multiple python the agent will use the requested python version)
    # ignore_requested_python_version: true

+    # Force the root folder of the git repository (instead of the working directory) into the PYHTONPATH
+    # default false, only the working directory will be added to the PYHTONPATH
+    # force_git_root_python_path: false
+
+    # if set, use GIT_ASKPASS to pass user/pass when cloning / fetch repositories
+    # it solves passing user/token to git submodules.
+    # this is a safer way to ensure multiple users using the same repository will
+    # not accidentally leak credentials
+    # Only supported on Linux systems, it will be the default in future releases
+    # enable_git_ask_pass: false
+
+    # in docker mode, if container's entrypoint automatically activated a virtual environment
+    # use the activated virtual environment and install everything there
+    # set to False to disable, and always create a new venv inheriting from the system_site_packages
+    # docker_use_activated_venv: true
+
    # select python package manager:
    # currently supported: pip, conda and poetry
    # if "pip" or "conda" are used, the agent installs the required packages
@@ -63,10 +78,11 @@ agent {
        # supported options: pip, conda, poetry
        type: pip,

-        # specify pip version to use (examples "<20", "==19.3.1", "", empty string will install the latest version)
-        # pip_version: "<20"
+        # specify pip version to use (examples "<20.2", "==19.3.1", "", empty string will install the latest version)
+        # pip_version: ["<20.2 ; python_version < '3.10'",  "<22.3 ; python_version >= '3.10'"]
        # specify poetry version to use (examples "<2", "==1.1.1", "", empty string will install the latest version)
        # poetry_version: "<2",
+        # poetry_install_extra_args: ["-v"]

        # virtual environment inheres packages from system
        system_site_packages: false,
@@ -113,7 +129,7 @@ agent {
        # minimum required free space to allow for cache entry, disable by passing 0 or negative value
        free_space_threshold_gb: 2.0
        # unmark to enable virtual environment caching
-        # path: ~/.clearml/venvs-cache
+        path: ~/.clearml/venvs-cache
    },

    # cached git clone folder
@@ -136,6 +152,12 @@ agent {
    },

    translate_ssh: true,
+
+    # set "disable_ssh_mount: true" to disable the automatic mount of ~/.ssh folder into the docker containers
+    # default is false, automatically mounts ~/.ssh
+    # Must be set to True if using "clearml-session" with this agent!
+    # disable_ssh_mount: false
+
    # reload configuration file every daemon execution
    reload_config: false,

@@ -219,7 +241,7 @@ agent {
    enable_task_env: false

    # CUDA versions used for Conda setup & solving PyTorch wheel packages
-    # it Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
+    # Should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
    # cuda_version: 10.1
    # cudnn_version: 7.6

@@ -423,42 +445,46 @@ sdk {

    # Apply top-level environment section from configuration into os.environ
    apply_environment: true
-    # Top-level environment section is in the form of:
-    #   environment {
-    #     key: value
-    #     ...
-    #   }
-    # and is applied to the OS environment as `key=value` for each key/value pair
-
    # Apply top-level files section from configuration into local file system
    apply_files: true
-    # Top-level files section allows auto-generating files at designated paths with a predefined contents
-    # and target format. Options include:
-    #  contents: the target file's content, typically a string (or any base type int/float/list/dict etc.)
-    #  format: a custom format for the contents. Currently supported value is `base64` to automatically decode a
-    #          base64-encoded contents string, otherwise ignored
-    #  path: the target file's path, may include ~ and inplace env vars
-    #  target_format: format used to encode contents before writing into the target file. Supported values are json,
-    #                 yaml, yml and bytes (in which case the file will be written in binary mode). Default is text mode.
-    #  overwrite: overwrite the target file in case it exists. Default is true.
-    #
-    # Example:
-    #   files {
-    #     myfile1 {
-    #       contents: "The quick brown fox jumped over the lazy dog"
-    #       path: "/tmp/fox.txt"
-    #     }
-    #     myjsonfile {
-    #       contents: {
-    #         some {
-    #           nested {
-    #             value: [1, 2, 3, 4]
-    #           }
-    #         }
-    #       }
-    #       path: "/tmp/test.json"
-    #       target_format: json
-    #     }
-    #   }
 }

+# Environment section (top-level) is applied to the OS environment as `key=value` for each key/value pair
+# * enable/disable with `agent.apply_environment` OR `sdk.apply_environment`
+# Example:
+#
+#   environment {
+#     key_a: value_a
+#     key_b: value_b
+#   }
+
+# Files section (top-level) allows auto-generating files at designated paths with
+# predefined content and target format.
+# * enable/disable with `agent.apply_files` OR `sdk.apply_files`
+# Files content options include:
+#  contents: the target file's content, typically a string (or any base type int/float/list/dict etc.)
+#  format: a custom format for the contents. Currently supported value is `base64` to automatically decode a
+#          base64-encoded contents string, otherwise ignored
+#  path: the target file's path, may include ~ and inplace env vars
+#  target_format: format used to encode contents before writing into the target file. Supported values are json,
+#                 yaml, yml and bytes (in which case the file will be written in binary mode). Default is text mode.
+#  overwrite: overwrite the target file in case it exists. Default is true.
+#
+# Example:
+#   files {
+#     myfile1 {
+#       contents: "The quick brown fox jumped over the lazy dog"
+#       path: "/tmp/fox.txt"
+#     }
+#     myjsonfile {
+#       contents: {
+#         some {
+#           nested {
+#             value: [1, 2, 3, 4]
+#           }
+#         }
+#       }
+#       path: "/tmp/test.json"
+#       target_format: json
+#     }
+#   }
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +1,15 @@
-attrs>=18.0,<20.4.0
+attrs>=18.0,<23.0.0
 enum34>=0.9,<1.2.0 ; python_version < '3.6'
 furl>=2.0.0,<2.2.0
-future>=0.16.0,<0.19.0
-jsonschema>=2.6.0,<3.3.0
+jsonschema>=2.6.0,<5.0.0
 pathlib2>=2.3.0,<2.4.0
-psutil>=3.4.2,<5.9.0
-pyhocon>=0.3.38,<0.4.0
-pyparsing>=2.0.3,<2.5.0
+psutil>=3.4.2,<5.10.0
+pyparsing>=2.0.3,<3.1.0
 python-dateutil>=2.4.2,<2.9.0
-pyjwt>=2.4.0,<2.5.0
-PyYAML>=3.12,<5.5.0
-requests>=2.20.0,<2.26.0
-six>=1.13.0,<1.16.0
+pyjwt>=2.4.0,<2.7.0
+PyYAML>=3.12,<6.1
+requests>=2.20.0,<2.29.0
+six>=1.13.0,<1.17.0
 typing>=3.6.4,<3.8.0 ; python_version < '3.5'
 urllib3>=1.21.1,<1.27.0
 virtualenv>=16,<21
Author	SHA1	Message	Date
allegroai	3fe92a92ba	Version bump to v1.5.2	2023-03-29 12:49:33 +03:00
allegroai	154db59ce6	Add agent.package_manager.poetry_install_extra_args configuration option	2023-03-28 14:37:48 +03:00
allegroai	afffa83063	Fix git+ssh:// links inside installed packages not being converted properly to https authenticated links	2023-03-28 14:35:51 +03:00
allegroai	787c7d88bb	Fix additional poetry cwd support feature	2023-03-28 14:35:41 +03:00
allegroai	667c2ced3d	Fix very old pip version support (<20)	2023-03-28 14:34:19 +03:00
allegroai	7f5b3c8df4	Fix None config file in session causes k8s agent to raise exception	2023-03-28 14:33:55 +03:00
allegroai	46ded2864d	Fix restart feature should be tested against agent session	2023-03-28 14:33:33 +03:00
allegroai	40456be948	Black formatting Refactor path support	2023-03-05 18:05:00 +02:00
allegroai	8d51aed679	Protect against cache folders without permission	2023-03-05 18:05:00 +02:00
allegroai	bfc4ba38cd	Fix torch inside nvidia containers to use preinstalled version (i.e. ==x.y.z.* matching)	2023-03-05 18:05:00 +02:00
Niels ten Boom	3cedc104df	Add poetry cwd support (#142 ) Closes #138	2023-03-05 14:19:57 +02:00
Marijan Smetko	b367c80477	Switch entrypoint shell from `sh` to `bash` (#141 )	2023-02-28 21:55:16 +02:00
allegroai	262b6d3a00	Update services agent entrypoint	2023-02-05 10:40:02 +02:00
allegroai	95e996bfda	Reintroduce `CLEARML_AGENT_SERVICES_DOCKER_RESTART` accidentally reverted by a previous merge	2023-02-05 10:34:38 +02:00
allegroai	b6d132b226	Fix build fails when target is relative path	2023-02-05 10:33:32 +02:00
allegroai	4f17a2c17d	Fix K8s glue does not delete pending pods if the tasks they represent were aborted	2023-02-05 10:32:16 +02:00
allegroai	00e8e9eb5a	Do not allow request exceptions (only on the initial login call)	2023-02-05 10:30:45 +02:00
allegroai	af6a77918f	Fix `_` is allowed in k8s label names	2023-02-05 10:29:48 +02:00
allegroai	855622fd30	Support custom service on `Worker.get()` calls	2023-02-05 10:29:09 +02:00
allegroai	8cd12810f3	Fix login uses GET with payload which breaks when trying to connect a server running in GCP	2023-02-05 10:28:41 +02:00
achaiah	ebb955187d	Fix agent update version (#132 ) * Fix agent update version Pip install command is missing the '==' to execute successfully * allow for fuzzy and direct version spec adding logic to allow for flexible version specification * Added regex to parse 1.2.3rc4 patterns	2023-01-08 19:10:26 +02:00
pollfly	85e1fadf9b	Fix typos (#131 )	2022-12-28 19:39:59 +02:00
allegroai	249b51a31b	Version bump	2022-12-13 15:29:10 +02:00
allegroai	da19ef26c4	Fix pinging running task (and change default to once a minute)	2022-12-13 15:26:26 +02:00
allegroai	f69e16ea9d	Fix `clearml-agent build --docker` stuck on certain containers	2022-12-13 15:24:32 +02:00
allegroai	efa1f71dac	Version bump to v1.5.1	2022-12-10 22:18:21 +02:00
allegroai	692cb8cf13	Update six requirements	2022-12-10 22:18:10 +02:00
allegroai	ebdc215632	Remove `"` from pip commands in venv	2022-12-10 20:58:30 +02:00
allegroai	b2da639582	Add `CLEARML_AGENT_FORCE_SYSTEM_SITE_PACKAGES` env var (default true) to allow overriding default "system_site_packages: true" behavior when running tasks in containers (docker mode and k8s-glue)	2022-12-10 20:00:46 +02:00
allegroai	71fdb43f10	Version bump to v1.5.1rc0	2022-12-07 22:09:40 +02:00
allegroai	ca2791c65e	Fix pip support allowing multiple pip version constraints (by default, one for <PY3.10 and one for >=PY3.10)	2022-12-07 22:09:25 +02:00
allegroai	dd75cedaab	Upgrade requirements for attrs, jsonschema, pyparsing and pyjwt	2022-12-07 22:08:15 +02:00
allegroai	669fb1a6e5	Fix using deprecated types validator argument raises an error (deprecated even before jsonschema 3.0.0 and unsupported since 4.0.0)	2022-12-07 22:07:53 +02:00
allegroai	5d517c91b5	Add `agent.disable_task_docker_override` configuration option to disable docker override specified in executing tasks	2022-12-07 22:07:11 +02:00
allegroai	6be75abc86	Add default output URI selection to "clearml-agent init"	2022-12-07 22:06:10 +02:00
allegroai	4c777fa2ee	Version bump to v1.5.0	2022-12-05 16:42:44 +02:00
allegroai	dc5e0033c8	Remove support for `kubectl run` Allow customizing pod name prefix and limit pod label Return deleted pods from cleanup Some refactoring	2022-12-05 11:40:19 +02:00
allegroai	3dd5973734	Filter by phase when detecting hanging pods More debug print-outs Use task session when possible Push task into k8s scheduler queue only if running from the same tenant Make sure we pass git_user/pass to the task pod Fix cleanup command not issued when no pods exist in a multi-queue setup	2022-12-05 11:29:59 +02:00
allegroai	53d379205f	Support `raise_error` in `get_bash_output()`	2022-12-05 11:26:40 +02:00
allegroai	57cde21c48	Send `task.ping` for executing tasks every 120 seconds (set using the `agent.task_ping_interval_sec` configuration option)	2022-12-05 11:22:25 +02:00
allegroai	396abf13b6	Fix `get_task_session()` may cause an old copy of the `APIClient` to be used containing a reference to the previous session	2022-12-05 11:20:32 +02:00
allegroai	6e7fb5f331	Fix sending task logs fails when agent is not running in the same tenant	2022-12-05 11:19:14 +02:00
allegroai	1d5c118b70	Fix setting `CLEARML_API_DEFAULT_REQ_METHOD` raises an error	2022-12-05 11:18:12 +02:00
allegroai	18612aac4d	Improve configuration examples	2022-12-05 11:17:27 +02:00
allegroai	76c533a2e8	Fix access to config object	2022-11-11 13:34:17 +02:00
Niels ten Boom	9eee213683	Add option to crash agent on exception using `agent.crash_on_exception` configuration setting (#123 )	2022-11-06 17:15:39 +02:00
allegroai	e4861fc0fb	Add missing settings in clearml.conf	2022-11-06 12:36:01 +02:00
allegroai	53ef984065	Update README	2022-11-06 11:53:16 +02:00
allegroai	26e62da1a8	version bump to 1.5.0rc0	2022-10-23 13:04:00 +03:00
allegroai	d2f3614ab0	Add support for CLEARML_AGENT_DOCKER_ARGS_HIDE_ENV environment variable (see `agent.hide_docker_command_env_vars` config option)	2022-10-23 13:04:00 +03:00
allegroai	c6d767bd64	Make venv caching the default behavior	2022-10-23 13:04:00 +03:00
allegroai	efb06891a8	Add support for PyTorch new extra_index_url repo support. We will find the correct index url based on the cuda version, and let pip do the rest.	2022-10-23 13:04:00 +03:00
allegroai	70771b12a9	Remove unused code	2022-10-23 13:04:00 +03:00
allegroai	3f7a4840cc	Add support for operator != in package version (mostly for pytorch resolving)	2022-10-23 13:04:00 +03:00
allegroai	e28048dc25	Change default pip version used to "pip<21" for better Python 3.10 support	2022-10-23 13:04:00 +03:00
allegroai	2ef5d38b32	Remove future (Python 2 is not supported for clearml-agent)	2022-10-23 13:03:59 +03:00
allegroai	d216d70cdf	Upgrade packages for better Python 3.10 support	2022-10-23 13:03:59 +03:00
allegroai	0de10345f7	Moved pyhocon to internal packages	2022-10-23 13:03:59 +03:00
allegroai	a243fa211f	Improve venv cache disabled message	2022-10-23 13:03:59 +03:00
allegroai	d794b047be	Fix system_site_packages is not turned on in k8s glue	2022-10-23 13:03:59 +03:00
allegroai	f0fd62a28f	Fix docker extra args showing up in configuration printout	2022-10-23 13:03:59 +03:00
allegroai	e8493d3807	Refactor override configuration to a method	2022-10-23 13:03:58 +03:00
Allegro AI	5353e9c44d	Update README.md	2022-10-19 02:47:10 +03:00
Allegro AI	75f5814f9f	Update README.md	2022-10-19 02:44:53 +03:00
Allegro AI	94b8b5520d	Update README.md	2022-10-19 02:18:56 +03:00
allegroai	42450dcbc4	Update clearml.conf	2022-10-07 15:33:19 +03:00
allegroai	ef47225d41	Version bump to v1.4.1	2022-10-07 15:27:49 +03:00
allegroai	e61accefb9	PEP8 + refactor	2022-10-07 15:26:31 +03:00
allegroai	5c1543d112	Add `agent.disable_ssh_mount` configuration option (same as `CLEARML_AGENT_DISABLE_SSH_MOUNT` env var)	2022-10-07 15:24:39 +03:00
allegroai	7ff6aee20c	Add warning if venv cache is disabled	2022-10-07 15:23:10 +03:00
allegroai	37ea381d98	Add support for docker args filters	2022-10-07 15:22:42 +03:00
allegroai	67fc884895	Fix `--gpus all` not reporting GPU stats on worker machine	2022-10-07 15:22:13 +03:00
allegroai	1e3646b57c	Fix docker command for monitoring child agents	2022-10-07 15:21:32 +03:00