From e60a6f9d14c0eb2064185b3a80e650ca050c5a60 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Sun, 25 Apr 2021 10:46:43 +0300
Subject: [PATCH 01/21] Fix --stop support for dynamic gpus

---
 clearml_agent/commands/worker.py | 47 ++++++++++++++++++--------------
 clearml_agent/helper/process.py  | 23 ++++++++++++++--
 2 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py
index c08b07f..3de4fb0 100644
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -90,7 +90,7 @@ from clearml_agent.helper.process import (
     get_bash_output,
     shutdown_docker_process,
     get_docker_id,
-    commit_docker, terminate_process, check_if_command_exists,
+    commit_docker, terminate_process, check_if_command_exists, terminate_all_child_processes,
 )
 from clearml_agent.helper.package.priority_req import PriorityPackageRequirement, PackageCollectorRequirement
 from clearml_agent.helper.repo import clone_repository_cached, RepoInfo, VCS, fix_package_import_diff_patch
@@ -685,6 +685,9 @@ class Worker(ServiceCommandSection):
                     shutdown_docker_process(docker_cmd_contains='--id {}\'\"'.format(task_id))
                 safe_remove_file(temp_stdout_name)
                 safe_remove_file(temp_stderr_name)
+                if self._services_mode and status == ExitStatus.interrupted:
+                    # unregister this worker, it was killed
+                    self._unregister()
 
     def run_tasks_loop(self, queues, worker_params, priority_order=True, gpu_indexes=None, gpu_queues=None):
         """
@@ -719,6 +722,7 @@ class Worker(ServiceCommandSection):
 
         # get current running instances
         available_gpus = None
+        dynamic_gpus_worker_id = None
         if gpu_indexes and gpu_queues:
             available_gpus, gpu_queues = self._setup_dynamic_gpus(gpu_queues)
             # multi instance support
@@ -812,7 +816,7 @@ class Worker(ServiceCommandSection):
                         self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id))
 
                         org_gpus = os.environ.get('NVIDIA_VISIBLE_DEVICES')
-                        worker_id = self.worker_id
+                        dynamic_gpus_worker_id = self.worker_id
                         # the following is only executed in dynamic gpus mode
                         if gpu_queues and gpu_queues.get(queue):
                             # pick the first available GPUs
@@ -836,7 +840,7 @@ class Worker(ServiceCommandSection):
                         self.run_one_task(queue, task_id, worker_params)
 
                         if gpu_queues:
-                            self.worker_id = worker_id
+                            self.worker_id = dynamic_gpus_worker_id
                             os.environ['CUDA_VISIBLE_DEVICES'] = \
                                 os.environ['NVIDIA_VISIBLE_DEVICES'] = org_gpus
 
@@ -864,18 +868,18 @@ class Worker(ServiceCommandSection):
                     if shutdown_docker_process(docker_cmd_contains='--id {}\'\"'.format(t_id)):
                         self.handle_task_termination(task_id=t_id, exit_code=0, stop_reason=TaskStopReason.stopped)
             else:
+                # if we are in dynamic gpus / services mode,
+                # we should send termination signal to all child processes
+                if self._services_mode:
+                    terminate_all_child_processes(timeout=120)
+
                 # if we are here, just kill all sub processes
                 kill_all_child_processes()
-                for t_id in set(list_task_gpus_ids.values()):
-                    # check if Task is running,
-                    task_info = get_task(
-                        self._session, t_id, only_fields=["status"]
-                    )
-                    # this is a bit risky we might have rerun it again after it already completed
-                    # basically we are not removing completed tasks from the list, hence the issue
-                    if str(task_info.status) == "in_progress":
-                        self.handle_task_termination(
-                            task_id=t_id, exit_code=0, stop_reason=TaskStopReason.stopped)
+
+            # unregister dynamic GPU worker, if we were terminated while setting up a Task
+            if dynamic_gpus_worker_id:
+                self.worker_id = dynamic_gpus_worker_id
+                self._unregister()
 
     def _dynamic_gpu_get_available(self, gpu_indexes):
         # noinspection PyBroadException
@@ -1792,15 +1796,16 @@ class Worker(ServiceCommandSection):
                 debug=self._session.debug_mode,
                 trace=self._session.trace,
             )
-            self.report_monitor(ResourceMonitor.StatusReport(task=current_task.id))
-            self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
+            try:
+                self.report_monitor(ResourceMonitor.StatusReport(task=current_task.id))
+                self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
+            finally:
+                self.stop_monitor()
+                self._unregister()
 
-            self.stop_monitor()
-            self._unregister()
-
-            if full_monitoring and self.temp_config_path:
-                safe_remove_file(self._session.config_file)
-                Singleton.close_pid_file()
+                if full_monitoring and self.temp_config_path:
+                    safe_remove_file(self._session.config_file)
+                    Singleton.close_pid_file()
             return
 
         self._session.print_configuration()
diff --git a/clearml_agent/helper/process.py b/clearml_agent/helper/process.py
index b8f4ef9..427d9c9 100644
--- a/clearml_agent/helper/process.py
+++ b/clearml_agent/helper/process.py
@@ -42,18 +42,18 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False):
     return output if not strip or not output else output.strip()
 
 
-def terminate_process(pid, timeout=10.):
+def terminate_process(pid, timeout=10., ignore_zombie=True):
     # noinspection PyBroadException
     try:
         proc = psutil.Process(pid)
         proc.terminate()
         cnt = 0
-        while proc.is_running() and cnt < timeout:
+        while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
             sleep(1.)
             cnt += 1
         proc.terminate()
         cnt = 0
-        while proc.is_running() and cnt < timeout:
+        while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
             sleep(1.)
             cnt += 1
         proc.kill()
@@ -84,6 +84,23 @@ def kill_all_child_processes(pid=None):
         parent.kill()
 
 
+def terminate_all_child_processes(pid=None, timeout=10., include_parent=True):
+    # get current process if pid not provided
+    if not pid:
+        pid = os.getpid()
+        include_parent = False
+    try:
+        parent = psutil.Process(pid)
+    except psutil.Error:
+        # could not find parent process id
+        return
+    for child in parent.children(recursive=False):
+        print('Terminating child process {}'.format(child.pid))
+        terminate_process(child.pid, timeout=timeout, ignore_zombie=False)
+    if include_parent:
+        terminate_process(parent.pid, timeout=timeout, ignore_zombie=False)
+
+
 def get_docker_id(docker_cmd_contains):
     try:
         containers_running = get_bash_output(cmd='docker ps --no-trunc --format \"{{.ID}}: {{.Command}}\"')

From 08ff5e6db7d8efae8f0cd6fff7728032d2851dd6 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Sun, 25 Apr 2021 10:47:49 +0300
Subject: [PATCH 02/21] Add number of pods limit to k8s glue

---
 clearml_agent/glue/k8s.py | 55 +++++++++++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 11 deletions(-)

diff --git a/clearml_agent/glue/k8s.py b/clearml_agent/glue/k8s.py
index d00abd2..8fdf2be 100644
--- a/clearml_agent/glue/k8s.py
+++ b/clearml_agent/glue/k8s.py
@@ -32,7 +32,7 @@ class K8sIntegration(Worker):
 
     K8S_DEFAULT_NAMESPACE = "clearml"
 
-    KUBECTL_APPLY_CMD = "kubectl apply -f"
+    KUBECTL_APPLY_CMD = "kubectl apply --namespace={namespace} -f"
 
     KUBECTL_RUN_CMD = "kubectl run clearml-{queue_name}-id-{task_id} " \
                       "--image {docker_image} " \
@@ -95,6 +95,7 @@ class K8sIntegration(Worker):
             clearml_conf_file=None,
             extra_bash_init_script=None,
             namespace=None,
+            max_pods_limit=None,
             **kwargs
     ):
         """
@@ -122,6 +123,7 @@ class K8sIntegration(Worker):
         :param str clearml_conf_file: clearml.conf file to be use by the pod itself (optional)
         :param str extra_bash_init_script: Additional bash script to run before starting the Task inside the container
         :param str namespace: K8S namespace to be used when creating the new pods (default: clearml)
+        :param int max_pods_limit: Maximum number of pods that K8S glue can run at the same time
         """
         super(K8sIntegration, self).__init__()
         self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE
@@ -147,6 +149,7 @@ class K8sIntegration(Worker):
         self.namespace = namespace or self.K8S_DEFAULT_NAMESPACE
         self.pod_limits = []
         self.pod_requests = []
+        self.max_pods_limit = max_pods_limit if not self.ports_mode else None
         if overrides_yaml:
             with open(os.path.expandvars(os.path.expanduser(str(overrides_yaml))), 'rt') as f:
                 overrides = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None))
@@ -311,13 +314,19 @@ class K8sIntegration(Worker):
         # Search for a free pod number
         pod_count = 0
         pod_number = self.base_pod_num
-        while self.ports_mode:
+        while self.ports_mode or self.max_pods_limit:
             pod_number = self.base_pod_num + pod_count
-            kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n {namespace}".format(
-                pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number),
-                agent_label=self.AGENT_LABEL,
-                namespace=self.namespace,
-            )
+            if self.ports_mode:
+                kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n {namespace}".format(
+                    pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number),
+                    agent_label=self.AGENT_LABEL,
+                    namespace=self.namespace,
+                )
+            else:
+                kubectl_cmd_new = "kubectl get pods -l {agent_label} -n {namespace} -o json".format(
+                    agent_label=self.AGENT_LABEL,
+                    namespace=self.namespace,
+                )
             process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             output, error = process.communicate()
             output = '' if not output else output if isinstance(output, str) else output.decode('utf-8')
@@ -326,18 +335,42 @@ class K8sIntegration(Worker):
             if not output:
                 # No such pod exist so we can use the pod_number we found
                 break
-            if pod_count >= self.num_of_services - 1:
-                # All pod numbers are taken, exit
+
+            if self.max_pods_limit:
+                try:
+                    current_pod_count = len(json.loads(output).get("items", []))
+                except (ValueError, TypeError) as ex:
+                    self.log.warning(
+                        "K8S Glue pods monitor: Failed parsing kubectl output:\n{}\ntask '{}' "
+                        "will be enqueued back to queue '{}'\nEx: {}".format(
+                            output, task_id, queue, ex
+                        )
+                    )
+                    self._session.api_client.tasks.reset(task_id)
+                    self._session.api_client.tasks.enqueue(task_id, queue=queue, status_reason='kubectl parsing error')
+                    return
+                max_count = self.max_pods_limit
+            else:
+                current_pod_count = pod_count
+                max_count = self.num_of_services - 1
+
+            if current_pod_count >= max_count:
+                # All pods are taken, exit
+                self.log.debug(
+                    "kubectl last result: {}\n{}".format(error, output))
                 self.log.warning(
-                    "kubectl last result: {}\n{}\nAll k8s services are in use, task '{}' "
+                    "All k8s services are in use, task '{}' "
                     "will be enqueued back to queue '{}'".format(
-                        error, output, task_id, queue
+                        task_id, queue
                     )
                 )
                 self._session.api_client.tasks.reset(task_id)
                 self._session.api_client.tasks.enqueue(
                     task_id, queue=queue, status_reason='k8s max pod limit (no free k8s service)')
                 return
+            elif self.max_pods_limit:
+                # max pods limit hasn't reached yet, so we can create the pod
+                break
             pod_count += 1
 
         labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + [self.AGENT_LABEL]

From 24dc59e31fc1814c98b81719db970cdaf675ecb6 Mon Sep 17 00:00:00 2001
From: Revital <revital@allegro.ai>
Date: Thu, 22 Apr 2021 14:14:38 +0300
Subject: [PATCH 03/21] add space to help message

---
 clearml_agent/interface/worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clearml_agent/interface/worker.py b/clearml_agent/interface/worker.py
index 7467840..ad5da38 100644
--- a/clearml_agent/interface/worker.py
+++ b/clearml_agent/interface/worker.py
@@ -50,7 +50,7 @@ DAEMON_ARGS = dict({
     },
     '--docker': {
         'help': 'Run execution task inside a docker (v19.03 and above). Optional args <image> <arguments> or '
-                'specify default docker image in agent.default_docker.image / agent.default_docker.arguments'
+                'specify default docker image in agent.default_docker.image / agent.default_docker.arguments '
                 'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker',
         'nargs': '*',
         'default': False,

From 823b67a3ce539b3d5a919221cd1c41302d6e2c29 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 28 Apr 2021 13:17:37 +0300
Subject: [PATCH 04/21] Deprecate venv_update (replaced by the more robust
 venvs_cache)

---
 docs/clearml.conf | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/clearml.conf b/docs/clearml.conf
index bd0c7eb..859333f 100644
--- a/docs/clearml.conf
+++ b/docs/clearml.conf
@@ -107,11 +107,12 @@ agent {
         path: ~/.clearml/vcs-cache
     },
 
+    # DEPRECATED: please use `venvs_cache` and set `venvs_cache.path`
     # use venv-update in order to accelerate python virtual environment building
     # Still in beta, turned off by default
-    venv_update: {
-        enabled: false,
-    },
+    # venv_update: {
+    #     enabled: false,
+    # },
 
     # cached folder for specific python package download (mostly pytorch versions)
     pip_download_cache {

From 3ec2a3a92e73ee22bec2a51945e54946aab398a4 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 28 Apr 2021 13:19:34 +0300
Subject: [PATCH 05/21] Add k8s pod limit to k8s glue example

---
 examples/k8s_glue_example.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/k8s_glue_example.py b/examples/k8s_glue_example.py
index b368b50..dc69c37 100644
--- a/examples/k8s_glue_example.py
+++ b/examples/k8s_glue_example.py
@@ -10,12 +10,15 @@ from clearml_agent.glue.k8s import K8sIntegration
 
 def parse_args():
     parser = ArgumentParser()
+    group = parser.add_mutually_exclusive_group()
+
     parser.add_argument(
         "--queue", type=str, help="Queue to pull tasks from"
     )
-    parser.add_argument(
+    group.add_argument(
         "--ports-mode", action='store_true', default=False,
         help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports"
+             "Should not be used with max-pods"
     )
     parser.add_argument(
         "--num-of-services", type=int, default=20,
@@ -57,6 +60,11 @@ def parse_args():
         "--namespace", type=str,
         help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml"
     )
+    group.add_argument(
+        "--max-pods", type=int,
+        help="Limit the maximum number of pods that this service can run at the same time."
+             "Should not be used with ports-mode"
+    )
     return parser.parse_args()
 
 
@@ -77,7 +85,7 @@ def main():
         user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf,
         template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash(
             ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None,
-        namespace=args.namespace,
+        namespace=args.namespace, max_pods_limit=args.max_pods or None,
     )
     k8s.k8s_daemon(args.queue)
 

From 4f18bb7ea0600db6ee63ecb5ad0ea8d048a272ca Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 28 Apr 2021 13:20:13 +0300
Subject: [PATCH 06/21] Add k8s glue default restartPolicy=Never to template to
 prevent pods from restarting

---
 clearml_agent/glue/k8s.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clearml_agent/glue/k8s.py b/clearml_agent/glue/k8s.py
index 8fdf2be..cdbeea7 100644
--- a/clearml_agent/glue/k8s.py
+++ b/clearml_agent/glue/k8s.py
@@ -446,6 +446,7 @@ class K8sIntegration(Worker):
         template['metadata']['name'] = name
         template.setdefault('spec', {})
         template['spec'].setdefault('containers', [])
+        template['spec'].setdefault('restartPolicy', 'Never')
         if labels:
             labels_dict = dict(pair.split('=', 1) for pair in labels)
             template['metadata'].setdefault('labels', {})

From cec6420c8f40d92ab1cd6cbe5ca8f24cf351abd8 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Mon, 3 May 2021 18:33:53 +0300
Subject: [PATCH 07/21] Version bump to v1.0.0

---
 clearml_agent/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clearml_agent/version.py b/clearml_agent/version.py
index 84d3488..1f356cc 100644
--- a/clearml_agent/version.py
+++ b/clearml_agent/version.py
@@ -1 +1 @@
-__version__ = '0.17.2'
+__version__ = '1.0.0'

From a2db1f5ab5cbf178840da736afdc370cfff43f0f Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 5 May 2021 11:58:37 +0300
Subject: [PATCH 08/21] Remove queue name from pod name in k8s glue, add queue
 name and ID to pod labels (issue #64)

---
 clearml_agent/glue/k8s.py | 31 +++++++++++++++++++------------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/clearml_agent/glue/k8s.py b/clearml_agent/glue/k8s.py
index cdbeea7..4b5375a 100644
--- a/clearml_agent/glue/k8s.py
+++ b/clearml_agent/glue/k8s.py
@@ -34,7 +34,7 @@ class K8sIntegration(Worker):
 
     KUBECTL_APPLY_CMD = "kubectl apply --namespace={namespace} -f"
 
-    KUBECTL_RUN_CMD = "kubectl run clearml-{queue_name}-id-{task_id} " \
+    KUBECTL_RUN_CMD = "kubectl run clearml-id-{task_id} " \
                       "--image {docker_image} " \
                       "--restart=Never " \
                       "--namespace={namespace}"
@@ -307,10 +307,6 @@ class K8sIntegration(Worker):
         except Exception:
             queue_name = 'k8s'
 
-        # conform queue name to k8s standards
-        safe_queue_name = queue_name.lower().strip()
-        safe_queue_name = re.sub(r'\W+', '', safe_queue_name).replace('_', '').replace('-', '')
-
         # Search for a free pod number
         pod_count = 0
         pod_number = self.base_pod_num
@@ -374,6 +370,8 @@ class K8sIntegration(Worker):
             pod_count += 1
 
         labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + [self.AGENT_LABEL]
+        labels.append("clearml-agent-queue={}".format(self._safe_k8s_label_value(queue)))
+        labels.append("clearml-agent-queue-name={}".format(self._safe_k8s_label_value(queue_name)))
 
         if self.ports_mode:
             print("Kubernetes scheduling task id={} on pod={} (pod_count={})".format(task_id, pod_number, pod_count))
@@ -384,13 +382,13 @@ class K8sIntegration(Worker):
             output, error = self._kubectl_apply(
                 create_clearml_conf=create_clearml_conf,
                 labels=labels, docker_image=docker_image, docker_args=docker_args,
-                task_id=task_id, queue=queue, queue_name=safe_queue_name)
+                task_id=task_id, queue=queue)
         else:
             output, error = self._kubectl_run(
                 create_clearml_conf=create_clearml_conf,
                 labels=labels, docker_image=docker_cmd,
                 task_data=task_data,
-                task_id=task_id, queue=queue, queue_name=safe_queue_name)
+                task_id=task_id, queue=queue)
 
         error = '' if not error else (error if isinstance(error, str) else error.decode('utf-8'))
         output = '' if not output else (output if isinstance(output, str) else output.decode('utf-8'))
@@ -437,12 +435,12 @@ class K8sIntegration(Worker):
                 self.log.warning('skipping docker argument {} (only -e --env supported)'.format(cmd))
         return {'env': kube_args} if kube_args else {}
 
-    def _kubectl_apply(self, create_clearml_conf, docker_image, docker_args, labels, queue, task_id, queue_name):
+    def _kubectl_apply(self, create_clearml_conf, docker_image, docker_args, labels, queue, task_id):
         template = deepcopy(self.template_dict)
         template.setdefault('apiVersion', 'v1')
         template['kind'] = 'Pod'
         template.setdefault('metadata', {})
-        name = 'clearml-{queue}-id-{task_id}'.format(queue=queue_name, task_id=task_id)
+        name = 'clearml-id-{task_id}'.format(task_id=task_id)
         template['metadata']['name'] = name
         template.setdefault('spec', {})
         template['spec'].setdefault('containers', [])
@@ -508,12 +506,11 @@ class K8sIntegration(Worker):
 
         return output, error
 
-    def _kubectl_run(self, create_clearml_conf, docker_image, labels, queue, task_data, task_id, queue_name):
+    def _kubectl_run(self, create_clearml_conf, docker_image, labels, queue, task_data, task_id):
         if callable(self.kubectl_cmd):
-            kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data, queue_name)
+            kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data)
         else:
             kubectl_cmd = self.kubectl_cmd.format(
-                queue_name=queue_name,
                 task_id=task_id,
                 docker_image=docker_image,
                 queue_id=queue,
@@ -641,3 +638,13 @@ class K8sIntegration(Worker):
         return merge_dicts(
             c1, c2, custom_merge_func=merge_env
         )
+
+    @staticmethod
+    def _safe_k8s_label_value(value):
+        """ Conform string to k8s standards for a label value """
+        value = value.lower().strip()
+        value = re.sub(r'^[^A-Za-z0-9]+', '', value)  # strip leading non-alphanumeric chars
+        value = re.sub(r'[^A-Za-z0-9]+$', '', value)  # strip trailing non-alphanumeric chars
+        value = re.sub(r'\W+', '-', value)  # allow only word chars (this removed "." which is supported, but nvm)
+        value = re.sub(r'-+', '-', value)  # don't leave messy "--" after replacing previous chars
+        return value[:63]

From ae3d0345313f10f31c83163bab936472577d4e69 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 12 May 2021 15:45:31 +0300
Subject: [PATCH 09/21] Protect against None in execution.repository

---
 clearml_agent/helper/repo.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clearml_agent/helper/repo.py b/clearml_agent/helper/repo.py
index f39a4f8..551c9f3 100644
--- a/clearml_agent/helper/repo.py
+++ b/clearml_agent/helper/repo.py
@@ -591,7 +591,7 @@ def clone_repository_cached(session, execution, destination):
     # mock lock
     repo_lock = Lock()
     repo_lock_timeout_sec = 300
-    repo_url = execution.repository  # type: str
+    repo_url = execution.repository or ''  # type: str
     parsed_url = furl(repo_url)
     no_password_url = parsed_url.copy().remove(password=True).url
 

From 4f7407084d1900a79d455570c573e60f40208742 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 12 May 2021 15:46:25 +0300
Subject: [PATCH 10/21] Fix standalone script with pre-exiting conda venv

---
 clearml_agent/commands/worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py
index 3de4fb0..55db405 100644
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -187,7 +187,7 @@ class LiteralScriptManager(object):
         location = location or (repo_info and repo_info.root)
         if not location:
             location = Path(self.venv_folder, "code")
-            location.mkdir(exist_ok=True)
+            location.mkdir(exist_ok=True, parents=True)
         log.debug("selected execution directory: %s", location)
         return Text(location), self.write(task, location, execution.entry_point)
 

From 1e795beec87be3673dfbd33735763d51f3670cd4 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 19 May 2021 15:20:03 +0300
Subject: [PATCH 11/21] Fix support for spaces in docker arguments (issue #358)

---
 clearml_agent/commands/worker.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py
index 55db405..36cef19 100644
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -11,6 +11,7 @@ import subprocess
 import sys
 import shutil
 import traceback
+import shlex
 from collections import defaultdict
 from copy import deepcopy, copy
 from datetime import datetime
@@ -221,6 +222,9 @@ def get_task(session, task_id, *args, **kwargs):
 
 
 def get_task_container(session, task_id):
+    """
+    Returns dict with Task docker container setup {container: '', arguments: '', setup_shell_script: ''}
+    """
     if session.check_min_api_version("2.13"):
         result = session.send_request(
             service='tasks',
@@ -233,12 +237,12 @@ def get_task_container(session, task_id):
         try:
             container = result.json()['data']['tasks'][0]['container'] if result.ok else {}
             if container.get('arguments'):
-                container['arguments'] = str(container.get('arguments')).split(' ')
+                container['arguments'] = shlex.split(str(container.get('arguments')).strip())
         except (ValueError, TypeError):
             container = {}
     else:
         response = get_task(session, task_id, only_fields=["execution.docker_cmd"])
-        task_docker_cmd_parts = str(response.execution.docker_cmd or '').strip().split(' ')
+        task_docker_cmd_parts = shlex.split(str(response.execution.docker_cmd or '').strip())
         try:
             container = dict(
                 container=task_docker_cmd_parts[0],
@@ -251,11 +255,14 @@ def get_task_container(session, task_id):
 
 
 def set_task_container(session, task_id, docker_image=None, docker_arguments=None, docker_bash_script=None):
+    if docker_arguments and isinstance(docker_arguments, str):
+        docker_arguments = [docker_arguments]
+
     if session.check_min_api_version("2.13"):
         container = dict(
-            image=docker_image or None,
-            arguments=' '.join(docker_arguments) if docker_arguments else None,
-            setup_shell_script=docker_bash_script or None,
+            image=docker_image or '',
+            arguments=' '.join(docker_arguments) if docker_arguments else '',
+            setup_shell_script=docker_bash_script or '',
         )
         result = session.send_request(
             service='tasks',
@@ -1913,7 +1920,6 @@ class Worker(ServiceCommandSection):
             if current_task.script.binary and current_task.script.binary.startswith('python') and \
                     execution.entry_point and execution.entry_point.split()[0].strip() == '-m':
                 # we need to split it
-                import shlex
                 extra.extend(shlex.split(execution.entry_point))
             else:
                 extra.append(execution.entry_point)

From 3c4e976093927ce9b1fc0c14a91c13b71d855e18 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 19 May 2021 15:20:44 +0300
Subject: [PATCH 12/21] Add agent.ignore_requested_python_version to config
 file

---
 clearml_agent/backend_api/config/default/agent.conf | 3 +++
 docs/clearml.conf                                   | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/clearml_agent/backend_api/config/default/agent.conf b/clearml_agent/backend_api/config/default/agent.conf
index de82bf3..a66d675 100644
--- a/clearml_agent/backend_api/config/default/agent.conf
+++ b/clearml_agent/backend_api/config/default/agent.conf
@@ -26,6 +26,9 @@
     # Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
     # The default is the python executing the clearml_agent
     python_binary: ""
+    # ignore any requested python version (Default: False, if a Task was using a
+    # specific python version and the system supports multiple python the agent will use the requested python version)
+    # ignore_requested_python_version: true
 
     # select python package manager:
     # currently supported pip and conda
diff --git a/docs/clearml.conf b/docs/clearml.conf
index 859333f..05e0922 100644
--- a/docs/clearml.conf
+++ b/docs/clearml.conf
@@ -42,6 +42,9 @@ agent {
     # Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6"
     # The default is the python executing the clearml_agent
     python_binary: ""
+    # ignore any requested python version (Default: False, if a Task was using a
+    # specific python version and the system supports multiple python the agent will use the requested python version)
+    # ignore_requested_python_version: true
 
     # select python package manager:
     # currently supported pip and conda

From e93384b99bdfd72a54cf2b68b3991b145b504b79 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Thu, 20 May 2021 10:58:46 +0300
Subject: [PATCH 13/21] Fix --stop with dynamic gpus

---
 clearml_agent/commands/worker.py | 11 +++++++++--
 clearml_agent/helper/process.py  | 18 ++++++++++++++----
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py
index 36cef19..a93d900 100644
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -878,7 +878,7 @@ class Worker(ServiceCommandSection):
                 # if we are in dynamic gpus / services mode,
                 # we should send termination signal to all child processes
                 if self._services_mode:
-                    terminate_all_child_processes(timeout=120)
+                    terminate_all_child_processes(timeout=20, include_parent=False)
 
                 # if we are here, just kill all sub processes
                 kill_all_child_processes()
@@ -1371,6 +1371,7 @@ class Worker(ServiceCommandSection):
         service_mode_internal_agent_started = None
         stopping = False
         status = None
+        process = None
         try:
             _last_machine_update_ts = time()
             stop_reason = None
@@ -1427,6 +1428,8 @@ class Worker(ServiceCommandSection):
             status = ex.returncode
         except KeyboardInterrupt:
             # so someone else will catch us
+            if process:
+                kill_all_child_processes(process.pid)
             raise
         except Exception:
             # we should not get here, but better safe than sorry
@@ -1438,6 +1441,10 @@ class Worker(ServiceCommandSection):
             stop_reason = TaskStopReason.exception
             status = -1
 
+        # full cleanup (just in case)
+        if process:
+            kill_all_child_processes(process.pid)
+
         # if running in services mode, keep the file open
         # in case the docker was so quick it started and finished, check the stop reason
         if self._services_mode and service_mode_internal_agent_started and stop_reason == 'Service started':
@@ -3091,7 +3098,7 @@ class Worker(ServiceCommandSection):
                     warning('Could not terminate process pid={}'.format(pid))
                 return True
 
-            # wither we have a match for the worker_id or we just pick the first one, and kill it.
+            # either we have a match for the worker_id or we just pick the first one, and kill it.
             if (worker_id and uid == worker_id) or (not worker_id and uid.startswith('{}:'.format(worker_name))):
                 # this is us kill it
                 print('Terminating clearml-agent worker_id={} pid={}'.format(uid, pid))
diff --git a/clearml_agent/helper/process.py b/clearml_agent/helper/process.py
index 427d9c9..c92c3cf 100644
--- a/clearml_agent/helper/process.py
+++ b/clearml_agent/helper/process.py
@@ -42,20 +42,31 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False):
     return output if not strip or not output else output.strip()
 
 
-def terminate_process(pid, timeout=10., ignore_zombie=True):
+def terminate_process(pid, timeout=10., ignore_zombie=True, include_children=False):
     # noinspection PyBroadException
     try:
         proc = psutil.Process(pid)
+        children = proc.children(recursive=True) if include_children else []
         proc.terminate()
         cnt = 0
         while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
             sleep(1.)
             cnt += 1
         proc.terminate()
+
+        # terminate children
+        for c in children:
+            c.terminate()
+
         cnt = 0
         while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout:
             sleep(1.)
             cnt += 1
+
+        # kill children
+        for c in children:
+            c.kill()
+
         proc.kill()
     except Exception:
         pass
@@ -66,9 +77,8 @@ def terminate_process(pid, timeout=10., ignore_zombie=True):
         return True
 
 
-def kill_all_child_processes(pid=None):
+def kill_all_child_processes(pid=None, include_parent=True):
     # get current process if pid not provided
-    include_parent = True
     if not pid:
         pid = os.getpid()
         include_parent = False
@@ -96,7 +106,7 @@ def terminate_all_child_processes(pid=None, timeout=10., include_parent=True):
         return
     for child in parent.children(recursive=False):
         print('Terminating child process {}'.format(child.pid))
-        terminate_process(child.pid, timeout=timeout, ignore_zombie=False)
+        terminate_process(child.pid, timeout=timeout, ignore_zombie=False, include_children=True)
     if include_parent:
         terminate_process(parent.pid, timeout=timeout, ignore_zombie=False)
 

From 742cbf57670815a80a0c502ef61da12521e1e71f Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Tue, 25 May 2021 19:31:45 +0300
Subject: [PATCH 14/21] Add docker environment arguments log masking support
 (issue #67)

---
 .../backend_api/config/default/agent.conf     | 12 ++++
 clearml_agent/commands/worker.py              | 55 ++++++++++++++++---
 clearml_agent/definitions.py                  | 12 ++--
 clearml_agent/helper/process.py               |  5 +-
 4 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/clearml_agent/backend_api/config/default/agent.conf b/clearml_agent/backend_api/config/default/agent.conf
index a66d675..fa37231 100644
--- a/clearml_agent/backend_api/config/default/agent.conf
+++ b/clearml_agent/backend_api/config/default/agent.conf
@@ -185,4 +185,16 @@
     # should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION
     # cuda_version: 10.1
     # cudnn_version: 7.6
+
+    # Hide docker environment variables containing secrets when printing out the docker command by replacing their
+    # values with "********". Turning this feature on will hide the following environment variables values:
+    #   CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY
+    # To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of
+    # your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the
+    # docker command, set:
+    #   extra_keys: ["MY_SPECIAL_PASSWORD"]
+    hide_docker_command_env_vars {
+        enabled: true
+        extra_keys: []
+    }
 }
diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py
index a93d900..8e58f14 100644
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -20,7 +20,7 @@ from functools import partial, cmp_to_key
 from itertools import chain
 from tempfile import mkdtemp, NamedTemporaryFile
 from time import sleep, time
-from typing import Text, Optional, Any, Tuple
+from typing import Text, Optional, Any, Tuple, List
 
 import attr
 import psutil
@@ -44,7 +44,13 @@ from clearml_agent.definitions import (
     ENV_DOCKER_HOST_MOUNT,
     ENV_TASK_EXTRA_PYTHON_PATH,
     ENV_AGENT_GIT_USER,
-    ENV_AGENT_GIT_PASS, ENV_WORKER_ID, ENV_DOCKER_SKIP_GPUS_FLAG, )
+    ENV_AGENT_GIT_PASS,
+    ENV_WORKER_ID,
+    ENV_DOCKER_SKIP_GPUS_FLAG,
+    ENV_AGENT_SECRET_KEY,
+    ENV_AWS_SECRET_KEY,
+    ENV_AZURE_ACCOUNT_KEY,
+)
 from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
 from clearml_agent.errors import APIError, CommandFailedError, Sigterm
 from clearml_agent.helper.base import (
@@ -68,7 +74,9 @@ from clearml_agent.helper.base import (
     get_python_path,
     is_linux_platform,
     rm_file,
-    add_python_path, safe_remove_tree, )
+    add_python_path,
+    safe_remove_tree,
+)
 from clearml_agent.helper.console import ensure_text, print_text, decode_binary_lines
 from clearml_agent.helper.os.daemonize import daemonize_process
 from clearml_agent.helper.package.base import PackageManager
@@ -91,7 +99,10 @@ from clearml_agent.helper.process import (
     get_bash_output,
     shutdown_docker_process,
     get_docker_id,
-    commit_docker, terminate_process, check_if_command_exists, terminate_all_child_processes,
+    commit_docker,
+    terminate_process,
+    check_if_command_exists,
+    terminate_all_child_processes,
 )
 from clearml_agent.helper.package.priority_req import PriorityPackageRequirement, PackageCollectorRequirement
 from clearml_agent.helper.repo import clone_repository_cached, RepoInfo, VCS, fix_package_import_diff_patch
@@ -621,10 +632,13 @@ class Worker(ServiceCommandSection):
                 '--standalone-mode' if self._standalone_mode else '',
                 task_id)
 
-            # send the actual used command line to the backend
-            self.send_logs(task_id=task_id, lines=['Executing: {}\n'.format(full_docker_cmd)], level="INFO")
+            display_docker_command = self._sanitize_docker_command(full_docker_cmd)
+
+            # send the actual used command line to the backend
+            self.send_logs(task_id=task_id, lines=['Executing: {}\n'.format(display_docker_command)], level="INFO")
+
+            cmd = Argv(*full_docker_cmd, display_argv=display_docker_command)
 
-            cmd = Argv(*full_docker_cmd)
             print('Running Docker:\n{}\n'.format(str(cmd)))
         else:
             cmd = worker_args.get_argv_for_command("execute") + (
@@ -3161,6 +3175,33 @@ class Worker(ServiceCommandSection):
             queue_ids.append(q_id)
         return queue_ids
 
+    def _sanitize_docker_command(self, docker_command):
+        # type: (List[str]) -> List[str]
+        if not self._session.config.get('agent.hide_docker_command_env_vars.enabled', False):
+            return docker_command
+
+        keys = set(self._session.config.get('agent.hide_docker_command_env_vars.extra_keys', []))
+        keys.update(
+            ENV_AGENT_GIT_PASS.vars,
+            ENV_AGENT_SECRET_KEY.vars,
+            ENV_AWS_SECRET_KEY.vars,
+            ENV_AZURE_ACCOUNT_KEY.vars
+        )
+
+        result = docker_command[:]
+        for i, item in enumerate(docker_command):
+            try:
+                if item not in ("-e", "--env"):
+                    continue
+                key, sep, _ = result[i + 1].partition("=")
+                if key not in keys or not sep:
+                    continue
+                result[i + 1] = "{}={}".format(key, "********")
+            except KeyError:
+                pass
+
+        return result
+
 
 if __name__ == "__main__":
     pass
diff --git a/clearml_agent/definitions.py b/clearml_agent/definitions.py
index c71db5f..6aa0451 100644
--- a/clearml_agent/definitions.py
+++ b/clearml_agent/definitions.py
@@ -62,6 +62,10 @@ class EnvironmentConfig(object):
         return None
 
 
+ENV_AGENT_SECRET_KEY = EnvironmentConfig("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY")
+ENV_AWS_SECRET_KEY = EnvironmentConfig("AWS_SECRET_ACCESS_KEY")
+ENV_AZURE_ACCOUNT_KEY = EnvironmentConfig("AZURE_STORAGE_KEY")
+
 ENVIRONMENT_CONFIG = {
     "api.api_server": EnvironmentConfig("CLEARML_API_HOST", "TRAINS_API_HOST", ),
     "api.files_server": EnvironmentConfig("CLEARML_FILES_HOST", "TRAINS_FILES_HOST", ),
@@ -69,9 +73,7 @@ ENVIRONMENT_CONFIG = {
     "api.credentials.access_key": EnvironmentConfig(
         "CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY",
     ),
-    "api.credentials.secret_key": EnvironmentConfig(
-        "CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY",
-    ),
+    "api.credentials.secret_key": ENV_AGENT_SECRET_KEY,
     "agent.worker_name": EnvironmentConfig("CLEARML_WORKER_NAME", "TRAINS_WORKER_NAME", ),
     "agent.worker_id": EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID", ),
     "agent.cuda_version": EnvironmentConfig(
@@ -84,10 +86,10 @@ ENVIRONMENT_CONFIG = {
         names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool
     ),
     "sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
-    "sdk.aws.s3.secret": EnvironmentConfig("AWS_SECRET_ACCESS_KEY"),
+    "sdk.aws.s3.secret": ENV_AWS_SECRET_KEY,
     "sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),
     "sdk.azure.storage.containers.0": {'account_name': EnvironmentConfig("AZURE_STORAGE_ACCOUNT"),
-                                       'account_key': EnvironmentConfig("AZURE_STORAGE_KEY")},
+                                       'account_key': ENV_AZURE_ACCOUNT_KEY},
     "sdk.google.storage.credentials_json": EnvironmentConfig("GOOGLE_APPLICATION_CREDENTIALS"),
 }
 
diff --git a/clearml_agent/helper/process.py b/clearml_agent/helper/process.py
index c92c3cf..fc86785 100644
--- a/clearml_agent/helper/process.py
+++ b/clearml_agent/helper/process.py
@@ -221,6 +221,7 @@ class Argv(Executable):
         """
         self.argv = argv
         self._log = kwargs.pop("log", None)
+        self._display_argv = kwargs.pop("display_argv", argv)
         if not self._log:
             self._log = logging.getLogger(__name__)
             self._log.propagate = False
@@ -245,10 +246,10 @@ class Argv(Executable):
         return self.argv
 
     def __repr__(self):
-        return "<Argv{}>".format(self.argv)
+        return "<Argv{}>".format(self._display_argv)
 
     def __str__(self):
-        return "Executing: {}".format(self.argv)
+        return "Executing: {}".format(self._display_argv)
 
     def __iter__(self):
         if is_windows_platform():

From 0694b9e8afcec119aec35fbe7e2908e283ddcc8d Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 26 May 2021 18:33:35 +0300
Subject: [PATCH 15/21] Fix PyYAML supported versions

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index ba96007..5186b3f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ pyhocon>=0.3.38,<0.4.0
 pyparsing>=2.0.3,<2.5.0
 python-dateutil>=2.4.2,<2.9.0
 pyjwt>=1.6.4,<2.1.0
-PyYAML>=3.12,<5.4.0
+PyYAML>=3.12,<5.5.0
 requests>=2.20.0,<2.26.0
 six>=1.11.0,<1.16.0
 typing>=3.6.4,<3.8.0

From 3a07bfe1d731aaff7f1f783373a68820c58f55af Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Mon, 31 May 2021 23:19:46 +0300
Subject: [PATCH 16/21] Version bump

---
 clearml_agent/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clearml_agent/version.py b/clearml_agent/version.py
index 1f356cc..e697a0c 100644
--- a/clearml_agent/version.py
+++ b/clearml_agent/version.py
@@ -1 +1 @@
-__version__ = '1.0.0'
+__version__ = '1.0.1rc0'

From 7e90ebd5db1c289a07df67a0d59de5dc861b61f0 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 2 Jun 2021 13:16:17 +0300
Subject: [PATCH 17/21] Fix _dynamic_gpu_get_available worker timeout increase
 to 10 minutes

---
 clearml_agent/commands/worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py
index 8e58f14..a980a6f 100644
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -905,7 +905,7 @@ class Worker(ServiceCommandSection):
     def _dynamic_gpu_get_available(self, gpu_indexes):
         # noinspection PyBroadException
         try:
-            response = self._session.send_api(workers_api.GetAllRequest(last_seen=60))
+            response = self._session.send_api(workers_api.GetAllRequest(last_seen=600))
         except Exception:
             return None
 

From 8c56777125d3b90936f52db3d5ca8de927a810f5 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 2 Jun 2021 13:16:58 +0300
Subject: [PATCH 18/21] Add CLEARML_AGENT_DISABLE_SSH_MOUNT allowing disabling
 the auto .ssh mount into the docker

---
 clearml_agent/commands/worker.py | 45 ++++++++++++++++++++------------
 clearml_agent/definitions.py     |  1 +
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py
index a980a6f..7c45bc8 100644
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -50,6 +50,7 @@ from clearml_agent.definitions import (
     ENV_AGENT_SECRET_KEY,
     ENV_AWS_SECRET_KEY,
     ENV_AZURE_ACCOUNT_KEY,
+    ENV_AGENT_DISABLE_SSH_MOUNT,
 )
 from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES
 from clearml_agent.errors import APIError, CommandFailedError, Sigterm
@@ -2725,8 +2726,11 @@ class Worker(ServiceCommandSection):
         if temp_config.get("agent.venvs_cache.path", None):
             temp_config.put("agent.venvs_cache.path", '/root/.clearml/venvs-cache')
 
-        self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.')
-        self._temp_cleanup_list.append(self._host_ssh_cache)
+        if ENV_AGENT_DISABLE_SSH_MOUNT.get():
+            self._host_ssh_cache = None
+        else:
+            self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.')
+            self._temp_cleanup_list.append(self._host_ssh_cache)
 
         return temp_config, partial(self._get_docker_config_cmd, temp_config=temp_config)
 
@@ -2748,24 +2752,31 @@ class Worker(ServiceCommandSection):
             "agent.docker_pip_cache", '~/.clearml/pip-cache'))).expanduser().as_posix()
 
         # make sure all folders are valid
-        Path(host_apt_cache).mkdir(parents=True, exist_ok=True)
-        Path(host_pip_cache).mkdir(parents=True, exist_ok=True)
-        Path(host_cache).mkdir(parents=True, exist_ok=True)
-        Path(host_pip_dl).mkdir(parents=True, exist_ok=True)
-        Path(host_vcs_cache).mkdir(parents=True, exist_ok=True)
-        Path(host_ssh_cache).mkdir(parents=True, exist_ok=True)
+        if host_apt_cache:
+            Path(host_apt_cache).mkdir(parents=True, exist_ok=True)
+        if host_pip_cache:
+            Path(host_pip_cache).mkdir(parents=True, exist_ok=True)
+        if host_cache:
+            Path(host_cache).mkdir(parents=True, exist_ok=True)
+        if host_pip_dl:
+            Path(host_pip_dl).mkdir(parents=True, exist_ok=True)
+        if host_vcs_cache:
+            Path(host_vcs_cache).mkdir(parents=True, exist_ok=True)
+        if host_ssh_cache:
+            Path(host_ssh_cache).mkdir(parents=True, exist_ok=True)
         if host_venvs_cache:
             Path(host_venvs_cache).mkdir(parents=True, exist_ok=True)
 
-        # copy the .ssh folder to a temp folder, to be mapped into docker
-        # noinspection PyBroadException
-        try:
-            if Path(host_ssh_cache).is_dir():
-                shutil.rmtree(host_ssh_cache, ignore_errors=True)
-            shutil.copytree(Path('~/.ssh').expanduser().as_posix(), host_ssh_cache)
-        except Exception:
-            host_ssh_cache = None
-            self.log.warning('Failed creating temporary copy of ~/.ssh for git credential')
+        if host_ssh_cache:
+            # copy the .ssh folder to a temp folder, to be mapped into docker
+            # noinspection PyBroadException
+            try:
+                if Path(host_ssh_cache).is_dir():
+                    shutil.rmtree(host_ssh_cache, ignore_errors=True)
+                shutil.copytree(Path('~/.ssh').expanduser().as_posix(), host_ssh_cache)
+            except Exception:
+                host_ssh_cache = None
+                self.log.warning('Failed creating temporary copy of ~/.ssh for git credential')
 
         # check if the .git credentials exist:
         try:
diff --git a/clearml_agent/definitions.py b/clearml_agent/definitions.py
index 6aa0451..6b0d8e2 100644
--- a/clearml_agent/definitions.py
+++ b/clearml_agent/definitions.py
@@ -134,6 +134,7 @@ ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig('CLEARML_DOCKER_SKIP_GPUS_FLAG', '
 ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER')
 ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS')
 ENV_AGENT_GIT_HOST = EnvironmentConfig('CLEARML_AGENT_GIT_HOST', 'TRAINS_AGENT_GIT_HOST')
+ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig('CLEARML_AGENT_DISABLE_SSH_MOUNT', type=bool)
 ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig('CLEARML_AGENT_EXEC_USER', 'TRAINS_AGENT_EXEC_USER')
 ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig('CLEARML_AGENT_EXTRA_PYTHON_PATH', 'TRAINS_AGENT_EXTRA_PYTHON_PATH')
 ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEARML_AGENT_DOCKER_HOST_MOUNT',

From eda597dea5d63dd7f8414610c4d6ad3bd9bb9d70 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 2 Jun 2021 13:17:37 +0300
Subject: [PATCH 19/21] Version bump

---
 clearml_agent/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clearml_agent/version.py b/clearml_agent/version.py
index e697a0c..6dada8f 100644
--- a/clearml_agent/version.py
+++ b/clearml_agent/version.py
@@ -1 +1 @@
-__version__ = '1.0.1rc0'
+__version__ = '1.0.1rc1'

From 29bf993be73f37bbcf6aae4e084b14b4a29576a6 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 2 Jun 2021 21:15:48 +0300
Subject: [PATCH 20/21] Add printout when using key/secret from env vars

---
 clearml_agent/backend_api/session/session.py |  6 ++++--
 clearml_agent/backend_config/entry.py        | 17 ++++++++++++-----
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/clearml_agent/backend_api/session/session.py b/clearml_agent/backend_api/session/session.py
index 27fc877..dbf8cf4 100644
--- a/clearml_agent/backend_api/session/session.py
+++ b/clearml_agent/backend_api/session/session.py
@@ -111,7 +111,8 @@ class Session(TokenManager):
         self._logger = logger
 
         self.__access_key = api_key or ENV_ACCESS_KEY.get(
-            default=(self.config.get("api.credentials.access_key", None) or self.default_key)
+            default=(self.config.get("api.credentials.access_key", None) or self.default_key),
+            value_cb=lambda key, value: logger.info("Using environment access key {}={}".format(key, value))
         )
         if not self.access_key:
             raise ValueError(
@@ -119,7 +120,8 @@ class Session(TokenManager):
             )
 
         self.__secret_key = secret_key or ENV_SECRET_KEY.get(
-            default=(self.config.get("api.credentials.secret_key", None) or self.default_secret)
+            default=(self.config.get("api.credentials.secret_key", None) or self.default_secret),
+            value_cb=lambda key, value: logger.info("Using environment secret key {}=********".format(key))
         )
         if not self.secret_key:
             raise ValueError(
diff --git a/clearml_agent/backend_config/entry.py b/clearml_agent/backend_config/entry.py
index 1107563..489d326 100644
--- a/clearml_agent/backend_config/entry.py
+++ b/clearml_agent/backend_config/entry.py
@@ -64,8 +64,8 @@ class Entry(object):
             converter = self.default_conversions().get(self.type, self.type)
         return converter(value)
 
-    def get_pair(self, default=NotSet, converter=None):
-        # type: (Any, Converter) -> Optional[Tuple[Text, Any]]
+    def get_pair(self, default=NotSet, converter=None, value_cb=None):
+        # type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Tuple[Text, Any]]
         for key in self.keys:
             value = self._get(key)
             if value is NotSet:
@@ -75,13 +75,20 @@ class Entry(object):
             except Exception as ex:
                 self.error("invalid value {key}={value}: {ex}".format(**locals()))
                 break
+            # noinspection PyBroadException
+            try:
+                if value_cb:
+                    value_cb(key, value)
+            except Exception:
+                pass
             return key, value
+
         result = self.default if default is NotSet else default
         return self.key, result
 
-    def get(self, default=NotSet, converter=None):
-        # type: (Any, Converter) -> Optional[Any]
-        return self.get_pair(default=default, converter=converter)[1]
+    def get(self, default=NotSet, converter=None, value_cb=None):
+        # type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Any]
+        return self.get_pair(default=default, converter=converter, value_cb=value_cb)[1]
 
     def set(self, value):
         # type: (Any, Any) -> (Text, Any)

From 176b4a4cdec9c4303a946a82e22a579ae22c3355 Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Wed, 16 Jun 2021 18:32:29 +0300
Subject: [PATCH 21/21] Fix --services-mode when the execute agent fails when
 starting to run with error code 0

---
 clearml_agent/commands/worker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py
index 7c45bc8..17e0949 100644
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@@ -1423,7 +1423,7 @@ class Worker(ServiceCommandSection):
 
                 # get diff from previous poll
                 printed_lines, stdout_pos_count = _print_file(stdout_path, stdout_pos_count)
-                if self._services_mode and not stopping and not status:
+                if self._services_mode and not stopping and status is None:
                     # if the internal agent started, we stop logging, it will take over logging.
                     # if the internal agent started running the task itself, it will return status==0,
                     # then we can quit the monitoring loop of this process