From e60a6f9d14c0eb2064185b3a80e650ca050c5a60 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Sun, 25 Apr 2021 10:46:43 +0300 Subject: [PATCH 01/21] Fix --stop support for dynamic gpus --- clearml_agent/commands/worker.py | 47 ++++++++++++++++++-------------- clearml_agent/helper/process.py | 23 ++++++++++++++-- 2 files changed, 46 insertions(+), 24 deletions(-) diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index c08b07f..3de4fb0 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -90,7 +90,7 @@ from clearml_agent.helper.process import ( get_bash_output, shutdown_docker_process, get_docker_id, - commit_docker, terminate_process, check_if_command_exists, + commit_docker, terminate_process, check_if_command_exists, terminate_all_child_processes, ) from clearml_agent.helper.package.priority_req import PriorityPackageRequirement, PackageCollectorRequirement from clearml_agent.helper.repo import clone_repository_cached, RepoInfo, VCS, fix_package_import_diff_patch @@ -685,6 +685,9 @@ class Worker(ServiceCommandSection): shutdown_docker_process(docker_cmd_contains='--id {}\'\"'.format(task_id)) safe_remove_file(temp_stdout_name) safe_remove_file(temp_stderr_name) + if self._services_mode and status == ExitStatus.interrupted: + # unregister this worker, it was killed + self._unregister() def run_tasks_loop(self, queues, worker_params, priority_order=True, gpu_indexes=None, gpu_queues=None): """ @@ -719,6 +722,7 @@ class Worker(ServiceCommandSection): # get current running instances available_gpus = None + dynamic_gpus_worker_id = None if gpu_indexes and gpu_queues: available_gpus, gpu_queues = self._setup_dynamic_gpus(gpu_queues) # multi instance support @@ -812,7 +816,7 @@ class Worker(ServiceCommandSection): self.report_monitor(ResourceMonitor.StatusReport(queues=queues, queue=queue, task=task_id)) org_gpus = os.environ.get('NVIDIA_VISIBLE_DEVICES') - worker_id = self.worker_id + dynamic_gpus_worker_id = self.worker_id # the following is only executed in dynamic gpus mode if gpu_queues and gpu_queues.get(queue): # pick the first available GPUs @@ -836,7 +840,7 @@ class Worker(ServiceCommandSection): self.run_one_task(queue, task_id, worker_params) if gpu_queues: - self.worker_id = worker_id + self.worker_id = dynamic_gpus_worker_id os.environ['CUDA_VISIBLE_DEVICES'] = \ os.environ['NVIDIA_VISIBLE_DEVICES'] = org_gpus @@ -864,18 +868,18 @@ class Worker(ServiceCommandSection): if shutdown_docker_process(docker_cmd_contains='--id {}\'\"'.format(t_id)): self.handle_task_termination(task_id=t_id, exit_code=0, stop_reason=TaskStopReason.stopped) else: + # if we are in dynamic gpus / services mode, + # we should send termination signal to all child processes + if self._services_mode: + terminate_all_child_processes(timeout=120) + # if we are here, just kill all sub processes kill_all_child_processes() - for t_id in set(list_task_gpus_ids.values()): - # check if Task is running, - task_info = get_task( - self._session, t_id, only_fields=["status"] - ) - # this is a bit risky we might have rerun it again after it already completed - # basically we are not removing completed tasks from the list, hence the issue - if str(task_info.status) == "in_progress": - self.handle_task_termination( - task_id=t_id, exit_code=0, stop_reason=TaskStopReason.stopped) + + # unregister dynamic GPU worker, if we were terminated while setting up a Task + if dynamic_gpus_worker_id: + self.worker_id = dynamic_gpus_worker_id + self._unregister() def _dynamic_gpu_get_available(self, gpu_indexes): # noinspection PyBroadException @@ -1792,15 +1796,16 @@ class Worker(ServiceCommandSection): debug=self._session.debug_mode, trace=self._session.trace, ) - self.report_monitor(ResourceMonitor.StatusReport(task=current_task.id)) - self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker) + try: + self.report_monitor(ResourceMonitor.StatusReport(task=current_task.id)) + self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker) + finally: + self.stop_monitor() + self._unregister() - self.stop_monitor() - self._unregister() - - if full_monitoring and self.temp_config_path: - safe_remove_file(self._session.config_file) - Singleton.close_pid_file() + if full_monitoring and self.temp_config_path: + safe_remove_file(self._session.config_file) + Singleton.close_pid_file() return self._session.print_configuration() diff --git a/clearml_agent/helper/process.py b/clearml_agent/helper/process.py index b8f4ef9..427d9c9 100644 --- a/clearml_agent/helper/process.py +++ b/clearml_agent/helper/process.py @@ -42,18 +42,18 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False): return output if not strip or not output else output.strip() -def terminate_process(pid, timeout=10.): +def terminate_process(pid, timeout=10., ignore_zombie=True): # noinspection PyBroadException try: proc = psutil.Process(pid) proc.terminate() cnt = 0 - while proc.is_running() and cnt < timeout: + while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout: sleep(1.) cnt += 1 proc.terminate() cnt = 0 - while proc.is_running() and cnt < timeout: + while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout: sleep(1.) cnt += 1 proc.kill() @@ -84,6 +84,23 @@ def kill_all_child_processes(pid=None): parent.kill() +def terminate_all_child_processes(pid=None, timeout=10., include_parent=True): + # get current process if pid not provided + if not pid: + pid = os.getpid() + include_parent = False + try: + parent = psutil.Process(pid) + except psutil.Error: + # could not find parent process id + return + for child in parent.children(recursive=False): + print('Terminating child process {}'.format(child.pid)) + terminate_process(child.pid, timeout=timeout, ignore_zombie=False) + if include_parent: + terminate_process(parent.pid, timeout=timeout, ignore_zombie=False) + + def get_docker_id(docker_cmd_contains): try: containers_running = get_bash_output(cmd='docker ps --no-trunc --format \"{{.ID}}: {{.Command}}\"') From 08ff5e6db7d8efae8f0cd6fff7728032d2851dd6 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Sun, 25 Apr 2021 10:47:49 +0300 Subject: [PATCH 02/21] Add number of pods limit to k8s glue --- clearml_agent/glue/k8s.py | 55 +++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/clearml_agent/glue/k8s.py b/clearml_agent/glue/k8s.py index d00abd2..8fdf2be 100644 --- a/clearml_agent/glue/k8s.py +++ b/clearml_agent/glue/k8s.py @@ -32,7 +32,7 @@ class K8sIntegration(Worker): K8S_DEFAULT_NAMESPACE = "clearml" - KUBECTL_APPLY_CMD = "kubectl apply -f" + KUBECTL_APPLY_CMD = "kubectl apply --namespace={namespace} -f" KUBECTL_RUN_CMD = "kubectl run clearml-{queue_name}-id-{task_id} " \ "--image {docker_image} " \ @@ -95,6 +95,7 @@ class K8sIntegration(Worker): clearml_conf_file=None, extra_bash_init_script=None, namespace=None, + max_pods_limit=None, **kwargs ): """ @@ -122,6 +123,7 @@ class K8sIntegration(Worker): :param str clearml_conf_file: clearml.conf file to be use by the pod itself (optional) :param str extra_bash_init_script: Additional bash script to run before starting the Task inside the container :param str namespace: K8S namespace to be used when creating the new pods (default: clearml) + :param int max_pods_limit: Maximum number of pods that K8S glue can run at the same time """ super(K8sIntegration, self).__init__() self.k8s_pending_queue_name = k8s_pending_queue_name or self.K8S_PENDING_QUEUE @@ -147,6 +149,7 @@ class K8sIntegration(Worker): self.namespace = namespace or self.K8S_DEFAULT_NAMESPACE self.pod_limits = [] self.pod_requests = [] + self.max_pods_limit = max_pods_limit if not self.ports_mode else None if overrides_yaml: with open(os.path.expandvars(os.path.expanduser(str(overrides_yaml))), 'rt') as f: overrides = yaml.load(f, Loader=getattr(yaml, 'FullLoader', None)) @@ -311,13 +314,19 @@ class K8sIntegration(Worker): # Search for a free pod number pod_count = 0 pod_number = self.base_pod_num - while self.ports_mode: + while self.ports_mode or self.max_pods_limit: pod_number = self.base_pod_num + pod_count - kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n {namespace}".format( - pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number), - agent_label=self.AGENT_LABEL, - namespace=self.namespace, - ) + if self.ports_mode: + kubectl_cmd_new = "kubectl get pods -l {pod_label},{agent_label} -n {namespace}".format( + pod_label=self.LIMIT_POD_LABEL.format(pod_number=pod_number), + agent_label=self.AGENT_LABEL, + namespace=self.namespace, + ) + else: + kubectl_cmd_new = "kubectl get pods -l {agent_label} -n {namespace} -o json".format( + agent_label=self.AGENT_LABEL, + namespace=self.namespace, + ) process = subprocess.Popen(kubectl_cmd_new.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) output, error = process.communicate() output = '' if not output else output if isinstance(output, str) else output.decode('utf-8') @@ -326,18 +335,42 @@ class K8sIntegration(Worker): if not output: # No such pod exist so we can use the pod_number we found break - if pod_count >= self.num_of_services - 1: - # All pod numbers are taken, exit + + if self.max_pods_limit: + try: + current_pod_count = len(json.loads(output).get("items", [])) + except (ValueError, TypeError) as ex: + self.log.warning( + "K8S Glue pods monitor: Failed parsing kubectl output:\n{}\ntask '{}' " + "will be enqueued back to queue '{}'\nEx: {}".format( + output, task_id, queue, ex + ) + ) + self._session.api_client.tasks.reset(task_id) + self._session.api_client.tasks.enqueue(task_id, queue=queue, status_reason='kubectl parsing error') + return + max_count = self.max_pods_limit + else: + current_pod_count = pod_count + max_count = self.num_of_services - 1 + + if current_pod_count >= max_count: + # All pods are taken, exit + self.log.debug( + "kubectl last result: {}\n{}".format(error, output)) self.log.warning( - "kubectl last result: {}\n{}\nAll k8s services are in use, task '{}' " + "All k8s services are in use, task '{}' " "will be enqueued back to queue '{}'".format( - error, output, task_id, queue + task_id, queue ) ) self._session.api_client.tasks.reset(task_id) self._session.api_client.tasks.enqueue( task_id, queue=queue, status_reason='k8s max pod limit (no free k8s service)') return + elif self.max_pods_limit: + # max pods limit hasn't reached yet, so we can create the pod + break pod_count += 1 labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + [self.AGENT_LABEL] From 24dc59e31fc1814c98b81719db970cdaf675ecb6 Mon Sep 17 00:00:00 2001 From: Revital Date: Thu, 22 Apr 2021 14:14:38 +0300 Subject: [PATCH 03/21] add space to help message --- clearml_agent/interface/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml_agent/interface/worker.py b/clearml_agent/interface/worker.py index 7467840..ad5da38 100644 --- a/clearml_agent/interface/worker.py +++ b/clearml_agent/interface/worker.py @@ -50,7 +50,7 @@ DAEMON_ARGS = dict({ }, '--docker': { 'help': 'Run execution task inside a docker (v19.03 and above). Optional args or ' - 'specify default docker image in agent.default_docker.image / agent.default_docker.arguments' + 'specify default docker image in agent.default_docker.image / agent.default_docker.arguments ' 'use --gpus/--cpu-only (or set NVIDIA_VISIBLE_DEVICES) to limit gpu visibility for docker', 'nargs': '*', 'default': False, From 823b67a3ce539b3d5a919221cd1c41302d6e2c29 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 28 Apr 2021 13:17:37 +0300 Subject: [PATCH 04/21] Deprecate venv_update (replaced by the more robust venvs_cache) --- docs/clearml.conf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/clearml.conf b/docs/clearml.conf index bd0c7eb..859333f 100644 --- a/docs/clearml.conf +++ b/docs/clearml.conf @@ -107,11 +107,12 @@ agent { path: ~/.clearml/vcs-cache }, + # DEPRECATED: please use `venvs_cache` and set `venvs_cache.path` # use venv-update in order to accelerate python virtual environment building # Still in beta, turned off by default - venv_update: { - enabled: false, - }, + # venv_update: { + # enabled: false, + # }, # cached folder for specific python package download (mostly pytorch versions) pip_download_cache { From 3ec2a3a92e73ee22bec2a51945e54946aab398a4 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 28 Apr 2021 13:19:34 +0300 Subject: [PATCH 05/21] Add k8s pod limit to k8s glue example --- examples/k8s_glue_example.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/k8s_glue_example.py b/examples/k8s_glue_example.py index b368b50..dc69c37 100644 --- a/examples/k8s_glue_example.py +++ b/examples/k8s_glue_example.py @@ -10,12 +10,15 @@ from clearml_agent.glue.k8s import K8sIntegration def parse_args(): parser = ArgumentParser() + group = parser.add_mutually_exclusive_group() + parser.add_argument( "--queue", type=str, help="Queue to pull tasks from" ) - parser.add_argument( + group.add_argument( "--ports-mode", action='store_true', default=False, help="Ports-Mode will add a label to the pod which can be used as service, in order to expose ports" + "Should not be used with max-pods" ) parser.add_argument( "--num-of-services", type=int, default=20, @@ -57,6 +60,11 @@ def parse_args(): "--namespace", type=str, help="Specify the namespace in which pods will be created (default: %(default)s)", default="clearml" ) + group.add_argument( + "--max-pods", type=int, + help="Limit the maximum number of pods that this service can run at the same time." + "Should not be used with ports-mode" + ) return parser.parse_args() @@ -77,7 +85,7 @@ def main(): user_props_cb=user_props_cb, overrides_yaml=args.overrides_yaml, clearml_conf_file=args.pod_clearml_conf, template_yaml=args.template_yaml, extra_bash_init_script=K8sIntegration.get_ssh_server_bash( ssh_port_number=args.ssh_server_port) if args.ssh_server_port else None, - namespace=args.namespace, + namespace=args.namespace, max_pods_limit=args.max_pods or None, ) k8s.k8s_daemon(args.queue) From 4f18bb7ea0600db6ee63ecb5ad0ea8d048a272ca Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 28 Apr 2021 13:20:13 +0300 Subject: [PATCH 06/21] Add k8s glue default restartPolicy=Never to template to prevent pods from restarting --- clearml_agent/glue/k8s.py | 1 + 1 file changed, 1 insertion(+) diff --git a/clearml_agent/glue/k8s.py b/clearml_agent/glue/k8s.py index 8fdf2be..cdbeea7 100644 --- a/clearml_agent/glue/k8s.py +++ b/clearml_agent/glue/k8s.py @@ -446,6 +446,7 @@ class K8sIntegration(Worker): template['metadata']['name'] = name template.setdefault('spec', {}) template['spec'].setdefault('containers', []) + template['spec'].setdefault('restartPolicy', 'Never') if labels: labels_dict = dict(pair.split('=', 1) for pair in labels) template['metadata'].setdefault('labels', {}) From cec6420c8f40d92ab1cd6cbe5ca8f24cf351abd8 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Mon, 3 May 2021 18:33:53 +0300 Subject: [PATCH 07/21] Version bump to v1.0.0 --- clearml_agent/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml_agent/version.py b/clearml_agent/version.py index 84d3488..1f356cc 100644 --- a/clearml_agent/version.py +++ b/clearml_agent/version.py @@ -1 +1 @@ -__version__ = '0.17.2' +__version__ = '1.0.0' From a2db1f5ab5cbf178840da736afdc370cfff43f0f Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 5 May 2021 11:58:37 +0300 Subject: [PATCH 08/21] Remove queue name from pod name in k8s glue, add queue name and ID to pod labels (issue #64) --- clearml_agent/glue/k8s.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/clearml_agent/glue/k8s.py b/clearml_agent/glue/k8s.py index cdbeea7..4b5375a 100644 --- a/clearml_agent/glue/k8s.py +++ b/clearml_agent/glue/k8s.py @@ -34,7 +34,7 @@ class K8sIntegration(Worker): KUBECTL_APPLY_CMD = "kubectl apply --namespace={namespace} -f" - KUBECTL_RUN_CMD = "kubectl run clearml-{queue_name}-id-{task_id} " \ + KUBECTL_RUN_CMD = "kubectl run clearml-id-{task_id} " \ "--image {docker_image} " \ "--restart=Never " \ "--namespace={namespace}" @@ -307,10 +307,6 @@ class K8sIntegration(Worker): except Exception: queue_name = 'k8s' - # conform queue name to k8s standards - safe_queue_name = queue_name.lower().strip() - safe_queue_name = re.sub(r'\W+', '', safe_queue_name).replace('_', '').replace('-', '') - # Search for a free pod number pod_count = 0 pod_number = self.base_pod_num @@ -374,6 +370,8 @@ class K8sIntegration(Worker): pod_count += 1 labels = ([self.LIMIT_POD_LABEL.format(pod_number=pod_number)] if self.ports_mode else []) + [self.AGENT_LABEL] + labels.append("clearml-agent-queue={}".format(self._safe_k8s_label_value(queue))) + labels.append("clearml-agent-queue-name={}".format(self._safe_k8s_label_value(queue_name))) if self.ports_mode: print("Kubernetes scheduling task id={} on pod={} (pod_count={})".format(task_id, pod_number, pod_count)) @@ -384,13 +382,13 @@ class K8sIntegration(Worker): output, error = self._kubectl_apply( create_clearml_conf=create_clearml_conf, labels=labels, docker_image=docker_image, docker_args=docker_args, - task_id=task_id, queue=queue, queue_name=safe_queue_name) + task_id=task_id, queue=queue) else: output, error = self._kubectl_run( create_clearml_conf=create_clearml_conf, labels=labels, docker_image=docker_cmd, task_data=task_data, - task_id=task_id, queue=queue, queue_name=safe_queue_name) + task_id=task_id, queue=queue) error = '' if not error else (error if isinstance(error, str) else error.decode('utf-8')) output = '' if not output else (output if isinstance(output, str) else output.decode('utf-8')) @@ -437,12 +435,12 @@ class K8sIntegration(Worker): self.log.warning('skipping docker argument {} (only -e --env supported)'.format(cmd)) return {'env': kube_args} if kube_args else {} - def _kubectl_apply(self, create_clearml_conf, docker_image, docker_args, labels, queue, task_id, queue_name): + def _kubectl_apply(self, create_clearml_conf, docker_image, docker_args, labels, queue, task_id): template = deepcopy(self.template_dict) template.setdefault('apiVersion', 'v1') template['kind'] = 'Pod' template.setdefault('metadata', {}) - name = 'clearml-{queue}-id-{task_id}'.format(queue=queue_name, task_id=task_id) + name = 'clearml-id-{task_id}'.format(task_id=task_id) template['metadata']['name'] = name template.setdefault('spec', {}) template['spec'].setdefault('containers', []) @@ -508,12 +506,11 @@ class K8sIntegration(Worker): return output, error - def _kubectl_run(self, create_clearml_conf, docker_image, labels, queue, task_data, task_id, queue_name): + def _kubectl_run(self, create_clearml_conf, docker_image, labels, queue, task_data, task_id): if callable(self.kubectl_cmd): - kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data, queue_name) + kubectl_cmd = self.kubectl_cmd(task_id, docker_image, queue, task_data) else: kubectl_cmd = self.kubectl_cmd.format( - queue_name=queue_name, task_id=task_id, docker_image=docker_image, queue_id=queue, @@ -641,3 +638,13 @@ class K8sIntegration(Worker): return merge_dicts( c1, c2, custom_merge_func=merge_env ) + + @staticmethod + def _safe_k8s_label_value(value): + """ Conform string to k8s standards for a label value """ + value = value.lower().strip() + value = re.sub(r'^[^A-Za-z0-9]+', '', value) # strip leading non-alphanumeric chars + value = re.sub(r'[^A-Za-z0-9]+$', '', value) # strip trailing non-alphanumeric chars + value = re.sub(r'\W+', '-', value) # allow only word chars (this removed "." which is supported, but nvm) + value = re.sub(r'-+', '-', value) # don't leave messy "--" after replacing previous chars + return value[:63] From ae3d0345313f10f31c83163bab936472577d4e69 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 12 May 2021 15:45:31 +0300 Subject: [PATCH 09/21] Protect against None in execution.repository --- clearml_agent/helper/repo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml_agent/helper/repo.py b/clearml_agent/helper/repo.py index f39a4f8..551c9f3 100644 --- a/clearml_agent/helper/repo.py +++ b/clearml_agent/helper/repo.py @@ -591,7 +591,7 @@ def clone_repository_cached(session, execution, destination): # mock lock repo_lock = Lock() repo_lock_timeout_sec = 300 - repo_url = execution.repository # type: str + repo_url = execution.repository or '' # type: str parsed_url = furl(repo_url) no_password_url = parsed_url.copy().remove(password=True).url From 4f7407084d1900a79d455570c573e60f40208742 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 12 May 2021 15:46:25 +0300 Subject: [PATCH 10/21] Fix standalone script with pre-exiting conda venv --- clearml_agent/commands/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index 3de4fb0..55db405 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -187,7 +187,7 @@ class LiteralScriptManager(object): location = location or (repo_info and repo_info.root) if not location: location = Path(self.venv_folder, "code") - location.mkdir(exist_ok=True) + location.mkdir(exist_ok=True, parents=True) log.debug("selected execution directory: %s", location) return Text(location), self.write(task, location, execution.entry_point) From 1e795beec87be3673dfbd33735763d51f3670cd4 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 19 May 2021 15:20:03 +0300 Subject: [PATCH 11/21] Fix support for spaces in docker arguments (issue #358) --- clearml_agent/commands/worker.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index 55db405..36cef19 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -11,6 +11,7 @@ import subprocess import sys import shutil import traceback +import shlex from collections import defaultdict from copy import deepcopy, copy from datetime import datetime @@ -221,6 +222,9 @@ def get_task(session, task_id, *args, **kwargs): def get_task_container(session, task_id): + """ + Returns dict with Task docker container setup {container: '', arguments: '', setup_shell_script: ''} + """ if session.check_min_api_version("2.13"): result = session.send_request( service='tasks', @@ -233,12 +237,12 @@ def get_task_container(session, task_id): try: container = result.json()['data']['tasks'][0]['container'] if result.ok else {} if container.get('arguments'): - container['arguments'] = str(container.get('arguments')).split(' ') + container['arguments'] = shlex.split(str(container.get('arguments')).strip()) except (ValueError, TypeError): container = {} else: response = get_task(session, task_id, only_fields=["execution.docker_cmd"]) - task_docker_cmd_parts = str(response.execution.docker_cmd or '').strip().split(' ') + task_docker_cmd_parts = shlex.split(str(response.execution.docker_cmd or '').strip()) try: container = dict( container=task_docker_cmd_parts[0], @@ -251,11 +255,14 @@ def get_task_container(session, task_id): def set_task_container(session, task_id, docker_image=None, docker_arguments=None, docker_bash_script=None): + if docker_arguments and isinstance(docker_arguments, str): + docker_arguments = [docker_arguments] + if session.check_min_api_version("2.13"): container = dict( - image=docker_image or None, - arguments=' '.join(docker_arguments) if docker_arguments else None, - setup_shell_script=docker_bash_script or None, + image=docker_image or '', + arguments=' '.join(docker_arguments) if docker_arguments else '', + setup_shell_script=docker_bash_script or '', ) result = session.send_request( service='tasks', @@ -1913,7 +1920,6 @@ class Worker(ServiceCommandSection): if current_task.script.binary and current_task.script.binary.startswith('python') and \ execution.entry_point and execution.entry_point.split()[0].strip() == '-m': # we need to split it - import shlex extra.extend(shlex.split(execution.entry_point)) else: extra.append(execution.entry_point) From 3c4e976093927ce9b1fc0c14a91c13b71d855e18 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 19 May 2021 15:20:44 +0300 Subject: [PATCH 12/21] Add agent.ignore_requested_python_version to config file --- clearml_agent/backend_api/config/default/agent.conf | 3 +++ docs/clearml.conf | 3 +++ 2 files changed, 6 insertions(+) diff --git a/clearml_agent/backend_api/config/default/agent.conf b/clearml_agent/backend_api/config/default/agent.conf index de82bf3..a66d675 100644 --- a/clearml_agent/backend_api/config/default/agent.conf +++ b/clearml_agent/backend_api/config/default/agent.conf @@ -26,6 +26,9 @@ # Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6" # The default is the python executing the clearml_agent python_binary: "" + # ignore any requested python version (Default: False, if a Task was using a + # specific python version and the system supports multiple python the agent will use the requested python version) + # ignore_requested_python_version: true # select python package manager: # currently supported pip and conda diff --git a/docs/clearml.conf b/docs/clearml.conf index 859333f..05e0922 100644 --- a/docs/clearml.conf +++ b/docs/clearml.conf @@ -42,6 +42,9 @@ agent { # Example values: "/usr/bin/python3" or "/usr/local/bin/python3.6" # The default is the python executing the clearml_agent python_binary: "" + # ignore any requested python version (Default: False, if a Task was using a + # specific python version and the system supports multiple python the agent will use the requested python version) + # ignore_requested_python_version: true # select python package manager: # currently supported pip and conda From e93384b99bdfd72a54cf2b68b3991b145b504b79 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Thu, 20 May 2021 10:58:46 +0300 Subject: [PATCH 13/21] Fix --stop with dynamic gpus --- clearml_agent/commands/worker.py | 11 +++++++++-- clearml_agent/helper/process.py | 18 ++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index 36cef19..a93d900 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -878,7 +878,7 @@ class Worker(ServiceCommandSection): # if we are in dynamic gpus / services mode, # we should send termination signal to all child processes if self._services_mode: - terminate_all_child_processes(timeout=120) + terminate_all_child_processes(timeout=20, include_parent=False) # if we are here, just kill all sub processes kill_all_child_processes() @@ -1371,6 +1371,7 @@ class Worker(ServiceCommandSection): service_mode_internal_agent_started = None stopping = False status = None + process = None try: _last_machine_update_ts = time() stop_reason = None @@ -1427,6 +1428,8 @@ class Worker(ServiceCommandSection): status = ex.returncode except KeyboardInterrupt: # so someone else will catch us + if process: + kill_all_child_processes(process.pid) raise except Exception: # we should not get here, but better safe than sorry @@ -1438,6 +1441,10 @@ class Worker(ServiceCommandSection): stop_reason = TaskStopReason.exception status = -1 + # full cleanup (just in case) + if process: + kill_all_child_processes(process.pid) + # if running in services mode, keep the file open # in case the docker was so quick it started and finished, check the stop reason if self._services_mode and service_mode_internal_agent_started and stop_reason == 'Service started': @@ -3091,7 +3098,7 @@ class Worker(ServiceCommandSection): warning('Could not terminate process pid={}'.format(pid)) return True - # wither we have a match for the worker_id or we just pick the first one, and kill it. + # either we have a match for the worker_id or we just pick the first one, and kill it. if (worker_id and uid == worker_id) or (not worker_id and uid.startswith('{}:'.format(worker_name))): # this is us kill it print('Terminating clearml-agent worker_id={} pid={}'.format(uid, pid)) diff --git a/clearml_agent/helper/process.py b/clearml_agent/helper/process.py index 427d9c9..c92c3cf 100644 --- a/clearml_agent/helper/process.py +++ b/clearml_agent/helper/process.py @@ -42,20 +42,31 @@ def get_bash_output(cmd, strip=False, stderr=subprocess.STDOUT, stdin=False): return output if not strip or not output else output.strip() -def terminate_process(pid, timeout=10., ignore_zombie=True): +def terminate_process(pid, timeout=10., ignore_zombie=True, include_children=False): # noinspection PyBroadException try: proc = psutil.Process(pid) + children = proc.children(recursive=True) if include_children else [] proc.terminate() cnt = 0 while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout: sleep(1.) cnt += 1 proc.terminate() + + # terminate children + for c in children: + c.terminate() + cnt = 0 while proc.is_running() and (ignore_zombie or proc.status() != 'zombie') and cnt < timeout: sleep(1.) cnt += 1 + + # kill children + for c in children: + c.kill() + proc.kill() except Exception: pass @@ -66,9 +77,8 @@ def terminate_process(pid, timeout=10., ignore_zombie=True): return True -def kill_all_child_processes(pid=None): +def kill_all_child_processes(pid=None, include_parent=True): # get current process if pid not provided - include_parent = True if not pid: pid = os.getpid() include_parent = False @@ -96,7 +106,7 @@ def terminate_all_child_processes(pid=None, timeout=10., include_parent=True): return for child in parent.children(recursive=False): print('Terminating child process {}'.format(child.pid)) - terminate_process(child.pid, timeout=timeout, ignore_zombie=False) + terminate_process(child.pid, timeout=timeout, ignore_zombie=False, include_children=True) if include_parent: terminate_process(parent.pid, timeout=timeout, ignore_zombie=False) From 742cbf57670815a80a0c502ef61da12521e1e71f Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Tue, 25 May 2021 19:31:45 +0300 Subject: [PATCH 14/21] Add docker environment arguments log masking support (issue #67) --- .../backend_api/config/default/agent.conf | 12 ++++ clearml_agent/commands/worker.py | 55 ++++++++++++++++--- clearml_agent/definitions.py | 12 ++-- clearml_agent/helper/process.py | 5 +- 4 files changed, 70 insertions(+), 14 deletions(-) diff --git a/clearml_agent/backend_api/config/default/agent.conf b/clearml_agent/backend_api/config/default/agent.conf index a66d675..fa37231 100644 --- a/clearml_agent/backend_api/config/default/agent.conf +++ b/clearml_agent/backend_api/config/default/agent.conf @@ -185,4 +185,16 @@ # should be detected automatically. Override with os environment CUDA_VERSION / CUDNN_VERSION # cuda_version: 10.1 # cudnn_version: 7.6 + + # Hide docker environment variables containing secrets when printing out the docker command by replacing their + # values with "********". Turning this feature on will hide the following environment variables values: + # CLEARML_API_SECRET_KEY, CLEARML_AGENT_GIT_PASS, AWS_SECRET_ACCESS_KEY, AZURE_STORAGE_KEY + # To include more environment variables, add their keys to the "extra_keys" list. E.g. to make sure the value of + # your custom environment variable named MY_SPECIAL_PASSWORD will not show in the logs when included in the + # docker command, set: + # extra_keys: ["MY_SPECIAL_PASSWORD"] + hide_docker_command_env_vars { + enabled: true + extra_keys: [] + } } diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index a93d900..8e58f14 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -20,7 +20,7 @@ from functools import partial, cmp_to_key from itertools import chain from tempfile import mkdtemp, NamedTemporaryFile from time import sleep, time -from typing import Text, Optional, Any, Tuple +from typing import Text, Optional, Any, Tuple, List import attr import psutil @@ -44,7 +44,13 @@ from clearml_agent.definitions import ( ENV_DOCKER_HOST_MOUNT, ENV_TASK_EXTRA_PYTHON_PATH, ENV_AGENT_GIT_USER, - ENV_AGENT_GIT_PASS, ENV_WORKER_ID, ENV_DOCKER_SKIP_GPUS_FLAG, ) + ENV_AGENT_GIT_PASS, + ENV_WORKER_ID, + ENV_DOCKER_SKIP_GPUS_FLAG, + ENV_AGENT_SECRET_KEY, + ENV_AWS_SECRET_KEY, + ENV_AZURE_ACCOUNT_KEY, +) from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES from clearml_agent.errors import APIError, CommandFailedError, Sigterm from clearml_agent.helper.base import ( @@ -68,7 +74,9 @@ from clearml_agent.helper.base import ( get_python_path, is_linux_platform, rm_file, - add_python_path, safe_remove_tree, ) + add_python_path, + safe_remove_tree, +) from clearml_agent.helper.console import ensure_text, print_text, decode_binary_lines from clearml_agent.helper.os.daemonize import daemonize_process from clearml_agent.helper.package.base import PackageManager @@ -91,7 +99,10 @@ from clearml_agent.helper.process import ( get_bash_output, shutdown_docker_process, get_docker_id, - commit_docker, terminate_process, check_if_command_exists, terminate_all_child_processes, + commit_docker, + terminate_process, + check_if_command_exists, + terminate_all_child_processes, ) from clearml_agent.helper.package.priority_req import PriorityPackageRequirement, PackageCollectorRequirement from clearml_agent.helper.repo import clone_repository_cached, RepoInfo, VCS, fix_package_import_diff_patch @@ -621,10 +632,13 @@ class Worker(ServiceCommandSection): '--standalone-mode' if self._standalone_mode else '', task_id) - # send the actual used command line to the backend - self.send_logs(task_id=task_id, lines=['Executing: {}\n'.format(full_docker_cmd)], level="INFO") + display_docker_command = self._sanitize_docker_command(full_docker_cmd) + + # send the actual used command line to the backend + self.send_logs(task_id=task_id, lines=['Executing: {}\n'.format(display_docker_command)], level="INFO") + + cmd = Argv(*full_docker_cmd, display_argv=display_docker_command) - cmd = Argv(*full_docker_cmd) print('Running Docker:\n{}\n'.format(str(cmd))) else: cmd = worker_args.get_argv_for_command("execute") + ( @@ -3161,6 +3175,33 @@ class Worker(ServiceCommandSection): queue_ids.append(q_id) return queue_ids + def _sanitize_docker_command(self, docker_command): + # type: (List[str]) -> List[str] + if not self._session.config.get('agent.hide_docker_command_env_vars.enabled', False): + return docker_command + + keys = set(self._session.config.get('agent.hide_docker_command_env_vars.extra_keys', [])) + keys.update( + ENV_AGENT_GIT_PASS.vars, + ENV_AGENT_SECRET_KEY.vars, + ENV_AWS_SECRET_KEY.vars, + ENV_AZURE_ACCOUNT_KEY.vars + ) + + result = docker_command[:] + for i, item in enumerate(docker_command): + try: + if item not in ("-e", "--env"): + continue + key, sep, _ = result[i + 1].partition("=") + if key not in keys or not sep: + continue + result[i + 1] = "{}={}".format(key, "********") + except KeyError: + pass + + return result + if __name__ == "__main__": pass diff --git a/clearml_agent/definitions.py b/clearml_agent/definitions.py index c71db5f..6aa0451 100644 --- a/clearml_agent/definitions.py +++ b/clearml_agent/definitions.py @@ -62,6 +62,10 @@ class EnvironmentConfig(object): return None +ENV_AGENT_SECRET_KEY = EnvironmentConfig("CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY") +ENV_AWS_SECRET_KEY = EnvironmentConfig("AWS_SECRET_ACCESS_KEY") +ENV_AZURE_ACCOUNT_KEY = EnvironmentConfig("AZURE_STORAGE_KEY") + ENVIRONMENT_CONFIG = { "api.api_server": EnvironmentConfig("CLEARML_API_HOST", "TRAINS_API_HOST", ), "api.files_server": EnvironmentConfig("CLEARML_FILES_HOST", "TRAINS_FILES_HOST", ), @@ -69,9 +73,7 @@ ENVIRONMENT_CONFIG = { "api.credentials.access_key": EnvironmentConfig( "CLEARML_API_ACCESS_KEY", "TRAINS_API_ACCESS_KEY", ), - "api.credentials.secret_key": EnvironmentConfig( - "CLEARML_API_SECRET_KEY", "TRAINS_API_SECRET_KEY", - ), + "api.credentials.secret_key": ENV_AGENT_SECRET_KEY, "agent.worker_name": EnvironmentConfig("CLEARML_WORKER_NAME", "TRAINS_WORKER_NAME", ), "agent.worker_id": EnvironmentConfig("CLEARML_WORKER_ID", "TRAINS_WORKER_ID", ), "agent.cuda_version": EnvironmentConfig( @@ -84,10 +86,10 @@ ENVIRONMENT_CONFIG = { names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool ), "sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"), - "sdk.aws.s3.secret": EnvironmentConfig("AWS_SECRET_ACCESS_KEY"), + "sdk.aws.s3.secret": ENV_AWS_SECRET_KEY, "sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"), "sdk.azure.storage.containers.0": {'account_name': EnvironmentConfig("AZURE_STORAGE_ACCOUNT"), - 'account_key': EnvironmentConfig("AZURE_STORAGE_KEY")}, + 'account_key': ENV_AZURE_ACCOUNT_KEY}, "sdk.google.storage.credentials_json": EnvironmentConfig("GOOGLE_APPLICATION_CREDENTIALS"), } diff --git a/clearml_agent/helper/process.py b/clearml_agent/helper/process.py index c92c3cf..fc86785 100644 --- a/clearml_agent/helper/process.py +++ b/clearml_agent/helper/process.py @@ -221,6 +221,7 @@ class Argv(Executable): """ self.argv = argv self._log = kwargs.pop("log", None) + self._display_argv = kwargs.pop("display_argv", argv) if not self._log: self._log = logging.getLogger(__name__) self._log.propagate = False @@ -245,10 +246,10 @@ class Argv(Executable): return self.argv def __repr__(self): - return "".format(self.argv) + return "".format(self._display_argv) def __str__(self): - return "Executing: {}".format(self.argv) + return "Executing: {}".format(self._display_argv) def __iter__(self): if is_windows_platform(): From 0694b9e8afcec119aec35fbe7e2908e283ddcc8d Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 26 May 2021 18:33:35 +0300 Subject: [PATCH 15/21] Fix PyYAML supported versions --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ba96007..5186b3f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ pyhocon>=0.3.38,<0.4.0 pyparsing>=2.0.3,<2.5.0 python-dateutil>=2.4.2,<2.9.0 pyjwt>=1.6.4,<2.1.0 -PyYAML>=3.12,<5.4.0 +PyYAML>=3.12,<5.5.0 requests>=2.20.0,<2.26.0 six>=1.11.0,<1.16.0 typing>=3.6.4,<3.8.0 From 3a07bfe1d731aaff7f1f783373a68820c58f55af Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Mon, 31 May 2021 23:19:46 +0300 Subject: [PATCH 16/21] Version bump --- clearml_agent/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml_agent/version.py b/clearml_agent/version.py index 1f356cc..e697a0c 100644 --- a/clearml_agent/version.py +++ b/clearml_agent/version.py @@ -1 +1 @@ -__version__ = '1.0.0' +__version__ = '1.0.1rc0' From 7e90ebd5db1c289a07df67a0d59de5dc861b61f0 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 2 Jun 2021 13:16:17 +0300 Subject: [PATCH 17/21] Fix _dynamic_gpu_get_available worker timeout increase to 10 minutes --- clearml_agent/commands/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index 8e58f14..a980a6f 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -905,7 +905,7 @@ class Worker(ServiceCommandSection): def _dynamic_gpu_get_available(self, gpu_indexes): # noinspection PyBroadException try: - response = self._session.send_api(workers_api.GetAllRequest(last_seen=60)) + response = self._session.send_api(workers_api.GetAllRequest(last_seen=600)) except Exception: return None From 8c56777125d3b90936f52db3d5ca8de927a810f5 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 2 Jun 2021 13:16:58 +0300 Subject: [PATCH 18/21] Add CLEARML_AGENT_DISABLE_SSH_MOUNT allowing disabling the auto .ssh mount into the docker --- clearml_agent/commands/worker.py | 45 ++++++++++++++++++++------------ clearml_agent/definitions.py | 1 + 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index a980a6f..7c45bc8 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -50,6 +50,7 @@ from clearml_agent.definitions import ( ENV_AGENT_SECRET_KEY, ENV_AWS_SECRET_KEY, ENV_AZURE_ACCOUNT_KEY, + ENV_AGENT_DISABLE_SSH_MOUNT, ) from clearml_agent.definitions import WORKING_REPOSITORY_DIR, PIP_EXTRA_INDICES from clearml_agent.errors import APIError, CommandFailedError, Sigterm @@ -2725,8 +2726,11 @@ class Worker(ServiceCommandSection): if temp_config.get("agent.venvs_cache.path", None): temp_config.put("agent.venvs_cache.path", '/root/.clearml/venvs-cache') - self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.') - self._temp_cleanup_list.append(self._host_ssh_cache) + if ENV_AGENT_DISABLE_SSH_MOUNT.get(): + self._host_ssh_cache = None + else: + self._host_ssh_cache = mkdtemp(prefix='clearml_agent.ssh.') + self._temp_cleanup_list.append(self._host_ssh_cache) return temp_config, partial(self._get_docker_config_cmd, temp_config=temp_config) @@ -2748,24 +2752,31 @@ class Worker(ServiceCommandSection): "agent.docker_pip_cache", '~/.clearml/pip-cache'))).expanduser().as_posix() # make sure all folders are valid - Path(host_apt_cache).mkdir(parents=True, exist_ok=True) - Path(host_pip_cache).mkdir(parents=True, exist_ok=True) - Path(host_cache).mkdir(parents=True, exist_ok=True) - Path(host_pip_dl).mkdir(parents=True, exist_ok=True) - Path(host_vcs_cache).mkdir(parents=True, exist_ok=True) - Path(host_ssh_cache).mkdir(parents=True, exist_ok=True) + if host_apt_cache: + Path(host_apt_cache).mkdir(parents=True, exist_ok=True) + if host_pip_cache: + Path(host_pip_cache).mkdir(parents=True, exist_ok=True) + if host_cache: + Path(host_cache).mkdir(parents=True, exist_ok=True) + if host_pip_dl: + Path(host_pip_dl).mkdir(parents=True, exist_ok=True) + if host_vcs_cache: + Path(host_vcs_cache).mkdir(parents=True, exist_ok=True) + if host_ssh_cache: + Path(host_ssh_cache).mkdir(parents=True, exist_ok=True) if host_venvs_cache: Path(host_venvs_cache).mkdir(parents=True, exist_ok=True) - # copy the .ssh folder to a temp folder, to be mapped into docker - # noinspection PyBroadException - try: - if Path(host_ssh_cache).is_dir(): - shutil.rmtree(host_ssh_cache, ignore_errors=True) - shutil.copytree(Path('~/.ssh').expanduser().as_posix(), host_ssh_cache) - except Exception: - host_ssh_cache = None - self.log.warning('Failed creating temporary copy of ~/.ssh for git credential') + if host_ssh_cache: + # copy the .ssh folder to a temp folder, to be mapped into docker + # noinspection PyBroadException + try: + if Path(host_ssh_cache).is_dir(): + shutil.rmtree(host_ssh_cache, ignore_errors=True) + shutil.copytree(Path('~/.ssh').expanduser().as_posix(), host_ssh_cache) + except Exception: + host_ssh_cache = None + self.log.warning('Failed creating temporary copy of ~/.ssh for git credential') # check if the .git credentials exist: try: diff --git a/clearml_agent/definitions.py b/clearml_agent/definitions.py index 6aa0451..6b0d8e2 100644 --- a/clearml_agent/definitions.py +++ b/clearml_agent/definitions.py @@ -134,6 +134,7 @@ ENV_DOCKER_SKIP_GPUS_FLAG = EnvironmentConfig('CLEARML_DOCKER_SKIP_GPUS_FLAG', ' ENV_AGENT_GIT_USER = EnvironmentConfig('CLEARML_AGENT_GIT_USER', 'TRAINS_AGENT_GIT_USER') ENV_AGENT_GIT_PASS = EnvironmentConfig('CLEARML_AGENT_GIT_PASS', 'TRAINS_AGENT_GIT_PASS') ENV_AGENT_GIT_HOST = EnvironmentConfig('CLEARML_AGENT_GIT_HOST', 'TRAINS_AGENT_GIT_HOST') +ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig('CLEARML_AGENT_DISABLE_SSH_MOUNT', type=bool) ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig('CLEARML_AGENT_EXEC_USER', 'TRAINS_AGENT_EXEC_USER') ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig('CLEARML_AGENT_EXTRA_PYTHON_PATH', 'TRAINS_AGENT_EXTRA_PYTHON_PATH') ENV_DOCKER_HOST_MOUNT = EnvironmentConfig('CLEARML_AGENT_K8S_HOST_MOUNT', 'CLEARML_AGENT_DOCKER_HOST_MOUNT', From eda597dea5d63dd7f8414610c4d6ad3bd9bb9d70 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 2 Jun 2021 13:17:37 +0300 Subject: [PATCH 19/21] Version bump --- clearml_agent/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml_agent/version.py b/clearml_agent/version.py index e697a0c..6dada8f 100644 --- a/clearml_agent/version.py +++ b/clearml_agent/version.py @@ -1 +1 @@ -__version__ = '1.0.1rc0' +__version__ = '1.0.1rc1' From 29bf993be73f37bbcf6aae4e084b14b4a29576a6 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 2 Jun 2021 21:15:48 +0300 Subject: [PATCH 20/21] Add printout when using key/secret from env vars --- clearml_agent/backend_api/session/session.py | 6 ++++-- clearml_agent/backend_config/entry.py | 17 ++++++++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/clearml_agent/backend_api/session/session.py b/clearml_agent/backend_api/session/session.py index 27fc877..dbf8cf4 100644 --- a/clearml_agent/backend_api/session/session.py +++ b/clearml_agent/backend_api/session/session.py @@ -111,7 +111,8 @@ class Session(TokenManager): self._logger = logger self.__access_key = api_key or ENV_ACCESS_KEY.get( - default=(self.config.get("api.credentials.access_key", None) or self.default_key) + default=(self.config.get("api.credentials.access_key", None) or self.default_key), + value_cb=lambda key, value: logger.info("Using environment access key {}={}".format(key, value)) ) if not self.access_key: raise ValueError( @@ -119,7 +120,8 @@ class Session(TokenManager): ) self.__secret_key = secret_key or ENV_SECRET_KEY.get( - default=(self.config.get("api.credentials.secret_key", None) or self.default_secret) + default=(self.config.get("api.credentials.secret_key", None) or self.default_secret), + value_cb=lambda key, value: logger.info("Using environment secret key {}=********".format(key)) ) if not self.secret_key: raise ValueError( diff --git a/clearml_agent/backend_config/entry.py b/clearml_agent/backend_config/entry.py index 1107563..489d326 100644 --- a/clearml_agent/backend_config/entry.py +++ b/clearml_agent/backend_config/entry.py @@ -64,8 +64,8 @@ class Entry(object): converter = self.default_conversions().get(self.type, self.type) return converter(value) - def get_pair(self, default=NotSet, converter=None): - # type: (Any, Converter) -> Optional[Tuple[Text, Any]] + def get_pair(self, default=NotSet, converter=None, value_cb=None): + # type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Tuple[Text, Any]] for key in self.keys: value = self._get(key) if value is NotSet: @@ -75,13 +75,20 @@ class Entry(object): except Exception as ex: self.error("invalid value {key}={value}: {ex}".format(**locals())) break + # noinspection PyBroadException + try: + if value_cb: + value_cb(key, value) + except Exception: + pass return key, value + result = self.default if default is NotSet else default return self.key, result - def get(self, default=NotSet, converter=None): - # type: (Any, Converter) -> Optional[Any] - return self.get_pair(default=default, converter=converter)[1] + def get(self, default=NotSet, converter=None, value_cb=None): + # type: (Any, Converter, Callable[[str, Any], None]) -> Optional[Any] + return self.get_pair(default=default, converter=converter, value_cb=value_cb)[1] def set(self, value): # type: (Any, Any) -> (Text, Any) From 176b4a4cdec9c4303a946a82e22a579ae22c3355 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 16 Jun 2021 18:32:29 +0300 Subject: [PATCH 21/21] Fix --services-mode when the execute agent fails when starting to run with error code 0 --- clearml_agent/commands/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index 7c45bc8..17e0949 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -1423,7 +1423,7 @@ class Worker(ServiceCommandSection): # get diff from previous poll printed_lines, stdout_pos_count = _print_file(stdout_path, stdout_pos_count) - if self._services_mode and not stopping and not status: + if self._services_mode and not stopping and status is None: # if the internal agent started, we stop logging, it will take over logging. # if the internal agent started running the task itself, it will return status==0, # then we can quit the monitoring loop of this process