From edc333ba5f55e4418347d7c80f3ce89dac1f18bf Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Wed, 24 Jul 2024 17:46:27 +0300 Subject: [PATCH] Add K8S_GLUE_POD_USE_IMAGE_ENTRYPOINT to allow running images without overriding the entrypoint (useful for agents using prebuilt images in k8s) --- clearml_agent/definitions.py | 1 + clearml_agent/glue/definitions.py | 6 ++++++ clearml_agent/glue/k8s.py | 15 ++++++++++++++- 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/clearml_agent/definitions.py b/clearml_agent/definitions.py index 182abf7..b99a95c 100644 --- a/clearml_agent/definitions.py +++ b/clearml_agent/definitions.py @@ -167,6 +167,7 @@ ENV_AGENT_GIT_USER = EnvironmentConfig("CLEARML_AGENT_GIT_USER", "TRAINS_AGENT_G ENV_AGENT_GIT_PASS = EnvironmentConfig("CLEARML_AGENT_GIT_PASS", "TRAINS_AGENT_GIT_PASS") ENV_AGENT_GIT_HOST = EnvironmentConfig("CLEARML_AGENT_GIT_HOST", "TRAINS_AGENT_GIT_HOST") ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig("CLEARML_AGENT_DISABLE_SSH_MOUNT", type=bool) +ENV_AGENT_DEBUG_GET_NEXT_TASK = EnvironmentConfig("CLEARML_AGENT_DEBUG_GET_NEXT_TASK", type=bool) ENV_SSH_AUTH_SOCK = EnvironmentConfig("SSH_AUTH_SOCK") ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig("CLEARML_AGENT_EXEC_USER", "TRAINS_AGENT_EXEC_USER") ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig("CLEARML_AGENT_EXTRA_PYTHON_PATH", "TRAINS_AGENT_EXTRA_PYTHON_PATH") diff --git a/clearml_agent/glue/definitions.py b/clearml_agent/glue/definitions.py index ee808e2..c28d422 100644 --- a/clearml_agent/glue/definitions.py +++ b/clearml_agent/glue/definitions.py @@ -12,3 +12,9 @@ ENV_POD_MONITOR_LOG_BATCH_SIZE = EnvEntry("K8S_GLUE_POD_MONITOR_LOG_BATCH_SIZE", ENV_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION = EnvEntry( "K8S_GLUE_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION", default=False, converter=bool ) + +ENV_POD_USE_IMAGE_ENTRYPOINT = EnvEntry("K8S_GLUE_POD_USE_IMAGE_ENTRYPOINT", default=False, converter=bool) +""" +Do not inject a cmd and args to the container's image when building the k8s template (depend on the built-in image +entrypoint) +""" \ No newline at end of file diff --git a/clearml_agent/glue/k8s.py b/clearml_agent/glue/k8s.py index 6620077..2d9c856 100644 --- a/clearml_agent/glue/k8s.py +++ b/clearml_agent/glue/k8s.py @@ -25,6 +25,7 @@ from clearml_agent.definitions import ( ENV_AGENT_GIT_USER, ENV_AGENT_GIT_PASS, ENV_FORCE_SYSTEM_SITE_PACKAGES, + ENV_AGENT_DEBUG_GET_NEXT_TASK, ) from clearml_agent.errors import APIError, UsageError from clearml_agent.glue.errors import GetPodCountError @@ -40,6 +41,7 @@ from clearml_agent.glue.definitions import ( ENV_START_AGENT_SCRIPT_PATH, ENV_DEFAULT_EXECUTION_AGENT_ARGS, ENV_POD_AGENT_INSTALL_ARGS, + ENV_POD_USE_IMAGE_ENTRYPOINT, ) @@ -692,6 +694,13 @@ class K8sIntegration(Worker): if not found_worker_id: container['env'] = env_vars + [{'name': 'CLEARML_WORKER_ID', 'value': task_worker_id}] + if ENV_POD_USE_IMAGE_ENTRYPOINT.get(): + # Don't add a cmd and args, just the image + return self._merge_containers( + container, dict(name=pod_name, image=docker_image) + ) + + # Create bash script for container and container_bash_script = [self.container_bash_script] if isinstance(self.container_bash_script, str) \ else self.container_bash_script @@ -771,7 +780,7 @@ class K8sIntegration(Worker): spec.setdefault('backoffLimit', 0) spec_template = spec.setdefault('template', {}) if labels: - # Place same labels fro any pod spawned by the job + # Place same labels for any pod spawned by the job place_labels(spec_template.setdefault('metadata', {})) spec = spec_template.setdefault('spec', {}) @@ -992,6 +1001,8 @@ class K8sIntegration(Worker): :param worker_params: Worker command line arguments :type worker_params: ``clearml_agent.helper.process.WorkerParams`` """ + # print("debug> running tasks loop") + events_service = self.get_service(Events) # make sure we have a k8s pending queue @@ -1023,12 +1034,14 @@ class K8sIntegration(Worker): continue # iterate over queues (priority style, queues[0] is highest) + # print("debug> iterating over queues") for queue in queues: # delete old completed / failed pods self._cleanup_old_pods(namespaces, extra_msg="Cleanup cycle {cmd}") # get next task in queue try: + # print(f"debug> getting tasks for queue {queue}") response = self._get_next_task(queue=queue, get_task_info=self._impersonate_as_task_owner) except Exception as e: print("Warning: Could not access task queue [{}], error: {}".format(queue, e))