mirror of
https://github.com/clearml/clearml-agent
synced 2025-01-31 00:56:53 +00:00
Add K8S_GLUE_POD_USE_IMAGE_ENTRYPOINT to allow running images without overriding the entrypoint (useful for agents using prebuilt images in k8s)
This commit is contained in:
parent
2f0553b873
commit
edc333ba5f
@ -167,6 +167,7 @@ ENV_AGENT_GIT_USER = EnvironmentConfig("CLEARML_AGENT_GIT_USER", "TRAINS_AGENT_G
|
|||||||
ENV_AGENT_GIT_PASS = EnvironmentConfig("CLEARML_AGENT_GIT_PASS", "TRAINS_AGENT_GIT_PASS")
|
ENV_AGENT_GIT_PASS = EnvironmentConfig("CLEARML_AGENT_GIT_PASS", "TRAINS_AGENT_GIT_PASS")
|
||||||
ENV_AGENT_GIT_HOST = EnvironmentConfig("CLEARML_AGENT_GIT_HOST", "TRAINS_AGENT_GIT_HOST")
|
ENV_AGENT_GIT_HOST = EnvironmentConfig("CLEARML_AGENT_GIT_HOST", "TRAINS_AGENT_GIT_HOST")
|
||||||
ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig("CLEARML_AGENT_DISABLE_SSH_MOUNT", type=bool)
|
ENV_AGENT_DISABLE_SSH_MOUNT = EnvironmentConfig("CLEARML_AGENT_DISABLE_SSH_MOUNT", type=bool)
|
||||||
|
ENV_AGENT_DEBUG_GET_NEXT_TASK = EnvironmentConfig("CLEARML_AGENT_DEBUG_GET_NEXT_TASK", type=bool)
|
||||||
ENV_SSH_AUTH_SOCK = EnvironmentConfig("SSH_AUTH_SOCK")
|
ENV_SSH_AUTH_SOCK = EnvironmentConfig("SSH_AUTH_SOCK")
|
||||||
ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig("CLEARML_AGENT_EXEC_USER", "TRAINS_AGENT_EXEC_USER")
|
ENV_TASK_EXECUTE_AS_USER = EnvironmentConfig("CLEARML_AGENT_EXEC_USER", "TRAINS_AGENT_EXEC_USER")
|
||||||
ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig("CLEARML_AGENT_EXTRA_PYTHON_PATH", "TRAINS_AGENT_EXTRA_PYTHON_PATH")
|
ENV_TASK_EXTRA_PYTHON_PATH = EnvironmentConfig("CLEARML_AGENT_EXTRA_PYTHON_PATH", "TRAINS_AGENT_EXTRA_PYTHON_PATH")
|
||||||
|
@ -12,3 +12,9 @@ ENV_POD_MONITOR_LOG_BATCH_SIZE = EnvEntry("K8S_GLUE_POD_MONITOR_LOG_BATCH_SIZE",
|
|||||||
ENV_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION = EnvEntry(
|
ENV_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION = EnvEntry(
|
||||||
"K8S_GLUE_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION", default=False, converter=bool
|
"K8S_GLUE_POD_MONITOR_DISABLE_ENQUEUE_ON_PREEMPTION", default=False, converter=bool
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ENV_POD_USE_IMAGE_ENTRYPOINT = EnvEntry("K8S_GLUE_POD_USE_IMAGE_ENTRYPOINT", default=False, converter=bool)
|
||||||
|
"""
|
||||||
|
Do not inject a cmd and args to the container's image when building the k8s template (depend on the built-in image
|
||||||
|
entrypoint)
|
||||||
|
"""
|
@ -25,6 +25,7 @@ from clearml_agent.definitions import (
|
|||||||
ENV_AGENT_GIT_USER,
|
ENV_AGENT_GIT_USER,
|
||||||
ENV_AGENT_GIT_PASS,
|
ENV_AGENT_GIT_PASS,
|
||||||
ENV_FORCE_SYSTEM_SITE_PACKAGES,
|
ENV_FORCE_SYSTEM_SITE_PACKAGES,
|
||||||
|
ENV_AGENT_DEBUG_GET_NEXT_TASK,
|
||||||
)
|
)
|
||||||
from clearml_agent.errors import APIError, UsageError
|
from clearml_agent.errors import APIError, UsageError
|
||||||
from clearml_agent.glue.errors import GetPodCountError
|
from clearml_agent.glue.errors import GetPodCountError
|
||||||
@ -40,6 +41,7 @@ from clearml_agent.glue.definitions import (
|
|||||||
ENV_START_AGENT_SCRIPT_PATH,
|
ENV_START_AGENT_SCRIPT_PATH,
|
||||||
ENV_DEFAULT_EXECUTION_AGENT_ARGS,
|
ENV_DEFAULT_EXECUTION_AGENT_ARGS,
|
||||||
ENV_POD_AGENT_INSTALL_ARGS,
|
ENV_POD_AGENT_INSTALL_ARGS,
|
||||||
|
ENV_POD_USE_IMAGE_ENTRYPOINT,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -692,6 +694,13 @@ class K8sIntegration(Worker):
|
|||||||
if not found_worker_id:
|
if not found_worker_id:
|
||||||
container['env'] = env_vars + [{'name': 'CLEARML_WORKER_ID', 'value': task_worker_id}]
|
container['env'] = env_vars + [{'name': 'CLEARML_WORKER_ID', 'value': task_worker_id}]
|
||||||
|
|
||||||
|
if ENV_POD_USE_IMAGE_ENTRYPOINT.get():
|
||||||
|
# Don't add a cmd and args, just the image
|
||||||
|
return self._merge_containers(
|
||||||
|
container, dict(name=pod_name, image=docker_image)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create bash script for container and
|
||||||
container_bash_script = [self.container_bash_script] if isinstance(self.container_bash_script, str) \
|
container_bash_script = [self.container_bash_script] if isinstance(self.container_bash_script, str) \
|
||||||
else self.container_bash_script
|
else self.container_bash_script
|
||||||
|
|
||||||
@ -771,7 +780,7 @@ class K8sIntegration(Worker):
|
|||||||
spec.setdefault('backoffLimit', 0)
|
spec.setdefault('backoffLimit', 0)
|
||||||
spec_template = spec.setdefault('template', {})
|
spec_template = spec.setdefault('template', {})
|
||||||
if labels:
|
if labels:
|
||||||
# Place same labels fro any pod spawned by the job
|
# Place same labels for any pod spawned by the job
|
||||||
place_labels(spec_template.setdefault('metadata', {}))
|
place_labels(spec_template.setdefault('metadata', {}))
|
||||||
|
|
||||||
spec = spec_template.setdefault('spec', {})
|
spec = spec_template.setdefault('spec', {})
|
||||||
@ -992,6 +1001,8 @@ class K8sIntegration(Worker):
|
|||||||
:param worker_params: Worker command line arguments
|
:param worker_params: Worker command line arguments
|
||||||
:type worker_params: ``clearml_agent.helper.process.WorkerParams``
|
:type worker_params: ``clearml_agent.helper.process.WorkerParams``
|
||||||
"""
|
"""
|
||||||
|
# print("debug> running tasks loop")
|
||||||
|
|
||||||
events_service = self.get_service(Events)
|
events_service = self.get_service(Events)
|
||||||
|
|
||||||
# make sure we have a k8s pending queue
|
# make sure we have a k8s pending queue
|
||||||
@ -1023,12 +1034,14 @@ class K8sIntegration(Worker):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
# iterate over queues (priority style, queues[0] is highest)
|
# iterate over queues (priority style, queues[0] is highest)
|
||||||
|
# print("debug> iterating over queues")
|
||||||
for queue in queues:
|
for queue in queues:
|
||||||
# delete old completed / failed pods
|
# delete old completed / failed pods
|
||||||
self._cleanup_old_pods(namespaces, extra_msg="Cleanup cycle {cmd}")
|
self._cleanup_old_pods(namespaces, extra_msg="Cleanup cycle {cmd}")
|
||||||
|
|
||||||
# get next task in queue
|
# get next task in queue
|
||||||
try:
|
try:
|
||||||
|
# print(f"debug> getting tasks for queue {queue}")
|
||||||
response = self._get_next_task(queue=queue, get_task_info=self._impersonate_as_task_owner)
|
response = self._get_next_task(queue=queue, get_task_info=self._impersonate_as_task_owner)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Warning: Could not access task queue [{}], error: {}".format(queue, e))
|
print("Warning: Could not access task queue [{}], error: {}".format(queue, e))
|
||||||
|
Loading…
Reference in New Issue
Block a user