Add CLEARML_AGENT_PROPAGATE_EXITCODE, set to 1 to let clearml-agent execute retrun a nonzero exit code on failure (notice by default we keep the retrun code 0, the exception is the k8s glue with non-restarting Pods, where users would want to get visibility into failing Tasks, do not use unless you know what to expect from k8s)

This commit is contained in:
allegroai 2022-03-24 22:04:25 +02:00
parent 341086d86a
commit 2432f5bb68
2 changed files with 9 additions and 4 deletions

View File

@ -16,6 +16,7 @@ ENV_DISABLE_VAULT_SUPPORT = EnvEntry('CLEARML_AGENT_DISABLE_VAULT_SUPPORT', type
ENV_ENABLE_ENV_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_ENV_CONFIG_SECTION', type=bool)
ENV_ENABLE_FILES_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_FILES_CONFIG_SECTION', type=bool)
ENV_VENV_CONFIGURED = EnvEntry('VIRTUAL_ENV', type=str)
ENV_PROPAGATE_EXITCODE = EnvEntry("CLEARML_AGENT_PROPAGATE_EXITCODE", type=bool, default=False)
ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
)

View File

@ -41,7 +41,7 @@ from clearml_agent.backend_api.services import workers as workers_api
from clearml_agent.backend_api.session import CallResult
from clearml_agent.backend_api.session.defs import (
ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
ENV_VENV_CONFIGURED, )
ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
from clearml_agent.backend_config.defs import UptimeConf
from clearml_agent.backend_config.utils import apply_environment, apply_files
from clearml_agent.commands.base import resolve_names, ServiceCommandSection
@ -639,7 +639,7 @@ class Worker(ServiceCommandSection):
pass
def run_one_task(self, queue, task_id, worker_args, docker=None, task_session=None):
# type: (Text, Text, WorkerParams, Optional[Text]) -> ()
# type: (Text, Text, WorkerParams, Optional[Text]) -> int
"""
Run one task pulled from queue.
:param queue: ID of queue that task was pulled from
@ -647,6 +647,8 @@ class Worker(ServiceCommandSection):
:param worker_args: Worker command line arguments
:param task_session: The session for running operations on the passed task
:param docker: Docker image in which the execution task will run
:return: exit code (0 is success)
"""
# start new process and execute task id
# "Running task '{}'".format(task_id)
@ -848,6 +850,8 @@ class Worker(ServiceCommandSection):
# unregister this worker, it was killed
self._unregister()
return status
def get_task_session(self, user, company):
"""
Get task session for the user by cloning the agent session
@ -2098,7 +2102,7 @@ class Worker(ServiceCommandSection):
)
try:
self.report_monitor(ResourceMonitor.StatusReport(task=current_task.id))
self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
status = self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
finally:
self.stop_monitor()
self._unregister()
@ -2106,7 +2110,7 @@ class Worker(ServiceCommandSection):
if full_monitoring and self.temp_config_path:
safe_remove_file(self._session.config_file)
Singleton.close_pid_file()
return
return status if ENV_PROPAGATE_EXITCODE.get() else 0
self._apply_extra_configuration()