Add CLEARML_AGENT_PROPAGATE_EXITCODE, set to 1 to let clearml-agent execute retrun a nonzero exit code on failure (notice by default we keep the retrun code 0, the exception is the k8s glue with non-restarting Pods, where users would want to get visibility into failing Tasks, do not use unless you know what to expect from k8s)

2025-06-26 18:16:15 +00:00 · 2022-03-24 22:04:25 +02:00 · 2022-03-24 22:04:25 +02:00 · 2432f5bb68
commit 2432f5bb68
parent 341086d86a
2 changed files with 9 additions and 4 deletions
--- a/clearml_agent/backend_api/session/defs.py
+++ b/clearml_agent/backend_api/session/defs.py
@ -16,6 +16,7 @@ ENV_DISABLE_VAULT_SUPPORT = EnvEntry('CLEARML_AGENT_DISABLE_VAULT_SUPPORT', type
 ENV_ENABLE_ENV_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_ENV_CONFIG_SECTION', type=bool)
 ENV_ENABLE_FILES_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_FILES_CONFIG_SECTION', type=bool)
 ENV_VENV_CONFIGURED = EnvEntry('VIRTUAL_ENV', type=str)
+ENV_PROPAGATE_EXITCODE = EnvEntry("CLEARML_AGENT_PROPAGATE_EXITCODE", type=bool, default=False)
 ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
    'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
 )
--- a/clearml_agent/commands/worker.py
+++ b/clearml_agent/commands/worker.py
@ -41,7 +41,7 @@ from clearml_agent.backend_api.services import workers as workers_api
 from clearml_agent.backend_api.session import CallResult
 from clearml_agent.backend_api.session.defs import (
    ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
-    ENV_VENV_CONFIGURED, )
+    ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
 from clearml_agent.backend_config.defs import UptimeConf
 from clearml_agent.backend_config.utils import apply_environment, apply_files
 from clearml_agent.commands.base import resolve_names, ServiceCommandSection
@ -639,7 +639,7 @@ class Worker(ServiceCommandSection):
            pass

    def run_one_task(self, queue, task_id, worker_args, docker=None, task_session=None):
-        # type: (Text, Text, WorkerParams, Optional[Text]) -> ()
+        # type: (Text, Text, WorkerParams, Optional[Text]) -> int
        """
        Run one task pulled from queue.
        :param queue: ID of queue that task was pulled from
@ -647,6 +647,8 @@ class Worker(ServiceCommandSection):
        :param worker_args: Worker command line arguments
        :param task_session: The session for running operations on the passed task
        :param docker: Docker image in which the execution task will run
+
+        :return: exit code (0 is success)
        """
        # start new process and execute task id
        # "Running task '{}'".format(task_id)
@ -848,6 +850,8 @@ class Worker(ServiceCommandSection):
                    # unregister this worker, it was killed
                    self._unregister()

+        return status
+
    def get_task_session(self, user, company):
        """
        Get task session for the user by cloning the agent session
@ -2098,7 +2102,7 @@ class Worker(ServiceCommandSection):
            )
            try:
                self.report_monitor(ResourceMonitor.StatusReport(task=current_task.id))
-                self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
+                status = self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
            finally:
                self.stop_monitor()
                self._unregister()
@ -2106,7 +2110,7 @@ class Worker(ServiceCommandSection):
                if full_monitoring and self.temp_config_path:
                    safe_remove_file(self._session.config_file)
                    Singleton.close_pid_file()
-            return
+            return status if ENV_PROPAGATE_EXITCODE.get() else 0

        self._apply_extra_configuration()