From 81de18dbce08229834d9bb0676446a151046e6a7 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Fri, 15 Apr 2022 19:24:37 +0300 Subject: [PATCH] Fix Hydra tasks never fail and are only set to completed (fix handling return code) --- clearml/binding/hydra_bind.py | 26 +++++++++++++++++++++++++- clearml/task.py | 7 ++++--- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/clearml/binding/hydra_bind.py b/clearml/binding/hydra_bind.py index 8f3fa65b..f3b4ecfb 100644 --- a/clearml/binding/hydra_bind.py +++ b/clearml/binding/hydra_bind.py @@ -90,6 +90,17 @@ class PatchHydra(object): @staticmethod def _patched_run_job(config, task_function, *args, **kwargs): + # noinspection PyBroadException + try: + from hydra.core.utils import JobStatus + + failed_status = JobStatus.FAILED + except Exception: + LoggerRoot.get_base_logger(PatchHydra).warning( + "Could not import JobStatus from Hydra. Failed tasks will be marked as completed" + ) + failed_status = None + # store the config # noinspection PyBroadException try: @@ -121,10 +132,23 @@ class PatchHydra(object): kwargs["config"] = config kwargs["task_function"] = partial(PatchHydra._patched_task_function, task_function,) result = PatchHydra._original_run_job(*args, **kwargs) + # noinspection PyBroadException + try: + result_status = result.status + except Exception: + LoggerRoot.get_base_logger(PatchHydra).warning( + "Could not get Hydra job status. Failed tasks will be marked as completed" + ) + result_status = None # if we have Task.init called inside the App, we close it after the app is done. # This will make sure that hydra run will create multiple Tasks - if not running_remotely() and not pre_app_task_init_call and PatchHydra._current_task: + if ( + not running_remotely() + and not pre_app_task_init_call + and PatchHydra._current_task + and (failed_status is None or result_status is None or result_status != failed_status) + ): PatchHydra._current_task.close() # make sure we do not reuse the Task if we have a multi-run session DEV_TASK_NO_REUSE.set(True) diff --git a/clearml/task.py b/clearml/task.py index 7046aab4..3f7b865c 100644 --- a/clearml/task.py +++ b/clearml/task.py @@ -3245,11 +3245,11 @@ class Task(_Task): is_sub_process = self.__is_subprocess() # noinspection PyBroadException + task_status = None try: wait_for_uploads = True # first thing mark task as stopped, so we will not end up with "running" on lost tasks # if we are running remotely, the daemon will take care of it - task_status = None wait_for_std_log = True if (not running_remotely() or DEBUG_SIMULATE_REMOTE_TASK.get()) \ and self.is_main_task() and not is_sub_process: @@ -3273,7 +3273,7 @@ class Task(_Task): if (is_exception and not isinstance(is_exception, KeyboardInterrupt) and is_exception != KeyboardInterrupt) \ or (not self.__exit_hook.remote_user_aborted and - self.__exit_hook.signal not in (None, 2, 15)): + (self.__exit_hook.signal not in (None, 2, 15) or self.__exit_hook.exit_code)): task_status = ( 'failed', 'Exception {}'.format(is_exception) if is_exception else @@ -3391,7 +3391,8 @@ class Task(_Task): pass self._edit_lock = None - self.set_progress(100) + if task_status and task_status[0] == "completed": + self.set_progress(100) # make sure no one will re-enter the shutdown method self._at_exit_called = True