Fix Hydra tasks never fail and are only set to completed (fix handling return code)

This commit is contained in:
allegroai 2022-04-15 19:24:37 +03:00
parent 11242d4029
commit 81de18dbce
2 changed files with 29 additions and 4 deletions

View File

@ -90,6 +90,17 @@ class PatchHydra(object):
@staticmethod @staticmethod
def _patched_run_job(config, task_function, *args, **kwargs): def _patched_run_job(config, task_function, *args, **kwargs):
# noinspection PyBroadException
try:
from hydra.core.utils import JobStatus
failed_status = JobStatus.FAILED
except Exception:
LoggerRoot.get_base_logger(PatchHydra).warning(
"Could not import JobStatus from Hydra. Failed tasks will be marked as completed"
)
failed_status = None
# store the config # store the config
# noinspection PyBroadException # noinspection PyBroadException
try: try:
@ -121,10 +132,23 @@ class PatchHydra(object):
kwargs["config"] = config kwargs["config"] = config
kwargs["task_function"] = partial(PatchHydra._patched_task_function, task_function,) kwargs["task_function"] = partial(PatchHydra._patched_task_function, task_function,)
result = PatchHydra._original_run_job(*args, **kwargs) result = PatchHydra._original_run_job(*args, **kwargs)
# noinspection PyBroadException
try:
result_status = result.status
except Exception:
LoggerRoot.get_base_logger(PatchHydra).warning(
"Could not get Hydra job status. Failed tasks will be marked as completed"
)
result_status = None
# if we have Task.init called inside the App, we close it after the app is done. # if we have Task.init called inside the App, we close it after the app is done.
# This will make sure that hydra run will create multiple Tasks # This will make sure that hydra run will create multiple Tasks
if not running_remotely() and not pre_app_task_init_call and PatchHydra._current_task: if (
not running_remotely()
and not pre_app_task_init_call
and PatchHydra._current_task
and (failed_status is None or result_status is None or result_status != failed_status)
):
PatchHydra._current_task.close() PatchHydra._current_task.close()
# make sure we do not reuse the Task if we have a multi-run session # make sure we do not reuse the Task if we have a multi-run session
DEV_TASK_NO_REUSE.set(True) DEV_TASK_NO_REUSE.set(True)

View File

@ -3245,11 +3245,11 @@ class Task(_Task):
is_sub_process = self.__is_subprocess() is_sub_process = self.__is_subprocess()
# noinspection PyBroadException # noinspection PyBroadException
task_status = None
try: try:
wait_for_uploads = True wait_for_uploads = True
# first thing mark task as stopped, so we will not end up with "running" on lost tasks # first thing mark task as stopped, so we will not end up with "running" on lost tasks
# if we are running remotely, the daemon will take care of it # if we are running remotely, the daemon will take care of it
task_status = None
wait_for_std_log = True wait_for_std_log = True
if (not running_remotely() or DEBUG_SIMULATE_REMOTE_TASK.get()) \ if (not running_remotely() or DEBUG_SIMULATE_REMOTE_TASK.get()) \
and self.is_main_task() and not is_sub_process: and self.is_main_task() and not is_sub_process:
@ -3273,7 +3273,7 @@ class Task(_Task):
if (is_exception and not isinstance(is_exception, KeyboardInterrupt) if (is_exception and not isinstance(is_exception, KeyboardInterrupt)
and is_exception != KeyboardInterrupt) \ and is_exception != KeyboardInterrupt) \
or (not self.__exit_hook.remote_user_aborted and or (not self.__exit_hook.remote_user_aborted and
self.__exit_hook.signal not in (None, 2, 15)): (self.__exit_hook.signal not in (None, 2, 15) or self.__exit_hook.exit_code)):
task_status = ( task_status = (
'failed', 'failed',
'Exception {}'.format(is_exception) if is_exception else 'Exception {}'.format(is_exception) if is_exception else
@ -3391,7 +3391,8 @@ class Task(_Task):
pass pass
self._edit_lock = None self._edit_lock = None
self.set_progress(100) if task_status and task_status[0] == "completed":
self.set_progress(100)
# make sure no one will re-enter the shutdown method # make sure no one will re-enter the shutdown method
self._at_exit_called = True self._at_exit_called = True