From 9eee213683252cd0bd19aae3f9b2c65939d75ac3 Mon Sep 17 00:00:00 2001 From: Niels ten Boom Date: Sun, 6 Nov 2022 16:15:39 +0100 Subject: [PATCH] Add option to crash agent on exception using `agent.crash_on_exception` configuration setting (#123) --- clearml_agent/backend_api/config/default/agent.conf | 5 +++++ clearml_agent/commands/worker.py | 6 +++++- clearml_agent/definitions.py | 1 + 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/clearml_agent/backend_api/config/default/agent.conf b/clearml_agent/backend_api/config/default/agent.conf index 9941ee2..7f5bcd7 100644 --- a/clearml_agent/backend_api/config/default/agent.conf +++ b/clearml_agent/backend_api/config/default/agent.conf @@ -327,4 +327,9 @@ # into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the # standard flow. custom_build_script: "" + + # Crash on exception: by default when encountering an exception while running a task, + # the agent will catch the exception, log it and continue running. + # Set this to `true` to propagate exceptions and crash the agent. + # crash_on_exception: true } diff --git a/clearml_agent/commands/worker.py b/clearml_agent/commands/worker.py index 24b1450..3067da3 100644 --- a/clearml_agent/commands/worker.py +++ b/clearml_agent/commands/worker.py @@ -1548,10 +1548,14 @@ class Worker(ServiceCommandSection): gpu_indexes=gpu_indexes, gpu_queues=dynamic_gpus, ) - except Exception: + except Exception as e: tb = six.text_type(traceback.format_exc()) print("FATAL ERROR:") print(tb) + + if self._session.config.get("agent.crash_on_exception", False): + raise e + crash_file, name = safe_mkstemp(prefix=".clearml_agent-crash", suffix=".log") try: with crash_file: diff --git a/clearml_agent/definitions.py b/clearml_agent/definitions.py index 8453500..1c4dbd2 100644 --- a/clearml_agent/definitions.py +++ b/clearml_agent/definitions.py @@ -87,6 +87,7 @@ ENVIRONMENT_CONFIG = { "agent.cpu_only": EnvironmentConfig( names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool ), + "agent.crash_on_exception": EnvironmentConfig("CLEAMRL_AGENT_CRASH_ON_EXCEPTION", type=bool), "sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"), "sdk.aws.s3.secret": ENV_AWS_SECRET_KEY, "sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),