Add option to crash agent on exception using agent.crash_on_exception configuration setting (#123)

This commit is contained in:
Niels ten Boom 2022-11-06 16:15:39 +01:00 committed by GitHub
parent e4861fc0fb
commit 9eee213683
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 11 additions and 1 deletions

View File

@ -327,4 +327,9 @@
# into the file specified in CLEARML_CUSTOM_BUILD_OUTPUT, the agent will emit a warning and continue with the
# standard flow.
custom_build_script: ""
# Crash on exception: by default when encountering an exception while running a task,
# the agent will catch the exception, log it and continue running.
# Set this to `true` to propagate exceptions and crash the agent.
# crash_on_exception: true
}

View File

@ -1548,10 +1548,14 @@ class Worker(ServiceCommandSection):
gpu_indexes=gpu_indexes,
gpu_queues=dynamic_gpus,
)
except Exception:
except Exception as e:
tb = six.text_type(traceback.format_exc())
print("FATAL ERROR:")
print(tb)
if self._session.config.get("agent.crash_on_exception", False):
raise e
crash_file, name = safe_mkstemp(prefix=".clearml_agent-crash", suffix=".log")
try:
with crash_file:

View File

@ -87,6 +87,7 @@ ENVIRONMENT_CONFIG = {
"agent.cpu_only": EnvironmentConfig(
names=("CLEARML_CPU_ONLY", "TRAINS_CPU_ONLY", "CPU_ONLY"), type=bool
),
"agent.crash_on_exception": EnvironmentConfig("CLEAMRL_AGENT_CRASH_ON_EXCEPTION", type=bool),
"sdk.aws.s3.key": EnvironmentConfig("AWS_ACCESS_KEY_ID"),
"sdk.aws.s3.secret": ENV_AWS_SECRET_KEY,
"sdk.aws.s3.region": EnvironmentConfig("AWS_DEFAULT_REGION"),