mirror of
https://github.com/clearml/clearml-agent
synced 2025-01-31 09:06:52 +00:00
Add CLEARML_AGENT_PROPAGATE_EXITCODE
, set to 1 to let clearml-agent execute retrun a nonzero exit code on failure (notice by default we keep the retrun code 0, the exception is the k8s glue with non-restarting Pods, where users would want to get visibility into failing Tasks, do not use unless you know what to expect from k8s)
This commit is contained in:
parent
341086d86a
commit
2432f5bb68
@ -16,6 +16,7 @@ ENV_DISABLE_VAULT_SUPPORT = EnvEntry('CLEARML_AGENT_DISABLE_VAULT_SUPPORT', type
|
|||||||
ENV_ENABLE_ENV_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_ENV_CONFIG_SECTION', type=bool)
|
ENV_ENABLE_ENV_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_ENV_CONFIG_SECTION', type=bool)
|
||||||
ENV_ENABLE_FILES_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_FILES_CONFIG_SECTION', type=bool)
|
ENV_ENABLE_FILES_CONFIG_SECTION = EnvEntry('CLEARML_AGENT_ENABLE_FILES_CONFIG_SECTION', type=bool)
|
||||||
ENV_VENV_CONFIGURED = EnvEntry('VIRTUAL_ENV', type=str)
|
ENV_VENV_CONFIGURED = EnvEntry('VIRTUAL_ENV', type=str)
|
||||||
|
ENV_PROPAGATE_EXITCODE = EnvEntry("CLEARML_AGENT_PROPAGATE_EXITCODE", type=bool, default=False)
|
||||||
ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
|
ENV_INITIAL_CONNECT_RETRY_OVERRIDE = EnvEntry(
|
||||||
'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
|
'CLEARML_AGENT_INITIAL_CONNECT_RETRY_OVERRIDE', default=True, converter=safe_text_to_bool
|
||||||
)
|
)
|
||||||
|
@ -41,7 +41,7 @@ from clearml_agent.backend_api.services import workers as workers_api
|
|||||||
from clearml_agent.backend_api.session import CallResult
|
from clearml_agent.backend_api.session import CallResult
|
||||||
from clearml_agent.backend_api.session.defs import (
|
from clearml_agent.backend_api.session.defs import (
|
||||||
ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
|
ENV_ENABLE_ENV_CONFIG_SECTION, ENV_ENABLE_FILES_CONFIG_SECTION,
|
||||||
ENV_VENV_CONFIGURED, )
|
ENV_VENV_CONFIGURED, ENV_PROPAGATE_EXITCODE, )
|
||||||
from clearml_agent.backend_config.defs import UptimeConf
|
from clearml_agent.backend_config.defs import UptimeConf
|
||||||
from clearml_agent.backend_config.utils import apply_environment, apply_files
|
from clearml_agent.backend_config.utils import apply_environment, apply_files
|
||||||
from clearml_agent.commands.base import resolve_names, ServiceCommandSection
|
from clearml_agent.commands.base import resolve_names, ServiceCommandSection
|
||||||
@ -639,7 +639,7 @@ class Worker(ServiceCommandSection):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
def run_one_task(self, queue, task_id, worker_args, docker=None, task_session=None):
|
def run_one_task(self, queue, task_id, worker_args, docker=None, task_session=None):
|
||||||
# type: (Text, Text, WorkerParams, Optional[Text]) -> ()
|
# type: (Text, Text, WorkerParams, Optional[Text]) -> int
|
||||||
"""
|
"""
|
||||||
Run one task pulled from queue.
|
Run one task pulled from queue.
|
||||||
:param queue: ID of queue that task was pulled from
|
:param queue: ID of queue that task was pulled from
|
||||||
@ -647,6 +647,8 @@ class Worker(ServiceCommandSection):
|
|||||||
:param worker_args: Worker command line arguments
|
:param worker_args: Worker command line arguments
|
||||||
:param task_session: The session for running operations on the passed task
|
:param task_session: The session for running operations on the passed task
|
||||||
:param docker: Docker image in which the execution task will run
|
:param docker: Docker image in which the execution task will run
|
||||||
|
|
||||||
|
:return: exit code (0 is success)
|
||||||
"""
|
"""
|
||||||
# start new process and execute task id
|
# start new process and execute task id
|
||||||
# "Running task '{}'".format(task_id)
|
# "Running task '{}'".format(task_id)
|
||||||
@ -848,6 +850,8 @@ class Worker(ServiceCommandSection):
|
|||||||
# unregister this worker, it was killed
|
# unregister this worker, it was killed
|
||||||
self._unregister()
|
self._unregister()
|
||||||
|
|
||||||
|
return status
|
||||||
|
|
||||||
def get_task_session(self, user, company):
|
def get_task_session(self, user, company):
|
||||||
"""
|
"""
|
||||||
Get task session for the user by cloning the agent session
|
Get task session for the user by cloning the agent session
|
||||||
@ -2098,7 +2102,7 @@ class Worker(ServiceCommandSection):
|
|||||||
)
|
)
|
||||||
try:
|
try:
|
||||||
self.report_monitor(ResourceMonitor.StatusReport(task=current_task.id))
|
self.report_monitor(ResourceMonitor.StatusReport(task=current_task.id))
|
||||||
self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
|
status = self.run_one_task(queue='', task_id=current_task.id, worker_args=worker_params, docker=docker)
|
||||||
finally:
|
finally:
|
||||||
self.stop_monitor()
|
self.stop_monitor()
|
||||||
self._unregister()
|
self._unregister()
|
||||||
@ -2106,7 +2110,7 @@ class Worker(ServiceCommandSection):
|
|||||||
if full_monitoring and self.temp_config_path:
|
if full_monitoring and self.temp_config_path:
|
||||||
safe_remove_file(self._session.config_file)
|
safe_remove_file(self._session.config_file)
|
||||||
Singleton.close_pid_file()
|
Singleton.close_pid_file()
|
||||||
return
|
return status if ENV_PROPAGATE_EXITCODE.get() else 0
|
||||||
|
|
||||||
self._apply_extra_configuration()
|
self._apply_extra_configuration()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user