mirror of
https://github.com/clearml/clearml
synced 2025-06-26 18:16:07 +00:00
Improve stability and resilience on intermittent network connection
This commit is contained in:
@@ -30,22 +30,22 @@ class TaskStopSignal(object):
|
||||
def test(self):
|
||||
# noinspection PyBroadException
|
||||
try:
|
||||
status = self.task.status
|
||||
status = str(self.task.status)
|
||||
message = self.task.data.status_message
|
||||
|
||||
if status == tasks.TaskStatusEnum.in_progress and "stopping" in message:
|
||||
if status == str(tasks.TaskStatusEnum.in_progress) and "stopping" in message:
|
||||
return TaskStopReason.stopped
|
||||
|
||||
_expected_statuses = (
|
||||
tasks.TaskStatusEnum.created,
|
||||
tasks.TaskStatusEnum.queued,
|
||||
tasks.TaskStatusEnum.in_progress,
|
||||
str(tasks.TaskStatusEnum.created),
|
||||
str(tasks.TaskStatusEnum.queued),
|
||||
str(tasks.TaskStatusEnum.in_progress),
|
||||
)
|
||||
|
||||
if status not in _expected_statuses and "worker" not in message:
|
||||
return TaskStopReason.status_changed
|
||||
|
||||
if status == tasks.TaskStatusEnum.created:
|
||||
if status == str(tasks.TaskStatusEnum.created):
|
||||
self._task_reset_state_counter += 1
|
||||
|
||||
if self._task_reset_state_counter >= self._number_of_consecutive_reset_tests:
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
from socket import gethostname
|
||||
|
||||
import attr
|
||||
from threading import Thread, Event
|
||||
|
||||
@@ -13,9 +11,9 @@ from ....backend_api.services import tasks
|
||||
class DevWorker(object):
|
||||
prefix = attr.ib(type=str, default="MANUAL:")
|
||||
|
||||
report_period = float(config.get('development.worker.report_period_sec', 30.))
|
||||
report_period = float(max(config.get('development.worker.report_period_sec', 30.), 1.))
|
||||
report_stdout = bool(config.get('development.worker.log_stdout', True))
|
||||
ping_period = 30.
|
||||
ping_period = float(max(config.get('development.worker.ping_period_sec', 30.), 1.))
|
||||
|
||||
def __init__(self):
|
||||
self._dev_stop_signal = None
|
||||
@@ -51,20 +49,23 @@ class DevWorker(object):
|
||||
def _daemon(self):
|
||||
last_ping = time()
|
||||
while self._task is not None:
|
||||
if self._exit_event.wait(min(self.ping_period, self.report_period)):
|
||||
return
|
||||
# send ping request
|
||||
if self._support_ping and (time() - last_ping) >= self.ping_period:
|
||||
self.ping()
|
||||
last_ping = time()
|
||||
if self._dev_stop_signal:
|
||||
stop_reason = self._dev_stop_signal.test()
|
||||
if stop_reason and self._task:
|
||||
self._task._dev_mode_stop_task(stop_reason)
|
||||
try:
|
||||
if self._exit_event.wait(min(self.ping_period, self.report_period)):
|
||||
return
|
||||
# send ping request
|
||||
if self._support_ping and (time() - last_ping) >= self.ping_period:
|
||||
self.ping()
|
||||
last_ping = time()
|
||||
if self._dev_stop_signal:
|
||||
stop_reason = self._dev_stop_signal.test()
|
||||
if stop_reason and self._task:
|
||||
self._task._dev_mode_stop_task(stop_reason)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def unregister(self):
|
||||
self._exit_event.set()
|
||||
self._dev_stop_signal = None
|
||||
self._thread = None
|
||||
self._task = None
|
||||
self._thread = None
|
||||
self._exit_event.set()
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user