Improve stability and resilience on intermittent network connection

This commit is contained in:
allegroai
2019-08-19 21:17:53 +03:00
parent 0a8cf706bd
commit 3bc1ec2362
10 changed files with 92 additions and 51 deletions

View File

@@ -30,22 +30,22 @@ class TaskStopSignal(object):
def test(self):
# noinspection PyBroadException
try:
status = self.task.status
status = str(self.task.status)
message = self.task.data.status_message
if status == tasks.TaskStatusEnum.in_progress and "stopping" in message:
if status == str(tasks.TaskStatusEnum.in_progress) and "stopping" in message:
return TaskStopReason.stopped
_expected_statuses = (
tasks.TaskStatusEnum.created,
tasks.TaskStatusEnum.queued,
tasks.TaskStatusEnum.in_progress,
str(tasks.TaskStatusEnum.created),
str(tasks.TaskStatusEnum.queued),
str(tasks.TaskStatusEnum.in_progress),
)
if status not in _expected_statuses and "worker" not in message:
return TaskStopReason.status_changed
if status == tasks.TaskStatusEnum.created:
if status == str(tasks.TaskStatusEnum.created):
self._task_reset_state_counter += 1
if self._task_reset_state_counter >= self._number_of_consecutive_reset_tests:

View File

@@ -1,5 +1,3 @@
from socket import gethostname
import attr
from threading import Thread, Event
@@ -13,9 +11,9 @@ from ....backend_api.services import tasks
class DevWorker(object):
prefix = attr.ib(type=str, default="MANUAL:")
report_period = float(config.get('development.worker.report_period_sec', 30.))
report_period = float(max(config.get('development.worker.report_period_sec', 30.), 1.))
report_stdout = bool(config.get('development.worker.log_stdout', True))
ping_period = 30.
ping_period = float(max(config.get('development.worker.ping_period_sec', 30.), 1.))
def __init__(self):
self._dev_stop_signal = None
@@ -51,20 +49,23 @@ class DevWorker(object):
def _daemon(self):
last_ping = time()
while self._task is not None:
if self._exit_event.wait(min(self.ping_period, self.report_period)):
return
# send ping request
if self._support_ping and (time() - last_ping) >= self.ping_period:
self.ping()
last_ping = time()
if self._dev_stop_signal:
stop_reason = self._dev_stop_signal.test()
if stop_reason and self._task:
self._task._dev_mode_stop_task(stop_reason)
try:
if self._exit_event.wait(min(self.ping_period, self.report_period)):
return
# send ping request
if self._support_ping and (time() - last_ping) >= self.ping_period:
self.ping()
last_ping = time()
if self._dev_stop_signal:
stop_reason = self._dev_stop_signal.test()
if stop_reason and self._task:
self._task._dev_mode_stop_task(stop_reason)
except Exception:
pass
def unregister(self):
self._exit_event.set()
self._dev_stop_signal = None
self._thread = None
self._task = None
self._thread = None
self._exit_event.set()
return True