Extract non-responsive tasks watchdog from main tasks logic

This commit is contained in:
allegroai 2020-06-01 11:31:36 +03:00
parent a1dcdffa53
commit ede5586ccc
5 changed files with 95 additions and 58 deletions
server

View File

@ -0,0 +1,89 @@
from datetime import timedelta, datetime
from time import sleep
from apierrors import errors
from bll.task import ChangeStatusRequest
from config import config
from database.model.task.task import TaskStatus, Task
from utilities.threads_manager import ThreadsManager
log = config.logger(__file__)
class NonResponsiveTasksWatchdog:
threads = ThreadsManager()
class _Settings:
"""
Retrieves watchdog settings from the config file
The properties are not cached so that the updates in
the config file are reflected
"""
_prefix = "services.tasks.non_responsive_tasks_watchdog"
@property
def enabled(self):
return config.get(f"{self._prefix}.enabled", True)
@property
def watch_interval_sec(self):
return config.get(f"{self._prefix}.watch_interval_sec", 900)
@property
def threshold_sec(self):
return config.get(f"{self._prefix}.threshold_sec", 7200)
settings = _Settings()
@classmethod
@threads.register("non_responsive_tasks_watchdog", daemon=True)
def start(cls):
sleep(cls.settings.watch_interval_sec)
while not ThreadsManager.terminating:
watch_interval = cls.settings.watch_interval_sec
if cls.settings.enabled:
try:
stopped = cls.cleanup_tasks(
threshold_sec=cls.settings.threshold_sec
)
log.info(f"{stopped} non-responsive tasks stopped")
except Exception as ex:
log.exception(f"Failed stopping tasks: {str(ex)}")
sleep(watch_interval)
@classmethod
def cleanup_tasks(cls, threshold_sec):
relevant_status = (TaskStatus.in_progress,)
threshold = timedelta(seconds=threshold_sec)
ref_time = datetime.utcnow() - threshold
log.info(
f"Starting cleanup cycle for running tasks last updated before {ref_time}"
)
tasks = list(
Task.objects(status__in=relevant_status, last_update__lt=ref_time).only(
"id", "name", "status", "project", "last_update"
)
)
log.info(f"{len(tasks)} non-responsive tasks found")
if not tasks:
return 0
err_count = 0
for task in tasks:
log.info(
f"Stopping {task.id} ({task.name}), last updated at {task.last_update}"
)
try:
ChangeStatusRequest(
task=task,
new_status=TaskStatus.stopped,
status_reason="Forced stop (non-responsive)",
status_message="Forced stop (non-responsive)",
force=True,
).execute()
except errors.bad_request.FailedChangingTaskStatus:
err_count += 1
return len(tasks) - err_count

View File

@ -1,5 +1,5 @@
from collections import OrderedDict from collections import OrderedDict
from datetime import datetime, timedelta from datetime import datetime
from operator import attrgetter from operator import attrgetter
from random import random from random import random
from time import sleep from time import sleep
@ -32,15 +32,12 @@ from database.utils import get_company_or_none_constraint, id as create_id
from service_repo import APICall from service_repo import APICall
from timing_context import TimingContext from timing_context import TimingContext
from utilities.dicts import deep_merge from utilities.dicts import deep_merge
from utilities.threads_manager import ThreadsManager
from .utils import ChangeStatusRequest, validate_status_change, ParameterKeyEscaper from .utils import ChangeStatusRequest, validate_status_change, ParameterKeyEscaper
log = config.logger(__file__) log = config.logger(__file__)
class TaskBLL(object): class TaskBLL(object):
threads = ThreadsManager("TaskBLL")
def __init__(self, events_es=None): def __init__(self, events_es=None):
self.events_es = ( self.events_es = (
events_es if events_es is not None else es_factory.connect("events") events_es if events_es is not None else es_factory.connect("events")
@ -575,58 +572,6 @@ class TaskBLL(object):
return [a.key for a in added], [a.key for a in updated] return [a.key for a in added], [a.key for a in updated]
@classmethod
@threads.register("non_responsive_tasks_watchdog", daemon=True)
def start_non_responsive_tasks_watchdog(cls):
log = config.logger("non_responsive_tasks_watchdog")
relevant_status = (TaskStatus.in_progress,)
threshold = timedelta(
seconds=config.get(
"services.tasks.non_responsive_tasks_watchdog.threshold_sec", 7200
)
)
watch_interval = config.get(
"services.tasks.non_responsive_tasks_watchdog.watch_interval_sec", 900
)
sleep(watch_interval)
while not ThreadsManager.terminating:
try:
ref_time = datetime.utcnow() - threshold
log.info(
f"Starting cleanup cycle for running tasks last updated before {ref_time}"
)
tasks = list(
Task.objects(
status__in=relevant_status, last_update__lt=ref_time
).only("id", "name", "status", "project", "last_update")
)
if tasks:
log.info(f"Stopping {len(tasks)} non-responsive tasks")
for task in tasks:
log.info(
f"Stopping {task.id} ({task.name}), last updated at {task.last_update}"
)
ChangeStatusRequest(
task=task,
new_status=TaskStatus.stopped,
status_reason="Forced stop (non-responsive)",
status_message="Forced stop (non-responsive)",
force=True,
).execute()
log.info(f"Done")
except Exception as ex:
log.exception(f"Failed stopping tasks: {str(ex)}")
sleep(watch_interval)
@staticmethod @staticmethod
def get_aggregated_project_execution_parameters( def get_aggregated_project_execution_parameters(
company_id, company_id,

View File

@ -1,4 +1,6 @@
non_responsive_tasks_watchdog { non_responsive_tasks_watchdog {
enabled: true
# In-progress tasks older than this value in seconds will be stopped by the watchdog # In-progress tasks older than this value in seconds will be stopped by the watchdog
threshold_sec: 7200 threshold_sec: 7200

View File

@ -39,6 +39,7 @@ from bll.task import (
split_by, split_by,
ParameterKeyEscaper, ParameterKeyEscaper,
) )
from bll.task.non_responsive_tasks_watchdog import NonResponsiveTasksWatchdog
from bll.util import SetFieldsResolver from bll.util import SetFieldsResolver
from database.errors import translate_errors_context from database.errors import translate_errors_context
from database.model.model import Model from database.model.model import Model
@ -70,7 +71,7 @@ event_bll = EventBLL()
queue_bll = QueueBLL() queue_bll = QueueBLL()
TaskBLL.start_non_responsive_tasks_watchdog() NonResponsiveTasksWatchdog.start()
def set_task_status_from_call( def set_task_status_from_call(

View File

@ -10,7 +10,7 @@ class ThreadsManager:
def __init__(self, name=None, **threads): def __init__(self, name=None, **threads):
super(ThreadsManager, self).__init__() super(ThreadsManager, self).__init__()
self.name = name or self.__class__.name self.name = name or self.__class__.__name__
self.objects = {} self.objects = {}
self.lock = Lock() self.lock = Lock()