Fix threaded jobs management (invoke only from AppSequence)

This commit is contained in:
allegroai 2022-09-29 19:13:22 +03:00
parent 4dff163af4
commit efd56e085e
7 changed files with 84 additions and 86 deletions

View File

@ -7,7 +7,7 @@ from elasticsearch import Elasticsearch
from apiserver import database from apiserver import database
from apiserver.es_factory import es_factory from apiserver.es_factory import es_factory
from apiserver.apierrors import errors from apiserver.apierrors import errors
from apiserver.bll.queue.queue_metrics import QueueMetrics, MetricsRefresher from apiserver.bll.queue.queue_metrics import QueueMetrics
from apiserver.bll.workers import WorkerBLL from apiserver.bll.workers import WorkerBLL
from apiserver.config_repo import config from apiserver.config_repo import config
from apiserver.database.errors import translate_errors_context from apiserver.database.errors import translate_errors_context
@ -334,6 +334,3 @@ class QueueBLL(object):
if res is None: if res is None:
raise errors.bad_request.InvalidQueueId(queue_id=queue_id) raise errors.bad_request.InvalidQueueId(queue_id=queue_id)
return int(res.get("count")) return int(res.get("count"))
MetricsRefresher.start(queue_metrics=QueueBLL().metrics)

View File

@ -279,10 +279,14 @@ class MetricsRefresher:
@classmethod @classmethod
@threads.register("queue_metrics_refresh_watchdog", daemon=True) @threads.register("queue_metrics_refresh_watchdog", daemon=True)
def start(cls, queue_metrics: QueueMetrics): def start(cls, queue_metrics: QueueMetrics = None):
if not cls.watch_interval_sec: if not cls.watch_interval_sec:
return return
if not queue_metrics:
from .queue_bll import QueueBLL
queue_metrics = QueueBLL().metrics
sleep(10) sleep(10)
while not ThreadsManager.terminating: while not ThreadsManager.terminating:
try: try:

View File

@ -1,6 +1,6 @@
from datetime import datetime from datetime import datetime
import operator import operator
from threading import Thread, Lock from threading import Lock
from time import sleep from time import sleep
import attr import attr
@ -9,7 +9,9 @@ import psutil
from apiserver.utilities.threads_manager import ThreadsManager from apiserver.utilities.threads_manager import ThreadsManager
class ResourceMonitor(Thread): stat_threads = ThreadsManager("Statistics")
@attr.s(auto_attribs=True) @attr.s(auto_attribs=True)
class Sample: class Sample:
cpu_usage: float = 0.0 cpu_usage: float = 0.0
@ -37,48 +39,53 @@ class ResourceMonitor(Thread):
res = self._apply(lambda x: x / (count + 1), res) res = self._apply(lambda x: x / (count + 1), res)
return res return res
def __init__(self, sample_interval_sec=5):
super(ResourceMonitor, self).__init__(daemon=True)
self.sample_interval_sec = sample_interval_sec
self._lock = Lock()
self._clear()
def _clear(self):
sample = self._get_sample()
self._avg = sample
self._min = sample
self._max = sample
self._clear_time = datetime.utcnow()
self._count = 1
@classmethod @classmethod
def _get_sample(cls) -> Sample: def get_current_sample(cls) -> "Sample":
return cls.Sample( return cls(
cpu_usage=psutil.cpu_percent(), cpu_usage=psutil.cpu_percent(),
mem_used_gb=psutil.virtual_memory().used / (1024 ** 3), mem_used_gb=psutil.virtual_memory().used / (1024 ** 3),
mem_free_gb=psutil.virtual_memory().free / (1024 ** 3), mem_free_gb=psutil.virtual_memory().free / (1024 ** 3),
) )
def run(self):
class ResourceMonitor:
class Accumulator:
def __init__(self):
sample = Sample.get_current_sample()
self.avg = sample
self.min = sample
self.max = sample
self.time = datetime.utcnow()
self.count = 1
def add_sample(self, sample: Sample):
self.min = self.min.min(sample)
self.max = self.max.max(sample)
self.avg = self.avg.avg(sample, self.count)
self.count += 1
sample_interval_sec = 5
_lock = Lock()
accumulator = Accumulator()
@classmethod
@stat_threads.register("resource_monitor", daemon=True)
def start(cls):
while not ThreadsManager.terminating: while not ThreadsManager.terminating:
sleep(self.sample_interval_sec) sleep(cls.sample_interval_sec)
sample = Sample.get_current_sample()
with cls._lock:
cls.accumulator.add_sample(sample)
sample = self._get_sample() @classmethod
def get_stats(cls) -> dict:
with self._lock:
self._min = self._min.min(sample)
self._max = self._max.max(sample)
self._avg = self._avg.avg(sample, self._count)
self._count += 1
def get_stats(self) -> dict:
""" Returns current resource statistics and clears internal resource statistics """ """ Returns current resource statistics and clears internal resource statistics """
with self._lock: with cls._lock:
min_ = attr.asdict(self._min) min_ = attr.asdict(cls.accumulator.min)
max_ = attr.asdict(self._max) max_ = attr.asdict(cls.accumulator.max)
avg = attr.asdict(self._avg) avg = attr.asdict(cls.accumulator.avg)
interval = datetime.utcnow() - self._clear_time interval = datetime.utcnow() - cls.accumulator.time
self._clear() cls.accumulator = cls.Accumulator()
return { return {
"interval_sec": interval.total_seconds(), "interval_sec": interval.total_seconds(),

View File

@ -23,7 +23,7 @@ from apiserver.tools import safe_get
from apiserver.utilities.json import dumps from apiserver.utilities.json import dumps
from apiserver.utilities.threads_manager import ThreadsManager from apiserver.utilities.threads_manager import ThreadsManager
from apiserver.version import __version__ as current_version from apiserver.version import __version__ as current_version
from .resource_monitor import ResourceMonitor from .resource_monitor import ResourceMonitor, stat_threads
log = config.logger(__file__) log = config.logger(__file__)
@ -31,17 +31,19 @@ worker_bll = WorkerBLL()
class StatisticsReporter: class StatisticsReporter:
threads = ThreadsManager("Statistics", resource_monitor=ResourceMonitor)
send_queue = queue.Queue() send_queue = queue.Queue()
supported = config.get("apiserver.statistics.supported", True) supported = config.get("apiserver.statistics.supported", True)
@classmethod @classmethod
def start(cls): def start(cls):
if not cls.supported:
return
ResourceMonitor.start()
cls.start_sender() cls.start_sender()
cls.start_reporter() cls.start_reporter()
@classmethod @classmethod
@threads.register("reporter", daemon=True) @stat_threads.register("reporter", daemon=True)
def start_reporter(cls): def start_reporter(cls):
""" """
Periodically send statistics reports for companies who have opted in. Periodically send statistics reports for companies who have opted in.
@ -68,7 +70,7 @@ class StatisticsReporter:
sleep(report_interval.total_seconds()) sleep(report_interval.total_seconds())
@classmethod @classmethod
@threads.register("sender", daemon=True) @stat_threads.register("sender", daemon=True)
def start_sender(cls): def start_sender(cls):
if not cls.supported: if not cls.supported:
return return
@ -111,7 +113,7 @@ class StatisticsReporter:
"uuid": get_server_uuid(), "uuid": get_server_uuid(),
"queues": {"count": Queue.objects(company=company_id).count()}, "queues": {"count": Queue.objects(company=company_id).count()},
"users": {"count": User.objects(company=company_id).count()}, "users": {"count": User.objects(company=company_id).count()},
"resources": cls.threads.resource_monitor.get_stats(), "resources": ResourceMonitor.get_stats(),
"experiments": next( "experiments": next(
iter(cls._get_experiments_stats(company_id).values()), {} iter(cls._get_experiments_stats(company_id).values()), {}
), ),

View File

@ -6,6 +6,8 @@ from flask_compress import Compress
from flask_cors import CORS from flask_cors import CORS
from packaging.version import Version from packaging.version import Version
from apiserver.bll.queue.queue_metrics import MetricsRefresher
from apiserver.bll.task.non_responsive_tasks_watchdog import NonResponsiveTasksWatchdog
from apiserver.database import db from apiserver.database import db
from apiserver.bll.statistics.stats_reporter import StatisticsReporter from apiserver.bll.statistics.stats_reporter import StatisticsReporter
from apiserver.config import info from apiserver.config import info
@ -119,6 +121,8 @@ class AppSequence:
def _start_worker(self): def _start_worker(self):
check_updates_thread.start() check_updates_thread.start()
StatisticsReporter.start() StatisticsReporter.start()
MetricsRefresher.start()
NonResponsiveTasksWatchdog.start()
def _on_worker_stop(self): def _on_worker_stop(self):
ThreadsManager.terminating = True ThreadsManager.terminating = True

View File

@ -135,8 +135,6 @@ queue_bll = QueueBLL()
org_bll = OrgBLL() org_bll = OrgBLL()
project_bll = ProjectBLL() project_bll = ProjectBLL()
NonResponsiveTasksWatchdog.start()
def set_task_status_from_call( def set_task_status_from_call(
request: UpdateRequest, company_id, new_status=None, **set_fields request: UpdateRequest, company_id, new_status=None, **set_fields

View File

@ -9,22 +9,8 @@ class ThreadsManager:
request_context_creator: ClassVar[Callable] = None request_context_creator: ClassVar[Callable] = None
terminating: ClassVar[bool] = False terminating: ClassVar[bool] = False
def __init__(self, name=None, **threads): def __init__(self, name=None):
self.name = name or self.__class__.__name__ self.name = name or self.__class__.__name__
self.objects = {}
self.lock = Lock()
for thread_name, thread in threads.items():
if issubclass(thread, Thread):
thread = thread()
thread.start()
elif isinstance(thread, Thread):
if not thread.is_alive():
thread.start()
else:
raise Exception(f"Expected thread or thread class ({thread_name}): {thread}")
self.objects[thread_name] = thread
def register(self, thread_name, daemon=True): def register(self, thread_name, daemon=True):
def decorator(f): def decorator(f):