mirror of
https://github.com/clearml/clearml-server
synced 2025-05-02 03:03:58 +00:00
Fix threaded jobs management (invoke only from AppSequence)
This commit is contained in:
parent
4dff163af4
commit
efd56e085e
@ -7,7 +7,7 @@ from elasticsearch import Elasticsearch
|
|||||||
from apiserver import database
|
from apiserver import database
|
||||||
from apiserver.es_factory import es_factory
|
from apiserver.es_factory import es_factory
|
||||||
from apiserver.apierrors import errors
|
from apiserver.apierrors import errors
|
||||||
from apiserver.bll.queue.queue_metrics import QueueMetrics, MetricsRefresher
|
from apiserver.bll.queue.queue_metrics import QueueMetrics
|
||||||
from apiserver.bll.workers import WorkerBLL
|
from apiserver.bll.workers import WorkerBLL
|
||||||
from apiserver.config_repo import config
|
from apiserver.config_repo import config
|
||||||
from apiserver.database.errors import translate_errors_context
|
from apiserver.database.errors import translate_errors_context
|
||||||
@ -334,6 +334,3 @@ class QueueBLL(object):
|
|||||||
if res is None:
|
if res is None:
|
||||||
raise errors.bad_request.InvalidQueueId(queue_id=queue_id)
|
raise errors.bad_request.InvalidQueueId(queue_id=queue_id)
|
||||||
return int(res.get("count"))
|
return int(res.get("count"))
|
||||||
|
|
||||||
|
|
||||||
MetricsRefresher.start(queue_metrics=QueueBLL().metrics)
|
|
||||||
|
@ -279,10 +279,14 @@ class MetricsRefresher:
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@threads.register("queue_metrics_refresh_watchdog", daemon=True)
|
@threads.register("queue_metrics_refresh_watchdog", daemon=True)
|
||||||
def start(cls, queue_metrics: QueueMetrics):
|
def start(cls, queue_metrics: QueueMetrics = None):
|
||||||
if not cls.watch_interval_sec:
|
if not cls.watch_interval_sec:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if not queue_metrics:
|
||||||
|
from .queue_bll import QueueBLL
|
||||||
|
queue_metrics = QueueBLL().metrics
|
||||||
|
|
||||||
sleep(10)
|
sleep(10)
|
||||||
while not ThreadsManager.terminating:
|
while not ThreadsManager.terminating:
|
||||||
try:
|
try:
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import operator
|
import operator
|
||||||
from threading import Thread, Lock
|
from threading import Lock
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
@ -9,7 +9,9 @@ import psutil
|
|||||||
from apiserver.utilities.threads_manager import ThreadsManager
|
from apiserver.utilities.threads_manager import ThreadsManager
|
||||||
|
|
||||||
|
|
||||||
class ResourceMonitor(Thread):
|
stat_threads = ThreadsManager("Statistics")
|
||||||
|
|
||||||
|
|
||||||
@attr.s(auto_attribs=True)
|
@attr.s(auto_attribs=True)
|
||||||
class Sample:
|
class Sample:
|
||||||
cpu_usage: float = 0.0
|
cpu_usage: float = 0.0
|
||||||
@ -37,48 +39,53 @@ class ResourceMonitor(Thread):
|
|||||||
res = self._apply(lambda x: x / (count + 1), res)
|
res = self._apply(lambda x: x / (count + 1), res)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def __init__(self, sample_interval_sec=5):
|
|
||||||
super(ResourceMonitor, self).__init__(daemon=True)
|
|
||||||
self.sample_interval_sec = sample_interval_sec
|
|
||||||
self._lock = Lock()
|
|
||||||
self._clear()
|
|
||||||
|
|
||||||
def _clear(self):
|
|
||||||
sample = self._get_sample()
|
|
||||||
self._avg = sample
|
|
||||||
self._min = sample
|
|
||||||
self._max = sample
|
|
||||||
self._clear_time = datetime.utcnow()
|
|
||||||
self._count = 1
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _get_sample(cls) -> Sample:
|
def get_current_sample(cls) -> "Sample":
|
||||||
return cls.Sample(
|
return cls(
|
||||||
cpu_usage=psutil.cpu_percent(),
|
cpu_usage=psutil.cpu_percent(),
|
||||||
mem_used_gb=psutil.virtual_memory().used / (1024 ** 3),
|
mem_used_gb=psutil.virtual_memory().used / (1024 ** 3),
|
||||||
mem_free_gb=psutil.virtual_memory().free / (1024 ** 3),
|
mem_free_gb=psutil.virtual_memory().free / (1024 ** 3),
|
||||||
)
|
)
|
||||||
|
|
||||||
def run(self):
|
|
||||||
|
class ResourceMonitor:
|
||||||
|
class Accumulator:
|
||||||
|
def __init__(self):
|
||||||
|
sample = Sample.get_current_sample()
|
||||||
|
self.avg = sample
|
||||||
|
self.min = sample
|
||||||
|
self.max = sample
|
||||||
|
self.time = datetime.utcnow()
|
||||||
|
self.count = 1
|
||||||
|
|
||||||
|
def add_sample(self, sample: Sample):
|
||||||
|
self.min = self.min.min(sample)
|
||||||
|
self.max = self.max.max(sample)
|
||||||
|
self.avg = self.avg.avg(sample, self.count)
|
||||||
|
self.count += 1
|
||||||
|
|
||||||
|
sample_interval_sec = 5
|
||||||
|
_lock = Lock()
|
||||||
|
accumulator = Accumulator()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@stat_threads.register("resource_monitor", daemon=True)
|
||||||
|
def start(cls):
|
||||||
while not ThreadsManager.terminating:
|
while not ThreadsManager.terminating:
|
||||||
sleep(self.sample_interval_sec)
|
sleep(cls.sample_interval_sec)
|
||||||
|
sample = Sample.get_current_sample()
|
||||||
|
with cls._lock:
|
||||||
|
cls.accumulator.add_sample(sample)
|
||||||
|
|
||||||
sample = self._get_sample()
|
@classmethod
|
||||||
|
def get_stats(cls) -> dict:
|
||||||
with self._lock:
|
|
||||||
self._min = self._min.min(sample)
|
|
||||||
self._max = self._max.max(sample)
|
|
||||||
self._avg = self._avg.avg(sample, self._count)
|
|
||||||
self._count += 1
|
|
||||||
|
|
||||||
def get_stats(self) -> dict:
|
|
||||||
""" Returns current resource statistics and clears internal resource statistics """
|
""" Returns current resource statistics and clears internal resource statistics """
|
||||||
with self._lock:
|
with cls._lock:
|
||||||
min_ = attr.asdict(self._min)
|
min_ = attr.asdict(cls.accumulator.min)
|
||||||
max_ = attr.asdict(self._max)
|
max_ = attr.asdict(cls.accumulator.max)
|
||||||
avg = attr.asdict(self._avg)
|
avg = attr.asdict(cls.accumulator.avg)
|
||||||
interval = datetime.utcnow() - self._clear_time
|
interval = datetime.utcnow() - cls.accumulator.time
|
||||||
self._clear()
|
cls.accumulator = cls.Accumulator()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"interval_sec": interval.total_seconds(),
|
"interval_sec": interval.total_seconds(),
|
||||||
|
@ -23,7 +23,7 @@ from apiserver.tools import safe_get
|
|||||||
from apiserver.utilities.json import dumps
|
from apiserver.utilities.json import dumps
|
||||||
from apiserver.utilities.threads_manager import ThreadsManager
|
from apiserver.utilities.threads_manager import ThreadsManager
|
||||||
from apiserver.version import __version__ as current_version
|
from apiserver.version import __version__ as current_version
|
||||||
from .resource_monitor import ResourceMonitor
|
from .resource_monitor import ResourceMonitor, stat_threads
|
||||||
|
|
||||||
log = config.logger(__file__)
|
log = config.logger(__file__)
|
||||||
|
|
||||||
@ -31,17 +31,19 @@ worker_bll = WorkerBLL()
|
|||||||
|
|
||||||
|
|
||||||
class StatisticsReporter:
|
class StatisticsReporter:
|
||||||
threads = ThreadsManager("Statistics", resource_monitor=ResourceMonitor)
|
|
||||||
send_queue = queue.Queue()
|
send_queue = queue.Queue()
|
||||||
supported = config.get("apiserver.statistics.supported", True)
|
supported = config.get("apiserver.statistics.supported", True)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def start(cls):
|
def start(cls):
|
||||||
|
if not cls.supported:
|
||||||
|
return
|
||||||
|
ResourceMonitor.start()
|
||||||
cls.start_sender()
|
cls.start_sender()
|
||||||
cls.start_reporter()
|
cls.start_reporter()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@threads.register("reporter", daemon=True)
|
@stat_threads.register("reporter", daemon=True)
|
||||||
def start_reporter(cls):
|
def start_reporter(cls):
|
||||||
"""
|
"""
|
||||||
Periodically send statistics reports for companies who have opted in.
|
Periodically send statistics reports for companies who have opted in.
|
||||||
@ -68,7 +70,7 @@ class StatisticsReporter:
|
|||||||
sleep(report_interval.total_seconds())
|
sleep(report_interval.total_seconds())
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@threads.register("sender", daemon=True)
|
@stat_threads.register("sender", daemon=True)
|
||||||
def start_sender(cls):
|
def start_sender(cls):
|
||||||
if not cls.supported:
|
if not cls.supported:
|
||||||
return
|
return
|
||||||
@ -111,7 +113,7 @@ class StatisticsReporter:
|
|||||||
"uuid": get_server_uuid(),
|
"uuid": get_server_uuid(),
|
||||||
"queues": {"count": Queue.objects(company=company_id).count()},
|
"queues": {"count": Queue.objects(company=company_id).count()},
|
||||||
"users": {"count": User.objects(company=company_id).count()},
|
"users": {"count": User.objects(company=company_id).count()},
|
||||||
"resources": cls.threads.resource_monitor.get_stats(),
|
"resources": ResourceMonitor.get_stats(),
|
||||||
"experiments": next(
|
"experiments": next(
|
||||||
iter(cls._get_experiments_stats(company_id).values()), {}
|
iter(cls._get_experiments_stats(company_id).values()), {}
|
||||||
),
|
),
|
||||||
|
@ -6,6 +6,8 @@ from flask_compress import Compress
|
|||||||
from flask_cors import CORS
|
from flask_cors import CORS
|
||||||
from packaging.version import Version
|
from packaging.version import Version
|
||||||
|
|
||||||
|
from apiserver.bll.queue.queue_metrics import MetricsRefresher
|
||||||
|
from apiserver.bll.task.non_responsive_tasks_watchdog import NonResponsiveTasksWatchdog
|
||||||
from apiserver.database import db
|
from apiserver.database import db
|
||||||
from apiserver.bll.statistics.stats_reporter import StatisticsReporter
|
from apiserver.bll.statistics.stats_reporter import StatisticsReporter
|
||||||
from apiserver.config import info
|
from apiserver.config import info
|
||||||
@ -119,6 +121,8 @@ class AppSequence:
|
|||||||
def _start_worker(self):
|
def _start_worker(self):
|
||||||
check_updates_thread.start()
|
check_updates_thread.start()
|
||||||
StatisticsReporter.start()
|
StatisticsReporter.start()
|
||||||
|
MetricsRefresher.start()
|
||||||
|
NonResponsiveTasksWatchdog.start()
|
||||||
|
|
||||||
def _on_worker_stop(self):
|
def _on_worker_stop(self):
|
||||||
ThreadsManager.terminating = True
|
ThreadsManager.terminating = True
|
||||||
|
@ -135,8 +135,6 @@ queue_bll = QueueBLL()
|
|||||||
org_bll = OrgBLL()
|
org_bll = OrgBLL()
|
||||||
project_bll = ProjectBLL()
|
project_bll = ProjectBLL()
|
||||||
|
|
||||||
NonResponsiveTasksWatchdog.start()
|
|
||||||
|
|
||||||
|
|
||||||
def set_task_status_from_call(
|
def set_task_status_from_call(
|
||||||
request: UpdateRequest, company_id, new_status=None, **set_fields
|
request: UpdateRequest, company_id, new_status=None, **set_fields
|
||||||
|
@ -9,22 +9,8 @@ class ThreadsManager:
|
|||||||
request_context_creator: ClassVar[Callable] = None
|
request_context_creator: ClassVar[Callable] = None
|
||||||
terminating: ClassVar[bool] = False
|
terminating: ClassVar[bool] = False
|
||||||
|
|
||||||
def __init__(self, name=None, **threads):
|
def __init__(self, name=None):
|
||||||
self.name = name or self.__class__.__name__
|
self.name = name or self.__class__.__name__
|
||||||
self.objects = {}
|
|
||||||
self.lock = Lock()
|
|
||||||
|
|
||||||
for thread_name, thread in threads.items():
|
|
||||||
if issubclass(thread, Thread):
|
|
||||||
thread = thread()
|
|
||||||
thread.start()
|
|
||||||
elif isinstance(thread, Thread):
|
|
||||||
if not thread.is_alive():
|
|
||||||
thread.start()
|
|
||||||
else:
|
|
||||||
raise Exception(f"Expected thread or thread class ({thread_name}): {thread}")
|
|
||||||
|
|
||||||
self.objects[thread_name] = thread
|
|
||||||
|
|
||||||
def register(self, thread_name, daemon=True):
|
def register(self, thread_name, daemon=True):
|
||||||
def decorator(f):
|
def decorator(f):
|
||||||
|
Loading…
Reference in New Issue
Block a user