mirror of
https://github.com/clearml/clearml
synced 2025-05-11 16:10:39 +00:00
Add new task state support
This commit is contained in:
parent
b5219d98de
commit
22a65abdc1
@ -1,5 +1,5 @@
|
|||||||
from ....config import config
|
from ....config import config
|
||||||
from ....backend_interface import Task, TaskStatusEnum
|
from ....backend_api.services import tasks
|
||||||
|
|
||||||
|
|
||||||
class TaskStopReason(object):
|
class TaskStopReason(object):
|
||||||
@ -13,14 +13,16 @@ class TaskStopSignal(object):
|
|||||||
|
|
||||||
_number_of_consecutive_reset_tests = 4
|
_number_of_consecutive_reset_tests = 4
|
||||||
|
|
||||||
_unexpected_statuses = (
|
# _unexpected_statuses = (
|
||||||
TaskStatusEnum.closed,
|
# tasks.TaskStatusEnum.closed,
|
||||||
TaskStatusEnum.stopped,
|
# tasks.TaskStatusEnum.stopped,
|
||||||
TaskStatusEnum.failed,
|
# tasks.TaskStatusEnum.failed,
|
||||||
TaskStatusEnum.published,
|
# tasks.TaskStatusEnum.published,
|
||||||
)
|
# tasks.TaskStatusEnum.completed,
|
||||||
|
# )
|
||||||
|
|
||||||
def __init__(self, task):
|
def __init__(self, task):
|
||||||
|
from ....backend_interface import Task
|
||||||
assert isinstance(task, Task)
|
assert isinstance(task, Task)
|
||||||
self.task = task
|
self.task = task
|
||||||
self._task_reset_state_counter = 0
|
self._task_reset_state_counter = 0
|
||||||
@ -29,13 +31,19 @@ class TaskStopSignal(object):
|
|||||||
status = self.task.status
|
status = self.task.status
|
||||||
message = self.task.data.status_message
|
message = self.task.data.status_message
|
||||||
|
|
||||||
if status == TaskStatusEnum.in_progress and "stopping" in message:
|
if status == tasks.TaskStatusEnum.in_progress and "stopping" in message:
|
||||||
return TaskStopReason.stopped
|
return TaskStopReason.stopped
|
||||||
|
|
||||||
if status in self._unexpected_statuses and "worker" not in message:
|
_expected_statuses = (
|
||||||
|
tasks.TaskStatusEnum.created,
|
||||||
|
tasks.TaskStatusEnum.queued,
|
||||||
|
tasks.TaskStatusEnum.in_progress,
|
||||||
|
)
|
||||||
|
|
||||||
|
if status not in _expected_statuses and "worker" not in message:
|
||||||
return TaskStopReason.status_changed
|
return TaskStopReason.status_changed
|
||||||
|
|
||||||
if status == TaskStatusEnum.created:
|
if status == tasks.TaskStatusEnum.created:
|
||||||
self._task_reset_state_counter += 1
|
self._task_reset_state_counter += 1
|
||||||
|
|
||||||
if self._task_reset_state_counter >= self._number_of_consecutive_reset_tests:
|
if self._task_reset_state_counter >= self._number_of_consecutive_reset_tests:
|
||||||
|
@ -1,26 +1,70 @@
|
|||||||
from socket import gethostname
|
from socket import gethostname
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
|
from threading import Thread, Event
|
||||||
|
|
||||||
from ....config import config, running_remotely, dev_worker_name
|
from time import time
|
||||||
|
|
||||||
|
from ....config import config
|
||||||
|
from ....backend_interface.task.development.stop_signal import TaskStopSignal
|
||||||
|
from ....backend_api.services import tasks
|
||||||
|
|
||||||
|
|
||||||
@attr.s
|
|
||||||
class DevWorker(object):
|
class DevWorker(object):
|
||||||
prefix = attr.ib(type=str, default="MANUAL:")
|
prefix = attr.ib(type=str, default="MANUAL:")
|
||||||
|
|
||||||
report_period = float(config.get('development.worker.report_period_sec', 30.))
|
report_period = float(config.get('development.worker.report_period_sec', 30.))
|
||||||
report_stdout = bool(config.get('development.worker.log_stdout', True))
|
report_stdout = bool(config.get('development.worker.log_stdout', True))
|
||||||
|
ping_period = 30.
|
||||||
|
|
||||||
@classmethod
|
def __init__(self):
|
||||||
def is_enabled(cls, model_updated=False):
|
self._dev_stop_signal = None
|
||||||
return False
|
self._thread = None
|
||||||
|
self._exit_event = Event()
|
||||||
|
self._task = None
|
||||||
|
self._support_ping = False
|
||||||
|
|
||||||
def status_report(self, timestamp=None):
|
def ping(self, timestamp=None):
|
||||||
|
try:
|
||||||
|
if self._task:
|
||||||
|
self._task.send(tasks.PingRequest(self._task.id))
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def register(self):
|
def register(self, task):
|
||||||
|
if self._thread:
|
||||||
|
return True
|
||||||
|
if TaskStopSignal.enabled:
|
||||||
|
self._dev_stop_signal = TaskStopSignal(task=task)
|
||||||
|
self._support_ping = hasattr(tasks, 'PingRequest')
|
||||||
|
# if there is nothing to monitor, leave
|
||||||
|
if not self._support_ping and not self._dev_stop_signal:
|
||||||
|
return
|
||||||
|
self._task = task
|
||||||
|
self._exit_event.clear()
|
||||||
|
self._thread = Thread(target=self._daemon)
|
||||||
|
self._thread.daemon = True
|
||||||
|
self._thread.start()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
def _daemon(self):
|
||||||
|
last_ping = time()
|
||||||
|
while self._task is not None:
|
||||||
|
if self._exit_event.wait(min(self.ping_period, self.report_period)):
|
||||||
|
return
|
||||||
|
# send ping request
|
||||||
|
if self._support_ping and (time() - last_ping) >= self.ping_period:
|
||||||
|
self.ping()
|
||||||
|
last_ping = time()
|
||||||
|
if self._dev_stop_signal:
|
||||||
|
stop_reason = self._dev_stop_signal.test()
|
||||||
|
if stop_reason and self._task:
|
||||||
|
self._task._dev_mode_stop_task(stop_reason)
|
||||||
|
|
||||||
def unregister(self):
|
def unregister(self):
|
||||||
|
self._exit_event.set()
|
||||||
|
self._dev_stop_signal = None
|
||||||
|
self._thread = None
|
||||||
|
self._task = None
|
||||||
return True
|
return True
|
||||||
|
Loading…
Reference in New Issue
Block a user