mirror of
https://github.com/clearml/clearml
synced 2025-01-31 17:17:00 +00:00
359 lines
12 KiB
Python
359 lines
12 KiB
Python
import hashlib
|
|
from datetime import datetime
|
|
from logging import getLogger
|
|
from time import time, sleep
|
|
from typing import Optional, Mapping, Sequence, Any
|
|
|
|
from ..task import Task
|
|
from ..backend_api.services import tasks as tasks_service
|
|
|
|
|
|
logger = getLogger('trains.automation.job')
|
|
|
|
|
|
class TrainsJob(object):
|
|
def __init__(
|
|
self,
|
|
base_task_id, # type: str
|
|
parameter_override=None, # type: Optional[Mapping[str, str]]
|
|
task_overrides=None, # type: Optional[Mapping[str, str]]
|
|
tags=None, # type: Optional[Sequence[str]]
|
|
parent=None, # type: Optional[str]
|
|
**kwargs # type: Any
|
|
):
|
|
# type: (...) -> ()
|
|
"""
|
|
Create a new Task based in a base_task_id with a different set of parameters
|
|
|
|
:param str base_task_id: base task id to clone from
|
|
:param dict parameter_override: dictionary of parameters and values to set fo the cloned task
|
|
:param dict task_overrides: Task object specific overrides
|
|
:param list tags: additional tags to add to the newly cloned task
|
|
:param str parent: Set newly created Task parent task field, default: base_tak_id.
|
|
:param dict kwargs: additional Task creation parameters
|
|
"""
|
|
self.task = Task.clone(base_task_id, parent=parent or base_task_id, **kwargs)
|
|
if tags:
|
|
self.task.set_tags(list(set(self.task.get_tags()) | set(tags)))
|
|
if parameter_override:
|
|
params = self.task.get_parameters_as_dict()
|
|
params.update(parameter_override)
|
|
self.task.set_parameters_as_dict(params)
|
|
if task_overrides:
|
|
# todo: make sure it works
|
|
# noinspection PyProtectedMember
|
|
self.task._edit(**task_overrides)
|
|
self.task_started = False
|
|
self._worker = None
|
|
|
|
def get_metric(self, title, series):
|
|
# type: (str, str) -> (float, float, float)
|
|
"""
|
|
Retrieve a specific scalar metric from the running Task.
|
|
|
|
:param str title: Graph title (metric)
|
|
:param str series: Series on the specific graph (variant)
|
|
:return: A tuple of min value, max value, last value
|
|
"""
|
|
title = hashlib.md5(str(title).encode('utf-8')).hexdigest()
|
|
series = hashlib.md5(str(series).encode('utf-8')).hexdigest()
|
|
metric = 'last_metrics.{}.{}.'.format(title, series)
|
|
values = ['min_value', 'max_value', 'value']
|
|
metrics = [metric + v for v in values]
|
|
|
|
res = self.task.send(
|
|
tasks_service.GetAllRequest(
|
|
id=[self.task.id],
|
|
page=0,
|
|
page_size=1,
|
|
only_fields=['id', ] + metrics
|
|
)
|
|
)
|
|
response = res.wait()
|
|
|
|
return tuple(response.response_data['tasks'][0]['last_metrics'][title][series][v] for v in values)
|
|
|
|
def launch(self, queue_name=None):
|
|
# type: (str) -> ()
|
|
"""
|
|
Send Job for execution on the requested execution queue
|
|
|
|
:param str queue_name:
|
|
"""
|
|
try:
|
|
Task.enqueue(task=self.task, queue_name=queue_name)
|
|
except Exception as ex:
|
|
logger.warning(ex)
|
|
|
|
def abort(self):
|
|
# type: () -> ()
|
|
"""
|
|
Abort currently running job (can be called multiple times)
|
|
"""
|
|
try:
|
|
self.task.stopped()
|
|
except Exception as ex:
|
|
logger.warning(ex)
|
|
|
|
def elapsed(self):
|
|
# type: () -> float
|
|
"""
|
|
Return the time in seconds since job started. Return -1 if job is still pending
|
|
|
|
:return: Seconds from start.
|
|
"""
|
|
if not self.task_started and str(self.task.status) != Task.TaskStatusEnum.in_progress:
|
|
return -1
|
|
self.task_started = True
|
|
if not self.task.data.started:
|
|
self.task.reload()
|
|
if not self.task.data.started:
|
|
return -1
|
|
return (datetime.now(tz=self.task.data.started.tzinfo) - self.task.data.started).total_seconds()
|
|
|
|
def iterations(self):
|
|
# type: () -> int
|
|
"""
|
|
Return the last iteration value of the current job. -1 if job has not started yet
|
|
|
|
:return: Task last iteration.
|
|
"""
|
|
if not self.task_started and self.task.status != Task.TaskStatusEnum.in_progress:
|
|
return -1
|
|
self.task_started = True
|
|
return self.task.get_last_iteration()
|
|
|
|
def task_id(self):
|
|
# type: () -> str
|
|
"""
|
|
Return the Task id.
|
|
|
|
:return: The Task ID.
|
|
"""
|
|
return self.task.id
|
|
|
|
def status(self):
|
|
# type: () -> str
|
|
"""
|
|
Return the Job Task current status, see Task.TaskStatusEnum
|
|
|
|
:return: Task status Task.TaskStatusEnum in string.
|
|
"""
|
|
return self.task.status
|
|
|
|
def wait(self, timeout=None, pool_period=30.):
|
|
# type: (Optional[float], float) -> bool
|
|
"""
|
|
Wait until the task is fully executed (i.e., aborted/completed/failed)
|
|
|
|
:param timeout: maximum time (minutes) to wait for Task to finish
|
|
:param pool_period: check task status every pool_period seconds
|
|
:return: True, if Task finished.
|
|
"""
|
|
tic = time()
|
|
while timeout is None or time() - tic < timeout * 60.:
|
|
if self.is_stopped():
|
|
return True
|
|
sleep(pool_period)
|
|
|
|
return self.is_stopped()
|
|
|
|
def get_console_output(self, number_of_reports=1):
|
|
# type: (int) -> Sequence[str]
|
|
"""
|
|
Return a list of console outputs reported by the Task.
|
|
Returned console outputs are retrieved from the most updated console outputs.
|
|
|
|
:param int number_of_reports: number of reports to return, default 1, the last (most updated) console output
|
|
:return: List of strings each entry corresponds to one report.
|
|
"""
|
|
return self.task.get_reported_console_output(number_of_reports=number_of_reports)
|
|
|
|
def worker(self):
|
|
# type: () -> str
|
|
"""
|
|
Return the current worker id executing this Job. If job is pending, returns None
|
|
|
|
:return: ID of the worker executing / executed the job, or None if job is still pending.
|
|
"""
|
|
if self.is_pending():
|
|
return self._worker
|
|
|
|
if self._worker is None:
|
|
# the last console outputs will update the worker
|
|
self.get_console_output(number_of_reports=1)
|
|
# if we still do not have it, store empty string
|
|
if not self._worker:
|
|
self._worker = ''
|
|
|
|
return self._worker
|
|
|
|
def is_running(self):
|
|
# type: () -> bool
|
|
"""
|
|
Return True, if job is currently running (pending is considered False)
|
|
|
|
:return: True, if the task is currently in progress.
|
|
"""
|
|
return self.task.status == Task.TaskStatusEnum.in_progress
|
|
|
|
def is_stopped(self):
|
|
# type: () -> bool
|
|
"""
|
|
Return True, if job is has executed and is not any more
|
|
|
|
:return: True the task is currently one of these states, stopped / completed / failed / published.
|
|
"""
|
|
return self.task.status in (
|
|
Task.TaskStatusEnum.stopped, Task.TaskStatusEnum.completed,
|
|
Task.TaskStatusEnum.failed, Task.TaskStatusEnum.published)
|
|
|
|
def is_failed(self):
|
|
# type: () -> bool
|
|
"""
|
|
Return True, if job is has executed and failed
|
|
|
|
:return: True the task is currently in failed state
|
|
"""
|
|
return self.task.status in (Task.TaskStatusEnum.failed, )
|
|
|
|
def is_pending(self):
|
|
# type: () -> bool
|
|
"""
|
|
Return True, if job is waiting for execution
|
|
|
|
:return: True the task is currently is currently queued.
|
|
"""
|
|
return self.task.status in (Task.TaskStatusEnum.queued, Task.TaskStatusEnum.created)
|
|
|
|
def started(self):
|
|
# type: () -> bool
|
|
"""
|
|
Return True, if job already started, or ended. False, if created/pending.
|
|
|
|
:return: False, if the task is currently in draft mode or pending.
|
|
"""
|
|
if not self.task_started and self.task.status in (
|
|
Task.TaskStatusEnum.in_progress, Task.TaskStatusEnum.created):
|
|
return False
|
|
|
|
self.task_started = True
|
|
return True
|
|
|
|
|
|
# noinspection PyMethodMayBeStatic, PyUnusedLocal
|
|
class _JobStub(object):
|
|
"""
|
|
This is a Job Stub, use only for debugging
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
base_task_id, # type: str
|
|
parameter_override=None, # type: Optional[Mapping[str, str]]
|
|
task_overrides=None, # type: Optional[Mapping[str, str]]
|
|
tags=None, # type: Optional[Sequence[str]]
|
|
**kwargs # type: Any
|
|
):
|
|
# type: (...) -> ()
|
|
self.task = None
|
|
self.base_task_id = base_task_id
|
|
self.parameter_override = parameter_override
|
|
self.task_overrides = task_overrides
|
|
self.tags = tags
|
|
self.iteration = -1
|
|
self.task_started = None
|
|
|
|
def launch(self, queue_name=None):
|
|
# type: (str) -> ()
|
|
self.iteration = 0
|
|
self.task_started = time()
|
|
print('launching', self.parameter_override, 'in', queue_name)
|
|
|
|
def abort(self):
|
|
# type: () -> ()
|
|
self.task_started = -1
|
|
|
|
def elapsed(self):
|
|
# type: () -> float
|
|
"""
|
|
Return the time in seconds since job started. Return -1 if job is still pending
|
|
|
|
:return: Seconds from start.
|
|
"""
|
|
if self.task_started is None:
|
|
return -1
|
|
return time() - self.task_started
|
|
|
|
def iterations(self):
|
|
# type: () -> int
|
|
"""
|
|
Return the last iteration value of the current job. -1 if job has not started yet
|
|
:return: Task last iteration.
|
|
"""
|
|
if self.task_started is None:
|
|
return -1
|
|
return self.iteration
|
|
|
|
def get_metric(self, title, series):
|
|
# type: (str, str) -> (float, float, float)
|
|
"""
|
|
Retrieve a specific scalar metric from the running Task.
|
|
|
|
:param str title: Graph title (metric)
|
|
:param str series: Series on the specific graph (variant)
|
|
:return: min value, max value, last value
|
|
"""
|
|
return 0, 1.0, 0.123
|
|
|
|
def task_id(self):
|
|
# type: () -> str
|
|
return 'stub'
|
|
|
|
def worker(self):
|
|
# type: () -> ()
|
|
return None
|
|
|
|
def status(self):
|
|
# type: () -> str
|
|
return 'in_progress'
|
|
|
|
def wait(self, timeout=None, pool_period=30.):
|
|
# type: (Optional[float], float) -> bool
|
|
"""
|
|
Wait for the task to be processed (i.e., aborted/completed/failed)
|
|
|
|
:param timeout: maximum time (minutes) to wait for Task to finish
|
|
:param pool_period: check task status every pool_period seconds
|
|
:return: True, if the Task finished.
|
|
"""
|
|
return True
|
|
|
|
def get_console_output(self, number_of_reports=1):
|
|
# type: (int) -> Sequence[str]
|
|
"""
|
|
Return a list of console outputs reported by the Task.
|
|
Returned console outputs are retrieved from the most updated console outputs.
|
|
|
|
|
|
:param int number_of_reports: number of reports to return, default 1, the last (most updated) console output
|
|
:return: List of strings each entry corresponds to one report.
|
|
"""
|
|
return []
|
|
|
|
def is_running(self):
|
|
# type: () -> bool
|
|
return self.task_started is not None and self.task_started > 0
|
|
|
|
def is_stopped(self):
|
|
# type: () -> bool
|
|
return self.task_started is not None and self.task_started < 0
|
|
|
|
def is_pending(self):
|
|
# type: () -> bool
|
|
return self.task_started is None
|
|
|
|
def started(self):
|
|
# type: () -> bool
|
|
return not self.is_pending()
|