clearml/trains/automation/job.py
2020-07-04 22:52:09 +03:00

359 lines
12 KiB
Python

import hashlib
from datetime import datetime
from logging import getLogger
from time import time, sleep
from typing import Optional, Mapping, Sequence, Any
from ..task import Task
from ..backend_api.services import tasks as tasks_service
logger = getLogger('trains.automation.job')
class TrainsJob(object):
def __init__(
self,
base_task_id, # type: str
parameter_override=None, # type: Optional[Mapping[str, str]]
task_overrides=None, # type: Optional[Mapping[str, str]]
tags=None, # type: Optional[Sequence[str]]
parent=None, # type: Optional[str]
**kwargs # type: Any
):
# type: (...) -> ()
"""
Create a new Task based in a base_task_id with a different set of parameters
:param str base_task_id: base task id to clone from
:param dict parameter_override: dictionary of parameters and values to set fo the cloned task
:param dict task_overrides: Task object specific overrides
:param list tags: additional tags to add to the newly cloned task
:param str parent: Set newly created Task parent task field, default: base_tak_id.
:param dict kwargs: additional Task creation parameters
"""
self.task = Task.clone(base_task_id, parent=parent or base_task_id, **kwargs)
if tags:
self.task.set_tags(list(set(self.task.get_tags()) | set(tags)))
if parameter_override:
params = self.task.get_parameters_as_dict()
params.update(parameter_override)
self.task.set_parameters_as_dict(params)
if task_overrides:
# todo: make sure it works
# noinspection PyProtectedMember
self.task._edit(**task_overrides)
self.task_started = False
self._worker = None
def get_metric(self, title, series):
# type: (str, str) -> (float, float, float)
"""
Retrieve a specific scalar metric from the running Task.
:param str title: Graph title (metric)
:param str series: Series on the specific graph (variant)
:return: A tuple of min value, max value, last value
"""
title = hashlib.md5(str(title).encode('utf-8')).hexdigest()
series = hashlib.md5(str(series).encode('utf-8')).hexdigest()
metric = 'last_metrics.{}.{}.'.format(title, series)
values = ['min_value', 'max_value', 'value']
metrics = [metric + v for v in values]
res = self.task.send(
tasks_service.GetAllRequest(
id=[self.task.id],
page=0,
page_size=1,
only_fields=['id', ] + metrics
)
)
response = res.wait()
return tuple(response.response_data['tasks'][0]['last_metrics'][title][series][v] for v in values)
def launch(self, queue_name=None):
# type: (str) -> ()
"""
Send Job for execution on the requested execution queue
:param str queue_name:
"""
try:
Task.enqueue(task=self.task, queue_name=queue_name)
except Exception as ex:
logger.warning(ex)
def abort(self):
# type: () -> ()
"""
Abort currently running job (can be called multiple times)
"""
try:
self.task.stopped()
except Exception as ex:
logger.warning(ex)
def elapsed(self):
# type: () -> float
"""
Return the time in seconds since job started. Return -1 if job is still pending
:return: Seconds from start.
"""
if not self.task_started and str(self.task.status) != Task.TaskStatusEnum.in_progress:
return -1
self.task_started = True
if not self.task.data.started:
self.task.reload()
if not self.task.data.started:
return -1
return (datetime.now(tz=self.task.data.started.tzinfo) - self.task.data.started).total_seconds()
def iterations(self):
# type: () -> int
"""
Return the last iteration value of the current job. -1 if job has not started yet
:return: Task last iteration.
"""
if not self.task_started and self.task.status != Task.TaskStatusEnum.in_progress:
return -1
self.task_started = True
return self.task.get_last_iteration()
def task_id(self):
# type: () -> str
"""
Return the Task id.
:return: The Task ID.
"""
return self.task.id
def status(self):
# type: () -> str
"""
Return the Job Task current status, see Task.TaskStatusEnum
:return: Task status Task.TaskStatusEnum in string.
"""
return self.task.status
def wait(self, timeout=None, pool_period=30.):
# type: (Optional[float], float) -> bool
"""
Wait until the task is fully executed (i.e., aborted/completed/failed)
:param timeout: maximum time (minutes) to wait for Task to finish
:param pool_period: check task status every pool_period seconds
:return: True, if Task finished.
"""
tic = time()
while timeout is None or time() - tic < timeout * 60.:
if self.is_stopped():
return True
sleep(pool_period)
return self.is_stopped()
def get_console_output(self, number_of_reports=1):
# type: (int) -> Sequence[str]
"""
Return a list of console outputs reported by the Task.
Returned console outputs are retrieved from the most updated console outputs.
:param int number_of_reports: number of reports to return, default 1, the last (most updated) console output
:return: List of strings each entry corresponds to one report.
"""
return self.task.get_reported_console_output(number_of_reports=number_of_reports)
def worker(self):
# type: () -> str
"""
Return the current worker id executing this Job. If job is pending, returns None
:return: ID of the worker executing / executed the job, or None if job is still pending.
"""
if self.is_pending():
return self._worker
if self._worker is None:
# the last console outputs will update the worker
self.get_console_output(number_of_reports=1)
# if we still do not have it, store empty string
if not self._worker:
self._worker = ''
return self._worker
def is_running(self):
# type: () -> bool
"""
Return True, if job is currently running (pending is considered False)
:return: True, if the task is currently in progress.
"""
return self.task.status == Task.TaskStatusEnum.in_progress
def is_stopped(self):
# type: () -> bool
"""
Return True, if job is has executed and is not any more
:return: True the task is currently one of these states, stopped / completed / failed / published.
"""
return self.task.status in (
Task.TaskStatusEnum.stopped, Task.TaskStatusEnum.completed,
Task.TaskStatusEnum.failed, Task.TaskStatusEnum.published)
def is_failed(self):
# type: () -> bool
"""
Return True, if job is has executed and failed
:return: True the task is currently in failed state
"""
return self.task.status in (Task.TaskStatusEnum.failed, )
def is_pending(self):
# type: () -> bool
"""
Return True, if job is waiting for execution
:return: True the task is currently is currently queued.
"""
return self.task.status in (Task.TaskStatusEnum.queued, Task.TaskStatusEnum.created)
def started(self):
# type: () -> bool
"""
Return True, if job already started, or ended. False, if created/pending.
:return: False, if the task is currently in draft mode or pending.
"""
if not self.task_started and self.task.status in (
Task.TaskStatusEnum.in_progress, Task.TaskStatusEnum.created):
return False
self.task_started = True
return True
# noinspection PyMethodMayBeStatic, PyUnusedLocal
class _JobStub(object):
"""
This is a Job Stub, use only for debugging
"""
def __init__(
self,
base_task_id, # type: str
parameter_override=None, # type: Optional[Mapping[str, str]]
task_overrides=None, # type: Optional[Mapping[str, str]]
tags=None, # type: Optional[Sequence[str]]
**kwargs # type: Any
):
# type: (...) -> ()
self.task = None
self.base_task_id = base_task_id
self.parameter_override = parameter_override
self.task_overrides = task_overrides
self.tags = tags
self.iteration = -1
self.task_started = None
def launch(self, queue_name=None):
# type: (str) -> ()
self.iteration = 0
self.task_started = time()
print('launching', self.parameter_override, 'in', queue_name)
def abort(self):
# type: () -> ()
self.task_started = -1
def elapsed(self):
# type: () -> float
"""
Return the time in seconds since job started. Return -1 if job is still pending
:return: Seconds from start.
"""
if self.task_started is None:
return -1
return time() - self.task_started
def iterations(self):
# type: () -> int
"""
Return the last iteration value of the current job. -1 if job has not started yet
:return: Task last iteration.
"""
if self.task_started is None:
return -1
return self.iteration
def get_metric(self, title, series):
# type: (str, str) -> (float, float, float)
"""
Retrieve a specific scalar metric from the running Task.
:param str title: Graph title (metric)
:param str series: Series on the specific graph (variant)
:return: min value, max value, last value
"""
return 0, 1.0, 0.123
def task_id(self):
# type: () -> str
return 'stub'
def worker(self):
# type: () -> ()
return None
def status(self):
# type: () -> str
return 'in_progress'
def wait(self, timeout=None, pool_period=30.):
# type: (Optional[float], float) -> bool
"""
Wait for the task to be processed (i.e., aborted/completed/failed)
:param timeout: maximum time (minutes) to wait for Task to finish
:param pool_period: check task status every pool_period seconds
:return: True, if the Task finished.
"""
return True
def get_console_output(self, number_of_reports=1):
# type: (int) -> Sequence[str]
"""
Return a list of console outputs reported by the Task.
Returned console outputs are retrieved from the most updated console outputs.
:param int number_of_reports: number of reports to return, default 1, the last (most updated) console output
:return: List of strings each entry corresponds to one report.
"""
return []
def is_running(self):
# type: () -> bool
return self.task_started is not None and self.task_started > 0
def is_stopped(self):
# type: () -> bool
return self.task_started is not None and self.task_started < 0
def is_pending(self):
# type: () -> bool
return self.task_started is None
def started(self):
# type: () -> bool
return not self.is_pending()