mirror of
https://github.com/clearml/clearml
synced 2025-05-07 14:24:31 +00:00
Change Task.wait_for_status() defaults & Add TrainsJob.is_failed()
This commit is contained in:
parent
29b4b7338b
commit
2004b64a5b
@ -202,12 +202,21 @@ class TrainsJob(object):
|
|||||||
"""
|
"""
|
||||||
Return True, if job is has executed and is not any more
|
Return True, if job is has executed and is not any more
|
||||||
|
|
||||||
:return: True the task is currently one of these states, stopped / completed / failed.
|
:return: True the task is currently one of these states, stopped / completed / failed / published.
|
||||||
"""
|
"""
|
||||||
return self.task.status in (
|
return self.task.status in (
|
||||||
Task.TaskStatusEnum.stopped, Task.TaskStatusEnum.completed,
|
Task.TaskStatusEnum.stopped, Task.TaskStatusEnum.completed,
|
||||||
Task.TaskStatusEnum.failed, Task.TaskStatusEnum.published)
|
Task.TaskStatusEnum.failed, Task.TaskStatusEnum.published)
|
||||||
|
|
||||||
|
def is_failed(self):
|
||||||
|
# type: () -> bool
|
||||||
|
"""
|
||||||
|
Return True, if job is has executed and failed
|
||||||
|
|
||||||
|
:return: True the task is currently in failed state
|
||||||
|
"""
|
||||||
|
return self.task.status in (Task.TaskStatusEnum.failed, )
|
||||||
|
|
||||||
def is_pending(self):
|
def is_pending(self):
|
||||||
# type: () -> bool
|
# type: () -> bool
|
||||||
"""
|
"""
|
||||||
|
183
trains/task.py
183
trains/task.py
@ -13,7 +13,7 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from collections import Callable, Sequence as CollectionsSequence
|
from collections import Callable, Sequence as CollectionsSequence
|
||||||
|
|
||||||
from typing import Optional, Union, Mapping, Sequence, Any, Dict, TYPE_CHECKING, Iterable
|
from typing import Optional, Union, Mapping, Sequence, Any, Dict, Iterable, TYPE_CHECKING
|
||||||
|
|
||||||
import psutil
|
import psutil
|
||||||
import six
|
import six
|
||||||
@ -1349,6 +1349,99 @@ class Task(_Task):
|
|||||||
"""
|
"""
|
||||||
self._arguments.copy_from_dict(flatten_dictionary(dictionary))
|
self._arguments.copy_from_dict(flatten_dictionary(dictionary))
|
||||||
|
|
||||||
|
def execute_remotely(self, queue_name=None, clone=False, exit_process=True):
|
||||||
|
# type: (Optional[str], bool, bool) -> ()
|
||||||
|
"""
|
||||||
|
If task is running locally (i.e., not by ``trains-agent``), then clone the Task and enqueue it for remote
|
||||||
|
execution; or, stop the execution of the current Task, reset its state, and enqueue it. If ``exit==True``,
|
||||||
|
*exit* this process.
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
If the task is running remotely (i.e., ``trains-agent`` is executing it), this call is a no-op
|
||||||
|
(i.e., does nothing).
|
||||||
|
|
||||||
|
:param queue_name: The queue name used for enqueueing the task. If ``None``, this call exits the process
|
||||||
|
without enqueuing the task.
|
||||||
|
:param clone: Clone the Task and execute the newly cloned Task?
|
||||||
|
|
||||||
|
The values are:
|
||||||
|
|
||||||
|
- ``True`` - A cloned copy of the Task will be created, and enqueued, instead of this Task.
|
||||||
|
- ``False`` - The Task will be enqueued.
|
||||||
|
|
||||||
|
:param exit_process: The function call will leave the calling process at the end?
|
||||||
|
|
||||||
|
- ``True`` - Exit the process (exit(0)).
|
||||||
|
- ``False`` - Do not exit the process.
|
||||||
|
|
||||||
|
.. warning::
|
||||||
|
|
||||||
|
If ``clone==False``, then ``exit_process`` must be ``True``.
|
||||||
|
"""
|
||||||
|
# do nothing, we are running remotely
|
||||||
|
if running_remotely():
|
||||||
|
return
|
||||||
|
|
||||||
|
if not clone and not exit_process:
|
||||||
|
raise ValueError(
|
||||||
|
"clone==False and exit_process==False is not supported. "
|
||||||
|
"Task enqueuing itself must exit the process afterwards.")
|
||||||
|
|
||||||
|
# make sure we analyze the process
|
||||||
|
if self.status in (Task.TaskStatusEnum.in_progress, ):
|
||||||
|
if clone:
|
||||||
|
# wait for repository detection (5 minutes should be reasonable time to detect all packages)
|
||||||
|
self.flush(wait_for_uploads=True)
|
||||||
|
if self._logger and not self.__is_subprocess():
|
||||||
|
self._wait_for_repo_detection(timeout=300.)
|
||||||
|
else:
|
||||||
|
# close ourselves (it will make sure the repo is updated)
|
||||||
|
self.close()
|
||||||
|
|
||||||
|
# clone / reset Task
|
||||||
|
if clone:
|
||||||
|
task = Task.clone(self)
|
||||||
|
else:
|
||||||
|
task = self
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
# enqueue ourselves
|
||||||
|
if queue_name:
|
||||||
|
Task.enqueue(task, queue_name=queue_name)
|
||||||
|
LoggerRoot.get_base_logger().warning(
|
||||||
|
'Switching to remote execution, output log page {}'.format(task.get_output_log_web_page()))
|
||||||
|
|
||||||
|
# leave this process.
|
||||||
|
if exit_process:
|
||||||
|
LoggerRoot.get_base_logger().warning('Terminating local execution process')
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
return
|
||||||
|
|
||||||
|
def wait_for_status(
|
||||||
|
self,
|
||||||
|
status=(_Task.TaskStatusEnum.completed, _Task.TaskStatusEnum.stopped, _Task.TaskStatusEnum.closed),
|
||||||
|
raise_on_status=(tasks.TaskStatusEnum.failed,),
|
||||||
|
check_interval_sec=60.,
|
||||||
|
):
|
||||||
|
# type: (Iterable[Task.TaskStatusEnum], Optional[Iterable[Task.TaskStatusEnum]], float) -> ()
|
||||||
|
"""
|
||||||
|
Wait for a task to reach a defined status.
|
||||||
|
|
||||||
|
:param status: Status to wait for. Defaults to ('completed', 'stopped', 'closed', )
|
||||||
|
:param raise_on_status: Raise RuntimeError if the status of the tasks matches one of these values.
|
||||||
|
Defaults to ('failed').
|
||||||
|
:param check_interval_sec: Interval in seconds between two checks. Defaults to 60 seconds.
|
||||||
|
|
||||||
|
:raise: RuntimeError if the status is one of {raise_on_status}.
|
||||||
|
"""
|
||||||
|
stopped_status = list(status) + (list(raise_on_status) if raise_on_status else [])
|
||||||
|
while self.status not in stopped_status:
|
||||||
|
time.sleep(check_interval_sec)
|
||||||
|
|
||||||
|
if raise_on_status and self.status in raise_on_status:
|
||||||
|
raise RuntimeError("Task {} has status: {}.".format(self.task_id, self.status))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def set_credentials(cls, api_host=None, web_host=None, files_host=None, key=None, secret=None, host=None):
|
def set_credentials(cls, api_host=None, web_host=None, files_host=None, key=None, secret=None, host=None):
|
||||||
# type: (Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]) -> ()
|
# type: (Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]) -> ()
|
||||||
@ -2415,91 +2508,3 @@ class Task(_Task):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def execute_remotely(self, queue_name=None, clone=False, exit_process=True):
|
|
||||||
# type: (Optional[str], bool, bool) -> ()
|
|
||||||
"""
|
|
||||||
If task is running locally (i.e., not by ``trains-agent``), then clone the Task and enqueue it for remote
|
|
||||||
execution; or, stop the execution of the current Task, reset its state, and enqueue it. If ``exit==True``,
|
|
||||||
*exit* this process.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
If the task is running remotely (i.e., ``trains-agent`` is executing it), this call is a no-op
|
|
||||||
(i.e., does nothing).
|
|
||||||
|
|
||||||
:param queue_name: The queue name used for enqueueing the task. If ``None``, this call exits the process
|
|
||||||
without enqueuing the task.
|
|
||||||
:param clone: Clone the Task and execute the newly cloned Task?
|
|
||||||
|
|
||||||
The values are:
|
|
||||||
|
|
||||||
- ``True`` - A cloned copy of the Task will be created, and enqueued, instead of this Task.
|
|
||||||
- ``False`` - The Task will be enqueued.
|
|
||||||
|
|
||||||
:param exit_process: The function call will leave the calling process at the end?
|
|
||||||
|
|
||||||
- ``True`` - Exit the process (exit(0)).
|
|
||||||
- ``False`` - Do not exit the process.
|
|
||||||
|
|
||||||
.. warning::
|
|
||||||
|
|
||||||
If ``clone==False``, then ``exit_process`` must be ``True``.
|
|
||||||
"""
|
|
||||||
# do nothing, we are running remotely
|
|
||||||
if running_remotely():
|
|
||||||
return
|
|
||||||
|
|
||||||
if not clone and not exit_process:
|
|
||||||
raise ValueError(
|
|
||||||
"clone==False and exit_process==False is not supported. "
|
|
||||||
"Task enqueuing itself must exit the process afterwards.")
|
|
||||||
|
|
||||||
# make sure we analyze the process
|
|
||||||
if self.status in (Task.TaskStatusEnum.in_progress, ):
|
|
||||||
if clone:
|
|
||||||
# wait for repository detection (5 minutes should be reasonable time to detect all packages)
|
|
||||||
self.flush(wait_for_uploads=True)
|
|
||||||
if self._logger and not self.__is_subprocess():
|
|
||||||
self._wait_for_repo_detection(timeout=300.)
|
|
||||||
else:
|
|
||||||
# close ourselves (it will make sure the repo is updated)
|
|
||||||
self.close()
|
|
||||||
|
|
||||||
# clone / reset Task
|
|
||||||
if clone:
|
|
||||||
task = Task.clone(self)
|
|
||||||
else:
|
|
||||||
task = self
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
# enqueue ourselves
|
|
||||||
if queue_name:
|
|
||||||
Task.enqueue(task, queue_name=queue_name)
|
|
||||||
LoggerRoot.get_base_logger().warning(
|
|
||||||
'Switching to remote execution, output log page {}'.format(task.get_output_log_web_page()))
|
|
||||||
|
|
||||||
# leave this process.
|
|
||||||
if exit_process:
|
|
||||||
LoggerRoot.get_base_logger().warning('Terminating local execution process')
|
|
||||||
exit(0)
|
|
||||||
|
|
||||||
return
|
|
||||||
|
|
||||||
def wait_for_status(self, status=(tasks.TaskStatusEnum.completed),
|
|
||||||
raise_on_status=(tasks.TaskStatusEnum.failed),
|
|
||||||
check_interval_sec=60):
|
|
||||||
# type: (Iterable[tasks.TaskStatusEnum], Iterable[tasks.TaskStatusEnum], int) -> ()
|
|
||||||
"""
|
|
||||||
Wait for a task to reach a defined status.
|
|
||||||
|
|
||||||
:param status: Status to wait for. Defaults to ('completed')
|
|
||||||
:param raise_on_status: Raise RuntimeError if the status of the tasks matches one of these values.
|
|
||||||
Defaults to ('failed').
|
|
||||||
:param check_interval_sec: Interval in seconds between two checks. Defaults to 60 seconds.
|
|
||||||
:raise: RuntimeError if the status is one of {raise_on_status}.
|
|
||||||
"""
|
|
||||||
while self.status not in status or self.status not in raise_on_status:
|
|
||||||
time.sleep(check_interval_sec)
|
|
||||||
|
|
||||||
if self.status in raise_on_status:
|
|
||||||
raise RuntimeError("Task {} has status: {}.".format(self.task_id, self.status))
|
|
||||||
|
Loading…
Reference in New Issue
Block a user