mirror of
https://github.com/clearml/clearml
synced 2025-05-03 12:31:00 +00:00
Add Pipeline controller caching, improve pipeline plot reporting
This commit is contained in:
parent
a9f52a468c
commit
9d108d855f
@ -6,7 +6,7 @@ from threading import Thread, Event, RLock
|
|||||||
from time import time
|
from time import time
|
||||||
|
|
||||||
from attr import attrib, attrs
|
from attr import attrib, attrs
|
||||||
from typing import Sequence, Optional, Mapping, Callable, Any, Union
|
from typing import Sequence, Optional, Mapping, Callable, Any, Union, List
|
||||||
|
|
||||||
from ..backend_interface.util import get_or_create_project
|
from ..backend_interface.util import get_or_create_project
|
||||||
from ..debugging.log import LoggerRoot
|
from ..debugging.log import LoggerRoot
|
||||||
@ -99,6 +99,7 @@ class PipelineController(object):
|
|||||||
self._task = auto_connect_task if isinstance(auto_connect_task, Task) else Task.current_task()
|
self._task = auto_connect_task if isinstance(auto_connect_task, Task) else Task.current_task()
|
||||||
self._step_ref_pattern = re.compile(self._step_pattern)
|
self._step_ref_pattern = re.compile(self._step_pattern)
|
||||||
self._reporting_lock = RLock()
|
self._reporting_lock = RLock()
|
||||||
|
self._pipeline_task_status_failed = None
|
||||||
if not self._task and always_create_task:
|
if not self._task and always_create_task:
|
||||||
self._task = Task.init(
|
self._task = Task.init(
|
||||||
project_name=pipeline_project or 'Pipelines',
|
project_name=pipeline_project or 'Pipelines',
|
||||||
@ -381,7 +382,11 @@ class PipelineController(object):
|
|||||||
:param float timeout: Wait timeout for the optimization thread to exit (minutes).
|
:param float timeout: Wait timeout for the optimization thread to exit (minutes).
|
||||||
The default is ``None``, indicating do not wait terminate immediately.
|
The default is ``None``, indicating do not wait terminate immediately.
|
||||||
"""
|
"""
|
||||||
pass
|
self.wait(timeout=timeout)
|
||||||
|
if self._task and self._pipeline_task_status_failed:
|
||||||
|
print('Setting pipeline controller Task as failed (due to failed steps) !')
|
||||||
|
self._task.close()
|
||||||
|
self._task.mark_failed(status_reason='Pipeline step failed', force=True)
|
||||||
|
|
||||||
def wait(self, timeout=None):
|
def wait(self, timeout=None):
|
||||||
# type: (Optional[float]) -> bool
|
# type: (Optional[float]) -> bool
|
||||||
@ -418,7 +423,16 @@ class PipelineController(object):
|
|||||||
|
|
||||||
:return: A boolean indicating whether the pipeline controller is active (still running) or stopped.
|
:return: A boolean indicating whether the pipeline controller is active (still running) or stopped.
|
||||||
"""
|
"""
|
||||||
return self._thread is not None
|
return self._thread is not None and self._thread.is_alive()
|
||||||
|
|
||||||
|
def is_successful(self):
|
||||||
|
# type: () -> bool
|
||||||
|
"""
|
||||||
|
return True if the pipeline controller is fully executed and none of the steps / Tasks failed
|
||||||
|
|
||||||
|
:return: A boolean indicating whether all steps did not fail
|
||||||
|
"""
|
||||||
|
return self._thread and not self.is_running() and not self._pipeline_task_status_failed
|
||||||
|
|
||||||
def elapsed(self):
|
def elapsed(self):
|
||||||
# type: () -> float
|
# type: () -> float
|
||||||
@ -469,6 +483,14 @@ class PipelineController(object):
|
|||||||
"""
|
"""
|
||||||
return {k: n for k, n in self._nodes.items() if k in self._running_nodes}
|
return {k: n for k, n in self._nodes.items() if k in self._running_nodes}
|
||||||
|
|
||||||
|
def update_execution_plot(self):
|
||||||
|
# type: () -> ()
|
||||||
|
"""
|
||||||
|
Update sankey diagram of the current pipeline
|
||||||
|
"""
|
||||||
|
with self._reporting_lock:
|
||||||
|
self._update_execution_plot()
|
||||||
|
|
||||||
def _serialize_pipeline_task(self):
|
def _serialize_pipeline_task(self):
|
||||||
# type: () -> (dict, dict)
|
# type: () -> (dict, dict)
|
||||||
"""
|
"""
|
||||||
@ -645,14 +667,6 @@ class PipelineController(object):
|
|||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def update_execution_plot(self):
|
|
||||||
# type: () -> ()
|
|
||||||
"""
|
|
||||||
Update sankey diagram of the current pipeline
|
|
||||||
"""
|
|
||||||
with self._reporting_lock:
|
|
||||||
self._update_execution_plot()
|
|
||||||
|
|
||||||
def _update_execution_plot(self):
|
def _update_execution_plot(self):
|
||||||
# type: () -> ()
|
# type: () -> ()
|
||||||
"""
|
"""
|
||||||
@ -719,15 +733,7 @@ class PipelineController(object):
|
|||||||
orientation='h'
|
orientation='h'
|
||||||
)
|
)
|
||||||
|
|
||||||
task_link_template = self._task.get_output_log_web_page()\
|
table_values = self._build_table_report(node_params, visited)
|
||||||
.replace('/{}/'.format(self._task.project), '/{project}/')\
|
|
||||||
.replace('/{}/'.format(self._task.id), '/{task}/')
|
|
||||||
|
|
||||||
table_values = [["Pipeline Step", "Task ID", "Status", "Parameters"]]
|
|
||||||
table_values += [
|
|
||||||
[v, self.__create_task_link(self._nodes[v], task_link_template),
|
|
||||||
self.__get_node_status(self._nodes[v]), str(n)]
|
|
||||||
for v, n in zip(visited, node_params)]
|
|
||||||
|
|
||||||
# hack, show single node sankey
|
# hack, show single node sankey
|
||||||
if single_nodes:
|
if single_nodes:
|
||||||
@ -766,6 +772,42 @@ class PipelineController(object):
|
|||||||
self._task.get_logger().report_table(
|
self._task.get_logger().report_table(
|
||||||
title='Pipeline Details', series='Execution Details', iteration=0, table_plot=table_values)
|
title='Pipeline Details', series='Execution Details', iteration=0, table_plot=table_values)
|
||||||
|
|
||||||
|
def _build_table_report(self, node_params, visited):
|
||||||
|
# type: (List, List) -> List[List]
|
||||||
|
"""
|
||||||
|
Create the detailed table report on all the jobs in the pipeline
|
||||||
|
|
||||||
|
:param node_params: list of node parameters
|
||||||
|
:param visited: list of nodes
|
||||||
|
:return: Table as List of List of strings (cell)
|
||||||
|
"""
|
||||||
|
task_link_template = self._task.get_output_log_web_page() \
|
||||||
|
.replace('/{}/'.format(self._task.project), '/{project}/') \
|
||||||
|
.replace('/{}/'.format(self._task.id), '/{task}/')
|
||||||
|
|
||||||
|
table_values = [["Pipeline Step", "Task ID", "Task Name", "Status", "Parameters"]]
|
||||||
|
|
||||||
|
for name, param in zip(visited, node_params):
|
||||||
|
param_str = str(param)
|
||||||
|
if len(param_str) > 3:
|
||||||
|
# remove {} from string
|
||||||
|
param_str = param_str[1:-1]
|
||||||
|
|
||||||
|
step_name = name
|
||||||
|
if self._nodes[name].base_task_id:
|
||||||
|
step_name += '\n[<a href="{}"> {} </a>]'.format(
|
||||||
|
task_link_template.format(project='*', task=self._nodes[name].base_task_id), 'base task')
|
||||||
|
|
||||||
|
table_values.append(
|
||||||
|
[step_name,
|
||||||
|
self.__create_task_link(self._nodes[name], task_link_template),
|
||||||
|
self._nodes[name].job.task.name if self._nodes[name].job else '',
|
||||||
|
self.__get_node_status(self._nodes[name]),
|
||||||
|
param_str]
|
||||||
|
)
|
||||||
|
|
||||||
|
return table_values
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _get_node_color(node):
|
def _get_node_color(node):
|
||||||
# type (self.Mode) -> str
|
# type (self.Mode) -> str
|
||||||
@ -788,7 +830,7 @@ class PipelineController(object):
|
|||||||
return "royalblue" # aborted job
|
return "royalblue" # aborted job
|
||||||
elif node.job:
|
elif node.job:
|
||||||
if node.job.is_pending():
|
if node.job.is_pending():
|
||||||
return "mediumseagreen" # pending in queue
|
return "#bdf5bd" # lightgreen, pending in queue
|
||||||
else:
|
else:
|
||||||
return "green" # running job
|
return "green" # running job
|
||||||
elif node.skip_job:
|
elif node.skip_job:
|
||||||
@ -810,10 +852,11 @@ class PipelineController(object):
|
|||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
pooling_counter = 0
|
pooling_counter = 0
|
||||||
|
launched_nodes = set()
|
||||||
|
last_plot_report = time()
|
||||||
while self._stop_event:
|
while self._stop_event:
|
||||||
# stop request
|
# stop request
|
||||||
if pooling_counter and self._stop_event.wait(self._pool_frequency):
|
if self._stop_event.wait(self._pool_frequency if pooling_counter else 0.01):
|
||||||
break
|
break
|
||||||
|
|
||||||
pooling_counter += 1
|
pooling_counter += 1
|
||||||
@ -825,6 +868,7 @@ class PipelineController(object):
|
|||||||
# check the state of all current jobs
|
# check the state of all current jobs
|
||||||
# if no a job ended, continue
|
# if no a job ended, continue
|
||||||
completed_jobs = []
|
completed_jobs = []
|
||||||
|
force_execution_plot_update = False
|
||||||
for j in self._running_nodes:
|
for j in self._running_nodes:
|
||||||
node = self._nodes[j]
|
node = self._nodes[j]
|
||||||
if not node.job:
|
if not node.job:
|
||||||
@ -832,18 +876,29 @@ class PipelineController(object):
|
|||||||
if node.job.is_stopped():
|
if node.job.is_stopped():
|
||||||
completed_jobs.append(j)
|
completed_jobs.append(j)
|
||||||
node.executed = node.job.task_id() if not node.job.is_failed() else False
|
node.executed = node.job.task_id() if not node.job.is_failed() else False
|
||||||
|
if j in launched_nodes:
|
||||||
|
launched_nodes.remove(j)
|
||||||
elif node.timeout:
|
elif node.timeout:
|
||||||
started = node.job.task.data.started
|
started = node.job.task.data.started
|
||||||
if (datetime.now().astimezone(started.tzinfo) - started).total_seconds() > node.timeout:
|
if (datetime.now().astimezone(started.tzinfo) - started).total_seconds() > node.timeout:
|
||||||
node.job.abort()
|
node.job.abort()
|
||||||
completed_jobs.append(j)
|
completed_jobs.append(j)
|
||||||
node.executed = node.job.task_id()
|
node.executed = node.job.task_id()
|
||||||
|
elif j in launched_nodes and node.job.is_running():
|
||||||
|
# make sure update the execution graph when the job started running
|
||||||
|
# (otherwise it will still be marked queued)
|
||||||
|
launched_nodes.remove(j)
|
||||||
|
force_execution_plot_update = True
|
||||||
|
|
||||||
# update running jobs
|
# update running jobs
|
||||||
self._running_nodes = [j for j in self._running_nodes if j not in completed_jobs]
|
self._running_nodes = [j for j in self._running_nodes if j not in completed_jobs]
|
||||||
|
|
||||||
# nothing changed, we can sleep
|
# nothing changed, we can sleep
|
||||||
if not completed_jobs and self._running_nodes:
|
if not completed_jobs and self._running_nodes:
|
||||||
|
# force updating the pipeline state (plot) at least every 5 min.
|
||||||
|
if force_execution_plot_update or time()-last_plot_report > 5.*60:
|
||||||
|
last_plot_report = time()
|
||||||
|
self.update_execution_plot()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# callback on completed jobs
|
# callback on completed jobs
|
||||||
@ -873,6 +928,10 @@ class PipelineController(object):
|
|||||||
print('Launching step: {}'.format(name))
|
print('Launching step: {}'.format(name))
|
||||||
print('Parameters:\n{}'.format(self._nodes[name].job.task_parameter_override))
|
print('Parameters:\n{}'.format(self._nodes[name].job.task_parameter_override))
|
||||||
self._running_nodes.append(name)
|
self._running_nodes.append(name)
|
||||||
|
launched_nodes.add(name)
|
||||||
|
# check if node is cached do not wait for event but run the loop again
|
||||||
|
if self._nodes[name].executed:
|
||||||
|
pooling_counter = 0
|
||||||
else:
|
else:
|
||||||
getLogger('clearml.automation.controller').warning(
|
getLogger('clearml.automation.controller').warning(
|
||||||
'Skipping launching step \'{}\': {}'.format(name, self._nodes[name]))
|
'Skipping launching step \'{}\': {}'.format(name, self._nodes[name]))
|
||||||
@ -888,19 +947,15 @@ class PipelineController(object):
|
|||||||
break
|
break
|
||||||
|
|
||||||
# stop all currently running jobs:
|
# stop all currently running jobs:
|
||||||
failing_pipeline = False
|
|
||||||
for node in self._nodes.values():
|
for node in self._nodes.values():
|
||||||
if node.executed is False:
|
if node.executed is False:
|
||||||
failing_pipeline = True
|
self._pipeline_task_status_failed = True
|
||||||
if node.job and node.executed and not node.job.is_stopped():
|
if node.job and node.executed and not node.job.is_stopped():
|
||||||
node.job.abort()
|
node.job.abort()
|
||||||
elif not node.job and not node.executed:
|
elif not node.job and not node.executed:
|
||||||
# mark Node as skipped if it has no Job object and it is not executed
|
# mark Node as skipped if it has no Job object and it is not executed
|
||||||
node.skip_job = True
|
node.skip_job = True
|
||||||
|
|
||||||
if failing_pipeline and self._task:
|
|
||||||
self._task.mark_failed(status_reason='Pipeline step failed')
|
|
||||||
|
|
||||||
# visualize pipeline state (plot)
|
# visualize pipeline state (plot)
|
||||||
self.update_execution_plot()
|
self.update_execution_plot()
|
||||||
|
|
||||||
@ -911,6 +966,36 @@ class PipelineController(object):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def _parse_step_ref(self, value):
|
||||||
|
# type: (Any) -> Optional[str]
|
||||||
|
"""
|
||||||
|
Return the step reference. For example "${step1.parameters.Args/param}"
|
||||||
|
:param value: string
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
# look for all the step references
|
||||||
|
pattern = self._step_ref_pattern
|
||||||
|
updated_value = value
|
||||||
|
if isinstance(value, str):
|
||||||
|
for g in pattern.findall(value):
|
||||||
|
# update with actual value
|
||||||
|
new_val = self.__parse_step_reference(g)
|
||||||
|
updated_value = updated_value.replace(g, new_val, 1)
|
||||||
|
return updated_value
|
||||||
|
|
||||||
|
def _parse_task_overrides(self, task_overrides):
|
||||||
|
# type: (dict) -> dict
|
||||||
|
"""
|
||||||
|
Return the step reference. For example "${step1.parameters.Args/param}"
|
||||||
|
:param task_overrides: string
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
updated_overrides = {}
|
||||||
|
for k, v in task_overrides.items():
|
||||||
|
updated_overrides[k] = self._parse_step_ref(v)
|
||||||
|
|
||||||
|
return updated_overrides
|
||||||
|
|
||||||
def __verify_step_reference(self, node, step_ref_string):
|
def __verify_step_reference(self, node, step_ref_string):
|
||||||
# type: (PipelineController.Node, str) -> bool
|
# type: (PipelineController.Node, str) -> bool
|
||||||
"""
|
"""
|
||||||
@ -1047,36 +1132,6 @@ class PipelineController(object):
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def _parse_step_ref(self, value):
|
|
||||||
# type: (Any) -> Optional[str]
|
|
||||||
"""
|
|
||||||
Return the step reference. For example "${step1.parameters.Args/param}"
|
|
||||||
:param value: string
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
# look for all the step references
|
|
||||||
pattern = self._step_ref_pattern
|
|
||||||
updated_value = value
|
|
||||||
if isinstance(value, str):
|
|
||||||
for g in pattern.findall(value):
|
|
||||||
# update with actual value
|
|
||||||
new_val = self.__parse_step_reference(g)
|
|
||||||
updated_value = updated_value.replace(g, new_val, 1)
|
|
||||||
return updated_value
|
|
||||||
|
|
||||||
def _parse_task_overrides(self, task_overrides):
|
|
||||||
# type: (dict) -> dict
|
|
||||||
"""
|
|
||||||
Return the step reference. For example "${step1.parameters.Args/param}"
|
|
||||||
:param task_overrides: string
|
|
||||||
:return:
|
|
||||||
"""
|
|
||||||
updated_overrides = {}
|
|
||||||
for k, v in task_overrides.items():
|
|
||||||
updated_overrides[k] = self._parse_step_ref(v)
|
|
||||||
|
|
||||||
return updated_overrides
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __get_node_status(cls, a_node):
|
def __get_node_status(cls, a_node):
|
||||||
# type: (PipelineController.Node) -> str
|
# type: (PipelineController.Node) -> str
|
||||||
|
Loading…
Reference in New Issue
Block a user