clearml/trains/automation/optimization.py

import hashlib
import json
from copy import copy
from datetime import datetime
from itertools import product
from logging import getLogger
from threading import Thread, Event
from time import time
from typing import Union, Any, Sequence, Optional, Mapping, Callable

from .job import TrainsJob
from .parameters import Parameter
from ..task import Task

logger = getLogger('trains.automation.optimization')


try:
    import pandas as pd
    Task.add_requirements('pandas')
except ImportError:
    pd = None
    logger.warning('Pandas is not installed, summary table reporting will be skipped.')


class Objective(object):
    """
    Objective class to maximize/minimize over all experiments
    Class will sample specific scalar from all experiments, and maximize/minimize
    over single scalar (i.e. title and series combination)

    Used by the SearchStrategy/HyperParameterOptimizer in the strategy search algorithm
    """

    def __init__(self, title, series, order='max', extremum=False):
        # type: (str, str, str, bool) -> ()
        """
        Construct objective object that will return the scalar value for a specific task ID

        :param str title: Scalar graph title to sample from
        :param str series: Scalar series title to sample from
        :param str order: Either "max" or "min" , setting for maximizing/minimizing the objective scalar value
        :param bool extremum: Default False, which will bring the last value reported for a specific Task
            If True, return the global minimum / maximum reported metric value
        """
        self.title = title
        self.series = series
        assert order in ('min', 'max',)
        # normalize value so we always look for the highest objective value
        self.sign = -1 if (isinstance(order, str) and order.lower().strip() == 'min') else +1
        self._metric = None
        self.extremum = extremum

    def get_objective(self, task_id):
        # type: (Union[str, Task, TrainsJob]) -> Optional[float]
        """
        Return a specific task scalar value based on the objective settings (title/series)

        :param str task_id: Task id to retrieve scalar from (or TrainsJob object)
        :return float: scalar value
        """
        # create self._metric
        self._get_last_metrics_encode_field()

        if isinstance(task_id, Task):
            task_id = task_id.id
        elif isinstance(task_id, TrainsJob):
            task_id = task_id.task_id()

        # noinspection PyBroadException, Py
        try:
            # noinspection PyProtectedMember
            task = Task._query_tasks(
                task_ids=[task_id], only_fields=['last_metrics.{}.{}'.format(self._metric[0], self._metric[1])])[0]
        except Exception:
            return None

        metrics = task.last_metrics
        # noinspection PyBroadException
        try:
            values = metrics[self._metric[0]][self._metric[1]]
            if not self.extremum:
                return values['value']

            return values['min_value'] if self.sign < 0 else values['max_value']
        except Exception:
            return None

    def get_current_raw_objective(self, task):
        # type: (Union[TrainsJob, Task]) -> (int, float)
        """
        Return the current raw value (without sign normalization) of the objective

        :param str task: Task or Job to retrieve scalar from (or TrainsJob object)
        :return tuple: (iteration, value) if metric does not exist return None
        """

        if not isinstance(task, Task):
            if hasattr(task, 'task'):
                task = task.task
            if not isinstance(task, Task):
                task = Task.get_task(task_id=str(task))
                if not task:
                    raise ValueError("Task object could not be found")

        # todo: replace with more efficient code
        scalars = task.get_reported_scalars()

        # noinspection PyBroadException
        try:
            return scalars[self.title][self.series]['x'][-1], scalars[self.title][self.series]['y'][-1]
        except Exception:
            return None

    def get_objective_sign(self):
        # type: () -> float
        """
        Return the sign of the objective (i.e. +1 if maximizing, and -1 if minimizing)

        :return float: objective function sign
        """
        return self.sign

    def get_objective_metric(self):
        # type: () -> (str, str)
        """
        Return the metric title, series pair of the objective

        :return (str, str): return (title, series)
        """
        return self.title, self.series

    def get_normalized_objective(self, task_id):
        # type: (Union[str, Task, TrainsJob]) -> Optional[float]
        """
        Return a normalized task scalar value based on the objective settings (title/series)
        I.e. objective is always to maximize the returned value

        :param str task_id: Task id to retrieve scalar from
        :return float: normalized scalar value
        """
        objective = self.get_objective(task_id=task_id)
        if objective is None:
            return None
        # normalize value so we always look for the highest objective value
        return self.sign * objective

    def _get_last_metrics_encode_field(self):
        # type: () -> str
        """
        Return encoded representation of title/series metric

        :return str: string representing the objective title/series
        """
        if not self._metric:
            title = hashlib.md5(str(self.title).encode('utf-8')).hexdigest()
            series = hashlib.md5(str(self.series).encode('utf-8')).hexdigest()
            self._metric = title, series
        return '{}last_metrics.{}.{}.{}'.format(
            '-' if self.sign < 0 else '', self._metric[0], self._metric[1],
            ('min_value' if self.sign < 0 else 'max_value') if self.extremum else 'value')


class Budget(object):
    class Field(object):
        def __init__(self, limit=None):
            # type: (Optional[float]) -> ()
            self.limit = limit
            self.current = {}

        def update(self, uid, value):
            # type: (Union[str, int], float) -> ()
            if value is not None:
                try:
                    self.current[uid] = float(value)
                except (TypeError, ValueError):
                    pass

        @property
        def used(self):
            # type: () -> (Optional[float])
            if self.limit is None or not self.current:
                return None
            return sum(self.current.values())/float(self.limit)

    def __init__(self, jobs_limit, iterations_limit, compute_time_limit):
        # type: (Optional[int], Optional[int], Optional[float]) -> ()
        self.jobs = self.Field(jobs_limit)
        self.iterations = self.Field(iterations_limit)
        self.compute_time = self.Field(compute_time_limit)

    def to_dict(self):
        # type: () -> (Mapping[str, Mapping[str, float]])

        # returned dict is Mapping[Union['jobs', 'iterations', 'compute_time'], Mapping[Union['limit', 'used'], float]]
        current_budget = {}
        jobs = self.jobs.used
        if jobs:
            current_budget['jobs'] = {'limit': self.jobs.limit, 'used': jobs}
        iterations = self.iterations.used
        if iterations:
            current_budget['iterations'] = {'limit': self.iterations.limit, 'used': iterations}
        compute_time = self.compute_time.used
        if compute_time:
            current_budget['compute_time'] = {'limit': self.compute_time.limit, 'used': compute_time}
        return current_budget


class SearchStrategy(object):
    """
    Base Search strategy class, inherit to implement your custom strategy
    """
    _tag = 'optimization'
    _job_class = TrainsJob  # type: TrainsJob

    def __init__(
            self,
            base_task_id,  # type: str
            hyper_parameters,  # type: Sequence[Parameter]
            objective_metric,  # type: Objective
            execution_queue,  # type: str
            num_concurrent_workers,  # type: int
            pool_period_min=2.,  # type: float
            time_limit_per_job=None,  # type: Optional[float]
            max_iteration_per_job=None,  # type: Optional[int]
            total_max_jobs=None,  # type: Optional[int]
            **_  # type: Any
    ):
        # type: (...) -> ()
        """
        Initialize a search strategy optimizer

        :param str base_task_id: Task ID (str)
        :param list hyper_parameters: list of Parameter objects to optimize over
        :param Objective objective_metric: Objective metric to maximize / minimize
        :param str execution_queue: execution queue to use for launching Tasks (experiments).
        :param int num_concurrent_workers: Limit number of concurrent running machines
        :param float pool_period_min: time in minutes between two consecutive pools
        :param float time_limit_per_job: Optional, maximum execution time per single job in minutes,
            when time limit is exceeded job is aborted
        :param int max_iteration_per_job: Optional, maximum iterations (of the objective metric)
            per single job, when exceeded job is aborted.
        :param int total_max_jobs: total maximum jobs for the optimization process. Default None, unlimited
        """
        super(SearchStrategy, self).__init__()
        self._base_task_id = base_task_id
        self._hyper_parameters = hyper_parameters
        self._objective_metric = objective_metric
        self._execution_queue = execution_queue
        self._num_concurrent_workers = num_concurrent_workers
        self.pool_period_minutes = pool_period_min
        self.time_limit_per_job = time_limit_per_job
        self.max_iteration_per_job = max_iteration_per_job
        self.total_max_jobs = total_max_jobs
        self._stop_event = Event()
        self._current_jobs = []
        self._pending_jobs = []
        self._num_jobs = 0
        self._job_parent_id = None
        self._created_jobs_ids = {}
        self._naming_function = None
        self._job_project = {}
        self.budget = Budget(
            jobs_limit=self.total_max_jobs,
            compute_time_limit=self.total_max_jobs * self.time_limit_per_job if
            self.time_limit_per_job and self.total_max_jobs else None,
            iterations_limit=self.total_max_jobs * self.max_iteration_per_job if
            self.max_iteration_per_job and self.total_max_jobs else None
        )
        self._validate_base_task()

    def start(self):
        # type: () -> ()
        """
        Start the Optimizer controller function loop()
        If the calling process is stopped, the controller will stop as well.

        Notice: This function returns only after optimization is completed! or stop() was called.
        """
        counter = 0
        while True:
            logger.debug('optimization loop #{}'.format(counter))
            if not self.process_step():
                break
            if self._stop_event.wait(timeout=self.pool_period_minutes * 60.):
                break
            counter += 1

    def stop(self):
        # type: () -> ()
        """
        Stop the current running optimization loop,
        Called from a different thread than the start()
        """
        self._stop_event.set()

    def process_step(self):
        # type: () -> bool
        """
        Abstract helper function, not a must to implement, default use in start default implementation
        Main optimization loop, called from the daemon thread created by start()
        - Call monitor job on every TrainsJob in jobs:
            - Check the performance or elapsed time, then decide if to kill the jobs
        - Call create_job:
            - Check if we have spare jpb slots
            - If yes: call create a new job based on previous tested experiments

        :return bool: True to continue the optimization and False to immediately stop
        """
        updated_jobs = []
        for job in self._current_jobs:
            if self.monitor_job(job):
                updated_jobs.append(job)

        self._current_jobs = updated_jobs

        pending_jobs = []
        for job in self._pending_jobs:
            if job.is_pending():
                pending_jobs.append(job)
            else:
                self.budget.jobs.update(job.task_id(), 1)

        self._pending_jobs = pending_jobs

        free_workers = self._num_concurrent_workers - len(self._current_jobs)

        # do not create more jobs if we hit the limit
        if self.total_max_jobs and self._num_jobs >= self.total_max_jobs:
            return bool(self._current_jobs)

        # see how many free slots we have and create job
        for i in range(max(0, free_workers)):
            new_job = self.create_job()
            if not new_job:
                break
            self._num_jobs += 1
            new_job.launch(self._execution_queue)
            self._current_jobs.append(new_job)
            self._pending_jobs.append(new_job)

        return bool(self._current_jobs)

    def create_job(self):
        # type: () -> Optional[TrainsJob]
        """
        Abstract helper function, not a must to implement, default use in process_step default implementation
        Create a new job if needed. return the newly created job.
        If no job needs to be created, return None

        :return TrainsJob: newly created TrainsJob object or None if no TrainsJob created
        """
        return None

    def monitor_job(self, job):
        # type: (TrainsJob) -> bool
        """
        Helper function, not a must to implement, default use in process_step default implementation
        Check if the job needs to be aborted or already completed
        if return False, the job was aborted / completed, and should be taken off the current job list

        If there is a budget limitation,
        this call should update self.budget.compute_time.update() / self.budget.iterations.update()

        :param TrainsJob job: a TrainsJob object to monitor
        :return bool: If False, job is no longer relevant
        """
        abort_job = False

        if self.time_limit_per_job:
            elapsed = job.elapsed() / 60.
            if elapsed > 0:
                self.budget.compute_time.update(job.task_id(), elapsed)
                if elapsed > self.time_limit_per_job:
                    abort_job = True

        if self.max_iteration_per_job:
            iterations = self._get_job_iterations(job)
            if iterations > 0:
                self.budget.iterations.update(job.task_id(), iterations)
                if iterations > self.max_iteration_per_job:
                    abort_job = True

        if abort_job:
            job.abort()
            return False

        return not job.is_stopped()

    def get_running_jobs(self):
        # type: () -> Sequence[TrainsJob]
        """
        Return the current running TrainsJobs

        :return list: list of TrainsJob objects
        """
        return self._current_jobs

    def get_created_jobs_ids(self):
        # type: () -> Mapping[str, dict]
        """
        Return a task ids dict created ny this optimizer until now, including completed and running jobs.
        The values of the returned dict are the parameters used in the specific job

        :return dict: dict of task ids (str) as keys, and their parameters dict as value
        """
        return self._created_jobs_ids

    def get_top_experiments(self, top_k):
        # type: (int) -> Sequence[Task]
        """
        Return a list of Tasks of the top performing experiments, based on the controller Objective object

        :param int top_k: Number of Tasks (experiments) to return
        :return list: List of Task objects, ordered by performance, where index 0 is the best performing Task.
        """
        # noinspection PyProtectedMember
        top_tasks = self._get_child_tasks(
            parent_task_id=self._job_parent_id or self._base_task_id,
            order_by=self._objective_metric._get_last_metrics_encode_field(),
            additional_filters={'page_size': int(top_k), 'page': 0})
        return top_tasks

    def get_objective_metric(self):
        # type: () -> (str, str)
        """
        Return the metric title, series pair of the objective

        :return (str, str): return (title, series)
        """
        return self._objective_metric.get_objective_metric()

    def helper_create_job(
            self,
            base_task_id,  # type: str
            parameter_override=None,  # type: Optional[Mapping[str, str]]
            task_overrides=None,  # type: Optional[Mapping[str, str]]
            tags=None,  # type: Optional[Sequence[str]]
            parent=None,  # type: Optional[str]
            **kwargs  # type: Any
    ):
        # type: (...) -> TrainsJob
        """
        Create a Job using the specified arguments, TrainsJob for details

        :return TrainsJob: Returns a newly created Job instance
        """
        if parameter_override:
            param_str = ['{}={}'.format(k, parameter_override[k]) for k in sorted(parameter_override.keys())]
            if self._naming_function:
                name = self._naming_function(self._base_task_name, parameter_override)
            elif self._naming_function is False:
                name = None
            else:
                name = '{}: {}'.format(self._base_task_name, ' '.join(param_str))
            comment = '\n'.join(param_str)
        else:
            name = None
            comment = None
        tags = (tags or []) + [self._tag, 'opt' + (': {}'.format(self._job_parent_id) if self._job_parent_id else '')]
        new_job = self._job_class(
            base_task_id=base_task_id, parameter_override=parameter_override,
            task_overrides=task_overrides, tags=tags, parent=parent or self._job_parent_id,
            name=name, comment=comment, project=self._get_task_project(parent or self._job_parent_id), **kwargs)
        self._created_jobs_ids[new_job.task_id()] = parameter_override
        logger.info('Creating new Task: {}'.format(parameter_override))
        return new_job

    def set_job_class(self, job_class):
        # type: (TrainsJob) -> ()
        """
        Set the class to use for the helper_create_job function

        :param TrainsJob job_class: Job Class type
        """
        self._job_class = job_class

    def set_job_default_parent(self, job_parent_task_id):
        # type: (str) -> ()
        """
        Set the default parent for all Jobs created by the helper_create_job method
        :param str job_parent_task_id: Parent task id
        """
        self._job_parent_id = job_parent_task_id

    def set_job_naming_scheme(self, naming_function):
        # type: (Optional[Callable[[str, dict], str]]) -> ()
        """
        Set the function used to name a newly created job

        :param callable naming_function: naming_functor(base_task_name, argument_dict) -> str
        """
        self._naming_function = naming_function

    def _validate_base_task(self):
        # type: () -> ()
        """
        Check the base task exists and contains the requested objective metric and hyper parameters
        """
        # check if the task exists
        try:
            task = Task.get_task(task_id=self._base_task_id)
            self._base_task_name = task.name
        except ValueError:
            raise ValueError("Could not find base task id {}".format(self._base_task_id))
        # check if the hyper-parameters exist:
        task_parameters = task.get_parameters_as_dict()
        missing_params = [h.name for h in self._hyper_parameters if h.name not in task_parameters]
        if missing_params:
            logger.warning('Could not find requested hyper-parameters {} on base task {}'.format(
                missing_params, self._base_task_id))
        # check if the objective metric exists (i.e. no typos etc)
        if self._objective_metric.get_objective(self._base_task_id) is None:
            logger.warning('Could not find requested metric {} report on base task {}'.format(
                self._objective_metric.get_objective_metric(), self._base_task_id))

    def _get_task_project(self, parent_task_id):
        # type: (str) -> (Optional[str])
        if not parent_task_id:
            return
        if parent_task_id not in self._job_project:
            task = Task.get_task(task_id=parent_task_id)
            self._job_project[parent_task_id] = task.project

        return self._job_project.get(parent_task_id)

    def _get_job_iterations(self, job):
        # type: (Union[TrainsJob, Task]) -> int
        iteration_value = self._objective_metric.get_current_raw_objective(job)
        return iteration_value[0] if iteration_value else -1

    @classmethod
    def _get_child_tasks(
            cls,
            parent_task_id,  # type: str
            status=None,  # type: Optional[Task.TaskStatusEnum]
            order_by=None,  # type: Optional[str]
            additional_filters=None  # type: Optional[dict]
    ):
        # type: (...) -> (Sequence[Task])
        """
        Helper function, return a list of tasks tagged automl with specific status ordered by sort_field

        :param str parent_task_id: Base Task ID (parent)
        :param status: Current status of requested tasks (in_progress, completed etc)
        :param str order_by: Field name to sort results.
            Examples:
                "-last_metrics.title.series.min"
                "last_metrics.title.series.max"
                "last_metrics.title.series.last"
                "execution.parameters.name"
                "updated"
        :param dict additional_filters: Additional task filters
        :return list(Task): List of Task objects
        """
        task_filter = {'parent': parent_task_id,
                       # 'tags': [cls._tag],
                       'system_tags': ['-archived']}
        task_filter.update(additional_filters or {})

        if status:
            task_filter['status'] = status

        if order_by and (order_by.startswith('last_metrics') or order_by.startswith('-last_metrics')):
            parts = order_by.split('.')
            if parts[-1] in ('min', 'max', 'last'):
                title = hashlib.md5(str(parts[1]).encode('utf-8')).hexdigest()
                series = hashlib.md5(str(parts[2]).encode('utf-8')).hexdigest()
                minmax = 'min_value' if 'min' in parts[3] else ('max_value' if 'max' in parts[3] else 'value')
                order_by = '{}last_metrics.'.join(
                    ('-' if order_by and order_by[0] == '-' else '', title, series, minmax))

        if order_by:
            task_filter['order_by'] = [order_by]

        return Task.get_tasks(task_filter=task_filter)


class GridSearch(SearchStrategy):
    """
    Grid search strategy controller.
    Full grid sampling of every hyper-parameter combination
    """

    def __init__(
            self,
            base_task_id,  # type: str
            hyper_parameters,  # type: Sequence[Parameter]
            objective_metric,  # type: Objective
            execution_queue,  # type: str
            num_concurrent_workers,  # type: int
            pool_period_min=2.,  # type: float
            time_limit_per_job=None,  # type: Optional[float]
            max_iteration_per_job=None,  # type: Optional[int]
            total_max_jobs=None,  # type: Optional[int]
            **_  # type: Any
    ):
        # type: (...) -> ()
        """
        Initialize a grid search optimizer

        :param str base_task_id: Task ID (str)
        :param list hyper_parameters: list of Parameter objects to optimize over
        :param Objective objective_metric: Objective metric to maximize / minimize
        :param str execution_queue: execution queue to use for launching Tasks (experiments).
        :param int num_concurrent_workers: Limit number of concurrent running machines
        :param float pool_period_min: time in minutes between two consecutive pools
        :param float time_limit_per_job: Optional, maximum execution time per single job in minutes,
            when time limit is exceeded job is aborted
        :param int max_iteration_per_job: maximum iterations (of the objective metric)
            per single job, when exceeded job is aborted.
        :param int total_max_jobs: total maximum jobs for the optimization process. Default None, unlimited
        """
        super(GridSearch, self).__init__(
            base_task_id=base_task_id, hyper_parameters=hyper_parameters, objective_metric=objective_metric,
            execution_queue=execution_queue, num_concurrent_workers=num_concurrent_workers,
            pool_period_min=pool_period_min, time_limit_per_job=time_limit_per_job,
            max_iteration_per_job=max_iteration_per_job, total_max_jobs=total_max_jobs, **_)
        self._param_iterator = None

    def create_job(self):
        # type: () -> Optional[TrainsJob]
        """
        Create a new job if needed. return the newly created job.
        If no job needs to be created, return None

        :return TrainsJob: newly created TrainsJob object or None if no TrainsJob created
        """
        try:
            parameters = self._next_configuration()
        except StopIteration:
            return None

        return self.helper_create_job(base_task_id=self._base_task_id, parameter_override=parameters)

    def _next_configuration(self):
        # type: () -> Mapping[str, str]
        def param_iterator_fn():
            hyper_params_values = [p.to_list() for p in self._hyper_parameters]
            for state in product(*hyper_params_values):
                yield dict(kv for d in state for kv in d.items())

        if not self._param_iterator:
            self._param_iterator = param_iterator_fn()
        return next(self._param_iterator)


class RandomSearch(SearchStrategy):
    """
    Random search strategy controller.
    Random uniform sampling of hyper-parameters
    """

    # Number of already chosen random samples before assuming we covered the entire hyper-parameter space
    _hp_space_cover_samples = 42

    def __init__(
            self,
            base_task_id,  # type: str
            hyper_parameters,  # type: Sequence[Parameter]
            objective_metric,  # type: Objective
            execution_queue,  # type: str
            num_concurrent_workers,  # type: int
            pool_period_min=2.,  # type: float
            time_limit_per_job=None,  # type: Optional[float]
            max_iteration_per_job=None,  # type: Optional[int]
            total_max_jobs=None,  # type: Optional[int]
            **_  # type: Any
    ):
        # type: (...) -> ()
        """
        Initialize a random search optimizer

        :param str base_task_id: Task ID (str)
        :param list hyper_parameters: list of Parameter objects to optimize over
        :param Objective objective_metric: Objective metric to maximize / minimize
        :param str execution_queue: execution queue to use for launching Tasks (experiments).
        :param int num_concurrent_workers: Limit number of concurrent running machines
        :param float pool_period_min: time in minutes between two consecutive pools
        :param float time_limit_per_job: Optional, maximum execution time per single job in minutes,
            when time limit is exceeded job is aborted
        :param int max_iteration_per_job: maximum iterations (of the objective metric)
            per single job, when exceeded job is aborted.
        :param int total_max_jobs: total maximum jobs for the optimization process. Default None, unlimited
        """
        super(RandomSearch, self).__init__(
            base_task_id=base_task_id, hyper_parameters=hyper_parameters, objective_metric=objective_metric,
            execution_queue=execution_queue, num_concurrent_workers=num_concurrent_workers,
            pool_period_min=pool_period_min, time_limit_per_job=time_limit_per_job,
            max_iteration_per_job=max_iteration_per_job, total_max_jobs=total_max_jobs, **_)
        self._hyper_parameters_collection = set()

    def create_job(self):
        # type: () -> Optional[TrainsJob]
        """
        Create a new job if needed. return the newly created job.
        If no job needs to be created, return None

        :return TrainsJob: newly created TrainsJob object or None if no TrainsJob created
        """
        parameters = None

        # maximum tries to ge a random set that is not already in the collection
        for i in range(self._hp_space_cover_samples):
            parameters = {}
            for p in self._hyper_parameters:
                parameters.update(p.get_value())
            # hash the parameters dictionary
            param_hash = hash(json.dumps(parameters, sort_keys=True))
            # if this is a new set of parameters, use it.
            if param_hash not in self._hyper_parameters_collection:
                self._hyper_parameters_collection.add(param_hash)
                break
            # try again
            parameters = None

        # if we failed to find a random set of parameters, assume we selected all of them
        if not parameters:
            return None

        return self.helper_create_job(base_task_id=self._base_task_id, parameter_override=parameters)


class HyperParameterOptimizer(object):
    """
    Hyper-parameter search controller. Cloning base experiment,
    changing arguments and trying to maximize/minimize the defined objective
    """
    _tag = 'optimization'

    def __init__(
            self,
            base_task_id,  # type: str
            hyper_parameters,  # type: Sequence[Parameter]
            objective_metric_title,  # type: str
            objective_metric_series,  # type: str
            objective_metric_sign='min',  # type: str
            optimizer_class=RandomSearch,  # type: type(SearchStrategy)
            max_number_of_concurrent_tasks=10,  # type: int
            execution_queue='default',  # type: str
            optimization_time_limit=None,  # type: Optional[float]
            auto_connect_task=True,  # type: bool
            always_create_task=False,  # type: bool
            **optimizer_kwargs  # type: Any
    ):
        # type: (...) -> ()
        """
        Create a new hyper-parameter controller. The newly created object will launch and monitor the new experiments.

        :param str base_task_id: Task ID to be used as template experiment to optimize.
        :param list hyper_parameters: list of Parameter objects to optimize over
        :param str objective_metric_title: Objective metric title to maximize / minimize (example: 'validation')
        :param str objective_metric_series: Objective metric series to maximize / minimize (example: 'loss')
        :param str objective_metric_sign: Objective to maximize / minimize.
            Valid options: ['min', 'max', 'min_global', 'max_global']
            'min'/'max': Minimize/Maximize the last reported value for the specified title/series scalar
            'min_global'/'max_global': Minimize/Maximize the min/max value
                of *all* reported values for the specific title/series scalar
        :param class.SearchStrategy optimizer_class: SearchStrategy optimizer to use for the hyper-parameter search
        :param int max_number_of_concurrent_tasks: Maximum number of
            concurrent Tasks (experiment) running at the same time.
        :param str execution_queue: execution queue to use for launching Tasks (experiments).
        :param float optimization_time_limit: Maximum time (minutes) for the entire optimization process.
            Default is None, no time limit,
        :param bool auto_connect_task: If True optimization argument and configuration will be stored on the Task
            All arguments will be under the hyper-parameter section as 'opt/<arg>'
            and the hyper_parameters will stored in the task connect_configuration (see artifacts/hyper-parameter)
        :param bool always_create_task: If True there ts no current Task initialized,
            we create a new task names 'optimization' in the base_task_id project.
            otherwise we use the Task.current_task (if exists) to report statistics
        :param ** optimizer_kwargs: arguments passed directly to the optimizer constructor

            Example:

            .. code-block:: python
                :linenos:
                :caption: Example

                from trains import Task
                from trains.automation import UniformParameterRange, DiscreteParameterRange
                from trains.automation import GridSearch, RandomSearch, HyperParameterOptimizer

                task = Task.init('examples', 'HyperParameterOptimizer example')
                an_optimizer = HyperParameterOptimizer(
                    base_task_id='fa30fa45d95d4927b87c323b5b04dc44',
                    hyper_parameters=[
                        UniformParameterRange('lr', min_value=0.01, max_value=0.3, step_size=0.05),
                        DiscreteParameterRange('network', values=['ResNet18', 'ResNet50', 'ResNet101']),
                    ],
                    objective_metric_title='title',
                    objective_metric_series='series',
                    objective_metric_sign='min',
                    max_number_of_concurrent_tasks=5,
                    optimizer_class=RandomSearch,
                    execution_queue='workers', time_limit_per_job=120, pool_period_min=0.2)

                # This will automatically create and print the optimizer new task id
                # for later use. if a Task was already created, it will use it.
                an_optimizer.set_time_limit(in_minutes=10.)
                an_optimizer.start()
                # we can create a pooling loop if we like
                while not an_optimizer.reached_time_limit():
                    top_exp = an_optimizer.get_top_experiments(top_k=3)
                    print(top_exp)
                # wait until optimization completed or timed-out
                an_optimizer.wait()
                # make sure we stop all jobs
                an_optimizer.stop()


        """

        # create a new Task, if we do not have one already
        self._task = Task.current_task()
        if not self._task and always_create_task:
            base_task = Task.get_task(task_id=self.base_task_id)
            self._task = Task.init(
                project_name=base_task.get_project_name(),
                task_name='Optimizing: {}'.format(base_task.name),
            )  # TODO: add task_type=controller

        opts = dict(
            base_task_id=base_task_id,
            objective_metric_title=objective_metric_title,
            objective_metric_series=objective_metric_series,
            objective_metric_sign=objective_metric_sign,
            max_number_of_concurrent_tasks=max_number_of_concurrent_tasks,
            execution_queue=execution_queue,
            optimization_time_limit=optimization_time_limit,
            optimizer_kwargs=optimizer_kwargs)
        # make sure all the created tasks are our children, as we are creating them
        if self._task:
            self._task.add_tags([self._tag])
            if auto_connect_task:
                optimizer_class, hyper_parameters, opts = self._connect_args(
                    optimizer_class=optimizer_class, hyper_param_configuration=hyper_parameters, **opts)

        self.base_task_id = opts['base_task_id']
        self.hyper_parameters = hyper_parameters
        self.max_number_of_concurrent_tasks = opts['max_number_of_concurrent_tasks']
        self.execution_queue = opts['execution_queue']
        self.objective_metric = Objective(
            title=opts['objective_metric_title'], series=opts['objective_metric_series'],
            order='min' if opts['objective_metric_sign'] in ('min', 'min_global') else 'max',
            extremum=opts['objective_metric_sign'].endswith('_global'))
        # if optimizer_class is an instance, use it as is.
        if type(optimizer_class) != type:
            self.optimizer = optimizer_class
        else:
            self.optimizer = optimizer_class(
                base_task_id=opts['base_task_id'], hyper_parameters=hyper_parameters,
                objective_metric=self.objective_metric, execution_queue=opts['execution_queue'],
                num_concurrent_workers=opts['max_number_of_concurrent_tasks'], **opts.get('optimizer_kwargs', {}))
        self.optimization_timeout = None
        self.optimization_start_time = None
        self._thread = None
        self._stop_event = None
        self._report_period_min = 5.
        self._thread_reporter = None
        self._experiment_completed_cb = None
        if self._task:
            self.optimizer.set_job_default_parent(self._task.id)
        self.set_time_limit(in_minutes=opts['optimization_time_limit'])

    def get_num_active_experiments(self):
        # type: () -> int
        """
        Return the number of current active experiments

        :return int: number of active experiments
        """
        if not self.optimizer:
            return 0
        return len(self.optimizer.get_running_jobs())

    def get_active_experiments(self):
        # type: () -> Sequence[Task]
        """
        Return a list of Tasks of the current active experiments

        :return list: List of Task objects, representing the current active experiments
        """
        if not self.optimizer:
            return []
        return [j.task for j in self.optimizer.get_running_jobs()]

    def start(self, job_complete_callback=None):
        # type: (Optional[Callable[[str, float, int, dict, str], None]]) -> bool
        """
        Start the HyperParameterOptimizer controller.
        If the calling process is stopped, the controller will stop as well.

        :param Callable job_complete_callback: callback function, called when a job is completed.
            def job_complete_callback(
                job_id,                 # type: str
                objective_value,        # type: float
                objective_iteration,    # type: int
                job_parameters,         # type: dict
                top_performance_job_id  # type: str
            ):
                pass
        :return bool: If True the controller started
        """
        if not self.optimizer:
            return False

        if self._thread:
            return True

        self.optimization_start_time = time()
        self._experiment_completed_cb = job_complete_callback
        self._stop_event = Event()
        self._thread = Thread(target=self._daemon)
        self._thread.daemon = True
        self._thread.start()
        self._thread_reporter = Thread(target=self._report_daemon)
        self._thread_reporter.daemon = True
        self._thread_reporter.start()
        return True

    def stop(self, timeout=None):
        # type: (Optional[float]) -> ()
        """
        Stop the HyperParameterOptimizer controller and  optimization thread,

        :param float timeout: Wait timeout in minutes for the optimization thread to exit.
            Default None, do not wait terminate immediately.
        """
        if not self._thread or not self._stop_event or not self.optimizer:
            return

        _thread = self._thread
        self._stop_event.set()
        self.optimizer.stop()

        # wait for optimizer thread
        if timeout is not None:
            _thread.join(timeout=timeout * 60.)

        # stop all running tasks:
        for j in self.optimizer.get_running_jobs():
            j.abort()

        # clear thread
        self._thread = None
        # wait for reporter to flush
        self._thread_reporter.join()

    def is_active(self):
        # type: () -> bool
        """
        Return True if the optimization procedure is still running
        Note, if the daemon thread has not yet started, it will still return True

        :return bool: If False the optimization procedure stopped
        """
        return self._stop_event is None or self._thread is not None

    def is_running(self):
        # type: () -> bool
        """
        Return True if the optimization controller is running

        :return bool: If True if optimization procedure is active
        """
        return self._thread is not None

    def wait(self, timeout=None):
        # type: (Optional[float]) -> bool
        """
        Wait for the optimizer to finish.
        It will not stop the optimizer in any case. Call stop() to terminate the optimizer.

        :param float timeout: Timeout in minutes to wait for the optimization to complete
            If None, wait until we reached the timeout, or optimization completed.
        :return bool: True if optimization finished, False if timeout.
        """
        if not self.is_running():
            return True

        if timeout is not None:
            timeout *= 60.
        else:
            timeout = max(0, self.optimization_timeout - self.optimization_start_time) \
                if self.optimization_timeout else None

        _thread = self._thread

        _thread.join(timeout=timeout)
        if _thread.is_alive():
            return False

        return True

    def set_time_limit(self, in_minutes=None, specific_time=None):
        # type: (Optional[float], Optional[datetime]) -> ()
        """
        Set a time limit for the HyperParameterOptimizer controller,
        i.e. if we reached the time limit, stop the optimization process

        :param float in_minutes: Set maximum processing time in minutes from current time
        :param datetime specific_time: Set specific date/time limit
        """
        if specific_time:
            self.optimization_timeout = specific_time.timestamp()
        else:
            self.optimization_timeout = (in_minutes * 60.) + time() if in_minutes else None

    def get_time_limit(self):
        # type: () -> datetime
        """
        Return the controller optimization time limit.

        :return datetime: Absolute datetime limit of the controller optimization process
        """
        return datetime.fromtimestamp(self.optimization_timeout)

    def elapsed(self):
        # type: () -> float
        """
        Return minutes elapsed from controller stating time stamp

        :return float: minutes from controller start time, negative value means the process has not started yet.
        """
        if self.optimization_start_time is None:
            return -1.0
        return (time() - self.optimization_start_time) / 60.

    def reached_time_limit(self):
        # type: () -> bool
        """
        Return True if we passed the time limit. Function returns immediately, it does not wait for the optimizer.

        :return bool: Return True, if optimizer is running and we passed the time limit, otherwise returns False.
        """
        if self.optimization_start_time is None:
            return False
        if not self.is_running():
            return False

        return time() > self.optimization_timeout

    def get_top_experiments(self, top_k):
        # type: (int) -> Sequence[Task]
        """
        Return a list of Tasks of the top performing experiments, based on the controller Objective object

        :param int top_k: Number of Tasks (experiments) to return
        :return list: List of Task objects, ordered by performance, where index 0 is the best performing Task.
        """
        if not self.optimizer:
            return []
        return self.optimizer.get_top_experiments(top_k=top_k)

    def get_optimizer(self):
        # type: () -> SearchStrategy
        """
        Return the currently used optimizer object

        :return SearchStrategy: Used SearchStrategy object
        """
        return self.optimizer

    def set_default_job_class(self, job_class):
        # type: (TrainsJob) -> ()
        """
        Set the Job class to use when the optimizer spawns new Jobs

        :param TrainsJob job_class: Job Class type
        """
        self.optimizer.set_job_class(job_class)

    def set_report_period(self, report_period_minutes):
        # type: (float) -> ()
        """
        Set reporting period in minutes, for the accumulated objective report
        This report is sent on the Optimizer Task, and collects objective metric from all running jobs.

        :param float report_period_minutes: Reporting period in minutes. Default once every 10 minutes.
        """
        self._report_period_min = float(report_period_minutes)

    def _connect_args(self, optimizer_class=None, hyper_param_configuration=None, **kwargs):
        # type: (SearchStrategy, dict, Any) -> (SearchStrategy, list, dict)
        if not self._task:
            logger.warning('Auto Connect turned on but no Task was found, '
                           'hyper-parameter optimization argument logging disabled')
            return optimizer_class, hyper_param_configuration, kwargs

        configuration_dict = {'parameter_optimization_space': [c.to_dict() for c in hyper_param_configuration]}
        self._task.connect_configuration(configuration_dict)
        # this is the conversion back magic:
        configuration_dict = {'parameter_optimization_space': [
            Parameter.from_dict(c) for c in configuration_dict['parameter_optimization_space']]}

        arguments = {'opt': kwargs}
        if type(optimizer_class) != type:
            logger.warning('Auto Connect optimizer_class disabled, {} is already instantiated'.format(optimizer_class))
            self._task.connect(arguments)
        else:
            arguments['opt']['optimizer_class'] = str(optimizer_class).split('.')[-1][:-2] \
                if not isinstance(optimizer_class, str) else optimizer_class
            self._task.connect(arguments)
            # this is the conversion back magic:
            original_class = optimizer_class
            optimizer_class = arguments['opt'].pop('optimizer_class', None)
            if optimizer_class == 'RandomSearch':
                optimizer_class = RandomSearch
            elif optimizer_class == 'GridSearch':
                optimizer_class = GridSearch
            elif optimizer_class == 'OptimizerBOHB':
                from .hpbandster import OptimizerBOHB
                optimizer_class = OptimizerBOHB
            else:
                logger.warning("Could not resolve optimizer_class {} reverting to original class {}".format(
                    optimizer_class, original_class))
                optimizer_class = original_class

        return optimizer_class, configuration_dict['parameter_optimization_space'], arguments['opt']

    def _daemon(self):
        # type: () -> ()
        """
        implement the main pooling thread, calling loop every self.pool_period_minutes minutes
        """
        self.optimizer.start()
        self._thread = None

    def _report_daemon(self):
        # type: () -> ()
        worker_to_series = {}
        title, series = self.objective_metric.get_objective_metric()
        title = '{}/{}'.format(title, series)
        series = 'machine:'
        counter = 0
        completed_jobs = dict()
        best_experiment = float('-inf'), None

        while self._thread is not None:
            timeout = self.optimization_timeout - time() if self.optimization_timeout else 0.

            if timeout >= 0:
                timeout = min(self._report_period_min * 60., timeout if timeout else self._report_period_min * 60.)
                # make sure that we have the first report fired before we actually go to sleep, wait for 15 sec.
                if counter <= 0:
                    timeout = 15
                print('Progress report #{} completed, sleeping for {} minutes'.format(counter, timeout / 60.))
                if self._stop_event.wait(timeout=timeout):
                    # wait for one last report
                    timeout = -1

            counter += 1

            # get task to report on.
            if self._task or Task.current_task():
                task_logger = (self._task or Task.current_task()).get_logger()

                # do some reporting

                # running objective, per machine
                running_job_ids = set()
                for j in self.optimizer.get_running_jobs():
                    worker = j.worker()
                    running_job_ids.add(j.task_id())
                    if worker not in worker_to_series:
                        worker_to_series[worker] = len(worker_to_series) + 1
                    machine_id = worker_to_series[worker]
                    value = self.objective_metric.get_objective(j)
                    if value is not None:
                        task_logger.report_scalar(
                            title=title, series='{}{}'.format(series, machine_id),
                            iteration=counter, value=value)

                # noinspection PyBroadException
                try:
                    budget = self.optimizer.budget.to_dict()
                except Exception:
                    budget = {}

                # report remaining budget
                for budget_part, value in budget.items():
                    task_logger.report_scalar(
                        title='remaining budget', series='{} %'.format(budget_part),
                        iteration=counter, value=round(100 - value['used'] * 100., ndigits=1))
                if self.optimization_timeout and self.optimization_start_time:
                    task_logger.report_scalar(
                        title='remaining budget', series='time %',
                        iteration=counter,
                        value=round(100 - (100. * (time() - self.optimization_start_time) /
                                           (self.optimization_timeout - self.optimization_start_time)), ndigits=1)
                    )

                # collect a summary of all the jobs and their final objective values
                cur_completed_jobs = set(self.optimizer.get_created_jobs_ids().keys()) - running_job_ids
                if cur_completed_jobs != set(completed_jobs.keys()):
                    pairs = []
                    labels = []
                    created_jobs = copy(self.optimizer.get_created_jobs_ids())
                    for i, (job_id, params) in enumerate(created_jobs.items()):
                        if job_id in completed_jobs:
                            pairs.append((i, completed_jobs[job_id][0]))
                            labels.append(str(completed_jobs[job_id][2])[1:-1])
                        else:
                            value = self.objective_metric.get_objective(job_id)
                            if value is not None:
                                pairs.append((i, value))
                                labels.append(str(params)[1:-1])
                                iteration_value = self.objective_metric.get_current_raw_objective(job_id)
                                completed_jobs[job_id] = (
                                    value, iteration_value[0] if iteration_value else -1, copy(params))
                                # callback new experiment completed
                                if self._experiment_completed_cb:
                                    normalized_value = self.objective_metric.get_normalized_objective(job_id)
                                    if normalized_value is not None and normalized_value > best_experiment[0]:
                                        best_experiment = normalized_value, job_id
                                    c = completed_jobs[job_id]
                                    self._experiment_completed_cb(job_id, c[0], c[1], c[2], best_experiment[1])

                    if pairs:
                        print('Updating job performance summary plot/table')

                        # update scatter plot
                        task_logger.report_scatter2d(
                            title='optimization', series=title,
                            scatter=pairs, iteration=0, labels=labels,
                            mode='markers', xaxis='job #', yaxis='objective')

                        # update summary table
                        if pd:
                            index = list(completed_jobs.keys())
                            table = {'objective': [completed_jobs[i][0] for i in index],
                                     'iteration': [completed_jobs[i][1] for i in index]}
                            columns = set([c for k, v in completed_jobs.items() for c in v[2].keys()])
                            for c in sorted(columns):
                                table.update({c: [completed_jobs[i][2].get(c, '') for i in index]})

                            df = pd.DataFrame(table, index=index)
                            df.sort_values(by='objective', ascending=bool(self.objective_metric.sign < 0), inplace=True)
                            df.index.name = 'task id'
                            task_logger.report_table("summary", "job", 0, table_plot=df)

            # if we should leave, stop everything now.
            if timeout < 0:
                # we should leave
                self.stop()
                return