Change HyperParameterOptimizer spawn_task_project to spawn_project

Add HyperParameterOptimizer parallel coordinates (issues #279)
2025-06-26 18:16:07 +00:00 · 2021-01-12 20:22:05 +02:00 · 2021-01-12 20:22:05 +02:00 · 794d97ca7c
commit 794d97ca7c
parent 54132e2505
2 changed files with 58 additions and 26 deletions
--- a/clearml/automation/optimization.py
+++ b/clearml/automation/optimization.py
@ -913,7 +913,7 @@ class HyperParameterOptimizer(object):
            compute_time_limit=None,  # type: Optional[float]
            auto_connect_task=True,  # type: Union[bool, Task]
            always_create_task=False,  # type: bool
-            spawn_task_project=None,  # type: Optional[str]
+            spawn_project=None,  # type: Optional[str]
            save_top_k_tasks_only=None,  # type: Optional[int]
            **optimizer_kwargs  # type: Any
    ):
@ -962,8 +962,8 @@ class HyperParameterOptimizer(object):

            - ``False`` - Use the :py:meth:`task.Task.current_task` (if exists) to report statistics.

-        :param str spawn_task_project: If project name is specified, create all optimization Jobs (Tasks) in the
-            specified project, instead of the original base_task_id project.
+        :param str spawn_project: If project name is specified, create all optimization Jobs (Tasks) in the
+            specified project instead of the original base_task_id project.

        :param int save_top_k_tasks_only: If specified and above 0, keep only the top_k performing Tasks,
            and archive the rest of the created Tasks. Default: -1 keep everything, nothing will be archived.
@ -1063,7 +1063,7 @@ class HyperParameterOptimizer(object):
        self._experiment_completed_cb = None
        self._save_top_k_tasks_only = max(0, save_top_k_tasks_only or 0)
        self.optimizer.set_job_default_parent(
-            self._task.id if self._task else None, project_name=spawn_task_project or None)
+            self._task.id if self._task else None, project_name=spawn_project or None)
        self.set_time_limit(in_minutes=opts['optimization_time_limit'])

    def get_num_active_experiments(self):
@ -1538,30 +1538,62 @@ class HyperParameterOptimizer(object):

                # update scatter plot
                task_logger.report_scatter2d(
-                    title='optimization', series=title,
+                    title='Optimization Objective', series=title,
                    scatter=pairs, iteration=0, labels=labels,
                    mode='markers', xaxis='job #', yaxis='objective')

-                # update summary table
-                job_ids = list(completed_jobs.keys())
-                job_ids_sorted_by_objective = sorted(
-                    job_ids, key=lambda x: completed_jobs[x][0], reverse=bool(self.objective_metric.sign >= 0))
-                # sort the columns except for 'objective', 'iteration'
-                columns = list(sorted(set([c for k, v in completed_jobs.items() for c in v[2].keys()])))
-                # add the index column (task id) and the first two columns 'objective', 'iteration' then the rest
-                table_values = [['task id', 'objective', 'iteration'] + columns]
+            # update summary table
+            job_ids = list(completed_jobs.keys())
+            job_ids_sorted_by_objective = sorted(
+                job_ids, key=lambda x: completed_jobs[x][0], reverse=bool(self.objective_metric.sign >= 0))
+            # sort the columns except for 'objective', 'iteration'
+            columns = list(sorted(set([c for k, v in completed_jobs.items() for c in v[2].keys()])))
+            # add the index column (task id) and the first two columns 'objective', 'iteration' then the rest
+            table_values = [['task id', 'objective', 'iteration'] + columns]

-                table_values += \
-                    [([job, completed_jobs[job][0], completed_jobs[job][1]] +
-                      [completed_jobs[job][2].get(c, '') for c in columns]) for job in job_ids_sorted_by_objective]
-                task_logger.report_table(
-                    "summary", "job", 0, table_plot=table_values,
-                    extra_layout={"title": "objective: {}".format(title)})
-                # upload summary as artifact
-                if force:
-                    task = self._task or Task.current_task()
-                    if task:
-                        task.upload_artifact(name='summary', artifact_object={'table': table_values})
+            table_values += \
+                [([job, completed_jobs[job][0], completed_jobs[job][1]] +
+                  [completed_jobs[job][2].get(c, '') for c in columns]) for job in job_ids_sorted_by_objective]
+            task_logger.report_table(
+                "summary", "job", 0, table_plot=table_values,
+                extra_layout={"title": "objective: {}".format(title)})
+
+            # Build parallel Coordinates: convert to columns, and reorder accordingly
+            if len(table_values) > 1:
+                table_values_columns = [[row[i] for row in table_values] for i in range(len(table_values[0]))]
+                table_values_columns = \
+                    [[table_values_columns[0][0]] + [c[:6]+'...' for c in table_values_columns[0][1:]]] + \
+                    table_values_columns[2:-1] + [[title]+table_values_columns[1][1:]]
+                pcc_dims = []
+                for col in table_values_columns:
+                    # test if all values are numbers:
+                    try:
+                        # try to cast all values to float
+                        values = [float(v) for v in col[1:]]
+                        d = dict(label=col[0], values=values)
+                    except (ValueError, TypeError):
+                        values = list(range(len(col[1:])))
+                        ticks = col[1:]
+                        d = dict(label=col[0], values=values, tickvals=values, ticktext=ticks)
+                    pcc_dims.append(d)
+                # report parallel coordinates
+                plotly_pcc = dict(
+                    data=[dict(
+                        type='parcoords',
+                        line=dict(colorscale='Viridis',
+                                  reversescale=bool(self.objective_metric.sign < 0),
+                                  color=table_values_columns[-1][1:]),
+                        dimensions=pcc_dims)],
+                    layout={})
+                task_logger.report_plotly(
+                    title='Parallel Coordinates', series='',
+                    iteration=0, figure=plotly_pcc)
+
+            # upload summary as artifact
+            if force:
+                task = self._task or Task.current_task()
+                if task:
+                    task.upload_artifact(name='summary', artifact_object={'table': table_values})

    def _report_remaining_budget(self, task_logger, counter):
        # noinspection PyBroadException
--- a/examples/optimization/hyper-parameter-optimization/hyper_parameter_optimizer.py
+++ b/examples/optimization/hyper-parameter-optimization/hyper_parameter_optimizer.py
@ -86,8 +86,8 @@ an_optimizer = HyperParameterOptimizer(
    optimizer_class=aSearchStrategy,
    # Select an execution queue to schedule the experiments for execution
    execution_queue=execution_queue,
-    # If specified all Tasks created by the HPO process will be under the `spawned_tasks_project` project
-    spawn_task_project=None,  # 'HPO spawn project',
+    # If specified all Tasks created by the HPO process will be created under the `spawned_project` project
+    spawn_project=None,  # 'HPO spawn project',
    # If specified only the top K performing Tasks will be kept, the others will be automatically archived
    save_top_k_tasks_only=None,  # 5,
    # Optional: Limit the execution time of a single experiment, in minutes.