Add include_archived to optionally exclude archived datasets from dataset.list_datasets() (#1069)

Addresses #1067 * feat(datasets.py): ✨ Added the option `include_archived` to Dataset.list_datasets() class method which, if False, will exclude archived datasets from the list. Defaults to True to maintain the current behavior of the class method. While it's possible to filter datasets using the get() class method it wasn't possible to do the same for the list_datasets() method. This feature seems useful to include and it's a simple, non-breaking modification. * style(datasets.py): 🚨 Updated formatting (white spaces/spaces after comma/line breaks) in accordance with the flake8 formatting checks for dataset.py. As suggesting in the contribution guidelines, fake8 was used to validate the formatting. Several errors were found from prior commits that were fixed.
2025-06-26 18:16:07 +00:00 · 2023-07-10 19:59:49 +02:00 · 2023-07-10 19:59:49 +02:00 · 29227cade1
commit 29227cade1
parent 5aa80267d6
1 changed files with 22 additions and 15 deletions
--- a/clearml/datasets/dataset.py
+++ b/clearml/datasets/dataset.py
@ -122,7 +122,7 @@ class Dataset(object):
    __hyperparams_section = "Datasets"
    __datasets_runtime_prop = "datasets"
    __orig_datasets_runtime_prop_prefix = "orig_datasets"
-    __preview_media_max_file_size =  deferred_config("dataset.preview.media.max_file_size", 5 * 1024 * 1024, transform=int)
+    __preview_media_max_file_size = deferred_config("dataset.preview.media.max_file_size", 5 * 1024 * 1024, transform=int)
    __preview_tabular_table_count = deferred_config("dataset.preview.tabular.table_count", 10, transform=int)
    __preview_tabular_row_count = deferred_config("dataset.preview.tabular.row_count", 10, transform=int)
    __preview_media_image_count = deferred_config("dataset.preview.media.image_count", 10, transform=int)
@ -1877,6 +1877,7 @@ class Dataset(object):
        ids=None,  # type: Optional[Sequence[str]]
        only_completed=True,  # type: bool
        recursive_project_search=True,  # type: bool
+        include_archived=True,  # type: bool
    ):
        # type: (...) -> List[dict]
        """
@ -1890,9 +1891,16 @@ class Dataset(object):
        :param recursive_project_search: If True and the `dataset_project` argument is set,
            search inside subprojects as well.
            If False, don't search inside subprojects (except for the special `.datasets` subproject)
+        :param include_archived: If True, include archived datasets as well.
        :return: List of dictionaries with dataset information
            Example: [{'name': name, 'project': project name, 'id': dataset_id, 'created': date_created},]
        """
+        # if include_archived is False, we need to add the system tag __$not:archived to filter out archived datasets
+        if not include_archived:
+            system_tags = ["__$all", cls.__tag, "__$not", "archived"]
+        else:
+            system_tags = [cls.__tag]
+
        if dataset_project:
            if not recursive_project_search:
                dataset_projects = [
@ -1903,12 +1911,13 @@ class Dataset(object):
                dataset_projects = [exact_match_regex(dataset_project), "^{}/.*".format(re.escape(dataset_project))]
        else:
            dataset_projects = None
+
        # noinspection PyProtectedMember
        datasets = Task._query_tasks(
            task_ids=ids or None,
            project_name=dataset_projects,
            task_name=partial_name,
-            system_tags=[cls.__tag],
+            system_tags=system_tags,
            type=[str(Task.TaskTypes.data_processing)],
            tags=tags or None,
            status=["stopped", "published", "completed", "closed"] if only_completed else None,
@ -2278,14 +2287,15 @@ class Dataset(object):
            ds = Dataset.get(dependency)
            links.update(ds._dataset_link_entries)
        links.update(self._dataset_link_entries)
-        def _download_link(link,target_path):
+
+        def _download_link(link, target_path):
            if os.path.exists(target_path):
-                    LoggerRoot.get_base_logger().info(
-                        "{} already exists. Skipping downloading {}".format(
-                            target_path, link
-                        )
+                LoggerRoot.get_base_logger().info(
+                    "{} already exists. Skipping downloading {}".format(
+                        target_path, link
                    )
-                    return
+                )
+                return
            ok = False
            error = None
            try:
@ -2310,16 +2320,12 @@ class Dataset(object):
        if not max_workers:
            for relative_path, link in links.items():
                target_path = os.path.join(target_folder, relative_path)
-                _download_link(link,target_path)
+                _download_link(link, target_path)
        else:
            with ThreadPoolExecutor(max_workers=max_workers) as pool:
                for relative_path, link in links.items():
                    target_path = os.path.join(target_folder, relative_path)
-                    pool.submit(_download_link,link,target_path)
-
-
-
-
+                    pool.submit(_download_link, link, target_path)

    def _extract_dataset_archive(
            self,
@ -2720,7 +2726,7 @@ class Dataset(object):
            dataset._task.mark_completed()

        return id
-    
+
    def _log_dataset_page(self):
        if bool(Session.check_min_api_server_version(self.__min_api_version)):
            self._task.get_logger().report_text(
@ -2732,6 +2738,7 @@ class Dataset(object):
                    )
                )
            )
+
    def _build_dependency_chunk_lookup(self):
        # type: () -> Dict[str, int]
        """