diff --git a/clearml/backend_interface/task/task.py b/clearml/backend_interface/task/task.py index ab61cdd8..db538c3d 100644 --- a/clearml/backend_interface/task/task.py +++ b/clearml/backend_interface/task/task.py @@ -2435,6 +2435,26 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin): return None return res.response.project.name + @classmethod + def _get_project_names(cls, project_ids): + # type: (Sequence[str]) -> Dict[str, str] + page = -1 + page_size = 500 + all_responses = [] + res = None + while True: + page += 1 + res = cls._send( + cls._get_default_session(), + projects.GetAllRequest(id=list(project_ids), page=page, page_size=page_size), + raise_on_errors=False, + ) + if res and res.response and res.response.projects: + all_responses.extend(res.response.projects) + else: + break + return {p.id: p.name for p in all_responses} + def _get_all_events( self, max_events=100, batch_size=500, order='asc', event_type=None, unique_selector=itemgetter("url") ): diff --git a/clearml/cli/data/__main__.py b/clearml/cli/data/__main__.py index aa68d96f..59c34ba7 100644 --- a/clearml/cli/data/__main__.py +++ b/clearml/cli/data/__main__.py @@ -270,11 +270,20 @@ def cli(): help='Verbose report all file changes (instead of summary)') squash.set_defaults(func=ds_squash) - search = subparsers.add_parser('search', help='Search datasets in the system (sorted by creation time)') - search.add_argument('--ids', type=str, nargs='*', help='Specify list of dataset IDs') - search.add_argument('--project', type=str, help='Specify datasets project name') - search.add_argument('--name', type=str, help='Specify datasets partial name matching') - search.add_argument('--tags', type=str, nargs='*', help='Specify list of dataset user tags') + search = subparsers.add_parser("search", help="Search datasets in the system (sorted by creation time)") + search.add_argument("--ids", type=str, nargs="*", help="Specify list of dataset IDs") + search.add_argument("--project", type=str, help="Specify datasets project name") + search.add_argument("--name", type=str, help="Specify datasets partial name matching") + search.add_argument("--tags", type=str, nargs="*", help="Specify list of dataset user tags") + search.add_argument( + "--not-only-completed", + action="store_true", + default=False, + help="If set, return datasets that are still in progress as well", + ) + search.add_argument( + "--non-recursive-project-search", action="store_true", default=False, help="Don't search inside subprojects" + ) search.set_defaults(func=ds_search) verify = subparsers.add_parser('verify', help='Verify local dataset content') @@ -446,17 +455,28 @@ def ds_list(args): dataset_name=args.name or None, dataset_version=args.version, ) - print('Listing dataset content') - formatting = '{:64} | {:10,} | {:64}' - print(formatting.replace(',', '').format('file name', 'size', 'hash')) - print('-' * len(formatting.replace(',', '').format('-', '-', '-'))) filters = args.filter if args.filter else [None] file_entries = ds.file_entries_dict link_entries = ds.link_entries_dict - num_files = 0 - total_size = 0 + file_name_max_len, size_max_len, hash_max_len = 64, 10, 64 + files_cache = [] for mask in filters: files = ds.list_files(dataset_path=mask, dataset_id=ds.id if args.modified else None) + files_cache.append(files) + for f in files: + e = link_entries.get(f) + if file_entries.get(f): + e = file_entries[f] + file_name_max_len = max(file_name_max_len, len(e.relative_path)) + size_max_len = max(size_max_len, len(str(e.size))) + hash_max_len = max(hash_max_len, len(str(e.hash))) + print('Listing dataset content') + formatting = "{:" + str(file_name_max_len) + "} | {:" + str(size_max_len) + ",} | {:" + str(hash_max_len) + "}" + print(formatting.replace(",", "").format("file name", "size", "hash")) + print("-" * len(formatting.replace(",", "").format("-", "-", "-"))) + num_files = 0 + total_size = 0 + for files in files_cache: num_files += len(files) for f in files: e = link_entries.get(f) @@ -480,15 +500,41 @@ def ds_search(args): print('Search datasets') print_args(args) datasets = Dataset.list_datasets( - dataset_project=args.project or None, partial_name=args.name or None, - tags=args.tags or None, ids=args.ids or None + dataset_project=args.project or None, + partial_name=args.name or None, + tags=args.tags or None, + ids=args.ids or None, + only_completed=not args.not_only_completed, + recursive_project_search=not args.non_recursive_project_search, ) - formatting = '{:16} | {:32} | {:19} | {:19} | {:32}' - print(formatting.format('project', 'name', 'tags', 'created', 'id')) - print('-' * len(formatting.format('-', '-', '-', '-', '-'))) + projects_col_len, name_col_len, tags_col_len, created_col_len, id_col_len = 16, 32, 19, 19, 32 for d in datasets: - print(formatting.format( - d['project'], d['name'], str(d['tags'] or [])[1:-1], str(d['created']).split('.')[0], d['id'])) + projects_col_len = max(projects_col_len, len(d["project"])) + name_col_len = max(name_col_len, len(d["name"])) + tags_col_len = max(tags_col_len, len(str(d["tags"] or [])[1:-1])) + created_col_len = max(created_col_len, len(str(d["created"]).split(".")[0])) + id_col_len = max(id_col_len, len(d["id"])) + formatting = ( + "{:" + + str(projects_col_len) + + "} | {:" + + str(name_col_len) + + "} | {:" + + str(tags_col_len) + + "} | {:" + + str(created_col_len) + + "} | {:" + + str(id_col_len) + + "}" + ) + print(formatting.format("project", "name", "tags", "created", "id")) + print("-" * len(formatting.format("-", "-", "-", "-", "-"))) + for d in datasets: + print( + formatting.format( + d["project"], d["name"], str(d["tags"] or [])[1:-1], str(d["created"]).split(".")[0], d["id"] + ) + ) return 0 diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py index f81c09b7..b84f91cb 100644 --- a/clearml/datasets/dataset.py +++ b/clearml/datasets/dataset.py @@ -4,6 +4,7 @@ import os import shutil import psutil import mimetypes +import re from copy import deepcopy, copy from multiprocessing.pool import ThreadPool from concurrent.futures import ThreadPoolExecutor @@ -1736,8 +1737,16 @@ class Dataset(object): return squashed_ds @classmethod - def list_datasets(cls, dataset_project=None, partial_name=None, tags=None, ids=None, only_completed=True): - # type: (Optional[str], Optional[str], Optional[Sequence[str]], Optional[Sequence[str]], bool) -> List[dict] + def list_datasets( + cls, + dataset_project=None, + partial_name=None, + tags=None, + ids=None, + only_completed=True, + recursive_project_search=True, + ): + # type: (Optional[str], Optional[str], Optional[Sequence[str]], Optional[Sequence[str]], bool, bool) -> List[dict] """ Query list of dataset in the system @@ -1746,30 +1755,47 @@ class Dataset(object): :param tags: Specify user tags :param ids: List specific dataset based on IDs list :param only_completed: If False return dataset that are still in progress (uploading/edited etc.) + :param recursive_project_search: If True and the `dataset_project` argument is set, + search inside subprojects as well. + If False, don't search inside subprojects (except for the special `.datasets` subproject) :return: List of dictionaries with dataset information Example: [{'name': name, 'project': project name, 'id': dataset_id, 'created': date_created},] """ + if dataset_project: + if not recursive_project_search: + dataset_projects = [ + exact_match_regex(dataset_project), + "^{}/\\.datasets/.*".format(re.escape(dataset_project)), + ] + else: + dataset_projects = [exact_match_regex(dataset_project), "^{}/.*".format(re.escape(dataset_project))] + else: + dataset_projects = None # noinspection PyProtectedMember datasets = Task._query_tasks( - task_ids=ids or None, project_name=dataset_project or None, + task_ids=ids or None, + project_name=dataset_projects, task_name=partial_name, system_tags=[cls.__tag], type=[str(Task.TaskTypes.data_processing)], tags=tags or None, - status=['stopped', 'published', 'completed', 'closed'] if only_completed else None, - only_fields=['created', 'id', 'name', 'project', 'tags'], + status=["stopped", "published", "completed", "closed"] if only_completed else None, + only_fields=["created", "id", "name", "project", "tags"], search_hidden=True, - _allow_extra_fields_=True + exact_match_regex_flag=False, + _allow_extra_fields_=True, ) project_ids = {d.project for d in datasets} # noinspection PyProtectedMember - project_id_lookup = {d: Task._get_project_name(d) for d in project_ids} + project_id_lookup = Task._get_project_names(project_ids) return [ - {'name': d.name, - 'created': d.created, - 'project': project_id_lookup[d.project], - 'id': d.id, - 'tags': d.tags} + { + "name": d.name, + "created": d.created, + "project": cls._remove_hidden_part_from_dataset_project(project_id_lookup[d.project]), + "id": d.id, + "tags": d.tags, + } for d in datasets ] diff --git a/clearml/task.py b/clearml/task.py index af9053c7..a6ad107d 100644 --- a/clearml/task.py +++ b/clearml/task.py @@ -3938,7 +3938,15 @@ class Task(_Task): return [cls(private=cls.__create_protection, task_id=task.id, log_to_backend=False) for task in queried_tasks] @classmethod - def _query_tasks(cls, task_ids=None, project_name=None, task_name=None, fetch_only_first_page=False, **kwargs): + def _query_tasks( + cls, + task_ids=None, + project_name=None, + task_name=None, + fetch_only_first_page=False, + exact_match_regex_flag=True, + **kwargs + ): res = None if not task_ids: task_ids = None @@ -3960,13 +3968,12 @@ class Task(_Task): res = cls._send( cls._get_default_session(), projects.GetAllRequest( - name=exact_match_regex(name), + name=exact_match_regex(name) if exact_match_regex_flag else name, **aux_kwargs ) ) - project = get_single_result(entity='project', query=name, results=res.response.projects) - if project: - project_ids.append(project.id) + if res.response and res.response.projects: + project_ids.extend([project.id for project in res.response.projects]) session = cls._get_default_session() system_tags = 'system_tags' if hasattr(tasks.Task, 'system_tags') else 'tags'