mirror of
				https://github.com/clearml/clearml
				synced 2025-06-26 18:16:07 +00:00 
			
		
		
		
	Fix Dataset.list_datasets() returns an empty list
This commit is contained in:
		
							parent
							
								
									fb644fe9ec
								
							
						
					
					
						commit
						c0bbab75b8
					
				| @ -2435,6 +2435,26 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin): | ||||
|             return None | ||||
|         return res.response.project.name | ||||
| 
 | ||||
|     @classmethod | ||||
|     def _get_project_names(cls, project_ids): | ||||
|         # type: (Sequence[str]) -> Dict[str, str] | ||||
|         page = -1 | ||||
|         page_size = 500 | ||||
|         all_responses = [] | ||||
|         res = None | ||||
|         while True: | ||||
|             page += 1 | ||||
|             res = cls._send( | ||||
|                 cls._get_default_session(), | ||||
|                 projects.GetAllRequest(id=list(project_ids), page=page, page_size=page_size), | ||||
|                 raise_on_errors=False, | ||||
|             ) | ||||
|             if res and res.response and res.response.projects: | ||||
|                 all_responses.extend(res.response.projects) | ||||
|             else: | ||||
|                 break | ||||
|         return {p.id: p.name for p in all_responses} | ||||
| 
 | ||||
|     def _get_all_events( | ||||
|         self, max_events=100, batch_size=500, order='asc', event_type=None, unique_selector=itemgetter("url") | ||||
|     ): | ||||
|  | ||||
| @ -270,11 +270,20 @@ def cli(): | ||||
|                         help='Verbose report all file changes (instead of summary)') | ||||
|     squash.set_defaults(func=ds_squash) | ||||
| 
 | ||||
|     search = subparsers.add_parser('search', help='Search datasets in the system (sorted by creation time)') | ||||
|     search.add_argument('--ids', type=str, nargs='*', help='Specify list of dataset IDs') | ||||
|     search.add_argument('--project', type=str, help='Specify datasets project name') | ||||
|     search.add_argument('--name', type=str, help='Specify datasets partial name matching') | ||||
|     search.add_argument('--tags', type=str, nargs='*', help='Specify list of dataset user tags') | ||||
|     search = subparsers.add_parser("search", help="Search datasets in the system (sorted by creation time)") | ||||
|     search.add_argument("--ids", type=str, nargs="*", help="Specify list of dataset IDs") | ||||
|     search.add_argument("--project", type=str, help="Specify datasets project name") | ||||
|     search.add_argument("--name", type=str, help="Specify datasets partial name matching") | ||||
|     search.add_argument("--tags", type=str, nargs="*", help="Specify list of dataset user tags") | ||||
|     search.add_argument( | ||||
|         "--not-only-completed", | ||||
|         action="store_true", | ||||
|         default=False, | ||||
|         help="If set, return datasets that are still in progress as well", | ||||
|     ) | ||||
|     search.add_argument( | ||||
|         "--non-recursive-project-search", action="store_true", default=False, help="Don't search inside subprojects" | ||||
|     ) | ||||
|     search.set_defaults(func=ds_search) | ||||
| 
 | ||||
|     verify = subparsers.add_parser('verify', help='Verify local dataset content') | ||||
| @ -446,17 +455,28 @@ def ds_list(args): | ||||
|         dataset_name=args.name or None, | ||||
|         dataset_version=args.version, | ||||
|     ) | ||||
|     print('Listing dataset content') | ||||
|     formatting = '{:64} | {:10,} | {:64}' | ||||
|     print(formatting.replace(',', '').format('file name', 'size', 'hash')) | ||||
|     print('-' * len(formatting.replace(',', '').format('-', '-', '-'))) | ||||
|     filters = args.filter if args.filter else [None] | ||||
|     file_entries = ds.file_entries_dict | ||||
|     link_entries = ds.link_entries_dict | ||||
|     num_files = 0 | ||||
|     total_size = 0 | ||||
|     file_name_max_len, size_max_len, hash_max_len = 64, 10, 64 | ||||
|     files_cache = [] | ||||
|     for mask in filters: | ||||
|         files = ds.list_files(dataset_path=mask, dataset_id=ds.id if args.modified else None) | ||||
|         files_cache.append(files) | ||||
|         for f in files: | ||||
|             e = link_entries.get(f) | ||||
|             if file_entries.get(f): | ||||
|                 e = file_entries[f] | ||||
|             file_name_max_len = max(file_name_max_len, len(e.relative_path)) | ||||
|             size_max_len = max(size_max_len, len(str(e.size))) | ||||
|             hash_max_len = max(hash_max_len, len(str(e.hash))) | ||||
|     print('Listing dataset content') | ||||
|     formatting = "{:" + str(file_name_max_len) + "} | {:" + str(size_max_len) + ",} | {:" + str(hash_max_len) + "}" | ||||
|     print(formatting.replace(",", "").format("file name", "size", "hash")) | ||||
|     print("-" * len(formatting.replace(",", "").format("-", "-", "-"))) | ||||
|     num_files = 0 | ||||
|     total_size = 0 | ||||
|     for files in files_cache: | ||||
|         num_files += len(files) | ||||
|         for f in files: | ||||
|             e = link_entries.get(f) | ||||
| @ -480,15 +500,41 @@ def ds_search(args): | ||||
|     print('Search datasets') | ||||
|     print_args(args) | ||||
|     datasets = Dataset.list_datasets( | ||||
|         dataset_project=args.project or None, partial_name=args.name or None, | ||||
|         tags=args.tags or None, ids=args.ids or None | ||||
|         dataset_project=args.project or None, | ||||
|         partial_name=args.name or None, | ||||
|         tags=args.tags or None, | ||||
|         ids=args.ids or None, | ||||
|         only_completed=not args.not_only_completed, | ||||
|         recursive_project_search=not args.non_recursive_project_search, | ||||
|     ) | ||||
|     formatting = '{:16} | {:32} | {:19} | {:19} | {:32}' | ||||
|     print(formatting.format('project', 'name', 'tags', 'created', 'id')) | ||||
|     print('-' * len(formatting.format('-', '-', '-', '-', '-'))) | ||||
|     projects_col_len, name_col_len, tags_col_len, created_col_len, id_col_len = 16, 32, 19, 19, 32 | ||||
|     for d in datasets: | ||||
|         print(formatting.format( | ||||
|             d['project'], d['name'], str(d['tags'] or [])[1:-1], str(d['created']).split('.')[0], d['id'])) | ||||
|         projects_col_len = max(projects_col_len, len(d["project"])) | ||||
|         name_col_len = max(name_col_len, len(d["name"])) | ||||
|         tags_col_len = max(tags_col_len, len(str(d["tags"] or [])[1:-1])) | ||||
|         created_col_len = max(created_col_len, len(str(d["created"]).split(".")[0])) | ||||
|         id_col_len = max(id_col_len, len(d["id"])) | ||||
|     formatting = ( | ||||
|         "{:" | ||||
|         + str(projects_col_len) | ||||
|         + "} | {:" | ||||
|         + str(name_col_len) | ||||
|         + "} | {:" | ||||
|         + str(tags_col_len) | ||||
|         + "} | {:" | ||||
|         + str(created_col_len) | ||||
|         + "} | {:" | ||||
|         + str(id_col_len) | ||||
|         + "}" | ||||
|     ) | ||||
|     print(formatting.format("project", "name", "tags", "created", "id")) | ||||
|     print("-" * len(formatting.format("-", "-", "-", "-", "-"))) | ||||
|     for d in datasets: | ||||
|         print( | ||||
|             formatting.format( | ||||
|                 d["project"], d["name"], str(d["tags"] or [])[1:-1], str(d["created"]).split(".")[0], d["id"] | ||||
|             ) | ||||
|         ) | ||||
|     return 0 | ||||
| 
 | ||||
| 
 | ||||
|  | ||||
| @ -4,6 +4,7 @@ import os | ||||
| import shutil | ||||
| import psutil | ||||
| import mimetypes | ||||
| import re | ||||
| from copy import deepcopy, copy | ||||
| from multiprocessing.pool import ThreadPool | ||||
| from concurrent.futures import ThreadPoolExecutor | ||||
| @ -1736,8 +1737,16 @@ class Dataset(object): | ||||
|         return squashed_ds | ||||
| 
 | ||||
|     @classmethod | ||||
|     def list_datasets(cls, dataset_project=None, partial_name=None, tags=None, ids=None, only_completed=True): | ||||
|         # type: (Optional[str], Optional[str], Optional[Sequence[str]], Optional[Sequence[str]], bool) -> List[dict] | ||||
|     def list_datasets( | ||||
|         cls, | ||||
|         dataset_project=None, | ||||
|         partial_name=None, | ||||
|         tags=None, | ||||
|         ids=None, | ||||
|         only_completed=True, | ||||
|         recursive_project_search=True, | ||||
|     ): | ||||
|         # type: (Optional[str], Optional[str], Optional[Sequence[str]], Optional[Sequence[str]], bool, bool) -> List[dict] | ||||
|         """ | ||||
|         Query list of dataset in the system | ||||
| 
 | ||||
| @ -1746,30 +1755,47 @@ class Dataset(object): | ||||
|         :param tags: Specify user tags | ||||
|         :param ids: List specific dataset based on IDs list | ||||
|         :param only_completed: If False return dataset that are still in progress (uploading/edited etc.) | ||||
|         :param recursive_project_search: If True and the `dataset_project` argument is set, | ||||
|             search inside subprojects as well. | ||||
|             If False, don't search inside subprojects (except for the special `.datasets` subproject) | ||||
|         :return: List of dictionaries with dataset information | ||||
|             Example: [{'name': name, 'project': project name, 'id': dataset_id, 'created': date_created},] | ||||
|         """ | ||||
|         if dataset_project: | ||||
|             if not recursive_project_search: | ||||
|                 dataset_projects = [ | ||||
|                     exact_match_regex(dataset_project), | ||||
|                     "^{}/\\.datasets/.*".format(re.escape(dataset_project)), | ||||
|                 ] | ||||
|             else: | ||||
|                 dataset_projects = [exact_match_regex(dataset_project), "^{}/.*".format(re.escape(dataset_project))] | ||||
|         else: | ||||
|             dataset_projects = None | ||||
|         # noinspection PyProtectedMember | ||||
|         datasets = Task._query_tasks( | ||||
|             task_ids=ids or None, project_name=dataset_project or None, | ||||
|             task_ids=ids or None, | ||||
|             project_name=dataset_projects, | ||||
|             task_name=partial_name, | ||||
|             system_tags=[cls.__tag], | ||||
|             type=[str(Task.TaskTypes.data_processing)], | ||||
|             tags=tags or None, | ||||
|             status=['stopped', 'published', 'completed', 'closed'] if only_completed else None, | ||||
|             only_fields=['created', 'id', 'name', 'project', 'tags'], | ||||
|             status=["stopped", "published", "completed", "closed"] if only_completed else None, | ||||
|             only_fields=["created", "id", "name", "project", "tags"], | ||||
|             search_hidden=True, | ||||
|             _allow_extra_fields_=True | ||||
|             exact_match_regex_flag=False, | ||||
|             _allow_extra_fields_=True, | ||||
|         ) | ||||
|         project_ids = {d.project for d in datasets} | ||||
|         # noinspection PyProtectedMember | ||||
|         project_id_lookup = {d: Task._get_project_name(d) for d in project_ids} | ||||
|         project_id_lookup = Task._get_project_names(project_ids) | ||||
|         return [ | ||||
|             {'name': d.name, | ||||
|              'created': d.created, | ||||
|              'project': project_id_lookup[d.project], | ||||
|              'id': d.id, | ||||
|              'tags': d.tags} | ||||
|             { | ||||
|                 "name": d.name, | ||||
|                 "created": d.created, | ||||
|                 "project": cls._remove_hidden_part_from_dataset_project(project_id_lookup[d.project]), | ||||
|                 "id": d.id, | ||||
|                 "tags": d.tags, | ||||
|             } | ||||
|             for d in datasets | ||||
|         ] | ||||
| 
 | ||||
|  | ||||
| @ -3938,7 +3938,15 @@ class Task(_Task): | ||||
|         return [cls(private=cls.__create_protection, task_id=task.id, log_to_backend=False) for task in queried_tasks] | ||||
| 
 | ||||
|     @classmethod | ||||
|     def _query_tasks(cls, task_ids=None, project_name=None, task_name=None, fetch_only_first_page=False, **kwargs): | ||||
|     def _query_tasks( | ||||
|         cls, | ||||
|         task_ids=None, | ||||
|         project_name=None, | ||||
|         task_name=None, | ||||
|         fetch_only_first_page=False, | ||||
|         exact_match_regex_flag=True, | ||||
|         **kwargs | ||||
|     ): | ||||
|         res = None | ||||
|         if not task_ids: | ||||
|             task_ids = None | ||||
| @ -3960,13 +3968,12 @@ class Task(_Task): | ||||
|                 res = cls._send( | ||||
|                     cls._get_default_session(), | ||||
|                     projects.GetAllRequest( | ||||
|                         name=exact_match_regex(name), | ||||
|                         name=exact_match_regex(name) if exact_match_regex_flag else name, | ||||
|                         **aux_kwargs | ||||
|                     ) | ||||
|                 ) | ||||
|                 project = get_single_result(entity='project', query=name, results=res.response.projects) | ||||
|                 if project: | ||||
|                     project_ids.append(project.id) | ||||
|                 if res.response and res.response.projects: | ||||
|                     project_ids.extend([project.id for project in res.response.projects]) | ||||
| 
 | ||||
|         session = cls._get_default_session() | ||||
|         system_tags = 'system_tags' if hasattr(tasks.Task, 'system_tags') else 'tags' | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 allegroai
						allegroai