Fix Dataset.list_datasets() returns an empty list

This commit is contained in:
allegroai 2022-09-13 14:57:21 +03:00
parent fb644fe9ec
commit c0bbab75b8
4 changed files with 134 additions and 35 deletions

View File

@ -2435,6 +2435,26 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin):
return None return None
return res.response.project.name return res.response.project.name
@classmethod
def _get_project_names(cls, project_ids):
# type: (Sequence[str]) -> Dict[str, str]
page = -1
page_size = 500
all_responses = []
res = None
while True:
page += 1
res = cls._send(
cls._get_default_session(),
projects.GetAllRequest(id=list(project_ids), page=page, page_size=page_size),
raise_on_errors=False,
)
if res and res.response and res.response.projects:
all_responses.extend(res.response.projects)
else:
break
return {p.id: p.name for p in all_responses}
def _get_all_events( def _get_all_events(
self, max_events=100, batch_size=500, order='asc', event_type=None, unique_selector=itemgetter("url") self, max_events=100, batch_size=500, order='asc', event_type=None, unique_selector=itemgetter("url")
): ):

View File

@ -270,11 +270,20 @@ def cli():
help='Verbose report all file changes (instead of summary)') help='Verbose report all file changes (instead of summary)')
squash.set_defaults(func=ds_squash) squash.set_defaults(func=ds_squash)
search = subparsers.add_parser('search', help='Search datasets in the system (sorted by creation time)') search = subparsers.add_parser("search", help="Search datasets in the system (sorted by creation time)")
search.add_argument('--ids', type=str, nargs='*', help='Specify list of dataset IDs') search.add_argument("--ids", type=str, nargs="*", help="Specify list of dataset IDs")
search.add_argument('--project', type=str, help='Specify datasets project name') search.add_argument("--project", type=str, help="Specify datasets project name")
search.add_argument('--name', type=str, help='Specify datasets partial name matching') search.add_argument("--name", type=str, help="Specify datasets partial name matching")
search.add_argument('--tags', type=str, nargs='*', help='Specify list of dataset user tags') search.add_argument("--tags", type=str, nargs="*", help="Specify list of dataset user tags")
search.add_argument(
"--not-only-completed",
action="store_true",
default=False,
help="If set, return datasets that are still in progress as well",
)
search.add_argument(
"--non-recursive-project-search", action="store_true", default=False, help="Don't search inside subprojects"
)
search.set_defaults(func=ds_search) search.set_defaults(func=ds_search)
verify = subparsers.add_parser('verify', help='Verify local dataset content') verify = subparsers.add_parser('verify', help='Verify local dataset content')
@ -446,17 +455,28 @@ def ds_list(args):
dataset_name=args.name or None, dataset_name=args.name or None,
dataset_version=args.version, dataset_version=args.version,
) )
print('Listing dataset content')
formatting = '{:64} | {:10,} | {:64}'
print(formatting.replace(',', '').format('file name', 'size', 'hash'))
print('-' * len(formatting.replace(',', '').format('-', '-', '-')))
filters = args.filter if args.filter else [None] filters = args.filter if args.filter else [None]
file_entries = ds.file_entries_dict file_entries = ds.file_entries_dict
link_entries = ds.link_entries_dict link_entries = ds.link_entries_dict
num_files = 0 file_name_max_len, size_max_len, hash_max_len = 64, 10, 64
total_size = 0 files_cache = []
for mask in filters: for mask in filters:
files = ds.list_files(dataset_path=mask, dataset_id=ds.id if args.modified else None) files = ds.list_files(dataset_path=mask, dataset_id=ds.id if args.modified else None)
files_cache.append(files)
for f in files:
e = link_entries.get(f)
if file_entries.get(f):
e = file_entries[f]
file_name_max_len = max(file_name_max_len, len(e.relative_path))
size_max_len = max(size_max_len, len(str(e.size)))
hash_max_len = max(hash_max_len, len(str(e.hash)))
print('Listing dataset content')
formatting = "{:" + str(file_name_max_len) + "} | {:" + str(size_max_len) + ",} | {:" + str(hash_max_len) + "}"
print(formatting.replace(",", "").format("file name", "size", "hash"))
print("-" * len(formatting.replace(",", "").format("-", "-", "-")))
num_files = 0
total_size = 0
for files in files_cache:
num_files += len(files) num_files += len(files)
for f in files: for f in files:
e = link_entries.get(f) e = link_entries.get(f)
@ -480,15 +500,41 @@ def ds_search(args):
print('Search datasets') print('Search datasets')
print_args(args) print_args(args)
datasets = Dataset.list_datasets( datasets = Dataset.list_datasets(
dataset_project=args.project or None, partial_name=args.name or None, dataset_project=args.project or None,
tags=args.tags or None, ids=args.ids or None partial_name=args.name or None,
tags=args.tags or None,
ids=args.ids or None,
only_completed=not args.not_only_completed,
recursive_project_search=not args.non_recursive_project_search,
) )
formatting = '{:16} | {:32} | {:19} | {:19} | {:32}' projects_col_len, name_col_len, tags_col_len, created_col_len, id_col_len = 16, 32, 19, 19, 32
print(formatting.format('project', 'name', 'tags', 'created', 'id'))
print('-' * len(formatting.format('-', '-', '-', '-', '-')))
for d in datasets: for d in datasets:
print(formatting.format( projects_col_len = max(projects_col_len, len(d["project"]))
d['project'], d['name'], str(d['tags'] or [])[1:-1], str(d['created']).split('.')[0], d['id'])) name_col_len = max(name_col_len, len(d["name"]))
tags_col_len = max(tags_col_len, len(str(d["tags"] or [])[1:-1]))
created_col_len = max(created_col_len, len(str(d["created"]).split(".")[0]))
id_col_len = max(id_col_len, len(d["id"]))
formatting = (
"{:"
+ str(projects_col_len)
+ "} | {:"
+ str(name_col_len)
+ "} | {:"
+ str(tags_col_len)
+ "} | {:"
+ str(created_col_len)
+ "} | {:"
+ str(id_col_len)
+ "}"
)
print(formatting.format("project", "name", "tags", "created", "id"))
print("-" * len(formatting.format("-", "-", "-", "-", "-")))
for d in datasets:
print(
formatting.format(
d["project"], d["name"], str(d["tags"] or [])[1:-1], str(d["created"]).split(".")[0], d["id"]
)
)
return 0 return 0

View File

@ -4,6 +4,7 @@ import os
import shutil import shutil
import psutil import psutil
import mimetypes import mimetypes
import re
from copy import deepcopy, copy from copy import deepcopy, copy
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
@ -1736,8 +1737,16 @@ class Dataset(object):
return squashed_ds return squashed_ds
@classmethod @classmethod
def list_datasets(cls, dataset_project=None, partial_name=None, tags=None, ids=None, only_completed=True): def list_datasets(
# type: (Optional[str], Optional[str], Optional[Sequence[str]], Optional[Sequence[str]], bool) -> List[dict] cls,
dataset_project=None,
partial_name=None,
tags=None,
ids=None,
only_completed=True,
recursive_project_search=True,
):
# type: (Optional[str], Optional[str], Optional[Sequence[str]], Optional[Sequence[str]], bool, bool) -> List[dict]
""" """
Query list of dataset in the system Query list of dataset in the system
@ -1746,30 +1755,47 @@ class Dataset(object):
:param tags: Specify user tags :param tags: Specify user tags
:param ids: List specific dataset based on IDs list :param ids: List specific dataset based on IDs list
:param only_completed: If False return dataset that are still in progress (uploading/edited etc.) :param only_completed: If False return dataset that are still in progress (uploading/edited etc.)
:param recursive_project_search: If True and the `dataset_project` argument is set,
search inside subprojects as well.
If False, don't search inside subprojects (except for the special `.datasets` subproject)
:return: List of dictionaries with dataset information :return: List of dictionaries with dataset information
Example: [{'name': name, 'project': project name, 'id': dataset_id, 'created': date_created},] Example: [{'name': name, 'project': project name, 'id': dataset_id, 'created': date_created},]
""" """
if dataset_project:
if not recursive_project_search:
dataset_projects = [
exact_match_regex(dataset_project),
"^{}/\\.datasets/.*".format(re.escape(dataset_project)),
]
else:
dataset_projects = [exact_match_regex(dataset_project), "^{}/.*".format(re.escape(dataset_project))]
else:
dataset_projects = None
# noinspection PyProtectedMember # noinspection PyProtectedMember
datasets = Task._query_tasks( datasets = Task._query_tasks(
task_ids=ids or None, project_name=dataset_project or None, task_ids=ids or None,
project_name=dataset_projects,
task_name=partial_name, task_name=partial_name,
system_tags=[cls.__tag], system_tags=[cls.__tag],
type=[str(Task.TaskTypes.data_processing)], type=[str(Task.TaskTypes.data_processing)],
tags=tags or None, tags=tags or None,
status=['stopped', 'published', 'completed', 'closed'] if only_completed else None, status=["stopped", "published", "completed", "closed"] if only_completed else None,
only_fields=['created', 'id', 'name', 'project', 'tags'], only_fields=["created", "id", "name", "project", "tags"],
search_hidden=True, search_hidden=True,
_allow_extra_fields_=True exact_match_regex_flag=False,
_allow_extra_fields_=True,
) )
project_ids = {d.project for d in datasets} project_ids = {d.project for d in datasets}
# noinspection PyProtectedMember # noinspection PyProtectedMember
project_id_lookup = {d: Task._get_project_name(d) for d in project_ids} project_id_lookup = Task._get_project_names(project_ids)
return [ return [
{'name': d.name, {
'created': d.created, "name": d.name,
'project': project_id_lookup[d.project], "created": d.created,
'id': d.id, "project": cls._remove_hidden_part_from_dataset_project(project_id_lookup[d.project]),
'tags': d.tags} "id": d.id,
"tags": d.tags,
}
for d in datasets for d in datasets
] ]

View File

@ -3938,7 +3938,15 @@ class Task(_Task):
return [cls(private=cls.__create_protection, task_id=task.id, log_to_backend=False) for task in queried_tasks] return [cls(private=cls.__create_protection, task_id=task.id, log_to_backend=False) for task in queried_tasks]
@classmethod @classmethod
def _query_tasks(cls, task_ids=None, project_name=None, task_name=None, fetch_only_first_page=False, **kwargs): def _query_tasks(
cls,
task_ids=None,
project_name=None,
task_name=None,
fetch_only_first_page=False,
exact_match_regex_flag=True,
**kwargs
):
res = None res = None
if not task_ids: if not task_ids:
task_ids = None task_ids = None
@ -3960,13 +3968,12 @@ class Task(_Task):
res = cls._send( res = cls._send(
cls._get_default_session(), cls._get_default_session(),
projects.GetAllRequest( projects.GetAllRequest(
name=exact_match_regex(name), name=exact_match_regex(name) if exact_match_regex_flag else name,
**aux_kwargs **aux_kwargs
) )
) )
project = get_single_result(entity='project', query=name, results=res.response.projects) if res.response and res.response.projects:
if project: project_ids.extend([project.id for project in res.response.projects])
project_ids.append(project.id)
session = cls._get_default_session() session = cls._get_default_session()
system_tags = 'system_tags' if hasattr(tasks.Task, 'system_tags') else 'tags' system_tags = 'system_tags' if hasattr(tasks.Task, 'system_tags') else 'tags'