Fix datasets can't be queried by project/name alone

This commit is contained in:
allegroai 2022-07-21 17:21:22 +03:00
parent 103f68e3e1
commit a42c4b0bd3

View File

@ -165,6 +165,16 @@ class Dataset(object):
for key in {'files added', 'files removed', 'files modified'}} for key in {'files added', 'files removed', 'files modified'}}
else: else:
self.changed_files = {'files added': 0, 'files removed': 0, 'files modified': 0} self.changed_files = {'files added': 0, 'files removed': 0, 'files modified': 0}
dataset_project, parent_project = self._build_hidden_project_name(task.get_project_name(), task.name)
task.move_to_project(new_project_name=dataset_project)
if bool(Session.check_min_api_server_version(Dataset.__min_api_version)):
get_or_create_project(task.session, project_name=parent_project, system_tags=[self.__hidden_tag])
get_or_create_project(
task.session,
project_name=dataset_project,
project_id=task.project,
system_tags=[self.__hidden_tag, self.__tag],
)
else: else:
self._created_task = True self._created_task = True
dataset_project, parent_project = self._build_hidden_project_name(dataset_project, dataset_name) dataset_project, parent_project = self._build_hidden_project_name(dataset_project, dataset_name)
@ -214,7 +224,7 @@ class Dataset(object):
# noinspection PyProtectedMember # noinspection PyProtectedMember
self._dataset_version = self._task._get_runtime_properties().get("version") self._dataset_version = self._task._get_runtime_properties().get("version")
if not self._dataset_version: if not self._dataset_version:
_, latest_version = self._get_dataset_id_by_version(self.project, self.name) _, latest_version = self._get_dataset_id(self.project, self.name)
if latest_version is not None: if latest_version is not None:
# noinspection PyBroadException # noinspection PyBroadException
try: try:
@ -1234,14 +1244,33 @@ class Dataset(object):
@classmethod @classmethod
def _get_dataset_ids_respecting_params( def _get_dataset_ids_respecting_params(
cls, cls,
dataset_id=None, dataset_id=None, # Optional[str]
dataset_project=None, dataset_project=None, # Optional[str]
dataset_name=None, dataset_name=None, # Optional[str]
force=False, force=False, # bool
dataset_version=None, dataset_version=None, # Optional[str]
entire_dataset=None, entire_dataset=False, # bool
action=None action=None, # Optional[str]
shallow_search=True, # bool
): ):
# type: (...) -> List[str]
"""
Get datasets IDs based on certain criteria, like the dataset_project, dataset_name etc.
:param dataset_id: If set, only this ID is returned
:param dataset_project: Corresponding dataset project
:param dataset_name: Corresponding dataset name
:param force: If True, get the dataset(s) even when being used. Also required to be set to
True when `entire_dataset` is set.
:param dataset_version: The version of the corresponding dataset. If set to `None` (default),
then get the dataset with the latest version
:param entire_dataset: If True, get all datasets that match the given `dataset_project`,
`dataset_name`, `dataset_version`. Note that `force` has to be True if this paramer is True
:param action: Corresponding action, used for logging/building error texts
:param shallow_search: If True, search only the first 500 results (first page)
:return: A list of datasets that matched the parameters
"""
if dataset_id: if dataset_id:
return [dataset_id] return [dataset_id]
if entire_dataset: if entire_dataset:
@ -1260,8 +1289,12 @@ class Dataset(object):
_allow_extra_fields_=True, _allow_extra_fields_=True,
) )
return [d.id for d in datasets] return [d.id for d in datasets]
dataset_id = cls._find_dataset_id( dataset_id, _ = cls._get_dataset_id(
dataset_project=dataset_project, dataset_name=dataset_name, dataset_version=dataset_version dataset_project=dataset_project,
dataset_name=dataset_name,
dataset_version=dataset_version,
raise_on_multiple=True,
shallow_search=shallow_search
) )
if not dataset_id: if not dataset_id:
raise ValueError( raise ValueError(
@ -1283,6 +1316,7 @@ class Dataset(object):
force=False, # bool force=False, # bool
dataset_version=None, # Optional[str] dataset_version=None, # Optional[str]
entire_dataset=False, # bool entire_dataset=False, # bool
shallow_search=True # bool
): ):
# type: (...) -> () # type: (...) -> ()
""" """
@ -1295,8 +1329,9 @@ class Dataset(object):
:param force: If True, deleted the dataset(s) even when being used. Also required to be set to :param force: If True, deleted the dataset(s) even when being used. Also required to be set to
True when `entire_dataset` is set. True when `entire_dataset` is set.
:param dataset_version: The version of the dataset(s) to be deletedd :param dataset_version: The version of the dataset(s) to be deletedd
:param entire_dataset: If True, deleted all all datasets that match the given `dataset_project`, :param entire_dataset: If True, delete all datasets that match the given `dataset_project`,
`dataset_name`, `dataset_version`. Note that `force` has to be True if this paramer is True `dataset_name`, `dataset_version`. Note that `force` has to be True if this paramer is True
:param shallow_search: If True, search only the first 500 results (first page)
""" """
if not any([dataset_id, dataset_project, dataset_name]): if not any([dataset_id, dataset_project, dataset_name]):
raise ValueError("Dataset deletion criteria not met. Didn't provide id/name/project correctly.") raise ValueError("Dataset deletion criteria not met. Didn't provide id/name/project correctly.")
@ -1313,6 +1348,7 @@ class Dataset(object):
force=force, force=force,
dataset_version=dataset_version, dataset_version=dataset_version,
entire_dataset=entire_dataset, entire_dataset=entire_dataset,
shallow_search=shallow_search,
action="delete", action="delete",
) )
except Exception as e: except Exception as e:
@ -1410,6 +1446,7 @@ class Dataset(object):
dataset_project=dataset_project, dataset_project=dataset_project,
dataset_name=dataset_name, dataset_name=dataset_name,
entire_dataset=True, entire_dataset=True,
shallow_search=False,
force=True, force=True,
action="move", action="move",
) )
@ -1441,11 +1478,12 @@ class Dataset(object):
dataset_version=None, # type: Optional[str] dataset_version=None, # type: Optional[str]
alias=None, # type: Optional[str] alias=None, # type: Optional[str]
overridable=False, # type: bool overridable=False, # type: bool
shallow_search=True, # type: bool
**kwargs **kwargs
): ):
# type: (...) -> "Dataset" # type: (...) -> "Dataset"
""" """
Get a specific Dataset. If multiple datasets are found, the most recent one is returned Get a specific Dataset. If multiple datasets are found, the dataset with the highest version is returned
:param dataset_id: Requested dataset ID :param dataset_id: Requested dataset ID
:param dataset_project: Requested dataset project name :param dataset_project: Requested dataset project name
@ -1462,6 +1500,7 @@ class Dataset(object):
:param overridable: If True, allow overriding the dataset ID with a given alias in the :param overridable: If True, allow overriding the dataset ID with a given alias in the
hyperparameters section. Useful when one wants to change the dataset used when running hyperparameters section. Useful when one wants to change the dataset used when running
a task remotely. If the alias parameter is not set, this parameter has no effect a task remotely. If the alias parameter is not set, this parameter has no effect
:param shallow_search: If True, search only the first 500 results (first page)
:return: Dataset object :return: Dataset object
""" """
@ -1534,24 +1573,21 @@ class Dataset(object):
return dataset return dataset
if not dataset_id: if not dataset_id:
dataset_id = cls._find_dataset_id( dataset_id, _ = cls._get_dataset_id(
dataset_project=dataset_project, dataset_project=dataset_project,
dataset_name=dataset_name, dataset_name=dataset_name,
dataset_version=dataset_version, dataset_version=dataset_version,
raise_on_error=False,
dataset_tags=dataset_tags,
dataset_filter=dict( dataset_filter=dict(
tags=dataset_tags,
system_tags=[cls.__tag, "-archived"], system_tags=[cls.__tag, "-archived"],
order_by=["-created"],
type=[str(Task.TaskTypes.data_processing)], type=[str(Task.TaskTypes.data_processing)],
page_size=1,
page=0,
status=["published"] status=["published"]
if only_published if only_published
else ["published", "completed", "closed"] else ["published", "completed", "closed"]
if only_completed if only_completed
else None, else None,
), ),
shallow_search=shallow_search
) )
if not dataset_id and not auto_create: if not dataset_id and not auto_create:
raise ValueError( raise ValueError(
@ -2880,103 +2916,80 @@ class Dataset(object):
return chunk_selection return chunk_selection
@classmethod @classmethod
def _get_dataset_id_by_version(cls, dataset_project, dataset_name, dataset_version="latest"): def _get_dataset_id(
cls,
dataset_project,
dataset_name,
dataset_version=None,
dataset_filter=None,
raise_on_multiple=False,
shallow_search=True,
):
# type: (str, str, Optional[str]) -> Tuple[str, str] # type: (str, str, Optional[str]) -> Tuple[str, str]
""" """
Gets the dataset ID that matches a project, name and a version. Gets the dataset ID that matches a project, name and a version.
:param dataset_project: Corresponding dataset project :param dataset_project: Corresponding dataset project
:param dataset_name: Corresponding dataset name :param dataset_name: Corresponding dataset name
:param dataset_version: The version of the corresponding dataset. If set to 'latest', :param dataset_version: The version of the corresponding dataset. If set to `None` (default),
then get the dataset with the latest version then get the dataset with the latest version
:param dataset_filter: Filter the found datasets based on the criteria present in this dict.
Has the same behaviour as `task_filter` parameter in Task.get_tasks. If None,
the filter will have parameters set specific to datasets
:param raise_on_multiple: If True and more than 1 dataset is found raise an Exception
:param shallow_search: If True, search only the first 500 results (first page)
:return: A tuple containing 2 strings: the dataset ID and the version of that dataset :return: A tuple containing 2 strings: the dataset ID and the version of that dataset
""" """
dataset_filter = dataset_filter or {}
unmodifiable_params = ["project_name", "task_name", "only_fields", "search_hidden", "_allow_extra_fields_"]
for unmodifiable_param in unmodifiable_params:
if unmodifiable_param in dataset_filter:
del dataset_filter[unmodifiable_param]
dataset_filter.setdefault("system_tags", [cls.__tag])
# dataset_filter.setdefault("type", [str(Task.TaskTypes.data_processing)])
dataset_filter.setdefault("order_by", ["-last_update"])
# making sure we have the right project name here # making sure we have the right project name here
hidden_dataset_project, _ = cls._build_hidden_project_name(dataset_project, dataset_name) hidden_dataset_project, _ = cls._build_hidden_project_name(dataset_project, dataset_name)
# noinspection PyProtectedMember # noinspection PyProtectedMember
datasets = Task._query_tasks( datasets = Task._query_tasks(
project_name=[hidden_dataset_project], project_name=[hidden_dataset_project] if hidden_dataset_project else None,
task_name=exact_match_regex(dataset_name) if dataset_name else None, task_name=exact_match_regex(dataset_name) if dataset_name else None,
system_tags=[cls.__tag], fetch_only_first_page=shallow_search,
only_fields=["id", "runtime.version"], only_fields=["id", "runtime.version"],
search_hidden=True, search_hidden=True,
_allow_extra_fields_=True _allow_extra_fields_=True,
**dataset_filter,
) )
if raise_on_multiple and len(datasets) > 1:
raise ValueError(
"Multiple datasets found with dataset_project={}, dataset_name={}, dataset_version={}".format(
dataset_project, dataset_name, dataset_version
)
)
result_dataset = None result_dataset = None
for dataset in datasets: for dataset in datasets:
current_version = dataset.runtime.get("version") current_version = dataset.runtime.get("version")
if not current_version: if not current_version:
continue continue
if dataset_version == "latest" and ( if dataset_version is None and (
not result_dataset or Version(result_dataset.runtime["version"]) < Version(current_version) not result_dataset or Version(result_dataset.runtime["version"]) < Version(current_version)
): ):
result_dataset = dataset result_dataset = dataset
elif dataset_version == current_version: elif dataset_version == current_version:
if result_dataset and raise_on_multiple:
raise ValueError(
"Multiple datasets found with dataset_project={}, dataset_name={}, dataset_version={}".format(
dataset_project, dataset_name, dataset_version
)
)
result_dataset = dataset result_dataset = dataset
break if not raise_on_multiple:
break
if not result_dataset: if not result_dataset:
return None, None return None, None
return result_dataset.id, result_dataset.runtime.get("version") return result_dataset.id, result_dataset.runtime.get("version")
@classmethod
def _find_dataset_id(
cls,
dataset_project=None, # Optional[str]
dataset_name=None, # Optional[str]
dataset_version=None, # Optional[str]
dataset_tags=None, # Optional[Sequence[str]]
dataset_filter=None, # Optional[dict]
raise_on_error=True, # bool
):
# type: (...) -> Optional[str]
"""
Find a dataset ID based on the given parameters
:param dataset_project: Project of the dataset searched
:param dataset_name: Name of the dataset searched
:param dataset_verion: Version of the dataset searched
:param dataset_tags: List of tags of the dataset searched
:param dataset_filter: Filter the found datasets based on the criteria present in this dict.
Has the same behaviour as `task_filter` parameter in Task.get_tasks. If None,
the filter will have parameters set specific to datasets.
:param raise_on_error: If True and no dataset is found or more than 1 dataset is found,
raise an Exception.
"""
if not dataset_version:
if dataset_filter is None:
dataset_filter = dict(
system_tags=[cls.__tag],
type=[str(Task.TaskTypes.data_processing)],
page_size=2,
page=0,
)
dataset_filter["search_hidden"] = True
dataset_filter["_allow_extra_fields_"] = True
hidden_dataset_project, _ = cls._build_hidden_project_name(dataset_project, dataset_name)
tasks = Task.get_tasks(
project_name=hidden_dataset_project,
task_name=exact_match_regex(dataset_name) if dataset_name else None,
tags=dataset_tags,
task_filter=dataset_filter,
)
if not tasks and raise_on_error:
raise ValueError("Dataset project={} name={} could not be found".format(dataset_project, dataset_name))
if len(tasks) > 1 and raise_on_error:
raise ValueError("Too many datasets matching project={} name={}".format(dataset_project, dataset_name))
dataset_id = tasks[0].id
else:
dataset_id, _ = cls._get_dataset_id_by_version(
dataset_project, dataset_name, dataset_version=dataset_version
)
if not dataset_id and raise_on_error:
raise ValueError(
"Dataset project={} name={} version={} could not be found".format(
dataset_project, dataset_name, dataset_version
)
)
return dataset_id
@classmethod @classmethod
def _build_hidden_project_name(cls, dataset_project, dataset_name): def _build_hidden_project_name(cls, dataset_project, dataset_name):
# type: (str, str) -> Tuple[str, str] # type: (str, str) -> Tuple[str, str]
@ -2990,10 +3003,13 @@ class Dataset(object):
:return: Tuple of 2 strings, one is the corresponding hidden dataset project and one :return: Tuple of 2 strings, one is the corresponding hidden dataset project and one
is the parent project is the parent project
""" """
dataset_project = cls._remove_hidden_part_from_dataset_project(dataset_project) if not dataset_project:
return None, None
project_name = cls._remove_hidden_part_from_dataset_project(dataset_project)
if bool(Session.check_min_api_server_version(cls.__min_api_version)): if bool(Session.check_min_api_server_version(cls.__min_api_version)):
parent_project = "{}.datasets".format(dataset_project + "/" if dataset_project else "") parent_project = "{}.datasets".format(dataset_project + "/" if dataset_project else "")
project_name = "{}/{}".format(parent_project, dataset_name) if dataset_name:
project_name = "{}/{}".format(parent_project, dataset_name)
else: else:
parent_project = None parent_project = None
project_name = dataset_project or "Datasets" project_name = dataset_project or "Datasets"