mirror of
https://github.com/clearml/clearml
synced 2025-06-23 01:55:38 +00:00
Fix too many parts will cause preview to increase Task object beyond 16MB limit, set a total limit of 320kbs
This commit is contained in:
parent
8e9f422ec5
commit
0e283dd514
@ -117,6 +117,7 @@ class Dataset(object):
|
||||
__dataset_folder_template = CacheManager.set_context_folder_lookup(__cache_context, "{0}_archive_{1}")
|
||||
__preview_max_file_entries = 15000
|
||||
__preview_max_size = 32 * 1024
|
||||
__preview_total_max_size = 320 * 1024
|
||||
__min_api_version = "2.20"
|
||||
__hyperparams_section = "Datasets"
|
||||
__datasets_runtime_prop = "datasets"
|
||||
@ -131,15 +132,15 @@ class Dataset(object):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
_private,
|
||||
task=None,
|
||||
dataset_project=None,
|
||||
dataset_name=None,
|
||||
dataset_tags=None,
|
||||
dataset_version=None,
|
||||
description=None,
|
||||
_private, # type: int
|
||||
task=None, # type: Optional[Task]
|
||||
dataset_project=None, # type: Optional[str]
|
||||
dataset_name=None, # type: Optional[str]
|
||||
dataset_tags=None, # type: Optional[Sequence[str]]
|
||||
dataset_version=None, # type: Optional[str]
|
||||
description=None, # type: Optional[str]
|
||||
):
|
||||
# type: (int, Optional[Task], Optional[str], Optional[str], Optional[Sequence[str]], Optional[str], Optional[str]) -> ()
|
||||
# type: (...) -> ()
|
||||
"""
|
||||
Do not use directly! Use Dataset.create(...) or Dataset.get(...) instead.
|
||||
"""
|
||||
@ -220,7 +221,8 @@ class Dataset(object):
|
||||
# generate the script section
|
||||
script = (
|
||||
"from clearml import Dataset\n\n"
|
||||
"ds = Dataset.create(dataset_project='{dataset_project}', dataset_name='{dataset_name}', dataset_version='{dataset_version}')\n".format(
|
||||
"ds = Dataset.create(dataset_project='{dataset_project}', dataset_name='{dataset_name}', "
|
||||
"dataset_version='{dataset_version}')\n".format(
|
||||
dataset_project=dataset_project, dataset_name=dataset_name, dataset_version=dataset_version
|
||||
)
|
||||
)
|
||||
@ -620,6 +622,7 @@ class Dataset(object):
|
||||
|
||||
total_size = 0
|
||||
chunks_count = 0
|
||||
total_preview_size = 0
|
||||
keep_as_file_entry = set()
|
||||
chunk_size = int(self._dataset_chunk_size_mb if not chunk_size else chunk_size)
|
||||
|
||||
@ -649,7 +652,9 @@ class Dataset(object):
|
||||
self._data_artifact_name = self._get_next_data_artifact_name(self._data_artifact_name)
|
||||
self._task.get_logger().report_text(
|
||||
"Uploading dataset changes ({} files compressed to {}) to {}".format(
|
||||
zip_.count, format_size(zip_.size, binary=True, use_b_instead_of_bytes=True), self.get_default_storage()
|
||||
zip_.count,
|
||||
format_size(zip_.size, binary=True, use_b_instead_of_bytes=True),
|
||||
self.get_default_storage()
|
||||
)
|
||||
)
|
||||
total_size += zip_.size
|
||||
@ -659,21 +664,27 @@ class Dataset(object):
|
||||
truncated_message = "...\ntruncated (too many files to preview)"
|
||||
for preview_entry in zip_.archive_preview[:Dataset.__preview_max_file_entries]:
|
||||
truncated_preview += preview_entry + "\n"
|
||||
if len(truncated_preview) > Dataset.__preview_max_size:
|
||||
if len(truncated_preview) > Dataset.__preview_max_size or \
|
||||
len(truncated_preview) + total_preview_size > Dataset.__preview_total_max_size:
|
||||
add_truncated_message = True
|
||||
break
|
||||
if len(zip_.archive_preview) > Dataset.__preview_max_file_entries:
|
||||
add_truncated_message = True
|
||||
|
||||
preview = truncated_preview + (truncated_message if add_truncated_message else "")
|
||||
total_preview_size += len(preview)
|
||||
|
||||
pool.submit(
|
||||
self._task.upload_artifact,
|
||||
name=artifact_name,
|
||||
artifact_object=Path(zip_path),
|
||||
preview=truncated_preview + (truncated_message if add_truncated_message else ""),
|
||||
preview=preview,
|
||||
delete_after_upload=True,
|
||||
wait_on_upload=True,
|
||||
)
|
||||
for file_entry in self._dataset_file_entries.values():
|
||||
if file_entry.local_path is not None and Path(file_entry.local_path).as_posix() in zip_.files_zipped:
|
||||
if file_entry.local_path is not None and \
|
||||
Path(file_entry.local_path).as_posix() in zip_.files_zipped:
|
||||
keep_as_file_entry.add(file_entry.relative_path)
|
||||
file_entry.artifact_name = artifact_name
|
||||
if file_entry.parent_dataset_id == self._id:
|
||||
@ -684,7 +695,8 @@ class Dataset(object):
|
||||
"File compression and upload completed: total size {}, {} chunk(s) stored (average size {})".format(
|
||||
format_size(total_size, binary=True, use_b_instead_of_bytes=True),
|
||||
chunks_count,
|
||||
format_size(0 if chunks_count == 0 else total_size / chunks_count, binary=True, use_b_instead_of_bytes=True),
|
||||
format_size(0 if chunks_count == 0 else total_size / chunks_count,
|
||||
binary=True, use_b_instead_of_bytes=True),
|
||||
)
|
||||
)
|
||||
self._ds_total_size_compressed = total_size + self._get_total_size_compressed_parents()
|
||||
@ -1209,7 +1221,7 @@ class Dataset(object):
|
||||
return instance
|
||||
|
||||
def _get_total_size_compressed_parents(self):
|
||||
# type: () -> (int)
|
||||
# type: () -> int
|
||||
"""
|
||||
:return: the compressed size of the files contained in the parent datasets
|
||||
"""
|
||||
@ -1733,14 +1745,14 @@ class Dataset(object):
|
||||
@classmethod
|
||||
def list_datasets(
|
||||
cls,
|
||||
dataset_project=None,
|
||||
partial_name=None,
|
||||
tags=None,
|
||||
ids=None,
|
||||
only_completed=True,
|
||||
recursive_project_search=True,
|
||||
dataset_project=None, # type: Optional[str]
|
||||
partial_name=None, # type: Optional[str]
|
||||
tags=None, # type: Optional[Sequence[str]]
|
||||
ids=None, # type: Optional[Sequence[str]]
|
||||
only_completed=True, # type: bool
|
||||
recursive_project_search=True, # type: bool
|
||||
):
|
||||
# type: (Optional[str], Optional[str], Optional[Sequence[str]], Optional[Sequence[str]], bool, bool) -> List[dict]
|
||||
# type: (...) -> List[dict]
|
||||
"""
|
||||
Query list of dataset in the system
|
||||
|
||||
@ -1781,7 +1793,7 @@ class Dataset(object):
|
||||
)
|
||||
project_ids = {d.project for d in datasets}
|
||||
# noinspection PyProtectedMember
|
||||
project_id_lookup = Task._get_project_names(project_ids)
|
||||
project_id_lookup = Task._get_project_names(list(project_ids))
|
||||
return [
|
||||
{
|
||||
"name": d.name,
|
||||
@ -1941,7 +1953,7 @@ class Dataset(object):
|
||||
removed_files_count = 0
|
||||
removed_files_size = 0
|
||||
parent_datasets_ids = self._dependency_graph[self._id]
|
||||
parent_file_entries = {}
|
||||
parent_file_entries = dict() # type: Dict[str, FileEntry]
|
||||
for parent_dataset_id in parent_datasets_ids:
|
||||
if parent_dataset_id == self._id:
|
||||
continue
|
||||
@ -2767,7 +2779,8 @@ class Dataset(object):
|
||||
if file_extension in compression_extensions:
|
||||
compression = file_extension
|
||||
_, file_extension = os.path.splitext(file_path[: -len(file_extension)])
|
||||
if file_extension in tabular_extensions and self.__preview_tables_count >= self.__preview_tabular_table_count:
|
||||
if file_extension in tabular_extensions and \
|
||||
self.__preview_tables_count >= self.__preview_tabular_table_count:
|
||||
continue
|
||||
artifact = convert_to_tabular_artifact(file_path, file_extension, compression)
|
||||
if artifact is not None:
|
||||
@ -3079,7 +3092,7 @@ class Dataset(object):
|
||||
raise_on_multiple=False,
|
||||
shallow_search=True,
|
||||
):
|
||||
# type: (str, str, Optional[str]) -> Tuple[str, str]
|
||||
# type: (str, str, Optional[str], Optional[str], bool, bool) -> Tuple[str, str]
|
||||
"""
|
||||
Gets the dataset ID that matches a project, name and a version.
|
||||
|
||||
@ -3194,7 +3207,7 @@ class Dataset(object):
|
||||
|
||||
@classmethod
|
||||
def _remove_hidden_part_from_dataset_project(cls, dataset_project):
|
||||
# type: (str, str) -> str
|
||||
# type: (str) -> str
|
||||
"""
|
||||
The project name contains the '.datasets' part, as well as the dataset_name.
|
||||
Remove those parts and return the project used when creating the dataset.
|
||||
|
Loading…
Reference in New Issue
Block a user