From 1a62d4b6e17c004ab7ceee6e0800b2c9328293f7 Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Mon, 23 Jan 2023 14:07:02 +0200 Subject: [PATCH] Fix `StorageManager.list()` does not return size metadata (#865) --- clearml/datasets/dataset.py | 53 ++++++++++++++++++++++++++----------- clearml/storage/helper.py | 14 +++++----- 2 files changed, 44 insertions(+), 23 deletions(-) diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py index 9f78473d..59006347 100644 --- a/clearml/datasets/dataset.py +++ b/clearml/datasets/dataset.py @@ -2030,30 +2030,53 @@ class Dataset(object): modified_files_size = 0 removed_files_count = 0 removed_files_size = 0 + + def update_changes(entries, parent_entries): + nonlocal total_size + nonlocal modified_files_count + nonlocal modified_files_size + nonlocal added_files_count + nonlocal added_files_size + nonlocal removed_files_count + nonlocal removed_files_size + + for file in entries.values(): + # noinspection PyBroadException + try: + total_size += file.size + if file.parent_dataset_id == self._id: + if file.relative_path in parent_file_entries: + modified_files_count += 1 + modified_files_size += file.size - parent_file_entries[file.relative_path].size + else: + added_files_count += 1 + added_files_size += file.size + except Exception: + pass + for parent_entry_key, parent_entry_value in parent_entries.items(): + # noinspection PyBroadException + try: + if parent_entry_key not in entries: + removed_files_count += 1 + removed_files_size -= parent_entry_value.size + except Exception: + pass + parent_datasets_ids = self._dependency_graph[self._id] parent_file_entries = dict() # type: Dict[str, FileEntry] + parent_link_entries = dict() # type: Dict[str, LinkEntry] for parent_dataset_id in parent_datasets_ids: if parent_dataset_id == self._id: continue parent_dataset = self.get(parent_dataset_id) parent_file_entries.update(parent_dataset._dataset_file_entries) + parent_link_entries.update(parent_dataset._dataset_link_entries) # we have to do this after we update the parent_file_entries because we might # have duplicate file entries - for parent_file_entry_key, parent_file_entry_value in parent_file_entries.items(): - if parent_file_entry_key not in self._dataset_file_entries: - removed_files_count += 1 - removed_files_size -= parent_file_entry_value.size - for file in self._dataset_file_entries.values(): - total_size += file.size - if file.parent_dataset_id == self._id: - if file.relative_path in parent_file_entries: - modified_files_count += 1 - modified_files_size += file.size - parent_file_entries[file.relative_path].size - else: - added_files_count += 1 - added_files_size += file.size + update_changes(self._dataset_file_entries, parent_file_entries) + update_changes(self._dataset_link_entries, parent_link_entries) state = dict( - file_count=len(self._dataset_file_entries), + file_count=len(self._dataset_file_entries) + len(self._dataset_link_entries), total_size=total_size, dataset_file_entries=[f.as_dict() for f in self._dataset_file_entries.values()], dataset_link_entries=[link.as_dict() for link in self._dataset_link_entries.values()], @@ -2743,7 +2766,7 @@ class Dataset(object): file_name = file.link dataset_details += "{}, {}, {}".format( file_name, - file.size if file.size is not None and not hasattr(file, "link") else "", + file.size if file.size is not None else "", file.hash if file.hash else "", ) preview_index += 1 diff --git a/clearml/storage/helper.py b/clearml/storage/helper.py index 50ae09c9..5087f542 100644 --- a/clearml/storage/helper.py +++ b/clearml/storage/helper.py @@ -619,13 +619,6 @@ class StorageHelper(object): if isinstance(self._driver, _HttpDriver) and obj: obj = self._driver._get_download_object(obj) # noqa size = int(obj.headers.get("Content-Length", 0)) - elif isinstance(self._driver, _Boto3Driver) and obj: - # noinspection PyBroadException - try: - # To catch botocore exceptions - size = obj.content_length # noqa - except Exception: - pass elif hasattr(obj, "size"): size = obj.size # Google storage has the option to reload the object to get the size @@ -633,7 +626,12 @@ class StorageHelper(object): obj.reload() size = obj.size elif hasattr(obj, "content_length"): - size = obj.content_length + # noinspection PyBroadException + try: + # To catch botocore exceptions + size = obj.content_length # noqa + except Exception: + pass except (ValueError, AttributeError, KeyError): pass return size