mirror of
https://github.com/clearml/clearml
synced 2025-06-26 18:16:07 +00:00
Limit number of _serialize
requests when adding list of links with add_external_files()
(#813)
This commit is contained in:
parent
8340d4b86c
commit
b793f2dfc6
@ -116,7 +116,7 @@ class Dataset(object):
|
|||||||
__default_dataset_version = "1.0.0"
|
__default_dataset_version = "1.0.0"
|
||||||
__dataset_folder_template = CacheManager.set_context_folder_lookup(__cache_context, "{0}_archive_{1}")
|
__dataset_folder_template = CacheManager.set_context_folder_lookup(__cache_context, "{0}_archive_{1}")
|
||||||
__preview_max_file_entries = 15000
|
__preview_max_file_entries = 15000
|
||||||
__preview_max_size = 5 * 1024 * 1024
|
__preview_max_size = 32 * 1024
|
||||||
__min_api_version = "2.20"
|
__min_api_version = "2.20"
|
||||||
__hyperparams_section = "Datasets"
|
__hyperparams_section = "Datasets"
|
||||||
__datasets_runtime_prop = "datasets"
|
__datasets_runtime_prop = "datasets"
|
||||||
@ -409,11 +409,13 @@ class Dataset(object):
|
|||||||
dataset_path=None, # type: Optional[str]
|
dataset_path=None, # type: Optional[str]
|
||||||
recursive=True, # type: bool
|
recursive=True, # type: bool
|
||||||
verbose=False, # type: bool
|
verbose=False, # type: bool
|
||||||
|
max_workers=None # type: Optional[int]
|
||||||
):
|
):
|
||||||
# type: (...) -> ()
|
# type: (...) -> int
|
||||||
"""
|
"""
|
||||||
Adds an external file or a folder to the current dataset.
|
Adds external files or folders to the current dataset.
|
||||||
External file links can be from cloud storage (s3://, gs://, azure://) or local / network storage (file://).
|
External file links can be from cloud storage (s3://, gs://, azure://), local / network storage (file://)
|
||||||
|
or http(s)// files.
|
||||||
Calculates file size for each file and compares against parent.
|
Calculates file size for each file and compares against parent.
|
||||||
|
|
||||||
A few examples:
|
A few examples:
|
||||||
@ -436,92 +438,32 @@ class Dataset(object):
|
|||||||
'image.jpg' will be downloaded to 's3_files/image.jpg' (relative path to the dataset)
|
'image.jpg' will be downloaded to 's3_files/image.jpg' (relative path to the dataset)
|
||||||
:param recursive: If True match all wildcard files recursively
|
:param recursive: If True match all wildcard files recursively
|
||||||
:param verbose: If True print to console files added/modified
|
:param verbose: If True print to console files added/modified
|
||||||
:return: number of file links added
|
:param max_workers: The number of threads to add the external files with. Useful when `source_url` is
|
||||||
|
a sequence. Defaults to the number of logical cores
|
||||||
|
:return: Number of file links added
|
||||||
"""
|
"""
|
||||||
num_added = 0
|
|
||||||
self._dirty = True
|
self._dirty = True
|
||||||
if not isinstance(source_url, str):
|
num_added = 0
|
||||||
for source_url_ in source_url:
|
num_modified = 0
|
||||||
num_added += self.add_external_files(
|
source_url_list = source_url if not isinstance(source_url, str) else [source_url]
|
||||||
|
max_workers = max_workers or psutil.cpu_count()
|
||||||
|
futures_ = []
|
||||||
|
with ThreadPoolExecutor(max_workers=max_workers) as tp:
|
||||||
|
for source_url_ in source_url_list:
|
||||||
|
futures_.append(
|
||||||
|
tp.submit(
|
||||||
|
self._add_external_files,
|
||||||
source_url_,
|
source_url_,
|
||||||
wildcard=wildcard,
|
wildcard=wildcard,
|
||||||
dataset_path=dataset_path,
|
dataset_path=dataset_path,
|
||||||
recursive=recursive,
|
recursive=recursive,
|
||||||
verbose=verbose
|
verbose=verbose,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
return num_added
|
for future_ in futures_:
|
||||||
if dataset_path:
|
num_added_this_call, num_modified_this_call = future_.result()
|
||||||
dataset_path = dataset_path.lstrip("/")
|
num_added += num_added_this_call
|
||||||
# noinspection PyBroadException
|
num_modified += num_modified_this_call
|
||||||
try:
|
|
||||||
if StorageManager.exists_file(source_url):
|
|
||||||
links = [source_url]
|
|
||||||
else:
|
|
||||||
if source_url[-1] != "/":
|
|
||||||
source_url = source_url + "/"
|
|
||||||
links = StorageManager.list(source_url, return_full_path=True)
|
|
||||||
except Exception:
|
|
||||||
self._task.get_logger().report_text(
|
|
||||||
"Could not list/find remote file(s) when adding {}".format(source_url)
|
|
||||||
)
|
|
||||||
return 0
|
|
||||||
num_modified = 0
|
|
||||||
for link in links:
|
|
||||||
relative_path = link[len(source_url):]
|
|
||||||
if not relative_path:
|
|
||||||
relative_path = source_url.split("/")[-1]
|
|
||||||
if not matches_any_wildcard(relative_path, wildcard, recursive=recursive):
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
relative_path = Path(os.path.join(dataset_path or ".", relative_path)).as_posix()
|
|
||||||
size = StorageManager.get_file_size_bytes(link, silence_errors=True)
|
|
||||||
already_added_file = self._dataset_file_entries.get(relative_path)
|
|
||||||
if relative_path not in self._dataset_link_entries:
|
|
||||||
if verbose:
|
|
||||||
self._task.get_logger().report_text(
|
|
||||||
"External file {} added".format(link),
|
|
||||||
print_console=False,
|
|
||||||
)
|
|
||||||
self._dataset_link_entries[relative_path] = LinkEntry(
|
|
||||||
link=link, relative_path=relative_path, parent_dataset_id=self._id, size=size
|
|
||||||
)
|
|
||||||
num_added += 1
|
|
||||||
elif already_added_file and already_added_file.size != size:
|
|
||||||
if verbose:
|
|
||||||
self._task.get_logger().report_text(
|
|
||||||
"External file {} modified".format(link),
|
|
||||||
print_console=False,
|
|
||||||
)
|
|
||||||
del self._dataset_file_entries[relative_path]
|
|
||||||
self._dataset_link_entries[relative_path] = LinkEntry(
|
|
||||||
link=link, relative_path=relative_path, parent_dataset_id=self._id, size=size
|
|
||||||
)
|
|
||||||
num_modified += 1
|
|
||||||
elif (
|
|
||||||
relative_path in self._dataset_link_entries
|
|
||||||
and self._dataset_link_entries[relative_path].size != size
|
|
||||||
):
|
|
||||||
if verbose:
|
|
||||||
self._task.get_logger().report_text(
|
|
||||||
"External file {} modified".format(link),
|
|
||||||
print_console=False,
|
|
||||||
)
|
|
||||||
self._dataset_link_entries[relative_path] = LinkEntry(
|
|
||||||
link=link, relative_path=relative_path, parent_dataset_id=self._id, size=size
|
|
||||||
)
|
|
||||||
num_modified += 1
|
|
||||||
else:
|
|
||||||
if verbose:
|
|
||||||
self._task.get_logger().report_text(
|
|
||||||
"External file {} skipped as it was not modified".format(link),
|
|
||||||
print_console=False,
|
|
||||||
)
|
|
||||||
except Exception as e:
|
|
||||||
if verbose:
|
|
||||||
self._task.get_logger().report_text(
|
|
||||||
"Error '{}' encountered trying to add external file {}".format(e, link),
|
|
||||||
print_console=False,
|
|
||||||
)
|
|
||||||
self._task.add_tags([self.__external_files_tag])
|
self._task.add_tags([self.__external_files_tag])
|
||||||
self._add_script_call(
|
self._add_script_call(
|
||||||
"add_external_files",
|
"add_external_files",
|
||||||
@ -712,11 +654,21 @@ class Dataset(object):
|
|||||||
)
|
)
|
||||||
total_size += zip_.size
|
total_size += zip_.size
|
||||||
chunks_count += 1
|
chunks_count += 1
|
||||||
|
truncated_preview = ""
|
||||||
|
add_truncated_message = False
|
||||||
|
truncated_message = "...\ntruncated (too many files to preview)"
|
||||||
|
for preview_entry in zip_.archive_preview[: Dataset.__preview_max_file_entries]:
|
||||||
|
truncated_preview += preview_entry + "\n"
|
||||||
|
if len(truncated_preview) > Dataset.__preview_max_size:
|
||||||
|
add_truncated_message = True
|
||||||
|
break
|
||||||
|
if len(zip_.archive_preview) > Dataset.__preview_max_file_entries:
|
||||||
|
add_truncated_message = True
|
||||||
pool.submit(
|
pool.submit(
|
||||||
self._task.upload_artifact,
|
self._task.upload_artifact,
|
||||||
name=artifact_name,
|
name=artifact_name,
|
||||||
artifact_object=Path(zip_path),
|
artifact_object=Path(zip_path),
|
||||||
preview=zip_.archive_preview,
|
preview=truncated_preview + (truncated_message if add_truncated_message else ""),
|
||||||
delete_after_upload=True,
|
delete_after_upload=True,
|
||||||
wait_on_upload=True,
|
wait_on_upload=True,
|
||||||
)
|
)
|
||||||
@ -2972,6 +2924,110 @@ class Dataset(object):
|
|||||||
self._dependency_chunk_lookup = self._build_dependency_chunk_lookup()
|
self._dependency_chunk_lookup = self._build_dependency_chunk_lookup()
|
||||||
return self._dependency_chunk_lookup
|
return self._dependency_chunk_lookup
|
||||||
|
|
||||||
|
def _add_external_files(
|
||||||
|
self,
|
||||||
|
source_url, # type: str
|
||||||
|
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
|
||||||
|
dataset_path=None, # type: Optional[str]
|
||||||
|
recursive=True, # type: bool
|
||||||
|
verbose=False, # type: bool
|
||||||
|
):
|
||||||
|
# type: (...) -> Tuple[int, int]
|
||||||
|
"""
|
||||||
|
Auxiliary function for `add_external_files`
|
||||||
|
Adds an external file or a folder to the current dataset.
|
||||||
|
External file links can be from cloud storage (s3://, gs://, azure://) or local / network storage (file://).
|
||||||
|
Calculates file size for each file and compares against parent.
|
||||||
|
|
||||||
|
:param source_url: Source url link (e.g. s3://bucket/folder/path)
|
||||||
|
:param wildcard: add only specific set of files.
|
||||||
|
Wildcard matching, can be a single string or a list of wildcards.
|
||||||
|
:param dataset_path: The location in the dataset where the file will be downloaded into.
|
||||||
|
e.g: for source_url='s3://bucket/remote_folder/image.jpg' and dataset_path='s3_files',
|
||||||
|
'image.jpg' will be downloaded to 's3_files/image.jpg' (relative path to the dataset)
|
||||||
|
:param recursive: If True match all wildcard files recursively
|
||||||
|
:param verbose: If True print to console files added/modified
|
||||||
|
:return: Number of file links added and modified
|
||||||
|
"""
|
||||||
|
if dataset_path:
|
||||||
|
dataset_path = dataset_path.lstrip("/")
|
||||||
|
remote_objects = None
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
if StorageManager.exists_file(source_url):
|
||||||
|
remote_objects = [StorageManager.get_metadata(source_url)]
|
||||||
|
elif not source_url.startswith(("http://", "https://")):
|
||||||
|
if source_url[-1] != "/":
|
||||||
|
source_url = source_url + "/"
|
||||||
|
remote_objects = StorageManager.list(source_url, with_metadata=True, return_full_path=True)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if not remote_objects:
|
||||||
|
self._task.get_logger().report_text(
|
||||||
|
"Could not list/find remote file(s) when adding {}".format(source_url)
|
||||||
|
)
|
||||||
|
return 0, 0
|
||||||
|
num_added = 0
|
||||||
|
num_modified = 0
|
||||||
|
for remote_object in remote_objects:
|
||||||
|
link = remote_object.get("name")
|
||||||
|
relative_path = link[len(source_url):]
|
||||||
|
if not relative_path:
|
||||||
|
relative_path = source_url.split("/")[-1]
|
||||||
|
if not matches_any_wildcard(relative_path, wildcard, recursive=recursive):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
relative_path = Path(os.path.join(dataset_path or ".", relative_path)).as_posix()
|
||||||
|
size = remote_object.get("size")
|
||||||
|
already_added_file = self._dataset_file_entries.get(relative_path)
|
||||||
|
if relative_path not in self._dataset_link_entries:
|
||||||
|
if verbose:
|
||||||
|
self._task.get_logger().report_text(
|
||||||
|
"External file {} added".format(link),
|
||||||
|
print_console=False,
|
||||||
|
)
|
||||||
|
self._dataset_link_entries[relative_path] = LinkEntry(
|
||||||
|
link=link, relative_path=relative_path, parent_dataset_id=self._id, size=size
|
||||||
|
)
|
||||||
|
num_added += 1
|
||||||
|
elif already_added_file and already_added_file.size != size:
|
||||||
|
if verbose:
|
||||||
|
self._task.get_logger().report_text(
|
||||||
|
"External file {} modified".format(link),
|
||||||
|
print_console=False,
|
||||||
|
)
|
||||||
|
del self._dataset_file_entries[relative_path]
|
||||||
|
self._dataset_link_entries[relative_path] = LinkEntry(
|
||||||
|
link=link, relative_path=relative_path, parent_dataset_id=self._id, size=size
|
||||||
|
)
|
||||||
|
num_modified += 1
|
||||||
|
elif (
|
||||||
|
relative_path in self._dataset_link_entries
|
||||||
|
and self._dataset_link_entries[relative_path].size != size
|
||||||
|
):
|
||||||
|
if verbose:
|
||||||
|
self._task.get_logger().report_text(
|
||||||
|
"External file {} modified".format(link),
|
||||||
|
print_console=False,
|
||||||
|
)
|
||||||
|
self._dataset_link_entries[relative_path] = LinkEntry(
|
||||||
|
link=link, relative_path=relative_path, parent_dataset_id=self._id, size=size
|
||||||
|
)
|
||||||
|
num_modified += 1
|
||||||
|
else:
|
||||||
|
if verbose:
|
||||||
|
self._task.get_logger().report_text(
|
||||||
|
"External file {} skipped as it was not modified".format(link),
|
||||||
|
print_console=False,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
if verbose:
|
||||||
|
self._task.get_logger().report_text(
|
||||||
|
"Error '{}' encountered trying to add external file {}".format(e, link),
|
||||||
|
print_console=False,
|
||||||
|
)
|
||||||
|
return num_added, num_modified
|
||||||
|
|
||||||
def _build_chunk_selection(self, part, num_parts):
|
def _build_chunk_selection(self, part, num_parts):
|
||||||
# type: (int, int) -> Dict[str, int]
|
# type: (int, int) -> Dict[str, int]
|
||||||
"""
|
"""
|
||||||
|
@ -595,8 +595,20 @@ class StorageHelper(object):
|
|||||||
:return: The size of the file in bytes.
|
:return: The size of the file in bytes.
|
||||||
None if the file could not be found or an error occurred.
|
None if the file could not be found or an error occurred.
|
||||||
"""
|
"""
|
||||||
size = None
|
|
||||||
obj = self.get_object(remote_url, silence_errors=silence_errors)
|
obj = self.get_object(remote_url, silence_errors=silence_errors)
|
||||||
|
return self._get_object_size_bytes(obj)
|
||||||
|
|
||||||
|
def _get_object_size_bytes(self, obj):
|
||||||
|
# type: (object, bool) -> [int, None]
|
||||||
|
"""
|
||||||
|
Auxiliary function for `get_object_size_bytes`.
|
||||||
|
Get size of the remote object in bytes.
|
||||||
|
|
||||||
|
:param object obj: The remote object
|
||||||
|
|
||||||
|
:return: The size of the object in bytes.
|
||||||
|
None if an error occurred.
|
||||||
|
"""
|
||||||
if not obj:
|
if not obj:
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
@ -615,6 +627,21 @@ class StorageHelper(object):
|
|||||||
pass
|
pass
|
||||||
return size
|
return size
|
||||||
|
|
||||||
|
def get_object_metadata(self, obj):
|
||||||
|
# type: (object) -> dict
|
||||||
|
"""
|
||||||
|
Get the metadata of the a remote object.
|
||||||
|
The metadata is a dict containing the following keys: `name`, `size`.
|
||||||
|
|
||||||
|
:param object obj: The remote object
|
||||||
|
|
||||||
|
:return: A dict containing the metadata of the remote object
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"name": obj.name if hasattr(obj, "name") else obj.url if hasattr(obj, "url") else None,
|
||||||
|
"size": self._get_object_size_bytes(obj),
|
||||||
|
}
|
||||||
|
|
||||||
def verify_upload(self, folder_uri='', raise_on_error=True, log_on_error=True):
|
def verify_upload(self, folder_uri='', raise_on_error=True, log_on_error=True):
|
||||||
"""
|
"""
|
||||||
Verify that this helper can upload files to a folder.
|
Verify that this helper can upload files to a folder.
|
||||||
@ -716,12 +743,13 @@ class StorageHelper(object):
|
|||||||
res = quote_url(res)
|
res = quote_url(res)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def list(self, prefix=None):
|
def list(self, prefix=None, with_metadata=False):
|
||||||
"""
|
"""
|
||||||
List entries in the helper base path.
|
List entries in the helper base path.
|
||||||
|
|
||||||
Return a list of names inside this helper base path. The base path is
|
Return a list of names inside this helper base path or a list of dictionaries containing
|
||||||
determined at creation time and is specific for each storage medium.
|
the objects' metadata. The base path is determined at creation time and is specific
|
||||||
|
for each storage medium.
|
||||||
For Google Storage and S3 it is the bucket of the path.
|
For Google Storage and S3 it is the bucket of the path.
|
||||||
For local files it is the root directory.
|
For local files it is the root directory.
|
||||||
|
|
||||||
@ -731,11 +759,14 @@ class StorageHelper(object):
|
|||||||
must be a string - the path of a sub directory under the base path.
|
must be a string - the path of a sub directory under the base path.
|
||||||
the returned list will include only objects under that subdir.
|
the returned list will include only objects under that subdir.
|
||||||
|
|
||||||
:return: The paths of all the objects in the storage base
|
:param with_metadata: Instead of returning just the names of the objects, return a list of dictionaries
|
||||||
path under prefix. Listed relative to the base path.
|
containing the name and metadata of the remote file. Thus, each dictionary will contain the following
|
||||||
|
keys: `name`, `size`.
|
||||||
|
|
||||||
|
:return: The paths of all the objects in the storage base path under prefix or
|
||||||
|
a list of dictionaries containing the objects' metadata.
|
||||||
|
Listed relative to the base path.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if prefix:
|
if prefix:
|
||||||
if prefix.startswith(self._base_url):
|
if prefix.startswith(self._base_url):
|
||||||
prefix = prefix[len(self.base_url):].lstrip("/")
|
prefix = prefix[len(self.base_url):].lstrip("/")
|
||||||
@ -746,15 +777,22 @@ class StorageHelper(object):
|
|||||||
res = self._driver.list_container_objects(self._container)
|
res = self._driver.list_container_objects(self._container)
|
||||||
|
|
||||||
result = [
|
result = [
|
||||||
obj.name
|
obj.name if not with_metadata else self.get_object_metadata(obj)
|
||||||
for obj in res
|
for obj in res
|
||||||
if (obj.name.startswith(prefix) or self._base_url == "file://") and obj.name != prefix
|
if (obj.name.startswith(prefix) or self._base_url == "file://") and obj.name != prefix
|
||||||
]
|
]
|
||||||
if self._base_url == "file://":
|
if self._base_url == "file://":
|
||||||
result = [Path(f).as_posix() for f in result]
|
if not with_metadata:
|
||||||
|
result = [Path(f).as_posix() for f in result]
|
||||||
|
else:
|
||||||
|
for metadata_entry in result:
|
||||||
|
metadata_entry["name"] = Path(metadata_entry["name"]).as_posix()
|
||||||
return result
|
return result
|
||||||
else:
|
else:
|
||||||
return [obj.name for obj in self._driver.list_container_objects(self._container)]
|
return [
|
||||||
|
obj.name if not with_metadata else self.get_object_metadata(obj)
|
||||||
|
for obj in self._driver.list_container_objects(self._container)
|
||||||
|
]
|
||||||
|
|
||||||
def download_to_file(
|
def download_to_file(
|
||||||
self,
|
self,
|
||||||
|
@ -314,15 +314,21 @@ class StorageManager(object):
|
|||||||
|
|
||||||
:return: True is the remote_url stores a file and False otherwise
|
:return: True is the remote_url stores a file and False otherwise
|
||||||
"""
|
"""
|
||||||
if remote_url.startswith("file://"):
|
# noinspection PyBroadException
|
||||||
return os.path.isfile(remote_url[len("file://"):])
|
try:
|
||||||
if remote_url.startswith("http://") or remote_url.startswith("https://"):
|
if remote_url.endswith("/"):
|
||||||
return requests.head(remote_url).status_code == requests.codes.ok
|
return False
|
||||||
helper = StorageHelper.get(remote_url)
|
if remote_url.startswith("file://"):
|
||||||
obj = helper.get_object(remote_url)
|
return os.path.isfile(remote_url[len("file://"):])
|
||||||
if not obj:
|
if remote_url.startswith(("http://", "https://")):
|
||||||
|
return requests.head(remote_url).ok
|
||||||
|
helper = StorageHelper.get(remote_url)
|
||||||
|
obj = helper.get_object(remote_url)
|
||||||
|
if not obj:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
return False
|
return False
|
||||||
return len(StorageManager.list(remote_url)) == 0
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_file_size_bytes(cls, remote_url, silence_errors=False):
|
def get_file_size_bytes(cls, remote_url, silence_errors=False):
|
||||||
@ -419,10 +425,11 @@ class StorageManager(object):
|
|||||||
return local_folder
|
return local_folder
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def list(cls, remote_url, return_full_path=False):
|
def list(cls, remote_url, return_full_path=False, with_metadata=False):
|
||||||
# type: (str, bool) -> Optional[List[str]]
|
# type: (str, bool) -> Optional[List[Union[str, dict]]]
|
||||||
"""
|
"""
|
||||||
Return a list of object names inside the base path
|
Return a list of object names inside the base path or dictionaries containing the corresponding
|
||||||
|
objects' metadata (in case `with_metadata` is True)
|
||||||
|
|
||||||
:param str remote_url: The base path.
|
:param str remote_url: The base path.
|
||||||
For Google Storage, Azure and S3 it is the bucket of the path, for local files it is the root directory.
|
For Google Storage, Azure and S3 it is the bucket of the path, for local files it is the root directory.
|
||||||
@ -431,17 +438,49 @@ class StorageManager(object):
|
|||||||
Azure blob storage: `azure://bucket/folder_` and also file system listing: `/mnt/share/folder_`
|
Azure blob storage: `azure://bucket/folder_` and also file system listing: `/mnt/share/folder_`
|
||||||
:param bool return_full_path: If True, return a list of full object paths, otherwise return a list of
|
:param bool return_full_path: If True, return a list of full object paths, otherwise return a list of
|
||||||
relative object paths (default False).
|
relative object paths (default False).
|
||||||
|
:param with_metadata: Instead of returning just the names of the objects, return a list of dictionaries
|
||||||
|
containing the name and metadata of the remote file. Thus, each dictionary will contain the following
|
||||||
|
keys: `name`, `size`.
|
||||||
|
`return_full_path` will modify the name of each dictionary entry to the full path.
|
||||||
|
|
||||||
:return: The paths of all the objects in the storage base path under prefix, relative to the base path.
|
:return: The paths of all the objects the storage base path under prefix or the dictionaries containing the objects' metadata, relative to the base path.
|
||||||
None in case of list operation is not supported (http and https protocols for example)
|
None in case of list operation is not supported (http and https protocols for example)
|
||||||
"""
|
"""
|
||||||
helper = StorageHelper.get(remote_url)
|
helper = StorageHelper.get(remote_url)
|
||||||
try:
|
try:
|
||||||
names_list = helper.list(prefix=remote_url)
|
helper_list_result = helper.list(prefix=remote_url, with_metadata=with_metadata)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
LoggerRoot.get_base_logger().warning("Can not list files for '{}' - {}".format(remote_url, ex))
|
LoggerRoot.get_base_logger().warning("Can not list files for '{}' - {}".format(remote_url, ex))
|
||||||
names_list = None
|
return None
|
||||||
|
|
||||||
if helper.base_url == 'file://':
|
prefix = remote_url.rstrip("/") if helper.base_url == "file://" else helper.base_url
|
||||||
return ["{}/{}".format(remote_url.rstrip('/'), name) for name in names_list] if return_full_path else names_list
|
if not with_metadata:
|
||||||
return ["{}/{}".format(helper.base_url, name) for name in names_list] if return_full_path else names_list
|
return (
|
||||||
|
["{}/{}".format(prefix, name) for name in helper_list_result]
|
||||||
|
if return_full_path
|
||||||
|
else helper_list_result
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if return_full_path:
|
||||||
|
for obj in helper_list_result:
|
||||||
|
obj["name"] = "{}/{}".format(prefix, obj.get("name"))
|
||||||
|
return helper_list_result
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_metadata(cls, remote_url):
|
||||||
|
# type: (str) -> Optional[dict]
|
||||||
|
"""
|
||||||
|
Get the metadata of the a remote object.
|
||||||
|
The metadata is a dict containing the following keys: `name`, `size`.
|
||||||
|
|
||||||
|
:param str remote_url: Source remote storage location, tree structure of `remote_url` will
|
||||||
|
be created under the target local_folder. Supports S3/GS/Azure, shared filesystem and http(s).
|
||||||
|
Example: 's3://bucket/data/'
|
||||||
|
|
||||||
|
:return: A dict containing the metadata of the remote object. In case of an error, `None` is returned
|
||||||
|
"""
|
||||||
|
helper = StorageHelper.get(remote_url)
|
||||||
|
obj = helper.get_object(remote_url)
|
||||||
|
if not obj:
|
||||||
|
return None
|
||||||
|
return helper.get_object_metadata(obj)
|
||||||
|
@ -238,7 +238,7 @@ class ParallelZipper(object):
|
|||||||
self.fd, self.zip_path = mkstemp(prefix=zip_prefix, suffix=zip_suffix)
|
self.fd, self.zip_path = mkstemp(prefix=zip_prefix, suffix=zip_suffix)
|
||||||
self.zip_path = Path(self.zip_path)
|
self.zip_path = Path(self.zip_path)
|
||||||
self.zip_file = ZipFile(self.zip_path.as_posix(), "w", allowZip64=allow_zip_64, compression=compression)
|
self.zip_file = ZipFile(self.zip_path.as_posix(), "w", allowZip64=allow_zip_64, compression=compression)
|
||||||
self.archive_preview = ""
|
self.archive_preview = []
|
||||||
self.count = 0
|
self.count = 0
|
||||||
self.files_zipped = set()
|
self.files_zipped = set()
|
||||||
|
|
||||||
@ -259,7 +259,7 @@ class ParallelZipper(object):
|
|||||||
preview_path = arcname
|
preview_path = arcname
|
||||||
if not preview_path:
|
if not preview_path:
|
||||||
preview_path = file_path
|
preview_path = file_path
|
||||||
self.archive_preview += "{} - {}\n".format(preview_path, format_size(self.size))
|
self.archive_preview.append("{} - {}".format(preview_path, format_size(self.size)))
|
||||||
self.files_zipped.add(Path(file_path).as_posix())
|
self.files_zipped.add(Path(file_path).as_posix())
|
||||||
if self._chunk_size <= 0 or self.size < self._chunk_size:
|
if self._chunk_size <= 0 or self.size < self._chunk_size:
|
||||||
self._zipper_queue.put(self)
|
self._zipper_queue.put(self)
|
||||||
@ -294,7 +294,7 @@ class ParallelZipper(object):
|
|||||||
parent_zip.writestr(child_name, child_zip.open(child_name).read())
|
parent_zip.writestr(child_name, child_zip.open(child_name).read())
|
||||||
self.files_zipped |= other.files_zipped
|
self.files_zipped |= other.files_zipped
|
||||||
self.count += other.count
|
self.count += other.count
|
||||||
self.archive_preview += other.archive_preview
|
self.archive_preview.extend(other.archive_preview)
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
# type: () -> ()
|
# type: () -> ()
|
||||||
|
Loading…
Reference in New Issue
Block a user