mirror of
https://github.com/clearml/clearml
synced 2025-02-07 05:18:50 +00:00
Don't download external files from parent datasets if they have been modified/removed in the child dataset
This commit is contained in:
parent
65c18798f4
commit
749a80a70a
@ -552,7 +552,7 @@ class Dataset(object):
|
|||||||
k: v
|
k: v
|
||||||
for k, v in self._dataset_link_entries.items()
|
for k, v in self._dataset_link_entries.items()
|
||||||
if not matches_any_wildcard(k, dataset_path, recursive=recursive)
|
if not matches_any_wildcard(k, dataset_path, recursive=recursive)
|
||||||
and not matches_any_wildcard(v.link, dataset_path, recursive=recursive)
|
and not (matches_any_wildcard(v.link, dataset_path, recursive=recursive) or v.link == dataset_path)
|
||||||
}
|
}
|
||||||
|
|
||||||
removed = 0
|
removed = 0
|
||||||
@ -2263,14 +2263,14 @@ class Dataset(object):
|
|||||||
|
|
||||||
def _get_dataset_files(
|
def _get_dataset_files(
|
||||||
self,
|
self,
|
||||||
force=False,
|
force=False, # type: bool
|
||||||
selected_chunks=None,
|
selected_chunks=None, # type: Optional[List[int]]
|
||||||
lock_target_folder=False,
|
lock_target_folder=False, # type: bool
|
||||||
cleanup_target_folder=True,
|
cleanup_target_folder=True, # type: bool
|
||||||
target_folder=None,
|
target_folder=None, # type: Optional[Path]
|
||||||
max_workers=None
|
max_workers=None, # type: Optional[int]
|
||||||
|
link_entries_of_interest=None # type: Optional[Dict[str, LinkEntry]]
|
||||||
):
|
):
|
||||||
# type: (bool, Optional[List[int]], bool, bool, Optional[Path], Optional[int]) -> str
|
|
||||||
"""
|
"""
|
||||||
First, extracts the archive present on the ClearML server containing this dataset's files.
|
First, extracts the archive present on the ClearML server containing this dataset's files.
|
||||||
Then, download the remote files. Note that if a remote file was added to the ClearML server, then
|
Then, download the remote files. Note that if a remote file was added to the ClearML server, then
|
||||||
@ -2287,6 +2287,8 @@ class Dataset(object):
|
|||||||
:param target_folder: If provided use the specified target folder, default, auto generate from Dataset ID.
|
:param target_folder: If provided use the specified target folder, default, auto generate from Dataset ID.
|
||||||
:param max_workers: Number of threads to be spawned when getting dataset files. Defaults
|
:param max_workers: Number of threads to be spawned when getting dataset files. Defaults
|
||||||
to the number of virtual cores.
|
to the number of virtual cores.
|
||||||
|
:param link_entries_of_interest: Download only the external files in this dictionary.
|
||||||
|
Useful when one doesn't want to download all the files in a parent dataset, as some files might be removed
|
||||||
|
|
||||||
:return: Path to the local storage where the data was downloaded
|
:return: Path to the local storage where the data was downloaded
|
||||||
"""
|
"""
|
||||||
@ -2300,14 +2302,21 @@ class Dataset(object):
|
|||||||
max_workers=max_workers
|
max_workers=max_workers
|
||||||
)
|
)
|
||||||
self._download_external_files(
|
self._download_external_files(
|
||||||
target_folder=target_folder, lock_target_folder=lock_target_folder, max_workers=max_workers
|
target_folder=target_folder,
|
||||||
|
lock_target_folder=lock_target_folder,
|
||||||
|
max_workers=max_workers,
|
||||||
|
link_entries_of_interest=link_entries_of_interest,
|
||||||
)
|
)
|
||||||
return local_folder
|
return local_folder
|
||||||
|
|
||||||
def _download_external_files(
|
def _download_external_files(
|
||||||
self, target_folder=None, lock_target_folder=False, max_workers=None
|
self,
|
||||||
|
target_folder=None,
|
||||||
|
lock_target_folder=False,
|
||||||
|
max_workers=None,
|
||||||
|
link_entries_of_interest=None
|
||||||
):
|
):
|
||||||
# (Union(Path, str), bool) -> None
|
# (Union(Path, str), bool, Optional[int], Optional[Dict[str, LinkEntry]]) -> None
|
||||||
"""
|
"""
|
||||||
Downloads external files in the dataset. These files will be downloaded
|
Downloads external files in the dataset. These files will be downloaded
|
||||||
at relative_path (the path relative to the target_folder). Note that
|
at relative_path (the path relative to the target_folder). Note that
|
||||||
@ -2318,6 +2327,8 @@ class Dataset(object):
|
|||||||
:param lock_target_folder: If True, local the target folder so the next cleanup will not delete
|
:param lock_target_folder: If True, local the target folder so the next cleanup will not delete
|
||||||
Notice you should unlock it manually, or wait for the process to finish for auto unlocking.
|
Notice you should unlock it manually, or wait for the process to finish for auto unlocking.
|
||||||
:param max_workers: Number of threads to be spawned when getting dataset files. Defaults to no multi-threading.
|
:param max_workers: Number of threads to be spawned when getting dataset files. Defaults to no multi-threading.
|
||||||
|
:param link_entries_of_interest: Download only the external files in this dictionary.
|
||||||
|
Useful when one doesn't want to download all the files in a parent dataset, as some files might be removed
|
||||||
"""
|
"""
|
||||||
def _download_link(link, target_path):
|
def _download_link(link, target_path):
|
||||||
if os.path.exists(target_path):
|
if os.path.exists(target_path):
|
||||||
@ -2370,12 +2381,13 @@ class Dataset(object):
|
|||||||
)[0]
|
)[0]
|
||||||
).as_posix()
|
).as_posix()
|
||||||
|
|
||||||
|
link_entries_of_interest = link_entries_of_interest or self._dataset_link_entries
|
||||||
if not max_workers:
|
if not max_workers:
|
||||||
for relative_path, link in self._dataset_link_entries.items():
|
for relative_path, link in link_entries_of_interest.items():
|
||||||
_submit_download_link(relative_path, link, target_folder)
|
_submit_download_link(relative_path, link, target_folder)
|
||||||
else:
|
else:
|
||||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||||
for relative_path, link in self._dataset_link_entries.items():
|
for relative_path, link in link_entries_of_interest.items():
|
||||||
_submit_download_link(relative_path, link, target_folder, pool=pool)
|
_submit_download_link(relative_path, link, target_folder, pool=pool)
|
||||||
|
|
||||||
def _extract_dataset_archive(
|
def _extract_dataset_archive(
|
||||||
@ -3224,7 +3236,8 @@ class Dataset(object):
|
|||||||
force=force,
|
force=force,
|
||||||
lock_target_folder=True,
|
lock_target_folder=True,
|
||||||
cleanup_target_folder=False,
|
cleanup_target_folder=False,
|
||||||
max_workers=max_workers
|
max_workers=max_workers,
|
||||||
|
link_entries_of_interest=self._dataset_link_entries
|
||||||
))
|
))
|
||||||
ds_base_folder.touch()
|
ds_base_folder.touch()
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user