mirror of
https://github.com/clearml/clearml
synced 2025-05-11 08:00:51 +00:00
Fix adding dataset folder with modified files will upload all files instead of just the modified ones
This commit is contained in:
parent
330abbf9c0
commit
aa2f7b0e3e
@ -694,6 +694,8 @@ class Dataset(object):
|
|||||||
chunk_size = int(self._dataset_chunk_size_mb if not chunk_size else chunk_size)
|
chunk_size = int(self._dataset_chunk_size_mb if not chunk_size else chunk_size)
|
||||||
upload_futures = []
|
upload_futures = []
|
||||||
|
|
||||||
|
self._fix_dataset_files_parents()
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
with ThreadPoolExecutor(max_workers=max_workers) as pool:
|
||||||
parallel_zipper = ParallelZipper(
|
parallel_zipper = ParallelZipper(
|
||||||
chunk_size,
|
chunk_size,
|
||||||
@ -1341,6 +1343,20 @@ class Dataset(object):
|
|||||||
cls._set_project_system_tags(instance._task)
|
cls._set_project_system_tags(instance._task)
|
||||||
return instance
|
return instance
|
||||||
|
|
||||||
|
def _fix_dataset_files_parents(self):
|
||||||
|
# type: () -> ()
|
||||||
|
"""
|
||||||
|
Needed when someone removes and adds the same file -> parent data will be lost
|
||||||
|
"""
|
||||||
|
datasets = self._dependency_graph[self._id]
|
||||||
|
for ds_id in datasets:
|
||||||
|
dataset = self.get(dataset_id=ds_id)
|
||||||
|
for parent_file_key, parent_file_value in dataset._dataset_file_entries.items():
|
||||||
|
if parent_file_key not in self._dataset_file_entries:
|
||||||
|
continue
|
||||||
|
if parent_file_value.hash == self._dataset_file_entries[parent_file_key].hash:
|
||||||
|
self._dataset_file_entries[parent_file_key].parent_dataset_id = ds_id
|
||||||
|
|
||||||
def _get_total_size_compressed_parents(self):
|
def _get_total_size_compressed_parents(self):
|
||||||
# type: () -> int
|
# type: () -> int
|
||||||
"""
|
"""
|
||||||
|
Loading…
Reference in New Issue
Block a user