mirror of
https://github.com/clearml/clearml
synced 2025-06-26 18:16:07 +00:00
Add clearml-Data (Datasets) multi-chunk support
This commit is contained in:
parent
0dd9ba8adc
commit
844c01e15b
@ -179,8 +179,8 @@ class PrintPatchLogger(object):
|
|||||||
cr_flush_period = None
|
cr_flush_period = None
|
||||||
|
|
||||||
def __init__(self, stream, logger=None, level=logging.INFO):
|
def __init__(self, stream, logger=None, level=logging.INFO):
|
||||||
if self.__class__.cr_flush_period is None:
|
if PrintPatchLogger.cr_flush_period is None:
|
||||||
self.__class__.cr_flush_period = config.get("development.worker.console_cr_flush_period", 0)
|
PrintPatchLogger.cr_flush_period = config.get("development.worker.console_cr_flush_period", 0)
|
||||||
PrintPatchLogger.patched = True
|
PrintPatchLogger.patched = True
|
||||||
self._terminal = stream
|
self._terminal = stream
|
||||||
self._log = logger
|
self._log = logger
|
||||||
|
@ -1266,6 +1266,34 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin):
|
|||||||
self._edit(execution=execution)
|
self._edit(execution=execution)
|
||||||
return self.data.execution.artifacts or []
|
return self.data.execution.artifacts or []
|
||||||
|
|
||||||
|
def _delete_artifacts(self, artifact_names):
|
||||||
|
# type: (Sequence[str]) -> bool
|
||||||
|
"""
|
||||||
|
Delete a list of artifacts, by artifact name, from the Task.
|
||||||
|
|
||||||
|
:param list artifact_names: list of artifact names
|
||||||
|
:return: True if successful
|
||||||
|
"""
|
||||||
|
if not Session.check_min_api_version('2.3'):
|
||||||
|
return False
|
||||||
|
if not isinstance(artifact_names, (list, tuple)):
|
||||||
|
raise ValueError('Expected artifact names as List[str]')
|
||||||
|
|
||||||
|
with self._edit_lock:
|
||||||
|
if Session.check_min_api_version("2.13") and not self._offline_mode:
|
||||||
|
req = tasks.DeleteArtifactsRequest(
|
||||||
|
task=self.task_id, artifacts=[{"key": n, "mode": "output"} for n in artifact_names], force=True)
|
||||||
|
res = self.send(req, raise_on_errors=False)
|
||||||
|
if not res or not res.response or not res.response.deleted:
|
||||||
|
return False
|
||||||
|
self.reload()
|
||||||
|
else:
|
||||||
|
self.reload()
|
||||||
|
execution = self.data.execution
|
||||||
|
execution.artifacts = [a for a in execution.artifacts or [] if a.key not in artifact_names]
|
||||||
|
self._edit(execution=execution)
|
||||||
|
return self.data.execution.artifacts or []
|
||||||
|
|
||||||
def _set_model_design(self, design=None):
|
def _set_model_design(self, design=None):
|
||||||
# type: (str) -> ()
|
# type: (str) -> ()
|
||||||
with self._edit_lock:
|
with self._edit_lock:
|
||||||
|
@ -85,7 +85,7 @@ def get_epoch_beginning_of_time(timezone_info=None):
|
|||||||
return datetime(1970, 1, 1).replace(tzinfo=timezone_info if timezone_info else utc_timezone)
|
return datetime(1970, 1, 1).replace(tzinfo=timezone_info if timezone_info else utc_timezone)
|
||||||
|
|
||||||
|
|
||||||
def get_single_result(entity, query, results, log=None, show_results=10, raise_on_error=True, sort_by_date=True):
|
def get_single_result(entity, query, results, log=None, show_results=1, raise_on_error=True, sort_by_date=True):
|
||||||
if not results:
|
if not results:
|
||||||
if not raise_on_error:
|
if not raise_on_error:
|
||||||
return None
|
return None
|
||||||
@ -96,8 +96,12 @@ def get_single_result(entity, query, results, log=None, show_results=10, raise_o
|
|||||||
if show_results:
|
if show_results:
|
||||||
if not log:
|
if not log:
|
||||||
log = get_logger()
|
log = get_logger()
|
||||||
log.warning('More than one {entity} found when searching for `{query}`'
|
if show_results > 1:
|
||||||
' (showing first {show_results} {entity}s follow)'.format(**locals()))
|
log.warning('{num} {entity} found when searching for `{query}`'
|
||||||
|
' (showing first {show_results} {entity}s follow)'.format(num=len(results), **locals()))
|
||||||
|
else:
|
||||||
|
log.warning('{num} {entity} found when searching for `{query}`'.format(num=len(results), **locals()))
|
||||||
|
|
||||||
if sort_by_date:
|
if sort_by_date:
|
||||||
relative_time = get_epoch_beginning_of_time()
|
relative_time = get_epoch_beginning_of_time()
|
||||||
# sort results based on timestamp and return the newest one
|
# sort results based on timestamp and return the newest one
|
||||||
|
@ -70,9 +70,7 @@ def cli():
|
|||||||
subparsers = parser.add_subparsers(help='Dataset actions', dest='command')
|
subparsers = parser.add_subparsers(help='Dataset actions', dest='command')
|
||||||
|
|
||||||
create = subparsers.add_parser('create', help='Create a new dataset')
|
create = subparsers.add_parser('create', help='Create a new dataset')
|
||||||
create.add_argument('--parents', type=str, nargs='*',
|
create.add_argument('--parents', type=str, nargs='*', help='Specify dataset parents IDs (i.e. merge all parents)')
|
||||||
help='[Optional] Specify dataset parents IDs (i.e. merge all parents). '
|
|
||||||
'Example: a17b4fID1 f0ee5ID2 a17b4f09eID3')
|
|
||||||
create.add_argument('--project', type=str, required=False, default=None, help='Dataset project name')
|
create.add_argument('--project', type=str, required=False, default=None, help='Dataset project name')
|
||||||
create.add_argument('--name', type=str, required=True, default=None, help='Dataset name')
|
create.add_argument('--name', type=str, required=True, default=None, help='Dataset name')
|
||||||
create.add_argument('--tags', type=str, nargs='*', help='Dataset user Tags')
|
create.add_argument('--tags', type=str, nargs='*', help='Dataset user Tags')
|
||||||
@ -100,20 +98,22 @@ def cli():
|
|||||||
help='Local folder to sync (support for wildcard selection). '
|
help='Local folder to sync (support for wildcard selection). '
|
||||||
'Example: ~/data/*.jpg')
|
'Example: ~/data/*.jpg')
|
||||||
sync.add_argument('--parents', type=str, nargs='*',
|
sync.add_argument('--parents', type=str, nargs='*',
|
||||||
help='[Optional] Specify dataset parents IDs (i.e. merge all parents). '
|
help='[Optional - Create new dataset] Specify dataset parents IDs (i.e. merge all parents)')
|
||||||
'Example: a17b4fID1 f0ee5ID2 a17b4f09eID3')
|
|
||||||
sync.add_argument('--project', type=str, required=False, default=None,
|
sync.add_argument('--project', type=str, required=False, default=None,
|
||||||
help='[Optional] Dataset project name')
|
help='[Optional - Create new dataset] Dataset project name')
|
||||||
sync.add_argument('--name', type=str, required=False, default=None,
|
sync.add_argument('--name', type=str, required=False, default=None,
|
||||||
help='[Optional] Dataset project name')
|
help='[Optional - Create new dataset] Dataset project name')
|
||||||
sync.add_argument('--tags', type=str, nargs='*',
|
sync.add_argument('--tags', type=str, nargs='*',
|
||||||
help='[Optional] Dataset user Tags')
|
help='[Optional - Create new dataset] Dataset user Tags')
|
||||||
sync.add_argument('--storage', type=str, default=None,
|
sync.add_argument('--storage', type=str, default=None,
|
||||||
help='Remote storage to use for the dataset files (default: files_server). '
|
help='Remote storage to use for the dataset files (default: files_server). '
|
||||||
'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', '
|
'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', '
|
||||||
'\'/mnt/shared/folder/data\'')
|
'\'/mnt/shared/folder/data\'')
|
||||||
sync.add_argument('--skip-close', action='store_true', default=False,
|
sync.add_argument('--skip-close', action='store_true', default=False,
|
||||||
help='Do not auto close dataset after syncing folders')
|
help='Do not auto close dataset after syncing folders')
|
||||||
|
sync.add_argument('--chunk-size', default=-1, type=int,
|
||||||
|
help='Set dataset artifact chunk size in MB. Default -1, unlimited size. '
|
||||||
|
'Example: 512, dataset will be split and uploaded in 512mb chunks.')
|
||||||
sync.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
sync.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
||||||
sync.set_defaults(func=ds_sync)
|
sync.set_defaults(func=ds_sync)
|
||||||
|
|
||||||
@ -136,6 +136,9 @@ def cli():
|
|||||||
help='Remote storage to use for the dataset files (default: files_server). '
|
help='Remote storage to use for the dataset files (default: files_server). '
|
||||||
'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', '
|
'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', '
|
||||||
'\'/mnt/shared/folder/data\'')
|
'\'/mnt/shared/folder/data\'')
|
||||||
|
upload.add_argument('--chunk-size', default=-1, type=int,
|
||||||
|
help='Set dataset artifact chunk size in MB. Default -1, unlimited size. '
|
||||||
|
'Example: 512, dataset will be split and uploaded in 512mb chunks.')
|
||||||
upload.add_argument('--verbose', default=False, action='store_true', help='Verbose reporting')
|
upload.add_argument('--verbose', default=False, action='store_true', help='Verbose reporting')
|
||||||
upload.set_defaults(func=ds_upload)
|
upload.set_defaults(func=ds_upload)
|
||||||
|
|
||||||
@ -148,6 +151,9 @@ def cli():
|
|||||||
'\'/mnt/shared/folder/data\'')
|
'\'/mnt/shared/folder/data\'')
|
||||||
finalize.add_argument('--disable-upload', action='store_true', default=False,
|
finalize.add_argument('--disable-upload', action='store_true', default=False,
|
||||||
help='Disable automatic upload when closing the dataset')
|
help='Disable automatic upload when closing the dataset')
|
||||||
|
finalize.add_argument('--chunk-size', default=-1, type=int,
|
||||||
|
help='Set dataset artifact chunk size in MB. Default -1, unlimited size. '
|
||||||
|
'Example: 512, dataset will be split and uploaded in 512mb chunks.')
|
||||||
finalize.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
finalize.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
||||||
finalize.set_defaults(func=ds_close)
|
finalize.set_defaults(func=ds_close)
|
||||||
|
|
||||||
@ -216,6 +222,14 @@ def cli():
|
|||||||
get.add_argument('--link', type=str, default=None,
|
get.add_argument('--link', type=str, default=None,
|
||||||
help='Create a soft link (not supported on Windows) to a '
|
help='Create a soft link (not supported on Windows) to a '
|
||||||
'read-only cached folder containing the dataset')
|
'read-only cached folder containing the dataset')
|
||||||
|
get.add_argument('--part', type=int, default=None,
|
||||||
|
help='Retrieve a partial copy of the dataset. '
|
||||||
|
'Part number (0 to `num-parts`-1) of total parts --num-parts.')
|
||||||
|
get.add_argument('--num-parts', type=int, default=None,
|
||||||
|
help='Total number of parts to divide the dataset to. '
|
||||||
|
'Notice minimum retrieved part is a single chunk in a dataset (or its parents).'
|
||||||
|
'Example: Dataset gen4, with 3 parents, each with a single chunk, '
|
||||||
|
'can be divided into 4 parts')
|
||||||
get.add_argument('--overwrite', action='store_true', default=False, help='If True, overwrite the target folder')
|
get.add_argument('--overwrite', action='store_true', default=False, help='If True, overwrite the target folder')
|
||||||
get.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
get.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
||||||
get.set_defaults(func=ds_get)
|
get.set_defaults(func=ds_get)
|
||||||
@ -274,7 +288,7 @@ def ds_get(args):
|
|||||||
pass
|
pass
|
||||||
if args.copy:
|
if args.copy:
|
||||||
ds_folder = args.copy
|
ds_folder = args.copy
|
||||||
ds.get_mutable_local_copy(target_folder=ds_folder)
|
ds.get_mutable_local_copy(target_folder=ds_folder, part=args.part, num_parts=args.num_parts)
|
||||||
else:
|
else:
|
||||||
if args.link:
|
if args.link:
|
||||||
Path(args.link).mkdir(parents=True, exist_ok=True)
|
Path(args.link).mkdir(parents=True, exist_ok=True)
|
||||||
@ -286,7 +300,7 @@ def ds_get(args):
|
|||||||
Path(args.link).unlink()
|
Path(args.link).unlink()
|
||||||
except Exception:
|
except Exception:
|
||||||
raise ValueError("Target directory {} is not empty. Use --overwrite.".format(args.link))
|
raise ValueError("Target directory {} is not empty. Use --overwrite.".format(args.link))
|
||||||
ds_folder = ds.get_local_copy()
|
ds_folder = ds.get_local_copy(part=args.part, num_parts=args.num_parts)
|
||||||
if args.link:
|
if args.link:
|
||||||
os.symlink(ds_folder, args.link)
|
os.symlink(ds_folder, args.link)
|
||||||
ds_folder = args.link
|
ds_folder = args.link
|
||||||
@ -372,7 +386,10 @@ def ds_close(args):
|
|||||||
raise ValueError("Pending uploads, cannot finalize dataset. run `clearml-data upload`")
|
raise ValueError("Pending uploads, cannot finalize dataset. run `clearml-data upload`")
|
||||||
# upload the files
|
# upload the files
|
||||||
print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
|
print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
|
||||||
ds.upload(show_progress=True, verbose=args.verbose, output_url=args.storage or None)
|
ds.upload(show_progress=True,
|
||||||
|
verbose=args.verbose,
|
||||||
|
output_url=args.storage or None,
|
||||||
|
chunk_size=args.chunk_size or -1,)
|
||||||
|
|
||||||
ds.finalize()
|
ds.finalize()
|
||||||
print('Dataset closed and finalized')
|
print('Dataset closed and finalized')
|
||||||
@ -399,7 +416,7 @@ def ds_upload(args):
|
|||||||
check_null_id(args)
|
check_null_id(args)
|
||||||
print_args(args)
|
print_args(args)
|
||||||
ds = Dataset.get(dataset_id=args.id)
|
ds = Dataset.get(dataset_id=args.id)
|
||||||
ds.upload(verbose=args.verbose, output_url=args.storage or None)
|
ds.upload(verbose=args.verbose, output_url=args.storage or None, chunk_size=args.chunk_size or -1)
|
||||||
print('Dataset upload completed')
|
print('Dataset upload completed')
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@ -443,7 +460,10 @@ def ds_sync(args):
|
|||||||
if ds.is_dirty():
|
if ds.is_dirty():
|
||||||
# upload the files
|
# upload the files
|
||||||
print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
|
print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
|
||||||
ds.upload(show_progress=True, verbose=args.verbose, output_url=args.storage or None)
|
ds.upload(show_progress=True,
|
||||||
|
verbose=args.verbose,
|
||||||
|
output_url=args.storage or None,
|
||||||
|
chunk_size=args.chunk_size or -1, )
|
||||||
|
|
||||||
ds.finalize()
|
ds.finalize()
|
||||||
print('Dataset closed and finalized')
|
print('Dataset closed and finalized')
|
||||||
|
@ -16,6 +16,7 @@ from .. import Task, StorageManager, Logger
|
|||||||
from ..backend_api.session.client import APIClient
|
from ..backend_api.session.client import APIClient
|
||||||
from ..backend_interface.task.development.worker import DevWorker
|
from ..backend_interface.task.development.worker import DevWorker
|
||||||
from ..backend_interface.util import mutually_exclusive, exact_match_regex
|
from ..backend_interface.util import mutually_exclusive, exact_match_regex
|
||||||
|
from ..config import deferred_config
|
||||||
from ..debugging.log import LoggerRoot
|
from ..debugging.log import LoggerRoot
|
||||||
from ..storage.helper import StorageHelper
|
from ..storage.helper import StorageHelper
|
||||||
from ..storage.cache import CacheManager
|
from ..storage.cache import CacheManager
|
||||||
@ -33,6 +34,8 @@ class FileEntry(object):
|
|||||||
hash = attrib(default=None, type=str)
|
hash = attrib(default=None, type=str)
|
||||||
parent_dataset_id = attrib(default=None, type=str)
|
parent_dataset_id = attrib(default=None, type=str)
|
||||||
size = attrib(default=None, type=int)
|
size = attrib(default=None, type=int)
|
||||||
|
# support multi part artifact storage
|
||||||
|
artifact_name = attrib(default=None, type=str)
|
||||||
# cleared when file is uploaded.
|
# cleared when file is uploaded.
|
||||||
local_path = attrib(default=None, type=str)
|
local_path = attrib(default=None, type=str)
|
||||||
|
|
||||||
@ -40,6 +43,7 @@ class FileEntry(object):
|
|||||||
# type: () -> Dict
|
# type: () -> Dict
|
||||||
state = dict(relative_path=self.relative_path, hash=self.hash,
|
state = dict(relative_path=self.relative_path, hash=self.hash,
|
||||||
parent_dataset_id=self.parent_dataset_id, size=self.size,
|
parent_dataset_id=self.parent_dataset_id, size=self.size,
|
||||||
|
artifact_name=self.artifact_name,
|
||||||
**dict([('local_path', self.local_path)] if self.local_path else ()))
|
**dict([('local_path', self.local_path)] if self.local_path else ()))
|
||||||
return state
|
return state
|
||||||
|
|
||||||
@ -47,13 +51,15 @@ class FileEntry(object):
|
|||||||
class Dataset(object):
|
class Dataset(object):
|
||||||
__private_magic = 42 * 1337
|
__private_magic = 42 * 1337
|
||||||
__state_entry_name = 'state'
|
__state_entry_name = 'state'
|
||||||
__data_entry_name = 'data'
|
__default_data_entry_name = 'data'
|
||||||
|
__data_entry_name_prefix = 'data_'
|
||||||
__cache_context = 'datasets'
|
__cache_context = 'datasets'
|
||||||
__tag = 'dataset'
|
__tag = 'dataset'
|
||||||
__cache_folder_prefix = 'ds_'
|
__cache_folder_prefix = 'ds_'
|
||||||
__dataset_folder_template = CacheManager.set_context_folder_lookup(__cache_context, "{0}_archive_{1}")
|
__dataset_folder_template = CacheManager.set_context_folder_lookup(__cache_context, "{0}_archive_{1}")
|
||||||
__preview_max_file_entries = 15000
|
__preview_max_file_entries = 15000
|
||||||
__preview_max_size = 5 * 1024 * 1024
|
__preview_max_size = 5 * 1024 * 1024
|
||||||
|
_dataset_chunk_size_mb = deferred_config("storage.dataset_chunk_size_mb", 512, transform=int)
|
||||||
|
|
||||||
def __init__(self, _private, task=None, dataset_project=None, dataset_name=None, dataset_tags=None):
|
def __init__(self, _private, task=None, dataset_project=None, dataset_name=None, dataset_tags=None):
|
||||||
# type: (int, Optional[Task], Optional[str], Optional[str], Optional[Sequence[str]]) -> ()
|
# type: (int, Optional[Task], Optional[str], Optional[str], Optional[Sequence[str]]) -> ()
|
||||||
@ -68,8 +74,14 @@ class Dataset(object):
|
|||||||
if task:
|
if task:
|
||||||
self._task_pinger = None
|
self._task_pinger = None
|
||||||
self._created_task = False
|
self._created_task = False
|
||||||
|
task_status = task.data.status
|
||||||
|
# if we are continuing aborted Task, force the state
|
||||||
|
if str(task_status) == 'stopped':
|
||||||
|
task.mark_started(force=True)
|
||||||
|
task_status = 'in_progress'
|
||||||
|
|
||||||
# If we are reusing the main current Task, make sure we set its type to data_processing
|
# If we are reusing the main current Task, make sure we set its type to data_processing
|
||||||
if str(task.data.status) in ('created', 'in_progress'):
|
if str(task_status) in ('created', 'in_progress'):
|
||||||
if str(task.task_type) != str(Task.TaskTypes.data_processing):
|
if str(task.task_type) != str(Task.TaskTypes.data_processing):
|
||||||
task.set_task_type(task_type=Task.TaskTypes.data_processing)
|
task.set_task_type(task_type=Task.TaskTypes.data_processing)
|
||||||
task.set_system_tags((task.get_system_tags() or []) + [self.__tag])
|
task.set_system_tags((task.get_system_tags() or []) + [self.__tag])
|
||||||
@ -114,6 +126,11 @@ class Dataset(object):
|
|||||||
# dirty flag, set True by any function call changing the dataset (regardless of weather it did anything)
|
# dirty flag, set True by any function call changing the dataset (regardless of weather it did anything)
|
||||||
self._dirty = False
|
self._dirty = False
|
||||||
self._using_current_task = False
|
self._using_current_task = False
|
||||||
|
# set current artifact name to be used (support for multiple upload sessions)
|
||||||
|
self._data_artifact_name = self._get_next_data_artifact_name()
|
||||||
|
# store a cached lookup of the number of chunks each parent dataset has.
|
||||||
|
# this will help with verifying we have n up-to-date partial local copy
|
||||||
|
self._dependency_chunk_lookup = None # type: Optional[Dict[str, int]]
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def id(self):
|
def id(self):
|
||||||
@ -247,8 +264,8 @@ class Dataset(object):
|
|||||||
def sync_folder(self, local_path, dataset_path=None, verbose=False):
|
def sync_folder(self, local_path, dataset_path=None, verbose=False):
|
||||||
# type: (Union[Path, _Path, str], Union[Path, _Path, str], bool) -> (int, int)
|
# type: (Union[Path, _Path, str], Union[Path, _Path, str], bool) -> (int, int)
|
||||||
"""
|
"""
|
||||||
Synchronize the dataset with a local folder. The dataset is synchronized recursively from the `local_path` into
|
Synchronize the dataset with a local folder. The dataset is synchronized from the
|
||||||
the `dataset_path` (default: dataset root).
|
relative_base_folder (default: dataset root) and deeper with the specified local path.
|
||||||
|
|
||||||
:param local_path: Local folder to sync (assumes all files and recursive)
|
:param local_path: Local folder to sync (assumes all files and recursive)
|
||||||
:param dataset_path: Target dataset path to sync with (default the root of the dataset)
|
:param dataset_path: Target dataset path to sync with (default the root of the dataset)
|
||||||
@ -294,8 +311,8 @@ class Dataset(object):
|
|||||||
self._serialize()
|
self._serialize()
|
||||||
return removed_files, added_files
|
return removed_files, added_files
|
||||||
|
|
||||||
def upload(self, show_progress=True, verbose=False, output_url=None, compression=None):
|
def upload(self, show_progress=True, verbose=False, output_url=None, compression=None, chunk_size=None):
|
||||||
# type: (bool, bool, Optional[str], Optional[str]) -> ()
|
# type: (bool, bool, Optional[str], Optional[str], int) -> ()
|
||||||
"""
|
"""
|
||||||
Start file uploading, the function returns when all files are uploaded.
|
Start file uploading, the function returns when all files are uploaded.
|
||||||
|
|
||||||
@ -304,6 +321,9 @@ class Dataset(object):
|
|||||||
:param output_url: Target storage for the compressed dataset (default: file server)
|
:param output_url: Target storage for the compressed dataset (default: file server)
|
||||||
Examples: `s3://bucket/data`, `gs://bucket/data` , `azure://bucket/data` , `/mnt/share/data`
|
Examples: `s3://bucket/data`, `gs://bucket/data` , `azure://bucket/data` , `/mnt/share/data`
|
||||||
:param compression: Compression algorithm for the Zipped dataset file (default: ZIP_DEFLATED)
|
:param compression: Compression algorithm for the Zipped dataset file (default: ZIP_DEFLATED)
|
||||||
|
:param chunk_size: Artifact chunk size (MB) for the compressed dataset,
|
||||||
|
if not provided (None) use the default chunk size (512mb).
|
||||||
|
If -1 is provided, use a single zip artifact for the entire dataset change-set (old behaviour)
|
||||||
"""
|
"""
|
||||||
# set output_url
|
# set output_url
|
||||||
if output_url:
|
if output_url:
|
||||||
@ -314,16 +334,34 @@ class Dataset(object):
|
|||||||
dict(show_progress=show_progress, verbose=verbose, output_url=output_url, compression=compression)),
|
dict(show_progress=show_progress, verbose=verbose, output_url=output_url, compression=compression)),
|
||||||
print_console=False)
|
print_console=False)
|
||||||
|
|
||||||
|
list_zipped_artifacts = [] # List[Tuple[Path, int, str, str]]
|
||||||
|
list_file_entries = list(self._dataset_file_entries.values())
|
||||||
|
total_size = 0
|
||||||
|
chunk_size = int(self._dataset_chunk_size_mb if not chunk_size else chunk_size)
|
||||||
|
try:
|
||||||
|
from tqdm import tqdm # noqa
|
||||||
|
a_tqdm = tqdm(total=len(list_file_entries))
|
||||||
|
except ImportError:
|
||||||
|
a_tqdm = None
|
||||||
|
|
||||||
|
while list_file_entries:
|
||||||
fd, zip_file = mkstemp(
|
fd, zip_file = mkstemp(
|
||||||
prefix='dataset.{}.'.format(self._id), suffix='.zip'
|
prefix='dataset.{}.'.format(self._id), suffix='.zip'
|
||||||
)
|
)
|
||||||
archive_preview = ''
|
archive_preview = ''
|
||||||
count = 0
|
count = 0
|
||||||
|
processed = 0
|
||||||
|
zip_file = Path(zip_file)
|
||||||
|
print('{}Compressing local files, chunk {} [remaining {} files]'.format(
|
||||||
|
'\n' if a_tqdm else '', 1+len(list_zipped_artifacts), len(list_file_entries)))
|
||||||
try:
|
try:
|
||||||
with ZipFile(zip_file, 'w', allowZip64=True, compression=compression or ZIP_DEFLATED) as zf:
|
with ZipFile(zip_file.as_posix(), 'w', allowZip64=True, compression=compression or ZIP_DEFLATED) as zf:
|
||||||
for file_entry in self._dataset_file_entries.values():
|
for file_entry in list_file_entries:
|
||||||
|
processed += 1
|
||||||
|
if a_tqdm:
|
||||||
|
a_tqdm.update()
|
||||||
if not file_entry.local_path:
|
if not file_entry.local_path:
|
||||||
# file is located in a different version
|
# file is already in an uploaded artifact
|
||||||
continue
|
continue
|
||||||
filename = Path(file_entry.local_path)
|
filename = Path(file_entry.local_path)
|
||||||
if not filename.is_file():
|
if not filename.is_file():
|
||||||
@ -339,8 +377,12 @@ class Dataset(object):
|
|||||||
zf.write(filename.as_posix(), arcname=relative_file_name)
|
zf.write(filename.as_posix(), arcname=relative_file_name)
|
||||||
archive_preview += '{} - {}\n'.format(
|
archive_preview += '{} - {}\n'.format(
|
||||||
relative_file_name, format_size(filename.stat().st_size))
|
relative_file_name, format_size(filename.stat().st_size))
|
||||||
file_entry.local_path = None
|
file_entry.artifact_name = self._data_artifact_name
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
# limit the size of a single artifact
|
||||||
|
if chunk_size > 0 and zip_file.stat().st_size >= chunk_size * (1024**2):
|
||||||
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# failed uploading folder:
|
# failed uploading folder:
|
||||||
LoggerRoot.get_base_logger().warning(
|
LoggerRoot.get_base_logger().warning(
|
||||||
@ -349,17 +391,35 @@ class Dataset(object):
|
|||||||
finally:
|
finally:
|
||||||
os.close(fd)
|
os.close(fd)
|
||||||
|
|
||||||
zip_file = Path(zip_file)
|
|
||||||
|
|
||||||
if not count:
|
if not count:
|
||||||
zip_file.unlink()
|
zip_file.unlink()
|
||||||
|
|
||||||
|
total_size += zip_file.stat().st_size
|
||||||
|
# let's see what's left
|
||||||
|
list_file_entries = list_file_entries[processed:]
|
||||||
|
# update the artifact preview
|
||||||
|
archive_preview = 'Dataset archive content [{} files]:\n'.format(count) + archive_preview
|
||||||
|
# add into the list
|
||||||
|
list_zipped_artifacts += [(zip_file, count, archive_preview, self._data_artifact_name)]
|
||||||
|
# next artifact name to use
|
||||||
|
self._data_artifact_name = self._get_next_data_artifact_name(self._data_artifact_name)
|
||||||
|
|
||||||
|
if a_tqdm:
|
||||||
|
a_tqdm.close()
|
||||||
|
|
||||||
|
self._task.get_logger().report_text(
|
||||||
|
'File compression completed: total size {}, {} chunked stored (average size {})'.format(
|
||||||
|
format_size(total_size),
|
||||||
|
len(list_zipped_artifacts),
|
||||||
|
format_size(total_size / len(list_zipped_artifacts))))
|
||||||
|
|
||||||
|
if not list_zipped_artifacts:
|
||||||
LoggerRoot.get_base_logger().info('No pending files, skipping upload.')
|
LoggerRoot.get_base_logger().info('No pending files, skipping upload.')
|
||||||
self._dirty = False
|
self._dirty = False
|
||||||
self._serialize()
|
self._serialize()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
archive_preview = 'Dataset archive content [{} files]:\n'.format(count) + archive_preview
|
for i, (zip_file, count, archive_preview, artifact_name) in enumerate(list_zipped_artifacts):
|
||||||
|
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
# let's try to rename it
|
# let's try to rename it
|
||||||
@ -368,18 +428,29 @@ class Dataset(object):
|
|||||||
zip_file = new_zip_file
|
zip_file = new_zip_file
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
# remove files that could not be zipped, containing Null relative Path
|
|
||||||
self._dataset_file_entries = {k: v for k, v in self._dataset_file_entries.items()
|
|
||||||
if v.relative_path is not None}
|
|
||||||
# start upload
|
# start upload
|
||||||
zip_file_size = format_size(Path(zip_file).stat().st_size)
|
zip_file_size = format_size(Path(zip_file).stat().st_size)
|
||||||
self._task.get_logger().report_text(
|
self._task.get_logger().report_text(
|
||||||
'Uploading compressed dataset changes ({} files, total {}) to {}'.format(
|
'Uploading compressed dataset changes {}/{} ({} files {}) to {}'.format(
|
||||||
count, zip_file_size, self.get_default_storage()))
|
i+1, len(list_zipped_artifacts), count, zip_file_size, self.get_default_storage()))
|
||||||
|
|
||||||
self._task.upload_artifact(
|
self._task.upload_artifact(
|
||||||
name=self.__data_entry_name, artifact_object=Path(zip_file), preview=archive_preview,
|
name=artifact_name, artifact_object=Path(zip_file), preview=archive_preview,
|
||||||
delete_after_upload=True, wait_on_upload=True)
|
delete_after_upload=True, wait_on_upload=True)
|
||||||
self._task.get_logger().report_text('Upload completed ({})'.format(zip_file_size))
|
|
||||||
|
# mark as upload completed and serialize
|
||||||
|
for file_entry in self._dataset_file_entries.values():
|
||||||
|
if file_entry.parent_dataset_id == self._id and file_entry.artifact_name == artifact_name:
|
||||||
|
file_entry.local_path = None
|
||||||
|
# serialize current state
|
||||||
|
self._serialize()
|
||||||
|
|
||||||
|
# remove files that could not be zipped, containing Null relative Path
|
||||||
|
self._dataset_file_entries = {
|
||||||
|
k: v for k, v in self._dataset_file_entries.items() if v.relative_path is not None}
|
||||||
|
# report upload completed
|
||||||
|
self._task.get_logger().report_text('Upload completed ({})'.format(format_size(total_size)))
|
||||||
|
|
||||||
self._add_script_call(
|
self._add_script_call(
|
||||||
'upload', show_progress=show_progress, verbose=verbose, output_url=output_url, compression=compression)
|
'upload', show_progress=show_progress, verbose=verbose, output_url=output_url, compression=compression)
|
||||||
@ -409,7 +480,7 @@ class Dataset(object):
|
|||||||
self._task.get_logger().report_text('Finalizing dataset', print_console=False)
|
self._task.get_logger().report_text('Finalizing dataset', print_console=False)
|
||||||
|
|
||||||
# make sure we have no redundant parent versions
|
# make sure we have no redundant parent versions
|
||||||
self._serialize()
|
self._serialize(update_dependency_chunk_lookup=True)
|
||||||
self._add_script_call('finalize')
|
self._add_script_call('finalize')
|
||||||
if verbose:
|
if verbose:
|
||||||
print('Updating statistics and genealogy')
|
print('Updating statistics and genealogy')
|
||||||
@ -453,14 +524,26 @@ class Dataset(object):
|
|||||||
return self._task.get_status() not in (
|
return self._task.get_status() not in (
|
||||||
Task.TaskStatusEnum.in_progress, Task.TaskStatusEnum.created, Task.TaskStatusEnum.failed)
|
Task.TaskStatusEnum.in_progress, Task.TaskStatusEnum.created, Task.TaskStatusEnum.failed)
|
||||||
|
|
||||||
def get_local_copy(self, use_soft_links=None, raise_on_error=True):
|
def get_local_copy(self, use_soft_links=None, part=None, num_parts=None, raise_on_error=True):
|
||||||
# type: (bool, bool) -> str
|
# type: (bool, Optional[int], Optional[int], bool) -> str
|
||||||
"""
|
"""
|
||||||
return a base folder with a read-only (immutable) local copy of the entire dataset
|
return a base folder with a read-only (immutable) local copy of the entire dataset
|
||||||
download and copy / soft-link, files from all the parent dataset versions
|
download and copy / soft-link, files from all the parent dataset versions
|
||||||
|
|
||||||
:param use_soft_links: If True use soft links, default False on windows True on Posix systems
|
:param use_soft_links: If True use soft links, default False on windows True on Posix systems
|
||||||
|
:param part: Optional, if provided only download the selected part (index) of the Dataset.
|
||||||
|
First part number is `0` and last part is `num_parts-1`
|
||||||
|
Notice, if `num_parts` is not provided, number of parts will be equal to the total number of chunks
|
||||||
|
(i.e. sum over all chunks from the specified Dataset including all parent Datasets).
|
||||||
|
This argument is passed to parent datasets, as well as the implicit `num_parts`,
|
||||||
|
allowing users to get a partial copy of the entire dataset, for multi node/step processing.
|
||||||
|
:param num_parts: Optional, If specified normalize the number of chunks stored to the
|
||||||
|
requested number of parts. Notice that the actual chunks used per part are rounded down.
|
||||||
|
Example: Assuming total 8 chunks for this dataset (including parent datasets),
|
||||||
|
and `num_parts=5`, the chunk index used per parts would be:
|
||||||
|
part=0 -> chunks[0,5], part=1 -> chunks[1,6], part=2 -> chunks[2,7], part=3 -> chunks[3, ]
|
||||||
:param raise_on_error: If True raise exception if dataset merging failed on any file
|
:param raise_on_error: If True raise exception if dataset merging failed on any file
|
||||||
|
|
||||||
:return: A base folder for the entire dataset
|
:return: A base folder for the entire dataset
|
||||||
"""
|
"""
|
||||||
assert self._id
|
assert self._id
|
||||||
@ -468,11 +551,12 @@ class Dataset(object):
|
|||||||
self._task = Task.get_task(task_id=self._id)
|
self._task = Task.get_task(task_id=self._id)
|
||||||
|
|
||||||
# now let's merge the parents
|
# now let's merge the parents
|
||||||
target_folder = self._merge_datasets(use_soft_links=use_soft_links, raise_on_error=raise_on_error)
|
target_folder = self._merge_datasets(
|
||||||
|
use_soft_links=use_soft_links, raise_on_error=raise_on_error, part=part, num_parts=num_parts)
|
||||||
return target_folder
|
return target_folder
|
||||||
|
|
||||||
def get_mutable_local_copy(self, target_folder, overwrite=False, raise_on_error=True):
|
def get_mutable_local_copy(self, target_folder, overwrite=False, part=None, num_parts=None, raise_on_error=True):
|
||||||
# type: (Union[Path, _Path, str], bool, bool) -> Optional[str]
|
# type: (Union[Path, _Path, str], bool, Optional[int], Optional[int], bool) -> Optional[str]
|
||||||
"""
|
"""
|
||||||
return a base folder with a writable (mutable) local copy of the entire dataset
|
return a base folder with a writable (mutable) local copy of the entire dataset
|
||||||
download and copy / soft-link, files from all the parent dataset versions
|
download and copy / soft-link, files from all the parent dataset versions
|
||||||
@ -480,7 +564,19 @@ class Dataset(object):
|
|||||||
:param target_folder: Target folder for the writable copy
|
:param target_folder: Target folder for the writable copy
|
||||||
:param overwrite: If True, recursively delete the target folder before creating a copy.
|
:param overwrite: If True, recursively delete the target folder before creating a copy.
|
||||||
If False (default) and target folder contains files, raise exception or return None
|
If False (default) and target folder contains files, raise exception or return None
|
||||||
|
:param part: Optional, if provided only download the selected part (index) of the Dataset.
|
||||||
|
First part number is `0` and last part is `num_parts-1`
|
||||||
|
Notice, if `num_parts` is not provided, number of parts will be equal to the total number of chunks
|
||||||
|
(i.e. sum over all chunks from the specified Dataset including all parent Datasets).
|
||||||
|
This argument is passed to parent datasets, as well as the implicit `num_parts`,
|
||||||
|
allowing users to get a partial copy of the entire dataset, for multi node/step processing.
|
||||||
|
:param num_parts: Optional, If specified normalize the number of chunks stored to the
|
||||||
|
requested number of parts. Notice that the actual chunks used per part are rounded down.
|
||||||
|
Example: Assuming total 8 chunks for this dataset (including parent datasets),
|
||||||
|
and `num_parts=5`, the chunk index used per parts would be:
|
||||||
|
part=0 -> chunks[0,5], part=1 -> chunks[1,6], part=2 -> chunks[2,7], part=3 -> chunks[3, ]
|
||||||
:param raise_on_error: If True raise exception if dataset merging failed on any file
|
:param raise_on_error: If True raise exception if dataset merging failed on any file
|
||||||
|
|
||||||
:return: A the target folder containing the entire dataset
|
:return: A the target folder containing the entire dataset
|
||||||
"""
|
"""
|
||||||
assert self._id
|
assert self._id
|
||||||
@ -497,8 +593,8 @@ class Dataset(object):
|
|||||||
return None
|
return None
|
||||||
shutil.rmtree(target_folder.as_posix())
|
shutil.rmtree(target_folder.as_posix())
|
||||||
|
|
||||||
ro_folder = self.get_local_copy(raise_on_error=raise_on_error)
|
ro_folder = self.get_local_copy(part=part, num_parts=num_parts, raise_on_error=raise_on_error)
|
||||||
shutil.copytree(ro_folder, target_folder.as_posix())
|
shutil.copytree(ro_folder, target_folder.as_posix(), symlinks=False)
|
||||||
return target_folder.as_posix()
|
return target_folder.as_posix()
|
||||||
|
|
||||||
def list_files(self, dataset_path=None, recursive=True, dataset_id=None):
|
def list_files(self, dataset_path=None, recursive=True, dataset_id=None):
|
||||||
@ -680,7 +776,7 @@ class Dataset(object):
|
|||||||
If True, the dataset is created on the current Task.
|
If True, the dataset is created on the current Task.
|
||||||
:return: Newly created Dataset object
|
:return: Newly created Dataset object
|
||||||
"""
|
"""
|
||||||
parent_datasets = [cls.get(dataset_id=p) if isinstance(p, str) else p for p in (parent_datasets or [])]
|
parent_datasets = [cls.get(dataset_id=p) if not isinstance(p, Dataset) else p for p in (parent_datasets or [])]
|
||||||
if any(not p.is_final() for p in parent_datasets):
|
if any(not p.is_final() for p in parent_datasets):
|
||||||
raise ValueError("Cannot inherit from a parent that was not finalized/closed")
|
raise ValueError("Cannot inherit from a parent that was not finalized/closed")
|
||||||
|
|
||||||
@ -831,15 +927,19 @@ class Dataset(object):
|
|||||||
if task.status == 'created':
|
if task.status == 'created':
|
||||||
raise ValueError('Dataset id={} is in draft mode, delete and recreate it'.format(task.id))
|
raise ValueError('Dataset id={} is in draft mode, delete and recreate it'.format(task.id))
|
||||||
force_download = False if task.status in ('stopped', 'published', 'closed', 'completed') else True
|
force_download = False if task.status in ('stopped', 'published', 'closed', 'completed') else True
|
||||||
|
if cls.__state_entry_name in task.artifacts:
|
||||||
local_state_file = StorageManager.get_local_copy(
|
local_state_file = StorageManager.get_local_copy(
|
||||||
remote_url=task.artifacts[cls.__state_entry_name].url, cache_context=cls.__cache_context,
|
remote_url=task.artifacts[cls.__state_entry_name].url, cache_context=cls.__cache_context,
|
||||||
extract_archive=False, name=task.id, force_download=force_download)
|
extract_archive=False, name=task.id, force_download=force_download)
|
||||||
if not local_state_file:
|
if not local_state_file:
|
||||||
raise ValueError('Could not load Dataset id={} state'.format(task.id))
|
raise ValueError('Could not load Dataset id={} state'.format(task.id))
|
||||||
|
else:
|
||||||
|
# we could not find the serialized state, start empty
|
||||||
|
local_state_file = {}
|
||||||
|
|
||||||
instance = cls._deserialize(local_state_file, task)
|
instance = cls._deserialize(local_state_file, task)
|
||||||
# remove the artifact, just in case
|
# remove the artifact, just in case
|
||||||
if force_download:
|
if force_download and local_state_file:
|
||||||
os.unlink(local_state_file)
|
os.unlink(local_state_file)
|
||||||
|
|
||||||
return instance
|
return instance
|
||||||
@ -853,6 +953,23 @@ class Dataset(object):
|
|||||||
"""
|
"""
|
||||||
return self._task.get_logger()
|
return self._task.get_logger()
|
||||||
|
|
||||||
|
def get_num_chunks(self, include_parents=True):
|
||||||
|
# type: (bool) -> int
|
||||||
|
"""
|
||||||
|
Return the number of chunks stored on this dataset
|
||||||
|
(it does not imply on the number of chunks parent versions store)
|
||||||
|
|
||||||
|
:param include_parents: If True (default),
|
||||||
|
return the total number of chunks from this version and all parent versions.
|
||||||
|
If False, only return the number of chunks we stored on this specific version.
|
||||||
|
|
||||||
|
:return: Number of chunks stored on the dataset.
|
||||||
|
"""
|
||||||
|
if not include_parents:
|
||||||
|
return len(self._get_data_artifact_names())
|
||||||
|
|
||||||
|
return sum(self._get_dependency_chunk_lookup().values())
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def squash(cls, dataset_name, dataset_ids=None, dataset_project_name_pairs=None, output_url=None):
|
def squash(cls, dataset_name, dataset_ids=None, dataset_project_name_pairs=None, output_url=None):
|
||||||
# type: (str, Optional[Sequence[Union[str, Dataset]]],Optional[Sequence[(str, str)]], Optional[str]) -> Dataset
|
# type: (str, Optional[Sequence[Union[str, Dataset]]],Optional[Sequence[(str, str)]], Optional[str]) -> Dataset
|
||||||
@ -987,7 +1104,7 @@ class Dataset(object):
|
|||||||
self._task.get_logger().report_text('Generating SHA2 hash for {} files'.format(len(file_entries)))
|
self._task.get_logger().report_text('Generating SHA2 hash for {} files'.format(len(file_entries)))
|
||||||
pool = ThreadPool(cpu_count() * 2)
|
pool = ThreadPool(cpu_count() * 2)
|
||||||
try:
|
try:
|
||||||
import tqdm
|
import tqdm # noqa
|
||||||
for _ in tqdm.tqdm(pool.imap_unordered(self._calc_file_hash, file_entries), total=len(file_entries)):
|
for _ in tqdm.tqdm(pool.imap_unordered(self._calc_file_hash, file_entries), total=len(file_entries)):
|
||||||
pass
|
pass
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -1010,6 +1127,13 @@ class Dataset(object):
|
|||||||
self._dataset_file_entries[f.relative_path] = f
|
self._dataset_file_entries[f.relative_path] = f
|
||||||
count += 1
|
count += 1
|
||||||
elif f.parent_dataset_id == self._id and ds_cur_f.parent_dataset_id == self._id:
|
elif f.parent_dataset_id == self._id and ds_cur_f.parent_dataset_id == self._id:
|
||||||
|
# check if we have the file in an already uploaded chunk
|
||||||
|
if ds_cur_f.local_path is None:
|
||||||
|
# skipping, already uploaded.
|
||||||
|
if verbose:
|
||||||
|
self._task.get_logger().report_text('Skipping {}'.format(f.relative_path))
|
||||||
|
else:
|
||||||
|
# if we never uploaded it, mark for upload
|
||||||
if verbose:
|
if verbose:
|
||||||
self._task.get_logger().report_text('Re-Added {}'.format(f.relative_path))
|
self._task.get_logger().report_text('Re-Added {}'.format(f.relative_path))
|
||||||
self._dataset_file_entries[f.relative_path] = f
|
self._dataset_file_entries[f.relative_path] = f
|
||||||
@ -1028,17 +1152,22 @@ class Dataset(object):
|
|||||||
# collect all dataset versions
|
# collect all dataset versions
|
||||||
used_dataset_versions = set(f.parent_dataset_id for f in self._dataset_file_entries.values())
|
used_dataset_versions = set(f.parent_dataset_id for f in self._dataset_file_entries.values())
|
||||||
used_dataset_versions.add(self._id)
|
used_dataset_versions.add(self._id)
|
||||||
current_parents = self._dependency_graph.get(self._id)
|
current_parents = self._dependency_graph.get(self._id) or []
|
||||||
# remove parent versions we no longer need from the main version list
|
# remove parent versions we no longer need from the main version list
|
||||||
# per version, remove unnecessary parent versions, if we do not need them
|
# per version, remove unnecessary parent versions, if we do not need them
|
||||||
self._dependency_graph = {k: [p for p in parents if p in used_dataset_versions]
|
self._dependency_graph = {
|
||||||
|
k: [p for p in parents or [] if p in used_dataset_versions]
|
||||||
for k, parents in self._dependency_graph.items() if k in used_dataset_versions}
|
for k, parents in self._dependency_graph.items() if k in used_dataset_versions}
|
||||||
# make sure we do not remove our parents, for geology sake
|
# make sure we do not remove our parents, for geology sake
|
||||||
self._dependency_graph[self._id] = current_parents
|
self._dependency_graph[self._id] = current_parents
|
||||||
|
|
||||||
def _serialize(self):
|
def _serialize(self, update_dependency_chunk_lookup=False):
|
||||||
|
# type: (bool) -> ()
|
||||||
"""
|
"""
|
||||||
store current state of the Dataset for later use
|
store current state of the Dataset for later use
|
||||||
|
|
||||||
|
:param update_dependency_chunk_lookup: If True, update the parent versions number of chunks
|
||||||
|
|
||||||
:return: object to be used for later deserialization
|
:return: object to be used for later deserialization
|
||||||
"""
|
"""
|
||||||
self._update_dependency_graph()
|
self._update_dependency_graph()
|
||||||
@ -1049,6 +1178,9 @@ class Dataset(object):
|
|||||||
id=self._id,
|
id=self._id,
|
||||||
dirty=self._dirty,
|
dirty=self._dirty,
|
||||||
)
|
)
|
||||||
|
if update_dependency_chunk_lookup:
|
||||||
|
state['dependency_chunk_lookup'] = self._build_dependency_chunk_lookup()
|
||||||
|
|
||||||
modified_files = [f['size'] for f in state['dataset_file_entries'] if f.get('parent_dataset_id') == self._id]
|
modified_files = [f['size'] for f in state['dataset_file_entries'] if f.get('parent_dataset_id') == self._id]
|
||||||
preview = \
|
preview = \
|
||||||
'Dataset state\n' \
|
'Dataset state\n' \
|
||||||
@ -1060,130 +1192,210 @@ class Dataset(object):
|
|||||||
self._task.upload_artifact(
|
self._task.upload_artifact(
|
||||||
name=self.__state_entry_name, artifact_object=state, preview=preview, wait_on_upload=True)
|
name=self.__state_entry_name, artifact_object=state, preview=preview, wait_on_upload=True)
|
||||||
|
|
||||||
def _download_dataset_archive(self):
|
def _download_dataset_archives(self):
|
||||||
"""
|
"""
|
||||||
Download the dataset archive, return a link to locally stored zip file
|
Download the dataset archive, return a link to locally stored zip file
|
||||||
:return: Path to locally stored zip file
|
:return: List of paths to locally stored zip files
|
||||||
"""
|
"""
|
||||||
pass # TODO: implement
|
pass # TODO: implement
|
||||||
|
|
||||||
def _extract_dataset_archive(self):
|
def _extract_dataset_archive(
|
||||||
|
self,
|
||||||
|
force=False,
|
||||||
|
selected_chunks=None,
|
||||||
|
lock_target_folder=False,
|
||||||
|
cleanup_target_folder=True,
|
||||||
|
target_folder=None,
|
||||||
|
):
|
||||||
|
# type: (bool, Optional[List[int]], bool, bool, Optional[Path]) -> str
|
||||||
"""
|
"""
|
||||||
Download the dataset archive, and extract the zip content to a cached folder.
|
Download the dataset archive, and extract the zip content to a cached folder.
|
||||||
Notice no merging is done.
|
Notice no merging is done.
|
||||||
|
|
||||||
|
:param force: If True extract dataset content even if target folder exists and is not empty
|
||||||
|
:param selected_chunks: Optional, if provided only download the selected chunks (index) of the Dataset.
|
||||||
|
Example: Assuming 8 chunks on this version
|
||||||
|
selected_chunks=[0,1,2]
|
||||||
|
:param lock_target_folder: If True, local the target folder so the next cleanup will not delete
|
||||||
|
Notice you should unlock it manually, or wait fro the process to fnish for auto unlocking.
|
||||||
|
:param cleanup_target_folder: If True remove target folder recursively
|
||||||
|
:param target_folder: If provided use the specified target folder, default, auto generate from Dataset ID.
|
||||||
|
|
||||||
:return: Path to a local storage extracted archive
|
:return: Path to a local storage extracted archive
|
||||||
"""
|
"""
|
||||||
|
assert selected_chunks is None or isinstance(selected_chunks, (list, tuple))
|
||||||
|
|
||||||
if not self._task:
|
if not self._task:
|
||||||
self._task = Task.get_task(task_id=self._id)
|
self._task = Task.get_task(task_id=self._id)
|
||||||
|
|
||||||
|
data_artifact_entries = self._get_data_artifact_names()
|
||||||
|
|
||||||
|
if selected_chunks is not None and data_artifact_entries:
|
||||||
|
data_artifact_entries = [
|
||||||
|
d for d in data_artifact_entries
|
||||||
|
if self._get_chunk_idx_from_artifact_name(d) in selected_chunks]
|
||||||
|
|
||||||
|
# get cache manager
|
||||||
|
local_folder = Path(target_folder) if target_folder else \
|
||||||
|
self._create_ds_target_folder(lock_target_folder=lock_target_folder)
|
||||||
|
|
||||||
# check if we have a dataset with empty change set
|
# check if we have a dataset with empty change set
|
||||||
if not self._task.artifacts.get(self.__data_entry_name):
|
if not data_artifact_entries:
|
||||||
cache = CacheManager.get_cache_manager(cache_context=self.__cache_context)
|
|
||||||
local_folder = Path(cache.get_cache_folder()) / self._get_cache_folder_name()
|
|
||||||
local_folder.mkdir(parents=True, exist_ok=True)
|
|
||||||
return local_folder.as_posix()
|
return local_folder.as_posix()
|
||||||
|
|
||||||
|
# check if target folder is not empty
|
||||||
|
if not force and next(local_folder.glob('*'), None):
|
||||||
|
return local_folder.as_posix()
|
||||||
|
|
||||||
|
# if we got here, we need to clear the target folder
|
||||||
|
local_folder = local_folder.as_posix()
|
||||||
|
if cleanup_target_folder:
|
||||||
|
shutil.rmtree(local_folder, ignore_errors=True)
|
||||||
|
# verify target folder exists
|
||||||
|
Path(local_folder).mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def _download_part(data_artifact_name):
|
||||||
# download the dataset zip
|
# download the dataset zip
|
||||||
local_zip = StorageManager.get_local_copy(
|
local_zip = StorageManager.get_local_copy(
|
||||||
remote_url=self._task.artifacts[self.__data_entry_name].url, cache_context=self.__cache_context,
|
remote_url=self._task.artifacts[data_artifact_name].url, cache_context=self.__cache_context,
|
||||||
extract_archive=False, name=self._id)
|
extract_archive=False, name=self._id)
|
||||||
if not local_zip:
|
if not local_zip:
|
||||||
raise ValueError("Could not download dataset id={}".format(self._id))
|
raise ValueError("Could not download dataset id={} entry={}".format(self._id, data_artifact_name))
|
||||||
local_folder = (Path(local_zip).parent / self._get_cache_folder_name()).as_posix()
|
|
||||||
# if we got here, we need to clear the target folder
|
|
||||||
shutil.rmtree(local_folder, ignore_errors=True)
|
|
||||||
# noinspection PyProtectedMember
|
# noinspection PyProtectedMember
|
||||||
local_folder = StorageManager._extract_to_cache(
|
StorageManager._extract_to_cache(
|
||||||
cached_file=local_zip, name=self._id,
|
cached_file=local_zip, name=self._id,
|
||||||
cache_context=self.__cache_context, target_folder=local_folder)
|
cache_context=self.__cache_context, target_folder=local_folder, force=True)
|
||||||
|
|
||||||
|
# download al parts in parallel
|
||||||
|
# if len(data_artifact_entries) > 1:
|
||||||
|
# pool = ThreadPool()
|
||||||
|
# pool.map(_download_part, data_artifact_entries)
|
||||||
|
# pool.close()
|
||||||
|
# else:
|
||||||
|
# _download_part(data_artifact_entries[0])
|
||||||
|
for d in data_artifact_entries:
|
||||||
|
_download_part(d)
|
||||||
|
|
||||||
return local_folder
|
return local_folder
|
||||||
|
|
||||||
def _merge_datasets(self, use_soft_links=None, raise_on_error=True):
|
def _create_ds_target_folder(self, part=None, num_parts=None, lock_target_folder=True):
|
||||||
# type: (bool, bool) -> str
|
# type: (Optional[int], Optional[int], bool) -> Path
|
||||||
|
cache = CacheManager.get_cache_manager(cache_context=self.__cache_context)
|
||||||
|
local_folder = Path(cache.get_cache_folder()) / self._get_cache_folder_name(part=part, num_parts=num_parts)
|
||||||
|
if lock_target_folder:
|
||||||
|
cache.lock_cache_folder(local_folder)
|
||||||
|
local_folder.mkdir(parents=True, exist_ok=True)
|
||||||
|
return local_folder
|
||||||
|
|
||||||
|
def _get_data_artifact_names(self):
|
||||||
|
# type: () -> List[str]
|
||||||
|
data_artifact_entries = [
|
||||||
|
a for a in self._task.artifacts
|
||||||
|
if a and (a == self.__default_data_entry_name or str(a).startswith(self.__data_entry_name_prefix))]
|
||||||
|
return data_artifact_entries
|
||||||
|
|
||||||
|
def _get_next_data_artifact_name(self, last_artifact_name=None):
|
||||||
|
# type: (Optional[str]) -> str
|
||||||
|
if not last_artifact_name:
|
||||||
|
data_artifact_entries = self._get_data_artifact_names()
|
||||||
|
if len(data_artifact_entries) < 1:
|
||||||
|
return self.__default_data_entry_name
|
||||||
|
else:
|
||||||
|
data_artifact_entries = [last_artifact_name]
|
||||||
|
prefix = self.__data_entry_name_prefix
|
||||||
|
prefix_len = len(prefix)
|
||||||
|
numbers = sorted([int(a[prefix_len:]) for a in data_artifact_entries if a.startswith(prefix)])
|
||||||
|
return '{}{:03d}'.format(prefix, numbers[-1]+1 if numbers else 1)
|
||||||
|
|
||||||
|
def _merge_datasets(self, use_soft_links=None, raise_on_error=True, part=None, num_parts=None):
|
||||||
|
# type: (bool, bool, Optional[int], Optional[int]) -> str
|
||||||
"""
|
"""
|
||||||
download and copy / soft-link, files from all the parent dataset versions
|
download and copy / soft-link, files from all the parent dataset versions
|
||||||
:param use_soft_links: If True use soft links, default False on windows True on Posix systems
|
:param use_soft_links: If True use soft links, default False on windows True on Posix systems
|
||||||
:param raise_on_error: If True raise exception if dataset merging failed on any file
|
:param raise_on_error: If True raise exception if dataset merging failed on any file
|
||||||
|
:param part: Optional, if provided only download the selected part (index) of the Dataset.
|
||||||
|
Notice, if `num_parts` is not provided, number of parts will be equal to the number of chunks.
|
||||||
|
This argument is passed to parent versions, as well as the implicit `num_parts`,
|
||||||
|
allowing users to get a partial copy of the entire dataset, for multi node/step processing.
|
||||||
|
:param num_parts: Optional, If specified normalize the number of chunks stored to the
|
||||||
|
requested number of parts. Notice that the actual chunks used per part are rounded down.
|
||||||
|
Example: Assuming 8 chunks on this version, and `num_parts=5`, the chunk index used per parts would be:
|
||||||
|
part=0 -> chunks[0,5], part=1 -> chunks[1,6], part=2 -> chunks[2,7], part=3 -> chunks[3, ]
|
||||||
|
|
||||||
:return: the target folder
|
:return: the target folder
|
||||||
"""
|
"""
|
||||||
|
assert part is None or (isinstance(part, int) and part >= 0)
|
||||||
|
assert num_parts is None or (isinstance(num_parts, int) and num_parts >= 1)
|
||||||
|
|
||||||
if use_soft_links is None:
|
if use_soft_links is None:
|
||||||
use_soft_links = False if is_windows() else True
|
use_soft_links = False if is_windows() else True
|
||||||
|
|
||||||
# check if we already have everything
|
if part is not None and not num_parts:
|
||||||
target_base_folder, target_base_size = CacheManager.get_cache_manager(
|
num_parts = self.get_num_chunks()
|
||||||
cache_context=self.__cache_context).get_cache_file(local_filename=self._get_cache_folder_name())
|
|
||||||
if target_base_folder and target_base_size is not None:
|
|
||||||
target_base_folder = Path(target_base_folder)
|
|
||||||
# check dataset file size, if we have a full match no need for parent dataset download / merge
|
|
||||||
verified = True
|
|
||||||
# noinspection PyBroadException
|
|
||||||
try:
|
|
||||||
for f in self._dataset_file_entries.values():
|
|
||||||
if (target_base_folder / f.relative_path).stat().st_size != f.size:
|
|
||||||
verified = False
|
|
||||||
break
|
|
||||||
except Exception:
|
|
||||||
verified = False
|
|
||||||
|
|
||||||
if verified:
|
# just create the dataset target folder
|
||||||
|
target_base_folder = self._create_ds_target_folder(
|
||||||
|
part=part, num_parts=num_parts, lock_target_folder=True)
|
||||||
|
|
||||||
|
# selected specific chunks if `part` was passed
|
||||||
|
chunk_selection = None if part is None else self._build_chunk_selection(part=part, num_parts=num_parts)
|
||||||
|
|
||||||
|
# check if target folder is not empty, see if it contains everything we need
|
||||||
|
if target_base_folder and next(target_base_folder.iterdir(), None):
|
||||||
|
if self._verify_dataset_folder(target_base_folder, part, chunk_selection):
|
||||||
|
target_base_folder.touch()
|
||||||
return target_base_folder.as_posix()
|
return target_base_folder.as_posix()
|
||||||
else:
|
else:
|
||||||
LoggerRoot.get_base_logger().info('Dataset needs refreshing, fetching all parent datasets')
|
LoggerRoot.get_base_logger().info('Dataset needs refreshing, fetching all parent datasets')
|
||||||
|
# we should delete the entire cache folder
|
||||||
|
shutil.rmtree(target_base_folder.as_posix())
|
||||||
|
# make sure we recreate the dataset target folder
|
||||||
|
target_base_folder.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# get the dataset dependencies (if `part` was passed, only selected the ones in the selected part)
|
||||||
|
dependencies_by_order = self._get_dependencies_by_order(include_unused=False, include_current=True) \
|
||||||
|
if chunk_selection is None else list(chunk_selection.keys())
|
||||||
|
|
||||||
# first get our dataset
|
# first get our dataset
|
||||||
target_base_folder = Path(self._extract_dataset_archive())
|
if self._id in dependencies_by_order:
|
||||||
|
self._extract_dataset_archive(
|
||||||
|
force=True,
|
||||||
|
selected_chunks=chunk_selection.get(self._id) if chunk_selection else None,
|
||||||
|
cleanup_target_folder=True,
|
||||||
|
target_folder=target_base_folder,
|
||||||
|
)
|
||||||
|
dependencies_by_order.remove(self._id)
|
||||||
|
|
||||||
|
# update target folder timestamp
|
||||||
target_base_folder.touch()
|
target_base_folder.touch()
|
||||||
|
|
||||||
# create thread pool
|
# if we have no dependencies, we can just return now
|
||||||
pool = ThreadPool(cpu_count() * 2)
|
if not dependencies_by_order:
|
||||||
for dataset_version_id in self._get_dependencies_by_order():
|
|
||||||
# make sure we skip over empty dependencies
|
|
||||||
if dataset_version_id not in self._dependency_graph:
|
|
||||||
continue
|
|
||||||
|
|
||||||
ds = Dataset.get(dataset_id=dataset_version_id)
|
|
||||||
ds_base_folder = Path(ds._extract_dataset_archive())
|
|
||||||
ds_base_folder.touch()
|
|
||||||
|
|
||||||
def copy_file(file_entry):
|
|
||||||
if file_entry.parent_dataset_id != dataset_version_id:
|
|
||||||
return
|
|
||||||
source = (ds_base_folder / file_entry.relative_path).as_posix()
|
|
||||||
target = (target_base_folder / file_entry.relative_path).as_posix()
|
|
||||||
try:
|
|
||||||
# make sure we have can overwrite the target file
|
|
||||||
# noinspection PyBroadException
|
|
||||||
try:
|
|
||||||
os.unlink(target)
|
|
||||||
except Exception:
|
|
||||||
Path(target).parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# copy / link
|
|
||||||
if use_soft_links:
|
|
||||||
if not os.path.isfile(source):
|
|
||||||
raise ValueError("Extracted file missing {}".format(source))
|
|
||||||
os.symlink(source, target)
|
|
||||||
else:
|
|
||||||
shutil.copy2(source, target, follow_symlinks=True)
|
|
||||||
except Exception as ex:
|
|
||||||
LoggerRoot.get_base_logger().warning('{}\nFailed {} file {} to {}'.format(
|
|
||||||
ex, 'linking' if use_soft_links else 'copying', source, target))
|
|
||||||
return ex
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
errors = pool.map(copy_file, self._dataset_file_entries.values())
|
|
||||||
if raise_on_error and any(errors):
|
|
||||||
raise ValueError("Dataset merging failed: {}".format([e for e in errors if e is not None]))
|
|
||||||
|
|
||||||
pool.close()
|
|
||||||
return target_base_folder.absolute().as_posix()
|
return target_base_folder.absolute().as_posix()
|
||||||
|
|
||||||
def _get_dependencies_by_order(self, include_unused=False):
|
# extract parent datasets
|
||||||
# type: (bool) -> List[str]
|
self._extract_parent_datasets(
|
||||||
|
target_base_folder=target_base_folder, dependencies_by_order=dependencies_by_order,
|
||||||
|
chunk_selection=chunk_selection, use_soft_links=use_soft_links,
|
||||||
|
raise_on_error=False, force=False)
|
||||||
|
|
||||||
|
# verify entire dataset (if failed, force downloading parent datasets)
|
||||||
|
if not self._verify_dataset_folder(target_base_folder, part, chunk_selection):
|
||||||
|
LoggerRoot.get_base_logger().info('Dataset parents need refreshing, re-fetching all parent datasets')
|
||||||
|
# we should delete the entire cache folder
|
||||||
|
self._extract_parent_datasets(
|
||||||
|
target_base_folder=target_base_folder, dependencies_by_order=dependencies_by_order,
|
||||||
|
chunk_selection=chunk_selection, use_soft_links=use_soft_links,
|
||||||
|
raise_on_error=raise_on_error, force=True)
|
||||||
|
|
||||||
|
return target_base_folder.absolute().as_posix()
|
||||||
|
|
||||||
|
def _get_dependencies_by_order(self, include_unused=False, include_current=True):
|
||||||
|
# type: (bool, bool) -> List[str]
|
||||||
"""
|
"""
|
||||||
Return the dataset dependencies by order of application (from the last to the current)
|
Return the dataset dependencies by order of application (from the last to the current)
|
||||||
:param bool include_unused: If True include unused datasets in the dependencies
|
:param include_unused: If True include unused datasets in the dependencies
|
||||||
|
:param include_current: If True include the current dataset ID as the last ID in the list
|
||||||
:return: list of str representing the datasets id
|
:return: list of str representing the datasets id
|
||||||
"""
|
"""
|
||||||
roots = [self._id]
|
roots = [self._id]
|
||||||
@ -1196,10 +1408,10 @@ class Dataset(object):
|
|||||||
# add the parents of the current node, only if the parents are in the general graph node list
|
# add the parents of the current node, only if the parents are in the general graph node list
|
||||||
if include_unused and r not in self._dependency_graph:
|
if include_unused and r not in self._dependency_graph:
|
||||||
roots.extend(list(reversed(
|
roots.extend(list(reversed(
|
||||||
[p for p in self.get(dataset_id=r)._get_parents() if p not in roots])))
|
[p for p in (self.get(dataset_id=r)._get_parents() or []) if p not in roots])))
|
||||||
else:
|
else:
|
||||||
roots.extend(list(reversed(
|
roots.extend(list(reversed(
|
||||||
[p for p in self._dependency_graph.get(r, [])
|
[p for p in (self._dependency_graph.get(r) or [])
|
||||||
if p not in roots and (include_unused or (p in self._dependency_graph))])))
|
if p not in roots and (include_unused or (p in self._dependency_graph))])))
|
||||||
|
|
||||||
# make sure we cover leftovers
|
# make sure we cover leftovers
|
||||||
@ -1214,14 +1426,15 @@ class Dataset(object):
|
|||||||
# add the parents of the current node, only if the parents are in the general graph node list
|
# add the parents of the current node, only if the parents are in the general graph node list
|
||||||
if include_unused and r not in self._dependency_graph:
|
if include_unused and r not in self._dependency_graph:
|
||||||
roots.extend(list(reversed(
|
roots.extend(list(reversed(
|
||||||
[p for p in self.get(dataset_id=r)._get_parents() if p not in roots])))
|
[p for p in (self.get(dataset_id=r)._get_parents() or []) if p not in roots])))
|
||||||
else:
|
else:
|
||||||
roots.extend(list(reversed(
|
roots.extend(list(reversed(
|
||||||
[p for p in self._dependency_graph.get(r, [])
|
[p for p in (self._dependency_graph.get(r) or [])
|
||||||
if p not in roots and (include_unused or (p in self._dependency_graph))])))
|
if p not in roots and (include_unused or (p in self._dependency_graph))])))
|
||||||
|
|
||||||
# skip our id
|
# skip our id
|
||||||
return list(reversed(dependencies[1:]))
|
dependencies = list(reversed(dependencies[1:]))
|
||||||
|
return (dependencies + [self._id]) if include_current else dependencies
|
||||||
|
|
||||||
def _get_parents(self):
|
def _get_parents(self):
|
||||||
# type: () -> Sequence[str]
|
# type: () -> Sequence[str]
|
||||||
@ -1248,10 +1461,24 @@ class Dataset(object):
|
|||||||
|
|
||||||
instance = cls(_private=cls.__private_magic, task=task)
|
instance = cls(_private=cls.__private_magic, task=task)
|
||||||
# assert instance._id == stored_state['id'] # They should match
|
# assert instance._id == stored_state['id'] # They should match
|
||||||
instance._dependency_graph = stored_state['dependency_graph']
|
instance._dependency_graph = stored_state.get('dependency_graph', {})
|
||||||
instance._dirty = stored_state.get('dirty', False)
|
instance._dirty = stored_state.get('dirty', False)
|
||||||
instance._dataset_file_entries = {
|
instance._dataset_file_entries = {
|
||||||
s['relative_path']: FileEntry(**s) for s in stored_state['dataset_file_entries']}
|
s['relative_path']: FileEntry(**s) for s in stored_state.get('dataset_file_entries', [])}
|
||||||
|
if stored_state.get('dependency_chunk_lookup') is not None:
|
||||||
|
instance._dependency_chunk_lookup = stored_state.get('dependency_chunk_lookup')
|
||||||
|
|
||||||
|
# update the last used artifact (remove the one we never serialized, they rae considered broken)
|
||||||
|
if task.status in ('in_progress', 'created', 'stopped'):
|
||||||
|
artifact_names = set([
|
||||||
|
a.artifact_name for a in instance._dataset_file_entries.values()
|
||||||
|
if a.artifact_name and a.parent_dataset_id == instance._id])
|
||||||
|
missing_artifact_name = set(instance._get_data_artifact_names()) - artifact_names
|
||||||
|
if missing_artifact_name:
|
||||||
|
instance._task._delete_artifacts(list(missing_artifact_name))
|
||||||
|
# if we removed any data artifact, update the next data artifact name
|
||||||
|
instance._data_artifact_name = instance._get_next_data_artifact_name()
|
||||||
|
|
||||||
return instance
|
return instance
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -1272,8 +1499,27 @@ class Dataset(object):
|
|||||||
"""
|
"""
|
||||||
return 'dsh{}'.format(md5text(dataset_id))
|
return 'dsh{}'.format(md5text(dataset_id))
|
||||||
|
|
||||||
def _get_cache_folder_name(self):
|
def _build_dependency_chunk_lookup(self):
|
||||||
|
# type: () -> Dict[str, int]
|
||||||
|
"""
|
||||||
|
Build the dependency dataset id to number-of-chunks, lookup table
|
||||||
|
:return: lookup dictionary from dataset-id to number of chunks
|
||||||
|
"""
|
||||||
|
# with ThreadPool() as pool:
|
||||||
|
# chunks_lookup = pool.map(
|
||||||
|
# lambda d: (d, Dataset.get(dataset_id=d).get_num_chunks()),
|
||||||
|
# self._dependency_graph.keys())
|
||||||
|
# return dict(chunks_lookup)
|
||||||
|
chunks_lookup = map(
|
||||||
|
lambda d: (d, Dataset.get(dataset_id=d).get_num_chunks()),
|
||||||
|
self._dependency_graph.keys())
|
||||||
|
return dict(chunks_lookup)
|
||||||
|
|
||||||
|
def _get_cache_folder_name(self, part=None, num_parts=None):
|
||||||
|
# type: (Optional[int], Optional[int]) -> str
|
||||||
|
if part is None:
|
||||||
return '{}{}'.format(self.__cache_folder_prefix, self._id)
|
return '{}{}'.format(self.__cache_folder_prefix, self._id)
|
||||||
|
return '{}{}_{}_{}'.format(self.__cache_folder_prefix, self._id, part, num_parts)
|
||||||
|
|
||||||
def _add_script_call(self, func_name, **kwargs):
|
def _add_script_call(self, func_name, **kwargs):
|
||||||
# type: (str, **Any) -> ()
|
# type: (str, **Any) -> ()
|
||||||
@ -1306,7 +1552,7 @@ class Dataset(object):
|
|||||||
hovertemplate='<extra></extra>',
|
hovertemplate='<extra></extra>',
|
||||||
)
|
)
|
||||||
# get DAG nodes
|
# get DAG nodes
|
||||||
nodes = self._get_dependencies_by_order(include_unused=True) + [self.id]
|
nodes = self._get_dependencies_by_order(include_unused=True, include_current=True)
|
||||||
# dataset name lookup
|
# dataset name lookup
|
||||||
# noinspection PyProtectedMember
|
# noinspection PyProtectedMember
|
||||||
node_names = {t.id: t.name for t in Task._query_tasks(task_ids=nodes, only_fields=['id', 'name'])}
|
node_names = {t.id: t.name for t in Task._query_tasks(task_ids=nodes, only_fields=['id', 'name'])}
|
||||||
@ -1393,14 +1639,24 @@ class Dataset(object):
|
|||||||
# report the detailed content of the dataset as configuration,
|
# report the detailed content of the dataset as configuration,
|
||||||
# this allows for easy version comparison in the UI
|
# this allows for easy version comparison in the UI
|
||||||
dataset_details = None
|
dataset_details = None
|
||||||
|
dataset_details_header = None
|
||||||
|
dataset_details_header_template = 'File Name ({} files) - File Size (total {}) - Hash (SHA2)\n'
|
||||||
if len(self._dataset_file_entries) < self.__preview_max_file_entries:
|
if len(self._dataset_file_entries) < self.__preview_max_file_entries:
|
||||||
file_entries = sorted(self._dataset_file_entries.values(), key=lambda x: x.relative_path)
|
file_entries = sorted(self._dataset_file_entries.values(), key=lambda x: x.relative_path)
|
||||||
dataset_details = \
|
dataset_details_header = dataset_details_header_template.format(
|
||||||
'File Name - File Size - Hash (SHA2)\n' +\
|
len(file_entries), format_size(sum(f.size for f in file_entries))
|
||||||
|
)
|
||||||
|
dataset_details = dataset_details_header + \
|
||||||
'\n'.join('{} - {} - {}'.format(f.relative_path, f.size, f.hash) for f in file_entries)
|
'\n'.join('{} - {} - {}'.format(f.relative_path, f.size, f.hash) for f in file_entries)
|
||||||
|
|
||||||
# too large to store
|
# too large to store
|
||||||
if not dataset_details or len(dataset_details) > self.__preview_max_size:
|
if not dataset_details or len(dataset_details) > self.__preview_max_size:
|
||||||
dataset_details = 'Dataset content is too large to preview'
|
if not dataset_details_header:
|
||||||
|
dataset_details_header = dataset_details_header_template.format(
|
||||||
|
len(self._dataset_file_entries),
|
||||||
|
format_size(sum(f.size for f in self._dataset_file_entries.values()))
|
||||||
|
)
|
||||||
|
dataset_details = dataset_details_header + 'Dataset content is too large to preview'
|
||||||
|
|
||||||
# noinspection PyProtectedMember
|
# noinspection PyProtectedMember
|
||||||
self._task._set_configuration(
|
self._task._set_configuration(
|
||||||
@ -1428,3 +1684,154 @@ class Dataset(object):
|
|||||||
:return: Return True means dataset has pending uploads, call 'upload' to start an upload process.
|
:return: Return True means dataset has pending uploads, call 'upload' to start an upload process.
|
||||||
"""
|
"""
|
||||||
return self._dirty
|
return self._dirty
|
||||||
|
|
||||||
|
def _extract_parent_datasets(
|
||||||
|
self,
|
||||||
|
target_base_folder,
|
||||||
|
dependencies_by_order,
|
||||||
|
chunk_selection,
|
||||||
|
use_soft_links,
|
||||||
|
raise_on_error,
|
||||||
|
force
|
||||||
|
):
|
||||||
|
# type: (Path, List[str], Optional[dict], bool, bool, bool) -> ()
|
||||||
|
# create thread pool, for creating soft-links / copying
|
||||||
|
# todo: parallelize by parent datasets
|
||||||
|
pool = ThreadPool(cpu_count() * 2)
|
||||||
|
for dataset_version_id in dependencies_by_order:
|
||||||
|
# make sure we skip over empty dependencies
|
||||||
|
if dataset_version_id not in self._dependency_graph:
|
||||||
|
continue
|
||||||
|
selected_chunks = chunk_selection.get(dataset_version_id) if chunk_selection else None
|
||||||
|
|
||||||
|
ds = Dataset.get(dataset_id=dataset_version_id)
|
||||||
|
ds_base_folder = Path(ds._extract_dataset_archive(
|
||||||
|
selected_chunks=selected_chunks,
|
||||||
|
force=force,
|
||||||
|
lock_target_folder=True,
|
||||||
|
cleanup_target_folder=False,
|
||||||
|
))
|
||||||
|
ds_base_folder.touch()
|
||||||
|
|
||||||
|
def copy_file(file_entry):
|
||||||
|
if file_entry.parent_dataset_id != dataset_version_id or \
|
||||||
|
(selected_chunks is not None and
|
||||||
|
self._get_chunk_idx_from_artifact_name(file_entry.artifact_name) not in selected_chunks):
|
||||||
|
return
|
||||||
|
source = (ds_base_folder / file_entry.relative_path).as_posix()
|
||||||
|
target = (target_base_folder / file_entry.relative_path).as_posix()
|
||||||
|
try:
|
||||||
|
# make sure we have can overwrite the target file
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
os.unlink(target)
|
||||||
|
except Exception:
|
||||||
|
Path(target).parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# copy / link
|
||||||
|
if use_soft_links:
|
||||||
|
if not os.path.isfile(source):
|
||||||
|
raise ValueError("Extracted file missing {}".format(source))
|
||||||
|
os.symlink(source, target)
|
||||||
|
else:
|
||||||
|
shutil.copy2(source, target, follow_symlinks=True)
|
||||||
|
except Exception as ex:
|
||||||
|
LoggerRoot.get_base_logger().warning('{}\nFailed {} file {} to {}'.format(
|
||||||
|
ex, 'linking' if use_soft_links else 'copying', source, target))
|
||||||
|
return ex
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
errors = pool.map(copy_file, self._dataset_file_entries.values())
|
||||||
|
|
||||||
|
CacheManager.get_cache_manager(cache_context=self.__cache_context).unlock_cache_folder(
|
||||||
|
ds_base_folder.as_posix())
|
||||||
|
|
||||||
|
if raise_on_error and any(errors):
|
||||||
|
raise ValueError("Dataset merging failed: {}".format([e for e in errors if e is not None]))
|
||||||
|
pool.close()
|
||||||
|
|
||||||
|
def _verify_dataset_folder(self, target_base_folder, part, chunk_selection):
|
||||||
|
# type: (Path, Optional[int], Optional[dict]) -> bool
|
||||||
|
target_base_folder = Path(target_base_folder)
|
||||||
|
# check dataset file size, if we have a full match no need for parent dataset download / merge
|
||||||
|
verified = True
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
for f in self._dataset_file_entries.values():
|
||||||
|
# check if we need it for the current part
|
||||||
|
if part is not None:
|
||||||
|
f_parts = chunk_selection.get(f.parent_dataset_id, [])
|
||||||
|
# this is not in our current part, no need to check it.
|
||||||
|
if self._get_chunk_idx_from_artifact_name(f.artifact_name) not in f_parts:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# check if the local size and the stored size match (faster than comparing hash)
|
||||||
|
if (target_base_folder / f.relative_path).stat().st_size != f.size:
|
||||||
|
verified = False
|
||||||
|
break
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
verified = False
|
||||||
|
|
||||||
|
return verified
|
||||||
|
|
||||||
|
def _get_dependency_chunk_lookup(self):
|
||||||
|
# type: () -> Dict[str, int]
|
||||||
|
"""
|
||||||
|
Return The parent dataset ID to number of chunks lookup table
|
||||||
|
:return: Dict key is dataset ID, value is total number of chunks for the specific dataset version.
|
||||||
|
"""
|
||||||
|
if self._dependency_chunk_lookup is None:
|
||||||
|
self._dependency_chunk_lookup = self._build_dependency_chunk_lookup()
|
||||||
|
return self._dependency_chunk_lookup
|
||||||
|
|
||||||
|
def _build_chunk_selection(self, part, num_parts):
|
||||||
|
# type: (int, int) -> Dict[str, int]
|
||||||
|
"""
|
||||||
|
Build the selected chunks from each parent version, based on the current selection.
|
||||||
|
Notice that for a specific part, one can only get the chunks from parent versions (not including this one)
|
||||||
|
:param part: Current part index (between 0 and num_parts-1)
|
||||||
|
:param num_parts: Total number of parts to divide the dataset into
|
||||||
|
:return: Dict of Dataset ID and their respected chunks used for this part number
|
||||||
|
"""
|
||||||
|
# get the chunk dependencies
|
||||||
|
dependency_chunk_lookup = self._get_dependency_chunk_lookup()
|
||||||
|
|
||||||
|
# first collect the total number of chunks
|
||||||
|
total_chunks = sum(dependency_chunk_lookup.values())
|
||||||
|
|
||||||
|
avg_chunk_per_part = total_chunks // num_parts
|
||||||
|
leftover_chunks = total_chunks % num_parts
|
||||||
|
|
||||||
|
dependencies = self._get_dependencies_by_order(include_unused=False, include_current=True)
|
||||||
|
# create the part look up
|
||||||
|
ds_id_chunk_list = [(d, i) for d in dependencies for i in range(dependency_chunk_lookup.get(d, 1))]
|
||||||
|
|
||||||
|
# select the chunks for this part
|
||||||
|
if part < leftover_chunks:
|
||||||
|
indexes = ds_id_chunk_list[part*(avg_chunk_per_part+1):(part+1)*(avg_chunk_per_part+1)]
|
||||||
|
else:
|
||||||
|
ds_id_chunk_list = ds_id_chunk_list[leftover_chunks*(avg_chunk_per_part+1):]
|
||||||
|
indexes = ds_id_chunk_list[(part-leftover_chunks)*avg_chunk_per_part:
|
||||||
|
(part-leftover_chunks+1)*avg_chunk_per_part]
|
||||||
|
|
||||||
|
# convert to lookup
|
||||||
|
chunk_selection = {}
|
||||||
|
for d, i in indexes:
|
||||||
|
chunk_selection[d] = chunk_selection.get(d, []) + [i]
|
||||||
|
|
||||||
|
return chunk_selection
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _get_chunk_idx_from_artifact_name(cls, artifact_name):
|
||||||
|
# type: (str) -> int
|
||||||
|
if not artifact_name:
|
||||||
|
return -1
|
||||||
|
artifact_name = str(artifact_name)
|
||||||
|
|
||||||
|
if artifact_name == cls.__default_data_entry_name:
|
||||||
|
return 0
|
||||||
|
if artifact_name.startswith(cls.__data_entry_name_prefix):
|
||||||
|
return int(artifact_name[len(cls.__data_entry_name_prefix):])
|
||||||
|
return -1
|
||||||
|
@ -1,13 +1,20 @@
|
|||||||
|
import atexit
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from threading import RLock
|
||||||
|
from typing import Union, Optional, Tuple, Dict
|
||||||
|
|
||||||
from pathlib2 import Path
|
from pathlib2 import Path
|
||||||
|
|
||||||
from .helper import StorageHelper
|
from .helper import StorageHelper
|
||||||
from .util import quote_url
|
from .util import quote_url
|
||||||
from ..config import get_cache_dir, deferred_config
|
from ..config import get_cache_dir, deferred_config
|
||||||
from ..debugging.log import LoggerRoot
|
from ..debugging.log import LoggerRoot
|
||||||
|
from ..utilities.locks.utils import Lock as FileLock
|
||||||
|
from ..utilities.locks.exceptions import LockException
|
||||||
|
|
||||||
|
|
||||||
class CacheManager(object):
|
class CacheManager(object):
|
||||||
@ -19,17 +26,26 @@ class CacheManager(object):
|
|||||||
__local_to_remote_url_lookup_max_size = 1024
|
__local_to_remote_url_lookup_max_size = 1024
|
||||||
_context_to_folder_lookup = dict()
|
_context_to_folder_lookup = dict()
|
||||||
_default_context_folder_template = "{0}_artifacts_archive_{1}"
|
_default_context_folder_template = "{0}_artifacts_archive_{1}"
|
||||||
|
_lockfile_prefix = '.lock.'
|
||||||
|
_lockfile_suffix = '.clearml'
|
||||||
|
|
||||||
class CacheContext(object):
|
class CacheContext(object):
|
||||||
|
_folder_locks = dict() # type: Dict[str, FileLock]
|
||||||
|
_lockfile_at_exit_cb = None
|
||||||
|
|
||||||
def __init__(self, cache_context, default_cache_file_limit=10):
|
def __init__(self, cache_context, default_cache_file_limit=10):
|
||||||
|
# type: (str, int) -> None
|
||||||
self._context = str(cache_context)
|
self._context = str(cache_context)
|
||||||
self._file_limit = int(default_cache_file_limit)
|
self._file_limit = int(default_cache_file_limit)
|
||||||
|
self._rlock = RLock()
|
||||||
|
|
||||||
def set_cache_limit(self, cache_file_limit):
|
def set_cache_limit(self, cache_file_limit):
|
||||||
|
# type: (int) -> int
|
||||||
self._file_limit = max(self._file_limit, int(cache_file_limit))
|
self._file_limit = max(self._file_limit, int(cache_file_limit))
|
||||||
return self._file_limit
|
return self._file_limit
|
||||||
|
|
||||||
def get_local_copy(self, remote_url, force_download):
|
def get_local_copy(self, remote_url, force_download):
|
||||||
|
# type: (str, bool) -> Optional[str]
|
||||||
helper = StorageHelper.get(remote_url)
|
helper = StorageHelper.get(remote_url)
|
||||||
if not helper:
|
if not helper:
|
||||||
raise ValueError("Storage access failed: {}".format(remote_url))
|
raise ValueError("Storage access failed: {}".format(remote_url))
|
||||||
@ -59,6 +75,7 @@ class CacheManager(object):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def upload_file(local_file, remote_url, wait_for_upload=True, retries=1):
|
def upload_file(local_file, remote_url, wait_for_upload=True, retries=1):
|
||||||
|
# type: (str, str, bool, int) -> Optional[str]
|
||||||
helper = StorageHelper.get(remote_url)
|
helper = StorageHelper.get(remote_url)
|
||||||
result = helper.upload(
|
result = helper.upload(
|
||||||
local_file, remote_url, async_enable=not wait_for_upload, retries=retries,
|
local_file, remote_url, async_enable=not wait_for_upload, retries=retries,
|
||||||
@ -68,11 +85,13 @@ class CacheManager(object):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_hashed_url_file(cls, url):
|
def get_hashed_url_file(cls, url):
|
||||||
|
# type: (str) -> str
|
||||||
str_hash = hashlib.md5(url.encode()).hexdigest()
|
str_hash = hashlib.md5(url.encode()).hexdigest()
|
||||||
filename = url.split("/")[-1]
|
filename = url.split("/")[-1]
|
||||||
return "{}.{}".format(str_hash, quote_url(filename))
|
return "{}.{}".format(str_hash, quote_url(filename))
|
||||||
|
|
||||||
def get_cache_folder(self):
|
def get_cache_folder(self):
|
||||||
|
# type: () -> str
|
||||||
"""
|
"""
|
||||||
:return: full path to current contexts cache folder
|
:return: full path to current contexts cache folder
|
||||||
"""
|
"""
|
||||||
@ -82,6 +101,7 @@ class CacheManager(object):
|
|||||||
return folder.as_posix()
|
return folder.as_posix()
|
||||||
|
|
||||||
def get_cache_file(self, remote_url=None, local_filename=None):
|
def get_cache_file(self, remote_url=None, local_filename=None):
|
||||||
|
# type: (Optional[str], Optional[str]) -> Tuple[str, Optional[int]]
|
||||||
"""
|
"""
|
||||||
:param remote_url: check if we have the remote url in our cache
|
:param remote_url: check if we have the remote url in our cache
|
||||||
:param local_filename: if local_file is given, search for the local file/directory in the cache folder
|
:param local_filename: if local_file is given, search for the local file/directory in the cache folder
|
||||||
@ -123,10 +143,52 @@ class CacheManager(object):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# first exclude lock files
|
||||||
|
lock_files = dict()
|
||||||
|
files = []
|
||||||
|
for f in sorted(folder.iterdir(), reverse=True, key=sort_max_access_time):
|
||||||
|
if f.name.startswith(CacheManager._lockfile_prefix) and f.name.endswith(CacheManager._lockfile_suffix):
|
||||||
|
# parse the lock filename
|
||||||
|
name = f.name[len(CacheManager._lockfile_prefix):-len(CacheManager._lockfile_suffix)]
|
||||||
|
num, _, name = name.partition('.')
|
||||||
|
lock_files[name] = lock_files.get(name, []) + [f.as_posix()]
|
||||||
|
else:
|
||||||
|
files.append(f)
|
||||||
|
|
||||||
|
# remove new lock files from the list (we will delete them when time comes)
|
||||||
|
for f in files[:self._file_limit]:
|
||||||
|
lock_files.pop(f.name, None)
|
||||||
|
|
||||||
# delete old files
|
# delete old files
|
||||||
files = sorted(folder.iterdir(), reverse=True, key=sort_max_access_time)
|
|
||||||
files = files[self._file_limit:]
|
files = files[self._file_limit:]
|
||||||
for f in files:
|
for i, f in enumerate(files):
|
||||||
|
if i < self._file_limit:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# check if the file is in the lock folder list:
|
||||||
|
folder_lock = self._folder_locks.get(f.absolute().as_posix())
|
||||||
|
if folder_lock:
|
||||||
|
# pop from lock files
|
||||||
|
lock_files.pop(f.name, None)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# check if someone else holds the lock file
|
||||||
|
locks = lock_files.get(f.name, [])
|
||||||
|
for l in locks:
|
||||||
|
try:
|
||||||
|
a_lock = FileLock(filename=l)
|
||||||
|
a_lock.acquire(timeout=0)
|
||||||
|
a_lock.release()
|
||||||
|
a_lock.delete_lock_file()
|
||||||
|
del a_lock
|
||||||
|
except LockException:
|
||||||
|
# someone have the lock skip the file
|
||||||
|
continue
|
||||||
|
|
||||||
|
# if we got here we need to pop from the lock_files, later we will delete the leftover entries
|
||||||
|
lock_files.pop(f.name, None)
|
||||||
|
|
||||||
|
# if we are here we can delete the file
|
||||||
if not f.is_dir():
|
if not f.is_dir():
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
@ -135,23 +197,93 @@ class CacheManager(object):
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(f)
|
shutil.rmtree(f.as_posix())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# failed deleting folder
|
# failed deleting folder
|
||||||
LoggerRoot.get_base_logger().debug(
|
LoggerRoot.get_base_logger().debug(
|
||||||
"Exception {}\nFailed deleting folder {}".format(e, f)
|
"Exception {}\nFailed deleting folder {}".format(e, f)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# cleanup old lock files
|
||||||
|
for lock_files in lock_files.values():
|
||||||
|
for f in lock_files:
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
os.unlink(f)
|
||||||
|
except BaseException:
|
||||||
|
pass
|
||||||
|
|
||||||
# if file doesn't exist, return file size None
|
# if file doesn't exist, return file size None
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
size = new_file.stat().st_size if new_file_exists else None
|
size = new_file.stat().st_size if new_file_exists else None
|
||||||
except Exception:
|
except Exception:
|
||||||
size = None
|
size = None
|
||||||
|
|
||||||
return new_file.as_posix(), size
|
return new_file.as_posix(), size
|
||||||
|
|
||||||
|
def lock_cache_folder(self, local_path):
|
||||||
|
# type: (Union[str, Path]) -> ()
|
||||||
|
"""
|
||||||
|
Lock a specific cache folder, making sure it will not be deleted in the next
|
||||||
|
cache cleanup round
|
||||||
|
:param local_path: Path (str/Path) to a sub-folder inside the instance cache folder
|
||||||
|
"""
|
||||||
|
local_path = Path(local_path).absolute()
|
||||||
|
self._rlock.acquire()
|
||||||
|
if self._lockfile_at_exit_cb is None:
|
||||||
|
self._lockfile_at_exit_cb = True
|
||||||
|
atexit.register(self._lock_file_cleanup_callback)
|
||||||
|
|
||||||
|
lock = self._folder_locks.get(local_path.as_posix())
|
||||||
|
i = 0
|
||||||
|
# try to create a lock if we do not already have one (if we do, we assume it is locked)
|
||||||
|
while not lock:
|
||||||
|
lock_path = local_path.parent / '{}{:03d}.{}{}'.format(
|
||||||
|
CacheManager._lockfile_prefix, i, local_path.name, CacheManager._lockfile_suffix)
|
||||||
|
lock = FileLock(filename=lock_path)
|
||||||
|
|
||||||
|
# try to lock folder (if we failed to create lock, try nex number)
|
||||||
|
try:
|
||||||
|
lock.acquire(timeout=0)
|
||||||
|
break
|
||||||
|
except LockException:
|
||||||
|
# failed locking, maybe someone else already locked it.
|
||||||
|
del lock
|
||||||
|
lock = None
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# store lock
|
||||||
|
self._folder_locks[local_path.as_posix()] = lock
|
||||||
|
self._rlock.release()
|
||||||
|
|
||||||
|
def unlock_cache_folder(self, local_path):
|
||||||
|
# type: (Union[str, Path]) -> ()
|
||||||
|
"""
|
||||||
|
Lock a specific cache folder, making sure it will not be deleted in the next
|
||||||
|
cache cleanup round
|
||||||
|
:param local_path: Path (str/Path) to a sub-folder inside the instance cache folder
|
||||||
|
"""
|
||||||
|
local_path = Path(local_path).absolute()
|
||||||
|
self._rlock.acquire()
|
||||||
|
# pop lock
|
||||||
|
lock = self._folder_locks.pop(local_path.as_posix(), None)
|
||||||
|
if lock:
|
||||||
|
lock.release()
|
||||||
|
lock.delete_lock_file()
|
||||||
|
del lock
|
||||||
|
|
||||||
|
self._rlock.release()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _lock_file_cleanup_callback(cls):
|
||||||
|
for lock in cls._folder_locks.values():
|
||||||
|
lock.release()
|
||||||
|
lock.delete_lock_file()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_cache_manager(cls, cache_context=None, cache_file_limit=None):
|
def get_cache_manager(cls, cache_context=None, cache_file_limit=None):
|
||||||
|
# type: (Optional[str], Optional[int]) -> CacheManager.CacheContext
|
||||||
cache_context = cache_context or cls._default_context
|
cache_context = cache_context or cls._default_context
|
||||||
if cache_context not in cls.__cache_managers:
|
if cache_context not in cls.__cache_managers:
|
||||||
cls.__cache_managers[cache_context] = cls.CacheContext(
|
cls.__cache_managers[cache_context] = cls.CacheContext(
|
||||||
@ -165,6 +297,7 @@ class CacheManager(object):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_remote_url(local_copy_path):
|
def get_remote_url(local_copy_path):
|
||||||
|
# type: (str) -> str
|
||||||
if not CacheManager._local_to_remote_url_lookup:
|
if not CacheManager._local_to_remote_url_lookup:
|
||||||
return local_copy_path
|
return local_copy_path
|
||||||
|
|
||||||
@ -178,6 +311,7 @@ class CacheManager(object):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _add_remote_url(remote_url, local_copy_path):
|
def _add_remote_url(remote_url, local_copy_path):
|
||||||
|
# type: (str, str) -> ()
|
||||||
# so that we can disable the cache lookup altogether
|
# so that we can disable the cache lookup altogether
|
||||||
if CacheManager._local_to_remote_url_lookup is None:
|
if CacheManager._local_to_remote_url_lookup is None:
|
||||||
return
|
return
|
||||||
@ -206,11 +340,13 @@ class CacheManager(object):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def set_context_folder_lookup(cls, context, name_template):
|
def set_context_folder_lookup(cls, context, name_template):
|
||||||
|
# type: (str, str) -> str
|
||||||
cls._context_to_folder_lookup[str(context)] = str(name_template)
|
cls._context_to_folder_lookup[str(context)] = str(name_template)
|
||||||
return str(name_template)
|
return str(name_template)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_context_folder_lookup(cls, context):
|
def get_context_folder_lookup(cls, context):
|
||||||
|
# type: (Optional[str]) -> str
|
||||||
if not context:
|
if not context:
|
||||||
return cls._default_context_folder_template
|
return cls._default_context_folder_template
|
||||||
return cls._context_to_folder_lookup.get(str(context), cls._default_context_folder_template)
|
return cls._context_to_folder_lookup.get(str(context), cls._default_context_folder_template)
|
||||||
|
@ -97,8 +97,16 @@ class StorageManager(object):
|
|||||||
).set_cache_limit(cache_file_limit)
|
).set_cache_limit(cache_file_limit)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _extract_to_cache(cls, cached_file, name, cache_context=None, target_folder=None, cache_path_encoding=None):
|
def _extract_to_cache(
|
||||||
# type: (str, str, Optional[str], Optional[str], Optional[str]) -> str
|
cls,
|
||||||
|
cached_file, # type: str
|
||||||
|
name, # type: str
|
||||||
|
cache_context=None, # type: Optional[str]
|
||||||
|
target_folder=None, # type: Optional[str]
|
||||||
|
cache_path_encoding=None, # type: Optional[str]
|
||||||
|
force=False, # type: bool
|
||||||
|
):
|
||||||
|
# type: (...) -> str
|
||||||
"""
|
"""
|
||||||
Extract cached file to cache folder
|
Extract cached file to cache folder
|
||||||
:param str cached_file: local copy of archive file
|
:param str cached_file: local copy of archive file
|
||||||
@ -108,6 +116,7 @@ class StorageManager(object):
|
|||||||
:param str cache_path_encoding: specify representation of the local path of the cached files,
|
:param str cache_path_encoding: specify representation of the local path of the cached files,
|
||||||
this will always point to local cache folder, even if we have direct access file.
|
this will always point to local cache folder, even if we have direct access file.
|
||||||
Used for extracting the cached archived based on cache_path_encoding
|
Used for extracting the cached archived based on cache_path_encoding
|
||||||
|
:param bool force: Force archive extraction even if target folder exists
|
||||||
:return: cached folder containing the extracted archive content
|
:return: cached folder containing the extracted archive content
|
||||||
"""
|
"""
|
||||||
if not cached_file:
|
if not cached_file:
|
||||||
@ -133,7 +142,7 @@ class StorageManager(object):
|
|||||||
target_folder = cache_folder / CacheManager.get_context_folder_lookup(
|
target_folder = cache_folder / CacheManager.get_context_folder_lookup(
|
||||||
cache_context).format(archive_suffix, name)
|
cache_context).format(archive_suffix, name)
|
||||||
|
|
||||||
if target_folder.is_dir():
|
if target_folder.is_dir() and not force:
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
target_folder.touch(exist_ok=True)
|
target_folder.touch(exist_ok=True)
|
||||||
@ -143,9 +152,14 @@ class StorageManager(object):
|
|||||||
|
|
||||||
base_logger = LoggerRoot.get_base_logger()
|
base_logger = LoggerRoot.get_base_logger()
|
||||||
try:
|
try:
|
||||||
|
# if target folder exists, meaning this is forced ao we extract directly into target folder
|
||||||
|
if target_folder.is_dir():
|
||||||
|
temp_target_folder = target_folder
|
||||||
|
else:
|
||||||
temp_target_folder = cache_folder / "{0}_{1}_{2}".format(
|
temp_target_folder = cache_folder / "{0}_{1}_{2}".format(
|
||||||
target_folder.name, time() * 1000, str(random()).replace('.', ''))
|
target_folder.name, time() * 1000, str(random()).replace('.', ''))
|
||||||
temp_target_folder.mkdir(parents=True, exist_ok=True)
|
temp_target_folder.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
if suffix == ".zip":
|
if suffix == ".zip":
|
||||||
ZipFile(cached_file.as_posix()).extractall(path=temp_target_folder.as_posix())
|
ZipFile(cached_file.as_posix()).extractall(path=temp_target_folder.as_posix())
|
||||||
elif suffix == ".tar.gz":
|
elif suffix == ".tar.gz":
|
||||||
@ -155,6 +169,7 @@ class StorageManager(object):
|
|||||||
with tarfile.open(cached_file.as_posix(), mode='r:gz') as file:
|
with tarfile.open(cached_file.as_posix(), mode='r:gz') as file:
|
||||||
file.extractall(temp_target_folder.as_posix())
|
file.extractall(temp_target_folder.as_posix())
|
||||||
|
|
||||||
|
if temp_target_folder != target_folder:
|
||||||
# we assume we will have such folder if we already extract the file
|
# we assume we will have such folder if we already extract the file
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
|
@ -179,6 +179,22 @@ class Lock(object):
|
|||||||
pass
|
pass
|
||||||
self.fh = None
|
self.fh = None
|
||||||
|
|
||||||
|
def delete_lock_file(self):
|
||||||
|
# type: () -> bool
|
||||||
|
"""
|
||||||
|
Remove the local file used for locking (fail if file is locked)
|
||||||
|
|
||||||
|
:return: True is successful
|
||||||
|
"""
|
||||||
|
if self.fh:
|
||||||
|
return False
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
os.unlink(path=self.filename)
|
||||||
|
except BaseException:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
def _get_fh(self):
|
def _get_fh(self):
|
||||||
'''Get a new filehandle'''
|
'''Get a new filehandle'''
|
||||||
return open(self.filename, self.mode, **self.file_open_kwargs)
|
return open(self.filename, self.mode, **self.file_open_kwargs)
|
||||||
|
Loading…
Reference in New Issue
Block a user