Add clearml-Data (Datasets) multi-chunk support

This commit is contained in:
allegroai 2021-09-25 23:07:49 +03:00
parent 0dd9ba8adc
commit 844c01e15b
8 changed files with 848 additions and 222 deletions

View File

@ -179,8 +179,8 @@ class PrintPatchLogger(object):
cr_flush_period = None cr_flush_period = None
def __init__(self, stream, logger=None, level=logging.INFO): def __init__(self, stream, logger=None, level=logging.INFO):
if self.__class__.cr_flush_period is None: if PrintPatchLogger.cr_flush_period is None:
self.__class__.cr_flush_period = config.get("development.worker.console_cr_flush_period", 0) PrintPatchLogger.cr_flush_period = config.get("development.worker.console_cr_flush_period", 0)
PrintPatchLogger.patched = True PrintPatchLogger.patched = True
self._terminal = stream self._terminal = stream
self._log = logger self._log = logger

View File

@ -1266,6 +1266,34 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin):
self._edit(execution=execution) self._edit(execution=execution)
return self.data.execution.artifacts or [] return self.data.execution.artifacts or []
def _delete_artifacts(self, artifact_names):
# type: (Sequence[str]) -> bool
"""
Delete a list of artifacts, by artifact name, from the Task.
:param list artifact_names: list of artifact names
:return: True if successful
"""
if not Session.check_min_api_version('2.3'):
return False
if not isinstance(artifact_names, (list, tuple)):
raise ValueError('Expected artifact names as List[str]')
with self._edit_lock:
if Session.check_min_api_version("2.13") and not self._offline_mode:
req = tasks.DeleteArtifactsRequest(
task=self.task_id, artifacts=[{"key": n, "mode": "output"} for n in artifact_names], force=True)
res = self.send(req, raise_on_errors=False)
if not res or not res.response or not res.response.deleted:
return False
self.reload()
else:
self.reload()
execution = self.data.execution
execution.artifacts = [a for a in execution.artifacts or [] if a.key not in artifact_names]
self._edit(execution=execution)
return self.data.execution.artifacts or []
def _set_model_design(self, design=None): def _set_model_design(self, design=None):
# type: (str) -> () # type: (str) -> ()
with self._edit_lock: with self._edit_lock:

View File

@ -85,7 +85,7 @@ def get_epoch_beginning_of_time(timezone_info=None):
return datetime(1970, 1, 1).replace(tzinfo=timezone_info if timezone_info else utc_timezone) return datetime(1970, 1, 1).replace(tzinfo=timezone_info if timezone_info else utc_timezone)
def get_single_result(entity, query, results, log=None, show_results=10, raise_on_error=True, sort_by_date=True): def get_single_result(entity, query, results, log=None, show_results=1, raise_on_error=True, sort_by_date=True):
if not results: if not results:
if not raise_on_error: if not raise_on_error:
return None return None
@ -96,8 +96,12 @@ def get_single_result(entity, query, results, log=None, show_results=10, raise_o
if show_results: if show_results:
if not log: if not log:
log = get_logger() log = get_logger()
log.warning('More than one {entity} found when searching for `{query}`' if show_results > 1:
' (showing first {show_results} {entity}s follow)'.format(**locals())) log.warning('{num} {entity} found when searching for `{query}`'
' (showing first {show_results} {entity}s follow)'.format(num=len(results), **locals()))
else:
log.warning('{num} {entity} found when searching for `{query}`'.format(num=len(results), **locals()))
if sort_by_date: if sort_by_date:
relative_time = get_epoch_beginning_of_time() relative_time = get_epoch_beginning_of_time()
# sort results based on timestamp and return the newest one # sort results based on timestamp and return the newest one

View File

@ -70,9 +70,7 @@ def cli():
subparsers = parser.add_subparsers(help='Dataset actions', dest='command') subparsers = parser.add_subparsers(help='Dataset actions', dest='command')
create = subparsers.add_parser('create', help='Create a new dataset') create = subparsers.add_parser('create', help='Create a new dataset')
create.add_argument('--parents', type=str, nargs='*', create.add_argument('--parents', type=str, nargs='*', help='Specify dataset parents IDs (i.e. merge all parents)')
help='[Optional] Specify dataset parents IDs (i.e. merge all parents). '
'Example: a17b4fID1 f0ee5ID2 a17b4f09eID3')
create.add_argument('--project', type=str, required=False, default=None, help='Dataset project name') create.add_argument('--project', type=str, required=False, default=None, help='Dataset project name')
create.add_argument('--name', type=str, required=True, default=None, help='Dataset name') create.add_argument('--name', type=str, required=True, default=None, help='Dataset name')
create.add_argument('--tags', type=str, nargs='*', help='Dataset user Tags') create.add_argument('--tags', type=str, nargs='*', help='Dataset user Tags')
@ -100,20 +98,22 @@ def cli():
help='Local folder to sync (support for wildcard selection). ' help='Local folder to sync (support for wildcard selection). '
'Example: ~/data/*.jpg') 'Example: ~/data/*.jpg')
sync.add_argument('--parents', type=str, nargs='*', sync.add_argument('--parents', type=str, nargs='*',
help='[Optional] Specify dataset parents IDs (i.e. merge all parents). ' help='[Optional - Create new dataset] Specify dataset parents IDs (i.e. merge all parents)')
'Example: a17b4fID1 f0ee5ID2 a17b4f09eID3')
sync.add_argument('--project', type=str, required=False, default=None, sync.add_argument('--project', type=str, required=False, default=None,
help='[Optional] Dataset project name') help='[Optional - Create new dataset] Dataset project name')
sync.add_argument('--name', type=str, required=False, default=None, sync.add_argument('--name', type=str, required=False, default=None,
help='[Optional] Dataset project name') help='[Optional - Create new dataset] Dataset project name')
sync.add_argument('--tags', type=str, nargs='*', sync.add_argument('--tags', type=str, nargs='*',
help='[Optional] Dataset user Tags') help='[Optional - Create new dataset] Dataset user Tags')
sync.add_argument('--storage', type=str, default=None, sync.add_argument('--storage', type=str, default=None,
help='Remote storage to use for the dataset files (default: files_server). ' help='Remote storage to use for the dataset files (default: files_server). '
'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', ' 'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', '
'\'/mnt/shared/folder/data\'') '\'/mnt/shared/folder/data\'')
sync.add_argument('--skip-close', action='store_true', default=False, sync.add_argument('--skip-close', action='store_true', default=False,
help='Do not auto close dataset after syncing folders') help='Do not auto close dataset after syncing folders')
sync.add_argument('--chunk-size', default=-1, type=int,
help='Set dataset artifact chunk size in MB. Default -1, unlimited size. '
'Example: 512, dataset will be split and uploaded in 512mb chunks.')
sync.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting') sync.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
sync.set_defaults(func=ds_sync) sync.set_defaults(func=ds_sync)
@ -136,6 +136,9 @@ def cli():
help='Remote storage to use for the dataset files (default: files_server). ' help='Remote storage to use for the dataset files (default: files_server). '
'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', ' 'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', '
'\'/mnt/shared/folder/data\'') '\'/mnt/shared/folder/data\'')
upload.add_argument('--chunk-size', default=-1, type=int,
help='Set dataset artifact chunk size in MB. Default -1, unlimited size. '
'Example: 512, dataset will be split and uploaded in 512mb chunks.')
upload.add_argument('--verbose', default=False, action='store_true', help='Verbose reporting') upload.add_argument('--verbose', default=False, action='store_true', help='Verbose reporting')
upload.set_defaults(func=ds_upload) upload.set_defaults(func=ds_upload)
@ -148,6 +151,9 @@ def cli():
'\'/mnt/shared/folder/data\'') '\'/mnt/shared/folder/data\'')
finalize.add_argument('--disable-upload', action='store_true', default=False, finalize.add_argument('--disable-upload', action='store_true', default=False,
help='Disable automatic upload when closing the dataset') help='Disable automatic upload when closing the dataset')
finalize.add_argument('--chunk-size', default=-1, type=int,
help='Set dataset artifact chunk size in MB. Default -1, unlimited size. '
'Example: 512, dataset will be split and uploaded in 512mb chunks.')
finalize.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting') finalize.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
finalize.set_defaults(func=ds_close) finalize.set_defaults(func=ds_close)
@ -216,6 +222,14 @@ def cli():
get.add_argument('--link', type=str, default=None, get.add_argument('--link', type=str, default=None,
help='Create a soft link (not supported on Windows) to a ' help='Create a soft link (not supported on Windows) to a '
'read-only cached folder containing the dataset') 'read-only cached folder containing the dataset')
get.add_argument('--part', type=int, default=None,
help='Retrieve a partial copy of the dataset. '
'Part number (0 to `num-parts`-1) of total parts --num-parts.')
get.add_argument('--num-parts', type=int, default=None,
help='Total number of parts to divide the dataset to. '
'Notice minimum retrieved part is a single chunk in a dataset (or its parents).'
'Example: Dataset gen4, with 3 parents, each with a single chunk, '
'can be divided into 4 parts')
get.add_argument('--overwrite', action='store_true', default=False, help='If True, overwrite the target folder') get.add_argument('--overwrite', action='store_true', default=False, help='If True, overwrite the target folder')
get.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting') get.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
get.set_defaults(func=ds_get) get.set_defaults(func=ds_get)
@ -274,7 +288,7 @@ def ds_get(args):
pass pass
if args.copy: if args.copy:
ds_folder = args.copy ds_folder = args.copy
ds.get_mutable_local_copy(target_folder=ds_folder) ds.get_mutable_local_copy(target_folder=ds_folder, part=args.part, num_parts=args.num_parts)
else: else:
if args.link: if args.link:
Path(args.link).mkdir(parents=True, exist_ok=True) Path(args.link).mkdir(parents=True, exist_ok=True)
@ -286,7 +300,7 @@ def ds_get(args):
Path(args.link).unlink() Path(args.link).unlink()
except Exception: except Exception:
raise ValueError("Target directory {} is not empty. Use --overwrite.".format(args.link)) raise ValueError("Target directory {} is not empty. Use --overwrite.".format(args.link))
ds_folder = ds.get_local_copy() ds_folder = ds.get_local_copy(part=args.part, num_parts=args.num_parts)
if args.link: if args.link:
os.symlink(ds_folder, args.link) os.symlink(ds_folder, args.link)
ds_folder = args.link ds_folder = args.link
@ -372,7 +386,10 @@ def ds_close(args):
raise ValueError("Pending uploads, cannot finalize dataset. run `clearml-data upload`") raise ValueError("Pending uploads, cannot finalize dataset. run `clearml-data upload`")
# upload the files # upload the files
print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage())) print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
ds.upload(show_progress=True, verbose=args.verbose, output_url=args.storage or None) ds.upload(show_progress=True,
verbose=args.verbose,
output_url=args.storage or None,
chunk_size=args.chunk_size or -1,)
ds.finalize() ds.finalize()
print('Dataset closed and finalized') print('Dataset closed and finalized')
@ -399,7 +416,7 @@ def ds_upload(args):
check_null_id(args) check_null_id(args)
print_args(args) print_args(args)
ds = Dataset.get(dataset_id=args.id) ds = Dataset.get(dataset_id=args.id)
ds.upload(verbose=args.verbose, output_url=args.storage or None) ds.upload(verbose=args.verbose, output_url=args.storage or None, chunk_size=args.chunk_size or -1)
print('Dataset upload completed') print('Dataset upload completed')
return 0 return 0
@ -443,7 +460,10 @@ def ds_sync(args):
if ds.is_dirty(): if ds.is_dirty():
# upload the files # upload the files
print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage())) print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
ds.upload(show_progress=True, verbose=args.verbose, output_url=args.storage or None) ds.upload(show_progress=True,
verbose=args.verbose,
output_url=args.storage or None,
chunk_size=args.chunk_size or -1, )
ds.finalize() ds.finalize()
print('Dataset closed and finalized') print('Dataset closed and finalized')

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,20 @@
import atexit
import hashlib import hashlib
import os
import shutil import shutil
from collections import OrderedDict from collections import OrderedDict
from threading import RLock
from typing import Union, Optional, Tuple, Dict
from pathlib2 import Path from pathlib2 import Path
from .helper import StorageHelper from .helper import StorageHelper
from .util import quote_url from .util import quote_url
from ..config import get_cache_dir, deferred_config from ..config import get_cache_dir, deferred_config
from ..debugging.log import LoggerRoot from ..debugging.log import LoggerRoot
from ..utilities.locks.utils import Lock as FileLock
from ..utilities.locks.exceptions import LockException
class CacheManager(object): class CacheManager(object):
@ -19,17 +26,26 @@ class CacheManager(object):
__local_to_remote_url_lookup_max_size = 1024 __local_to_remote_url_lookup_max_size = 1024
_context_to_folder_lookup = dict() _context_to_folder_lookup = dict()
_default_context_folder_template = "{0}_artifacts_archive_{1}" _default_context_folder_template = "{0}_artifacts_archive_{1}"
_lockfile_prefix = '.lock.'
_lockfile_suffix = '.clearml'
class CacheContext(object): class CacheContext(object):
_folder_locks = dict() # type: Dict[str, FileLock]
_lockfile_at_exit_cb = None
def __init__(self, cache_context, default_cache_file_limit=10): def __init__(self, cache_context, default_cache_file_limit=10):
# type: (str, int) -> None
self._context = str(cache_context) self._context = str(cache_context)
self._file_limit = int(default_cache_file_limit) self._file_limit = int(default_cache_file_limit)
self._rlock = RLock()
def set_cache_limit(self, cache_file_limit): def set_cache_limit(self, cache_file_limit):
# type: (int) -> int
self._file_limit = max(self._file_limit, int(cache_file_limit)) self._file_limit = max(self._file_limit, int(cache_file_limit))
return self._file_limit return self._file_limit
def get_local_copy(self, remote_url, force_download): def get_local_copy(self, remote_url, force_download):
# type: (str, bool) -> Optional[str]
helper = StorageHelper.get(remote_url) helper = StorageHelper.get(remote_url)
if not helper: if not helper:
raise ValueError("Storage access failed: {}".format(remote_url)) raise ValueError("Storage access failed: {}".format(remote_url))
@ -59,6 +75,7 @@ class CacheManager(object):
@staticmethod @staticmethod
def upload_file(local_file, remote_url, wait_for_upload=True, retries=1): def upload_file(local_file, remote_url, wait_for_upload=True, retries=1):
# type: (str, str, bool, int) -> Optional[str]
helper = StorageHelper.get(remote_url) helper = StorageHelper.get(remote_url)
result = helper.upload( result = helper.upload(
local_file, remote_url, async_enable=not wait_for_upload, retries=retries, local_file, remote_url, async_enable=not wait_for_upload, retries=retries,
@ -68,11 +85,13 @@ class CacheManager(object):
@classmethod @classmethod
def get_hashed_url_file(cls, url): def get_hashed_url_file(cls, url):
# type: (str) -> str
str_hash = hashlib.md5(url.encode()).hexdigest() str_hash = hashlib.md5(url.encode()).hexdigest()
filename = url.split("/")[-1] filename = url.split("/")[-1]
return "{}.{}".format(str_hash, quote_url(filename)) return "{}.{}".format(str_hash, quote_url(filename))
def get_cache_folder(self): def get_cache_folder(self):
# type: () -> str
""" """
:return: full path to current contexts cache folder :return: full path to current contexts cache folder
""" """
@ -82,6 +101,7 @@ class CacheManager(object):
return folder.as_posix() return folder.as_posix()
def get_cache_file(self, remote_url=None, local_filename=None): def get_cache_file(self, remote_url=None, local_filename=None):
# type: (Optional[str], Optional[str]) -> Tuple[str, Optional[int]]
""" """
:param remote_url: check if we have the remote url in our cache :param remote_url: check if we have the remote url in our cache
:param local_filename: if local_file is given, search for the local file/directory in the cache folder :param local_filename: if local_file is given, search for the local file/directory in the cache folder
@ -123,10 +143,52 @@ class CacheManager(object):
except Exception: except Exception:
pass pass
# first exclude lock files
lock_files = dict()
files = []
for f in sorted(folder.iterdir(), reverse=True, key=sort_max_access_time):
if f.name.startswith(CacheManager._lockfile_prefix) and f.name.endswith(CacheManager._lockfile_suffix):
# parse the lock filename
name = f.name[len(CacheManager._lockfile_prefix):-len(CacheManager._lockfile_suffix)]
num, _, name = name.partition('.')
lock_files[name] = lock_files.get(name, []) + [f.as_posix()]
else:
files.append(f)
# remove new lock files from the list (we will delete them when time comes)
for f in files[:self._file_limit]:
lock_files.pop(f.name, None)
# delete old files # delete old files
files = sorted(folder.iterdir(), reverse=True, key=sort_max_access_time)
files = files[self._file_limit:] files = files[self._file_limit:]
for f in files: for i, f in enumerate(files):
if i < self._file_limit:
continue
# check if the file is in the lock folder list:
folder_lock = self._folder_locks.get(f.absolute().as_posix())
if folder_lock:
# pop from lock files
lock_files.pop(f.name, None)
continue
# check if someone else holds the lock file
locks = lock_files.get(f.name, [])
for l in locks:
try:
a_lock = FileLock(filename=l)
a_lock.acquire(timeout=0)
a_lock.release()
a_lock.delete_lock_file()
del a_lock
except LockException:
# someone have the lock skip the file
continue
# if we got here we need to pop from the lock_files, later we will delete the leftover entries
lock_files.pop(f.name, None)
# if we are here we can delete the file
if not f.is_dir(): if not f.is_dir():
# noinspection PyBroadException # noinspection PyBroadException
try: try:
@ -135,23 +197,93 @@ class CacheManager(object):
pass pass
else: else:
try: try:
shutil.rmtree(f) shutil.rmtree(f.as_posix())
except Exception as e: except Exception as e:
# failed deleting folder # failed deleting folder
LoggerRoot.get_base_logger().debug( LoggerRoot.get_base_logger().debug(
"Exception {}\nFailed deleting folder {}".format(e, f) "Exception {}\nFailed deleting folder {}".format(e, f)
) )
# cleanup old lock files
for lock_files in lock_files.values():
for f in lock_files:
# noinspection PyBroadException
try:
os.unlink(f)
except BaseException:
pass
# if file doesn't exist, return file size None # if file doesn't exist, return file size None
# noinspection PyBroadException # noinspection PyBroadException
try: try:
size = new_file.stat().st_size if new_file_exists else None size = new_file.stat().st_size if new_file_exists else None
except Exception: except Exception:
size = None size = None
return new_file.as_posix(), size return new_file.as_posix(), size
def lock_cache_folder(self, local_path):
# type: (Union[str, Path]) -> ()
"""
Lock a specific cache folder, making sure it will not be deleted in the next
cache cleanup round
:param local_path: Path (str/Path) to a sub-folder inside the instance cache folder
"""
local_path = Path(local_path).absolute()
self._rlock.acquire()
if self._lockfile_at_exit_cb is None:
self._lockfile_at_exit_cb = True
atexit.register(self._lock_file_cleanup_callback)
lock = self._folder_locks.get(local_path.as_posix())
i = 0
# try to create a lock if we do not already have one (if we do, we assume it is locked)
while not lock:
lock_path = local_path.parent / '{}{:03d}.{}{}'.format(
CacheManager._lockfile_prefix, i, local_path.name, CacheManager._lockfile_suffix)
lock = FileLock(filename=lock_path)
# try to lock folder (if we failed to create lock, try nex number)
try:
lock.acquire(timeout=0)
break
except LockException:
# failed locking, maybe someone else already locked it.
del lock
lock = None
i += 1
# store lock
self._folder_locks[local_path.as_posix()] = lock
self._rlock.release()
def unlock_cache_folder(self, local_path):
# type: (Union[str, Path]) -> ()
"""
Lock a specific cache folder, making sure it will not be deleted in the next
cache cleanup round
:param local_path: Path (str/Path) to a sub-folder inside the instance cache folder
"""
local_path = Path(local_path).absolute()
self._rlock.acquire()
# pop lock
lock = self._folder_locks.pop(local_path.as_posix(), None)
if lock:
lock.release()
lock.delete_lock_file()
del lock
self._rlock.release()
@classmethod
def _lock_file_cleanup_callback(cls):
for lock in cls._folder_locks.values():
lock.release()
lock.delete_lock_file()
@classmethod @classmethod
def get_cache_manager(cls, cache_context=None, cache_file_limit=None): def get_cache_manager(cls, cache_context=None, cache_file_limit=None):
# type: (Optional[str], Optional[int]) -> CacheManager.CacheContext
cache_context = cache_context or cls._default_context cache_context = cache_context or cls._default_context
if cache_context not in cls.__cache_managers: if cache_context not in cls.__cache_managers:
cls.__cache_managers[cache_context] = cls.CacheContext( cls.__cache_managers[cache_context] = cls.CacheContext(
@ -165,6 +297,7 @@ class CacheManager(object):
@staticmethod @staticmethod
def get_remote_url(local_copy_path): def get_remote_url(local_copy_path):
# type: (str) -> str
if not CacheManager._local_to_remote_url_lookup: if not CacheManager._local_to_remote_url_lookup:
return local_copy_path return local_copy_path
@ -178,6 +311,7 @@ class CacheManager(object):
@staticmethod @staticmethod
def _add_remote_url(remote_url, local_copy_path): def _add_remote_url(remote_url, local_copy_path):
# type: (str, str) -> ()
# so that we can disable the cache lookup altogether # so that we can disable the cache lookup altogether
if CacheManager._local_to_remote_url_lookup is None: if CacheManager._local_to_remote_url_lookup is None:
return return
@ -206,11 +340,13 @@ class CacheManager(object):
@classmethod @classmethod
def set_context_folder_lookup(cls, context, name_template): def set_context_folder_lookup(cls, context, name_template):
# type: (str, str) -> str
cls._context_to_folder_lookup[str(context)] = str(name_template) cls._context_to_folder_lookup[str(context)] = str(name_template)
return str(name_template) return str(name_template)
@classmethod @classmethod
def get_context_folder_lookup(cls, context): def get_context_folder_lookup(cls, context):
# type: (Optional[str]) -> str
if not context: if not context:
return cls._default_context_folder_template return cls._default_context_folder_template
return cls._context_to_folder_lookup.get(str(context), cls._default_context_folder_template) return cls._context_to_folder_lookup.get(str(context), cls._default_context_folder_template)

View File

@ -97,8 +97,16 @@ class StorageManager(object):
).set_cache_limit(cache_file_limit) ).set_cache_limit(cache_file_limit)
@classmethod @classmethod
def _extract_to_cache(cls, cached_file, name, cache_context=None, target_folder=None, cache_path_encoding=None): def _extract_to_cache(
# type: (str, str, Optional[str], Optional[str], Optional[str]) -> str cls,
cached_file, # type: str
name, # type: str
cache_context=None, # type: Optional[str]
target_folder=None, # type: Optional[str]
cache_path_encoding=None, # type: Optional[str]
force=False, # type: bool
):
# type: (...) -> str
""" """
Extract cached file to cache folder Extract cached file to cache folder
:param str cached_file: local copy of archive file :param str cached_file: local copy of archive file
@ -108,6 +116,7 @@ class StorageManager(object):
:param str cache_path_encoding: specify representation of the local path of the cached files, :param str cache_path_encoding: specify representation of the local path of the cached files,
this will always point to local cache folder, even if we have direct access file. this will always point to local cache folder, even if we have direct access file.
Used for extracting the cached archived based on cache_path_encoding Used for extracting the cached archived based on cache_path_encoding
:param bool force: Force archive extraction even if target folder exists
:return: cached folder containing the extracted archive content :return: cached folder containing the extracted archive content
""" """
if not cached_file: if not cached_file:
@ -133,7 +142,7 @@ class StorageManager(object):
target_folder = cache_folder / CacheManager.get_context_folder_lookup( target_folder = cache_folder / CacheManager.get_context_folder_lookup(
cache_context).format(archive_suffix, name) cache_context).format(archive_suffix, name)
if target_folder.is_dir(): if target_folder.is_dir() and not force:
# noinspection PyBroadException # noinspection PyBroadException
try: try:
target_folder.touch(exist_ok=True) target_folder.touch(exist_ok=True)
@ -143,9 +152,14 @@ class StorageManager(object):
base_logger = LoggerRoot.get_base_logger() base_logger = LoggerRoot.get_base_logger()
try: try:
temp_target_folder = cache_folder / "{0}_{1}_{2}".format( # if target folder exists, meaning this is forced ao we extract directly into target folder
target_folder.name, time() * 1000, str(random()).replace('.', '')) if target_folder.is_dir():
temp_target_folder.mkdir(parents=True, exist_ok=True) temp_target_folder = target_folder
else:
temp_target_folder = cache_folder / "{0}_{1}_{2}".format(
target_folder.name, time() * 1000, str(random()).replace('.', ''))
temp_target_folder.mkdir(parents=True, exist_ok=True)
if suffix == ".zip": if suffix == ".zip":
ZipFile(cached_file.as_posix()).extractall(path=temp_target_folder.as_posix()) ZipFile(cached_file.as_posix()).extractall(path=temp_target_folder.as_posix())
elif suffix == ".tar.gz": elif suffix == ".tar.gz":
@ -155,23 +169,24 @@ class StorageManager(object):
with tarfile.open(cached_file.as_posix(), mode='r:gz') as file: with tarfile.open(cached_file.as_posix(), mode='r:gz') as file:
file.extractall(temp_target_folder.as_posix()) file.extractall(temp_target_folder.as_posix())
# we assume we will have such folder if we already extract the file if temp_target_folder != target_folder:
# noinspection PyBroadException # we assume we will have such folder if we already extract the file
try: # noinspection PyBroadException
# if rename fails, it means that someone else already manged to extract the file, delete the current
# folder and return the already existing cached zip folder
shutil.move(temp_target_folder.as_posix(), target_folder.as_posix())
except Exception:
if target_folder.exists():
target_folder.touch(exist_ok=True)
else:
base_logger.warning(
"Failed renaming {0} to {1}".format(temp_target_folder.as_posix(), target_folder.as_posix()))
try: try:
shutil.rmtree(temp_target_folder.as_posix()) # if rename fails, it means that someone else already manged to extract the file, delete the current
except Exception as ex: # folder and return the already existing cached zip folder
base_logger.warning( shutil.move(temp_target_folder.as_posix(), target_folder.as_posix())
"Exception {}\nFailed deleting folder {}".format(ex, temp_target_folder.as_posix())) except Exception:
if target_folder.exists():
target_folder.touch(exist_ok=True)
else:
base_logger.warning(
"Failed renaming {0} to {1}".format(temp_target_folder.as_posix(), target_folder.as_posix()))
try:
shutil.rmtree(temp_target_folder.as_posix())
except Exception as ex:
base_logger.warning(
"Exception {}\nFailed deleting folder {}".format(ex, temp_target_folder.as_posix()))
except Exception as ex: except Exception as ex:
# failed extracting the file: # failed extracting the file:
base_logger.warning( base_logger.warning(

View File

@ -179,6 +179,22 @@ class Lock(object):
pass pass
self.fh = None self.fh = None
def delete_lock_file(self):
# type: () -> bool
"""
Remove the local file used for locking (fail if file is locked)
:return: True is successful
"""
if self.fh:
return False
# noinspection PyBroadException
try:
os.unlink(path=self.filename)
except BaseException:
return False
return True
def _get_fh(self): def _get_fh(self):
'''Get a new filehandle''' '''Get a new filehandle'''
return open(self.filename, self.mode, **self.file_open_kwargs) return open(self.filename, self.mode, **self.file_open_kwargs)