mirror of
https://github.com/clearml/clearml
synced 2025-06-23 01:55:38 +00:00
Add clearml-Data (Datasets) multi-chunk support
This commit is contained in:
parent
0dd9ba8adc
commit
844c01e15b
@ -179,8 +179,8 @@ class PrintPatchLogger(object):
|
|||||||
cr_flush_period = None
|
cr_flush_period = None
|
||||||
|
|
||||||
def __init__(self, stream, logger=None, level=logging.INFO):
|
def __init__(self, stream, logger=None, level=logging.INFO):
|
||||||
if self.__class__.cr_flush_period is None:
|
if PrintPatchLogger.cr_flush_period is None:
|
||||||
self.__class__.cr_flush_period = config.get("development.worker.console_cr_flush_period", 0)
|
PrintPatchLogger.cr_flush_period = config.get("development.worker.console_cr_flush_period", 0)
|
||||||
PrintPatchLogger.patched = True
|
PrintPatchLogger.patched = True
|
||||||
self._terminal = stream
|
self._terminal = stream
|
||||||
self._log = logger
|
self._log = logger
|
||||||
|
@ -1266,6 +1266,34 @@ class Task(IdObjectBase, AccessMixin, SetupUploadMixin):
|
|||||||
self._edit(execution=execution)
|
self._edit(execution=execution)
|
||||||
return self.data.execution.artifacts or []
|
return self.data.execution.artifacts or []
|
||||||
|
|
||||||
|
def _delete_artifacts(self, artifact_names):
|
||||||
|
# type: (Sequence[str]) -> bool
|
||||||
|
"""
|
||||||
|
Delete a list of artifacts, by artifact name, from the Task.
|
||||||
|
|
||||||
|
:param list artifact_names: list of artifact names
|
||||||
|
:return: True if successful
|
||||||
|
"""
|
||||||
|
if not Session.check_min_api_version('2.3'):
|
||||||
|
return False
|
||||||
|
if not isinstance(artifact_names, (list, tuple)):
|
||||||
|
raise ValueError('Expected artifact names as List[str]')
|
||||||
|
|
||||||
|
with self._edit_lock:
|
||||||
|
if Session.check_min_api_version("2.13") and not self._offline_mode:
|
||||||
|
req = tasks.DeleteArtifactsRequest(
|
||||||
|
task=self.task_id, artifacts=[{"key": n, "mode": "output"} for n in artifact_names], force=True)
|
||||||
|
res = self.send(req, raise_on_errors=False)
|
||||||
|
if not res or not res.response or not res.response.deleted:
|
||||||
|
return False
|
||||||
|
self.reload()
|
||||||
|
else:
|
||||||
|
self.reload()
|
||||||
|
execution = self.data.execution
|
||||||
|
execution.artifacts = [a for a in execution.artifacts or [] if a.key not in artifact_names]
|
||||||
|
self._edit(execution=execution)
|
||||||
|
return self.data.execution.artifacts or []
|
||||||
|
|
||||||
def _set_model_design(self, design=None):
|
def _set_model_design(self, design=None):
|
||||||
# type: (str) -> ()
|
# type: (str) -> ()
|
||||||
with self._edit_lock:
|
with self._edit_lock:
|
||||||
|
@ -85,7 +85,7 @@ def get_epoch_beginning_of_time(timezone_info=None):
|
|||||||
return datetime(1970, 1, 1).replace(tzinfo=timezone_info if timezone_info else utc_timezone)
|
return datetime(1970, 1, 1).replace(tzinfo=timezone_info if timezone_info else utc_timezone)
|
||||||
|
|
||||||
|
|
||||||
def get_single_result(entity, query, results, log=None, show_results=10, raise_on_error=True, sort_by_date=True):
|
def get_single_result(entity, query, results, log=None, show_results=1, raise_on_error=True, sort_by_date=True):
|
||||||
if not results:
|
if not results:
|
||||||
if not raise_on_error:
|
if not raise_on_error:
|
||||||
return None
|
return None
|
||||||
@ -96,8 +96,12 @@ def get_single_result(entity, query, results, log=None, show_results=10, raise_o
|
|||||||
if show_results:
|
if show_results:
|
||||||
if not log:
|
if not log:
|
||||||
log = get_logger()
|
log = get_logger()
|
||||||
log.warning('More than one {entity} found when searching for `{query}`'
|
if show_results > 1:
|
||||||
' (showing first {show_results} {entity}s follow)'.format(**locals()))
|
log.warning('{num} {entity} found when searching for `{query}`'
|
||||||
|
' (showing first {show_results} {entity}s follow)'.format(num=len(results), **locals()))
|
||||||
|
else:
|
||||||
|
log.warning('{num} {entity} found when searching for `{query}`'.format(num=len(results), **locals()))
|
||||||
|
|
||||||
if sort_by_date:
|
if sort_by_date:
|
||||||
relative_time = get_epoch_beginning_of_time()
|
relative_time = get_epoch_beginning_of_time()
|
||||||
# sort results based on timestamp and return the newest one
|
# sort results based on timestamp and return the newest one
|
||||||
|
@ -70,9 +70,7 @@ def cli():
|
|||||||
subparsers = parser.add_subparsers(help='Dataset actions', dest='command')
|
subparsers = parser.add_subparsers(help='Dataset actions', dest='command')
|
||||||
|
|
||||||
create = subparsers.add_parser('create', help='Create a new dataset')
|
create = subparsers.add_parser('create', help='Create a new dataset')
|
||||||
create.add_argument('--parents', type=str, nargs='*',
|
create.add_argument('--parents', type=str, nargs='*', help='Specify dataset parents IDs (i.e. merge all parents)')
|
||||||
help='[Optional] Specify dataset parents IDs (i.e. merge all parents). '
|
|
||||||
'Example: a17b4fID1 f0ee5ID2 a17b4f09eID3')
|
|
||||||
create.add_argument('--project', type=str, required=False, default=None, help='Dataset project name')
|
create.add_argument('--project', type=str, required=False, default=None, help='Dataset project name')
|
||||||
create.add_argument('--name', type=str, required=True, default=None, help='Dataset name')
|
create.add_argument('--name', type=str, required=True, default=None, help='Dataset name')
|
||||||
create.add_argument('--tags', type=str, nargs='*', help='Dataset user Tags')
|
create.add_argument('--tags', type=str, nargs='*', help='Dataset user Tags')
|
||||||
@ -100,20 +98,22 @@ def cli():
|
|||||||
help='Local folder to sync (support for wildcard selection). '
|
help='Local folder to sync (support for wildcard selection). '
|
||||||
'Example: ~/data/*.jpg')
|
'Example: ~/data/*.jpg')
|
||||||
sync.add_argument('--parents', type=str, nargs='*',
|
sync.add_argument('--parents', type=str, nargs='*',
|
||||||
help='[Optional] Specify dataset parents IDs (i.e. merge all parents). '
|
help='[Optional - Create new dataset] Specify dataset parents IDs (i.e. merge all parents)')
|
||||||
'Example: a17b4fID1 f0ee5ID2 a17b4f09eID3')
|
|
||||||
sync.add_argument('--project', type=str, required=False, default=None,
|
sync.add_argument('--project', type=str, required=False, default=None,
|
||||||
help='[Optional] Dataset project name')
|
help='[Optional - Create new dataset] Dataset project name')
|
||||||
sync.add_argument('--name', type=str, required=False, default=None,
|
sync.add_argument('--name', type=str, required=False, default=None,
|
||||||
help='[Optional] Dataset project name')
|
help='[Optional - Create new dataset] Dataset project name')
|
||||||
sync.add_argument('--tags', type=str, nargs='*',
|
sync.add_argument('--tags', type=str, nargs='*',
|
||||||
help='[Optional] Dataset user Tags')
|
help='[Optional - Create new dataset] Dataset user Tags')
|
||||||
sync.add_argument('--storage', type=str, default=None,
|
sync.add_argument('--storage', type=str, default=None,
|
||||||
help='Remote storage to use for the dataset files (default: files_server). '
|
help='Remote storage to use for the dataset files (default: files_server). '
|
||||||
'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', '
|
'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', '
|
||||||
'\'/mnt/shared/folder/data\'')
|
'\'/mnt/shared/folder/data\'')
|
||||||
sync.add_argument('--skip-close', action='store_true', default=False,
|
sync.add_argument('--skip-close', action='store_true', default=False,
|
||||||
help='Do not auto close dataset after syncing folders')
|
help='Do not auto close dataset after syncing folders')
|
||||||
|
sync.add_argument('--chunk-size', default=-1, type=int,
|
||||||
|
help='Set dataset artifact chunk size in MB. Default -1, unlimited size. '
|
||||||
|
'Example: 512, dataset will be split and uploaded in 512mb chunks.')
|
||||||
sync.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
sync.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
||||||
sync.set_defaults(func=ds_sync)
|
sync.set_defaults(func=ds_sync)
|
||||||
|
|
||||||
@ -136,6 +136,9 @@ def cli():
|
|||||||
help='Remote storage to use for the dataset files (default: files_server). '
|
help='Remote storage to use for the dataset files (default: files_server). '
|
||||||
'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', '
|
'Examples: \'s3://bucket/data\', \'gs://bucket/data\', \'azure://bucket/data\', '
|
||||||
'\'/mnt/shared/folder/data\'')
|
'\'/mnt/shared/folder/data\'')
|
||||||
|
upload.add_argument('--chunk-size', default=-1, type=int,
|
||||||
|
help='Set dataset artifact chunk size in MB. Default -1, unlimited size. '
|
||||||
|
'Example: 512, dataset will be split and uploaded in 512mb chunks.')
|
||||||
upload.add_argument('--verbose', default=False, action='store_true', help='Verbose reporting')
|
upload.add_argument('--verbose', default=False, action='store_true', help='Verbose reporting')
|
||||||
upload.set_defaults(func=ds_upload)
|
upload.set_defaults(func=ds_upload)
|
||||||
|
|
||||||
@ -148,6 +151,9 @@ def cli():
|
|||||||
'\'/mnt/shared/folder/data\'')
|
'\'/mnt/shared/folder/data\'')
|
||||||
finalize.add_argument('--disable-upload', action='store_true', default=False,
|
finalize.add_argument('--disable-upload', action='store_true', default=False,
|
||||||
help='Disable automatic upload when closing the dataset')
|
help='Disable automatic upload when closing the dataset')
|
||||||
|
finalize.add_argument('--chunk-size', default=-1, type=int,
|
||||||
|
help='Set dataset artifact chunk size in MB. Default -1, unlimited size. '
|
||||||
|
'Example: 512, dataset will be split and uploaded in 512mb chunks.')
|
||||||
finalize.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
finalize.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
||||||
finalize.set_defaults(func=ds_close)
|
finalize.set_defaults(func=ds_close)
|
||||||
|
|
||||||
@ -216,6 +222,14 @@ def cli():
|
|||||||
get.add_argument('--link', type=str, default=None,
|
get.add_argument('--link', type=str, default=None,
|
||||||
help='Create a soft link (not supported on Windows) to a '
|
help='Create a soft link (not supported on Windows) to a '
|
||||||
'read-only cached folder containing the dataset')
|
'read-only cached folder containing the dataset')
|
||||||
|
get.add_argument('--part', type=int, default=None,
|
||||||
|
help='Retrieve a partial copy of the dataset. '
|
||||||
|
'Part number (0 to `num-parts`-1) of total parts --num-parts.')
|
||||||
|
get.add_argument('--num-parts', type=int, default=None,
|
||||||
|
help='Total number of parts to divide the dataset to. '
|
||||||
|
'Notice minimum retrieved part is a single chunk in a dataset (or its parents).'
|
||||||
|
'Example: Dataset gen4, with 3 parents, each with a single chunk, '
|
||||||
|
'can be divided into 4 parts')
|
||||||
get.add_argument('--overwrite', action='store_true', default=False, help='If True, overwrite the target folder')
|
get.add_argument('--overwrite', action='store_true', default=False, help='If True, overwrite the target folder')
|
||||||
get.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
get.add_argument('--verbose', action='store_true', default=False, help='Verbose reporting')
|
||||||
get.set_defaults(func=ds_get)
|
get.set_defaults(func=ds_get)
|
||||||
@ -274,7 +288,7 @@ def ds_get(args):
|
|||||||
pass
|
pass
|
||||||
if args.copy:
|
if args.copy:
|
||||||
ds_folder = args.copy
|
ds_folder = args.copy
|
||||||
ds.get_mutable_local_copy(target_folder=ds_folder)
|
ds.get_mutable_local_copy(target_folder=ds_folder, part=args.part, num_parts=args.num_parts)
|
||||||
else:
|
else:
|
||||||
if args.link:
|
if args.link:
|
||||||
Path(args.link).mkdir(parents=True, exist_ok=True)
|
Path(args.link).mkdir(parents=True, exist_ok=True)
|
||||||
@ -286,7 +300,7 @@ def ds_get(args):
|
|||||||
Path(args.link).unlink()
|
Path(args.link).unlink()
|
||||||
except Exception:
|
except Exception:
|
||||||
raise ValueError("Target directory {} is not empty. Use --overwrite.".format(args.link))
|
raise ValueError("Target directory {} is not empty. Use --overwrite.".format(args.link))
|
||||||
ds_folder = ds.get_local_copy()
|
ds_folder = ds.get_local_copy(part=args.part, num_parts=args.num_parts)
|
||||||
if args.link:
|
if args.link:
|
||||||
os.symlink(ds_folder, args.link)
|
os.symlink(ds_folder, args.link)
|
||||||
ds_folder = args.link
|
ds_folder = args.link
|
||||||
@ -372,7 +386,10 @@ def ds_close(args):
|
|||||||
raise ValueError("Pending uploads, cannot finalize dataset. run `clearml-data upload`")
|
raise ValueError("Pending uploads, cannot finalize dataset. run `clearml-data upload`")
|
||||||
# upload the files
|
# upload the files
|
||||||
print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
|
print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
|
||||||
ds.upload(show_progress=True, verbose=args.verbose, output_url=args.storage or None)
|
ds.upload(show_progress=True,
|
||||||
|
verbose=args.verbose,
|
||||||
|
output_url=args.storage or None,
|
||||||
|
chunk_size=args.chunk_size or -1,)
|
||||||
|
|
||||||
ds.finalize()
|
ds.finalize()
|
||||||
print('Dataset closed and finalized')
|
print('Dataset closed and finalized')
|
||||||
@ -399,7 +416,7 @@ def ds_upload(args):
|
|||||||
check_null_id(args)
|
check_null_id(args)
|
||||||
print_args(args)
|
print_args(args)
|
||||||
ds = Dataset.get(dataset_id=args.id)
|
ds = Dataset.get(dataset_id=args.id)
|
||||||
ds.upload(verbose=args.verbose, output_url=args.storage or None)
|
ds.upload(verbose=args.verbose, output_url=args.storage or None, chunk_size=args.chunk_size or -1)
|
||||||
print('Dataset upload completed')
|
print('Dataset upload completed')
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@ -443,7 +460,10 @@ def ds_sync(args):
|
|||||||
if ds.is_dirty():
|
if ds.is_dirty():
|
||||||
# upload the files
|
# upload the files
|
||||||
print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
|
print("Pending uploads, starting dataset upload to {}".format(args.storage or ds.get_default_storage()))
|
||||||
ds.upload(show_progress=True, verbose=args.verbose, output_url=args.storage or None)
|
ds.upload(show_progress=True,
|
||||||
|
verbose=args.verbose,
|
||||||
|
output_url=args.storage or None,
|
||||||
|
chunk_size=args.chunk_size or -1, )
|
||||||
|
|
||||||
ds.finalize()
|
ds.finalize()
|
||||||
print('Dataset closed and finalized')
|
print('Dataset closed and finalized')
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,13 +1,20 @@
|
|||||||
|
import atexit
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import os
|
||||||
import shutil
|
import shutil
|
||||||
|
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
from threading import RLock
|
||||||
|
from typing import Union, Optional, Tuple, Dict
|
||||||
|
|
||||||
from pathlib2 import Path
|
from pathlib2 import Path
|
||||||
|
|
||||||
from .helper import StorageHelper
|
from .helper import StorageHelper
|
||||||
from .util import quote_url
|
from .util import quote_url
|
||||||
from ..config import get_cache_dir, deferred_config
|
from ..config import get_cache_dir, deferred_config
|
||||||
from ..debugging.log import LoggerRoot
|
from ..debugging.log import LoggerRoot
|
||||||
|
from ..utilities.locks.utils import Lock as FileLock
|
||||||
|
from ..utilities.locks.exceptions import LockException
|
||||||
|
|
||||||
|
|
||||||
class CacheManager(object):
|
class CacheManager(object):
|
||||||
@ -19,17 +26,26 @@ class CacheManager(object):
|
|||||||
__local_to_remote_url_lookup_max_size = 1024
|
__local_to_remote_url_lookup_max_size = 1024
|
||||||
_context_to_folder_lookup = dict()
|
_context_to_folder_lookup = dict()
|
||||||
_default_context_folder_template = "{0}_artifacts_archive_{1}"
|
_default_context_folder_template = "{0}_artifacts_archive_{1}"
|
||||||
|
_lockfile_prefix = '.lock.'
|
||||||
|
_lockfile_suffix = '.clearml'
|
||||||
|
|
||||||
class CacheContext(object):
|
class CacheContext(object):
|
||||||
|
_folder_locks = dict() # type: Dict[str, FileLock]
|
||||||
|
_lockfile_at_exit_cb = None
|
||||||
|
|
||||||
def __init__(self, cache_context, default_cache_file_limit=10):
|
def __init__(self, cache_context, default_cache_file_limit=10):
|
||||||
|
# type: (str, int) -> None
|
||||||
self._context = str(cache_context)
|
self._context = str(cache_context)
|
||||||
self._file_limit = int(default_cache_file_limit)
|
self._file_limit = int(default_cache_file_limit)
|
||||||
|
self._rlock = RLock()
|
||||||
|
|
||||||
def set_cache_limit(self, cache_file_limit):
|
def set_cache_limit(self, cache_file_limit):
|
||||||
|
# type: (int) -> int
|
||||||
self._file_limit = max(self._file_limit, int(cache_file_limit))
|
self._file_limit = max(self._file_limit, int(cache_file_limit))
|
||||||
return self._file_limit
|
return self._file_limit
|
||||||
|
|
||||||
def get_local_copy(self, remote_url, force_download):
|
def get_local_copy(self, remote_url, force_download):
|
||||||
|
# type: (str, bool) -> Optional[str]
|
||||||
helper = StorageHelper.get(remote_url)
|
helper = StorageHelper.get(remote_url)
|
||||||
if not helper:
|
if not helper:
|
||||||
raise ValueError("Storage access failed: {}".format(remote_url))
|
raise ValueError("Storage access failed: {}".format(remote_url))
|
||||||
@ -59,6 +75,7 @@ class CacheManager(object):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def upload_file(local_file, remote_url, wait_for_upload=True, retries=1):
|
def upload_file(local_file, remote_url, wait_for_upload=True, retries=1):
|
||||||
|
# type: (str, str, bool, int) -> Optional[str]
|
||||||
helper = StorageHelper.get(remote_url)
|
helper = StorageHelper.get(remote_url)
|
||||||
result = helper.upload(
|
result = helper.upload(
|
||||||
local_file, remote_url, async_enable=not wait_for_upload, retries=retries,
|
local_file, remote_url, async_enable=not wait_for_upload, retries=retries,
|
||||||
@ -68,11 +85,13 @@ class CacheManager(object):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_hashed_url_file(cls, url):
|
def get_hashed_url_file(cls, url):
|
||||||
|
# type: (str) -> str
|
||||||
str_hash = hashlib.md5(url.encode()).hexdigest()
|
str_hash = hashlib.md5(url.encode()).hexdigest()
|
||||||
filename = url.split("/")[-1]
|
filename = url.split("/")[-1]
|
||||||
return "{}.{}".format(str_hash, quote_url(filename))
|
return "{}.{}".format(str_hash, quote_url(filename))
|
||||||
|
|
||||||
def get_cache_folder(self):
|
def get_cache_folder(self):
|
||||||
|
# type: () -> str
|
||||||
"""
|
"""
|
||||||
:return: full path to current contexts cache folder
|
:return: full path to current contexts cache folder
|
||||||
"""
|
"""
|
||||||
@ -82,6 +101,7 @@ class CacheManager(object):
|
|||||||
return folder.as_posix()
|
return folder.as_posix()
|
||||||
|
|
||||||
def get_cache_file(self, remote_url=None, local_filename=None):
|
def get_cache_file(self, remote_url=None, local_filename=None):
|
||||||
|
# type: (Optional[str], Optional[str]) -> Tuple[str, Optional[int]]
|
||||||
"""
|
"""
|
||||||
:param remote_url: check if we have the remote url in our cache
|
:param remote_url: check if we have the remote url in our cache
|
||||||
:param local_filename: if local_file is given, search for the local file/directory in the cache folder
|
:param local_filename: if local_file is given, search for the local file/directory in the cache folder
|
||||||
@ -123,10 +143,52 @@ class CacheManager(object):
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# first exclude lock files
|
||||||
|
lock_files = dict()
|
||||||
|
files = []
|
||||||
|
for f in sorted(folder.iterdir(), reverse=True, key=sort_max_access_time):
|
||||||
|
if f.name.startswith(CacheManager._lockfile_prefix) and f.name.endswith(CacheManager._lockfile_suffix):
|
||||||
|
# parse the lock filename
|
||||||
|
name = f.name[len(CacheManager._lockfile_prefix):-len(CacheManager._lockfile_suffix)]
|
||||||
|
num, _, name = name.partition('.')
|
||||||
|
lock_files[name] = lock_files.get(name, []) + [f.as_posix()]
|
||||||
|
else:
|
||||||
|
files.append(f)
|
||||||
|
|
||||||
|
# remove new lock files from the list (we will delete them when time comes)
|
||||||
|
for f in files[:self._file_limit]:
|
||||||
|
lock_files.pop(f.name, None)
|
||||||
|
|
||||||
# delete old files
|
# delete old files
|
||||||
files = sorted(folder.iterdir(), reverse=True, key=sort_max_access_time)
|
|
||||||
files = files[self._file_limit:]
|
files = files[self._file_limit:]
|
||||||
for f in files:
|
for i, f in enumerate(files):
|
||||||
|
if i < self._file_limit:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# check if the file is in the lock folder list:
|
||||||
|
folder_lock = self._folder_locks.get(f.absolute().as_posix())
|
||||||
|
if folder_lock:
|
||||||
|
# pop from lock files
|
||||||
|
lock_files.pop(f.name, None)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# check if someone else holds the lock file
|
||||||
|
locks = lock_files.get(f.name, [])
|
||||||
|
for l in locks:
|
||||||
|
try:
|
||||||
|
a_lock = FileLock(filename=l)
|
||||||
|
a_lock.acquire(timeout=0)
|
||||||
|
a_lock.release()
|
||||||
|
a_lock.delete_lock_file()
|
||||||
|
del a_lock
|
||||||
|
except LockException:
|
||||||
|
# someone have the lock skip the file
|
||||||
|
continue
|
||||||
|
|
||||||
|
# if we got here we need to pop from the lock_files, later we will delete the leftover entries
|
||||||
|
lock_files.pop(f.name, None)
|
||||||
|
|
||||||
|
# if we are here we can delete the file
|
||||||
if not f.is_dir():
|
if not f.is_dir():
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
@ -135,23 +197,93 @@ class CacheManager(object):
|
|||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(f)
|
shutil.rmtree(f.as_posix())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# failed deleting folder
|
# failed deleting folder
|
||||||
LoggerRoot.get_base_logger().debug(
|
LoggerRoot.get_base_logger().debug(
|
||||||
"Exception {}\nFailed deleting folder {}".format(e, f)
|
"Exception {}\nFailed deleting folder {}".format(e, f)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# cleanup old lock files
|
||||||
|
for lock_files in lock_files.values():
|
||||||
|
for f in lock_files:
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
os.unlink(f)
|
||||||
|
except BaseException:
|
||||||
|
pass
|
||||||
|
|
||||||
# if file doesn't exist, return file size None
|
# if file doesn't exist, return file size None
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
size = new_file.stat().st_size if new_file_exists else None
|
size = new_file.stat().st_size if new_file_exists else None
|
||||||
except Exception:
|
except Exception:
|
||||||
size = None
|
size = None
|
||||||
|
|
||||||
return new_file.as_posix(), size
|
return new_file.as_posix(), size
|
||||||
|
|
||||||
|
def lock_cache_folder(self, local_path):
|
||||||
|
# type: (Union[str, Path]) -> ()
|
||||||
|
"""
|
||||||
|
Lock a specific cache folder, making sure it will not be deleted in the next
|
||||||
|
cache cleanup round
|
||||||
|
:param local_path: Path (str/Path) to a sub-folder inside the instance cache folder
|
||||||
|
"""
|
||||||
|
local_path = Path(local_path).absolute()
|
||||||
|
self._rlock.acquire()
|
||||||
|
if self._lockfile_at_exit_cb is None:
|
||||||
|
self._lockfile_at_exit_cb = True
|
||||||
|
atexit.register(self._lock_file_cleanup_callback)
|
||||||
|
|
||||||
|
lock = self._folder_locks.get(local_path.as_posix())
|
||||||
|
i = 0
|
||||||
|
# try to create a lock if we do not already have one (if we do, we assume it is locked)
|
||||||
|
while not lock:
|
||||||
|
lock_path = local_path.parent / '{}{:03d}.{}{}'.format(
|
||||||
|
CacheManager._lockfile_prefix, i, local_path.name, CacheManager._lockfile_suffix)
|
||||||
|
lock = FileLock(filename=lock_path)
|
||||||
|
|
||||||
|
# try to lock folder (if we failed to create lock, try nex number)
|
||||||
|
try:
|
||||||
|
lock.acquire(timeout=0)
|
||||||
|
break
|
||||||
|
except LockException:
|
||||||
|
# failed locking, maybe someone else already locked it.
|
||||||
|
del lock
|
||||||
|
lock = None
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
# store lock
|
||||||
|
self._folder_locks[local_path.as_posix()] = lock
|
||||||
|
self._rlock.release()
|
||||||
|
|
||||||
|
def unlock_cache_folder(self, local_path):
|
||||||
|
# type: (Union[str, Path]) -> ()
|
||||||
|
"""
|
||||||
|
Lock a specific cache folder, making sure it will not be deleted in the next
|
||||||
|
cache cleanup round
|
||||||
|
:param local_path: Path (str/Path) to a sub-folder inside the instance cache folder
|
||||||
|
"""
|
||||||
|
local_path = Path(local_path).absolute()
|
||||||
|
self._rlock.acquire()
|
||||||
|
# pop lock
|
||||||
|
lock = self._folder_locks.pop(local_path.as_posix(), None)
|
||||||
|
if lock:
|
||||||
|
lock.release()
|
||||||
|
lock.delete_lock_file()
|
||||||
|
del lock
|
||||||
|
|
||||||
|
self._rlock.release()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _lock_file_cleanup_callback(cls):
|
||||||
|
for lock in cls._folder_locks.values():
|
||||||
|
lock.release()
|
||||||
|
lock.delete_lock_file()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_cache_manager(cls, cache_context=None, cache_file_limit=None):
|
def get_cache_manager(cls, cache_context=None, cache_file_limit=None):
|
||||||
|
# type: (Optional[str], Optional[int]) -> CacheManager.CacheContext
|
||||||
cache_context = cache_context or cls._default_context
|
cache_context = cache_context or cls._default_context
|
||||||
if cache_context not in cls.__cache_managers:
|
if cache_context not in cls.__cache_managers:
|
||||||
cls.__cache_managers[cache_context] = cls.CacheContext(
|
cls.__cache_managers[cache_context] = cls.CacheContext(
|
||||||
@ -165,6 +297,7 @@ class CacheManager(object):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_remote_url(local_copy_path):
|
def get_remote_url(local_copy_path):
|
||||||
|
# type: (str) -> str
|
||||||
if not CacheManager._local_to_remote_url_lookup:
|
if not CacheManager._local_to_remote_url_lookup:
|
||||||
return local_copy_path
|
return local_copy_path
|
||||||
|
|
||||||
@ -178,6 +311,7 @@ class CacheManager(object):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _add_remote_url(remote_url, local_copy_path):
|
def _add_remote_url(remote_url, local_copy_path):
|
||||||
|
# type: (str, str) -> ()
|
||||||
# so that we can disable the cache lookup altogether
|
# so that we can disable the cache lookup altogether
|
||||||
if CacheManager._local_to_remote_url_lookup is None:
|
if CacheManager._local_to_remote_url_lookup is None:
|
||||||
return
|
return
|
||||||
@ -206,11 +340,13 @@ class CacheManager(object):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def set_context_folder_lookup(cls, context, name_template):
|
def set_context_folder_lookup(cls, context, name_template):
|
||||||
|
# type: (str, str) -> str
|
||||||
cls._context_to_folder_lookup[str(context)] = str(name_template)
|
cls._context_to_folder_lookup[str(context)] = str(name_template)
|
||||||
return str(name_template)
|
return str(name_template)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_context_folder_lookup(cls, context):
|
def get_context_folder_lookup(cls, context):
|
||||||
|
# type: (Optional[str]) -> str
|
||||||
if not context:
|
if not context:
|
||||||
return cls._default_context_folder_template
|
return cls._default_context_folder_template
|
||||||
return cls._context_to_folder_lookup.get(str(context), cls._default_context_folder_template)
|
return cls._context_to_folder_lookup.get(str(context), cls._default_context_folder_template)
|
||||||
|
@ -97,8 +97,16 @@ class StorageManager(object):
|
|||||||
).set_cache_limit(cache_file_limit)
|
).set_cache_limit(cache_file_limit)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _extract_to_cache(cls, cached_file, name, cache_context=None, target_folder=None, cache_path_encoding=None):
|
def _extract_to_cache(
|
||||||
# type: (str, str, Optional[str], Optional[str], Optional[str]) -> str
|
cls,
|
||||||
|
cached_file, # type: str
|
||||||
|
name, # type: str
|
||||||
|
cache_context=None, # type: Optional[str]
|
||||||
|
target_folder=None, # type: Optional[str]
|
||||||
|
cache_path_encoding=None, # type: Optional[str]
|
||||||
|
force=False, # type: bool
|
||||||
|
):
|
||||||
|
# type: (...) -> str
|
||||||
"""
|
"""
|
||||||
Extract cached file to cache folder
|
Extract cached file to cache folder
|
||||||
:param str cached_file: local copy of archive file
|
:param str cached_file: local copy of archive file
|
||||||
@ -108,6 +116,7 @@ class StorageManager(object):
|
|||||||
:param str cache_path_encoding: specify representation of the local path of the cached files,
|
:param str cache_path_encoding: specify representation of the local path of the cached files,
|
||||||
this will always point to local cache folder, even if we have direct access file.
|
this will always point to local cache folder, even if we have direct access file.
|
||||||
Used for extracting the cached archived based on cache_path_encoding
|
Used for extracting the cached archived based on cache_path_encoding
|
||||||
|
:param bool force: Force archive extraction even if target folder exists
|
||||||
:return: cached folder containing the extracted archive content
|
:return: cached folder containing the extracted archive content
|
||||||
"""
|
"""
|
||||||
if not cached_file:
|
if not cached_file:
|
||||||
@ -133,7 +142,7 @@ class StorageManager(object):
|
|||||||
target_folder = cache_folder / CacheManager.get_context_folder_lookup(
|
target_folder = cache_folder / CacheManager.get_context_folder_lookup(
|
||||||
cache_context).format(archive_suffix, name)
|
cache_context).format(archive_suffix, name)
|
||||||
|
|
||||||
if target_folder.is_dir():
|
if target_folder.is_dir() and not force:
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
try:
|
try:
|
||||||
target_folder.touch(exist_ok=True)
|
target_folder.touch(exist_ok=True)
|
||||||
@ -143,9 +152,14 @@ class StorageManager(object):
|
|||||||
|
|
||||||
base_logger = LoggerRoot.get_base_logger()
|
base_logger = LoggerRoot.get_base_logger()
|
||||||
try:
|
try:
|
||||||
temp_target_folder = cache_folder / "{0}_{1}_{2}".format(
|
# if target folder exists, meaning this is forced ao we extract directly into target folder
|
||||||
target_folder.name, time() * 1000, str(random()).replace('.', ''))
|
if target_folder.is_dir():
|
||||||
temp_target_folder.mkdir(parents=True, exist_ok=True)
|
temp_target_folder = target_folder
|
||||||
|
else:
|
||||||
|
temp_target_folder = cache_folder / "{0}_{1}_{2}".format(
|
||||||
|
target_folder.name, time() * 1000, str(random()).replace('.', ''))
|
||||||
|
temp_target_folder.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
if suffix == ".zip":
|
if suffix == ".zip":
|
||||||
ZipFile(cached_file.as_posix()).extractall(path=temp_target_folder.as_posix())
|
ZipFile(cached_file.as_posix()).extractall(path=temp_target_folder.as_posix())
|
||||||
elif suffix == ".tar.gz":
|
elif suffix == ".tar.gz":
|
||||||
@ -155,23 +169,24 @@ class StorageManager(object):
|
|||||||
with tarfile.open(cached_file.as_posix(), mode='r:gz') as file:
|
with tarfile.open(cached_file.as_posix(), mode='r:gz') as file:
|
||||||
file.extractall(temp_target_folder.as_posix())
|
file.extractall(temp_target_folder.as_posix())
|
||||||
|
|
||||||
# we assume we will have such folder if we already extract the file
|
if temp_target_folder != target_folder:
|
||||||
# noinspection PyBroadException
|
# we assume we will have such folder if we already extract the file
|
||||||
try:
|
# noinspection PyBroadException
|
||||||
# if rename fails, it means that someone else already manged to extract the file, delete the current
|
|
||||||
# folder and return the already existing cached zip folder
|
|
||||||
shutil.move(temp_target_folder.as_posix(), target_folder.as_posix())
|
|
||||||
except Exception:
|
|
||||||
if target_folder.exists():
|
|
||||||
target_folder.touch(exist_ok=True)
|
|
||||||
else:
|
|
||||||
base_logger.warning(
|
|
||||||
"Failed renaming {0} to {1}".format(temp_target_folder.as_posix(), target_folder.as_posix()))
|
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(temp_target_folder.as_posix())
|
# if rename fails, it means that someone else already manged to extract the file, delete the current
|
||||||
except Exception as ex:
|
# folder and return the already existing cached zip folder
|
||||||
base_logger.warning(
|
shutil.move(temp_target_folder.as_posix(), target_folder.as_posix())
|
||||||
"Exception {}\nFailed deleting folder {}".format(ex, temp_target_folder.as_posix()))
|
except Exception:
|
||||||
|
if target_folder.exists():
|
||||||
|
target_folder.touch(exist_ok=True)
|
||||||
|
else:
|
||||||
|
base_logger.warning(
|
||||||
|
"Failed renaming {0} to {1}".format(temp_target_folder.as_posix(), target_folder.as_posix()))
|
||||||
|
try:
|
||||||
|
shutil.rmtree(temp_target_folder.as_posix())
|
||||||
|
except Exception as ex:
|
||||||
|
base_logger.warning(
|
||||||
|
"Exception {}\nFailed deleting folder {}".format(ex, temp_target_folder.as_posix()))
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
# failed extracting the file:
|
# failed extracting the file:
|
||||||
base_logger.warning(
|
base_logger.warning(
|
||||||
|
@ -179,6 +179,22 @@ class Lock(object):
|
|||||||
pass
|
pass
|
||||||
self.fh = None
|
self.fh = None
|
||||||
|
|
||||||
|
def delete_lock_file(self):
|
||||||
|
# type: () -> bool
|
||||||
|
"""
|
||||||
|
Remove the local file used for locking (fail if file is locked)
|
||||||
|
|
||||||
|
:return: True is successful
|
||||||
|
"""
|
||||||
|
if self.fh:
|
||||||
|
return False
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
os.unlink(path=self.filename)
|
||||||
|
except BaseException:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
def _get_fh(self):
|
def _get_fh(self):
|
||||||
'''Get a new filehandle'''
|
'''Get a new filehandle'''
|
||||||
return open(self.filename, self.mode, **self.file_open_kwargs)
|
return open(self.filename, self.mode, **self.file_open_kwargs)
|
||||||
|
Loading…
Reference in New Issue
Block a user