Support controlling the naming of the sub-folder created by StorageManager/CacheManager

This commit is contained in:
allegroai 2020-11-25 10:58:01 +02:00
parent 8f65f28d58
commit d78ee6c669
4 changed files with 91 additions and 20 deletions

View File

@ -17,6 +17,8 @@ class CacheManager(object):
_default_context = "global" _default_context = "global"
_local_to_remote_url_lookup = OrderedDict() _local_to_remote_url_lookup = OrderedDict()
__local_to_remote_url_lookup_max_size = 1024 __local_to_remote_url_lookup_max_size = 1024
_context_to_folder_lookup = dict()
_default_context_folder_template = "{0}_artifacts_archive_{1}"
class CacheContext(object): class CacheContext(object):
def __init__(self, cache_context, default_cache_file_limit=10): def __init__(self, cache_context, default_cache_file_limit=10):
@ -43,7 +45,7 @@ class CacheManager(object):
return direct_access return direct_access
# check if we already have the file in our cache # check if we already have the file in our cache
cached_file, cached_size = self._get_cache_file(remote_url) cached_file, cached_size = self.get_cache_file(remote_url)
if cached_size is not None and not force_download: if cached_size is not None and not force_download:
CacheManager._add_remote_url(remote_url, cached_file) CacheManager._add_remote_url(remote_url, cached_file)
return cached_file return cached_file
@ -70,9 +72,19 @@ class CacheManager(object):
filename = url.split("/")[-1] filename = url.split("/")[-1]
return "{}.{}".format(str_hash, quote_url(filename)) return "{}.{}".format(str_hash, quote_url(filename))
def _get_cache_file(self, remote_url): def get_cache_folder(self):
"""
:return: full path to current contexts cache folder
"""
folder = Path(
get_cache_dir() / CacheManager._storage_manager_folder / self._context
)
return folder.as_posix()
def get_cache_file(self, remote_url=None, local_filename=None):
""" """
:param remote_url: check if we have the remote url in our cache :param remote_url: check if we have the remote url in our cache
:param local_filename: if local_file is given, search for the local file/directory in the cache folder
:return: full path to file name, current file size or None :return: full path to file name, current file size or None
""" """
def safe_time(x): def safe_time(x):
@ -101,7 +113,7 @@ class CacheManager(object):
get_cache_dir() / CacheManager._storage_manager_folder / self._context get_cache_dir() / CacheManager._storage_manager_folder / self._context
) )
folder.mkdir(parents=True, exist_ok=True) folder.mkdir(parents=True, exist_ok=True)
local_filename = self._get_hashed_url_file(remote_url) local_filename = local_filename or self._get_hashed_url_file(remote_url)
new_file = folder / local_filename new_file = folder / local_filename
new_file_exists = new_file.exists() new_file_exists = new_file.exists()
if new_file_exists: if new_file_exists:
@ -190,3 +202,14 @@ class CacheManager(object):
if len(CacheManager._local_to_remote_url_lookup) > CacheManager.__local_to_remote_url_lookup_max_size: if len(CacheManager._local_to_remote_url_lookup) > CacheManager.__local_to_remote_url_lookup_max_size:
# pop the first item (FIFO) # pop the first item (FIFO)
CacheManager._local_to_remote_url_lookup.popitem(last=False) CacheManager._local_to_remote_url_lookup.popitem(last=False)
@classmethod
def set_context_folder_lookup(cls, context, name_template):
cls._context_to_folder_lookup[str(context)] = str(name_template)
return str(name_template)
@classmethod
def get_context_folder_lookup(cls, context):
if not context:
return cls._default_context_folder_template
return cls._context_to_folder_lookup.get(str(context), cls._default_context_folder_template)

View File

@ -691,7 +691,7 @@ class StorageHelper(object):
except (ValueError, AttributeError, KeyError): except (ValueError, AttributeError, KeyError):
pass pass
# if driver supports download with call back, use it (it might be faster) # if driver supports download with callback, use it (it might be faster)
if hasattr(self._driver, 'download_object'): if hasattr(self._driver, 'download_object'):
# callback # callback
cb = _DownloadProgressReport(total_size_mb, verbose, cb = _DownloadProgressReport(total_size_mb, verbose,

View File

@ -1,3 +1,4 @@
import os
import shutil import shutil
import tarfile import tarfile
from random import random from random import random
@ -7,9 +8,9 @@ from zipfile import ZipFile
from pathlib2 import Path from pathlib2 import Path
from .cache import CacheManager
from .util import encode_string_to_filename from .util import encode_string_to_filename
from ..debugging.log import LoggerRoot from ..debugging.log import LoggerRoot
from .cache import CacheManager
class StorageManager(object): class StorageManager(object):
@ -42,7 +43,7 @@ class StorageManager(object):
cache_context=cache_context cache_context=cache_context
).get_local_copy(remote_url=remote_url, force_download=force_download) ).get_local_copy(remote_url=remote_url, force_download=force_download)
if extract_archive and cached_file: if extract_archive and cached_file:
return cls._extract_to_cache(cached_file, name) return cls._extract_to_cache(cached_file, name, cache_context)
return cached_file return cached_file
@ -89,11 +90,14 @@ class StorageManager(object):
).set_cache_limit(cache_file_limit) ).set_cache_limit(cache_file_limit)
@classmethod @classmethod
def _extract_to_cache(cls, cached_file, name): def _extract_to_cache(cls, cached_file, name, cache_context=None, target_folder=None):
# type: (str, str, Optional[str], Optional[str]) -> str
""" """
Extract cached file to cache folder Extract cached file to cache folder
:param str cached_file: local copy of archive file :param str cached_file: local copy of archive file
:param str name: cache context :param str name: name of the target file
:param str cache_context: cache context id
:param str target_folder: specify target path to use for archive extraction
:return: cached folder containing the extracted archive content :return: cached folder containing the extracted archive content
""" """
if not cached_file: if not cached_file:
@ -102,21 +106,24 @@ class StorageManager(object):
cached_file = Path(cached_file) cached_file = Path(cached_file)
# we support zip and tar.gz files auto-extraction # we support zip and tar.gz files auto-extraction
if ( suffix = cached_file.suffix.lower()
not cached_file.suffix == ".zip" if suffix == '.gz':
and not cached_file.suffixes[-2:] == [".tar", ".gz"] suffix = ''.join(a.lower() for a in cached_file.suffixes[-2:])
):
if suffix not in (".zip", ".tgz", ".tar.gz"):
return str(cached_file) return str(cached_file)
cached_folder = cached_file.parent cached_folder = Path(cached_file).parent
archive_suffix = cached_file.name[:-len(suffix)]
name = encode_string_to_filename(name)
target_folder = Path(
target_folder or CacheManager.get_context_folder_lookup(cache_context).format(archive_suffix, name))
name = encode_string_to_filename(name) if name else name
target_folder = Path("{0}/{1}_artifacts_archive_{2}".format(cached_folder, cached_file.stem, name))
if target_folder.exists(): if target_folder.exists():
# noinspection PyBroadException # noinspection PyBroadException
try: try:
target_folder.touch(exist_ok=True) target_folder.touch(exist_ok=True)
return target_folder return target_folder.as_posix()
except Exception: except Exception:
pass pass
@ -125,11 +132,14 @@ class StorageManager(object):
temp_target_folder = cached_folder / "{0}_{1}_{2}".format( temp_target_folder = cached_folder / "{0}_{1}_{2}".format(
target_folder.name, time() * 1000, str(random()).replace('.', '')) target_folder.name, time() * 1000, str(random()).replace('.', ''))
temp_target_folder.mkdir(parents=True, exist_ok=True) temp_target_folder.mkdir(parents=True, exist_ok=True)
if cached_file.suffix == ".zip": if suffix == ".zip":
ZipFile(cached_file).extractall(path=temp_target_folder.as_posix()) ZipFile(cached_file).extractall(path=temp_target_folder.as_posix())
elif cached_file.suffixes[-2:] == [".tar", ".gz"]: elif suffix == ".tar.gz":
with tarfile.open(cached_file) as file: with tarfile.open(cached_file) as file:
file.extractall(temp_target_folder) file.extractall(temp_target_folder.as_posix())
elif suffix == ".tgz":
with tarfile.open(cached_file, mode='r:gz') as file:
file.extractall(temp_target_folder.as_posix())
# we assume we will have such folder if we already extract the file # we assume we will have such folder if we already extract the file
# noinspection PyBroadException # noinspection PyBroadException
@ -165,7 +175,7 @@ class StorageManager(object):
except Exception: except Exception:
pass pass
return cached_file return cached_file
return target_folder return target_folder.as_posix()
@classmethod @classmethod
def get_files_server(cls): def get_files_server(cls):

View File

@ -1,7 +1,13 @@
import hashlib
import sys
from typing import Optional
from six.moves.urllib.parse import quote, urlparse, urlunparse from six.moves.urllib.parse import quote, urlparse, urlunparse
import six import six
import fnmatch import fnmatch
from ..debugging.log import LoggerRoot
def get_config_object_matcher(**patterns): def get_config_object_matcher(**patterns):
unsupported = {k: v for k, v in patterns.items() if not isinstance(v, six.string_types)} unsupported = {k: v for k, v in patterns.items() if not isinstance(v, six.string_types)}
@ -39,3 +45,35 @@ def quote_url(url):
def encode_string_to_filename(text): def encode_string_to_filename(text):
return quote(text, safe=" ") return quote(text, safe=" ")
def sha256sum(filename, skip_header=0, block_size=65536):
# type: (str, int, int) -> (Optional[str], Optional[str])
# create sha2 of the file, notice we skip the header of the file (32 bytes)
# because sometimes that is the only change
h = hashlib.sha256()
file_hash = hashlib.sha256()
b = bytearray(block_size)
mv = memoryview(b)
try:
with open(filename, 'rb', buffering=0) as f:
# skip header
if skip_header:
file_hash.update(f.read(skip_header))
# noinspection PyUnresolvedReferences
for n in iter(lambda: f.readinto(mv), 0):
h.update(mv[:n])
if skip_header:
file_hash.update(mv[:n])
except Exception as e:
LoggerRoot.get_base_logger().warning(str(e))
return None, None
return h.hexdigest(), file_hash.hexdigest() if skip_header else None
def is_windows():
"""
:return: True if currently running on windows OS
"""
return sys.platform == 'win32'