Support controlling the naming of the sub-folder created by StorageManager/CacheManager

This commit is contained in:
allegroai 2020-11-25 10:58:01 +02:00
parent 8f65f28d58
commit d78ee6c669
4 changed files with 91 additions and 20 deletions

View File

@ -17,6 +17,8 @@ class CacheManager(object):
_default_context = "global"
_local_to_remote_url_lookup = OrderedDict()
__local_to_remote_url_lookup_max_size = 1024
_context_to_folder_lookup = dict()
_default_context_folder_template = "{0}_artifacts_archive_{1}"
class CacheContext(object):
def __init__(self, cache_context, default_cache_file_limit=10):
@ -43,7 +45,7 @@ class CacheManager(object):
return direct_access
# check if we already have the file in our cache
cached_file, cached_size = self._get_cache_file(remote_url)
cached_file, cached_size = self.get_cache_file(remote_url)
if cached_size is not None and not force_download:
CacheManager._add_remote_url(remote_url, cached_file)
return cached_file
@ -70,9 +72,19 @@ class CacheManager(object):
filename = url.split("/")[-1]
return "{}.{}".format(str_hash, quote_url(filename))
def _get_cache_file(self, remote_url):
def get_cache_folder(self):
"""
:return: full path to current contexts cache folder
"""
folder = Path(
get_cache_dir() / CacheManager._storage_manager_folder / self._context
)
return folder.as_posix()
def get_cache_file(self, remote_url=None, local_filename=None):
"""
:param remote_url: check if we have the remote url in our cache
:param local_filename: if local_file is given, search for the local file/directory in the cache folder
:return: full path to file name, current file size or None
"""
def safe_time(x):
@ -101,7 +113,7 @@ class CacheManager(object):
get_cache_dir() / CacheManager._storage_manager_folder / self._context
)
folder.mkdir(parents=True, exist_ok=True)
local_filename = self._get_hashed_url_file(remote_url)
local_filename = local_filename or self._get_hashed_url_file(remote_url)
new_file = folder / local_filename
new_file_exists = new_file.exists()
if new_file_exists:
@ -190,3 +202,14 @@ class CacheManager(object):
if len(CacheManager._local_to_remote_url_lookup) > CacheManager.__local_to_remote_url_lookup_max_size:
# pop the first item (FIFO)
CacheManager._local_to_remote_url_lookup.popitem(last=False)
@classmethod
def set_context_folder_lookup(cls, context, name_template):
cls._context_to_folder_lookup[str(context)] = str(name_template)
return str(name_template)
@classmethod
def get_context_folder_lookup(cls, context):
if not context:
return cls._default_context_folder_template
return cls._context_to_folder_lookup.get(str(context), cls._default_context_folder_template)

View File

@ -691,7 +691,7 @@ class StorageHelper(object):
except (ValueError, AttributeError, KeyError):
pass
# if driver supports download with call back, use it (it might be faster)
# if driver supports download with callback, use it (it might be faster)
if hasattr(self._driver, 'download_object'):
# callback
cb = _DownloadProgressReport(total_size_mb, verbose,

View File

@ -1,3 +1,4 @@
import os
import shutil
import tarfile
from random import random
@ -7,9 +8,9 @@ from zipfile import ZipFile
from pathlib2 import Path
from .cache import CacheManager
from .util import encode_string_to_filename
from ..debugging.log import LoggerRoot
from .cache import CacheManager
class StorageManager(object):
@ -42,7 +43,7 @@ class StorageManager(object):
cache_context=cache_context
).get_local_copy(remote_url=remote_url, force_download=force_download)
if extract_archive and cached_file:
return cls._extract_to_cache(cached_file, name)
return cls._extract_to_cache(cached_file, name, cache_context)
return cached_file
@ -89,11 +90,14 @@ class StorageManager(object):
).set_cache_limit(cache_file_limit)
@classmethod
def _extract_to_cache(cls, cached_file, name):
def _extract_to_cache(cls, cached_file, name, cache_context=None, target_folder=None):
# type: (str, str, Optional[str], Optional[str]) -> str
"""
Extract cached file to cache folder
:param str cached_file: local copy of archive file
:param str name: cache context
:param str name: name of the target file
:param str cache_context: cache context id
:param str target_folder: specify target path to use for archive extraction
:return: cached folder containing the extracted archive content
"""
if not cached_file:
@ -102,21 +106,24 @@ class StorageManager(object):
cached_file = Path(cached_file)
# we support zip and tar.gz files auto-extraction
if (
not cached_file.suffix == ".zip"
and not cached_file.suffixes[-2:] == [".tar", ".gz"]
):
suffix = cached_file.suffix.lower()
if suffix == '.gz':
suffix = ''.join(a.lower() for a in cached_file.suffixes[-2:])
if suffix not in (".zip", ".tgz", ".tar.gz"):
return str(cached_file)
cached_folder = cached_file.parent
cached_folder = Path(cached_file).parent
archive_suffix = cached_file.name[:-len(suffix)]
name = encode_string_to_filename(name)
target_folder = Path(
target_folder or CacheManager.get_context_folder_lookup(cache_context).format(archive_suffix, name))
name = encode_string_to_filename(name) if name else name
target_folder = Path("{0}/{1}_artifacts_archive_{2}".format(cached_folder, cached_file.stem, name))
if target_folder.exists():
# noinspection PyBroadException
try:
target_folder.touch(exist_ok=True)
return target_folder
return target_folder.as_posix()
except Exception:
pass
@ -125,11 +132,14 @@ class StorageManager(object):
temp_target_folder = cached_folder / "{0}_{1}_{2}".format(
target_folder.name, time() * 1000, str(random()).replace('.', ''))
temp_target_folder.mkdir(parents=True, exist_ok=True)
if cached_file.suffix == ".zip":
if suffix == ".zip":
ZipFile(cached_file).extractall(path=temp_target_folder.as_posix())
elif cached_file.suffixes[-2:] == [".tar", ".gz"]:
elif suffix == ".tar.gz":
with tarfile.open(cached_file) as file:
file.extractall(temp_target_folder)
file.extractall(temp_target_folder.as_posix())
elif suffix == ".tgz":
with tarfile.open(cached_file, mode='r:gz') as file:
file.extractall(temp_target_folder.as_posix())
# we assume we will have such folder if we already extract the file
# noinspection PyBroadException
@ -165,7 +175,7 @@ class StorageManager(object):
except Exception:
pass
return cached_file
return target_folder
return target_folder.as_posix()
@classmethod
def get_files_server(cls):

View File

@ -1,7 +1,13 @@
import hashlib
import sys
from typing import Optional
from six.moves.urllib.parse import quote, urlparse, urlunparse
import six
import fnmatch
from ..debugging.log import LoggerRoot
def get_config_object_matcher(**patterns):
unsupported = {k: v for k, v in patterns.items() if not isinstance(v, six.string_types)}
@ -39,3 +45,35 @@ def quote_url(url):
def encode_string_to_filename(text):
return quote(text, safe=" ")
def sha256sum(filename, skip_header=0, block_size=65536):
# type: (str, int, int) -> (Optional[str], Optional[str])
# create sha2 of the file, notice we skip the header of the file (32 bytes)
# because sometimes that is the only change
h = hashlib.sha256()
file_hash = hashlib.sha256()
b = bytearray(block_size)
mv = memoryview(b)
try:
with open(filename, 'rb', buffering=0) as f:
# skip header
if skip_header:
file_hash.update(f.read(skip_header))
# noinspection PyUnresolvedReferences
for n in iter(lambda: f.readinto(mv), 0):
h.update(mv[:n])
if skip_header:
file_hash.update(mv[:n])
except Exception as e:
LoggerRoot.get_base_logger().warning(str(e))
return None, None
return h.hexdigest(), file_hash.hexdigest() if skip_header else None
def is_windows():
"""
:return: True if currently running on windows OS
"""
return sys.platform == 'win32'