diff --git a/trains/storage/cache.py b/trains/storage/cache.py index 7b1043e5..3e1af1e1 100644 --- a/trains/storage/cache.py +++ b/trains/storage/cache.py @@ -17,6 +17,8 @@ class CacheManager(object): _default_context = "global" _local_to_remote_url_lookup = OrderedDict() __local_to_remote_url_lookup_max_size = 1024 + _context_to_folder_lookup = dict() + _default_context_folder_template = "{0}_artifacts_archive_{1}" class CacheContext(object): def __init__(self, cache_context, default_cache_file_limit=10): @@ -43,7 +45,7 @@ class CacheManager(object): return direct_access # check if we already have the file in our cache - cached_file, cached_size = self._get_cache_file(remote_url) + cached_file, cached_size = self.get_cache_file(remote_url) if cached_size is not None and not force_download: CacheManager._add_remote_url(remote_url, cached_file) return cached_file @@ -70,9 +72,19 @@ class CacheManager(object): filename = url.split("/")[-1] return "{}.{}".format(str_hash, quote_url(filename)) - def _get_cache_file(self, remote_url): + def get_cache_folder(self): + """ + :return: full path to current contexts cache folder + """ + folder = Path( + get_cache_dir() / CacheManager._storage_manager_folder / self._context + ) + return folder.as_posix() + + def get_cache_file(self, remote_url=None, local_filename=None): """ :param remote_url: check if we have the remote url in our cache + :param local_filename: if local_file is given, search for the local file/directory in the cache folder :return: full path to file name, current file size or None """ def safe_time(x): @@ -101,7 +113,7 @@ class CacheManager(object): get_cache_dir() / CacheManager._storage_manager_folder / self._context ) folder.mkdir(parents=True, exist_ok=True) - local_filename = self._get_hashed_url_file(remote_url) + local_filename = local_filename or self._get_hashed_url_file(remote_url) new_file = folder / local_filename new_file_exists = new_file.exists() if new_file_exists: @@ -190,3 +202,14 @@ class CacheManager(object): if len(CacheManager._local_to_remote_url_lookup) > CacheManager.__local_to_remote_url_lookup_max_size: # pop the first item (FIFO) CacheManager._local_to_remote_url_lookup.popitem(last=False) + + @classmethod + def set_context_folder_lookup(cls, context, name_template): + cls._context_to_folder_lookup[str(context)] = str(name_template) + return str(name_template) + + @classmethod + def get_context_folder_lookup(cls, context): + if not context: + return cls._default_context_folder_template + return cls._context_to_folder_lookup.get(str(context), cls._default_context_folder_template) diff --git a/trains/storage/helper.py b/trains/storage/helper.py index 277dcf86..a588a981 100644 --- a/trains/storage/helper.py +++ b/trains/storage/helper.py @@ -691,7 +691,7 @@ class StorageHelper(object): except (ValueError, AttributeError, KeyError): pass - # if driver supports download with call back, use it (it might be faster) + # if driver supports download with callback, use it (it might be faster) if hasattr(self._driver, 'download_object'): # callback cb = _DownloadProgressReport(total_size_mb, verbose, diff --git a/trains/storage/manager.py b/trains/storage/manager.py index b9599063..fdf2da74 100644 --- a/trains/storage/manager.py +++ b/trains/storage/manager.py @@ -1,3 +1,4 @@ +import os import shutil import tarfile from random import random @@ -7,9 +8,9 @@ from zipfile import ZipFile from pathlib2 import Path -from .cache import CacheManager from .util import encode_string_to_filename from ..debugging.log import LoggerRoot +from .cache import CacheManager class StorageManager(object): @@ -42,7 +43,7 @@ class StorageManager(object): cache_context=cache_context ).get_local_copy(remote_url=remote_url, force_download=force_download) if extract_archive and cached_file: - return cls._extract_to_cache(cached_file, name) + return cls._extract_to_cache(cached_file, name, cache_context) return cached_file @@ -89,11 +90,14 @@ class StorageManager(object): ).set_cache_limit(cache_file_limit) @classmethod - def _extract_to_cache(cls, cached_file, name): + def _extract_to_cache(cls, cached_file, name, cache_context=None, target_folder=None): + # type: (str, str, Optional[str], Optional[str]) -> str """ Extract cached file to cache folder :param str cached_file: local copy of archive file - :param str name: cache context + :param str name: name of the target file + :param str cache_context: cache context id + :param str target_folder: specify target path to use for archive extraction :return: cached folder containing the extracted archive content """ if not cached_file: @@ -102,21 +106,24 @@ class StorageManager(object): cached_file = Path(cached_file) # we support zip and tar.gz files auto-extraction - if ( - not cached_file.suffix == ".zip" - and not cached_file.suffixes[-2:] == [".tar", ".gz"] - ): + suffix = cached_file.suffix.lower() + if suffix == '.gz': + suffix = ''.join(a.lower() for a in cached_file.suffixes[-2:]) + + if suffix not in (".zip", ".tgz", ".tar.gz"): return str(cached_file) - cached_folder = cached_file.parent + cached_folder = Path(cached_file).parent + archive_suffix = cached_file.name[:-len(suffix)] + name = encode_string_to_filename(name) + target_folder = Path( + target_folder or CacheManager.get_context_folder_lookup(cache_context).format(archive_suffix, name)) - name = encode_string_to_filename(name) if name else name - target_folder = Path("{0}/{1}_artifacts_archive_{2}".format(cached_folder, cached_file.stem, name)) if target_folder.exists(): # noinspection PyBroadException try: target_folder.touch(exist_ok=True) - return target_folder + return target_folder.as_posix() except Exception: pass @@ -125,11 +132,14 @@ class StorageManager(object): temp_target_folder = cached_folder / "{0}_{1}_{2}".format( target_folder.name, time() * 1000, str(random()).replace('.', '')) temp_target_folder.mkdir(parents=True, exist_ok=True) - if cached_file.suffix == ".zip": + if suffix == ".zip": ZipFile(cached_file).extractall(path=temp_target_folder.as_posix()) - elif cached_file.suffixes[-2:] == [".tar", ".gz"]: + elif suffix == ".tar.gz": with tarfile.open(cached_file) as file: - file.extractall(temp_target_folder) + file.extractall(temp_target_folder.as_posix()) + elif suffix == ".tgz": + with tarfile.open(cached_file, mode='r:gz') as file: + file.extractall(temp_target_folder.as_posix()) # we assume we will have such folder if we already extract the file # noinspection PyBroadException @@ -165,7 +175,7 @@ class StorageManager(object): except Exception: pass return cached_file - return target_folder + return target_folder.as_posix() @classmethod def get_files_server(cls): diff --git a/trains/storage/util.py b/trains/storage/util.py index 0bde4e96..ce53889b 100644 --- a/trains/storage/util.py +++ b/trains/storage/util.py @@ -1,7 +1,13 @@ +import hashlib +import sys +from typing import Optional + from six.moves.urllib.parse import quote, urlparse, urlunparse import six import fnmatch +from ..debugging.log import LoggerRoot + def get_config_object_matcher(**patterns): unsupported = {k: v for k, v in patterns.items() if not isinstance(v, six.string_types)} @@ -39,3 +45,35 @@ def quote_url(url): def encode_string_to_filename(text): return quote(text, safe=" ") + + +def sha256sum(filename, skip_header=0, block_size=65536): + # type: (str, int, int) -> (Optional[str], Optional[str]) + # create sha2 of the file, notice we skip the header of the file (32 bytes) + # because sometimes that is the only change + h = hashlib.sha256() + file_hash = hashlib.sha256() + b = bytearray(block_size) + mv = memoryview(b) + try: + with open(filename, 'rb', buffering=0) as f: + # skip header + if skip_header: + file_hash.update(f.read(skip_header)) + # noinspection PyUnresolvedReferences + for n in iter(lambda: f.readinto(mv), 0): + h.update(mv[:n]) + if skip_header: + file_hash.update(mv[:n]) + except Exception as e: + LoggerRoot.get_base_logger().warning(str(e)) + return None, None + + return h.hexdigest(), file_hash.hexdigest() if skip_header else None + + +def is_windows(): + """ + :return: True if currently running on windows OS + """ + return sys.platform == 'win32' \ No newline at end of file