mirror of
https://github.com/clearml/clearml
synced 2025-04-10 07:26:03 +00:00
Support caching of extracted zip artifacts
This commit is contained in:
parent
6ff3cc0ee4
commit
0a7a32f2eb
@ -167,6 +167,10 @@ sdk {
|
|||||||
|
|
||||||
# Log all stdout & stderr
|
# Log all stdout & stderr
|
||||||
log_stdout: true
|
log_stdout: true
|
||||||
|
|
||||||
|
# compatibility feature, report memory usage for the entire machine
|
||||||
|
# default (false), report only on the running process and its sub-processes
|
||||||
|
report_global_mem_used: false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -157,22 +157,11 @@ class Artifact(object):
|
|||||||
:return: a local path to a downloaded copy of the artifact
|
:return: a local path to a downloaded copy of the artifact
|
||||||
"""
|
"""
|
||||||
from trains.storage import StorageManager
|
from trains.storage import StorageManager
|
||||||
local_path = StorageManager.get_local_copy(self.url)
|
return StorageManager.get_local_copy(
|
||||||
if local_path and extract_archive and self.type == 'archive':
|
remote_url=self.url,
|
||||||
temp_folder = None
|
extract_archive=extract_archive and self.type == 'archive',
|
||||||
try:
|
name=self.name
|
||||||
temp_folder = mkdtemp(prefix='artifact_', suffix='.archive_'+self.name)
|
)
|
||||||
ZipFile(local_path).extractall(path=temp_folder)
|
|
||||||
except Exception:
|
|
||||||
try:
|
|
||||||
if temp_folder:
|
|
||||||
Path(temp_folder).rmdir()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return local_path
|
|
||||||
return temp_folder
|
|
||||||
|
|
||||||
return local_path
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return str({'name': self.name, 'size': self.size, 'type': self.type, 'mode': self.mode, 'url': self.url,
|
return str({'name': self.name, 'size': self.size, 'type': self.type, 'mode': self.mode, 'url': self.url,
|
||||||
|
@ -1,17 +1,19 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
|
import shutil
|
||||||
|
|
||||||
from pathlib2 import Path
|
from pathlib2 import Path
|
||||||
|
|
||||||
from .helper import StorageHelper
|
from .helper import StorageHelper
|
||||||
from .util import quote_url
|
from .util import quote_url
|
||||||
from ..config import get_cache_dir
|
from ..config import get_cache_dir
|
||||||
|
from ..debugging.log import LoggerRoot
|
||||||
|
|
||||||
|
|
||||||
class CacheManager(object):
|
class CacheManager(object):
|
||||||
__cache_managers = {}
|
__cache_managers = {}
|
||||||
_default_cache_file_limit = 100
|
_default_cache_file_limit = 100
|
||||||
_storage_manager_folder = 'storage_manager'
|
_storage_manager_folder = "storage_manager"
|
||||||
_default_context = 'global'
|
_default_context = "global"
|
||||||
|
|
||||||
class CacheContext(object):
|
class CacheContext(object):
|
||||||
def __init__(self, cache_context, default_cache_file_limit=10):
|
def __init__(self, cache_context, default_cache_file_limit=10):
|
||||||
@ -44,20 +46,24 @@ class CacheManager(object):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def upload_file(local_file, remote_url, wait_for_upload=True):
|
def upload_file(local_file, remote_url, wait_for_upload=True):
|
||||||
helper = StorageHelper.get(remote_url)
|
helper = StorageHelper.get(remote_url)
|
||||||
return helper.upload(local_file, remote_url, async_enable=not wait_for_upload)
|
return helper.upload(
|
||||||
|
local_file, remote_url, async_enable=not wait_for_upload
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def _get_hashed_url_file(cls, url):
|
def _get_hashed_url_file(cls, url):
|
||||||
str_hash = hashlib.md5(url.encode()).hexdigest()
|
str_hash = hashlib.md5(url.encode()).hexdigest()
|
||||||
filename = url.split('/')[-1]
|
filename = url.split("/")[-1]
|
||||||
return '{}.{}'.format(str_hash, quote_url(filename))
|
return "{}.{}".format(str_hash, quote_url(filename))
|
||||||
|
|
||||||
def _get_cache_file(self, remote_url):
|
def _get_cache_file(self, remote_url):
|
||||||
"""
|
"""
|
||||||
:param remote_url: check if we have the remote url in our cache
|
:param remote_url: check if we have the remote url in our cache
|
||||||
:return: full path to file name, current file size or None
|
:return: full path to file name, current file size or None
|
||||||
"""
|
"""
|
||||||
folder = Path(get_cache_dir() / CacheManager._storage_manager_folder / self._context)
|
folder = Path(
|
||||||
|
get_cache_dir() / CacheManager._storage_manager_folder / self._context
|
||||||
|
)
|
||||||
folder.mkdir(parents=True, exist_ok=True)
|
folder.mkdir(parents=True, exist_ok=True)
|
||||||
local_filename = self._get_hashed_url_file(remote_url)
|
local_filename = self._get_hashed_url_file(remote_url)
|
||||||
new_file = folder / local_filename
|
new_file = folder / local_filename
|
||||||
@ -65,20 +71,48 @@ class CacheManager(object):
|
|||||||
new_file.touch(exist_ok=True)
|
new_file.touch(exist_ok=True)
|
||||||
|
|
||||||
# delete old files
|
# delete old files
|
||||||
files = sorted(folder.iterdir(), reverse=True, key=lambda x: x.stat().st_atime)
|
def sort_max_access_time(x):
|
||||||
|
atime = x.stat().st_atime
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
if x.is_dir():
|
||||||
|
dir_files = list(x.iterdir())
|
||||||
|
atime = (
|
||||||
|
max(atime, max(s.stat().st_atime for s in dir_files))
|
||||||
|
if dir_files
|
||||||
|
else atime
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return atime
|
||||||
|
|
||||||
|
files = sorted(folder.iterdir(), reverse=True, key=sort_max_access_time)
|
||||||
files = files[self._file_limit:]
|
files = files[self._file_limit:]
|
||||||
for f in files:
|
for f in files:
|
||||||
f.unlink()
|
if not f.is_dir():
|
||||||
|
f.unlink()
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
shutil.rmtree(f)
|
||||||
|
except Exception as e:
|
||||||
|
# failed deleting folder
|
||||||
|
LoggerRoot.get_base_logger().warning(
|
||||||
|
"Exception {}\nFailed deleting folder {}".format(e, f)
|
||||||
|
)
|
||||||
|
|
||||||
# if file doesn't exist, return file size None
|
# if file doesn't exist, return file size None
|
||||||
return new_file.as_posix(), new_file.stat().st_size if new_file.exists() else None
|
return (
|
||||||
|
new_file.as_posix(),
|
||||||
|
new_file.stat().st_size if new_file.exists() else None,
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_cache_manager(cls, cache_context=None, cache_file_limit=None):
|
def get_cache_manager(cls, cache_context=None, cache_file_limit=None):
|
||||||
cache_context = cache_context or cls._default_context
|
cache_context = cache_context or cls._default_context
|
||||||
if cache_context not in cls.__cache_managers:
|
if cache_context not in cls.__cache_managers:
|
||||||
cls.__cache_managers[cache_context] = cls.CacheContext(
|
cls.__cache_managers[cache_context] = cls.CacheContext(
|
||||||
cache_context, cache_file_limit or cls._default_cache_file_limit)
|
cache_context, cache_file_limit or cls._default_cache_file_limit
|
||||||
|
)
|
||||||
if cache_file_limit:
|
if cache_file_limit:
|
||||||
cls.__cache_managers[cache_context].set_cache_limit(cache_file_limit)
|
cls.__cache_managers[cache_context].set_cache_limit(cache_file_limit)
|
||||||
|
|
||||||
|
@ -1,5 +1,12 @@
|
|||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
from time import time
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
from zipfile import ZipFile
|
||||||
|
|
||||||
|
from pathlib2 import Path
|
||||||
|
|
||||||
|
from ..debugging.log import LoggerRoot
|
||||||
from .cache import CacheManager
|
from .cache import CacheManager
|
||||||
|
|
||||||
|
|
||||||
@ -11,7 +18,10 @@ class StorageManager(object):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_local_copy(cls, remote_url, cache_context=None): # type: (str, Optional[str]) -> str
|
def get_local_copy(
|
||||||
|
cls, remote_url, cache_context=None, extract_archive=True, name=None
|
||||||
|
):
|
||||||
|
# type: (str, Optional[str], Optional[bool], Optional[str]) -> str
|
||||||
"""
|
"""
|
||||||
Get a local copy of the remote file. If the remote URL is a direct file access,
|
Get a local copy of the remote file. If the remote URL is a direct file access,
|
||||||
the returned link is the same, otherwise a link to a local copy of the url file is returned.
|
the returned link is the same, otherwise a link to a local copy of the url file is returned.
|
||||||
@ -20,12 +30,63 @@ class StorageManager(object):
|
|||||||
|
|
||||||
:param str remote_url: remote url link (string)
|
:param str remote_url: remote url link (string)
|
||||||
:param str cache_context: Optional caching context identifier (string), default context 'global'
|
:param str cache_context: Optional caching context identifier (string), default context 'global'
|
||||||
|
:param bool extract_archive: if True returned path will be a cached folder containing the archive's content,
|
||||||
|
currently only zip files are supported.
|
||||||
|
:param name: name of artifact.
|
||||||
:return str: full path to local copy of the requested url. Return None on Error.
|
:return str: full path to local copy of the requested url. Return None on Error.
|
||||||
"""
|
"""
|
||||||
return CacheManager.get_cache_manager(cache_context=cache_context).get_local_copy(remote_url=remote_url)
|
cached_file = CacheManager.get_cache_manager(
|
||||||
|
cache_context=cache_context
|
||||||
|
).get_local_copy(remote_url=remote_url)
|
||||||
|
if not extract_archive or not cached_file:
|
||||||
|
return cached_file
|
||||||
|
archive_suffix = cached_file.rpartition(".")[0]
|
||||||
|
target_folder = Path("{0}_artifact_archive_{1}".format(archive_suffix, name))
|
||||||
|
base_logger = LoggerRoot.get_base_logger()
|
||||||
|
try:
|
||||||
|
temp_target_folder = "{0}_{1}".format(target_folder.name, time() * 1000)
|
||||||
|
os.mkdir(path=temp_target_folder)
|
||||||
|
ZipFile(cached_file).extractall(path=temp_target_folder)
|
||||||
|
# we assume we will have such folder if we already extract the zip file
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
# if rename fails, it means that someone else already manged to extract the zip, delete the current
|
||||||
|
# folder and return the already existing cached zip folder
|
||||||
|
os.rename(temp_target_folder, str(target_folder))
|
||||||
|
except Exception:
|
||||||
|
if target_folder.exists():
|
||||||
|
target_folder.touch(exist_ok=True)
|
||||||
|
else:
|
||||||
|
base_logger.warning(
|
||||||
|
"Failed renaming {0} to {1}".format(
|
||||||
|
temp_target_folder, target_folder
|
||||||
|
)
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
shutil.rmtree(temp_target_folder)
|
||||||
|
except Exception as ex:
|
||||||
|
base_logger.warning(
|
||||||
|
"Exception {}\nFailed deleting folder {}".format(
|
||||||
|
ex, temp_target_folder
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
# failed extracting zip file:
|
||||||
|
base_logger.warning(
|
||||||
|
"Exception {}\nFailed extracting zip file {}".format(ex, cached_file)
|
||||||
|
)
|
||||||
|
# noinspection PyBroadException
|
||||||
|
try:
|
||||||
|
target_folder.rmdir()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return cached_file
|
||||||
|
return target_folder
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def upload_file(cls, local_file, remote_url, wait_for_upload=True): # type: (str, str, bool) -> str
|
def upload_file(
|
||||||
|
cls, local_file, remote_url, wait_for_upload=True
|
||||||
|
): # type: (str, str, bool) -> str
|
||||||
"""
|
"""
|
||||||
Upload a local file to a remote location.
|
Upload a local file to a remote location.
|
||||||
remote url is the finale destination of the uploaded file.
|
remote url is the finale destination of the uploaded file.
|
||||||
@ -40,10 +101,15 @@ class StorageManager(object):
|
|||||||
:return str: Newly uploaded remote url
|
:return str: Newly uploaded remote url
|
||||||
"""
|
"""
|
||||||
return CacheManager.get_cache_manager().upload_file(
|
return CacheManager.get_cache_manager().upload_file(
|
||||||
local_file=local_file, remote_url=remote_url, wait_for_upload=wait_for_upload)
|
local_file=local_file,
|
||||||
|
remote_url=remote_url,
|
||||||
|
wait_for_upload=wait_for_upload,
|
||||||
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def set_cache_file_limit(cls, cache_file_limit, cache_context=None): # type: (int, Optional[str]) -> int
|
def set_cache_file_limit(
|
||||||
|
cls, cache_file_limit, cache_context=None
|
||||||
|
): # type: (int, Optional[str]) -> int
|
||||||
"""
|
"""
|
||||||
Set the cache context file limit. File limit is the maximum number of files the specific cache context holds.
|
Set the cache context file limit. File limit is the maximum number of files the specific cache context holds.
|
||||||
Notice, there is no limit on the size of these files, only the total number of cached files.
|
Notice, there is no limit on the size of these files, only the total number of cached files.
|
||||||
@ -53,4 +119,5 @@ class StorageManager(object):
|
|||||||
:return int: Return new cache context file limit
|
:return int: Return new cache context file limit
|
||||||
"""
|
"""
|
||||||
return CacheManager.get_cache_manager(
|
return CacheManager.get_cache_manager(
|
||||||
cache_context=cache_context, cache_file_limit=cache_file_limit).set_cache_limit(cache_file_limit)
|
cache_context=cache_context, cache_file_limit=cache_file_limit
|
||||||
|
).set_cache_limit(cache_file_limit)
|
||||||
|
Loading…
Reference in New Issue
Block a user