From b84963fea0d964207f10ef09985161673528742c Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Tue, 28 Jun 2022 21:22:51 +0300 Subject: [PATCH] Add support for renaming very long filenames to avoid file system errors when downloading files --- clearml/storage/cache.py | 47 ++++++++++++++++++++++++++++++++++++++ clearml/utilities/files.py | 23 +++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 clearml/utilities/files.py diff --git a/clearml/storage/cache.py b/clearml/storage/cache.py index f10e32b3..5af8053e 100644 --- a/clearml/storage/cache.py +++ b/clearml/storage/cache.py @@ -15,6 +15,7 @@ from ..config import get_cache_dir, deferred_config from ..debugging.log import LoggerRoot from ..utilities.locks.utils import Lock as FileLock from ..utilities.locks.exceptions import LockException +from ..utilities.files import get_filename_max_length class CacheManager(object): @@ -40,6 +41,7 @@ class CacheManager(object): self._context = str(cache_context) self._file_limit = int(default_cache_file_limit) self._rlock = RLock() + self._max_file_name_length = None def set_cache_limit(self, cache_file_limit): # type: (int) -> int @@ -108,6 +110,50 @@ class CacheManager(object): filename = url.split("/")[-1] return "{}.{}".format(str_hash, quote_url(filename)) + def _conform_filename(self, file_name): + # type: (str) -> str + """ + Renames very long filename by reducing characters from the end + without the extensions from 2 floating point. + :param file_name: base file name + :return: new_file name (if it has very long name) or original + """ + if self._max_file_name_length is None: + self._max_file_name_length = get_filename_max_length(self.get_cache_folder()) + + # Maximum character supported for filename + # (FS limit) - (32 for temporary file name addition) + allowed_length = self._max_file_name_length - 32 + + if len(file_name) <= allowed_length: + return file_name # File name size is in limit + + file_ext = "".join(Path(file_name).suffixes[-2:]) + file_ext = file_ext.rstrip(" ") + + file_basename = file_name[:-len(file_ext)] + file_basename = file_basename.strip() + + # Omit characters from extensionss + if len(file_ext) > allowed_length: + file_ext = file_ext[-(allowed_length - 1):] + file_ext = "." + file_ext.lstrip(".") + + # Updating maximum character length + allowed_length -= len(file_ext) + + # Omit characters from filename (without extension) + if len(file_basename) > allowed_length: + file_basename = file_basename[:allowed_length].strip() + + new_file_name = file_basename + file_ext + + LoggerRoot.get_base_logger().warning( + 'Renaming file to "{}" due to filename length limit'.format(new_file_name) + ) + + return new_file_name + def get_cache_folder(self): # type: () -> str """ @@ -153,6 +199,7 @@ class CacheManager(object): ) folder.mkdir(parents=True, exist_ok=True) local_filename = local_filename or self.get_hashed_url_file(remote_url) + local_filename = self._conform_filename(local_filename) new_file = folder / local_filename new_file_exists = new_file.exists() if new_file_exists: diff --git a/clearml/utilities/files.py b/clearml/utilities/files.py new file mode 100644 index 00000000..0b02c0e3 --- /dev/null +++ b/clearml/utilities/files.py @@ -0,0 +1,23 @@ +import os +from sys import platform + +import pathlib2 +import psutil + + +def get_filename_max_length(dir_path): + # type: (str) -> int + try: + dir_path = pathlib2.Path(os.path.abspath(dir_path)) + if platform == "win32": + dir_drive = dir_path.drive + for drv in psutil.disk_partitions(): + if drv.device.startswith(dir_drive): + return drv.maxfile + elif platform in ("linux", "darwin"): + return os.statvfs(dir_path).f_namemax + except Exception as err: + print(err) + + return 255 # Common filesystems like NTFS, EXT4 and HFS+ limited with 255 +