mirror of
https://github.com/clearml/clearml
synced 2025-06-23 01:55:38 +00:00
307 lines
11 KiB
Python
307 lines
11 KiB
Python
import hashlib
|
|
import json
|
|
import re
|
|
import sys
|
|
from zlib import crc32
|
|
from typing import Optional, Union, Sequence, Dict
|
|
from pathlib2 import Path
|
|
|
|
from six.moves.urllib.parse import quote, urlparse, urlunparse
|
|
import six
|
|
import fnmatch
|
|
|
|
from ..debugging.log import LoggerRoot
|
|
|
|
|
|
def get_config_object_matcher(**patterns):
|
|
unsupported = {k: v for k, v in patterns.items() if not isinstance(v, six.string_types)}
|
|
if unsupported:
|
|
raise ValueError('Unsupported object matcher (expecting string): %s'
|
|
% ', '.join('%s=%s' % (k, v) for k, v in unsupported.items()))
|
|
|
|
# optimize simple patters
|
|
starts_with = {k: v.rstrip('*') for k, v in patterns.items() if '*' not in v.rstrip('*') and '?' not in v}
|
|
patterns = {k: v for k, v in patterns.items() if v not in starts_with}
|
|
|
|
def _matcher(**kwargs):
|
|
for key, value in kwargs.items():
|
|
if not value:
|
|
continue
|
|
start = starts_with.get(key)
|
|
if start:
|
|
if value.startswith(start):
|
|
return True
|
|
else:
|
|
pat = patterns.get(key)
|
|
if pat and fnmatch.fnmatch(value, pat):
|
|
return True
|
|
|
|
return _matcher
|
|
|
|
|
|
def quote_url(url):
|
|
parsed = urlparse(url)
|
|
if parsed.scheme not in ('http', 'https'):
|
|
return url
|
|
parsed = parsed._replace(path=quote(parsed.path))
|
|
return urlunparse(parsed)
|
|
|
|
|
|
def encode_string_to_filename(text):
|
|
return quote(text, safe=" ")
|
|
|
|
|
|
def sha256sum(filename, skip_header=0, block_size=65536):
|
|
# type: (str, int, int) -> (Optional[str], Optional[str])
|
|
# create sha2 of the file, notice we skip the header of the file (32 bytes)
|
|
# because sometimes that is the only change
|
|
h = hashlib.sha256()
|
|
file_hash = hashlib.sha256()
|
|
b = bytearray(block_size)
|
|
mv = memoryview(b)
|
|
try:
|
|
with open(filename, 'rb', buffering=0) as f:
|
|
# skip header
|
|
if skip_header:
|
|
file_hash.update(f.read(skip_header))
|
|
# noinspection PyUnresolvedReferences
|
|
for n in iter(lambda: f.readinto(mv), 0):
|
|
h.update(mv[:n])
|
|
if skip_header:
|
|
file_hash.update(mv[:n])
|
|
except Exception as e:
|
|
LoggerRoot.get_base_logger().warning(str(e))
|
|
return None, None
|
|
|
|
return h.hexdigest(), file_hash.hexdigest() if skip_header else None
|
|
|
|
|
|
def md5text(text, seed=1337):
|
|
# type: (str, Union[int, str]) -> str
|
|
"""
|
|
Return md5 hash of a string
|
|
Do not use this hash for security, if needed use something stronger like SHA2
|
|
|
|
:param text: string to hash
|
|
:param seed: use prefix seed for hashing
|
|
:return: md5 string
|
|
"""
|
|
return hash_text(text=text, seed=seed, hash_func='md5')
|
|
|
|
|
|
def crc32text(text, seed=1337):
|
|
# type: (str, Union[int, str]) -> str
|
|
"""
|
|
Return crc32 hash of a string
|
|
Do not use this hash for security, if needed use something stronger like SHA2
|
|
|
|
:param text: string to hash
|
|
:param seed: use prefix seed for hashing
|
|
:return: crc32 hex in string (32bits = 8 characters in hex)
|
|
"""
|
|
return '{:08x}'.format(crc32((str(seed)+str(text)).encode('utf-8')))
|
|
|
|
|
|
def hash_text(text, seed=1337, hash_func='md5'):
|
|
# type: (str, Union[int, str], str) -> str
|
|
"""
|
|
Return hash_func (md5/sha1/sha256/sha384/sha512) hash of a string
|
|
|
|
:param text: string to hash
|
|
:param seed: use prefix seed for hashing
|
|
:param hash_func: hashing function. currently supported md5 sha256
|
|
:return: hashed string
|
|
"""
|
|
assert hash_func in ('md5', 'sha256', 'sha256', 'sha384', 'sha512')
|
|
h = getattr(hashlib, hash_func)()
|
|
h.update((str(seed) + str(text)).encode('utf-8'))
|
|
return h.hexdigest()
|
|
|
|
|
|
def hash_dict(a_dict, seed=1337, hash_func='md5'):
|
|
# type: (Dict, Union[int, str], str) -> str
|
|
"""
|
|
Return hash_func (crc32/md5/sha1/sha256/sha384/sha512) hash of the dict values
|
|
(dict must be JSON serializable)
|
|
|
|
:param a_dict: a dictionary to hash
|
|
:param seed: use prefix seed for hashing
|
|
:param hash_func: hashing function. currently supported md5 sha256
|
|
:return: hashed string
|
|
"""
|
|
assert hash_func in ('crc32', 'md5', 'sha256', 'sha256', 'sha384', 'sha512')
|
|
repr_string = json.dumps(a_dict, sort_keys=True)
|
|
if hash_func == 'crc32':
|
|
return crc32text(repr_string, seed=seed)
|
|
else:
|
|
return hash_text(repr_string, seed=seed, hash_func=hash_func)
|
|
|
|
|
|
def is_windows():
|
|
"""
|
|
:return: True if currently running on windows OS
|
|
"""
|
|
return sys.platform == 'win32'
|
|
|
|
|
|
def format_size(size_in_bytes, binary=False):
|
|
# type: (Union[int, float], bool) -> str
|
|
"""
|
|
Return the size in human readable format (string)
|
|
Matching humanfriendly.format_size outputs
|
|
|
|
:param size_in_bytes: number of bytes
|
|
:param binary: If `True` 1 Kb equals 1024 bytes, if False (default) 1 KB = 1000 bytes
|
|
:return: string representation of the number of bytes (b,Kb,Mb,Gb, Tb,)
|
|
>>> format_size(0)
|
|
'0 bytes'
|
|
>>> format_size(1)
|
|
'1 byte'
|
|
>>> format_size(5)
|
|
'5 bytes'
|
|
> format_size(1000)
|
|
'1 KB'
|
|
> format_size(1024, binary=True)
|
|
'1 KiB'
|
|
>>> format_size(1000 ** 3 * 4)
|
|
'4 GB'
|
|
"""
|
|
size = float(size_in_bytes)
|
|
# single byte is the exception here
|
|
if size == 1:
|
|
return '{} byte'.format(int(size))
|
|
k = 1024 if binary else 1000
|
|
scale = ('bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB') if binary else ('bytes', 'KB', 'MB', 'GB', 'TB', 'PB')
|
|
for i, m in enumerate(scale):
|
|
if size < k**(i+1) or i == len(scale)-1:
|
|
return ('{:.2f}'.format(size/(k**i)).rstrip('0').rstrip('.')
|
|
if i > 0 else '{}'.format(int(size))) + ' ' + m
|
|
# we should never get here
|
|
return '{} {}'.format(int(size), scale[0])
|
|
|
|
|
|
def parse_size(size, binary=False):
|
|
"""
|
|
Parse a human readable data size and return the number of bytes.
|
|
Match humanfriendly.parse_size
|
|
|
|
:param size: The human readable file size to parse (a string).
|
|
:param binary: :data:`True` to use binary multiples of bytes (base-2) for
|
|
ambiguous unit symbols and names, :data:`False` to use
|
|
decimal multiples of bytes (base-10).
|
|
:returns: The corresponding size in bytes (an integer).
|
|
:raises: :exc:`InvalidSize` when the input can't be parsed.
|
|
|
|
This function knows how to parse sizes in bytes, kilobytes, megabytes,
|
|
gigabytes, terabytes and petabytes. Some examples:
|
|
>>> parse_size('42')
|
|
42
|
|
>>> parse_size('13b')
|
|
13
|
|
>>> parse_size('5 bytes')
|
|
5
|
|
>>> parse_size('1 KB')
|
|
1000
|
|
>>> parse_size('1 kilobyte')
|
|
1000
|
|
>>> parse_size('1 KiB')
|
|
1024
|
|
>>> parse_size('1 KB', binary=True)
|
|
1024
|
|
>>> parse_size('1.5 GB')
|
|
1500000000
|
|
>>> parse_size('1.5 GB', binary=True)
|
|
1610612736
|
|
"""
|
|
def tokenize(text):
|
|
tokenized_input = []
|
|
for token in re.split(r'(\d+(?:\.\d+)?)', text):
|
|
token = token.strip()
|
|
if re.match(r'\d+\.\d+', token):
|
|
tokenized_input.append(float(token))
|
|
elif token.isdigit():
|
|
tokenized_input.append(int(token))
|
|
elif token:
|
|
tokenized_input.append(token)
|
|
return tokenized_input
|
|
tokens = tokenize(str(size))
|
|
if tokens and isinstance(tokens[0], (int, float)):
|
|
disk_size_units_b = \
|
|
(('B', 'bytes'), ('KiB', 'kibibyte'), ('MiB', 'mebibyte'), ('GiB', 'gibibyte'),
|
|
('TiB', 'tebibyte'), ('PiB', 'pebibyte'))
|
|
disk_size_units_d = \
|
|
(('B', 'bytes'), ('KB', 'kilobyte'), ('MB', 'megabyte'), ('GB', 'gigabyte'),
|
|
('TB', 'terabyte'), ('PB', 'petabyte'))
|
|
disk_size_units_b = [(1024 ** i, s[0], s[1]) for i, s in enumerate(disk_size_units_b)]
|
|
k = 1024 if binary else 1000
|
|
disk_size_units_d = [(k ** i, s[0], s[1]) for i, s in enumerate(disk_size_units_d)]
|
|
disk_size_units = (disk_size_units_b + disk_size_units_d) \
|
|
if binary else (disk_size_units_d + disk_size_units_b)
|
|
|
|
# Get the normalized unit (if any) from the tokenized input.
|
|
normalized_unit = tokens[1].lower() if len(tokens) == 2 and isinstance(tokens[1], str) else ''
|
|
# If the input contains only a number, it's assumed to be the number of
|
|
# bytes. The second token can also explicitly reference the unit bytes.
|
|
if len(tokens) == 1 or normalized_unit.startswith('b'):
|
|
return int(tokens[0])
|
|
# Otherwise we expect two tokens: A number and a unit.
|
|
if normalized_unit:
|
|
# Convert plural units to singular units, for details:
|
|
# https://github.com/xolox/python-humanfriendly/issues/26
|
|
normalized_unit = normalized_unit.rstrip('s')
|
|
for k, low, high in disk_size_units:
|
|
# First we check for unambiguous symbols (KiB, MiB, GiB, etc)
|
|
# and names (kibibyte, mebibyte, gibibyte, etc) because their
|
|
# handling is always the same.
|
|
if normalized_unit in (low.lower(), high.lower()):
|
|
return int(tokens[0] * k)
|
|
# Now we will deal with ambiguous prefixes (K, M, G, etc),
|
|
# symbols (KB, MB, GB, etc) and names (kilobyte, megabyte,
|
|
# gigabyte, etc) according to the caller's preference.
|
|
if (normalized_unit in (low.lower(), high.lower()) or
|
|
normalized_unit.startswith(low.lower())):
|
|
return int(tokens[0] * k)
|
|
|
|
raise ValueError("Failed to parse size! (input {} was tokenized as {})".format(size, tokens))
|
|
|
|
|
|
def get_common_path(list_of_files):
|
|
# type: (Sequence[Union[str, Path]]) -> Optional[str]
|
|
"""
|
|
Return the common path of a list of files
|
|
|
|
:param list_of_files: list of files (str or Path objects)
|
|
:return: Common path string (always absolute) or None if common path could not be found
|
|
"""
|
|
if not list_of_files:
|
|
return None
|
|
|
|
# a single file has its parent as common path
|
|
if len(list_of_files) == 1:
|
|
return Path(list_of_files[0]).absolute().parent.as_posix()
|
|
|
|
# find common path to support folder structure inside zip
|
|
common_path_parts = Path(list_of_files[0]).absolute().parts
|
|
for f in list_of_files:
|
|
f_parts = Path(f).absolute().parts
|
|
num_p = min(len(f_parts), len(common_path_parts))
|
|
if f_parts[:num_p] == common_path_parts[:num_p]:
|
|
common_path_parts = common_path_parts[:num_p]
|
|
continue
|
|
num_p = min(
|
|
[i for i, (a, b) in enumerate(zip(common_path_parts[:num_p], f_parts[:num_p])) if a != b] or [-1])
|
|
# no common path, break
|
|
if num_p < 0:
|
|
common_path_parts = []
|
|
break
|
|
# update common path
|
|
common_path_parts = common_path_parts[:num_p]
|
|
|
|
if common_path_parts:
|
|
common_path = Path()
|
|
for f in common_path_parts:
|
|
common_path /= f
|
|
return common_path.as_posix()
|
|
|
|
return None
|