From 0f401545b88d31f04d3a646c44260786b485939c Mon Sep 17 00:00:00 2001 From: allegroai <> Date: Sun, 24 Jan 2021 09:22:56 +0200 Subject: [PATCH] Replace humanfriendly with utility functions --- .../backend_interface/metrics/interface.py | 6 +- clearml/binding/artifacts.py | 7 +- clearml/datasets/dataset.py | 13 +- clearml/storage/util.py | 122 ++++++++++++++++++ clearml/utilities/config.py | 4 +- 5 files changed, 135 insertions(+), 17 deletions(-) diff --git a/clearml/backend_interface/metrics/interface.py b/clearml/backend_interface/metrics/interface.py index 6733f2f2..9cf8d1de 100644 --- a/clearml/backend_interface/metrics/interface.py +++ b/clearml/backend_interface/metrics/interface.py @@ -3,10 +3,8 @@ import os from functools import partial from logging import warning from multiprocessing.pool import ThreadPool -from multiprocessing import Lock from time import time -from humanfriendly import format_timespan from pathlib2 import Path from ...backend_api.services import events as api_events @@ -198,8 +196,8 @@ class Metrics(InterfaceBase): t_f, t_u, t_ref = \ (self._file_related_event_time, self._file_upload_time, self._file_upload_starvation_warning_sec) if t_f and t_u and t_ref and (t_f - t_u) > t_ref: - log.warning('Possible metrics file upload starvation: files were not uploaded for %s' % - format_timespan(t_ref)) + log.warning('Possible metrics file upload starvation: ' + 'files were not uploaded for {} seconds'.format(t_ref)) # send the events in a batched request good_events = [ev for ev in events if ev.upload_exception is None] diff --git a/clearml/binding/artifacts.py b/clearml/binding/artifacts.py index ffa043f5..0ea68419 100644 --- a/clearml/binding/artifacts.py +++ b/clearml/binding/artifacts.py @@ -12,7 +12,6 @@ from threading import Thread from time import time from zipfile import ZipFile, ZIP_DEFLATED -import humanfriendly import six from PIL import Image from pathlib2 import Path @@ -24,7 +23,7 @@ from ..backend_api.services import tasks from ..backend_interface.metrics.events import UploadEvent from ..debugging.log import LoggerRoot from ..storage.helper import remote_driver_schemes -from ..storage.util import sha256sum +from ..storage.util import sha256sum, format_size try: import pandas as pd @@ -428,7 +427,7 @@ class Artifacts(object): if filename.is_file(): relative_file_name = filename.relative_to(folder).as_posix() archive_preview += '{} - {}\n'.format( - relative_file_name, humanfriendly.format_size(filename.stat().st_size)) + relative_file_name, format_size(filename.stat().st_size)) zf.write(filename.as_posix(), arcname=relative_file_name) except Exception as e: # failed uploading folder: @@ -449,7 +448,7 @@ class Artifacts(object): override_filename_in_uri = artifact_object.parts[-1] artifact_type_data.preview = preview or '{} - {}\n'.format( - artifact_object, humanfriendly.format_size(artifact_object.stat().st_size)) + artifact_object, format_size(artifact_object.stat().st_size)) artifact_object = artifact_object.as_posix() artifact_type = 'custom' artifact_type_data.content_type = mimetypes.guess_type(artifact_object)[0] diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py index b99c5e2a..159068c5 100644 --- a/clearml/datasets/dataset.py +++ b/clearml/datasets/dataset.py @@ -9,7 +9,6 @@ from tempfile import mkstemp, mkdtemp from typing import Union, Optional, Sequence, List, Dict, Any, Mapping from zipfile import ZipFile, ZIP_DEFLATED -import humanfriendly from attr import attrs, attrib from pathlib2 import Path @@ -20,7 +19,7 @@ from ..backend_interface.util import mutually_exclusive, exact_match_regex from ..debugging.log import LoggerRoot from ..storage.helper import StorageHelper from ..storage.cache import CacheManager -from ..storage.util import sha256sum, is_windows, md5text +from ..storage.util import sha256sum, is_windows, md5text, format_size try: from pathlib import Path as _Path # noqa @@ -324,7 +323,7 @@ class Dataset(object): relative_file_name = file_entry.relative_path zf.write(filename.as_posix(), arcname=relative_file_name) archive_preview += '{} - {}\n'.format( - relative_file_name, humanfriendly.format_size(filename.stat().st_size)) + relative_file_name, format_size(filename.stat().st_size)) file_entry.local_path = None count += 1 except Exception as e: @@ -358,7 +357,7 @@ class Dataset(object): self._dataset_file_entries = {k: v for k, v in self._dataset_file_entries.items() if v.relative_path is not None} # start upload - zip_file_size = humanfriendly.format_size(Path(zip_file).stat().st_size) + zip_file_size = format_size(Path(zip_file).stat().st_size) self._task.get_logger().report_text( 'Uploading compressed dataset changes ({} files, total {}) to {}'.format( count, zip_file_size, self.get_default_storage())) @@ -966,7 +965,7 @@ class Dataset(object): 'Dataset state\n' \ 'Files added/modified: {0} - total size {1}\n' \ 'Current dependency graph: {2}\n'.format( - len(modified_files), humanfriendly.format_size(sum(modified_files)), + len(modified_files), format_size(sum(modified_files)), json.dumps(self._dependency_graph, indent=2, sort_keys=True)) # store as artifact of the Task. self._task.upload_artifact( @@ -1230,8 +1229,8 @@ class Dataset(object): removed = len(self.list_removed_files(node)) modified = len(self.list_modified_files(node)) table_values += [[node, node_names.get(node, ''), - removed, modified, count-modified, humanfriendly.format_size(size)]] - node_details[node] = [removed, modified, count-modified, humanfriendly.format_size(size)] + removed, modified, count-modified, format_size(size)]] + node_details[node] = [removed, modified, count-modified, format_size(size)] # create DAG visited = [] diff --git a/clearml/storage/util.py b/clearml/storage/util.py index 9875e0d2..70f25fc0 100644 --- a/clearml/storage/util.py +++ b/clearml/storage/util.py @@ -1,4 +1,5 @@ import hashlib +import re import sys from typing import Optional, Union @@ -92,3 +93,124 @@ def is_windows(): :return: True if currently running on windows OS """ return sys.platform == 'win32' + + +def format_size(size_in_bytes, binary=False): + # type: (Union[int, float], bool) -> str + """ + Return the size in human readable format (string) + Matching humanfriendly.format_size outputs + + :param size_in_bytes: number of bytes + :param binary: If `True` 1 Kb equals 1024 bytes, if False (default) 1 KB = 1000 bytes + :return: string representation of the number of bytes (b,Kb,Mb,Gb, Tb,) + >>> format_size(0) + '0 bytes' + >>> format_size(1) + '1 byte' + >>> format_size(5) + '5 bytes' + > format_size(1000) + '1 KB' + > format_size(1024, binary=True) + '1 KiB' + >>> format_size(1000 ** 3 * 4) + '4 GB' + """ + size = float(size_in_bytes) + # single byte is the exception here + if size == 1: + return '{} byte'.format(int(size)) + k = 1024 if binary else 1000 + scale = ('bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB') if binary else ('bytes', 'KB', 'MB', 'GB', 'TB', 'PB') + for i, m in enumerate(scale): + if size < k**(i+1) or i == len(scale)-1: + return ('{:.2f}'.format(size/(k**i)).rstrip('0').rstrip('.') + if i > 0 else '{}'.format(int(size))) + ' ' + m + # we should never get here + return '{} {}'.format(int(size), scale[0]) + + +def parse_size(size, binary=False): + """ + Parse a human readable data size and return the number of bytes. + Match humanfriendly.parse_size + + :param size: The human readable file size to parse (a string). + :param binary: :data:`True` to use binary multiples of bytes (base-2) for + ambiguous unit symbols and names, :data:`False` to use + decimal multiples of bytes (base-10). + :returns: The corresponding size in bytes (an integer). + :raises: :exc:`InvalidSize` when the input can't be parsed. + + This function knows how to parse sizes in bytes, kilobytes, megabytes, + gigabytes, terabytes and petabytes. Some examples: + >>> parse_size('42') + 42 + >>> parse_size('13b') + 13 + >>> parse_size('5 bytes') + 5 + >>> parse_size('1 KB') + 1000 + >>> parse_size('1 kilobyte') + 1000 + >>> parse_size('1 KiB') + 1024 + >>> parse_size('1 KB', binary=True) + 1024 + >>> parse_size('1.5 GB') + 1500000000 + >>> parse_size('1.5 GB', binary=True) + 1610612736 + """ + def tokenize(text): + tokenized_input = [] + for token in re.split(r'(\d+(?:\.\d+)?)', text): + token = token.strip() + if re.match(r'\d+\.\d+', token): + tokenized_input.append(float(token)) + elif token.isdigit(): + tokenized_input.append(int(token)) + elif token: + tokenized_input.append(token) + return tokenized_input + tokens = tokenize(str(size)) + if tokens and isinstance(tokens[0], (int, float)): + disk_size_units_b = \ + (('B', 'bytes'), ('KiB', 'kibibyte'), ('MiB', 'mebibyte'), ('GiB', 'gibibyte'), + ('TiB', 'tebibyte'), ('PiB', 'pebibyte')) + disk_size_units_d = \ + (('B', 'bytes'), ('KB', 'kilobyte'), ('MB', 'megabyte'), ('GB', 'gigabyte'), + ('TB', 'terabyte'), ('PB', 'petabyte')) + disk_size_units_b = [(1024 ** i, s[0], s[1]) for i, s in enumerate(disk_size_units_b)] + k = 1024 if binary else 1000 + disk_size_units_d = [(k ** i, s[0], s[1]) for i, s in enumerate(disk_size_units_d)] + disk_size_units = (disk_size_units_b + disk_size_units_d) \ + if binary else (disk_size_units_d + disk_size_units_b) + + # Get the normalized unit (if any) from the tokenized input. + normalized_unit = tokens[1].lower() if len(tokens) == 2 and isinstance(tokens[1], str) else '' + # If the input contains only a number, it's assumed to be the number of + # bytes. The second token can also explicitly reference the unit bytes. + if len(tokens) == 1 or normalized_unit.startswith('b'): + return int(tokens[0]) + # Otherwise we expect two tokens: A number and a unit. + if normalized_unit: + # Convert plural units to singular units, for details: + # https://github.com/xolox/python-humanfriendly/issues/26 + normalized_unit = normalized_unit.rstrip('s') + for k, low, high in disk_size_units: + # First we check for unambiguous symbols (KiB, MiB, GiB, etc) + # and names (kibibyte, mebibyte, gibibyte, etc) because their + # handling is always the same. + if normalized_unit in (low.lower(), high.lower()): + return int(tokens[0] * k) + # Now we will deal with ambiguous prefixes (K, M, G, etc), + # symbols (KB, MB, GB, etc) and names (kilobyte, megabyte, + # gigabyte, etc) according to the caller's preference. + if (normalized_unit in (low.lower(), high.lower()) or + normalized_unit.startswith(low.lower())): + return int(tokens[0] * k) + + raise ValueError("Failed to parse size! (input {} was tokenized as {})".format(size, tokens)) diff --git a/clearml/utilities/config.py b/clearml/utilities/config.py index b2d9c42d..50b2db26 100644 --- a/clearml/utilities/config.py +++ b/clearml/utilities/config.py @@ -3,14 +3,14 @@ from __future__ import division import json import six -import humanfriendly import pyparsing from .pyhocon import ConfigFactory, HOCONConverter +from ..storage.util import parse_size def parse_human_size(value): if isinstance(value, six.string_types): - return humanfriendly.parse_size(value) + return parse_size(value) return value