From 0f401545b88d31f04d3a646c44260786b485939c Mon Sep 17 00:00:00 2001
From: allegroai <>
Date: Sun, 24 Jan 2021 09:22:56 +0200
Subject: [PATCH] Replace humanfriendly with utility functions

---
 .../backend_interface/metrics/interface.py    |   6 +-
 clearml/binding/artifacts.py                  |   7 +-
 clearml/datasets/dataset.py                   |  13 +-
 clearml/storage/util.py                       | 122 ++++++++++++++++++
 clearml/utilities/config.py                   |   4 +-
 5 files changed, 135 insertions(+), 17 deletions(-)

diff --git a/clearml/backend_interface/metrics/interface.py b/clearml/backend_interface/metrics/interface.py
index 6733f2f2..9cf8d1de 100644
--- a/clearml/backend_interface/metrics/interface.py
+++ b/clearml/backend_interface/metrics/interface.py
@@ -3,10 +3,8 @@ import os
 from functools import partial
 from logging import warning
 from multiprocessing.pool import ThreadPool
-from multiprocessing import Lock
 from time import time
 
-from humanfriendly import format_timespan
 from pathlib2 import Path
 
 from ...backend_api.services import events as api_events
@@ -198,8 +196,8 @@ class Metrics(InterfaceBase):
         t_f, t_u, t_ref = \
             (self._file_related_event_time, self._file_upload_time, self._file_upload_starvation_warning_sec)
         if t_f and t_u and t_ref and (t_f - t_u) > t_ref:
-            log.warning('Possible metrics file upload starvation: files were not uploaded for %s' %
-                        format_timespan(t_ref))
+            log.warning('Possible metrics file upload starvation: '
+                        'files were not uploaded for {} seconds'.format(t_ref))
 
         # send the events in a batched request
         good_events = [ev for ev in events if ev.upload_exception is None]
diff --git a/clearml/binding/artifacts.py b/clearml/binding/artifacts.py
index ffa043f5..0ea68419 100644
--- a/clearml/binding/artifacts.py
+++ b/clearml/binding/artifacts.py
@@ -12,7 +12,6 @@ from threading import Thread
 from time import time
 from zipfile import ZipFile, ZIP_DEFLATED
 
-import humanfriendly
 import six
 from PIL import Image
 from pathlib2 import Path
@@ -24,7 +23,7 @@ from ..backend_api.services import tasks
 from ..backend_interface.metrics.events import UploadEvent
 from ..debugging.log import LoggerRoot
 from ..storage.helper import remote_driver_schemes
-from ..storage.util import sha256sum
+from ..storage.util import sha256sum, format_size
 
 try:
     import pandas as pd
@@ -428,7 +427,7 @@ class Artifacts(object):
                             if filename.is_file():
                                 relative_file_name = filename.relative_to(folder).as_posix()
                                 archive_preview += '{} - {}\n'.format(
-                                    relative_file_name, humanfriendly.format_size(filename.stat().st_size))
+                                    relative_file_name, format_size(filename.stat().st_size))
                                 zf.write(filename.as_posix(), arcname=relative_file_name)
                 except Exception as e:
                     # failed uploading folder:
@@ -449,7 +448,7 @@ class Artifacts(object):
 
                 override_filename_in_uri = artifact_object.parts[-1]
                 artifact_type_data.preview = preview or '{} - {}\n'.format(
-                    artifact_object, humanfriendly.format_size(artifact_object.stat().st_size))
+                    artifact_object, format_size(artifact_object.stat().st_size))
                 artifact_object = artifact_object.as_posix()
                 artifact_type = 'custom'
                 artifact_type_data.content_type = mimetypes.guess_type(artifact_object)[0]
diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py
index b99c5e2a..159068c5 100644
--- a/clearml/datasets/dataset.py
+++ b/clearml/datasets/dataset.py
@@ -9,7 +9,6 @@ from tempfile import mkstemp, mkdtemp
 from typing import Union, Optional, Sequence, List, Dict, Any, Mapping
 from zipfile import ZipFile, ZIP_DEFLATED
 
-import humanfriendly
 from attr import attrs, attrib
 from pathlib2 import Path
 
@@ -20,7 +19,7 @@ from ..backend_interface.util import mutually_exclusive, exact_match_regex
 from ..debugging.log import LoggerRoot
 from ..storage.helper import StorageHelper
 from ..storage.cache import CacheManager
-from ..storage.util import sha256sum, is_windows, md5text
+from ..storage.util import sha256sum, is_windows, md5text, format_size
 
 try:
     from pathlib import Path as _Path  # noqa
@@ -324,7 +323,7 @@ class Dataset(object):
                     relative_file_name = file_entry.relative_path
                     zf.write(filename.as_posix(), arcname=relative_file_name)
                     archive_preview += '{} - {}\n'.format(
-                        relative_file_name, humanfriendly.format_size(filename.stat().st_size))
+                        relative_file_name, format_size(filename.stat().st_size))
                     file_entry.local_path = None
                     count += 1
         except Exception as e:
@@ -358,7 +357,7 @@ class Dataset(object):
         self._dataset_file_entries = {k: v for k, v in self._dataset_file_entries.items()
                                       if v.relative_path is not None}
         # start upload
-        zip_file_size = humanfriendly.format_size(Path(zip_file).stat().st_size)
+        zip_file_size = format_size(Path(zip_file).stat().st_size)
         self._task.get_logger().report_text(
             'Uploading compressed dataset changes ({} files, total {}) to {}'.format(
                 count, zip_file_size, self.get_default_storage()))
@@ -966,7 +965,7 @@ class Dataset(object):
             'Dataset state\n' \
             'Files added/modified: {0} - total size {1}\n' \
             'Current dependency graph: {2}\n'.format(
-                len(modified_files), humanfriendly.format_size(sum(modified_files)),
+                len(modified_files), format_size(sum(modified_files)),
                 json.dumps(self._dependency_graph, indent=2, sort_keys=True))
         # store as artifact of the Task.
         self._task.upload_artifact(
@@ -1230,8 +1229,8 @@ class Dataset(object):
             removed = len(self.list_removed_files(node))
             modified = len(self.list_modified_files(node))
             table_values += [[node, node_names.get(node, ''),
-                              removed, modified, count-modified, humanfriendly.format_size(size)]]
-            node_details[node] = [removed, modified, count-modified, humanfriendly.format_size(size)]
+                              removed, modified, count-modified, format_size(size)]]
+            node_details[node] = [removed, modified, count-modified, format_size(size)]
 
         # create DAG
         visited = []
diff --git a/clearml/storage/util.py b/clearml/storage/util.py
index 9875e0d2..70f25fc0 100644
--- a/clearml/storage/util.py
+++ b/clearml/storage/util.py
@@ -1,4 +1,5 @@
 import hashlib
+import re
 import sys
 from typing import Optional, Union
 
@@ -92,3 +93,124 @@ def is_windows():
     :return: True if currently running on windows OS
     """
     return sys.platform == 'win32'
+
+
+def format_size(size_in_bytes, binary=False):
+    # type: (Union[int, float], bool) -> str
+    """
+    Return the size in human readable format (string)
+    Matching humanfriendly.format_size outputs
+
+    :param size_in_bytes: number of bytes
+    :param binary: If `True` 1 Kb equals 1024 bytes, if False (default) 1 KB = 1000 bytes
+    :return: string representation of the number of bytes (b,Kb,Mb,Gb, Tb,)
+        >>> format_size(0)
+        '0 bytes'
+        >>> format_size(1)
+        '1 byte'
+        >>> format_size(5)
+        '5 bytes'
+        > format_size(1000)
+        '1 KB'
+        > format_size(1024, binary=True)
+        '1 KiB'
+        >>> format_size(1000 ** 3 * 4)
+        '4 GB'
+    """
+    size = float(size_in_bytes)
+    # single byte is the exception here
+    if size == 1:
+        return '{} byte'.format(int(size))
+    k = 1024 if binary else 1000
+    scale = ('bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB') if binary else ('bytes', 'KB', 'MB', 'GB', 'TB', 'PB')
+    for i, m in enumerate(scale):
+        if size < k**(i+1) or i == len(scale)-1:
+            return ('{:.2f}'.format(size/(k**i)).rstrip('0').rstrip('.')
+                    if i > 0 else '{}'.format(int(size))) + ' ' + m
+    # we should never get here
+    return '{} {}'.format(int(size), scale[0])
+
+
+def parse_size(size, binary=False):
+    """
+    Parse a human readable data size and return the number of bytes.
+    Match humanfriendly.parse_size
+
+    :param size: The human readable file size to parse (a string).
+    :param binary: :data:`True` to use binary multiples of bytes (base-2) for
+                   ambiguous unit symbols and names, :data:`False` to use
+                   decimal multiples of bytes (base-10).
+    :returns: The corresponding size in bytes (an integer).
+    :raises: :exc:`InvalidSize` when the input can't be parsed.
+
+    This function knows how to parse sizes in bytes, kilobytes, megabytes,
+    gigabytes, terabytes and petabytes. Some examples:
+        >>> parse_size('42')
+        42
+        >>> parse_size('13b')
+        13
+        >>> parse_size('5 bytes')
+        5
+        >>> parse_size('1 KB')
+        1000
+        >>> parse_size('1 kilobyte')
+        1000
+        >>> parse_size('1 KiB')
+        1024
+        >>> parse_size('1 KB', binary=True)
+        1024
+        >>> parse_size('1.5 GB')
+        1500000000
+        >>> parse_size('1.5 GB', binary=True)
+        1610612736
+    """
+    def tokenize(text):
+        tokenized_input = []
+        for token in re.split(r'(\d+(?:\.\d+)?)', text):
+            token = token.strip()
+            if re.match(r'\d+\.\d+', token):
+                tokenized_input.append(float(token))
+            elif token.isdigit():
+                tokenized_input.append(int(token))
+            elif token:
+                tokenized_input.append(token)
+        return tokenized_input
+    tokens = tokenize(str(size))
+    if tokens and isinstance(tokens[0], (int, float)):
+        disk_size_units_b = \
+            (('B', 'bytes'), ('KiB', 'kibibyte'), ('MiB', 'mebibyte'), ('GiB', 'gibibyte'),
+             ('TiB', 'tebibyte'), ('PiB', 'pebibyte'))
+        disk_size_units_d = \
+            (('B', 'bytes'), ('KB', 'kilobyte'), ('MB', 'megabyte'), ('GB', 'gigabyte'),
+             ('TB', 'terabyte'), ('PB', 'petabyte'))
+        disk_size_units_b = [(1024 ** i, s[0], s[1]) for i, s in enumerate(disk_size_units_b)]
+        k = 1024 if binary else 1000
+        disk_size_units_d = [(k ** i, s[0], s[1]) for i, s in enumerate(disk_size_units_d)]
+        disk_size_units = (disk_size_units_b + disk_size_units_d) \
+            if binary else (disk_size_units_d + disk_size_units_b)
+
+        # Get the normalized unit (if any) from the tokenized input.
+        normalized_unit = tokens[1].lower() if len(tokens) == 2 and isinstance(tokens[1], str) else ''
+        # If the input contains only a number, it's assumed to be the number of
+        # bytes. The second token can also explicitly reference the unit bytes.
+        if len(tokens) == 1 or normalized_unit.startswith('b'):
+            return int(tokens[0])
+        # Otherwise we expect two tokens: A number and a unit.
+        if normalized_unit:
+            # Convert plural units to singular units, for details:
+            # https://github.com/xolox/python-humanfriendly/issues/26
+            normalized_unit = normalized_unit.rstrip('s')
+            for k, low, high in disk_size_units:
+                # First we check for unambiguous symbols (KiB, MiB, GiB, etc)
+                # and names (kibibyte, mebibyte, gibibyte, etc) because their
+                # handling is always the same.
+                if normalized_unit in (low.lower(), high.lower()):
+                    return int(tokens[0] * k)
+                # Now we will deal with ambiguous prefixes (K, M, G, etc),
+                # symbols (KB, MB, GB, etc) and names (kilobyte, megabyte,
+                # gigabyte, etc) according to the caller's preference.
+                if (normalized_unit in (low.lower(), high.lower()) or
+                        normalized_unit.startswith(low.lower())):
+                    return int(tokens[0] * k)
+
+    raise ValueError("Failed to parse size! (input {} was tokenized as {})".format(size, tokens))
diff --git a/clearml/utilities/config.py b/clearml/utilities/config.py
index b2d9c42d..50b2db26 100644
--- a/clearml/utilities/config.py
+++ b/clearml/utilities/config.py
@@ -3,14 +3,14 @@ from __future__ import division
 import json
 
 import six
-import humanfriendly
 import pyparsing
 from .pyhocon import ConfigFactory, HOCONConverter
+from ..storage.util import parse_size
 
 
 def parse_human_size(value):
     if isinstance(value, six.string_types):
-        return humanfriendly.parse_size(value)
+        return parse_size(value)
     return value