Replace humanfriendly with utility functions

This commit is contained in:
allegroai 2021-01-24 09:22:56 +02:00
parent 1c84b31056
commit 0f401545b8
5 changed files with 135 additions and 17 deletions

View File

@ -3,10 +3,8 @@ import os
from functools import partial from functools import partial
from logging import warning from logging import warning
from multiprocessing.pool import ThreadPool from multiprocessing.pool import ThreadPool
from multiprocessing import Lock
from time import time from time import time
from humanfriendly import format_timespan
from pathlib2 import Path from pathlib2 import Path
from ...backend_api.services import events as api_events from ...backend_api.services import events as api_events
@ -198,8 +196,8 @@ class Metrics(InterfaceBase):
t_f, t_u, t_ref = \ t_f, t_u, t_ref = \
(self._file_related_event_time, self._file_upload_time, self._file_upload_starvation_warning_sec) (self._file_related_event_time, self._file_upload_time, self._file_upload_starvation_warning_sec)
if t_f and t_u and t_ref and (t_f - t_u) > t_ref: if t_f and t_u and t_ref and (t_f - t_u) > t_ref:
log.warning('Possible metrics file upload starvation: files were not uploaded for %s' % log.warning('Possible metrics file upload starvation: '
format_timespan(t_ref)) 'files were not uploaded for {} seconds'.format(t_ref))
# send the events in a batched request # send the events in a batched request
good_events = [ev for ev in events if ev.upload_exception is None] good_events = [ev for ev in events if ev.upload_exception is None]

View File

@ -12,7 +12,6 @@ from threading import Thread
from time import time from time import time
from zipfile import ZipFile, ZIP_DEFLATED from zipfile import ZipFile, ZIP_DEFLATED
import humanfriendly
import six import six
from PIL import Image from PIL import Image
from pathlib2 import Path from pathlib2 import Path
@ -24,7 +23,7 @@ from ..backend_api.services import tasks
from ..backend_interface.metrics.events import UploadEvent from ..backend_interface.metrics.events import UploadEvent
from ..debugging.log import LoggerRoot from ..debugging.log import LoggerRoot
from ..storage.helper import remote_driver_schemes from ..storage.helper import remote_driver_schemes
from ..storage.util import sha256sum from ..storage.util import sha256sum, format_size
try: try:
import pandas as pd import pandas as pd
@ -428,7 +427,7 @@ class Artifacts(object):
if filename.is_file(): if filename.is_file():
relative_file_name = filename.relative_to(folder).as_posix() relative_file_name = filename.relative_to(folder).as_posix()
archive_preview += '{} - {}\n'.format( archive_preview += '{} - {}\n'.format(
relative_file_name, humanfriendly.format_size(filename.stat().st_size)) relative_file_name, format_size(filename.stat().st_size))
zf.write(filename.as_posix(), arcname=relative_file_name) zf.write(filename.as_posix(), arcname=relative_file_name)
except Exception as e: except Exception as e:
# failed uploading folder: # failed uploading folder:
@ -449,7 +448,7 @@ class Artifacts(object):
override_filename_in_uri = artifact_object.parts[-1] override_filename_in_uri = artifact_object.parts[-1]
artifact_type_data.preview = preview or '{} - {}\n'.format( artifact_type_data.preview = preview or '{} - {}\n'.format(
artifact_object, humanfriendly.format_size(artifact_object.stat().st_size)) artifact_object, format_size(artifact_object.stat().st_size))
artifact_object = artifact_object.as_posix() artifact_object = artifact_object.as_posix()
artifact_type = 'custom' artifact_type = 'custom'
artifact_type_data.content_type = mimetypes.guess_type(artifact_object)[0] artifact_type_data.content_type = mimetypes.guess_type(artifact_object)[0]

View File

@ -9,7 +9,6 @@ from tempfile import mkstemp, mkdtemp
from typing import Union, Optional, Sequence, List, Dict, Any, Mapping from typing import Union, Optional, Sequence, List, Dict, Any, Mapping
from zipfile import ZipFile, ZIP_DEFLATED from zipfile import ZipFile, ZIP_DEFLATED
import humanfriendly
from attr import attrs, attrib from attr import attrs, attrib
from pathlib2 import Path from pathlib2 import Path
@ -20,7 +19,7 @@ from ..backend_interface.util import mutually_exclusive, exact_match_regex
from ..debugging.log import LoggerRoot from ..debugging.log import LoggerRoot
from ..storage.helper import StorageHelper from ..storage.helper import StorageHelper
from ..storage.cache import CacheManager from ..storage.cache import CacheManager
from ..storage.util import sha256sum, is_windows, md5text from ..storage.util import sha256sum, is_windows, md5text, format_size
try: try:
from pathlib import Path as _Path # noqa from pathlib import Path as _Path # noqa
@ -324,7 +323,7 @@ class Dataset(object):
relative_file_name = file_entry.relative_path relative_file_name = file_entry.relative_path
zf.write(filename.as_posix(), arcname=relative_file_name) zf.write(filename.as_posix(), arcname=relative_file_name)
archive_preview += '{} - {}\n'.format( archive_preview += '{} - {}\n'.format(
relative_file_name, humanfriendly.format_size(filename.stat().st_size)) relative_file_name, format_size(filename.stat().st_size))
file_entry.local_path = None file_entry.local_path = None
count += 1 count += 1
except Exception as e: except Exception as e:
@ -358,7 +357,7 @@ class Dataset(object):
self._dataset_file_entries = {k: v for k, v in self._dataset_file_entries.items() self._dataset_file_entries = {k: v for k, v in self._dataset_file_entries.items()
if v.relative_path is not None} if v.relative_path is not None}
# start upload # start upload
zip_file_size = humanfriendly.format_size(Path(zip_file).stat().st_size) zip_file_size = format_size(Path(zip_file).stat().st_size)
self._task.get_logger().report_text( self._task.get_logger().report_text(
'Uploading compressed dataset changes ({} files, total {}) to {}'.format( 'Uploading compressed dataset changes ({} files, total {}) to {}'.format(
count, zip_file_size, self.get_default_storage())) count, zip_file_size, self.get_default_storage()))
@ -966,7 +965,7 @@ class Dataset(object):
'Dataset state\n' \ 'Dataset state\n' \
'Files added/modified: {0} - total size {1}\n' \ 'Files added/modified: {0} - total size {1}\n' \
'Current dependency graph: {2}\n'.format( 'Current dependency graph: {2}\n'.format(
len(modified_files), humanfriendly.format_size(sum(modified_files)), len(modified_files), format_size(sum(modified_files)),
json.dumps(self._dependency_graph, indent=2, sort_keys=True)) json.dumps(self._dependency_graph, indent=2, sort_keys=True))
# store as artifact of the Task. # store as artifact of the Task.
self._task.upload_artifact( self._task.upload_artifact(
@ -1230,8 +1229,8 @@ class Dataset(object):
removed = len(self.list_removed_files(node)) removed = len(self.list_removed_files(node))
modified = len(self.list_modified_files(node)) modified = len(self.list_modified_files(node))
table_values += [[node, node_names.get(node, ''), table_values += [[node, node_names.get(node, ''),
removed, modified, count-modified, humanfriendly.format_size(size)]] removed, modified, count-modified, format_size(size)]]
node_details[node] = [removed, modified, count-modified, humanfriendly.format_size(size)] node_details[node] = [removed, modified, count-modified, format_size(size)]
# create DAG # create DAG
visited = [] visited = []

View File

@ -1,4 +1,5 @@
import hashlib import hashlib
import re
import sys import sys
from typing import Optional, Union from typing import Optional, Union
@ -92,3 +93,124 @@ def is_windows():
:return: True if currently running on windows OS :return: True if currently running on windows OS
""" """
return sys.platform == 'win32' return sys.platform == 'win32'
def format_size(size_in_bytes, binary=False):
# type: (Union[int, float], bool) -> str
"""
Return the size in human readable format (string)
Matching humanfriendly.format_size outputs
:param size_in_bytes: number of bytes
:param binary: If `True` 1 Kb equals 1024 bytes, if False (default) 1 KB = 1000 bytes
:return: string representation of the number of bytes (b,Kb,Mb,Gb, Tb,)
>>> format_size(0)
'0 bytes'
>>> format_size(1)
'1 byte'
>>> format_size(5)
'5 bytes'
> format_size(1000)
'1 KB'
> format_size(1024, binary=True)
'1 KiB'
>>> format_size(1000 ** 3 * 4)
'4 GB'
"""
size = float(size_in_bytes)
# single byte is the exception here
if size == 1:
return '{} byte'.format(int(size))
k = 1024 if binary else 1000
scale = ('bytes', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB') if binary else ('bytes', 'KB', 'MB', 'GB', 'TB', 'PB')
for i, m in enumerate(scale):
if size < k**(i+1) or i == len(scale)-1:
return ('{:.2f}'.format(size/(k**i)).rstrip('0').rstrip('.')
if i > 0 else '{}'.format(int(size))) + ' ' + m
# we should never get here
return '{} {}'.format(int(size), scale[0])
def parse_size(size, binary=False):
"""
Parse a human readable data size and return the number of bytes.
Match humanfriendly.parse_size
:param size: The human readable file size to parse (a string).
:param binary: :data:`True` to use binary multiples of bytes (base-2) for
ambiguous unit symbols and names, :data:`False` to use
decimal multiples of bytes (base-10).
:returns: The corresponding size in bytes (an integer).
:raises: :exc:`InvalidSize` when the input can't be parsed.
This function knows how to parse sizes in bytes, kilobytes, megabytes,
gigabytes, terabytes and petabytes. Some examples:
>>> parse_size('42')
42
>>> parse_size('13b')
13
>>> parse_size('5 bytes')
5
>>> parse_size('1 KB')
1000
>>> parse_size('1 kilobyte')
1000
>>> parse_size('1 KiB')
1024
>>> parse_size('1 KB', binary=True)
1024
>>> parse_size('1.5 GB')
1500000000
>>> parse_size('1.5 GB', binary=True)
1610612736
"""
def tokenize(text):
tokenized_input = []
for token in re.split(r'(\d+(?:\.\d+)?)', text):
token = token.strip()
if re.match(r'\d+\.\d+', token):
tokenized_input.append(float(token))
elif token.isdigit():
tokenized_input.append(int(token))
elif token:
tokenized_input.append(token)
return tokenized_input
tokens = tokenize(str(size))
if tokens and isinstance(tokens[0], (int, float)):
disk_size_units_b = \
(('B', 'bytes'), ('KiB', 'kibibyte'), ('MiB', 'mebibyte'), ('GiB', 'gibibyte'),
('TiB', 'tebibyte'), ('PiB', 'pebibyte'))
disk_size_units_d = \
(('B', 'bytes'), ('KB', 'kilobyte'), ('MB', 'megabyte'), ('GB', 'gigabyte'),
('TB', 'terabyte'), ('PB', 'petabyte'))
disk_size_units_b = [(1024 ** i, s[0], s[1]) for i, s in enumerate(disk_size_units_b)]
k = 1024 if binary else 1000
disk_size_units_d = [(k ** i, s[0], s[1]) for i, s in enumerate(disk_size_units_d)]
disk_size_units = (disk_size_units_b + disk_size_units_d) \
if binary else (disk_size_units_d + disk_size_units_b)
# Get the normalized unit (if any) from the tokenized input.
normalized_unit = tokens[1].lower() if len(tokens) == 2 and isinstance(tokens[1], str) else ''
# If the input contains only a number, it's assumed to be the number of
# bytes. The second token can also explicitly reference the unit bytes.
if len(tokens) == 1 or normalized_unit.startswith('b'):
return int(tokens[0])
# Otherwise we expect two tokens: A number and a unit.
if normalized_unit:
# Convert plural units to singular units, for details:
# https://github.com/xolox/python-humanfriendly/issues/26
normalized_unit = normalized_unit.rstrip('s')
for k, low, high in disk_size_units:
# First we check for unambiguous symbols (KiB, MiB, GiB, etc)
# and names (kibibyte, mebibyte, gibibyte, etc) because their
# handling is always the same.
if normalized_unit in (low.lower(), high.lower()):
return int(tokens[0] * k)
# Now we will deal with ambiguous prefixes (K, M, G, etc),
# symbols (KB, MB, GB, etc) and names (kilobyte, megabyte,
# gigabyte, etc) according to the caller's preference.
if (normalized_unit in (low.lower(), high.lower()) or
normalized_unit.startswith(low.lower())):
return int(tokens[0] * k)
raise ValueError("Failed to parse size! (input {} was tokenized as {})".format(size, tokens))

View File

@ -3,14 +3,14 @@ from __future__ import division
import json import json
import six import six
import humanfriendly
import pyparsing import pyparsing
from .pyhocon import ConfigFactory, HOCONConverter from .pyhocon import ConfigFactory, HOCONConverter
from ..storage.util import parse_size
def parse_human_size(value): def parse_human_size(value):
if isinstance(value, six.string_types): if isinstance(value, six.string_types):
return humanfriendly.parse_size(value) return parse_size(value)
return value return value