diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py index 588e1f6a..f81c09b7 100644 --- a/clearml/datasets/dataset.py +++ b/clearml/datasets/dataset.py @@ -373,7 +373,7 @@ class Dataset(object): def add_external_files( self, - source_url, # type: str + source_url, # type: Union[str, Sequence[str]] wildcard=None, # type: Optional[Union[str, Sequence[str]]] dataset_path=None, # type: Optional[str] recursive=True, # type: bool @@ -396,8 +396,8 @@ class Dataset(object): - Add the local file "/folder/local_file.jpg" to the dataset. add_external_files(source_url="file:///folder/local_file.jpg", dataset_path="/my_dataset/new_folder/") - :param source_url: Source url link to add to the dataset, - e.g. s3://bucket/folder/path, s3://bucket/folder/file.csv + :param source_url: Source url link (e.g. s3://bucket/folder/path) or list/tuple of links to add to + the dataset (e.g. [s3://bucket/folder/file.csv, http://web.com/file.txt]) :param wildcard: add only specific set of files. Wildcard matching, can be a single string or a list of wildcards. :param dataset_path: The location in the dataset where the file will be downloaded into. @@ -407,7 +407,18 @@ class Dataset(object): :param verbose: If True print to console files added/modified :return: number of file links added """ + num_added = 0 self._dirty = True + if not isinstance(source_url, str): + for source_url_ in source_url: + num_added += self.add_external_files( + source_url_, + wildcard=wildcard, + dataset_path=dataset_path, + recursive=recursive, + verbose=verbose + ) + return num_added if dataset_path: dataset_path = dataset_path.lstrip("/") # noinspection PyBroadException @@ -419,11 +430,10 @@ class Dataset(object): source_url = source_url + "/" links = StorageManager.list(source_url, return_full_path=True) except Exception: - self._task.get_logger().warning( - "Could not list remote file(s) when adding {}".format(source_url) + self._task.get_logger().report_text( + "Could not list/find remote file(s) when adding {}".format(source_url) ) return 0 - num_added = 0 num_modified = 0 for link in links: relative_path = link[len(source_url):] diff --git a/clearml/storage/helper.py b/clearml/storage/helper.py index acbe401f..d84c501c 100644 --- a/clearml/storage/helper.py +++ b/clearml/storage/helper.py @@ -518,7 +518,7 @@ class StorageHelper(object): try: if isinstance(self._driver, _HttpDriver) and obj: obj = self._driver._get_download_object(obj) # noqa - size = obj.headers.get("Content-Length", 0) + size = int(obj.headers.get("Content-Length", 0)) elif hasattr(obj, "size"): size = obj.size # Google storage has the option to reload the object to get the size diff --git a/clearml/storage/manager.py b/clearml/storage/manager.py index efcc1192..029c5c70 100644 --- a/clearml/storage/manager.py +++ b/clearml/storage/manager.py @@ -9,6 +9,7 @@ from typing import List, Optional from zipfile import ZipFile from six.moves.urllib.parse import urlparse +import requests from pathlib2 import Path from .cache import CacheManager @@ -313,8 +314,10 @@ class StorageManager(object): :return: True is the remote_url stores a file and False otherwise """ - if remote_url.startswith('file://'): - return os.path.isfile(remote_url[len('file://'):]) + if remote_url.startswith("file://"): + return os.path.isfile(remote_url[len("file://"):]) + if remote_url.startswith("http://") or remote_url.startswith("https://"): + return requests.head(remote_url).status_code == requests.codes.ok helper = StorageHelper.get(remote_url) obj = helper.get_object(remote_url) if not obj: