Support providing list of links in clearml-data

2025-06-26 18:16:07 +00:00 · 2022-09-02 23:33:33 +03:00 · 2022-09-02 23:33:33 +03:00 · c2b4f728f4
commit c2b4f728f4
parent a6104347f2
3 changed files with 22 additions and 9 deletions
--- a/clearml/datasets/dataset.py
+++ b/clearml/datasets/dataset.py
@ -373,7 +373,7 @@ class Dataset(object):

    def add_external_files(
        self,
-        source_url,  # type: str
+        source_url,  # type: Union[str, Sequence[str]]
        wildcard=None,  # type: Optional[Union[str, Sequence[str]]]
        dataset_path=None,  # type: Optional[str]
        recursive=True,  # type: bool
@ -396,8 +396,8 @@ class Dataset(object):
        - Add the local file "/folder/local_file.jpg" to the dataset.
        add_external_files(source_url="file:///folder/local_file.jpg", dataset_path="/my_dataset/new_folder/")

-        :param source_url: Source url link to add to the dataset,
-            e.g. s3://bucket/folder/path, s3://bucket/folder/file.csv
+        :param source_url: Source url link (e.g. s3://bucket/folder/path) or list/tuple of links to add to
+            the dataset (e.g. [s3://bucket/folder/file.csv, http://web.com/file.txt])
        :param wildcard: add only specific set of files.
            Wildcard matching, can be a single string or a list of wildcards.
        :param dataset_path: The location in the dataset where the file will be downloaded into.
@ -407,7 +407,18 @@ class Dataset(object):
        :param verbose: If True print to console files added/modified
        :return: number of file links added
        """
+        num_added = 0
        self._dirty = True
+        if not isinstance(source_url, str):
+            for source_url_ in source_url:
+                num_added += self.add_external_files(
+                        source_url_,
+                        wildcard=wildcard,
+                        dataset_path=dataset_path,
+                        recursive=recursive,
+                        verbose=verbose
+                )
+            return num_added
        if dataset_path:
            dataset_path = dataset_path.lstrip("/")
        # noinspection PyBroadException
@ -419,11 +430,10 @@ class Dataset(object):
                    source_url = source_url + "/"
                links = StorageManager.list(source_url, return_full_path=True)
        except Exception:
-            self._task.get_logger().warning(
-                "Could not list remote file(s) when adding {}".format(source_url)
+            self._task.get_logger().report_text(
+                "Could not list/find remote file(s) when adding {}".format(source_url)
            )
            return 0
-        num_added = 0
        num_modified = 0
        for link in links:
            relative_path = link[len(source_url):]
--- a/clearml/storage/helper.py
+++ b/clearml/storage/helper.py
@ -518,7 +518,7 @@ class StorageHelper(object):
        try:
            if isinstance(self._driver, _HttpDriver) and obj:
                obj = self._driver._get_download_object(obj)  # noqa
-                size = obj.headers.get("Content-Length", 0)
+                size = int(obj.headers.get("Content-Length", 0))
            elif hasattr(obj, "size"):
                size = obj.size
                # Google storage has the option to reload the object to get the size
--- a/clearml/storage/manager.py
+++ b/clearml/storage/manager.py
@ -9,6 +9,7 @@ from typing import List, Optional
 from zipfile import ZipFile
 from six.moves.urllib.parse import urlparse

+import requests
 from pathlib2 import Path

 from .cache import CacheManager
@ -313,8 +314,10 @@ class StorageManager(object):

        :return: True is the remote_url stores a file and False otherwise
        """
-        if remote_url.startswith('file://'):
-            return os.path.isfile(remote_url[len('file://'):])
+        if remote_url.startswith("file://"):
+            return os.path.isfile(remote_url[len("file://"):])
+        if remote_url.startswith("http://") or remote_url.startswith("https://"):
+            return requests.head(remote_url).status_code == requests.codes.ok
        helper = StorageHelper.get(remote_url)
        obj = helper.get_object(remote_url)
        if not obj: