Support providing list of links in clearml-data

This commit is contained in:
allegroai 2022-09-02 23:33:33 +03:00
parent a6104347f2
commit c2b4f728f4
3 changed files with 22 additions and 9 deletions

View File

@ -373,7 +373,7 @@ class Dataset(object):
def add_external_files( def add_external_files(
self, self,
source_url, # type: str source_url, # type: Union[str, Sequence[str]]
wildcard=None, # type: Optional[Union[str, Sequence[str]]] wildcard=None, # type: Optional[Union[str, Sequence[str]]]
dataset_path=None, # type: Optional[str] dataset_path=None, # type: Optional[str]
recursive=True, # type: bool recursive=True, # type: bool
@ -396,8 +396,8 @@ class Dataset(object):
- Add the local file "/folder/local_file.jpg" to the dataset. - Add the local file "/folder/local_file.jpg" to the dataset.
add_external_files(source_url="file:///folder/local_file.jpg", dataset_path="/my_dataset/new_folder/") add_external_files(source_url="file:///folder/local_file.jpg", dataset_path="/my_dataset/new_folder/")
:param source_url: Source url link to add to the dataset, :param source_url: Source url link (e.g. s3://bucket/folder/path) or list/tuple of links to add to
e.g. s3://bucket/folder/path, s3://bucket/folder/file.csv the dataset (e.g. [s3://bucket/folder/file.csv, http://web.com/file.txt])
:param wildcard: add only specific set of files. :param wildcard: add only specific set of files.
Wildcard matching, can be a single string or a list of wildcards. Wildcard matching, can be a single string or a list of wildcards.
:param dataset_path: The location in the dataset where the file will be downloaded into. :param dataset_path: The location in the dataset where the file will be downloaded into.
@ -407,7 +407,18 @@ class Dataset(object):
:param verbose: If True print to console files added/modified :param verbose: If True print to console files added/modified
:return: number of file links added :return: number of file links added
""" """
num_added = 0
self._dirty = True self._dirty = True
if not isinstance(source_url, str):
for source_url_ in source_url:
num_added += self.add_external_files(
source_url_,
wildcard=wildcard,
dataset_path=dataset_path,
recursive=recursive,
verbose=verbose
)
return num_added
if dataset_path: if dataset_path:
dataset_path = dataset_path.lstrip("/") dataset_path = dataset_path.lstrip("/")
# noinspection PyBroadException # noinspection PyBroadException
@ -419,11 +430,10 @@ class Dataset(object):
source_url = source_url + "/" source_url = source_url + "/"
links = StorageManager.list(source_url, return_full_path=True) links = StorageManager.list(source_url, return_full_path=True)
except Exception: except Exception:
self._task.get_logger().warning( self._task.get_logger().report_text(
"Could not list remote file(s) when adding {}".format(source_url) "Could not list/find remote file(s) when adding {}".format(source_url)
) )
return 0 return 0
num_added = 0
num_modified = 0 num_modified = 0
for link in links: for link in links:
relative_path = link[len(source_url):] relative_path = link[len(source_url):]

View File

@ -518,7 +518,7 @@ class StorageHelper(object):
try: try:
if isinstance(self._driver, _HttpDriver) and obj: if isinstance(self._driver, _HttpDriver) and obj:
obj = self._driver._get_download_object(obj) # noqa obj = self._driver._get_download_object(obj) # noqa
size = obj.headers.get("Content-Length", 0) size = int(obj.headers.get("Content-Length", 0))
elif hasattr(obj, "size"): elif hasattr(obj, "size"):
size = obj.size size = obj.size
# Google storage has the option to reload the object to get the size # Google storage has the option to reload the object to get the size

View File

@ -9,6 +9,7 @@ from typing import List, Optional
from zipfile import ZipFile from zipfile import ZipFile
from six.moves.urllib.parse import urlparse from six.moves.urllib.parse import urlparse
import requests
from pathlib2 import Path from pathlib2 import Path
from .cache import CacheManager from .cache import CacheManager
@ -313,8 +314,10 @@ class StorageManager(object):
:return: True is the remote_url stores a file and False otherwise :return: True is the remote_url stores a file and False otherwise
""" """
if remote_url.startswith('file://'): if remote_url.startswith("file://"):
return os.path.isfile(remote_url[len('file://'):]) return os.path.isfile(remote_url[len("file://"):])
if remote_url.startswith("http://") or remote_url.startswith("https://"):
return requests.head(remote_url).status_code == requests.codes.ok
helper = StorageHelper.get(remote_url) helper = StorageHelper.get(remote_url)
obj = helper.get_object(remote_url) obj = helper.get_object(remote_url)
if not obj: if not obj: