Support providing list of links in clearml-data

This commit is contained in:
allegroai 2022-09-02 23:33:33 +03:00
parent a6104347f2
commit c2b4f728f4
3 changed files with 22 additions and 9 deletions

View File

@ -373,7 +373,7 @@ class Dataset(object):
def add_external_files(
self,
source_url, # type: str
source_url, # type: Union[str, Sequence[str]]
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
dataset_path=None, # type: Optional[str]
recursive=True, # type: bool
@ -396,8 +396,8 @@ class Dataset(object):
- Add the local file "/folder/local_file.jpg" to the dataset.
add_external_files(source_url="file:///folder/local_file.jpg", dataset_path="/my_dataset/new_folder/")
:param source_url: Source url link to add to the dataset,
e.g. s3://bucket/folder/path, s3://bucket/folder/file.csv
:param source_url: Source url link (e.g. s3://bucket/folder/path) or list/tuple of links to add to
the dataset (e.g. [s3://bucket/folder/file.csv, http://web.com/file.txt])
:param wildcard: add only specific set of files.
Wildcard matching, can be a single string or a list of wildcards.
:param dataset_path: The location in the dataset where the file will be downloaded into.
@ -407,7 +407,18 @@ class Dataset(object):
:param verbose: If True print to console files added/modified
:return: number of file links added
"""
num_added = 0
self._dirty = True
if not isinstance(source_url, str):
for source_url_ in source_url:
num_added += self.add_external_files(
source_url_,
wildcard=wildcard,
dataset_path=dataset_path,
recursive=recursive,
verbose=verbose
)
return num_added
if dataset_path:
dataset_path = dataset_path.lstrip("/")
# noinspection PyBroadException
@ -419,11 +430,10 @@ class Dataset(object):
source_url = source_url + "/"
links = StorageManager.list(source_url, return_full_path=True)
except Exception:
self._task.get_logger().warning(
"Could not list remote file(s) when adding {}".format(source_url)
self._task.get_logger().report_text(
"Could not list/find remote file(s) when adding {}".format(source_url)
)
return 0
num_added = 0
num_modified = 0
for link in links:
relative_path = link[len(source_url):]

View File

@ -518,7 +518,7 @@ class StorageHelper(object):
try:
if isinstance(self._driver, _HttpDriver) and obj:
obj = self._driver._get_download_object(obj) # noqa
size = obj.headers.get("Content-Length", 0)
size = int(obj.headers.get("Content-Length", 0))
elif hasattr(obj, "size"):
size = obj.size
# Google storage has the option to reload the object to get the size

View File

@ -9,6 +9,7 @@ from typing import List, Optional
from zipfile import ZipFile
from six.moves.urllib.parse import urlparse
import requests
from pathlib2 import Path
from .cache import CacheManager
@ -313,8 +314,10 @@ class StorageManager(object):
:return: True is the remote_url stores a file and False otherwise
"""
if remote_url.startswith('file://'):
return os.path.isfile(remote_url[len('file://'):])
if remote_url.startswith("file://"):
return os.path.isfile(remote_url[len("file://"):])
if remote_url.startswith("http://") or remote_url.startswith("https://"):
return requests.head(remote_url).status_code == requests.codes.ok
helper = StorageHelper.get(remote_url)
obj = helper.get_object(remote_url)
if not obj: