mirror of
https://github.com/clearml/clearml
synced 2025-04-16 21:42:10 +00:00
Support providing list of links in clearml-data
This commit is contained in:
parent
a6104347f2
commit
c2b4f728f4
@ -373,7 +373,7 @@ class Dataset(object):
|
||||
|
||||
def add_external_files(
|
||||
self,
|
||||
source_url, # type: str
|
||||
source_url, # type: Union[str, Sequence[str]]
|
||||
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
|
||||
dataset_path=None, # type: Optional[str]
|
||||
recursive=True, # type: bool
|
||||
@ -396,8 +396,8 @@ class Dataset(object):
|
||||
- Add the local file "/folder/local_file.jpg" to the dataset.
|
||||
add_external_files(source_url="file:///folder/local_file.jpg", dataset_path="/my_dataset/new_folder/")
|
||||
|
||||
:param source_url: Source url link to add to the dataset,
|
||||
e.g. s3://bucket/folder/path, s3://bucket/folder/file.csv
|
||||
:param source_url: Source url link (e.g. s3://bucket/folder/path) or list/tuple of links to add to
|
||||
the dataset (e.g. [s3://bucket/folder/file.csv, http://web.com/file.txt])
|
||||
:param wildcard: add only specific set of files.
|
||||
Wildcard matching, can be a single string or a list of wildcards.
|
||||
:param dataset_path: The location in the dataset where the file will be downloaded into.
|
||||
@ -407,7 +407,18 @@ class Dataset(object):
|
||||
:param verbose: If True print to console files added/modified
|
||||
:return: number of file links added
|
||||
"""
|
||||
num_added = 0
|
||||
self._dirty = True
|
||||
if not isinstance(source_url, str):
|
||||
for source_url_ in source_url:
|
||||
num_added += self.add_external_files(
|
||||
source_url_,
|
||||
wildcard=wildcard,
|
||||
dataset_path=dataset_path,
|
||||
recursive=recursive,
|
||||
verbose=verbose
|
||||
)
|
||||
return num_added
|
||||
if dataset_path:
|
||||
dataset_path = dataset_path.lstrip("/")
|
||||
# noinspection PyBroadException
|
||||
@ -419,11 +430,10 @@ class Dataset(object):
|
||||
source_url = source_url + "/"
|
||||
links = StorageManager.list(source_url, return_full_path=True)
|
||||
except Exception:
|
||||
self._task.get_logger().warning(
|
||||
"Could not list remote file(s) when adding {}".format(source_url)
|
||||
self._task.get_logger().report_text(
|
||||
"Could not list/find remote file(s) when adding {}".format(source_url)
|
||||
)
|
||||
return 0
|
||||
num_added = 0
|
||||
num_modified = 0
|
||||
for link in links:
|
||||
relative_path = link[len(source_url):]
|
||||
|
@ -518,7 +518,7 @@ class StorageHelper(object):
|
||||
try:
|
||||
if isinstance(self._driver, _HttpDriver) and obj:
|
||||
obj = self._driver._get_download_object(obj) # noqa
|
||||
size = obj.headers.get("Content-Length", 0)
|
||||
size = int(obj.headers.get("Content-Length", 0))
|
||||
elif hasattr(obj, "size"):
|
||||
size = obj.size
|
||||
# Google storage has the option to reload the object to get the size
|
||||
|
@ -9,6 +9,7 @@ from typing import List, Optional
|
||||
from zipfile import ZipFile
|
||||
from six.moves.urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from pathlib2 import Path
|
||||
|
||||
from .cache import CacheManager
|
||||
@ -313,8 +314,10 @@ class StorageManager(object):
|
||||
|
||||
:return: True is the remote_url stores a file and False otherwise
|
||||
"""
|
||||
if remote_url.startswith('file://'):
|
||||
return os.path.isfile(remote_url[len('file://'):])
|
||||
if remote_url.startswith("file://"):
|
||||
return os.path.isfile(remote_url[len("file://"):])
|
||||
if remote_url.startswith("http://") or remote_url.startswith("https://"):
|
||||
return requests.head(remote_url).status_code == requests.codes.ok
|
||||
helper = StorageHelper.get(remote_url)
|
||||
obj = helper.get_object(remote_url)
|
||||
if not obj:
|
||||
|
Loading…
Reference in New Issue
Block a user