mirror of
https://github.com/clearml/clearml
synced 2025-06-14 10:28:41 +00:00
Support providing list of links in clearml-data
This commit is contained in:
parent
a6104347f2
commit
c2b4f728f4
@ -373,7 +373,7 @@ class Dataset(object):
|
|||||||
|
|
||||||
def add_external_files(
|
def add_external_files(
|
||||||
self,
|
self,
|
||||||
source_url, # type: str
|
source_url, # type: Union[str, Sequence[str]]
|
||||||
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
|
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
|
||||||
dataset_path=None, # type: Optional[str]
|
dataset_path=None, # type: Optional[str]
|
||||||
recursive=True, # type: bool
|
recursive=True, # type: bool
|
||||||
@ -396,8 +396,8 @@ class Dataset(object):
|
|||||||
- Add the local file "/folder/local_file.jpg" to the dataset.
|
- Add the local file "/folder/local_file.jpg" to the dataset.
|
||||||
add_external_files(source_url="file:///folder/local_file.jpg", dataset_path="/my_dataset/new_folder/")
|
add_external_files(source_url="file:///folder/local_file.jpg", dataset_path="/my_dataset/new_folder/")
|
||||||
|
|
||||||
:param source_url: Source url link to add to the dataset,
|
:param source_url: Source url link (e.g. s3://bucket/folder/path) or list/tuple of links to add to
|
||||||
e.g. s3://bucket/folder/path, s3://bucket/folder/file.csv
|
the dataset (e.g. [s3://bucket/folder/file.csv, http://web.com/file.txt])
|
||||||
:param wildcard: add only specific set of files.
|
:param wildcard: add only specific set of files.
|
||||||
Wildcard matching, can be a single string or a list of wildcards.
|
Wildcard matching, can be a single string or a list of wildcards.
|
||||||
:param dataset_path: The location in the dataset where the file will be downloaded into.
|
:param dataset_path: The location in the dataset where the file will be downloaded into.
|
||||||
@ -407,7 +407,18 @@ class Dataset(object):
|
|||||||
:param verbose: If True print to console files added/modified
|
:param verbose: If True print to console files added/modified
|
||||||
:return: number of file links added
|
:return: number of file links added
|
||||||
"""
|
"""
|
||||||
|
num_added = 0
|
||||||
self._dirty = True
|
self._dirty = True
|
||||||
|
if not isinstance(source_url, str):
|
||||||
|
for source_url_ in source_url:
|
||||||
|
num_added += self.add_external_files(
|
||||||
|
source_url_,
|
||||||
|
wildcard=wildcard,
|
||||||
|
dataset_path=dataset_path,
|
||||||
|
recursive=recursive,
|
||||||
|
verbose=verbose
|
||||||
|
)
|
||||||
|
return num_added
|
||||||
if dataset_path:
|
if dataset_path:
|
||||||
dataset_path = dataset_path.lstrip("/")
|
dataset_path = dataset_path.lstrip("/")
|
||||||
# noinspection PyBroadException
|
# noinspection PyBroadException
|
||||||
@ -419,11 +430,10 @@ class Dataset(object):
|
|||||||
source_url = source_url + "/"
|
source_url = source_url + "/"
|
||||||
links = StorageManager.list(source_url, return_full_path=True)
|
links = StorageManager.list(source_url, return_full_path=True)
|
||||||
except Exception:
|
except Exception:
|
||||||
self._task.get_logger().warning(
|
self._task.get_logger().report_text(
|
||||||
"Could not list remote file(s) when adding {}".format(source_url)
|
"Could not list/find remote file(s) when adding {}".format(source_url)
|
||||||
)
|
)
|
||||||
return 0
|
return 0
|
||||||
num_added = 0
|
|
||||||
num_modified = 0
|
num_modified = 0
|
||||||
for link in links:
|
for link in links:
|
||||||
relative_path = link[len(source_url):]
|
relative_path = link[len(source_url):]
|
||||||
|
@ -518,7 +518,7 @@ class StorageHelper(object):
|
|||||||
try:
|
try:
|
||||||
if isinstance(self._driver, _HttpDriver) and obj:
|
if isinstance(self._driver, _HttpDriver) and obj:
|
||||||
obj = self._driver._get_download_object(obj) # noqa
|
obj = self._driver._get_download_object(obj) # noqa
|
||||||
size = obj.headers.get("Content-Length", 0)
|
size = int(obj.headers.get("Content-Length", 0))
|
||||||
elif hasattr(obj, "size"):
|
elif hasattr(obj, "size"):
|
||||||
size = obj.size
|
size = obj.size
|
||||||
# Google storage has the option to reload the object to get the size
|
# Google storage has the option to reload the object to get the size
|
||||||
|
@ -9,6 +9,7 @@ from typing import List, Optional
|
|||||||
from zipfile import ZipFile
|
from zipfile import ZipFile
|
||||||
from six.moves.urllib.parse import urlparse
|
from six.moves.urllib.parse import urlparse
|
||||||
|
|
||||||
|
import requests
|
||||||
from pathlib2 import Path
|
from pathlib2 import Path
|
||||||
|
|
||||||
from .cache import CacheManager
|
from .cache import CacheManager
|
||||||
@ -313,8 +314,10 @@ class StorageManager(object):
|
|||||||
|
|
||||||
:return: True is the remote_url stores a file and False otherwise
|
:return: True is the remote_url stores a file and False otherwise
|
||||||
"""
|
"""
|
||||||
if remote_url.startswith('file://'):
|
if remote_url.startswith("file://"):
|
||||||
return os.path.isfile(remote_url[len('file://'):])
|
return os.path.isfile(remote_url[len("file://"):])
|
||||||
|
if remote_url.startswith("http://") or remote_url.startswith("https://"):
|
||||||
|
return requests.head(remote_url).status_code == requests.codes.ok
|
||||||
helper = StorageHelper.get(remote_url)
|
helper = StorageHelper.get(remote_url)
|
||||||
obj = helper.get_object(remote_url)
|
obj = helper.get_object(remote_url)
|
||||||
if not obj:
|
if not obj:
|
||||||
|
Loading…
Reference in New Issue
Block a user