This commit is contained in:
AuricResin 2025-02-22 16:30:06 -08:00 committed by GitHub
commit 3d84b712f1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -380,8 +380,18 @@ class Dataset(object):
:param tags: A list of tags which describe the Task to add. :param tags: A list of tags which describe the Task to add.
""" """
self._task.add_tags(tags) self._task.add_tags(tags)
import re
def add_files( def is_url(path):
"""
Helper function to check if the provided path is an external URL (e.g., s3://, http://).
"""
url_regex = re.compile(
r'^(?:http|ftp|s3|gs|azure)://' # schemes: http, ftp, s3, gs, azure
)
return url_regex.match(path) is not None
def add_files(
self, self,
path, # type: Union[str, Path, _Path] path, # type: Union[str, Path, _Path]
wildcard=None, # type: Optional[Union[str, Sequence[str]]] wildcard=None, # type: Optional[Union[str, Sequence[str]]]
@ -390,7 +400,7 @@ class Dataset(object):
recursive=True, # type: bool recursive=True, # type: bool
verbose=False, # type: bool verbose=False, # type: bool
max_workers=None, # type: Optional[int] max_workers=None, # type: Optional[int]
): ):
# type: (...) -> () # type: (...) -> ()
""" """
Add a folder into the current dataset. calculate file hash, Add a folder into the current dataset. calculate file hash,
@ -406,6 +416,13 @@ class Dataset(object):
:param max_workers: The number of threads to add the files with. Defaults to the number of logical cores :param max_workers: The number of threads to add the files with. Defaults to the number of logical cores
:return: number of files added :return: number of files added
""" """
# Check if the path provided is a URL, if so, raise an error and suggest using add_external_files
if is_url(path):
raise ValueError(
"The path provided seems to be an external URL (e.g., s3://, http://). "
"Please use `add_external_files()` to add external files to the dataset."
)
max_workers = max_workers or psutil.cpu_count() max_workers = max_workers or psutil.cpu_count()
self._dirty = True self._dirty = True
self._task.get_logger().report_text( self._task.get_logger().report_text(