mirror of
https://github.com/clearml/clearml
synced 2025-04-06 13:45:17 +00:00
Merge 50ad7b09a1
into 342e1b35f8
This commit is contained in:
commit
3d84b712f1
@ -380,58 +380,75 @@ class Dataset(object):
|
|||||||
:param tags: A list of tags which describe the Task to add.
|
:param tags: A list of tags which describe the Task to add.
|
||||||
"""
|
"""
|
||||||
self._task.add_tags(tags)
|
self._task.add_tags(tags)
|
||||||
|
import re
|
||||||
|
|
||||||
def add_files(
|
def is_url(path):
|
||||||
self,
|
"""
|
||||||
path, # type: Union[str, Path, _Path]
|
Helper function to check if the provided path is an external URL (e.g., s3://, http://).
|
||||||
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
|
"""
|
||||||
local_base_folder=None, # type: Optional[str]
|
url_regex = re.compile(
|
||||||
dataset_path=None, # type: Optional[str]
|
r'^(?:http|ftp|s3|gs|azure)://' # schemes: http, ftp, s3, gs, azure
|
||||||
recursive=True, # type: bool
|
)
|
||||||
verbose=False, # type: bool
|
return url_regex.match(path) is not None
|
||||||
max_workers=None, # type: Optional[int]
|
|
||||||
):
|
|
||||||
# type: (...) -> ()
|
|
||||||
"""
|
|
||||||
Add a folder into the current dataset. calculate file hash,
|
|
||||||
and compare against parent, mark files to be uploaded
|
|
||||||
|
|
||||||
:param path: Add a folder/file to the dataset
|
def add_files(
|
||||||
:param wildcard: add only specific set of files.
|
self,
|
||||||
Wildcard matching, can be a single string or a list of wildcards.
|
path, # type: Union[str, Path, _Path]
|
||||||
:param local_base_folder: files will be located based on their relative path from local_base_folder
|
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
|
||||||
:param dataset_path: where in the dataset the folder/files should be located
|
local_base_folder=None, # type: Optional[str]
|
||||||
:param recursive: If True, match all wildcard files recursively
|
dataset_path=None, # type: Optional[str]
|
||||||
:param verbose: If True, print to console files added/modified
|
recursive=True, # type: bool
|
||||||
:param max_workers: The number of threads to add the files with. Defaults to the number of logical cores
|
verbose=False, # type: bool
|
||||||
:return: number of files added
|
max_workers=None, # type: Optional[int]
|
||||||
"""
|
):
|
||||||
max_workers = max_workers or psutil.cpu_count()
|
# type: (...) -> ()
|
||||||
self._dirty = True
|
"""
|
||||||
self._task.get_logger().report_text(
|
Add a folder into the current dataset. calculate file hash,
|
||||||
'Adding files to dataset: {}'.format(
|
and compare against parent, mark files to be uploaded
|
||||||
dict(path=path, wildcard=wildcard, local_base_folder=local_base_folder,
|
|
||||||
dataset_path=dataset_path, recursive=recursive, verbose=verbose)),
|
|
||||||
print_console=False)
|
|
||||||
|
|
||||||
num_added, num_modified = self._add_files(
|
:param path: Add a folder/file to the dataset
|
||||||
path=path,
|
:param wildcard: add only specific set of files.
|
||||||
wildcard=wildcard,
|
Wildcard matching, can be a single string or a list of wildcards.
|
||||||
local_base_folder=local_base_folder,
|
:param local_base_folder: files will be located based on their relative path from local_base_folder
|
||||||
dataset_path=dataset_path,
|
:param dataset_path: where in the dataset the folder/files should be located
|
||||||
recursive=recursive,
|
:param recursive: If True, match all wildcard files recursively
|
||||||
verbose=verbose,
|
:param verbose: If True, print to console files added/modified
|
||||||
max_workers=max_workers,
|
:param max_workers: The number of threads to add the files with. Defaults to the number of logical cores
|
||||||
|
:return: number of files added
|
||||||
|
"""
|
||||||
|
# Check if the path provided is a URL, if so, raise an error and suggest using add_external_files
|
||||||
|
if is_url(path):
|
||||||
|
raise ValueError(
|
||||||
|
"The path provided seems to be an external URL (e.g., s3://, http://). "
|
||||||
|
"Please use `add_external_files()` to add external files to the dataset."
|
||||||
)
|
)
|
||||||
|
|
||||||
# update the task script
|
max_workers = max_workers or psutil.cpu_count()
|
||||||
self._add_script_call(
|
self._dirty = True
|
||||||
'add_files', path=path, wildcard=wildcard, local_base_folder=local_base_folder,
|
self._task.get_logger().report_text(
|
||||||
dataset_path=dataset_path, recursive=recursive)
|
'Adding files to dataset: {}'.format(
|
||||||
|
dict(path=path, wildcard=wildcard, local_base_folder=local_base_folder,
|
||||||
|
dataset_path=dataset_path, recursive=recursive, verbose=verbose)),
|
||||||
|
print_console=False)
|
||||||
|
|
||||||
self._serialize()
|
num_added, num_modified = self._add_files(
|
||||||
|
path=path,
|
||||||
|
wildcard=wildcard,
|
||||||
|
local_base_folder=local_base_folder,
|
||||||
|
dataset_path=dataset_path,
|
||||||
|
recursive=recursive,
|
||||||
|
verbose=verbose,
|
||||||
|
max_workers=max_workers,
|
||||||
|
)
|
||||||
|
|
||||||
return num_added
|
# update the task script
|
||||||
|
self._add_script_call(
|
||||||
|
'add_files', path=path, wildcard=wildcard, local_base_folder=local_base_folder,
|
||||||
|
dataset_path=dataset_path, recursive=recursive)
|
||||||
|
|
||||||
|
self._serialize()
|
||||||
|
|
||||||
|
return num_added
|
||||||
|
|
||||||
def add_external_files(
|
def add_external_files(
|
||||||
self,
|
self,
|
||||||
|
Loading…
Reference in New Issue
Block a user