mirror of
https://github.com/clearml/clearml
synced 2025-04-08 14:46:07 +00:00
1316 lines
59 KiB
Python
1316 lines
59 KiB
Python
import json
|
|
import os
|
|
import shutil
|
|
from copy import deepcopy, copy
|
|
from fnmatch import fnmatch
|
|
from multiprocessing import cpu_count
|
|
from multiprocessing.pool import ThreadPool
|
|
from tempfile import mkstemp, mkdtemp
|
|
from typing import Union, Optional, Sequence, List, Dict, Any, Mapping
|
|
from zipfile import ZipFile, ZIP_DEFLATED
|
|
|
|
import humanfriendly
|
|
from attr import attrs, attrib
|
|
from pathlib2 import Path
|
|
|
|
from .. import Task, StorageManager
|
|
from ..backend_api.session.client import APIClient
|
|
from ..backend_interface.task.development.worker import DevWorker
|
|
from ..backend_interface.util import mutually_exclusive, exact_match_regex
|
|
from ..debugging.log import LoggerRoot
|
|
from ..storage.helper import StorageHelper
|
|
from ..storage.cache import CacheManager
|
|
from ..storage.util import sha256sum, is_windows, md5text
|
|
|
|
try:
|
|
from pathlib import Path as _Path # noqa
|
|
except ImportError:
|
|
_Path = None
|
|
|
|
|
|
@attrs
|
|
class FileEntry(object):
|
|
relative_path = attrib(default=None, type=str)
|
|
hash = attrib(default=None, type=str)
|
|
parent_dataset_id = attrib(default=None, type=str)
|
|
size = attrib(default=None, type=int)
|
|
# cleared when file is uploaded.
|
|
local_path = attrib(default=None, type=str)
|
|
|
|
def as_dict(self):
|
|
# type: () -> Dict
|
|
state = dict(relative_path=self.relative_path, hash=self.hash,
|
|
parent_dataset_id=self.parent_dataset_id, size=self.size,
|
|
**dict([('local_path', self.local_path)] if self.local_path else ()))
|
|
return state
|
|
|
|
|
|
class Dataset(object):
|
|
__private_magic = 42 * 1337
|
|
__state_entry_name = 'state'
|
|
__data_entry_name = 'data'
|
|
__cache_context = 'datasets'
|
|
__tag = 'dataset'
|
|
__cache_folder_prefix = 'ds_'
|
|
__dataset_folder_template = CacheManager.set_context_folder_lookup(__cache_context, "{0}_archive_{1}")
|
|
__preview_max_file_entries = 15000
|
|
__preview_max_size = 5 * 1024 * 1024
|
|
|
|
def __init__(self, _private, task=None, dataset_project=None, dataset_name=None):
|
|
# type: (int, Optional[Task], Optional[str], Optional[str]) -> ()
|
|
"""
|
|
Do not use directly! Use Dataset.create(...) or Dataset.get(...) instead.
|
|
"""
|
|
assert _private == self.__private_magic
|
|
# key for the dataset file entries are the relative path within the data
|
|
self._dataset_file_entries = {} # type: Dict[str, FileEntry]
|
|
# this will create a graph of all the dependencies we have, each entry lists it's own direct parents
|
|
self._dependency_graph = {} # type: Dict[str, List[str]]
|
|
if not task:
|
|
task = Task.create(
|
|
project_name=dataset_project, task_name=dataset_name, task_type=Task.TaskTypes.data_processing)
|
|
task.set_system_tags((task.get_system_tags() or []) + [self.__tag])
|
|
task.mark_started()
|
|
# generate the script section
|
|
script = \
|
|
'from clearml import Dataset\n\n' \
|
|
'ds = Dataset.create(dataset_project=\'{dataset_project}\', dataset_name=\'{dataset_name}\')\n'.format(
|
|
dataset_project=dataset_project, dataset_name=dataset_name)
|
|
task.data.script.diff = script
|
|
task.data.script.working_dir = '.'
|
|
task.data.script.entry_point = 'register_dataset.py'
|
|
from clearml import __version__
|
|
task.data.script.requirements = {'pip': 'clearml == {}\n'.format(__version__)}
|
|
# noinspection PyProtectedMember
|
|
task._edit(script=task.data.script)
|
|
|
|
# if the task is running make sure we ping to the server so it will not be aborted by a watchdog
|
|
if task.status in ('created', 'in_progress'):
|
|
self._task_pinger = DevWorker()
|
|
self._task_pinger.register(task, stop_signal_support=False)
|
|
else:
|
|
self._task_pinger = None
|
|
|
|
# store current dataset Task
|
|
self._task = task
|
|
# store current dataset id
|
|
self._id = task.id
|
|
# store the folder where the dataset was downloaded to
|
|
self._local_base_folder = None # type: Optional[Path]
|
|
# dirty flag, set True by any function call changing the dataset (regardless of weather it did anything)
|
|
self._dirty = False
|
|
|
|
@property
|
|
def id(self):
|
|
# type: () -> str
|
|
return self._id
|
|
|
|
@property
|
|
def file_entries(self):
|
|
# type: () -> List[FileEntry]
|
|
return list(self._dataset_file_entries.values())
|
|
|
|
@property
|
|
def file_entries_dict(self):
|
|
# type: () -> Mapping[str, FileEntry]
|
|
"""
|
|
Notice this call returns an internal representation, do not modify!
|
|
:return: dict with relative file path as key, and FileEntry as value
|
|
"""
|
|
return self._dataset_file_entries
|
|
|
|
@property
|
|
def project(self):
|
|
# type: () -> str
|
|
return self._task.get_project_name()
|
|
|
|
@property
|
|
def name(self):
|
|
# type: () -> str
|
|
return self._task.name
|
|
|
|
@property
|
|
def tags(self):
|
|
# type: () -> List[str]
|
|
return self._task.get_tags() or []
|
|
|
|
@tags.setter
|
|
def tags(self, values):
|
|
# type: (List[str]) -> ()
|
|
self._task.set_tags(values or [])
|
|
|
|
def add_files(self,
|
|
path, # type: Union[str, Path, _Path]
|
|
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
|
|
local_base_folder=None, # type: Optional[str]
|
|
dataset_path=None, # type: Optional[str]
|
|
recursive=True, # type: bool
|
|
verbose=False # type: bool
|
|
):
|
|
# type: (...) -> ()
|
|
"""
|
|
Add a folder into the current dataset. calculate file hash,
|
|
and compare against parent, mark files to be uploaded
|
|
|
|
:param path: Add a folder/file to the dataset
|
|
:param wildcard: add only specific set of files.
|
|
Wildcard matching, can be a single string or a list of wildcards)
|
|
:param local_base_folder: files will be located based on their relative path from local_base_folder
|
|
:param dataset_path: where in the dataset the folder/files should be located
|
|
:param recursive: If True match all wildcard files recursively
|
|
:param verbose: If True print to console files added/modified
|
|
:return: number of files added
|
|
"""
|
|
self._dirty = True
|
|
self._task.get_logger().report_text(
|
|
'Adding files to dataset: {}'.format(
|
|
dict(path=path, wildcard=wildcard, local_base_folder=local_base_folder,
|
|
dataset_path=dataset_path, recursive=recursive, verbose=verbose)),
|
|
print_console=False)
|
|
|
|
num_added = self._add_files(
|
|
path=path, wildcard=wildcard, local_base_folder=local_base_folder,
|
|
dataset_path=dataset_path, recursive=recursive, verbose=verbose)
|
|
|
|
# update the task script
|
|
self._add_script_call(
|
|
'add_files', path=path, wildcard=wildcard, local_base_folder=local_base_folder,
|
|
dataset_path=dataset_path, recursive=recursive)
|
|
|
|
self._serialize()
|
|
|
|
return num_added
|
|
|
|
def remove_files(self, dataset_path=None, recursive=True, verbose=False):
|
|
# type: (Optional[str], bool, bool) -> int
|
|
"""
|
|
Add a folder into the current dataset. calculate file hash,
|
|
and compare against parent, mark files to be uploaded
|
|
|
|
:param dataset_path: Remove files from the dataset.
|
|
The path is always relative to the dataset (e.g 'folder/file.bin')
|
|
:param recursive: If True match all wildcard files recursively
|
|
:param verbose: If True print to console files removed
|
|
:return: Number of files removed
|
|
"""
|
|
self._dirty = True
|
|
self._task.get_logger().report_text(
|
|
'Removing files from dataset: {}'.format(
|
|
dict(dataset_path=dataset_path, recursive=recursive, verbose=verbose)),
|
|
print_console=False)
|
|
|
|
if dataset_path and dataset_path.startswith('/'):
|
|
dataset_path = dataset_path[1:]
|
|
|
|
num_files = len(self._dataset_file_entries)
|
|
org_files = list(self._dataset_file_entries.keys()) if verbose else None
|
|
|
|
if not recursive:
|
|
self._dataset_file_entries = {
|
|
k: v for k, v in self._dataset_file_entries.items()
|
|
if not fnmatch(k + '/', dataset_path + '/')}
|
|
else:
|
|
wildcard = dataset_path.split('/')[-1]
|
|
path = dataset_path[:-len(dataset_path)] + '*'
|
|
|
|
self._dataset_file_entries = {
|
|
k: v for k, v in self._dataset_file_entries.items()
|
|
if not (fnmatch(k, path) and fnmatch(k, '*/' + wildcard))}
|
|
|
|
if verbose and org_files:
|
|
for f in org_files:
|
|
if f not in self._dataset_file_entries:
|
|
self._task.get_logger().report_text('Remove {}'.format(f))
|
|
|
|
# update the task script
|
|
self._add_script_call(
|
|
'remove_files', dataset_path=dataset_path, recursive=recursive)
|
|
|
|
self._serialize()
|
|
|
|
return num_files - len(self._dataset_file_entries)
|
|
|
|
def sync_folder(self, local_path, dataset_path=None, verbose=False):
|
|
# type: (Union[Path, _Path, str], Union[Path, _Path, str], bool) -> (int, int)
|
|
"""
|
|
Synchronize the dataset with a local folder. The dataset is synchronized from the
|
|
relative_base_folder (default: dataset root) and deeper with the specified local path.
|
|
|
|
:param local_path: Local folder to sync (assumes all files and recursive)
|
|
:param dataset_path: Target dataset path to sync with (default the root of the dataset)
|
|
:param verbose: If true print to console files added/modified/removed
|
|
:return: number of files removed, number of files modified/added
|
|
"""
|
|
def filter_f(f):
|
|
keep = (not f.relative_path.startswith(relative_prefix) or
|
|
(local_path / f.relative_path[len(relative_prefix):]).is_file())
|
|
if not keep and verbose:
|
|
self._task.get_logger().report_text('Remove {}'.format(f.relative_path))
|
|
return keep
|
|
|
|
self._task.get_logger().report_text(
|
|
'Syncing local copy with dataset: {}'.format(
|
|
dict(local_path=local_path, dataset_path=dataset_path, verbose=verbose)),
|
|
print_console=False)
|
|
|
|
self._dirty = True
|
|
local_path = Path(local_path)
|
|
|
|
# Path().as_posix() will never end with /
|
|
relative_prefix = (Path(dataset_path).as_posix() + '/') if dataset_path else ''
|
|
|
|
# remove files
|
|
num_files = len(self._dataset_file_entries)
|
|
self._dataset_file_entries = {
|
|
k: f for k, f in self._dataset_file_entries.items() if filter_f(f)}
|
|
removed_files = num_files - len(self._dataset_file_entries)
|
|
|
|
# add remaining files
|
|
added_files = self._add_files(path=local_path, dataset_path=dataset_path, recursive=True, verbose=verbose)
|
|
|
|
if verbose:
|
|
self._task.get_logger().report_text(
|
|
'Syncing folder {} : {} files removed, {} added / modified'.format(
|
|
local_path.as_posix(), removed_files, added_files))
|
|
|
|
# update the task script
|
|
self._add_script_call(
|
|
'sync_folder', local_path=local_path, dataset_path=dataset_path)
|
|
|
|
self._serialize()
|
|
return removed_files, added_files
|
|
|
|
def upload(self, show_progress=True, verbose=False, output_url=None, compression=None):
|
|
# type: (bool, bool, Optional[str], Optional[str]) -> ()
|
|
"""
|
|
Start file uploading, the function returns when all files are uploaded.
|
|
|
|
:param show_progress: If True show upload progress bar
|
|
:param verbose: If True print verbose progress report
|
|
:param output_url: Target storage for the compressed dataset (default: file server)
|
|
Examples: `s3://bucket/data`, `gs://bucket/data` , `azure://bucket/data` , `/mnt/share/data`
|
|
:param compression: Compression algorithm for the Zipped dataset file (default: ZIP_DEFLATED)
|
|
"""
|
|
# set output_url
|
|
if output_url:
|
|
self._task.output_uri = output_url
|
|
|
|
self._task.get_logger().report_text(
|
|
'Uploading dataset files: {}'.format(
|
|
dict(show_progress=show_progress, verbose=verbose, output_url=output_url, compression=compression)),
|
|
print_console=False)
|
|
|
|
fd, zip_file = mkstemp(
|
|
prefix='dataset.{}.'.format(self._id), suffix='.zip'
|
|
)
|
|
archive_preview = ''
|
|
count = 0
|
|
try:
|
|
with ZipFile(zip_file, 'w', allowZip64=True, compression=compression or ZIP_DEFLATED) as zf:
|
|
for file_entry in self._dataset_file_entries.values():
|
|
if not file_entry.local_path:
|
|
# file is located in a different version
|
|
continue
|
|
filename = Path(file_entry.local_path)
|
|
if not filename.is_file():
|
|
LoggerRoot.get_base_logger().warning(
|
|
"Could not store dataset file {}. File skipped".format(file_entry.local_path))
|
|
# mark for removal
|
|
file_entry.relative_path = None
|
|
continue
|
|
if verbose:
|
|
self._task.get_logger().report_text('Compressing {}'.format(filename.as_posix()))
|
|
|
|
relative_file_name = file_entry.relative_path
|
|
zf.write(filename.as_posix(), arcname=relative_file_name)
|
|
archive_preview += '{} - {}\n'.format(
|
|
relative_file_name, humanfriendly.format_size(filename.stat().st_size))
|
|
file_entry.local_path = None
|
|
count += 1
|
|
except Exception as e:
|
|
# failed uploading folder:
|
|
LoggerRoot.get_base_logger().warning(
|
|
'Exception {}\nFailed zipping dataset.'.format(e))
|
|
return False
|
|
finally:
|
|
os.close(fd)
|
|
|
|
zip_file = Path(zip_file)
|
|
|
|
if not count:
|
|
zip_file.unlink()
|
|
LoggerRoot.get_base_logger().info('No pending files, skipping upload.')
|
|
self._dirty = False
|
|
self._serialize()
|
|
return True
|
|
|
|
archive_preview = 'Dataset archive content [{} files]:\n'.format(count) + archive_preview
|
|
|
|
# noinspection PyBroadException
|
|
try:
|
|
# let's try to rename it
|
|
new_zip_file = zip_file.parent / 'dataset.{}.zip'.format(self._id)
|
|
zip_file.rename(new_zip_file)
|
|
zip_file = new_zip_file
|
|
except Exception:
|
|
pass
|
|
# remove files that could not be zipped, containing Null relative Path
|
|
self._dataset_file_entries = {k: v for k, v in self._dataset_file_entries.items()
|
|
if v.relative_path is not None}
|
|
# start upload
|
|
zip_file_size = humanfriendly.format_size(Path(zip_file).stat().st_size)
|
|
self._task.get_logger().report_text(
|
|
'Uploading compressed dataset changes ({} files, total {}) to {}'.format(
|
|
count, zip_file_size, self.get_default_storage()))
|
|
self._task.upload_artifact(
|
|
name=self.__data_entry_name, artifact_object=Path(zip_file), preview=archive_preview,
|
|
delete_after_upload=True, wait_on_upload=True)
|
|
self._task.get_logger().report_text('Upload completed ({})'.format(zip_file_size))
|
|
|
|
self._add_script_call(
|
|
'upload', show_progress=show_progress, verbose=verbose, output_url=output_url, compression=compression)
|
|
|
|
self._dirty = False
|
|
self._serialize()
|
|
|
|
def finalize(self, verbose=False, raise_on_error=True):
|
|
# type: (bool, bool) -> bool
|
|
"""
|
|
Finalize the dataset (if upload was not called, it will be called automatically) publish dataset Task.
|
|
If files need to be uploaded, throw exception (or return False)
|
|
|
|
:param verbose: If True print verbose progress report
|
|
:param raise_on_error: If True raise exception if dataset finalizing failed
|
|
"""
|
|
# check we do not have files waiting for upload.
|
|
if self._dirty:
|
|
if raise_on_error:
|
|
raise ValueError("Cannot finalize dataset, pending uploads. Call Dataset.upload(...)")
|
|
return False
|
|
|
|
self._task.get_logger().report_text('Finalizing dataset', print_console=False)
|
|
|
|
# make sure we have no redundant parent versions
|
|
self._serialize()
|
|
self._add_script_call('finalize')
|
|
if verbose:
|
|
print('Updating statistics and genealogy')
|
|
self._report_dataset_genealogy()
|
|
hashed_nodes = [self._get_dataset_id_hash(k) for k in self._dependency_graph.keys()]
|
|
self._task.comment = 'Dependencies: {}\n'.format(hashed_nodes)
|
|
self._task.close()
|
|
self._task.completed()
|
|
|
|
if self._task_pinger:
|
|
self._task_pinger.unregister()
|
|
self._task_pinger = None
|
|
|
|
return True
|
|
|
|
def is_final(self):
|
|
# type: () -> bool
|
|
"""
|
|
Return True if the dataset was finalized and cannot be changed any more.
|
|
|
|
:return: True if dataset if final
|
|
"""
|
|
return self._task.get_status() not in (
|
|
Task.TaskStatusEnum.in_progress, Task.TaskStatusEnum.created, Task.TaskStatusEnum.failed)
|
|
|
|
def get_local_copy(self, use_soft_links=None, raise_on_error=True):
|
|
# type: (bool, bool) -> str
|
|
"""
|
|
return a base folder with a read-only (immutable) local copy of the entire dataset
|
|
download and copy / soft-link, files from all the parent dataset versions
|
|
|
|
:param use_soft_links: If True use soft links, default False on windows True on Posix systems
|
|
:param raise_on_error: If True raise exception if dataset merging failed on any file
|
|
:return: A base folder for the entire dataset
|
|
"""
|
|
assert self._id
|
|
if not self._task:
|
|
self._task = Task.get_task(task_id=self._id)
|
|
|
|
# now let's merge the parents
|
|
target_folder = self._merge_datasets(use_soft_links=use_soft_links, raise_on_error=raise_on_error)
|
|
return target_folder
|
|
|
|
def get_mutable_local_copy(self, target_folder, overwrite=False, raise_on_error=True):
|
|
# type: (Union[Path, _Path, str], bool, bool) -> Optional[str]
|
|
"""
|
|
return a base folder with a writable (mutable) local copy of the entire dataset
|
|
download and copy / soft-link, files from all the parent dataset versions
|
|
|
|
:param target_folder: Target folder for the writable copy
|
|
:param overwrite: If True, recursively delete the target folder before creating a copy.
|
|
If False (default) and target folder contains files, raise exception or return None
|
|
:param raise_on_error: If True raise exception if dataset merging failed on any file
|
|
:return: A the target folder containing the entire dataset
|
|
"""
|
|
assert self._id
|
|
target_folder = Path(target_folder)
|
|
target_folder.mkdir(parents=True, exist_ok=True)
|
|
# noinspection PyBroadException
|
|
try:
|
|
target_folder.rmdir()
|
|
except Exception:
|
|
if not overwrite:
|
|
if raise_on_error:
|
|
raise ValueError("Target folder {} already contains files".format(target_folder.as_posix()))
|
|
else:
|
|
return None
|
|
shutil.rmtree(target_folder.as_posix())
|
|
|
|
ro_folder = self.get_local_copy(raise_on_error=raise_on_error)
|
|
shutil.copytree(ro_folder, target_folder.as_posix())
|
|
return target_folder.as_posix()
|
|
|
|
def list_files(self, dataset_path=None, recursive=True, dataset_id=None):
|
|
# type: (Optional[str], bool, Optional[str]) -> List[str]
|
|
"""
|
|
returns a list of files in the current dataset
|
|
If dataset_id is give, return a list of files that remained unchanged since the specified dataset_version
|
|
|
|
:param dataset_path: Only match files matching the dataset_path (including wildcards).
|
|
Example: folder/sub/*.json
|
|
:param recursive: If True (default) matching dataset_path recursively
|
|
:param dataset_id: Filter list based on the dataset id containing the latest version of the file.
|
|
Default: None, do not filter files based on parent dataset.
|
|
:return: List of files with relative path
|
|
(files might not be available locally until get_local_copy() is called)
|
|
"""
|
|
files = self._dataset_file_entries.keys() if not dataset_id else \
|
|
[k for k, v in self._dataset_file_entries.items() if v.parent_dataset_id == dataset_id]
|
|
|
|
if not dataset_path:
|
|
return sorted(files)
|
|
|
|
if dataset_path.startswith('/'):
|
|
dataset_path = dataset_path[1:]
|
|
|
|
if not recursive:
|
|
return sorted([k for k in files if fnmatch(k + '/', dataset_path + '/')])
|
|
|
|
wildcard = dataset_path.split('/')[-1]
|
|
path = dataset_path[:-len(wildcard)] + '*'
|
|
return sorted([k for k in files if fnmatch(k, path) and fnmatch(k, '*/' + wildcard)])
|
|
|
|
def list_removed_files(self, dataset_id=None):
|
|
# type: (str) -> List[str]
|
|
"""
|
|
return a list of files removed when comparing to a specific dataset_version
|
|
|
|
:param dataset_id: dataset id (str) to compare against, if None is given compare against the parents datasets
|
|
:return: List of files with relative path
|
|
(files might not be available locally until get_local_copy() is called)
|
|
"""
|
|
datasets = self._dependency_graph[self._id] if not dataset_id or dataset_id == self._id else [dataset_id]
|
|
unified_list = set()
|
|
for ds_id in datasets:
|
|
dataset = self.get(dataset_id=ds_id)
|
|
unified_list |= set(dataset._dataset_file_entries.keys())
|
|
|
|
removed_list = [f for f in unified_list if f not in self._dataset_file_entries]
|
|
return sorted(removed_list)
|
|
|
|
def list_modified_files(self, dataset_id=None):
|
|
# type: (str) -> List[str]
|
|
"""
|
|
return a list of files removed when comparing to a specific dataset_version
|
|
|
|
:param dataset_id: dataset id (str) to compare against, if None is given compare against the parents datasets
|
|
:return: List of files with relative path
|
|
(files might not be available locally until get_local_copy() is called)
|
|
"""
|
|
datasets = self._dependency_graph[self._id] if not dataset_id or dataset_id == self._id else [dataset_id]
|
|
unified_list = dict()
|
|
for ds_id in datasets:
|
|
dataset = self.get(dataset_id=ds_id)
|
|
unified_list.update(dict((k, v.hash) for k, v in dataset._dataset_file_entries.items()))
|
|
|
|
modified_list = [k for k, v in self._dataset_file_entries.items()
|
|
if k in unified_list and v.hash != unified_list[k]]
|
|
return sorted(modified_list)
|
|
|
|
def list_added_files(self, dataset_id=None):
|
|
# type: (str) -> List[str]
|
|
"""
|
|
return a list of files removed when comparing to a specific dataset_version
|
|
|
|
:param dataset_id: dataset id (str) to compare against, if None is given compare against the parents datasets
|
|
:return: List of files with relative path
|
|
(files might not be available locally until get_local_copy() is called)
|
|
"""
|
|
datasets = self._dependency_graph[self._id] if not dataset_id or dataset_id == self._id else [dataset_id]
|
|
unified_list = set()
|
|
for ds_id in datasets:
|
|
dataset = self.get(dataset_id=ds_id)
|
|
unified_list |= set(dataset._dataset_file_entries.keys())
|
|
|
|
added_list = [f for f in self._dataset_file_entries.keys() if f not in unified_list]
|
|
return sorted(added_list)
|
|
|
|
def get_dependency_graph(self):
|
|
"""
|
|
return the DAG of the dataset dependencies (all previous dataset version and their parents/
|
|
Example:
|
|
{
|
|
'current_dataset_id': ['parent_1_id', 'parent_2_id'],
|
|
'parent_2_id': ['parent_1_id'],
|
|
'parent_1_id': [],
|
|
}
|
|
|
|
:return: dict representing the genealogy dag graph of the current dataset
|
|
"""
|
|
return deepcopy(self._dependency_graph)
|
|
|
|
def verify_dataset_hash(self, local_copy_path=None, skip_hash=False, verbose=False):
|
|
# type: (Optional[str], bool, bool) -> List[str]
|
|
"""
|
|
Verify the current copy of the dataset against the stored hash
|
|
|
|
:param local_copy_path: Specify local path containing a copy of the dataset,
|
|
If not provide use the cached folder
|
|
:param skip_hash: If True, skip hash checks and verify file size only
|
|
:param verbose: If True print errors while testing dataset files hash
|
|
:return: List of files with unmatched hashes
|
|
"""
|
|
local_path = local_copy_path or self.get_local_copy()
|
|
|
|
def compare(file_entry):
|
|
file_entry_copy = copy(file_entry)
|
|
file_entry_copy.local_path = (Path(local_path) / file_entry.relative_path).as_posix()
|
|
if skip_hash:
|
|
file_entry_copy.size = Path(file_entry_copy.local_path).stat().st_size
|
|
if file_entry_copy.size != file_entry.size:
|
|
if verbose:
|
|
print('Error: file size mismatch {} expected size {} current {}'.format(
|
|
file_entry.relative_path, file_entry.size, file_entry_copy.size))
|
|
return file_entry
|
|
else:
|
|
self._calc_file_hash(file_entry_copy)
|
|
if file_entry_copy.hash != file_entry.hash:
|
|
if verbose:
|
|
print('Error: hash mismatch {} expected size/hash {}/{} recalculated {}/{}'.format(
|
|
file_entry.relative_path,
|
|
file_entry.size, file_entry.hash,
|
|
file_entry_copy.size, file_entry_copy.hash))
|
|
return file_entry
|
|
|
|
return None
|
|
|
|
pool = ThreadPool(cpu_count() * 2)
|
|
matching_errors = pool.map(compare, self._dataset_file_entries.values())
|
|
pool.close()
|
|
return [f.relative_path for f in matching_errors if f is not None]
|
|
|
|
def get_default_storage(self):
|
|
# type: () -> Optional[str]
|
|
"""
|
|
Return the default storage location of the dataset
|
|
|
|
:return: URL for the default storage location
|
|
"""
|
|
if not self._task:
|
|
return None
|
|
return self._task.output_uri or self._task.get_logger().get_default_upload_destination()
|
|
|
|
@classmethod
|
|
def create(cls, dataset_name, dataset_project=None, parent_datasets=None):
|
|
# type: (str, Optional[str], Optional[Sequence[Union[str, Dataset]]]) -> Dataset
|
|
"""
|
|
Create a new dataset. Multiple dataset parents are supported.
|
|
Merging of parent datasets is done based on the order,
|
|
where each one can override overlapping files in the previous parent
|
|
|
|
:param dataset_name: Naming the new dataset
|
|
:param dataset_project: Project containing the dataset.
|
|
If not specified, infer project name form parent datasets
|
|
:param parent_datasets: Expand a parent dataset by adding/removing files
|
|
:return: Newly created Dataset object
|
|
"""
|
|
parent_datasets = [cls.get(dataset_id=p) if isinstance(p, str) else p for p in (parent_datasets or [])]
|
|
if any(not p.is_final() for p in parent_datasets):
|
|
raise ValueError("Cannot inherit from a parent that was not finalized/closed")
|
|
|
|
# get project name
|
|
if not dataset_project:
|
|
if not parent_datasets:
|
|
raise ValueError("Missing dataset project name. Could not infer project name from parent dataset.")
|
|
# get project name from parent dataset
|
|
dataset_project = parent_datasets[-1]._task.get_project_name()
|
|
|
|
# merge datasets according to order
|
|
dataset_file_entries = {}
|
|
dependency_graph = {}
|
|
for p in parent_datasets:
|
|
dataset_file_entries.update(deepcopy(p._dataset_file_entries))
|
|
dependency_graph.update(deepcopy(p._dependency_graph))
|
|
instance = cls(_private=cls.__private_magic, dataset_project=dataset_project, dataset_name=dataset_name)
|
|
instance._task.get_logger().report_text('Dataset created', print_console=False)
|
|
instance._dataset_file_entries = dataset_file_entries
|
|
instance._dependency_graph = dependency_graph
|
|
instance._dependency_graph[instance._id] = [p._id for p in parent_datasets]
|
|
instance._serialize()
|
|
instance._task.flush(wait_for_uploads=True)
|
|
cls._set_project_system_tags(instance._task)
|
|
return instance
|
|
|
|
@classmethod
|
|
def delete(cls, dataset_id=None, dataset_project=None, dataset_name=None, force=False):
|
|
# type: (Optional[str], Optional[str], Optional[str], bool) -> ()
|
|
"""
|
|
Delete a dataset, raise exception if dataset is used by other dataset versions.
|
|
Use force=True to forcefully delete the dataset
|
|
|
|
:param dataset_id: Dataset id to delete
|
|
:param dataset_project: Project containing the dataset
|
|
:param dataset_name: Naming the new dataset
|
|
:param force: If True delete even if other datasets depend on the specified dataset version
|
|
"""
|
|
mutually_exclusive(dataset_id=dataset_id, dataset_project=dataset_project)
|
|
mutually_exclusive(dataset_id=dataset_id, dataset_name=dataset_name)
|
|
if not dataset_id:
|
|
tasks = Task.get_tasks(
|
|
project_name=dataset_project,
|
|
task_name=exact_match_regex(dataset_name) if dataset_name else None,
|
|
task_filter=dict(
|
|
system_tags=[cls.__tag],
|
|
type=[str(Task.TaskTypes.data_processing)],
|
|
page_size=2, page=0,)
|
|
)
|
|
if not tasks:
|
|
raise ValueError("Dataset project={} name={} could not be found".format(dataset_project, dataset_name))
|
|
if len(tasks) > 1:
|
|
raise ValueError("Too many datasets matching project={} name={}".format(dataset_project, dataset_name))
|
|
dataset_id = tasks[0].id
|
|
|
|
# check if someone is using the datasets
|
|
if not force:
|
|
# noinspection PyProtectedMember
|
|
dependencies = Task._query_tasks(
|
|
system_tags=[cls.__tag],
|
|
type=[str(Task.TaskTypes.data_processing)],
|
|
only_fields=['created', 'id', 'name'],
|
|
search_text='{}'.format(cls._get_dataset_id_hash(dataset_id))
|
|
)
|
|
# filter us out
|
|
if dependencies:
|
|
dependencies = [d for d in dependencies if d.id != dataset_id]
|
|
if dependencies:
|
|
raise ValueError("Dataset id={} is used by datasets: {}".format(
|
|
dataset_id, [d.id for d in dependencies]))
|
|
|
|
client = APIClient()
|
|
# notice the force here is a must, since the state is never draft
|
|
# noinspection PyBroadException
|
|
try:
|
|
t = client.tasks.get_by_id(dataset_id)
|
|
except Exception:
|
|
t = None
|
|
if not t:
|
|
raise ValueError("Dataset id={} could not be found".format(dataset_id))
|
|
if str(t.type) != str(Task.TaskTypes.data_processing) or cls.__tag not in t.system_tags:
|
|
raise ValueError("Dataset id={} is not of type Dataset".format(dataset_id))
|
|
|
|
task = Task.get_task(task_id=dataset_id)
|
|
# first delete all the artifacts from the dataset
|
|
for artifact in task.artifacts.values():
|
|
h = StorageHelper.get(artifact.url)
|
|
# noinspection PyBroadException
|
|
try:
|
|
h.delete(artifact.url)
|
|
except Exception as ex:
|
|
LoggerRoot.get_base_logger().warning('Failed deleting remote file \'{}\': {}'.format(
|
|
artifact.url, ex))
|
|
|
|
# now delete the actual task
|
|
client.tasks.delete(task=dataset_id, force=True)
|
|
|
|
@classmethod
|
|
def get(cls, dataset_id=None, dataset_project=None, dataset_name=None, only_completed=False, only_published=False):
|
|
# type: (Optional[str], Optional[str], Optional[str], bool , bool) -> Dataset
|
|
"""
|
|
Get a specific Dataset. If only dataset_project is given, return the last Dataset in the Dataset project
|
|
|
|
:param dataset_id: Requested Dataset ID
|
|
:param dataset_project: Requested Dataset project name
|
|
:param dataset_name: Requested Dataset name
|
|
:param only_completed: Return only if the requested dataset is completed or published
|
|
:param only_published: Return only if the requested dataset is published
|
|
:return: Dataset object
|
|
"""
|
|
mutually_exclusive(dataset_id=dataset_id, dataset_project=dataset_project)
|
|
mutually_exclusive(dataset_id=dataset_id, dataset_name=dataset_name)
|
|
tasks = Task.get_tasks(
|
|
task_ids=[dataset_id] if dataset_id else None,
|
|
project_name=dataset_project,
|
|
task_name=exact_match_regex(dataset_name) if dataset_name else None,
|
|
task_filter=dict(
|
|
system_tags=[cls.__tag, '-archived'], order_by=['-created'],
|
|
type=[str(Task.TaskTypes.data_processing)],
|
|
page_size=1, page=0,
|
|
status=['publish'] if only_published else
|
|
['publish', 'stopped', 'completed', 'closed'] if only_completed else None)
|
|
)
|
|
if not tasks:
|
|
raise ValueError('Could not find Dataset {} {}'.format(
|
|
'id' if dataset_id else 'project/name',
|
|
dataset_id if dataset_id else (dataset_project, dataset_name)))
|
|
task = tasks[0]
|
|
if task.status == 'created':
|
|
raise ValueError('Dataset id={} is in draft mode, delete and recreate it'.format(task.id))
|
|
force_download = False if task.status in ('stopped', 'published', 'closed', 'completed') else True
|
|
local_state_file = StorageManager.get_local_copy(
|
|
remote_url=task.artifacts[cls.__state_entry_name].url, cache_context=cls.__cache_context,
|
|
extract_archive=False, name=task.id, force_download=force_download)
|
|
if not local_state_file:
|
|
raise ValueError('Could not load Dataset id={} state'.format(task.id))
|
|
|
|
instance = cls._deserialize(local_state_file, task)
|
|
# remove the artifact, just in case
|
|
if force_download:
|
|
os.unlink(local_state_file)
|
|
|
|
return instance
|
|
|
|
@classmethod
|
|
def squash(cls, dataset_name, dataset_ids=None, dataset_project_name_pairs=None, output_url=None):
|
|
# type: (str, Optional[Sequence[Union[str, Dataset]]],Optional[Sequence[(str, str)]], Optional[str]) -> Dataset
|
|
"""
|
|
Generate a new dataset from the squashed set of dataset versions.
|
|
If a single version is given it will squash to the root (i.e. create single standalone version)
|
|
If a set of versions are given it will squash the versions diff into a single version
|
|
|
|
:param dataset_name: Target name for the newly generated squashed dataset
|
|
:param dataset_ids: List of dataset Ids (or objects) to squash. Notice order does matter.
|
|
The versions are merged from first to last.
|
|
:param dataset_project_name_pairs: List of pairs (project_name, dataset_name) to squash.
|
|
Notice order does matter. The versions are merged from first to last.
|
|
:param output_url: Target storage for the compressed dataset (default: file server)
|
|
Examples: `s3://bucket/data`, `gs://bucket/data` , `azure://bucket/data` , `/mnt/share/data`
|
|
:return: Newly created dataset object.
|
|
"""
|
|
mutually_exclusive(dataset_ids=dataset_ids, dataset_project_name_pairs=dataset_project_name_pairs)
|
|
datasets = [cls.get(dataset_id=d) for d in dataset_ids] if dataset_ids else \
|
|
[cls.get(dataset_project=pair[0], dataset_name=pair[1]) for pair in dataset_project_name_pairs]
|
|
# single dataset to squash, squash it all.
|
|
if len(datasets) == 1:
|
|
temp_folder = datasets[0].get_local_copy()
|
|
parents = set()
|
|
else:
|
|
parents = None
|
|
temp_folder = Path(mkdtemp(prefix='squash-datasets.'))
|
|
pool = ThreadPool()
|
|
for ds in datasets:
|
|
base_folder = Path(ds._extract_dataset_archive())
|
|
files = [f.relative_path for f in ds.file_entries if f.parent_dataset_id == ds.id]
|
|
pool.map(
|
|
lambda x:
|
|
(temp_folder / x).parent.mkdir(parents=True, exist_ok=True) or
|
|
shutil.copy((base_folder / x).as_posix(), (temp_folder / x).as_posix(), follow_symlinks=True),
|
|
files)
|
|
parents = set(ds._get_parents()) if parents is None else (parents & set(ds._get_parents()))
|
|
pool.close()
|
|
|
|
squashed_ds = cls.create(
|
|
dataset_project=datasets[0].project, dataset_name=dataset_name, parent_datasets=list(parents))
|
|
squashed_ds._task.get_logger().report_text('Squashing dataset', print_console=False)
|
|
squashed_ds.add_files(temp_folder)
|
|
squashed_ds.upload(output_url=output_url)
|
|
squashed_ds.finalize()
|
|
return squashed_ds
|
|
|
|
@classmethod
|
|
def list_datasets(cls, dataset_project=None, partial_name=None, tags=None, ids=None, only_completed=True):
|
|
# type: (Optional[str], Optional[str], Optional[Sequence[str]], Optional[Sequence[str]], bool) -> List[dict]
|
|
"""
|
|
Query list of dataset in the system
|
|
|
|
:param dataset_project: Specify dataset project name
|
|
:param partial_name: Specify partial match to a dataset name
|
|
:param tags: Specify user tags
|
|
:param ids: List specific dataset based on IDs list
|
|
:param only_completed: If False return dataset that are still in progress (uploading/edited etc.)
|
|
:return: List of dictionaries with dataset information
|
|
Example: [{'name': name, 'project': project name, 'id': dataset_id, 'created': date_created},]
|
|
"""
|
|
# noinspection PyProtectedMember
|
|
datasets = Task._query_tasks(
|
|
task_ids=ids or None, project_name=dataset_project or None,
|
|
task_name=partial_name,
|
|
system_tags=[cls.__tag],
|
|
type=[str(Task.TaskTypes.data_processing)],
|
|
tags=tags or None,
|
|
status=['stopped', 'published', 'completed', 'closed'] if only_completed else None,
|
|
only_fields=['created', 'id', 'name', 'project']
|
|
)
|
|
project_ids = {d.project for d in datasets}
|
|
# noinspection PyProtectedMember
|
|
project_id_lookup = {d: Task._get_project_name(d) for d in project_ids}
|
|
return [
|
|
{'name': d.name, 'created': d.created, 'project': project_id_lookup[d.project], 'id': d.id}
|
|
for d in datasets
|
|
]
|
|
|
|
def _add_files(self,
|
|
path, # type: Union[str, Path, _Path]
|
|
wildcard=None, # type: Optional[Union[str, Sequence[str]]]
|
|
local_base_folder=None, # type: Optional[str]
|
|
dataset_path=None, # type: Optional[str]
|
|
recursive=True, # type: bool
|
|
verbose=False # type: bool
|
|
):
|
|
# type: (...) -> int
|
|
"""
|
|
Add a folder into the current dataset. calculate file hash,
|
|
and compare against parent, mark files to be uploaded
|
|
|
|
:param path: Add a folder/file to the dataset
|
|
:param wildcard: add only specific set of files.
|
|
Wildcard matching, can be a single string or a list of wildcards)
|
|
:param local_base_folder: files will be located based on their relative path from local_base_folder
|
|
:param dataset_path: where in the dataset the folder/files should be located
|
|
:param recursive: If True match all wildcard files recursively
|
|
:param verbose: If True print to console added files
|
|
"""
|
|
if dataset_path and dataset_path.startswith('/'):
|
|
dataset_path = dataset_path[1:]
|
|
path = Path(path)
|
|
local_base_folder = Path(local_base_folder or path)
|
|
wildcard = wildcard or '*'
|
|
# single file, no need for threading
|
|
if path.is_file():
|
|
if not local_base_folder.is_dir():
|
|
local_base_folder = local_base_folder.parent
|
|
file_entry = self._calc_file_hash(
|
|
FileEntry(local_path=path.absolute().as_posix(),
|
|
relative_path=(Path(dataset_path or '.') / path.relative_to(local_base_folder)).as_posix(),
|
|
parent_dataset_id=self._id))
|
|
file_entries = [file_entry]
|
|
else:
|
|
# if not a folder raise exception
|
|
if not path.is_dir():
|
|
raise ValueError("Could not find file/folder \'{}\'", path.as_posix())
|
|
|
|
# prepare a list of files
|
|
files = list(path.rglob(wildcard)) if recursive else list(path.glob(wildcard))
|
|
file_entries = [
|
|
FileEntry(
|
|
parent_dataset_id=self._id,
|
|
local_path=f.absolute().as_posix(),
|
|
relative_path=(Path(dataset_path or '.') / f.relative_to(local_base_folder)).as_posix())
|
|
for f in files if f.is_file()]
|
|
self._task.get_logger().report_text('Generating SHA2 hash for {} files'.format(len(file_entries)))
|
|
pool = ThreadPool(cpu_count() * 2)
|
|
pool.map(self._calc_file_hash, file_entries)
|
|
pool.close()
|
|
self._task.get_logger().report_text('Hash generation completed')
|
|
|
|
# merge back into the dataset
|
|
count = 0
|
|
for f in file_entries:
|
|
ds_cur_f = self._dataset_file_entries.get(f.relative_path)
|
|
if not ds_cur_f:
|
|
if verbose:
|
|
self._task.get_logger().report_text('Add {}'.format(f.relative_path))
|
|
self._dataset_file_entries[f.relative_path] = f
|
|
count += 1
|
|
elif ds_cur_f.hash != f.hash:
|
|
if verbose:
|
|
self._task.get_logger().report_text('Modified {}'.format(f.relative_path))
|
|
self._dataset_file_entries[f.relative_path] = f
|
|
count += 1
|
|
elif f.parent_dataset_id == self._id and ds_cur_f.parent_dataset_id == self._id:
|
|
if verbose:
|
|
self._task.get_logger().report_text('Re-Added {}'.format(f.relative_path))
|
|
self._dataset_file_entries[f.relative_path] = f
|
|
count += 1
|
|
else:
|
|
if verbose:
|
|
self._task.get_logger().report_text('Unchanged {}'.format(f.relative_path))
|
|
|
|
return count
|
|
|
|
def _update_dependency_graph(self):
|
|
"""
|
|
Update the dependency graph based on the current self._dataset_file_entries state
|
|
:return:
|
|
"""
|
|
# collect all dataset versions
|
|
used_dataset_versions = set(f.parent_dataset_id for f in self._dataset_file_entries.values())
|
|
used_dataset_versions.add(self._id)
|
|
current_parents = self._dependency_graph.get(self._id)
|
|
# remove parent versions we no longer need from the main version list
|
|
# per version, remove unnecessary parent versions, if we do not need them
|
|
self._dependency_graph = {k: [p for p in parents if p in used_dataset_versions]
|
|
for k, parents in self._dependency_graph.items() if k in used_dataset_versions}
|
|
# make sure we do not remove our parents, for geology sake
|
|
self._dependency_graph[self._id] = current_parents
|
|
|
|
def _serialize(self):
|
|
"""
|
|
store current state of the Dataset for later use
|
|
:return: object to be used for later deserialization
|
|
"""
|
|
self._update_dependency_graph()
|
|
|
|
state = dict(
|
|
dataset_file_entries=[f.as_dict() for f in self._dataset_file_entries.values()],
|
|
dependency_graph=self._dependency_graph,
|
|
id=self._id,
|
|
dirty=self._dirty,
|
|
)
|
|
modified_files = [f['size'] for f in state['dataset_file_entries'] if f.get('parent_dataset_id') == self._id]
|
|
preview = \
|
|
'Dataset state\n' \
|
|
'Files added/modified: {0} - total size {1}\n' \
|
|
'Current dependency graph: {2}\n'.format(
|
|
len(modified_files), humanfriendly.format_size(sum(modified_files)),
|
|
json.dumps(self._dependency_graph, indent=2, sort_keys=True))
|
|
# store as artifact of the Task.
|
|
self._task.upload_artifact(
|
|
name=self.__state_entry_name, artifact_object=state, preview=preview, wait_on_upload=True)
|
|
|
|
def _download_dataset_archive(self):
|
|
"""
|
|
Download the dataset archive, return a link to locally stored zip file
|
|
:return: Path to locally stored zip file
|
|
"""
|
|
pass # TODO: implement
|
|
|
|
def _extract_dataset_archive(self):
|
|
"""
|
|
Download the dataset archive, and extract the zip content to a cached folder.
|
|
Notice no merging is done.
|
|
|
|
:return: Path to a local storage extracted archive
|
|
"""
|
|
if not self._task:
|
|
self._task = Task.get_task(task_id=self._id)
|
|
# check if we have a dataset with empty change set
|
|
if not self._task.artifacts.get(self.__data_entry_name):
|
|
cache = CacheManager.get_cache_manager(cache_context=self.__cache_context)
|
|
local_folder = Path(cache.get_cache_folder()) / self._get_cache_folder_name()
|
|
local_folder.mkdir(parents=True, exist_ok=True)
|
|
return local_folder.as_posix()
|
|
|
|
# download the dataset zip
|
|
local_zip = StorageManager.get_local_copy(
|
|
remote_url=self._task.artifacts[self.__data_entry_name].url, cache_context=self.__cache_context,
|
|
extract_archive=False, name=self._id)
|
|
if not local_zip:
|
|
raise ValueError("Could not download dataset id={}".format(self._id))
|
|
local_folder = (Path(local_zip).parent / self._get_cache_folder_name()).as_posix()
|
|
# if we got here, we need to clear the target folder
|
|
shutil.rmtree(local_folder, ignore_errors=True)
|
|
# noinspection PyProtectedMember
|
|
local_folder = StorageManager._extract_to_cache(
|
|
cached_file=local_zip, name=self._id,
|
|
cache_context=self.__cache_context, target_folder=local_folder)
|
|
return local_folder
|
|
|
|
def _merge_datasets(self, use_soft_links=None, raise_on_error=True):
|
|
# type: (bool, bool) -> str
|
|
"""
|
|
download and copy / soft-link, files from all the parent dataset versions
|
|
:param use_soft_links: If True use soft links, default False on windows True on Posix systems
|
|
:param raise_on_error: If True raise exception if dataset merging failed on any file
|
|
:return: the target folder
|
|
"""
|
|
if use_soft_links is None:
|
|
use_soft_links = False if is_windows() else True
|
|
|
|
# check if we already have everything
|
|
target_base_folder, target_base_size = CacheManager.get_cache_manager(
|
|
cache_context=self.__cache_context).get_cache_file(local_filename=self._get_cache_folder_name())
|
|
if target_base_folder and target_base_size is not None:
|
|
target_base_folder = Path(target_base_folder)
|
|
# check dataset file size, if we have a full match no need for parent dataset download / merge
|
|
verified = True
|
|
# noinspection PyBroadException
|
|
try:
|
|
for f in self._dataset_file_entries.values():
|
|
if (target_base_folder / f.relative_path).stat().st_size != f.size:
|
|
verified = False
|
|
break
|
|
except Exception:
|
|
verified = False
|
|
|
|
if verified:
|
|
return target_base_folder.as_posix()
|
|
else:
|
|
LoggerRoot.get_base_logger().info('Dataset needs refreshing, fetching all parent datasets')
|
|
|
|
# first get our dataset
|
|
target_base_folder = Path(self._extract_dataset_archive())
|
|
target_base_folder.touch()
|
|
|
|
# create thread pool
|
|
pool = ThreadPool(cpu_count() * 2)
|
|
for dataset_version_id in self._get_dependencies_by_order():
|
|
# make sure we skip over empty dependencies
|
|
if dataset_version_id not in self._dependency_graph:
|
|
continue
|
|
|
|
ds = Dataset.get(dataset_id=dataset_version_id)
|
|
ds_base_folder = Path(ds._extract_dataset_archive())
|
|
ds_base_folder.touch()
|
|
|
|
def copy_file(file_entry):
|
|
if file_entry.parent_dataset_id != dataset_version_id:
|
|
return
|
|
source = (ds_base_folder / file_entry.relative_path).as_posix()
|
|
target = (target_base_folder / file_entry.relative_path).as_posix()
|
|
try:
|
|
# make sure we have can overwrite the target file
|
|
# noinspection PyBroadException
|
|
try:
|
|
os.unlink(target)
|
|
except Exception:
|
|
Path(target).parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# copy / link
|
|
if use_soft_links:
|
|
if not os.path.isfile(source):
|
|
raise ValueError("Extracted file missing {}".format(source))
|
|
os.symlink(source, target)
|
|
else:
|
|
shutil.copy2(source, target, follow_symlinks=True)
|
|
except Exception as ex:
|
|
LoggerRoot.get_base_logger().warning('{}\nFailed {} file {} to {}'.format(
|
|
ex, 'linking' if use_soft_links else 'copying', source, target))
|
|
return ex
|
|
|
|
return None
|
|
|
|
errors = pool.map(copy_file, self._dataset_file_entries.values())
|
|
if raise_on_error and any(errors):
|
|
raise ValueError("Dataset merging failed: {}".format([e for e in errors if e is not None]))
|
|
|
|
pool.close()
|
|
return target_base_folder.as_posix()
|
|
|
|
def _get_dependencies_by_order(self, include_unused=False):
|
|
# type: (bool) -> List[str]
|
|
"""
|
|
Return the dataset dependencies by order of application (from the last to the current)
|
|
:param bool include_unused: If True include unused datasets in the dependencies
|
|
:return: list of str representing the datasets id
|
|
"""
|
|
roots = [self._id]
|
|
dependencies = []
|
|
# noinspection DuplicatedCode
|
|
while roots:
|
|
r = roots.pop(0)
|
|
if r not in dependencies:
|
|
dependencies.append(r)
|
|
# add the parents of the current node, only if the parents are in the general graph node list
|
|
if include_unused and r not in self._dependency_graph:
|
|
roots.extend(list(reversed(
|
|
[p for p in self.get(dataset_id=r)._get_parents() if p not in roots])))
|
|
else:
|
|
roots.extend(list(reversed(
|
|
[p for p in self._dependency_graph.get(r, [])
|
|
if p not in roots and (include_unused or (p in self._dependency_graph))])))
|
|
|
|
# make sure we cover leftovers
|
|
leftovers = set(self._dependency_graph.keys()) - set(dependencies)
|
|
if leftovers:
|
|
roots = list(leftovers)
|
|
# noinspection DuplicatedCode
|
|
while roots:
|
|
r = roots.pop(0)
|
|
if r not in dependencies:
|
|
dependencies.append(r)
|
|
# add the parents of the current node, only if the parents are in the general graph node list
|
|
if include_unused and r not in self._dependency_graph:
|
|
roots.extend(list(reversed(
|
|
[p for p in self.get(dataset_id=r)._get_parents() if p not in roots])))
|
|
else:
|
|
roots.extend(list(reversed(
|
|
[p for p in self._dependency_graph.get(r, [])
|
|
if p not in roots and (include_unused or (p in self._dependency_graph))])))
|
|
|
|
# skip our id
|
|
return list(reversed(dependencies[1:]))
|
|
|
|
def _get_parents(self):
|
|
# type: () -> Sequence[str]
|
|
"""
|
|
Return a list of direct parent datasets (str)
|
|
:return: list of dataset ids
|
|
"""
|
|
return self._dependency_graph[self.id]
|
|
|
|
@classmethod
|
|
def _deserialize(cls, stored_state, task):
|
|
# type: (Union[dict, str, Path, _Path], Task) -> Dataset
|
|
"""
|
|
reload a dataset state from the stored_state object
|
|
:param task: Task object associated with the dataset
|
|
:return: A Dataset object
|
|
"""
|
|
assert isinstance(stored_state, (dict, str, Path, _Path))
|
|
|
|
if isinstance(stored_state, (str, Path, _Path)):
|
|
stored_state_file = Path(stored_state).as_posix()
|
|
with open(stored_state_file, 'rt') as f:
|
|
stored_state = json.load(f)
|
|
|
|
instance = cls(_private=cls.__private_magic, task=task)
|
|
# assert instance._id == stored_state['id'] # They should match
|
|
instance._dependency_graph = stored_state['dependency_graph']
|
|
instance._dirty = stored_state.get('dirty', False)
|
|
instance._dataset_file_entries = {
|
|
s['relative_path']: FileEntry(**s) for s in stored_state['dataset_file_entries']}
|
|
return instance
|
|
|
|
@staticmethod
|
|
def _calc_file_hash(file_entry):
|
|
# calculate hash
|
|
file_entry.hash, _ = sha256sum(file_entry.local_path)
|
|
file_entry.size = Path(file_entry.local_path).stat().st_size
|
|
return file_entry
|
|
|
|
@classmethod
|
|
def _get_dataset_id_hash(cls, dataset_id):
|
|
# type: (str) -> str
|
|
"""
|
|
Return hash used to search for the dataset id in text fields.
|
|
This is not a strong hash and used for defining dependencies.
|
|
:param dataset_id:
|
|
:return:
|
|
"""
|
|
return 'dsh{}'.format(md5text(dataset_id))
|
|
|
|
def _get_cache_folder_name(self):
|
|
return '{}{}'.format(self.__cache_folder_prefix, self._id)
|
|
|
|
def _add_script_call(self, func_name, **kwargs):
|
|
# type: (str, **Any) -> ()
|
|
args = ', '.join('\n {}={}'.format(k, '\''+str(v)+'\'' if isinstance(v, (str, Path, _Path)) else v)
|
|
for k, v in kwargs.items())
|
|
if args:
|
|
args += '\n'
|
|
line = 'ds.{}({})\n'.format(func_name, args)
|
|
self._task.data.script.diff += line
|
|
# noinspection PyProtectedMember
|
|
self._task._edit(script=self._task.data.script)
|
|
|
|
def _report_dataset_genealogy(self):
|
|
sankey_node = dict(
|
|
label=[],
|
|
color=[],
|
|
customdata=[],
|
|
hovertemplate='%{customdata}<extra></extra>',
|
|
hoverlabel={"align": "left"},
|
|
)
|
|
sankey_link = dict(
|
|
source=[],
|
|
target=[],
|
|
value=[],
|
|
hovertemplate='<extra></extra>',
|
|
)
|
|
# get DAG nodes
|
|
nodes = self._get_dependencies_by_order(include_unused=True) + [self.id]
|
|
# dataset name lookup
|
|
# noinspection PyProtectedMember
|
|
node_names = {t.id: t.name for t in Task._query_tasks(task_ids=nodes, only_fields=['id', 'name'])}
|
|
node_details = {}
|
|
# Generate table and details
|
|
table_values = [["Dataset id", "name", "removed", "modified", "added", "size"]]
|
|
for node in nodes:
|
|
count = 0
|
|
size = 0
|
|
for f in self._dataset_file_entries.values():
|
|
if f.parent_dataset_id == node:
|
|
count += 1
|
|
size += f.size
|
|
removed = len(self.list_removed_files(node))
|
|
modified = len(self.list_modified_files(node))
|
|
table_values += [[node, node_names.get(node, ''),
|
|
removed, modified, count-modified, humanfriendly.format_size(size)]]
|
|
node_details[node] = [removed, modified, count-modified, humanfriendly.format_size(size)]
|
|
|
|
# create DAG
|
|
visited = []
|
|
# add nodes
|
|
for idx, node in enumerate(nodes):
|
|
visited.append(node)
|
|
sankey_node['color'].append("mediumpurple" if node == self.id else "lightblue")
|
|
sankey_node['label'].append('{}'.format(node))
|
|
sankey_node['customdata'].append(
|
|
"name {}<br />removed {}<br />modified {}<br />added {}<br />size {}".format(
|
|
node_names.get(node, ''), *node_details[node]))
|
|
|
|
# add edges
|
|
for idx, node in enumerate(nodes):
|
|
if node in self._dependency_graph:
|
|
parents = [visited.index(p) for p in self._dependency_graph[node] or [] if p in visited]
|
|
else:
|
|
parents = [visited.index(p) for p in self.get(dataset_id=node)._get_parents() or [] if p in visited]
|
|
|
|
for p in parents:
|
|
sankey_link['source'].append(p)
|
|
sankey_link['target'].append(idx)
|
|
sankey_link['value'].append(max(1, node_details[visited[p]][-2]))
|
|
|
|
# create the sankey graph
|
|
dag_flow = dict(
|
|
link=sankey_link,
|
|
node=sankey_node,
|
|
textfont=dict(color='rgba(0,0,0,255)', size=10),
|
|
type='sankey',
|
|
orientation='h'
|
|
)
|
|
fig = dict(data=[dag_flow], layout={'xaxis': {'visible': False}, 'yaxis': {'visible': False}})
|
|
|
|
if len(nodes) > 1:
|
|
self._task.get_logger().report_plotly(
|
|
title='Dataset Genealogy', series='', iteration=0, figure=fig)
|
|
|
|
# report detailed table
|
|
self._task.get_logger().report_table(
|
|
title='Dataset Summary', series='Details', iteration=0, table_plot=table_values,
|
|
extra_layout={"title": "Files by parent dataset id"})
|
|
|
|
# report the detailed content of the dataset as configuration,
|
|
# this allows for easy version comparison in the UI
|
|
dataset_details = None
|
|
if len(self._dataset_file_entries) < self.__preview_max_file_entries:
|
|
file_entries = sorted(self._dataset_file_entries.values(), key=lambda x: x.relative_path)
|
|
dataset_details = \
|
|
'File Name - File Size - Hash (SHA2)\n' +\
|
|
'\n'.join('{} - {} - {}'.format(f.relative_path, f.size, f.hash) for f in file_entries)
|
|
# too large to store
|
|
if not dataset_details or len(dataset_details) > self.__preview_max_size:
|
|
dataset_details = 'Dataset content is too large to preview'
|
|
|
|
# noinspection PyProtectedMember
|
|
self._task._set_configuration(
|
|
name='Dataset Content', description='Dataset content preview',
|
|
config_type='read-only',
|
|
config_text=dataset_details
|
|
)
|
|
|
|
@classmethod
|
|
def _set_project_system_tags(cls, task):
|
|
from ..backend_api.services import projects
|
|
res = task.send(projects.GetByIdRequest(project=task.project), raise_on_errors=False)
|
|
if not res or not res.response or not res.response.project:
|
|
return
|
|
system_tags = res.response.project.system_tags or []
|
|
if cls.__tag not in system_tags:
|
|
system_tags += [cls.__tag]
|
|
task.send(projects.UpdateRequest(project=task.project, system_tags=system_tags), raise_on_errors=False)
|
|
|
|
def is_dirty(self):
|
|
# type: () -> bool
|
|
"""
|
|
Return True if the dataset has pending uploads (i.e. we cannot finalize it)
|
|
|
|
:return: Return True means dataset has pending uploads, call 'upload' to start an upload process.
|
|
"""
|
|
return self._dirty
|