clearml/clearml/datasets/dataset.py

import calendar
import json
import os
import shutil
import psutil
import mimetypes
import re
import logging
from copy import deepcopy, copy
from multiprocessing.pool import ThreadPool
from concurrent.futures import ThreadPoolExecutor
from tempfile import mkdtemp
from typing import Union, Optional, Sequence, List, Dict, Any, Mapping, Tuple
from zipfile import ZIP_DEFLATED

from attr import attrs, attrib
from pathlib2 import Path

from .. import Task, StorageManager, Logger
from ..backend_api import Session
from ..backend_interface.task.development.worker import DevWorker
from ..backend_interface.util import mutually_exclusive, exact_match_regex, get_or_create_project, rename_project
from ..config import deferred_config, running_remotely, get_remote_task_id
from ..debugging.log import LoggerRoot
from ..storage.helper import StorageHelper, cloud_driver_schemes
from ..storage.cache import CacheManager
from ..storage.util import sha256sum, is_windows, md5text, format_size
from ..utilities.matching import matches_any_wildcard
from ..utilities.parallel import ParallelZipper
from ..utilities.version import Version

try:
    from pathlib import Path as _Path  # noqa
except ImportError:
    _Path = None

try:
    import numpy as np
except ImportError:
    np = None

try:
    import pandas as pd
except ImportError:
    pd = None
except Exception as e:
    logging.warning("ClearML Dataset failed importing pandas: {}".format(e))
    pd = None

try:
    import pyarrow # noqa
except ImportError:
    pyarrow = None
except Exception as e:
    logging.warning("ClearML Dataset failed importing pyarrow: {}".format(e))
    pyarrow = None

try:
    import fastparquet # noqa
except ImportError:
    fastparquet = None
except Exception as e:
    logging.warning("ClearML Dataset failed importing fastparquet: {}".format(e))
    fastparquet = None


@attrs
class FileEntry(object):
    relative_path = attrib(default=None, type=str)
    hash = attrib(default=None, type=str)
    parent_dataset_id = attrib(default=None, type=str)
    size = attrib(default=None, type=int)
    # support multi part artifact storage
    artifact_name = attrib(default=None, type=str)
    # cleared when file is uploaded.
    local_path = attrib(default=None, type=str)

    def as_dict(self):
        # type: () -> Dict
        state = dict(relative_path=self.relative_path, hash=self.hash,
                     parent_dataset_id=self.parent_dataset_id, size=self.size,
                     artifact_name=self.artifact_name,
                     **dict([('local_path', self.local_path)] if self.local_path else ()))
        return state


@attrs
class LinkEntry(object):
    link = attrib(default=None, type=str)
    relative_path = attrib(default=None, type=str)
    parent_dataset_id = attrib(default=None, type=str)
    size = attrib(default=None, type=int)
    hash = attrib(default=None, type=str)

    def as_dict(self):
        # type: () -> Dict
        return dict(
            link=self.link,
            relative_path=self.relative_path,
            parent_dataset_id=self.parent_dataset_id,
            size=self.size,
        )


class Dataset(object):
    __private_magic = 42 * 1337
    __state_entry_name = 'state'
    __default_data_entry_name = 'data'
    __data_entry_name_prefix = 'data_'
    __cache_context = 'datasets'
    __tag = 'dataset'
    __hidden_tag = "hidden"
    __external_files_tag = "external files"
    __cache_folder_prefix = "ds_"
    __default_dataset_version = "1.0.0"
    __dataset_folder_template = CacheManager.set_context_folder_lookup(__cache_context, "{0}_archive_{1}")
    __preview_max_file_entries = 15000
    __preview_max_size = 32 * 1024
    __preview_total_max_size = 320 * 1024
    __min_api_version = "2.20"
    __hyperparams_section = "Datasets"
    __datasets_runtime_prop = "datasets"
    __orig_datasets_runtime_prop_prefix = "orig_datasets"
    __preview_tabular_table_count = deferred_config("dataset.preview.tabular.table_count", 10, transform=int)
    __preview_tabular_row_count = deferred_config("dataset.preview.tabular.row_count", 10, transform=int)
    __preview_media_image_count = deferred_config("dataset.preview.media.image_count", 10, transform=int)
    __preview_media_video_count = deferred_config("dataset.preview.media.video_count", 10, transform=int)
    __preview_media_audio_count = deferred_config("dataset.preview.media.audio_count", 10, transform=int)
    __preview_media_html_count = deferred_config("dataset.preview.media.html_count", 10, transform=int)
    _dataset_chunk_size_mb = deferred_config("storage.dataset_chunk_size_mb", 512, transform=int)

    def __init__(
        self,
        _private,  # type: int
        task=None,  # type: Optional[Task]
        dataset_project=None,  # type: Optional[str]
        dataset_name=None,  # type: Optional[str]
        dataset_tags=None,  # type: Optional[Sequence[str]]
        dataset_version=None,  # type: Optional[str]
        description=None,  # type: Optional[str]
    ):
        # type: (...) -> ()
        """
        Do not use directly! Use Dataset.create(...) or Dataset.get(...) instead.
        """
        assert _private == self.__private_magic
        # key for the dataset file entries are the relative path within the data
        self._dataset_file_entries = {}  # type: Dict[str, FileEntry]
        self._dataset_link_entries = {}  # type: Dict[str, LinkEntry]
        # this will create a graph of all the dependencies we have, each entry lists it's own direct parents
        self._dependency_graph = {}  # type: Dict[str, List[str]]
        self._dataset_version = None
        if dataset_version:
            self._dataset_version = str(dataset_version).strip()
            if not Version.is_valid_version_string(self._dataset_version):
                LoggerRoot.get_base_logger().warning(
                    "Setting non-semantic dataset version '{}'".format(self._dataset_version)
                )
        if task:
            self._task_pinger = None
            self._created_task = False
            task_status = task.data.status
            # if we are continuing aborted Task, force the state
            if str(task_status) == 'stopped':
                # print warning that we are opening a stopped dataset:
                LoggerRoot.get_base_logger().warning(
                    'Reopening aborted Dataset, any change will clear and overwrite current state')
                task.mark_started(force=True)
                task_status = 'in_progress'

            # If we are reusing the main current Task, make sure we set its type to data_processing
            if str(task_status) in ('created', 'in_progress'):
                if str(task.task_type) != str(Task.TaskTypes.data_processing):
                    task.set_task_type(task_type=Task.TaskTypes.data_processing)
                task_system_tags = task.get_system_tags() or []
                if self.__tag not in task_system_tags:
                    task.set_system_tags(task_system_tags + [self.__tag])
                if dataset_tags:
                    task.set_tags((task.get_tags() or []) + list(dataset_tags))

            # Keep track of modified files (added, removed, modified)
            # We also load the metadata from the existing task into this one, so we can add when
            # e.g. add_files is called multiple times
            task_state = task.artifacts.get('state')
            if task_state:
                self.changed_files = {key: int(task_state.metadata.get(key, 0))
                                      for key in {'files added', 'files removed', 'files modified'}}
            else:
                self.changed_files = {'files added': 0, 'files removed': 0, 'files modified': 0}
            if "/.datasets/" not in task.get_project_name() or "":
                dataset_project, parent_project = self._build_hidden_project_name(task.get_project_name(), task.name)
                task.move_to_project(new_project_name=dataset_project)
                if bool(Session.check_min_api_server_version(Dataset.__min_api_version)):
                    get_or_create_project(task.session, project_name=parent_project, system_tags=[self.__hidden_tag])
                    get_or_create_project(
                        task.session,
                        project_name=dataset_project,
                        project_id=task.project,
                        system_tags=[self.__hidden_tag, self.__tag],
                    )
        else:
            self._created_task = True
            dataset_project, parent_project = self._build_hidden_project_name(dataset_project, dataset_name)
            task = Task.create(
                project_name=dataset_project, task_name=dataset_name, task_type=Task.TaskTypes.data_processing)
            if bool(Session.check_min_api_server_version(Dataset.__min_api_version)):
                get_or_create_project(task.session, project_name=parent_project, system_tags=[self.__hidden_tag])
                get_or_create_project(
                    task.session,
                    project_name=dataset_project,
                    project_id=task.project,
                    system_tags=[self.__hidden_tag, self.__tag],
                )
            # set default output_uri
            task.output_uri = True
            task.set_system_tags((task.get_system_tags() or []) + [self.__tag])
            if dataset_tags:
                task.set_tags((task.get_tags() or []) + list(dataset_tags))
            task.mark_started()
            # generate the script section
            script = (
                "from clearml import Dataset\n\n"
                "ds = Dataset.create(dataset_project='{dataset_project}', dataset_name='{dataset_name}', "
                "dataset_version='{dataset_version}')\n".format(
                    dataset_project=dataset_project, dataset_name=dataset_name, dataset_version=dataset_version
                )
            )
            task.data.script.diff = script
            task.data.script.working_dir = '.'
            task.data.script.entry_point = 'register_dataset.py'
            from clearml import __version__
            task.data.script.requirements = {'pip': 'clearml == {}\n'.format(__version__)}
            # noinspection PyProtectedMember
            task._edit(script=task.data.script)

            # if the task is running make sure we ping to the server so it will not be aborted by a watchdog
            self._task_pinger = DevWorker()
            self._task_pinger.register(task, stop_signal_support=False)
            # set the newly created Dataset parent ot the current Task, so we know who created it.
            if Task.current_task() and Task.current_task().id != task.id:
                task.set_parent(Task.current_task())
            # Set the modified files to empty on dataset creation
            self.changed_files = {'files added': 0, 'files removed': 0, 'files modified': 0}

        # store current dataset Task
        self._task = task
        if not self._dataset_version:
            # noinspection PyProtectedMember
            self._dataset_version = self._task._get_runtime_properties().get("version")
        if not self._dataset_version:
            _, latest_version = self._get_dataset_id(self.project, self.name)
            if latest_version is not None:
                # noinspection PyBroadException
                try:
                    self._dataset_version = str(Version(latest_version).get_next_version())
                except Exception:
                    LoggerRoot.get_base_logger().warning(
                        "Could not auto-increment version {} of dataset with ID {}".format(
                            latest_version, self._task.id
                        )
                    )
        # store current dataset id
        self._id = task.id
        # store the folder where the dataset was downloaded to
        self._local_base_folder = None  # type: Optional[Path]
        # dirty flag, set True by any function call changing the dataset (regardless of weather it did anything)
        self._dirty = False
        self._using_current_task = False
        # set current artifact name to be used (support for multiple upload sessions)
        self._data_artifact_name = self._get_next_data_artifact_name()
        # store a cached lookup of the number of chunks each parent dataset has.
        # this will help with verifying we have n up-to-date partial local copy
        self._dependency_chunk_lookup = None  # type: Optional[Dict[str, int]]
        self._ds_total_size = None
        self._ds_total_size_compressed = None
        self.__preview_tables_count = 0
        self.__preview_image_count = 0
        self.__preview_video_count = 0
        self.__preview_audio_count = 0
        self.__preview_html_count = 0

    @property
    def id(self):
        # type: () -> str
        return self._id

    @property
    def file_entries(self):
        # type: () -> List[FileEntry]
        return list(self._dataset_file_entries.values())

    @property
    def link_entries(self):
        # type: () -> List[LinkEntry]
        return list(self._dataset_link_entries.values())

    @property
    def file_entries_dict(self):
        # type: () -> Mapping[str, FileEntry]
        """
        Notice this call returns an internal representation, do not modify!
        :return: dict with relative file path as key, and FileEntry as value
        """
        return self._dataset_file_entries

    @property
    def link_entries_dict(self):
        # type: () -> Mapping[str, LinkEntry]
        """
        Notice this call returns an internal representation, do not modify!
        :return: dict with relative file path as key, and LinkEntry as value
        """
        return self._dataset_link_entries

    @property
    def project(self):
        # type: () -> str
        return self._remove_hidden_part_from_dataset_project(self._task.get_project_name())

    @property
    def name(self):
        # type: () -> str
        if bool(Session.check_min_api_server_version(Dataset.__min_api_version)):
            return self._task.get_project_name().partition("/.datasets/")[-1]
        return self._task.name

    @property
    def version(self):
        # type: () -> Optional[str]
        return self._dataset_version

    @version.setter
    def version(self, version):
        # type: (str) -> ()
        version = str(version).strip()
        self._dataset_version = version
        if not Version.is_valid_version_string(version):
            LoggerRoot.get_base_logger().warning("Setting non-semantic dataset version '{}'".format(version))
        # noinspection PyProtectedMember
        self._task._set_runtime_properties({"version": version})
        self._task.set_user_properties(version=version)

    @property
    def tags(self):
        # type: () -> List[str]
        return self._task.get_tags() or []

    @tags.setter
    def tags(self, values):
        # type: (List[str]) -> ()
        self._task.set_tags(values or [])

    def add_tags(self, tags):
        # type: (Union[Sequence[str], str]) -> None
        """
        Add Tags to this dataset. Old tags are not deleted. When executing a Task (experiment) remotely,
        this method has no effect.

        :param tags: A list of tags which describe the Task to add.
        """
        self._task.add_tags(tags)

    def add_files(
            self,
            path,  # type: Union[str, Path, _Path]
            wildcard=None,  # type: Optional[Union[str, Sequence[str]]]
            local_base_folder=None,  # type: Optional[str]
            dataset_path=None,  # type: Optional[str]
            recursive=True,  # type: bool
            verbose=False,  # type: bool
            max_workers=None,  # type: Optional[int]
    ):
        # type: (...) -> ()
        """
        Add a folder into the current dataset. calculate file hash,
        and compare against parent, mark files to be uploaded

        :param path: Add a folder/file to the dataset
        :param wildcard: add only specific set of files.
            Wildcard matching, can be a single string or a list of wildcards.
        :param local_base_folder: files will be located based on their relative path from local_base_folder
        :param dataset_path: where in the dataset the folder/files should be located
        :param recursive: If True, match all wildcard files recursively
        :param verbose: If True, print to console files added/modified
        :param max_workers: The number of threads to add the files with. Defaults to the number of logical cores
        :return: number of files added
        """
        max_workers = max_workers or psutil.cpu_count()
        self._dirty = True
        self._task.get_logger().report_text(
            'Adding files to dataset: {}'.format(
                dict(path=path, wildcard=wildcard, local_base_folder=local_base_folder,
                     dataset_path=dataset_path, recursive=recursive, verbose=verbose)),
            print_console=False)

        num_added, num_modified = self._add_files(
            path=path,
            wildcard=wildcard,
            local_base_folder=local_base_folder,
            dataset_path=dataset_path,
            recursive=recursive,
            verbose=verbose,
            max_workers=max_workers,
        )

        # update the task script
        self._add_script_call(
            'add_files', path=path, wildcard=wildcard, local_base_folder=local_base_folder,
            dataset_path=dataset_path, recursive=recursive)

        self._serialize()

        return num_added

    def add_external_files(
        self,
        source_url,  # type: Union[str, Sequence[str]]
        wildcard=None,  # type: Optional[Union[str, Sequence[str]]]
        dataset_path=None,  # type: Optional[str]
        recursive=True,  # type: bool
        verbose=False,  # type: bool
        max_workers=None  # type: Optional[int]
    ):
        # type: (...) -> int
        """
        Adds external files or folders to the current dataset.
        External file links can be from cloud storage (s3://, gs://, azure://), local / network storage (file://)
        or http(s)// files.
        Calculates file size for each file and compares against parent.

        A few examples:
        - Add file.jpg to the dataset. When retrieving a copy of the entire dataset (see dataset.get_local_copy()).
        This file will be located in "./my_dataset/new_folder/file.jpg".
        add_external_files(source_url="s3://my_bucket/stuff/file.jpg", dataset_path="/my_dataset/new_folder/")
        - Add all jpg files located in s3 bucket called "my_bucket" to the dataset.
        add_external_files(source_url="s3://my/bucket/", wildcard = "*.jpg", dataset_path="/my_dataset/new_folder/")
        - Add the entire content of "remote_folder" to the dataset.
        add_external_files(source_url="s3://bucket/remote_folder/", dataset_path="/my_dataset/new_folder/")
        - Add the local file "/folder/local_file.jpg" to the dataset.
        add_external_files(source_url="file:///folder/local_file.jpg", dataset_path="/my_dataset/new_folder/")

        :param source_url: Source url link (e.g. s3://bucket/folder/path) or list/tuple of links to add to
            the dataset (e.g. [s3://bucket/folder/file.csv, http://web.com/file.txt])
        :param wildcard: add only specific set of files.
            Wildcard matching, can be a single string or a list of wildcards.
        :param dataset_path: The location in the dataset where the file will be downloaded into.
            e.g: for source_url='s3://bucket/remote_folder/image.jpg' and dataset_path='s3_files',
            'image.jpg' will be downloaded to 's3_files/image.jpg' (relative path to the dataset)
        :param recursive: If True, match all wildcard files recursively
        :param verbose: If True, print to console files added/modified
        :param max_workers: The number of threads to add the external files with. Useful when `source_url` is
            a sequence. Defaults to the number of logical cores
        :return: Number of file links added
        """
        self._dirty = True
        num_added = 0
        num_modified = 0
        source_url_list = source_url if not isinstance(source_url, str) else [source_url]
        max_workers = max_workers or psutil.cpu_count()
        futures_ = []
        with ThreadPoolExecutor(max_workers=max_workers) as tp:
            for source_url_ in source_url_list:
                futures_.append(
                    tp.submit(
                        self._add_external_files,
                        source_url_,
                        wildcard=wildcard,
                        dataset_path=dataset_path,
                        recursive=recursive,
                        verbose=verbose,
                    )
                )
        for future_ in futures_:
            num_added_this_call, num_modified_this_call = future_.result()
            num_added += num_added_this_call
            num_modified += num_modified_this_call
        self._task.add_tags([self.__external_files_tag])
        self._add_script_call(
            "add_external_files",
            source_url=source_url,
            wildcard=wildcard,
            dataset_path=dataset_path,
            recursive=recursive,
            verbose=verbose,
        )
        self.update_changed_files(num_files_added=num_added, num_files_modified=num_modified)
        self._serialize()
        return num_added

    def remove_files(self, dataset_path=None, recursive=True, verbose=False):
        # type: (Optional[str], bool, bool) -> int
        """
        Remove files from the current dataset

        :param dataset_path: Remove files from the dataset.
            The path is always relative to the dataset (e.g 'folder/file.bin').
            External files can also be removed by their links (e.g. 's3://bucket/file')
        :param recursive: If True, match all wildcard files recursively
        :param verbose: If True, print to console files removed
        :return: Number of files removed
        """
        self._task.get_logger().report_text(
            'Removing files from dataset: {}'.format(
                dict(dataset_path=dataset_path, recursive=recursive, verbose=verbose)),
            print_console=False)

        if dataset_path and dataset_path.startswith('/'):
            dataset_path = dataset_path[1:]

        org_files = list(self._dataset_file_entries.keys()) + list(self._dataset_link_entries.keys())

        self._dataset_file_entries = {
            k: v
            for k, v in self._dataset_file_entries.items()
            if not matches_any_wildcard(k, dataset_path, recursive=recursive)
        }
        self._dataset_link_entries = {
            k: v
            for k, v in self._dataset_link_entries.items()
            if not matches_any_wildcard(k, dataset_path, recursive=recursive)
            and not matches_any_wildcard(v.link, dataset_path, recursive=recursive)
        }

        removed = 0
        for f in org_files:
            if f not in self._dataset_file_entries and f not in self._dataset_link_entries:
                if verbose:
                    self._task.get_logger().report_text('Remove {}'.format(f))
                removed += 1

        # update the task script
        self._add_script_call(
            'remove_files', dataset_path=dataset_path, recursive=recursive)
        self._serialize()
        # Update state
        self.update_changed_files(num_files_removed=removed)
        return removed

    def sync_folder(self, local_path, dataset_path=None, verbose=False):
        # type: (Union[Path, _Path, str], Union[Path, _Path, str], bool) -> (int, int, int)
        """
        Synchronize the dataset with a local folder. The dataset is synchronized from the
        relative_base_folder (default: dataset root)  and deeper with the specified local path.
        Note that if a remote file is identified as being modified when syncing, it will
        be added as a FileEntry, ready to be uploaded to the ClearML server. This version of the
        file is considered "newer" and it will be downloaded instead of the one stored at its
        remote address when calling Dataset.get_local_copy().

        :param local_path: Local folder to sync (assumes all files and recursive)
        :param dataset_path: Target dataset path to sync with (default the root of the dataset)
        :param verbose: If True, print to console files added/modified/removed
        :return: number of files removed, number of files modified/added
        """
        def filter_f(f):
            keep = (not f.relative_path.startswith(relative_prefix) or
                    (local_path / f.relative_path[len(relative_prefix):]).is_file())
            if not keep and verbose:
                self._task.get_logger().report_text('Remove {}'.format(f.relative_path))
            return keep

        self._task.get_logger().report_text(
            'Syncing local copy with dataset: {}'.format(
                dict(local_path=local_path, dataset_path=dataset_path, verbose=verbose)),
            print_console=False)

        self._dirty = True
        local_path = Path(local_path)

        # Path().as_posix() will never end with /
        relative_prefix = (Path(dataset_path).as_posix() + '/') if dataset_path else ''

        # remove files
        num_files = len(self._dataset_file_entries)
        self._dataset_file_entries = {
            k: f for k, f in self._dataset_file_entries.items() if filter_f(f)}
        num_removed = num_files - len(self._dataset_file_entries)
        # Update the internal state
        self.update_changed_files(num_files_removed=num_removed)

        # add remaining files, state is updated in _add_files
        num_added, num_modified = self._add_files(path=local_path, dataset_path=dataset_path,
                                                  recursive=True, verbose=verbose)

        # How many of the files were modified? AKA have the same name but a different hash

        if verbose:
            self._task.get_logger().report_text(
                'Syncing folder {} : {} files removed, {} added / modified'.format(
                    local_path.as_posix(), num_removed, num_added + num_modified))

        # update the task script
        self._add_script_call(
            'sync_folder', local_path=local_path, dataset_path=dataset_path)

        return num_removed, num_added, num_modified

    def upload(
        self,
        show_progress=True,
        verbose=False,
        output_url=None,
        compression=None,
        chunk_size=None,
        max_workers=None,
        retries=3,
    ):
        # type: (bool, bool, Optional[str], Optional[str], int, Optional[int], int) -> ()
        """
        Start file uploading, the function returns when all files are uploaded.

        :param show_progress: If True, show upload progress bar
        :param verbose: If True, print verbose progress report
        :param output_url: Target storage for the compressed dataset (default: file server)
            Examples: `s3://bucket/data`, `gs://bucket/data` , `azure://bucket/data` , `/mnt/share/data`
        :param compression: Compression algorithm for the Zipped dataset file (default: ZIP_DEFLATED)
        :param chunk_size: Artifact chunk size (MB) for the compressed dataset,
            if not provided (None) use the default chunk size (512mb).
            If -1 is provided, use a single zip artifact for the entire dataset change-set (old behaviour)
        :param max_workers: Numbers of threads to be spawned when zipping and uploading the files.
            If None (default) it will be set to:
            - 1: if the upload destination is a cloud provider ('s3', 'gs', 'azure')
            - number of logical cores: otherwise
        :param int retries: Number of retries before failing to upload each zip. If 0, the upload is not retried.

        :raise: If the upload failed (i.e. at least one zip failed to upload), raise a `ValueError`
        """
        self._report_dataset_preview()

        # set output_url
        if output_url:
            self._task.output_uri = output_url
            self._task.get_logger().set_default_upload_destination(output_url)

        if not max_workers:
            max_workers = 1 if self._task.output_uri.startswith(tuple(cloud_driver_schemes)) else psutil.cpu_count()

        self._task.get_logger().report_text(
            "Uploading dataset files: {}".format(
                dict(show_progress=show_progress, verbose=verbose, output_url=output_url, compression=compression)
            ),
            print_console=False,
        )

        total_size = 0
        chunks_count = 0
        total_preview_size = 0
        keep_as_file_entry = set()
        chunk_size = int(self._dataset_chunk_size_mb if not chunk_size else chunk_size)
        upload_futures = []

        with ThreadPoolExecutor(max_workers=max_workers) as pool:
            parallel_zipper = ParallelZipper(
                chunk_size,
                max_workers,
                allow_zip_64=True,
                compression=ZIP_DEFLATED if compression is None else compression,
                zip_prefix="dataset.{}.".format(self._id),
                zip_suffix=".zip",
                verbose=verbose,
                task=self._task,
                pool=pool,
            )
            file_paths = []
            arcnames = {}
            for f in self._dataset_file_entries.values():
                if not f.local_path:
                    keep_as_file_entry.add(f.relative_path)
                    continue
                file_paths.append(f.local_path)
                arcnames[f.local_path] = f.relative_path
            for zip_ in parallel_zipper.zip_iter(file_paths, arcnames=arcnames):
                running_futures = []
                for upload_future in upload_futures:
                    if upload_future.running():
                        running_futures.append(upload_future)
                    else:
                        if not upload_future.result():
                            raise ValueError("Failed uploading dataset with ID {}".format(self._id))
                upload_futures = running_futures

                zip_path = Path(zip_.zip_path)
                artifact_name = self._data_artifact_name
                self._data_artifact_name = self._get_next_data_artifact_name(self._data_artifact_name)
                self._task.get_logger().report_text(
                    "Uploading dataset changes ({} files compressed to {}) to {}".format(
                        zip_.count,
                        format_size(zip_.size, binary=True, use_b_instead_of_bytes=True),
                        self.get_default_storage()
                    )
                )
                total_size += zip_.size
                chunks_count += 1
                truncated_preview = ""
                add_truncated_message = False
                truncated_message = "...\ntruncated (too many files to preview)"
                for preview_entry in zip_.archive_preview[:Dataset.__preview_max_file_entries]:
                    truncated_preview += preview_entry + "\n"
                    if len(truncated_preview) > Dataset.__preview_max_size or \
                            len(truncated_preview) + total_preview_size > Dataset.__preview_total_max_size:
                        add_truncated_message = True
                        break
                if len(zip_.archive_preview) > Dataset.__preview_max_file_entries:
                    add_truncated_message = True

                preview = truncated_preview + (truncated_message if add_truncated_message else "")
                total_preview_size += len(preview)

                upload_futures.append(
                    pool.submit(
                        self._task.upload_artifact,
                        name=artifact_name,
                        artifact_object=Path(zip_path),
                        preview=preview,
                        delete_after_upload=True,
                        wait_on_upload=True,
                        retries=retries
                    )
                )
                for file_entry in self._dataset_file_entries.values():
                    if file_entry.local_path is not None and \
                            Path(file_entry.local_path).as_posix() in zip_.files_zipped:
                        keep_as_file_entry.add(file_entry.relative_path)
                        file_entry.artifact_name = artifact_name
                        if file_entry.parent_dataset_id == self._id:
                            file_entry.local_path = None
                self._serialize()

        self._task.get_logger().report_text(
            "File compression and upload completed: total size {}, {} chunk(s) stored (average size {})".format(
                format_size(total_size, binary=True, use_b_instead_of_bytes=True),
                chunks_count,
                format_size(0 if chunks_count == 0 else total_size / chunks_count,
                            binary=True, use_b_instead_of_bytes=True),
            )
        )
        self._ds_total_size_compressed = total_size + self._get_total_size_compressed_parents()

        if chunks_count == 0:
            LoggerRoot.get_base_logger().info("No pending files, skipping upload.")
            self._dirty = False
            self._serialize()
            return True

        # remove files that could not be zipped
        self._dataset_file_entries = {
            k: v for k, v in self._dataset_file_entries.items() if v.relative_path in keep_as_file_entry
        }

        # report upload completed
        self._add_script_call(
            "upload", show_progress=show_progress, verbose=verbose, output_url=output_url, compression=compression
        )

        self._dirty = False
        self._serialize()

    def finalize(self, verbose=False, raise_on_error=True, auto_upload=False):
        # type: (bool, bool, bool) -> bool
        """
        Finalize the dataset publish dataset Task. Upload must first be called to verify that there are no pending uploads.
        If files do need to be uploaded, it throws an exception (or return False)

        :param verbose: If True, print verbose progress report
        :param raise_on_error: If True, raise exception if dataset finalizing failed
        :param auto_upload: Automatically upload dataset if not called yet, will upload to default location.
        """
        # check we do not have files waiting for upload.
        if self._dirty:
            if auto_upload:
                self._task.get_logger().report_text("Pending uploads, starting dataset upload to {}"
                                                    .format(self.get_default_storage()))
                self.upload()
            elif raise_on_error:
                raise ValueError("Cannot finalize dataset, pending uploads. Call Dataset.upload(...)")
            else:
                return False

        status = self._task.get_status()
        if status not in ('in_progress', 'created'):
            raise ValueError("Cannot finalize dataset, status '{}' is not valid".format(status))

        self._task.get_logger().report_text('Finalizing dataset', print_console=False)

        # make sure we have no redundant parent versions
        self._serialize(update_dependency_chunk_lookup=True)
        self._add_script_call('finalize')
        if verbose:
            print('Updating statistics and genealogy')
        self._report_dataset_struct()
        self._report_dataset_genealogy()
        if self._using_current_task:
            self._task.flush(wait_for_uploads=True)
        else:
            self._task.close()
            self._task.mark_completed()

        if self._task_pinger:
            self._task_pinger.unregister()
            self._task_pinger = None

        return True

    def set_metadata(self, metadata, metadata_name='metadata', ui_visible=True):
        # type: (Union[numpy.array, pd.DataFrame, Dict[str, Any]], str, bool) -> () # noqa: F821
        """
        Attach a user-defined metadata to the dataset. Check `Task.upload_artifact` for supported types.
        If type is Optionally make it visible as a table in the UI.
        """
        if metadata_name.startswith(self.__data_entry_name_prefix):
            raise ValueError("metadata_name can not start with '{}'".format(self.__data_entry_name_prefix))
        self._task.upload_artifact(name=metadata_name, artifact_object=metadata)
        if ui_visible:
            if pd and isinstance(metadata, pd.DataFrame):
                self.get_logger().report_table(
                    title='Dataset Metadata',
                    series='Dataset Metadata',
                    table_plot=metadata
                )
            else:
                self._task.get_logger().report_text(
                    "Displaying metadata in the UI is only supported for pandas Dataframes for now. Skipping!",
                    print_console=True,
                )

    def get_metadata(self, metadata_name='metadata'):
        # type: (str) -> Optional[numpy.array, pd.DataFrame, dict, str, bool] # noqa: F821
        """
        Get attached metadata back in its original format. Will return None if none was found.
        """
        metadata = self._task.artifacts.get(metadata_name)
        if metadata is None:
            self._task.get_logger().report_text(
                "Cannot find metadata on this task, are you sure it has the correct name?",
                print_console=True,
            )
            return None
        return metadata.get()

    def set_description(self, description):
        # type: (str) -> ()
        """
        Set description of the dataset

        :param description: Description to be set
        """
        self._task.comment = description

    def publish(self, raise_on_error=True):
        # type: (bool) -> bool
        """
        Publish the dataset
        If dataset is not finalize, throw exception

        :param raise_on_error: If True, raise exception if dataset publishing failed
        """
        # check we can publish this dataset
        if not self.is_final():
            raise ValueError("Cannot publish dataset, dataset in status {}.".format(self._task.get_status()))

        self._task.publish(ignore_errors=raise_on_error)
        return True

    def is_final(self):
        # type: () -> bool
        """
        Return True if the dataset was finalized and cannot be changed any more.

        :return: True if dataset if final
        """
        return self._task.get_status() not in (
            Task.TaskStatusEnum.in_progress, Task.TaskStatusEnum.created, Task.TaskStatusEnum.failed)

    def get_local_copy(self, use_soft_links=None, part=None, num_parts=None, raise_on_error=True, max_workers=None):
        # type: (bool, Optional[int], Optional[int], bool, Optional[int]) -> str
        """
        Return a base folder with a read-only (immutable) local copy of the entire dataset
        download and copy / soft-link, files from all the parent dataset versions. The dataset needs to be finalized

        :param use_soft_links: If True, use soft links, default False on windows True on Posix systems
        :param part: Optional, if provided only download the selected part (index) of the Dataset.
            First part number is `0` and last part is `num_parts-1`
            Notice, if `num_parts` is not provided, number of parts will be equal to the total number of chunks
            (i.e. sum over all chunks from the specified Dataset including all parent Datasets).
            This argument is passed to parent datasets, as well as the implicit `num_parts`,
            allowing users to get a partial copy of the entire dataset, for multi node/step processing.
        :param num_parts: Optional, if specified, normalize the number of chunks stored to the
            requested number of parts. Notice that the actual chunks used per part are rounded down.
            Example: Assuming total 8 chunks for this dataset (including parent datasets),
            and `num_parts=5`, the chunk index used per parts would be:
            part=0 -> chunks[0,5], part=1 -> chunks[1,6], part=2 -> chunks[2,7], part=3 -> chunks[3, ]
        :param raise_on_error: If True, raise exception if dataset merging failed on any file
        :param max_workers: Number of threads to be spawned when getting the dataset copy. Defaults
            to the number of logical cores.

        :return: A base folder for the entire dataset
        """
        assert self._id
        if not self._task:
            self._task = Task.get_task(task_id=self._id)
        if not self.is_final():
            raise ValueError("Cannot get a local copy of a dataset that was not finalized/closed")
        max_workers = max_workers or psutil.cpu_count()

        # now let's merge the parents
        target_folder = self._merge_datasets(
            use_soft_links=use_soft_links,
            raise_on_error=raise_on_error,
            part=part,
            num_parts=num_parts,
            max_workers=max_workers,
        )
        return target_folder

    def get_mutable_local_copy(
        self, target_folder, overwrite=False, part=None, num_parts=None, raise_on_error=True, max_workers=None
    ):
        # type: (Union[Path, _Path, str], bool, Optional[int], Optional[int], bool, Optional[int]) -> Optional[str]
        """
        return a base folder with a writable (mutable) local copy of the entire dataset
            download and copy / soft-link, files from all the parent dataset versions

        :param target_folder: Target folder for the writable copy
        :param overwrite: If True, recursively delete the target folder before creating a copy.
            If False (default) and target folder contains files, raise exception or return None
        :param part: Optional, if provided only download the selected part (index) of the Dataset.
            First part number is `0` and last part is `num_parts-1`
            Notice, if `num_parts` is not provided, number of parts will be equal to the total number of chunks
            (i.e. sum over all chunks from the specified Dataset including all parent Datasets).
            This argument is passed to parent datasets, as well as the implicit `num_parts`,
            allowing users to get a partial copy of the entire dataset, for multi node/step processing.
        :param num_parts: Optional, if specified, normalize the number of chunks stored to the
            requested number of parts. Notice that the actual chunks used per part are rounded down.
            Example: Assuming total 8 chunks for this dataset (including parent datasets),
            and `num_parts=5`, the chunk index used per parts would be:
            part=0 -> chunks[0,5], part=1 -> chunks[1,6], part=2 -> chunks[2,7], part=3 -> chunks[3, ]
        :param raise_on_error: If True, raise exception if dataset merging failed on any file
        :param max_workers: Number of threads to be spawned when getting the dataset copy. Defaults
            to the number of logical cores.

        :return: The target folder containing the entire dataset
        """
        assert self._id
        max_workers = max_workers or psutil.cpu_count()
        target_folder = Path(target_folder).absolute()
        target_folder.mkdir(parents=True, exist_ok=True)
        # noinspection PyBroadException
        try:
            target_folder.rmdir()
        except Exception:
            if not overwrite:
                if raise_on_error:
                    raise ValueError("Target folder {} already contains files".format(target_folder.as_posix()))
                else:
                    return None
            shutil.rmtree(target_folder.as_posix())

        ro_folder = self.get_local_copy(
            part=part, num_parts=num_parts, raise_on_error=raise_on_error, max_workers=max_workers
        )
        shutil.copytree(ro_folder, target_folder.as_posix(), symlinks=False)
        return target_folder.as_posix()

    def list_files(self, dataset_path=None, recursive=True, dataset_id=None):
        # type: (Optional[str], bool, Optional[str]) -> List[str]
        """
        returns a list of files in the current dataset
        If dataset_id is provided, return a list of files that remained unchanged since the specified dataset_id

        :param dataset_path: Only match files matching the dataset_path (including wildcards).
            Example: 'folder/sub/*.json'
        :param recursive: If True (default), matching dataset_path recursively
        :param dataset_id: Filter list based on the dataset ID containing the latest version of the file.
            Default: None, do not filter files based on parent dataset.

        :return: List of files with relative path
            (files might not be available locally until get_local_copy() is called)
        """
        files = (
            list(self._dataset_file_entries.keys())
            if not dataset_id
            else [
                k
                for k, v in self._dataset_file_entries.items()
                if v.parent_dataset_id == dataset_id
            ]
        )
        files.extend(
            list(self._dataset_link_entries.keys())
            if not dataset_id
            else [
                k
                for k, v in self._dataset_link_entries.items()
                if v.parent_dataset_id == dataset_id
            ]
        )
        files = list(set(files))

        if not dataset_path:
            return sorted(files)

        if dataset_path.startswith("/"):
            dataset_path = dataset_path[1:]

        return sorted(
            [
                f
                for f in files
                if matches_any_wildcard(f, dataset_path, recursive=recursive)
            ]
        )

    def list_removed_files(self, dataset_id=None):
        # type: (str) -> List[str]
        """
        return a list of files removed when comparing to a specific dataset_id

        :param dataset_id: dataset ID (str) to compare against, if None is given compare against the parents datasets
        :return: List of files with relative path
            (files might not be available locally until get_local_copy() is called)
        """
        datasets = self._dependency_graph[self._id] if not dataset_id or dataset_id == self._id else [dataset_id]
        unified_list = set()
        for ds_id in datasets:
            dataset = self.get(dataset_id=ds_id)
            unified_list |= set(dataset._dataset_file_entries.keys())
            unified_list |= set(dataset._dataset_link_entries.keys())

        removed_list = [
            f for f in unified_list if f not in self._dataset_file_entries and f not in self._dataset_link_entries
        ]
        return sorted(removed_list)

    def list_modified_files(self, dataset_id=None):
        # type: (str) -> List[str]
        """
        return a list of files modified when comparing to a specific dataset_id

        :param dataset_id: dataset ID (str) to compare against, if None is given compare against the parents datasets
        :return: List of files with relative path
            (files might not be available locally until get_local_copy() is called)
        """
        datasets = self._dependency_graph[self._id] if not dataset_id or dataset_id == self._id else [dataset_id]
        unified_list = dict()
        for ds_id in datasets:
            dataset = self.get(dataset_id=ds_id)
            unified_list.update(dict((k, v.hash) for k, v in dataset._dataset_file_entries.items()))
        modified_list = [k for k, v in self._dataset_file_entries.items()
                         if k in unified_list and v.hash != unified_list[k]]
        unified_list_sizes = dict()
        for ds_id in datasets:
            dataset = self.get(dataset_id=ds_id)
            for k, v in dataset._dataset_link_entries.items():
                unified_list_sizes[k] = v.size
                if k in dataset._dataset_file_entries:
                    unified_list_sizes[k] = dataset._dataset_file_entries[k].size
        for k, v in self._dataset_link_entries.items():
            if k not in unified_list_sizes:
                continue
            size = v.size
            if k in self._dataset_file_entries:
                size = self._dataset_file_entries[k].size
            if size != unified_list_sizes[k]:
                modified_list.append(k)
        return sorted(list(set(modified_list)))

    def list_added_files(self, dataset_id=None):
        # type: (str) -> List[str]
        """
        return a list of files added when comparing to a specific dataset_id

        :param dataset_id: dataset ID (str) to compare against, if None is given compare against the parents datasets
        :return: List of files with relative path
            (files might not be available locally until get_local_copy() is called)
        """
        datasets = self._dependency_graph[self._id] if not dataset_id or dataset_id == self._id else [dataset_id]
        unified_list = set()
        for ds_id in datasets:
            dataset = self.get(dataset_id=ds_id)
            unified_list |= set(dataset._dataset_file_entries.keys())
            unified_list |= set(dataset._dataset_link_entries.keys())
        added_list = [
            f
            for f in list(self._dataset_file_entries.keys()) + list(self._dataset_link_entries.keys())
            if f not in unified_list
        ]
        return sorted(list(set(added_list)))

    def get_dependency_graph(self):
        """
        return the DAG of the dataset dependencies (all previous dataset version and their parents)

        Example:

        .. code-block:: py

            {
                'current_dataset_id': ['parent_1_id', 'parent_2_id'],
                'parent_2_id': ['parent_1_id'],
                'parent_1_id': [],
            }

        :return: dict representing the genealogy dag graph of the current dataset
        """
        return deepcopy(self._dependency_graph)

    def verify_dataset_hash(self, local_copy_path=None, skip_hash=False, verbose=False):
        # type: (Optional[str], bool, bool) -> List[str]
        """
        Verify the current copy of the dataset against the stored hash

        :param local_copy_path: Specify local path containing a copy of the dataset,
            If not provide use the cached folder
        :param skip_hash: If True, skip hash checks and verify file size only
        :param verbose: If True, print errors while testing dataset files hash
        :return: List of files with unmatched hashes
        """
        local_path = local_copy_path or self.get_local_copy()

        def compare(file_entry):
            file_entry_copy = copy(file_entry)
            file_entry_copy.local_path = (Path(local_path) / file_entry.relative_path).as_posix()
            if skip_hash:
                file_entry_copy.size = Path(file_entry_copy.local_path).stat().st_size
                if file_entry_copy.size != file_entry.size:
                    if verbose:
                        print('Error: file size mismatch {} expected size {} current {}'.format(
                            file_entry.relative_path, file_entry.size, file_entry_copy.size))
                    return file_entry
            else:
                self._calc_file_hash(file_entry_copy)
                if file_entry_copy.hash != file_entry.hash:
                    if verbose:
                        print('Error: hash mismatch {} expected size/hash {}/{} recalculated {}/{}'.format(
                            file_entry.relative_path,
                            file_entry.size, file_entry.hash,
                            file_entry_copy.size, file_entry_copy.hash))
                    return file_entry

            return None

        pool = ThreadPool(psutil.cpu_count())
        matching_errors = pool.map(compare, self._dataset_file_entries.values())
        pool.close()
        return [f.relative_path for f in matching_errors if f is not None]

    def get_default_storage(self):
        # type: () -> Optional[str]
        """
        Return the default storage location of the dataset

        :return: URL for the default storage location
        """
        if not self._task:
            return None
        return self._task.output_uri or self._task.get_logger().get_default_upload_destination()

    @classmethod
    def create(
            cls,
            dataset_name=None,  # type: Optional[str]
            dataset_project=None,  # type: Optional[str]
            dataset_tags=None,  # type: Optional[Sequence[str]]
            parent_datasets=None,  # type: Optional[Sequence[Union[str, Dataset]]]
            use_current_task=False,  # type: bool
            dataset_version=None,  # type: Optional[str]
            output_uri=None,  # type: Optional[str]
            description=None  # type: Optional[str]
    ):
        # type: (...) -> "Dataset"
        """
        Create a new dataset. Multiple dataset parents are supported.
        Merging of parent datasets is done based on the order,
        where each one can override overlapping files in the previous parent

        :param dataset_name: Naming the new dataset
        :param dataset_project: Project containing the dataset.
            If not specified, infer project name form parent datasets
        :param dataset_tags: Optional, list of tags (strings) to attach to the newly created Dataset
        :param parent_datasets: Expand a parent dataset by adding/removing files
        :param use_current_task: False (default), a new Dataset task is created.
            If True, the dataset is created on the current Task.
        :param dataset_version: Version of the new dataset. If not set, try to find the latest version
            of the dataset with given `dataset_name` and `dataset_project` and auto-increment it.
        :param output_uri: Location to upload the datasets file to, including preview samples.
            The following are examples of ``output_uri`` values for the supported locations:

            - A shared folder: ``/mnt/share/folder``
            - S3: ``s3://bucket/folder``
            - Google Cloud Storage: ``gs://bucket-name/folder``
            - Azure Storage: ``azure://company.blob.core.windows.net/folder/``
            - Default file server: None

        :param description: Description of the dataset

        :return: Newly created Dataset object
        """
        if not Session.check_min_api_server_version("2.13"):
            raise NotImplementedError("Datasets are not supported with your current ClearML server version. Please update your server.")

        parent_datasets = [cls.get(dataset_id=p) if not isinstance(p, Dataset) else p for p in (parent_datasets or [])]
        if any(not p.is_final() for p in parent_datasets):
            raise ValueError("Cannot inherit from a parent that was not finalized/closed")

        if dataset_name and not dataset_project and Task.current_task():
            LoggerRoot.get_base_logger().info("Dataset project not provided, using Current Task's project")
            dataset_project = Task.current_task().get_project_name()

        # if dataset name + project are None, default to use current_task
        if dataset_project is None and dataset_name is None and not use_current_task:
            LoggerRoot.get_base_logger().info("New dataset project/name not provided, storing on Current Task")
            use_current_task = True

        # get project name
        if not dataset_project and not use_current_task:
            if not parent_datasets:
                raise ValueError("Missing dataset project name. Could not infer project name from parent dataset.")
            # get project name from parent dataset
            # noinspection PyProtectedMember
            dataset_project = parent_datasets[-1]._task.get_project_name()

        # merge datasets according to order
        dataset_file_entries = {}
        dataset_link_entries = {}
        dependency_graph = {}
        for p in parent_datasets:
            # noinspection PyProtectedMember
            dataset_file_entries.update(deepcopy(p._dataset_file_entries))
            # noinspection PyProtectedMember
            dataset_link_entries.update(deepcopy(p._dataset_link_entries))
            # noinspection PyProtectedMember
            dependency_graph.update(deepcopy(p._dependency_graph))
        instance = cls(_private=cls.__private_magic,
                       dataset_project=dataset_project,
                       dataset_name=dataset_name,
                       dataset_tags=dataset_tags,
                       task=Task.current_task() if use_current_task else None,
                       dataset_version=dataset_version,
                       description=description)
        runtime_props = {
            "orig_dataset_name": instance._task._get_runtime_properties().get(
                "orig_dataset_name", instance._task.name
            ),  # noqa
            "orig_dataset_id": instance._task._get_runtime_properties().get(
                "orig_dataset_id", instance._task.id
            ),  # noqa
        }
        if not instance._dataset_version:
            instance._dataset_version = cls.__default_dataset_version
        runtime_props["version"] = instance._dataset_version
        # noinspection PyProtectedMember
        instance._task.set_user_properties(version=instance._dataset_version)
        # noinspection PyProtectedMember
        instance._task._set_runtime_properties(runtime_props)
        if description:
            instance.set_description(description)
        # noinspection PyProtectedMember
        if output_uri and not Task._offline_mode:
            # noinspection PyProtectedMember
            instance._task.output_uri = output_uri
            # noinspection PyProtectedMember
            instance._task.get_logger().set_default_upload_destination(output_uri)
        # noinspection PyProtectedMember
        instance._using_current_task = use_current_task
        # noinspection PyProtectedMember
        instance._dataset_file_entries = dataset_file_entries
        # noinspection PyProtectedMember
        instance._dataset_link_entries = dataset_link_entries
        # noinspection PyProtectedMember
        instance._dependency_graph = dependency_graph
        # noinspection PyProtectedMember
        instance._dependency_graph[instance._id] = [p._id for p in parent_datasets]
        # noinspection PyProtectedMember
        instance._serialize()
        # noinspection PyProtectedMember
        instance._report_dataset_struct()
        # noinspection PyProtectedMember
        instance._task.get_logger().report_text(
            "ClearML results page: {}".format(instance._task.get_output_log_web_page())
        )
        if bool(Session.check_min_api_server_version(cls.__min_api_version)):
            instance._task.get_logger().report_text(  # noqa
                "ClearML dataset page: {}".format(
                    "{}/datasets/simple/{}/experiments/{}".format(
                        instance._task._get_app_server(),  # noqa
                        instance._task.project if instance._task.project is not None else "*",  # noqa
                        instance._task.id,  # noqa
                    )
                )
            )
        # noinspection PyProtectedMember
        instance._task.flush(wait_for_uploads=True)
        # noinspection PyProtectedMember
        cls._set_project_system_tags(instance._task)
        return instance

    def _get_total_size_compressed_parents(self):
        # type: () -> int
        """
        :return: the compressed size of the files contained in the parent datasets
        """
        parents = self._get_parents()
        if not parents:
            return 0
        runtime_tasks = Task._query_tasks(
            task_ids=parents,
            only_fields=["runtime.ds_total_size_compressed"],
            search_hidden=True,
            _allow_extra_fields_=True,
        )
        compressed_size = 0
        for runtime_task in runtime_tasks:
            try:
                compressed_size += int(runtime_task.runtime.get("ds_total_size_compressed") or 0)
            except (TypeError, ValueError):
                pass
        return compressed_size

    @classmethod
    def _raise_on_dataset_used(cls, dataset_id):
        # type: (str) -> ()
        """
        Raise an exception if the given dataset is being used

        :param dataset_id: ID of the dataset potentially being used
        """
        # noinspection PyProtectedMember
        dependencies = Task._query_tasks(
            system_tags=[cls.__tag],
            type=[str(Task.TaskTypes.data_processing)],
            only_fields=["created", "id", "name"],
            search_text="{}".format(cls._get_dataset_id_hash(dataset_id)),
            search_hidden=True,
            _allow_extra_fields_=True,
        )
        if dependencies:
            dependencies = [d for d in dependencies if d.id != dataset_id]
        if dependencies:
            raise ValueError("Dataset id={} is used by datasets: {}".format(dataset_id, [d.id for d in dependencies]))

    @classmethod
    def _get_dataset_ids_respecting_params(
        cls,
        dataset_id=None,  # Optional[str]
        dataset_project=None,  # Optional[str]
        dataset_name=None,  # Optional[str]
        force=False,  # bool
        dataset_version=None,  # Optional[str]
        entire_dataset=False,  # bool
        action=None,  # Optional[str]
        shallow_search=False,  # bool
    ):
        # type: (...) -> List[str]
        """
        Get datasets IDs based on certain criteria, like the dataset_project, dataset_name etc.

        :param dataset_id: If set, only this ID is returned
        :param dataset_project: Corresponding dataset project
        :param dataset_name: Corresponding dataset name
        :param force: If True, get the dataset(s) even when being used. Also required to be set to
            True when `entire_dataset` is set.
        :param dataset_version: The version of the corresponding dataset. If set to `None` (default),
            then get the dataset with the latest version
        :param entire_dataset: If True, get all datasets that match the given `dataset_project`,
            `dataset_name`, `dataset_version`. Note that `force` has to be True if this parameter is True
        :param action: Corresponding action, used for logging/building error texts
        :param shallow_search: If True, search only the first 500 results (first page)

        :return: A list of datasets that matched the parameters
        """
        if dataset_id:
            return [dataset_id]
        if entire_dataset:
            if not force:
                if action:
                    raise ValueError("Can only {} entire dataset if force is True".format(action))
                raise ValueError("Could not fetch ids for requested datasets")
            hidden_dataset_project, _ = cls._build_hidden_project_name(dataset_project, dataset_name)
            # noinspection PyProtectedMember
            datasets = Task._query_tasks(
                project_name=[hidden_dataset_project],
                task_name=exact_match_regex(dataset_name) if dataset_name else None,
                system_tags=[cls.__tag],
                only_fields=["id"],
                search_hidden=True,
                _allow_extra_fields_=True,
            )
            return [d.id for d in datasets]
        dataset_id, _ = cls._get_dataset_id(
            dataset_project=dataset_project,
            dataset_name=dataset_name,
            dataset_version=dataset_version,
            raise_on_multiple=True,
            shallow_search=shallow_search
        )
        if not dataset_id:
            raise ValueError(
                "Could not find dataset to move to another project with project={} name={} version={}".format(
                    dataset_project, dataset_name, dataset_version
                )
            )
        # check if someone is using the datasets
        if not force:
            cls._raise_on_dataset_used(dataset_id)
        return [dataset_id]

    @classmethod
    def delete(
        cls,
        dataset_id=None,  # Optional[str]
        dataset_project=None,  # Optional[str]
        dataset_name=None,  # Optional[str]
        force=False,  # bool
        dataset_version=None,  # Optional[str]
        entire_dataset=False,  # bool
        shallow_search=False,  # bool
        delete_files=True,  # bool
        delete_external_files=False  # bool
    ):
        # type: (...) -> ()
        """
        Delete the dataset(s). If multiple datasets match the parameters,
        raise an Exception or move the entire dataset if `entire_dataset` is True and `force` is True

        :param dataset_id: The ID of the dataset(s) to be deleted
        :param dataset_project: The project the dataset(s) to be deleted belong(s) to
        :param dataset_name: The name of the dataset(s) to be deleted
        :param force: If True, deleted the dataset(s) even when being used. Also required to be set to
            True when `entire_dataset` is set.
        :param dataset_version: The version of the dataset(s) to be deleted
        :param entire_dataset: If True, delete all datasets that match the given `dataset_project`,
            `dataset_name`, `dataset_version`. Note that `force` has to be True if this parameter is True
        :param shallow_search: If True, search only the first 500 results (first page)
        :param delete_files: Delete all local files in the dataset (from the ClearML file server), as well as
            all artifacts related to the dataset.
        :param delete_external_files: Delete all external files in the dataset (from their external storage)
        """
        if not any([dataset_id, dataset_project, dataset_name]):
            raise ValueError("Dataset deletion criteria not met. Didn't provide id/name/project correctly.")

        mutually_exclusive(dataset_id=dataset_id, dataset_project=dataset_project)
        mutually_exclusive(dataset_id=dataset_id, dataset_name=dataset_name)

        # noinspection PyBroadException
        try:
            dataset_ids = cls._get_dataset_ids_respecting_params(
                dataset_id=dataset_id,
                dataset_project=dataset_project,
                dataset_name=dataset_name,
                force=force,
                dataset_version=dataset_version,
                entire_dataset=entire_dataset,
                shallow_search=shallow_search,
                action="delete",
            )
        except Exception as e:
            LoggerRoot.get_base_logger().warning("Failed deleting dataset: {}".format(str(e)))
            return
        for dataset_id in dataset_ids:
            try:
                dataset = Dataset.get(dataset_id=dataset_id)
            except Exception as e:
                LoggerRoot.get_base_logger().warning("Could not get dataset with ID {}: {}".format(dataset_id, str(e)))
                continue
            # noinspection PyProtectedMember
            dataset._task.delete(delete_artifacts_and_models=delete_files)
            if delete_external_files:
                for external_file in dataset.link_entries:
                    if external_file.parent_dataset_id == dataset_id:
                        try:
                            helper = StorageHelper.get(external_file.link)
                            helper.delete(external_file.link)
                        except Exception as ex:
                            LoggerRoot.get_base_logger().warning(
                                "Failed deleting remote file '{}': {}".format(external_file.link, ex)
                            )

    @classmethod
    def rename(
        cls,
        new_dataset_name,  # str
        dataset_project,  # str
        dataset_name,  # str
    ):
        # type: (...) -> ()
        """
        Rename the dataset.

        :param new_dataset_name: The new name of the datasets to be renamed
        :param dataset_project: The project the datasets to be renamed belongs to
        :param dataset_name: The name of the datasets (before renaming)
        """
        if not bool(Session.check_min_api_server_version(cls.__min_api_version)):
            LoggerRoot.get_base_logger().warning(
                "Could not rename dataset because API version < {}".format(cls.__min_api_version)
            )
            return
        project, _ = cls._build_hidden_project_name(dataset_project, dataset_name)
        new_project, _ = cls._build_hidden_project_name(dataset_project, new_dataset_name)
        # noinspection PyProtectedMember
        result = rename_project(Task._get_default_session(), project, new_project)
        if not result:
            LoggerRoot.get_base_logger().warning(
                "Could not rename dataset with dataset_project={} dataset_name={}".format(dataset_project, dataset_name)
            )

    @classmethod
    def _move_to_project_aux(cls, task, new_project, dataset_name):
        """
        Move a task to another project. Helper function, useful when the task and name of
        the corresponding dataset are known.

        :param task: A dataset's task
        :param new_project: New project to move the dataset to
        :param dataset_name: Name of the dataset

        :return: True if the dataset was moved and False otherwise
        """
        hidden_dataset_project_, parent_project = cls._build_hidden_project_name(new_project, dataset_name)
        get_or_create_project(task.session, project_name=parent_project, system_tags=[cls.__hidden_tag])
        return task.move_to_project(new_project_name=hidden_dataset_project_, system_tags=[cls.__hidden_tag, cls.__tag])

    @classmethod
    def move_to_project(
        cls,
        new_dataset_project,  # str
        dataset_project,  # str
        dataset_name,  # str
    ):
        # type: (...) -> ()
        """
        Move the dataset to another project.

        :param new_dataset_project: New project to move the dataset(s) to
        :param dataset_project: Project of the dataset(s) to move to new project
        :param dataset_name: Name of the dataset(s) to move to new project
        """
        if not bool(Session.check_min_api_server_version(cls.__min_api_version)):
            LoggerRoot.get_base_logger().warning(
                "Could not move dataset to another project because API version < {}".format(cls.__min_api_version)
            )
            return
        # noinspection PyBroadException
        try:
            dataset_ids = cls._get_dataset_ids_respecting_params(
                dataset_project=dataset_project,
                dataset_name=dataset_name,
                entire_dataset=True,
                shallow_search=False,
                force=True,
                action="move",
            )
        except Exception as e:
            LoggerRoot.get_base_logger().warning("Error: {}".format(str(e)))
            return
        for dataset_id in dataset_ids:
            # noinspection PyBroadException
            try:
                dataset = cls.get(dataset_id=dataset_id, _dont_propulate_runtime_props=True)
            except Exception:
                dataset = None
            if not dataset:
                LoggerRoot.get_base_logger().warning("Could not find dataset to move to another project")
                continue
            cls._move_to_project_aux(dataset._task, new_dataset_project, dataset.name)

    @classmethod
    def get(
        cls,
        dataset_id=None,  # type: Optional[str]
        dataset_project=None,  # type: Optional[str]
        dataset_name=None,  # type: Optional[str]
        dataset_tags=None,  # type: Optional[Sequence[str]]
        only_completed=False,  # type: bool
        only_published=False,  # type: bool
        include_archived=False,  # type: bool
        auto_create=False,  # type: bool
        writable_copy=False,  # type: bool
        dataset_version=None,  # type: Optional[str]
        alias=None,  # type: Optional[str]
        overridable=False,  # type: bool
        shallow_search=False,  # type: bool
        **kwargs
    ):
        # type: (...) -> "Dataset"
        """
        Get a specific Dataset. If multiple datasets are found, the dataset with the
        highest semantic version is returned. If no semantic version is found, the most recently
        updated dataset is returned. This functions raises an Exception in case no dataset
        can be found and the ``auto_create=True`` flag is not set

        :param dataset_id: Requested dataset ID
        :param dataset_project: Requested dataset project name
        :param dataset_name: Requested dataset name
        :param dataset_tags: Requested dataset tags (list of tag strings)
        :param only_completed: Return only if the requested dataset is completed or published
        :param only_published: Return only if the requested dataset is published
        :param include_archived: Include archived tasks and datasets also
        :param auto_create: Create a new dataset if it does not exist yet
        :param writable_copy: Get a newly created mutable dataset with the current one as its parent,
            so new files can be added to the instance.
        :param dataset_version: Requested version of the Dataset
        :param alias: Alias of the dataset. If set, the 'alias : dataset ID' key-value pair
            will be set under the hyperparameters section 'Datasets'
        :param overridable: If True, allow overriding the dataset ID with a given alias in the
            hyperparameters section. Useful when one wants to change the dataset used when running
            a task remotely. If the alias parameter is not set, this parameter has no effect
        :param shallow_search: If True, search only the first 500 results (first page)

        :return: Dataset object
        """
        system_tags = ["__$all", cls.__tag]
        if not include_archived:
            system_tags = ["__$all", cls.__tag, "__$not", "archived"]
        if not any([dataset_id, dataset_project, dataset_name, dataset_tags]):
            raise ValueError("Dataset selection criteria not met. Didn't provide id/name/project/tags correctly.")
        current_task = Task.current_task()
        if not alias and current_task:
            LoggerRoot.get_base_logger().info(
                "Dataset.get() did not specify alias. Dataset information "
                "will not be automatically logged in ClearML Server.")

        mutually_exclusive(dataset_id=dataset_id, dataset_project=dataset_project, _require_at_least_one=False)
        mutually_exclusive(dataset_id=dataset_id, dataset_name=dataset_name, _require_at_least_one=False)

        invalid_kwargs = [kwarg for kwarg in kwargs.keys() if not kwarg.startswith("_")]
        if invalid_kwargs:
            raise ValueError("Invalid 'Dataset.get' arguments: {}".format(invalid_kwargs))

        def get_instance(dataset_id_):
            task = Task.get_task(task_id=dataset_id_)
            if task.status == "created":
                raise ValueError("Dataset id={} is in draft mode, delete and recreate it".format(task.id))
            force_download = False if task.status in ("stopped", "published", "closed", "completed") else True
            if cls.__state_entry_name in task.artifacts:
                local_state_file = StorageManager.get_local_copy(
                    remote_url=task.artifacts[cls.__state_entry_name].url,
                    cache_context=cls.__cache_context,
                    extract_archive=False,
                    name=task.id,
                    force_download=force_download,
                )
                if not local_state_file:
                    raise ValueError("Could not load Dataset id={} state".format(task.id))
            else:
                # we could not find the serialized state, start empty
                local_state_file = {}
            instance_ = cls._deserialize(local_state_file, task)
            # remove the artifact, just in case
            if force_download and local_state_file:
                os.unlink(local_state_file)
            return instance_

        def finish_dataset_get(dataset, orig_dataset_id):
            # noinspection PyProtectedMember
            dataset_id_ = dataset._id
            if not current_task or kwargs.get("_dont_populate_runtime_props"):
                return dataset
            if alias:
                # noinspection PyProtectedMember
                current_task._set_parameters(
                    {"{}/{}".format(cls.__hyperparams_section, alias): dataset_id_}, __update=True
                )
            # noinspection PyProtectedMember
            runtime_props = current_task._get_runtime_properties()
            used_datasets = list(runtime_props.get(cls.__datasets_runtime_prop, []))
            runtime_props_to_set = {}
            if dataset_id_ not in used_datasets:
                used_datasets.append(dataset_id_)
                runtime_props_to_set.update({cls.__datasets_runtime_prop: used_datasets})
            orig_dataset = get_instance(orig_dataset_id)
            # noinspection PyProtectedMember
            if orig_dataset._dataset_version:
                runtime_props_to_set.update(
                    {
                        "{}.{}/{}".format(
                            cls.__orig_datasets_runtime_prop_prefix, orig_dataset.name, orig_dataset._dataset_version
                        ): orig_dataset_id
                    }
                )
            else:
                runtime_props_to_set.update(
                    {"{}.{}".format(cls.__orig_datasets_runtime_prop_prefix, orig_dataset.name): orig_dataset_id}
                )
            if runtime_props_to_set:
                # noinspection PyProtectedMember
                current_task._set_runtime_properties(runtime_props_to_set)
            return dataset

        if not dataset_id:
            dataset_id, _ = cls._get_dataset_id(
                dataset_project=dataset_project,
                dataset_name=dataset_name,
                dataset_version=dataset_version,
                dataset_filter=dict(
                    tags=dataset_tags,
                    system_tags=system_tags,
                    type=[str(Task.TaskTypes.data_processing)],
                    status=["published"]
                    if only_published
                    else ["published", "completed", "closed"]
                    if only_completed
                    else None,
                ),
                shallow_search=shallow_search
            )
            if not dataset_id and not auto_create:
                raise ValueError(
                    "Could not find Dataset {} {}".format(
                        "id" if dataset_id else "project/name/version",
                        dataset_id if dataset_id else (dataset_project, dataset_name, dataset_version),
                    )
                )
        orig_dataset_id_ = dataset_id

        if alias and overridable and running_remotely():
            remote_task = Task.get_task(task_id=get_remote_task_id())
            dataset_id = remote_task.get_parameter("{}/{}".format(cls.__hyperparams_section, alias))

        if not dataset_id:
            if not auto_create:
                raise ValueError(
                    "Could not find Dataset {} {}".format(
                        "id" if dataset_id else "project/name/version",
                        dataset_id if dataset_id else (dataset_project, dataset_name, dataset_version),
                    )
                )
            instance = Dataset.create(
                dataset_name=dataset_name, dataset_project=dataset_project, dataset_tags=dataset_tags
            )
            return finish_dataset_get(instance, instance._id)
        instance = get_instance(dataset_id)
        # Now we have the requested dataset, but if we want a mutable copy instead, we create a new dataset with the
        # current one as its parent. So one can add files to it and finalize as a new version.
        if writable_copy:
            writeable_instance = Dataset.create(
                dataset_name=instance.name,
                dataset_project=instance.project,
                dataset_tags=instance.tags,
                parent_datasets=[instance.id],
            )
            return finish_dataset_get(writeable_instance, writeable_instance._id)

        return finish_dataset_get(instance, orig_dataset_id_)

    def get_logger(self):
        # type: () -> Logger
        """
        Return a Logger object for the Dataset, allowing users to report statistics metrics
        and debug samples on the Dataset itself
        :return: Logger object
        """
        return self._task.get_logger()

    def get_num_chunks(self, include_parents=True):
        # type: (bool) -> int
        """
        Return the number of chunks stored on this dataset
        (it does not imply on the number of chunks parent versions store)

        :param include_parents: If True (default),
        return the total number of chunks from this version and all parent versions.
        If False, only return the number of chunks we stored on this specific version.

        :return: Number of chunks stored on the dataset.
        """
        if not include_parents:
            return len(self._get_data_artifact_names())

        return sum(self._get_dependency_chunk_lookup().values())

    @classmethod
    def squash(
            cls,
            dataset_name,  # type: str
            dataset_ids=None,  # type: Optional[Sequence[Union[str, Dataset]]]
            dataset_project_name_pairs=None,  # type: Optional[Sequence[(str, str)]]
            output_url=None,  # type: Optional[str]
    ):
        # type: (...) -> "Dataset"
        """
        Generate a new dataset from the squashed set of dataset versions.
        If a single version is given it will squash to the root (i.e. create single standalone version)
        If a set of versions are given it will squash the versions diff into a single version

        :param dataset_name: Target name for the newly generated squashed dataset
        :param dataset_ids: List of dataset IDs (or objects) to squash. Notice order does matter.
            The versions are merged from first to last.
        :param dataset_project_name_pairs: List of pairs (project_name, dataset_name) to squash.
            Notice order does matter. The versions are merged from first to last.
        :param output_url: Target storage for the compressed dataset (default: file server)
            Examples: `s3://bucket/data`, `gs://bucket/data` , `azure://bucket/data` , `/mnt/share/data`
        :return: Newly created dataset object.
        """
        mutually_exclusive(dataset_ids=dataset_ids, dataset_project_name_pairs=dataset_project_name_pairs)
        datasets = [cls.get(dataset_id=d) for d in dataset_ids] if dataset_ids else \
            [cls.get(dataset_project=pair[0], dataset_name=pair[1]) for pair in dataset_project_name_pairs]
        # single dataset to squash, squash it all.
        if len(datasets) == 1:
            temp_folder = datasets[0].get_local_copy()
            parents = set()
        else:
            parents = None
            temp_folder = Path(mkdtemp(prefix='squash-datasets.'))
            pool = ThreadPool()
            for ds in datasets:
                base_folder = Path(ds._get_dataset_files())
                files = [f.relative_path for f in ds.file_entries if f.parent_dataset_id == ds.id]
                pool.map(
                    lambda x:
                        (temp_folder / x).parent.mkdir(parents=True, exist_ok=True) or
                        shutil.copy((base_folder / x).as_posix(), (temp_folder / x).as_posix(), follow_symlinks=True),
                    files)
                parents = set(ds._get_parents()) if parents is None else (parents & set(ds._get_parents()))
            pool.close()

        squashed_ds = cls.create(
            dataset_project=datasets[0].project, dataset_name=dataset_name, parent_datasets=list(parents))
        squashed_ds._task.get_logger().report_text('Squashing dataset', print_console=False)
        squashed_ds.add_files(temp_folder)
        for ds in datasets:
            squashed_ds._dataset_link_entries.update(ds._dataset_link_entries)
        squashed_ds.upload(output_url=output_url)
        squashed_ds.finalize()
        return squashed_ds

    @classmethod
    def list_datasets(
        cls,
        dataset_project=None,  # type: Optional[str]
        partial_name=None,  # type: Optional[str]
        tags=None,  # type: Optional[Sequence[str]]
        ids=None,  # type: Optional[Sequence[str]]
        only_completed=True,  # type: bool
        recursive_project_search=True,  # type: bool
    ):
        # type: (...) -> List[dict]
        """
        Query list of dataset in the system

        :param dataset_project: Specify dataset project name
        :param partial_name: Specify partial match to a dataset name
        :param tags: Specify user tags
        :param ids: List specific dataset based on IDs list
        :param only_completed: If False, return datasets that are still in progress (uploading/edited etc.)
        :param recursive_project_search: If True and the `dataset_project` argument is set,
            search inside subprojects as well.
            If False, don't search inside subprojects (except for the special `.datasets` subproject)
        :return: List of dictionaries with dataset information
            Example: [{'name': name, 'project': project name, 'id': dataset_id, 'created': date_created},]
        """
        if dataset_project:
            if not recursive_project_search:
                dataset_projects = [
                    exact_match_regex(dataset_project),
                    "^{}/\\.datasets/.*".format(re.escape(dataset_project)),
                ]
            else:
                dataset_projects = [exact_match_regex(dataset_project), "^{}/.*".format(re.escape(dataset_project))]
        else:
            dataset_projects = None
        # noinspection PyProtectedMember
        datasets = Task._query_tasks(
            task_ids=ids or None,
            project_name=dataset_projects,
            task_name=partial_name,
            system_tags=[cls.__tag],
            type=[str(Task.TaskTypes.data_processing)],
            tags=tags or None,
            status=["stopped", "published", "completed", "closed"] if only_completed else None,
            only_fields=["created", "id", "name", "project", "tags"],
            search_hidden=True,
            exact_match_regex_flag=False,
            _allow_extra_fields_=True,
        )
        project_ids = {d.project for d in datasets}
        # noinspection PyProtectedMember
        project_id_lookup = Task._get_project_names(list(project_ids))
        return [
            {
                "name": d.name,
                "created": d.created,
                "project": cls._remove_hidden_part_from_dataset_project(project_id_lookup[d.project]),
                "id": d.id,
                "tags": d.tags,
            }
            for d in datasets
        ]

    def _add_files(
        self,
        path,  # type: Union[str, Path, _Path]
        wildcard=None,  # type: Optional[Union[str, Sequence[str]]]
        local_base_folder=None,  # type: Optional[str]
        dataset_path=None,  # type: Optional[str]
        recursive=True,  # type: bool
        verbose=False,  # type: bool
        max_workers=None,  # type: Optional[int]
    ):
        # type: (...) -> tuple[int, int]
        """
        Add a folder into the current dataset. calculate file hash,
        and compare against parent, mark files to be uploaded

        :param path: Add a folder/file to the dataset
        :param wildcard: add only specific set of files.
            Wildcard matching, can be a single string or a list of wildcards)
        :param local_base_folder: files will be located based on their relative path from local_base_folder
        :param dataset_path: where in the dataset the folder/files should be located
        :param recursive: If True, match all wildcard files recursively
        :param verbose: If True, print to console added files
        :param max_workers: The number of threads to add the files with. Defaults to the number of logical cores
        """
        max_workers = max_workers or psutil.cpu_count()
        if dataset_path:
            dataset_path = dataset_path.lstrip("/")
        path = Path(path)
        local_base_folder = Path(local_base_folder or path)
        wildcard = wildcard or ["*"]
        if isinstance(wildcard, str):
            wildcard = [wildcard]
        # single file, no need for threading
        if path.is_file():
            if not local_base_folder.is_dir():
                local_base_folder = local_base_folder.parent
            file_entry = self._calc_file_hash(
                FileEntry(local_path=path.absolute().as_posix(),
                          relative_path=(Path(dataset_path or '.') / path.relative_to(local_base_folder)).as_posix(),
                          parent_dataset_id=self._id))
            file_entries = [file_entry]
        else:
            # if not a folder raise exception
            if not path.is_dir():
                raise ValueError("Could not find file/folder \'{}\'", path.as_posix())

            # prepare a list of files
            file_entries = []
            for w in wildcard:
                files = list(path.rglob(w)) if recursive else list(path.glob(w))
                file_entries.extend([f for f in files if f.is_file()])
            file_entries = list(set(file_entries))
            file_entries = [
                FileEntry(
                    parent_dataset_id=self._id,
                    local_path=f.absolute().as_posix(),
                    relative_path=(Path(dataset_path or ".") / f.relative_to(local_base_folder)).as_posix(),
                )
                for f in file_entries
            ]
            self._task.get_logger().report_text('Generating SHA2 hash for {} files'.format(len(file_entries)))
            pool = ThreadPool(max_workers)
            try:
                import tqdm  # noqa
                for _ in tqdm.tqdm(pool.imap_unordered(self._calc_file_hash, file_entries), total=len(file_entries)):
                    pass
            except ImportError:
                pool.map(self._calc_file_hash, file_entries)
            pool.close()
            self._task.get_logger().report_text('Hash generation completed')

        # Get modified files, files with the same filename but a different hash
        filename_hash_dict = {fe.relative_path: fe.hash for fe in file_entries}
        modified_count = len([k for k, v in self._dataset_file_entries.items()
                              if k in filename_hash_dict and v.hash != filename_hash_dict[k]])

        # merge back into the dataset
        count = 0
        for f in file_entries:
            ds_cur_f = self._dataset_file_entries.get(f.relative_path)
            if not ds_cur_f:
                if (
                    f.relative_path in self._dataset_link_entries
                    and f.size == self._dataset_link_entries[f.relative_path].size
                ):
                    continue
                if verbose:
                    self._task.get_logger().report_text('Add {}'.format(f.relative_path))
                self._dataset_file_entries[f.relative_path] = f
                if f.relative_path not in self._dataset_link_entries:
                    count += 1
            elif ds_cur_f.hash != f.hash:
                if verbose:
                    self._task.get_logger().report_text('Modified {}'.format(f.relative_path))
                self._dataset_file_entries[f.relative_path] = f
                count += 1
            elif f.parent_dataset_id == self._id and ds_cur_f.parent_dataset_id == self._id:
                # check if we have the file in an already uploaded chunk
                if ds_cur_f.local_path is None:
                    # skipping, already uploaded.
                    if verbose:
                        self._task.get_logger().report_text('Skipping {}'.format(f.relative_path))
                else:
                    # if we never uploaded it, mark for upload
                    if verbose:
                        self._task.get_logger().report_text('Re-Added {}'.format(f.relative_path))
                    self._dataset_file_entries[f.relative_path] = f
                    count += 1
            else:
                if verbose:
                    self._task.get_logger().report_text('Unchanged {}'.format(f.relative_path))

        # We don't count the modified files as added files
        self.update_changed_files(num_files_added=count - modified_count, num_files_modified=modified_count)
        return count - modified_count, modified_count

    def _update_dependency_graph(self):
        """
        Update the dependency graph based on the current self._dataset_file_entries state
        :return:
        """
        # collect all dataset versions
        used_dataset_versions = set(f.parent_dataset_id for f in self._dataset_file_entries.values())
        used_dataset_versions.add(self._id)
        current_parents = self._dependency_graph.get(self._id) or []
        # remove parent versions we no longer need from the main version list
        # per version, remove unnecessary parent versions, if we do not need them
        self._dependency_graph = {
            k: [p for p in parents or [] if p in used_dataset_versions]
            for k, parents in self._dependency_graph.items() if k in used_dataset_versions}
        # make sure we do not remove our parents, for geology sake
        self._dependency_graph[self._id] = current_parents

    def _serialize(self, update_dependency_chunk_lookup=False):
        # type: (bool) -> ()
        """
        store current state of the Dataset for later use

        :param update_dependency_chunk_lookup: If True, update the parent versions number of chunks

        :return: object to be used for later deserialization
        """
        self._update_dependency_graph()

        total_size = 0
        added_files_count = 0
        added_files_size = 0
        modified_files_count = 0
        modified_files_size = 0
        removed_files_count = 0
        removed_files_size = 0

        def update_changes(entries, parent_entries):
            nonlocal total_size
            nonlocal modified_files_count
            nonlocal modified_files_size
            nonlocal added_files_count
            nonlocal added_files_size
            nonlocal removed_files_count
            nonlocal removed_files_size

            for file in entries.values():
                # noinspection PyBroadException
                try:
                    total_size += file.size
                    if file.parent_dataset_id == self._id:
                        if file.relative_path in parent_file_entries:
                            modified_files_count += 1
                            modified_files_size += file.size - parent_file_entries[file.relative_path].size
                        else:
                            added_files_count += 1
                            added_files_size += file.size
                except Exception:
                    pass
            for parent_entry_key, parent_entry_value in parent_entries.items():
                # noinspection PyBroadException
                try:
                    if parent_entry_key not in entries:
                        removed_files_count += 1
                        removed_files_size -= parent_entry_value.size
                except Exception:
                    pass

        parent_datasets_ids = self._dependency_graph[self._id]
        parent_file_entries = dict()  # type: Dict[str, FileEntry]
        parent_link_entries = dict()  # type: Dict[str, LinkEntry]
        for parent_dataset_id in parent_datasets_ids:
            if parent_dataset_id == self._id:
                continue
            parent_dataset = self.get(parent_dataset_id)
            parent_file_entries.update(parent_dataset._dataset_file_entries)
            parent_link_entries.update(parent_dataset._dataset_link_entries)
        # we have to do this after we update the parent_file_entries because we might
        # have duplicate file entries
        update_changes(self._dataset_file_entries, parent_file_entries)
        update_changes(self._dataset_link_entries, parent_link_entries)
        state = dict(
            file_count=len(self._dataset_file_entries) + len(self._dataset_link_entries),
            total_size=total_size,
            dataset_file_entries=[f.as_dict() for f in self._dataset_file_entries.values()],
            dataset_link_entries=[link.as_dict() for link in self._dataset_link_entries.values()],
            dependency_graph=self._dependency_graph,
            id=self._id,
            dirty=self._dirty,
        )
        if update_dependency_chunk_lookup:
            state["dependency_chunk_lookup"] = self._build_dependency_chunk_lookup()

        preview = (
            "Dataset state\n"
            "Files added/modified: {0} - total size {1}\n"
            "Current dependency graph: {2}\n".format(
                modified_files_count + added_files_count,
                format_size(
                    added_files_size + modified_files_size,
                    binary=True,
                    use_nonbinary_notation=True,
                    use_b_instead_of_bytes=True,
                ),
                json.dumps(self._dependency_graph, indent=2, sort_keys=True),
            )
        )
        # store as artifact of the Task and add the amount of files added or removed as metadata, so we can use those
        # later to create the table
        self._task.upload_artifact(
            name=self.__state_entry_name,
            artifact_object=state,
            preview=preview,
            wait_on_upload=True,
            metadata=self.changed_files,
        )
        self._ds_total_size = total_size
        # noinspection PyProtectedMember
        self._task._set_runtime_properties(
            {
                "ds_file_count": len(self._dataset_file_entries),
                "ds_link_count": len(self._dataset_link_entries),
                "ds_total_size": self._ds_total_size,
                "ds_total_size_compressed": self._ds_total_size_compressed,
                "ds_change_add": added_files_count,
                "ds_change_remove": removed_files_count,
                "ds_change_modify": modified_files_count,
                "ds_change_size": added_files_size + modified_files_size + removed_files_size,
            }
        )

    def update_changed_files(self, num_files_added=None, num_files_modified=None, num_files_removed=None):
        """
        Update the internal state keeping track of added, modified and removed files.

        :param num_files_added: Amount of files added when compared to the parent dataset
        :param num_files_modified: Amount of files with the same name but a different hash when
                                   compared to the parent dataset
        :param num_files_removed: Amount of files removed when compared to the parent dataset
        """
        if num_files_added:
            self.changed_files["files added"] += num_files_added
        if num_files_removed:
            self.changed_files["files removed"] += num_files_removed
        if num_files_modified:
            self.changed_files["files modified"] += num_files_modified

    def _download_dataset_archives(self):
        """
        Download the dataset archive, return a link to locally stored zip file
        :return: List of paths to locally stored zip files
        """
        pass  # TODO: implement

    def _get_dataset_files(
        self,
        force=False,
        selected_chunks=None,
        lock_target_folder=False,
        cleanup_target_folder=True,
        target_folder=None,
        max_workers=None
    ):
        # type: (bool, Optional[List[int]], bool, bool, Optional[Path], Optional[int]) -> str
        """
        First, extracts the archive present on the ClearML server containing this dataset's files.
        Then, download the remote files. Note that if a remote file was added to the ClearML server, then
        it won't be downloaded from the remote storage unless it is added again using
        Dataset.add_external_files().

        :param force: If True, extract dataset content even if target folder exists and is not empty
        :param selected_chunks: Optional, if provided only download the selected chunks (index) of the Dataset.
            Example: Assuming 8 chunks on this version
            selected_chunks=[0,1,2]
        :param lock_target_folder: If True, local the target folder so the next cleanup will not delete
            Notice you should unlock it manually, or wait for the process to finish for auto unlocking.
        :param cleanup_target_folder: If True, remove target folder recursively
        :param target_folder: If provided use the specified target folder, default, auto generate from Dataset ID.
        :param max_workers: Number of threads to be spawned when getting dataset files. Defaults
            to the number of virtual cores.

        :return: Path to the local storage where the data was downloaded
        """
        max_workers = max_workers or psutil.cpu_count()
        local_folder = self._extract_dataset_archive(
            force=force,
            selected_chunks=selected_chunks,
            lock_target_folder=lock_target_folder,
            cleanup_target_folder=cleanup_target_folder,
            target_folder=target_folder,
            max_workers=max_workers
        )
        self._download_external_files(
            target_folder=target_folder, lock_target_folder=lock_target_folder
        )
        return local_folder

    def _download_external_files(
        self, target_folder=None, lock_target_folder=False
    ):
        # (Union(Path, str), bool) -> None
        """
        Downloads external files in the dataset. These files will be downloaded
        at relative_path (the path relative to the target_folder). Note that
        the download will not overwrite any existing files. Hence, if the file
        was already downloaded from the ClearML server, it will not be overwritten.

        :param target_folder: If provided use the specified target folder, default, auto generate from Dataset ID.
        :param lock_target_folder: If True, local the target folder so the next cleanup will not delete
            Notice you should unlock it manually, or wait for the process to finish for auto unlocking.
        """
        target_folder = (
            Path(target_folder)
            if target_folder
            else self._create_ds_target_folder(
                lock_target_folder=lock_target_folder
            )[0]
        ).as_posix()
        dependencies = self._get_dependencies_by_order(
            include_unused=False, include_current=True
        )
        links = {}
        for dependency in dependencies:
            ds = Dataset.get(dependency)
            links.update(ds._dataset_link_entries)
        links.update(self._dataset_link_entries)
        for relative_path, link in links.items():
            target_path = os.path.join(target_folder, relative_path)
            if os.path.exists(target_path):
                LoggerRoot.get_base_logger().info(
                    "{} already exists. Skipping downloading {}".format(
                        target_path, link
                    )
                )
                continue
            ok = False
            error = None
            try:
                helper = StorageHelper.get(link.link)
                ok = helper.download_to_file(
                    link.link,
                    target_path,
                    overwrite_existing=False,
                    verbose=False,
                    direct_access=False,
                    silence_errors=True
                )
            except Exception as e:
                error = e
            if not ok:
                log_string = "Failed downloading {}".format(link.link)
                if error:
                    log_string += " Error is '{}'".format(error)
                LoggerRoot.get_base_logger().info(log_string)
            else:
                link.size = Path(target_path).stat().st_size

    def _extract_dataset_archive(
            self,
            force=False,
            selected_chunks=None,
            lock_target_folder=False,
            cleanup_target_folder=True,
            target_folder=None,
            max_workers=None
    ):
        # type: (bool, Optional[List[int]], bool, bool, Optional[Path], Optional[int]) -> str
        """
        Download the dataset archive, and extract the zip content to a cached folder.
        Notice no merging is done.

        :param force: If True, extract dataset content even if target folder exists and is not empty
        :param selected_chunks: Optional, if provided only download the selected chunks (index) of the Dataset.
            Example: Assuming 8 chunks on this version
            selected_chunks=[0,1,2]
        :param lock_target_folder: If True, local the target folder so the next cleanup will not delete
            Notice you should unlock it manually, or wait for the process to finish for auto unlocking.
        :param cleanup_target_folder: If True remove target folder recursively
        :param target_folder: If provided use the specified target folder, default, auto generate from Dataset ID.
        :param max_workers: Number of threads to be spawned when downloading and extracting the archives

        :return: Path to a local storage extracted archive
        """
        assert selected_chunks is None or isinstance(selected_chunks, (list, tuple))

        if not self._task:
            self._task = Task.get_task(task_id=self._id)

        max_workers = max_workers or psutil.cpu_count()

        data_artifact_entries = self._get_data_artifact_names()

        if selected_chunks is not None and data_artifact_entries:
            data_artifact_entries = [
                d for d in data_artifact_entries
                if self._get_chunk_idx_from_artifact_name(d) in selected_chunks]

        # get cache manager
        local_folder = Path(target_folder) if target_folder else \
            self._create_ds_target_folder(lock_target_folder=lock_target_folder)[0]

        # check if we have a dataset with empty change set
        if not data_artifact_entries:
            return local_folder.as_posix()

        # check if target folder is not empty
        if not force and next(local_folder.glob('*'), None):
            return local_folder.as_posix()

        # if we got here, we need to clear the target folder
        local_folder = local_folder.as_posix()
        if cleanup_target_folder:
            shutil.rmtree(local_folder, ignore_errors=True)
        # verify target folder exists
        Path(local_folder).mkdir(parents=True, exist_ok=True)

        def _download_part(data_artifact_name):
            # download the dataset zip
            local_zip = StorageManager.get_local_copy(
                remote_url=self._task.artifacts[data_artifact_name].url, cache_context=self.__cache_context,
                extract_archive=False, name=self._id)
            if not local_zip:
                raise ValueError("Could not download dataset id={} entry={}".format(self._id, data_artifact_name))
            return local_zip

        def _extract_part(local_zip, data_artifact_name):
            # noinspection PyProtectedMember
            StorageManager._extract_to_cache(
                cached_file=local_zip, name=self._id,
                cache_context=self.__cache_context, target_folder=local_folder, force=True)
            # noinspection PyBroadException
            try:
                # do not delete files we accessed directly
                url = self._task.artifacts[data_artifact_name].url
                helper = StorageHelper.get(url)
                if helper.get_driver_direct_access(url) is None:
                    Path(local_zip).unlink()
            except Exception:
                pass

        with ThreadPoolExecutor(max_workers=max_workers) as pool:
            for d in data_artifact_entries:
                local_zip = _download_part(d)
                pool.submit(_extract_part, local_zip, d)

        return local_folder

    def _create_ds_target_folder(self, part=None, num_parts=None, lock_target_folder=True):
        # type: (Optional[int], Optional[int], bool) -> Tuple[Path, CacheManager.CacheContext]
        cache = CacheManager.get_cache_manager(cache_context=self.__cache_context)
        local_folder = Path(cache.get_cache_folder()) / self._get_cache_folder_name(part=part, num_parts=num_parts)
        if lock_target_folder:
            cache.lock_cache_folder(local_folder)
        local_folder.mkdir(parents=True, exist_ok=True)
        return local_folder, cache

    def _release_lock_ds_target_folder(self, target_folder):
        # type: (Union[str, Path]) -> None
        cache = CacheManager.get_cache_manager(cache_context=self.__cache_context)
        cache.unlock_cache_folder(target_folder)

    def _get_data_artifact_names(self):
        # type: () -> List[str]
        data_artifact_entries = [
            a for a in self._task.artifacts
            if a and (a == self.__default_data_entry_name or str(a).startswith(self.__data_entry_name_prefix))]
        return data_artifact_entries

    def _get_next_data_artifact_name(self, last_artifact_name=None):
        # type: (Optional[str]) -> str
        if not last_artifact_name:
            data_artifact_entries = self._get_data_artifact_names()
            if len(data_artifact_entries) < 1:
                return self.__default_data_entry_name
        else:
            data_artifact_entries = [last_artifact_name]
        prefix = self.__data_entry_name_prefix
        prefix_len = len(prefix)
        numbers = sorted([int(a[prefix_len:]) for a in data_artifact_entries if a.startswith(prefix)])
        return '{}{:03d}'.format(prefix, numbers[-1]+1 if numbers else 1)

    def _merge_datasets(self, use_soft_links=None, raise_on_error=True, part=None, num_parts=None, max_workers=None):
        # type: (bool, bool, Optional[int], Optional[int], Optional[int]) -> str
        """
        download and copy / soft-link, files from all the parent dataset versions
        :param use_soft_links: If True use soft links, default False on windows True on Posix systems
        :param raise_on_error: If True raise exception if dataset merging failed on any file
        :param part: Optional, if provided only download the selected part (index) of the Dataset.
            Notice, if `num_parts` is not provided, number of parts will be equal to the number of chunks.
            This argument is passed to parent versions, as well as the implicit `num_parts`,
            allowing users to get a partial copy of the entire dataset, for multi node/step processing.
        :param num_parts: Optional, if specified, normalize the number of chunks stored to the
            requested number of parts. Notice that the actual chunks used per part are rounded down.
            Example: Assuming 8 chunks on this version, and `num_parts=5`, the chunk index used per parts would be:
            part=0 -> chunks[0,5], part=1 -> chunks[1,6], part=2 -> chunks[2,7], part=3 -> chunks[3, ]
        :param max_workers: Number of threads to be spawned when merging datasets. Defaults to the number
            of logical cores.

        :return: the target folder
        """
        assert part is None or (isinstance(part, int) and part >= 0)
        assert num_parts is None or (isinstance(num_parts, int) and num_parts >= 1)

        max_workers = max_workers or psutil.cpu_count()

        if use_soft_links is None:
            use_soft_links = False if is_windows() else True

        if part is not None and not num_parts:
            num_parts = self.get_num_chunks()

        # just create the dataset target folder
        target_base_folder, _ = self._create_ds_target_folder(
            part=part, num_parts=num_parts, lock_target_folder=True)

        # selected specific chunks if `part` was passed
        chunk_selection = None if part is None else self._build_chunk_selection(part=part, num_parts=num_parts)

        # check if target folder is not empty, see if it contains everything we need
        if target_base_folder and next(target_base_folder.iterdir(), None):
            if self._verify_dataset_folder(target_base_folder, part, chunk_selection):
                target_base_folder.touch()
                self._release_lock_ds_target_folder(target_base_folder)
                return target_base_folder.as_posix()
            else:
                LoggerRoot.get_base_logger().info('Dataset needs refreshing, fetching all parent datasets')
                # we should delete the entire cache folder
                shutil.rmtree(target_base_folder.as_posix())
                # make sure we recreate the dataset target folder
                target_base_folder.mkdir(parents=True, exist_ok=True)

        # get the dataset dependencies (if `part` was passed, only selected the ones in the selected part)
        dependencies_by_order = self._get_dependencies_by_order(include_unused=False, include_current=True) \
            if chunk_selection is None else list(chunk_selection.keys())

        # first get our dataset
        if self._id in dependencies_by_order:
            self._get_dataset_files(
                force=True,
                selected_chunks=chunk_selection.get(self._id) if chunk_selection else None,
                cleanup_target_folder=True,
                target_folder=target_base_folder,
                max_workers=max_workers
            )
            dependencies_by_order.remove(self._id)

        # update target folder timestamp
        target_base_folder.touch()

        # if we have no dependencies, we can just return now
        if not dependencies_by_order:
            self._release_lock_ds_target_folder(target_base_folder)
            return target_base_folder.absolute().as_posix()

        # extract parent datasets
        self._extract_parent_datasets(
            target_base_folder=target_base_folder, dependencies_by_order=dependencies_by_order,
            chunk_selection=chunk_selection, use_soft_links=use_soft_links,
            raise_on_error=False, force=False)

        # verify entire dataset (if failed, force downloading parent datasets)
        if not self._verify_dataset_folder(target_base_folder, part, chunk_selection):
            LoggerRoot.get_base_logger().info('Dataset parents need refreshing, re-fetching all parent datasets')
            # we should delete the entire cache folder
            self._extract_parent_datasets(
                target_base_folder=target_base_folder, dependencies_by_order=dependencies_by_order,
                chunk_selection=chunk_selection, use_soft_links=use_soft_links,
                raise_on_error=raise_on_error, force=True)

        self._release_lock_ds_target_folder(target_base_folder)
        return target_base_folder.absolute().as_posix()

    def _get_dependencies_by_order(self, include_unused=False, include_current=True):
        # type: (bool, bool) -> List[str]
        """
        Return the dataset dependencies by order of application (from the last to the current)
        :param include_unused: If True include unused datasets in the dependencies
        :param include_current: If True include the current dataset ID as the last ID in the list
        :return: list of str representing the datasets id
        """
        roots = [self._id]
        dependencies = []
        # noinspection DuplicatedCode
        while roots:
            r = roots.pop(0)
            if r not in dependencies:
                dependencies.append(r)
            # add the parents of the current node, only if the parents are in the general graph node list
            if include_unused and r not in self._dependency_graph:
                roots.extend(list(reversed(
                    [p for p in (self.get(dataset_id=r)._get_parents() or []) if p not in roots])))
            else:
                roots.extend(list(reversed(
                    [p for p in (self._dependency_graph.get(r) or [])
                     if p not in roots and (include_unused or (p in self._dependency_graph))])))

        # make sure we cover leftovers
        leftovers = set(self._dependency_graph.keys()) - set(dependencies)
        if leftovers:
            roots = list(leftovers)
            # noinspection DuplicatedCode
            while roots:
                r = roots.pop(0)
                if r not in dependencies:
                    dependencies.append(r)
                # add the parents of the current node, only if the parents are in the general graph node list
                if include_unused and r not in self._dependency_graph:
                    roots.extend(list(reversed(
                        [p for p in (self.get(dataset_id=r)._get_parents() or []) if p not in roots])))
                else:
                    roots.extend(list(reversed(
                        [p for p in (self._dependency_graph.get(r) or [])
                         if p not in roots and (include_unused or (p in self._dependency_graph))])))

        # skip our id
        dependencies = list(reversed(dependencies[1:]))
        return (dependencies + [self._id]) if include_current else dependencies

    def _get_parents(self):
        # type: () -> Sequence[str]
        """
        Return a list of direct parent datasets (str)
        :return: list of dataset ids
        """
        return self._dependency_graph[self.id]

    @classmethod
    def _deserialize(cls, stored_state, task):
        # type: (Union[dict, str, Path, _Path], Task) -> "Dataset"
        """
        reload a dataset state from the stored_state object
        :param task: Task object associated with the dataset
        :return: A Dataset object
        """
        assert isinstance(stored_state, (dict, str, Path, _Path))

        if isinstance(stored_state, (str, Path, _Path)):
            stored_state_file = Path(stored_state).as_posix()
            with open(stored_state_file, 'rt') as f:
                stored_state = json.load(f)

        instance = cls(_private=cls.__private_magic, task=task)
        # assert instance._id == stored_state['id']  # They should match
        instance._dependency_graph = stored_state.get('dependency_graph', {})
        instance._dirty = stored_state.get('dirty', False)
        instance._dataset_file_entries = {
            s["relative_path"]: FileEntry(**s) for s in stored_state.get("dataset_file_entries", [])
        }
        instance._dataset_link_entries = {
            s["relative_path"]: LinkEntry(**s) for s in stored_state.get("dataset_link_entries", [])
        }
        if stored_state.get('dependency_chunk_lookup') is not None:
            instance._dependency_chunk_lookup = stored_state.get('dependency_chunk_lookup')

        # update the last used artifact (remove the one we never serialized, they rae considered broken)
        if task.status in ('in_progress', 'created', 'stopped'):
            artifact_names = set([
                a.artifact_name for a in instance._dataset_file_entries.values()
                if a.artifact_name and a.parent_dataset_id == instance._id])
            missing_artifact_name = set(instance._get_data_artifact_names()) - artifact_names
            if missing_artifact_name:
                instance._task._delete_artifacts(list(missing_artifact_name))
                # if we removed any data artifact, update the next data artifact name
                instance._data_artifact_name = instance._get_next_data_artifact_name()

        return instance

    @staticmethod
    def _calc_file_hash(file_entry):
        # calculate hash
        file_entry.hash, _ = sha256sum(file_entry.local_path)
        file_entry.size = Path(file_entry.local_path).stat().st_size
        return file_entry

    @classmethod
    def _get_dataset_id_hash(cls, dataset_id):
        # type: (str) -> str
        """
        Return hash used to search for the dataset id in text fields.
        This is not a strong hash and used for defining dependencies.
        :param dataset_id:
        :return:
        """
        return 'dsh{}'.format(md5text(dataset_id))

    def _build_dependency_chunk_lookup(self):
        # type: () -> Dict[str, int]
        """
        Build the dependency dataset id to number-of-chunks, lookup table
        :return: lookup dictionary from dataset-id to number of chunks
        """
        # with ThreadPool() as pool:
        #     chunks_lookup = pool.map(
        #         lambda d: (d, Dataset.get(dataset_id=d).get_num_chunks()),
        #         self._dependency_graph.keys())
        #     return dict(chunks_lookup)
        chunks_lookup = map(
            lambda d: (d, (self if d == self.id else Dataset.get(dataset_id=d)).get_num_chunks(include_parents=False)),
            self._dependency_graph.keys())
        return dict(chunks_lookup)

    def _get_cache_folder_name(self, part=None, num_parts=None):
        # type: (Optional[int], Optional[int]) -> str
        if part is None:
            return '{}{}'.format(self.__cache_folder_prefix, self._id)
        return '{}{}_{}_{}'.format(self.__cache_folder_prefix, self._id, part, num_parts)

    def _add_script_call(self, func_name, **kwargs):
        # type: (str, **Any) -> ()

        # if we never created the Task, we should not add the script calls
        if not self._created_task:
            return

        args = ', '.join('\n    {}={}'.format(k, '\''+str(v)+'\'' if isinstance(v, (str, Path, _Path)) else v)
                         for k, v in kwargs.items())
        if args:
            args += '\n'
        line = 'ds.{}({})\n'.format(func_name, args)
        self._task.data.script.diff += line
        # noinspection PyProtectedMember
        self._task._edit(script=self._task.data.script)

    def _report_dataset_genealogy(self):
        sankey_node = dict(
            label=[],
            color=[],
            customdata=[],
            hovertemplate='%{customdata}<extra></extra>',
            hoverlabel={"align": "left"},
        )
        sankey_link = dict(
            source=[],
            target=[],
            value=[],
            hovertemplate='<extra></extra>',
        )
        # get DAG nodes
        nodes = self._get_dependencies_by_order(include_unused=True, include_current=True)
        # dataset name lookup
        # noinspection PyProtectedMember
        node_names = {
            t.id: t.name
            for t in Task._query_tasks(
                task_ids=nodes, only_fields=["id", "name"], search_hidden=True, _allow_extra_fields_=True
            )
        }
        node_details = {}
        # Generate table and details
        table_values = [["Dataset id", "name", "removed", "modified", "added", "size"]]
        for node in nodes:
            count = 0
            size = 0
            for f in list(self._dataset_file_entries.values()) + list(self._dataset_link_entries.values()):
                if f.parent_dataset_id == node:
                    count += 1
                    size += f.size
            # State is of type clearml.binding.artifacts.Artifact
            node_task = Task.get_task(task_id=node)
            node_state_metadata = node_task.artifacts.get('state').metadata
            # Backwards compatibility, if the task was made before the new table change, just use the old system
            if not node_state_metadata:
                node_dataset = Dataset.get(dataset_id=node)
                removed = len(node_dataset.list_removed_files())
                added = len(node_dataset.list_added_files())
                modified = len(node_dataset.list_modified_files())
            else:
                # TODO: if new system is prevalent, get rid of old system
                removed = int(node_state_metadata.get('files removed', 0))
                added = int(node_state_metadata.get('files added', 0))
                modified = int(node_state_metadata.get('files modified', 0))

            table_values += [
                [
                    node,
                    node_names.get(node, ""),
                    removed,
                    modified,
                    added,
                    format_size(size, binary=True, use_nonbinary_notation=True, use_b_instead_of_bytes=True),
                ]
            ]
            node_details[node] = [
                removed,
                modified,
                added,
                format_size(size, binary=True, use_nonbinary_notation=True, use_b_instead_of_bytes=True),
            ]

        # create DAG
        visited = []
        # add nodes
        for idx, node in enumerate(nodes):
            visited.append(node)
            sankey_node['color'].append("mediumpurple" if node == self.id else "lightblue")
            sankey_node['label'].append('{}'.format(node))
            sankey_node['customdata'].append(
                "name {}<br />removed {}<br />modified {}<br />added {}<br />size {}".format(
                    node_names.get(node, ''), *node_details[node]))

        # add edges
        for idx, node in enumerate(nodes):
            if node in self._dependency_graph:
                parents = [visited.index(p) for p in self._dependency_graph[node] or [] if p in visited]
            else:
                parents = [visited.index(p) for p in self.get(dataset_id=node)._get_parents() or [] if p in visited]

            for p in parents:
                sankey_link['source'].append(p)
                sankey_link['target'].append(idx)
                sankey_link['value'].append(max(1, node_details[visited[p]][-2]))

        if len(nodes) > 1:
            # create the sankey graph
            dag_flow = dict(
                link=sankey_link,
                node=sankey_node,
                textfont=dict(color='rgba(0,0,0,255)', size=10),
                type='sankey',
                orientation='h'
            )
            fig = dict(data=[dag_flow], layout={'xaxis': {'visible': False}, 'yaxis': {'visible': False}})
        elif len(nodes) == 1:
            # hack, show single node sankey
            singles_flow = dict(
                x=list(range(len(nodes))), y=[1] * len(nodes),
                text=sankey_node['label'],
                customdata=sankey_node['customdata'],
                mode='markers',
                hovertemplate='%{customdata}<extra></extra>',
                marker=dict(
                    color=[v for i, v in enumerate(sankey_node['color']) if i in nodes],
                    size=[40] * len(nodes),
                ),
                showlegend=False,
                type='scatter',
            )
            # only single nodes
            fig = dict(data=[singles_flow], layout={
                'hovermode': 'closest', 'xaxis': {'visible': False}, 'yaxis': {'visible': False}})
        else:
            fig = None

        # report genealogy
        if fig:
            self._task.get_logger().report_plotly(title="__Dataset Genealogy", series="", iteration=0, figure=fig)

        # report detailed table
        self._task.get_logger().report_table(
            title="__Dataset Summary",
            series="Details",
            iteration=0,
            table_plot=table_values,
            extra_layout={"title": "Files by parent dataset id"},
        )

        # report the detailed content of the dataset as configuration,
        # this allows for easy version comparison in the UI
        dataset_details = ""
        preview_index = 0
        file_entries = sorted(list(self._dataset_file_entries.values())) + sorted(
            list(self._dataset_link_entries.values()), key=lambda x: x.link
        )
        while preview_index < self.__preview_max_file_entries and preview_index < len(file_entries):
            file = file_entries[preview_index]
            if dataset_details:
                dataset_details += "\n"
            file_name = file.relative_path
            if hasattr(file, "link"):
                file_name = file.link
            dataset_details += "{}, {}, {}".format(
                file_name,
                file.size if file.size is not None else "",
                file.hash if file.hash else "",
            )
            preview_index += 1
        if not self._ds_total_size:
            self._report_dataset_struct()
        if not self._dataset_link_entries:
            dataset_details = (
                "File Name ({} files), File Size (total {}), Hash (SHA2)\n".format(
                    len(self._dataset_file_entries),
                    format_size(
                        self._ds_total_size, binary=True, use_nonbinary_notation=True, use_b_instead_of_bytes=True
                    ),
                )
                + dataset_details
            )
        else:
            dataset_details = (
                "File Name ({} files + {} links), File Size (total {}), Hash (SHA2)\n".format(
                    len(self._dataset_file_entries),
                    len(self._dataset_link_entries),
                    format_size(
                        self._ds_total_size, binary=True, use_nonbinary_notation=True, use_b_instead_of_bytes=True
                    ),
                )
                + dataset_details
            )

        # noinspection PyProtectedMember
        self._task._set_configuration(
            name="Dataset Content",
            description="Dataset content preview",
            config_type="CSV",
            config_text=dataset_details,
        )

    def _report_dataset_struct(self):
        self._update_dependency_graph()
        current_index = 0
        dataset_struct = {}
        indices = {}
        dependency_graph_ex_copy = deepcopy(self._dependency_graph)
        # Make sure that id we reference a node as a parent, they exist on the DAG itself
        for parents in self._dependency_graph.values():
            for parent in parents:
                if parent not in self._dependency_graph:
                    dependency_graph_ex_copy[parent] = []
        # get data from the parent versions
        dependency_graph_ex = {}
        while dependency_graph_ex_copy:
            id_, parents = dependency_graph_ex_copy.popitem()
            dependency_graph_ex[id_] = parents

            task = Task.get_task(task_id=id_)
            dataset_struct_entry = {"job_id": id_, "status": task.status}
            # noinspection PyProtectedMember
            last_update = task._get_last_update()
            if last_update:
                last_update = calendar.timegm(last_update.timetuple())
            # fetch the parents of this version (task) based on what we have on the Task itself.
            # noinspection PyBroadException
            try:
                dataset_version_node = task.get_configuration_object_as_dict("Dataset Struct")
                # fine the one that is us
                for node in dataset_version_node.values():
                    if node["job_id"] != id_:
                        continue
                    for parent in node.get("parents", []):
                        parent_id = dataset_version_node[parent]["job_id"]
                        if parent_id not in dependency_graph_ex_copy and parent_id not in dependency_graph_ex:
                            # add p to dependency_graph_ex
                            dependency_graph_ex_copy[parent_id] = []
                        if parent_id not in parents:
                            parents.append(parent_id)
                    break
            except Exception:
                pass
            dataset_struct_entry["last_update"] = last_update
            dataset_struct_entry["parents"] = parents
            # noinspection PyProtectedMember
            dataset_struct_entry["job_size"] = task._get_runtime_properties().get("ds_total_size")
            dataset_struct_entry["name"] = task.name
            # noinspection PyProtectedMember
            dataset_struct_entry["version"] = task._get_runtime_properties().get("version")
            dataset_struct[str(current_index)] = dataset_struct_entry
            indices[id_] = str(current_index)
            current_index += 1
        for id_, parents in dependency_graph_ex.items():
            dataset_struct[indices[id_]]["parents"] = [indices[p] for p in parents]
        # noinspection PyProtectedMember
        self._task._set_configuration(
            name="Dataset Struct",
            description="Structure of the dataset",
            config_type="json",
            config_text=json.dumps(dataset_struct, indent=2),
        )

    def _report_dataset_preview(self):
        self.__preview_tabular_row_count = int(self.__preview_tabular_row_count)

        def convert_to_tabular_artifact(file_path_, file_extension_, compression_=None):
            # noinspection PyBroadException
            try:
                if file_extension_ == ".csv" and pd:
                    return pd.read_csv(
                        file_path_,
                        nrows=self.__preview_tabular_row_count,
                        compression=compression_.lstrip(".") if compression_ else None,
                    )
                elif file_extension_ == ".tsv" and pd:
                    return pd.read_csv(
                        file_path_,
                        sep='\t',
                        nrows=self.__preview_tabular_row_count,
                        compression=compression_.lstrip(".") if compression_ else None,
                    )
                elif file_extension_ == ".parquet" or file_extension_ == ".parq":
                    if pyarrow:
                        pf = pyarrow.parquet.ParquetFile(file_path_)
                        preview_rows = next(pf.iter_batches(batch_size=self.__preview_tabular_row_count))
                        return pyarrow.Table.from_batches([preview_rows]).to_pandas()
                    elif fastparquet:
                        return fastparquet.ParquetFile(file_path_).head(self.__preview_tabular_row_count).to_pandas()
                elif (file_extension_ == ".npz" or file_extension_ == ".npy") and np:
                    return pd.DataFrame(np.loadtxt(file_path_, max_rows=self.__preview_tabular_row_count))
            except Exception:
                pass
            return None

        compression_extensions = {".gz", ".bz2", ".zip", ".xz", ".zst"}
        tabular_extensions = {".csv", ".tsv", ".parquet", ".parq", ".npz", ".npy"}
        for file in self._dataset_file_entries.values():
            if file.local_path:
                file_path = file.local_path
            else:
                file_path = file.relative_path
            if not os.path.isfile(file_path):
                continue
            file_name = os.path.basename(file_path)
            _, file_extension = os.path.splitext(file_path)
            compression = None
            if file_extension in compression_extensions:
                compression = file_extension
                _, file_extension = os.path.splitext(file_path[: -len(file_extension)])
            if file_extension in tabular_extensions and \
                    self.__preview_tables_count >= self.__preview_tabular_table_count:
                continue
            artifact = convert_to_tabular_artifact(file_path, file_extension, compression)
            if artifact is not None:
                # noinspection PyBroadException
                try:
                    # we only use report_table if default_upload_destination is not set
                    # (it is the same as the file server)
                    # because it will not upload the sample to that destination.
                    # use report_media instead to not leak data
                    if (
                        isinstance(artifact, pd.DataFrame)
                        and self._task.get_logger().get_default_upload_destination() == Session.get_files_server_host()
                    ):
                        self._task.get_logger().report_table("Tables", "summary", table_plot=artifact)
                    else:
                        self._task.get_logger().report_media(
                            "Tables", file_name, stream=artifact.to_csv(index=False), file_extension=".txt"
                        )
                    self.__preview_tables_count += 1
                except Exception:
                    pass
                continue
            if compression:
                continue
            guessed_type = mimetypes.guess_type(file_path)
            if not guessed_type or not guessed_type[0]:
                continue
            guessed_type = guessed_type[0]
            if guessed_type.startswith("image") and self.__preview_image_count < self.__preview_media_image_count:
                self._task.get_logger().report_media("Images", file_name, local_path=file_path)
                self.__preview_image_count += 1
            elif guessed_type.startswith("video") and self.__preview_video_count < self.__preview_media_video_count:
                self._task.get_logger().report_media("Videos", file_name, local_path=file_path)
                self.__preview_video_count += 1
            elif guessed_type.startswith("audio") and self.__preview_audio_count < self.__preview_media_audio_count:
                self._task.get_logger().report_media("Audio", file_name, local_path=file_path)
                self.__preview_audio_count += 1
            elif guessed_type == "text/html" and self.__preview_html_count < self.__preview_media_html_count:
                self._task.get_logger().report_media("HTML", file_name, local_path=file_path)
                self.__preview_html_count += 1

    @classmethod
    def _set_project_system_tags(cls, task):
        from ..backend_api.services import projects
        res = task.send(projects.GetByIdRequest(project=task.project), raise_on_errors=False)
        if not res or not res.response or not res.response.project:
            return
        system_tags = res.response.project.system_tags or []
        if cls.__tag not in system_tags:
            system_tags += [cls.__tag]
            task.send(projects.UpdateRequest(project=task.project, system_tags=system_tags), raise_on_errors=False)

    def is_dirty(self):
        # type: () -> bool
        """
        Return True if the dataset has pending uploads (i.e. we cannot finalize it)

        :return: Return True means dataset has pending uploads, call 'upload' to start an upload process.
        """
        return self._dirty

    def _extract_parent_datasets(
            self,
            target_base_folder,
            dependencies_by_order,
            chunk_selection,
            use_soft_links,
            raise_on_error,
            force,
            max_workers=None
    ):
        # type: (Path, List[str], dict, bool, bool, bool, Optional[int]) -> ()
        # create thread pool, for creating soft-links / copying
        max_workers = max_workers or psutil.cpu_count()
        pool = ThreadPool(max_workers)
        for dataset_version_id in dependencies_by_order:
            # make sure we skip over empty dependencies
            if dataset_version_id not in self._dependency_graph:
                continue
            selected_chunks = chunk_selection.get(dataset_version_id) if chunk_selection else None

            ds = Dataset.get(dataset_id=dataset_version_id)
            ds_base_folder = Path(ds._get_dataset_files(
                selected_chunks=selected_chunks,
                force=force,
                lock_target_folder=True,
                cleanup_target_folder=False,
                max_workers=max_workers
            ))
            ds_base_folder.touch()

            def copy_file(file_entry):
                if file_entry.parent_dataset_id != dataset_version_id or \
                        (selected_chunks is not None and
                         self._get_chunk_idx_from_artifact_name(file_entry.artifact_name) not in selected_chunks):
                    return
                source = (ds_base_folder / file_entry.relative_path).as_posix()
                target = (target_base_folder / file_entry.relative_path).as_posix()
                try:
                    # make sure we have can overwrite the target file
                    # noinspection PyBroadException
                    try:
                        os.unlink(target)
                    except Exception:
                        Path(target).parent.mkdir(parents=True, exist_ok=True)

                    # copy / link
                    if use_soft_links:
                        if not os.path.isfile(source):
                            raise ValueError("Extracted file missing {}".format(source))
                        os.symlink(source, target)
                    else:
                        shutil.copy2(source, target, follow_symlinks=True)
                except Exception as ex:
                    LoggerRoot.get_base_logger().warning('{}\nFailed {} file {} to {}'.format(
                        ex, 'linking' if use_soft_links else 'copying', source, target))
                    return ex

                return None

            errors = pool.map(copy_file, self._dataset_file_entries.values())

            CacheManager.get_cache_manager(cache_context=self.__cache_context).unlock_cache_folder(
                ds_base_folder.as_posix())

            if raise_on_error and any(errors):
                raise ValueError("Dataset merging failed: {}".format([e for e in errors if e is not None]))
        pool.close()

    def _verify_dataset_folder(self, target_base_folder, part, chunk_selection):
        # type: (Path, Optional[int], Optional[dict]) -> bool
        target_base_folder = Path(target_base_folder)
        # check dataset file size, if we have a full match no need for parent dataset download / merge
        verified = True
        # noinspection PyBroadException
        try:
            for f in self._dataset_file_entries.values():
                # check if we need it for the current part
                if part is not None:
                    f_parts = chunk_selection.get(f.parent_dataset_id, [])
                    # this is not in our current part, no need to check it.
                    if self._get_chunk_idx_from_artifact_name(f.artifact_name) not in f_parts:
                        continue

                # check if the local size and the stored size match (faster than comparing hash)
                if (target_base_folder / f.relative_path).stat().st_size != f.size:
                    verified = False
                    break

            for f in self._dataset_link_entries.values():
                if (target_base_folder / f.relative_path).stat().st_size != f.size:
                    verified = False
                    break

        except Exception:
            verified = False

        return verified

    def _get_dependency_chunk_lookup(self):
        # type: () -> Dict[str, int]
        """
        Return The parent dataset ID to number of chunks lookup table
        :return: Dict key is dataset ID, value is total number of chunks for the specific dataset version.
        """
        if self._dependency_chunk_lookup is None:
            self._dependency_chunk_lookup = self._build_dependency_chunk_lookup()
        return self._dependency_chunk_lookup

    def _add_external_files(
        self,
        source_url,  # type: str
        wildcard=None,  # type: Optional[Union[str, Sequence[str]]]
        dataset_path=None,  # type: Optional[str]
        recursive=True,  # type: bool
        verbose=False,  # type: bool
    ):
        # type: (...) -> Tuple[int, int]
        """
        Auxiliary function for `add_external_files`
        Adds an external file or a folder to the current dataset.
        External file links can be from cloud storage (s3://, gs://, azure://) or local / network storage (file://).
        Calculates file size for each file and compares against parent.

        :param source_url: Source url link (e.g. s3://bucket/folder/path)
        :param wildcard: add only specific set of files.
            Wildcard matching, can be a single string or a list of wildcards.
        :param dataset_path: The location in the dataset where the file will be downloaded into.
            e.g: for source_url='s3://bucket/remote_folder/image.jpg' and dataset_path='s3_files',
            'image.jpg' will be downloaded to 's3_files/image.jpg' (relative path to the dataset)
        :param recursive: If True match all wildcard files recursively
        :param verbose: If True print to console files added/modified
        :return: Number of file links added and modified
        """
        if dataset_path:
            dataset_path = dataset_path.lstrip("/")
        remote_objects = None
        # noinspection PyBroadException
        try:
            if StorageManager.exists_file(source_url):
                remote_objects = [StorageManager.get_metadata(source_url, return_full_path=True)]
            elif not source_url.startswith(("http://", "https://")):
                if source_url[-1] != "/":
                    source_url = source_url + "/"
                remote_objects = StorageManager.list(source_url, with_metadata=True, return_full_path=True)
        except Exception:
            pass
        if not remote_objects:
            self._task.get_logger().report_text(
                "Could not list/find remote file(s) when adding {}".format(source_url)
            )
            return 0, 0
        num_added = 0
        num_modified = 0
        for remote_object in remote_objects:
            link = remote_object.get("name")
            relative_path = link[len(source_url):]
            if not relative_path:
                relative_path = source_url.split("/")[-1]
            if not matches_any_wildcard(relative_path, wildcard, recursive=recursive):
                continue
            try:
                relative_path = Path(os.path.join(dataset_path or ".", relative_path)).as_posix()
                size = remote_object.get("size")
                already_added_file = self._dataset_file_entries.get(relative_path)
                if relative_path not in self._dataset_link_entries:
                    if verbose:
                        self._task.get_logger().report_text(
                            "External file {} added".format(link),
                            print_console=False,
                        )
                    self._dataset_link_entries[relative_path] = LinkEntry(
                        link=link, relative_path=relative_path, parent_dataset_id=self._id, size=size
                    )
                    num_added += 1
                elif already_added_file and already_added_file.size != size:
                    if verbose:
                        self._task.get_logger().report_text(
                            "External file {} modified".format(link),
                            print_console=False,
                        )
                    del self._dataset_file_entries[relative_path]
                    self._dataset_link_entries[relative_path] = LinkEntry(
                        link=link, relative_path=relative_path, parent_dataset_id=self._id, size=size
                    )
                    num_modified += 1
                elif (
                    relative_path in self._dataset_link_entries
                    and self._dataset_link_entries[relative_path].size != size
                ):
                    if verbose:
                        self._task.get_logger().report_text(
                            "External file {} modified".format(link),
                            print_console=False,
                        )
                    self._dataset_link_entries[relative_path] = LinkEntry(
                        link=link, relative_path=relative_path, parent_dataset_id=self._id, size=size
                    )
                    num_modified += 1
                else:
                    if verbose:
                        self._task.get_logger().report_text(
                            "External file {} skipped as it was not modified".format(link),
                            print_console=False,
                        )
            except Exception as e:
                if verbose:
                    self._task.get_logger().report_text(
                        "Error '{}' encountered trying to add external file {}".format(e, link),
                        print_console=False,
                    )
        return num_added, num_modified

    def _build_chunk_selection(self, part, num_parts):
        # type: (int, int) -> Dict[str, int]
        """
        Build the selected chunks from each parent version, based on the current selection.
        Notice that for a specific part, one can only get the chunks from parent versions (not including this one)
        :param part: Current part index (between 0 and num_parts-1)
        :param num_parts: Total number of parts to divide the dataset into
        :return: Dict of Dataset ID and their respected chunks used for this part number
        """
        # get the chunk dependencies
        dependency_chunk_lookup = self._get_dependency_chunk_lookup()

        # first collect the total number of chunks
        total_chunks = sum(dependency_chunk_lookup.values())

        avg_chunk_per_part = total_chunks // num_parts
        leftover_chunks = total_chunks % num_parts

        dependencies = self._get_dependencies_by_order(include_unused=False, include_current=True)
        # create the part look up
        ds_id_chunk_list = [(d, i) for d in dependencies for i in range(dependency_chunk_lookup.get(d, 1))]

        # select the chunks for this part
        if part < leftover_chunks:
            indexes = ds_id_chunk_list[part*(avg_chunk_per_part+1):(part+1)*(avg_chunk_per_part+1)]
        else:
            ds_id_chunk_list = ds_id_chunk_list[leftover_chunks*(avg_chunk_per_part+1):]
            indexes = ds_id_chunk_list[(part-leftover_chunks)*avg_chunk_per_part:
                                       (part-leftover_chunks+1)*avg_chunk_per_part]

        # convert to lookup
        chunk_selection = {}
        for d, i in indexes:
            chunk_selection[d] = chunk_selection.get(d, []) + [i]

        return chunk_selection

    @classmethod
    def _get_dataset_id(
        cls,
        dataset_project,
        dataset_name,
        dataset_version=None,
        dataset_filter=None,
        raise_on_multiple=False,
        shallow_search=True,
    ):
        # type: (str, str, Optional[str], Optional[str], bool, bool) -> Tuple[Optional[str], Optional[str]]
        """
        Gets the dataset ID that matches a project, name and a version.

        :param dataset_project: Corresponding dataset project
        :param dataset_name: Corresponding dataset name
        :param dataset_version: The version of the corresponding dataset. If set to `None` (default),
            then get the dataset with the latest version
        :param dataset_filter: Filter the found datasets based on the criteria present in this dict.
            Has the same behaviour as `task_filter` parameter in Task.get_tasks. If None,
            the filter will have parameters set specific to datasets
        :param raise_on_multiple: If True and more than 1 dataset is found raise an Exception
        :param shallow_search: If True, search only the first 500 results (first page)

        :return: A tuple containing 2 strings: the dataset ID and the version of that dataset
        """
        dataset_filter = dataset_filter or {}
        unmodifiable_params = ["project_name", "task_name", "only_fields", "search_hidden", "_allow_extra_fields_"]
        for unmodifiable_param in unmodifiable_params:
            if unmodifiable_param in dataset_filter:
                del dataset_filter[unmodifiable_param]
        dataset_filter.setdefault("system_tags", [cls.__tag])
        # dataset_filter.setdefault("type", [str(Task.TaskTypes.data_processing)])
        dataset_filter.setdefault("order_by", ["-last_update"])
        # making sure we have the right project name here
        hidden_dataset_project, _ = cls._build_hidden_project_name(dataset_project, dataset_name)
        # noinspection PyBroadException
        try:
            # noinspection PyProtectedMember
            datasets = Task._query_tasks(
                project_name=[hidden_dataset_project] if hidden_dataset_project else None,
                task_name=exact_match_regex(dataset_name) if dataset_name else None,
                fetch_only_first_page=shallow_search,
                only_fields=["id", "runtime.version"],
                search_hidden=True,
                _allow_extra_fields_=True,
                **dataset_filter
            )
        except Exception:
            datasets = []
        if raise_on_multiple and len(datasets) > 1:
            raise ValueError(
                "Multiple datasets found with dataset_project={}, dataset_name={}, dataset_version={}".format(
                    dataset_project, dataset_name, dataset_version
                )
            )
        result_dataset = None
        for dataset in datasets:
            candidate_dataset_version = dataset.runtime.get("version")
            if not dataset_version:
                if not result_dataset:
                    result_dataset = dataset
                else:
                    # noinspection PyBroadException
                    try:

                        if (
                            candidate_dataset_version
                            and Version.is_valid_version_string(candidate_dataset_version)
                            and (
                                (
                                    not result_dataset.runtime.get("version")
                                    or not Version.is_valid_version_string(result_dataset.runtime.get("version"))
                                )
                                or (
                                    result_dataset.runtime.get("version")
                                    and Version(result_dataset.runtime.get("version"))
                                    < Version(candidate_dataset_version)
                                )
                            )
                        ):
                            result_dataset = dataset
                    except Exception:
                        pass
            elif dataset_version == candidate_dataset_version:
                if result_dataset and raise_on_multiple:
                    raise ValueError(
                        "Multiple datasets found with dataset_project={}, dataset_name={}, dataset_version={}".format(
                            dataset_project, dataset_name, dataset_version
                        )
                    )
                result_dataset = dataset
                if not raise_on_multiple:
                    break
        if not result_dataset:
            return None, None
        return result_dataset.id, result_dataset.runtime.get("version")

    @classmethod
    def _build_hidden_project_name(cls, dataset_project, dataset_name):
        # type: (str, str) -> Tuple[Optional[str], Optional[str]]
        """
        Build the corresponding hidden name of a dataset, given its `dataset_project`
        and `dataset_name`

        :param dataset_project: Dataset's project
        :param dataset_name: Dataset name passed by the user

        :return: Tuple of 2 strings, one is the corresponding hidden dataset project and one
            is the parent project
        """
        if not dataset_project:
            return None, None
        project_name = cls._remove_hidden_part_from_dataset_project(dataset_project)
        if bool(Session.check_min_api_server_version(cls.__min_api_version)):
            parent_project = "{}.datasets".format(dataset_project + "/" if dataset_project else "")
            if dataset_name:
                project_name = "{}/{}".format(parent_project, dataset_name)
        else:
            parent_project = None
            project_name = dataset_project or "Datasets"
        return project_name, parent_project

    @classmethod
    def _remove_hidden_part_from_dataset_project(cls, dataset_project):
        # type: (str) -> str
        """
        The project name contains the '.datasets' part, as well as the dataset_name.
        Remove those parts and return the project used when creating the dataset.

        :param dataset_project: Current project name

        :return: The project name without the '.datasets' part
        """
        return dataset_project.partition("/.datasets/")[0]

    @classmethod
    def _get_chunk_idx_from_artifact_name(cls, artifact_name):
        # type: (str) -> int
        if not artifact_name:
            return -1
        artifact_name = str(artifact_name)

        if artifact_name == cls.__default_data_entry_name:
            return 0
        if artifact_name.startswith(cls.__data_entry_name_prefix):
            return int(artifact_name[len(cls.__data_entry_name_prefix):])
        return -1