From 067c817f308c8d475e18fc37cf98cd43b6052382 Mon Sep 17 00:00:00 2001 From: Alex Burlacu Date: Thu, 23 Mar 2023 18:03:40 +0200 Subject: [PATCH] Allow deleting files when deleting datasets stored with clearml-data --- clearml/datasets/dataset.py | 43 ++++++++++++++++++++----------------- clearml/storage/helper.py | 5 ++++- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/clearml/datasets/dataset.py b/clearml/datasets/dataset.py index 31e7705a..e84ed439 100644 --- a/clearml/datasets/dataset.py +++ b/clearml/datasets/dataset.py @@ -1410,7 +1410,9 @@ class Dataset(object): force=False, # bool dataset_version=None, # Optional[str] entire_dataset=False, # bool - shallow_search=False # bool + shallow_search=False, # bool + delete_files=True, # bool + delete_external_files=False # bool ): # type: (...) -> () """ @@ -1426,6 +1428,9 @@ class Dataset(object): :param entire_dataset: If True, delete all datasets that match the given `dataset_project`, `dataset_name`, `dataset_version`. Note that `force` has to be True if this parameter is True :param shallow_search: If True, search only the first 500 results (first page) + :param delete_files: Delete all local files in the dataset (from the ClearML file server), as well as + all artifacts related to the dataset. + :param delete_external_files: Delete all external files in the dataset (from their external storage) """ if not any([dataset_id, dataset_project, dataset_name]): raise ValueError("Dataset deletion criteria not met. Didn't provide id/name/project correctly.") @@ -1446,28 +1451,26 @@ class Dataset(object): action="delete", ) except Exception as e: - LoggerRoot.get_base_logger().warning("Error: {}".format(str(e))) + LoggerRoot.get_base_logger().warning("Failed deleting dataset: {}".format(str(e))) return - client = APIClient() for dataset_id in dataset_ids: - task = Task.get_task(task_id=dataset_id) - if str(task.task_type) != str(Task.TaskTypes.data_processing) or cls.__tag not in ( - task.get_system_tags() or [] - ): - LoggerRoot.get_base_logger().warning("Task id={} is not of type Dataset".format(dataset_id)) + try: + dataset = Dataset.get(dataset_id=dataset_id) + except Exception as e: + LoggerRoot.get_base_logger().warning("Could not get dataset with ID {}: {}".format(dataset_id, str(e))) continue - for artifact in task.artifacts.values(): - h = StorageHelper.get(artifact.url) - # noinspection PyBroadException - try: - h.delete(artifact.url) - except Exception as ex: - LoggerRoot.get_base_logger().warning( - "Failed deleting remote file '{}': {}".format(artifact.url, ex) - ) - # this force is different than the force passed in Dataset.delete - # it indicated that we want delete a non-draft task - client.tasks.delete(task=dataset_id, force=True) + # noinspection PyProtectedMember + dataset._task.delete(delete_artifacts_and_models=delete_files) + if delete_external_files: + for external_file in dataset.link_entries: + if external_file.parent_dataset_id == dataset_id: + try: + helper = StorageHelper.get(external_file.link) + helper.delete(external_file.link) + except Exception as ex: + LoggerRoot.get_base_logger().warning( + "Failed deleting remote file '{}': {}".format(external_file.link, ex) + ) @classmethod def rename( diff --git a/clearml/storage/helper.py b/clearml/storage/helper.py index 6594026e..83fb68c6 100644 --- a/clearml/storage/helper.py +++ b/clearml/storage/helper.py @@ -1471,7 +1471,10 @@ class _HttpDriver(_Driver): def exists_file(self, container_name, object_name): # noinspection PyBroadException try: - return requests.head(container_name + object_name, allow_redirects=True).ok + container = self.get_container(container_name) + url = container_name + object_name + + return container.session.head(url, allow_redirects=True, headers=container.get_headers(url)).ok except Exception: return False