Add server-side support for deleting files from fileserver on task delete

This commit is contained in:
allegroai 2022-09-29 19:34:24 +03:00
parent 6c49e96ff0
commit 0c9e2f92ee
11 changed files with 449 additions and 6 deletions

View File

@ -103,6 +103,7 @@ class DeleteRequest(UpdateRequest):
move_to_trash = BoolField(default=True)
return_file_urls = BoolField(default=False)
delete_output_models = BoolField(default=True)
delete_external_artifacts = BoolField(default=True)
class SetRequirementsRequest(TaskRequest):
@ -180,6 +181,7 @@ class ResetRequest(UpdateRequest):
clear_all = BoolField(default=False)
return_file_urls = BoolField(default=False)
delete_output_models = BoolField(default=True)
delete_external_artifacts = BoolField(default=True)
class MultiTaskRequest(models.Base):
@ -280,6 +282,7 @@ class DeleteManyRequest(TaskBatchRequest):
return_file_urls = BoolField(default=False)
delete_output_models = BoolField(default=True)
force = BoolField(default=False)
delete_external_artifacts = BoolField(default=True)
class ResetManyRequest(TaskBatchRequest):
@ -287,6 +290,7 @@ class ResetManyRequest(TaskBatchRequest):
return_file_urls = BoolField(default=False)
delete_output_models = BoolField(default=True)
force = BoolField(default=False)
delete_external_artifacts = BoolField(default=True)
class PublishManyRequest(TaskBatchRequest):

View File

@ -1,17 +1,23 @@
from datetime import datetime
from itertools import chain
from operator import attrgetter
from typing import Sequence, Set, Tuple
import attr
from boltons.iterutils import partition
from boltons.iterutils import partition, bucketize, first
from mongoengine import NotUniqueError
from pymongo.errors import DuplicateKeyError
from apiserver.apierrors import errors
from apiserver.bll.event import EventBLL
from apiserver.bll.event.event_bll import PlotFields
from apiserver.bll.task.utils import deleted_prefix
from apiserver.config_repo import config
from apiserver.database.model.model import Model
from apiserver.database.model.task.task import Task, TaskStatus, ArtifactModes
from apiserver.database.model.url_to_delete import StorageType, UrlToDelete, FileType, DeletionStatus
from apiserver.timing_context import TimingContext
from apiserver.database.utils import id as db_id
event_bll = EventBLL()
@ -94,12 +100,76 @@ def collect_debug_image_urls(company: str, task: str) -> Set[str]:
return urls
supported_storage_types = {
"https://files": StorageType.fileserver,
}
def _schedule_for_delete(
company: str, user: str, task_id: str, urls: Set[str], can_delete_folders: bool,
) -> Set[str]:
urls_per_storage = bucketize(
urls,
key=lambda u: first(
type_
for prefix, type_ in supported_storage_types.items()
if u.startswith(prefix)
),
)
urls_per_storage.pop(None, None)
processed_urls = set()
for storage_type, storage_urls in urls_per_storage.items():
delete_folders = (storage_type == StorageType.fileserver) and can_delete_folders
scheduled_to_delete = set()
for url in storage_urls:
folder = None
if delete_folders:
folder, _, _ = url.rpartition("/")
to_delete = folder or url
if to_delete in scheduled_to_delete:
processed_urls.add(url)
continue
try:
UrlToDelete(
id=db_id(),
company=company,
user=user,
url=to_delete,
task=task_id,
created=datetime.utcnow(),
storage_type=storage_type,
type=FileType.folder if folder else FileType.file,
).save()
except (DuplicateKeyError, NotUniqueError):
existing = UrlToDelete.objects(company=company, url=to_delete).first()
if existing:
existing.update(
user=user,
task=task_id,
created=datetime.utcnow(),
retry_count=0,
unset__last_failure_time=1,
unset__last_failure_reason=1,
status=DeletionStatus.created,
)
processed_urls.add(url)
scheduled_to_delete.add(to_delete)
return processed_urls
def cleanup_task(
company: str,
user: str,
task: Task,
force: bool = False,
update_children=True,
return_file_urls=False,
delete_output_models=True,
delete_external_artifacts=True,
) -> CleanupResult:
"""
Validate task deletion and delete/modify all its output.
@ -155,6 +225,19 @@ def cleanup_task(
event_bll.delete_task_events(task.company, task.id, allow_locked=force)
if delete_external_artifacts and config.get(
"services.async_urls_delete.enabled", False
):
scheduled = _schedule_for_delete(
task_id=task.id,
company=company,
user=user,
urls=event_urls | model_urls | artifact_urls,
can_delete_folders=not in_use_model_ids and not published_models,
)
for urls in (event_urls, model_urls, artifact_urls):
urls.difference_update(scheduled)
return CleanupResult(
deleted_models=deleted_models,
updated_children=updated_children,

View File

@ -191,12 +191,14 @@ def move_tasks_to_trash(tasks: Sequence[str]) -> int:
def delete_task(
task_id: str,
company_id: str,
user_id: str,
move_to_trash: bool,
force: bool,
return_file_urls: bool,
delete_output_models: bool,
status_message: str,
status_reason: str,
delete_external_artifacts: bool,
) -> Tuple[int, Task, CleanupResult]:
task = TaskBLL.get_task_with_access(
task_id, company_id=company_id, requires_write_access=True
@ -226,10 +228,13 @@ def delete_task(
pass
cleanup_res = cleanup_task(
task,
company=company_id,
user=user_id,
task=task,
force=force,
return_file_urls=return_file_urls,
delete_output_models=delete_output_models,
delete_external_artifacts=delete_external_artifacts,
)
if move_to_trash:
@ -246,10 +251,12 @@ def delete_task(
def reset_task(
task_id: str,
company_id: str,
user_id: str,
force: bool,
return_file_urls: bool,
delete_output_models: bool,
clear_all: bool,
delete_external_artifacts: bool,
) -> Tuple[dict, CleanupResult, dict]:
task = TaskBLL.get_task_with_access(
task_id, company_id=company_id, requires_write_access=True
@ -268,11 +275,14 @@ def reset_task(
pass
cleaned_up = cleanup_task(
task,
company=company_id,
user=user_id,
task=task,
force=force,
update_children=False,
return_file_urls=return_file_urls,
delete_output_models=delete_output_models,
delete_external_artifacts=delete_external_artifacts,
)
updates.update(

View File

@ -1,3 +1,5 @@
fileserver = "http://localhost:8081"
elastic {
events {
hosts: [{host: "127.0.0.1", port: 9200}]

View File

@ -0,0 +1,5 @@
# if set to True then on task delete/reset external file urls for know storage types are scheduled for async delete
# otherwise they are returned to a client for the client side delete
enabled: false
max_retries: 3
retry_timeout_sec: 60

View File

@ -0,0 +1,51 @@
from enum import Enum
from mongoengine import StringField, DateTimeField, IntField, EnumField
from apiserver.database import Database, strict
from apiserver.database.model import AttributedDocument
class StorageType(str, Enum):
fileserver = "fileserver"
unknown = "unknown"
class FileType(str, Enum):
file = "file"
folder = "folder"
class DeletionStatus(str, Enum):
created = "created"
retrying = "retrying"
failed = "failed"
class UrlToDelete(AttributedDocument):
_field_collation_overrides = {
"url": AttributedDocument._numeric_locale,
}
meta = {
"db_alias": Database.backend,
"strict": strict,
"indexes": [
("company", "user", "task"),
"storage_type",
"created",
"retry_count",
"type",
],
}
id = StringField(primary_key=True)
url = StringField(required=True, unique_with="company")
task = StringField(required=True)
created = DateTimeField(required=True)
storage_type = EnumField(StorageType, default=StorageType.unknown)
type = EnumField(FileType, default=FileType.file)
retry_count = IntField(default=0)
last_failure_time = DateTimeField()
last_failure_reason = StringField()
status = EnumField(DeletionStatus, default=DeletionStatus.created)

View File

@ -0,0 +1,174 @@
from argparse import ArgumentParser
from collections import defaultdict
from datetime import datetime, timedelta
from functools import partial
from itertools import chain
from pathlib import Path
from time import sleep
from typing import Sequence
import requests
from furl import furl
from mongoengine import Q
from apiserver.config_repo import config
from apiserver.database import db
from apiserver.database.model.url_to_delete import UrlToDelete, DeletionStatus, StorageType
log = config.logger(f"JOB-{Path(__file__).name}")
conf = config.get("services.async_urls_delete")
max_retries = conf.get("max_retries", 3)
retry_timeout = timedelta(seconds=conf.get("retry_timeout_sec", 60))
token_expiration_sec = 600
def validate_fileserver_access(fileserver_host: str) -> str:
fileserver_host = fileserver_host or config.get("hosts.fileserver", None)
if not fileserver_host:
log.error(f"Fileserver host not configured")
exit(1)
res = requests.get(
url=fileserver_host
)
res.raise_for_status()
return fileserver_host
def mark_retry_failed(ids: Sequence[str], reason: str):
UrlToDelete.objects(id__in=ids).update(
last_failure_time=datetime.utcnow(),
last_failure_reason=reason,
inc__retry_count=1,
)
UrlToDelete.objects(id__in=ids, retry_count__gte=max_retries).update(
status=DeletionStatus.failed
)
def mark_failed(query: Q, reason: str):
UrlToDelete.objects(query).update(
status=DeletionStatus.failed,
last_failure_time=datetime.utcnow(),
last_failure_reason=reason,
)
def delete_fileserver_urls(urls_query: Q, fileserver_host: str):
to_delete = list(UrlToDelete.objects(urls_query).limit(10000))
if not to_delete:
return
paths = set()
path_to_id_mapping = defaultdict(list)
for url in to_delete:
try:
path = str(furl(url.url).path)
path = path.strip("/")
if not path:
raise ValueError("Empty path")
except Exception as ex:
err = str(ex)
log.warn(f"Error getting path for {url.url}: {err}")
mark_failed(Q(id=url.id), err)
continue
paths.add(path)
path_to_id_mapping[path].append(url.id)
if not paths:
return
ids_to_delete = set(chain.from_iterable(path_to_id_mapping.values()))
try:
res = requests.post(
url=furl(fileserver_host).add(path="delete_many").url,
json={"files": list(paths)},
)
res.raise_for_status()
except Exception as ex:
err = str(ex)
log.warn(f"Error deleting {len(paths)} files from fileserver: {err}")
mark_failed(Q(id__in=list(ids_to_delete)), err)
return
res_data = res.json()
deleted_ids = set(
chain.from_iterable(
path_to_id_mapping.get(path, []) for path in list(res_data.get("deleted", {}))
)
)
if deleted_ids:
UrlToDelete.objects(id__in=list(deleted_ids)).delete()
log.info(f"{len(deleted_ids)} files deleted from the fileserver")
failed_ids = set()
for err, error_ids in res_data.get("errors", {}).items():
error_ids = list(
chain.from_iterable(path_to_id_mapping.get(path, []) for path in error_ids)
)
mark_retry_failed(error_ids, err)
log.warning(
f"Failed to delete {len(error_ids)} files from the fileserver due to: {err}"
)
failed_ids.update(error_ids)
missing_ids = ids_to_delete - deleted_ids - failed_ids
if missing_ids:
mark_retry_failed(list(missing_ids), "Not succeeded")
def run_delete_loop(fileserver_host: str):
fileserver_host = validate_fileserver_access(fileserver_host)
storage_delete_funcs = {
StorageType.fileserver: partial(
delete_fileserver_urls, fileserver_host=fileserver_host
),
}
while True:
now = datetime.utcnow()
urls_query = (
Q(status__ne=DeletionStatus.failed)
& Q(retry_count__lt=max_retries)
& (
Q(last_failure_time__exists=False)
| Q(last_failure_time__lt=now - retry_timeout)
)
)
url_to_delete: UrlToDelete = UrlToDelete.objects(
urls_query & Q(storage_type__in=list(storage_delete_funcs))
).order_by("retry_count").limit(1).first()
if not url_to_delete:
sleep(10)
continue
company = url_to_delete.company
user = url_to_delete.user
storage_type = url_to_delete.storage_type
log.info(
f"Deleting {storage_type} objects for company: {company}, user: {user}"
)
company_storage_urls_query = urls_query & Q(
company=company, storage_type=storage_type,
)
storage_delete_funcs[storage_type](
urls_query=company_storage_urls_query
)
def main():
parser = ArgumentParser(description=__doc__)
parser.add_argument(
"--fileserver-host", "-fh", help="Fileserver host address", type=str,
)
args = parser.parse_args()
db.initialize()
run_delete_loop(args.fileserver_host)
if __name__ == "__main__":
main()

View File

@ -1489,6 +1489,13 @@ reset {
}
}
}
"2.21": ${reset."2.13"} {
request.properties.delete_external_artifacts {
description: "If set to 'true' then BE will try to delete the extenal artifacts associated with the task from the fileserver (if configured to do so)"
type: boolean
default: true
}
}
}
reset_many {
"2.13": ${_definitions.batch_operation} {
@ -1541,6 +1548,13 @@ reset_many {
}
}
}
"2.21": ${reset_many."2.13"} {
request.properties.delete_external_artifacts {
description: "If set to 'true' then BE will try to delete the extenal artifacts associated with the tasks from the fileserver (if configured to do so)"
type: boolean
default: true
}
}
}
delete_many {
"2.13": ${_definitions.batch_operation} {
@ -1591,6 +1605,13 @@ delete_many {
}
}
}
"2.21": ${delete_many."2.13"} {
request.properties.delete_external_artifacts {
description: "If set to 'true' then BE will try to delete the extenal artifacts associated with the tasks from the fileserver (if configured to do so)"
type: boolean
default: true
}
}
}
delete {
"2.1" {
@ -1655,6 +1676,13 @@ delete {
}
}
}
"2.21": ${delete."2.13"} {
request.properties.delete_external_artifacts {
description: "If set to 'true' then BE will try to delete the extenal artifacts associated with the task from the fileserver (if configured to do so)"
type: boolean
default: true
}
}
}
archive {
"2.12" {

View File

@ -81,7 +81,6 @@ from apiserver.bll.task.artifacts import (
Artifacts,
)
from apiserver.bll.task.hyperparams import HyperParams
from apiserver.bll.task.non_responsive_tasks_watchdog import NonResponsiveTasksWatchdog
from apiserver.bll.task.param_utils import (
params_prepare_for_save,
params_unprepare_from_saved,
@ -942,10 +941,12 @@ def reset(call: APICall, company_id, request: ResetRequest):
dequeued, cleanup_res, updates = reset_task(
task_id=request.task,
company_id=company_id,
user_id=call.identity.user,
force=request.force,
return_file_urls=request.return_file_urls,
delete_output_models=request.delete_output_models,
clear_all=request.clear_all,
delete_external_artifacts=request.delete_external_artifacts,
)
res = ResetResponse(**updates, dequeued=dequeued)
@ -968,10 +969,12 @@ def reset_many(call: APICall, company_id, request: ResetManyRequest):
func=partial(
reset_task,
company_id=company_id,
user_id=call.identity.user,
force=request.force,
return_file_urls=request.return_file_urls,
delete_output_models=request.delete_output_models,
clear_all=request.clear_all,
delete_external_artifacts=request.delete_external_artifacts,
),
ids=request.ids,
)
@ -1069,12 +1072,14 @@ def delete(call: APICall, company_id, request: DeleteRequest):
deleted, task, cleanup_res = delete_task(
task_id=request.task,
company_id=company_id,
user_id=call.identity.user,
move_to_trash=request.move_to_trash,
force=request.force,
return_file_urls=request.return_file_urls,
delete_output_models=request.delete_output_models,
status_message=request.status_message,
status_reason=request.status_reason,
delete_external_artifacts=request.delete_external_artifacts,
)
if deleted:
if request.move_to_trash:
@ -1089,12 +1094,14 @@ def delete_many(call: APICall, company_id, request: DeleteManyRequest):
func=partial(
delete_task,
company_id=company_id,
user_id=call.identity.user,
move_to_trash=request.move_to_trash,
force=request.force,
return_file_urls=request.return_file_urls,
delete_output_models=request.delete_output_models,
status_message=request.status_message,
status_reason=request.status_reason,
delete_external_artifacts=request.delete_external_artifacts,
),
ids=request.ids,
)

View File

@ -6,6 +6,10 @@ download {
cache_timeout_sec: 300
}
delete {
allow_batch: false
}
cors {
origins: "*"
}

View File

@ -2,7 +2,9 @@
import json
import mimetypes
import os
import shutil
from argparse import ArgumentParser
from collections import defaultdict
from pathlib import Path
from boltons.iterutils import first
@ -15,6 +17,7 @@ from werkzeug.security import safe_join
from config import config
from utils import get_env_bool
log = config.logger(__file__)
DEFAULT_UPLOAD_FOLDER = "/mnt/fileserver"
app = Flask(__name__)
@ -32,6 +35,11 @@ app.config["SEND_FILE_MAX_AGE_DEFAULT"] = config.get(
)
@app.route("/", methods=["GET"])
def ping():
return "OK", 200
@app.before_request
def before_request():
if request.content_encoding:
@ -60,6 +68,8 @@ def upload():
target.parent.mkdir(parents=True, exist_ok=True)
file.save(str(target))
results.append(file_path)
log.info(f"Uploaded {len(results)} files")
return json.dumps(results), 200
@ -83,19 +93,84 @@ def download(path):
headers["Cache-control"] = "no-cache"
headers["Pragma"] = "no-cache"
headers["Expires"] = "0"
log.info(f"Downloaded file {str(path)}")
return response
def _get_full_path(path: str) -> Path:
return Path(safe_join(os.fspath(app.config["UPLOAD_FOLDER"]), os.fspath(path)))
@app.route("/<path:path>", methods=["DELETE"])
def delete(path):
real_path = Path(safe_join(os.fspath(app.config["UPLOAD_FOLDER"]), os.fspath(path)))
real_path = _get_full_path(path)
if not real_path.exists() or not real_path.is_file():
abort(Response(f"File {str(path)} not found", 404))
log.error(f"Error deleting file {str(real_path)}. Not found or not a file")
abort(Response(f"File {str(real_path)} not found", 404))
real_path.unlink()
log.info(f"Deleted file {str(real_path)}")
return json.dumps(str(path)), 200
def batch_delete():
body = request.get_json(force=True, silent=False)
if not body:
abort(Response("Json payload is missing", 400))
files = body.get("files")
if not files:
abort(Response("files are missing", 400))
deleted = {}
errors = defaultdict(list)
log_errors = defaultdict(list)
def record_error(msg: str, file_: str, path_: Path):
errors[msg].append(file_)
log_errors[msg].append(str(path_))
for file in files:
if not file or not file.strip("/"):
# empty path may result in deleting all company data. Too dangerous
record_error("Empty path not allowed", file, file)
continue
path = _get_full_path(file)
if not path.exists():
record_error("Not found", file, path)
continue
try:
if path.is_file():
path.unlink()
elif path.is_dir():
shutil.rmtree(path)
else:
record_error("Not a file or folder", file, path)
continue
except OSError as ex:
record_error(ex.strerror, file, path)
continue
except Exception as ex:
record_error(str(ex).replace(str(path), ""), file, path)
continue
deleted[file] = str(path)
for error, paths in log_errors.items():
log.error(f"{len(paths)} files/folders cannot be deleted due to the {error}")
log.info(f"Deleted {len(deleted)} files/folders")
return json.dumps({"deleted": deleted, "errors": errors}), 200
if config.get("fileserver.delete.allow_batch"):
app.route("/delete_many", methods=["POST"])(batch_delete)
def main():
parser = ArgumentParser(description=__doc__)
parser.add_argument(