mirror of
https://github.com/clearml/clearml-server
synced 2025-06-26 23:15:47 +00:00
Add support for server-side delete for AWS S3, Google Storage and Azure Blob Storage
This commit is contained in:
parent
0ffde24dc2
commit
e66257761a
@ -9,6 +9,7 @@ from operator import attrgetter
|
|||||||
from typing import Sequence, Set, Tuple, Optional, List, Mapping, Union
|
from typing import Sequence, Set, Tuple, Optional, List, Mapping, Union
|
||||||
|
|
||||||
import elasticsearch
|
import elasticsearch
|
||||||
|
from boltons.iterutils import chunked_iter
|
||||||
from elasticsearch.helpers import BulkIndexError
|
from elasticsearch.helpers import BulkIndexError
|
||||||
from mongoengine import Q
|
from mongoengine import Q
|
||||||
from nested_dict import nested_dict
|
from nested_dict import nested_dict
|
||||||
@ -1188,8 +1189,10 @@ class EventBLL(object):
|
|||||||
Delete mutliple task events. No check is done for tasks write access
|
Delete mutliple task events. No check is done for tasks write access
|
||||||
so it should be checked by the calling code
|
so it should be checked by the calling code
|
||||||
"""
|
"""
|
||||||
es_req = {"query": {"terms": {"task": task_ids}}}
|
deleted = 0
|
||||||
with translate_errors_context():
|
with translate_errors_context():
|
||||||
|
for tasks in chunked_iter(task_ids, 100):
|
||||||
|
es_req = {"query": {"terms": {"task": tasks}}}
|
||||||
es_res = delete_company_events(
|
es_res = delete_company_events(
|
||||||
es=self.es,
|
es=self.es,
|
||||||
company_id=company_id,
|
company_id=company_id,
|
||||||
@ -1197,6 +1200,8 @@ class EventBLL(object):
|
|||||||
body=es_req,
|
body=es_req,
|
||||||
**self._get_events_deletion_params(async_delete),
|
**self._get_events_deletion_params(async_delete),
|
||||||
)
|
)
|
||||||
|
if not async_delete:
|
||||||
|
deleted += es_res.get("deleted", 0)
|
||||||
|
|
||||||
if not async_delete:
|
if not async_delete:
|
||||||
return es_res.get("deleted", 0)
|
return es_res.get("deleted", 0)
|
||||||
|
48
apiserver/bll/storage/__init__.py
Normal file
48
apiserver/bll/storage/__init__.py
Normal file
@ -0,0 +1,48 @@
|
|||||||
|
from copy import copy
|
||||||
|
|
||||||
|
from boltons.cacheutils import cachedproperty
|
||||||
|
from clearml.backend_config.bucket_config import (
|
||||||
|
S3BucketConfigurations,
|
||||||
|
AzureContainerConfigurations,
|
||||||
|
GSBucketConfigurations,
|
||||||
|
)
|
||||||
|
|
||||||
|
from apiserver.config_repo import config
|
||||||
|
|
||||||
|
|
||||||
|
log = config.logger(__file__)
|
||||||
|
|
||||||
|
|
||||||
|
class StorageBLL:
|
||||||
|
default_aws_configs: S3BucketConfigurations = None
|
||||||
|
conf = config.get("services.storage_credentials")
|
||||||
|
|
||||||
|
@cachedproperty
|
||||||
|
def _default_aws_configs(self) -> S3BucketConfigurations:
|
||||||
|
return S3BucketConfigurations.from_config(self.conf.get("aws.s3"))
|
||||||
|
|
||||||
|
@cachedproperty
|
||||||
|
def _default_azure_configs(self) -> AzureContainerConfigurations:
|
||||||
|
return AzureContainerConfigurations.from_config(self.conf.get("azure.storage"))
|
||||||
|
|
||||||
|
@cachedproperty
|
||||||
|
def _default_gs_configs(self) -> GSBucketConfigurations:
|
||||||
|
return GSBucketConfigurations.from_config(self.conf.get("google.storage"))
|
||||||
|
|
||||||
|
def get_azure_settings_for_company(
|
||||||
|
self,
|
||||||
|
company_id: str,
|
||||||
|
) -> AzureContainerConfigurations:
|
||||||
|
return copy(self._default_azure_configs)
|
||||||
|
|
||||||
|
def get_gs_settings_for_company(
|
||||||
|
self,
|
||||||
|
company_id: str,
|
||||||
|
) -> GSBucketConfigurations:
|
||||||
|
return copy(self._default_gs_configs)
|
||||||
|
|
||||||
|
def get_aws_settings_for_company(
|
||||||
|
self,
|
||||||
|
company_id: str,
|
||||||
|
) -> S3BucketConfigurations:
|
||||||
|
return copy(self._default_aws_configs)
|
@ -5,6 +5,7 @@ from typing import Sequence, Set, Tuple
|
|||||||
|
|
||||||
import attr
|
import attr
|
||||||
from boltons.iterutils import partition, bucketize, first
|
from boltons.iterutils import partition, bucketize, first
|
||||||
|
from furl import furl
|
||||||
from mongoengine import NotUniqueError
|
from mongoengine import NotUniqueError
|
||||||
from pymongo.errors import DuplicateKeyError
|
from pymongo.errors import DuplicateKeyError
|
||||||
|
|
||||||
@ -106,6 +107,9 @@ def collect_debug_image_urls(company: str, task_or_model: str) -> Set[str]:
|
|||||||
supported_storage_types = {
|
supported_storage_types = {
|
||||||
"https://": StorageType.fileserver,
|
"https://": StorageType.fileserver,
|
||||||
"http://": StorageType.fileserver,
|
"http://": StorageType.fileserver,
|
||||||
|
"s3://": StorageType.s3,
|
||||||
|
"azure://": StorageType.azure,
|
||||||
|
"gs://": StorageType.gs,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -129,7 +133,14 @@ def _schedule_for_delete(
|
|||||||
for url in storage_urls:
|
for url in storage_urls:
|
||||||
folder = None
|
folder = None
|
||||||
if delete_folders:
|
if delete_folders:
|
||||||
folder, _, _ = url.rpartition("/")
|
try:
|
||||||
|
parsed = furl(url)
|
||||||
|
if parsed.path and len(parsed.path.segments) > 1:
|
||||||
|
folder = parsed.remove(
|
||||||
|
args=True, fragment=True, path=parsed.path.segments[-1]
|
||||||
|
).url.rstrip("/")
|
||||||
|
except Exception as ex:
|
||||||
|
pass
|
||||||
|
|
||||||
to_delete = folder or url
|
to_delete = folder or url
|
||||||
if to_delete in scheduled_to_delete:
|
if to_delete in scheduled_to_delete:
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# if set to True then on task delete/reset external file urls for know storage types are scheduled for async delete
|
# if set to True then on task delete/reset external file urls for know storage types are scheduled for async delete
|
||||||
# otherwise they are returned to a client for the client side delete
|
# otherwise they are returned to a client for the client side delete
|
||||||
enabled: false
|
enabled: true
|
||||||
max_retries: 3
|
max_retries: 3
|
||||||
retry_timeout_sec: 60
|
retry_timeout_sec: 60
|
||||||
|
|
||||||
|
53
apiserver/config/default/services/storage_credentials.conf
Normal file
53
apiserver/config/default/services/storage_credentials.conf
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
aws {
|
||||||
|
s3 {
|
||||||
|
# S3 credentials, used for read/write access by various SDK elements
|
||||||
|
# default, used for any bucket not specified below
|
||||||
|
key: ""
|
||||||
|
secret: ""
|
||||||
|
region: ""
|
||||||
|
use_credentials_chain: false
|
||||||
|
# Additional ExtraArgs passed to boto3 when uploading files. Can also be set per-bucket under "credentials".
|
||||||
|
extra_args: {}
|
||||||
|
credentials: [
|
||||||
|
# specifies key/secret credentials to use when handling s3 urls (read or write)
|
||||||
|
# {
|
||||||
|
# bucket: "my-bucket-name"
|
||||||
|
# key: "my-access-key"
|
||||||
|
# secret: "my-secret-key"
|
||||||
|
# },
|
||||||
|
{
|
||||||
|
# This will apply to all buckets in this host (unless key/value is specifically provided for a given bucket)
|
||||||
|
host: "localhost:9000"
|
||||||
|
key: "evg_user"
|
||||||
|
secret: "evg_pass"
|
||||||
|
multipart: false
|
||||||
|
secure: false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
google.storage {
|
||||||
|
# Default project and credentials file
|
||||||
|
# Will be used when no bucket configuration is found
|
||||||
|
// project: "clearml"
|
||||||
|
// credentials_json: "/path/to/credentials.json"
|
||||||
|
//
|
||||||
|
// # Specific credentials per bucket and sub directory
|
||||||
|
// credentials = [
|
||||||
|
// {
|
||||||
|
// bucket: "my-bucket"
|
||||||
|
// subdir: "path/in/bucket" # Not required
|
||||||
|
// project: "clearml"
|
||||||
|
// credentials_json: "/path/to/credentials.json"
|
||||||
|
// },
|
||||||
|
// ]
|
||||||
|
}
|
||||||
|
azure.storage {
|
||||||
|
# containers: [
|
||||||
|
# {
|
||||||
|
# account_name: "clearml"
|
||||||
|
# account_key: "secret"
|
||||||
|
# # container_name:
|
||||||
|
# }
|
||||||
|
# ]
|
||||||
|
}
|
@ -8,6 +8,9 @@ from apiserver.database.model import AttributedDocument
|
|||||||
|
|
||||||
class StorageType(str, Enum):
|
class StorageType(str, Enum):
|
||||||
fileserver = "fileserver"
|
fileserver = "fileserver"
|
||||||
|
s3 = "s3"
|
||||||
|
azure = "azure"
|
||||||
|
gs = "gs"
|
||||||
unknown = "unknown"
|
unknown = "unknown"
|
||||||
|
|
||||||
|
|
||||||
@ -32,10 +35,8 @@ class UrlToDelete(AttributedDocument):
|
|||||||
"strict": strict,
|
"strict": strict,
|
||||||
"indexes": [
|
"indexes": [
|
||||||
("company", "user", "task"),
|
("company", "user", "task"),
|
||||||
"storage_type",
|
("company", "storage_type", "url"),
|
||||||
"created",
|
("status", "retry_count", "storage_type"),
|
||||||
"retry_count",
|
|
||||||
"type",
|
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
import os
|
||||||
|
from abc import ABC, ABCMeta, abstractmethod
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
@ -5,38 +7,28 @@ from functools import partial
|
|||||||
from itertools import chain
|
from itertools import chain
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import Sequence, Tuple
|
from typing import Sequence, Optional, Tuple, Mapping, TypeVar, Hashable, Generic
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import boto3
|
||||||
import requests
|
import requests
|
||||||
|
from azure.storage.blob import ContainerClient, PartialBatchErrorException
|
||||||
|
from boltons.iterutils import bucketize, chunked_iter
|
||||||
from furl import furl
|
from furl import furl
|
||||||
|
from google.cloud import storage as google_storage
|
||||||
from mongoengine import Q
|
from mongoengine import Q
|
||||||
|
from mypy_boto3_s3.service_resource import Bucket as AWSBucket
|
||||||
|
|
||||||
|
from apiserver.bll.storage import StorageBLL
|
||||||
from apiserver.config_repo import config
|
from apiserver.config_repo import config
|
||||||
from apiserver.database import db
|
from apiserver.database import db
|
||||||
from apiserver.database.model.url_to_delete import (
|
from apiserver.database.model.url_to_delete import UrlToDelete, StorageType, DeletionStatus
|
||||||
UrlToDelete,
|
|
||||||
DeletionStatus,
|
|
||||||
StorageType,
|
|
||||||
)
|
|
||||||
|
|
||||||
log = config.logger(f"JOB-{Path(__file__).name}")
|
log = config.logger(f"JOB-{Path(__file__).name}")
|
||||||
conf = config.get("services.async_urls_delete")
|
conf = config.get("services.async_urls_delete")
|
||||||
max_retries = conf.get("max_retries", 3)
|
max_retries = conf.get("max_retries", 3)
|
||||||
retry_timeout = timedelta(seconds=conf.get("retry_timeout_sec", 60))
|
retry_timeout = timedelta(seconds=conf.get("retry_timeout_sec", 60))
|
||||||
fileserver_timeout = conf.get("fileserver.timeout_sec", 300)
|
storage_bll = StorageBLL()
|
||||||
UrlPrefix = Tuple[str, str]
|
|
||||||
|
|
||||||
|
|
||||||
def validate_fileserver_access(fileserver_host: str) -> str:
|
|
||||||
fileserver_host = fileserver_host or config.get("hosts.fileserver", None)
|
|
||||||
if not fileserver_host:
|
|
||||||
log.error(f"Fileserver host not configured")
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
res = requests.get(url=fileserver_host)
|
|
||||||
res.raise_for_status()
|
|
||||||
|
|
||||||
return fileserver_host
|
|
||||||
|
|
||||||
|
|
||||||
def mark_retry_failed(ids: Sequence[str], reason: str):
|
def mark_retry_failed(ids: Sequence[str], reason: str):
|
||||||
@ -58,90 +50,163 @@ def mark_failed(query: Q, reason: str):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def delete_fileserver_urls(
|
def scheme_prefix(scheme: str) -> str:
|
||||||
urls_query: Q, fileserver_host: str, url_prefixes: Sequence[UrlPrefix]
|
return str(furl(scheme=scheme, netloc=""))
|
||||||
):
|
|
||||||
to_delete = list(UrlToDelete.objects(urls_query).limit(10000))
|
|
||||||
|
T = TypeVar("T", bound=Hashable)
|
||||||
|
|
||||||
|
|
||||||
|
class Storage(Generic[T], metaclass=ABCMeta):
|
||||||
|
class Client(ABC):
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def chunk_size(self) -> int:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_path(self, url: UrlToDelete) -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def delete_many(
|
||||||
|
self, paths: Sequence[str]
|
||||||
|
) -> Tuple[Sequence[str], Mapping[str, Sequence[str]]]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def name(self) -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def group_urls(
|
||||||
|
self, urls: Sequence[UrlToDelete]
|
||||||
|
) -> Mapping[T, Sequence[UrlToDelete]]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_client(self, base: T, urls: Sequence[UrlToDelete]) -> Client:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def delete_urls(urls_query: Q, storage: Storage):
|
||||||
|
to_delete = list(UrlToDelete.objects(urls_query).order_by("url").limit(10000))
|
||||||
if not to_delete:
|
if not to_delete:
|
||||||
return
|
return
|
||||||
|
|
||||||
def resolve_path(url_: UrlToDelete) -> str:
|
grouped_urls = storage.group_urls(to_delete)
|
||||||
parsed = furl(url_.url)
|
for base, urls in grouped_urls.items():
|
||||||
url_host = f"{parsed.scheme}://{parsed.netloc}" if parsed.scheme else None
|
if not base:
|
||||||
url_path = str(parsed.path)
|
msg = f"Invalid {storage.name} url or missing {storage.name} configuration for account"
|
||||||
|
mark_failed(
|
||||||
for host, path_prefix in url_prefixes:
|
Q(id__in=[url.id for url in urls]), msg,
|
||||||
if host and url_host != host:
|
)
|
||||||
|
log.warning(
|
||||||
|
f"Failed to delete {len(urls)} files from {storage.name} due to: {msg}"
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
if path_prefix and not url_path.startswith(path_prefix + "/"):
|
|
||||||
continue
|
|
||||||
return url_path[len(path_prefix or ""):]
|
|
||||||
|
|
||||||
raise ValueError("could not map path")
|
|
||||||
|
|
||||||
paths = set()
|
|
||||||
path_to_id_mapping = defaultdict(list)
|
|
||||||
for url in to_delete:
|
|
||||||
try:
|
try:
|
||||||
path = resolve_path(url)
|
client = storage.get_client(base, urls)
|
||||||
path = path.strip("/")
|
except Exception as ex:
|
||||||
if not path:
|
failed = [url.id for url in urls]
|
||||||
raise ValueError("Empty path")
|
mark_retry_failed(failed, reason=str(ex))
|
||||||
|
log.warning(
|
||||||
|
f"Failed to delete {len(failed)} files from {storage.name} due to: {str(ex)}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for chunk in chunked_iter(urls, client.chunk_size):
|
||||||
|
paths = []
|
||||||
|
path_to_id_mapping = defaultdict(list)
|
||||||
|
ids_to_delete = set()
|
||||||
|
for url in chunk:
|
||||||
|
try:
|
||||||
|
path = client.get_path(url)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
err = str(ex)
|
err = str(ex)
|
||||||
log.warn(f"Error getting path for {url.url}: {err}")
|
|
||||||
mark_failed(Q(id=url.id), err)
|
mark_failed(Q(id=url.id), err)
|
||||||
|
log.warning(f"Error getting path for {url.url}: {err}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
paths.add(path)
|
paths.append(path)
|
||||||
path_to_id_mapping[path].append(url.id)
|
path_to_id_mapping[path].append(url.id)
|
||||||
|
ids_to_delete.add(url.id)
|
||||||
|
|
||||||
if not paths:
|
if not paths:
|
||||||
return
|
continue
|
||||||
|
|
||||||
ids_to_delete = set(chain.from_iterable(path_to_id_mapping.values()))
|
|
||||||
try:
|
try:
|
||||||
res = requests.post(
|
deleted_paths, errors = client.delete_many(paths)
|
||||||
url=furl(fileserver_host).add(path="delete_many").url,
|
|
||||||
json={"files": list(paths)},
|
|
||||||
timeout=fileserver_timeout,
|
|
||||||
)
|
|
||||||
res.raise_for_status()
|
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
err = str(ex)
|
mark_retry_failed([url.id for url in urls], str(ex))
|
||||||
log.warn(f"Error deleting {len(paths)} files from fileserver: {err}")
|
log.warning(
|
||||||
mark_retry_failed(list(ids_to_delete), err)
|
f"Error deleting {len(paths)} files from {storage.name}: {str(ex)}"
|
||||||
return
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
failed_ids = set()
|
||||||
|
for reason, err_paths in errors.items():
|
||||||
|
error_ids = set(
|
||||||
|
chain.from_iterable(
|
||||||
|
path_to_id_mapping.get(p, []) for p in err_paths
|
||||||
|
)
|
||||||
|
)
|
||||||
|
mark_retry_failed(list(error_ids), reason)
|
||||||
|
log.warning(
|
||||||
|
f"Failed to delete {len(error_ids)} files from {storage.name} storage due to: {reason}"
|
||||||
|
)
|
||||||
|
failed_ids.update(error_ids)
|
||||||
|
|
||||||
res_data = res.json()
|
|
||||||
deleted_ids = set(
|
deleted_ids = set(
|
||||||
chain.from_iterable(
|
chain.from_iterable(
|
||||||
path_to_id_mapping.get(path, [])
|
path_to_id_mapping.get(p, []) for p in deleted_paths
|
||||||
for path in list(res_data.get("deleted", {}))
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if deleted_ids:
|
if deleted_ids:
|
||||||
UrlToDelete.objects(id__in=list(deleted_ids)).delete()
|
UrlToDelete.objects(id__in=list(deleted_ids)).delete()
|
||||||
log.info(f"{len(deleted_ids)} files deleted from the fileserver")
|
log.info(
|
||||||
|
f"{len(deleted_ids)} files deleted from {storage.name} storage"
|
||||||
failed_ids = set()
|
|
||||||
for err, error_ids in res_data.get("errors", {}).items():
|
|
||||||
error_ids = list(
|
|
||||||
chain.from_iterable(path_to_id_mapping.get(path, []) for path in error_ids)
|
|
||||||
)
|
)
|
||||||
mark_retry_failed(error_ids, err)
|
|
||||||
log.warning(
|
|
||||||
f"Failed to delete {len(error_ids)} files from the fileserver due to: {err}"
|
|
||||||
)
|
|
||||||
failed_ids.update(error_ids)
|
|
||||||
|
|
||||||
missing_ids = ids_to_delete - deleted_ids - failed_ids
|
missing_ids = ids_to_delete - deleted_ids - failed_ids
|
||||||
if missing_ids:
|
if missing_ids:
|
||||||
mark_retry_failed(list(missing_ids), "Not succeeded")
|
mark_retry_failed(list(missing_ids), "Not succeeded")
|
||||||
|
|
||||||
|
|
||||||
def _get_fileserver_url_prefixes(fileserver_host: str) -> Sequence[UrlPrefix]:
|
class FileserverStorage(Storage):
|
||||||
def _parse_url_prefix(prefix) -> UrlPrefix:
|
class Client(Storage.Client):
|
||||||
|
timeout = conf.get("fileserver.timeout_sec", 300)
|
||||||
|
|
||||||
|
def __init__(self, session: requests.Session, host: str):
|
||||||
|
self.session = session
|
||||||
|
self.delete_url = furl(host).add(path="delete_many").url
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chunk_size(self) -> int:
|
||||||
|
return 10000
|
||||||
|
|
||||||
|
def get_path(self, url: UrlToDelete) -> str:
|
||||||
|
path = url.url.strip("/")
|
||||||
|
if not path:
|
||||||
|
raise ValueError("Empty path")
|
||||||
|
|
||||||
|
return path
|
||||||
|
|
||||||
|
def delete_many(
|
||||||
|
self, paths: Sequence[str]
|
||||||
|
) -> Tuple[Sequence[str], Mapping[str, Sequence[str]]]:
|
||||||
|
res = self.session.post(
|
||||||
|
url=self.delete_url, json={"files": list(paths)}, timeout=self.timeout
|
||||||
|
)
|
||||||
|
res.raise_for_status()
|
||||||
|
res_data = res.json()
|
||||||
|
return list(res_data.get("deleted", {})), res_data.get("errors", {})
|
||||||
|
|
||||||
|
def __init__(self, company: str, fileserver_host: str = None):
|
||||||
|
fileserver_host = fileserver_host or config.get("hosts.fileserver", None)
|
||||||
|
self.host = fileserver_host.rstrip("/")
|
||||||
|
if not self.host:
|
||||||
|
log.warning(f"Fileserver host not configured")
|
||||||
|
|
||||||
|
def _parse_url_prefix(prefix) -> Tuple[str, str]:
|
||||||
url = furl(prefix)
|
url = furl(prefix)
|
||||||
host = f"{url.scheme}://{url.netloc}" if url.scheme else None
|
host = f"{url.scheme}://{url.netloc}" if url.scheme else None
|
||||||
return host, str(url.path).rstrip("/")
|
return host, str(url.path).rstrip("/")
|
||||||
@ -149,21 +214,353 @@ def _get_fileserver_url_prefixes(fileserver_host: str) -> Sequence[UrlPrefix]:
|
|||||||
url_prefixes = [
|
url_prefixes = [
|
||||||
_parse_url_prefix(p) for p in conf.get("fileserver.url_prefixes", [])
|
_parse_url_prefix(p) for p in conf.get("fileserver.url_prefixes", [])
|
||||||
]
|
]
|
||||||
if not any(fileserver_host == host for host, _ in url_prefixes):
|
if not any(self.host == host for host, _ in url_prefixes):
|
||||||
url_prefixes.append((fileserver_host, ""))
|
url_prefixes.append((self.host, ""))
|
||||||
|
self.url_prefixes = url_prefixes
|
||||||
|
|
||||||
return url_prefixes
|
self.company = company
|
||||||
|
|
||||||
|
# @classmethod
|
||||||
|
# def validate_fileserver_access(cls, fileserver_host: str):
|
||||||
|
# res = requests.get(
|
||||||
|
# url=fileserver_host
|
||||||
|
# )
|
||||||
|
# res.raise_for_status()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "Fileserver"
|
||||||
|
|
||||||
|
def _resolve_base_url(self, url: UrlToDelete) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
For the url return the base_url containing schema, optional host and bucket name
|
||||||
|
"""
|
||||||
|
if not url.url:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
parsed = furl(url.url)
|
||||||
|
url_host = f"{parsed.scheme}://{parsed.netloc}" if parsed.scheme else None
|
||||||
|
url_path = str(parsed.path)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
for host, path_prefix in self.url_prefixes:
|
||||||
|
if host and url_host != host:
|
||||||
|
continue
|
||||||
|
if path_prefix and not url_path.startswith(path_prefix + "/"):
|
||||||
|
continue
|
||||||
|
url.url = url_path[len(path_prefix or "") :]
|
||||||
|
return self.host
|
||||||
|
|
||||||
|
def group_urls(
|
||||||
|
self, urls: Sequence[UrlToDelete]
|
||||||
|
) -> Mapping[str, Sequence[UrlToDelete]]:
|
||||||
|
return bucketize(urls, key=self._resolve_base_url)
|
||||||
|
|
||||||
|
def get_client(self, base: str, urls: Sequence[UrlToDelete]) -> Client:
|
||||||
|
host = base
|
||||||
|
session = requests.session()
|
||||||
|
res = session.get(url=host, timeout=self.Client.timeout)
|
||||||
|
res.raise_for_status()
|
||||||
|
|
||||||
|
return self.Client(session, host)
|
||||||
|
|
||||||
|
|
||||||
|
class AzureStorage(Storage):
|
||||||
|
class Client(Storage.Client):
|
||||||
|
def __init__(self, container: ContainerClient):
|
||||||
|
self.container = container
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chunk_size(self) -> int:
|
||||||
|
return 256
|
||||||
|
|
||||||
|
def get_path(self, url: UrlToDelete) -> str:
|
||||||
|
parsed = furl(url.url)
|
||||||
|
if (
|
||||||
|
not parsed.path
|
||||||
|
or not parsed.path.segments
|
||||||
|
or len(parsed.path.segments) <= 1
|
||||||
|
):
|
||||||
|
raise ValueError("No path found following container name")
|
||||||
|
|
||||||
|
return os.path.join(*parsed.path.segments[1:])
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _path_from_request_url(request_url: str) -> str:
|
||||||
|
try:
|
||||||
|
return furl(request_url).path.segments[-1]
|
||||||
|
except Exception:
|
||||||
|
return request_url
|
||||||
|
|
||||||
|
def delete_many(
|
||||||
|
self, paths: Sequence[str]
|
||||||
|
) -> Tuple[Sequence[str], Mapping[str, Sequence[str]]]:
|
||||||
|
try:
|
||||||
|
res = self.container.delete_blobs(*paths)
|
||||||
|
except PartialBatchErrorException as pex:
|
||||||
|
deleted = []
|
||||||
|
errors = defaultdict(list)
|
||||||
|
for part in pex.parts:
|
||||||
|
if 300 >= part.status_code >= 200:
|
||||||
|
deleted.append(self._path_from_request_url(part.request.url))
|
||||||
|
else:
|
||||||
|
errors[part.reason].append(
|
||||||
|
self._path_from_request_url(part.request.url)
|
||||||
|
)
|
||||||
|
return deleted, errors
|
||||||
|
|
||||||
|
return [self._path_from_request_url(part.request.url) for part in res], {}
|
||||||
|
|
||||||
|
def __init__(self, company: str):
|
||||||
|
self.configs = storage_bll.get_azure_settings_for_company(company)
|
||||||
|
self.scheme = "azure"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "Azure"
|
||||||
|
|
||||||
|
def _resolve_base_url(self, url: UrlToDelete) -> Optional[Tuple]:
|
||||||
|
"""
|
||||||
|
For the url return the base_url containing schema, optional host and bucket name
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url.url)
|
||||||
|
if parsed.scheme != self.scheme:
|
||||||
|
return None
|
||||||
|
|
||||||
|
azure_conf = self.configs.get_config_by_uri(url.url)
|
||||||
|
if azure_conf is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
account_url = parsed.netloc
|
||||||
|
return account_url, azure_conf.container_name
|
||||||
|
except Exception as ex:
|
||||||
|
log.warning(f"Error resolving base url for {url.url}: " + str(ex))
|
||||||
|
return None
|
||||||
|
|
||||||
|
def group_urls(
|
||||||
|
self, urls: Sequence[UrlToDelete]
|
||||||
|
) -> Mapping[Tuple, Sequence[UrlToDelete]]:
|
||||||
|
return bucketize(urls, key=self._resolve_base_url)
|
||||||
|
|
||||||
|
def get_client(self, base: Tuple, urls: Sequence[UrlToDelete]) -> Client:
|
||||||
|
account_url, container_name = base
|
||||||
|
sample_url = urls[0].url
|
||||||
|
cfg = self.configs.get_config_by_uri(sample_url)
|
||||||
|
if not cfg or not cfg.account_name or not cfg.account_key:
|
||||||
|
raise ValueError(
|
||||||
|
f"Missing account name or key for Azure Blob Storage "
|
||||||
|
f"account: {account_url}, container: {container_name}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return self.Client(
|
||||||
|
ContainerClient(
|
||||||
|
account_url=account_url,
|
||||||
|
container_name=cfg.container_name,
|
||||||
|
credential={
|
||||||
|
"account_name": cfg.account_name,
|
||||||
|
"account_key": cfg.account_key,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AWSStorage(Storage):
|
||||||
|
class Client(Storage.Client):
|
||||||
|
def __init__(self, base_url: str, container: AWSBucket):
|
||||||
|
self.container = container
|
||||||
|
self.base_url = base_url
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chunk_size(self) -> int:
|
||||||
|
return 1000
|
||||||
|
|
||||||
|
def get_path(self, url: UrlToDelete) -> str:
|
||||||
|
""" Normalize remote path. Remove any prefix that is already handled by the container """
|
||||||
|
path = url.url
|
||||||
|
if path.startswith(self.base_url):
|
||||||
|
path = path[len(self.base_url) :]
|
||||||
|
path = path.lstrip("/")
|
||||||
|
return path
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _path_from_request_url(request_url: str) -> str:
|
||||||
|
try:
|
||||||
|
return furl(request_url).path.segments[-1]
|
||||||
|
except Exception:
|
||||||
|
return request_url
|
||||||
|
|
||||||
|
def delete_many(
|
||||||
|
self, paths: Sequence[str]
|
||||||
|
) -> Tuple[Sequence[str], Mapping[str, Sequence[str]]]:
|
||||||
|
res = self.container.delete_objects(
|
||||||
|
Delete={"Objects": [{"Key": p} for p in paths]}
|
||||||
|
)
|
||||||
|
errors = defaultdict(list)
|
||||||
|
for err in res.get("Errors", []):
|
||||||
|
msg = err.get("Message", "")
|
||||||
|
errors[msg].append(err.get("Key"))
|
||||||
|
|
||||||
|
return [d.get("Key") for d in res.get("Deleted", [])], errors
|
||||||
|
|
||||||
|
def __init__(self, company: str):
|
||||||
|
self.configs = storage_bll.get_aws_settings_for_company(company)
|
||||||
|
self.scheme = "s3"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "AWS"
|
||||||
|
|
||||||
|
def _resolve_base_url(self, url: UrlToDelete) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
For the url return the base_url containing schema, optional host and bucket name
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url.url)
|
||||||
|
if parsed.scheme != self.scheme:
|
||||||
|
return None
|
||||||
|
|
||||||
|
s3_conf = self.configs.get_config_by_uri(url.url)
|
||||||
|
if s3_conf is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
s3_bucket = s3_conf.bucket
|
||||||
|
if not s3_bucket:
|
||||||
|
parts = Path(parsed.path.strip("/")).parts
|
||||||
|
if parts:
|
||||||
|
s3_bucket = parts[0]
|
||||||
|
return "/".join(filter(None, ("s3:/", s3_conf.host, s3_bucket)))
|
||||||
|
except Exception as ex:
|
||||||
|
log.warning(f"Error resolving base url for {url.url}: " + str(ex))
|
||||||
|
return None
|
||||||
|
|
||||||
|
def group_urls(
|
||||||
|
self, urls: Sequence[UrlToDelete]
|
||||||
|
) -> Mapping[str, Sequence[UrlToDelete]]:
|
||||||
|
return bucketize(urls, key=self._resolve_base_url)
|
||||||
|
|
||||||
|
def get_client(self, base: str, urls: Sequence[UrlToDelete]) -> Client:
|
||||||
|
sample_url = urls[0].url
|
||||||
|
cfg = self.configs.get_config_by_uri(sample_url)
|
||||||
|
boto_kwargs = {
|
||||||
|
"endpoint_url": (("https://" if cfg.secure else "http://") + cfg.host)
|
||||||
|
if cfg.host
|
||||||
|
else None,
|
||||||
|
"use_ssl": cfg.secure,
|
||||||
|
"verify": cfg.verify,
|
||||||
|
}
|
||||||
|
name = base[len(scheme_prefix(self.scheme)) :]
|
||||||
|
bucket_name = name[len(cfg.host) + 1 :] if cfg.host else name
|
||||||
|
if not cfg.use_credentials_chain:
|
||||||
|
if not cfg.key or not cfg.secret:
|
||||||
|
raise ValueError(
|
||||||
|
f"Missing key or secret for AWS S3 host: {cfg.host}, bucket: {str(bucket_name)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
boto_kwargs["aws_access_key_id"] = cfg.key
|
||||||
|
boto_kwargs["aws_secret_access_key"] = cfg.secret
|
||||||
|
if cfg.token:
|
||||||
|
boto_kwargs["aws_session_token"] = cfg.token
|
||||||
|
|
||||||
|
return self.Client(
|
||||||
|
base, boto3.resource("s3", **boto_kwargs).Bucket(bucket_name)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleCloudStorage(Storage):
|
||||||
|
class Client(Storage.Client):
|
||||||
|
def __init__(self, base_url: str, container: google_storage.Bucket):
|
||||||
|
self.container = container
|
||||||
|
self.base_url = base_url
|
||||||
|
|
||||||
|
@property
|
||||||
|
def chunk_size(self) -> int:
|
||||||
|
return 100
|
||||||
|
|
||||||
|
def get_path(self, url: UrlToDelete) -> str:
|
||||||
|
""" Normalize remote path. Remove any prefix that is already handled by the container """
|
||||||
|
path = url.url
|
||||||
|
if path.startswith(self.base_url):
|
||||||
|
path = path[len(self.base_url) :]
|
||||||
|
path = path.lstrip("/")
|
||||||
|
return path
|
||||||
|
|
||||||
|
def delete_many(
|
||||||
|
self, paths: Sequence[str]
|
||||||
|
) -> Tuple[Sequence[str], Mapping[str, Sequence[str]]]:
|
||||||
|
not_found = set()
|
||||||
|
|
||||||
|
def error_callback(blob: google_storage.Blob):
|
||||||
|
not_found.add(blob.name)
|
||||||
|
|
||||||
|
self.container.delete_blobs(
|
||||||
|
[self.container.blob(p) for p in paths], on_error=error_callback,
|
||||||
|
)
|
||||||
|
errors = {"Not found": list(not_found)} if not_found else {}
|
||||||
|
return list(set(paths) - not_found), errors
|
||||||
|
|
||||||
|
def __init__(self, company: str):
|
||||||
|
self.configs = storage_bll.get_gs_settings_for_company(company)
|
||||||
|
self.scheme = "gs"
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return "Google Storage"
|
||||||
|
|
||||||
|
def _resolve_base_url(self, url: UrlToDelete) -> Optional[str]:
|
||||||
|
"""
|
||||||
|
For the url return the base_url containing schema, optional host and bucket name
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url.url)
|
||||||
|
if parsed.scheme != self.scheme:
|
||||||
|
return None
|
||||||
|
|
||||||
|
gs_conf = self.configs.get_config_by_uri(url.url)
|
||||||
|
if gs_conf is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return str(furl(scheme=parsed.scheme, netloc=gs_conf.bucket))
|
||||||
|
except Exception as ex:
|
||||||
|
log.warning(f"Error resolving base url for {url.url}: " + str(ex))
|
||||||
|
return None
|
||||||
|
|
||||||
|
def group_urls(
|
||||||
|
self, urls: Sequence[UrlToDelete]
|
||||||
|
) -> Mapping[str, Sequence[UrlToDelete]]:
|
||||||
|
return bucketize(urls, key=self._resolve_base_url)
|
||||||
|
|
||||||
|
def get_client(self, base: str, urls: Sequence[UrlToDelete]) -> Client:
|
||||||
|
sample_url = urls[0].url
|
||||||
|
cfg = self.configs.get_config_by_uri(sample_url)
|
||||||
|
if cfg.credentials_json:
|
||||||
|
from google.oauth2 import service_account
|
||||||
|
|
||||||
|
credentials = service_account.Credentials.from_service_account_file(
|
||||||
|
cfg.credentials_json
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
credentials = None
|
||||||
|
|
||||||
|
bucket_name = base[len(scheme_prefix(self.scheme)) :]
|
||||||
|
return self.Client(
|
||||||
|
base,
|
||||||
|
google_storage.Client(project=cfg.project, credentials=credentials).bucket(
|
||||||
|
bucket_name
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def run_delete_loop(fileserver_host: str):
|
def run_delete_loop(fileserver_host: str):
|
||||||
fileserver_host = validate_fileserver_access(fileserver_host)
|
storage_helpers = {
|
||||||
|
|
||||||
storage_delete_funcs = {
|
|
||||||
StorageType.fileserver: partial(
|
StorageType.fileserver: partial(
|
||||||
delete_fileserver_urls,
|
FileserverStorage, fileserver_host=fileserver_host
|
||||||
fileserver_host=fileserver_host,
|
|
||||||
url_prefixes=_get_fileserver_url_prefixes(fileserver_host),
|
|
||||||
),
|
),
|
||||||
|
StorageType.s3: AWSStorage,
|
||||||
|
StorageType.azure: AzureStorage,
|
||||||
|
StorageType.gs: GoogleCloudStorage,
|
||||||
}
|
}
|
||||||
while True:
|
while True:
|
||||||
now = datetime.utcnow()
|
now = datetime.utcnow()
|
||||||
@ -177,7 +574,7 @@ def run_delete_loop(fileserver_host: str):
|
|||||||
)
|
)
|
||||||
|
|
||||||
url_to_delete: UrlToDelete = UrlToDelete.objects(
|
url_to_delete: UrlToDelete = UrlToDelete.objects(
|
||||||
urls_query & Q(storage_type__in=list(storage_delete_funcs))
|
urls_query & Q(storage_type__in=list(storage_helpers))
|
||||||
).order_by("retry_count").limit(1).first()
|
).order_by("retry_count").limit(1).first()
|
||||||
if not url_to_delete:
|
if not url_to_delete:
|
||||||
sleep(10)
|
sleep(10)
|
||||||
@ -192,7 +589,10 @@ def run_delete_loop(fileserver_host: str):
|
|||||||
company_storage_urls_query = urls_query & Q(
|
company_storage_urls_query = urls_query & Q(
|
||||||
company=company, storage_type=storage_type,
|
company=company, storage_type=storage_type,
|
||||||
)
|
)
|
||||||
storage_delete_funcs[storage_type](urls_query=company_storage_urls_query)
|
delete_urls(
|
||||||
|
urls_query=company_storage_urls_query,
|
||||||
|
storage=storage_helpers[storage_type](company=company),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -1,7 +1,10 @@
|
|||||||
attrs>=22.1.0
|
attrs>=22.1.0
|
||||||
|
azure-storage-blob>=12.13.1
|
||||||
bcrypt>=3.1.4
|
bcrypt>=3.1.4
|
||||||
boltons>=19.1.0
|
boltons>=19.1.0
|
||||||
boto3==1.14.13
|
boto3==1.14.13
|
||||||
|
boto3-stubs[s3]>=1.24.35
|
||||||
|
clearml>=1.6.0,<1.7.0
|
||||||
dpath>=1.4.2,<2.0
|
dpath>=1.4.2,<2.0
|
||||||
elasticsearch==7.13.3
|
elasticsearch==7.13.3
|
||||||
fastjsonschema>=2.8
|
fastjsonschema>=2.8
|
||||||
@ -10,6 +13,8 @@ flask-cors>=3.0.5
|
|||||||
flask>=0.12.2
|
flask>=0.12.2
|
||||||
funcsigs==1.0.2
|
funcsigs==1.0.2
|
||||||
furl>=2.0.0
|
furl>=2.0.0
|
||||||
|
google-cloud-storage==2.0.0
|
||||||
|
protobuf==3.19.5
|
||||||
gunicorn>=19.7.1
|
gunicorn>=19.7.1
|
||||||
humanfriendly==4.18
|
humanfriendly==4.18
|
||||||
jinja2==2.11.3
|
jinja2==2.11.3
|
||||||
|
@ -178,9 +178,10 @@ class TestTasksResetDelete(TestService):
|
|||||||
return {url1, url2}
|
return {url1, url2}
|
||||||
|
|
||||||
def send_debug_image_events(self, task) -> Set[str]:
|
def send_debug_image_events(self, task) -> Set[str]:
|
||||||
|
url_pattern = "url_{num}.txt"
|
||||||
events = [
|
events = [
|
||||||
self.create_event(
|
self.create_event(
|
||||||
task, "training_debug_image", iteration, url=f"url_{iteration}"
|
task, "training_debug_image", iteration, url=url_pattern.format(num=iteration)
|
||||||
)
|
)
|
||||||
for iteration in range(5)
|
for iteration in range(5)
|
||||||
]
|
]
|
||||||
|
@ -7,7 +7,7 @@ download {
|
|||||||
}
|
}
|
||||||
|
|
||||||
delete {
|
delete {
|
||||||
allow_batch: false
|
allow_batch: true
|
||||||
}
|
}
|
||||||
|
|
||||||
cors {
|
cors {
|
||||||
|
Loading…
Reference in New Issue
Block a user