From ee9f45ea613dbd2e8edc955e1fd28ddceeb6cfd4 Mon Sep 17 00:00:00 2001 From: clearml <> Date: Thu, 5 Dec 2024 22:17:13 +0200 Subject: [PATCH] Optimize MongoDB indices usage for large dbs --- apiserver/bll/model/__init__.py | 2 +- apiserver/bll/organization/tags_cache.py | 4 ++-- apiserver/bll/project/project_bll.py | 6 +++--- apiserver/bll/project/project_queries.py | 2 +- apiserver/bll/queue/queue_bll.py | 2 +- apiserver/bll/task/hyperparams.py | 2 +- apiserver/database/model/base.py | 2 +- apiserver/database/model/model.py | 12 ++++++++++-- apiserver/database/model/task/task.py | 14 ++++++++++++-- apiserver/database/utils.py | 4 ++-- apiserver/services/models.py | 16 +++++++++++++--- 11 files changed, 47 insertions(+), 19 deletions(-) diff --git a/apiserver/bll/model/__init__.py b/apiserver/bll/model/__init__.py index b3951f6..5a4367b 100644 --- a/apiserver/bll/model/__init__.py +++ b/apiserver/bll/model/__init__.py @@ -186,7 +186,7 @@ class ModelBLL: [ { "$match": { - "company": {"$in": [None, "", company]}, + "company": {"$in": ["", company]}, "_id": {"$in": model_ids}, } }, diff --git a/apiserver/bll/organization/tags_cache.py b/apiserver/bll/organization/tags_cache.py index 8196290..3fd6abe 100644 --- a/apiserver/bll/organization/tags_cache.py +++ b/apiserver/bll/organization/tags_cache.py @@ -43,8 +43,8 @@ class _TagsCache: query &= GetMixin.get_list_field_query(name, vals) if project: query &= Q(project__in=project_ids_with_children([project])) - else: - query &= Q(system_tags__nin=[EntityVisibility.hidden.value]) + # else: + # query &= Q(system_tags__nin=[EntityVisibility.hidden.value]) return self.db_cls.objects(query).distinct(field) diff --git a/apiserver/bll/project/project_bll.py b/apiserver/bll/project/project_bll.py index a500c19..4bc3a68 100644 --- a/apiserver/bll/project/project_bll.py +++ b/apiserver/bll/project/project_bll.py @@ -1015,8 +1015,8 @@ class ProjectBLL: if include_subprojects: projects = _ids_with_children(projects) query &= Q(project__in=projects) - else: - query &= Q(system_tags__nin=[EntityVisibility.hidden.value]) + # else: + # query &= Q(system_tags__nin=[EntityVisibility.hidden.value]) if state == EntityVisibility.archived: query &= Q(system_tags__in=[EntityVisibility.archived.value]) @@ -1101,7 +1101,7 @@ class ProjectBLL: project_field: str = "project", ): conditions = { - "company": {"$in": [None, "", company]}, + "company": {"$in": ["", company]}, project_field: {"$in": project_ids}, } if users: diff --git a/apiserver/bll/project/project_queries.py b/apiserver/bll/project/project_queries.py index 5fd05b9..b4c96d2 100644 --- a/apiserver/bll/project/project_queries.py +++ b/apiserver/bll/project/project_queries.py @@ -47,7 +47,7 @@ class ProjectQueries: @staticmethod def _get_company_constraint(company_id: str, allow_public: bool = True) -> dict: if allow_public: - return {"company": {"$in": [None, "", company_id]}} + return {"company": {"$in": ["", company_id]}} return {"company": company_id} diff --git a/apiserver/bll/queue/queue_bll.py b/apiserver/bll/queue/queue_bll.py index 1e67ad0..fc5aac6 100644 --- a/apiserver/bll/queue/queue_bll.py +++ b/apiserver/bll/queue/queue_bll.py @@ -525,7 +525,7 @@ class QueueBLL(object): [ { "$match": { - "company": {"$in": [None, "", company]}, + "company": {"$in": ["", company]}, "_id": queue_id, } }, diff --git a/apiserver/bll/task/hyperparams.py b/apiserver/bll/task/hyperparams.py index eae25b2..de7b71e 100644 --- a/apiserver/bll/task/hyperparams.py +++ b/apiserver/bll/task/hyperparams.py @@ -193,7 +193,7 @@ class HyperParams: pipeline = [ { "$match": { - "company": {"$in": [None, "", company_id]}, + "company": {"$in": ["", company_id]}, "_id": {"$in": task_ids}, } }, diff --git a/apiserver/database/model/base.py b/apiserver/database/model/base.py index 238b53c..3443e4b 100644 --- a/apiserver/database/model/base.py +++ b/apiserver/database/model/base.py @@ -1394,7 +1394,7 @@ class DbModelMixin(GetMixin, ProperDictMixin, UpdateMixin): else: items = list( cls.objects( - id__in=ids, company__in=(None, ""), company_origin=company_id + id__in=ids, company="", company_origin=company_id ).only("id") ) update: dict = dict(set__company=company_id, unset__company_origin=1) diff --git a/apiserver/database/model/model.py b/apiserver/database/model/model.py index 7516312..9cf3bd1 100644 --- a/apiserver/database/model/model.py +++ b/apiserver/database/model/model.py @@ -37,10 +37,18 @@ class Model(AttributedDocument): "project", "task", "last_update", - ("company", "framework"), + ("company", "last_update"), ("company", "name"), - ("company", "user"), ("company", "uri"), + # distinct queries support + ("company", "tags"), + ("company", "system_tags"), + ("company", "project", "tags"), + ("company", "project", "system_tags"), + ("company", "user"), + ("company", "project", "user"), + ("company", "framework"), + ("company", "project", "framework"), { "name": "%s.model.main_text_index" % Database.backend, "fields": ["$name", "$id", "$comment", "$parent", "$task", "$project"], diff --git a/apiserver/database/model/task/task.py b/apiserver/database/model/task/task.py index 14f5c16..15b87f0 100644 --- a/apiserver/database/model/task/task.py +++ b/apiserver/database/model/task/task.py @@ -183,9 +183,8 @@ class Task(AttributedDocument): "status_changed", "models.input.model", ("company", "name"), - ("company", "user"), ("company", "status", "type"), - ("company", "system_tags", "last_update"), + ("company", "last_update", "system_tags"), ("company", "type", "system_tags", "status"), ("company", "project", "type", "system_tags", "status"), ("status", "last_update"), # for maintenance tasks @@ -193,6 +192,17 @@ class Task(AttributedDocument): "fields": ["company", "project"], "collation": AttributedDocument._numeric_locale, }, + # distinct queries support + ("company", "tags"), + ("company", "system_tags"), + ("company", "project", "tags"), + ("company", "project", "system_tags"), + ("company", "user"), + ("company", "project", "user"), + ("company", "parent"), + ("company", "project", "parent"), + ("company", "type"), + ("company", "project", "type"), { "name": "%s.task.main_text_index" % Database.backend, "fields": [ diff --git a/apiserver/database/utils.py b/apiserver/database/utils.py index 7ee28c7..de4e57e 100644 --- a/apiserver/database/utils.py +++ b/apiserver/database/utils.py @@ -121,8 +121,8 @@ def init_cls_from_base(cls, instance): ) -def get_company_or_none_constraint(company=None): - return Q(company__in=(company, None, "")) | Q(company__exists=False) +def get_company_or_none_constraint(company=""): + return Q(company__in=list({company, ""})) def field_does_not_exist(field: str, empty_value=None, is_list=False) -> Q: diff --git a/apiserver/services/models.py b/apiserver/services/models.py index af7b63a..436773a 100644 --- a/apiserver/services/models.py +++ b/apiserver/services/models.py @@ -188,7 +188,12 @@ def get_all(call: APICall, company_id, _): def get_frameworks(call: APICall, company_id, request: GetFrameworksRequest): call.result.data = { "frameworks": sorted( - project_bll.get_model_frameworks(company_id, project_ids=request.projects) + filter( + None, + project_bll.get_model_frameworks( + company_id, project_ids=request.projects + ), + ) ) } @@ -590,7 +595,10 @@ def _delete_model_events( ) event_urls = delete_task_events_and_collect_urls( - company=company_id, task_ids=model_ids, model=True, wait_for_delete=sync_delete + company=company_id, + task_ids=model_ids, + model=True, + wait_for_delete=sync_delete, ) if event_urls: schedule_for_delete( @@ -601,7 +609,9 @@ def _delete_model_events( can_delete_folders=False, ) - event_bll.delete_task_events(company_id, model_ids, model=True, wait_for_delete=sync_delete) + event_bll.delete_task_events( + company_id, model_ids, model=True, wait_for_delete=sync_delete + ) @endpoint("models.delete", request_data_model=DeleteModelRequest)