Add ElasticSearch 8.x support

This commit is contained in:
allegroai 2024-03-18 15:37:44 +02:00
parent 6a1fc04d1e
commit 88abf28287
20 changed files with 249 additions and 55 deletions

View File

@ -44,7 +44,6 @@ from apiserver.database.errors import translate_errors_context
from apiserver.database.model.task.task import Task, TaskStatus from apiserver.database.model.task.task import Task, TaskStatus
from apiserver.redis_manager import redman from apiserver.redis_manager import redman
from apiserver.service_repo.auth import Identity from apiserver.service_repo.auth import Identity
from apiserver.tools import safe_get
from apiserver.utilities.dicts import nested_get from apiserver.utilities.dicts import nested_get
from apiserver.utilities.json import loads from apiserver.utilities.json import loads
@ -661,8 +660,8 @@ class EventBLL(object):
Return events and next scroll id from the scrolled query Return events and next scroll id from the scrolled query
Release the scroll once it is exhausted Release the scroll once it is exhausted
""" """
total_events = safe_get(es_res, "hits/total/value", default=0) total_events = nested_get(es_res, ("hits", "total", "value"), default=0)
events = [doc["_source"] for doc in safe_get(es_res, "hits/hits", default=[])] events = [doc["_source"] for doc in nested_get(es_res, ("hits", "hits"), default=[])]
next_scroll_id = es_res.get("_scroll_id") next_scroll_id = es_res.get("_scroll_id")
if next_scroll_id and not events: if next_scroll_id and not events:
self.clear_scroll(next_scroll_id) self.clear_scroll(next_scroll_id)

View File

@ -9,7 +9,7 @@ from elasticsearch import Elasticsearch
from apiserver.config_repo import config from apiserver.config_repo import config
from apiserver.database.errors import translate_errors_context from apiserver.database.errors import translate_errors_context
from apiserver.database.model.task.task import Task from apiserver.database.model.task.task import Task
from apiserver.tools import safe_get from apiserver.utilities.dicts import nested_get
class EventType(Enum): class EventType(Enum):
@ -123,8 +123,8 @@ def get_max_metric_and_variant_counts(
es, company_id=company_id, event_type=event_type, body=es_req, **kwargs, es, company_id=company_id, event_type=event_type, body=es_req, **kwargs,
) )
metrics_count = safe_get( metrics_count = nested_get(
es_res, "aggregations/metrics_count/value", max_metrics_count es_res, ("aggregations", "metrics_count", "value"), max_metrics_count
) )
if not metrics_count: if not metrics_count:
return max_metrics_count, max_variants_count return max_metrics_count, max_variants_count

View File

@ -24,7 +24,7 @@ from apiserver.bll.event.scalar_key import ScalarKey, ScalarKeyEnum
from apiserver.bll.query import Builder as QueryBuilder from apiserver.bll.query import Builder as QueryBuilder
from apiserver.config_repo import config from apiserver.config_repo import config
from apiserver.database.errors import translate_errors_context from apiserver.database.errors import translate_errors_context
from apiserver.tools import safe_get from apiserver.utilities.dicts import nested_get
log = config.logger(__file__) log = config.logger(__file__)
@ -342,12 +342,12 @@ class EventMetrics:
total amount of intervals does not exceeds the samples total amount of intervals does not exceeds the samples
Return the interval and resulting amount of intervals Return the interval and resulting amount of intervals
""" """
count = safe_get(data, "count/value", default=0) count = nested_get(data, ("count", "value"), default=0)
if count < samples: if count < samples:
return metric, variant, 1, count return metric, variant, 1, count
min_index = safe_get(data, "min_index/value", default=0) min_index = nested_get(data, ("min_index", "value"), default=0)
max_index = safe_get(data, "max_index/value", default=min_index) max_index = nested_get(data, ("max_index", "value"), default=min_index)
index_range = max_index - min_index + 1 index_range = max_index - min_index + 1
interval = max(1, math.ceil(float(index_range) / samples)) interval = max(1, math.ceil(float(index_range) / samples))
max_samples = math.ceil(float(index_range) / interval) max_samples = math.ceil(float(index_range) / interval)
@ -592,5 +592,5 @@ class EventMetrics:
return [ return [
metric["key"] metric["key"]
for metric in safe_get(es_res, "aggregations/metrics/buckets", default=[]) for metric in nested_get(es_res, ("aggregations", "metrics", "buckets"), default=[])
] ]

View File

@ -6,7 +6,6 @@ from operator import itemgetter
from typing import Sequence, Tuple, Optional, Mapping, Callable from typing import Sequence, Tuple, Optional, Mapping, Callable
import attr import attr
import dpath
from boltons.iterutils import first from boltons.iterutils import first
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
from jsonmodels.fields import StringField, ListField, IntField from jsonmodels.fields import StringField, ListField, IntField
@ -27,6 +26,7 @@ from apiserver.config_repo import config
from apiserver.database.errors import translate_errors_context from apiserver.database.errors import translate_errors_context
from apiserver.database.model.task.metrics import MetricEventStats from apiserver.database.model.task.metrics import MetricEventStats
from apiserver.database.model.task.task import Task from apiserver.database.model.task.task import Task
from apiserver.utilities.dicts import nested_get
class VariantState(Base): class VariantState(Base):
@ -305,13 +305,13 @@ class MetricEventsIterator:
return [ return [
MetricState( MetricState(
metric=metric["key"], metric=metric["key"],
timestamp=dpath.get(metric, "last_event_timestamp/value"), timestamp=nested_get(metric, ("last_event_timestamp", "value")),
variants=[ variants=[
init_variant_state(variant) init_variant_state(variant)
for variant in dpath.get(metric, "variants/buckets") for variant in nested_get(metric, ("variants", "buckets"))
], ],
) )
for metric in dpath.get(es_res, "aggregations/metrics/buckets") for metric in nested_get(es_res, ("aggregations", "metrics", "buckets"))
] ]
@abc.abstractmethod @abc.abstractmethod
@ -430,14 +430,14 @@ class MetricEventsIterator:
def get_iteration_events(it_: dict) -> Sequence: def get_iteration_events(it_: dict) -> Sequence:
return [ return [
self._process_event(ev["_source"]) self._process_event(ev["_source"])
for m in dpath.get(it_, "metrics/buckets") for m in nested_get(it_, ("metrics", "buckets"))
for v in dpath.get(m, "variants/buckets") for v in nested_get(m, ("variants", "buckets"))
for ev in dpath.get(v, "events/hits/hits") for ev in nested_get(v, ("events", "hits", "hits"))
if is_valid_event(ev["_source"]) if is_valid_event(ev["_source"])
] ]
iterations = [] iterations = []
for it in dpath.get(es_res, "aggregations/iters/buckets"): for it in nested_get(es_res, ("aggregations", "iters", "buckets")):
events = get_iteration_events(it) events = get_iteration_events(it)
if events: if events:
iterations.append({"iter": it["key"], "events": events}) iterations.append({"iter": it["key"], "events": events})

View File

@ -18,7 +18,7 @@ from apiserver.config.info import get_deployment_type
from apiserver.database.model import Company, User from apiserver.database.model import Company, User
from apiserver.database.model.queue import Queue from apiserver.database.model.queue import Queue
from apiserver.database.model.task.task import Task from apiserver.database.model.task.task import Task
from apiserver.tools import safe_get from apiserver.utilities.dicts import nested_get
from apiserver.utilities.json import dumps from apiserver.utilities.json import dumps
from apiserver.version import __version__ as current_version from apiserver.version import __version__ as current_version
from .resource_monitor import ResourceMonitor, stat_threads from .resource_monitor import ResourceMonitor, stat_threads
@ -162,7 +162,7 @@ class StatisticsReporter:
def _get_cardinality_fields(categories: Sequence[dict]) -> dict: def _get_cardinality_fields(categories: Sequence[dict]) -> dict:
names = {"cpu": "num_cores"} names = {"cpu": "num_cores"}
return { return {
names[c["key"]]: safe_get(c, "count/value") names[c["key"]]: nested_get(c, ("count", "value"))
for c in categories for c in categories
if c["key"] in names if c["key"] in names
} }
@ -175,21 +175,21 @@ class StatisticsReporter:
} }
return { return {
names[m["key"]]: { names[m["key"]]: {
"min": safe_get(m, "min/value"), "min": nested_get(m, ("min", "value")),
"max": safe_get(m, "max/value"), "max": nested_get(m, ("max", "value")),
"avg": safe_get(m, "avg/value"), "avg": nested_get(m, ("avg", "value")),
} }
for m in metrics for m in metrics
if m["key"] in names if m["key"] in names
} }
buckets = safe_get(res, "aggregations/workers/buckets", default=[]) buckets = nested_get(res, ("aggregations", "workers", "buckets"), default=[])
return { return {
b["key"]: { b["key"]: {
key: { key: {
"interval_sec": agent_resource_threshold_sec, "interval_sec": agent_resource_threshold_sec,
**_get_cardinality_fields(safe_get(b, "categories/buckets", [])), **_get_cardinality_fields(nested_get(b, ("categories", "buckets"), [])),
**_get_metric_fields(safe_get(b, "metrics/buckets", [])), **_get_metric_fields(nested_get(b, ("metrics", "buckets"), [])),
} }
} }
for b in buckets for b in buckets
@ -227,7 +227,7 @@ class StatisticsReporter:
}, },
} }
res = cls._run_worker_stats_query(company_id, es_req) res = cls._run_worker_stats_query(company_id, es_req)
buckets = safe_get(res, "aggregations/workers/buckets", default=[]) buckets = nested_get(res, ("aggregations", "workers", "buckets"), default=[])
return { return {
b["key"]: {"last_activity_time": b["last_activity_time"]["value"]} b["key"]: {"last_activity_time": b["last_activity_time"]["value"]}
for b in buckets for b in buckets

View File

@ -27,10 +27,9 @@ from apiserver.database.model.project import Project
from apiserver.database.model.queue import Queue from apiserver.database.model.queue import Queue
from apiserver.database.model.task.task import Task from apiserver.database.model.task.task import Task
from apiserver.redis_manager import redman from apiserver.redis_manager import redman
from apiserver.tools import safe_get from apiserver.utilities.dicts import nested_get
from .stats import WorkerStats from .stats import WorkerStats
log = config.logger(__file__) log = config.logger(__file__)
@ -287,7 +286,7 @@ class WorkerBLL:
filter( filter(
None, None,
( (
safe_get(info, "next_entry/task") nested_get(info, ("next_entry", "task"))
for info in queues_info.values() for info in queues_info.values()
), ),
) )
@ -311,7 +310,7 @@ class WorkerBLL:
continue continue
entry.name = info.get("name", None) entry.name = info.get("name", None)
entry.num_tasks = info.get("num_entries", 0) entry.num_tasks = info.get("num_entries", 0)
task_id = safe_get(info, "next_entry/task") task_id = nested_get(info, ("next_entry", "task"))
if task_id: if task_id:
task = tasks_info.get(task_id, None) task = tasks_info.get(task_id, None)
entry.next_task = IdNameEntry( entry.next_task = IdNameEntry(

View File

@ -2,10 +2,9 @@ fileserver = "http://localhost:8081"
elastic { elastic {
events { events {
hosts: [{host: "127.0.0.1", port: 9200}] hosts: [{host: "127.0.0.1", port: 9200, scheme: http}]
args { args {
timeout: 60 timeout: 60
dead_timeout: 10
max_retries: 3 max_retries: 3
retry_on_timeout: true retry_on_timeout: true
} }
@ -13,10 +12,9 @@ elastic {
} }
workers { workers {
hosts: [{host:"127.0.0.1", port:9200}] hosts: [{host:"127.0.0.1", port:9200, scheme: http}]
args { args {
timeout: 60 timeout: 60
dead_timeout: 10
max_retries: 3 max_retries: 3
retry_on_timeout: true retry_on_timeout: true
} }

View File

@ -5,7 +5,7 @@ from textwrap import shorten
import dpath import dpath
from dpath.exceptions import InvalidKeyName from dpath.exceptions import InvalidKeyName
from elasticsearch import ElasticsearchException from elastic_transport import TransportError, ApiError
from elasticsearch.helpers import BulkIndexError from elasticsearch.helpers import BulkIndexError
from jsonmodels.errors import ValidationError as JsonschemaValidationError from jsonmodels.errors import ValidationError as JsonschemaValidationError
from mongoengine.errors import ( from mongoengine.errors import (
@ -210,9 +210,9 @@ def translate_errors_context(message=None, **kwargs):
raise errors.bad_request.ValidationError(e.args[0]) raise errors.bad_request.ValidationError(e.args[0])
except BulkIndexError as e: except BulkIndexError as e:
ElasticErrorsHandler.bulk_error(e, message, **kwargs) ElasticErrorsHandler.bulk_error(e, message, **kwargs)
except ElasticsearchException as e: except (TransportError, ApiError) as e:
raise errors.server_error.DataError(e, message, **kwargs) raise errors.server_error.DataError(e, message, **kwargs)
except InvalidKeyName: except InvalidKeyName:
raise errors.server_error.DataError("invalid empty key encountered in data") raise errors.server_error.DataError("invalid empty key encountered in data")
except Exception as ex: except Exception:
raise raise

View File

@ -4,12 +4,15 @@ Apply elasticsearch mappings to given hosts.
""" """
import argparse import argparse
import json import json
import logging
from pathlib import Path from pathlib import Path
from typing import Optional, Sequence, Tuple from typing import Optional, Sequence, Tuple
from elasticsearch import Elasticsearch from elasticsearch import Elasticsearch
HERE = Path(__file__).resolve().parent HERE = Path(__file__).resolve().parent
logging.getLogger('elasticsearch').setLevel(logging.WARNING)
logging.getLogger('elastic_transport').setLevel(logging.WARNING)
def apply_mappings_to_cluster( def apply_mappings_to_cluster(
@ -17,6 +20,20 @@ def apply_mappings_to_cluster(
): ):
"""Hosts maybe a sequence of strings or dicts in the form {"host": <host>, "port": <port>}""" """Hosts maybe a sequence of strings or dicts in the form {"host": <host>, "port": <port>}"""
def _send_component_template(ct_file):
with ct_file.open() as json_data:
body = json.load(json_data)
template_name = f"{ct_file.stem}"
res = es.cluster.put_component_template(name=template_name, body=body)
return {"component_template": template_name, "result": res}
def _send_index_template(it_file):
with it_file.open() as json_data:
body = json.load(json_data)
template_name = f"{it_file.stem}"
res = es.indices.put_index_template(name=template_name, body=body)
return {"index_template": template_name, "result": res}
def _send_template(f): def _send_template(f):
with f.open() as json_data: with f.open() as json_data:
data = json.load(json_data) data = json.load(json_data)
@ -24,14 +41,30 @@ def apply_mappings_to_cluster(
res = es.indices.put_template(name=template_name, body=data) res = es.indices.put_template(name=template_name, body=data)
return {"mapping": template_name, "result": res} return {"mapping": template_name, "result": res}
p = HERE / "mappings"
if key:
files = (p / key).glob("*.json")
else:
files = p.glob("**/*.json")
es = Elasticsearch(hosts=hosts, http_auth=http_auth, **(es_args or {})) es = Elasticsearch(hosts=hosts, http_auth=http_auth, **(es_args or {}))
return [_send_template(f) for f in files] p = HERE / "index_templates"
if key:
folders = [p / key]
else:
folders = [
f for f in p.iterdir() if f.is_dir()
]
ret = []
for f in folders:
for ct in (f / "component_templates").glob("*.json"):
ret.append(_send_component_template(ct))
for it in f.glob("*.json"):
ret.append(_send_index_template(it))
return ret
# p = HERE / "mappings"
# if key:
# files = (p / key).glob("*.json")
# else:
# files = p.glob("**/*.json")
#
# return [_send_template(f) for f in files]
def parse_args(): def parse_args():

View File

@ -0,0 +1,48 @@
{
"template": {
"settings": {
"number_of_replicas": 0,
"number_of_shards": 1
},
"mappings": {
"_source": {
"enabled": true
},
"properties": {
"@timestamp": {
"type": "date"
},
"task": {
"type": "keyword"
},
"type": {
"type": "keyword"
},
"worker": {
"type": "keyword"
},
"timestamp": {
"type": "date"
},
"iter": {
"type": "long"
},
"metric": {
"type": "keyword"
},
"variant": {
"type": "keyword"
},
"value": {
"type": "float"
},
"company_id": {
"type": "keyword"
},
"model_event": {
"type": "boolean"
}
}
}
}
}

View File

@ -0,0 +1,18 @@
{
"index_patterns": "events-log-*",
"template": {
"mappings": {
"properties": {
"msg": {
"type": "text",
"index": false
},
"level": {
"type": "keyword"
}
}
}
},
"priority": 500,
"composed_of": ["events_common"]
}

View File

@ -0,0 +1,18 @@
{
"index_patterns": "events-plot-*",
"template": {
"mappings": {
"properties": {
"plot_str": {
"type": "text",
"index": false
},
"plot_data": {
"type": "binary"
}
}
}
},
"priority": 500,
"composed_of": ["events_common"]
}

View File

@ -0,0 +1,17 @@
{
"index_patterns": "events-training_debug_image-*",
"template": {
"mappings": {
"properties": {
"key": {
"type": "keyword"
},
"url": {
"type": "keyword"
}
}
}
},
"priority": 500,
"composed_of": ["events_common"]
}

View File

@ -0,0 +1,26 @@
{
"template": {
"mappings": {
"_source": {
"enabled": true
},
"properties": {
"timestamp": {
"type": "date"
},
"queue": {
"type": "keyword"
},
"average_waiting_time": {
"type": "float"
},
"queue_length": {
"type": "integer"
},
"company_id": {
"type": "keyword"
}
}
}
}
}

View File

@ -0,0 +1,38 @@
{
"template": {
"mappings": {
"_source": {
"enabled": true
},
"properties": {
"timestamp": {
"type": "date"
},
"worker": {
"type": "keyword"
},
"category": {
"type": "keyword"
},
"metric": {
"type": "keyword"
},
"variant": {
"type": "keyword"
},
"value": {
"type": "float"
},
"unit": {
"type": "keyword"
},
"task": {
"type": "keyword"
},
"company_id": {
"type": "keyword"
}
}
}
}
}

View File

@ -10,6 +10,8 @@ from apiserver.config_repo import config
from apiserver.elastic.apply_mappings import apply_mappings_to_cluster from apiserver.elastic.apply_mappings import apply_mappings_to_cluster
log = config.logger(__file__) log = config.logger(__file__)
logging.getLogger('elasticsearch').setLevel(logging.WARNING)
logging.getLogger('elastic_transport').setLevel(logging.WARNING)
class MissingElasticConfiguration(Exception): class MissingElasticConfiguration(Exception):

View File

@ -1,3 +1,4 @@
import logging
from datetime import datetime from datetime import datetime
from functools import lru_cache from functools import lru_cache
from os import getenv from os import getenv
@ -9,6 +10,8 @@ from elasticsearch import Elasticsearch
from apiserver.config_repo import config from apiserver.config_repo import config
log = config.logger(__file__) log = config.logger(__file__)
logging.getLogger('elasticsearch').setLevel(logging.WARNING)
logging.getLogger('elastic_transport').setLevel(logging.WARNING)
OVERRIDE_HOST_ENV_KEY = ( OVERRIDE_HOST_ENV_KEY = (
"CLEARML_ELASTIC_SERVICE_HOST", "CLEARML_ELASTIC_SERVICE_HOST",
@ -32,6 +35,7 @@ if OVERRIDE_HOST:
OVERRIDE_PORT = first(filter(None, map(getenv, OVERRIDE_PORT_ENV_KEY))) OVERRIDE_PORT = first(filter(None, map(getenv, OVERRIDE_PORT_ENV_KEY)))
if OVERRIDE_PORT: if OVERRIDE_PORT:
OVERRIDE_PORT = int(OVERRIDE_PORT)
log.info(f"Using override elastic port {OVERRIDE_PORT}") log.info(f"Using override elastic port {OVERRIDE_PORT}")
OVERRIDE_USERNAME = first(filter(None, map(getenv, OVERRIDE_USERNAME_ENV_KEY))) OVERRIDE_USERNAME = first(filter(None, map(getenv, OVERRIDE_USERNAME_ENV_KEY)))

View File

@ -6,7 +6,7 @@ boto3>=1.26
boto3-stubs[s3]>=1.26 boto3-stubs[s3]>=1.26
clearml>=1.10.3 clearml>=1.10.3
dpath>=1.4.2,<2.0 dpath>=1.4.2,<2.0
elasticsearch==7.17.9 elasticsearch==8.12.0
fastjsonschema>=2.8 fastjsonschema>=2.8
flask-compress>=1.4.0 flask-compress>=1.4.0
flask-cors>=3.0.5 flask-cors>=3.0.5

View File

@ -49,13 +49,10 @@ services:
cluster.routing.allocation.disk.watermark.low: 500mb cluster.routing.allocation.disk.watermark.low: 500mb
cluster.routing.allocation.disk.watermark.high: 500mb cluster.routing.allocation.disk.watermark.high: 500mb
cluster.routing.allocation.disk.watermark.flood_stage: 500mb cluster.routing.allocation.disk.watermark.flood_stage: 500mb
discovery.zen.minimum_master_nodes: "1"
discovery.type: "single-node" discovery.type: "single-node"
http.compression_level: "7" http.compression_level: "7"
node.ingest: "true"
node.name: clearml node.name: clearml
reindex.remote.whitelist: '*.*' reindex.remote.whitelist: "'*.*'"
xpack.monitoring.enabled: "false"
xpack.security.enabled: "false" xpack.security.enabled: "false"
ulimits: ulimits:
memlock: memlock:

View File

@ -49,13 +49,10 @@ services:
cluster.routing.allocation.disk.watermark.low: 500mb cluster.routing.allocation.disk.watermark.low: 500mb
cluster.routing.allocation.disk.watermark.high: 500mb cluster.routing.allocation.disk.watermark.high: 500mb
cluster.routing.allocation.disk.watermark.flood_stage: 500mb cluster.routing.allocation.disk.watermark.flood_stage: 500mb
discovery.zen.minimum_master_nodes: "1"
discovery.type: "single-node" discovery.type: "single-node"
http.compression_level: "7" http.compression_level: "7"
node.ingest: "true"
node.name: clearml node.name: clearml
reindex.remote.whitelist: '*.*' reindex.remote.whitelist: "'*.*'"
xpack.monitoring.enabled: "false"
xpack.security.enabled: "false" xpack.security.enabled: "false"
ulimits: ulimits:
memlock: memlock: