mirror of
https://github.com/clearml/clearml-server
synced 2025-06-26 23:15:47 +00:00
Support chart series per single resource in workers.get_stats
This commit is contained in:
parent
1983b22157
commit
f3c67ac3fd
@ -12,7 +12,7 @@ from jsonmodels.fields import (
|
|||||||
)
|
)
|
||||||
from jsonmodels.models import Base
|
from jsonmodels.models import Base
|
||||||
|
|
||||||
from apiserver.apimodels import ListField, EnumField, JsonSerializableMixin
|
from apiserver.apimodels import ListField, EnumField, JsonSerializableMixin, ActualEnumField
|
||||||
from apiserver.config_repo import config
|
from apiserver.config_repo import config
|
||||||
|
|
||||||
|
|
||||||
@ -130,7 +130,7 @@ class AggregationType(Enum):
|
|||||||
|
|
||||||
class StatItem(Base):
|
class StatItem(Base):
|
||||||
key = StringField(required=True)
|
key = StringField(required=True)
|
||||||
aggregation = EnumField(AggregationType, default=AggregationType.avg)
|
aggregation = ActualEnumField(AggregationType, default=AggregationType.avg)
|
||||||
|
|
||||||
|
|
||||||
class GetStatsRequest(StatsReportBase):
|
class GetStatsRequest(StatsReportBase):
|
||||||
@ -138,17 +138,24 @@ class GetStatsRequest(StatsReportBase):
|
|||||||
StatItem, required=True, validators=validators.Length(minimum_value=1)
|
StatItem, required=True, validators=validators.Length(minimum_value=1)
|
||||||
)
|
)
|
||||||
split_by_variant = BoolField(default=False)
|
split_by_variant = BoolField(default=False)
|
||||||
|
split_by_resource = BoolField(default=False)
|
||||||
|
|
||||||
|
|
||||||
|
class MetricResourceSeries(Base):
|
||||||
|
name = StringField()
|
||||||
|
values = ListField(float)
|
||||||
|
|
||||||
|
|
||||||
class AggregationStats(Base):
|
class AggregationStats(Base):
|
||||||
aggregation = EnumField(AggregationType)
|
aggregation = EnumField(AggregationType)
|
||||||
|
dates = ListField(int)
|
||||||
values = ListField(float)
|
values = ListField(float)
|
||||||
|
resource_series = ListField(MetricResourceSeries)
|
||||||
|
|
||||||
|
|
||||||
class MetricStats(Base):
|
class MetricStats(Base):
|
||||||
metric = StringField()
|
metric = StringField()
|
||||||
variant = StringField()
|
variant = StringField()
|
||||||
dates = ListField(int)
|
|
||||||
stats = ListField(AggregationStats)
|
stats = ListField(AggregationStats)
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
from operator import attrgetter
|
from collections import defaultdict
|
||||||
|
from concurrent.futures.thread import ThreadPoolExecutor
|
||||||
|
from functools import partial
|
||||||
from typing import Optional, Sequence
|
from typing import Optional, Sequence
|
||||||
|
|
||||||
from boltons.iterutils import bucketize
|
from apiserver.apierrors import errors
|
||||||
|
|
||||||
from apiserver.apierrors.errors import bad_request
|
from apiserver.apierrors.errors import bad_request
|
||||||
from apiserver.apimodels.workers import AggregationType, GetStatsRequest, StatItem
|
from apiserver.apimodels.workers import AggregationType, GetStatsRequest, StatItem
|
||||||
from apiserver.bll.query import Builder as QueryBuilder
|
from apiserver.bll.query import Builder as QueryBuilder
|
||||||
@ -14,6 +15,7 @@ log = config.logger(__file__)
|
|||||||
|
|
||||||
class WorkerStats:
|
class WorkerStats:
|
||||||
min_chart_interval = config.get("services.workers.min_chart_interval_sec", 40)
|
min_chart_interval = config.get("services.workers.min_chart_interval_sec", 40)
|
||||||
|
_max_metrics_concurrency = config.get("services.events.events_retrieval.max_metrics_concurrency", 4)
|
||||||
|
|
||||||
def __init__(self, es):
|
def __init__(self, es):
|
||||||
self.es = es
|
self.es = es
|
||||||
@ -23,7 +25,7 @@ class WorkerStats:
|
|||||||
"""Returns the es index prefix for the company"""
|
"""Returns the es index prefix for the company"""
|
||||||
return f"worker_stats_{company_id.lower()}_"
|
return f"worker_stats_{company_id.lower()}_"
|
||||||
|
|
||||||
def _search_company_stats(self, company_id: str, es_req: dict) -> dict:
|
def search_company_stats(self, company_id: str, es_req: dict) -> dict:
|
||||||
return self.es.search(
|
return self.es.search(
|
||||||
index=f"{self.worker_stats_prefix_for_company(company_id)}*",
|
index=f"{self.worker_stats_prefix_for_company(company_id)}*",
|
||||||
body=es_req,
|
body=es_req,
|
||||||
@ -51,7 +53,7 @@ class WorkerStats:
|
|||||||
if worker_ids:
|
if worker_ids:
|
||||||
es_req["query"] = QueryBuilder.terms("worker", worker_ids)
|
es_req["query"] = QueryBuilder.terms("worker", worker_ids)
|
||||||
|
|
||||||
res = self._search_company_stats(company_id, es_req)
|
res = self.search_company_stats(company_id, es_req)
|
||||||
|
|
||||||
if not res["hits"]["total"]["value"]:
|
if not res["hits"]["total"]["value"]:
|
||||||
raise bad_request.WorkerStatsNotFound(
|
raise bad_request.WorkerStatsNotFound(
|
||||||
@ -65,6 +67,75 @@ class WorkerStats:
|
|||||||
for category in res["aggregations"]["categories"]["buckets"]
|
for category in res["aggregations"]["categories"]["buckets"]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _get_worker_stats_per_metric(
|
||||||
|
self,
|
||||||
|
metric_item: StatItem,
|
||||||
|
company_id: str,
|
||||||
|
from_date: float,
|
||||||
|
to_date: float,
|
||||||
|
interval: int,
|
||||||
|
split_by_resource: bool,
|
||||||
|
worker_ids: Sequence[str],
|
||||||
|
):
|
||||||
|
agg_types_to_es = {
|
||||||
|
AggregationType.avg: "avg",
|
||||||
|
AggregationType.min: "min",
|
||||||
|
AggregationType.max: "max",
|
||||||
|
}
|
||||||
|
agg = {
|
||||||
|
metric_item.aggregation.value: {
|
||||||
|
agg_types_to_es[metric_item.aggregation]: {"field": "value", "missing": 0.0 }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
split_by_resource = split_by_resource and metric_item.key.startswith("gpu_")
|
||||||
|
if split_by_resource:
|
||||||
|
split_aggs = {"split": {"terms": {"field": "variant"}, "aggs": agg}}
|
||||||
|
else:
|
||||||
|
split_aggs = {}
|
||||||
|
|
||||||
|
es_req = {
|
||||||
|
"size": 0,
|
||||||
|
"aggs": {
|
||||||
|
"workers": {
|
||||||
|
"terms": {"field": "worker"},
|
||||||
|
"aggs": {
|
||||||
|
"dates": {
|
||||||
|
"date_histogram": {
|
||||||
|
"field": "timestamp",
|
||||||
|
"fixed_interval": f"{interval}s",
|
||||||
|
"extended_bounds": {
|
||||||
|
"min": int(from_date) * 1000,
|
||||||
|
"max": int(to_date) * 1000,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"aggs": {
|
||||||
|
**agg,
|
||||||
|
**split_aggs,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
query_terms = [
|
||||||
|
QueryBuilder.dates_range(from_date, to_date),
|
||||||
|
QueryBuilder.term("metric", metric_item.key),
|
||||||
|
]
|
||||||
|
if worker_ids:
|
||||||
|
query_terms.append(QueryBuilder.terms("worker", worker_ids))
|
||||||
|
es_req["query"] = {"bool": {"must": query_terms}}
|
||||||
|
|
||||||
|
with translate_errors_context():
|
||||||
|
data = self.search_company_stats(company_id, es_req)
|
||||||
|
|
||||||
|
cutoff_date = (
|
||||||
|
to_date - 0.9 * interval
|
||||||
|
) * 1000 # do not return the point for the incomplete last interval
|
||||||
|
return self._extract_results(
|
||||||
|
data, metric_item, split_by_resource, cutoff_date
|
||||||
|
)
|
||||||
|
|
||||||
def get_worker_stats(self, company_id: str, request: GetStatsRequest) -> dict:
|
def get_worker_stats(self, company_id: str, request: GetStatsRequest) -> dict:
|
||||||
"""
|
"""
|
||||||
Get statistics for company workers metrics in the specified time range
|
Get statistics for company workers metrics in the specified time range
|
||||||
@ -76,123 +147,90 @@ class WorkerStats:
|
|||||||
from_date = request.from_date
|
from_date = request.from_date
|
||||||
to_date = request.to_date
|
to_date = request.to_date
|
||||||
if from_date >= to_date:
|
if from_date >= to_date:
|
||||||
raise bad_request.FieldsValueError("from_date must be less than to_date")
|
raise errors.bad_request.FieldsValueError(
|
||||||
|
"from_date must be less than to_date"
|
||||||
interval = max(request.interval, self.min_chart_interval)
|
|
||||||
|
|
||||||
def get_dates_agg() -> dict:
|
|
||||||
es_to_agg_types = (
|
|
||||||
("avg", AggregationType.avg.value),
|
|
||||||
("min", AggregationType.min.value),
|
|
||||||
("max", AggregationType.max.value),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return {
|
interval = max(request.interval, self.min_chart_interval)
|
||||||
"dates": {
|
with ThreadPoolExecutor(self._max_metrics_concurrency) as pool:
|
||||||
"date_histogram": {
|
res = list(
|
||||||
"field": "timestamp",
|
pool.map(
|
||||||
"fixed_interval": f"{interval}s",
|
partial(
|
||||||
"extended_bounds": {
|
self._get_worker_stats_per_metric,
|
||||||
"min": int(from_date) * 1000,
|
company_id=company_id,
|
||||||
"max": int(to_date) * 1000,
|
from_date=from_date,
|
||||||
}
|
to_date=to_date,
|
||||||
},
|
interval=interval,
|
||||||
"aggs": {
|
split_by_resource=request.split_by_resource,
|
||||||
agg_type: {es_agg: {"field": "value"}}
|
worker_ids=request.worker_ids,
|
||||||
for es_agg, agg_type in es_to_agg_types
|
),
|
||||||
},
|
request.items,
|
||||||
}
|
)
|
||||||
}
|
)
|
||||||
|
|
||||||
def get_variants_agg() -> dict:
|
ret = defaultdict(lambda: defaultdict(dict))
|
||||||
return {
|
for workers in res:
|
||||||
"variants": {"terms": {"field": "variant"}, "aggs": get_dates_agg()}
|
for worker, metrics in workers.items():
|
||||||
}
|
for metric, stats in metrics.items():
|
||||||
|
ret[worker][metric].update(stats)
|
||||||
|
|
||||||
es_req = {
|
return ret
|
||||||
"size": 0,
|
|
||||||
"aggs": {
|
|
||||||
"workers": {
|
|
||||||
"terms": {"field": "worker"},
|
|
||||||
"aggs": {
|
|
||||||
"metrics": {
|
|
||||||
"terms": {"field": "metric"},
|
|
||||||
"aggs": get_variants_agg()
|
|
||||||
if request.split_by_variant
|
|
||||||
else get_dates_agg(),
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
query_terms = [
|
|
||||||
QueryBuilder.dates_range(from_date, to_date),
|
|
||||||
QueryBuilder.terms("metric", {item.key for item in request.items}),
|
|
||||||
]
|
|
||||||
if request.worker_ids:
|
|
||||||
query_terms.append(QueryBuilder.terms("worker", request.worker_ids))
|
|
||||||
es_req["query"] = {"bool": {"must": query_terms}}
|
|
||||||
|
|
||||||
with translate_errors_context():
|
|
||||||
data = self._search_company_stats(company_id, es_req)
|
|
||||||
|
|
||||||
cutoff_date = (to_date - 0.9 * interval) * 1000 # do not return the point for the incomplete last interval
|
|
||||||
return self._extract_results(data, request.items, request.split_by_variant, cutoff_date)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _extract_results(
|
def _extract_results(
|
||||||
data: dict, request_items: Sequence[StatItem], split_by_variant: bool, cutoff_date
|
data: dict,
|
||||||
|
metric_item: StatItem,
|
||||||
|
split_by_resource: bool,
|
||||||
|
cutoff_date,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
Clean results returned from elastic search (remove "aggregations", "buckets" etc.),
|
Clean results returned from elastic search (remove "aggregations", "buckets" etc.),
|
||||||
leave only aggregation types requested by the user and return a clean dictionary
|
leave only aggregation types requested by the user and return a clean dictionary
|
||||||
:param data: aggregation data retrieved from ES
|
:param data: aggregation data retrieved from ES
|
||||||
:param request_items: aggs types requested by the user
|
|
||||||
:param split_by_variant: if False then aggregate by metric type, otherwise metric type + variant
|
|
||||||
"""
|
"""
|
||||||
if "aggregations" not in data:
|
if "aggregations" not in data:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
items_by_key = bucketize(request_items, key=attrgetter("key"))
|
def extract_metric_results(metric: dict) -> dict:
|
||||||
aggs_per_metric = {
|
aggregation = metric_item.aggregation.value
|
||||||
key: [item.aggregation for item in items]
|
date_buckets = metric["dates"]["buckets"]
|
||||||
for key, items in items_by_key.items()
|
length = len(date_buckets)
|
||||||
}
|
while length > 0 and date_buckets[length - 1]["key"] >= cutoff_date:
|
||||||
|
length -= 1
|
||||||
|
|
||||||
|
dates = [None] * length
|
||||||
|
agg_values = [0.0] * length
|
||||||
|
resource_series = defaultdict(lambda: [0.0] * length)
|
||||||
|
|
||||||
|
for idx in range(0, length):
|
||||||
|
date = date_buckets[idx]
|
||||||
|
dates[idx] = date["key"]
|
||||||
|
if aggregation in date:
|
||||||
|
agg_values[idx] = date[aggregation]["value"] or 0.0
|
||||||
|
|
||||||
|
if split_by_resource and "split" in date:
|
||||||
|
for resource in date["split"]["buckets"]:
|
||||||
|
series = resource_series[resource["key"]]
|
||||||
|
if aggregation in resource:
|
||||||
|
series[idx] = resource[aggregation]["value"] or 0.0
|
||||||
|
|
||||||
|
if len(resource_series) == 1:
|
||||||
|
resource_series = {}
|
||||||
|
|
||||||
def extract_date_stats(date: dict, metric_key) -> dict:
|
|
||||||
return {
|
return {
|
||||||
"date": date["key"],
|
"dates": dates,
|
||||||
"count": date["doc_count"],
|
"values": agg_values,
|
||||||
**{agg: date[agg]["value"] or 0.0 for agg in aggs_per_metric[metric_key]},
|
**(
|
||||||
}
|
{"resource_series": resource_series} if resource_series else {}
|
||||||
|
),
|
||||||
def extract_metric_results(
|
|
||||||
metric_or_variant: dict, metric_key: str
|
|
||||||
) -> Sequence[dict]:
|
|
||||||
return [
|
|
||||||
extract_date_stats(date, metric_key)
|
|
||||||
for date in metric_or_variant["dates"]["buckets"]
|
|
||||||
if date["key"] <= cutoff_date
|
|
||||||
]
|
|
||||||
|
|
||||||
def extract_variant_results(metric: dict) -> dict:
|
|
||||||
metric_key = metric["key"]
|
|
||||||
return {
|
|
||||||
variant["key"]: extract_metric_results(variant, metric_key)
|
|
||||||
for variant in metric["variants"]["buckets"]
|
|
||||||
}
|
|
||||||
|
|
||||||
def extract_worker_results(worker: dict) -> dict:
|
|
||||||
return {
|
|
||||||
metric["key"]: extract_variant_results(metric)
|
|
||||||
if split_by_variant
|
|
||||||
else extract_metric_results(metric, metric["key"])
|
|
||||||
for metric in worker["metrics"]["buckets"]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
worker["key"]: extract_worker_results(worker)
|
worker["key"]: {
|
||||||
|
metric_item.key: {
|
||||||
|
metric_item.aggregation.value: extract_metric_results(worker)
|
||||||
|
}
|
||||||
|
}
|
||||||
for worker in data["aggregations"]["workers"]["buckets"]
|
for worker in data["aggregations"]["workers"]["buckets"]
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -237,7 +275,7 @@ class WorkerStats:
|
|||||||
}
|
}
|
||||||
|
|
||||||
with translate_errors_context():
|
with translate_errors_context():
|
||||||
data = self._search_company_stats(company_id, es_req)
|
data = self.search_company_stats(company_id, es_req)
|
||||||
|
|
||||||
if "aggregations" not in data:
|
if "aggregations" not in data:
|
||||||
return {}
|
return {}
|
||||||
|
@ -3,3 +3,7 @@ default_cluster_timeout_sec: 600
|
|||||||
|
|
||||||
# The minimal sampling interval for resource dashboard and worker activity charts
|
# The minimal sampling interval for resource dashboard and worker activity charts
|
||||||
min_chart_interval_sec: 40
|
min_chart_interval_sec: 40
|
||||||
|
|
||||||
|
stats {
|
||||||
|
max_metrics_concurrency: 4
|
||||||
|
}
|
@ -15,6 +15,26 @@ _definitions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
worker_stat_key {
|
||||||
|
type: string
|
||||||
|
enum: [
|
||||||
|
cpu_usage
|
||||||
|
cpu_temperature
|
||||||
|
memory_used
|
||||||
|
memory_free
|
||||||
|
gpu_usage
|
||||||
|
gpu_temperature
|
||||||
|
gpu_fraction
|
||||||
|
gpu_memory_free
|
||||||
|
gpu_memory_used
|
||||||
|
network_tx
|
||||||
|
network_rx
|
||||||
|
disk_free_home
|
||||||
|
disk_free_temp
|
||||||
|
disk_read
|
||||||
|
disk_write
|
||||||
|
]
|
||||||
|
}
|
||||||
aggregation_type {
|
aggregation_type {
|
||||||
type: string
|
type: string
|
||||||
enum: [ avg, min, max ]
|
enum: [ avg, min, max ]
|
||||||
@ -23,8 +43,7 @@ _definitions {
|
|||||||
stat_item {
|
stat_item {
|
||||||
type: object
|
type: object
|
||||||
properties {
|
properties {
|
||||||
key {
|
key: ${_definitions.worker_stat_key} {
|
||||||
type: string
|
|
||||||
description: "Name of a metric"
|
description: "Name of a metric"
|
||||||
}
|
}
|
||||||
category {
|
category {
|
||||||
@ -38,6 +57,30 @@ _definitions {
|
|||||||
aggregation {
|
aggregation {
|
||||||
"$ref": "#/definitions/aggregation_type"
|
"$ref": "#/definitions/aggregation_type"
|
||||||
}
|
}
|
||||||
|
dates {
|
||||||
|
type: array
|
||||||
|
description: "List of timestamps (in seconds from epoch) in the acceding order. The timestamps are separated by the requested interval. Timestamps where no workers activity was recorded are omitted."
|
||||||
|
items { type: integer }
|
||||||
|
}
|
||||||
|
values {
|
||||||
|
type: array
|
||||||
|
description: "List of values corresponding to the dates in metric statistics"
|
||||||
|
items { type: number }
|
||||||
|
}
|
||||||
|
resource_series {
|
||||||
|
type: array
|
||||||
|
description: "Metric data per single resource. Return only if split_by_resource request parameter is set to True"
|
||||||
|
items {"$ref": "#/definitions/metric_resource_series"}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
metric_resource_series {
|
||||||
|
type: object
|
||||||
|
properties {
|
||||||
|
name {
|
||||||
|
type: string
|
||||||
|
description: Resource name
|
||||||
|
}
|
||||||
values {
|
values {
|
||||||
type: array
|
type: array
|
||||||
description: "List of values corresponding to the dates in metric statistics"
|
description: "List of values corresponding to the dates in metric statistics"
|
||||||
@ -56,11 +99,6 @@ _definitions {
|
|||||||
type: string
|
type: string
|
||||||
description: "Name of the metric component. Set only if 'split_by_variant' was set in the request"
|
description: "Name of the metric component. Set only if 'split_by_variant' was set in the request"
|
||||||
}
|
}
|
||||||
dates {
|
|
||||||
type: array
|
|
||||||
description: "List of timestamps (in seconds from epoch) in the acceding order. The timestamps are separated by the requested interval. Timestamps where no workers activity was recorded are omitted."
|
|
||||||
items { type: integer }
|
|
||||||
}
|
|
||||||
stats {
|
stats {
|
||||||
type: array
|
type: array
|
||||||
description: "Statistics data by type"
|
description: "Statistics data by type"
|
||||||
@ -482,6 +520,20 @@ get_stats {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
"2.32": ${get_stats."2.4"} {
|
||||||
|
request.properties {
|
||||||
|
split_by_variant {
|
||||||
|
description: "Obsolete, please do not use"
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
}
|
||||||
|
split_by_resource {
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
description: If set then for GPU related keys return the per GPU charts in addition to the aggregated one
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
get_activity_report {
|
get_activity_report {
|
||||||
"2.4" {
|
"2.4" {
|
||||||
|
@ -1,9 +1,3 @@
|
|||||||
import itertools
|
|
||||||
from operator import attrgetter
|
|
||||||
from typing import Optional, Sequence, Union
|
|
||||||
|
|
||||||
from boltons.iterutils import bucketize
|
|
||||||
|
|
||||||
from apiserver.apierrors.errors import bad_request
|
from apiserver.apierrors.errors import bad_request
|
||||||
from apiserver.apimodels.workers import (
|
from apiserver.apimodels.workers import (
|
||||||
WorkerRequest,
|
WorkerRequest,
|
||||||
@ -23,6 +17,7 @@ from apiserver.apimodels.workers import (
|
|||||||
GetActivityReportResponse,
|
GetActivityReportResponse,
|
||||||
ActivityReportSeries,
|
ActivityReportSeries,
|
||||||
GetCountRequest,
|
GetCountRequest,
|
||||||
|
MetricResourceSeries,
|
||||||
)
|
)
|
||||||
from apiserver.bll.workers import WorkerBLL
|
from apiserver.bll.workers import WorkerBLL
|
||||||
from apiserver.config_repo import config
|
from apiserver.config_repo import config
|
||||||
@ -163,71 +158,47 @@ def get_activity_report(
|
|||||||
@endpoint(
|
@endpoint(
|
||||||
"workers.get_stats",
|
"workers.get_stats",
|
||||||
min_version="2.4",
|
min_version="2.4",
|
||||||
request_data_model=GetStatsRequest,
|
|
||||||
response_data_model=GetStatsResponse,
|
response_data_model=GetStatsResponse,
|
||||||
validate_schema=True,
|
validate_schema=True,
|
||||||
)
|
)
|
||||||
def get_stats(call: APICall, company_id, request: GetStatsRequest):
|
def get_stats(call: APICall, company_id, request: GetStatsRequest):
|
||||||
ret = worker_bll.stats.get_worker_stats(company_id, request)
|
ret = worker_bll.stats.get_worker_stats(company_id, request)
|
||||||
|
|
||||||
def _get_variant_metric_stats(
|
def _get_agg_stats(
|
||||||
metric: str,
|
aggregation: str,
|
||||||
agg_names: Sequence[str],
|
stats: dict,
|
||||||
stats: Sequence[dict],
|
) -> AggregationStats:
|
||||||
variant: Optional[str] = None,
|
resource_series = []
|
||||||
) -> MetricStats:
|
if "resource_series" in stats:
|
||||||
stat_by_name = extract_properties_to_lists(agg_names, stats)
|
for name, values in stats["resource_series"].items():
|
||||||
return MetricStats(
|
resource_series.append(
|
||||||
metric=metric,
|
MetricResourceSeries(
|
||||||
variant=variant,
|
name=name,
|
||||||
dates=stat_by_name["date"],
|
values=values
|
||||||
stats=[
|
)
|
||||||
AggregationStats(aggregation=name, values=aggs)
|
)
|
||||||
for name, aggs in stat_by_name.items()
|
return AggregationStats(
|
||||||
if name != "date"
|
aggregation=aggregation,
|
||||||
],
|
dates=stats["dates"],
|
||||||
)
|
values=stats["values"],
|
||||||
|
resource_series=resource_series,
|
||||||
def _get_metric_stats(
|
|
||||||
metric: str, stats: Union[dict, Sequence[dict]], agg_types: Sequence[str]
|
|
||||||
) -> Sequence[MetricStats]:
|
|
||||||
"""
|
|
||||||
Return statistics for a certain metric or a list of statistic for
|
|
||||||
metric variants if break_by_variant was requested
|
|
||||||
"""
|
|
||||||
agg_names = ["date"] + list(set(agg_types))
|
|
||||||
if not isinstance(stats, dict):
|
|
||||||
# no variants were requested
|
|
||||||
return [_get_variant_metric_stats(metric, agg_names, stats)]
|
|
||||||
|
|
||||||
return [
|
|
||||||
_get_variant_metric_stats(metric, agg_names, variant_stats, variant)
|
|
||||||
for variant, variant_stats in stats.items()
|
|
||||||
]
|
|
||||||
|
|
||||||
def _get_worker_metrics(stats: dict) -> Sequence[MetricStats]:
|
|
||||||
"""
|
|
||||||
Convert the worker statistics data from the internal format of lists of structs
|
|
||||||
to a more "compact" format for json transfer (arrays of dates and arrays of values)
|
|
||||||
"""
|
|
||||||
# removed metrics that were requested but for some reason
|
|
||||||
# do not exist in stats data
|
|
||||||
metrics = [metric for metric in request.items if metric.key in stats]
|
|
||||||
|
|
||||||
aggs_by_metric = bucketize(
|
|
||||||
metrics, key=attrgetter("key"), value_transform=attrgetter("aggregation")
|
|
||||||
)
|
|
||||||
|
|
||||||
return list(
|
|
||||||
itertools.chain.from_iterable(
|
|
||||||
_get_metric_stats(metric, metric_stats, aggs_by_metric[metric])
|
|
||||||
for metric, metric_stats in stats.items()
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return GetStatsResponse(
|
return GetStatsResponse(
|
||||||
workers=[
|
workers=[
|
||||||
WorkerStatistics(worker=worker, metrics=_get_worker_metrics(stats))
|
WorkerStatistics(
|
||||||
for worker, stats in ret.items()
|
worker=worker,
|
||||||
|
metrics=[
|
||||||
|
MetricStats(
|
||||||
|
metric=metric,
|
||||||
|
stats=[
|
||||||
|
_get_agg_stats(aggregation, a_stats)
|
||||||
|
for aggregation, a_stats in m_stats.items()
|
||||||
|
]
|
||||||
|
)
|
||||||
|
for metric, m_stats in w_stats.items()
|
||||||
|
],
|
||||||
|
)
|
||||||
|
for worker, w_stats in ret.items()
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import statistics
|
||||||
import time
|
import time
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from typing import Sequence
|
from typing import Sequence
|
||||||
@ -83,7 +84,7 @@ class TestWorkersService(TestService):
|
|||||||
self._check_exists(test_worker, False, tags=["test"])
|
self._check_exists(test_worker, False, tags=["test"])
|
||||||
self._check_exists(test_worker, False, tags=["-application"])
|
self._check_exists(test_worker, False, tags=["-application"])
|
||||||
|
|
||||||
def _simulate_workers(self, start: int) -> Sequence[str]:
|
def _simulate_workers(self, start: int, with_gpu: bool = False) -> dict:
|
||||||
"""
|
"""
|
||||||
Two workers writing the same metrics. One for 4 seconds. Another one for 2
|
Two workers writing the same metrics. One for 4 seconds. Another one for 2
|
||||||
The first worker reports a task
|
The first worker reports a task
|
||||||
@ -93,20 +94,25 @@ class TestWorkersService(TestService):
|
|||||||
task_id = self._create_running_task(task_name="task-1")
|
task_id = self._create_running_task(task_name="task-1")
|
||||||
|
|
||||||
workers = [f"test_{uuid4().hex}", f"test_{uuid4().hex}"]
|
workers = [f"test_{uuid4().hex}", f"test_{uuid4().hex}"]
|
||||||
workers_stats = [
|
if with_gpu:
|
||||||
|
gpu_usage = [dict(gpu_usage=[60, 70]), dict(gpu_usage=[40])]
|
||||||
|
else:
|
||||||
|
gpu_usage = [{}, {}]
|
||||||
|
|
||||||
|
worker_stats = [
|
||||||
(
|
(
|
||||||
dict(cpu_usage=[10, 20], memory_used=50),
|
dict(cpu_usage=[10, 20], memory_used=50, **gpu_usage[0]),
|
||||||
dict(cpu_usage=[5], memory_used=30),
|
dict(cpu_usage=[5], memory_used=30, **gpu_usage[1]),
|
||||||
)
|
)
|
||||||
] * 4
|
] * 4
|
||||||
workers_activity = [
|
worker_activity = [
|
||||||
(workers[0], workers[1]),
|
(workers[0], workers[1]),
|
||||||
(workers[0], workers[1]),
|
(workers[0], workers[1]),
|
||||||
(workers[0],),
|
(workers[0],),
|
||||||
(workers[0],),
|
(workers[0],),
|
||||||
]
|
]
|
||||||
timestamp = start * 1000
|
timestamp = start * 1000
|
||||||
for ws, stats in zip(workers_activity, workers_stats):
|
for ws, stats in zip(worker_activity, worker_stats):
|
||||||
for w, s in zip(ws, stats):
|
for w, s in zip(ws, stats):
|
||||||
data = dict(
|
data = dict(
|
||||||
worker=w,
|
worker=w,
|
||||||
@ -118,7 +124,10 @@ class TestWorkersService(TestService):
|
|||||||
self.api.workers.status_report(**data)
|
self.api.workers.status_report(**data)
|
||||||
timestamp += 60*1000
|
timestamp += 60*1000
|
||||||
|
|
||||||
return workers
|
return {
|
||||||
|
w: s
|
||||||
|
for w, s in zip(workers, worker_stats[0])
|
||||||
|
}
|
||||||
|
|
||||||
def _create_running_task(self, task_name):
|
def _create_running_task(self, task_name):
|
||||||
task_input = dict(name=task_name, type="testing")
|
task_input = dict(name=task_name, type="testing")
|
||||||
@ -131,7 +140,7 @@ class TestWorkersService(TestService):
|
|||||||
def test_get_keys(self):
|
def test_get_keys(self):
|
||||||
workers = self._simulate_workers(int(time.time()))
|
workers = self._simulate_workers(int(time.time()))
|
||||||
time.sleep(5) # give to es time to refresh
|
time.sleep(5) # give to es time to refresh
|
||||||
res = self.api.workers.get_metric_keys(worker_ids=workers)
|
res = self.api.workers.get_metric_keys(worker_ids=list(workers))
|
||||||
assert {"cpu", "memory"} == set(c.name for c in res["categories"])
|
assert {"cpu", "memory"} == set(c.name for c in res["categories"])
|
||||||
assert all(
|
assert all(
|
||||||
c.metric_keys == ["cpu_usage"] for c in res["categories"] if c.name == "cpu"
|
c.metric_keys == ["cpu_usage"] for c in res["categories"] if c.name == "cpu"
|
||||||
@ -147,7 +156,7 @@ class TestWorkersService(TestService):
|
|||||||
|
|
||||||
def test_get_stats(self):
|
def test_get_stats(self):
|
||||||
start = int(time.time())
|
start = int(time.time())
|
||||||
workers = self._simulate_workers(start)
|
workers = self._simulate_workers(start, with_gpu=True)
|
||||||
|
|
||||||
time.sleep(5) # give to ES time to refresh
|
time.sleep(5) # give to ES time to refresh
|
||||||
from_date = start
|
from_date = start
|
||||||
@ -157,49 +166,72 @@ class TestWorkersService(TestService):
|
|||||||
items=[
|
items=[
|
||||||
dict(key="cpu_usage", aggregation="avg"),
|
dict(key="cpu_usage", aggregation="avg"),
|
||||||
dict(key="cpu_usage", aggregation="max"),
|
dict(key="cpu_usage", aggregation="max"),
|
||||||
|
dict(key="gpu_usage", aggregation="avg"),
|
||||||
|
dict(key="gpu_usage", aggregation="max"),
|
||||||
dict(key="memory_used", aggregation="max"),
|
dict(key="memory_used", aggregation="max"),
|
||||||
dict(key="memory_used", aggregation="min"),
|
|
||||||
],
|
],
|
||||||
from_date=from_date,
|
from_date=from_date,
|
||||||
to_date=to_date,
|
to_date=to_date,
|
||||||
# split_by_variant=True,
|
# split_by_variant=True,
|
||||||
interval=1,
|
interval=1,
|
||||||
worker_ids=workers,
|
worker_ids=list(workers),
|
||||||
)
|
)
|
||||||
self.assertWorkersInStats(workers, res.workers)
|
self.assertWorkersInStats(list(workers), res.workers)
|
||||||
for worker in res.workers:
|
for worker in res.workers:
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
set(metric.metric for metric in worker.metrics),
|
set(metric.metric for metric in worker.metrics),
|
||||||
{"cpu_usage", "memory_used"},
|
{"cpu_usage", "gpu_usage", "memory_used"},
|
||||||
)
|
)
|
||||||
|
|
||||||
for worker in res.workers:
|
for worker in res.workers:
|
||||||
|
worker_id = worker.worker
|
||||||
for metric, metric_stats in zip(
|
for metric, metric_stats in zip(
|
||||||
worker.metrics, ({"avg", "max"}, {"max", "min"})
|
worker.metrics, ({"avg", "max"}, {"avg", "max"}, {"max"})
|
||||||
):
|
):
|
||||||
|
metric_name = metric.metric
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
set(stat.aggregation for stat in metric.stats), metric_stats
|
set(stat.aggregation for stat in metric.stats), metric_stats
|
||||||
)
|
)
|
||||||
self.assertTrue(11 >= len(metric.dates) >= 10)
|
for stat in metric.stats:
|
||||||
|
expected = workers[worker_id][metric_name]
|
||||||
|
self.assertTrue(11 >= len(stat.dates) >= 10)
|
||||||
|
self.assertFalse(stat.get("resource_series"))
|
||||||
|
agg = stat.aggregation
|
||||||
|
if isinstance(expected, list):
|
||||||
|
if agg == "avg":
|
||||||
|
val = statistics.mean(expected)
|
||||||
|
elif agg == "min":
|
||||||
|
val = min(expected)
|
||||||
|
else:
|
||||||
|
val = max(expected)
|
||||||
|
else:
|
||||||
|
val = expected
|
||||||
|
self.assertEqual(set(stat["values"]), {val, 0})
|
||||||
|
|
||||||
# split by variants
|
# split by resources
|
||||||
res = self.api.workers.get_stats(
|
res = self.api.workers.get_stats(
|
||||||
items=[dict(key="cpu_usage", aggregation="avg")],
|
items=[dict(key="gpu_usage", aggregation="avg")],
|
||||||
from_date=from_date,
|
from_date=from_date,
|
||||||
to_date=to_date,
|
to_date=to_date,
|
||||||
split_by_variant=True,
|
split_by_resource=True,
|
||||||
interval=1,
|
interval=1,
|
||||||
worker_ids=workers,
|
worker_ids=list(workers),
|
||||||
)
|
)
|
||||||
self.assertWorkersInStats(workers, res.workers)
|
self.assertWorkersInStats(list(workers), res.workers)
|
||||||
|
|
||||||
for worker in res.workers:
|
for worker in res.workers:
|
||||||
|
worker_id = worker.worker
|
||||||
for metric in worker.metrics:
|
for metric in worker.metrics:
|
||||||
self.assertEqual(
|
metric_name = metric.metric
|
||||||
set(metric.variant for metric in worker.metrics),
|
for stat in metric.stats:
|
||||||
{"0", "1"} if worker.worker == workers[0] else {"0"},
|
expected = workers[worker_id][metric_name]
|
||||||
)
|
if metric_name.startswith("gpu") and len(expected) > 1:
|
||||||
self.assertTrue(11 >= len(metric.dates) >= 10)
|
resource_series = stat.get("resource_series")
|
||||||
|
self.assertEqual(len(resource_series), len(expected))
|
||||||
|
for rs, value in zip(resource_series, expected):
|
||||||
|
self.assertEqual(set(rs["values"]), {value, 0})
|
||||||
|
else:
|
||||||
|
self.assertEqual(stat.get("resource_series"), [])
|
||||||
|
|
||||||
res = self.api.workers.get_stats(
|
res = self.api.workers.get_stats(
|
||||||
items=[dict(key="cpu_usage", aggregation="avg")],
|
items=[dict(key="cpu_usage", aggregation="avg")],
|
||||||
|
Loading…
Reference in New Issue
Block a user