mirror of
https://github.com/clearml/clearml-server
synced 2025-06-26 23:15:47 +00:00
Support chart series per single resource in workers.get_stats
This commit is contained in:
parent
1983b22157
commit
f3c67ac3fd
@ -12,7 +12,7 @@ from jsonmodels.fields import (
|
||||
)
|
||||
from jsonmodels.models import Base
|
||||
|
||||
from apiserver.apimodels import ListField, EnumField, JsonSerializableMixin
|
||||
from apiserver.apimodels import ListField, EnumField, JsonSerializableMixin, ActualEnumField
|
||||
from apiserver.config_repo import config
|
||||
|
||||
|
||||
@ -130,7 +130,7 @@ class AggregationType(Enum):
|
||||
|
||||
class StatItem(Base):
|
||||
key = StringField(required=True)
|
||||
aggregation = EnumField(AggregationType, default=AggregationType.avg)
|
||||
aggregation = ActualEnumField(AggregationType, default=AggregationType.avg)
|
||||
|
||||
|
||||
class GetStatsRequest(StatsReportBase):
|
||||
@ -138,17 +138,24 @@ class GetStatsRequest(StatsReportBase):
|
||||
StatItem, required=True, validators=validators.Length(minimum_value=1)
|
||||
)
|
||||
split_by_variant = BoolField(default=False)
|
||||
split_by_resource = BoolField(default=False)
|
||||
|
||||
|
||||
class MetricResourceSeries(Base):
|
||||
name = StringField()
|
||||
values = ListField(float)
|
||||
|
||||
|
||||
class AggregationStats(Base):
|
||||
aggregation = EnumField(AggregationType)
|
||||
dates = ListField(int)
|
||||
values = ListField(float)
|
||||
resource_series = ListField(MetricResourceSeries)
|
||||
|
||||
|
||||
class MetricStats(Base):
|
||||
metric = StringField()
|
||||
variant = StringField()
|
||||
dates = ListField(int)
|
||||
stats = ListField(AggregationStats)
|
||||
|
||||
|
||||
|
@ -1,8 +1,9 @@
|
||||
from operator import attrgetter
|
||||
from collections import defaultdict
|
||||
from concurrent.futures.thread import ThreadPoolExecutor
|
||||
from functools import partial
|
||||
from typing import Optional, Sequence
|
||||
|
||||
from boltons.iterutils import bucketize
|
||||
|
||||
from apiserver.apierrors import errors
|
||||
from apiserver.apierrors.errors import bad_request
|
||||
from apiserver.apimodels.workers import AggregationType, GetStatsRequest, StatItem
|
||||
from apiserver.bll.query import Builder as QueryBuilder
|
||||
@ -14,6 +15,7 @@ log = config.logger(__file__)
|
||||
|
||||
class WorkerStats:
|
||||
min_chart_interval = config.get("services.workers.min_chart_interval_sec", 40)
|
||||
_max_metrics_concurrency = config.get("services.events.events_retrieval.max_metrics_concurrency", 4)
|
||||
|
||||
def __init__(self, es):
|
||||
self.es = es
|
||||
@ -23,7 +25,7 @@ class WorkerStats:
|
||||
"""Returns the es index prefix for the company"""
|
||||
return f"worker_stats_{company_id.lower()}_"
|
||||
|
||||
def _search_company_stats(self, company_id: str, es_req: dict) -> dict:
|
||||
def search_company_stats(self, company_id: str, es_req: dict) -> dict:
|
||||
return self.es.search(
|
||||
index=f"{self.worker_stats_prefix_for_company(company_id)}*",
|
||||
body=es_req,
|
||||
@ -51,7 +53,7 @@ class WorkerStats:
|
||||
if worker_ids:
|
||||
es_req["query"] = QueryBuilder.terms("worker", worker_ids)
|
||||
|
||||
res = self._search_company_stats(company_id, es_req)
|
||||
res = self.search_company_stats(company_id, es_req)
|
||||
|
||||
if not res["hits"]["total"]["value"]:
|
||||
raise bad_request.WorkerStatsNotFound(
|
||||
@ -65,6 +67,75 @@ class WorkerStats:
|
||||
for category in res["aggregations"]["categories"]["buckets"]
|
||||
}
|
||||
|
||||
def _get_worker_stats_per_metric(
|
||||
self,
|
||||
metric_item: StatItem,
|
||||
company_id: str,
|
||||
from_date: float,
|
||||
to_date: float,
|
||||
interval: int,
|
||||
split_by_resource: bool,
|
||||
worker_ids: Sequence[str],
|
||||
):
|
||||
agg_types_to_es = {
|
||||
AggregationType.avg: "avg",
|
||||
AggregationType.min: "min",
|
||||
AggregationType.max: "max",
|
||||
}
|
||||
agg = {
|
||||
metric_item.aggregation.value: {
|
||||
agg_types_to_es[metric_item.aggregation]: {"field": "value", "missing": 0.0 }
|
||||
}
|
||||
}
|
||||
split_by_resource = split_by_resource and metric_item.key.startswith("gpu_")
|
||||
if split_by_resource:
|
||||
split_aggs = {"split": {"terms": {"field": "variant"}, "aggs": agg}}
|
||||
else:
|
||||
split_aggs = {}
|
||||
|
||||
es_req = {
|
||||
"size": 0,
|
||||
"aggs": {
|
||||
"workers": {
|
||||
"terms": {"field": "worker"},
|
||||
"aggs": {
|
||||
"dates": {
|
||||
"date_histogram": {
|
||||
"field": "timestamp",
|
||||
"fixed_interval": f"{interval}s",
|
||||
"extended_bounds": {
|
||||
"min": int(from_date) * 1000,
|
||||
"max": int(to_date) * 1000,
|
||||
},
|
||||
},
|
||||
"aggs": {
|
||||
**agg,
|
||||
**split_aggs,
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
query_terms = [
|
||||
QueryBuilder.dates_range(from_date, to_date),
|
||||
QueryBuilder.term("metric", metric_item.key),
|
||||
]
|
||||
if worker_ids:
|
||||
query_terms.append(QueryBuilder.terms("worker", worker_ids))
|
||||
es_req["query"] = {"bool": {"must": query_terms}}
|
||||
|
||||
with translate_errors_context():
|
||||
data = self.search_company_stats(company_id, es_req)
|
||||
|
||||
cutoff_date = (
|
||||
to_date - 0.9 * interval
|
||||
) * 1000 # do not return the point for the incomplete last interval
|
||||
return self._extract_results(
|
||||
data, metric_item, split_by_resource, cutoff_date
|
||||
)
|
||||
|
||||
def get_worker_stats(self, company_id: str, request: GetStatsRequest) -> dict:
|
||||
"""
|
||||
Get statistics for company workers metrics in the specified time range
|
||||
@ -76,123 +147,90 @@ class WorkerStats:
|
||||
from_date = request.from_date
|
||||
to_date = request.to_date
|
||||
if from_date >= to_date:
|
||||
raise bad_request.FieldsValueError("from_date must be less than to_date")
|
||||
|
||||
interval = max(request.interval, self.min_chart_interval)
|
||||
|
||||
def get_dates_agg() -> dict:
|
||||
es_to_agg_types = (
|
||||
("avg", AggregationType.avg.value),
|
||||
("min", AggregationType.min.value),
|
||||
("max", AggregationType.max.value),
|
||||
raise errors.bad_request.FieldsValueError(
|
||||
"from_date must be less than to_date"
|
||||
)
|
||||
|
||||
return {
|
||||
"dates": {
|
||||
"date_histogram": {
|
||||
"field": "timestamp",
|
||||
"fixed_interval": f"{interval}s",
|
||||
"extended_bounds": {
|
||||
"min": int(from_date) * 1000,
|
||||
"max": int(to_date) * 1000,
|
||||
}
|
||||
},
|
||||
"aggs": {
|
||||
agg_type: {es_agg: {"field": "value"}}
|
||||
for es_agg, agg_type in es_to_agg_types
|
||||
},
|
||||
}
|
||||
}
|
||||
interval = max(request.interval, self.min_chart_interval)
|
||||
with ThreadPoolExecutor(self._max_metrics_concurrency) as pool:
|
||||
res = list(
|
||||
pool.map(
|
||||
partial(
|
||||
self._get_worker_stats_per_metric,
|
||||
company_id=company_id,
|
||||
from_date=from_date,
|
||||
to_date=to_date,
|
||||
interval=interval,
|
||||
split_by_resource=request.split_by_resource,
|
||||
worker_ids=request.worker_ids,
|
||||
),
|
||||
request.items,
|
||||
)
|
||||
)
|
||||
|
||||
def get_variants_agg() -> dict:
|
||||
return {
|
||||
"variants": {"terms": {"field": "variant"}, "aggs": get_dates_agg()}
|
||||
}
|
||||
ret = defaultdict(lambda: defaultdict(dict))
|
||||
for workers in res:
|
||||
for worker, metrics in workers.items():
|
||||
for metric, stats in metrics.items():
|
||||
ret[worker][metric].update(stats)
|
||||
|
||||
es_req = {
|
||||
"size": 0,
|
||||
"aggs": {
|
||||
"workers": {
|
||||
"terms": {"field": "worker"},
|
||||
"aggs": {
|
||||
"metrics": {
|
||||
"terms": {"field": "metric"},
|
||||
"aggs": get_variants_agg()
|
||||
if request.split_by_variant
|
||||
else get_dates_agg(),
|
||||
}
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
query_terms = [
|
||||
QueryBuilder.dates_range(from_date, to_date),
|
||||
QueryBuilder.terms("metric", {item.key for item in request.items}),
|
||||
]
|
||||
if request.worker_ids:
|
||||
query_terms.append(QueryBuilder.terms("worker", request.worker_ids))
|
||||
es_req["query"] = {"bool": {"must": query_terms}}
|
||||
|
||||
with translate_errors_context():
|
||||
data = self._search_company_stats(company_id, es_req)
|
||||
|
||||
cutoff_date = (to_date - 0.9 * interval) * 1000 # do not return the point for the incomplete last interval
|
||||
return self._extract_results(data, request.items, request.split_by_variant, cutoff_date)
|
||||
return ret
|
||||
|
||||
@staticmethod
|
||||
def _extract_results(
|
||||
data: dict, request_items: Sequence[StatItem], split_by_variant: bool, cutoff_date
|
||||
data: dict,
|
||||
metric_item: StatItem,
|
||||
split_by_resource: bool,
|
||||
cutoff_date,
|
||||
) -> dict:
|
||||
"""
|
||||
Clean results returned from elastic search (remove "aggregations", "buckets" etc.),
|
||||
leave only aggregation types requested by the user and return a clean dictionary
|
||||
:param data: aggregation data retrieved from ES
|
||||
:param request_items: aggs types requested by the user
|
||||
:param split_by_variant: if False then aggregate by metric type, otherwise metric type + variant
|
||||
"""
|
||||
if "aggregations" not in data:
|
||||
return {}
|
||||
|
||||
items_by_key = bucketize(request_items, key=attrgetter("key"))
|
||||
aggs_per_metric = {
|
||||
key: [item.aggregation for item in items]
|
||||
for key, items in items_by_key.items()
|
||||
}
|
||||
def extract_metric_results(metric: dict) -> dict:
|
||||
aggregation = metric_item.aggregation.value
|
||||
date_buckets = metric["dates"]["buckets"]
|
||||
length = len(date_buckets)
|
||||
while length > 0 and date_buckets[length - 1]["key"] >= cutoff_date:
|
||||
length -= 1
|
||||
|
||||
dates = [None] * length
|
||||
agg_values = [0.0] * length
|
||||
resource_series = defaultdict(lambda: [0.0] * length)
|
||||
|
||||
for idx in range(0, length):
|
||||
date = date_buckets[idx]
|
||||
dates[idx] = date["key"]
|
||||
if aggregation in date:
|
||||
agg_values[idx] = date[aggregation]["value"] or 0.0
|
||||
|
||||
if split_by_resource and "split" in date:
|
||||
for resource in date["split"]["buckets"]:
|
||||
series = resource_series[resource["key"]]
|
||||
if aggregation in resource:
|
||||
series[idx] = resource[aggregation]["value"] or 0.0
|
||||
|
||||
if len(resource_series) == 1:
|
||||
resource_series = {}
|
||||
|
||||
def extract_date_stats(date: dict, metric_key) -> dict:
|
||||
return {
|
||||
"date": date["key"],
|
||||
"count": date["doc_count"],
|
||||
**{agg: date[agg]["value"] or 0.0 for agg in aggs_per_metric[metric_key]},
|
||||
}
|
||||
|
||||
def extract_metric_results(
|
||||
metric_or_variant: dict, metric_key: str
|
||||
) -> Sequence[dict]:
|
||||
return [
|
||||
extract_date_stats(date, metric_key)
|
||||
for date in metric_or_variant["dates"]["buckets"]
|
||||
if date["key"] <= cutoff_date
|
||||
]
|
||||
|
||||
def extract_variant_results(metric: dict) -> dict:
|
||||
metric_key = metric["key"]
|
||||
return {
|
||||
variant["key"]: extract_metric_results(variant, metric_key)
|
||||
for variant in metric["variants"]["buckets"]
|
||||
}
|
||||
|
||||
def extract_worker_results(worker: dict) -> dict:
|
||||
return {
|
||||
metric["key"]: extract_variant_results(metric)
|
||||
if split_by_variant
|
||||
else extract_metric_results(metric, metric["key"])
|
||||
for metric in worker["metrics"]["buckets"]
|
||||
"dates": dates,
|
||||
"values": agg_values,
|
||||
**(
|
||||
{"resource_series": resource_series} if resource_series else {}
|
||||
),
|
||||
}
|
||||
|
||||
return {
|
||||
worker["key"]: extract_worker_results(worker)
|
||||
worker["key"]: {
|
||||
metric_item.key: {
|
||||
metric_item.aggregation.value: extract_metric_results(worker)
|
||||
}
|
||||
}
|
||||
for worker in data["aggregations"]["workers"]["buckets"]
|
||||
}
|
||||
|
||||
@ -237,7 +275,7 @@ class WorkerStats:
|
||||
}
|
||||
|
||||
with translate_errors_context():
|
||||
data = self._search_company_stats(company_id, es_req)
|
||||
data = self.search_company_stats(company_id, es_req)
|
||||
|
||||
if "aggregations" not in data:
|
||||
return {}
|
||||
|
@ -3,3 +3,7 @@ default_cluster_timeout_sec: 600
|
||||
|
||||
# The minimal sampling interval for resource dashboard and worker activity charts
|
||||
min_chart_interval_sec: 40
|
||||
|
||||
stats {
|
||||
max_metrics_concurrency: 4
|
||||
}
|
@ -15,6 +15,26 @@ _definitions {
|
||||
}
|
||||
}
|
||||
}
|
||||
worker_stat_key {
|
||||
type: string
|
||||
enum: [
|
||||
cpu_usage
|
||||
cpu_temperature
|
||||
memory_used
|
||||
memory_free
|
||||
gpu_usage
|
||||
gpu_temperature
|
||||
gpu_fraction
|
||||
gpu_memory_free
|
||||
gpu_memory_used
|
||||
network_tx
|
||||
network_rx
|
||||
disk_free_home
|
||||
disk_free_temp
|
||||
disk_read
|
||||
disk_write
|
||||
]
|
||||
}
|
||||
aggregation_type {
|
||||
type: string
|
||||
enum: [ avg, min, max ]
|
||||
@ -23,8 +43,7 @@ _definitions {
|
||||
stat_item {
|
||||
type: object
|
||||
properties {
|
||||
key {
|
||||
type: string
|
||||
key: ${_definitions.worker_stat_key} {
|
||||
description: "Name of a metric"
|
||||
}
|
||||
category {
|
||||
@ -38,6 +57,30 @@ _definitions {
|
||||
aggregation {
|
||||
"$ref": "#/definitions/aggregation_type"
|
||||
}
|
||||
dates {
|
||||
type: array
|
||||
description: "List of timestamps (in seconds from epoch) in the acceding order. The timestamps are separated by the requested interval. Timestamps where no workers activity was recorded are omitted."
|
||||
items { type: integer }
|
||||
}
|
||||
values {
|
||||
type: array
|
||||
description: "List of values corresponding to the dates in metric statistics"
|
||||
items { type: number }
|
||||
}
|
||||
resource_series {
|
||||
type: array
|
||||
description: "Metric data per single resource. Return only if split_by_resource request parameter is set to True"
|
||||
items {"$ref": "#/definitions/metric_resource_series"}
|
||||
}
|
||||
}
|
||||
}
|
||||
metric_resource_series {
|
||||
type: object
|
||||
properties {
|
||||
name {
|
||||
type: string
|
||||
description: Resource name
|
||||
}
|
||||
values {
|
||||
type: array
|
||||
description: "List of values corresponding to the dates in metric statistics"
|
||||
@ -56,11 +99,6 @@ _definitions {
|
||||
type: string
|
||||
description: "Name of the metric component. Set only if 'split_by_variant' was set in the request"
|
||||
}
|
||||
dates {
|
||||
type: array
|
||||
description: "List of timestamps (in seconds from epoch) in the acceding order. The timestamps are separated by the requested interval. Timestamps where no workers activity was recorded are omitted."
|
||||
items { type: integer }
|
||||
}
|
||||
stats {
|
||||
type: array
|
||||
description: "Statistics data by type"
|
||||
@ -482,6 +520,20 @@ get_stats {
|
||||
}
|
||||
}
|
||||
}
|
||||
"2.32": ${get_stats."2.4"} {
|
||||
request.properties {
|
||||
split_by_variant {
|
||||
description: "Obsolete, please do not use"
|
||||
type: boolean
|
||||
default: false
|
||||
}
|
||||
split_by_resource {
|
||||
type: boolean
|
||||
default: false
|
||||
description: If set then for GPU related keys return the per GPU charts in addition to the aggregated one
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
get_activity_report {
|
||||
"2.4" {
|
||||
|
@ -1,9 +1,3 @@
|
||||
import itertools
|
||||
from operator import attrgetter
|
||||
from typing import Optional, Sequence, Union
|
||||
|
||||
from boltons.iterutils import bucketize
|
||||
|
||||
from apiserver.apierrors.errors import bad_request
|
||||
from apiserver.apimodels.workers import (
|
||||
WorkerRequest,
|
||||
@ -23,6 +17,7 @@ from apiserver.apimodels.workers import (
|
||||
GetActivityReportResponse,
|
||||
ActivityReportSeries,
|
||||
GetCountRequest,
|
||||
MetricResourceSeries,
|
||||
)
|
||||
from apiserver.bll.workers import WorkerBLL
|
||||
from apiserver.config_repo import config
|
||||
@ -163,71 +158,47 @@ def get_activity_report(
|
||||
@endpoint(
|
||||
"workers.get_stats",
|
||||
min_version="2.4",
|
||||
request_data_model=GetStatsRequest,
|
||||
response_data_model=GetStatsResponse,
|
||||
validate_schema=True,
|
||||
)
|
||||
def get_stats(call: APICall, company_id, request: GetStatsRequest):
|
||||
ret = worker_bll.stats.get_worker_stats(company_id, request)
|
||||
|
||||
def _get_variant_metric_stats(
|
||||
metric: str,
|
||||
agg_names: Sequence[str],
|
||||
stats: Sequence[dict],
|
||||
variant: Optional[str] = None,
|
||||
) -> MetricStats:
|
||||
stat_by_name = extract_properties_to_lists(agg_names, stats)
|
||||
return MetricStats(
|
||||
metric=metric,
|
||||
variant=variant,
|
||||
dates=stat_by_name["date"],
|
||||
stats=[
|
||||
AggregationStats(aggregation=name, values=aggs)
|
||||
for name, aggs in stat_by_name.items()
|
||||
if name != "date"
|
||||
],
|
||||
)
|
||||
|
||||
def _get_metric_stats(
|
||||
metric: str, stats: Union[dict, Sequence[dict]], agg_types: Sequence[str]
|
||||
) -> Sequence[MetricStats]:
|
||||
"""
|
||||
Return statistics for a certain metric or a list of statistic for
|
||||
metric variants if break_by_variant was requested
|
||||
"""
|
||||
agg_names = ["date"] + list(set(agg_types))
|
||||
if not isinstance(stats, dict):
|
||||
# no variants were requested
|
||||
return [_get_variant_metric_stats(metric, agg_names, stats)]
|
||||
|
||||
return [
|
||||
_get_variant_metric_stats(metric, agg_names, variant_stats, variant)
|
||||
for variant, variant_stats in stats.items()
|
||||
]
|
||||
|
||||
def _get_worker_metrics(stats: dict) -> Sequence[MetricStats]:
|
||||
"""
|
||||
Convert the worker statistics data from the internal format of lists of structs
|
||||
to a more "compact" format for json transfer (arrays of dates and arrays of values)
|
||||
"""
|
||||
# removed metrics that were requested but for some reason
|
||||
# do not exist in stats data
|
||||
metrics = [metric for metric in request.items if metric.key in stats]
|
||||
|
||||
aggs_by_metric = bucketize(
|
||||
metrics, key=attrgetter("key"), value_transform=attrgetter("aggregation")
|
||||
)
|
||||
|
||||
return list(
|
||||
itertools.chain.from_iterable(
|
||||
_get_metric_stats(metric, metric_stats, aggs_by_metric[metric])
|
||||
for metric, metric_stats in stats.items()
|
||||
)
|
||||
def _get_agg_stats(
|
||||
aggregation: str,
|
||||
stats: dict,
|
||||
) -> AggregationStats:
|
||||
resource_series = []
|
||||
if "resource_series" in stats:
|
||||
for name, values in stats["resource_series"].items():
|
||||
resource_series.append(
|
||||
MetricResourceSeries(
|
||||
name=name,
|
||||
values=values
|
||||
)
|
||||
)
|
||||
return AggregationStats(
|
||||
aggregation=aggregation,
|
||||
dates=stats["dates"],
|
||||
values=stats["values"],
|
||||
resource_series=resource_series,
|
||||
)
|
||||
|
||||
return GetStatsResponse(
|
||||
workers=[
|
||||
WorkerStatistics(worker=worker, metrics=_get_worker_metrics(stats))
|
||||
for worker, stats in ret.items()
|
||||
WorkerStatistics(
|
||||
worker=worker,
|
||||
metrics=[
|
||||
MetricStats(
|
||||
metric=metric,
|
||||
stats=[
|
||||
_get_agg_stats(aggregation, a_stats)
|
||||
for aggregation, a_stats in m_stats.items()
|
||||
]
|
||||
)
|
||||
for metric, m_stats in w_stats.items()
|
||||
],
|
||||
)
|
||||
for worker, w_stats in ret.items()
|
||||
]
|
||||
)
|
||||
|
@ -1,3 +1,4 @@
|
||||
import statistics
|
||||
import time
|
||||
from uuid import uuid4
|
||||
from typing import Sequence
|
||||
@ -83,7 +84,7 @@ class TestWorkersService(TestService):
|
||||
self._check_exists(test_worker, False, tags=["test"])
|
||||
self._check_exists(test_worker, False, tags=["-application"])
|
||||
|
||||
def _simulate_workers(self, start: int) -> Sequence[str]:
|
||||
def _simulate_workers(self, start: int, with_gpu: bool = False) -> dict:
|
||||
"""
|
||||
Two workers writing the same metrics. One for 4 seconds. Another one for 2
|
||||
The first worker reports a task
|
||||
@ -93,20 +94,25 @@ class TestWorkersService(TestService):
|
||||
task_id = self._create_running_task(task_name="task-1")
|
||||
|
||||
workers = [f"test_{uuid4().hex}", f"test_{uuid4().hex}"]
|
||||
workers_stats = [
|
||||
if with_gpu:
|
||||
gpu_usage = [dict(gpu_usage=[60, 70]), dict(gpu_usage=[40])]
|
||||
else:
|
||||
gpu_usage = [{}, {}]
|
||||
|
||||
worker_stats = [
|
||||
(
|
||||
dict(cpu_usage=[10, 20], memory_used=50),
|
||||
dict(cpu_usage=[5], memory_used=30),
|
||||
dict(cpu_usage=[10, 20], memory_used=50, **gpu_usage[0]),
|
||||
dict(cpu_usage=[5], memory_used=30, **gpu_usage[1]),
|
||||
)
|
||||
] * 4
|
||||
workers_activity = [
|
||||
worker_activity = [
|
||||
(workers[0], workers[1]),
|
||||
(workers[0], workers[1]),
|
||||
(workers[0],),
|
||||
(workers[0],),
|
||||
]
|
||||
timestamp = start * 1000
|
||||
for ws, stats in zip(workers_activity, workers_stats):
|
||||
for ws, stats in zip(worker_activity, worker_stats):
|
||||
for w, s in zip(ws, stats):
|
||||
data = dict(
|
||||
worker=w,
|
||||
@ -118,7 +124,10 @@ class TestWorkersService(TestService):
|
||||
self.api.workers.status_report(**data)
|
||||
timestamp += 60*1000
|
||||
|
||||
return workers
|
||||
return {
|
||||
w: s
|
||||
for w, s in zip(workers, worker_stats[0])
|
||||
}
|
||||
|
||||
def _create_running_task(self, task_name):
|
||||
task_input = dict(name=task_name, type="testing")
|
||||
@ -131,7 +140,7 @@ class TestWorkersService(TestService):
|
||||
def test_get_keys(self):
|
||||
workers = self._simulate_workers(int(time.time()))
|
||||
time.sleep(5) # give to es time to refresh
|
||||
res = self.api.workers.get_metric_keys(worker_ids=workers)
|
||||
res = self.api.workers.get_metric_keys(worker_ids=list(workers))
|
||||
assert {"cpu", "memory"} == set(c.name for c in res["categories"])
|
||||
assert all(
|
||||
c.metric_keys == ["cpu_usage"] for c in res["categories"] if c.name == "cpu"
|
||||
@ -147,7 +156,7 @@ class TestWorkersService(TestService):
|
||||
|
||||
def test_get_stats(self):
|
||||
start = int(time.time())
|
||||
workers = self._simulate_workers(start)
|
||||
workers = self._simulate_workers(start, with_gpu=True)
|
||||
|
||||
time.sleep(5) # give to ES time to refresh
|
||||
from_date = start
|
||||
@ -157,49 +166,72 @@ class TestWorkersService(TestService):
|
||||
items=[
|
||||
dict(key="cpu_usage", aggregation="avg"),
|
||||
dict(key="cpu_usage", aggregation="max"),
|
||||
dict(key="gpu_usage", aggregation="avg"),
|
||||
dict(key="gpu_usage", aggregation="max"),
|
||||
dict(key="memory_used", aggregation="max"),
|
||||
dict(key="memory_used", aggregation="min"),
|
||||
],
|
||||
from_date=from_date,
|
||||
to_date=to_date,
|
||||
# split_by_variant=True,
|
||||
interval=1,
|
||||
worker_ids=workers,
|
||||
worker_ids=list(workers),
|
||||
)
|
||||
self.assertWorkersInStats(workers, res.workers)
|
||||
self.assertWorkersInStats(list(workers), res.workers)
|
||||
for worker in res.workers:
|
||||
self.assertEqual(
|
||||
set(metric.metric for metric in worker.metrics),
|
||||
{"cpu_usage", "memory_used"},
|
||||
{"cpu_usage", "gpu_usage", "memory_used"},
|
||||
)
|
||||
|
||||
for worker in res.workers:
|
||||
worker_id = worker.worker
|
||||
for metric, metric_stats in zip(
|
||||
worker.metrics, ({"avg", "max"}, {"max", "min"})
|
||||
worker.metrics, ({"avg", "max"}, {"avg", "max"}, {"max"})
|
||||
):
|
||||
metric_name = metric.metric
|
||||
self.assertEqual(
|
||||
set(stat.aggregation for stat in metric.stats), metric_stats
|
||||
)
|
||||
self.assertTrue(11 >= len(metric.dates) >= 10)
|
||||
for stat in metric.stats:
|
||||
expected = workers[worker_id][metric_name]
|
||||
self.assertTrue(11 >= len(stat.dates) >= 10)
|
||||
self.assertFalse(stat.get("resource_series"))
|
||||
agg = stat.aggregation
|
||||
if isinstance(expected, list):
|
||||
if agg == "avg":
|
||||
val = statistics.mean(expected)
|
||||
elif agg == "min":
|
||||
val = min(expected)
|
||||
else:
|
||||
val = max(expected)
|
||||
else:
|
||||
val = expected
|
||||
self.assertEqual(set(stat["values"]), {val, 0})
|
||||
|
||||
# split by variants
|
||||
# split by resources
|
||||
res = self.api.workers.get_stats(
|
||||
items=[dict(key="cpu_usage", aggregation="avg")],
|
||||
items=[dict(key="gpu_usage", aggregation="avg")],
|
||||
from_date=from_date,
|
||||
to_date=to_date,
|
||||
split_by_variant=True,
|
||||
split_by_resource=True,
|
||||
interval=1,
|
||||
worker_ids=workers,
|
||||
worker_ids=list(workers),
|
||||
)
|
||||
self.assertWorkersInStats(workers, res.workers)
|
||||
self.assertWorkersInStats(list(workers), res.workers)
|
||||
|
||||
for worker in res.workers:
|
||||
worker_id = worker.worker
|
||||
for metric in worker.metrics:
|
||||
self.assertEqual(
|
||||
set(metric.variant for metric in worker.metrics),
|
||||
{"0", "1"} if worker.worker == workers[0] else {"0"},
|
||||
)
|
||||
self.assertTrue(11 >= len(metric.dates) >= 10)
|
||||
metric_name = metric.metric
|
||||
for stat in metric.stats:
|
||||
expected = workers[worker_id][metric_name]
|
||||
if metric_name.startswith("gpu") and len(expected) > 1:
|
||||
resource_series = stat.get("resource_series")
|
||||
self.assertEqual(len(resource_series), len(expected))
|
||||
for rs, value in zip(resource_series, expected):
|
||||
self.assertEqual(set(rs["values"]), {value, 0})
|
||||
else:
|
||||
self.assertEqual(stat.get("resource_series"), [])
|
||||
|
||||
res = self.api.workers.get_stats(
|
||||
items=[dict(key="cpu_usage", aggregation="avg")],
|
||||
|
Loading…
Reference in New Issue
Block a user