Add mem used charts and cpu/gpu counts to model endpoints instance details

For the num of requests serving charts always take the max value from the interval
This commit is contained in:
clearml 2024-12-05 22:31:45 +02:00
parent 17fcaba2cb
commit ed60a27d1a
6 changed files with 96 additions and 27 deletions

View File

@ -86,8 +86,10 @@ class MetricType(Enum):
cpu_util = "cpu_util"
gpu_util = "gpu_util"
ram_total = "ram_total"
ram_used = "ram_used"
ram_free = "ram_free"
gpu_ram_total = "gpu_ram_total"
gpu_ram_used = "gpu_ram_used"
gpu_ram_free = "gpu_ram_free"
network_rx = "network_rx"
network_tx = "network_tx"

View File

@ -13,11 +13,13 @@ from apiserver.apimodels.serving import (
RegisterRequest,
StatusReportRequest,
)
from apiserver.apimodels.workers import MachineStats
from apiserver.apierrors import errors
from apiserver.config_repo import config
from apiserver.redis_manager import redman
from .stats import ServingStats
log = config.logger(__file__)
@ -329,6 +331,21 @@ class ServingBLL:
}
)
def get_machine_stats_data(machine_stats: MachineStats) -> dict:
ret = {"cpu_count": 0, "gpu_count": 0}
if not machine_stats:
return ret
for value, field in (
(machine_stats.cpu_usage, "cpu_count"),
(machine_stats.gpu_usage, "gpu_count"),
):
if value is None:
continue
ret[field] = len(value) if isinstance(value, (list, tuple)) else 1
return ret
first_entry = entries[0]
return {
"endpoint": first_entry.endpoint_name,
@ -352,6 +369,7 @@ class ServingBLL:
"reference": [ref.to_struct() for ref in entry.reference]
if isinstance(entry.reference, list)
else entry.reference,
**get_machine_stats_data(entry.machine_stats),
}
for entry in entries
],

View File

@ -94,7 +94,7 @@ class ServingStats:
{
f"{category}_free": free,
f"{category}_used": used,
f"{category}_total": (free or 0) + (used or 0),
f"{category}_total": round((free or 0) + (used or 0), 3),
}
)
@ -110,58 +110,90 @@ class ServingStats:
return 1
@staticmethod
def round_series(values: Sequence, koeff=1.0) -> list:
def round_series(values: Sequence, koeff) -> list:
return [round(v * koeff, 2) if v else 0 for v in values]
_mb_to_gb = 1 / 1024
agg_fields = {
MetricType.requests: (
"requests_num",
"Number of Requests",
_AggregationType.sum,
None,
),
MetricType.requests_min: (
"requests_min",
"Requests per Minute",
_AggregationType.sum,
None,
),
MetricType.latency_ms: (
"latency_ms",
"Average Latency (ms)",
_AggregationType.avg,
None,
),
MetricType.cpu_count: ("cpu_num", "CPU Count", _AggregationType.sum),
MetricType.gpu_count: ("gpu_num", "GPU Count", _AggregationType.sum),
MetricType.cpu_count: ("cpu_num", "CPU Count", _AggregationType.sum, None),
MetricType.gpu_count: ("gpu_num", "GPU Count", _AggregationType.sum, None),
MetricType.cpu_util: (
"cpu_usage",
"Average CPU Load (%)",
_AggregationType.avg,
None,
),
MetricType.gpu_util: (
"gpu_usage",
"Average GPU Utilization (%)",
_AggregationType.avg,
None,
),
MetricType.ram_total: (
"memory_total",
"RAM Total (GB)",
_AggregationType.sum,
_mb_to_gb,
),
MetricType.ram_used: (
"memory_used",
"RAM Used (GB)",
_AggregationType.sum,
_mb_to_gb,
),
MetricType.ram_free: (
"memory_free",
"RAM Free (GB)",
_AggregationType.sum,
_mb_to_gb,
),
MetricType.ram_total: ("memory_total", "RAM Total (GB)", _AggregationType.sum),
MetricType.ram_free: ("memory_free", "RAM Free (GB)", _AggregationType.sum),
MetricType.gpu_ram_total: (
"gpu_memory_total",
"GPU RAM Total (GB)",
_AggregationType.sum,
_mb_to_gb,
),
MetricType.gpu_ram_used: (
"gpu_memory_used",
"GPU RAM Used (GB)",
_AggregationType.sum,
_mb_to_gb,
),
MetricType.gpu_ram_free: (
"gpu_memory_free",
"GPU RAM Free (GB)",
_AggregationType.sum,
_mb_to_gb,
),
MetricType.network_rx: (
"network_rx",
"Network Throughput RX (MBps)",
_AggregationType.sum,
None,
),
MetricType.network_tx: (
"network_tx",
"Network Throughput TX (MBps)",
_AggregationType.sum,
None,
),
}
@ -183,7 +215,7 @@ class ServingStats:
if not agg_data:
raise NotImplemented(f"Charts for {metric_type} not implemented")
agg_field, title, agg_type = agg_data
agg_field, title, agg_type, multiplier = agg_data
if agg_type == _AggregationType.sum:
instance_sum_type = "sum_bucket"
else:
@ -220,7 +252,7 @@ class ServingStats:
instance_keys = {ib["key"] for ib in instance_buckets}
must_conditions.append(QueryBuilder.terms("container_id", instance_keys))
query = {"bool": {"must": must_conditions}}
sample_func = "avg" if metric_type != MetricType.requests else "max"
aggs = {
"instances": {
"terms": {
@ -228,13 +260,13 @@ class ServingStats:
"size": max(len(instance_keys), 10),
},
"aggs": {
"average": {"avg": {"field": agg_field}},
"sample": {sample_func: {"field": agg_field}},
},
},
"total_instances": {
instance_sum_type: {
"gap_policy": "insert_zeros",
"buckets_path": "instances>average",
"buckets_path": "instances>sample",
}
},
}
@ -282,16 +314,21 @@ class ServingStats:
found_keys = set()
for instance in nested_get(point, ("instances", "buckets"), []):
instances[instance["key"]].append(
nested_get(instance, ("average", "value"), 0)
nested_get(instance, ("sample", "value"), 0)
)
found_keys.add(instance["key"])
for missing_key in instance_keys - found_keys:
instances[missing_key].append(0)
koeff = multiplier if multiplier else 1.0
hist_ret["total"]["dates"] = dates_
hist_ret["total"]["values"] = cls.round_series(total)
hist_ret["total"]["values"] = cls.round_series(total, koeff)
hist_ret["instances"] = {
key: {"title": key, "dates": dates_, "values": cls.round_series(values)}
key: {
"title": key,
"dates": dates_,
"values": cls.round_series(values, koeff),
}
for key, values in sorted(instances.items(), key=lambda p: p[0])
}

View File

@ -13,45 +13,45 @@ machine_stats {
}
memory_used {
description: "Used memory MBs"
type: integer
type: number
}
memory_free {
description: "Free memory MBs"
type: integer
type: number
}
gpu_memory_free {
description: "GPU free memory MBs"
type: array
items { type: integer }
items { type: number }
}
gpu_memory_used {
description: "GPU used memory MBs"
type: array
items { type: integer }
items { type: number }
}
network_tx {
description: "Mbytes per second"
type: integer
type: number
}
network_rx {
description: "Mbytes per second"
type: integer
type: number
}
disk_free_home {
description: "Free space in % of /home drive"
type: integer
type: number
}
disk_free_temp {
description: "Free space in % of /tmp drive"
type: integer
type: number
}
disk_read {
description: "Mbytes read per second"
type: integer
type: number
}
disk_write {
description: "Mbytes write per second"
type: integer
type: number
}
cpu_temperature {
description: "CPU temperature"

View File

@ -134,6 +134,14 @@ _definitions {
format: "date-time"
description: The latest time when the container instance sent update
}
cpu_count {
type: integer
description: CPU Count
}
gpu_count {
type: integer
description: GPU Count
}
reference: ${_definitions.reference}
}
@ -390,8 +398,10 @@ get_endpoint_metrics_history {
cpu_util
gpu_util
ram_total
ram_used
ram_free
gpu_ram_total
gpu_ram_used
gpu_ram_free
network_rx
network_tx

View File

@ -49,7 +49,7 @@ class TestServing(TestService):
latency_ms=100 * mul, # average latency
machine_stats={ # the same structure here as used by worker status_reports
"cpu_usage": [10, 20],
"memory_used": 50,
"memory_used": 50 * 1024,
},
)
@ -68,14 +68,16 @@ class TestServing(TestService):
"requests",
"requests_min",
"latency_ms",
"cpu_count",
"gpu_count",
"reference",
)
]
for inst in details.instances
},
{
"container_1": [1000, 1000, 5, 100, reference],
"container_2": [2000, 2000, 10, 200, []],
"container_1": [1000, 1000, 5, 100, 2, 0, reference],
"container_2": [2000, 2000, 10, 200, 2, 0, []],
},
)
# make sure that the first call did not invalidate anything
@ -92,7 +94,7 @@ class TestServing(TestService):
("latency_ms", "Average Latency (ms)", 150),
("cpu_count", "CPU Count", 4),
("cpu_util", "Average CPU Load (%)", 15),
("ram_total", "RAM Total (GB)", 100),
("ram_used", "RAM Used (GB)", 100.0),
):
res = self.api.serving.get_endpoint_metrics_history(
endpoint_url=url,