Add mem used charts and cpu/gpu counts to model endpoints instance details

For the num of requests serving charts always take the max value from the interval
This commit is contained in:
clearml 2024-12-05 22:31:45 +02:00
parent 17fcaba2cb
commit ed60a27d1a
6 changed files with 96 additions and 27 deletions

View File

@ -86,8 +86,10 @@ class MetricType(Enum):
cpu_util = "cpu_util" cpu_util = "cpu_util"
gpu_util = "gpu_util" gpu_util = "gpu_util"
ram_total = "ram_total" ram_total = "ram_total"
ram_used = "ram_used"
ram_free = "ram_free" ram_free = "ram_free"
gpu_ram_total = "gpu_ram_total" gpu_ram_total = "gpu_ram_total"
gpu_ram_used = "gpu_ram_used"
gpu_ram_free = "gpu_ram_free" gpu_ram_free = "gpu_ram_free"
network_rx = "network_rx" network_rx = "network_rx"
network_tx = "network_tx" network_tx = "network_tx"

View File

@ -13,11 +13,13 @@ from apiserver.apimodels.serving import (
RegisterRequest, RegisterRequest,
StatusReportRequest, StatusReportRequest,
) )
from apiserver.apimodels.workers import MachineStats
from apiserver.apierrors import errors from apiserver.apierrors import errors
from apiserver.config_repo import config from apiserver.config_repo import config
from apiserver.redis_manager import redman from apiserver.redis_manager import redman
from .stats import ServingStats from .stats import ServingStats
log = config.logger(__file__) log = config.logger(__file__)
@ -329,6 +331,21 @@ class ServingBLL:
} }
) )
def get_machine_stats_data(machine_stats: MachineStats) -> dict:
ret = {"cpu_count": 0, "gpu_count": 0}
if not machine_stats:
return ret
for value, field in (
(machine_stats.cpu_usage, "cpu_count"),
(machine_stats.gpu_usage, "gpu_count"),
):
if value is None:
continue
ret[field] = len(value) if isinstance(value, (list, tuple)) else 1
return ret
first_entry = entries[0] first_entry = entries[0]
return { return {
"endpoint": first_entry.endpoint_name, "endpoint": first_entry.endpoint_name,
@ -352,6 +369,7 @@ class ServingBLL:
"reference": [ref.to_struct() for ref in entry.reference] "reference": [ref.to_struct() for ref in entry.reference]
if isinstance(entry.reference, list) if isinstance(entry.reference, list)
else entry.reference, else entry.reference,
**get_machine_stats_data(entry.machine_stats),
} }
for entry in entries for entry in entries
], ],

View File

@ -94,7 +94,7 @@ class ServingStats:
{ {
f"{category}_free": free, f"{category}_free": free,
f"{category}_used": used, f"{category}_used": used,
f"{category}_total": (free or 0) + (used or 0), f"{category}_total": round((free or 0) + (used or 0), 3),
} }
) )
@ -110,58 +110,90 @@ class ServingStats:
return 1 return 1
@staticmethod @staticmethod
def round_series(values: Sequence, koeff=1.0) -> list: def round_series(values: Sequence, koeff) -> list:
return [round(v * koeff, 2) if v else 0 for v in values] return [round(v * koeff, 2) if v else 0 for v in values]
_mb_to_gb = 1 / 1024
agg_fields = { agg_fields = {
MetricType.requests: ( MetricType.requests: (
"requests_num", "requests_num",
"Number of Requests", "Number of Requests",
_AggregationType.sum, _AggregationType.sum,
None,
), ),
MetricType.requests_min: ( MetricType.requests_min: (
"requests_min", "requests_min",
"Requests per Minute", "Requests per Minute",
_AggregationType.sum, _AggregationType.sum,
None,
), ),
MetricType.latency_ms: ( MetricType.latency_ms: (
"latency_ms", "latency_ms",
"Average Latency (ms)", "Average Latency (ms)",
_AggregationType.avg, _AggregationType.avg,
None,
), ),
MetricType.cpu_count: ("cpu_num", "CPU Count", _AggregationType.sum), MetricType.cpu_count: ("cpu_num", "CPU Count", _AggregationType.sum, None),
MetricType.gpu_count: ("gpu_num", "GPU Count", _AggregationType.sum), MetricType.gpu_count: ("gpu_num", "GPU Count", _AggregationType.sum, None),
MetricType.cpu_util: ( MetricType.cpu_util: (
"cpu_usage", "cpu_usage",
"Average CPU Load (%)", "Average CPU Load (%)",
_AggregationType.avg, _AggregationType.avg,
None,
), ),
MetricType.gpu_util: ( MetricType.gpu_util: (
"gpu_usage", "gpu_usage",
"Average GPU Utilization (%)", "Average GPU Utilization (%)",
_AggregationType.avg, _AggregationType.avg,
None,
),
MetricType.ram_total: (
"memory_total",
"RAM Total (GB)",
_AggregationType.sum,
_mb_to_gb,
),
MetricType.ram_used: (
"memory_used",
"RAM Used (GB)",
_AggregationType.sum,
_mb_to_gb,
),
MetricType.ram_free: (
"memory_free",
"RAM Free (GB)",
_AggregationType.sum,
_mb_to_gb,
), ),
MetricType.ram_total: ("memory_total", "RAM Total (GB)", _AggregationType.sum),
MetricType.ram_free: ("memory_free", "RAM Free (GB)", _AggregationType.sum),
MetricType.gpu_ram_total: ( MetricType.gpu_ram_total: (
"gpu_memory_total", "gpu_memory_total",
"GPU RAM Total (GB)", "GPU RAM Total (GB)",
_AggregationType.sum, _AggregationType.sum,
_mb_to_gb,
),
MetricType.gpu_ram_used: (
"gpu_memory_used",
"GPU RAM Used (GB)",
_AggregationType.sum,
_mb_to_gb,
), ),
MetricType.gpu_ram_free: ( MetricType.gpu_ram_free: (
"gpu_memory_free", "gpu_memory_free",
"GPU RAM Free (GB)", "GPU RAM Free (GB)",
_AggregationType.sum, _AggregationType.sum,
_mb_to_gb,
), ),
MetricType.network_rx: ( MetricType.network_rx: (
"network_rx", "network_rx",
"Network Throughput RX (MBps)", "Network Throughput RX (MBps)",
_AggregationType.sum, _AggregationType.sum,
None,
), ),
MetricType.network_tx: ( MetricType.network_tx: (
"network_tx", "network_tx",
"Network Throughput TX (MBps)", "Network Throughput TX (MBps)",
_AggregationType.sum, _AggregationType.sum,
None,
), ),
} }
@ -183,7 +215,7 @@ class ServingStats:
if not agg_data: if not agg_data:
raise NotImplemented(f"Charts for {metric_type} not implemented") raise NotImplemented(f"Charts for {metric_type} not implemented")
agg_field, title, agg_type = agg_data agg_field, title, agg_type, multiplier = agg_data
if agg_type == _AggregationType.sum: if agg_type == _AggregationType.sum:
instance_sum_type = "sum_bucket" instance_sum_type = "sum_bucket"
else: else:
@ -220,7 +252,7 @@ class ServingStats:
instance_keys = {ib["key"] for ib in instance_buckets} instance_keys = {ib["key"] for ib in instance_buckets}
must_conditions.append(QueryBuilder.terms("container_id", instance_keys)) must_conditions.append(QueryBuilder.terms("container_id", instance_keys))
query = {"bool": {"must": must_conditions}} query = {"bool": {"must": must_conditions}}
sample_func = "avg" if metric_type != MetricType.requests else "max"
aggs = { aggs = {
"instances": { "instances": {
"terms": { "terms": {
@ -228,13 +260,13 @@ class ServingStats:
"size": max(len(instance_keys), 10), "size": max(len(instance_keys), 10),
}, },
"aggs": { "aggs": {
"average": {"avg": {"field": agg_field}}, "sample": {sample_func: {"field": agg_field}},
}, },
}, },
"total_instances": { "total_instances": {
instance_sum_type: { instance_sum_type: {
"gap_policy": "insert_zeros", "gap_policy": "insert_zeros",
"buckets_path": "instances>average", "buckets_path": "instances>sample",
} }
}, },
} }
@ -282,16 +314,21 @@ class ServingStats:
found_keys = set() found_keys = set()
for instance in nested_get(point, ("instances", "buckets"), []): for instance in nested_get(point, ("instances", "buckets"), []):
instances[instance["key"]].append( instances[instance["key"]].append(
nested_get(instance, ("average", "value"), 0) nested_get(instance, ("sample", "value"), 0)
) )
found_keys.add(instance["key"]) found_keys.add(instance["key"])
for missing_key in instance_keys - found_keys: for missing_key in instance_keys - found_keys:
instances[missing_key].append(0) instances[missing_key].append(0)
koeff = multiplier if multiplier else 1.0
hist_ret["total"]["dates"] = dates_ hist_ret["total"]["dates"] = dates_
hist_ret["total"]["values"] = cls.round_series(total) hist_ret["total"]["values"] = cls.round_series(total, koeff)
hist_ret["instances"] = { hist_ret["instances"] = {
key: {"title": key, "dates": dates_, "values": cls.round_series(values)} key: {
"title": key,
"dates": dates_,
"values": cls.round_series(values, koeff),
}
for key, values in sorted(instances.items(), key=lambda p: p[0]) for key, values in sorted(instances.items(), key=lambda p: p[0])
} }

View File

@ -13,45 +13,45 @@ machine_stats {
} }
memory_used { memory_used {
description: "Used memory MBs" description: "Used memory MBs"
type: integer type: number
} }
memory_free { memory_free {
description: "Free memory MBs" description: "Free memory MBs"
type: integer type: number
} }
gpu_memory_free { gpu_memory_free {
description: "GPU free memory MBs" description: "GPU free memory MBs"
type: array type: array
items { type: integer } items { type: number }
} }
gpu_memory_used { gpu_memory_used {
description: "GPU used memory MBs" description: "GPU used memory MBs"
type: array type: array
items { type: integer } items { type: number }
} }
network_tx { network_tx {
description: "Mbytes per second" description: "Mbytes per second"
type: integer type: number
} }
network_rx { network_rx {
description: "Mbytes per second" description: "Mbytes per second"
type: integer type: number
} }
disk_free_home { disk_free_home {
description: "Free space in % of /home drive" description: "Free space in % of /home drive"
type: integer type: number
} }
disk_free_temp { disk_free_temp {
description: "Free space in % of /tmp drive" description: "Free space in % of /tmp drive"
type: integer type: number
} }
disk_read { disk_read {
description: "Mbytes read per second" description: "Mbytes read per second"
type: integer type: number
} }
disk_write { disk_write {
description: "Mbytes write per second" description: "Mbytes write per second"
type: integer type: number
} }
cpu_temperature { cpu_temperature {
description: "CPU temperature" description: "CPU temperature"

View File

@ -134,6 +134,14 @@ _definitions {
format: "date-time" format: "date-time"
description: The latest time when the container instance sent update description: The latest time when the container instance sent update
} }
cpu_count {
type: integer
description: CPU Count
}
gpu_count {
type: integer
description: GPU Count
}
reference: ${_definitions.reference} reference: ${_definitions.reference}
} }
@ -390,8 +398,10 @@ get_endpoint_metrics_history {
cpu_util cpu_util
gpu_util gpu_util
ram_total ram_total
ram_used
ram_free ram_free
gpu_ram_total gpu_ram_total
gpu_ram_used
gpu_ram_free gpu_ram_free
network_rx network_rx
network_tx network_tx

View File

@ -49,7 +49,7 @@ class TestServing(TestService):
latency_ms=100 * mul, # average latency latency_ms=100 * mul, # average latency
machine_stats={ # the same structure here as used by worker status_reports machine_stats={ # the same structure here as used by worker status_reports
"cpu_usage": [10, 20], "cpu_usage": [10, 20],
"memory_used": 50, "memory_used": 50 * 1024,
}, },
) )
@ -68,14 +68,16 @@ class TestServing(TestService):
"requests", "requests",
"requests_min", "requests_min",
"latency_ms", "latency_ms",
"cpu_count",
"gpu_count",
"reference", "reference",
) )
] ]
for inst in details.instances for inst in details.instances
}, },
{ {
"container_1": [1000, 1000, 5, 100, reference], "container_1": [1000, 1000, 5, 100, 2, 0, reference],
"container_2": [2000, 2000, 10, 200, []], "container_2": [2000, 2000, 10, 200, 2, 0, []],
}, },
) )
# make sure that the first call did not invalidate anything # make sure that the first call did not invalidate anything
@ -92,7 +94,7 @@ class TestServing(TestService):
("latency_ms", "Average Latency (ms)", 150), ("latency_ms", "Average Latency (ms)", 150),
("cpu_count", "CPU Count", 4), ("cpu_count", "CPU Count", 4),
("cpu_util", "Average CPU Load (%)", 15), ("cpu_util", "Average CPU Load (%)", 15),
("ram_total", "RAM Total (GB)", 100), ("ram_used", "RAM Used (GB)", 100.0),
): ):
res = self.api.serving.get_endpoint_metrics_history( res = self.api.serving.get_endpoint_metrics_history(
endpoint_url=url, endpoint_url=url,