mirror of
https://github.com/clearml/clearml-server
synced 2025-06-23 08:45:30 +00:00
Add mem used charts and cpu/gpu counts to model endpoints instance details
For the num of requests serving charts always take the max value from the interval
This commit is contained in:
parent
17fcaba2cb
commit
ed60a27d1a
@ -86,8 +86,10 @@ class MetricType(Enum):
|
|||||||
cpu_util = "cpu_util"
|
cpu_util = "cpu_util"
|
||||||
gpu_util = "gpu_util"
|
gpu_util = "gpu_util"
|
||||||
ram_total = "ram_total"
|
ram_total = "ram_total"
|
||||||
|
ram_used = "ram_used"
|
||||||
ram_free = "ram_free"
|
ram_free = "ram_free"
|
||||||
gpu_ram_total = "gpu_ram_total"
|
gpu_ram_total = "gpu_ram_total"
|
||||||
|
gpu_ram_used = "gpu_ram_used"
|
||||||
gpu_ram_free = "gpu_ram_free"
|
gpu_ram_free = "gpu_ram_free"
|
||||||
network_rx = "network_rx"
|
network_rx = "network_rx"
|
||||||
network_tx = "network_tx"
|
network_tx = "network_tx"
|
||||||
|
@ -13,11 +13,13 @@ from apiserver.apimodels.serving import (
|
|||||||
RegisterRequest,
|
RegisterRequest,
|
||||||
StatusReportRequest,
|
StatusReportRequest,
|
||||||
)
|
)
|
||||||
|
from apiserver.apimodels.workers import MachineStats
|
||||||
from apiserver.apierrors import errors
|
from apiserver.apierrors import errors
|
||||||
from apiserver.config_repo import config
|
from apiserver.config_repo import config
|
||||||
from apiserver.redis_manager import redman
|
from apiserver.redis_manager import redman
|
||||||
from .stats import ServingStats
|
from .stats import ServingStats
|
||||||
|
|
||||||
|
|
||||||
log = config.logger(__file__)
|
log = config.logger(__file__)
|
||||||
|
|
||||||
|
|
||||||
@ -329,6 +331,21 @@ class ServingBLL:
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_machine_stats_data(machine_stats: MachineStats) -> dict:
|
||||||
|
ret = {"cpu_count": 0, "gpu_count": 0}
|
||||||
|
if not machine_stats:
|
||||||
|
return ret
|
||||||
|
|
||||||
|
for value, field in (
|
||||||
|
(machine_stats.cpu_usage, "cpu_count"),
|
||||||
|
(machine_stats.gpu_usage, "gpu_count"),
|
||||||
|
):
|
||||||
|
if value is None:
|
||||||
|
continue
|
||||||
|
ret[field] = len(value) if isinstance(value, (list, tuple)) else 1
|
||||||
|
|
||||||
|
return ret
|
||||||
|
|
||||||
first_entry = entries[0]
|
first_entry = entries[0]
|
||||||
return {
|
return {
|
||||||
"endpoint": first_entry.endpoint_name,
|
"endpoint": first_entry.endpoint_name,
|
||||||
@ -352,6 +369,7 @@ class ServingBLL:
|
|||||||
"reference": [ref.to_struct() for ref in entry.reference]
|
"reference": [ref.to_struct() for ref in entry.reference]
|
||||||
if isinstance(entry.reference, list)
|
if isinstance(entry.reference, list)
|
||||||
else entry.reference,
|
else entry.reference,
|
||||||
|
**get_machine_stats_data(entry.machine_stats),
|
||||||
}
|
}
|
||||||
for entry in entries
|
for entry in entries
|
||||||
],
|
],
|
||||||
|
@ -94,7 +94,7 @@ class ServingStats:
|
|||||||
{
|
{
|
||||||
f"{category}_free": free,
|
f"{category}_free": free,
|
||||||
f"{category}_used": used,
|
f"{category}_used": used,
|
||||||
f"{category}_total": (free or 0) + (used or 0),
|
f"{category}_total": round((free or 0) + (used or 0), 3),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -110,58 +110,90 @@ class ServingStats:
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def round_series(values: Sequence, koeff=1.0) -> list:
|
def round_series(values: Sequence, koeff) -> list:
|
||||||
return [round(v * koeff, 2) if v else 0 for v in values]
|
return [round(v * koeff, 2) if v else 0 for v in values]
|
||||||
|
|
||||||
|
_mb_to_gb = 1 / 1024
|
||||||
agg_fields = {
|
agg_fields = {
|
||||||
MetricType.requests: (
|
MetricType.requests: (
|
||||||
"requests_num",
|
"requests_num",
|
||||||
"Number of Requests",
|
"Number of Requests",
|
||||||
_AggregationType.sum,
|
_AggregationType.sum,
|
||||||
|
None,
|
||||||
),
|
),
|
||||||
MetricType.requests_min: (
|
MetricType.requests_min: (
|
||||||
"requests_min",
|
"requests_min",
|
||||||
"Requests per Minute",
|
"Requests per Minute",
|
||||||
_AggregationType.sum,
|
_AggregationType.sum,
|
||||||
|
None,
|
||||||
),
|
),
|
||||||
MetricType.latency_ms: (
|
MetricType.latency_ms: (
|
||||||
"latency_ms",
|
"latency_ms",
|
||||||
"Average Latency (ms)",
|
"Average Latency (ms)",
|
||||||
_AggregationType.avg,
|
_AggregationType.avg,
|
||||||
|
None,
|
||||||
),
|
),
|
||||||
MetricType.cpu_count: ("cpu_num", "CPU Count", _AggregationType.sum),
|
MetricType.cpu_count: ("cpu_num", "CPU Count", _AggregationType.sum, None),
|
||||||
MetricType.gpu_count: ("gpu_num", "GPU Count", _AggregationType.sum),
|
MetricType.gpu_count: ("gpu_num", "GPU Count", _AggregationType.sum, None),
|
||||||
MetricType.cpu_util: (
|
MetricType.cpu_util: (
|
||||||
"cpu_usage",
|
"cpu_usage",
|
||||||
"Average CPU Load (%)",
|
"Average CPU Load (%)",
|
||||||
_AggregationType.avg,
|
_AggregationType.avg,
|
||||||
|
None,
|
||||||
),
|
),
|
||||||
MetricType.gpu_util: (
|
MetricType.gpu_util: (
|
||||||
"gpu_usage",
|
"gpu_usage",
|
||||||
"Average GPU Utilization (%)",
|
"Average GPU Utilization (%)",
|
||||||
_AggregationType.avg,
|
_AggregationType.avg,
|
||||||
|
None,
|
||||||
|
),
|
||||||
|
MetricType.ram_total: (
|
||||||
|
"memory_total",
|
||||||
|
"RAM Total (GB)",
|
||||||
|
_AggregationType.sum,
|
||||||
|
_mb_to_gb,
|
||||||
|
),
|
||||||
|
MetricType.ram_used: (
|
||||||
|
"memory_used",
|
||||||
|
"RAM Used (GB)",
|
||||||
|
_AggregationType.sum,
|
||||||
|
_mb_to_gb,
|
||||||
|
),
|
||||||
|
MetricType.ram_free: (
|
||||||
|
"memory_free",
|
||||||
|
"RAM Free (GB)",
|
||||||
|
_AggregationType.sum,
|
||||||
|
_mb_to_gb,
|
||||||
),
|
),
|
||||||
MetricType.ram_total: ("memory_total", "RAM Total (GB)", _AggregationType.sum),
|
|
||||||
MetricType.ram_free: ("memory_free", "RAM Free (GB)", _AggregationType.sum),
|
|
||||||
MetricType.gpu_ram_total: (
|
MetricType.gpu_ram_total: (
|
||||||
"gpu_memory_total",
|
"gpu_memory_total",
|
||||||
"GPU RAM Total (GB)",
|
"GPU RAM Total (GB)",
|
||||||
_AggregationType.sum,
|
_AggregationType.sum,
|
||||||
|
_mb_to_gb,
|
||||||
|
),
|
||||||
|
MetricType.gpu_ram_used: (
|
||||||
|
"gpu_memory_used",
|
||||||
|
"GPU RAM Used (GB)",
|
||||||
|
_AggregationType.sum,
|
||||||
|
_mb_to_gb,
|
||||||
),
|
),
|
||||||
MetricType.gpu_ram_free: (
|
MetricType.gpu_ram_free: (
|
||||||
"gpu_memory_free",
|
"gpu_memory_free",
|
||||||
"GPU RAM Free (GB)",
|
"GPU RAM Free (GB)",
|
||||||
_AggregationType.sum,
|
_AggregationType.sum,
|
||||||
|
_mb_to_gb,
|
||||||
),
|
),
|
||||||
MetricType.network_rx: (
|
MetricType.network_rx: (
|
||||||
"network_rx",
|
"network_rx",
|
||||||
"Network Throughput RX (MBps)",
|
"Network Throughput RX (MBps)",
|
||||||
_AggregationType.sum,
|
_AggregationType.sum,
|
||||||
|
None,
|
||||||
),
|
),
|
||||||
MetricType.network_tx: (
|
MetricType.network_tx: (
|
||||||
"network_tx",
|
"network_tx",
|
||||||
"Network Throughput TX (MBps)",
|
"Network Throughput TX (MBps)",
|
||||||
_AggregationType.sum,
|
_AggregationType.sum,
|
||||||
|
None,
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -183,7 +215,7 @@ class ServingStats:
|
|||||||
if not agg_data:
|
if not agg_data:
|
||||||
raise NotImplemented(f"Charts for {metric_type} not implemented")
|
raise NotImplemented(f"Charts for {metric_type} not implemented")
|
||||||
|
|
||||||
agg_field, title, agg_type = agg_data
|
agg_field, title, agg_type, multiplier = agg_data
|
||||||
if agg_type == _AggregationType.sum:
|
if agg_type == _AggregationType.sum:
|
||||||
instance_sum_type = "sum_bucket"
|
instance_sum_type = "sum_bucket"
|
||||||
else:
|
else:
|
||||||
@ -220,7 +252,7 @@ class ServingStats:
|
|||||||
instance_keys = {ib["key"] for ib in instance_buckets}
|
instance_keys = {ib["key"] for ib in instance_buckets}
|
||||||
must_conditions.append(QueryBuilder.terms("container_id", instance_keys))
|
must_conditions.append(QueryBuilder.terms("container_id", instance_keys))
|
||||||
query = {"bool": {"must": must_conditions}}
|
query = {"bool": {"must": must_conditions}}
|
||||||
|
sample_func = "avg" if metric_type != MetricType.requests else "max"
|
||||||
aggs = {
|
aggs = {
|
||||||
"instances": {
|
"instances": {
|
||||||
"terms": {
|
"terms": {
|
||||||
@ -228,13 +260,13 @@ class ServingStats:
|
|||||||
"size": max(len(instance_keys), 10),
|
"size": max(len(instance_keys), 10),
|
||||||
},
|
},
|
||||||
"aggs": {
|
"aggs": {
|
||||||
"average": {"avg": {"field": agg_field}},
|
"sample": {sample_func: {"field": agg_field}},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
"total_instances": {
|
"total_instances": {
|
||||||
instance_sum_type: {
|
instance_sum_type: {
|
||||||
"gap_policy": "insert_zeros",
|
"gap_policy": "insert_zeros",
|
||||||
"buckets_path": "instances>average",
|
"buckets_path": "instances>sample",
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
@ -282,16 +314,21 @@ class ServingStats:
|
|||||||
found_keys = set()
|
found_keys = set()
|
||||||
for instance in nested_get(point, ("instances", "buckets"), []):
|
for instance in nested_get(point, ("instances", "buckets"), []):
|
||||||
instances[instance["key"]].append(
|
instances[instance["key"]].append(
|
||||||
nested_get(instance, ("average", "value"), 0)
|
nested_get(instance, ("sample", "value"), 0)
|
||||||
)
|
)
|
||||||
found_keys.add(instance["key"])
|
found_keys.add(instance["key"])
|
||||||
for missing_key in instance_keys - found_keys:
|
for missing_key in instance_keys - found_keys:
|
||||||
instances[missing_key].append(0)
|
instances[missing_key].append(0)
|
||||||
|
|
||||||
|
koeff = multiplier if multiplier else 1.0
|
||||||
hist_ret["total"]["dates"] = dates_
|
hist_ret["total"]["dates"] = dates_
|
||||||
hist_ret["total"]["values"] = cls.round_series(total)
|
hist_ret["total"]["values"] = cls.round_series(total, koeff)
|
||||||
hist_ret["instances"] = {
|
hist_ret["instances"] = {
|
||||||
key: {"title": key, "dates": dates_, "values": cls.round_series(values)}
|
key: {
|
||||||
|
"title": key,
|
||||||
|
"dates": dates_,
|
||||||
|
"values": cls.round_series(values, koeff),
|
||||||
|
}
|
||||||
for key, values in sorted(instances.items(), key=lambda p: p[0])
|
for key, values in sorted(instances.items(), key=lambda p: p[0])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -13,45 +13,45 @@ machine_stats {
|
|||||||
}
|
}
|
||||||
memory_used {
|
memory_used {
|
||||||
description: "Used memory MBs"
|
description: "Used memory MBs"
|
||||||
type: integer
|
type: number
|
||||||
}
|
}
|
||||||
memory_free {
|
memory_free {
|
||||||
description: "Free memory MBs"
|
description: "Free memory MBs"
|
||||||
type: integer
|
type: number
|
||||||
}
|
}
|
||||||
gpu_memory_free {
|
gpu_memory_free {
|
||||||
description: "GPU free memory MBs"
|
description: "GPU free memory MBs"
|
||||||
type: array
|
type: array
|
||||||
items { type: integer }
|
items { type: number }
|
||||||
}
|
}
|
||||||
gpu_memory_used {
|
gpu_memory_used {
|
||||||
description: "GPU used memory MBs"
|
description: "GPU used memory MBs"
|
||||||
type: array
|
type: array
|
||||||
items { type: integer }
|
items { type: number }
|
||||||
}
|
}
|
||||||
network_tx {
|
network_tx {
|
||||||
description: "Mbytes per second"
|
description: "Mbytes per second"
|
||||||
type: integer
|
type: number
|
||||||
}
|
}
|
||||||
network_rx {
|
network_rx {
|
||||||
description: "Mbytes per second"
|
description: "Mbytes per second"
|
||||||
type: integer
|
type: number
|
||||||
}
|
}
|
||||||
disk_free_home {
|
disk_free_home {
|
||||||
description: "Free space in % of /home drive"
|
description: "Free space in % of /home drive"
|
||||||
type: integer
|
type: number
|
||||||
}
|
}
|
||||||
disk_free_temp {
|
disk_free_temp {
|
||||||
description: "Free space in % of /tmp drive"
|
description: "Free space in % of /tmp drive"
|
||||||
type: integer
|
type: number
|
||||||
}
|
}
|
||||||
disk_read {
|
disk_read {
|
||||||
description: "Mbytes read per second"
|
description: "Mbytes read per second"
|
||||||
type: integer
|
type: number
|
||||||
}
|
}
|
||||||
disk_write {
|
disk_write {
|
||||||
description: "Mbytes write per second"
|
description: "Mbytes write per second"
|
||||||
type: integer
|
type: number
|
||||||
}
|
}
|
||||||
cpu_temperature {
|
cpu_temperature {
|
||||||
description: "CPU temperature"
|
description: "CPU temperature"
|
||||||
|
@ -134,6 +134,14 @@ _definitions {
|
|||||||
format: "date-time"
|
format: "date-time"
|
||||||
description: The latest time when the container instance sent update
|
description: The latest time when the container instance sent update
|
||||||
}
|
}
|
||||||
|
cpu_count {
|
||||||
|
type: integer
|
||||||
|
description: CPU Count
|
||||||
|
}
|
||||||
|
gpu_count {
|
||||||
|
type: integer
|
||||||
|
description: GPU Count
|
||||||
|
}
|
||||||
reference: ${_definitions.reference}
|
reference: ${_definitions.reference}
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -390,8 +398,10 @@ get_endpoint_metrics_history {
|
|||||||
cpu_util
|
cpu_util
|
||||||
gpu_util
|
gpu_util
|
||||||
ram_total
|
ram_total
|
||||||
|
ram_used
|
||||||
ram_free
|
ram_free
|
||||||
gpu_ram_total
|
gpu_ram_total
|
||||||
|
gpu_ram_used
|
||||||
gpu_ram_free
|
gpu_ram_free
|
||||||
network_rx
|
network_rx
|
||||||
network_tx
|
network_tx
|
||||||
|
@ -49,7 +49,7 @@ class TestServing(TestService):
|
|||||||
latency_ms=100 * mul, # average latency
|
latency_ms=100 * mul, # average latency
|
||||||
machine_stats={ # the same structure here as used by worker status_reports
|
machine_stats={ # the same structure here as used by worker status_reports
|
||||||
"cpu_usage": [10, 20],
|
"cpu_usage": [10, 20],
|
||||||
"memory_used": 50,
|
"memory_used": 50 * 1024,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -68,14 +68,16 @@ class TestServing(TestService):
|
|||||||
"requests",
|
"requests",
|
||||||
"requests_min",
|
"requests_min",
|
||||||
"latency_ms",
|
"latency_ms",
|
||||||
|
"cpu_count",
|
||||||
|
"gpu_count",
|
||||||
"reference",
|
"reference",
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
for inst in details.instances
|
for inst in details.instances
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"container_1": [1000, 1000, 5, 100, reference],
|
"container_1": [1000, 1000, 5, 100, 2, 0, reference],
|
||||||
"container_2": [2000, 2000, 10, 200, []],
|
"container_2": [2000, 2000, 10, 200, 2, 0, []],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
# make sure that the first call did not invalidate anything
|
# make sure that the first call did not invalidate anything
|
||||||
@ -92,7 +94,7 @@ class TestServing(TestService):
|
|||||||
("latency_ms", "Average Latency (ms)", 150),
|
("latency_ms", "Average Latency (ms)", 150),
|
||||||
("cpu_count", "CPU Count", 4),
|
("cpu_count", "CPU Count", 4),
|
||||||
("cpu_util", "Average CPU Load (%)", 15),
|
("cpu_util", "Average CPU Load (%)", 15),
|
||||||
("ram_total", "RAM Total (GB)", 100),
|
("ram_used", "RAM Used (GB)", 100.0),
|
||||||
):
|
):
|
||||||
res = self.api.serving.get_endpoint_metrics_history(
|
res = self.api.serving.get_endpoint_metrics_history(
|
||||||
endpoint_url=url,
|
endpoint_url=url,
|
||||||
|
Loading…
Reference in New Issue
Block a user