mirror of
https://github.com/clearml/clearml-server
synced 2025-01-31 10:56:48 +00:00
538 lines
17 KiB
Plaintext
538 lines
17 KiB
Plaintext
_description: "Provides an API for worker machines, allowing workers to report status and get tasks for execution"
|
|
_definitions {
|
|
metrics_category {
|
|
type: object
|
|
properties {
|
|
name {
|
|
type: string
|
|
description: "Name of the metrics category."
|
|
}
|
|
metric_keys {
|
|
type: array
|
|
items { type: string }
|
|
description: "The names of the metrics in the category."
|
|
}
|
|
}
|
|
}
|
|
aggregation_type {
|
|
type: string
|
|
enum: [ avg, min, max ]
|
|
description: "Metric aggregation type"
|
|
}
|
|
stat_item {
|
|
type: object
|
|
properties {
|
|
key {
|
|
type: string
|
|
description: "Name of a metric"
|
|
}
|
|
category {
|
|
"$ref": "#/definitions/aggregation_type"
|
|
}
|
|
}
|
|
}
|
|
aggregation_stats {
|
|
type: object
|
|
properties {
|
|
aggregation {
|
|
"$ref": "#/definitions/aggregation_type"
|
|
}
|
|
values {
|
|
type: array
|
|
description: "List of values corresponding to the dates in metric statistics"
|
|
items { type: number }
|
|
}
|
|
}
|
|
}
|
|
metric_stats {
|
|
type: object
|
|
properties {
|
|
metric {
|
|
type: string
|
|
description: "Name of the metric ("cpu_usage", "memory_used" etc.)"
|
|
}
|
|
variant {
|
|
type: string
|
|
description: "Name of the metric component. Set only if 'split_by_variant' was set in the request"
|
|
}
|
|
dates {
|
|
type: array
|
|
description: "List of timestamps (in seconds from epoch) in the acceding order. The timestamps are separated by the requested interval. Timestamps where no workers activity was recorded are omitted."
|
|
items { type: integer }
|
|
}
|
|
stats {
|
|
type: array
|
|
description: "Statistics data by type"
|
|
items { "$ref": "#/definitions/aggregation_stats" }
|
|
}
|
|
}
|
|
}
|
|
worker_stats {
|
|
type: object
|
|
properties {
|
|
worker {
|
|
type: string
|
|
description: "ID of the worker"
|
|
}
|
|
metrics {
|
|
type: array
|
|
description: "List of the metrics statistics for the worker"
|
|
items { "$ref": "#/definitions/metric_stats" }
|
|
}
|
|
}
|
|
}
|
|
activity_series {
|
|
type: object
|
|
properties {
|
|
dates {
|
|
type: array
|
|
description: "List of timestamps (in seconds from epoch) in the acceding order. The timestamps are separated by the requested interval."
|
|
items {type: integer}
|
|
}
|
|
counts {
|
|
type: array
|
|
description: "List of worker counts corresponding to the timestamps in the dates list. None values are returned for the dates with no workers."
|
|
items {type: integer}
|
|
}
|
|
}
|
|
}
|
|
worker {
|
|
type: object
|
|
properties {
|
|
id {
|
|
description: "Worker ID"
|
|
type: string
|
|
}
|
|
user {
|
|
description: "Associated user (under whose credentials are used by the worker daemon)"
|
|
"$ref": "#/definitions/id_name_entry"
|
|
}
|
|
company {
|
|
description: "Associated company"
|
|
"$ref": "#/definitions/id_name_entry"
|
|
}
|
|
ip {
|
|
description: "IP of the worker"
|
|
type: string
|
|
}
|
|
register_time {
|
|
description: "Registration time"
|
|
type: string
|
|
format: "date-time"
|
|
}
|
|
last_activity_time {
|
|
description: "Last activity time (even if an error occurred)"
|
|
type: string
|
|
format: "date-time"
|
|
}
|
|
last_report_time {
|
|
description: "Last successful report time"
|
|
type: string
|
|
format: "date-time"
|
|
}
|
|
task {
|
|
description: "Task currently being run by the worker"
|
|
"$ref": "#/definitions/current_task_entry"
|
|
}
|
|
project {
|
|
description: "Project in which currently executing task resides"
|
|
"$ref": "#/definitions/id_name_entry"
|
|
}
|
|
queue {
|
|
description: "Queue from which running task was taken"
|
|
"$ref": "#/definitions/queue_entry"
|
|
}
|
|
queues {
|
|
description: "List of queues on which the worker is listening"
|
|
type: array
|
|
items { "$ref": "#/definitions/queue_entry" }
|
|
}
|
|
tags {
|
|
description: "User tags for the worker"
|
|
type: array
|
|
items: { type: string }
|
|
}
|
|
system_tags {
|
|
description: "System tags for the worker"
|
|
type: array
|
|
items: { type: string }
|
|
}
|
|
key {
|
|
description: "Worker entry key"
|
|
type: string
|
|
}
|
|
}
|
|
}
|
|
|
|
id_name_entry {
|
|
type: object
|
|
properties {
|
|
id {
|
|
description: "Worker ID"
|
|
type: string
|
|
}
|
|
name {
|
|
description: "Worker name"
|
|
type: string
|
|
}
|
|
}
|
|
}
|
|
|
|
current_task_entry = ${_definitions.id_name_entry} {
|
|
properties {
|
|
running_time {
|
|
description: "Task running time"
|
|
type: integer
|
|
}
|
|
last_iteration {
|
|
description: "Last task iteration"
|
|
type: integer
|
|
}
|
|
}
|
|
}
|
|
|
|
queue_entry = ${_definitions.id_name_entry} {
|
|
properties {
|
|
next_task {
|
|
description: "Next task in the queue"
|
|
"$ref": "#/definitions/id_name_entry"
|
|
}
|
|
num_tasks {
|
|
description: "Number of task entries in the queue"
|
|
type: integer
|
|
}
|
|
}
|
|
}
|
|
|
|
machine_stats {
|
|
type: object
|
|
properties {
|
|
cpu_usage {
|
|
description: "Average CPU usage per core"
|
|
type: array
|
|
items { type: number }
|
|
}
|
|
gpu_usage {
|
|
description: "Average GPU usage per GPU card"
|
|
type: array
|
|
items { type: number }
|
|
}
|
|
memory_used {
|
|
description: "Used memory MBs"
|
|
type: integer
|
|
}
|
|
memory_free {
|
|
description: "Free memory MBs"
|
|
type: integer
|
|
}
|
|
gpu_memory_free {
|
|
description: "GPU free memory MBs"
|
|
type: array
|
|
items { type: integer }
|
|
}
|
|
gpu_memory_used {
|
|
description: "GPU used memory MBs"
|
|
type: array
|
|
items { type: integer }
|
|
}
|
|
network_tx {
|
|
description: "Mbytes per second"
|
|
type: integer
|
|
}
|
|
network_rx {
|
|
description: "Mbytes per second"
|
|
type: integer
|
|
}
|
|
disk_free_home {
|
|
description: "Mbytes free space of /home drive"
|
|
type: integer
|
|
}
|
|
disk_free_temp {
|
|
description: "Mbytes free space of /tmp drive"
|
|
type: integer
|
|
}
|
|
disk_read {
|
|
description: "Mbytes read per second"
|
|
type: integer
|
|
}
|
|
disk_write {
|
|
description: "Mbytes write per second"
|
|
type: integer
|
|
}
|
|
cpu_temperature {
|
|
description: "CPU temperature"
|
|
type: array
|
|
items { type: number }
|
|
}
|
|
gpu_temperature {
|
|
description: "GPU temperature"
|
|
type: array
|
|
items { type: number }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
get_all {
|
|
"2.4" {
|
|
description: "Returns information on all registered workers."
|
|
request {
|
|
type: object
|
|
properties {
|
|
last_seen {
|
|
description: """Filter out workers not active for more than last_seen seconds.
|
|
A value or 0 or 'none' will disable the filter."""
|
|
type: integer
|
|
default: 3600
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {
|
|
workers {
|
|
type: array
|
|
items { "$ref": "#/definitions/worker" }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
"2.20": ${get_all."2.4"} {
|
|
request.properties.tags {
|
|
description: The list of allowed worker tags. Prepend tag value with '-' in order to exclude
|
|
type: array
|
|
items { type: string }
|
|
}
|
|
}
|
|
"2.22": ${get_all."2.20"} {
|
|
request.properties.system_tags {
|
|
description: The list of allowed worker system tags. Prepend tag value with '-' in order to exclude
|
|
type: array
|
|
items { type: string }
|
|
}
|
|
}
|
|
}
|
|
register {
|
|
"2.4" {
|
|
description: "Register a worker in the system. Called by the Worker Daemon."
|
|
request {
|
|
required: [ worker ]
|
|
type: object
|
|
properties {
|
|
worker {
|
|
description: "Worker id. Must be unique in company."
|
|
type: string
|
|
}
|
|
timeout {
|
|
description: "Registration timeout in seconds. If timeout seconds have passed since the worker's last call to register or status_report, the worker is automatically removed from the list of registered workers."
|
|
type: integer
|
|
default: 600
|
|
}
|
|
queues {
|
|
description: "List of queue IDs on which the worker is listening."
|
|
type: array
|
|
items { type: string }
|
|
}
|
|
tags {
|
|
description: "User tags for the worker"
|
|
type: array
|
|
items: { type: string }
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {}
|
|
}
|
|
}
|
|
"2.22": ${register."2.4"} {
|
|
request.properties.system_tags {
|
|
description: "System tags for the worker"
|
|
type: array
|
|
items: { type: string }
|
|
}
|
|
}
|
|
}
|
|
unregister {
|
|
"2.4" {
|
|
description: "Unregister a worker in the system. Called by the Worker Daemon."
|
|
request {
|
|
required: [ worker ]
|
|
type: object
|
|
properties {
|
|
worker {
|
|
description: "Worker id. Must be unique in company."
|
|
type: string
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {}
|
|
}
|
|
}
|
|
}
|
|
status_report {
|
|
"2.4" {
|
|
description: "Called periodically by the worker daemon to report machine status"
|
|
request {
|
|
required: [
|
|
worker
|
|
timestamp
|
|
]
|
|
type: object
|
|
properties {
|
|
worker {
|
|
description: "Worker id."
|
|
type: string
|
|
}
|
|
task {
|
|
description: "ID of a task currently being run by the worker. If no task is sent, the worker's task field will be cleared."
|
|
type: string
|
|
}
|
|
queue {
|
|
description: "ID of the queue from which task was received. If no queue is sent, the worker's queue field will be cleared."
|
|
type: string
|
|
}
|
|
queues {
|
|
description: "List of queue IDs on which the worker is listening. If null, the worker's queues list will not be updated."
|
|
type: array
|
|
items { type: string }
|
|
}
|
|
timestamp {
|
|
description: "UNIX time in seconds since epoch."
|
|
type: integer
|
|
}
|
|
machine_stats {
|
|
description: "The machine statistics."
|
|
"$ref": "#/definitions/machine_stats"
|
|
}
|
|
tags {
|
|
description: "New user tags for the worker"
|
|
type: array
|
|
items: { type: string }
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {}
|
|
}
|
|
}
|
|
"2.22": ${status_report."2.4"} {
|
|
request.properties.system_tags {
|
|
description: "New system tags for the worker"
|
|
type: array
|
|
items: { type: string }
|
|
}
|
|
}
|
|
}
|
|
get_metric_keys {
|
|
"2.4" {
|
|
description: "Returns worker statistics metric keys grouped by categories."
|
|
request {
|
|
type: object
|
|
properties {
|
|
worker_ids {
|
|
description: "List of worker ids to collect metrics for. If not provided or empty then all the company workers metrics are analyzed."
|
|
type: array
|
|
items { type: string }
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {
|
|
categories {
|
|
type: array
|
|
description: "List of unique metric categories found in the statistics of the requested workers."
|
|
items { "$ref": "#/definitions/metrics_category" }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
get_stats {
|
|
"2.4" {
|
|
description: "Returns statistics for the selected workers and time range aggregated by date intervals."
|
|
request {
|
|
type: object
|
|
required: [ from_date, to_date, interval, items ]
|
|
properties {
|
|
worker_ids {
|
|
description: "List of worker ids to collect metrics for. If not provided or empty then all the company workers metrics are analyzed."
|
|
type: array
|
|
items { type: string }
|
|
}
|
|
from_date {
|
|
description: "Starting time (in seconds from epoch) for collecting statistics"
|
|
type: number
|
|
}
|
|
to_date {
|
|
description: "Ending time (in seconds from epoch) for collecting statistics"
|
|
type: number
|
|
}
|
|
interval {
|
|
description: "Time interval in seconds for a single statistics point. The minimal value is 1"
|
|
type: integer
|
|
}
|
|
items {
|
|
description: "List of metric keys and requested statistics"
|
|
type: array
|
|
items { "$ref": "#/definitions/stat_item" }
|
|
}
|
|
split_by_variant {
|
|
description: "If true then break statistics by hardware sub types"
|
|
type: boolean
|
|
default: false
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {
|
|
workers {
|
|
type: array
|
|
description: "List of the requested workers with their statistics"
|
|
items { "$ref": "#/definitions/worker_stats" }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
get_activity_report {
|
|
"2.4" {
|
|
description: "Returns count of active company workers in the selected time range."
|
|
request {
|
|
type: object
|
|
required: [ from_date, to_date, interval ]
|
|
properties {
|
|
from_date {
|
|
description: "Starting time (in seconds from epoch) for collecting statistics"
|
|
type: number
|
|
}
|
|
to_date {
|
|
description: "Ending time (in seconds from epoch) for collecting statistics"
|
|
type: number
|
|
}
|
|
interval {
|
|
description: "Time interval in seconds for a single statistics point. The minimal value is 1"
|
|
type: integer
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {
|
|
total {
|
|
description: "Activity series that include all the workers that sent reports in the given time interval."
|
|
"$ref": "#/definitions/activity_series"
|
|
}
|
|
active {
|
|
description: "Activity series that include only workers that worked on a task in the given time interval."
|
|
"$ref": "#/definitions/activity_series"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|