mirror of
https://github.com/clearml/clearml-server
synced 2025-02-08 05:53:12 +00:00
![allegroai](/assets/img/avatar_default.png)
Add initial support for project ordering Add support for sortable task duration (used by the UI in the experiment's table) Add support for project name in worker's current task info Add support for results and artifacts in pre-populates examples Add demo server features
488 lines
17 KiB
Plaintext
488 lines
17 KiB
Plaintext
{
|
|
_description: "Provides an API for worker machines, allowing workers to report status and get tasks for execution"
|
|
_definitions {
|
|
metrics_category {
|
|
type: object
|
|
properties {
|
|
name {
|
|
type: string
|
|
description: "Name of the metrics category."
|
|
}
|
|
metric_keys {
|
|
type: array
|
|
items { type: string }
|
|
description: "The names of the metrics in the category."
|
|
}
|
|
}
|
|
}
|
|
aggregation_type {
|
|
type: string
|
|
enum: [ avg, min, max ]
|
|
description: "Metric aggregation type"
|
|
}
|
|
stat_item {
|
|
type: object
|
|
properties {
|
|
key {
|
|
type: string
|
|
description: "Name of a metric"
|
|
}
|
|
category {
|
|
"$ref": "#/definitions/aggregation_type"
|
|
}
|
|
}
|
|
}
|
|
aggregation_stats {
|
|
type: object
|
|
properties {
|
|
aggregation {
|
|
"$ref": "#/definitions/aggregation_type"
|
|
}
|
|
values {
|
|
type: array
|
|
description: "List of values corresponding to the dates in metric statistics"
|
|
items { type: number }
|
|
}
|
|
}
|
|
}
|
|
metric_stats {
|
|
type: object
|
|
properties {
|
|
metric {
|
|
type: string
|
|
description: "Name of the metric ("cpu_usage", "memory_used" etc.)"
|
|
}
|
|
variant {
|
|
type: string
|
|
description: "Name of the metric component. Set only if 'split_by_variant' was set in the request"
|
|
}
|
|
dates {
|
|
type: array
|
|
description: "List of timestamps (in seconds from epoch) in the acceding order. The timestamps are separated by the requested interval. Timestamps where no workers activity was recorded are omitted."
|
|
items { type: integer }
|
|
}
|
|
stats {
|
|
type: array
|
|
description: "Statistics data by type"
|
|
items { "$ref": "#/definitions/aggregation_stats" }
|
|
}
|
|
}
|
|
}
|
|
worker_stats {
|
|
type: object
|
|
properties {
|
|
worker {
|
|
type: string
|
|
description: "ID of the worker"
|
|
}
|
|
metrics {
|
|
type: array
|
|
description: "List of the metrics statistics for the worker"
|
|
items { "$ref": "#/definitions/metric_stats" }
|
|
}
|
|
}
|
|
}
|
|
activity_series {
|
|
type: object
|
|
properties {
|
|
dates {
|
|
type: array
|
|
description: "List of timestamps (in seconds from epoch) in the acceding order. The timestamps are separated by the requested interval."
|
|
items {type: integer}
|
|
}
|
|
counts {
|
|
type: array
|
|
description: "List of worker counts corresponding to the timestamps in the dates list. None values are returned for the dates with no workers."
|
|
items {type: integer}
|
|
}
|
|
}
|
|
}
|
|
worker {
|
|
type: object
|
|
properties {
|
|
id {
|
|
description: "Worker ID"
|
|
type: string
|
|
}
|
|
user {
|
|
description: "Associated user (under whose credentials are used by the worker daemon)"
|
|
"$ref": "#/definitions/id_name_entry"
|
|
}
|
|
company {
|
|
description: "Associated company"
|
|
"$ref": "#/definitions/id_name_entry"
|
|
}
|
|
ip {
|
|
description: "IP of the worker"
|
|
type: string
|
|
}
|
|
register_time {
|
|
description: "Registration time"
|
|
type: string
|
|
format: "date-time"
|
|
}
|
|
last_activity_time {
|
|
description: "Last activity time (even if an error occurred)"
|
|
type: string
|
|
format: "date-time"
|
|
}
|
|
last_report_time {
|
|
description: "Last successful report time"
|
|
type: string
|
|
format: "date-time"
|
|
}
|
|
task {
|
|
description: "Task currently being run by the worker"
|
|
"$ref": "#/definitions/current_task_entry"
|
|
}
|
|
project {
|
|
description: "Project in which currently executing task resides"
|
|
"$ref": "#/definitions/id_name_entry"
|
|
}
|
|
queue {
|
|
description: "Queue from which running task was taken"
|
|
"$ref": "#/definitions/queue_entry"
|
|
}
|
|
queues {
|
|
description: "List of queues on which the worker is listening"
|
|
type: array
|
|
items { "$ref": "#/definitions/queue_entry" }
|
|
}
|
|
}
|
|
}
|
|
|
|
id_name_entry {
|
|
type: object
|
|
properties {
|
|
id {
|
|
description: "ID"
|
|
type: string
|
|
}
|
|
name {
|
|
description: "Name"
|
|
type: string
|
|
}
|
|
}
|
|
}
|
|
|
|
current_task_entry = ${_definitions.id_name_entry} {
|
|
properties {
|
|
running_time {
|
|
description: "Task running time"
|
|
type: integer
|
|
}
|
|
last_iteration {
|
|
description: "Last task iteration"
|
|
type: integer
|
|
}
|
|
}
|
|
}
|
|
|
|
queue_entry = ${_definitions.id_name_entry} {
|
|
properties {
|
|
next_task {
|
|
description: "Next task in the queue"
|
|
"$ref": "#/definitions/id_name_entry"
|
|
}
|
|
num_tasks {
|
|
description: "Number of task entries in the queue"
|
|
type: integer
|
|
}
|
|
}
|
|
}
|
|
|
|
machine_stats {
|
|
type: object
|
|
properties {
|
|
cpu_usage {
|
|
description: "Average CPU usage per core"
|
|
type: array
|
|
items { type: number }
|
|
}
|
|
gpu_usage {
|
|
description: "Average GPU usage per GPU card"
|
|
type: array
|
|
items { type: number }
|
|
}
|
|
memory_used {
|
|
description: "Used memory MBs"
|
|
type: integer
|
|
}
|
|
memory_free {
|
|
description: "Free memory MBs"
|
|
type: integer
|
|
}
|
|
gpu_memory_free {
|
|
description: "GPU free memory MBs"
|
|
type: array
|
|
items { type: integer }
|
|
}
|
|
gpu_memory_used {
|
|
description: "GPU used memory MBs"
|
|
type: array
|
|
items { type: integer }
|
|
}
|
|
network_tx {
|
|
description: "Mbytes per second"
|
|
type: integer
|
|
}
|
|
network_rx {
|
|
description: "Mbytes per second"
|
|
type: integer
|
|
}
|
|
disk_free_home {
|
|
description: "Mbytes free space of /home drive"
|
|
type: integer
|
|
}
|
|
disk_free_temp {
|
|
description: "Mbytes free space of /tmp drive"
|
|
type: integer
|
|
}
|
|
disk_read {
|
|
description: "Mbytes read per second"
|
|
type: integer
|
|
}
|
|
disk_write {
|
|
description: "Mbytes write per second"
|
|
type: integer
|
|
}
|
|
cpu_temperature {
|
|
description: "CPU temperature"
|
|
type: array
|
|
items { type: number }
|
|
}
|
|
gpu_temperature {
|
|
description: "GPU temperature"
|
|
type: array
|
|
items { type: number }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
get_all {
|
|
"2.4" {
|
|
description: "Returns information on all registered workers."
|
|
request {
|
|
type: object
|
|
properties {
|
|
last_seen {
|
|
description: """Filter out workers not active for more than last_seen seconds.
|
|
A value or 0 or 'none' will disable the filter."""
|
|
type: integer
|
|
default: 3600
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {
|
|
workers {
|
|
type: array
|
|
items { "$ref": "#/definitions/worker" }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
register {
|
|
"2.4" {
|
|
description: "Register a worker in the system. Called by the Worker Daemon."
|
|
request {
|
|
required: [ worker ]
|
|
type: object
|
|
properties {
|
|
worker {
|
|
description: "Worker id. Must be unique in company."
|
|
type: string
|
|
}
|
|
timeout {
|
|
description: "Registration timeout in seconds. If timeout seconds have passed since the worker's last call to register or status_report, the worker is automatically removed from the list of registered workers."
|
|
type: integer
|
|
default: 600
|
|
}
|
|
queues {
|
|
description: "List of queue IDs on which the worker is listening."
|
|
type: array
|
|
items { type: string }
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {}
|
|
}
|
|
}
|
|
}
|
|
unregister {
|
|
"2.4" {
|
|
description: "Unregister a worker in the system. Called by the Worker Daemon."
|
|
request {
|
|
required: [ worker ]
|
|
type: object
|
|
properties {
|
|
worker {
|
|
description: "Worker id. Must be unique in company."
|
|
type: string
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {}
|
|
}
|
|
}
|
|
}
|
|
status_report {
|
|
"2.4" {
|
|
description: "Called periodically by the worker daemon to report machine status"
|
|
request {
|
|
required: [
|
|
worker
|
|
timestamp
|
|
]
|
|
type: object
|
|
properties {
|
|
worker {
|
|
description: "Worker id."
|
|
type: string
|
|
}
|
|
task {
|
|
description: "ID of a task currently being run by the worker. If no task is sent, the worker's task field will be cleared."
|
|
type: string
|
|
}
|
|
queue {
|
|
description: "ID of the queue from which task was received. If no queue is sent, the worker's queue field will be cleared."
|
|
type: string
|
|
}
|
|
queues {
|
|
description: "List of queue IDs on which the worker is listening. If null, the worker's queues list will not be updated."
|
|
type: array
|
|
items { type: string }
|
|
}
|
|
timestamp {
|
|
description: "UNIX time in seconds since epoch."
|
|
type: integer
|
|
}
|
|
machine_stats {
|
|
description: "The machine statistics."
|
|
"$ref": "#/definitions/machine_stats"
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {}
|
|
}
|
|
}
|
|
}
|
|
get_metric_keys {
|
|
"2.4" {
|
|
description: "Returns worker statistics metric keys grouped by categories."
|
|
request {
|
|
type: object
|
|
properties {
|
|
worker_ids {
|
|
description: "List of worker ids to collect metrics for. If not provided or empty then all the company workers metrics are analyzed."
|
|
type: array
|
|
items { type: string }
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {
|
|
categories {
|
|
type: array
|
|
description: "List of unique metric categories found in the statistics of the requested workers."
|
|
items { "$ref": "#/definitions/metrics_category" }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
get_stats {
|
|
"2.4" {
|
|
description: "Returns statistics for the selected workers and time range aggregated by date intervals."
|
|
request {
|
|
type: object
|
|
required: [ from_date, to_date, interval, items ]
|
|
properties {
|
|
worker_ids {
|
|
description: "List of worker ids to collect metrics for. If not provided or empty then all the company workers metrics are analyzed."
|
|
type: array
|
|
items { type: string }
|
|
}
|
|
from_date {
|
|
description: "Starting time (in seconds from epoch) for collecting statistics"
|
|
type: number
|
|
}
|
|
to_date {
|
|
description: "Ending time (in seconds from epoch) for collecting statistics"
|
|
type: number
|
|
}
|
|
interval {
|
|
description: "Time interval in seconds for a single statistics point. The minimal value is 1"
|
|
type: integer
|
|
}
|
|
items {
|
|
description: "List of metric keys and requested statistics"
|
|
type: array
|
|
items { "$ref": "#/definitions/stat_item" }
|
|
}
|
|
split_by_variant {
|
|
description: "If true then break statistics by hardware sub types"
|
|
type: boolean
|
|
default: false
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {
|
|
workers {
|
|
type: array
|
|
description: "List of the requested workers with their statistics"
|
|
items { "$ref": "#/definitions/worker_stats" }
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
get_activity_report {
|
|
"2.4" {
|
|
description: "Returns count of active company workers in the selected time range."
|
|
request {
|
|
type: object
|
|
required: [ from_date, to_date, interval ]
|
|
properties {
|
|
from_date {
|
|
description: "Starting time (in seconds from epoch) for collecting statistics"
|
|
type: number
|
|
}
|
|
to_date {
|
|
description: "Ending time (in seconds from epoch) for collecting statistics"
|
|
type: number
|
|
}
|
|
interval {
|
|
description: "Time interval in seconds for a single statistics point. The minimal value is 1"
|
|
type: integer
|
|
}
|
|
}
|
|
}
|
|
response {
|
|
type: object
|
|
properties {
|
|
total {
|
|
description: "Activity series that include all the workers that sent reports in the given time interval."
|
|
"$ref": "#/definitions/activity_series"
|
|
}
|
|
active {
|
|
description: "Activity series that include only workers that worked on a task in the given time interval."
|
|
"$ref": "#/definitions/activity_series"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|