- Integrate apaw_agent_model_research_v3.html as standalone dashboard - Add model-benchmarks.json with 32 agents, 11 scored models, 11 recommendations - Add build-research-dashboard.ts: inject live data into template → standalone HTML - Add rebuild-template.cjs: regenerate template from v3.html source - Add sync-benchmarks-from-yaml.cjs: sync YAML → JSON round-trip - Add sync-model-research.ts: apply recommendation matrix to config files - Add model-benchmarks.schema.json and model-research.schema.json for validation - Add bidirectional-data-flow.md architecture documentation - Add log-execution.cjs pipeline hook - Update capability-index.yaml: add fallback_models, failover_strategy - Update kilo-meta.json, kilo.jsonc, KILO_SPEC.md with synced models - Update evolution.md / research.md / self-evolution.md / evolutionary-sync.md docs - Fix security-auditor.md: quote YAML color (#DC2626) - Fix orchestrator.md: remove duplicate devops-engineer key - Build research-dashboard.html (106KB standalone) + dated archive
553 lines
13 KiB
JSON
553 lines
13 KiB
JSON
{
|
||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||
"$id": "https://app.kilo.ai/model-benchmarks.schema.json",
|
||
"title": "APAW Model Benchmarks Data",
|
||
"description": "Schema for static model benchmarks extracted from HTML sources",
|
||
"type": "object",
|
||
"required": [
|
||
"version",
|
||
"generated",
|
||
"source",
|
||
"metadata",
|
||
"models",
|
||
"groq_models",
|
||
"agent_model_scores",
|
||
"if_scores",
|
||
"agent_current_config",
|
||
"recommendations",
|
||
"impact_data",
|
||
"benchmark_comparison"
|
||
],
|
||
"properties": {
|
||
"version": {
|
||
"type": "string",
|
||
"const": "1.0.0"
|
||
},
|
||
"generated": {
|
||
"type": "string",
|
||
"format": "date-time"
|
||
},
|
||
"source": {
|
||
"type": "string",
|
||
"description": "Source of benchmark data (e.g. HTML scraping, API, manual entry)"
|
||
},
|
||
"metadata": {
|
||
"type": "object",
|
||
"properties": {
|
||
"scrape_date": {
|
||
"type": "string",
|
||
"format": "date-time"
|
||
},
|
||
"source_urls": {
|
||
"type": "array",
|
||
"items": {
|
||
"type": "string"
|
||
}
|
||
},
|
||
"notes": {
|
||
"type": "string"
|
||
},
|
||
"data_quality": {
|
||
"type": "string",
|
||
"enum": [
|
||
"high",
|
||
"medium",
|
||
"low",
|
||
"estimated"
|
||
]
|
||
}
|
||
}
|
||
},
|
||
"models": {
|
||
"type": "array",
|
||
"description": "All benchmarked models from various providers",
|
||
"items": {
|
||
"type": "object",
|
||
"required": [
|
||
"id",
|
||
"name",
|
||
"provider",
|
||
"category"
|
||
],
|
||
"properties": {
|
||
"id": {
|
||
"type": "string",
|
||
"description": "Model identifier"
|
||
},
|
||
"name": {
|
||
"type": "string"
|
||
},
|
||
"organization": {
|
||
"type": "string"
|
||
},
|
||
"provider": {
|
||
"type": "string",
|
||
"enum": [
|
||
"ollama",
|
||
"ollama-cloud",
|
||
"openrouter",
|
||
"groq",
|
||
"anthropic",
|
||
"openai",
|
||
"meta",
|
||
"cohere",
|
||
"google",
|
||
"microsoft",
|
||
"unknown"
|
||
]
|
||
},
|
||
"category": {
|
||
"type": "string",
|
||
"enum": [
|
||
"big",
|
||
"medium",
|
||
"small",
|
||
"coder",
|
||
"reasoning",
|
||
"creative"
|
||
]
|
||
},
|
||
"parameters": {
|
||
"type": "string"
|
||
},
|
||
"benchmarks": {
|
||
"type": "object",
|
||
"properties": {
|
||
"swe_bench": {
|
||
"type": [
|
||
"number",
|
||
"null"
|
||
]
|
||
},
|
||
"swe_bench_pro": {
|
||
"type": [
|
||
"number",
|
||
"null"
|
||
]
|
||
},
|
||
"terminal_bench": {
|
||
"type": [
|
||
"number",
|
||
"null"
|
||
]
|
||
},
|
||
"live_codebench": {
|
||
"type": [
|
||
"number",
|
||
"null"
|
||
]
|
||
},
|
||
"gpqa": {
|
||
"type": [
|
||
"number",
|
||
"null"
|
||
]
|
||
},
|
||
"hle": {
|
||
"type": [
|
||
"number",
|
||
"null"
|
||
]
|
||
},
|
||
"browse_comp": {
|
||
"type": [
|
||
"number",
|
||
"null"
|
||
]
|
||
},
|
||
"m_mlu": {
|
||
"type": [
|
||
"number",
|
||
"null"
|
||
]
|
||
},
|
||
"m_mlu_pro": {
|
||
"type": [
|
||
"number",
|
||
"null"
|
||
]
|
||
}
|
||
}
|
||
},
|
||
"description": {
|
||
"type": "string"
|
||
},
|
||
"availability": {
|
||
"type": "object",
|
||
"properties": {
|
||
"rpm": {
|
||
"type": [
|
||
"integer",
|
||
"null"
|
||
]
|
||
},
|
||
"rpd": {
|
||
"type": [
|
||
"integer",
|
||
"string",
|
||
"null"
|
||
]
|
||
},
|
||
"tpm": {
|
||
"type": [
|
||
"integer",
|
||
"string",
|
||
"null"
|
||
]
|
||
},
|
||
"tpd": {
|
||
"type": [
|
||
"integer",
|
||
"string",
|
||
"null"
|
||
]
|
||
}
|
||
}
|
||
},
|
||
"free": {
|
||
"type": "boolean"
|
||
},
|
||
"cost_per_1m_input": {
|
||
"type": [
|
||
"number",
|
||
"string",
|
||
"null"
|
||
]
|
||
},
|
||
"tier": {
|
||
"type": "string",
|
||
"enum": [
|
||
"free",
|
||
"trial",
|
||
"paid",
|
||
"enterprise"
|
||
]
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"groq_models": {
|
||
"type": "array",
|
||
"description": "Groq-specific models with performance data",
|
||
"items": {
|
||
"type": "object",
|
||
"required": [
|
||
"id",
|
||
"name",
|
||
"speed_tps",
|
||
"provider"
|
||
],
|
||
"properties": {
|
||
"id": {
|
||
"type": "string"
|
||
},
|
||
"name": {
|
||
"type": "string"
|
||
},
|
||
"speed_tps": {
|
||
"type": [
|
||
"number",
|
||
"string"
|
||
]
|
||
},
|
||
"provider": {
|
||
"type": "string",
|
||
"const": "groq"
|
||
},
|
||
"benchmarks": {
|
||
"type": "object"
|
||
},
|
||
"availability": {
|
||
"type": "object"
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"agent_model_scores": {
|
||
"type": "array",
|
||
"description": "Agent × Model compatibility scoring matrices",
|
||
"items": {
|
||
"type": "object",
|
||
"required": [
|
||
"agent",
|
||
"model_id",
|
||
"score",
|
||
"category"
|
||
],
|
||
"properties": {
|
||
"agent": {
|
||
"type": "string"
|
||
},
|
||
"model_id": {
|
||
"type": "string"
|
||
},
|
||
"score": {
|
||
"type": "number",
|
||
"minimum": 0,
|
||
"maximum": 100
|
||
},
|
||
"category": {
|
||
"type": "string",
|
||
"enum": [
|
||
"performance",
|
||
"instruction_following",
|
||
"creativity",
|
||
"code_generation"
|
||
]
|
||
},
|
||
"reason": {
|
||
"type": "string"
|
||
},
|
||
"timestamp": {
|
||
"type": "string",
|
||
"format": "date-time"
|
||
},
|
||
"current_model_id": {
|
||
"type": "string",
|
||
"description": "Current model ID string (replaces index)"
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"if_scores": {
|
||
"type": "object",
|
||
"description": "Instruction Following scores mapping",
|
||
"additionalProperties": {
|
||
"type": "number",
|
||
"minimum": 0,
|
||
"maximum": 100
|
||
}
|
||
},
|
||
"agent_current_config": {
|
||
"type": "array",
|
||
"description": "Current agent model configurations",
|
||
"items": {
|
||
"type": "object",
|
||
"required": [
|
||
"agent",
|
||
"model",
|
||
"provider",
|
||
"status"
|
||
],
|
||
"properties": {
|
||
"agent": {
|
||
"type": "string"
|
||
},
|
||
"model": {
|
||
"type": "string"
|
||
},
|
||
"provider": {
|
||
"type": "string"
|
||
},
|
||
"status": {
|
||
"type": "string",
|
||
"enum": [
|
||
"active",
|
||
"testing",
|
||
"deprecated",
|
||
"pending"
|
||
]
|
||
},
|
||
"reasoning_effort": {
|
||
"type": "string",
|
||
"enum": [
|
||
"L",
|
||
"M",
|
||
"H"
|
||
]
|
||
},
|
||
"fit_score": {
|
||
"type": "number"
|
||
},
|
||
"date_applied": {
|
||
"type": "string",
|
||
"format": "date-time"
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"recommendations": {
|
||
"type": "array",
|
||
"description": "Model change recommendations based on benchmarks",
|
||
"items": {
|
||
"type": "object",
|
||
"required": [
|
||
"agent",
|
||
"action",
|
||
"current_model",
|
||
"recommended_model",
|
||
"impact"
|
||
],
|
||
"properties": {
|
||
"agent": {
|
||
"type": "string"
|
||
},
|
||
"action": {
|
||
"type": "string",
|
||
"enum": [
|
||
"update_model",
|
||
"confirm_model",
|
||
"add_fallback",
|
||
"redesign_agent"
|
||
]
|
||
},
|
||
"current_model": {
|
||
"type": "string"
|
||
},
|
||
"current_provider": {
|
||
"type": "string"
|
||
},
|
||
"recommended_model": {
|
||
"type": "string"
|
||
},
|
||
"recommended_provider": {
|
||
"type": "string"
|
||
},
|
||
"impact": {
|
||
"type": "string",
|
||
"enum": [
|
||
"critical",
|
||
"high",
|
||
"medium",
|
||
"low"
|
||
]
|
||
},
|
||
"rationale": {
|
||
"type": "string"
|
||
},
|
||
"expected_improvement": {
|
||
"type": "object"
|
||
},
|
||
"applied": {
|
||
"type": "boolean"
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"impact_data": {
|
||
"type": "array",
|
||
"description": "Impact analysis of model changes",
|
||
"items": {
|
||
"type": "object",
|
||
"required": [
|
||
"agent",
|
||
"model_change",
|
||
"impact_score"
|
||
],
|
||
"properties": {
|
||
"agent": {
|
||
"type": "string"
|
||
},
|
||
"model_change": {
|
||
"type": "string"
|
||
},
|
||
"impact_score": {
|
||
"type": "number",
|
||
"minimum": 0,
|
||
"maximum": 100,
|
||
"description": "Impact score 0-100"
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"benchmark_comparison": {
|
||
"type": "object",
|
||
"description": "APAW vs closed-source benchmark comparison",
|
||
"properties": {
|
||
"benchmarks": {
|
||
"type": "array",
|
||
"description": "Benchmark names used for comparison",
|
||
"items": {
|
||
"type": "string"
|
||
}
|
||
},
|
||
"closed_source_models": {
|
||
"type": "array",
|
||
"description": "Closed-source models included in comparison",
|
||
"items": {
|
||
"type": "object",
|
||
"properties": {
|
||
"name": {
|
||
"type": "string"
|
||
},
|
||
"provider": {
|
||
"type": "string"
|
||
},
|
||
"benchmarks": {
|
||
"type": "object"
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"apaw_models": {
|
||
"type": "array",
|
||
"description": "APAW pipeline models included in comparison",
|
||
"items": {
|
||
"type": "object",
|
||
"properties": {
|
||
"name": {
|
||
"type": "string"
|
||
},
|
||
"provider": {
|
||
"type": "string"
|
||
},
|
||
"benchmarks": {
|
||
"type": "object"
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"apaw_best": {
|
||
"type": "object",
|
||
"description": "Best APAW model per benchmark",
|
||
"additionalProperties": {
|
||
"type": "object",
|
||
"properties": {
|
||
"model": {
|
||
"type": "string"
|
||
},
|
||
"score": {
|
||
"type": "number"
|
||
},
|
||
"gap_to_closed": {
|
||
"type": [
|
||
"number",
|
||
"string"
|
||
]
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"closed_best": {
|
||
"type": "object",
|
||
"description": "Best closed-source model per benchmark",
|
||
"additionalProperties": {
|
||
"type": "object",
|
||
"properties": {
|
||
"model": {
|
||
"type": "string"
|
||
},
|
||
"score": {
|
||
"type": "number"
|
||
}
|
||
}
|
||
}
|
||
},
|
||
"summary": {
|
||
"type": "object",
|
||
"properties": {
|
||
"apaw_avg_score": {
|
||
"type": "number"
|
||
},
|
||
"closed_avg_score": {
|
||
"type": "number"
|
||
},
|
||
"coverage_gap": {
|
||
"type": "string"
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
}
|
||
} |