feat: bidirectional research dashboard + agent config fixes

- Integrate apaw_agent_model_research_v3.html as standalone dashboard
- Add model-benchmarks.json with 32 agents, 11 scored models, 11 recommendations
- Add build-research-dashboard.ts: inject live data into template → standalone HTML
- Add rebuild-template.cjs: regenerate template from v3.html source
- Add sync-benchmarks-from-yaml.cjs: sync YAML → JSON round-trip
- Add sync-model-research.ts: apply recommendation matrix to config files
- Add model-benchmarks.schema.json and model-research.schema.json for validation
- Add bidirectional-data-flow.md architecture documentation
- Add log-execution.cjs pipeline hook
- Update capability-index.yaml: add fallback_models, failover_strategy
- Update kilo-meta.json, kilo.jsonc, KILO_SPEC.md with synced models
- Update evolution.md / research.md / self-evolution.md / evolutionary-sync.md docs
- Fix security-auditor.md: quote YAML color (#DC2626)
- Fix orchestrator.md: remove duplicate devops-engineer key
- Build research-dashboard.html (106KB standalone) + dated archive
This commit is contained in:
¨NW¨
2026-04-29 21:04:22 +01:00
parent 2ae7789802
commit 3badb259cc
29 changed files with 13779 additions and 992 deletions

View File

@@ -0,0 +1,553 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://app.kilo.ai/model-benchmarks.schema.json",
"title": "APAW Model Benchmarks Data",
"description": "Schema for static model benchmarks extracted from HTML sources",
"type": "object",
"required": [
"version",
"generated",
"source",
"metadata",
"models",
"groq_models",
"agent_model_scores",
"if_scores",
"agent_current_config",
"recommendations",
"impact_data",
"benchmark_comparison"
],
"properties": {
"version": {
"type": "string",
"const": "1.0.0"
},
"generated": {
"type": "string",
"format": "date-time"
},
"source": {
"type": "string",
"description": "Source of benchmark data (e.g. HTML scraping, API, manual entry)"
},
"metadata": {
"type": "object",
"properties": {
"scrape_date": {
"type": "string",
"format": "date-time"
},
"source_urls": {
"type": "array",
"items": {
"type": "string"
}
},
"notes": {
"type": "string"
},
"data_quality": {
"type": "string",
"enum": [
"high",
"medium",
"low",
"estimated"
]
}
}
},
"models": {
"type": "array",
"description": "All benchmarked models from various providers",
"items": {
"type": "object",
"required": [
"id",
"name",
"provider",
"category"
],
"properties": {
"id": {
"type": "string",
"description": "Model identifier"
},
"name": {
"type": "string"
},
"organization": {
"type": "string"
},
"provider": {
"type": "string",
"enum": [
"ollama",
"ollama-cloud",
"openrouter",
"groq",
"anthropic",
"openai",
"meta",
"cohere",
"google",
"microsoft",
"unknown"
]
},
"category": {
"type": "string",
"enum": [
"big",
"medium",
"small",
"coder",
"reasoning",
"creative"
]
},
"parameters": {
"type": "string"
},
"benchmarks": {
"type": "object",
"properties": {
"swe_bench": {
"type": [
"number",
"null"
]
},
"swe_bench_pro": {
"type": [
"number",
"null"
]
},
"terminal_bench": {
"type": [
"number",
"null"
]
},
"live_codebench": {
"type": [
"number",
"null"
]
},
"gpqa": {
"type": [
"number",
"null"
]
},
"hle": {
"type": [
"number",
"null"
]
},
"browse_comp": {
"type": [
"number",
"null"
]
},
"m_mlu": {
"type": [
"number",
"null"
]
},
"m_mlu_pro": {
"type": [
"number",
"null"
]
}
}
},
"description": {
"type": "string"
},
"availability": {
"type": "object",
"properties": {
"rpm": {
"type": [
"integer",
"null"
]
},
"rpd": {
"type": [
"integer",
"string",
"null"
]
},
"tpm": {
"type": [
"integer",
"string",
"null"
]
},
"tpd": {
"type": [
"integer",
"string",
"null"
]
}
}
},
"free": {
"type": "boolean"
},
"cost_per_1m_input": {
"type": [
"number",
"string",
"null"
]
},
"tier": {
"type": "string",
"enum": [
"free",
"trial",
"paid",
"enterprise"
]
}
}
}
},
"groq_models": {
"type": "array",
"description": "Groq-specific models with performance data",
"items": {
"type": "object",
"required": [
"id",
"name",
"speed_tps",
"provider"
],
"properties": {
"id": {
"type": "string"
},
"name": {
"type": "string"
},
"speed_tps": {
"type": [
"number",
"string"
]
},
"provider": {
"type": "string",
"const": "groq"
},
"benchmarks": {
"type": "object"
},
"availability": {
"type": "object"
}
}
}
},
"agent_model_scores": {
"type": "array",
"description": "Agent × Model compatibility scoring matrices",
"items": {
"type": "object",
"required": [
"agent",
"model_id",
"score",
"category"
],
"properties": {
"agent": {
"type": "string"
},
"model_id": {
"type": "string"
},
"score": {
"type": "number",
"minimum": 0,
"maximum": 100
},
"category": {
"type": "string",
"enum": [
"performance",
"instruction_following",
"creativity",
"code_generation"
]
},
"reason": {
"type": "string"
},
"timestamp": {
"type": "string",
"format": "date-time"
},
"current_model_id": {
"type": "string",
"description": "Current model ID string (replaces index)"
}
}
}
},
"if_scores": {
"type": "object",
"description": "Instruction Following scores mapping",
"additionalProperties": {
"type": "number",
"minimum": 0,
"maximum": 100
}
},
"agent_current_config": {
"type": "array",
"description": "Current agent model configurations",
"items": {
"type": "object",
"required": [
"agent",
"model",
"provider",
"status"
],
"properties": {
"agent": {
"type": "string"
},
"model": {
"type": "string"
},
"provider": {
"type": "string"
},
"status": {
"type": "string",
"enum": [
"active",
"testing",
"deprecated",
"pending"
]
},
"reasoning_effort": {
"type": "string",
"enum": [
"L",
"M",
"H"
]
},
"fit_score": {
"type": "number"
},
"date_applied": {
"type": "string",
"format": "date-time"
}
}
}
},
"recommendations": {
"type": "array",
"description": "Model change recommendations based on benchmarks",
"items": {
"type": "object",
"required": [
"agent",
"action",
"current_model",
"recommended_model",
"impact"
],
"properties": {
"agent": {
"type": "string"
},
"action": {
"type": "string",
"enum": [
"update_model",
"confirm_model",
"add_fallback",
"redesign_agent"
]
},
"current_model": {
"type": "string"
},
"current_provider": {
"type": "string"
},
"recommended_model": {
"type": "string"
},
"recommended_provider": {
"type": "string"
},
"impact": {
"type": "string",
"enum": [
"critical",
"high",
"medium",
"low"
]
},
"rationale": {
"type": "string"
},
"expected_improvement": {
"type": "object"
},
"applied": {
"type": "boolean"
}
}
}
},
"impact_data": {
"type": "array",
"description": "Impact analysis of model changes",
"items": {
"type": "object",
"required": [
"agent",
"model_change",
"impact_score"
],
"properties": {
"agent": {
"type": "string"
},
"model_change": {
"type": "string"
},
"impact_score": {
"type": "number",
"minimum": 0,
"maximum": 100,
"description": "Impact score 0-100"
}
}
}
},
"benchmark_comparison": {
"type": "object",
"description": "APAW vs closed-source benchmark comparison",
"properties": {
"benchmarks": {
"type": "array",
"description": "Benchmark names used for comparison",
"items": {
"type": "string"
}
},
"closed_source_models": {
"type": "array",
"description": "Closed-source models included in comparison",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"provider": {
"type": "string"
},
"benchmarks": {
"type": "object"
}
}
}
},
"apaw_models": {
"type": "array",
"description": "APAW pipeline models included in comparison",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"provider": {
"type": "string"
},
"benchmarks": {
"type": "object"
}
}
}
},
"apaw_best": {
"type": "object",
"description": "Best APAW model per benchmark",
"additionalProperties": {
"type": "object",
"properties": {
"model": {
"type": "string"
},
"score": {
"type": "number"
},
"gap_to_closed": {
"type": [
"number",
"string"
]
}
}
}
},
"closed_best": {
"type": "object",
"description": "Best closed-source model per benchmark",
"additionalProperties": {
"type": "object",
"properties": {
"model": {
"type": "string"
},
"score": {
"type": "number"
}
}
}
},
"summary": {
"type": "object",
"properties": {
"apaw_avg_score": {
"type": "number"
},
"closed_avg_score": {
"type": "number"
},
"coverage_gap": {
"type": "string"
}
}
}
}
}
}
}