feat: bidirectional research dashboard + agent config fixes

- Integrate apaw_agent_model_research_v3.html as standalone dashboard
- Add model-benchmarks.json with 32 agents, 11 scored models, 11 recommendations
- Add build-research-dashboard.ts: inject live data into template → standalone HTML
- Add rebuild-template.cjs: regenerate template from v3.html source
- Add sync-benchmarks-from-yaml.cjs: sync YAML → JSON round-trip
- Add sync-model-research.ts: apply recommendation matrix to config files
- Add model-benchmarks.schema.json and model-research.schema.json for validation
- Add bidirectional-data-flow.md architecture documentation
- Add log-execution.cjs pipeline hook
- Update capability-index.yaml: add fallback_models, failover_strategy
- Update kilo-meta.json, kilo.jsonc, KILO_SPEC.md with synced models
- Update evolution.md / research.md / self-evolution.md / evolutionary-sync.md docs
- Fix security-auditor.md: quote YAML color (#DC2626)
- Fix orchestrator.md: remove duplicate devops-engineer key
- Build research-dashboard.html (106KB standalone) + dated archive
This commit is contained in:
¨NW¨
2026-04-29 21:04:22 +01:00
parent 2ae7789802
commit 3badb259cc
29 changed files with 13779 additions and 992 deletions

View File

@@ -1,12 +1,12 @@
{
"version": "1.0.0",
"lastUpdated": "2026-04-23T06:24:32.543Z",
"lastUpdated": "2026-04-27T20:28:58.592Z",
"agents": {
"lead-developer": {
"current": {
"description": "Primary code writer for backend and core logic. Writes implementation to pass tests",
"mode": "subagent",
"model": "ollama-cloud/qwen3-coder:480b",
"model": "ollama-cloud/nemotron-3-super",
"provider": "Ollama",
"variant": "thinking",
"color": "\"#DC2626\"",
@@ -27,6 +27,24 @@
"to": "ollama-cloud/qwen3-coder:480b",
"reason": "Initial configuration from capability-index.yaml",
"source": "git"
},
{
"date": "2026-04-27T16:56:09.013Z",
"commit": "model-research-sync",
"type": "model_change",
"from": "ollama-cloud/qwen3-coder:480b",
"to": "ollama-cloud/nemotron-3-super",
"reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.",
"source": "research"
},
{
"date": "2026-04-27T20:28:58.592Z",
"commit": "model-research-sync",
"type": "model_change",
"from": "ollama-cloud/qwen3-coder:480b",
"to": "ollama-cloud/nemotron-3-super",
"reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.",
"source": "research"
}
],
"performance_log": []
@@ -255,7 +273,7 @@
"current": {
"description": "Designs technical specifications, data schemas, and API contracts before implementation",
"mode": "subagent",
"model": "ollama-cloud/glm-5.1",
"model": "ollama-cloud/nemotron-3-super",
"provider": "Ollama",
"variant": "thinking",
"color": "\"#0891B2\"",
@@ -285,6 +303,15 @@
"to": "ollama-cloud/glm-5.1",
"reason": "Model update from sync",
"source": "git"
},
{
"date": "2026-04-27T16:59:52.825Z",
"commit": "model-research-sync",
"type": "model_change",
"from": "ollama-cloud/glm-5.1",
"to": "ollama-cloud/nemotron-3-super",
"reason": "Test recommendation for model research sync script",
"source": "research"
}
],
"performance_log": []

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,553 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://app.kilo.ai/model-benchmarks.schema.json",
"title": "APAW Model Benchmarks Data",
"description": "Schema for static model benchmarks extracted from HTML sources",
"type": "object",
"required": [
"version",
"generated",
"source",
"metadata",
"models",
"groq_models",
"agent_model_scores",
"if_scores",
"agent_current_config",
"recommendations",
"impact_data",
"benchmark_comparison"
],
"properties": {
"version": {
"type": "string",
"const": "1.0.0"
},
"generated": {
"type": "string",
"format": "date-time"
},
"source": {
"type": "string",
"description": "Source of benchmark data (e.g. HTML scraping, API, manual entry)"
},
"metadata": {
"type": "object",
"properties": {
"scrape_date": {
"type": "string",
"format": "date-time"
},
"source_urls": {
"type": "array",
"items": {
"type": "string"
}
},
"notes": {
"type": "string"
},
"data_quality": {
"type": "string",
"enum": [
"high",
"medium",
"low",
"estimated"
]
}
}
},
"models": {
"type": "array",
"description": "All benchmarked models from various providers",
"items": {
"type": "object",
"required": [
"id",
"name",
"provider",
"category"
],
"properties": {
"id": {
"type": "string",
"description": "Model identifier"
},
"name": {
"type": "string"
},
"organization": {
"type": "string"
},
"provider": {
"type": "string",
"enum": [
"ollama",
"ollama-cloud",
"openrouter",
"groq",
"anthropic",
"openai",
"meta",
"cohere",
"google",
"microsoft",
"unknown"
]
},
"category": {
"type": "string",
"enum": [
"big",
"medium",
"small",
"coder",
"reasoning",
"creative"
]
},
"parameters": {
"type": "string"
},
"benchmarks": {
"type": "object",
"properties": {
"swe_bench": {
"type": [
"number",
"null"
]
},
"swe_bench_pro": {
"type": [
"number",
"null"
]
},
"terminal_bench": {
"type": [
"number",
"null"
]
},
"live_codebench": {
"type": [
"number",
"null"
]
},
"gpqa": {
"type": [
"number",
"null"
]
},
"hle": {
"type": [
"number",
"null"
]
},
"browse_comp": {
"type": [
"number",
"null"
]
},
"m_mlu": {
"type": [
"number",
"null"
]
},
"m_mlu_pro": {
"type": [
"number",
"null"
]
}
}
},
"description": {
"type": "string"
},
"availability": {
"type": "object",
"properties": {
"rpm": {
"type": [
"integer",
"null"
]
},
"rpd": {
"type": [
"integer",
"string",
"null"
]
},
"tpm": {
"type": [
"integer",
"string",
"null"
]
},
"tpd": {
"type": [
"integer",
"string",
"null"
]
}
}
},
"free": {
"type": "boolean"
},
"cost_per_1m_input": {
"type": [
"number",
"string",
"null"
]
},
"tier": {
"type": "string",
"enum": [
"free",
"trial",
"paid",
"enterprise"
]
}
}
}
},
"groq_models": {
"type": "array",
"description": "Groq-specific models with performance data",
"items": {
"type": "object",
"required": [
"id",
"name",
"speed_tps",
"provider"
],
"properties": {
"id": {
"type": "string"
},
"name": {
"type": "string"
},
"speed_tps": {
"type": [
"number",
"string"
]
},
"provider": {
"type": "string",
"const": "groq"
},
"benchmarks": {
"type": "object"
},
"availability": {
"type": "object"
}
}
}
},
"agent_model_scores": {
"type": "array",
"description": "Agent × Model compatibility scoring matrices",
"items": {
"type": "object",
"required": [
"agent",
"model_id",
"score",
"category"
],
"properties": {
"agent": {
"type": "string"
},
"model_id": {
"type": "string"
},
"score": {
"type": "number",
"minimum": 0,
"maximum": 100
},
"category": {
"type": "string",
"enum": [
"performance",
"instruction_following",
"creativity",
"code_generation"
]
},
"reason": {
"type": "string"
},
"timestamp": {
"type": "string",
"format": "date-time"
},
"current_model_id": {
"type": "string",
"description": "Current model ID string (replaces index)"
}
}
}
},
"if_scores": {
"type": "object",
"description": "Instruction Following scores mapping",
"additionalProperties": {
"type": "number",
"minimum": 0,
"maximum": 100
}
},
"agent_current_config": {
"type": "array",
"description": "Current agent model configurations",
"items": {
"type": "object",
"required": [
"agent",
"model",
"provider",
"status"
],
"properties": {
"agent": {
"type": "string"
},
"model": {
"type": "string"
},
"provider": {
"type": "string"
},
"status": {
"type": "string",
"enum": [
"active",
"testing",
"deprecated",
"pending"
]
},
"reasoning_effort": {
"type": "string",
"enum": [
"L",
"M",
"H"
]
},
"fit_score": {
"type": "number"
},
"date_applied": {
"type": "string",
"format": "date-time"
}
}
}
},
"recommendations": {
"type": "array",
"description": "Model change recommendations based on benchmarks",
"items": {
"type": "object",
"required": [
"agent",
"action",
"current_model",
"recommended_model",
"impact"
],
"properties": {
"agent": {
"type": "string"
},
"action": {
"type": "string",
"enum": [
"update_model",
"confirm_model",
"add_fallback",
"redesign_agent"
]
},
"current_model": {
"type": "string"
},
"current_provider": {
"type": "string"
},
"recommended_model": {
"type": "string"
},
"recommended_provider": {
"type": "string"
},
"impact": {
"type": "string",
"enum": [
"critical",
"high",
"medium",
"low"
]
},
"rationale": {
"type": "string"
},
"expected_improvement": {
"type": "object"
},
"applied": {
"type": "boolean"
}
}
}
},
"impact_data": {
"type": "array",
"description": "Impact analysis of model changes",
"items": {
"type": "object",
"required": [
"agent",
"model_change",
"impact_score"
],
"properties": {
"agent": {
"type": "string"
},
"model_change": {
"type": "string"
},
"impact_score": {
"type": "number",
"minimum": 0,
"maximum": 100,
"description": "Impact score 0-100"
}
}
}
},
"benchmark_comparison": {
"type": "object",
"description": "APAW vs closed-source benchmark comparison",
"properties": {
"benchmarks": {
"type": "array",
"description": "Benchmark names used for comparison",
"items": {
"type": "string"
}
},
"closed_source_models": {
"type": "array",
"description": "Closed-source models included in comparison",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"provider": {
"type": "string"
},
"benchmarks": {
"type": "object"
}
}
}
},
"apaw_models": {
"type": "array",
"description": "APAW pipeline models included in comparison",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"provider": {
"type": "string"
},
"benchmarks": {
"type": "object"
}
}
}
},
"apaw_best": {
"type": "object",
"description": "Best APAW model per benchmark",
"additionalProperties": {
"type": "object",
"properties": {
"model": {
"type": "string"
},
"score": {
"type": "number"
},
"gap_to_closed": {
"type": [
"number",
"string"
]
}
}
}
},
"closed_best": {
"type": "object",
"description": "Best closed-source model per benchmark",
"additionalProperties": {
"type": "object",
"properties": {
"model": {
"type": "string"
},
"score": {
"type": "number"
}
}
}
},
"summary": {
"type": "object",
"properties": {
"apaw_avg_score": {
"type": "number"
},
"closed_avg_score": {
"type": "number"
},
"coverage_gap": {
"type": "string"
}
}
}
}
}
}
}

View File

@@ -0,0 +1,59 @@
{
"version": "1.0.0",
"generated": "2026-04-27T17:51:36.000Z",
"source": "/research model-optimization",
"models": [],
"recommendations": [
{
"agent": "lead-developer",
"action": "update_model",
"current_model": "ollama-cloud/qwen3-coder:480b",
"current_provider": "ollama-cloud",
"recommended_model": "ollama-cloud/nemotron-3-super",
"recommended_provider": "ollama-cloud",
"impact": "high",
"expected_improvement": {
"quality": "+15%",
"speed": "+20%",
"context_window": "1M→1M"
},
"score_before": 85,
"score_after": 92,
"score_delta": 7,
"rationale": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.",
"applied": false,
"applied_date": null
},
{
"agent": "devops-engineer",
"action": "confirm_model",
"current_model": "ollama-cloud/nemotron-3-super",
"current_provider": "ollama-cloud",
"recommended_model": "ollama-cloud/nemotron-3-super",
"recommended_provider": "ollama-cloud",
"impact": "low",
"expected_improvement": {
"quality": "0%",
"speed": "0%",
"context_window": "1M→1M"
},
"score_before": 88,
"score_after": 88,
"score_delta": 0,
"rationale": "Current model already optimal for DevOps tasks. Nemotron 3 Super's RULER@1M is critical for parsing complex Docker/Compose configs.",
"applied": false,
"applied_date": null
}
],
"heatmap": {},
"closed_source_comparison": {},
"capability_index_patch": [],
"summary": {
"avg_quality_improvement": "+7.5%",
"providers_used": ["ollama-cloud"],
"key_models": ["nemotron-3-super"],
"total_recommendations": 2,
"applied_count": 0,
"pending_count": 2
}
}

View File

@@ -0,0 +1,331 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://app.kilo.ai/model-research.schema.json",
"title": "APAW Model Research Output",
"description": "Schema for automated model research and recommendation output",
"type": "object",
"required": ["version", "generated", "source", "models", "recommendations", "heatmap"],
"properties": {
"version": {
"type": "string",
"const": "1.0.0"
},
"generated": {
"type": "string",
"format": "date-time"
},
"source": {
"type": "string",
"description": "What triggered this research (e.g. /evolution, /research, manual)"
},
"trigger": {
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": ["evolution_cycle", "manual_research", "fitness_below_threshold", "scheduled"]
},
"issue": {
"type": "integer"
},
"fitness_score": {
"type": "number"
},
"reason": {
"type": "string"
}
}
},
"models": {
"type": "array",
"items": {
"type": "object",
"required": ["id", "name", "organization", "if_score", "provider"],
"properties": {
"id": {
"type": "string",
"description": "Full model ID like ollama-cloud/qwen3-coder:480b"
},
"name": {
"type": "string"
},
"organization": {
"type": "string"
},
"parameters": {
"type": "string"
},
"context_window": {
"type": "string"
},
"swe_bench": {
"type": ["number", "null"]
},
"swe_bench_pro": {
"type": ["number", "null"]
},
"terminal_bench": {
"type": ["number", "null"]
},
"live_codebench": {
"type": ["number", "null"]
},
"gpqa": {
"type": ["number", "null"]
},
"hle": {
"type": ["number", "null"]
},
"browse_comp": {
"type": ["number", "null"]
},
"if_score": {
"type": "number",
"minimum": 0,
"maximum": 100,
"description": "Instruction Following composite score (IFEval + IFBench)"
},
"categories": {
"type": "array",
"items": {
"type": "string"
}
},
"tags": {
"type": "array",
"items": {
"type": "string"
}
},
"provider": {
"type": "string",
"enum": ["ollama", "ollama-cloud", "openrouter", "groq", "hybrid"]
},
"free": {
"type": "boolean"
},
"cost_per_1m_input": {
"type": ["number", "string", "null"]
},
"description": {
"type": "string"
},
"availability": {
"type": "object",
"properties": {
"rpm": {
"type": ["integer", "null"]
},
"rpd": {
"type": ["integer", "string", "null"]
},
"tpm": {
"type": ["integer", "string", "null"]
},
"tpd": {
"type": ["integer", "string", "null"]
}
}
},
"speed_tps": {
"type": ["number", "string", "null"]
}
}
}
},
"recommendations": {
"type": "array",
"items": {
"type": "object",
"required": ["agent", "action", "current_model", "recommended_model", "impact", "rationale"],
"properties": {
"agent": {
"type": "string"
},
"action": {
"type": "string",
"enum": ["update_model", "confirm_model", "add_fallback", "redesign_agent"]
},
"current_model": {
"type": "string"
},
"current_provider": {
"type": "string"
},
"recommended_model": {
"type": "string"
},
"recommended_provider": {
"type": "string"
},
"fallback_model": {
"type": "string"
},
"fallback_strategy": {
"type": "string"
},
"impact": {
"type": "string",
"enum": ["critical", "high", "medium", "low"]
},
"expected_improvement": {
"type": "object",
"properties": {
"quality": {
"type": "string"
},
"speed": {
"type": "string"
},
"context_window": {
"type": "string"
}
}
},
"score_before": {
"type": "number"
},
"score_after": {
"type": "number"
},
"score_delta": {
"type": "number"
},
"rationale": {
"type": "string"
},
"applied": {
"type": "boolean",
"default": false
},
"applied_date": {
"type": ["string", "null"],
"format": "date-time"
}
}
}
},
"heatmap": {
"type": "object",
"description": "Agent × Model compatibility matrix with IF adjustment",
"required": ["models", "agents"],
"properties": {
"models": {
"type": "array",
"items": {
"type": "object",
"required": ["id", "if_score"],
"properties": {
"id": {
"type": "string"
},
"display_name": {
"type": "string"
},
"provider": {
"type": "string"
},
"if_score": {
"type": "number"
}
}
}
},
"agents": {
"type": "array",
"items": {
"type": "object",
"required": ["agent", "reasoning_effort", "scores"],
"properties": {
"agent": {
"type": "string"
},
"current_model": {
"type": "string"
},
"reasoning_effort": {
"type": "string",
"enum": ["L", "M", "H"]
},
"scores": {
"type": "object",
"additionalProperties": {
"type": "number"
},
"description": "Model ID → compatibility score (0-100, IF-adjusted)"
}
}
}
},
"if_adjustment_formula": {
"type": "string",
"default": "score * (0.7 + 0.3 * IF/100)"
}
}
},
"closed_source_comparison": {
"type": "object",
"description": "APAW pipeline models vs top closed-source models",
"properties": {
"benchmarks": {
"type": "array"
},
"models": {
"type": "array"
},
"apaw_best_per_benchmark": {
"type": "object"
},
"closed_best_per_benchmark": {
"type": "object"
}
}
},
"capability_index_patch": {
"type": "array",
"description": "Ready-to-apply patches to capability-index.yaml",
"items": {
"type": "object",
"required": ["agent", "set"],
"properties": {
"agent": {
"type": "string"
},
"set": {
"type": "object",
"additionalProperties": true
}
}
}
},
"summary": {
"type": "object",
"properties": {
"avg_quality_improvement": {
"type": "string"
},
"providers_used": {
"type": "array",
"items": {
"type": "string"
}
},
"key_models": {
"type": "array",
"items": {
"type": "string"
}
},
"total_recommendations": {
"type": "integer"
},
"applied_count": {
"type": "integer"
},
"pending_count": {
"type": "integer"
}
}
}
}
}