feat: bidirectional research dashboard + agent config fixes

- Integrate apaw_agent_model_research_v3.html as standalone dashboard - Add model-benchmarks.json with 32 agents, 11 scored models, 11 recommendations - Add build-research-dashboard.ts: inject live data into template → standalone HTML - Add rebuild-template.cjs: regenerate template from v3.html source - Add sync-benchmarks-from-yaml.cjs: sync YAML → JSON round-trip - Add sync-model-research.ts: apply recommendation matrix to config files - Add model-benchmarks.schema.json and model-research.schema.json for validation - Add bidirectional-data-flow.md architecture documentation - Add log-execution.cjs pipeline hook - Update capability-index.yaml: add fallback_models, failover_strategy - Update kilo-meta.json, kilo.jsonc, KILO_SPEC.md with synced models - Update evolution.md / research.md / self-evolution.md / evolutionary-sync.md docs - Fix security-auditor.md: quote YAML color (#DC2626) - Fix orchestrator.md: remove duplicate devops-engineer key - Build research-dashboard.html (106KB standalone) + dated archive
2026-04-29 21:04:22 +01:00
parent 2ae7789802
commit 3badb259cc
29 changed files with 13779 additions and 992 deletions
--- a/agent-evolution/data/agent-versions.json
+++ b/agent-evolution/data/agent-versions.json
@@ -1,12 +1,12 @@
 {
  "version": "1.0.0",
-  "lastUpdated": "2026-04-23T06:24:32.543Z",
+  "lastUpdated": "2026-04-27T20:28:58.592Z",
  "agents": {
    "lead-developer": {
      "current": {
        "description": "Primary code writer for backend and core logic. Writes implementation to pass tests",
        "mode": "subagent",
-        "model": "ollama-cloud/qwen3-coder:480b",
+        "model": "ollama-cloud/nemotron-3-super",
        "provider": "Ollama",
        "variant": "thinking",
        "color": "\"#DC2626\"",
@@ -27,6 +27,24 @@
          "to": "ollama-cloud/qwen3-coder:480b",
          "reason": "Initial configuration from capability-index.yaml",
          "source": "git"
+        },
+        {
+          "date": "2026-04-27T16:56:09.013Z",
+          "commit": "model-research-sync",
+          "type": "model_change",
+          "from": "ollama-cloud/qwen3-coder:480b",
+          "to": "ollama-cloud/nemotron-3-super",
+          "reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.",
+          "source": "research"
+        },
+        {
+          "date": "2026-04-27T20:28:58.592Z",
+          "commit": "model-research-sync",
+          "type": "model_change",
+          "from": "ollama-cloud/qwen3-coder:480b",
+          "to": "ollama-cloud/nemotron-3-super",
+          "reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.",
+          "source": "research"
        }
      ],
      "performance_log": []
@@ -255,7 +273,7 @@
      "current": {
        "description": "Designs technical specifications, data schemas, and API contracts before implementation",
        "mode": "subagent",
-        "model": "ollama-cloud/glm-5.1",
+        "model": "ollama-cloud/nemotron-3-super",
        "provider": "Ollama",
        "variant": "thinking",
        "color": "\"#0891B2\"",
@@ -285,6 +303,15 @@
          "to": "ollama-cloud/glm-5.1",
          "reason": "Model update from sync",
          "source": "git"
+        },
+        {
+          "date": "2026-04-27T16:59:52.825Z",
+          "commit": "model-research-sync",
+          "type": "model_change",
+          "from": "ollama-cloud/glm-5.1",
+          "to": "ollama-cloud/nemotron-3-super",
+          "reason": "Test recommendation for model research sync script",
+          "source": "research"
        }
      ],
      "performance_log": []
--- a/agent-evolution/data/model-benchmarks.json
+++ b/agent-evolution/data/model-benchmarks.json
--- a/agent-evolution/data/model-benchmarks.schema.json
+++ b/agent-evolution/data/model-benchmarks.schema.json
@@ -0,0 +1,553 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://app.kilo.ai/model-benchmarks.schema.json",
+  "title": "APAW Model Benchmarks Data",
+  "description": "Schema for static model benchmarks extracted from HTML sources",
+  "type": "object",
+  "required": [
+    "version",
+    "generated",
+    "source",
+    "metadata",
+    "models",
+    "groq_models",
+    "agent_model_scores",
+    "if_scores",
+    "agent_current_config",
+    "recommendations",
+    "impact_data",
+    "benchmark_comparison"
+  ],
+  "properties": {
+    "version": {
+      "type": "string",
+      "const": "1.0.0"
+    },
+    "generated": {
+      "type": "string",
+      "format": "date-time"
+    },
+    "source": {
+      "type": "string",
+      "description": "Source of benchmark data (e.g. HTML scraping, API, manual entry)"
+    },
+    "metadata": {
+      "type": "object",
+      "properties": {
+        "scrape_date": {
+          "type": "string",
+          "format": "date-time"
+        },
+        "source_urls": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "notes": {
+          "type": "string"
+        },
+        "data_quality": {
+          "type": "string",
+          "enum": [
+            "high",
+            "medium",
+            "low",
+            "estimated"
+          ]
+        }
+      }
+    },
+    "models": {
+      "type": "array",
+      "description": "All benchmarked models from various providers",
+      "items": {
+        "type": "object",
+        "required": [
+          "id",
+          "name",
+          "provider",
+          "category"
+        ],
+        "properties": {
+          "id": {
+            "type": "string",
+            "description": "Model identifier"
+          },
+          "name": {
+            "type": "string"
+          },
+          "organization": {
+            "type": "string"
+          },
+          "provider": {
+            "type": "string",
+            "enum": [
+              "ollama",
+              "ollama-cloud",
+              "openrouter",
+              "groq",
+              "anthropic",
+              "openai",
+              "meta",
+              "cohere",
+              "google",
+              "microsoft",
+              "unknown"
+            ]
+          },
+          "category": {
+            "type": "string",
+            "enum": [
+              "big",
+              "medium",
+              "small",
+              "coder",
+              "reasoning",
+              "creative"
+            ]
+          },
+          "parameters": {
+            "type": "string"
+          },
+          "benchmarks": {
+            "type": "object",
+            "properties": {
+              "swe_bench": {
+                "type": [
+                  "number",
+                  "null"
+                ]
+              },
+              "swe_bench_pro": {
+                "type": [
+                  "number",
+                  "null"
+                ]
+              },
+              "terminal_bench": {
+                "type": [
+                  "number",
+                  "null"
+                ]
+              },
+              "live_codebench": {
+                "type": [
+                  "number",
+                  "null"
+                ]
+              },
+              "gpqa": {
+                "type": [
+                  "number",
+                  "null"
+                ]
+              },
+              "hle": {
+                "type": [
+                  "number",
+                  "null"
+                ]
+              },
+              "browse_comp": {
+                "type": [
+                  "number",
+                  "null"
+                ]
+              },
+              "m_mlu": {
+                "type": [
+                  "number",
+                  "null"
+                ]
+              },
+              "m_mlu_pro": {
+                "type": [
+                  "number",
+                  "null"
+                ]
+              }
+            }
+          },
+          "description": {
+            "type": "string"
+          },
+          "availability": {
+            "type": "object",
+            "properties": {
+              "rpm": {
+                "type": [
+                  "integer",
+                  "null"
+                ]
+              },
+              "rpd": {
+                "type": [
+                  "integer",
+                  "string",
+                  "null"
+                ]
+              },
+              "tpm": {
+                "type": [
+                  "integer",
+                  "string",
+                  "null"
+                ]
+              },
+              "tpd": {
+                "type": [
+                  "integer",
+                  "string",
+                  "null"
+                ]
+              }
+            }
+          },
+          "free": {
+            "type": "boolean"
+          },
+          "cost_per_1m_input": {
+            "type": [
+              "number",
+              "string",
+              "null"
+            ]
+          },
+          "tier": {
+            "type": "string",
+            "enum": [
+              "free",
+              "trial",
+              "paid",
+              "enterprise"
+            ]
+          }
+        }
+      }
+    },
+    "groq_models": {
+      "type": "array",
+      "description": "Groq-specific models with performance data",
+      "items": {
+        "type": "object",
+        "required": [
+          "id",
+          "name",
+          "speed_tps",
+          "provider"
+        ],
+        "properties": {
+          "id": {
+            "type": "string"
+          },
+          "name": {
+            "type": "string"
+          },
+          "speed_tps": {
+            "type": [
+              "number",
+              "string"
+            ]
+          },
+          "provider": {
+            "type": "string",
+            "const": "groq"
+          },
+          "benchmarks": {
+            "type": "object"
+          },
+          "availability": {
+            "type": "object"
+          }
+        }
+      }
+    },
+    "agent_model_scores": {
+      "type": "array",
+      "description": "Agent × Model compatibility scoring matrices",
+      "items": {
+        "type": "object",
+        "required": [
+          "agent",
+          "model_id",
+          "score",
+          "category"
+        ],
+        "properties": {
+          "agent": {
+            "type": "string"
+          },
+          "model_id": {
+            "type": "string"
+          },
+          "score": {
+            "type": "number",
+            "minimum": 0,
+            "maximum": 100
+          },
+          "category": {
+            "type": "string",
+            "enum": [
+              "performance",
+              "instruction_following",
+              "creativity",
+              "code_generation"
+            ]
+          },
+          "reason": {
+            "type": "string"
+          },
+          "timestamp": {
+            "type": "string",
+            "format": "date-time"
+          },
+          "current_model_id": {
+            "type": "string",
+            "description": "Current model ID string (replaces index)"
+          }
+        }
+      }
+    },
+    "if_scores": {
+      "type": "object",
+      "description": "Instruction Following scores mapping",
+      "additionalProperties": {
+        "type": "number",
+        "minimum": 0,
+        "maximum": 100
+      }
+    },
+    "agent_current_config": {
+      "type": "array",
+      "description": "Current agent model configurations",
+      "items": {
+        "type": "object",
+        "required": [
+          "agent",
+          "model",
+          "provider",
+          "status"
+        ],
+        "properties": {
+          "agent": {
+            "type": "string"
+          },
+          "model": {
+            "type": "string"
+          },
+          "provider": {
+            "type": "string"
+          },
+          "status": {
+            "type": "string",
+            "enum": [
+              "active",
+              "testing",
+              "deprecated",
+              "pending"
+            ]
+          },
+          "reasoning_effort": {
+            "type": "string",
+            "enum": [
+              "L",
+              "M",
+              "H"
+            ]
+          },
+          "fit_score": {
+            "type": "number"
+          },
+          "date_applied": {
+            "type": "string",
+            "format": "date-time"
+          }
+        }
+      }
+    },
+    "recommendations": {
+      "type": "array",
+      "description": "Model change recommendations based on benchmarks",
+      "items": {
+        "type": "object",
+        "required": [
+          "agent",
+          "action",
+          "current_model",
+          "recommended_model",
+          "impact"
+        ],
+        "properties": {
+          "agent": {
+            "type": "string"
+          },
+          "action": {
+            "type": "string",
+            "enum": [
+              "update_model",
+              "confirm_model",
+              "add_fallback",
+              "redesign_agent"
+            ]
+          },
+          "current_model": {
+            "type": "string"
+          },
+          "current_provider": {
+            "type": "string"
+          },
+          "recommended_model": {
+            "type": "string"
+          },
+          "recommended_provider": {
+            "type": "string"
+          },
+          "impact": {
+            "type": "string",
+            "enum": [
+              "critical",
+              "high",
+              "medium",
+              "low"
+            ]
+          },
+          "rationale": {
+            "type": "string"
+          },
+          "expected_improvement": {
+            "type": "object"
+          },
+          "applied": {
+            "type": "boolean"
+          }
+        }
+      }
+    },
+    "impact_data": {
+      "type": "array",
+      "description": "Impact analysis of model changes",
+      "items": {
+        "type": "object",
+        "required": [
+          "agent",
+          "model_change",
+          "impact_score"
+        ],
+        "properties": {
+          "agent": {
+            "type": "string"
+          },
+          "model_change": {
+            "type": "string"
+          },
+          "impact_score": {
+            "type": "number",
+            "minimum": 0,
+            "maximum": 100,
+            "description": "Impact score 0-100"
+          }
+        }
+      }
+    },
+    "benchmark_comparison": {
+      "type": "object",
+      "description": "APAW vs closed-source benchmark comparison",
+      "properties": {
+        "benchmarks": {
+          "type": "array",
+          "description": "Benchmark names used for comparison",
+          "items": {
+            "type": "string"
+          }
+        },
+        "closed_source_models": {
+          "type": "array",
+          "description": "Closed-source models included in comparison",
+          "items": {
+            "type": "object",
+            "properties": {
+              "name": {
+                "type": "string"
+              },
+              "provider": {
+                "type": "string"
+              },
+              "benchmarks": {
+                "type": "object"
+              }
+            }
+          }
+        },
+        "apaw_models": {
+          "type": "array",
+          "description": "APAW pipeline models included in comparison",
+          "items": {
+            "type": "object",
+            "properties": {
+              "name": {
+                "type": "string"
+              },
+              "provider": {
+                "type": "string"
+              },
+              "benchmarks": {
+                "type": "object"
+              }
+            }
+          }
+        },
+        "apaw_best": {
+          "type": "object",
+          "description": "Best APAW model per benchmark",
+          "additionalProperties": {
+            "type": "object",
+            "properties": {
+              "model": {
+                "type": "string"
+              },
+              "score": {
+                "type": "number"
+              },
+              "gap_to_closed": {
+                "type": [
+                  "number",
+                  "string"
+                ]
+              }
+            }
+          }
+        },
+        "closed_best": {
+          "type": "object",
+          "description": "Best closed-source model per benchmark",
+          "additionalProperties": {
+            "type": "object",
+            "properties": {
+              "model": {
+                "type": "string"
+              },
+              "score": {
+                "type": "number"
+              }
+            }
+          }
+        },
+        "summary": {
+          "type": "object",
+          "properties": {
+            "apaw_avg_score": {
+              "type": "number"
+            },
+            "closed_avg_score": {
+              "type": "number"
+            },
+            "coverage_gap": {
+              "type": "string"
+            }
+          }
+        }
+      }
+    }
+  }
+}
--- a/agent-evolution/data/model-research-latest.json
+++ b/agent-evolution/data/model-research-latest.json
@@ -0,0 +1,59 @@
+{
+  "version": "1.0.0",
+  "generated": "2026-04-27T17:51:36.000Z",
+  "source": "/research model-optimization",
+  "models": [],
+  "recommendations": [
+    {
+      "agent": "lead-developer",
+      "action": "update_model",
+      "current_model": "ollama-cloud/qwen3-coder:480b",
+      "current_provider": "ollama-cloud",
+      "recommended_model": "ollama-cloud/nemotron-3-super",
+      "recommended_provider": "ollama-cloud",
+      "impact": "high",
+      "expected_improvement": {
+        "quality": "+15%",
+        "speed": "+20%",
+        "context_window": "1M→1M"
+      },
+      "score_before": 85,
+      "score_after": 92,
+      "score_delta": 7,
+      "rationale": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.",
+      "applied": false,
+      "applied_date": null
+    },
+    {
+      "agent": "devops-engineer",
+      "action": "confirm_model",
+      "current_model": "ollama-cloud/nemotron-3-super",
+      "current_provider": "ollama-cloud",
+      "recommended_model": "ollama-cloud/nemotron-3-super",
+      "recommended_provider": "ollama-cloud",
+      "impact": "low",
+      "expected_improvement": {
+        "quality": "0%",
+        "speed": "0%",
+        "context_window": "1M→1M"
+      },
+      "score_before": 88,
+      "score_after": 88,
+      "score_delta": 0,
+      "rationale": "Current model already optimal for DevOps tasks. Nemotron 3 Super's RULER@1M is critical for parsing complex Docker/Compose configs.",
+      "applied": false,
+      "applied_date": null
+    }
+  ],
+  "heatmap": {},
+  "closed_source_comparison": {},
+  "capability_index_patch": [],
+  "summary": {
+    "avg_quality_improvement": "+7.5%",
+    "providers_used": ["ollama-cloud"],
+    "key_models": ["nemotron-3-super"],
+    "total_recommendations": 2,
+    "applied_count": 0,
+    "pending_count": 2
+  }
+}
--- a/agent-evolution/data/model-research.schema.json
+++ b/agent-evolution/data/model-research.schema.json
@@ -0,0 +1,331 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://app.kilo.ai/model-research.schema.json",
+  "title": "APAW Model Research Output",
+  "description": "Schema for automated model research and recommendation output",
+  "type": "object",
+  "required": ["version", "generated", "source", "models", "recommendations", "heatmap"],
+  "properties": {
+    "version": {
+      "type": "string",
+      "const": "1.0.0"
+    },
+    "generated": {
+      "type": "string",
+      "format": "date-time"
+    },
+    "source": {
+      "type": "string",
+      "description": "What triggered this research (e.g. /evolution, /research, manual)"
+    },
+    "trigger": {
+      "type": "object",
+      "properties": {
+        "type": {
+          "type": "string",
+          "enum": ["evolution_cycle", "manual_research", "fitness_below_threshold", "scheduled"]
+        },
+        "issue": {
+          "type": "integer"
+        },
+        "fitness_score": {
+          "type": "number"
+        },
+        "reason": {
+          "type": "string"
+        }
+      }
+    },
+    "models": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": ["id", "name", "organization", "if_score", "provider"],
+        "properties": {
+          "id": {
+            "type": "string",
+            "description": "Full model ID like ollama-cloud/qwen3-coder:480b"
+          },
+          "name": {
+            "type": "string"
+          },
+          "organization": {
+            "type": "string"
+          },
+          "parameters": {
+            "type": "string"
+          },
+          "context_window": {
+            "type": "string"
+          },
+          "swe_bench": {
+            "type": ["number", "null"]
+          },
+          "swe_bench_pro": {
+            "type": ["number", "null"]
+          },
+          "terminal_bench": {
+            "type": ["number", "null"]
+          },
+          "live_codebench": {
+            "type": ["number", "null"]
+          },
+          "gpqa": {
+            "type": ["number", "null"]
+          },
+          "hle": {
+            "type": ["number", "null"]
+          },
+          "browse_comp": {
+            "type": ["number", "null"]
+          },
+  "if_score": {
+    "type": "number",
+    "minimum": 0,
+    "maximum": 100,
+    "description": "Instruction Following composite score (IFEval + IFBench)"
+  },
+          "categories": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          },
+          "tags": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          },
+          "provider": {
+            "type": "string",
+            "enum": ["ollama", "ollama-cloud", "openrouter", "groq", "hybrid"]
+          },
+          "free": {
+            "type": "boolean"
+          },
+          "cost_per_1m_input": {
+            "type": ["number", "string", "null"]
+          },
+          "description": {
+            "type": "string"
+          },
+          "availability": {
+            "type": "object",
+            "properties": {
+              "rpm": {
+                "type": ["integer", "null"]
+              },
+              "rpd": {
+                "type": ["integer", "string", "null"]
+              },
+              "tpm": {
+                "type": ["integer", "string", "null"]
+              },
+              "tpd": {
+                "type": ["integer", "string", "null"]
+              }
+            }
+          },
+          "speed_tps": {
+            "type": ["number", "string", "null"]
+          }
+        }
+      }
+    },
+    "recommendations": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "required": ["agent", "action", "current_model", "recommended_model", "impact", "rationale"],
+        "properties": {
+          "agent": {
+            "type": "string"
+          },
+          "action": {
+            "type": "string",
+            "enum": ["update_model", "confirm_model", "add_fallback", "redesign_agent"]
+          },
+          "current_model": {
+            "type": "string"
+          },
+          "current_provider": {
+            "type": "string"
+          },
+          "recommended_model": {
+            "type": "string"
+          },
+          "recommended_provider": {
+            "type": "string"
+          },
+          "fallback_model": {
+            "type": "string"
+          },
+          "fallback_strategy": {
+            "type": "string"
+          },
+          "impact": {
+            "type": "string",
+            "enum": ["critical", "high", "medium", "low"]
+          },
+          "expected_improvement": {
+            "type": "object",
+            "properties": {
+              "quality": {
+                "type": "string"
+              },
+              "speed": {
+                "type": "string"
+              },
+              "context_window": {
+                "type": "string"
+              }
+            }
+          },
+          "score_before": {
+            "type": "number"
+          },
+          "score_after": {
+            "type": "number"
+          },
+          "score_delta": {
+            "type": "number"
+          },
+          "rationale": {
+            "type": "string"
+          },
+          "applied": {
+            "type": "boolean",
+            "default": false
+          },
+          "applied_date": {
+            "type": ["string", "null"],
+            "format": "date-time"
+          }
+        }
+      }
+    },
+    "heatmap": {
+      "type": "object",
+      "description": "Agent × Model compatibility matrix with IF adjustment",
+      "required": ["models", "agents"],
+      "properties": {
+        "models": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "required": ["id", "if_score"],
+            "properties": {
+              "id": {
+                "type": "string"
+              },
+              "display_name": {
+                "type": "string"
+              },
+              "provider": {
+                "type": "string"
+              },
+              "if_score": {
+                "type": "number"
+              }
+            }
+          }
+        },
+        "agents": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "required": ["agent", "reasoning_effort", "scores"],
+            "properties": {
+              "agent": {
+                "type": "string"
+              },
+              "current_model": {
+                "type": "string"
+              },
+              "reasoning_effort": {
+                "type": "string",
+                "enum": ["L", "M", "H"]
+              },
+              "scores": {
+                "type": "object",
+                "additionalProperties": {
+                  "type": "number"
+                },
+                "description": "Model ID → compatibility score (0-100, IF-adjusted)"
+              }
+            }
+          }
+        },
+        "if_adjustment_formula": {
+          "type": "string",
+          "default": "score * (0.7 + 0.3 * IF/100)"
+        }
+      }
+    },
+    "closed_source_comparison": {
+      "type": "object",
+      "description": "APAW pipeline models vs top closed-source models",
+      "properties": {
+        "benchmarks": {
+          "type": "array"
+        },
+        "models": {
+          "type": "array"
+        },
+        "apaw_best_per_benchmark": {
+          "type": "object"
+        },
+        "closed_best_per_benchmark": {
+          "type": "object"
+        }
+      }
+    },
+    "capability_index_patch": {
+      "type": "array",
+      "description": "Ready-to-apply patches to capability-index.yaml",
+      "items": {
+        "type": "object",
+        "required": ["agent", "set"],
+        "properties": {
+          "agent": {
+            "type": "string"
+          },
+          "set": {
+            "type": "object",
+            "additionalProperties": true
+          }
+        }
+      }
+    },
+    "summary": {
+      "type": "object",
+      "properties": {
+        "avg_quality_improvement": {
+          "type": "string"
+        },
+        "providers_used": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "key_models": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "total_recommendations": {
+          "type": "integer"
+        },
+        "applied_count": {
+          "type": "integer"
+        },
+        "pending_count": {
+          "type": "integer"
+        }
+      }
+    }
+  }
+}