fix: restore optimal v3 models + add fitness gate protection

- Restore all 30 agents to v3.html heatmap optimal models:
  * frontend-developer: qwen3-coder -> minimax-m2.5 (92★)
  * devops-engineer: nemotron-3-super -> kimi-k2.6:cloud (88★)
  * browser-automation: qwen3-coder -> kimi-k2.6:cloud (86★)
  * agent-architect: glm-5.1 -> kimi-k2.6:cloud (86★)
- Add Model Evolution Guard system:
  * agent-evolution/scripts/lib/fitness-gate.cjs
  * Rejects downgrades >3 points or below score 75
  * Produces detailed diff report before any file modifications
  * Normalized model ID lookup (v3.html ':' vs JSON '-')
- Update sync-benchmarks-from-yaml.cjs with fitness gate
- Update model-benchmarks.json with v3 optimal assignments
- Rebuild research-dashboard.html (104KB, 30 agents, 11 models)
- Add model-evolution-guard.md architecture documentation
- Add v3-optimal-models.json as source-of-truth reference

Fixes regression introduced by commit 3badb25 where models were
silently downgraded from heatmap optimal to inferior assignments.
This commit is contained in:
¨NW¨
2026-04-29 23:19:16 +01:00
parent d1516f4856
commit 9e48a4960e
14 changed files with 2850 additions and 2049 deletions

View File

@@ -255,7 +255,7 @@
<div class="container">
<div class="header">
<h1>APAW Agent Model Research v2</h1>
<div class="sub">Live dashboard • 15 models × 32 agents • 2026-04-29</div>
<div class="sub">Live dashboard • 15 models × 30 agents • 2026-04-29</div>
</div>
<div class="tabs" id="tabBar">
@@ -419,12 +419,12 @@
<script>
// BENCHMARK_DATA_PLACEHOLDER - REPLACED BY BUILD SCRIPT
// Generated from model-benchmarks.json on 2026-04-29T19:58:05.244Z
// Generated from model-benchmarks.json on 2026-04-29T22:15:07.925Z
const EMBEDDED_DATA = {
"version": "1.0.0",
"generated": "2026-04-29T19:56:51.418Z",
"source": ".kilo/capability-index.yaml (synced v2)",
"total_agents": 32,
"generated": "2026-04-29T21:47:05.339Z",
"source": ".kilo/capability-index.yaml (synced v3 + fitness-gate)",
"total_agents": 30,
"total_models_tracked": 11,
"providers": [
"ollama",
@@ -800,8 +800,8 @@ const EMBEDDED_DATA = {
"agent_model_scores": [
{
"agent": "lead-developer",
"current_model_index": 6,
"current_model_id": "nemotron-3-super",
"current_model_index": 0,
"current_model_id": "qwen3-coder-480b",
"reasoning_effort": "H",
"scores": {
"qwen3-coder-480b": 92,
@@ -818,8 +818,8 @@ const EMBEDDED_DATA = {
},
{
"agent": "frontend-developer",
"current_model_index": -1,
"current_model_id": "qwen3-coder:480b",
"current_model_index": 1,
"current_model_id": "minimax-m2.5",
"reasoning_effort": "M",
"scores": {
"qwen3-coder-480b": 86,
@@ -836,8 +836,8 @@ const EMBEDDED_DATA = {
},
{
"agent": "php-developer",
"current_model_index": -1,
"current_model_id": "qwen3-coder:480b",
"current_model_index": 0,
"current_model_id": "qwen3-coder-480b",
"reasoning_effort": "H",
"scores": {
"qwen3-coder-480b": 87,
@@ -854,8 +854,8 @@ const EMBEDDED_DATA = {
},
{
"agent": "python-developer",
"current_model_index": -1,
"current_model_id": "qwen3-coder:480b",
"current_model_index": 0,
"current_model_id": "qwen3-coder-480b",
"reasoning_effort": "H",
"scores": {
"qwen3-coder-480b": 90,
@@ -872,8 +872,8 @@ const EMBEDDED_DATA = {
},
{
"agent": "backend-developer",
"current_model_index": -1,
"current_model_id": "qwen3-coder:480b",
"current_model_index": 0,
"current_model_id": "qwen3-coder-480b",
"reasoning_effort": "M",
"scores": {
"qwen3-coder-480b": 91,
@@ -890,8 +890,8 @@ const EMBEDDED_DATA = {
},
{
"agent": "go-developer",
"current_model_index": -1,
"current_model_id": "qwen3-coder:480b",
"current_model_index": 0,
"current_model_id": "qwen3-coder-480b",
"reasoning_effort": "M",
"scores": {
"qwen3-coder-480b": 85,
@@ -908,8 +908,8 @@ const EMBEDDED_DATA = {
},
{
"agent": "flutter-developer",
"current_model_index": -1,
"current_model_id": "qwen3-coder:480b",
"current_model_index": 0,
"current_model_id": "qwen3-coder-480b",
"reasoning_effort": "M",
"scores": {
"qwen3-coder-480b": 86,
@@ -926,8 +926,8 @@ const EMBEDDED_DATA = {
},
{
"agent": "devops-engineer",
"current_model_index": 6,
"current_model_id": "nemotron-3-super",
"current_model_index": -1,
"current_model_id": "kimi-k2.6",
"reasoning_effort": "M",
"scores": {
"qwen3-coder-480b": 66,
@@ -944,8 +944,8 @@ const EMBEDDED_DATA = {
},
{
"agent": "sdet-engineer",
"current_model_index": -1,
"current_model_id": "qwen3-coder:480b",
"current_model_index": 0,
"current_model_id": "qwen3-coder-480b",
"reasoning_effort": "H",
"scores": {
"qwen3-coder-480b": 88,
@@ -1035,7 +1035,7 @@ const EMBEDDED_DATA = {
{
"agent": "browser-automation",
"current_model_index": -1,
"current_model_id": "qwen3-coder:480b",
"current_model_id": "kimi-k2.6",
"reasoning_effort": "M",
"scores": {
"qwen3-coder-480b": 87,
@@ -1052,8 +1052,8 @@ const EMBEDDED_DATA = {
},
{
"agent": "visual-tester",
"current_model_index": -1,
"current_model_id": "qwen3-coder:480b",
"current_model_index": 0,
"current_model_id": "qwen3-coder-480b",
"reasoning_effort": "M",
"scores": {
"qwen3-coder-480b": 82,
@@ -1070,9 +1070,9 @@ const EMBEDDED_DATA = {
},
{
"agent": "system-analyst",
"current_model_index": 6,
"current_model_id": "nemotron-3-super",
"reasoning_effort": "H",
"current_model_index": 7,
"current_model_id": "glm-5.1",
"reasoning_effort": "M",
"scores": {
"qwen3-coder-480b": 70,
"minimax-m2.5": 66,
@@ -1086,42 +1086,6 @@ const EMBEDDED_DATA = {
"kimi-k2-6": 86
}
},
{
"agent": "requirement-refiner",
"current_model_index": 7,
"current_model_id": "glm-5.1",
"reasoning_effort": "H",
"scores": {
"qwen3-coder-480b": 66,
"minimax-m2.5": 62,
"minimax-m2.7": 60,
"nemotron-3-super": 72,
"glm-5.1": 80,
"deepseek-v4-pro-max": 82,
"qwen3-5-122b": 74,
"qwen3-coder-next": 54,
"qwen3-6-plus": 78,
"kimi-k2-6": 82
}
},
{
"agent": "history-miner",
"current_model_index": 6,
"current_model_id": "nemotron-3-super",
"reasoning_effort": "M",
"scores": {
"qwen3-coder-480b": 68,
"minimax-m2.5": 60,
"minimax-m2.7": 56,
"nemotron-3-super": 85,
"glm-5.1": 78,
"deepseek-v4-pro-max": 86,
"qwen3-5-122b": 72,
"qwen3-coder-next": 56,
"qwen3-6-plus": 84,
"kimi-k2-6": 82
}
},
{
"agent": "capability-analyst",
"current_model_index": 7,
@@ -1143,7 +1107,7 @@ const EMBEDDED_DATA = {
{
"agent": "orchestrator",
"current_model_index": -1,
"current_model_id": "kimi-k2.6:cloud",
"current_model_id": "kimi-k2.6",
"reasoning_effort": "H",
"scores": {
"qwen3-coder-480b": 74,
@@ -1286,8 +1250,8 @@ const EMBEDDED_DATA = {
},
{
"agent": "agent-architect",
"current_model_index": 7,
"current_model_id": "glm-5.1",
"current_model_index": -1,
"current_model_id": "kimi-k2.6",
"reasoning_effort": "H",
"scores": {
"qwen3-coder-480b": 78,
@@ -1391,17 +1355,17 @@ const EMBEDDED_DATA = {
"agent_current_config": [
{
"agent": "lead-developer",
"model": "ollama-cloud/nemotron-3-super",
"model": "ollama-cloud/qwen3-coder:480b",
"provider": "Ollama Cloud",
"category": "Process",
"badge_type": "nemotron",
"badge_type": "qwen",
"fit_score": 0,
"status": "good",
"previous_model": null
},
{
"agent": "frontend-developer",
"model": "ollama-cloud/qwen3-coder:480b",
"model": "ollama-cloud/minimax-m2.5",
"provider": "Ollama Cloud",
"category": "Process",
"badge_type": "qwen",
@@ -1461,7 +1425,7 @@ const EMBEDDED_DATA = {
},
{
"agent": "devops-engineer",
"model": "ollama-cloud/nemotron-3-super",
"model": "ollama-cloud/kimi-k2.6",
"provider": "Ollama Cloud",
"category": "Process",
"badge_type": "nemotron",
@@ -1521,7 +1485,7 @@ const EMBEDDED_DATA = {
},
{
"agent": "browser-automation",
"model": "ollama-cloud/qwen3-coder:480b",
"model": "ollama-cloud/kimi-k2.6",
"provider": "Ollama Cloud",
"category": "Process",
"badge_type": "qwen",
@@ -1541,16 +1505,6 @@ const EMBEDDED_DATA = {
},
{
"agent": "system-analyst",
"model": "ollama-cloud/nemotron-3-super",
"provider": "Ollama Cloud",
"category": "Process",
"badge_type": "nemotron",
"fit_score": 0,
"status": "good",
"previous_model": null
},
{
"agent": "requirement-refiner",
"model": "ollama-cloud/glm-5.1",
"provider": "Ollama Cloud",
"category": "Process",
@@ -1559,16 +1513,6 @@ const EMBEDDED_DATA = {
"status": "good",
"previous_model": null
},
{
"agent": "history-miner",
"model": "ollama-cloud/nemotron-3-super",
"provider": "Ollama Cloud",
"category": "Process",
"badge_type": "nemotron",
"fit_score": 0,
"status": "good",
"previous_model": null
},
{
"agent": "capability-analyst",
"model": "ollama-cloud/glm-5.1",
@@ -1581,7 +1525,7 @@ const EMBEDDED_DATA = {
},
{
"agent": "orchestrator",
"model": "ollama-cloud/kimi-k2.6:cloud",
"model": "ollama-cloud/kimi-k2.6",
"provider": "Ollama Cloud",
"category": "Process",
"badge_type": "kimi",
@@ -1661,7 +1605,7 @@ const EMBEDDED_DATA = {
},
{
"agent": "agent-architect",
"model": "ollama-cloud/glm-5.1",
"model": "ollama-cloud/kimi-k2.6",
"provider": "Ollama Cloud",
"category": "Process",
"badge_type": "glm",