fix: restore optimal v3 models + add fitness gate protection

- Restore all 30 agents to v3.html heatmap optimal models:
  * frontend-developer: qwen3-coder -> minimax-m2.5 (92★)
  * devops-engineer: nemotron-3-super -> kimi-k2.6:cloud (88★)
  * browser-automation: qwen3-coder -> kimi-k2.6:cloud (86★)
  * agent-architect: glm-5.1 -> kimi-k2.6:cloud (86★)
- Add Model Evolution Guard system:
  * agent-evolution/scripts/lib/fitness-gate.cjs
  * Rejects downgrades >3 points or below score 75
  * Produces detailed diff report before any file modifications
  * Normalized model ID lookup (v3.html ':' vs JSON '-')
- Update sync-benchmarks-from-yaml.cjs with fitness gate
- Update model-benchmarks.json with v3 optimal assignments
- Rebuild research-dashboard.html (104KB, 30 agents, 11 models)
- Add model-evolution-guard.md architecture documentation
- Add v3-optimal-models.json as source-of-truth reference

Fixes regression introduced by commit 3badb25 where models were
silently downgraded from heatmap optimal to inferior assignments.
This commit is contained in:
¨NW¨
2026-04-29 23:19:16 +01:00
parent d1516f4856
commit 9e48a4960e
14 changed files with 2850 additions and 2049 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,610 @@
{
"lead-developer": {
"model": "qwen3-coder:480b",
"c": 0,
"score": 92,
"best": 92,
"scores": [
92,
86,
82,
70,
68,
75,
88,
66,
80,
88,
90
]
},
"frontend-developer": {
"model": "minimax-m2.5",
"c": 1,
"score": 92,
"best": 92,
"scores": [
86,
92,
88,
62,
56,
64,
82,
60,
76,
88,
86
]
},
"backend-developer": {
"model": "qwen3-coder:480b",
"c": 0,
"score": 91,
"best": 91,
"scores": [
91,
84,
80,
68,
63,
72,
86,
62,
78,
87,
90
]
},
"go-developer": {
"model": "qwen3-coder:480b",
"c": 0,
"score": 85,
"best": 88,
"scores": [
85,
78,
74,
66,
58,
68,
88,
58,
74,
82,
86
]
},
"flutter-developer": {
"model": "qwen3-coder:480b",
"c": 0,
"score": 86,
"best": 86,
"scores": [
86,
70,
66,
60,
53,
62,
78,
58,
74,
82,
84
]
},
"php-developer": {
"model": "qwen3-coder:480b",
"c": 0,
"score": 87,
"best": 87,
"scores": [
87,
76,
72,
64,
56,
66,
74,
60,
76,
84,
86
]
},
"python-developer": {
"model": "qwen3-coder:480b",
"c": 0,
"score": 90,
"best": 90,
"scores": [
90,
82,
78,
66,
60,
70,
78,
64,
78,
88,
88
]
},
"sdet-engineer": {
"model": "qwen3-coder:480b",
"c": 0,
"score": 88,
"best": 88,
"scores": [
88,
84,
80,
70,
63,
72,
84,
64,
78,
84,
87
]
},
"orchestrator": {
"model": "kimi-k2.6",
"c": 10,
"score": 92,
"best": 92,
"scores": [
74,
70,
68,
80,
82,
90,
86,
78,
62,
84,
92
]
},
"evaluator": {
"model": "glm-5.1",
"c": 5,
"score": 86,
"best": 86,
"scores": [
70,
73,
70,
78,
78,
86,
84,
76,
58,
81,
84
]
},
"capability-analyst": {
"model": "glm-5.1",
"c": 5,
"score": 85,
"best": 85,
"scores": [
72,
68,
66,
76,
78,
85,
82,
75,
60,
79,
82
]
},
"architect-indexer": {
"model": "glm-5.1",
"c": 5,
"score": 88,
"best": 88,
"scores": [
70,
64,
62,
74,
80,
88,
78,
76,
58,
80,
84
]
},
"pipeline-judge": {
"model": "glm-5.1",
"c": 5,
"score": 86,
"best": 86,
"scores": [
64,
68,
65,
78,
76,
86,
82,
74,
56,
80,
84
]
},
"release-manager": {
"model": "glm-5.1",
"c": 5,
"score": 82,
"best": 82,
"scores": [
72,
66,
64,
74,
76,
82,
78,
72,
60,
76,
78
]
},
"requirement-refiner": {
"model": "glm-5.1",
"c": 5,
"score": 88,
"best": 88,
"scores": [
66,
62,
60,
72,
80,
88,
82,
74,
54,
78,
82
]
},
"workflow-architect": {
"model": "glm-5.1",
"c": 5,
"score": 84,
"best": 84,
"scores": [
68,
62,
60,
76,
76,
84,
80,
72,
56,
80,
82
]
},
"agent-architect": {
"model": "kimi-k2.6",
"c": 10,
"score": 86,
"best": 86,
"scores": [
78,
72,
70,
78,
76,
84,
82,
76,
66,
82,
86
]
},
"security-auditor": {
"model": "nemotron-3-super",
"c": 3,
"score": 76,
"best": 80,
"scores": [
76,
74,
68,
76,
68,
78,
80,
72,
64,
75,
80
]
},
"performance-engineer": {
"model": "nemotron-3-super",
"c": 3,
"score": 78,
"best": 84,
"scores": [
78,
75,
70,
78,
74,
82,
84,
70,
67,
76,
82
]
},
"history-miner": {
"model": "nemotron-3-super",
"c": 3,
"score": 85,
"best": 88,
"scores": [
68,
60,
56,
85,
78,
88,
86,
72,
56,
84,
82
]
},
"memory-manager": {
"model": "nemotron-3-super",
"c": 3,
"score": 86,
"best": 87,
"scores": [
63,
58,
56,
86,
72,
84,
86,
70,
50,
87,
84
]
},
"planner": {
"model": "nemotron-3-super",
"c": 3,
"score": 80,
"best": 88,
"scores": [
72,
68,
66,
80,
78,
85,
88,
78,
60,
85,
86
]
},
"reflector": {
"model": "nemotron-3-super",
"c": 3,
"score": 78,
"best": 84,
"scores": [
68,
66,
64,
78,
76,
82,
84,
76,
56,
82,
80
]
},
"browser-automation": {
"model": "kimi-k2.6",
"c": 10,
"score": 86,
"best": 87,
"scores": [
87,
72,
68,
61,
53,
64,
82,
56,
72,
82,
86
]
},
"product-owner": {
"model": "glm-5.1",
"c": 5,
"score": 84,
"best": 84,
"scores": [
60,
56,
54,
74,
78,
84,
76,
74,
48,
78,
76
]
},
"visual-tester": {
"model": "qwen3-coder:480b",
"c": 0,
"score": 82,
"best": 82,
"scores": [
82,
68,
64,
55,
48,
58,
76,
54,
66,
76,
78
]
},
"prompt-optimizer": {
"model": "glm-5.1",
"c": 5,
"score": 82,
"best": 83,
"scores": [
76,
74,
72,
76,
75,
82,
80,
74,
64,
83,
82
]
},
"system-analyst": {
"model": "glm-5.1",
"c": 5,
"score": 90,
"best": 90,
"scores": [
70,
66,
63,
74,
82,
90,
88,
76,
58,
80,
86
]
},
"code-skeptic": {
"model": "minimax-m2.5",
"c": 1,
"score": 85,
"best": 85,
"scores": [
82,
85,
80,
73,
72,
78,
82,
70,
72,
80,
82
]
},
"the-fixer": {
"model": "minimax-m2.5",
"c": 1,
"score": 88,
"best": 90,
"scores": [
89,
88,
84,
71,
64,
74,
88,
64,
82,
86,
90
]
},
"devops-engineer": {
"model": "kimi-k2.6",
"c": 10,
"score": 88,
"best": 88,
"scores": [
66,
53,
48,
78,
75,
84,
86,
70,
54,
76,
88
]
},
"[built-in] debug": {
"model": "glm-5.1",
"c": 5,
"score": 88,
"best": 90,
"scores": [
78,
80,
76,
72,
64,
88,
90,
68,
76,
85,
90
]
}
}