- Restore all 30 agents to v3.html heatmap optimal models:
* frontend-developer: qwen3-coder -> minimax-m2.5 (92★)
* devops-engineer: nemotron-3-super -> kimi-k2.6:cloud (88★)
* browser-automation: qwen3-coder -> kimi-k2.6:cloud (86★)
* agent-architect: glm-5.1 -> kimi-k2.6:cloud (86★)
- Add Model Evolution Guard system:
* agent-evolution/scripts/lib/fitness-gate.cjs
* Rejects downgrades >3 points or below score 75
* Produces detailed diff report before any file modifications
* Normalized model ID lookup (v3.html ':' vs JSON '-')
- Update sync-benchmarks-from-yaml.cjs with fitness gate
- Update model-benchmarks.json with v3 optimal assignments
- Rebuild research-dashboard.html (104KB, 30 agents, 11 models)
- Add model-evolution-guard.md architecture documentation
- Add v3-optimal-models.json as source-of-truth reference
Fixes regression introduced by commit 3badb25 where models were
silently downgraded from heatmap optimal to inferior assignments.
610 lines
7.9 KiB
JSON
610 lines
7.9 KiB
JSON
{
|
|
"lead-developer": {
|
|
"model": "qwen3-coder:480b",
|
|
"c": 0,
|
|
"score": 92,
|
|
"best": 92,
|
|
"scores": [
|
|
92,
|
|
86,
|
|
82,
|
|
70,
|
|
68,
|
|
75,
|
|
88,
|
|
66,
|
|
80,
|
|
88,
|
|
90
|
|
]
|
|
},
|
|
"frontend-developer": {
|
|
"model": "minimax-m2.5",
|
|
"c": 1,
|
|
"score": 92,
|
|
"best": 92,
|
|
"scores": [
|
|
86,
|
|
92,
|
|
88,
|
|
62,
|
|
56,
|
|
64,
|
|
82,
|
|
60,
|
|
76,
|
|
88,
|
|
86
|
|
]
|
|
},
|
|
"backend-developer": {
|
|
"model": "qwen3-coder:480b",
|
|
"c": 0,
|
|
"score": 91,
|
|
"best": 91,
|
|
"scores": [
|
|
91,
|
|
84,
|
|
80,
|
|
68,
|
|
63,
|
|
72,
|
|
86,
|
|
62,
|
|
78,
|
|
87,
|
|
90
|
|
]
|
|
},
|
|
"go-developer": {
|
|
"model": "qwen3-coder:480b",
|
|
"c": 0,
|
|
"score": 85,
|
|
"best": 88,
|
|
"scores": [
|
|
85,
|
|
78,
|
|
74,
|
|
66,
|
|
58,
|
|
68,
|
|
88,
|
|
58,
|
|
74,
|
|
82,
|
|
86
|
|
]
|
|
},
|
|
"flutter-developer": {
|
|
"model": "qwen3-coder:480b",
|
|
"c": 0,
|
|
"score": 86,
|
|
"best": 86,
|
|
"scores": [
|
|
86,
|
|
70,
|
|
66,
|
|
60,
|
|
53,
|
|
62,
|
|
78,
|
|
58,
|
|
74,
|
|
82,
|
|
84
|
|
]
|
|
},
|
|
"php-developer": {
|
|
"model": "qwen3-coder:480b",
|
|
"c": 0,
|
|
"score": 87,
|
|
"best": 87,
|
|
"scores": [
|
|
87,
|
|
76,
|
|
72,
|
|
64,
|
|
56,
|
|
66,
|
|
74,
|
|
60,
|
|
76,
|
|
84,
|
|
86
|
|
]
|
|
},
|
|
"python-developer": {
|
|
"model": "qwen3-coder:480b",
|
|
"c": 0,
|
|
"score": 90,
|
|
"best": 90,
|
|
"scores": [
|
|
90,
|
|
82,
|
|
78,
|
|
66,
|
|
60,
|
|
70,
|
|
78,
|
|
64,
|
|
78,
|
|
88,
|
|
88
|
|
]
|
|
},
|
|
"sdet-engineer": {
|
|
"model": "qwen3-coder:480b",
|
|
"c": 0,
|
|
"score": 88,
|
|
"best": 88,
|
|
"scores": [
|
|
88,
|
|
84,
|
|
80,
|
|
70,
|
|
63,
|
|
72,
|
|
84,
|
|
64,
|
|
78,
|
|
84,
|
|
87
|
|
]
|
|
},
|
|
"orchestrator": {
|
|
"model": "kimi-k2.6",
|
|
"c": 10,
|
|
"score": 92,
|
|
"best": 92,
|
|
"scores": [
|
|
74,
|
|
70,
|
|
68,
|
|
80,
|
|
82,
|
|
90,
|
|
86,
|
|
78,
|
|
62,
|
|
84,
|
|
92
|
|
]
|
|
},
|
|
"evaluator": {
|
|
"model": "glm-5.1",
|
|
"c": 5,
|
|
"score": 86,
|
|
"best": 86,
|
|
"scores": [
|
|
70,
|
|
73,
|
|
70,
|
|
78,
|
|
78,
|
|
86,
|
|
84,
|
|
76,
|
|
58,
|
|
81,
|
|
84
|
|
]
|
|
},
|
|
"capability-analyst": {
|
|
"model": "glm-5.1",
|
|
"c": 5,
|
|
"score": 85,
|
|
"best": 85,
|
|
"scores": [
|
|
72,
|
|
68,
|
|
66,
|
|
76,
|
|
78,
|
|
85,
|
|
82,
|
|
75,
|
|
60,
|
|
79,
|
|
82
|
|
]
|
|
},
|
|
"architect-indexer": {
|
|
"model": "glm-5.1",
|
|
"c": 5,
|
|
"score": 88,
|
|
"best": 88,
|
|
"scores": [
|
|
70,
|
|
64,
|
|
62,
|
|
74,
|
|
80,
|
|
88,
|
|
78,
|
|
76,
|
|
58,
|
|
80,
|
|
84
|
|
]
|
|
},
|
|
"pipeline-judge": {
|
|
"model": "glm-5.1",
|
|
"c": 5,
|
|
"score": 86,
|
|
"best": 86,
|
|
"scores": [
|
|
64,
|
|
68,
|
|
65,
|
|
78,
|
|
76,
|
|
86,
|
|
82,
|
|
74,
|
|
56,
|
|
80,
|
|
84
|
|
]
|
|
},
|
|
"release-manager": {
|
|
"model": "glm-5.1",
|
|
"c": 5,
|
|
"score": 82,
|
|
"best": 82,
|
|
"scores": [
|
|
72,
|
|
66,
|
|
64,
|
|
74,
|
|
76,
|
|
82,
|
|
78,
|
|
72,
|
|
60,
|
|
76,
|
|
78
|
|
]
|
|
},
|
|
"requirement-refiner": {
|
|
"model": "glm-5.1",
|
|
"c": 5,
|
|
"score": 88,
|
|
"best": 88,
|
|
"scores": [
|
|
66,
|
|
62,
|
|
60,
|
|
72,
|
|
80,
|
|
88,
|
|
82,
|
|
74,
|
|
54,
|
|
78,
|
|
82
|
|
]
|
|
},
|
|
"workflow-architect": {
|
|
"model": "glm-5.1",
|
|
"c": 5,
|
|
"score": 84,
|
|
"best": 84,
|
|
"scores": [
|
|
68,
|
|
62,
|
|
60,
|
|
76,
|
|
76,
|
|
84,
|
|
80,
|
|
72,
|
|
56,
|
|
80,
|
|
82
|
|
]
|
|
},
|
|
"agent-architect": {
|
|
"model": "kimi-k2.6",
|
|
"c": 10,
|
|
"score": 86,
|
|
"best": 86,
|
|
"scores": [
|
|
78,
|
|
72,
|
|
70,
|
|
78,
|
|
76,
|
|
84,
|
|
82,
|
|
76,
|
|
66,
|
|
82,
|
|
86
|
|
]
|
|
},
|
|
"security-auditor": {
|
|
"model": "nemotron-3-super",
|
|
"c": 3,
|
|
"score": 76,
|
|
"best": 80,
|
|
"scores": [
|
|
76,
|
|
74,
|
|
68,
|
|
76,
|
|
68,
|
|
78,
|
|
80,
|
|
72,
|
|
64,
|
|
75,
|
|
80
|
|
]
|
|
},
|
|
"performance-engineer": {
|
|
"model": "nemotron-3-super",
|
|
"c": 3,
|
|
"score": 78,
|
|
"best": 84,
|
|
"scores": [
|
|
78,
|
|
75,
|
|
70,
|
|
78,
|
|
74,
|
|
82,
|
|
84,
|
|
70,
|
|
67,
|
|
76,
|
|
82
|
|
]
|
|
},
|
|
"history-miner": {
|
|
"model": "nemotron-3-super",
|
|
"c": 3,
|
|
"score": 85,
|
|
"best": 88,
|
|
"scores": [
|
|
68,
|
|
60,
|
|
56,
|
|
85,
|
|
78,
|
|
88,
|
|
86,
|
|
72,
|
|
56,
|
|
84,
|
|
82
|
|
]
|
|
},
|
|
"memory-manager": {
|
|
"model": "nemotron-3-super",
|
|
"c": 3,
|
|
"score": 86,
|
|
"best": 87,
|
|
"scores": [
|
|
63,
|
|
58,
|
|
56,
|
|
86,
|
|
72,
|
|
84,
|
|
86,
|
|
70,
|
|
50,
|
|
87,
|
|
84
|
|
]
|
|
},
|
|
"planner": {
|
|
"model": "nemotron-3-super",
|
|
"c": 3,
|
|
"score": 80,
|
|
"best": 88,
|
|
"scores": [
|
|
72,
|
|
68,
|
|
66,
|
|
80,
|
|
78,
|
|
85,
|
|
88,
|
|
78,
|
|
60,
|
|
85,
|
|
86
|
|
]
|
|
},
|
|
"reflector": {
|
|
"model": "nemotron-3-super",
|
|
"c": 3,
|
|
"score": 78,
|
|
"best": 84,
|
|
"scores": [
|
|
68,
|
|
66,
|
|
64,
|
|
78,
|
|
76,
|
|
82,
|
|
84,
|
|
76,
|
|
56,
|
|
82,
|
|
80
|
|
]
|
|
},
|
|
"browser-automation": {
|
|
"model": "kimi-k2.6",
|
|
"c": 10,
|
|
"score": 86,
|
|
"best": 87,
|
|
"scores": [
|
|
87,
|
|
72,
|
|
68,
|
|
61,
|
|
53,
|
|
64,
|
|
82,
|
|
56,
|
|
72,
|
|
82,
|
|
86
|
|
]
|
|
},
|
|
"product-owner": {
|
|
"model": "glm-5.1",
|
|
"c": 5,
|
|
"score": 84,
|
|
"best": 84,
|
|
"scores": [
|
|
60,
|
|
56,
|
|
54,
|
|
74,
|
|
78,
|
|
84,
|
|
76,
|
|
74,
|
|
48,
|
|
78,
|
|
76
|
|
]
|
|
},
|
|
"visual-tester": {
|
|
"model": "qwen3-coder:480b",
|
|
"c": 0,
|
|
"score": 82,
|
|
"best": 82,
|
|
"scores": [
|
|
82,
|
|
68,
|
|
64,
|
|
55,
|
|
48,
|
|
58,
|
|
76,
|
|
54,
|
|
66,
|
|
76,
|
|
78
|
|
]
|
|
},
|
|
"prompt-optimizer": {
|
|
"model": "glm-5.1",
|
|
"c": 5,
|
|
"score": 82,
|
|
"best": 83,
|
|
"scores": [
|
|
76,
|
|
74,
|
|
72,
|
|
76,
|
|
75,
|
|
82,
|
|
80,
|
|
74,
|
|
64,
|
|
83,
|
|
82
|
|
]
|
|
},
|
|
"system-analyst": {
|
|
"model": "glm-5.1",
|
|
"c": 5,
|
|
"score": 90,
|
|
"best": 90,
|
|
"scores": [
|
|
70,
|
|
66,
|
|
63,
|
|
74,
|
|
82,
|
|
90,
|
|
88,
|
|
76,
|
|
58,
|
|
80,
|
|
86
|
|
]
|
|
},
|
|
"code-skeptic": {
|
|
"model": "minimax-m2.5",
|
|
"c": 1,
|
|
"score": 85,
|
|
"best": 85,
|
|
"scores": [
|
|
82,
|
|
85,
|
|
80,
|
|
73,
|
|
72,
|
|
78,
|
|
82,
|
|
70,
|
|
72,
|
|
80,
|
|
82
|
|
]
|
|
},
|
|
"the-fixer": {
|
|
"model": "minimax-m2.5",
|
|
"c": 1,
|
|
"score": 88,
|
|
"best": 90,
|
|
"scores": [
|
|
89,
|
|
88,
|
|
84,
|
|
71,
|
|
64,
|
|
74,
|
|
88,
|
|
64,
|
|
82,
|
|
86,
|
|
90
|
|
]
|
|
},
|
|
"devops-engineer": {
|
|
"model": "kimi-k2.6",
|
|
"c": 10,
|
|
"score": 88,
|
|
"best": 88,
|
|
"scores": [
|
|
66,
|
|
53,
|
|
48,
|
|
78,
|
|
75,
|
|
84,
|
|
86,
|
|
70,
|
|
54,
|
|
76,
|
|
88
|
|
]
|
|
},
|
|
"[built-in] debug": {
|
|
"model": "glm-5.1",
|
|
"c": 5,
|
|
"score": 88,
|
|
"best": 90,
|
|
"scores": [
|
|
78,
|
|
80,
|
|
76,
|
|
72,
|
|
64,
|
|
88,
|
|
90,
|
|
68,
|
|
76,
|
|
85,
|
|
90
|
|
]
|
|
}
|
|
} |