fix: restore optimal v3 models + add fitness gate protection

- Restore all 30 agents to v3.html heatmap optimal models:
  * frontend-developer: qwen3-coder -> minimax-m2.5 (92★)
  * devops-engineer: nemotron-3-super -> kimi-k2.6:cloud (88★)
  * browser-automation: qwen3-coder -> kimi-k2.6:cloud (86★)
  * agent-architect: glm-5.1 -> kimi-k2.6:cloud (86★)
- Add Model Evolution Guard system:
  * agent-evolution/scripts/lib/fitness-gate.cjs
  * Rejects downgrades >3 points or below score 75
  * Produces detailed diff report before any file modifications
  * Normalized model ID lookup (v3.html ':' vs JSON '-')
- Update sync-benchmarks-from-yaml.cjs with fitness gate
- Update model-benchmarks.json with v3 optimal assignments
- Rebuild research-dashboard.html (104KB, 30 agents, 11 models)
- Add model-evolution-guard.md architecture documentation
- Add v3-optimal-models.json as source-of-truth reference

Fixes regression introduced by commit 3badb25 where models were
silently downgraded from heatmap optimal to inferior assignments.
This commit is contained in:
¨NW¨
2026-04-29 23:19:16 +01:00
parent d1516f4856
commit 9e48a4960e
14 changed files with 2850 additions and 2049 deletions

View File

@@ -0,0 +1,171 @@
/**
* Model Evolution Fitness Gate
*
* Validates any model assignment change against heatmap-derived scores.
* Rejects changes that would downgrade agents beyond the regression threshold.
*
* Usage:
* const { FitnessGate, runGate } = require('./fitness-gate');
* runGate(require('../../data/model-benchmarks.json'));
*/
const fs = require('fs');
const path = require('path');
const BENCHMARKS_PATH = path.join(__dirname, '../../data/model-benchmarks.json');
const DEFAULT_MIN_SCORE = 75;
const DEFAULT_MAX_REGRESSION = 3;
class FitnessGate {
constructor(benchmarks, options = {}) {
this.benchmarks = benchmarks;
this.agents = this._buildAgentIndex(benchmarks);
this.models = this._buildModelIndex(benchmarks);
this.minScore = options.minScore ?? DEFAULT_MIN_SCORE;
this.maxRegression = options.maxRegression ?? DEFAULT_MAX_REGRESSION;
}
_buildAgentIndex(data) {
const map = {};
(data.agent_model_scores || []).forEach(a => {
map[a.agent] = a;
});
return map;
}
_buildModelIndex(data) {
const map = {};
(data.models || []).forEach((m, i) => {
map[m.id] = { ...m, idx: i };
});
return map;
}
getScore(agentName, modelId) {
const agent = this.agents[agentName];
if (!agent) return null;
// Normalize model IDs (v3.html uses "", JSON may use "kimi-k2.6" instead of "kimi-k2.6:cloud")
const normalizedId = modelId.replace(/:/g, '-').replace(/--cloud$/, '-2.6');
const tryKeys = [normalizedId, modelId, modelId + '-cloud'];
for (const key of tryKeys) {
if (agent.scores?.[key] !== undefined) return agent.scores[key];
}
return null;
}
validateChange(agentName, fromModel, toModel) {
const agent = this.agents[agentName];
if (!agent) return { acceptable: false, reason: `Agent "${agentName}" not found in benchmarks` };
const oldScore = this.getScore(agentName, fromModel);
const newScore = this.getScore(agentName, toModel);
if (oldScore === null) {
return { acceptable: false, reason: `No score for "${fromModel}" on agent "${agentName}"` };
}
if (newScore === null) {
return { acceptable: false, reason: `No score for "${toModel}" on agent "${agentName}"` };
}
if (newScore < this.minScore) {
return {
acceptable: false,
reason: `Score ${newScore} below global minimum ${this.minScore}`,
oldScore, newScore, delta: newScore - oldScore
};
}
if (newScore < oldScore - this.maxRegression) {
return {
acceptable: false,
reason: `Regression ${oldScore} -> ${newScore} (delta ${newScore - oldScore}) exceeds max allowed regression of ${this.maxRegression}`,
oldScore, newScore, delta: newScore - oldScore
};
}
return {
acceptable: true,
oldScore, newScore, delta: newScore - oldScore,
status: newScore > oldScore ? 'upgrade' : newScore === oldScore ? 'same' : 'minor_regression'
};
}
validateAllChanges(changes) {
const results = [];
const rejections = [];
for (const change of changes) {
const result = this.validateChange(change.agent, change.from, change.to);
results.push({ ...change, ...result });
if (!result.acceptable) rejections.push(result);
}
return { results, rejections, passed: rejections.length === 0 };
}
printDiff(report) {
console.log('\n=== Model Change Diff Report ===');
console.log(
'Agent'.padEnd(25),
'Old Model'.padEnd(25),
'Old Score'.padEnd(10),
'New Model'.padEnd(25),
'New Score'.padEnd(10),
'Status'
);
console.log('-'.repeat(115));
for (const r of report.results) {
const status = r.acceptable
? r.delta > 0 ? '✅ UPGRADE'
: r.delta === 0 ? ' SAME'
: `⚠️ MINOR (${r.delta})`
: `⛔ REJECTED: ${r.reason}`;
console.log(
r.agent.padEnd(25),
(r.from || '-').padEnd(25),
(r.oldScore ?? '-').toString().padEnd(10),
(r.to || '-').padEnd(25),
(r.newScore ?? '-').toString().padEnd(10),
status
);
}
console.log('-'.repeat(115));
const upgrades = report.results.filter(r => r.delta > 0).length;
const downgrades = report.results.filter(r => r.delta < 0 && r.acceptable).length;
const same = report.results.filter(r => r.delta === 0).length;
const rejected = report.rejections.length;
console.log(`Upgrades: ${upgrades} | Minor regressions: ${downgrades} | Same: ${same} | Rejected: ${rejected}`);
if (rejected > 0) {
console.log('\n⛔ REJECTIONS (sync blocked):');
for (const r of report.rejections) {
console.log(` - ${r.agent}: ${r.reason}`);
}
console.log('\nNo files were modified. Fix the source data or adjust thresholds (not recommended).');
}
}
}
/**
* Convenience: load benchmarks from default path and create gate
*/
function loadGate(options = {}) {
const data = JSON.parse(fs.readFileSync(BENCHMARKS_PATH, 'utf8'));
return new FitnessGate(data, options);
}
/**
* Convenience: validate + print diff in one call
*/
function runGate(changes, options = {}) {
const gate = loadGate(options);
const report = gate.validateAllChanges(changes);
gate.printDiff(report);
return report;
}
module.exports = { FitnessGate, loadGate, runGate };