Files
APAW/agent-evolution/scripts/lib/fitness-gate.cjs
¨NW¨ 9e48a4960e fix: restore optimal v3 models + add fitness gate protection
- Restore all 30 agents to v3.html heatmap optimal models:
  * frontend-developer: qwen3-coder -> minimax-m2.5 (92★)
  * devops-engineer: nemotron-3-super -> kimi-k2.6:cloud (88★)
  * browser-automation: qwen3-coder -> kimi-k2.6:cloud (86★)
  * agent-architect: glm-5.1 -> kimi-k2.6:cloud (86★)
- Add Model Evolution Guard system:
  * agent-evolution/scripts/lib/fitness-gate.cjs
  * Rejects downgrades >3 points or below score 75
  * Produces detailed diff report before any file modifications
  * Normalized model ID lookup (v3.html ':' vs JSON '-')
- Update sync-benchmarks-from-yaml.cjs with fitness gate
- Update model-benchmarks.json with v3 optimal assignments
- Rebuild research-dashboard.html (104KB, 30 agents, 11 models)
- Add model-evolution-guard.md architecture documentation
- Add v3-optimal-models.json as source-of-truth reference

Fixes regression introduced by commit 3badb25 where models were
silently downgraded from heatmap optimal to inferior assignments.
2026-04-29 23:19:16 +01:00

172 lines
5.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Model Evolution Fitness Gate
*
* Validates any model assignment change against heatmap-derived scores.
* Rejects changes that would downgrade agents beyond the regression threshold.
*
* Usage:
* const { FitnessGate, runGate } = require('./fitness-gate');
* runGate(require('../../data/model-benchmarks.json'));
*/
const fs = require('fs');
const path = require('path');
const BENCHMARKS_PATH = path.join(__dirname, '../../data/model-benchmarks.json');
const DEFAULT_MIN_SCORE = 75;
const DEFAULT_MAX_REGRESSION = 3;
class FitnessGate {
constructor(benchmarks, options = {}) {
this.benchmarks = benchmarks;
this.agents = this._buildAgentIndex(benchmarks);
this.models = this._buildModelIndex(benchmarks);
this.minScore = options.minScore ?? DEFAULT_MIN_SCORE;
this.maxRegression = options.maxRegression ?? DEFAULT_MAX_REGRESSION;
}
_buildAgentIndex(data) {
const map = {};
(data.agent_model_scores || []).forEach(a => {
map[a.agent] = a;
});
return map;
}
_buildModelIndex(data) {
const map = {};
(data.models || []).forEach((m, i) => {
map[m.id] = { ...m, idx: i };
});
return map;
}
getScore(agentName, modelId) {
const agent = this.agents[agentName];
if (!agent) return null;
// Normalize model IDs (v3.html uses "", JSON may use "kimi-k2.6" instead of "kimi-k2.6:cloud")
const normalizedId = modelId.replace(/:/g, '-').replace(/--cloud$/, '-2.6');
const tryKeys = [normalizedId, modelId, modelId + '-cloud'];
for (const key of tryKeys) {
if (agent.scores?.[key] !== undefined) return agent.scores[key];
}
return null;
}
validateChange(agentName, fromModel, toModel) {
const agent = this.agents[agentName];
if (!agent) return { acceptable: false, reason: `Agent "${agentName}" not found in benchmarks` };
const oldScore = this.getScore(agentName, fromModel);
const newScore = this.getScore(agentName, toModel);
if (oldScore === null) {
return { acceptable: false, reason: `No score for "${fromModel}" on agent "${agentName}"` };
}
if (newScore === null) {
return { acceptable: false, reason: `No score for "${toModel}" on agent "${agentName}"` };
}
if (newScore < this.minScore) {
return {
acceptable: false,
reason: `Score ${newScore} below global minimum ${this.minScore}`,
oldScore, newScore, delta: newScore - oldScore
};
}
if (newScore < oldScore - this.maxRegression) {
return {
acceptable: false,
reason: `Regression ${oldScore} -> ${newScore} (delta ${newScore - oldScore}) exceeds max allowed regression of ${this.maxRegression}`,
oldScore, newScore, delta: newScore - oldScore
};
}
return {
acceptable: true,
oldScore, newScore, delta: newScore - oldScore,
status: newScore > oldScore ? 'upgrade' : newScore === oldScore ? 'same' : 'minor_regression'
};
}
validateAllChanges(changes) {
const results = [];
const rejections = [];
for (const change of changes) {
const result = this.validateChange(change.agent, change.from, change.to);
results.push({ ...change, ...result });
if (!result.acceptable) rejections.push(result);
}
return { results, rejections, passed: rejections.length === 0 };
}
printDiff(report) {
console.log('\n=== Model Change Diff Report ===');
console.log(
'Agent'.padEnd(25),
'Old Model'.padEnd(25),
'Old Score'.padEnd(10),
'New Model'.padEnd(25),
'New Score'.padEnd(10),
'Status'
);
console.log('-'.repeat(115));
for (const r of report.results) {
const status = r.acceptable
? r.delta > 0 ? '✅ UPGRADE'
: r.delta === 0 ? ' SAME'
: `⚠️ MINOR (${r.delta})`
: `⛔ REJECTED: ${r.reason}`;
console.log(
r.agent.padEnd(25),
(r.from || '-').padEnd(25),
(r.oldScore ?? '-').toString().padEnd(10),
(r.to || '-').padEnd(25),
(r.newScore ?? '-').toString().padEnd(10),
status
);
}
console.log('-'.repeat(115));
const upgrades = report.results.filter(r => r.delta > 0).length;
const downgrades = report.results.filter(r => r.delta < 0 && r.acceptable).length;
const same = report.results.filter(r => r.delta === 0).length;
const rejected = report.rejections.length;
console.log(`Upgrades: ${upgrades} | Minor regressions: ${downgrades} | Same: ${same} | Rejected: ${rejected}`);
if (rejected > 0) {
console.log('\n⛔ REJECTIONS (sync blocked):');
for (const r of report.rejections) {
console.log(` - ${r.agent}: ${r.reason}`);
}
console.log('\nNo files were modified. Fix the source data or adjust thresholds (not recommended).');
}
}
}
/**
* Convenience: load benchmarks from default path and create gate
*/
function loadGate(options = {}) {
const data = JSON.parse(fs.readFileSync(BENCHMARKS_PATH, 'utf8'));
return new FitnessGate(data, options);
}
/**
* Convenience: validate + print diff in one call
*/
function runGate(changes, options = {}) {
const gate = loadGate(options);
const report = gate.validateAllChanges(changes);
gate.printDiff(report);
return report;
}
module.exports = { FitnessGate, loadGate, runGate };