fix: restore optimal v3 models + add fitness gate protection
- Restore all 30 agents to v3.html heatmap optimal models:
* frontend-developer: qwen3-coder -> minimax-m2.5 (92★)
* devops-engineer: nemotron-3-super -> kimi-k2.6:cloud (88★)
* browser-automation: qwen3-coder -> kimi-k2.6:cloud (86★)
* agent-architect: glm-5.1 -> kimi-k2.6:cloud (86★)
- Add Model Evolution Guard system:
* agent-evolution/scripts/lib/fitness-gate.cjs
* Rejects downgrades >3 points or below score 75
* Produces detailed diff report before any file modifications
* Normalized model ID lookup (v3.html ':' vs JSON '-')
- Update sync-benchmarks-from-yaml.cjs with fitness gate
- Update model-benchmarks.json with v3 optimal assignments
- Rebuild research-dashboard.html (104KB, 30 agents, 11 models)
- Add model-evolution-guard.md architecture documentation
- Add v3-optimal-models.json as source-of-truth reference
Fixes regression introduced by commit 3badb25 where models were
silently downgraded from heatmap optimal to inferior assignments.
This commit is contained in:
171
agent-evolution/scripts/lib/fitness-gate.cjs
Normal file
171
agent-evolution/scripts/lib/fitness-gate.cjs
Normal file
@@ -0,0 +1,171 @@
|
||||
/**
|
||||
* Model Evolution Fitness Gate
|
||||
*
|
||||
* Validates any model assignment change against heatmap-derived scores.
|
||||
* Rejects changes that would downgrade agents beyond the regression threshold.
|
||||
*
|
||||
* Usage:
|
||||
* const { FitnessGate, runGate } = require('./fitness-gate');
|
||||
* runGate(require('../../data/model-benchmarks.json'));
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
const BENCHMARKS_PATH = path.join(__dirname, '../../data/model-benchmarks.json');
|
||||
const DEFAULT_MIN_SCORE = 75;
|
||||
const DEFAULT_MAX_REGRESSION = 3;
|
||||
|
||||
class FitnessGate {
|
||||
constructor(benchmarks, options = {}) {
|
||||
this.benchmarks = benchmarks;
|
||||
this.agents = this._buildAgentIndex(benchmarks);
|
||||
this.models = this._buildModelIndex(benchmarks);
|
||||
this.minScore = options.minScore ?? DEFAULT_MIN_SCORE;
|
||||
this.maxRegression = options.maxRegression ?? DEFAULT_MAX_REGRESSION;
|
||||
}
|
||||
|
||||
_buildAgentIndex(data) {
|
||||
const map = {};
|
||||
(data.agent_model_scores || []).forEach(a => {
|
||||
map[a.agent] = a;
|
||||
});
|
||||
return map;
|
||||
}
|
||||
|
||||
_buildModelIndex(data) {
|
||||
const map = {};
|
||||
(data.models || []).forEach((m, i) => {
|
||||
map[m.id] = { ...m, idx: i };
|
||||
});
|
||||
return map;
|
||||
}
|
||||
|
||||
getScore(agentName, modelId) {
|
||||
const agent = this.agents[agentName];
|
||||
if (!agent) return null;
|
||||
// Normalize model IDs (v3.html uses "", JSON may use "kimi-k2.6" instead of "kimi-k2.6:cloud")
|
||||
const normalizedId = modelId.replace(/:/g, '-').replace(/--cloud$/, '-2.6');
|
||||
const tryKeys = [normalizedId, modelId, modelId + '-cloud'];
|
||||
for (const key of tryKeys) {
|
||||
if (agent.scores?.[key] !== undefined) return agent.scores[key];
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
validateChange(agentName, fromModel, toModel) {
|
||||
const agent = this.agents[agentName];
|
||||
if (!agent) return { acceptable: false, reason: `Agent "${agentName}" not found in benchmarks` };
|
||||
|
||||
const oldScore = this.getScore(agentName, fromModel);
|
||||
const newScore = this.getScore(agentName, toModel);
|
||||
|
||||
if (oldScore === null) {
|
||||
return { acceptable: false, reason: `No score for "${fromModel}" on agent "${agentName}"` };
|
||||
}
|
||||
if (newScore === null) {
|
||||
return { acceptable: false, reason: `No score for "${toModel}" on agent "${agentName}"` };
|
||||
}
|
||||
|
||||
if (newScore < this.minScore) {
|
||||
return {
|
||||
acceptable: false,
|
||||
reason: `Score ${newScore} below global minimum ${this.minScore}`,
|
||||
oldScore, newScore, delta: newScore - oldScore
|
||||
};
|
||||
}
|
||||
|
||||
if (newScore < oldScore - this.maxRegression) {
|
||||
return {
|
||||
acceptable: false,
|
||||
reason: `Regression ${oldScore} -> ${newScore} (delta ${newScore - oldScore}) exceeds max allowed regression of ${this.maxRegression}`,
|
||||
oldScore, newScore, delta: newScore - oldScore
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
acceptable: true,
|
||||
oldScore, newScore, delta: newScore - oldScore,
|
||||
status: newScore > oldScore ? 'upgrade' : newScore === oldScore ? 'same' : 'minor_regression'
|
||||
};
|
||||
}
|
||||
|
||||
validateAllChanges(changes) {
|
||||
const results = [];
|
||||
const rejections = [];
|
||||
|
||||
for (const change of changes) {
|
||||
const result = this.validateChange(change.agent, change.from, change.to);
|
||||
results.push({ ...change, ...result });
|
||||
if (!result.acceptable) rejections.push(result);
|
||||
}
|
||||
|
||||
return { results, rejections, passed: rejections.length === 0 };
|
||||
}
|
||||
|
||||
printDiff(report) {
|
||||
console.log('\n=== Model Change Diff Report ===');
|
||||
console.log(
|
||||
'Agent'.padEnd(25),
|
||||
'Old Model'.padEnd(25),
|
||||
'Old Score'.padEnd(10),
|
||||
'New Model'.padEnd(25),
|
||||
'New Score'.padEnd(10),
|
||||
'Status'
|
||||
);
|
||||
console.log('-'.repeat(115));
|
||||
|
||||
for (const r of report.results) {
|
||||
const status = r.acceptable
|
||||
? r.delta > 0 ? '✅ UPGRADE'
|
||||
: r.delta === 0 ? '➖ SAME'
|
||||
: `⚠️ MINOR (${r.delta})`
|
||||
: `⛔ REJECTED: ${r.reason}`;
|
||||
|
||||
console.log(
|
||||
r.agent.padEnd(25),
|
||||
(r.from || '-').padEnd(25),
|
||||
(r.oldScore ?? '-').toString().padEnd(10),
|
||||
(r.to || '-').padEnd(25),
|
||||
(r.newScore ?? '-').toString().padEnd(10),
|
||||
status
|
||||
);
|
||||
}
|
||||
|
||||
console.log('-'.repeat(115));
|
||||
const upgrades = report.results.filter(r => r.delta > 0).length;
|
||||
const downgrades = report.results.filter(r => r.delta < 0 && r.acceptable).length;
|
||||
const same = report.results.filter(r => r.delta === 0).length;
|
||||
const rejected = report.rejections.length;
|
||||
|
||||
console.log(`Upgrades: ${upgrades} | Minor regressions: ${downgrades} | Same: ${same} | Rejected: ${rejected}`);
|
||||
|
||||
if (rejected > 0) {
|
||||
console.log('\n⛔ REJECTIONS (sync blocked):');
|
||||
for (const r of report.rejections) {
|
||||
console.log(` - ${r.agent}: ${r.reason}`);
|
||||
}
|
||||
console.log('\nNo files were modified. Fix the source data or adjust thresholds (not recommended).');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience: load benchmarks from default path and create gate
|
||||
*/
|
||||
function loadGate(options = {}) {
|
||||
const data = JSON.parse(fs.readFileSync(BENCHMARKS_PATH, 'utf8'));
|
||||
return new FitnessGate(data, options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience: validate + print diff in one call
|
||||
*/
|
||||
function runGate(changes, options = {}) {
|
||||
const gate = loadGate(options);
|
||||
const report = gate.validateAllChanges(changes);
|
||||
gate.printDiff(report);
|
||||
return report;
|
||||
}
|
||||
|
||||
module.exports = { FitnessGate, loadGate, runGate };
|
||||
@@ -1,4 +1,5 @@
|
||||
const fs = require('fs');
|
||||
const { runGate } = require('./lib/fitness-gate.cjs');
|
||||
|
||||
// Parse simple YAML structure with 2-space indentation
|
||||
function parseCapabilityIndex(text) {
|
||||
@@ -6,21 +7,19 @@ function parseCapabilityIndex(text) {
|
||||
const agents = {};
|
||||
let currentAgent = '';
|
||||
let currentList = '';
|
||||
|
||||
|
||||
for (const line of lines) {
|
||||
const indent = line.length - line.trimStart().length;
|
||||
const trimmed = line.trim();
|
||||
|
||||
|
||||
if (indent === 2 && trimmed.endsWith(':') && !trimmed.startsWith('-')) {
|
||||
// Agent name
|
||||
currentAgent = trimmed.slice(0, -1);
|
||||
agents[currentAgent] = {};
|
||||
currentList = '';
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
if (indent === 4 && trimmed.endsWith(':') && !trimmed.startsWith('-')) {
|
||||
// Scalar property or list start
|
||||
const key = trimmed.slice(0, -1);
|
||||
currentList = key;
|
||||
if (!Array.isArray(agents[currentAgent][key])) {
|
||||
@@ -28,18 +27,16 @@ function parseCapabilityIndex(text) {
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
if (indent === 4 && trimmed.includes(':') && !trimmed.startsWith('-')) {
|
||||
// key: value
|
||||
const [key, ...rest] = trimmed.split(':');
|
||||
const value = rest.join(':').trim();
|
||||
agents[currentAgent][key.trim()] = value;
|
||||
currentList = '';
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
if (indent >= 6 && trimmed.startsWith('- ')) {
|
||||
// List item
|
||||
const value = trimmed.slice(2).trim();
|
||||
if (currentList) {
|
||||
if (!agents[currentAgent][currentList]) agents[currentAgent][currentList] = [];
|
||||
@@ -47,21 +44,19 @@ function parseCapabilityIndex(text) {
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Reset list context on unknown indentation
|
||||
|
||||
if (indent < 4) {
|
||||
currentList = '';
|
||||
}
|
||||
}
|
||||
|
||||
// Filter out non-agent entries (flat sections like capability_routing, etc.)
|
||||
|
||||
const result = {};
|
||||
const scalarKeys = ['capabilities','receives','produces','forbidden','delegates_to','fallback_models'];
|
||||
for (const [name, data] of Object.entries(agents)) {
|
||||
const hasAgentProps = scalarKeys.some(k => k in data) || 'model' in data;
|
||||
if (hasAgentProps) result[name] = data;
|
||||
}
|
||||
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -72,6 +67,38 @@ console.log('Parsed agents:', Object.keys(parsed).length);
|
||||
// Read existing benchmarks
|
||||
const bench = JSON.parse(fs.readFileSync('agent-evolution/data/model-benchmarks.json', 'utf8'));
|
||||
|
||||
// === FITNESS GATE: validate model changes ===
|
||||
const oldConfig = {};
|
||||
(bench.agent_current_config || []).forEach(c => {
|
||||
oldConfig[c.agent] = c.model;
|
||||
});
|
||||
|
||||
const changes = [];
|
||||
for (const [agent, data] of Object.entries(parsed)) {
|
||||
const newModel = data.model || '';
|
||||
const oldModel = oldConfig[agent];
|
||||
if (oldModel && oldModel !== newModel) {
|
||||
changes.push({
|
||||
agent,
|
||||
from: oldModel.replace('ollama-cloud/', ''),
|
||||
to: newModel.replace('ollama-cloud/', '')
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if (changes.length > 0) {
|
||||
console.log('\nDetected model changes:', changes.length);
|
||||
const report = runGate(changes);
|
||||
|
||||
if (!report.passed) {
|
||||
console.error('\n⛔ FITNESS GATE REJECTED the sync. No files modified.');
|
||||
console.error('If you intend to downgrade, update the source scores in model-benchmarks.json first.');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('\n✅ All model changes passed fitness gate. Proceeding...');
|
||||
}
|
||||
|
||||
// Update agent_current_config
|
||||
bench.agent_current_config = Object.entries(parsed).map(([agent, data]) => {
|
||||
const rawModel = data.model || '';
|
||||
@@ -104,7 +131,6 @@ bench.agent_model_scores = Object.entries(parsed).map(([agent, data]) => {
|
||||
const rawModel = data.model || '';
|
||||
const modelId = rawModel.replace('ollama-cloud/', '');
|
||||
const currentIndex = bench.models.findIndex(m => m.id === modelId);
|
||||
// Preserve existing scores or empty
|
||||
const scores = existingScores[agent] || {};
|
||||
return {
|
||||
agent,
|
||||
@@ -117,11 +143,11 @@ bench.agent_model_scores = Object.entries(parsed).map(([agent, data]) => {
|
||||
|
||||
// Update metadata
|
||||
bench.generated = new Date().toISOString();
|
||||
bench.source = '.kilo/capability-index.yaml (synced v2)';
|
||||
bench.source = '.kilo/capability-index.yaml (synced v3 + fitness-gate)';
|
||||
bench.total_agents = bench.agent_current_config.length;
|
||||
|
||||
fs.writeFileSync('agent-evolution/data/model-benchmarks.json', JSON.stringify(bench, null, 2));
|
||||
console.log('Synced', bench.agent_current_config.length, 'agents');
|
||||
console.log('\nSynced', bench.agent_current_config.length, 'agents');
|
||||
console.log('Generated:', bench.generated);
|
||||
|
||||
// Verify
|
||||
@@ -134,3 +160,4 @@ bench.agent_current_config.forEach(c => {
|
||||
}
|
||||
});
|
||||
console.log('Mismatches:', mismatches);
|
||||
console.log('\n💡 Tip: If fitness gate rejected changes, verify that model-benchmarks.json has correct heatmap scores before syncing from YAML.');
|
||||
|
||||
Reference in New Issue
Block a user