fix: restore optimal v3 models + add fitness gate protection

- Restore all 30 agents to v3.html heatmap optimal models:
  * frontend-developer: qwen3-coder -> minimax-m2.5 (92★)
  * devops-engineer: nemotron-3-super -> kimi-k2.6:cloud (88★)
  * browser-automation: qwen3-coder -> kimi-k2.6:cloud (86★)
  * agent-architect: glm-5.1 -> kimi-k2.6:cloud (86★)
- Add Model Evolution Guard system:
  * agent-evolution/scripts/lib/fitness-gate.cjs
  * Rejects downgrades >3 points or below score 75
  * Produces detailed diff report before any file modifications
  * Normalized model ID lookup (v3.html ':' vs JSON '-')
- Update sync-benchmarks-from-yaml.cjs with fitness gate
- Update model-benchmarks.json with v3 optimal assignments
- Rebuild research-dashboard.html (104KB, 30 agents, 11 models)
- Add model-evolution-guard.md architecture documentation
- Add v3-optimal-models.json as source-of-truth reference

Fixes regression introduced by commit 3badb25 where models were
silently downgraded from heatmap optimal to inferior assignments.
This commit is contained in:
¨NW¨
2026-04-29 23:19:16 +01:00
parent d1516f4856
commit 9e48a4960e
14 changed files with 2850 additions and 2049 deletions

View File

@@ -1,4 +1,5 @@
const fs = require('fs');
const { runGate } = require('./lib/fitness-gate.cjs');
// Parse simple YAML structure with 2-space indentation
function parseCapabilityIndex(text) {
@@ -6,21 +7,19 @@ function parseCapabilityIndex(text) {
const agents = {};
let currentAgent = '';
let currentList = '';
for (const line of lines) {
const indent = line.length - line.trimStart().length;
const trimmed = line.trim();
if (indent === 2 && trimmed.endsWith(':') && !trimmed.startsWith('-')) {
// Agent name
currentAgent = trimmed.slice(0, -1);
agents[currentAgent] = {};
currentList = '';
continue;
}
if (indent === 4 && trimmed.endsWith(':') && !trimmed.startsWith('-')) {
// Scalar property or list start
const key = trimmed.slice(0, -1);
currentList = key;
if (!Array.isArray(agents[currentAgent][key])) {
@@ -28,18 +27,16 @@ function parseCapabilityIndex(text) {
}
continue;
}
if (indent === 4 && trimmed.includes(':') && !trimmed.startsWith('-')) {
// key: value
const [key, ...rest] = trimmed.split(':');
const value = rest.join(':').trim();
agents[currentAgent][key.trim()] = value;
currentList = '';
continue;
}
if (indent >= 6 && trimmed.startsWith('- ')) {
// List item
const value = trimmed.slice(2).trim();
if (currentList) {
if (!agents[currentAgent][currentList]) agents[currentAgent][currentList] = [];
@@ -47,21 +44,19 @@ function parseCapabilityIndex(text) {
}
continue;
}
// Reset list context on unknown indentation
if (indent < 4) {
currentList = '';
}
}
// Filter out non-agent entries (flat sections like capability_routing, etc.)
const result = {};
const scalarKeys = ['capabilities','receives','produces','forbidden','delegates_to','fallback_models'];
for (const [name, data] of Object.entries(agents)) {
const hasAgentProps = scalarKeys.some(k => k in data) || 'model' in data;
if (hasAgentProps) result[name] = data;
}
return result;
}
@@ -72,6 +67,38 @@ console.log('Parsed agents:', Object.keys(parsed).length);
// Read existing benchmarks
const bench = JSON.parse(fs.readFileSync('agent-evolution/data/model-benchmarks.json', 'utf8'));
// === FITNESS GATE: validate model changes ===
const oldConfig = {};
(bench.agent_current_config || []).forEach(c => {
oldConfig[c.agent] = c.model;
});
const changes = [];
for (const [agent, data] of Object.entries(parsed)) {
const newModel = data.model || '';
const oldModel = oldConfig[agent];
if (oldModel && oldModel !== newModel) {
changes.push({
agent,
from: oldModel.replace('ollama-cloud/', ''),
to: newModel.replace('ollama-cloud/', '')
});
}
}
if (changes.length > 0) {
console.log('\nDetected model changes:', changes.length);
const report = runGate(changes);
if (!report.passed) {
console.error('\n⛔ FITNESS GATE REJECTED the sync. No files modified.');
console.error('If you intend to downgrade, update the source scores in model-benchmarks.json first.');
process.exit(1);
}
console.log('\n✅ All model changes passed fitness gate. Proceeding...');
}
// Update agent_current_config
bench.agent_current_config = Object.entries(parsed).map(([agent, data]) => {
const rawModel = data.model || '';
@@ -104,7 +131,6 @@ bench.agent_model_scores = Object.entries(parsed).map(([agent, data]) => {
const rawModel = data.model || '';
const modelId = rawModel.replace('ollama-cloud/', '');
const currentIndex = bench.models.findIndex(m => m.id === modelId);
// Preserve existing scores or empty
const scores = existingScores[agent] || {};
return {
agent,
@@ -117,11 +143,11 @@ bench.agent_model_scores = Object.entries(parsed).map(([agent, data]) => {
// Update metadata
bench.generated = new Date().toISOString();
bench.source = '.kilo/capability-index.yaml (synced v2)';
bench.source = '.kilo/capability-index.yaml (synced v3 + fitness-gate)';
bench.total_agents = bench.agent_current_config.length;
fs.writeFileSync('agent-evolution/data/model-benchmarks.json', JSON.stringify(bench, null, 2));
console.log('Synced', bench.agent_current_config.length, 'agents');
console.log('\nSynced', bench.agent_current_config.length, 'agents');
console.log('Generated:', bench.generated);
// Verify
@@ -134,3 +160,4 @@ bench.agent_current_config.forEach(c => {
}
});
console.log('Mismatches:', mismatches);
console.log('\n💡 Tip: If fitness gate rejected changes, verify that model-benchmarks.json has correct heatmap scores before syncing from YAML.');