fix: restore optimal v3 models + add fitness gate protection

- Restore all 30 agents to v3.html heatmap optimal models: * frontend-developer: qwen3-coder -> minimax-m2.5 (92★) * devops-engineer: nemotron-3-super -> kimi-k2.6:cloud (88★) * browser-automation: qwen3-coder -> kimi-k2.6:cloud (86★) * agent-architect: glm-5.1 -> kimi-k2.6:cloud (86★) - Add Model Evolution Guard system: * agent-evolution/scripts/lib/fitness-gate.cjs * Rejects downgrades >3 points or below score 75 * Produces detailed diff report before any file modifications * Normalized model ID lookup (v3.html ':' vs JSON '-') - Update sync-benchmarks-from-yaml.cjs with fitness gate - Update model-benchmarks.json with v3 optimal assignments - Rebuild research-dashboard.html (104KB, 30 agents, 11 models) - Add model-evolution-guard.md architecture documentation - Add v3-optimal-models.json as source-of-truth reference Fixes regression introduced by commit 3badb25 where models were silently downgraded from heatmap optimal to inferior assignments.
2026-04-29 23:19:16 +01:00
parent d1516f4856
commit 9e48a4960e
14 changed files with 2850 additions and 2049 deletions
--- a/agent-evolution/scripts/sync-benchmarks-from-yaml.cjs
+++ b/agent-evolution/scripts/sync-benchmarks-from-yaml.cjs
@@ -1,4 +1,5 @@
 const fs = require('fs');
+const { runGate } = require('./lib/fitness-gate.cjs');

 // Parse simple YAML structure with 2-space indentation
 function parseCapabilityIndex(text) {
@@ -6,21 +7,19 @@ function parseCapabilityIndex(text) {
  const agents = {};
  let currentAgent = '';
  let currentList = '';
-  
+
  for (const line of lines) {
    const indent = line.length - line.trimStart().length;
    const trimmed = line.trim();
-    
+
    if (indent === 2 && trimmed.endsWith(':') && !trimmed.startsWith('-')) {
-      // Agent name
      currentAgent = trimmed.slice(0, -1);
      agents[currentAgent] = {};
      currentList = '';
      continue;
    }
-    
+
    if (indent === 4 && trimmed.endsWith(':') && !trimmed.startsWith('-')) {
-      // Scalar property or list start
      const key = trimmed.slice(0, -1);
      currentList = key;
      if (!Array.isArray(agents[currentAgent][key])) {
@@ -28,18 +27,16 @@ function parseCapabilityIndex(text) {
      }
      continue;
    }
-    
+
    if (indent === 4 && trimmed.includes(':') && !trimmed.startsWith('-')) {
-      // key: value
      const [key, ...rest] = trimmed.split(':');
      const value = rest.join(':').trim();
      agents[currentAgent][key.trim()] = value;
      currentList = '';
      continue;
    }
-    
+
    if (indent >= 6 && trimmed.startsWith('- ')) {
-      // List item
      const value = trimmed.slice(2).trim();
      if (currentList) {
        if (!agents[currentAgent][currentList]) agents[currentAgent][currentList] = [];
@@ -47,21 +44,19 @@ function parseCapabilityIndex(text) {
      }
      continue;
    }
-    
-    // Reset list context on unknown indentation
+
    if (indent < 4) {
      currentList = '';
    }
  }
-  
-  // Filter out non-agent entries (flat sections like capability_routing, etc.)
+
  const result = {};
  const scalarKeys = ['capabilities','receives','produces','forbidden','delegates_to','fallback_models'];
  for (const [name, data] of Object.entries(agents)) {
    const hasAgentProps = scalarKeys.some(k => k in data) || 'model' in data;
    if (hasAgentProps) result[name] = data;
  }
-  
+
  return result;
 }

@@ -72,6 +67,38 @@ console.log('Parsed agents:', Object.keys(parsed).length);
 // Read existing benchmarks
 const bench = JSON.parse(fs.readFileSync('agent-evolution/data/model-benchmarks.json', 'utf8'));

+// === FITNESS GATE: validate model changes ===
+const oldConfig = {};
+(bench.agent_current_config || []).forEach(c => {
+  oldConfig[c.agent] = c.model;
+});
+
+const changes = [];
+for (const [agent, data] of Object.entries(parsed)) {
+  const newModel = data.model || '';
+  const oldModel = oldConfig[agent];
+  if (oldModel && oldModel !== newModel) {
+    changes.push({
+      agent,
+      from: oldModel.replace('ollama-cloud/', ''),
+      to: newModel.replace('ollama-cloud/', '')
+    });
+  }
+}
+
+if (changes.length > 0) {
+  console.log('\nDetected model changes:', changes.length);
+  const report = runGate(changes);
+
+  if (!report.passed) {
+    console.error('\n⛔ FITNESS GATE REJECTED the sync. No files modified.');
+    console.error('If you intend to downgrade, update the source scores in model-benchmarks.json first.');
+    process.exit(1);
+  }
+
+  console.log('\n✅ All model changes passed fitness gate. Proceeding...');
+}
+
 // Update agent_current_config
 bench.agent_current_config = Object.entries(parsed).map(([agent, data]) => {
  const rawModel = data.model || '';
@@ -104,7 +131,6 @@ bench.agent_model_scores = Object.entries(parsed).map(([agent, data]) => {
  const rawModel = data.model || '';
  const modelId = rawModel.replace('ollama-cloud/', '');
  const currentIndex = bench.models.findIndex(m => m.id === modelId);
-  // Preserve existing scores or empty
  const scores = existingScores[agent] || {};
  return {
    agent,
@@ -117,11 +143,11 @@ bench.agent_model_scores = Object.entries(parsed).map(([agent, data]) => {

 // Update metadata
 bench.generated = new Date().toISOString();
-bench.source = '.kilo/capability-index.yaml (synced v2)';
+bench.source = '.kilo/capability-index.yaml (synced v3 + fitness-gate)';
 bench.total_agents = bench.agent_current_config.length;

 fs.writeFileSync('agent-evolution/data/model-benchmarks.json', JSON.stringify(bench, null, 2));
-console.log('Synced', bench.agent_current_config.length, 'agents');
+console.log('\nSynced', bench.agent_current_config.length, 'agents');
 console.log('Generated:', bench.generated);

 // Verify
@@ -134,3 +160,4 @@ bench.agent_current_config.forEach(c => {
  }
 });
 console.log('Mismatches:', mismatches);
+console.log('\n💡 Tip: If fitness gate rejected changes, verify that model-benchmarks.json has correct heatmap scores before syncing from YAML.');