fix: restore optimal v3 models + add fitness gate protection

- Restore all 30 agents to v3.html heatmap optimal models: * frontend-developer: qwen3-coder -> minimax-m2.5 (92★) * devops-engineer: nemotron-3-super -> kimi-k2.6:cloud (88★) * browser-automation: qwen3-coder -> kimi-k2.6:cloud (86★) * agent-architect: glm-5.1 -> kimi-k2.6:cloud (86★) - Add Model Evolution Guard system: * agent-evolution/scripts/lib/fitness-gate.cjs * Rejects downgrades >3 points or below score 75 * Produces detailed diff report before any file modifications * Normalized model ID lookup (v3.html ':' vs JSON '-') - Update sync-benchmarks-from-yaml.cjs with fitness gate - Update model-benchmarks.json with v3 optimal assignments - Rebuild research-dashboard.html (104KB, 30 agents, 11 models) - Add model-evolution-guard.md architecture documentation - Add v3-optimal-models.json as source-of-truth reference Fixes regression introduced by commit 3badb25 where models were silently downgraded from heatmap optimal to inferior assignments.
2026-04-29 23:19:16 +01:00
parent d1516f4856
commit 9e48a4960e
14 changed files with 2850 additions and 2049 deletions
--- a/agent-evolution/scripts/lib/fitness-gate.cjs
+++ b/agent-evolution/scripts/lib/fitness-gate.cjs
@@ -0,0 +1,171 @@
+/**
+ * Model Evolution Fitness Gate
+ *
+ * Validates any model assignment change against heatmap-derived scores.
+ * Rejects changes that would downgrade agents beyond the regression threshold.
+ *
+ * Usage:
+ *   const { FitnessGate, runGate } = require('./fitness-gate');
+ *   runGate(require('../../data/model-benchmarks.json'));
+ */
+
+const fs = require('fs');
+const path = require('path');
+
+const BENCHMARKS_PATH = path.join(__dirname, '../../data/model-benchmarks.json');
+const DEFAULT_MIN_SCORE = 75;
+const DEFAULT_MAX_REGRESSION = 3;
+
+class FitnessGate {
+  constructor(benchmarks, options = {}) {
+    this.benchmarks = benchmarks;
+    this.agents = this._buildAgentIndex(benchmarks);
+    this.models = this._buildModelIndex(benchmarks);
+    this.minScore = options.minScore ?? DEFAULT_MIN_SCORE;
+    this.maxRegression = options.maxRegression ?? DEFAULT_MAX_REGRESSION;
+  }
+
+  _buildAgentIndex(data) {
+    const map = {};
+    (data.agent_model_scores || []).forEach(a => {
+      map[a.agent] = a;
+    });
+    return map;
+  }
+
+  _buildModelIndex(data) {
+    const map = {};
+    (data.models || []).forEach((m, i) => {
+      map[m.id] = { ...m, idx: i };
+    });
+    return map;
+  }
+
+  getScore(agentName, modelId) {
+    const agent = this.agents[agentName];
+    if (!agent) return null;
+    // Normalize model IDs (v3.html uses "", JSON may use "kimi-k2.6" instead of "kimi-k2.6:cloud")
+    const normalizedId = modelId.replace(/:/g, '-').replace(/--cloud$/, '-2.6');
+    const tryKeys = [normalizedId, modelId, modelId + '-cloud'];
+    for (const key of tryKeys) {
+      if (agent.scores?.[key] !== undefined) return agent.scores[key];
+    }
+    return null;
+  }
+
+  validateChange(agentName, fromModel, toModel) {
+    const agent = this.agents[agentName];
+    if (!agent) return { acceptable: false, reason: `Agent "${agentName}" not found in benchmarks` };
+
+    const oldScore = this.getScore(agentName, fromModel);
+    const newScore = this.getScore(agentName, toModel);
+
+    if (oldScore === null) {
+      return { acceptable: false, reason: `No score for "${fromModel}" on agent "${agentName}"` };
+    }
+    if (newScore === null) {
+      return { acceptable: false, reason: `No score for "${toModel}" on agent "${agentName}"` };
+    }
+
+    if (newScore < this.minScore) {
+      return {
+        acceptable: false,
+        reason: `Score ${newScore} below global minimum ${this.minScore}`,
+        oldScore, newScore, delta: newScore - oldScore
+      };
+    }
+
+    if (newScore < oldScore - this.maxRegression) {
+      return {
+        acceptable: false,
+        reason: `Regression ${oldScore} -> ${newScore} (delta ${newScore - oldScore}) exceeds max allowed regression of ${this.maxRegression}`,
+        oldScore, newScore, delta: newScore - oldScore
+      };
+    }
+
+    return {
+      acceptable: true,
+      oldScore, newScore, delta: newScore - oldScore,
+      status: newScore > oldScore ? 'upgrade' : newScore === oldScore ? 'same' : 'minor_regression'
+    };
+  }
+
+  validateAllChanges(changes) {
+    const results = [];
+    const rejections = [];
+
+    for (const change of changes) {
+      const result = this.validateChange(change.agent, change.from, change.to);
+      results.push({ ...change, ...result });
+      if (!result.acceptable) rejections.push(result);
+    }
+
+    return { results, rejections, passed: rejections.length === 0 };
+  }
+
+  printDiff(report) {
+    console.log('\n=== Model Change Diff Report ===');
+    console.log(
+      'Agent'.padEnd(25),
+      'Old Model'.padEnd(25),
+      'Old Score'.padEnd(10),
+      'New Model'.padEnd(25),
+      'New Score'.padEnd(10),
+      'Status'
+    );
+    console.log('-'.repeat(115));
+
+    for (const r of report.results) {
+      const status = r.acceptable
+        ? r.delta > 0 ? '✅ UPGRADE'
+        : r.delta === 0 ? '➖ SAME'
+        : `⚠️ MINOR (${r.delta})`
+        : `⛔ REJECTED: ${r.reason}`;
+
+      console.log(
+        r.agent.padEnd(25),
+        (r.from || '-').padEnd(25),
+        (r.oldScore ?? '-').toString().padEnd(10),
+        (r.to || '-').padEnd(25),
+        (r.newScore ?? '-').toString().padEnd(10),
+        status
+      );
+    }
+
+    console.log('-'.repeat(115));
+    const upgrades = report.results.filter(r => r.delta > 0).length;
+    const downgrades = report.results.filter(r => r.delta < 0 && r.acceptable).length;
+    const same = report.results.filter(r => r.delta === 0).length;
+    const rejected = report.rejections.length;
+
+    console.log(`Upgrades: ${upgrades} | Minor regressions: ${downgrades} | Same: ${same} | Rejected: ${rejected}`);
+
+    if (rejected > 0) {
+      console.log('\n⛔ REJECTIONS (sync blocked):');
+      for (const r of report.rejections) {
+        console.log(`  - ${r.agent}: ${r.reason}`);
+      }
+      console.log('\nNo files were modified. Fix the source data or adjust thresholds (not recommended).');
+    }
+  }
+}
+
+/**
+ * Convenience: load benchmarks from default path and create gate
+ */
+function loadGate(options = {}) {
+  const data = JSON.parse(fs.readFileSync(BENCHMARKS_PATH, 'utf8'));
+  return new FitnessGate(data, options);
+}
+
+/**
+ * Convenience: validate + print diff in one call
+ */
+function runGate(changes, options = {}) {
+  const gate = loadGate(options);
+  const report = gate.validateAllChanges(changes);
+  gate.printDiff(report);
+  return report;
+}
+
+module.exports = { FitnessGate, loadGate, runGate };
--- a/agent-evolution/scripts/sync-benchmarks-from-yaml.cjs
+++ b/agent-evolution/scripts/sync-benchmarks-from-yaml.cjs
@@ -1,4 +1,5 @@
 const fs = require('fs');
+const { runGate } = require('./lib/fitness-gate.cjs');

 // Parse simple YAML structure with 2-space indentation
 function parseCapabilityIndex(text) {
@@ -6,21 +7,19 @@ function parseCapabilityIndex(text) {
  const agents = {};
  let currentAgent = '';
  let currentList = '';
-  
+
  for (const line of lines) {
    const indent = line.length - line.trimStart().length;
    const trimmed = line.trim();
-    
+
    if (indent === 2 && trimmed.endsWith(':') && !trimmed.startsWith('-')) {
-      // Agent name
      currentAgent = trimmed.slice(0, -1);
      agents[currentAgent] = {};
      currentList = '';
      continue;
    }
-    
+
    if (indent === 4 && trimmed.endsWith(':') && !trimmed.startsWith('-')) {
-      // Scalar property or list start
      const key = trimmed.slice(0, -1);
      currentList = key;
      if (!Array.isArray(agents[currentAgent][key])) {
@@ -28,18 +27,16 @@ function parseCapabilityIndex(text) {
      }
      continue;
    }
-    
+
    if (indent === 4 && trimmed.includes(':') && !trimmed.startsWith('-')) {
-      // key: value
      const [key, ...rest] = trimmed.split(':');
      const value = rest.join(':').trim();
      agents[currentAgent][key.trim()] = value;
      currentList = '';
      continue;
    }
-    
+
    if (indent >= 6 && trimmed.startsWith('- ')) {
-      // List item
      const value = trimmed.slice(2).trim();
      if (currentList) {
        if (!agents[currentAgent][currentList]) agents[currentAgent][currentList] = [];
@@ -47,21 +44,19 @@ function parseCapabilityIndex(text) {
      }
      continue;
    }
-    
-    // Reset list context on unknown indentation
+
    if (indent < 4) {
      currentList = '';
    }
  }
-  
-  // Filter out non-agent entries (flat sections like capability_routing, etc.)
+
  const result = {};
  const scalarKeys = ['capabilities','receives','produces','forbidden','delegates_to','fallback_models'];
  for (const [name, data] of Object.entries(agents)) {
    const hasAgentProps = scalarKeys.some(k => k in data) || 'model' in data;
    if (hasAgentProps) result[name] = data;
  }
-  
+
  return result;
 }

@@ -72,6 +67,38 @@ console.log('Parsed agents:', Object.keys(parsed).length);
 // Read existing benchmarks
 const bench = JSON.parse(fs.readFileSync('agent-evolution/data/model-benchmarks.json', 'utf8'));

+// === FITNESS GATE: validate model changes ===
+const oldConfig = {};
+(bench.agent_current_config || []).forEach(c => {
+  oldConfig[c.agent] = c.model;
+});
+
+const changes = [];
+for (const [agent, data] of Object.entries(parsed)) {
+  const newModel = data.model || '';
+  const oldModel = oldConfig[agent];
+  if (oldModel && oldModel !== newModel) {
+    changes.push({
+      agent,
+      from: oldModel.replace('ollama-cloud/', ''),
+      to: newModel.replace('ollama-cloud/', '')
+    });
+  }
+}
+
+if (changes.length > 0) {
+  console.log('\nDetected model changes:', changes.length);
+  const report = runGate(changes);
+
+  if (!report.passed) {
+    console.error('\n⛔ FITNESS GATE REJECTED the sync. No files modified.');
+    console.error('If you intend to downgrade, update the source scores in model-benchmarks.json first.');
+    process.exit(1);
+  }
+
+  console.log('\n✅ All model changes passed fitness gate. Proceeding...');
+}
+
 // Update agent_current_config
 bench.agent_current_config = Object.entries(parsed).map(([agent, data]) => {
  const rawModel = data.model || '';
@@ -104,7 +131,6 @@ bench.agent_model_scores = Object.entries(parsed).map(([agent, data]) => {
  const rawModel = data.model || '';
  const modelId = rawModel.replace('ollama-cloud/', '');
  const currentIndex = bench.models.findIndex(m => m.id === modelId);
-  // Preserve existing scores or empty
  const scores = existingScores[agent] || {};
  return {
    agent,
@@ -117,11 +143,11 @@ bench.agent_model_scores = Object.entries(parsed).map(([agent, data]) => {

 // Update metadata
 bench.generated = new Date().toISOString();
-bench.source = '.kilo/capability-index.yaml (synced v2)';
+bench.source = '.kilo/capability-index.yaml (synced v3 + fitness-gate)';
 bench.total_agents = bench.agent_current_config.length;

 fs.writeFileSync('agent-evolution/data/model-benchmarks.json', JSON.stringify(bench, null, 2));
-console.log('Synced', bench.agent_current_config.length, 'agents');
+console.log('\nSynced', bench.agent_current_config.length, 'agents');
 console.log('Generated:', bench.generated);

 // Verify
@@ -134,3 +160,4 @@ bench.agent_current_config.forEach(c => {
  }
 });
 console.log('Mismatches:', mismatches);
+console.log('\n💡 Tip: If fitness gate rejected changes, verify that model-benchmarks.json has correct heatmap scores before syncing from YAML.');