APAW/agent-evolution/scripts/lib/fitness-gate.cjs

/**
 * Model Evolution Fitness Gate
 *
 * Validates any model assignment change against heatmap-derived scores.
 * Rejects changes that would downgrade agents beyond the regression threshold.
 *
 * Usage:
 *   const { FitnessGate, runGate } = require('./fitness-gate');
 *   runGate(require('../../data/model-benchmarks.json'));
 */

const fs = require('fs');
const path = require('path');

const BENCHMARKS_PATH = path.join(__dirname, '../../data/model-benchmarks.json');
const DEFAULT_MIN_SCORE = 75;
const DEFAULT_MAX_REGRESSION = 3;

class FitnessGate {
  constructor(benchmarks, options = {}) {
    this.benchmarks = benchmarks;
    this.agents = this._buildAgentIndex(benchmarks);
    this.models = this._buildModelIndex(benchmarks);
    this.minScore = options.minScore ?? DEFAULT_MIN_SCORE;
    this.maxRegression = options.maxRegression ?? DEFAULT_MAX_REGRESSION;
  }

  _buildAgentIndex(data) {
    const map = {};
    (data.agent_model_scores || []).forEach(a => {
      map[a.agent] = a;
    });
    return map;
  }

  _buildModelIndex(data) {
    const map = {};
    (data.models || []).forEach((m, i) => {
      map[m.id] = { ...m, idx: i };
    });
    return map;
  }

  getScore(agentName, modelId) {
    const agent = this.agents[agentName];
    if (!agent) return null;
    // Normalize model IDs (v3.html uses "", JSON may use "kimi-k2.6" instead of "kimi-k2.6:cloud")
    const normalizedId = modelId.replace(/:/g, '-').replace(/--cloud$/, '-2.6');
    const tryKeys = [normalizedId, modelId, modelId + '-cloud'];
    for (const key of tryKeys) {
      if (agent.scores?.[key] !== undefined) return agent.scores[key];
    }
    return null;
  }

  validateChange(agentName, fromModel, toModel) {
    const agent = this.agents[agentName];
    if (!agent) return { acceptable: false, reason: `Agent "${agentName}" not found in benchmarks` };

    const oldScore = this.getScore(agentName, fromModel);
    const newScore = this.getScore(agentName, toModel);

    if (oldScore === null) {
      return { acceptable: false, reason: `No score for "${fromModel}" on agent "${agentName}"` };
    }
    if (newScore === null) {
      return { acceptable: false, reason: `No score for "${toModel}" on agent "${agentName}"` };
    }

    if (newScore < this.minScore) {
      return {
        acceptable: false,
        reason: `Score ${newScore} below global minimum ${this.minScore}`,
        oldScore, newScore, delta: newScore - oldScore
      };
    }

    if (newScore < oldScore - this.maxRegression) {
      return {
        acceptable: false,
        reason: `Regression ${oldScore} -> ${newScore} (delta ${newScore - oldScore}) exceeds max allowed regression of ${this.maxRegression}`,
        oldScore, newScore, delta: newScore - oldScore
      };
    }

    return {
      acceptable: true,
      oldScore, newScore, delta: newScore - oldScore,
      status: newScore > oldScore ? 'upgrade' : newScore === oldScore ? 'same' : 'minor_regression'
    };
  }

  validateAllChanges(changes) {
    const results = [];
    const rejections = [];

    for (const change of changes) {
      const result = this.validateChange(change.agent, change.from, change.to);
      results.push({ ...change, ...result });
      if (!result.acceptable) rejections.push(result);
    }

    return { results, rejections, passed: rejections.length === 0 };
  }

  printDiff(report) {
    console.log('\n=== Model Change Diff Report ===');
    console.log(
      'Agent'.padEnd(25),
      'Old Model'.padEnd(25),
      'Old Score'.padEnd(10),
      'New Model'.padEnd(25),
      'New Score'.padEnd(10),
      'Status'
    );
    console.log('-'.repeat(115));

    for (const r of report.results) {
      const status = r.acceptable
        ? r.delta > 0 ? '✅ UPGRADE'
        : r.delta === 0 ? '➖ SAME'
        : `⚠️ MINOR (${r.delta})`
        : `⛔ REJECTED: ${r.reason}`;

      console.log(
        r.agent.padEnd(25),
        (r.from || '-').padEnd(25),
        (r.oldScore ?? '-').toString().padEnd(10),
        (r.to || '-').padEnd(25),
        (r.newScore ?? '-').toString().padEnd(10),
        status
      );
    }

    console.log('-'.repeat(115));
    const upgrades = report.results.filter(r => r.delta > 0).length;
    const downgrades = report.results.filter(r => r.delta < 0 && r.acceptable).length;
    const same = report.results.filter(r => r.delta === 0).length;
    const rejected = report.rejections.length;

    console.log(`Upgrades: ${upgrades} | Minor regressions: ${downgrades} | Same: ${same} | Rejected: ${rejected}`);

    if (rejected > 0) {
      console.log('\n⛔ REJECTIONS (sync blocked):');
      for (const r of report.rejections) {
        console.log(`  - ${r.agent}: ${r.reason}`);
      }
      console.log('\nNo files were modified. Fix the source data or adjust thresholds (not recommended).');
    }
  }
}

/**
 * Convenience: load benchmarks from default path and create gate
 */
function loadGate(options = {}) {
  const data = JSON.parse(fs.readFileSync(BENCHMARKS_PATH, 'utf8'));
  return new FitnessGate(data, options);
}

/**
 * Convenience: validate + print diff in one call
 */
function runGate(changes, options = {}) {
  const gate = loadGate(options);
  const report = gate.validateAllChanges(changes);
  gate.printDiff(report);
  return report;
}

module.exports = { FitnessGate, loadGate, runGate };