/** * Model Evolution Fitness Gate * * Validates any model assignment change against heatmap-derived scores. * Rejects changes that would downgrade agents beyond the regression threshold. * * Usage: * const { FitnessGate, runGate } = require('./fitness-gate'); * runGate(require('../../data/model-benchmarks.json')); */ const fs = require('fs'); const path = require('path'); const BENCHMARKS_PATH = path.join(__dirname, '../../data/model-benchmarks.json'); const DEFAULT_MIN_SCORE = 75; const DEFAULT_MAX_REGRESSION = 3; class FitnessGate { constructor(benchmarks, options = {}) { this.benchmarks = benchmarks; this.agents = this._buildAgentIndex(benchmarks); this.models = this._buildModelIndex(benchmarks); this.minScore = options.minScore ?? DEFAULT_MIN_SCORE; this.maxRegression = options.maxRegression ?? DEFAULT_MAX_REGRESSION; } _buildAgentIndex(data) { const map = {}; (data.agent_model_scores || []).forEach(a => { map[a.agent] = a; }); return map; } _buildModelIndex(data) { const map = {}; (data.models || []).forEach((m, i) => { map[m.id] = { ...m, idx: i }; }); return map; } getScore(agentName, modelId) { const agent = this.agents[agentName]; if (!agent) return null; // Normalize model IDs (v3.html uses "", JSON may use "kimi-k2.6" instead of "kimi-k2.6:cloud") const normalizedId = modelId.replace(/:/g, '-').replace(/--cloud$/, '-2.6'); const tryKeys = [normalizedId, modelId, modelId + '-cloud']; for (const key of tryKeys) { if (agent.scores?.[key] !== undefined) return agent.scores[key]; } return null; } validateChange(agentName, fromModel, toModel) { const agent = this.agents[agentName]; if (!agent) return { acceptable: false, reason: `Agent "${agentName}" not found in benchmarks` }; const oldScore = this.getScore(agentName, fromModel); const newScore = this.getScore(agentName, toModel); if (oldScore === null) { return { acceptable: false, reason: `No score for "${fromModel}" on agent "${agentName}"` }; } if (newScore === null) { return { acceptable: false, reason: `No score for "${toModel}" on agent "${agentName}"` }; } if (newScore < this.minScore) { return { acceptable: false, reason: `Score ${newScore} below global minimum ${this.minScore}`, oldScore, newScore, delta: newScore - oldScore }; } if (newScore < oldScore - this.maxRegression) { return { acceptable: false, reason: `Regression ${oldScore} -> ${newScore} (delta ${newScore - oldScore}) exceeds max allowed regression of ${this.maxRegression}`, oldScore, newScore, delta: newScore - oldScore }; } return { acceptable: true, oldScore, newScore, delta: newScore - oldScore, status: newScore > oldScore ? 'upgrade' : newScore === oldScore ? 'same' : 'minor_regression' }; } validateAllChanges(changes) { const results = []; const rejections = []; for (const change of changes) { const result = this.validateChange(change.agent, change.from, change.to); results.push({ ...change, ...result }); if (!result.acceptable) rejections.push(result); } return { results, rejections, passed: rejections.length === 0 }; } printDiff(report) { console.log('\n=== Model Change Diff Report ==='); console.log( 'Agent'.padEnd(25), 'Old Model'.padEnd(25), 'Old Score'.padEnd(10), 'New Model'.padEnd(25), 'New Score'.padEnd(10), 'Status' ); console.log('-'.repeat(115)); for (const r of report.results) { const status = r.acceptable ? r.delta > 0 ? '✅ UPGRADE' : r.delta === 0 ? '➖ SAME' : `⚠️ MINOR (${r.delta})` : `⛔ REJECTED: ${r.reason}`; console.log( r.agent.padEnd(25), (r.from || '-').padEnd(25), (r.oldScore ?? '-').toString().padEnd(10), (r.to || '-').padEnd(25), (r.newScore ?? '-').toString().padEnd(10), status ); } console.log('-'.repeat(115)); const upgrades = report.results.filter(r => r.delta > 0).length; const downgrades = report.results.filter(r => r.delta < 0 && r.acceptable).length; const same = report.results.filter(r => r.delta === 0).length; const rejected = report.rejections.length; console.log(`Upgrades: ${upgrades} | Minor regressions: ${downgrades} | Same: ${same} | Rejected: ${rejected}`); if (rejected > 0) { console.log('\n⛔ REJECTIONS (sync blocked):'); for (const r of report.rejections) { console.log(` - ${r.agent}: ${r.reason}`); } console.log('\nNo files were modified. Fix the source data or adjust thresholds (not recommended).'); } } } /** * Convenience: load benchmarks from default path and create gate */ function loadGate(options = {}) { const data = JSON.parse(fs.readFileSync(BENCHMARKS_PATH, 'utf8')); return new FitnessGate(data, options); } /** * Convenience: validate + print diff in one call */ function runGate(changes, options = {}) { const gate = loadGate(options); const report = gate.validateAllChanges(changes); gate.printDiff(report); return report; } module.exports = { FitnessGate, loadGate, runGate };