- Restore all 30 agents to v3.html heatmap optimal models:
* frontend-developer: qwen3-coder -> minimax-m2.5 (92★)
* devops-engineer: nemotron-3-super -> kimi-k2.6:cloud (88★)
* browser-automation: qwen3-coder -> kimi-k2.6:cloud (86★)
* agent-architect: glm-5.1 -> kimi-k2.6:cloud (86★)
- Add Model Evolution Guard system:
* agent-evolution/scripts/lib/fitness-gate.cjs
* Rejects downgrades >3 points or below score 75
* Produces detailed diff report before any file modifications
* Normalized model ID lookup (v3.html ':' vs JSON '-')
- Update sync-benchmarks-from-yaml.cjs with fitness gate
- Update model-benchmarks.json with v3 optimal assignments
- Rebuild research-dashboard.html (104KB, 30 agents, 11 models)
- Add model-evolution-guard.md architecture documentation
- Add v3-optimal-models.json as source-of-truth reference
Fixes regression introduced by commit 3badb25 where models were
silently downgraded from heatmap optimal to inferior assignments.
164 lines
5.3 KiB
JavaScript
164 lines
5.3 KiB
JavaScript
const fs = require('fs');
|
|
const { runGate } = require('./lib/fitness-gate.cjs');
|
|
|
|
// Parse simple YAML structure with 2-space indentation
|
|
function parseCapabilityIndex(text) {
|
|
const lines = text.split(/\r?\n/);
|
|
const agents = {};
|
|
let currentAgent = '';
|
|
let currentList = '';
|
|
|
|
for (const line of lines) {
|
|
const indent = line.length - line.trimStart().length;
|
|
const trimmed = line.trim();
|
|
|
|
if (indent === 2 && trimmed.endsWith(':') && !trimmed.startsWith('-')) {
|
|
currentAgent = trimmed.slice(0, -1);
|
|
agents[currentAgent] = {};
|
|
currentList = '';
|
|
continue;
|
|
}
|
|
|
|
if (indent === 4 && trimmed.endsWith(':') && !trimmed.startsWith('-')) {
|
|
const key = trimmed.slice(0, -1);
|
|
currentList = key;
|
|
if (!Array.isArray(agents[currentAgent][key])) {
|
|
agents[currentAgent][key] = [];
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (indent === 4 && trimmed.includes(':') && !trimmed.startsWith('-')) {
|
|
const [key, ...rest] = trimmed.split(':');
|
|
const value = rest.join(':').trim();
|
|
agents[currentAgent][key.trim()] = value;
|
|
currentList = '';
|
|
continue;
|
|
}
|
|
|
|
if (indent >= 6 && trimmed.startsWith('- ')) {
|
|
const value = trimmed.slice(2).trim();
|
|
if (currentList) {
|
|
if (!agents[currentAgent][currentList]) agents[currentAgent][currentList] = [];
|
|
agents[currentAgent][currentList].push(value);
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (indent < 4) {
|
|
currentList = '';
|
|
}
|
|
}
|
|
|
|
const result = {};
|
|
const scalarKeys = ['capabilities','receives','produces','forbidden','delegates_to','fallback_models'];
|
|
for (const [name, data] of Object.entries(agents)) {
|
|
const hasAgentProps = scalarKeys.some(k => k in data) || 'model' in data;
|
|
if (hasAgentProps) result[name] = data;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
const yaml = fs.readFileSync('.kilo/capability-index.yaml', 'utf8');
|
|
const parsed = parseCapabilityIndex(yaml);
|
|
console.log('Parsed agents:', Object.keys(parsed).length);
|
|
|
|
// Read existing benchmarks
|
|
const bench = JSON.parse(fs.readFileSync('agent-evolution/data/model-benchmarks.json', 'utf8'));
|
|
|
|
// === FITNESS GATE: validate model changes ===
|
|
const oldConfig = {};
|
|
(bench.agent_current_config || []).forEach(c => {
|
|
oldConfig[c.agent] = c.model;
|
|
});
|
|
|
|
const changes = [];
|
|
for (const [agent, data] of Object.entries(parsed)) {
|
|
const newModel = data.model || '';
|
|
const oldModel = oldConfig[agent];
|
|
if (oldModel && oldModel !== newModel) {
|
|
changes.push({
|
|
agent,
|
|
from: oldModel.replace('ollama-cloud/', ''),
|
|
to: newModel.replace('ollama-cloud/', '')
|
|
});
|
|
}
|
|
}
|
|
|
|
if (changes.length > 0) {
|
|
console.log('\nDetected model changes:', changes.length);
|
|
const report = runGate(changes);
|
|
|
|
if (!report.passed) {
|
|
console.error('\n⛔ FITNESS GATE REJECTED the sync. No files modified.');
|
|
console.error('If you intend to downgrade, update the source scores in model-benchmarks.json first.');
|
|
process.exit(1);
|
|
}
|
|
|
|
console.log('\n✅ All model changes passed fitness gate. Proceeding...');
|
|
}
|
|
|
|
// Update agent_current_config
|
|
bench.agent_current_config = Object.entries(parsed).map(([agent, data]) => {
|
|
const rawModel = data.model || '';
|
|
const modelId = rawModel.replace('ollama-cloud/', '');
|
|
const badge = modelId.includes('qwen3') ? 'qwen' :
|
|
modelId.includes('minimax') ? 'minimax' :
|
|
modelId.includes('nemotron') ? 'nemotron' :
|
|
modelId.includes('glm') ? 'glm' :
|
|
modelId.includes('kimi') ? 'kimi' :
|
|
modelId.includes('deepseek') ? 'deepseek' : 'groq';
|
|
return {
|
|
agent,
|
|
model: rawModel,
|
|
provider: data.mode === 'all' ? 'Ollama Cloud' : (rawModel.startsWith('ollama-cloud/') ? 'Ollama Cloud' : 'Ollama'),
|
|
category: 'Process',
|
|
badge_type: badge,
|
|
fit_score: 0,
|
|
status: 'good',
|
|
previous_model: null
|
|
};
|
|
});
|
|
|
|
// Update agent_model_scores — preserve existing scores, fix current_model_id
|
|
const existingScores = {};
|
|
(bench.agent_model_scores || []).forEach(s => {
|
|
existingScores[s.agent] = s.scores || {};
|
|
});
|
|
|
|
bench.agent_model_scores = Object.entries(parsed).map(([agent, data]) => {
|
|
const rawModel = data.model || '';
|
|
const modelId = rawModel.replace('ollama-cloud/', '');
|
|
const currentIndex = bench.models.findIndex(m => m.id === modelId);
|
|
const scores = existingScores[agent] || {};
|
|
return {
|
|
agent,
|
|
current_model_index: currentIndex >= 0 ? currentIndex : -1,
|
|
current_model_id: modelId,
|
|
reasoning_effort: data.variant === 'thinking' ? 'H' : 'M',
|
|
scores
|
|
};
|
|
});
|
|
|
|
// Update metadata
|
|
bench.generated = new Date().toISOString();
|
|
bench.source = '.kilo/capability-index.yaml (synced v3 + fitness-gate)';
|
|
bench.total_agents = bench.agent_current_config.length;
|
|
|
|
fs.writeFileSync('agent-evolution/data/model-benchmarks.json', JSON.stringify(bench, null, 2));
|
|
console.log('\nSynced', bench.agent_current_config.length, 'agents');
|
|
console.log('Generated:', bench.generated);
|
|
|
|
// Verify
|
|
let mismatches = 0;
|
|
bench.agent_current_config.forEach(c => {
|
|
const scores = bench.agent_model_scores.find(s => s.agent === c.agent);
|
|
if (scores && scores.current_model_id !== c.model.replace('ollama-cloud/', '')) {
|
|
console.log(' MISMATCH:', c.agent, scores.current_model_id, '->', c.model);
|
|
mismatches++;
|
|
}
|
|
});
|
|
console.log('Mismatches:', mismatches);
|
|
console.log('\n💡 Tip: If fitness gate rejected changes, verify that model-benchmarks.json has correct heatmap scores before syncing from YAML.');
|