feat(evolution): add real-fit dashboard, API, report builder, and docker compose

- real-fit.html: API-driven research dashboard with agent/model heatmap, detail modal with score breakdown and evaluator commentary - api.py: FastAPI backend serving /api/real-fit-report (dynamic from SQLite), /api/research, /api/evolve-agent/start - rebuild-report.py: generates real-fit-report.json from SQLite DB for static fallback - docker-compose.yml: add evolution-api service (Python 3.12, uvicorn) for research endpoints - index.standalone.html: sync with dashboard data updates - archive/index.html: standalone dashboard snapshot (263KB) - .gitignore: exclude *.db, research-jobs.json from tracking
2026-05-28 11:55:49 +01:00
parent dbbf4c32e1
commit b95fd41587
13 changed files with 8886 additions and 353 deletions
--- a/agent-evolution/scripts/audit-system.cjs
+++ b/agent-evolution/scripts/audit-system.cjs
@@ -0,0 +1,138 @@
+const fs = require('fs');
+
+function parseFrontmatter(content) {
+  if (!content.startsWith('---')) return null;
+  const end = content.indexOf('---', 3);
+  if (end === -1) return null;
+  const fm = content.slice(3, end).trim();
+  const data = {};
+  for (const line of fm.split('\n')) {
+    const m = line.match(/^(\w+):\s*(.+)$/);
+    if (m) data[m[1]] = m[2].trim();
+  }
+  return data;
+}
+
+function stripComments(str) {
+  // Remove single-line comments, but not inside strings
+  return str.replace(/\/\/.*$/gm, '');
+}
+
+const agents = [];
+const commands = [];
+const issues = [];
+
+// 1. Parse agent .md files
+for (const f of fs.readdirSync('.kilo/agents').filter(f => f.endsWith('.md'))) {
+  const content = fs.readFileSync('.kilo/agents/' + f, 'utf8');
+  const fm = parseFrontmatter(content);
+  if (fm && fm.model) {
+    agents.push({
+      name: f.replace('.md', ''),
+      model: fm.model,
+      mode: fm.mode || 'subagent',
+      source: '.kilo/agents/' + f,
+      description: fm.description || ''
+    });
+  }
+}
+
+// 2. Parse command .md files
+for (const f of fs.readdirSync('.kilo/commands').filter(f => f.endsWith('.md'))) {
+  const content = fs.readFileSync('.kilo/commands/' + f, 'utf8');
+  const fm = parseFrontmatter(content);
+  if (fm && fm.model) {
+    commands.push({
+      name: f.replace('.md', ''),
+      model: fm.model,
+      mode: fm.mode || 'command',
+      source: '.kilo/commands/' + f,
+      description: fm.description || ''
+    });
+  }
+}
+
+// 3. Parse kilo-meta.json
+const meta = JSON.parse(fs.readFileSync('kilo-meta.json', 'utf8'));
+for (const a of agents) {
+  const m = meta.agents?.[a.name];
+  if (m) {
+    a.metaModel = m.model;
+    if (a.model !== m.model) issues.push(`AGENT ${a.name}: .md=${a.model} vs meta=${m.model}`);
+  }
+}
+for (const c of commands) {
+  const m = meta.commands?.[c.name];
+  if (m) {
+    c.metaModel = m.model;
+    if (c.model !== m.model) issues.push(`CMD ${c.name}: .md=${c.model} vs meta=${m.model}`);
+  }
+}
+
+// 4. Parse .kilo/kilo.jsonc
+const dotKiloRaw = stripComments(fs.readFileSync('.kilo/kilo.jsonc', 'utf8'));
+const dotKilo = JSON.parse(dotKiloRaw);
+for (const [name, cfg] of Object.entries(dotKilo.agent || {})) {
+  if (!cfg.model) continue;
+  const agent = agents.find(a => a.name === name);
+  if (agent) {
+    agent.kiloModel = cfg.model;
+    if (agent.model !== cfg.model) issues.push(`AGENT ${name}: .md=${agent.model} vs .kilo/kilo.jsonc=${cfg.model}`);
+  }
+}
+
+// 5. Parse root kilo.jsonc
+const rootKiloRaw = stripComments(fs.readFileSync('kilo.jsonc', 'utf8'));
+const rootKilo = JSON.parse(rootKiloRaw);
+for (const [name, cfg] of Object.entries(rootKilo.agent || {})) {
+  if (!cfg.model) continue;
+  const cmd = commands.find(c => c.name === name);
+  if (cmd) {
+    cmd.rootModel = cfg.model;
+    if (cmd.model !== cfg.model) issues.push(`CMD ${name}: .md=${cmd.model} vs kilo.jsonc=${cfg.model}`);
+  }
+}
+
+// 6. Check non-ollama
+const nonOllama = [];
+for (const a of agents) if (!a.model.startsWith('ollama-cloud/')) nonOllama.push({type:'agent', name:a.name, model:a.model});
+for (const c of commands) if (!c.model.startsWith('ollama-cloud/')) nonOllama.push({type:'command', name:c.name, model:c.model});
+
+// 7. Summary by model
+const modelStats = {};
+for (const a of agents) modelStats[a.model] = (modelStats[a.model] || 0) + 1;
+for (const c of commands) modelStats[c.model] = (modelStats[c.model] || 0) + 1;
+
+const state = {
+  generated: new Date().toISOString(),
+  totalAgents: agents.length,
+  totalCommands: commands.length,
+  allOllama: nonOllama.length === 0,
+  modelDistribution: modelStats,
+  agents: agents.sort((a,b) => a.name.localeCompare(b.name)),
+  commands: commands.sort((a,b) => a.name.localeCompare(b.name)),
+  issues: issues,
+  nonOllama: nonOllama
+};
+
+fs.writeFileSync('agent-evolution/data/real-state.json', JSON.stringify(state, null, 2) + '\n');
+
+// Console report
+console.log('=== REAL SYSTEM STATE ===');
+console.log('Generated:', state.generated);
+console.log('Agents:', state.totalAgents);
+console.log('Commands:', state.totalCommands);
+console.log('All ollama-cloud/:', state.allOllama ? 'YES' : 'NO (' + nonOllama.length + ' exceptions)');
+console.log('\n=== MODEL DISTRIBUTION ===');
+for (const [m, c] of Object.entries(modelStats).sort((a,b) => b[1]-a[1])) {
+  console.log(`  ${m}: ${c}`);
+}
+if (issues.length > 0) {
+  console.log('\n=== ISSUES ===');
+  issues.forEach(i => console.log('  ⚠️', i));
+}
+if (nonOllama.length > 0) {
+  console.log('\n=== NON-OLLOMA ===');
+  nonOllama.forEach(n => console.log('  ❌', n.type, n.name, n.model));
+}
+console.log('\n✅ State written to agent-evolution/data/real-state.json');
--- a/agent-evolution/scripts/merge-real-fit.cjs
+++ b/agent-evolution/scripts/merge-real-fit.cjs
@@ -0,0 +1,29 @@
+const fs = require('fs');
+const path = require('path');
+
+const DASH = path.join(__dirname, '../data/dashboard-data.json');
+const REAL = path.join(__dirname, '../data/real-fit-report.json');
+const OUT  = path.join(__dirname, '../data/dashboard-data.json');
+
+const dash = JSON.parse(fs.readFileSync(DASH, 'utf-8'));
+const real = JSON.parse(fs.readFileSync(REAL, 'utf-8'));
+
+// Inject real_evaluations into each agent
+dash.agents.forEach(a => {
+    const r = real.agents?.[a.name];
+    if (r && r.evaluations) {
+        a.real_evaluations = r.evaluations;
+        a.real_best_model = r.best_model;
+        a.real_best_score = r.best_score;
+    } else {
+        a.real_evaluations = {};
+    }
+});
+
+// Add metadata
+dash.real_fit_generated = real.generated;
+dash.real_fit_source = real.source;
+
+fs.writeFileSync(OUT, JSON.stringify(dash, null, 2));
+console.log('Merged real-fit data into ' + OUT);
+console.log('Agents with real evals:', dash.agents.filter(a => Object.keys(a.real_evaluations||{}).length > 0).length);
--- a/agent-evolution/scripts/patch-heatmap.js
+++ b/agent-evolution/scripts/patch-heatmap.js
@@ -0,0 +1,98 @@
+const fs = require('fs');
+const path = require('path');
+
+const INDEX = path.join(__dirname, '../index.standalone.html');
+
+// 1. New renderHeatmap that reads real-fit data
+const newRenderHeatmap = `function renderHeatmap() {
+    const esc = str => (str || '').replace(/[&<>"']/g, m => ({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[m]));
+    const dd = window.dashboardData;
+
+    // Merge real-fit if loaded
+    const rf = window.realFitData || {};
+    const realAgents = rf.agents || {};
+
+    if (!dd || !dd.agents) {
+        document.getElementById('hmTable').innerHTML = '<tr><td style="color:var(--text-secondary);padding:20px;text-align:center;">⚠️ No data. Run analysis.</td></tr>';
+        return;
+    }
+
+    // Build model list from real-fit (cross-model) + current dashboard data
+    const modelsSeen = new Set();
+    dd.agents.forEach(a => { modelsSeen.add(a.model_short); });
+    Object.values(realAgents).forEach(a => { Object.keys(a.evaluations || {}).forEach(m => modelsSeen.add(m)); });
+    const modelList = Array.from(modelsSeen).filter(m => m && m !== 'code-skeptic');
+
+    const t = document.getElementById('hmTable');
+    let h = '<thead><tr><th class="hm-role">Agent</th>';
+    modelList.forEach(m => {
+        h += '<th style="writing-mode:vertical-lr;transform:rotate(180deg);max-width:32px;font-size:.56em;padding:3px 1px;">' + esc(m) + '</th>';
+    });
+    h += '<th>Best</th><th>Score</th></tr></thead><tbody>';
+
+    dd.agents.forEach(a => {
+        const realAgent = realAgents[a.name];
+        h += '<tr><td class="hm-r">' + esc(a.name) + '</td>';
+        modelList.forEach(m => {
+            let score = 0;
+            if (realAgent && realAgent.evaluations && realAgent.evaluations[m] > 0) {
+                score = Math.round(realAgent.evaluations[m]);
+            }
+            const isCurrent = a.model_short === m;
+            let cls = 'na';
+            if (score >= 90) cls = 'high';
+            else if (score >= 75) cls = 'good';
+            else if (score >= 50) cls = 'med';
+            else if (score > 0) cls = 'low';
+            const display = score > 0 ? score : (isCurrent ? Math.round(a.fit_score || 0) : '·');
+            const curStyle = isCurrent ? 'box-shadow:inset 0 0 0 2px var(--accent-cyan)' : '';
+            h += '<td class="score ' + cls + '" style="' + curStyle + '">' + display + '</td>';
+        });
+        const bestModel = realAgent ? (realAgent.best_model || a.model_short) : a.model_short;
+        const bestScore = realAgent ? Math.round(realAgent.best_score || 0) : Math.round(a.fit_score || 0);
+        h += '<td>' + esc(bestModel) + '</td><td style="font-weight:700">' + bestScore + '</td></tr>';
+    });
+    t.innerHTML = h + '</tbody>';
+}`;
+
+// 2. Add loadRealFitData script after dashboard load
+const loadRealFitData = `
+        // Load real-fit report for cross-model evaluation
+        try {
+            const rfRes = await fetch('data/real-fit-report.json');
+            if (rfRes.ok) window.realFitData = await rfRes.json();
+        } catch(e) { console.warn('real-fit-report.json not loaded:', e.message); }
+`;
+
+let html = fs.readFileSync(INDEX, 'utf-8');
+
+// Patch A: replace renderHeatmap function
+const oldPattern = /\/\/ Render Heatmap[\s\S]*?function renderHeatmap\(\)\s*\{[^}]*\{[^}]*\}[^}]*\}/;
+const oldMatch = html.match(oldPattern);
+if (oldMatch) {
+    html = html.substring(0, oldMatch.index) + '// Render Heatmap (real-fit enabled)\n' + newRenderHeatmap + html.substring(oldMatch.index + oldMatch[0].length);
+    console.log('Patched renderHeatmap');
+} else {
+    console.log('Pattern A not found, trying fallback...');
+    // Fallback: find and replace the specific renderHeatmap block
+    const start = html.indexOf('function renderHeatmap() {');
+    if (start !== -1) {
+        let brace = 0, end = start;
+        for (let i = start; i < html.length; i++) {
+            if (html[i] === '{') brace++;
+            else if (html[i] === '}') { brace--; if (brace === 0) { end = i + 1; break; } }
+        }
+        html = html.substring(0, start) + newRenderHeatmap + '\n' + html.substring(end);
+        console.log('Patched renderHeatmap (fallback)');
+    }
+}
+
+// Patch B: insert real-fit loading after dashboard load
+const dashLoadPattern = /window\.dashboardData = await dashRes\.json\(\);/;
+if (dashLoadPattern.test(html)) {
+    html = html.replace(dashLoadPattern, 'window.dashboardData = await dashRes.json();\n' + loadRealFitData.trim());
+    console.log('Patched init() to load real-fit data');
+}
+
+fs.writeFileSync(INDEX, html);
+console.log('Done — ' + (fs.statSync(INDEX).size / 1024).toFixed(1) + ' KB');
--- a/agent-evolution/scripts/rebuild-report.py
+++ b/agent-evolution/scripts/rebuild-report.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""
+Rebuild real-fit-report.json from SQLite DB.
+
+Usage:
+    python3 rebuild-report.py
+    python3 rebuild-report.py --db /path/to/real-fit.db --report /path/to/real-fit-report.json
+"""
+
+import argparse
+import json
+import sqlite3
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def _sync_agents_from_meta(db_path: Path) -> None:
+    """Import any missing agents from kilo-meta.json into the DB agents table."""
+    meta_path = db_path.parent.parent.parent / "kilo-meta.json"
+    if not meta_path.exists():
+        return
+    with open(meta_path) as f:
+        meta = json.load(f)
+
+    conn = sqlite3.connect(str(db_path))
+    cursor = conn.cursor()
+    cursor.execute("SELECT name FROM agents")
+    existing = {r[0] for r in cursor.fetchall()}
+
+    for name, info in meta.get("agents", {}).items():
+        if name in existing:
+            continue
+        cursor.execute(
+            "INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)",
+            (
+                name,
+                info.get("description", ""),
+                info.get("category", "meta"),
+                info.get("model", ""),
+                info.get("color", "#6B7280"),
+                datetime.now(timezone.utc).isoformat(),
+            ),
+        )
+    conn.commit()
+    conn.close()
+
+
+def build_report(db_path: Path) -> dict:
+    _sync_agents_from_meta(db_path)
+    conn = sqlite3.connect(str(db_path))
+    conn.row_factory = sqlite3.Row
+    cursor = conn.cursor()
+
+    cursor.execute("""
+        SELECT name, description, category, current_model
+        FROM agents
+    """)
+    agents_meta = {row["name"]: dict(row) for row in cursor.fetchall()}
+
+    # Only take evaluations that are NOT HTTP error responses
+    # AND prefer evaluator='rubric_v2' over 'rubric_v1'
+    cursor.execute("""
+        SELECT agent_name, model, total_score, evaluator, response
+        FROM evaluations
+        WHERE total_score > 0
+          AND evaluator NOT LIKE '%rubric_v1%'
+          AND (response IS NULL
+               OR (response NOT LIKE '%[HTTP %' AND response != ''))
+        ORDER BY agent_name, model,
+            CASE evaluator
+                WHEN 'evolution-skeptic' THEN 0
+                WHEN 'rubric_v2' THEN 1
+                ELSE 2
+            END,
+            total_score DESC
+    """)
+
+    # Take the first (best preferred evaluator, highest score) per agent-model
+    best_evals = {}
+    for row in cursor.fetchall():
+        agent = row["agent_name"]
+        model = row["model"]
+        score = row["total_score"]
+        if agent not in best_evals:
+            best_evals[agent] = {}
+        if model not in best_evals[agent]:
+            best_evals[agent][model] = score
+
+    # Rebuild fit_scores from selected evaluations only
+    cursor.execute("""
+        SELECT agent_name, model, MAX(total_score) as best_score, scores, explanation
+        FROM evaluations
+        WHERE total_score > 0
+          AND evaluator NOT LIKE '%rubric_v1%'
+          AND (response IS NULL
+               OR (response NOT LIKE '%[HTTP %' AND response != ''))
+        GROUP BY agent_name, model
+    """)
+    fit_scores = {}
+    for row in cursor.fetchall():
+        fit_scores[row["agent_name"]] = {
+            "model": row["model"],
+            "fit": row["best_score"],
+            "explanation": (
+                f"Best model for {row['agent_name']} is {row['model']} "
+                f"with avg score {row['best_score']:.1f}. "
+                "Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
+            ),
+        }
+
+    conn.close()
+
+    agents_report = {}
+    for agent_name, meta in agents_meta.items():
+        evals = best_evals.get(agent_name, {})
+        if evals:
+            best_model = max(evals, key=evals.get)
+            best_score = evals[best_model]
+        else:
+            best_model = ""
+            best_score = 0.0
+        agents_report[agent_name] = {
+            "name": agent_name,
+            "evaluations": evals,
+            "info": [
+                meta.get("description") or "",
+                meta.get("category") or "",
+                meta.get("current_model") or "",
+            ],
+            "best_model": best_model,
+            "best_score": best_score,
+        }
+
+    total_evals = sum(len(evals) for evals in best_evals.values())
+    generated = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+
+    return {
+        "generated": generated,
+        "source": "real-fit-engine-db-filtered",
+        "total_evaluations": total_evals,
+        "agents": agents_report,
+        "fit_scores": fit_scores,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Rebuild real-fit-report.json from DB")
+    parser.add_argument(
+        "--db",
+        type=Path,
+        default=Path(__file__).parent.parent / "data" / "real-fit.db",
+        help="Path to SQLite DB",
+    )
+    parser.add_argument(
+        "--report",
+        type=Path,
+        default=Path(__file__).parent.parent / "data" / "real-fit-report.json",
+        help="Path to report JSON output",
+    )
+    args = parser.parse_args()
+
+    report = build_report(args.db)
+    args.report.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.report, "w", encoding="utf-8") as f:
+        json.dump(report, f, indent=2)
+
+    print(f"Report rebuilt: {args.report}")
+    print(f"Agents: {len(report['agents'])}, Evaluations: {report['total_evaluations']}")
+
+
+if __name__ == "__main__":
+    main()