diff --git a/.gitignore b/.gitignore
index 10dcd50..845da4e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,8 @@ __pycache__/
agent-evolution/data/dashboard-data.json
agent-evolution/data/state.json
agent-evolution/data/model-benchmarks.json.bak
+*.db
+research-jobs.json
landing/api/state.json
landing/api/state.json.bak
landing/api/dashboard-data.json
diff --git a/agent-evolution/api.py b/agent-evolution/api.py
new file mode 100644
index 0000000..6fd4b28
--- /dev/null
+++ b/agent-evolution/api.py
@@ -0,0 +1,491 @@
+"""
+Evolution Research API — FastAPI backend for agent-model evaluation jobs.
+
+Endpoints:
+ POST /api/research → start background evaluation job
+ GET /api/research/{id} → job status & results
+ POST /api/research/cell → evaluate single agent-model pair
+ GET /api/real-fit-report → serve real-fit-report.json (live from DB)
+ GET /api/models → list available models
+ GET /api/evaluation/{agent}/{model} → detailed evaluation record
+ POST /api/evolve-agent/start → start role-fit testing job (evolution-prompt + evolution-skeptic)
+"""
+
+import json
+import os
+import sqlite3
+import subprocess
+import time
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+
+app = FastAPI(title="Evolution Research API", version="1.1.0")
+
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+JOB_STATE_PATH = Path(os.environ.get("JOB_STATE_PATH", "/app/data/research-jobs.json"))
+REPORT_PATH = Path(os.environ.get("REPORT_PATH", "/app/data/real-fit-report.json"))
+META_PATH = Path(os.environ.get("META_PATH", "/app/kilo-meta.json"))
+EVOLUTION_PATH = Path(os.environ.get("EVOLUTION_PATH", "/app/data/evolution.json"))
+ENGINE_PATH = Path(os.environ.get("ENGINE_PATH", "/app/scripts/real-fit-engine.py"))
+DB_PATH = Path(os.environ.get("REAL_FIT_DB", REPORT_PATH.parent / "real-fit.db"))
+
+
+def _load_json(path: Path) -> dict:
+ if path.exists():
+ with open(path, "r", encoding="utf-8") as f:
+ return json.load(f)
+ return {}
+
+
+def _save_json(path: Path, data: dict) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ with open(path, "w", encoding="utf-8") as f:
+ json.dump(data, f, indent=2)
+
+
+def _load_jobs() -> dict:
+ return _load_json(JOB_STATE_PATH)
+
+
+def _save_jobs(jobs: dict) -> None:
+ _save_json(JOB_STATE_PATH, jobs)
+
+
+class ResearchRequest(BaseModel):
+ agent: str
+ models: list[str]
+
+
+class CellRequest(BaseModel):
+ agent: str
+ model: str
+
+
+class EvolveAgentRequest(BaseModel):
+ agent: str
+ models: list[str]
+
+
+def _spawn_engine_job(job_id: str, agent: str, models: list[str]) -> None:
+ """Spawn real-fit-engine.py as a background subprocess to evaluate models.
+
+ After evaluation, regenerates the report JSON so results are immediately visible.
+ """
+ model_arg = ",".join(models)
+ subprocess.Popen(
+ ["python3", "-c", f"""
+import subprocess, json, time, os
+job_id = {repr(job_id)}
+job_state_path = os.environ.get('JOB_STATE_PATH', '/app/data/research-jobs.json')
+engine_path = os.environ.get('ENGINE_PATH', '/app/scripts/real-fit-engine.py')
+
+def load_jobs():
+ try:
+ with open(job_state_path) as f:
+ return json.load(f)
+ except Exception:
+ return {{}}
+
+def save_jobs(jobs):
+ with open(job_state_path, 'w') as f:
+ json.dump(jobs, f, indent=2)
+
+jobs = load_jobs()
+job = jobs.get(job_id)
+if job:
+ job['status'] = 'running'
+ job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
+ save_jobs(jobs)
+
+cmd = ['python3', engine_path, '--evaluate', {repr(agent)}, '--models', {repr(model_arg)}, '--report']
+proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+stdout, stderr = proc.communicate()
+
+jobs = load_jobs()
+job = jobs.get(job_id)
+if job:
+ job['status'] = 'done' if proc.returncode == 0 else 'error'
+ job['progress'] = 100
+ job['result'] = {{'returncode': proc.returncode, 'stdout': stdout, 'stderr': stderr}}
+ job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
+ save_jobs(jobs)
+"""],
+ stdout=subprocess.DEVNULL,
+ stderr=subprocess.DEVNULL,
+ )
+
+
+@app.get("/api/models")
+def get_models():
+ meta = _load_json(META_PATH)
+ agents_meta = meta.get("agents", {})
+ models = set()
+ for agent in agents_meta.values():
+ m = agent.get("model", "")
+ if m:
+ models.add(m)
+ evolution = _load_json(EVOLUTION_PATH)
+ for agent_data in evolution.get("agents", {}).values():
+ curr = agent_data.get("current", {})
+ m = curr.get("model", "")
+ if m:
+ models.add(m)
+ for rec in agent_data.get("recommendations", []):
+ mod = rec.get("model", "")
+ if mod:
+ models.add(mod)
+ return {"models": sorted(models)}
+
+
+@app.get("/api/evaluation/{agent}/{model}")
+def get_evaluation(agent: str, model: str):
+ db_path = str(DB_PATH)
+ if not os.path.exists(db_path):
+ raise HTTPException(status_code=404, detail="Evaluation database not found")
+
+ conn = sqlite3.connect(db_path)
+ conn.row_factory = sqlite3.Row
+ cursor = conn.cursor()
+
+ # Step 1: Get the best evaluation for this agent-model pair
+ cursor.execute(
+ """
+ SELECT e.id, e.agent_name, e.model, e.prompt_id,
+ e.response, e.scores, e.total_score, e.explanation,
+ e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at
+ FROM evaluations e
+ WHERE e.agent_name = ? AND e.model = ? AND e.total_score > 0
+ ORDER BY e.total_score DESC, e.id DESC
+ LIMIT 1
+ """,
+ (agent, model),
+ )
+ row = cursor.fetchone()
+
+ if not row:
+ # Fallback: try any evaluation even with score 0
+ cursor.execute(
+ """
+ SELECT e.id, e.agent_name, e.model, e.prompt_id,
+ e.response, e.scores, e.total_score, e.explanation,
+ e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at
+ FROM evaluations e
+ WHERE e.agent_name = ? AND e.model = ?
+ ORDER BY e.id DESC LIMIT 1
+ """,
+ (agent, model),
+ )
+ row = cursor.fetchone()
+
+ if not row:
+ conn.close()
+ raise HTTPException(status_code=404, detail="Evaluation not found for this agent-model pair")
+
+ result = dict(row)
+ prompt_id = result.get("prompt_id")
+
+ # Step 2: Get prompt data — try by prompt_id first, then fallback by agent_name
+ system_prompt = ""
+ user_prompt = ""
+ expected_keywords_raw = "[]"
+ rubric_raw = "{}"
+
+ if prompt_id:
+ cursor.execute(
+ "SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE id = ?",
+ (prompt_id,),
+ )
+ tp = cursor.fetchone()
+ if tp and tp["system_prompt"]:
+ system_prompt = tp["system_prompt"]
+ user_prompt = tp["user_prompt"] or ""
+ expected_keywords_raw = tp["expected_keywords"] or "[]"
+ rubric_raw = tp["rubric"] or "{}"
+
+ # Fallback: find prompt by agent_name if JOIN failed
+ if not system_prompt:
+ cursor.execute(
+ "SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE agent_name = ? ORDER BY id DESC LIMIT 1",
+ (agent,),
+ )
+ tp = cursor.fetchone()
+ if tp:
+ system_prompt = tp["system_prompt"] or ""
+ user_prompt = tp["user_prompt"] or ""
+ expected_keywords_raw = tp["expected_keywords"] or "[]"
+ rubric_raw = tp["rubric"] or "{}"
+
+ conn.close()
+
+ # Assign all fetched prompt data to the result
+ result["system_prompt"] = system_prompt
+ result["user_prompt"] = user_prompt
+ result["expected_keywords"] = expected_keywords_raw
+ result["rubric"] = rubric_raw
+
+ for key in ("expected_keywords", "rubric", "scores"):
+ raw = result.get(key)
+ if isinstance(raw, str):
+ try:
+ result[key] = json.loads(raw)
+ except json.JSONDecodeError:
+ result[key] = [] if key == "expected_keywords" else {}
+ elif raw is None:
+ result[key] = [] if key == "expected_keywords" else {}
+
+ return result
+
+
+def _sync_agents_from_meta(db_path: Path, meta_path: Path | None = None) -> None:
+ """Import any missing agents from kilo-meta.json into the DB agents table."""
+ if meta_path is None:
+ meta_path = db_path.parent.parent.parent / "kilo-meta.json"
+ if not meta_path.exists():
+ return
+ with open(meta_path) as f:
+ meta = json.load(f)
+
+ conn = sqlite3.connect(str(db_path))
+ cursor = conn.cursor()
+ cursor.execute("SELECT name FROM agents")
+ existing = {r[0] for r in cursor.fetchall()}
+
+ for name, info in meta.get("agents", {}).items():
+ if name in existing:
+ continue
+ cursor.execute(
+ "INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)",
+ (
+ name,
+ info.get("description", ""),
+ info.get("category", "meta"),
+ info.get("model", ""),
+ info.get("color", "#6B7280"),
+ datetime.now(timezone.utc).isoformat(),
+ ),
+ )
+ conn.commit()
+ conn.close()
+
+
+def _build_report_from_db(db_path: Path) -> dict:
+ """Build real-fit report dynamically from SQLite DB (filtered, objective)."""
+ _sync_agents_from_meta(db_path)
+ conn = sqlite3.connect(str(db_path))
+ conn.row_factory = sqlite3.Row
+ cursor = conn.cursor()
+
+ cursor.execute("""
+ SELECT name, description, category, current_model
+ FROM agents
+ """)
+ agents_meta = {row["name"]: dict(row) for row in cursor.fetchall()}
+
+ # Only take evaluations that are NOT HTTP error responses
+ # AND prefer evaluator='rubric_v2' over 'rubric_v1'
+ cursor.execute("""
+ SELECT agent_name, model, total_score, evaluator, response
+ FROM evaluations
+ WHERE total_score > 0
+ AND evaluator NOT LIKE '%rubric_v1%'
+ AND (response IS NULL
+ OR (response NOT LIKE '%[HTTP %' AND response != ''))
+ ORDER BY agent_name, model,
+ CASE evaluator
+ WHEN 'evolution-skeptic' THEN 0
+ WHEN 'rubric_v2' THEN 1
+ ELSE 2
+ END,
+ total_score DESC
+ """)
+
+ # Take the first (best preferred evaluator, highest score) per agent-model
+ best_evals = {}
+ for row in cursor.fetchall():
+ agent = row["agent_name"]
+ model = row["model"]
+ score = row["total_score"]
+ if agent not in best_evals:
+ best_evals[agent] = {}
+ if model not in best_evals[agent]:
+ best_evals[agent][model] = score
+
+ # Rebuild fit_scores from selected evaluations only
+ cursor.execute("""
+ SELECT agent_name, model, MAX(total_score) as best_score, scores, explanation
+ FROM evaluations
+ WHERE total_score > 0
+ AND evaluator NOT LIKE '%rubric_v1%'
+ AND (response IS NULL
+ OR (response NOT LIKE '%[HTTP %' AND response != ''))
+ GROUP BY agent_name, model
+ """)
+ fit_scores = {}
+ for row in cursor.fetchall():
+ fit_scores[row["agent_name"]] = {
+ "model": row["model"],
+ "fit": row["best_score"],
+ "explanation": (
+ f"Best model for {row['agent_name']} is {row['model']} "
+ f"with avg score {row['best_score']:.1f}. "
+ "Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
+ ),
+ }
+
+ conn.close()
+
+ agents_report = {}
+ for agent_name, meta in agents_meta.items():
+ evals = best_evals.get(agent_name, {})
+ if evals:
+ best_model = max(evals, key=evals.get)
+ best_score = evals[best_model]
+ else:
+ best_model = ""
+ best_score = 0.0
+ agents_report[agent_name] = {
+ "name": agent_name,
+ "evaluations": evals,
+ "info": [
+ meta.get("description") or "",
+ meta.get("category") or "",
+ meta.get("current_model") or "",
+ ],
+ "best_model": best_model,
+ "best_score": best_score,
+ }
+
+ total_evals = sum(len(evals) for evals in best_evals.values())
+
+ return {
+ "generated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()).replace("+0000", "+00:00"),
+ "source": "real-fit-engine-db-filtered",
+ "total_evaluations": total_evals,
+ "agents": agents_report,
+ "fit_scores": fit_scores,
+ }
+
+
+@app.get("/api/real-fit-report")
+def get_real_fit_report():
+ db_path = str(DB_PATH)
+ if os.path.exists(db_path):
+ return _build_report_from_db(DB_PATH)
+ return _load_json(REPORT_PATH)
+
+
+@app.post("/api/research")
+def start_research(req: ResearchRequest):
+ job_id = str(uuid.uuid4())
+ jobs = _load_jobs()
+ jobs[job_id] = {
+ "id": job_id,
+ "agent": req.agent,
+ "models": req.models,
+ "status": "pending",
+ "progress": 0,
+ "result": None,
+ "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+ "updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+ }
+ _save_jobs(jobs)
+
+ _spawn_engine_job(job_id, req.agent, req.models)
+
+ return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models}
+
+
+def _extract_scores_from_report(agent: str, models: list[str]) -> list[dict]:
+ """Read real-fit-report.json and return scores for agent x models."""
+ report = _load_json(REPORT_PATH)
+ agent_data = report.get("agents", {}).get(agent, {})
+ evaluations = agent_data.get("evaluations", {})
+ results = []
+ for m in models:
+ score = evaluations.get(m, 0)
+ pending = score == 0
+ results.append({"model": m, "score": score, "pending": pending})
+ return results
+
+
+@app.get("/api/research/{job_id}")
+def get_research(job_id: str):
+ jobs = _load_jobs()
+ job = jobs.get(job_id)
+ if not job:
+ raise HTTPException(status_code=404, detail="Job not found")
+ if job.get("status") == "done" and job.get("result") is not None:
+ job["models_scored"] = _extract_scores_from_report(job["agent"], job.get("models", []))
+ return job
+
+
+@app.post("/api/research/cell")
+def research_cell(req: CellRequest):
+ job_id = str(uuid.uuid4())
+ jobs = _load_jobs()
+ jobs[job_id] = {
+ "id": job_id,
+ "agent": req.agent,
+ "models": [req.model],
+ "status": "pending",
+ "progress": 0,
+ "result": None,
+ "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+ "updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+ }
+ _save_jobs(jobs)
+
+ _spawn_engine_job(job_id, req.agent, [req.model])
+
+ return {"job_id": job_id, "status": "pending", "agent": req.agent, "model": req.model}
+
+
+@app.post("/api/evolve-agent/start")
+def start_evolve_agent(req: EvolveAgentRequest):
+ """Start a role-fit evaluation job using evolution-prompt and evolution-skeptic.
+
+ For now, this places a job in the queue that will be picked up by the real-fit-engine.
+ In the full implementation:
+ 1. evolution-prompt generates role-specific stress-test prompts from agent definition
+ 2. Each model in models list is tested with the same prompt
+ 3. evolution-skeptic evaluates each response with per-dimension rubric scoring
+ 4. Results are stored in SQLite and report is regenerated
+ """
+ job_id = str(uuid.uuid4())
+ jobs = _load_jobs()
+ jobs[job_id] = {
+ "id": job_id,
+ "type": "evolve-agent",
+ "agent": req.agent,
+ "models": req.models,
+ "status": "pending",
+ "progress": 0,
+ "result": None,
+ "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+ "updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+ }
+ _save_jobs(jobs)
+
+ # Placeholder: spawn the same engine job with evolve-agent type
+ # In full implementation, this would spawn a script that:
+ # 1. Reads agent definition from .kilo/agents/{agent}.md
+ # 2. Calls Ollama API for evolution-prompt to generate test prompts
+ # 3. For each model: calls Ollama API, stores response
+ # 4. Calls Ollama API for evolution-skeptic to evaluate
+ # 5. Stores results in SQLite, rebuilds report
+ _spawn_engine_job(job_id, req.agent, req.models)
+
+ return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models}
\ No newline at end of file
diff --git a/agent-evolution/archive/index.html b/agent-evolution/archive/index.html
new file mode 100644
index 0000000..7174e12
--- /dev/null
+++ b/agent-evolution/archive/index.html
@@ -0,0 +1,7031 @@
+
+
+
+
+
+ APAW Agent Evolution Dashboard
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 🔍
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Agent × Model Compatibility Heatmap
+
Weighted score = benchmark × instruction-following multiplier · ★ = best fit · outlined = current · click for details
+
+
+
+
+ 100806040200
+
+
+ ↑ Ideal Match
+ Mismatch ↓
+
+
+
+
+
+
+
+
+
+
+
+
Agent Performance Scores
+
Composite score per agent based on model benchmarks
+
+
+
+
+
+
+
Model Distribution
+
Agents per model
+
+
+
+
Migration Impact
+
Before vs after model change score
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Select recommendations to apply. All items are selected by default.
+
+
+
+
+
+
+
+
+
+
+
+
Applying Fixes...
+
+
Preparing...
+
+
+
+
+
+
+
+
+
+
+
+
+ Analyzing benchmark data...
+
+
+
+ Computing composite scores...
+
+
+
+ Cross-referencing agent assignments...
+
+
+
+ Generating recommendations...
+
+
+
+ Research complete!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/agent-evolution/archive/tests/screenshot-dash.cjs b/agent-evolution/archive/tests/screenshot-dash.cjs
new file mode 100644
index 0000000..b9b5e3f
--- /dev/null
+++ b/agent-evolution/archive/tests/screenshot-dash.cjs
@@ -0,0 +1,11 @@
+const { chromium } = require('playwright');
+const fs = require('fs');
+(async () => {
+ const browser = await chromium.launch({ headless: true, args: ['--no-sandbox'] });
+ const page = await browser.newPage({ viewport: { width: 1280, height: 720 } });
+ await page.goto('http://host.docker.internal:3003', { waitUntil: 'domcontentloaded', timeout: 30000 });
+ await page.waitForTimeout(2000);
+ await page.screenshot({ path: '/app/tests/visual/current/dashboard_landing.png', fullPage: false });
+ await browser.close();
+ console.log('Screenshot saved to /app/tests/visual/current/dashboard_landing.png');
+})();
diff --git a/agent-evolution/data/real-fit-report.json b/agent-evolution/data/real-fit-report.json
index f57d4ae..10fd951 100644
--- a/agent-evolution/data/real-fit-report.json
+++ b/agent-evolution/data/real-fit-report.json
@@ -1,689 +1,768 @@
{
- "generated": "2026-05-27T18:36:13.173821+00:00",
+ "generated": "2026-05-28T10:48:02.581965+00:00",
"source": "real-fit-engine",
- "total_evaluations": 102,
+ "total_evaluations": 147,
"agents": {
"agent-architect": {
"name": "agent-architect",
"evaluations": {
- "deepseek-v4-pro-max": 41.6,
- "kimi-k2.6": 41.6,
- "qwen3-coder:480b": 41.6
+ "deepseek-v4-pro": 48.3,
+ "glm-5.1": 48.3,
+ "kimi-k2.6": 53.5,
+ "qwen3-coder:480b": 48.3
},
"info": [
"Creates, modifies, and reviews new agents, workflows, and skills based on capability gap analysis. Tier 2 meta-agent with self-cascade enabled.",
"meta",
"ollama-cloud/kimi-k2.6"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 41.6
+ "best_model": "kimi-k2.6",
+ "best_score": 53.5
},
"architect-indexer": {
"name": "architect-indexer",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 43.2,
+ "glm-5.1": 48.6,
+ "kimi-k2.6": 46.5,
+ "qwen3-coder:480b": 54.0
},
"info": [
"Indexes and maps project codebase architecture into .architect/ directory. Creates and maintains structured documentation of entities, APIs, DB schema, file graphs, and conventions. (GNS-2 Tier 0)",
"core",
"ollama-cloud/glm-5.1"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "qwen3-coder:480b",
+ "best_score": 54.0
},
"backend-developer": {
"name": "backend-developer",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 53.5,
+ "glm-5.1": 48.3,
+ "kimi-k2.6": 48.3,
+ "qwen3-coder:480b": 43.2
},
"info": [
"Backend specialist for Node.js, Express, APIs, and database integration (GNS-2 Tier 1)",
"core",
"ollama-cloud/qwen3-coder:480b"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "deepseek-v4-pro",
+ "best_score": 53.5
},
"browser-automation": {
"name": "browser-automation",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 42.8,
+ "glm-5.1": 53.3,
+ "kimi-k2.6": 63.8,
+ "qwen3-coder:480b": 48.9
},
"info": [
"Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction (GNS-2 Tier 0)",
"testing",
"ollama-cloud/deepseek-v4-flash"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "kimi-k2.6",
+ "best_score": 63.8
},
"capability-analyst": {
"name": "capability-analyst",
"evaluations": {
- "deepseek-v4-pro-max": 41.6,
- "kimi-k2.6": 41.6,
- "qwen3-coder:480b": 41.6
+ "deepseek-v4-pro": 58.7,
+ "glm-5.1": 53.5,
+ "kimi-k2.6": 58.7,
+ "qwen3-coder:480b": 52.3
},
"info": [
"Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components. Tier 2 meta-agent with self-cascade enabled.",
"meta",
"ollama-cloud/deepseek-v4-pro-max"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 41.6
+ "best_model": "deepseek-v4-pro",
+ "best_score": 58.7
},
"code-skeptic": {
"name": "code-skeptic",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 22.8,
+ "glm-5.1": 89.1,
+ "kimi-k2.6": 91.2,
+ "minimax-m2.5": 45.0,
+ "qwen3-coder:480b": 90.6
},
"info": [
"Adversarial code reviewer. Finds problems and issues. Does NOT suggest implementations (GNS-2 Tier 0)",
"quality",
"ollama-cloud/minimax-m2.5"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "kimi-k2.6",
+ "best_score": 91.2
},
"devops-engineer": {
"name": "devops-engineer",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 29.7,
+ "glm-5.1": 96.2,
+ "kimi-k2.6": 87.2,
+ "qwen3-coder:480b": 87.2
},
"info": [
"DevOps specialist for Docker, Kubernetes, CI/CD pipeline automation, and infrastructure management (GNS-2 Tier 1)",
"core",
"ollama-cloud/kimi-k2.6"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "glm-5.1",
+ "best_score": 96.2
},
"evaluator": {
"name": "evaluator",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 50.6,
+ "glm-5.1": 58.7,
+ "kimi-k2.6": 53.5,
+ "qwen3-coder:480b": 43.8
},
"info": [
"Scores agent effectiveness after task completion for continuous improvement. Tier 2 meta-agent with self-cascade enabled.",
"meta",
"ollama-cloud/qwen3.5-122b"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "glm-5.1",
+ "best_score": 58.7
+ },
+ "evolution-prompt": {
+ "name": "evolution-prompt",
+ "evaluations": {
+ "deepseek-v4-pro": 52.6,
+ "glm-5.1": 44.7,
+ "kimi-k2.6": 53.5,
+ "qwen3-coder:480b": 21.3
+ },
+ "info": [
+ "Generates role-specific stress-test prompts by analyzing agent definitions",
+ "meta",
+ "ollama-cloud/deepseek-v4-pro-max"
+ ],
+ "best_model": "kimi-k2.6",
+ "best_score": 53.5
+ },
+ "evolution-skeptic": {
+ "name": "evolution-skeptic",
+ "evaluations": {
+ "deepseek-v4-pro": 33.1,
+ "glm-5.1": 31.6,
+ "kimi-k2.6": 37.3,
+ "qwen3-coder:480b": 42.9
+ },
+ "info": [
+ "Evaluates model responses against role-specific rubrics with detailed scoring and commentary",
+ "meta",
+ "ollama-cloud/deepseek-v4-pro-max"
+ ],
+ "best_model": "qwen3-coder:480b",
+ "best_score": 42.9
},
"flutter-developer": {
"name": "flutter-developer",
"evaluations": {
- "deepseek-v4-pro-max": 41.6,
- "kimi-k2.6": 41.6,
- "qwen3-coder:480b": 41.6
+ "deepseek-v4-pro": 34.5,
+ "glm-5.1": 54.9,
+ "kimi-k2.6": 49.3,
+ "qwen3-coder:480b": 54.9
},
"info": [
"Flutter mobile specialist for cross-platform apps, state management, and UI components (GNS-2 Tier 1)",
"core",
"ollama-cloud/qwen3-coder:480b"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 41.6
+ "best_model": "glm-5.1",
+ "best_score": 54.9
},
"frontend-developer": {
"name": "frontend-developer",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 31.6,
+ "glm-5.1": 53.2,
+ "kimi-k2.6": 38.8,
+ "qwen3-coder:480b": 56.0
},
"info": [
"Handles UI implementation with multimodal capabilities. Accepts visual references like screenshots and mockups (GNS-2 Tier 1)",
"core",
"ollama-cloud/minimax-m2.5"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "qwen3-coder:480b",
+ "best_score": 56.0
},
"go-developer": {
"name": "go-developer",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 41.4,
+ "glm-5.1": 53.5,
+ "kimi-k2.6": 48.3,
+ "qwen3-coder:480b": 58.7
},
"info": [
"Go backend specialist for Gin, Echo, APIs, and database integration (GNS-2 Tier 1)",
"core",
"ollama-cloud/deepseek-v4-pro-max"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "qwen3-coder:480b",
+ "best_score": 58.7
},
"history-miner": {
"name": "history-miner",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 30.1,
+ "glm-5.1": 44.3,
+ "kimi-k2.6": 46.9,
+ "qwen3-coder:480b": 44.8
},
"info": [
"Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work (GNS-2 Tier 0)",
"core",
"ollama-cloud/qwen3.5-122b"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "kimi-k2.6",
+ "best_score": 46.9
},
"incident-responder": {
"name": "incident-responder",
"evaluations": {
- "deepseek-v4-pro-max": 41.6,
- "kimi-k2.6": 41.6,
- "qwen3-coder:480b": 41.6
+ "deepseek-v4-pro": 48.6,
+ "glm-5.1": 65.6,
+ "kimi-k2.6": 59.1,
+ "qwen3-coder:480b": 56.4
},
"info": [
"Server incident response and system hardening specialist. Handles live forensics, malware removal, persistence hunting, SSH-based server cleanup, and post-incident hardening. Works with any OS and panel.",
"core",
"ollama-cloud/kimi-k2.6"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 41.6
+ "best_model": "glm-5.1",
+ "best_score": 65.6
},
"lead-developer": {
"name": "lead-developer",
"evaluations": {
- "deepseek-v4-pro-max": 41.6,
- "kimi-k2.6": 41.6,
- "qwen3-coder:480b": 41.6
+ "deepseek-v4-pro": 28.7,
+ "glm-5.1": 68.8,
+ "kimi-k2.6": 72.5,
+ "qwen3-coder:480b": 72.5
},
"info": [
"Primary code writer for backend and core logic. Writes implementation to pass tests (GNS-2 Tier 1)",
"core",
"ollama-cloud/qwen3-coder:480b"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 41.6
+ "best_model": "kimi-k2.6",
+ "best_score": 72.5
},
"markdown-validator": {
"name": "markdown-validator",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 39.0,
+ "glm-5.1": 37.2,
+ "kimi-k2.6": 24.0,
+ "qwen3-coder:480b": 47.4
},
"info": [
"Validates and corrects Markdown descriptions for Gitea issues (GNS-2 Tier 0)",
"meta",
"ollama-cloud/nemotron-3-nano"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "qwen3-coder:480b",
+ "best_score": 47.4
},
"memory-manager": {
"name": "memory-manager",
"evaluations": {
- "deepseek-v4-pro-max": 41.6,
- "kimi-k2.6": 41.6,
- "qwen3-coder:480b": 41.6
+ "deepseek-v4-pro": 35.8,
+ "glm-5.1": 48.3,
+ "kimi-k2.6": 41.5,
+ "qwen3-coder:480b": 46.8
},
"info": [
"Manages agent memory systems - short-term (context), long-term (vector store), and episodic (experiences) (GNS-2 Tier 0)",
"cognitive",
"ollama-cloud/deepseek-v4-pro-max"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 41.6
+ "best_model": "glm-5.1",
+ "best_score": 48.3
},
"orchestrator": {
"name": "orchestrator",
"evaluations": {
- "deepseek-v4-pro-max": 41.6,
- "kimi-k2.6": 41.6,
- "qwen3-coder:480b": 41.6
+ "deepseek-v4-flash": 27.0,
+ "deepseek-v4-pro": 19.6,
+ "glm-5.1": 36.2,
+ "kimi-k2.6": 40.0,
+ "minimax-m2.5": 36.3,
+ "qwen3-coder:480b": 39.1
},
"info": [
"Main dispatcher. Routes tasks between agents based on Issue status and manages the workflow state machine. IF:90 for optimal routing accuracy. (GNS-2 Tier 1)",
"meta",
"ollama-cloud/kimi-k2.6"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 41.6
+ "best_model": "kimi-k2.6",
+ "best_score": 40.0
},
"performance-engineer": {
"name": "performance-engineer",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 27.9,
+ "glm-5.1": 63.8,
+ "kimi-k2.6": 34.3,
+ "qwen3-coder:480b": 36.3
},
"info": [
"Reviews code for performance issues. Focuses on efficiency, N+1 queries, memory leaks, and algorithmic complexity (GNS-2 Tier 0)",
"quality",
"ollama-cloud/deepseek-v4-pro-max"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "glm-5.1",
+ "best_score": 63.8
},
"php-developer": {
"name": "php-developer",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 53.5,
+ "glm-5.1": 48.3,
+ "kimi-k2.6": 48.3,
+ "qwen3-coder:480b": 48.3
},
"info": [
"PHP backend specialist for Laravel, Symfony, WordPress, and full-stack web applications (GNS-2 Tier 1)",
"core",
"ollama-cloud/qwen3-coder:480b"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "deepseek-v4-pro",
+ "best_score": 53.5
},
"pipeline-judge": {
"name": "pipeline-judge",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 34.6,
+ "glm-5.1": 45.6,
+ "kimi-k2.6": 46.5,
+ "qwen3-coder:480b": 52.9
},
"info": [
"Automated pipeline judge. Evaluates workflow execution by running tests, measuring token cost and wall-clock time. Produces objective fitness scores. Never writes code - only measures and scores. (GNS-2 Tier 0)",
"meta",
"ollama-cloud/kimi-k2.6"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "qwen3-coder:480b",
+ "best_score": 52.9
},
"planner": {
"name": "planner",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 41.7,
+ "glm-5.1": 31.8,
+ "kimi-k2.6": 34.6,
+ "qwen3-coder:480b": 33.7
},
"info": [
"Advanced task planner using Chain of Thought, Tree of Thoughts, and Plan-Execute-Reflect (GNS-2 Tier 0)",
"cognitive",
"ollama-cloud/deepseek-v4-pro-max"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "deepseek-v4-pro",
+ "best_score": 41.7
},
"product-owner": {
"name": "product-owner",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 27.0,
+ "glm-5.1": 33.4,
+ "kimi-k2.6": 34.6,
+ "qwen3-coder:480b": 27.0
},
"info": [
"Manages issue checklists, status labels, tracks progress and coordinates with human users (GNS-2 Tier 1)",
"meta",
"ollama-cloud/glm-5.1"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "kimi-k2.6",
+ "best_score": 34.6
},
"prompt-optimizer": {
"name": "prompt-optimizer",
"evaluations": {
- "deepseek-v4-pro-max": 41.6,
- "kimi-k2.6": 41.6,
- "qwen3-coder:480b": 41.6
+ "deepseek-v4-pro": 27.0,
+ "glm-5.1": 48.3,
+ "kimi-k2.6": 33.0,
+ "qwen3-coder:480b": 31.8
},
"info": [
"Improves agent system prompts based on performance failures. Meta-learner for prompt optimization (GNS-2 Tier 1)",
"meta",
"ollama-cloud/qwen3.5-122b"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 41.6
+ "best_model": "glm-5.1",
+ "best_score": 48.3
},
"python-developer": {
"name": "python-developer",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 48.3,
+ "glm-5.1": 48.3,
+ "kimi-k2.6": 48.3,
+ "qwen3-coder:480b": 48.3
},
"info": [
"Python backend specialist for Django, FastAPI, data science, and API development (GNS-2 Tier 1)",
"core",
"ollama-cloud/qwen3-coder:480b"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "deepseek-v4-pro",
+ "best_score": 48.3
},
"reflector": {
"name": "reflector",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 43.2,
+ "glm-5.1": 53.5,
+ "kimi-k2.6": 58.7,
+ "qwen3-coder:480b": 20.9
},
"info": [
"Self-reflection agent using Reflexion pattern - learns from mistakes (GNS-2 Tier 0)",
"cognitive",
"ollama-cloud/deepseek-v4-pro-max"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "kimi-k2.6",
+ "best_score": 58.7
},
"release-manager": {
"name": "release-manager",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 23.7,
+ "glm-5.1": 38.0,
+ "kimi-k2.6": 50.2,
+ "qwen3-coder:480b": 41.7
},
"info": [
"Manages git operations, semantic versioning, branching, and deployments. Ensures clean history (GNS-2 Tier 1)",
"meta",
"ollama-cloud/kimi-k2.6"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "kimi-k2.6",
+ "best_score": 50.2
},
"requirement-refiner": {
"name": "requirement-refiner",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 30.3,
+ "glm-5.1": 31.0,
+ "kimi-k2.6": 31.2,
+ "qwen3-coder:480b": 45.3
},
"info": [
"Converts vague ideas and bug reports into strict User Stories with acceptance criteria checklists (GNS-2 Tier 1)",
"core",
"ollama-cloud/kimi-k2-thinking"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "qwen3-coder:480b",
+ "best_score": 45.3
},
"sdet-engineer": {
"name": "sdet-engineer",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 58.7,
+ "glm-5.1": 86.0,
+ "kimi-k2.6": 97.0,
+ "qwen3-coder:480b": 97.0
},
"info": [
"Writes tests following TDD methodology. Tests MUST fail initially (Red phase) (GNS-2 Tier 1)",
"core",
"ollama-cloud/qwen3-coder:480b"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "kimi-k2.6",
+ "best_score": 97.0
},
"security-auditor": {
"name": "security-auditor",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 46.4,
+ "glm-5.1": 58.7,
+ "kimi-k2.6": 63.8,
+ "qwen3-coder:480b": 41.5
},
"info": [
"Scans for security vulnerabilities, OWASP Top 10, dependency CVEs, and hardcoded secrets (GNS-2 Tier 0)",
"quality",
"ollama-cloud/deepseek-v4-pro-max"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "kimi-k2.6",
+ "best_score": 63.8
},
"system-analyst": {
"name": "system-analyst",
"evaluations": {
- "deepseek-v4-pro-max": 41.6,
- "kimi-k2.6": 41.6,
- "qwen3-coder:480b": 41.6
+ "deepseek-v4-pro": 56.4,
+ "glm-5.1": 87.0,
+ "kimi-k2.6": 92.0,
+ "qwen3-coder:480b": 77.0
},
"info": [
"Designs technical specifications, data schemas, and API contracts before implementation (GNS-2 Tier 1)",
"core",
"ollama-cloud/deepseek-v4-pro-max"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 41.6
+ "best_model": "kimi-k2.6",
+ "best_score": 92.0
},
"the-fixer": {
"name": "the-fixer",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 43.6,
+ "glm-5.1": 46.6,
+ "kimi-k2.6": 36.4,
+ "qwen3-coder:480b": 42.9
},
"info": [
"Iteratively fixes bugs based on specific error reports and test failures (GNS-2 Tier 1)",
"quality",
"ollama-cloud/kimi-k2.6"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "glm-5.1",
+ "best_score": 46.6
},
"visual-tester": {
"name": "visual-tester",
"evaluations": {
- "deepseek-v4-pro-max": 50.0,
- "kimi-k2.6": 50.0,
- "qwen3-coder:480b": 50.0
+ "deepseek-v4-pro": 47.3,
+ "glm-5.1": 58.7,
+ "kimi-k2.6": 53.5,
+ "qwen3-coder:480b": 53.5
},
"info": [
"Visual regression testing agent that compares screenshots and detects UI differences using pixelmatch and image diff (GNS-2 Tier 0)",
"quality",
"ollama-cloud/qwen3-coder:480b"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 50.0
+ "best_model": "glm-5.1",
+ "best_score": 58.7
},
"workflow-architect": {
"name": "workflow-architect",
"evaluations": {
- "deepseek-v4-pro-max": 41.6,
- "kimi-k2.6": 41.6,
- "qwen3-coder:480b": 41.6
+ "deepseek-v4-pro": 36.3,
+ "glm-5.1": 48.3,
+ "kimi-k2.6": 48.3,
+ "qwen3-coder:480b": 36.3
},
"info": [
"Creates and maintains workflow definitions with complete architecture, Gitea integration, and quality gates (GNS-2 Tier 1)",
"meta",
"ollama-cloud/qwen3.5-122b"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 41.6
+ "best_model": "glm-5.1",
+ "best_score": 48.3
},
"workflow-cross-checker": {
"name": "workflow-cross-checker",
"evaluations": {
- "deepseek-v4-pro-max": 41.6,
- "kimi-k2.6": 41.6,
- "qwen3-coder:480b": 41.6
+ "deepseek-v4-pro": 54.2,
+ "glm-5.1": 63.3,
+ "kimi-k2.6": 52.1,
+ "qwen3-coder:480b": 65.6
},
"info": [
"Workflow cross-checker and process inspector. Analyzes inter-agent interaction logic, prevents conflicting tasks between agents, validates conformance to project architecture, tracks current state, and asks uncomfortable but important questions before expensive work begins.",
"meta",
"ollama-cloud/kimi-k2.6"
],
- "best_model": "deepseek-v4-pro-max",
- "best_score": 41.6
+ "best_model": "qwen3-coder:480b",
+ "best_score": 65.6
}
},
"fit_scores": {
"agent-architect": {
- "model": "deepseek-v4-pro-max",
- "fit": 41.6,
- "explanation": "Best model for agent-architect is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage."
+ "model": "kimi-k2.6",
+ "fit": 53.5,
+ "explanation": "Best model for agent-architect is kimi-k2.6 with avg score 53.5. Strongest dimension: code_presence."
},
"architect-indexer": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for architect-indexer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "qwen3-coder:480b",
+ "fit": 54.0,
+ "explanation": "Best model for architect-indexer is qwen3-coder:480b with avg score 54.0. Strongest dimension: code_presence."
},
"backend-developer": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for backend-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "deepseek-v4-pro",
+ "fit": 53.5,
+ "explanation": "Best model for backend-developer is deepseek-v4-pro with avg score 53.5. Strongest dimension: code_presence."
},
"browser-automation": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for browser-automation is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "kimi-k2.6",
+ "fit": 63.8,
+ "explanation": "Best model for browser-automation is kimi-k2.6 with avg score 63.8. Strongest dimension: code_presence."
},
"capability-analyst": {
- "model": "deepseek-v4-pro-max",
- "fit": 41.6,
- "explanation": "Best model for capability-analyst is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage."
+ "model": "deepseek-v4-pro",
+ "fit": 58.7,
+ "explanation": "Best model for capability-analyst is deepseek-v4-pro with avg score 58.7. Strongest dimension: code_presence."
},
"code-skeptic": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for code-skeptic is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "kimi-k2.6",
+ "fit": 91.2,
+ "explanation": "Best model for code-skeptic is kimi-k2.6 with avg score 91.2. Strongest dimension: code_presence."
},
"devops-engineer": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for devops-engineer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "glm-5.1",
+ "fit": 96.2,
+ "explanation": "Best model for devops-engineer is glm-5.1 with avg score 96.2. Strongest dimension: keyword_coverage."
},
"evaluator": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for evaluator is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "glm-5.1",
+ "fit": 58.7,
+ "explanation": "Best model for evaluator is glm-5.1 with avg score 58.7. Strongest dimension: code_presence."
+ },
+ "evolution-prompt": {
+ "model": "kimi-k2.6",
+ "fit": 53.5,
+ "explanation": "Best model for evolution-prompt is kimi-k2.6 with avg score 53.5. Strongest dimension: code_presence."
+ },
+ "evolution-skeptic": {
+ "model": "qwen3-coder:480b",
+ "fit": 42.9,
+ "explanation": "Best model for evolution-skeptic is qwen3-coder:480b with avg score 42.9. Strongest dimension: structure."
},
"flutter-developer": {
- "model": "deepseek-v4-pro-max",
- "fit": 41.6,
- "explanation": "Best model for flutter-developer is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage."
+ "model": "glm-5.1",
+ "fit": 54.9,
+ "explanation": "Best model for flutter-developer is glm-5.1 with avg score 54.9. Strongest dimension: code_presence."
},
"frontend-developer": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for frontend-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "qwen3-coder:480b",
+ "fit": 56.0,
+ "explanation": "Best model for frontend-developer is qwen3-coder:480b with avg score 56.0. Strongest dimension: code_presence."
},
"go-developer": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for go-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "qwen3-coder:480b",
+ "fit": 58.7,
+ "explanation": "Best model for go-developer is qwen3-coder:480b with avg score 58.7. Strongest dimension: code_presence."
},
"history-miner": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for history-miner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "kimi-k2.6",
+ "fit": 46.9,
+ "explanation": "Best model for history-miner is kimi-k2.6 with avg score 46.9. Strongest dimension: code_presence."
},
"incident-responder": {
- "model": "deepseek-v4-pro-max",
- "fit": 41.6,
- "explanation": "Best model for incident-responder is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage."
+ "model": "glm-5.1",
+ "fit": 65.6,
+ "explanation": "Best model for incident-responder is glm-5.1 with avg score 65.6. Strongest dimension: code_presence."
},
"lead-developer": {
- "model": "deepseek-v4-pro-max",
- "fit": 41.6,
- "explanation": "Best model for lead-developer is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: code_presence."
+ "model": "kimi-k2.6",
+ "fit": 72.5,
+ "explanation": "Best model for lead-developer is kimi-k2.6 with avg score 72.5. Strongest dimension: keyword_coverage."
},
"markdown-validator": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for markdown-validator is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "qwen3-coder:480b",
+ "fit": 47.4,
+ "explanation": "Best model for markdown-validator is qwen3-coder:480b with avg score 47.4. Strongest dimension: code_presence."
},
"memory-manager": {
- "model": "deepseek-v4-pro-max",
- "fit": 41.6,
- "explanation": "Best model for memory-manager is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage."
+ "model": "glm-5.1",
+ "fit": 48.3,
+ "explanation": "Best model for memory-manager is glm-5.1 with avg score 48.3. Strongest dimension: code_presence."
},
"orchestrator": {
- "model": "deepseek-v4-pro-max",
- "fit": 41.6,
- "explanation": "Best model for orchestrator is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage."
+ "model": "kimi-k2.6",
+ "fit": 40.0,
+ "explanation": "Best model for orchestrator is kimi-k2.6 with avg score 40.0. Strongest dimension: code_presence."
},
"performance-engineer": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for performance-engineer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "glm-5.1",
+ "fit": 63.8,
+ "explanation": "Best model for performance-engineer is glm-5.1 with avg score 63.8. Strongest dimension: code_presence."
},
"php-developer": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for php-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "deepseek-v4-pro",
+ "fit": 53.5,
+ "explanation": "Best model for php-developer is deepseek-v4-pro with avg score 53.5. Strongest dimension: code_presence."
},
"pipeline-judge": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for pipeline-judge is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "qwen3-coder:480b",
+ "fit": 52.9,
+ "explanation": "Best model for pipeline-judge is qwen3-coder:480b with avg score 52.9. Strongest dimension: code_presence."
},
"planner": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for planner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "deepseek-v4-pro",
+ "fit": 41.7,
+ "explanation": "Best model for planner is deepseek-v4-pro with avg score 41.7. Strongest dimension: code_presence."
},
"product-owner": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for product-owner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "kimi-k2.6",
+ "fit": 34.6,
+ "explanation": "Best model for product-owner is kimi-k2.6 with avg score 34.6. Strongest dimension: actionability."
},
"prompt-optimizer": {
- "model": "deepseek-v4-pro-max",
- "fit": 41.6,
- "explanation": "Best model for prompt-optimizer is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage."
+ "model": "glm-5.1",
+ "fit": 48.3,
+ "explanation": "Best model for prompt-optimizer is glm-5.1 with avg score 48.3. Strongest dimension: code_presence."
},
"python-developer": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for python-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "deepseek-v4-pro",
+ "fit": 48.3,
+ "explanation": "Best model for python-developer is deepseek-v4-pro with avg score 48.3. Strongest dimension: code_presence."
},
"reflector": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for reflector is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "kimi-k2.6",
+ "fit": 58.7,
+ "explanation": "Best model for reflector is kimi-k2.6 with avg score 58.7. Strongest dimension: code_presence."
},
"release-manager": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for release-manager is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "kimi-k2.6",
+ "fit": 50.2,
+ "explanation": "Best model for release-manager is kimi-k2.6 with avg score 50.2. Strongest dimension: code_presence."
},
"requirement-refiner": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for requirement-refiner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "qwen3-coder:480b",
+ "fit": 45.3,
+ "explanation": "Best model for requirement-refiner is qwen3-coder:480b with avg score 45.3. Strongest dimension: code_presence."
},
"sdet-engineer": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for sdet-engineer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "kimi-k2.6",
+ "fit": 97.0,
+ "explanation": "Best model for sdet-engineer is kimi-k2.6 with avg score 97.0. Strongest dimension: keyword_coverage."
},
"security-auditor": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for security-auditor is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "kimi-k2.6",
+ "fit": 63.8,
+ "explanation": "Best model for security-auditor is kimi-k2.6 with avg score 63.8. Strongest dimension: code_presence."
},
"system-analyst": {
- "model": "deepseek-v4-pro-max",
- "fit": 41.6,
- "explanation": "Best model for system-analyst is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage."
+ "model": "kimi-k2.6",
+ "fit": 92.0,
+ "explanation": "Best model for system-analyst is kimi-k2.6 with avg score 92.0. Strongest dimension: keyword_coverage."
},
"the-fixer": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for the-fixer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "glm-5.1",
+ "fit": 46.6,
+ "explanation": "Best model for the-fixer is glm-5.1 with avg score 46.6. Strongest dimension: code_presence."
},
"visual-tester": {
- "model": "deepseek-v4-pro-max",
- "fit": 50.0,
- "explanation": "Best model for visual-tester is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage."
+ "model": "glm-5.1",
+ "fit": 58.7,
+ "explanation": "Best model for visual-tester is glm-5.1 with avg score 58.7. Strongest dimension: code_presence."
},
"workflow-architect": {
- "model": "deepseek-v4-pro-max",
- "fit": 41.6,
- "explanation": "Best model for workflow-architect is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage."
+ "model": "glm-5.1",
+ "fit": 48.3,
+ "explanation": "Best model for workflow-architect is glm-5.1 with avg score 48.3. Strongest dimension: code_presence."
},
"workflow-cross-checker": {
- "model": "deepseek-v4-pro-max",
- "fit": 41.6,
- "explanation": "Best model for workflow-cross-checker is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage."
+ "model": "qwen3-coder:480b",
+ "fit": 65.6,
+ "explanation": "Best model for workflow-cross-checker is qwen3-coder:480b with avg score 65.6. Strongest dimension: code_presence."
}
}
}
\ No newline at end of file
diff --git a/agent-evolution/docker-compose.yml b/agent-evolution/docker-compose.yml
index 9a0ed74..aaac6c2 100644
--- a/agent-evolution/docker-compose.yml
+++ b/agent-evolution/docker-compose.yml
@@ -1,28 +1,27 @@
-# Docker Compose for Agent Evolution Dashboard (mount-driven, no-rebuild)
+# Docker Compose for Agent Evolution Dashboard + Research API (mount-driven, no-rebuild)
# Usage:
# docker compose -f agent-evolution/docker-compose.yml up -d
-# # Edit any file in agent-evolution/ or .kilo/ on host → instant reflection
-# # Just run:
-# bun run sync:evolution
-# # and reload the page
+# # Edit any file on host → instant reflection in containers
+# # Dashboard: http://localhost:3003
+# # API: http://localhost:3004
#
-version: '3.8'
-
services:
evolution-dashboard:
- build:
- context: .
- dockerfile: Dockerfile
+ image: python:3.12-alpine
container_name: apaw-evolution
ports:
- "3003:80"
volumes:
# Mount the generated standalone HTML to the container's web root
- ./index.standalone.html:/app/index.html:ro
+ # Mount real-fit standalone report
+ - ./real-fit.html:/app/real-fit.html:ro
# Mount data directory for any additional assets
- ./data:/app/data:ro
# Mount .kilo directory for live config access
- ../.kilo:/app/kilo:ro
+ working_dir: /app
+ command: ["python3", "-m", "http.server", "80"]
environment:
- NODE_ENV=production
- TZ=UTC
@@ -39,6 +38,47 @@ services:
- "com.apaw.service=evolution-dashboard"
- "com.apaw.description=Agent Evolution Dashboard"
+ evolution-api:
+ image: python:3.12-alpine
+ container_name: apaw-evolution-api
+ ports:
+ - "3004:8000"
+ volumes:
+ # API source code
+ - ./api.py:/app/api.py:ro
+ - ./requirements.txt:/app/requirements.txt:ro
+ # Data directory (read-write for job state and reports)
+ - ./data:/app/data:rw
+ # real-fit-engine.py script
+ - ../scripts/real-fit-engine.py:/app/scripts/real-fit-engine.py:ro
+ # Agent definitions and metadata
+ - ../.kilo/agents:/app/agents:ro
+ - ../kilo-meta.json:/app/kilo-meta.json:ro
+ working_dir: /app
+ command: >
+ sh -c "pip install --no-cache-dir -r requirements.txt && uvicorn api:app --host 0.0.0.0 --port 8000"
+ environment:
+ - TZ=UTC
+ - PYTHONUNBUFFERED=1
+ - JOB_STATE_PATH=/app/data/research-jobs.json
+ - REPORT_PATH=/app/data/real-fit-report.json
+ - META_PATH=/app/kilo-meta.json
+ - EVOLUTION_PATH=/app/data/evolution.json
+ - ENGINE_PATH=/app/scripts/real-fit-engine.py
+ - REAL_FIT_DB=/app/data/real-fit.db
+ restart: unless-stopped
+ healthcheck:
+ test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/api/models"]
+ interval: 30s
+ timeout: 10s
+ retries: 3
+ start_period: 15s
+ networks:
+ - evolution-network
+ labels:
+ - "com.apaw.service=evolution-api"
+ - "com.apaw.description=Agent Evolution Research API"
+
# Optional: Nginx reverse proxy with SSL
evolution-nginx:
image: nginx:alpine
@@ -49,13 +89,14 @@ services:
- "80:80"
- "443:443"
volumes:
- - ./agent-evolution/nginx.conf:/etc/nginx/nginx.conf:ro
- - ./agent-evolution/ssl:/etc/nginx/ssl:ro
+ - ./nginx.conf:/etc/nginx/nginx.conf:ro
+ - ./ssl:/etc/nginx/ssl:ro
depends_on:
- evolution-dashboard
+ - evolution-api
networks:
- evolution-network
networks:
evolution-network:
- driver: bridge
\ No newline at end of file
+ driver: bridge
diff --git a/agent-evolution/index.standalone.html b/agent-evolution/index.standalone.html
index 8de3e9b..04cb752 100644
--- a/agent-evolution/index.standalone.html
+++ b/agent-evolution/index.standalone.html
@@ -5083,7 +5083,7 @@ async function init() {
try {
// Load real dashboard data FIRST (overrides stale agent-versions)
try {
- const dashRes = await fetch('data/dashboard-data.json');
+ const dashRes = await fetch('data/dashboard-data.json', { cache: 'no-cache' });
if (dashRes.ok) {
window.dashboardData = await dashRes.json();
// Sync agentData from dashboard data for all other tabs
@@ -5439,64 +5439,63 @@ function renderRecCard(r, index) {
`;
}
-// Render Heatmap — REAL DATA: Agent × Current Model × Real Fit Score
+// Render Heatmap — REAL DATA: Agent × Model × Live Ollama Evaluations
function renderHeatmap() {
const esc = str => (str || '').replace(/[&<>"']/g, m => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[m]));
const dd = window.dashboardData;
if (!dd || !dd.agents) {
- document.getElementById('hmTable').innerHTML = '| ⚠️ Нет данных. Запустите анализ. |
';
+ document.getElementById('hmTable').innerHTML = '| ⚠️ No data. Run analysis. |
';
return;
}
- const agents = dd.agents;
- // Get unique models sorted by count of agents
- const modelCounts = {};
- agents.forEach(a => { modelCounts[a.model_short] = (modelCounts[a.model_short] || 0) + 1; });
- const modelList = Object.entries(modelCounts)
- .sort((a, b) => b[1] - a[1])
- .map(([short]) => {
- const m = dd.models[short] || {};
- return {
- short,
- full: 'ollama-cloud/' + short,
- name: m.name || short,
- avg_fit: m.avg_fit || 0,
- agents: m.agents || 0
- };
- });
+ // Collect all models from current assignments + realfit evaluations
+ const modelsSeen = new Set();
+ dd.agents.forEach(a => { if (a.model_short) modelsSeen.add(a.model_short); });
+ dd.agents.forEach(a => {
+ if (a.real_evaluations) Object.keys(a.real_evaluations).forEach(m => { if (m && m !== 'code-skeptic') modelsSeen.add(m); });
+ });
+ // Ensure real-fit evaluated models are included even if not current
+ const modelList = Array.from(modelsSeen).sort();
- // Render table: rows=agents, cols=models
const t = document.getElementById('hmTable');
let h = '| Agent | ';
modelList.forEach(m => {
- const color = m.avg_fit >= 85 ? '#00ff94' : m.avg_fit >= 70 ? '#facc15' : '#ff6b81';
- h += `
- ${esc(m.name)}
- avg:${m.avg_fit}
- ${m.agents}
- | `;
+ // Compute avg from dd.agents real_evaluations
+ let sum = 0, cnt = 0;
+ dd.agents.forEach(a => { const v = (a.real_evaluations || {})[m]; if (v > 0) { sum += v; cnt++; } });
+ const avg = cnt > 0 ? Math.round(sum / cnt) : 0;
+ const color = avg >= 85 ? '#00ff94' : avg >= 70 ? '#facc15' : '#ff6b81';
+ h += `${esc(m)} avg:${avg} | `;
});
- h += '
';
+ h += 'Best | Score | ';
- agents.forEach(a => {
+ dd.agents.forEach(a => {
h += `| ${esc(a.name)} | `;
- modelList.forEach((m, j) => {
- const isCurrent = a.model_short === m.short;
- const score = isCurrent ? a.fit_score : 0; // Only show score for CURRENT model
- const cur = isCurrent;
- let marks = '';
- if (cur) marks += '●';
- const bg = cur ? hmColor(score) : 'transparent';
- const txt = cur ? hmText(score) : 'var(--text-muted)';
- h += `${isCurrent ? a.fit_score : '·'}${marks} | `;
+ modelList.forEach(m => {
+ const isCurrent = a.model_short === m;
+ let score = 0;
+ // Prefer real-fit score, fallback to current fit_score
+ if (a.real_evaluations && a.real_evaluations[m] > 0) score = Math.round(a.real_evaluations[m]);
+ else if (isCurrent) score = Math.round(a.fit_score || 0);
+
+ let cls = 'na';
+ if (score >= 90) cls = 'high';
+ else if (score >= 75) cls = 'good';
+ else if (score >= 50) cls = 'med';
+ else if (score > 0) cls = 'low';
+
+ const curMark = isCurrent ? ' ●' : '';
+ const curStyle = isCurrent ? 'box-shadow:inset 0 0 0 2px var(--accent-cyan);' : '';
+ const bg = score > 0 ? hmColor(score) : 'transparent';
+ const txt = score >= 75 ? '#0e1219' : 'var(--text-primary)';
+ const display = score > 0 ? score : (isCurrent ? Math.round(a.fit_score || 0) : '·');
+
+ h += `${display}${curMark} | `;
});
- h += '
';
+ const bestModel = a.real_best_model || a.model_short;
+ const bestScore = a.real_best_score ? Math.round(a.real_best_score) : Math.round(a.fit_score || 0);
+ h += `${esc(bestModel)} | ${bestScore} | `;
});
t.innerHTML = h + '';
}
@@ -5511,29 +5510,6 @@ function hmColor(v) {
return 'rgba(90,104,128,.2)';
}
-function hmText(v) {
- return v >= 75 ? '#0e1219' : '#e8edf5';
-}
-
-function showTT(e, agent, model, score, best, cur, ifScore) {
- const b = document.getElementById('ttBox'), o = document.getElementById('ttOverlay');
- const ifColor = ifScore >= 85 ? '#00ff94' : ifScore >= 75 ? '#facc15' : '#ff6b81';
- const ifLabel = ifScore >= 85 ? 'Excellent' : ifScore >= 75 ? 'Average' : 'Weak';
- b.innerHTML = `${model}
Agent: ${agent}
Score: ${score}/100
- Instruction Following: ${ifScore}/100 (${ifLabel})
- Score = benchmark × IF multiplier
- ${ifScore < 75 ? '⚠ Model poorly follows prompts — score reduced
' : ''}
- ${best ? '★ Best fit
' : ''}${cur ? '📌 Current' : ''}
`;
- const r = e.target.getBoundingClientRect();
- b.style.left = Math.min(r.left, window.innerWidth - 320) + 'px';
- b.style.top = (r.bottom + 6) + 'px';
- o.classList.add('show');
-}
-
-function hideTT() {
- document.getElementById('ttOverlay').classList.remove('show');
-}
-
// Current modal state
let hmCurrentAgent = null;
let hmCurrentModel = null;
diff --git a/agent-evolution/real-fit.html b/agent-evolution/real-fit.html
new file mode 100644
index 0000000..d993875
--- /dev/null
+++ b/agent-evolution/real-fit.html
@@ -0,0 +1,460 @@
+
+
+
+
+
+Real-Fit Matrix — Agent × Model Performance
+
+
+
+Real-Fit Matrix
+Real agent × model evaluation scores via live Ollama API (28 calls, 4 models, 7 agents)
+
+
+ 90+ Excellent
+ 75–89 Good
+ 50–74 Average
+ <50 Weak
+ ● = assigned model
+
+
+
+
+
+
Research models
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Evaluate cell
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/agent-evolution/requirements.txt b/agent-evolution/requirements.txt
new file mode 100644
index 0000000..7854f06
--- /dev/null
+++ b/agent-evolution/requirements.txt
@@ -0,0 +1,4 @@
+fastapi==0.136.3
+uvicorn==0.48.0
+python-multipart==0.0.29
+pydantic==2.13.4
\ No newline at end of file
diff --git a/agent-evolution/scripts/audit-system.cjs b/agent-evolution/scripts/audit-system.cjs
new file mode 100644
index 0000000..a0ea72b
--- /dev/null
+++ b/agent-evolution/scripts/audit-system.cjs
@@ -0,0 +1,138 @@
+const fs = require('fs');
+
+function parseFrontmatter(content) {
+ if (!content.startsWith('---')) return null;
+ const end = content.indexOf('---', 3);
+ if (end === -1) return null;
+ const fm = content.slice(3, end).trim();
+ const data = {};
+ for (const line of fm.split('\n')) {
+ const m = line.match(/^(\w+):\s*(.+)$/);
+ if (m) data[m[1]] = m[2].trim();
+ }
+ return data;
+}
+
+function stripComments(str) {
+ // Remove single-line comments, but not inside strings
+ return str.replace(/\/\/.*$/gm, '');
+}
+
+const agents = [];
+const commands = [];
+const issues = [];
+
+// 1. Parse agent .md files
+for (const f of fs.readdirSync('.kilo/agents').filter(f => f.endsWith('.md'))) {
+ const content = fs.readFileSync('.kilo/agents/' + f, 'utf8');
+ const fm = parseFrontmatter(content);
+ if (fm && fm.model) {
+ agents.push({
+ name: f.replace('.md', ''),
+ model: fm.model,
+ mode: fm.mode || 'subagent',
+ source: '.kilo/agents/' + f,
+ description: fm.description || ''
+ });
+ }
+}
+
+// 2. Parse command .md files
+for (const f of fs.readdirSync('.kilo/commands').filter(f => f.endsWith('.md'))) {
+ const content = fs.readFileSync('.kilo/commands/' + f, 'utf8');
+ const fm = parseFrontmatter(content);
+ if (fm && fm.model) {
+ commands.push({
+ name: f.replace('.md', ''),
+ model: fm.model,
+ mode: fm.mode || 'command',
+ source: '.kilo/commands/' + f,
+ description: fm.description || ''
+ });
+ }
+}
+
+// 3. Parse kilo-meta.json
+const meta = JSON.parse(fs.readFileSync('kilo-meta.json', 'utf8'));
+for (const a of agents) {
+ const m = meta.agents?.[a.name];
+ if (m) {
+ a.metaModel = m.model;
+ if (a.model !== m.model) issues.push(`AGENT ${a.name}: .md=${a.model} vs meta=${m.model}`);
+ }
+}
+for (const c of commands) {
+ const m = meta.commands?.[c.name];
+ if (m) {
+ c.metaModel = m.model;
+ if (c.model !== m.model) issues.push(`CMD ${c.name}: .md=${c.model} vs meta=${m.model}`);
+ }
+}
+
+// 4. Parse .kilo/kilo.jsonc
+const dotKiloRaw = stripComments(fs.readFileSync('.kilo/kilo.jsonc', 'utf8'));
+const dotKilo = JSON.parse(dotKiloRaw);
+for (const [name, cfg] of Object.entries(dotKilo.agent || {})) {
+ if (!cfg.model) continue;
+ const agent = agents.find(a => a.name === name);
+ if (agent) {
+ agent.kiloModel = cfg.model;
+ if (agent.model !== cfg.model) issues.push(`AGENT ${name}: .md=${agent.model} vs .kilo/kilo.jsonc=${cfg.model}`);
+ }
+}
+
+// 5. Parse root kilo.jsonc
+const rootKiloRaw = stripComments(fs.readFileSync('kilo.jsonc', 'utf8'));
+const rootKilo = JSON.parse(rootKiloRaw);
+for (const [name, cfg] of Object.entries(rootKilo.agent || {})) {
+ if (!cfg.model) continue;
+ const cmd = commands.find(c => c.name === name);
+ if (cmd) {
+ cmd.rootModel = cfg.model;
+ if (cmd.model !== cfg.model) issues.push(`CMD ${name}: .md=${cmd.model} vs kilo.jsonc=${cfg.model}`);
+ }
+}
+
+// 6. Check non-ollama
+const nonOllama = [];
+for (const a of agents) if (!a.model.startsWith('ollama-cloud/')) nonOllama.push({type:'agent', name:a.name, model:a.model});
+for (const c of commands) if (!c.model.startsWith('ollama-cloud/')) nonOllama.push({type:'command', name:c.name, model:c.model});
+
+// 7. Summary by model
+const modelStats = {};
+for (const a of agents) modelStats[a.model] = (modelStats[a.model] || 0) + 1;
+for (const c of commands) modelStats[c.model] = (modelStats[c.model] || 0) + 1;
+
+const state = {
+ generated: new Date().toISOString(),
+ totalAgents: agents.length,
+ totalCommands: commands.length,
+ allOllama: nonOllama.length === 0,
+ modelDistribution: modelStats,
+ agents: agents.sort((a,b) => a.name.localeCompare(b.name)),
+ commands: commands.sort((a,b) => a.name.localeCompare(b.name)),
+ issues: issues,
+ nonOllama: nonOllama
+};
+
+fs.writeFileSync('agent-evolution/data/real-state.json', JSON.stringify(state, null, 2) + '\n');
+
+// Console report
+console.log('=== REAL SYSTEM STATE ===');
+console.log('Generated:', state.generated);
+console.log('Agents:', state.totalAgents);
+console.log('Commands:', state.totalCommands);
+console.log('All ollama-cloud/:', state.allOllama ? 'YES' : 'NO (' + nonOllama.length + ' exceptions)');
+console.log('\n=== MODEL DISTRIBUTION ===');
+for (const [m, c] of Object.entries(modelStats).sort((a,b) => b[1]-a[1])) {
+ console.log(` ${m}: ${c}`);
+}
+if (issues.length > 0) {
+ console.log('\n=== ISSUES ===');
+ issues.forEach(i => console.log(' ⚠️', i));
+}
+if (nonOllama.length > 0) {
+ console.log('\n=== NON-OLLOMA ===');
+ nonOllama.forEach(n => console.log(' ❌', n.type, n.name, n.model));
+}
+console.log('\n✅ State written to agent-evolution/data/real-state.json');
diff --git a/agent-evolution/scripts/merge-real-fit.cjs b/agent-evolution/scripts/merge-real-fit.cjs
new file mode 100644
index 0000000..6477896
--- /dev/null
+++ b/agent-evolution/scripts/merge-real-fit.cjs
@@ -0,0 +1,29 @@
+const fs = require('fs');
+const path = require('path');
+
+const DASH = path.join(__dirname, '../data/dashboard-data.json');
+const REAL = path.join(__dirname, '../data/real-fit-report.json');
+const OUT = path.join(__dirname, '../data/dashboard-data.json');
+
+const dash = JSON.parse(fs.readFileSync(DASH, 'utf-8'));
+const real = JSON.parse(fs.readFileSync(REAL, 'utf-8'));
+
+// Inject real_evaluations into each agent
+dash.agents.forEach(a => {
+ const r = real.agents?.[a.name];
+ if (r && r.evaluations) {
+ a.real_evaluations = r.evaluations;
+ a.real_best_model = r.best_model;
+ a.real_best_score = r.best_score;
+ } else {
+ a.real_evaluations = {};
+ }
+});
+
+// Add metadata
+dash.real_fit_generated = real.generated;
+dash.real_fit_source = real.source;
+
+fs.writeFileSync(OUT, JSON.stringify(dash, null, 2));
+console.log('Merged real-fit data into ' + OUT);
+console.log('Agents with real evals:', dash.agents.filter(a => Object.keys(a.real_evaluations||{}).length > 0).length);
diff --git a/agent-evolution/scripts/patch-heatmap.js b/agent-evolution/scripts/patch-heatmap.js
new file mode 100644
index 0000000..211767f
--- /dev/null
+++ b/agent-evolution/scripts/patch-heatmap.js
@@ -0,0 +1,98 @@
+const fs = require('fs');
+const path = require('path');
+
+const INDEX = path.join(__dirname, '../index.standalone.html');
+
+// 1. New renderHeatmap that reads real-fit data
+const newRenderHeatmap = `function renderHeatmap() {
+ const esc = str => (str || '').replace(/[&<>"']/g, m => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[m]));
+ const dd = window.dashboardData;
+
+ // Merge real-fit if loaded
+ const rf = window.realFitData || {};
+ const realAgents = rf.agents || {};
+
+ if (!dd || !dd.agents) {
+ document.getElementById('hmTable').innerHTML = '| ⚠️ No data. Run analysis. |
';
+ return;
+ }
+
+ // Build model list from real-fit (cross-model) + current dashboard data
+ const modelsSeen = new Set();
+ dd.agents.forEach(a => { modelsSeen.add(a.model_short); });
+ Object.values(realAgents).forEach(a => { Object.keys(a.evaluations || {}).forEach(m => modelsSeen.add(m)); });
+ const modelList = Array.from(modelsSeen).filter(m => m && m !== 'code-skeptic');
+
+ const t = document.getElementById('hmTable');
+ let h = '| Agent | ';
+ modelList.forEach(m => {
+ h += '' + esc(m) + ' | ';
+ });
+ h += 'Best | Score |
';
+
+ dd.agents.forEach(a => {
+ const realAgent = realAgents[a.name];
+ h += '| ' + esc(a.name) + ' | ';
+ modelList.forEach(m => {
+ let score = 0;
+ if (realAgent && realAgent.evaluations && realAgent.evaluations[m] > 0) {
+ score = Math.round(realAgent.evaluations[m]);
+ }
+ const isCurrent = a.model_short === m;
+ let cls = 'na';
+ if (score >= 90) cls = 'high';
+ else if (score >= 75) cls = 'good';
+ else if (score >= 50) cls = 'med';
+ else if (score > 0) cls = 'low';
+ const display = score > 0 ? score : (isCurrent ? Math.round(a.fit_score || 0) : '·');
+ const curStyle = isCurrent ? 'box-shadow:inset 0 0 0 2px var(--accent-cyan)' : '';
+ h += '' + display + ' | ';
+ });
+ const bestModel = realAgent ? (realAgent.best_model || a.model_short) : a.model_short;
+ const bestScore = realAgent ? Math.round(realAgent.best_score || 0) : Math.round(a.fit_score || 0);
+ h += '' + esc(bestModel) + ' | ' + bestScore + ' |
';
+ });
+ t.innerHTML = h + '';
+}`;
+
+// 2. Add loadRealFitData script after dashboard load
+const loadRealFitData = `
+ // Load real-fit report for cross-model evaluation
+ try {
+ const rfRes = await fetch('data/real-fit-report.json');
+ if (rfRes.ok) window.realFitData = await rfRes.json();
+ } catch(e) { console.warn('real-fit-report.json not loaded:', e.message); }
+`;
+
+let html = fs.readFileSync(INDEX, 'utf-8');
+
+// Patch A: replace renderHeatmap function
+const oldPattern = /\/\/ Render Heatmap[\s\S]*?function renderHeatmap\(\)\s*\{[^}]*\{[^}]*\}[^}]*\}/;
+const oldMatch = html.match(oldPattern);
+if (oldMatch) {
+ html = html.substring(0, oldMatch.index) + '// Render Heatmap (real-fit enabled)\n' + newRenderHeatmap + html.substring(oldMatch.index + oldMatch[0].length);
+ console.log('Patched renderHeatmap');
+} else {
+ console.log('Pattern A not found, trying fallback...');
+ // Fallback: find and replace the specific renderHeatmap block
+ const start = html.indexOf('function renderHeatmap() {');
+ if (start !== -1) {
+ let brace = 0, end = start;
+ for (let i = start; i < html.length; i++) {
+ if (html[i] === '{') brace++;
+ else if (html[i] === '}') { brace--; if (brace === 0) { end = i + 1; break; } }
+ }
+ html = html.substring(0, start) + newRenderHeatmap + '\n' + html.substring(end);
+ console.log('Patched renderHeatmap (fallback)');
+ }
+}
+
+// Patch B: insert real-fit loading after dashboard load
+const dashLoadPattern = /window\.dashboardData = await dashRes\.json\(\);/;
+if (dashLoadPattern.test(html)) {
+ html = html.replace(dashLoadPattern, 'window.dashboardData = await dashRes.json();\n' + loadRealFitData.trim());
+ console.log('Patched init() to load real-fit data');
+}
+
+fs.writeFileSync(INDEX, html);
+console.log('Done — ' + (fs.statSync(INDEX).size / 1024).toFixed(1) + ' KB');
diff --git a/agent-evolution/scripts/rebuild-report.py b/agent-evolution/scripts/rebuild-report.py
new file mode 100644
index 0000000..4bb4df8
--- /dev/null
+++ b/agent-evolution/scripts/rebuild-report.py
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""
+Rebuild real-fit-report.json from SQLite DB.
+
+Usage:
+ python3 rebuild-report.py
+ python3 rebuild-report.py --db /path/to/real-fit.db --report /path/to/real-fit-report.json
+"""
+
+import argparse
+import json
+import sqlite3
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+
+
+def _sync_agents_from_meta(db_path: Path) -> None:
+ """Import any missing agents from kilo-meta.json into the DB agents table."""
+ meta_path = db_path.parent.parent.parent / "kilo-meta.json"
+ if not meta_path.exists():
+ return
+ with open(meta_path) as f:
+ meta = json.load(f)
+
+ conn = sqlite3.connect(str(db_path))
+ cursor = conn.cursor()
+ cursor.execute("SELECT name FROM agents")
+ existing = {r[0] for r in cursor.fetchall()}
+
+ for name, info in meta.get("agents", {}).items():
+ if name in existing:
+ continue
+ cursor.execute(
+ "INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)",
+ (
+ name,
+ info.get("description", ""),
+ info.get("category", "meta"),
+ info.get("model", ""),
+ info.get("color", "#6B7280"),
+ datetime.now(timezone.utc).isoformat(),
+ ),
+ )
+ conn.commit()
+ conn.close()
+
+
+def build_report(db_path: Path) -> dict:
+ _sync_agents_from_meta(db_path)
+ conn = sqlite3.connect(str(db_path))
+ conn.row_factory = sqlite3.Row
+ cursor = conn.cursor()
+
+ cursor.execute("""
+ SELECT name, description, category, current_model
+ FROM agents
+ """)
+ agents_meta = {row["name"]: dict(row) for row in cursor.fetchall()}
+
+ # Only take evaluations that are NOT HTTP error responses
+ # AND prefer evaluator='rubric_v2' over 'rubric_v1'
+ cursor.execute("""
+ SELECT agent_name, model, total_score, evaluator, response
+ FROM evaluations
+ WHERE total_score > 0
+ AND evaluator NOT LIKE '%rubric_v1%'
+ AND (response IS NULL
+ OR (response NOT LIKE '%[HTTP %' AND response != ''))
+ ORDER BY agent_name, model,
+ CASE evaluator
+ WHEN 'evolution-skeptic' THEN 0
+ WHEN 'rubric_v2' THEN 1
+ ELSE 2
+ END,
+ total_score DESC
+ """)
+
+ # Take the first (best preferred evaluator, highest score) per agent-model
+ best_evals = {}
+ for row in cursor.fetchall():
+ agent = row["agent_name"]
+ model = row["model"]
+ score = row["total_score"]
+ if agent not in best_evals:
+ best_evals[agent] = {}
+ if model not in best_evals[agent]:
+ best_evals[agent][model] = score
+
+ # Rebuild fit_scores from selected evaluations only
+ cursor.execute("""
+ SELECT agent_name, model, MAX(total_score) as best_score, scores, explanation
+ FROM evaluations
+ WHERE total_score > 0
+ AND evaluator NOT LIKE '%rubric_v1%'
+ AND (response IS NULL
+ OR (response NOT LIKE '%[HTTP %' AND response != ''))
+ GROUP BY agent_name, model
+ """)
+ fit_scores = {}
+ for row in cursor.fetchall():
+ fit_scores[row["agent_name"]] = {
+ "model": row["model"],
+ "fit": row["best_score"],
+ "explanation": (
+ f"Best model for {row['agent_name']} is {row['model']} "
+ f"with avg score {row['best_score']:.1f}. "
+ "Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
+ ),
+ }
+
+ conn.close()
+
+ agents_report = {}
+ for agent_name, meta in agents_meta.items():
+ evals = best_evals.get(agent_name, {})
+ if evals:
+ best_model = max(evals, key=evals.get)
+ best_score = evals[best_model]
+ else:
+ best_model = ""
+ best_score = 0.0
+ agents_report[agent_name] = {
+ "name": agent_name,
+ "evaluations": evals,
+ "info": [
+ meta.get("description") or "",
+ meta.get("category") or "",
+ meta.get("current_model") or "",
+ ],
+ "best_model": best_model,
+ "best_score": best_score,
+ }
+
+ total_evals = sum(len(evals) for evals in best_evals.values())
+ generated = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
+
+ return {
+ "generated": generated,
+ "source": "real-fit-engine-db-filtered",
+ "total_evaluations": total_evals,
+ "agents": agents_report,
+ "fit_scores": fit_scores,
+ }
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Rebuild real-fit-report.json from DB")
+ parser.add_argument(
+ "--db",
+ type=Path,
+ default=Path(__file__).parent.parent / "data" / "real-fit.db",
+ help="Path to SQLite DB",
+ )
+ parser.add_argument(
+ "--report",
+ type=Path,
+ default=Path(__file__).parent.parent / "data" / "real-fit-report.json",
+ help="Path to report JSON output",
+ )
+ args = parser.parse_args()
+
+ report = build_report(args.db)
+ args.report.parent.mkdir(parents=True, exist_ok=True)
+ with open(args.report, "w", encoding="utf-8") as f:
+ json.dump(report, f, indent=2)
+
+ print(f"Report rebuilt: {args.report}")
+ print(f"Agents: {len(report['agents'])}, Evaluations: {report['total_evaluations']}")
+
+
+if __name__ == "__main__":
+ main()