diff --git a/.gitignore b/.gitignore index 10dcd50..845da4e 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,8 @@ __pycache__/ agent-evolution/data/dashboard-data.json agent-evolution/data/state.json agent-evolution/data/model-benchmarks.json.bak +*.db +research-jobs.json landing/api/state.json landing/api/state.json.bak landing/api/dashboard-data.json diff --git a/agent-evolution/api.py b/agent-evolution/api.py new file mode 100644 index 0000000..6fd4b28 --- /dev/null +++ b/agent-evolution/api.py @@ -0,0 +1,491 @@ +""" +Evolution Research API — FastAPI backend for agent-model evaluation jobs. + +Endpoints: + POST /api/research → start background evaluation job + GET /api/research/{id} → job status & results + POST /api/research/cell → evaluate single agent-model pair + GET /api/real-fit-report → serve real-fit-report.json (live from DB) + GET /api/models → list available models + GET /api/evaluation/{agent}/{model} → detailed evaluation record + POST /api/evolve-agent/start → start role-fit testing job (evolution-prompt + evolution-skeptic) +""" + +import json +import os +import sqlite3 +import subprocess +import time +import uuid +from datetime import datetime, timezone +from pathlib import Path + +from fastapi import FastAPI, HTTPException +from fastapi.responses import JSONResponse +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel + +app = FastAPI(title="Evolution Research API", version="1.1.0") + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +JOB_STATE_PATH = Path(os.environ.get("JOB_STATE_PATH", "/app/data/research-jobs.json")) +REPORT_PATH = Path(os.environ.get("REPORT_PATH", "/app/data/real-fit-report.json")) +META_PATH = Path(os.environ.get("META_PATH", "/app/kilo-meta.json")) +EVOLUTION_PATH = Path(os.environ.get("EVOLUTION_PATH", "/app/data/evolution.json")) +ENGINE_PATH = Path(os.environ.get("ENGINE_PATH", "/app/scripts/real-fit-engine.py")) +DB_PATH = Path(os.environ.get("REAL_FIT_DB", REPORT_PATH.parent / "real-fit.db")) + + +def _load_json(path: Path) -> dict: + if path.exists(): + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + return {} + + +def _save_json(path: Path, data: dict) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + json.dump(data, f, indent=2) + + +def _load_jobs() -> dict: + return _load_json(JOB_STATE_PATH) + + +def _save_jobs(jobs: dict) -> None: + _save_json(JOB_STATE_PATH, jobs) + + +class ResearchRequest(BaseModel): + agent: str + models: list[str] + + +class CellRequest(BaseModel): + agent: str + model: str + + +class EvolveAgentRequest(BaseModel): + agent: str + models: list[str] + + +def _spawn_engine_job(job_id: str, agent: str, models: list[str]) -> None: + """Spawn real-fit-engine.py as a background subprocess to evaluate models. + + After evaluation, regenerates the report JSON so results are immediately visible. + """ + model_arg = ",".join(models) + subprocess.Popen( + ["python3", "-c", f""" +import subprocess, json, time, os +job_id = {repr(job_id)} +job_state_path = os.environ.get('JOB_STATE_PATH', '/app/data/research-jobs.json') +engine_path = os.environ.get('ENGINE_PATH', '/app/scripts/real-fit-engine.py') + +def load_jobs(): + try: + with open(job_state_path) as f: + return json.load(f) + except Exception: + return {{}} + +def save_jobs(jobs): + with open(job_state_path, 'w') as f: + json.dump(jobs, f, indent=2) + +jobs = load_jobs() +job = jobs.get(job_id) +if job: + job['status'] = 'running' + job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) + save_jobs(jobs) + +cmd = ['python3', engine_path, '--evaluate', {repr(agent)}, '--models', {repr(model_arg)}, '--report'] +proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) +stdout, stderr = proc.communicate() + +jobs = load_jobs() +job = jobs.get(job_id) +if job: + job['status'] = 'done' if proc.returncode == 0 else 'error' + job['progress'] = 100 + job['result'] = {{'returncode': proc.returncode, 'stdout': stdout, 'stderr': stderr}} + job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) + save_jobs(jobs) +"""], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + + +@app.get("/api/models") +def get_models(): + meta = _load_json(META_PATH) + agents_meta = meta.get("agents", {}) + models = set() + for agent in agents_meta.values(): + m = agent.get("model", "") + if m: + models.add(m) + evolution = _load_json(EVOLUTION_PATH) + for agent_data in evolution.get("agents", {}).values(): + curr = agent_data.get("current", {}) + m = curr.get("model", "") + if m: + models.add(m) + for rec in agent_data.get("recommendations", []): + mod = rec.get("model", "") + if mod: + models.add(mod) + return {"models": sorted(models)} + + +@app.get("/api/evaluation/{agent}/{model}") +def get_evaluation(agent: str, model: str): + db_path = str(DB_PATH) + if not os.path.exists(db_path): + raise HTTPException(status_code=404, detail="Evaluation database not found") + + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + # Step 1: Get the best evaluation for this agent-model pair + cursor.execute( + """ + SELECT e.id, e.agent_name, e.model, e.prompt_id, + e.response, e.scores, e.total_score, e.explanation, + e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at + FROM evaluations e + WHERE e.agent_name = ? AND e.model = ? AND e.total_score > 0 + ORDER BY e.total_score DESC, e.id DESC + LIMIT 1 + """, + (agent, model), + ) + row = cursor.fetchone() + + if not row: + # Fallback: try any evaluation even with score 0 + cursor.execute( + """ + SELECT e.id, e.agent_name, e.model, e.prompt_id, + e.response, e.scores, e.total_score, e.explanation, + e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at + FROM evaluations e + WHERE e.agent_name = ? AND e.model = ? + ORDER BY e.id DESC LIMIT 1 + """, + (agent, model), + ) + row = cursor.fetchone() + + if not row: + conn.close() + raise HTTPException(status_code=404, detail="Evaluation not found for this agent-model pair") + + result = dict(row) + prompt_id = result.get("prompt_id") + + # Step 2: Get prompt data — try by prompt_id first, then fallback by agent_name + system_prompt = "" + user_prompt = "" + expected_keywords_raw = "[]" + rubric_raw = "{}" + + if prompt_id: + cursor.execute( + "SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE id = ?", + (prompt_id,), + ) + tp = cursor.fetchone() + if tp and tp["system_prompt"]: + system_prompt = tp["system_prompt"] + user_prompt = tp["user_prompt"] or "" + expected_keywords_raw = tp["expected_keywords"] or "[]" + rubric_raw = tp["rubric"] or "{}" + + # Fallback: find prompt by agent_name if JOIN failed + if not system_prompt: + cursor.execute( + "SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE agent_name = ? ORDER BY id DESC LIMIT 1", + (agent,), + ) + tp = cursor.fetchone() + if tp: + system_prompt = tp["system_prompt"] or "" + user_prompt = tp["user_prompt"] or "" + expected_keywords_raw = tp["expected_keywords"] or "[]" + rubric_raw = tp["rubric"] or "{}" + + conn.close() + + # Assign all fetched prompt data to the result + result["system_prompt"] = system_prompt + result["user_prompt"] = user_prompt + result["expected_keywords"] = expected_keywords_raw + result["rubric"] = rubric_raw + + for key in ("expected_keywords", "rubric", "scores"): + raw = result.get(key) + if isinstance(raw, str): + try: + result[key] = json.loads(raw) + except json.JSONDecodeError: + result[key] = [] if key == "expected_keywords" else {} + elif raw is None: + result[key] = [] if key == "expected_keywords" else {} + + return result + + +def _sync_agents_from_meta(db_path: Path, meta_path: Path | None = None) -> None: + """Import any missing agents from kilo-meta.json into the DB agents table.""" + if meta_path is None: + meta_path = db_path.parent.parent.parent / "kilo-meta.json" + if not meta_path.exists(): + return + with open(meta_path) as f: + meta = json.load(f) + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT name FROM agents") + existing = {r[0] for r in cursor.fetchall()} + + for name, info in meta.get("agents", {}).items(): + if name in existing: + continue + cursor.execute( + "INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)", + ( + name, + info.get("description", ""), + info.get("category", "meta"), + info.get("model", ""), + info.get("color", "#6B7280"), + datetime.now(timezone.utc).isoformat(), + ), + ) + conn.commit() + conn.close() + + +def _build_report_from_db(db_path: Path) -> dict: + """Build real-fit report dynamically from SQLite DB (filtered, objective).""" + _sync_agents_from_meta(db_path) + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + cursor.execute(""" + SELECT name, description, category, current_model + FROM agents + """) + agents_meta = {row["name"]: dict(row) for row in cursor.fetchall()} + + # Only take evaluations that are NOT HTTP error responses + # AND prefer evaluator='rubric_v2' over 'rubric_v1' + cursor.execute(""" + SELECT agent_name, model, total_score, evaluator, response + FROM evaluations + WHERE total_score > 0 + AND evaluator NOT LIKE '%rubric_v1%' + AND (response IS NULL + OR (response NOT LIKE '%[HTTP %' AND response != '')) + ORDER BY agent_name, model, + CASE evaluator + WHEN 'evolution-skeptic' THEN 0 + WHEN 'rubric_v2' THEN 1 + ELSE 2 + END, + total_score DESC + """) + + # Take the first (best preferred evaluator, highest score) per agent-model + best_evals = {} + for row in cursor.fetchall(): + agent = row["agent_name"] + model = row["model"] + score = row["total_score"] + if agent not in best_evals: + best_evals[agent] = {} + if model not in best_evals[agent]: + best_evals[agent][model] = score + + # Rebuild fit_scores from selected evaluations only + cursor.execute(""" + SELECT agent_name, model, MAX(total_score) as best_score, scores, explanation + FROM evaluations + WHERE total_score > 0 + AND evaluator NOT LIKE '%rubric_v1%' + AND (response IS NULL + OR (response NOT LIKE '%[HTTP %' AND response != '')) + GROUP BY agent_name, model + """) + fit_scores = {} + for row in cursor.fetchall(): + fit_scores[row["agent_name"]] = { + "model": row["model"], + "fit": row["best_score"], + "explanation": ( + f"Best model for {row['agent_name']} is {row['model']} " + f"with avg score {row['best_score']:.1f}. " + "Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)." + ), + } + + conn.close() + + agents_report = {} + for agent_name, meta in agents_meta.items(): + evals = best_evals.get(agent_name, {}) + if evals: + best_model = max(evals, key=evals.get) + best_score = evals[best_model] + else: + best_model = "" + best_score = 0.0 + agents_report[agent_name] = { + "name": agent_name, + "evaluations": evals, + "info": [ + meta.get("description") or "", + meta.get("category") or "", + meta.get("current_model") or "", + ], + "best_model": best_model, + "best_score": best_score, + } + + total_evals = sum(len(evals) for evals in best_evals.values()) + + return { + "generated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()).replace("+0000", "+00:00"), + "source": "real-fit-engine-db-filtered", + "total_evaluations": total_evals, + "agents": agents_report, + "fit_scores": fit_scores, + } + + +@app.get("/api/real-fit-report") +def get_real_fit_report(): + db_path = str(DB_PATH) + if os.path.exists(db_path): + return _build_report_from_db(DB_PATH) + return _load_json(REPORT_PATH) + + +@app.post("/api/research") +def start_research(req: ResearchRequest): + job_id = str(uuid.uuid4()) + jobs = _load_jobs() + jobs[job_id] = { + "id": job_id, + "agent": req.agent, + "models": req.models, + "status": "pending", + "progress": 0, + "result": None, + "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + } + _save_jobs(jobs) + + _spawn_engine_job(job_id, req.agent, req.models) + + return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models} + + +def _extract_scores_from_report(agent: str, models: list[str]) -> list[dict]: + """Read real-fit-report.json and return scores for agent x models.""" + report = _load_json(REPORT_PATH) + agent_data = report.get("agents", {}).get(agent, {}) + evaluations = agent_data.get("evaluations", {}) + results = [] + for m in models: + score = evaluations.get(m, 0) + pending = score == 0 + results.append({"model": m, "score": score, "pending": pending}) + return results + + +@app.get("/api/research/{job_id}") +def get_research(job_id: str): + jobs = _load_jobs() + job = jobs.get(job_id) + if not job: + raise HTTPException(status_code=404, detail="Job not found") + if job.get("status") == "done" and job.get("result") is not None: + job["models_scored"] = _extract_scores_from_report(job["agent"], job.get("models", [])) + return job + + +@app.post("/api/research/cell") +def research_cell(req: CellRequest): + job_id = str(uuid.uuid4()) + jobs = _load_jobs() + jobs[job_id] = { + "id": job_id, + "agent": req.agent, + "models": [req.model], + "status": "pending", + "progress": 0, + "result": None, + "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + } + _save_jobs(jobs) + + _spawn_engine_job(job_id, req.agent, [req.model]) + + return {"job_id": job_id, "status": "pending", "agent": req.agent, "model": req.model} + + +@app.post("/api/evolve-agent/start") +def start_evolve_agent(req: EvolveAgentRequest): + """Start a role-fit evaluation job using evolution-prompt and evolution-skeptic. + + For now, this places a job in the queue that will be picked up by the real-fit-engine. + In the full implementation: + 1. evolution-prompt generates role-specific stress-test prompts from agent definition + 2. Each model in models list is tested with the same prompt + 3. evolution-skeptic evaluates each response with per-dimension rubric scoring + 4. Results are stored in SQLite and report is regenerated + """ + job_id = str(uuid.uuid4()) + jobs = _load_jobs() + jobs[job_id] = { + "id": job_id, + "type": "evolve-agent", + "agent": req.agent, + "models": req.models, + "status": "pending", + "progress": 0, + "result": None, + "created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + "updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + } + _save_jobs(jobs) + + # Placeholder: spawn the same engine job with evolve-agent type + # In full implementation, this would spawn a script that: + # 1. Reads agent definition from .kilo/agents/{agent}.md + # 2. Calls Ollama API for evolution-prompt to generate test prompts + # 3. For each model: calls Ollama API, stores response + # 4. Calls Ollama API for evolution-skeptic to evaluate + # 5. Stores results in SQLite, rebuilds report + _spawn_engine_job(job_id, req.agent, req.models) + + return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models} \ No newline at end of file diff --git a/agent-evolution/archive/index.html b/agent-evolution/archive/index.html new file mode 100644 index 0000000..7174e12 --- /dev/null +++ b/agent-evolution/archive/index.html @@ -0,0 +1,7031 @@ + + + + + + APAW Agent Evolution Dashboard + + + + + + +
+
+

APAW Agent Evolution

+
Real-time agent model & performance tracking
+
+ Loading... + + 0 agents + + 0 with history +
+
+ +
+ + + + + + +
+ + +
+
+ +
+
+

Recent Changes

+ 0 +
+
+
+
+
+ +
+
+

Pending Recommendations

+ 0 +
+
+
+
+ + +
+ +
+ + + + + + + +
+
+
+ + +
+
+

Evolution Timeline

+
+
+
+ + +
+
+ + + +
+
+
+ + +
+
+
Agent × Model Compatibility Heatmap
+
Weighted score = benchmark × instruction-following multiplier · ★ = best fit · outlined = current · click for details
+
+
+
+
+ 100806040200 +
+
+ ↑ Ideal Match + Mismatch ↓ +
+
+
+
+ + +
+
+ + +
+
Agent Performance Scores
+
Composite score per agent based on model benchmarks
+
+
+ + +
+
+
Model Distribution
+
Agents per model
+
+
+
+
Migration Impact
+
Before vs after model change score
+
+
+
+
+
+ + + + + + + + +
+
+
Applying Fixes...
+
+
+
+
Preparing...
+
+

+ +
+
+
+ + + + + +
+ + + + + + + + + + \ No newline at end of file diff --git a/agent-evolution/archive/tests/screenshot-dash.cjs b/agent-evolution/archive/tests/screenshot-dash.cjs new file mode 100644 index 0000000..b9b5e3f --- /dev/null +++ b/agent-evolution/archive/tests/screenshot-dash.cjs @@ -0,0 +1,11 @@ +const { chromium } = require('playwright'); +const fs = require('fs'); +(async () => { + const browser = await chromium.launch({ headless: true, args: ['--no-sandbox'] }); + const page = await browser.newPage({ viewport: { width: 1280, height: 720 } }); + await page.goto('http://host.docker.internal:3003', { waitUntil: 'domcontentloaded', timeout: 30000 }); + await page.waitForTimeout(2000); + await page.screenshot({ path: '/app/tests/visual/current/dashboard_landing.png', fullPage: false }); + await browser.close(); + console.log('Screenshot saved to /app/tests/visual/current/dashboard_landing.png'); +})(); diff --git a/agent-evolution/data/real-fit-report.json b/agent-evolution/data/real-fit-report.json index f57d4ae..10fd951 100644 --- a/agent-evolution/data/real-fit-report.json +++ b/agent-evolution/data/real-fit-report.json @@ -1,689 +1,768 @@ { - "generated": "2026-05-27T18:36:13.173821+00:00", + "generated": "2026-05-28T10:48:02.581965+00:00", "source": "real-fit-engine", - "total_evaluations": 102, + "total_evaluations": 147, "agents": { "agent-architect": { "name": "agent-architect", "evaluations": { - "deepseek-v4-pro-max": 41.6, - "kimi-k2.6": 41.6, - "qwen3-coder:480b": 41.6 + "deepseek-v4-pro": 48.3, + "glm-5.1": 48.3, + "kimi-k2.6": 53.5, + "qwen3-coder:480b": 48.3 }, "info": [ "Creates, modifies, and reviews new agents, workflows, and skills based on capability gap analysis. Tier 2 meta-agent with self-cascade enabled.", "meta", "ollama-cloud/kimi-k2.6" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 41.6 + "best_model": "kimi-k2.6", + "best_score": 53.5 }, "architect-indexer": { "name": "architect-indexer", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 43.2, + "glm-5.1": 48.6, + "kimi-k2.6": 46.5, + "qwen3-coder:480b": 54.0 }, "info": [ "Indexes and maps project codebase architecture into .architect/ directory. Creates and maintains structured documentation of entities, APIs, DB schema, file graphs, and conventions. (GNS-2 Tier 0)", "core", "ollama-cloud/glm-5.1" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "qwen3-coder:480b", + "best_score": 54.0 }, "backend-developer": { "name": "backend-developer", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 53.5, + "glm-5.1": 48.3, + "kimi-k2.6": 48.3, + "qwen3-coder:480b": 43.2 }, "info": [ "Backend specialist for Node.js, Express, APIs, and database integration (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "deepseek-v4-pro", + "best_score": 53.5 }, "browser-automation": { "name": "browser-automation", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 42.8, + "glm-5.1": 53.3, + "kimi-k2.6": 63.8, + "qwen3-coder:480b": 48.9 }, "info": [ "Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction (GNS-2 Tier 0)", "testing", "ollama-cloud/deepseek-v4-flash" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "kimi-k2.6", + "best_score": 63.8 }, "capability-analyst": { "name": "capability-analyst", "evaluations": { - "deepseek-v4-pro-max": 41.6, - "kimi-k2.6": 41.6, - "qwen3-coder:480b": 41.6 + "deepseek-v4-pro": 58.7, + "glm-5.1": 53.5, + "kimi-k2.6": 58.7, + "qwen3-coder:480b": 52.3 }, "info": [ "Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components. Tier 2 meta-agent with self-cascade enabled.", "meta", "ollama-cloud/deepseek-v4-pro-max" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 41.6 + "best_model": "deepseek-v4-pro", + "best_score": 58.7 }, "code-skeptic": { "name": "code-skeptic", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 22.8, + "glm-5.1": 89.1, + "kimi-k2.6": 91.2, + "minimax-m2.5": 45.0, + "qwen3-coder:480b": 90.6 }, "info": [ "Adversarial code reviewer. Finds problems and issues. Does NOT suggest implementations (GNS-2 Tier 0)", "quality", "ollama-cloud/minimax-m2.5" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "kimi-k2.6", + "best_score": 91.2 }, "devops-engineer": { "name": "devops-engineer", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 29.7, + "glm-5.1": 96.2, + "kimi-k2.6": 87.2, + "qwen3-coder:480b": 87.2 }, "info": [ "DevOps specialist for Docker, Kubernetes, CI/CD pipeline automation, and infrastructure management (GNS-2 Tier 1)", "core", "ollama-cloud/kimi-k2.6" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "glm-5.1", + "best_score": 96.2 }, "evaluator": { "name": "evaluator", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 50.6, + "glm-5.1": 58.7, + "kimi-k2.6": 53.5, + "qwen3-coder:480b": 43.8 }, "info": [ "Scores agent effectiveness after task completion for continuous improvement. Tier 2 meta-agent with self-cascade enabled.", "meta", "ollama-cloud/qwen3.5-122b" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "glm-5.1", + "best_score": 58.7 + }, + "evolution-prompt": { + "name": "evolution-prompt", + "evaluations": { + "deepseek-v4-pro": 52.6, + "glm-5.1": 44.7, + "kimi-k2.6": 53.5, + "qwen3-coder:480b": 21.3 + }, + "info": [ + "Generates role-specific stress-test prompts by analyzing agent definitions", + "meta", + "ollama-cloud/deepseek-v4-pro-max" + ], + "best_model": "kimi-k2.6", + "best_score": 53.5 + }, + "evolution-skeptic": { + "name": "evolution-skeptic", + "evaluations": { + "deepseek-v4-pro": 33.1, + "glm-5.1": 31.6, + "kimi-k2.6": 37.3, + "qwen3-coder:480b": 42.9 + }, + "info": [ + "Evaluates model responses against role-specific rubrics with detailed scoring and commentary", + "meta", + "ollama-cloud/deepseek-v4-pro-max" + ], + "best_model": "qwen3-coder:480b", + "best_score": 42.9 }, "flutter-developer": { "name": "flutter-developer", "evaluations": { - "deepseek-v4-pro-max": 41.6, - "kimi-k2.6": 41.6, - "qwen3-coder:480b": 41.6 + "deepseek-v4-pro": 34.5, + "glm-5.1": 54.9, + "kimi-k2.6": 49.3, + "qwen3-coder:480b": 54.9 }, "info": [ "Flutter mobile specialist for cross-platform apps, state management, and UI components (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 41.6 + "best_model": "glm-5.1", + "best_score": 54.9 }, "frontend-developer": { "name": "frontend-developer", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 31.6, + "glm-5.1": 53.2, + "kimi-k2.6": 38.8, + "qwen3-coder:480b": 56.0 }, "info": [ "Handles UI implementation with multimodal capabilities. Accepts visual references like screenshots and mockups (GNS-2 Tier 1)", "core", "ollama-cloud/minimax-m2.5" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "qwen3-coder:480b", + "best_score": 56.0 }, "go-developer": { "name": "go-developer", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 41.4, + "glm-5.1": 53.5, + "kimi-k2.6": 48.3, + "qwen3-coder:480b": 58.7 }, "info": [ "Go backend specialist for Gin, Echo, APIs, and database integration (GNS-2 Tier 1)", "core", "ollama-cloud/deepseek-v4-pro-max" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "qwen3-coder:480b", + "best_score": 58.7 }, "history-miner": { "name": "history-miner", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 30.1, + "glm-5.1": 44.3, + "kimi-k2.6": 46.9, + "qwen3-coder:480b": 44.8 }, "info": [ "Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work (GNS-2 Tier 0)", "core", "ollama-cloud/qwen3.5-122b" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "kimi-k2.6", + "best_score": 46.9 }, "incident-responder": { "name": "incident-responder", "evaluations": { - "deepseek-v4-pro-max": 41.6, - "kimi-k2.6": 41.6, - "qwen3-coder:480b": 41.6 + "deepseek-v4-pro": 48.6, + "glm-5.1": 65.6, + "kimi-k2.6": 59.1, + "qwen3-coder:480b": 56.4 }, "info": [ "Server incident response and system hardening specialist. Handles live forensics, malware removal, persistence hunting, SSH-based server cleanup, and post-incident hardening. Works with any OS and panel.", "core", "ollama-cloud/kimi-k2.6" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 41.6 + "best_model": "glm-5.1", + "best_score": 65.6 }, "lead-developer": { "name": "lead-developer", "evaluations": { - "deepseek-v4-pro-max": 41.6, - "kimi-k2.6": 41.6, - "qwen3-coder:480b": 41.6 + "deepseek-v4-pro": 28.7, + "glm-5.1": 68.8, + "kimi-k2.6": 72.5, + "qwen3-coder:480b": 72.5 }, "info": [ "Primary code writer for backend and core logic. Writes implementation to pass tests (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 41.6 + "best_model": "kimi-k2.6", + "best_score": 72.5 }, "markdown-validator": { "name": "markdown-validator", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 39.0, + "glm-5.1": 37.2, + "kimi-k2.6": 24.0, + "qwen3-coder:480b": 47.4 }, "info": [ "Validates and corrects Markdown descriptions for Gitea issues (GNS-2 Tier 0)", "meta", "ollama-cloud/nemotron-3-nano" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "qwen3-coder:480b", + "best_score": 47.4 }, "memory-manager": { "name": "memory-manager", "evaluations": { - "deepseek-v4-pro-max": 41.6, - "kimi-k2.6": 41.6, - "qwen3-coder:480b": 41.6 + "deepseek-v4-pro": 35.8, + "glm-5.1": 48.3, + "kimi-k2.6": 41.5, + "qwen3-coder:480b": 46.8 }, "info": [ "Manages agent memory systems - short-term (context), long-term (vector store), and episodic (experiences) (GNS-2 Tier 0)", "cognitive", "ollama-cloud/deepseek-v4-pro-max" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 41.6 + "best_model": "glm-5.1", + "best_score": 48.3 }, "orchestrator": { "name": "orchestrator", "evaluations": { - "deepseek-v4-pro-max": 41.6, - "kimi-k2.6": 41.6, - "qwen3-coder:480b": 41.6 + "deepseek-v4-flash": 27.0, + "deepseek-v4-pro": 19.6, + "glm-5.1": 36.2, + "kimi-k2.6": 40.0, + "minimax-m2.5": 36.3, + "qwen3-coder:480b": 39.1 }, "info": [ "Main dispatcher. Routes tasks between agents based on Issue status and manages the workflow state machine. IF:90 for optimal routing accuracy. (GNS-2 Tier 1)", "meta", "ollama-cloud/kimi-k2.6" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 41.6 + "best_model": "kimi-k2.6", + "best_score": 40.0 }, "performance-engineer": { "name": "performance-engineer", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 27.9, + "glm-5.1": 63.8, + "kimi-k2.6": 34.3, + "qwen3-coder:480b": 36.3 }, "info": [ "Reviews code for performance issues. Focuses on efficiency, N+1 queries, memory leaks, and algorithmic complexity (GNS-2 Tier 0)", "quality", "ollama-cloud/deepseek-v4-pro-max" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "glm-5.1", + "best_score": 63.8 }, "php-developer": { "name": "php-developer", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 53.5, + "glm-5.1": 48.3, + "kimi-k2.6": 48.3, + "qwen3-coder:480b": 48.3 }, "info": [ "PHP backend specialist for Laravel, Symfony, WordPress, and full-stack web applications (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "deepseek-v4-pro", + "best_score": 53.5 }, "pipeline-judge": { "name": "pipeline-judge", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 34.6, + "glm-5.1": 45.6, + "kimi-k2.6": 46.5, + "qwen3-coder:480b": 52.9 }, "info": [ "Automated pipeline judge. Evaluates workflow execution by running tests, measuring token cost and wall-clock time. Produces objective fitness scores. Never writes code - only measures and scores. (GNS-2 Tier 0)", "meta", "ollama-cloud/kimi-k2.6" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "qwen3-coder:480b", + "best_score": 52.9 }, "planner": { "name": "planner", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 41.7, + "glm-5.1": 31.8, + "kimi-k2.6": 34.6, + "qwen3-coder:480b": 33.7 }, "info": [ "Advanced task planner using Chain of Thought, Tree of Thoughts, and Plan-Execute-Reflect (GNS-2 Tier 0)", "cognitive", "ollama-cloud/deepseek-v4-pro-max" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "deepseek-v4-pro", + "best_score": 41.7 }, "product-owner": { "name": "product-owner", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 27.0, + "glm-5.1": 33.4, + "kimi-k2.6": 34.6, + "qwen3-coder:480b": 27.0 }, "info": [ "Manages issue checklists, status labels, tracks progress and coordinates with human users (GNS-2 Tier 1)", "meta", "ollama-cloud/glm-5.1" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "kimi-k2.6", + "best_score": 34.6 }, "prompt-optimizer": { "name": "prompt-optimizer", "evaluations": { - "deepseek-v4-pro-max": 41.6, - "kimi-k2.6": 41.6, - "qwen3-coder:480b": 41.6 + "deepseek-v4-pro": 27.0, + "glm-5.1": 48.3, + "kimi-k2.6": 33.0, + "qwen3-coder:480b": 31.8 }, "info": [ "Improves agent system prompts based on performance failures. Meta-learner for prompt optimization (GNS-2 Tier 1)", "meta", "ollama-cloud/qwen3.5-122b" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 41.6 + "best_model": "glm-5.1", + "best_score": 48.3 }, "python-developer": { "name": "python-developer", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 48.3, + "glm-5.1": 48.3, + "kimi-k2.6": 48.3, + "qwen3-coder:480b": 48.3 }, "info": [ "Python backend specialist for Django, FastAPI, data science, and API development (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "deepseek-v4-pro", + "best_score": 48.3 }, "reflector": { "name": "reflector", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 43.2, + "glm-5.1": 53.5, + "kimi-k2.6": 58.7, + "qwen3-coder:480b": 20.9 }, "info": [ "Self-reflection agent using Reflexion pattern - learns from mistakes (GNS-2 Tier 0)", "cognitive", "ollama-cloud/deepseek-v4-pro-max" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "kimi-k2.6", + "best_score": 58.7 }, "release-manager": { "name": "release-manager", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 23.7, + "glm-5.1": 38.0, + "kimi-k2.6": 50.2, + "qwen3-coder:480b": 41.7 }, "info": [ "Manages git operations, semantic versioning, branching, and deployments. Ensures clean history (GNS-2 Tier 1)", "meta", "ollama-cloud/kimi-k2.6" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "kimi-k2.6", + "best_score": 50.2 }, "requirement-refiner": { "name": "requirement-refiner", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 30.3, + "glm-5.1": 31.0, + "kimi-k2.6": 31.2, + "qwen3-coder:480b": 45.3 }, "info": [ "Converts vague ideas and bug reports into strict User Stories with acceptance criteria checklists (GNS-2 Tier 1)", "core", "ollama-cloud/kimi-k2-thinking" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "qwen3-coder:480b", + "best_score": 45.3 }, "sdet-engineer": { "name": "sdet-engineer", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 58.7, + "glm-5.1": 86.0, + "kimi-k2.6": 97.0, + "qwen3-coder:480b": 97.0 }, "info": [ "Writes tests following TDD methodology. Tests MUST fail initially (Red phase) (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "kimi-k2.6", + "best_score": 97.0 }, "security-auditor": { "name": "security-auditor", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 46.4, + "glm-5.1": 58.7, + "kimi-k2.6": 63.8, + "qwen3-coder:480b": 41.5 }, "info": [ "Scans for security vulnerabilities, OWASP Top 10, dependency CVEs, and hardcoded secrets (GNS-2 Tier 0)", "quality", "ollama-cloud/deepseek-v4-pro-max" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "kimi-k2.6", + "best_score": 63.8 }, "system-analyst": { "name": "system-analyst", "evaluations": { - "deepseek-v4-pro-max": 41.6, - "kimi-k2.6": 41.6, - "qwen3-coder:480b": 41.6 + "deepseek-v4-pro": 56.4, + "glm-5.1": 87.0, + "kimi-k2.6": 92.0, + "qwen3-coder:480b": 77.0 }, "info": [ "Designs technical specifications, data schemas, and API contracts before implementation (GNS-2 Tier 1)", "core", "ollama-cloud/deepseek-v4-pro-max" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 41.6 + "best_model": "kimi-k2.6", + "best_score": 92.0 }, "the-fixer": { "name": "the-fixer", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 43.6, + "glm-5.1": 46.6, + "kimi-k2.6": 36.4, + "qwen3-coder:480b": 42.9 }, "info": [ "Iteratively fixes bugs based on specific error reports and test failures (GNS-2 Tier 1)", "quality", "ollama-cloud/kimi-k2.6" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "glm-5.1", + "best_score": 46.6 }, "visual-tester": { "name": "visual-tester", "evaluations": { - "deepseek-v4-pro-max": 50.0, - "kimi-k2.6": 50.0, - "qwen3-coder:480b": 50.0 + "deepseek-v4-pro": 47.3, + "glm-5.1": 58.7, + "kimi-k2.6": 53.5, + "qwen3-coder:480b": 53.5 }, "info": [ "Visual regression testing agent that compares screenshots and detects UI differences using pixelmatch and image diff (GNS-2 Tier 0)", "quality", "ollama-cloud/qwen3-coder:480b" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 50.0 + "best_model": "glm-5.1", + "best_score": 58.7 }, "workflow-architect": { "name": "workflow-architect", "evaluations": { - "deepseek-v4-pro-max": 41.6, - "kimi-k2.6": 41.6, - "qwen3-coder:480b": 41.6 + "deepseek-v4-pro": 36.3, + "glm-5.1": 48.3, + "kimi-k2.6": 48.3, + "qwen3-coder:480b": 36.3 }, "info": [ "Creates and maintains workflow definitions with complete architecture, Gitea integration, and quality gates (GNS-2 Tier 1)", "meta", "ollama-cloud/qwen3.5-122b" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 41.6 + "best_model": "glm-5.1", + "best_score": 48.3 }, "workflow-cross-checker": { "name": "workflow-cross-checker", "evaluations": { - "deepseek-v4-pro-max": 41.6, - "kimi-k2.6": 41.6, - "qwen3-coder:480b": 41.6 + "deepseek-v4-pro": 54.2, + "glm-5.1": 63.3, + "kimi-k2.6": 52.1, + "qwen3-coder:480b": 65.6 }, "info": [ "Workflow cross-checker and process inspector. Analyzes inter-agent interaction logic, prevents conflicting tasks between agents, validates conformance to project architecture, tracks current state, and asks uncomfortable but important questions before expensive work begins.", "meta", "ollama-cloud/kimi-k2.6" ], - "best_model": "deepseek-v4-pro-max", - "best_score": 41.6 + "best_model": "qwen3-coder:480b", + "best_score": 65.6 } }, "fit_scores": { "agent-architect": { - "model": "deepseek-v4-pro-max", - "fit": 41.6, - "explanation": "Best model for agent-architect is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + "model": "kimi-k2.6", + "fit": 53.5, + "explanation": "Best model for agent-architect is kimi-k2.6 with avg score 53.5. Strongest dimension: code_presence." }, "architect-indexer": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for architect-indexer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "qwen3-coder:480b", + "fit": 54.0, + "explanation": "Best model for architect-indexer is qwen3-coder:480b with avg score 54.0. Strongest dimension: code_presence." }, "backend-developer": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for backend-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "deepseek-v4-pro", + "fit": 53.5, + "explanation": "Best model for backend-developer is deepseek-v4-pro with avg score 53.5. Strongest dimension: code_presence." }, "browser-automation": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for browser-automation is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "kimi-k2.6", + "fit": 63.8, + "explanation": "Best model for browser-automation is kimi-k2.6 with avg score 63.8. Strongest dimension: code_presence." }, "capability-analyst": { - "model": "deepseek-v4-pro-max", - "fit": 41.6, - "explanation": "Best model for capability-analyst is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + "model": "deepseek-v4-pro", + "fit": 58.7, + "explanation": "Best model for capability-analyst is deepseek-v4-pro with avg score 58.7. Strongest dimension: code_presence." }, "code-skeptic": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for code-skeptic is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "kimi-k2.6", + "fit": 91.2, + "explanation": "Best model for code-skeptic is kimi-k2.6 with avg score 91.2. Strongest dimension: code_presence." }, "devops-engineer": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for devops-engineer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "glm-5.1", + "fit": 96.2, + "explanation": "Best model for devops-engineer is glm-5.1 with avg score 96.2. Strongest dimension: keyword_coverage." }, "evaluator": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for evaluator is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "glm-5.1", + "fit": 58.7, + "explanation": "Best model for evaluator is glm-5.1 with avg score 58.7. Strongest dimension: code_presence." + }, + "evolution-prompt": { + "model": "kimi-k2.6", + "fit": 53.5, + "explanation": "Best model for evolution-prompt is kimi-k2.6 with avg score 53.5. Strongest dimension: code_presence." + }, + "evolution-skeptic": { + "model": "qwen3-coder:480b", + "fit": 42.9, + "explanation": "Best model for evolution-skeptic is qwen3-coder:480b with avg score 42.9. Strongest dimension: structure." }, "flutter-developer": { - "model": "deepseek-v4-pro-max", - "fit": 41.6, - "explanation": "Best model for flutter-developer is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + "model": "glm-5.1", + "fit": 54.9, + "explanation": "Best model for flutter-developer is glm-5.1 with avg score 54.9. Strongest dimension: code_presence." }, "frontend-developer": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for frontend-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "qwen3-coder:480b", + "fit": 56.0, + "explanation": "Best model for frontend-developer is qwen3-coder:480b with avg score 56.0. Strongest dimension: code_presence." }, "go-developer": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for go-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "qwen3-coder:480b", + "fit": 58.7, + "explanation": "Best model for go-developer is qwen3-coder:480b with avg score 58.7. Strongest dimension: code_presence." }, "history-miner": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for history-miner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "kimi-k2.6", + "fit": 46.9, + "explanation": "Best model for history-miner is kimi-k2.6 with avg score 46.9. Strongest dimension: code_presence." }, "incident-responder": { - "model": "deepseek-v4-pro-max", - "fit": 41.6, - "explanation": "Best model for incident-responder is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + "model": "glm-5.1", + "fit": 65.6, + "explanation": "Best model for incident-responder is glm-5.1 with avg score 65.6. Strongest dimension: code_presence." }, "lead-developer": { - "model": "deepseek-v4-pro-max", - "fit": 41.6, - "explanation": "Best model for lead-developer is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: code_presence." + "model": "kimi-k2.6", + "fit": 72.5, + "explanation": "Best model for lead-developer is kimi-k2.6 with avg score 72.5. Strongest dimension: keyword_coverage." }, "markdown-validator": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for markdown-validator is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "qwen3-coder:480b", + "fit": 47.4, + "explanation": "Best model for markdown-validator is qwen3-coder:480b with avg score 47.4. Strongest dimension: code_presence." }, "memory-manager": { - "model": "deepseek-v4-pro-max", - "fit": 41.6, - "explanation": "Best model for memory-manager is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + "model": "glm-5.1", + "fit": 48.3, + "explanation": "Best model for memory-manager is glm-5.1 with avg score 48.3. Strongest dimension: code_presence." }, "orchestrator": { - "model": "deepseek-v4-pro-max", - "fit": 41.6, - "explanation": "Best model for orchestrator is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + "model": "kimi-k2.6", + "fit": 40.0, + "explanation": "Best model for orchestrator is kimi-k2.6 with avg score 40.0. Strongest dimension: code_presence." }, "performance-engineer": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for performance-engineer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "glm-5.1", + "fit": 63.8, + "explanation": "Best model for performance-engineer is glm-5.1 with avg score 63.8. Strongest dimension: code_presence." }, "php-developer": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for php-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "deepseek-v4-pro", + "fit": 53.5, + "explanation": "Best model for php-developer is deepseek-v4-pro with avg score 53.5. Strongest dimension: code_presence." }, "pipeline-judge": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for pipeline-judge is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "qwen3-coder:480b", + "fit": 52.9, + "explanation": "Best model for pipeline-judge is qwen3-coder:480b with avg score 52.9. Strongest dimension: code_presence." }, "planner": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for planner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "deepseek-v4-pro", + "fit": 41.7, + "explanation": "Best model for planner is deepseek-v4-pro with avg score 41.7. Strongest dimension: code_presence." }, "product-owner": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for product-owner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "kimi-k2.6", + "fit": 34.6, + "explanation": "Best model for product-owner is kimi-k2.6 with avg score 34.6. Strongest dimension: actionability." }, "prompt-optimizer": { - "model": "deepseek-v4-pro-max", - "fit": 41.6, - "explanation": "Best model for prompt-optimizer is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + "model": "glm-5.1", + "fit": 48.3, + "explanation": "Best model for prompt-optimizer is glm-5.1 with avg score 48.3. Strongest dimension: code_presence." }, "python-developer": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for python-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "deepseek-v4-pro", + "fit": 48.3, + "explanation": "Best model for python-developer is deepseek-v4-pro with avg score 48.3. Strongest dimension: code_presence." }, "reflector": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for reflector is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "kimi-k2.6", + "fit": 58.7, + "explanation": "Best model for reflector is kimi-k2.6 with avg score 58.7. Strongest dimension: code_presence." }, "release-manager": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for release-manager is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "kimi-k2.6", + "fit": 50.2, + "explanation": "Best model for release-manager is kimi-k2.6 with avg score 50.2. Strongest dimension: code_presence." }, "requirement-refiner": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for requirement-refiner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "qwen3-coder:480b", + "fit": 45.3, + "explanation": "Best model for requirement-refiner is qwen3-coder:480b with avg score 45.3. Strongest dimension: code_presence." }, "sdet-engineer": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for sdet-engineer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "kimi-k2.6", + "fit": 97.0, + "explanation": "Best model for sdet-engineer is kimi-k2.6 with avg score 97.0. Strongest dimension: keyword_coverage." }, "security-auditor": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for security-auditor is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "kimi-k2.6", + "fit": 63.8, + "explanation": "Best model for security-auditor is kimi-k2.6 with avg score 63.8. Strongest dimension: code_presence." }, "system-analyst": { - "model": "deepseek-v4-pro-max", - "fit": 41.6, - "explanation": "Best model for system-analyst is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + "model": "kimi-k2.6", + "fit": 92.0, + "explanation": "Best model for system-analyst is kimi-k2.6 with avg score 92.0. Strongest dimension: keyword_coverage." }, "the-fixer": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for the-fixer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "glm-5.1", + "fit": 46.6, + "explanation": "Best model for the-fixer is glm-5.1 with avg score 46.6. Strongest dimension: code_presence." }, "visual-tester": { - "model": "deepseek-v4-pro-max", - "fit": 50.0, - "explanation": "Best model for visual-tester is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + "model": "glm-5.1", + "fit": 58.7, + "explanation": "Best model for visual-tester is glm-5.1 with avg score 58.7. Strongest dimension: code_presence." }, "workflow-architect": { - "model": "deepseek-v4-pro-max", - "fit": 41.6, - "explanation": "Best model for workflow-architect is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + "model": "glm-5.1", + "fit": 48.3, + "explanation": "Best model for workflow-architect is glm-5.1 with avg score 48.3. Strongest dimension: code_presence." }, "workflow-cross-checker": { - "model": "deepseek-v4-pro-max", - "fit": 41.6, - "explanation": "Best model for workflow-cross-checker is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + "model": "qwen3-coder:480b", + "fit": 65.6, + "explanation": "Best model for workflow-cross-checker is qwen3-coder:480b with avg score 65.6. Strongest dimension: code_presence." } } } \ No newline at end of file diff --git a/agent-evolution/docker-compose.yml b/agent-evolution/docker-compose.yml index 9a0ed74..aaac6c2 100644 --- a/agent-evolution/docker-compose.yml +++ b/agent-evolution/docker-compose.yml @@ -1,28 +1,27 @@ -# Docker Compose for Agent Evolution Dashboard (mount-driven, no-rebuild) +# Docker Compose for Agent Evolution Dashboard + Research API (mount-driven, no-rebuild) # Usage: # docker compose -f agent-evolution/docker-compose.yml up -d -# # Edit any file in agent-evolution/ or .kilo/ on host → instant reflection -# # Just run: -# bun run sync:evolution -# # and reload the page +# # Edit any file on host → instant reflection in containers +# # Dashboard: http://localhost:3003 +# # API: http://localhost:3004 # -version: '3.8' - services: evolution-dashboard: - build: - context: . - dockerfile: Dockerfile + image: python:3.12-alpine container_name: apaw-evolution ports: - "3003:80" volumes: # Mount the generated standalone HTML to the container's web root - ./index.standalone.html:/app/index.html:ro + # Mount real-fit standalone report + - ./real-fit.html:/app/real-fit.html:ro # Mount data directory for any additional assets - ./data:/app/data:ro # Mount .kilo directory for live config access - ../.kilo:/app/kilo:ro + working_dir: /app + command: ["python3", "-m", "http.server", "80"] environment: - NODE_ENV=production - TZ=UTC @@ -39,6 +38,47 @@ services: - "com.apaw.service=evolution-dashboard" - "com.apaw.description=Agent Evolution Dashboard" + evolution-api: + image: python:3.12-alpine + container_name: apaw-evolution-api + ports: + - "3004:8000" + volumes: + # API source code + - ./api.py:/app/api.py:ro + - ./requirements.txt:/app/requirements.txt:ro + # Data directory (read-write for job state and reports) + - ./data:/app/data:rw + # real-fit-engine.py script + - ../scripts/real-fit-engine.py:/app/scripts/real-fit-engine.py:ro + # Agent definitions and metadata + - ../.kilo/agents:/app/agents:ro + - ../kilo-meta.json:/app/kilo-meta.json:ro + working_dir: /app + command: > + sh -c "pip install --no-cache-dir -r requirements.txt && uvicorn api:app --host 0.0.0.0 --port 8000" + environment: + - TZ=UTC + - PYTHONUNBUFFERED=1 + - JOB_STATE_PATH=/app/data/research-jobs.json + - REPORT_PATH=/app/data/real-fit-report.json + - META_PATH=/app/kilo-meta.json + - EVOLUTION_PATH=/app/data/evolution.json + - ENGINE_PATH=/app/scripts/real-fit-engine.py + - REAL_FIT_DB=/app/data/real-fit.db + restart: unless-stopped + healthcheck: + test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/api/models"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 15s + networks: + - evolution-network + labels: + - "com.apaw.service=evolution-api" + - "com.apaw.description=Agent Evolution Research API" + # Optional: Nginx reverse proxy with SSL evolution-nginx: image: nginx:alpine @@ -49,13 +89,14 @@ services: - "80:80" - "443:443" volumes: - - ./agent-evolution/nginx.conf:/etc/nginx/nginx.conf:ro - - ./agent-evolution/ssl:/etc/nginx/ssl:ro + - ./nginx.conf:/etc/nginx/nginx.conf:ro + - ./ssl:/etc/nginx/ssl:ro depends_on: - evolution-dashboard + - evolution-api networks: - evolution-network networks: evolution-network: - driver: bridge \ No newline at end of file + driver: bridge diff --git a/agent-evolution/index.standalone.html b/agent-evolution/index.standalone.html index 8de3e9b..04cb752 100644 --- a/agent-evolution/index.standalone.html +++ b/agent-evolution/index.standalone.html @@ -5083,7 +5083,7 @@ async function init() { try { // Load real dashboard data FIRST (overrides stale agent-versions) try { - const dashRes = await fetch('data/dashboard-data.json'); + const dashRes = await fetch('data/dashboard-data.json', { cache: 'no-cache' }); if (dashRes.ok) { window.dashboardData = await dashRes.json(); // Sync agentData from dashboard data for all other tabs @@ -5439,64 +5439,63 @@ function renderRecCard(r, index) { `; } -// Render Heatmap — REAL DATA: Agent × Current Model × Real Fit Score +// Render Heatmap — REAL DATA: Agent × Model × Live Ollama Evaluations function renderHeatmap() { const esc = str => (str || '').replace(/[&<>"']/g, m => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[m])); const dd = window.dashboardData; if (!dd || !dd.agents) { - document.getElementById('hmTable').innerHTML = '⚠️ Нет данных. Запустите анализ.'; + document.getElementById('hmTable').innerHTML = '⚠️ No data. Run analysis.'; return; } - const agents = dd.agents; - // Get unique models sorted by count of agents - const modelCounts = {}; - agents.forEach(a => { modelCounts[a.model_short] = (modelCounts[a.model_short] || 0) + 1; }); - const modelList = Object.entries(modelCounts) - .sort((a, b) => b[1] - a[1]) - .map(([short]) => { - const m = dd.models[short] || {}; - return { - short, - full: 'ollama-cloud/' + short, - name: m.name || short, - avg_fit: m.avg_fit || 0, - agents: m.agents || 0 - }; - }); + // Collect all models from current assignments + realfit evaluations + const modelsSeen = new Set(); + dd.agents.forEach(a => { if (a.model_short) modelsSeen.add(a.model_short); }); + dd.agents.forEach(a => { + if (a.real_evaluations) Object.keys(a.real_evaluations).forEach(m => { if (m && m !== 'code-skeptic') modelsSeen.add(m); }); + }); + // Ensure real-fit evaluated models are included even if not current + const modelList = Array.from(modelsSeen).sort(); - // Render table: rows=agents, cols=models const t = document.getElementById('hmTable'); let h = 'Agent'; modelList.forEach(m => { - const color = m.avg_fit >= 85 ? '#00ff94' : m.avg_fit >= 70 ? '#facc15' : '#ff6b81'; - h += ` - ${esc(m.name)}
- avg:${m.avg_fit}
- ${m.agents} - `; + // Compute avg from dd.agents real_evaluations + let sum = 0, cnt = 0; + dd.agents.forEach(a => { const v = (a.real_evaluations || {})[m]; if (v > 0) { sum += v; cnt++; } }); + const avg = cnt > 0 ? Math.round(sum / cnt) : 0; + const color = avg >= 85 ? '#00ff94' : avg >= 70 ? '#facc15' : '#ff6b81'; + h += `${esc(m)}
avg:${avg}`; }); - h += ''; + h += 'BestScore'; - agents.forEach(a => { + dd.agents.forEach(a => { h += `${esc(a.name)}`; - modelList.forEach((m, j) => { - const isCurrent = a.model_short === m.short; - const score = isCurrent ? a.fit_score : 0; // Only show score for CURRENT model - const cur = isCurrent; - let marks = ''; - if (cur) marks += ''; - const bg = cur ? hmColor(score) : 'transparent'; - const txt = cur ? hmText(score) : 'var(--text-muted)'; - h += `${isCurrent ? a.fit_score : '·'}${marks}`; + modelList.forEach(m => { + const isCurrent = a.model_short === m; + let score = 0; + // Prefer real-fit score, fallback to current fit_score + if (a.real_evaluations && a.real_evaluations[m] > 0) score = Math.round(a.real_evaluations[m]); + else if (isCurrent) score = Math.round(a.fit_score || 0); + + let cls = 'na'; + if (score >= 90) cls = 'high'; + else if (score >= 75) cls = 'good'; + else if (score >= 50) cls = 'med'; + else if (score > 0) cls = 'low'; + + const curMark = isCurrent ? ' ●' : ''; + const curStyle = isCurrent ? 'box-shadow:inset 0 0 0 2px var(--accent-cyan);' : ''; + const bg = score > 0 ? hmColor(score) : 'transparent'; + const txt = score >= 75 ? '#0e1219' : 'var(--text-primary)'; + const display = score > 0 ? score : (isCurrent ? Math.round(a.fit_score || 0) : '·'); + + h += `${display}${curMark}`; }); - h += ''; + const bestModel = a.real_best_model || a.model_short; + const bestScore = a.real_best_score ? Math.round(a.real_best_score) : Math.round(a.fit_score || 0); + h += `${esc(bestModel)}${bestScore}`; }); t.innerHTML = h + ''; } @@ -5511,29 +5510,6 @@ function hmColor(v) { return 'rgba(90,104,128,.2)'; } -function hmText(v) { - return v >= 75 ? '#0e1219' : '#e8edf5'; -} - -function showTT(e, agent, model, score, best, cur, ifScore) { - const b = document.getElementById('ttBox'), o = document.getElementById('ttOverlay'); - const ifColor = ifScore >= 85 ? '#00ff94' : ifScore >= 75 ? '#facc15' : '#ff6b81'; - const ifLabel = ifScore >= 85 ? 'Excellent' : ifScore >= 75 ? 'Average' : 'Weak'; - b.innerHTML = `

${model}

Agent: ${agent}
Score: ${score}/100
- Instruction Following: ${ifScore}/100 (${ifLabel})
- Score = benchmark × IF multiplier
- ${ifScore < 75 ? '⚠ Model poorly follows prompts — score reduced
' : ''} - ${best ? '★ Best fit
' : ''}${cur ? '📌 Current' : ''}

`; - const r = e.target.getBoundingClientRect(); - b.style.left = Math.min(r.left, window.innerWidth - 320) + 'px'; - b.style.top = (r.bottom + 6) + 'px'; - o.classList.add('show'); -} - -function hideTT() { - document.getElementById('ttOverlay').classList.remove('show'); -} - // Current modal state let hmCurrentAgent = null; let hmCurrentModel = null; diff --git a/agent-evolution/real-fit.html b/agent-evolution/real-fit.html new file mode 100644 index 0000000..d993875 --- /dev/null +++ b/agent-evolution/real-fit.html @@ -0,0 +1,460 @@ + + + + + +Real-Fit Matrix — Agent × Model Performance + + + +

Real-Fit Matrix

+
Real agent × model evaluation scores via live Ollama API (28 calls, 4 models, 7 agents)
+
+
+ 90+ Excellent + 75–89 Good + 50–74 Average + <50 Weak + ● = assigned model +
+
Data source: real-fit-report.json | Updated:
+ + + + + + + + + + diff --git a/agent-evolution/requirements.txt b/agent-evolution/requirements.txt new file mode 100644 index 0000000..7854f06 --- /dev/null +++ b/agent-evolution/requirements.txt @@ -0,0 +1,4 @@ +fastapi==0.136.3 +uvicorn==0.48.0 +python-multipart==0.0.29 +pydantic==2.13.4 \ No newline at end of file diff --git a/agent-evolution/scripts/audit-system.cjs b/agent-evolution/scripts/audit-system.cjs new file mode 100644 index 0000000..a0ea72b --- /dev/null +++ b/agent-evolution/scripts/audit-system.cjs @@ -0,0 +1,138 @@ +const fs = require('fs'); + +function parseFrontmatter(content) { + if (!content.startsWith('---')) return null; + const end = content.indexOf('---', 3); + if (end === -1) return null; + const fm = content.slice(3, end).trim(); + const data = {}; + for (const line of fm.split('\n')) { + const m = line.match(/^(\w+):\s*(.+)$/); + if (m) data[m[1]] = m[2].trim(); + } + return data; +} + +function stripComments(str) { + // Remove single-line comments, but not inside strings + return str.replace(/\/\/.*$/gm, ''); +} + +const agents = []; +const commands = []; +const issues = []; + +// 1. Parse agent .md files +for (const f of fs.readdirSync('.kilo/agents').filter(f => f.endsWith('.md'))) { + const content = fs.readFileSync('.kilo/agents/' + f, 'utf8'); + const fm = parseFrontmatter(content); + if (fm && fm.model) { + agents.push({ + name: f.replace('.md', ''), + model: fm.model, + mode: fm.mode || 'subagent', + source: '.kilo/agents/' + f, + description: fm.description || '' + }); + } +} + +// 2. Parse command .md files +for (const f of fs.readdirSync('.kilo/commands').filter(f => f.endsWith('.md'))) { + const content = fs.readFileSync('.kilo/commands/' + f, 'utf8'); + const fm = parseFrontmatter(content); + if (fm && fm.model) { + commands.push({ + name: f.replace('.md', ''), + model: fm.model, + mode: fm.mode || 'command', + source: '.kilo/commands/' + f, + description: fm.description || '' + }); + } +} + +// 3. Parse kilo-meta.json +const meta = JSON.parse(fs.readFileSync('kilo-meta.json', 'utf8')); +for (const a of agents) { + const m = meta.agents?.[a.name]; + if (m) { + a.metaModel = m.model; + if (a.model !== m.model) issues.push(`AGENT ${a.name}: .md=${a.model} vs meta=${m.model}`); + } +} +for (const c of commands) { + const m = meta.commands?.[c.name]; + if (m) { + c.metaModel = m.model; + if (c.model !== m.model) issues.push(`CMD ${c.name}: .md=${c.model} vs meta=${m.model}`); + } +} + +// 4. Parse .kilo/kilo.jsonc +const dotKiloRaw = stripComments(fs.readFileSync('.kilo/kilo.jsonc', 'utf8')); +const dotKilo = JSON.parse(dotKiloRaw); +for (const [name, cfg] of Object.entries(dotKilo.agent || {})) { + if (!cfg.model) continue; + const agent = agents.find(a => a.name === name); + if (agent) { + agent.kiloModel = cfg.model; + if (agent.model !== cfg.model) issues.push(`AGENT ${name}: .md=${agent.model} vs .kilo/kilo.jsonc=${cfg.model}`); + } +} + +// 5. Parse root kilo.jsonc +const rootKiloRaw = stripComments(fs.readFileSync('kilo.jsonc', 'utf8')); +const rootKilo = JSON.parse(rootKiloRaw); +for (const [name, cfg] of Object.entries(rootKilo.agent || {})) { + if (!cfg.model) continue; + const cmd = commands.find(c => c.name === name); + if (cmd) { + cmd.rootModel = cfg.model; + if (cmd.model !== cfg.model) issues.push(`CMD ${name}: .md=${cmd.model} vs kilo.jsonc=${cfg.model}`); + } +} + +// 6. Check non-ollama +const nonOllama = []; +for (const a of agents) if (!a.model.startsWith('ollama-cloud/')) nonOllama.push({type:'agent', name:a.name, model:a.model}); +for (const c of commands) if (!c.model.startsWith('ollama-cloud/')) nonOllama.push({type:'command', name:c.name, model:c.model}); + +// 7. Summary by model +const modelStats = {}; +for (const a of agents) modelStats[a.model] = (modelStats[a.model] || 0) + 1; +for (const c of commands) modelStats[c.model] = (modelStats[c.model] || 0) + 1; + +const state = { + generated: new Date().toISOString(), + totalAgents: agents.length, + totalCommands: commands.length, + allOllama: nonOllama.length === 0, + modelDistribution: modelStats, + agents: agents.sort((a,b) => a.name.localeCompare(b.name)), + commands: commands.sort((a,b) => a.name.localeCompare(b.name)), + issues: issues, + nonOllama: nonOllama +}; + +fs.writeFileSync('agent-evolution/data/real-state.json', JSON.stringify(state, null, 2) + '\n'); + +// Console report +console.log('=== REAL SYSTEM STATE ==='); +console.log('Generated:', state.generated); +console.log('Agents:', state.totalAgents); +console.log('Commands:', state.totalCommands); +console.log('All ollama-cloud/:', state.allOllama ? 'YES' : 'NO (' + nonOllama.length + ' exceptions)'); +console.log('\n=== MODEL DISTRIBUTION ==='); +for (const [m, c] of Object.entries(modelStats).sort((a,b) => b[1]-a[1])) { + console.log(` ${m}: ${c}`); +} +if (issues.length > 0) { + console.log('\n=== ISSUES ==='); + issues.forEach(i => console.log(' ⚠️', i)); +} +if (nonOllama.length > 0) { + console.log('\n=== NON-OLLOMA ==='); + nonOllama.forEach(n => console.log(' ❌', n.type, n.name, n.model)); +} +console.log('\n✅ State written to agent-evolution/data/real-state.json'); diff --git a/agent-evolution/scripts/merge-real-fit.cjs b/agent-evolution/scripts/merge-real-fit.cjs new file mode 100644 index 0000000..6477896 --- /dev/null +++ b/agent-evolution/scripts/merge-real-fit.cjs @@ -0,0 +1,29 @@ +const fs = require('fs'); +const path = require('path'); + +const DASH = path.join(__dirname, '../data/dashboard-data.json'); +const REAL = path.join(__dirname, '../data/real-fit-report.json'); +const OUT = path.join(__dirname, '../data/dashboard-data.json'); + +const dash = JSON.parse(fs.readFileSync(DASH, 'utf-8')); +const real = JSON.parse(fs.readFileSync(REAL, 'utf-8')); + +// Inject real_evaluations into each agent +dash.agents.forEach(a => { + const r = real.agents?.[a.name]; + if (r && r.evaluations) { + a.real_evaluations = r.evaluations; + a.real_best_model = r.best_model; + a.real_best_score = r.best_score; + } else { + a.real_evaluations = {}; + } +}); + +// Add metadata +dash.real_fit_generated = real.generated; +dash.real_fit_source = real.source; + +fs.writeFileSync(OUT, JSON.stringify(dash, null, 2)); +console.log('Merged real-fit data into ' + OUT); +console.log('Agents with real evals:', dash.agents.filter(a => Object.keys(a.real_evaluations||{}).length > 0).length); diff --git a/agent-evolution/scripts/patch-heatmap.js b/agent-evolution/scripts/patch-heatmap.js new file mode 100644 index 0000000..211767f --- /dev/null +++ b/agent-evolution/scripts/patch-heatmap.js @@ -0,0 +1,98 @@ +const fs = require('fs'); +const path = require('path'); + +const INDEX = path.join(__dirname, '../index.standalone.html'); + +// 1. New renderHeatmap that reads real-fit data +const newRenderHeatmap = `function renderHeatmap() { + const esc = str => (str || '').replace(/[&<>"']/g, m => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[m])); + const dd = window.dashboardData; + + // Merge real-fit if loaded + const rf = window.realFitData || {}; + const realAgents = rf.agents || {}; + + if (!dd || !dd.agents) { + document.getElementById('hmTable').innerHTML = '⚠️ No data. Run analysis.'; + return; + } + + // Build model list from real-fit (cross-model) + current dashboard data + const modelsSeen = new Set(); + dd.agents.forEach(a => { modelsSeen.add(a.model_short); }); + Object.values(realAgents).forEach(a => { Object.keys(a.evaluations || {}).forEach(m => modelsSeen.add(m)); }); + const modelList = Array.from(modelsSeen).filter(m => m && m !== 'code-skeptic'); + + const t = document.getElementById('hmTable'); + let h = 'Agent'; + modelList.forEach(m => { + h += '' + esc(m) + ''; + }); + h += 'BestScore'; + + dd.agents.forEach(a => { + const realAgent = realAgents[a.name]; + h += '' + esc(a.name) + ''; + modelList.forEach(m => { + let score = 0; + if (realAgent && realAgent.evaluations && realAgent.evaluations[m] > 0) { + score = Math.round(realAgent.evaluations[m]); + } + const isCurrent = a.model_short === m; + let cls = 'na'; + if (score >= 90) cls = 'high'; + else if (score >= 75) cls = 'good'; + else if (score >= 50) cls = 'med'; + else if (score > 0) cls = 'low'; + const display = score > 0 ? score : (isCurrent ? Math.round(a.fit_score || 0) : '·'); + const curStyle = isCurrent ? 'box-shadow:inset 0 0 0 2px var(--accent-cyan)' : ''; + h += '' + display + ''; + }); + const bestModel = realAgent ? (realAgent.best_model || a.model_short) : a.model_short; + const bestScore = realAgent ? Math.round(realAgent.best_score || 0) : Math.round(a.fit_score || 0); + h += '' + esc(bestModel) + '' + bestScore + ''; + }); + t.innerHTML = h + ''; +}`; + +// 2. Add loadRealFitData script after dashboard load +const loadRealFitData = ` + // Load real-fit report for cross-model evaluation + try { + const rfRes = await fetch('data/real-fit-report.json'); + if (rfRes.ok) window.realFitData = await rfRes.json(); + } catch(e) { console.warn('real-fit-report.json not loaded:', e.message); } +`; + +let html = fs.readFileSync(INDEX, 'utf-8'); + +// Patch A: replace renderHeatmap function +const oldPattern = /\/\/ Render Heatmap[\s\S]*?function renderHeatmap\(\)\s*\{[^}]*\{[^}]*\}[^}]*\}/; +const oldMatch = html.match(oldPattern); +if (oldMatch) { + html = html.substring(0, oldMatch.index) + '// Render Heatmap (real-fit enabled)\n' + newRenderHeatmap + html.substring(oldMatch.index + oldMatch[0].length); + console.log('Patched renderHeatmap'); +} else { + console.log('Pattern A not found, trying fallback...'); + // Fallback: find and replace the specific renderHeatmap block + const start = html.indexOf('function renderHeatmap() {'); + if (start !== -1) { + let brace = 0, end = start; + for (let i = start; i < html.length; i++) { + if (html[i] === '{') brace++; + else if (html[i] === '}') { brace--; if (brace === 0) { end = i + 1; break; } } + } + html = html.substring(0, start) + newRenderHeatmap + '\n' + html.substring(end); + console.log('Patched renderHeatmap (fallback)'); + } +} + +// Patch B: insert real-fit loading after dashboard load +const dashLoadPattern = /window\.dashboardData = await dashRes\.json\(\);/; +if (dashLoadPattern.test(html)) { + html = html.replace(dashLoadPattern, 'window.dashboardData = await dashRes.json();\n' + loadRealFitData.trim()); + console.log('Patched init() to load real-fit data'); +} + +fs.writeFileSync(INDEX, html); +console.log('Done — ' + (fs.statSync(INDEX).size / 1024).toFixed(1) + ' KB'); diff --git a/agent-evolution/scripts/rebuild-report.py b/agent-evolution/scripts/rebuild-report.py new file mode 100644 index 0000000..4bb4df8 --- /dev/null +++ b/agent-evolution/scripts/rebuild-report.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +""" +Rebuild real-fit-report.json from SQLite DB. + +Usage: + python3 rebuild-report.py + python3 rebuild-report.py --db /path/to/real-fit.db --report /path/to/real-fit-report.json +""" + +import argparse +import json +import sqlite3 +import time +from datetime import datetime, timezone +from pathlib import Path + + +def _sync_agents_from_meta(db_path: Path) -> None: + """Import any missing agents from kilo-meta.json into the DB agents table.""" + meta_path = db_path.parent.parent.parent / "kilo-meta.json" + if not meta_path.exists(): + return + with open(meta_path) as f: + meta = json.load(f) + + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + cursor.execute("SELECT name FROM agents") + existing = {r[0] for r in cursor.fetchall()} + + for name, info in meta.get("agents", {}).items(): + if name in existing: + continue + cursor.execute( + "INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)", + ( + name, + info.get("description", ""), + info.get("category", "meta"), + info.get("model", ""), + info.get("color", "#6B7280"), + datetime.now(timezone.utc).isoformat(), + ), + ) + conn.commit() + conn.close() + + +def build_report(db_path: Path) -> dict: + _sync_agents_from_meta(db_path) + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + cursor = conn.cursor() + + cursor.execute(""" + SELECT name, description, category, current_model + FROM agents + """) + agents_meta = {row["name"]: dict(row) for row in cursor.fetchall()} + + # Only take evaluations that are NOT HTTP error responses + # AND prefer evaluator='rubric_v2' over 'rubric_v1' + cursor.execute(""" + SELECT agent_name, model, total_score, evaluator, response + FROM evaluations + WHERE total_score > 0 + AND evaluator NOT LIKE '%rubric_v1%' + AND (response IS NULL + OR (response NOT LIKE '%[HTTP %' AND response != '')) + ORDER BY agent_name, model, + CASE evaluator + WHEN 'evolution-skeptic' THEN 0 + WHEN 'rubric_v2' THEN 1 + ELSE 2 + END, + total_score DESC + """) + + # Take the first (best preferred evaluator, highest score) per agent-model + best_evals = {} + for row in cursor.fetchall(): + agent = row["agent_name"] + model = row["model"] + score = row["total_score"] + if agent not in best_evals: + best_evals[agent] = {} + if model not in best_evals[agent]: + best_evals[agent][model] = score + + # Rebuild fit_scores from selected evaluations only + cursor.execute(""" + SELECT agent_name, model, MAX(total_score) as best_score, scores, explanation + FROM evaluations + WHERE total_score > 0 + AND evaluator NOT LIKE '%rubric_v1%' + AND (response IS NULL + OR (response NOT LIKE '%[HTTP %' AND response != '')) + GROUP BY agent_name, model + """) + fit_scores = {} + for row in cursor.fetchall(): + fit_scores[row["agent_name"]] = { + "model": row["model"], + "fit": row["best_score"], + "explanation": ( + f"Best model for {row['agent_name']} is {row['model']} " + f"with avg score {row['best_score']:.1f}. " + "Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)." + ), + } + + conn.close() + + agents_report = {} + for agent_name, meta in agents_meta.items(): + evals = best_evals.get(agent_name, {}) + if evals: + best_model = max(evals, key=evals.get) + best_score = evals[best_model] + else: + best_model = "" + best_score = 0.0 + agents_report[agent_name] = { + "name": agent_name, + "evaluations": evals, + "info": [ + meta.get("description") or "", + meta.get("category") or "", + meta.get("current_model") or "", + ], + "best_model": best_model, + "best_score": best_score, + } + + total_evals = sum(len(evals) for evals in best_evals.values()) + generated = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()) + + return { + "generated": generated, + "source": "real-fit-engine-db-filtered", + "total_evaluations": total_evals, + "agents": agents_report, + "fit_scores": fit_scores, + } + + +def main(): + parser = argparse.ArgumentParser(description="Rebuild real-fit-report.json from DB") + parser.add_argument( + "--db", + type=Path, + default=Path(__file__).parent.parent / "data" / "real-fit.db", + help="Path to SQLite DB", + ) + parser.add_argument( + "--report", + type=Path, + default=Path(__file__).parent.parent / "data" / "real-fit-report.json", + help="Path to report JSON output", + ) + args = parser.parse_args() + + report = build_report(args.db) + args.report.parent.mkdir(parents=True, exist_ok=True) + with open(args.report, "w", encoding="utf-8") as f: + json.dump(report, f, indent=2) + + print(f"Report rebuilt: {args.report}") + print(f"Agents: {len(report['agents'])}, Evaluations: {report['total_evaluations']}") + + +if __name__ == "__main__": + main()