Files
APAW/agent-evolution/api.py
Deploy Bot b95fd41587 feat(evolution): add real-fit dashboard, API, report builder, and docker compose
- real-fit.html: API-driven research dashboard with agent/model heatmap, detail modal with score breakdown and evaluator commentary
- api.py: FastAPI backend serving /api/real-fit-report (dynamic from SQLite), /api/research, /api/evolve-agent/start
- rebuild-report.py: generates real-fit-report.json from SQLite DB for static fallback
- docker-compose.yml: add evolution-api service (Python 3.12, uvicorn) for research endpoints
- index.standalone.html: sync with dashboard data updates
- archive/index.html: standalone dashboard snapshot (263KB)
- .gitignore: exclude *.db, research-jobs.json from tracking
2026-05-28 11:55:49 +01:00

491 lines
16 KiB
Python

"""
Evolution Research API — FastAPI backend for agent-model evaluation jobs.
Endpoints:
POST /api/research → start background evaluation job
GET /api/research/{id} → job status & results
POST /api/research/cell → evaluate single agent-model pair
GET /api/real-fit-report → serve real-fit-report.json (live from DB)
GET /api/models → list available models
GET /api/evaluation/{agent}/{model} → detailed evaluation record
POST /api/evolve-agent/start → start role-fit testing job (evolution-prompt + evolution-skeptic)
"""
import json
import os
import sqlite3
import subprocess
import time
import uuid
from datetime import datetime, timezone
from pathlib import Path
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
app = FastAPI(title="Evolution Research API", version="1.1.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
JOB_STATE_PATH = Path(os.environ.get("JOB_STATE_PATH", "/app/data/research-jobs.json"))
REPORT_PATH = Path(os.environ.get("REPORT_PATH", "/app/data/real-fit-report.json"))
META_PATH = Path(os.environ.get("META_PATH", "/app/kilo-meta.json"))
EVOLUTION_PATH = Path(os.environ.get("EVOLUTION_PATH", "/app/data/evolution.json"))
ENGINE_PATH = Path(os.environ.get("ENGINE_PATH", "/app/scripts/real-fit-engine.py"))
DB_PATH = Path(os.environ.get("REAL_FIT_DB", REPORT_PATH.parent / "real-fit.db"))
def _load_json(path: Path) -> dict:
if path.exists():
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
return {}
def _save_json(path: Path, data: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
def _load_jobs() -> dict:
return _load_json(JOB_STATE_PATH)
def _save_jobs(jobs: dict) -> None:
_save_json(JOB_STATE_PATH, jobs)
class ResearchRequest(BaseModel):
agent: str
models: list[str]
class CellRequest(BaseModel):
agent: str
model: str
class EvolveAgentRequest(BaseModel):
agent: str
models: list[str]
def _spawn_engine_job(job_id: str, agent: str, models: list[str]) -> None:
"""Spawn real-fit-engine.py as a background subprocess to evaluate models.
After evaluation, regenerates the report JSON so results are immediately visible.
"""
model_arg = ",".join(models)
subprocess.Popen(
["python3", "-c", f"""
import subprocess, json, time, os
job_id = {repr(job_id)}
job_state_path = os.environ.get('JOB_STATE_PATH', '/app/data/research-jobs.json')
engine_path = os.environ.get('ENGINE_PATH', '/app/scripts/real-fit-engine.py')
def load_jobs():
try:
with open(job_state_path) as f:
return json.load(f)
except Exception:
return {{}}
def save_jobs(jobs):
with open(job_state_path, 'w') as f:
json.dump(jobs, f, indent=2)
jobs = load_jobs()
job = jobs.get(job_id)
if job:
job['status'] = 'running'
job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
save_jobs(jobs)
cmd = ['python3', engine_path, '--evaluate', {repr(agent)}, '--models', {repr(model_arg)}, '--report']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
stdout, stderr = proc.communicate()
jobs = load_jobs()
job = jobs.get(job_id)
if job:
job['status'] = 'done' if proc.returncode == 0 else 'error'
job['progress'] = 100
job['result'] = {{'returncode': proc.returncode, 'stdout': stdout, 'stderr': stderr}}
job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
save_jobs(jobs)
"""],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
@app.get("/api/models")
def get_models():
meta = _load_json(META_PATH)
agents_meta = meta.get("agents", {})
models = set()
for agent in agents_meta.values():
m = agent.get("model", "")
if m:
models.add(m)
evolution = _load_json(EVOLUTION_PATH)
for agent_data in evolution.get("agents", {}).values():
curr = agent_data.get("current", {})
m = curr.get("model", "")
if m:
models.add(m)
for rec in agent_data.get("recommendations", []):
mod = rec.get("model", "")
if mod:
models.add(mod)
return {"models": sorted(models)}
@app.get("/api/evaluation/{agent}/{model}")
def get_evaluation(agent: str, model: str):
db_path = str(DB_PATH)
if not os.path.exists(db_path):
raise HTTPException(status_code=404, detail="Evaluation database not found")
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# Step 1: Get the best evaluation for this agent-model pair
cursor.execute(
"""
SELECT e.id, e.agent_name, e.model, e.prompt_id,
e.response, e.scores, e.total_score, e.explanation,
e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at
FROM evaluations e
WHERE e.agent_name = ? AND e.model = ? AND e.total_score > 0
ORDER BY e.total_score DESC, e.id DESC
LIMIT 1
""",
(agent, model),
)
row = cursor.fetchone()
if not row:
# Fallback: try any evaluation even with score 0
cursor.execute(
"""
SELECT e.id, e.agent_name, e.model, e.prompt_id,
e.response, e.scores, e.total_score, e.explanation,
e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at
FROM evaluations e
WHERE e.agent_name = ? AND e.model = ?
ORDER BY e.id DESC LIMIT 1
""",
(agent, model),
)
row = cursor.fetchone()
if not row:
conn.close()
raise HTTPException(status_code=404, detail="Evaluation not found for this agent-model pair")
result = dict(row)
prompt_id = result.get("prompt_id")
# Step 2: Get prompt data — try by prompt_id first, then fallback by agent_name
system_prompt = ""
user_prompt = ""
expected_keywords_raw = "[]"
rubric_raw = "{}"
if prompt_id:
cursor.execute(
"SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE id = ?",
(prompt_id,),
)
tp = cursor.fetchone()
if tp and tp["system_prompt"]:
system_prompt = tp["system_prompt"]
user_prompt = tp["user_prompt"] or ""
expected_keywords_raw = tp["expected_keywords"] or "[]"
rubric_raw = tp["rubric"] or "{}"
# Fallback: find prompt by agent_name if JOIN failed
if not system_prompt:
cursor.execute(
"SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE agent_name = ? ORDER BY id DESC LIMIT 1",
(agent,),
)
tp = cursor.fetchone()
if tp:
system_prompt = tp["system_prompt"] or ""
user_prompt = tp["user_prompt"] or ""
expected_keywords_raw = tp["expected_keywords"] or "[]"
rubric_raw = tp["rubric"] or "{}"
conn.close()
# Assign all fetched prompt data to the result
result["system_prompt"] = system_prompt
result["user_prompt"] = user_prompt
result["expected_keywords"] = expected_keywords_raw
result["rubric"] = rubric_raw
for key in ("expected_keywords", "rubric", "scores"):
raw = result.get(key)
if isinstance(raw, str):
try:
result[key] = json.loads(raw)
except json.JSONDecodeError:
result[key] = [] if key == "expected_keywords" else {}
elif raw is None:
result[key] = [] if key == "expected_keywords" else {}
return result
def _sync_agents_from_meta(db_path: Path, meta_path: Path | None = None) -> None:
"""Import any missing agents from kilo-meta.json into the DB agents table."""
if meta_path is None:
meta_path = db_path.parent.parent.parent / "kilo-meta.json"
if not meta_path.exists():
return
with open(meta_path) as f:
meta = json.load(f)
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
cursor.execute("SELECT name FROM agents")
existing = {r[0] for r in cursor.fetchall()}
for name, info in meta.get("agents", {}).items():
if name in existing:
continue
cursor.execute(
"INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)",
(
name,
info.get("description", ""),
info.get("category", "meta"),
info.get("model", ""),
info.get("color", "#6B7280"),
datetime.now(timezone.utc).isoformat(),
),
)
conn.commit()
conn.close()
def _build_report_from_db(db_path: Path) -> dict:
"""Build real-fit report dynamically from SQLite DB (filtered, objective)."""
_sync_agents_from_meta(db_path)
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute("""
SELECT name, description, category, current_model
FROM agents
""")
agents_meta = {row["name"]: dict(row) for row in cursor.fetchall()}
# Only take evaluations that are NOT HTTP error responses
# AND prefer evaluator='rubric_v2' over 'rubric_v1'
cursor.execute("""
SELECT agent_name, model, total_score, evaluator, response
FROM evaluations
WHERE total_score > 0
AND evaluator NOT LIKE '%rubric_v1%'
AND (response IS NULL
OR (response NOT LIKE '%[HTTP %' AND response != ''))
ORDER BY agent_name, model,
CASE evaluator
WHEN 'evolution-skeptic' THEN 0
WHEN 'rubric_v2' THEN 1
ELSE 2
END,
total_score DESC
""")
# Take the first (best preferred evaluator, highest score) per agent-model
best_evals = {}
for row in cursor.fetchall():
agent = row["agent_name"]
model = row["model"]
score = row["total_score"]
if agent not in best_evals:
best_evals[agent] = {}
if model not in best_evals[agent]:
best_evals[agent][model] = score
# Rebuild fit_scores from selected evaluations only
cursor.execute("""
SELECT agent_name, model, MAX(total_score) as best_score, scores, explanation
FROM evaluations
WHERE total_score > 0
AND evaluator NOT LIKE '%rubric_v1%'
AND (response IS NULL
OR (response NOT LIKE '%[HTTP %' AND response != ''))
GROUP BY agent_name, model
""")
fit_scores = {}
for row in cursor.fetchall():
fit_scores[row["agent_name"]] = {
"model": row["model"],
"fit": row["best_score"],
"explanation": (
f"Best model for {row['agent_name']} is {row['model']} "
f"with avg score {row['best_score']:.1f}. "
"Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
),
}
conn.close()
agents_report = {}
for agent_name, meta in agents_meta.items():
evals = best_evals.get(agent_name, {})
if evals:
best_model = max(evals, key=evals.get)
best_score = evals[best_model]
else:
best_model = ""
best_score = 0.0
agents_report[agent_name] = {
"name": agent_name,
"evaluations": evals,
"info": [
meta.get("description") or "",
meta.get("category") or "",
meta.get("current_model") or "",
],
"best_model": best_model,
"best_score": best_score,
}
total_evals = sum(len(evals) for evals in best_evals.values())
return {
"generated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()).replace("+0000", "+00:00"),
"source": "real-fit-engine-db-filtered",
"total_evaluations": total_evals,
"agents": agents_report,
"fit_scores": fit_scores,
}
@app.get("/api/real-fit-report")
def get_real_fit_report():
db_path = str(DB_PATH)
if os.path.exists(db_path):
return _build_report_from_db(DB_PATH)
return _load_json(REPORT_PATH)
@app.post("/api/research")
def start_research(req: ResearchRequest):
job_id = str(uuid.uuid4())
jobs = _load_jobs()
jobs[job_id] = {
"id": job_id,
"agent": req.agent,
"models": req.models,
"status": "pending",
"progress": 0,
"result": None,
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
_save_jobs(jobs)
_spawn_engine_job(job_id, req.agent, req.models)
return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models}
def _extract_scores_from_report(agent: str, models: list[str]) -> list[dict]:
"""Read real-fit-report.json and return scores for agent x models."""
report = _load_json(REPORT_PATH)
agent_data = report.get("agents", {}).get(agent, {})
evaluations = agent_data.get("evaluations", {})
results = []
for m in models:
score = evaluations.get(m, 0)
pending = score == 0
results.append({"model": m, "score": score, "pending": pending})
return results
@app.get("/api/research/{job_id}")
def get_research(job_id: str):
jobs = _load_jobs()
job = jobs.get(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
if job.get("status") == "done" and job.get("result") is not None:
job["models_scored"] = _extract_scores_from_report(job["agent"], job.get("models", []))
return job
@app.post("/api/research/cell")
def research_cell(req: CellRequest):
job_id = str(uuid.uuid4())
jobs = _load_jobs()
jobs[job_id] = {
"id": job_id,
"agent": req.agent,
"models": [req.model],
"status": "pending",
"progress": 0,
"result": None,
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
_save_jobs(jobs)
_spawn_engine_job(job_id, req.agent, [req.model])
return {"job_id": job_id, "status": "pending", "agent": req.agent, "model": req.model}
@app.post("/api/evolve-agent/start")
def start_evolve_agent(req: EvolveAgentRequest):
"""Start a role-fit evaluation job using evolution-prompt and evolution-skeptic.
For now, this places a job in the queue that will be picked up by the real-fit-engine.
In the full implementation:
1. evolution-prompt generates role-specific stress-test prompts from agent definition
2. Each model in models list is tested with the same prompt
3. evolution-skeptic evaluates each response with per-dimension rubric scoring
4. Results are stored in SQLite and report is regenerated
"""
job_id = str(uuid.uuid4())
jobs = _load_jobs()
jobs[job_id] = {
"id": job_id,
"type": "evolve-agent",
"agent": req.agent,
"models": req.models,
"status": "pending",
"progress": 0,
"result": None,
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
_save_jobs(jobs)
# Placeholder: spawn the same engine job with evolve-agent type
# In full implementation, this would spawn a script that:
# 1. Reads agent definition from .kilo/agents/{agent}.md
# 2. Calls Ollama API for evolution-prompt to generate test prompts
# 3. For each model: calls Ollama API, stores response
# 4. Calls Ollama API for evolution-skeptic to evaluate
# 5. Stores results in SQLite, rebuilds report
_spawn_engine_job(job_id, req.agent, req.models)
return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models}