- real-fit.html: API-driven research dashboard with agent/model heatmap, detail modal with score breakdown and evaluator commentary - api.py: FastAPI backend serving /api/real-fit-report (dynamic from SQLite), /api/research, /api/evolve-agent/start - rebuild-report.py: generates real-fit-report.json from SQLite DB for static fallback - docker-compose.yml: add evolution-api service (Python 3.12, uvicorn) for research endpoints - index.standalone.html: sync with dashboard data updates - archive/index.html: standalone dashboard snapshot (263KB) - .gitignore: exclude *.db, research-jobs.json from tracking
491 lines
16 KiB
Python
491 lines
16 KiB
Python
"""
|
|
Evolution Research API — FastAPI backend for agent-model evaluation jobs.
|
|
|
|
Endpoints:
|
|
POST /api/research → start background evaluation job
|
|
GET /api/research/{id} → job status & results
|
|
POST /api/research/cell → evaluate single agent-model pair
|
|
GET /api/real-fit-report → serve real-fit-report.json (live from DB)
|
|
GET /api/models → list available models
|
|
GET /api/evaluation/{agent}/{model} → detailed evaluation record
|
|
POST /api/evolve-agent/start → start role-fit testing job (evolution-prompt + evolution-skeptic)
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
import subprocess
|
|
import time
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from fastapi import FastAPI, HTTPException
|
|
from fastapi.responses import JSONResponse
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from pydantic import BaseModel
|
|
|
|
app = FastAPI(title="Evolution Research API", version="1.1.0")
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
JOB_STATE_PATH = Path(os.environ.get("JOB_STATE_PATH", "/app/data/research-jobs.json"))
|
|
REPORT_PATH = Path(os.environ.get("REPORT_PATH", "/app/data/real-fit-report.json"))
|
|
META_PATH = Path(os.environ.get("META_PATH", "/app/kilo-meta.json"))
|
|
EVOLUTION_PATH = Path(os.environ.get("EVOLUTION_PATH", "/app/data/evolution.json"))
|
|
ENGINE_PATH = Path(os.environ.get("ENGINE_PATH", "/app/scripts/real-fit-engine.py"))
|
|
DB_PATH = Path(os.environ.get("REAL_FIT_DB", REPORT_PATH.parent / "real-fit.db"))
|
|
|
|
|
|
def _load_json(path: Path) -> dict:
|
|
if path.exists():
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
|
|
def _save_json(path: Path, data: dict) -> None:
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2)
|
|
|
|
|
|
def _load_jobs() -> dict:
|
|
return _load_json(JOB_STATE_PATH)
|
|
|
|
|
|
def _save_jobs(jobs: dict) -> None:
|
|
_save_json(JOB_STATE_PATH, jobs)
|
|
|
|
|
|
class ResearchRequest(BaseModel):
|
|
agent: str
|
|
models: list[str]
|
|
|
|
|
|
class CellRequest(BaseModel):
|
|
agent: str
|
|
model: str
|
|
|
|
|
|
class EvolveAgentRequest(BaseModel):
|
|
agent: str
|
|
models: list[str]
|
|
|
|
|
|
def _spawn_engine_job(job_id: str, agent: str, models: list[str]) -> None:
|
|
"""Spawn real-fit-engine.py as a background subprocess to evaluate models.
|
|
|
|
After evaluation, regenerates the report JSON so results are immediately visible.
|
|
"""
|
|
model_arg = ",".join(models)
|
|
subprocess.Popen(
|
|
["python3", "-c", f"""
|
|
import subprocess, json, time, os
|
|
job_id = {repr(job_id)}
|
|
job_state_path = os.environ.get('JOB_STATE_PATH', '/app/data/research-jobs.json')
|
|
engine_path = os.environ.get('ENGINE_PATH', '/app/scripts/real-fit-engine.py')
|
|
|
|
def load_jobs():
|
|
try:
|
|
with open(job_state_path) as f:
|
|
return json.load(f)
|
|
except Exception:
|
|
return {{}}
|
|
|
|
def save_jobs(jobs):
|
|
with open(job_state_path, 'w') as f:
|
|
json.dump(jobs, f, indent=2)
|
|
|
|
jobs = load_jobs()
|
|
job = jobs.get(job_id)
|
|
if job:
|
|
job['status'] = 'running'
|
|
job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
|
|
save_jobs(jobs)
|
|
|
|
cmd = ['python3', engine_path, '--evaluate', {repr(agent)}, '--models', {repr(model_arg)}, '--report']
|
|
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
stdout, stderr = proc.communicate()
|
|
|
|
jobs = load_jobs()
|
|
job = jobs.get(job_id)
|
|
if job:
|
|
job['status'] = 'done' if proc.returncode == 0 else 'error'
|
|
job['progress'] = 100
|
|
job['result'] = {{'returncode': proc.returncode, 'stdout': stdout, 'stderr': stderr}}
|
|
job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
|
|
save_jobs(jobs)
|
|
"""],
|
|
stdout=subprocess.DEVNULL,
|
|
stderr=subprocess.DEVNULL,
|
|
)
|
|
|
|
|
|
@app.get("/api/models")
|
|
def get_models():
|
|
meta = _load_json(META_PATH)
|
|
agents_meta = meta.get("agents", {})
|
|
models = set()
|
|
for agent in agents_meta.values():
|
|
m = agent.get("model", "")
|
|
if m:
|
|
models.add(m)
|
|
evolution = _load_json(EVOLUTION_PATH)
|
|
for agent_data in evolution.get("agents", {}).values():
|
|
curr = agent_data.get("current", {})
|
|
m = curr.get("model", "")
|
|
if m:
|
|
models.add(m)
|
|
for rec in agent_data.get("recommendations", []):
|
|
mod = rec.get("model", "")
|
|
if mod:
|
|
models.add(mod)
|
|
return {"models": sorted(models)}
|
|
|
|
|
|
@app.get("/api/evaluation/{agent}/{model}")
|
|
def get_evaluation(agent: str, model: str):
|
|
db_path = str(DB_PATH)
|
|
if not os.path.exists(db_path):
|
|
raise HTTPException(status_code=404, detail="Evaluation database not found")
|
|
|
|
conn = sqlite3.connect(db_path)
|
|
conn.row_factory = sqlite3.Row
|
|
cursor = conn.cursor()
|
|
|
|
# Step 1: Get the best evaluation for this agent-model pair
|
|
cursor.execute(
|
|
"""
|
|
SELECT e.id, e.agent_name, e.model, e.prompt_id,
|
|
e.response, e.scores, e.total_score, e.explanation,
|
|
e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at
|
|
FROM evaluations e
|
|
WHERE e.agent_name = ? AND e.model = ? AND e.total_score > 0
|
|
ORDER BY e.total_score DESC, e.id DESC
|
|
LIMIT 1
|
|
""",
|
|
(agent, model),
|
|
)
|
|
row = cursor.fetchone()
|
|
|
|
if not row:
|
|
# Fallback: try any evaluation even with score 0
|
|
cursor.execute(
|
|
"""
|
|
SELECT e.id, e.agent_name, e.model, e.prompt_id,
|
|
e.response, e.scores, e.total_score, e.explanation,
|
|
e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at
|
|
FROM evaluations e
|
|
WHERE e.agent_name = ? AND e.model = ?
|
|
ORDER BY e.id DESC LIMIT 1
|
|
""",
|
|
(agent, model),
|
|
)
|
|
row = cursor.fetchone()
|
|
|
|
if not row:
|
|
conn.close()
|
|
raise HTTPException(status_code=404, detail="Evaluation not found for this agent-model pair")
|
|
|
|
result = dict(row)
|
|
prompt_id = result.get("prompt_id")
|
|
|
|
# Step 2: Get prompt data — try by prompt_id first, then fallback by agent_name
|
|
system_prompt = ""
|
|
user_prompt = ""
|
|
expected_keywords_raw = "[]"
|
|
rubric_raw = "{}"
|
|
|
|
if prompt_id:
|
|
cursor.execute(
|
|
"SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE id = ?",
|
|
(prompt_id,),
|
|
)
|
|
tp = cursor.fetchone()
|
|
if tp and tp["system_prompt"]:
|
|
system_prompt = tp["system_prompt"]
|
|
user_prompt = tp["user_prompt"] or ""
|
|
expected_keywords_raw = tp["expected_keywords"] or "[]"
|
|
rubric_raw = tp["rubric"] or "{}"
|
|
|
|
# Fallback: find prompt by agent_name if JOIN failed
|
|
if not system_prompt:
|
|
cursor.execute(
|
|
"SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE agent_name = ? ORDER BY id DESC LIMIT 1",
|
|
(agent,),
|
|
)
|
|
tp = cursor.fetchone()
|
|
if tp:
|
|
system_prompt = tp["system_prompt"] or ""
|
|
user_prompt = tp["user_prompt"] or ""
|
|
expected_keywords_raw = tp["expected_keywords"] or "[]"
|
|
rubric_raw = tp["rubric"] or "{}"
|
|
|
|
conn.close()
|
|
|
|
# Assign all fetched prompt data to the result
|
|
result["system_prompt"] = system_prompt
|
|
result["user_prompt"] = user_prompt
|
|
result["expected_keywords"] = expected_keywords_raw
|
|
result["rubric"] = rubric_raw
|
|
|
|
for key in ("expected_keywords", "rubric", "scores"):
|
|
raw = result.get(key)
|
|
if isinstance(raw, str):
|
|
try:
|
|
result[key] = json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
result[key] = [] if key == "expected_keywords" else {}
|
|
elif raw is None:
|
|
result[key] = [] if key == "expected_keywords" else {}
|
|
|
|
return result
|
|
|
|
|
|
def _sync_agents_from_meta(db_path: Path, meta_path: Path | None = None) -> None:
|
|
"""Import any missing agents from kilo-meta.json into the DB agents table."""
|
|
if meta_path is None:
|
|
meta_path = db_path.parent.parent.parent / "kilo-meta.json"
|
|
if not meta_path.exists():
|
|
return
|
|
with open(meta_path) as f:
|
|
meta = json.load(f)
|
|
|
|
conn = sqlite3.connect(str(db_path))
|
|
cursor = conn.cursor()
|
|
cursor.execute("SELECT name FROM agents")
|
|
existing = {r[0] for r in cursor.fetchall()}
|
|
|
|
for name, info in meta.get("agents", {}).items():
|
|
if name in existing:
|
|
continue
|
|
cursor.execute(
|
|
"INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)",
|
|
(
|
|
name,
|
|
info.get("description", ""),
|
|
info.get("category", "meta"),
|
|
info.get("model", ""),
|
|
info.get("color", "#6B7280"),
|
|
datetime.now(timezone.utc).isoformat(),
|
|
),
|
|
)
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
|
|
def _build_report_from_db(db_path: Path) -> dict:
|
|
"""Build real-fit report dynamically from SQLite DB (filtered, objective)."""
|
|
_sync_agents_from_meta(db_path)
|
|
conn = sqlite3.connect(str(db_path))
|
|
conn.row_factory = sqlite3.Row
|
|
cursor = conn.cursor()
|
|
|
|
cursor.execute("""
|
|
SELECT name, description, category, current_model
|
|
FROM agents
|
|
""")
|
|
agents_meta = {row["name"]: dict(row) for row in cursor.fetchall()}
|
|
|
|
# Only take evaluations that are NOT HTTP error responses
|
|
# AND prefer evaluator='rubric_v2' over 'rubric_v1'
|
|
cursor.execute("""
|
|
SELECT agent_name, model, total_score, evaluator, response
|
|
FROM evaluations
|
|
WHERE total_score > 0
|
|
AND evaluator NOT LIKE '%rubric_v1%'
|
|
AND (response IS NULL
|
|
OR (response NOT LIKE '%[HTTP %' AND response != ''))
|
|
ORDER BY agent_name, model,
|
|
CASE evaluator
|
|
WHEN 'evolution-skeptic' THEN 0
|
|
WHEN 'rubric_v2' THEN 1
|
|
ELSE 2
|
|
END,
|
|
total_score DESC
|
|
""")
|
|
|
|
# Take the first (best preferred evaluator, highest score) per agent-model
|
|
best_evals = {}
|
|
for row in cursor.fetchall():
|
|
agent = row["agent_name"]
|
|
model = row["model"]
|
|
score = row["total_score"]
|
|
if agent not in best_evals:
|
|
best_evals[agent] = {}
|
|
if model not in best_evals[agent]:
|
|
best_evals[agent][model] = score
|
|
|
|
# Rebuild fit_scores from selected evaluations only
|
|
cursor.execute("""
|
|
SELECT agent_name, model, MAX(total_score) as best_score, scores, explanation
|
|
FROM evaluations
|
|
WHERE total_score > 0
|
|
AND evaluator NOT LIKE '%rubric_v1%'
|
|
AND (response IS NULL
|
|
OR (response NOT LIKE '%[HTTP %' AND response != ''))
|
|
GROUP BY agent_name, model
|
|
""")
|
|
fit_scores = {}
|
|
for row in cursor.fetchall():
|
|
fit_scores[row["agent_name"]] = {
|
|
"model": row["model"],
|
|
"fit": row["best_score"],
|
|
"explanation": (
|
|
f"Best model for {row['agent_name']} is {row['model']} "
|
|
f"with avg score {row['best_score']:.1f}. "
|
|
"Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
|
|
),
|
|
}
|
|
|
|
conn.close()
|
|
|
|
agents_report = {}
|
|
for agent_name, meta in agents_meta.items():
|
|
evals = best_evals.get(agent_name, {})
|
|
if evals:
|
|
best_model = max(evals, key=evals.get)
|
|
best_score = evals[best_model]
|
|
else:
|
|
best_model = ""
|
|
best_score = 0.0
|
|
agents_report[agent_name] = {
|
|
"name": agent_name,
|
|
"evaluations": evals,
|
|
"info": [
|
|
meta.get("description") or "",
|
|
meta.get("category") or "",
|
|
meta.get("current_model") or "",
|
|
],
|
|
"best_model": best_model,
|
|
"best_score": best_score,
|
|
}
|
|
|
|
total_evals = sum(len(evals) for evals in best_evals.values())
|
|
|
|
return {
|
|
"generated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()).replace("+0000", "+00:00"),
|
|
"source": "real-fit-engine-db-filtered",
|
|
"total_evaluations": total_evals,
|
|
"agents": agents_report,
|
|
"fit_scores": fit_scores,
|
|
}
|
|
|
|
|
|
@app.get("/api/real-fit-report")
|
|
def get_real_fit_report():
|
|
db_path = str(DB_PATH)
|
|
if os.path.exists(db_path):
|
|
return _build_report_from_db(DB_PATH)
|
|
return _load_json(REPORT_PATH)
|
|
|
|
|
|
@app.post("/api/research")
|
|
def start_research(req: ResearchRequest):
|
|
job_id = str(uuid.uuid4())
|
|
jobs = _load_jobs()
|
|
jobs[job_id] = {
|
|
"id": job_id,
|
|
"agent": req.agent,
|
|
"models": req.models,
|
|
"status": "pending",
|
|
"progress": 0,
|
|
"result": None,
|
|
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
}
|
|
_save_jobs(jobs)
|
|
|
|
_spawn_engine_job(job_id, req.agent, req.models)
|
|
|
|
return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models}
|
|
|
|
|
|
def _extract_scores_from_report(agent: str, models: list[str]) -> list[dict]:
|
|
"""Read real-fit-report.json and return scores for agent x models."""
|
|
report = _load_json(REPORT_PATH)
|
|
agent_data = report.get("agents", {}).get(agent, {})
|
|
evaluations = agent_data.get("evaluations", {})
|
|
results = []
|
|
for m in models:
|
|
score = evaluations.get(m, 0)
|
|
pending = score == 0
|
|
results.append({"model": m, "score": score, "pending": pending})
|
|
return results
|
|
|
|
|
|
@app.get("/api/research/{job_id}")
|
|
def get_research(job_id: str):
|
|
jobs = _load_jobs()
|
|
job = jobs.get(job_id)
|
|
if not job:
|
|
raise HTTPException(status_code=404, detail="Job not found")
|
|
if job.get("status") == "done" and job.get("result") is not None:
|
|
job["models_scored"] = _extract_scores_from_report(job["agent"], job.get("models", []))
|
|
return job
|
|
|
|
|
|
@app.post("/api/research/cell")
|
|
def research_cell(req: CellRequest):
|
|
job_id = str(uuid.uuid4())
|
|
jobs = _load_jobs()
|
|
jobs[job_id] = {
|
|
"id": job_id,
|
|
"agent": req.agent,
|
|
"models": [req.model],
|
|
"status": "pending",
|
|
"progress": 0,
|
|
"result": None,
|
|
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
}
|
|
_save_jobs(jobs)
|
|
|
|
_spawn_engine_job(job_id, req.agent, [req.model])
|
|
|
|
return {"job_id": job_id, "status": "pending", "agent": req.agent, "model": req.model}
|
|
|
|
|
|
@app.post("/api/evolve-agent/start")
|
|
def start_evolve_agent(req: EvolveAgentRequest):
|
|
"""Start a role-fit evaluation job using evolution-prompt and evolution-skeptic.
|
|
|
|
For now, this places a job in the queue that will be picked up by the real-fit-engine.
|
|
In the full implementation:
|
|
1. evolution-prompt generates role-specific stress-test prompts from agent definition
|
|
2. Each model in models list is tested with the same prompt
|
|
3. evolution-skeptic evaluates each response with per-dimension rubric scoring
|
|
4. Results are stored in SQLite and report is regenerated
|
|
"""
|
|
job_id = str(uuid.uuid4())
|
|
jobs = _load_jobs()
|
|
jobs[job_id] = {
|
|
"id": job_id,
|
|
"type": "evolve-agent",
|
|
"agent": req.agent,
|
|
"models": req.models,
|
|
"status": "pending",
|
|
"progress": 0,
|
|
"result": None,
|
|
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
}
|
|
_save_jobs(jobs)
|
|
|
|
# Placeholder: spawn the same engine job with evolve-agent type
|
|
# In full implementation, this would spawn a script that:
|
|
# 1. Reads agent definition from .kilo/agents/{agent}.md
|
|
# 2. Calls Ollama API for evolution-prompt to generate test prompts
|
|
# 3. For each model: calls Ollama API, stores response
|
|
# 4. Calls Ollama API for evolution-skeptic to evaluate
|
|
# 5. Stores results in SQLite, rebuilds report
|
|
_spawn_engine_job(job_id, req.agent, req.models)
|
|
|
|
return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models} |