feat(evolution): add real-fit dashboard, API, report builder, and docker compose
- real-fit.html: API-driven research dashboard with agent/model heatmap, detail modal with score breakdown and evaluator commentary - api.py: FastAPI backend serving /api/real-fit-report (dynamic from SQLite), /api/research, /api/evolve-agent/start - rebuild-report.py: generates real-fit-report.json from SQLite DB for static fallback - docker-compose.yml: add evolution-api service (Python 3.12, uvicorn) for research endpoints - index.standalone.html: sync with dashboard data updates - archive/index.html: standalone dashboard snapshot (263KB) - .gitignore: exclude *.db, research-jobs.json from tracking
This commit is contained in:
491
agent-evolution/api.py
Normal file
491
agent-evolution/api.py
Normal file
@@ -0,0 +1,491 @@
|
||||
"""
|
||||
Evolution Research API — FastAPI backend for agent-model evaluation jobs.
|
||||
|
||||
Endpoints:
|
||||
POST /api/research → start background evaluation job
|
||||
GET /api/research/{id} → job status & results
|
||||
POST /api/research/cell → evaluate single agent-model pair
|
||||
GET /api/real-fit-report → serve real-fit-report.json (live from DB)
|
||||
GET /api/models → list available models
|
||||
GET /api/evaluation/{agent}/{model} → detailed evaluation record
|
||||
POST /api/evolve-agent/start → start role-fit testing job (evolution-prompt + evolution-skeptic)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
|
||||
app = FastAPI(title="Evolution Research API", version="1.1.0")
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
JOB_STATE_PATH = Path(os.environ.get("JOB_STATE_PATH", "/app/data/research-jobs.json"))
|
||||
REPORT_PATH = Path(os.environ.get("REPORT_PATH", "/app/data/real-fit-report.json"))
|
||||
META_PATH = Path(os.environ.get("META_PATH", "/app/kilo-meta.json"))
|
||||
EVOLUTION_PATH = Path(os.environ.get("EVOLUTION_PATH", "/app/data/evolution.json"))
|
||||
ENGINE_PATH = Path(os.environ.get("ENGINE_PATH", "/app/scripts/real-fit-engine.py"))
|
||||
DB_PATH = Path(os.environ.get("REAL_FIT_DB", REPORT_PATH.parent / "real-fit.db"))
|
||||
|
||||
|
||||
def _load_json(path: Path) -> dict:
|
||||
if path.exists():
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
|
||||
def _save_json(path: Path, data: dict) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
|
||||
|
||||
def _load_jobs() -> dict:
|
||||
return _load_json(JOB_STATE_PATH)
|
||||
|
||||
|
||||
def _save_jobs(jobs: dict) -> None:
|
||||
_save_json(JOB_STATE_PATH, jobs)
|
||||
|
||||
|
||||
class ResearchRequest(BaseModel):
|
||||
agent: str
|
||||
models: list[str]
|
||||
|
||||
|
||||
class CellRequest(BaseModel):
|
||||
agent: str
|
||||
model: str
|
||||
|
||||
|
||||
class EvolveAgentRequest(BaseModel):
|
||||
agent: str
|
||||
models: list[str]
|
||||
|
||||
|
||||
def _spawn_engine_job(job_id: str, agent: str, models: list[str]) -> None:
|
||||
"""Spawn real-fit-engine.py as a background subprocess to evaluate models.
|
||||
|
||||
After evaluation, regenerates the report JSON so results are immediately visible.
|
||||
"""
|
||||
model_arg = ",".join(models)
|
||||
subprocess.Popen(
|
||||
["python3", "-c", f"""
|
||||
import subprocess, json, time, os
|
||||
job_id = {repr(job_id)}
|
||||
job_state_path = os.environ.get('JOB_STATE_PATH', '/app/data/research-jobs.json')
|
||||
engine_path = os.environ.get('ENGINE_PATH', '/app/scripts/real-fit-engine.py')
|
||||
|
||||
def load_jobs():
|
||||
try:
|
||||
with open(job_state_path) as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return {{}}
|
||||
|
||||
def save_jobs(jobs):
|
||||
with open(job_state_path, 'w') as f:
|
||||
json.dump(jobs, f, indent=2)
|
||||
|
||||
jobs = load_jobs()
|
||||
job = jobs.get(job_id)
|
||||
if job:
|
||||
job['status'] = 'running'
|
||||
job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
|
||||
save_jobs(jobs)
|
||||
|
||||
cmd = ['python3', engine_path, '--evaluate', {repr(agent)}, '--models', {repr(model_arg)}, '--report']
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
||||
stdout, stderr = proc.communicate()
|
||||
|
||||
jobs = load_jobs()
|
||||
job = jobs.get(job_id)
|
||||
if job:
|
||||
job['status'] = 'done' if proc.returncode == 0 else 'error'
|
||||
job['progress'] = 100
|
||||
job['result'] = {{'returncode': proc.returncode, 'stdout': stdout, 'stderr': stderr}}
|
||||
job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
|
||||
save_jobs(jobs)
|
||||
"""],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/models")
|
||||
def get_models():
|
||||
meta = _load_json(META_PATH)
|
||||
agents_meta = meta.get("agents", {})
|
||||
models = set()
|
||||
for agent in agents_meta.values():
|
||||
m = agent.get("model", "")
|
||||
if m:
|
||||
models.add(m)
|
||||
evolution = _load_json(EVOLUTION_PATH)
|
||||
for agent_data in evolution.get("agents", {}).values():
|
||||
curr = agent_data.get("current", {})
|
||||
m = curr.get("model", "")
|
||||
if m:
|
||||
models.add(m)
|
||||
for rec in agent_data.get("recommendations", []):
|
||||
mod = rec.get("model", "")
|
||||
if mod:
|
||||
models.add(mod)
|
||||
return {"models": sorted(models)}
|
||||
|
||||
|
||||
@app.get("/api/evaluation/{agent}/{model}")
|
||||
def get_evaluation(agent: str, model: str):
|
||||
db_path = str(DB_PATH)
|
||||
if not os.path.exists(db_path):
|
||||
raise HTTPException(status_code=404, detail="Evaluation database not found")
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Step 1: Get the best evaluation for this agent-model pair
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT e.id, e.agent_name, e.model, e.prompt_id,
|
||||
e.response, e.scores, e.total_score, e.explanation,
|
||||
e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at
|
||||
FROM evaluations e
|
||||
WHERE e.agent_name = ? AND e.model = ? AND e.total_score > 0
|
||||
ORDER BY e.total_score DESC, e.id DESC
|
||||
LIMIT 1
|
||||
""",
|
||||
(agent, model),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
# Fallback: try any evaluation even with score 0
|
||||
cursor.execute(
|
||||
"""
|
||||
SELECT e.id, e.agent_name, e.model, e.prompt_id,
|
||||
e.response, e.scores, e.total_score, e.explanation,
|
||||
e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at
|
||||
FROM evaluations e
|
||||
WHERE e.agent_name = ? AND e.model = ?
|
||||
ORDER BY e.id DESC LIMIT 1
|
||||
""",
|
||||
(agent, model),
|
||||
)
|
||||
row = cursor.fetchone()
|
||||
|
||||
if not row:
|
||||
conn.close()
|
||||
raise HTTPException(status_code=404, detail="Evaluation not found for this agent-model pair")
|
||||
|
||||
result = dict(row)
|
||||
prompt_id = result.get("prompt_id")
|
||||
|
||||
# Step 2: Get prompt data — try by prompt_id first, then fallback by agent_name
|
||||
system_prompt = ""
|
||||
user_prompt = ""
|
||||
expected_keywords_raw = "[]"
|
||||
rubric_raw = "{}"
|
||||
|
||||
if prompt_id:
|
||||
cursor.execute(
|
||||
"SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE id = ?",
|
||||
(prompt_id,),
|
||||
)
|
||||
tp = cursor.fetchone()
|
||||
if tp and tp["system_prompt"]:
|
||||
system_prompt = tp["system_prompt"]
|
||||
user_prompt = tp["user_prompt"] or ""
|
||||
expected_keywords_raw = tp["expected_keywords"] or "[]"
|
||||
rubric_raw = tp["rubric"] or "{}"
|
||||
|
||||
# Fallback: find prompt by agent_name if JOIN failed
|
||||
if not system_prompt:
|
||||
cursor.execute(
|
||||
"SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE agent_name = ? ORDER BY id DESC LIMIT 1",
|
||||
(agent,),
|
||||
)
|
||||
tp = cursor.fetchone()
|
||||
if tp:
|
||||
system_prompt = tp["system_prompt"] or ""
|
||||
user_prompt = tp["user_prompt"] or ""
|
||||
expected_keywords_raw = tp["expected_keywords"] or "[]"
|
||||
rubric_raw = tp["rubric"] or "{}"
|
||||
|
||||
conn.close()
|
||||
|
||||
# Assign all fetched prompt data to the result
|
||||
result["system_prompt"] = system_prompt
|
||||
result["user_prompt"] = user_prompt
|
||||
result["expected_keywords"] = expected_keywords_raw
|
||||
result["rubric"] = rubric_raw
|
||||
|
||||
for key in ("expected_keywords", "rubric", "scores"):
|
||||
raw = result.get(key)
|
||||
if isinstance(raw, str):
|
||||
try:
|
||||
result[key] = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
result[key] = [] if key == "expected_keywords" else {}
|
||||
elif raw is None:
|
||||
result[key] = [] if key == "expected_keywords" else {}
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _sync_agents_from_meta(db_path: Path, meta_path: Path | None = None) -> None:
|
||||
"""Import any missing agents from kilo-meta.json into the DB agents table."""
|
||||
if meta_path is None:
|
||||
meta_path = db_path.parent.parent.parent / "kilo-meta.json"
|
||||
if not meta_path.exists():
|
||||
return
|
||||
with open(meta_path) as f:
|
||||
meta = json.load(f)
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT name FROM agents")
|
||||
existing = {r[0] for r in cursor.fetchall()}
|
||||
|
||||
for name, info in meta.get("agents", {}).items():
|
||||
if name in existing:
|
||||
continue
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
(
|
||||
name,
|
||||
info.get("description", ""),
|
||||
info.get("category", "meta"),
|
||||
info.get("model", ""),
|
||||
info.get("color", "#6B7280"),
|
||||
datetime.now(timezone.utc).isoformat(),
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def _build_report_from_db(db_path: Path) -> dict:
|
||||
"""Build real-fit report dynamically from SQLite DB (filtered, objective)."""
|
||||
_sync_agents_from_meta(db_path)
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT name, description, category, current_model
|
||||
FROM agents
|
||||
""")
|
||||
agents_meta = {row["name"]: dict(row) for row in cursor.fetchall()}
|
||||
|
||||
# Only take evaluations that are NOT HTTP error responses
|
||||
# AND prefer evaluator='rubric_v2' over 'rubric_v1'
|
||||
cursor.execute("""
|
||||
SELECT agent_name, model, total_score, evaluator, response
|
||||
FROM evaluations
|
||||
WHERE total_score > 0
|
||||
AND evaluator NOT LIKE '%rubric_v1%'
|
||||
AND (response IS NULL
|
||||
OR (response NOT LIKE '%[HTTP %' AND response != ''))
|
||||
ORDER BY agent_name, model,
|
||||
CASE evaluator
|
||||
WHEN 'evolution-skeptic' THEN 0
|
||||
WHEN 'rubric_v2' THEN 1
|
||||
ELSE 2
|
||||
END,
|
||||
total_score DESC
|
||||
""")
|
||||
|
||||
# Take the first (best preferred evaluator, highest score) per agent-model
|
||||
best_evals = {}
|
||||
for row in cursor.fetchall():
|
||||
agent = row["agent_name"]
|
||||
model = row["model"]
|
||||
score = row["total_score"]
|
||||
if agent not in best_evals:
|
||||
best_evals[agent] = {}
|
||||
if model not in best_evals[agent]:
|
||||
best_evals[agent][model] = score
|
||||
|
||||
# Rebuild fit_scores from selected evaluations only
|
||||
cursor.execute("""
|
||||
SELECT agent_name, model, MAX(total_score) as best_score, scores, explanation
|
||||
FROM evaluations
|
||||
WHERE total_score > 0
|
||||
AND evaluator NOT LIKE '%rubric_v1%'
|
||||
AND (response IS NULL
|
||||
OR (response NOT LIKE '%[HTTP %' AND response != ''))
|
||||
GROUP BY agent_name, model
|
||||
""")
|
||||
fit_scores = {}
|
||||
for row in cursor.fetchall():
|
||||
fit_scores[row["agent_name"]] = {
|
||||
"model": row["model"],
|
||||
"fit": row["best_score"],
|
||||
"explanation": (
|
||||
f"Best model for {row['agent_name']} is {row['model']} "
|
||||
f"with avg score {row['best_score']:.1f}. "
|
||||
"Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
|
||||
),
|
||||
}
|
||||
|
||||
conn.close()
|
||||
|
||||
agents_report = {}
|
||||
for agent_name, meta in agents_meta.items():
|
||||
evals = best_evals.get(agent_name, {})
|
||||
if evals:
|
||||
best_model = max(evals, key=evals.get)
|
||||
best_score = evals[best_model]
|
||||
else:
|
||||
best_model = ""
|
||||
best_score = 0.0
|
||||
agents_report[agent_name] = {
|
||||
"name": agent_name,
|
||||
"evaluations": evals,
|
||||
"info": [
|
||||
meta.get("description") or "",
|
||||
meta.get("category") or "",
|
||||
meta.get("current_model") or "",
|
||||
],
|
||||
"best_model": best_model,
|
||||
"best_score": best_score,
|
||||
}
|
||||
|
||||
total_evals = sum(len(evals) for evals in best_evals.values())
|
||||
|
||||
return {
|
||||
"generated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()).replace("+0000", "+00:00"),
|
||||
"source": "real-fit-engine-db-filtered",
|
||||
"total_evaluations": total_evals,
|
||||
"agents": agents_report,
|
||||
"fit_scores": fit_scores,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/real-fit-report")
|
||||
def get_real_fit_report():
|
||||
db_path = str(DB_PATH)
|
||||
if os.path.exists(db_path):
|
||||
return _build_report_from_db(DB_PATH)
|
||||
return _load_json(REPORT_PATH)
|
||||
|
||||
|
||||
@app.post("/api/research")
|
||||
def start_research(req: ResearchRequest):
|
||||
job_id = str(uuid.uuid4())
|
||||
jobs = _load_jobs()
|
||||
jobs[job_id] = {
|
||||
"id": job_id,
|
||||
"agent": req.agent,
|
||||
"models": req.models,
|
||||
"status": "pending",
|
||||
"progress": 0,
|
||||
"result": None,
|
||||
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
}
|
||||
_save_jobs(jobs)
|
||||
|
||||
_spawn_engine_job(job_id, req.agent, req.models)
|
||||
|
||||
return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models}
|
||||
|
||||
|
||||
def _extract_scores_from_report(agent: str, models: list[str]) -> list[dict]:
|
||||
"""Read real-fit-report.json and return scores for agent x models."""
|
||||
report = _load_json(REPORT_PATH)
|
||||
agent_data = report.get("agents", {}).get(agent, {})
|
||||
evaluations = agent_data.get("evaluations", {})
|
||||
results = []
|
||||
for m in models:
|
||||
score = evaluations.get(m, 0)
|
||||
pending = score == 0
|
||||
results.append({"model": m, "score": score, "pending": pending})
|
||||
return results
|
||||
|
||||
|
||||
@app.get("/api/research/{job_id}")
|
||||
def get_research(job_id: str):
|
||||
jobs = _load_jobs()
|
||||
job = jobs.get(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
if job.get("status") == "done" and job.get("result") is not None:
|
||||
job["models_scored"] = _extract_scores_from_report(job["agent"], job.get("models", []))
|
||||
return job
|
||||
|
||||
|
||||
@app.post("/api/research/cell")
|
||||
def research_cell(req: CellRequest):
|
||||
job_id = str(uuid.uuid4())
|
||||
jobs = _load_jobs()
|
||||
jobs[job_id] = {
|
||||
"id": job_id,
|
||||
"agent": req.agent,
|
||||
"models": [req.model],
|
||||
"status": "pending",
|
||||
"progress": 0,
|
||||
"result": None,
|
||||
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
}
|
||||
_save_jobs(jobs)
|
||||
|
||||
_spawn_engine_job(job_id, req.agent, [req.model])
|
||||
|
||||
return {"job_id": job_id, "status": "pending", "agent": req.agent, "model": req.model}
|
||||
|
||||
|
||||
@app.post("/api/evolve-agent/start")
|
||||
def start_evolve_agent(req: EvolveAgentRequest):
|
||||
"""Start a role-fit evaluation job using evolution-prompt and evolution-skeptic.
|
||||
|
||||
For now, this places a job in the queue that will be picked up by the real-fit-engine.
|
||||
In the full implementation:
|
||||
1. evolution-prompt generates role-specific stress-test prompts from agent definition
|
||||
2. Each model in models list is tested with the same prompt
|
||||
3. evolution-skeptic evaluates each response with per-dimension rubric scoring
|
||||
4. Results are stored in SQLite and report is regenerated
|
||||
"""
|
||||
job_id = str(uuid.uuid4())
|
||||
jobs = _load_jobs()
|
||||
jobs[job_id] = {
|
||||
"id": job_id,
|
||||
"type": "evolve-agent",
|
||||
"agent": req.agent,
|
||||
"models": req.models,
|
||||
"status": "pending",
|
||||
"progress": 0,
|
||||
"result": None,
|
||||
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
||||
}
|
||||
_save_jobs(jobs)
|
||||
|
||||
# Placeholder: spawn the same engine job with evolve-agent type
|
||||
# In full implementation, this would spawn a script that:
|
||||
# 1. Reads agent definition from .kilo/agents/{agent}.md
|
||||
# 2. Calls Ollama API for evolution-prompt to generate test prompts
|
||||
# 3. For each model: calls Ollama API, stores response
|
||||
# 4. Calls Ollama API for evolution-skeptic to evaluate
|
||||
# 5. Stores results in SQLite, rebuilds report
|
||||
_spawn_engine_job(job_id, req.agent, req.models)
|
||||
|
||||
return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models}
|
||||
7031
agent-evolution/archive/index.html
Normal file
7031
agent-evolution/archive/index.html
Normal file
File diff suppressed because it is too large
Load Diff
11
agent-evolution/archive/tests/screenshot-dash.cjs
Normal file
11
agent-evolution/archive/tests/screenshot-dash.cjs
Normal file
@@ -0,0 +1,11 @@
|
||||
const { chromium } = require('playwright');
|
||||
const fs = require('fs');
|
||||
(async () => {
|
||||
const browser = await chromium.launch({ headless: true, args: ['--no-sandbox'] });
|
||||
const page = await browser.newPage({ viewport: { width: 1280, height: 720 } });
|
||||
await page.goto('http://host.docker.internal:3003', { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||||
await page.waitForTimeout(2000);
|
||||
await page.screenshot({ path: '/app/tests/visual/current/dashboard_landing.png', fullPage: false });
|
||||
await browser.close();
|
||||
console.log('Screenshot saved to /app/tests/visual/current/dashboard_landing.png');
|
||||
})();
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,28 +1,27 @@
|
||||
# Docker Compose for Agent Evolution Dashboard (mount-driven, no-rebuild)
|
||||
# Docker Compose for Agent Evolution Dashboard + Research API (mount-driven, no-rebuild)
|
||||
# Usage:
|
||||
# docker compose -f agent-evolution/docker-compose.yml up -d
|
||||
# # Edit any file in agent-evolution/ or .kilo/ on host → instant reflection
|
||||
# # Just run:
|
||||
# bun run sync:evolution
|
||||
# # and reload the page
|
||||
# # Edit any file on host → instant reflection in containers
|
||||
# # Dashboard: http://localhost:3003
|
||||
# # API: http://localhost:3004
|
||||
#
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
evolution-dashboard:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
image: python:3.12-alpine
|
||||
container_name: apaw-evolution
|
||||
ports:
|
||||
- "3003:80"
|
||||
volumes:
|
||||
# Mount the generated standalone HTML to the container's web root
|
||||
- ./index.standalone.html:/app/index.html:ro
|
||||
# Mount real-fit standalone report
|
||||
- ./real-fit.html:/app/real-fit.html:ro
|
||||
# Mount data directory for any additional assets
|
||||
- ./data:/app/data:ro
|
||||
# Mount .kilo directory for live config access
|
||||
- ../.kilo:/app/kilo:ro
|
||||
working_dir: /app
|
||||
command: ["python3", "-m", "http.server", "80"]
|
||||
environment:
|
||||
- NODE_ENV=production
|
||||
- TZ=UTC
|
||||
@@ -39,6 +38,47 @@ services:
|
||||
- "com.apaw.service=evolution-dashboard"
|
||||
- "com.apaw.description=Agent Evolution Dashboard"
|
||||
|
||||
evolution-api:
|
||||
image: python:3.12-alpine
|
||||
container_name: apaw-evolution-api
|
||||
ports:
|
||||
- "3004:8000"
|
||||
volumes:
|
||||
# API source code
|
||||
- ./api.py:/app/api.py:ro
|
||||
- ./requirements.txt:/app/requirements.txt:ro
|
||||
# Data directory (read-write for job state and reports)
|
||||
- ./data:/app/data:rw
|
||||
# real-fit-engine.py script
|
||||
- ../scripts/real-fit-engine.py:/app/scripts/real-fit-engine.py:ro
|
||||
# Agent definitions and metadata
|
||||
- ../.kilo/agents:/app/agents:ro
|
||||
- ../kilo-meta.json:/app/kilo-meta.json:ro
|
||||
working_dir: /app
|
||||
command: >
|
||||
sh -c "pip install --no-cache-dir -r requirements.txt && uvicorn api:app --host 0.0.0.0 --port 8000"
|
||||
environment:
|
||||
- TZ=UTC
|
||||
- PYTHONUNBUFFERED=1
|
||||
- JOB_STATE_PATH=/app/data/research-jobs.json
|
||||
- REPORT_PATH=/app/data/real-fit-report.json
|
||||
- META_PATH=/app/kilo-meta.json
|
||||
- EVOLUTION_PATH=/app/data/evolution.json
|
||||
- ENGINE_PATH=/app/scripts/real-fit-engine.py
|
||||
- REAL_FIT_DB=/app/data/real-fit.db
|
||||
restart: unless-stopped
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/api/models"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 15s
|
||||
networks:
|
||||
- evolution-network
|
||||
labels:
|
||||
- "com.apaw.service=evolution-api"
|
||||
- "com.apaw.description=Agent Evolution Research API"
|
||||
|
||||
# Optional: Nginx reverse proxy with SSL
|
||||
evolution-nginx:
|
||||
image: nginx:alpine
|
||||
@@ -49,13 +89,14 @@ services:
|
||||
- "80:80"
|
||||
- "443:443"
|
||||
volumes:
|
||||
- ./agent-evolution/nginx.conf:/etc/nginx/nginx.conf:ro
|
||||
- ./agent-evolution/ssl:/etc/nginx/ssl:ro
|
||||
- ./nginx.conf:/etc/nginx/nginx.conf:ro
|
||||
- ./ssl:/etc/nginx/ssl:ro
|
||||
depends_on:
|
||||
- evolution-dashboard
|
||||
- evolution-api
|
||||
networks:
|
||||
- evolution-network
|
||||
|
||||
networks:
|
||||
evolution-network:
|
||||
driver: bridge
|
||||
driver: bridge
|
||||
|
||||
@@ -5083,7 +5083,7 @@ async function init() {
|
||||
try {
|
||||
// Load real dashboard data FIRST (overrides stale agent-versions)
|
||||
try {
|
||||
const dashRes = await fetch('data/dashboard-data.json');
|
||||
const dashRes = await fetch('data/dashboard-data.json', { cache: 'no-cache' });
|
||||
if (dashRes.ok) {
|
||||
window.dashboardData = await dashRes.json();
|
||||
// Sync agentData from dashboard data for all other tabs
|
||||
@@ -5439,64 +5439,63 @@ function renderRecCard(r, index) {
|
||||
`;
|
||||
}
|
||||
|
||||
// Render Heatmap — REAL DATA: Agent × Current Model × Real Fit Score
|
||||
// Render Heatmap — REAL DATA: Agent × Model × Live Ollama Evaluations
|
||||
function renderHeatmap() {
|
||||
const esc = str => (str || '').replace(/[&<>"']/g, m => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[m]));
|
||||
const dd = window.dashboardData;
|
||||
|
||||
if (!dd || !dd.agents) {
|
||||
document.getElementById('hmTable').innerHTML = '<tr><td style="color:var(--text-secondary);padding:20px;text-align:center;">⚠️ Нет данных. Запустите анализ.</td></tr>';
|
||||
document.getElementById('hmTable').innerHTML = '<tr><td style="color:var(--text-secondary);padding:20px;text-align:center;">⚠️ No data. Run analysis.</td></tr>';
|
||||
return;
|
||||
}
|
||||
|
||||
const agents = dd.agents;
|
||||
// Get unique models sorted by count of agents
|
||||
const modelCounts = {};
|
||||
agents.forEach(a => { modelCounts[a.model_short] = (modelCounts[a.model_short] || 0) + 1; });
|
||||
const modelList = Object.entries(modelCounts)
|
||||
.sort((a, b) => b[1] - a[1])
|
||||
.map(([short]) => {
|
||||
const m = dd.models[short] || {};
|
||||
return {
|
||||
short,
|
||||
full: 'ollama-cloud/' + short,
|
||||
name: m.name || short,
|
||||
avg_fit: m.avg_fit || 0,
|
||||
agents: m.agents || 0
|
||||
};
|
||||
});
|
||||
// Collect all models from current assignments + realfit evaluations
|
||||
const modelsSeen = new Set();
|
||||
dd.agents.forEach(a => { if (a.model_short) modelsSeen.add(a.model_short); });
|
||||
dd.agents.forEach(a => {
|
||||
if (a.real_evaluations) Object.keys(a.real_evaluations).forEach(m => { if (m && m !== 'code-skeptic') modelsSeen.add(m); });
|
||||
});
|
||||
// Ensure real-fit evaluated models are included even if not current
|
||||
const modelList = Array.from(modelsSeen).sort();
|
||||
|
||||
// Render table: rows=agents, cols=models
|
||||
const t = document.getElementById('hmTable');
|
||||
let h = '<thead><tr><th class="hm-role">Agent</th>';
|
||||
modelList.forEach(m => {
|
||||
const color = m.avg_fit >= 85 ? '#00ff94' : m.avg_fit >= 70 ? '#facc15' : '#ff6b81';
|
||||
h += `<th style="writing-mode:vertical-lr;transform:rotate(180deg);max-width:32px;font-size:.56em;padding:3px 1px;">
|
||||
${esc(m.name)}<br>
|
||||
<span style="color:${color};font-size:.9em;font-weight:700">avg:${m.avg_fit}</span><br>
|
||||
<span style="color:var(--text-muted);font-size:.8em">${m.agents}</span>
|
||||
</th>`;
|
||||
// Compute avg from dd.agents real_evaluations
|
||||
let sum = 0, cnt = 0;
|
||||
dd.agents.forEach(a => { const v = (a.real_evaluations || {})[m]; if (v > 0) { sum += v; cnt++; } });
|
||||
const avg = cnt > 0 ? Math.round(sum / cnt) : 0;
|
||||
const color = avg >= 85 ? '#00ff94' : avg >= 70 ? '#facc15' : '#ff6b81';
|
||||
h += `<th style="writing-mode:vertical-lr;transform:rotate(180deg);max-width:32px;font-size:.56em;padding:3px 1px;">${esc(m)}<br><span style="color:${color};font-size:.9em;font-weight:700">avg:${avg}</span></th>`;
|
||||
});
|
||||
h += '</tr></thead><tbody>';
|
||||
h += '<th>Best</th><th>Score</th></tr></thead><tbody>';
|
||||
|
||||
agents.forEach(a => {
|
||||
dd.agents.forEach(a => {
|
||||
h += `<tr><td class="hm-r">${esc(a.name)}</td>`;
|
||||
modelList.forEach((m, j) => {
|
||||
const isCurrent = a.model_short === m.short;
|
||||
const score = isCurrent ? a.fit_score : 0; // Only show score for CURRENT model
|
||||
const cur = isCurrent;
|
||||
let marks = '';
|
||||
if (cur) marks += '<span style="border:1px solid var(--accent-cyan);border-radius:50%;padding:1px 3px;font-size:8px">●</span>';
|
||||
const bg = cur ? hmColor(score) : 'transparent';
|
||||
const txt = cur ? hmText(score) : 'var(--text-muted)';
|
||||
h += `<td style="background:${bg};color:${txt};cursor:pointer${cur ? ';box-shadow:inset 0 0 0 2px var(--accent-cyan)' : ''}" class="${cur ? 'hm-cur' : ''}"
|
||||
title="${esc(a.name)} → ${esc(m.name)}: ${isCurrent ? 'fit=' + a.fit_score + ', if=' + a.instruction_following : 'не использует этот модель'}"
|
||||
onmouseover="showTT(event,'${esc(a.name)}','${esc(m.name)}',${isCurrent ? a.fit_score : 0},${isCurrent},${cur},${isCurrent ? a.instruction_following : 0})"
|
||||
onmouseout="hideTT()"
|
||||
onclick="openHmModal(event, '${esc(a.name)}', '${esc(m.name)}', ${isCurrent ? a.fit_score : 0}, ${isCurrent ? a.instruction_following : 0})"
|
||||
>${isCurrent ? a.fit_score : '·'}${marks}</td>`;
|
||||
modelList.forEach(m => {
|
||||
const isCurrent = a.model_short === m;
|
||||
let score = 0;
|
||||
// Prefer real-fit score, fallback to current fit_score
|
||||
if (a.real_evaluations && a.real_evaluations[m] > 0) score = Math.round(a.real_evaluations[m]);
|
||||
else if (isCurrent) score = Math.round(a.fit_score || 0);
|
||||
|
||||
let cls = 'na';
|
||||
if (score >= 90) cls = 'high';
|
||||
else if (score >= 75) cls = 'good';
|
||||
else if (score >= 50) cls = 'med';
|
||||
else if (score > 0) cls = 'low';
|
||||
|
||||
const curMark = isCurrent ? ' ●' : '';
|
||||
const curStyle = isCurrent ? 'box-shadow:inset 0 0 0 2px var(--accent-cyan);' : '';
|
||||
const bg = score > 0 ? hmColor(score) : 'transparent';
|
||||
const txt = score >= 75 ? '#0e1219' : 'var(--text-primary)';
|
||||
const display = score > 0 ? score : (isCurrent ? Math.round(a.fit_score || 0) : '·');
|
||||
|
||||
h += `<td class="score ${cls}" style="background:${bg};color:${txt};${curStyle}cursor:pointer" title="${esc(a.name)} → ${esc(m)}: ${score > 0 ? 'real fit=' + score : (isCurrent ? 'fit=' + a.fit_score : 'no data')}" onclick="openHmModal(event,'${esc(a.name)}','${esc(m)}',${score},${a.instruction_following || 0})">${display}${curMark}</td>`;
|
||||
});
|
||||
h += '</tr>';
|
||||
const bestModel = a.real_best_model || a.model_short;
|
||||
const bestScore = a.real_best_score ? Math.round(a.real_best_score) : Math.round(a.fit_score || 0);
|
||||
h += `<td>${esc(bestModel)}</td><td style="font-weight:700">${bestScore}</td></tr>`;
|
||||
});
|
||||
t.innerHTML = h + '</tbody>';
|
||||
}
|
||||
@@ -5511,29 +5510,6 @@ function hmColor(v) {
|
||||
return 'rgba(90,104,128,.2)';
|
||||
}
|
||||
|
||||
function hmText(v) {
|
||||
return v >= 75 ? '#0e1219' : '#e8edf5';
|
||||
}
|
||||
|
||||
function showTT(e, agent, model, score, best, cur, ifScore) {
|
||||
const b = document.getElementById('ttBox'), o = document.getElementById('ttOverlay');
|
||||
const ifColor = ifScore >= 85 ? '#00ff94' : ifScore >= 75 ? '#facc15' : '#ff6b81';
|
||||
const ifLabel = ifScore >= 85 ? 'Excellent' : ifScore >= 75 ? 'Average' : 'Weak';
|
||||
b.innerHTML = `<h4>${model}</h4><p><strong>Agent:</strong> ${agent}<br><strong>Score:</strong> ${score}/100<br>
|
||||
<strong>Instruction Following:</strong> <span style="color:${ifColor};font-weight:700">${ifScore}/100 (${ifLabel})</span><br>
|
||||
<span style="font-size:.9em;color:var(--text-muted)">Score = benchmark × IF multiplier</span><br>
|
||||
${ifScore < 75 ? '<span style="color:#ff6b81">⚠ Model poorly follows prompts — score reduced</span><br>' : ''}
|
||||
${best ? '★ <strong>Best fit</strong><br>' : ''}${cur ? '📌 <strong>Current</strong>' : ''}</p>`;
|
||||
const r = e.target.getBoundingClientRect();
|
||||
b.style.left = Math.min(r.left, window.innerWidth - 320) + 'px';
|
||||
b.style.top = (r.bottom + 6) + 'px';
|
||||
o.classList.add('show');
|
||||
}
|
||||
|
||||
function hideTT() {
|
||||
document.getElementById('ttOverlay').classList.remove('show');
|
||||
}
|
||||
|
||||
// Current modal state
|
||||
let hmCurrentAgent = null;
|
||||
let hmCurrentModel = null;
|
||||
|
||||
460
agent-evolution/real-fit.html
Normal file
460
agent-evolution/real-fit.html
Normal file
@@ -0,0 +1,460 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="ru">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Real-Fit Matrix — Agent × Model Performance</title>
|
||||
<style>
|
||||
:root{--bg:#0a0f1a;--bg2:#0f1525;--bg3:#141c2e;--bdr:#1e2d45;--txt:#e8f1ff;--txt2:#8ba3c0;--cyan:#00d4ff;--green:#00ff94;--red:#ff4757;--orange:#ff9f43;--purple:#a855f7;}
|
||||
*{margin:0;padding:0;box-sizing:border-box}
|
||||
body{font-family:system-ui,-apple-system,sans-serif;background:var(--bg);color:var(--txt);min-height:100vh;padding:24px}
|
||||
h1{font-size:1.6rem;background:linear-gradient(90deg,var(--cyan),var(--green));-webkit-background-clip:text;-webkit-text-fill-color:transparent;margin-bottom:8px}
|
||||
.sub{color:var(--txt2);font-size:.85rem;margin-bottom:20px}
|
||||
table{width:100%;border-collapse:collapse;font-size:.82rem}
|
||||
th,td{padding:8px 10px;border:1px solid var(--bdr);text-align:center}
|
||||
th{background:var(--bg2);color:var(--txt2);font-size:.72rem;text-transform:uppercase;letter-spacing:.5px;position:sticky;top:0}
|
||||
td:first-child{text-align:left;font-weight:700;white-space:nowrap}
|
||||
td.score{font-weight:700;font-family:monospace}
|
||||
.hm-cur{box-shadow:inset 0 0 0 2px var(--cyan)}
|
||||
.high{background:rgba(0,255,148,.18);color:var(--green)}
|
||||
.good{background:rgba(0,212,255,.14);color:var(--cyan)}
|
||||
.med{background:rgba(168,85,247,.15);color:var(--purple)}
|
||||
.low{background:rgba(255,71,87,.1);color:var(--red)}
|
||||
.na{background:transparent;color:var(--txt2);font-size:.9rem;cursor:pointer}
|
||||
.na:hover{background:rgba(0,212,255,.08)}
|
||||
.legend{display:flex;gap:12px;flex-wrap:wrap;margin-top:16px;font-size:.78rem;color:var(--txt2)}
|
||||
.legend span{display:flex;align-items:center;gap:4px}
|
||||
.dot{width:14px;height:14px;border-radius:3px}
|
||||
.meta{font-size:.72rem;color:var(--txt2);margin-top:12px}
|
||||
a{color:var(--cyan);text-decoration:none}
|
||||
.btn-research{font-size:.9rem;background:none;border:none;cursor:pointer;margin-left:4px;opacity:.7}
|
||||
.btn-research:hover{opacity:1}
|
||||
.modal{position:fixed;inset:0;background:rgba(0,0,0,.7);display:flex;align-items:center;justify-content:center;z-index:1000;padding:16px}
|
||||
.modal.hidden{display:none}
|
||||
.modal-panel{background:var(--bg2);border:1px solid var(--cyan);border-radius:12px;max-width:90vw;width:480px;max-height:90vh;overflow-y:auto;padding:20px;position:relative}
|
||||
.modal-panel.wide{width:640px}
|
||||
.modal-title{font-size:1.1rem;margin-bottom:12px}
|
||||
.modal-list{text-align:left;margin:12px 0;max-height:40vh;overflow-y:auto}
|
||||
.modal-list label{display:flex;align-items:center;gap:8px;padding:6px 0;cursor:pointer;border-bottom:1px solid var(--bdr)}
|
||||
.modal-list input{margin:0}
|
||||
.modal-actions{display:flex;gap:8px;justify-content:flex-end;margin-top:16px}
|
||||
.btn{padding:6px 14px;border-radius:6px;border:1px solid var(--bdr);background:var(--bg3);color:var(--txt);cursor:pointer;font-size:.85rem}
|
||||
.btn.primary{background:linear-gradient(90deg,var(--green),var(--cyan));color:#000;border:none;font-weight:700}
|
||||
.progress{margin-top:12px}
|
||||
.progress-bar{height:8px;background:var(--bg3);border-radius:4px;overflow:hidden}
|
||||
.progress-fill{height:100%;width:0%;background:linear-gradient(90deg,var(--green),var(--cyan));transition:width .3s}
|
||||
.progress-text{font-size:.8rem;color:var(--txt2);margin-top:6px;text-align:center}
|
||||
.result-table{width:100%;margin-top:12px;font-size:.82rem;border-collapse:collapse}
|
||||
.result-table th,.result-table td{padding:6px;border:1px solid var(--bdr)}
|
||||
.result-table .best{background:rgba(0,255,148,.25);color:var(--green);font-weight:700}
|
||||
.result-table tbody tr{cursor:pointer}
|
||||
.result-table tbody tr:hover{background:rgba(0,212,255,.06)}
|
||||
.detail-row{margin-bottom:12px}
|
||||
.detail-label{font-size:.72rem;color:var(--txt2);text-transform:uppercase;letter-spacing:.5px;margin-bottom:4px}
|
||||
.detail-val{font-size:.85rem;white-space:pre-wrap;word-break:break-word}
|
||||
.detail-pills{display:flex;flex-wrap:wrap;gap:6px;margin-top:4px}
|
||||
.pill{font-size:.72rem;padding:2px 8px;border-radius:4px;background:var(--bg3);border:1px solid var(--bdr);color:var(--txt2)}
|
||||
.score-big{font-size:2rem;font-weight:700;margin:4px 0}
|
||||
.toggle{color:var(--cyan);cursor:pointer;font-size:.78rem}
|
||||
.toggle:hover{text-decoration:underline}
|
||||
.dim-bar{display:flex;align-items:center;gap:8px;margin:4px 0}
|
||||
.dim-bar>span:first-child{width:120px;font-size:.75rem;color:var(--txt2);white-space:nowrap;overflow:hidden;text-overflow:ellipsis}
|
||||
.dim-track{flex:1;height:8px;background:var(--bg3);border-radius:4px;overflow:hidden}
|
||||
.dim-fill{height:100%;border-radius:4px}
|
||||
.dim-num{width:30px;text-align:right;font-size:.78rem;font-weight:700}
|
||||
.v-pass{color:var(--green)}
|
||||
.v-marginal{color:var(--orange)}
|
||||
.v-fail{color:var(--red)}
|
||||
.commentary{font-size:.85rem;padding:10px 12px;background:rgba(0,212,255,.08);border-left:3px solid var(--cyan);border-radius:0 6px 6px 0;color:var(--txt);white-space:pre-wrap;word-break:break-word}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Real-Fit Matrix</h1>
|
||||
<div class="sub">Real agent × model evaluation scores via live Ollama API (28 calls, 4 models, 7 agents)</div>
|
||||
<div id="matrix"></div>
|
||||
<div class="legend">
|
||||
<span><span class="dot high"></span> 90+ Excellent</span>
|
||||
<span><span class="dot good"></span> 75–89 Good</span>
|
||||
<span><span class="dot med"></span> 50–74 Average</span>
|
||||
<span><span class="dot low"></span> <50 Weak</span>
|
||||
<span style="margin-left:auto">● = assigned model</span>
|
||||
</div>
|
||||
<div class="meta">Data source: <a href="data/real-fit-report.json" target="_blank">real-fit-report.json</a> | Updated: <span id="updated"></span></div>
|
||||
|
||||
<div id="researchAgentModal" class="modal hidden">
|
||||
<div class="modal-panel">
|
||||
<div class="modal-title" id="agentModalTitle">Research models</div>
|
||||
<div class="modal-list" id="agentModalList"></div>
|
||||
<div class="progress hidden" id="agentProgress">
|
||||
<div class="progress-bar"><div class="progress-fill" id="agentProgressFill"></div></div>
|
||||
<div class="progress-text" id="agentProgressText"></div>
|
||||
</div>
|
||||
<div class="modal-actions">
|
||||
<button class="btn" onclick="closeModal('researchAgentModal')">Close</button>
|
||||
<button class="btn" id="evolveAgentBtn" onclick="startEvolveAgent()">Run Role-Fit Test</button>
|
||||
<button class="btn primary" id="agentStartBtn" onclick="startAgentResearch()">Start Research</button>
|
||||
</div>
|
||||
<div id="agentResults"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="researchCellModal" class="modal hidden">
|
||||
<div class="modal-panel">
|
||||
<div class="modal-title" id="cellModalTitle">Evaluate cell</div>
|
||||
<div class="modal-list" id="cellModalList"></div>
|
||||
<div class="progress hidden" id="cellProgress">
|
||||
<div class="progress-bar"><div class="progress-fill" id="cellProgressFill"></div></div>
|
||||
<div class="progress-text" id="cellProgressText"></div>
|
||||
</div>
|
||||
<div class="modal-actions">
|
||||
<button class="btn" onclick="closeModal('researchCellModal')">Close</button>
|
||||
<button class="btn primary" id="cellStartBtn" onclick="startCellResearch()">Evaluate</button>
|
||||
</div>
|
||||
<div id="cellResults"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="detailModal" class="modal hidden">
|
||||
<div class="modal-panel wide">
|
||||
<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:12px">
|
||||
<span style="font-size:1rem;font-weight:700" id="detailTitle"></span>
|
||||
<button class="btn" onclick="closeModal('detailModal')">Close</button>
|
||||
</div>
|
||||
<div id="detailContent"></div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
let reportData, evoData, allModels=[], allAvailableModels=[];
|
||||
const API_BASE='http://localhost:3004';
|
||||
const $=id=>document.getElementById(id);
|
||||
|
||||
const MODEL_BENCHMARKS={
|
||||
"qwen3.5-122b":91,"qwen3-coder-480b":88,"deepseek-v4-pro-max":89,
|
||||
"deepseek-v4-flash":86,"kimi-k2.6":91,"kimi-k2.5":90,
|
||||
"minimax-m2.5":82,"minimax-m2.7":80,"glm-5.1":90,
|
||||
"glm-5":90,"nemotron-3-super":78,"nemotron-3-nano":68,
|
||||
"gemma4-27b":85,"devstral-2":80,"devstral-small-2":75
|
||||
};
|
||||
function clsFor(s){if(s>=90)return'score high';if(s>=75)return'score good';if(s>=50)return'score med';return'score low';}
|
||||
function scoreColor(s){if(s>=90)return'var(--green)';if(s>=75)return'var(--cyan)';if(s>=50)return'var(--purple)';return'var(--red)';}
|
||||
function closeModal(id){$(id).classList.add('hidden');}
|
||||
|
||||
async function load(){
|
||||
let mRes;
|
||||
try{ mRes=await fetch(`${API_BASE}/api/models`); allAvailableModels=(await mRes.json()).models||[]; }
|
||||
catch(e){ allAvailableModels=Object.keys(MODEL_BENCHMARKS); }
|
||||
try{
|
||||
const rRes=await fetch(`${API_BASE}/api/real-fit-report`);
|
||||
reportData=await rRes.json();
|
||||
}catch(e){
|
||||
const rRes=await fetch('data/real-fit-report.json');
|
||||
reportData=await rRes.json();
|
||||
}
|
||||
const eRes=await fetch('data/evolution.json');
|
||||
evoData=await eRes.json();
|
||||
reportData.generated = reportData.generated || new Date().toISOString();
|
||||
// Больше не смешиваем API-данные с локальным кэшем — иначе stale cached scores перекрывают живые данные из БД
|
||||
if (reportData.source && reportData.source.includes('db')) {
|
||||
try { localStorage.removeItem('__researchResults'); } catch(e) {}
|
||||
}
|
||||
$('updated').textContent=new Date(reportData.generated).toLocaleString('ru-RU');
|
||||
|
||||
const agents=Object.values(reportData.agents).filter(a=>Object.values(a.evaluations).some(s=>s>0));
|
||||
const modelScores={};
|
||||
agents.forEach(a=>{for(const[m,s] of Object.entries(a.evaluations)){modelScores[m]=(modelScores[m]||0)+s;}});
|
||||
allModels=Object.keys(modelScores).filter(m=>modelScores[m]>0).sort();
|
||||
|
||||
mergeCachedResults();
|
||||
renderTable();
|
||||
}
|
||||
|
||||
function currentModel(agentName){
|
||||
const info=reportData.agents[agentName]?.info||[];
|
||||
return (info[2]||'').split('/').pop();
|
||||
}
|
||||
|
||||
function modelShort(full){return full.replace('ollama-cloud/','');}
|
||||
|
||||
function openAgentModal(agent){
|
||||
$('agentModalTitle').textContent='Research models for '+agent;
|
||||
const cur=currentModel(agent);
|
||||
let html='';
|
||||
allAvailableModels.forEach(full=>{
|
||||
const m=modelShort(full);
|
||||
const checked=m===cur||cur.replace(':','-')===m||m.replace('-',':')===cur?'checked':'';
|
||||
const ifs=MODEL_BENCHMARKS[m]||'—';
|
||||
html+=`<label><input type="checkbox" value="${m}" ${checked}> <span>${m}</span> <span style="color:var(--txt2);margin-left:auto">IF ${ifs}</span></label>`;
|
||||
});
|
||||
$('agentModalList').innerHTML=html||'<p style="color:var(--txt2)">No model data</p>';
|
||||
$('agentResults').innerHTML='';
|
||||
$('agentProgress').classList.add('hidden');
|
||||
$('agentStartBtn').disabled=false;
|
||||
$('evolveAgentBtn').disabled=false;
|
||||
$('researchAgentModal').classList.remove('hidden');
|
||||
window.__activeAgent=agent;
|
||||
}
|
||||
|
||||
function openCellModal(agent,model){
|
||||
$('cellModalTitle').textContent='Evaluate '+agent+' × '+model;
|
||||
$('cellModalList').innerHTML=`<label><input type="checkbox" value="${model}" checked> <span>${model}</span></label>`;
|
||||
$('cellResults').innerHTML='';
|
||||
$('cellProgress').classList.add('hidden');
|
||||
$('cellStartBtn').disabled=false;
|
||||
$('researchCellModal').classList.remove('hidden');
|
||||
window.__activeAgent=agent; window.__activeModel=model;
|
||||
}
|
||||
|
||||
async function openDetail(agent,model){
|
||||
$('detailTitle').textContent=agent+' × '+model;
|
||||
$('detailContent').innerHTML='<p style="color:var(--txt2)">Loading...</p>';
|
||||
$('detailModal').classList.remove('hidden');
|
||||
let data;
|
||||
try{
|
||||
const res=await fetch(`${API_BASE}/api/evaluation/${encodeURIComponent(agent)}/${encodeURIComponent(model)}`);
|
||||
if(!res.ok) throw new Error(res.status);
|
||||
data=await res.json();
|
||||
}catch(e){
|
||||
$('detailContent').innerHTML='<p style="color:var(--red);margin-top:12px">No detailed evaluation data available for this combination. Run research first.</p>';
|
||||
return;
|
||||
}
|
||||
const s=data.total_score??data.score??0;
|
||||
|
||||
const verdict=(data.verdict||'').toUpperCase();
|
||||
let vClass='';
|
||||
if(verdict==='PASS') vClass='v-pass';
|
||||
else if(verdict==='MARGINAL') vClass='v-marginal';
|
||||
else if(verdict==='FAIL') vClass='v-fail';
|
||||
const verdictHtml=verdict?`<span class="${vClass}" style="font-size:.85rem;font-weight:700;border:1px solid currentColor;padding:2px 8px;border-radius:4px;margin-left:8px">${verdict}</span>`:'';
|
||||
|
||||
let scoresHtml='';
|
||||
if(data.scores){
|
||||
scoresHtml='<div class="detail-row"><div class="detail-label">Score Breakdown</div>';
|
||||
for(const [k,v] of Object.entries(data.scores)){
|
||||
const num=typeof v==='number'?v:Number(v)||0;
|
||||
scoresHtml+=`<div class="dim-bar"><span>${k}</span><div class="dim-track"><div class="dim-fill" style="width:${num}%;background:${scoreColor(num)}"></div></div><span class="dim-num">${Math.round(num)}</span></div>`;
|
||||
}
|
||||
scoresHtml+='</div>';
|
||||
}
|
||||
|
||||
let commentaryHtml='';
|
||||
if(data.explanation){
|
||||
commentaryHtml=`<div class="detail-row"><div class="detail-label">Evaluator Commentary</div><div class="commentary">${data.explanation}</div></div>`;
|
||||
}
|
||||
|
||||
let rubricHtml='';
|
||||
if(data.rubric){
|
||||
rubricHtml='<div class="detail-row"><div class="detail-label">Rubric Weights</div><div class="detail-pills">';
|
||||
for(const [k,v] of Object.entries(data.rubric)){
|
||||
rubricHtml+=`<span class="pill">${k}: ${v}</span>`;
|
||||
}
|
||||
rubricHtml+='</div></div>';
|
||||
}
|
||||
|
||||
let kwHtml='';
|
||||
if(data.expected_keywords?.length){ kwHtml='<div class="detail-pills">'+data.expected_keywords.map(k=>`<span class="pill">${k}</span>`).join('')+'</div>'; }
|
||||
const resp=(data.response||'').toString();
|
||||
const trunc=resp.length>500?resp.slice(0,500)+'...':resp;
|
||||
const more=resp.length>500;
|
||||
const rid='r'+Math.random().toString(36).slice(2);
|
||||
window.__respCache=window.__respCache||{};
|
||||
window.__respCache[rid]={full:resp,trunc:trunc};
|
||||
let respHtml=`<div class="detail-val" id="${rid}">${trunc}</div>`;
|
||||
if(more) respHtml+=`<span class="toggle" onclick="const c=window.__respCache['${rid}'];const el=$('${rid}');const isFull=el.dataset.f==='1';el.textContent=isFull?c.trunc:c.full;el.dataset.f=isFull?'0':'1';this.textContent=isFull?'Show more':'Show less'">Show more</span>`;
|
||||
const lat=data.latency_ms;
|
||||
const latTxt=typeof lat==='number'?(lat>=1000?(lat/1000).toFixed(1)+'s':lat+'ms'):'—';
|
||||
|
||||
$('detailContent').innerHTML=`
|
||||
<div class="detail-row"><div class="detail-label">Agent × Model</div><div class="detail-val">${agent} × ${model}${verdictHtml}</div></div>
|
||||
<div class="detail-row"><div class="detail-label">Total Score</div><div class="score-big" style="color:${scoreColor(s)}">${Math.round(s)}</div></div>
|
||||
${scoresHtml}
|
||||
<div class="detail-row"><div class="detail-label">Task</div><div class="detail-val">${data.user_prompt||'—'}</div></div>
|
||||
<div class="detail-row"><div class="detail-label">System Role</div><div class="detail-val">${data.system_prompt||'—'}</div></div>
|
||||
<div class="detail-row"><div class="detail-label">Model Response</div>${respHtml}</div>
|
||||
${commentaryHtml}
|
||||
${rubricHtml}
|
||||
<div class="detail-row"><div class="detail-label">Evaluator</div><div class="detail-val">${data.evaluator||'—'}</div></div>
|
||||
<div class="detail-row"><div class="detail-label">Latency</div><div class="detail-val">${latTxt}</div></div>
|
||||
<div class="detail-row"><div class="detail-label">Tokens</div><div class="detail-val">Prompt: ${data.tokens_prompt??0} / Response: ${data.tokens_response??0}</div></div>
|
||||
<div class="detail-row"><div class="detail-label">Expected Keywords</div>${kwHtml||'<div class="detail-val">—</div>'}</div>
|
||||
<div class="detail-row"><div class="detail-label">Evaluated At</div><div class="detail-val">${data.evaluated_at?new Date(data.evaluated_at).toLocaleString('ru-RU'):'—'}</div></div>
|
||||
`;
|
||||
}
|
||||
|
||||
async function animateProgress(pid,label,ms){
|
||||
const bar=$(pid+'Fill'),txt=$(pid+'Text'),wrap=$(pid);
|
||||
wrap.classList.remove('hidden'); txt.textContent=label; bar.style.width='0%';
|
||||
await new Promise(r=>setTimeout(r,50)); bar.style.transition=`width ${ms}ms linear`;
|
||||
await new Promise(r=>setTimeout(r,50)); bar.style.width='100%';
|
||||
await new Promise(r=>setTimeout(r,ms));
|
||||
bar.style.transition='width .3s';
|
||||
}
|
||||
|
||||
function setProgress(pid,percent,label){
|
||||
const bar=$(pid+'Fill'),txt=$(pid+'Text'),wrap=$(pid);
|
||||
wrap.classList.remove('hidden'); txt.textContent=label; bar.style.width=percent+'%';
|
||||
}
|
||||
|
||||
function mergeCachedResults(){
|
||||
try{
|
||||
const store=JSON.parse(localStorage.getItem('__researchResults')||'{}');
|
||||
for(const[agent,rec] of Object.entries(store)){
|
||||
if(!reportData.agents[agent]) continue;
|
||||
for(const r of (rec.models||[])){
|
||||
reportData.agents[agent].evaluations[r.model]=r.score;
|
||||
if(!allModels.includes(r.model)) allModels.push(r.model);
|
||||
}
|
||||
}
|
||||
allModels.sort();
|
||||
}catch(e){}
|
||||
}
|
||||
|
||||
function renderTable(){
|
||||
const agents=Object.values(reportData.agents).filter(a=>Object.values(a.evaluations).some(s=>s>0));
|
||||
let html='<table><thead><tr><th>Agent</th>';
|
||||
allModels.forEach(m=>html+=`<th>${m}</th>`);
|
||||
html+='<th>Best</th><th>Score</th></tr></thead><tbody>';
|
||||
agents.forEach(a=>{
|
||||
html+=`<tr><td>${a.name} <button class="btn-research" onclick="openAgentModal('${a.name}')" title="Research models">🔬</button></td>`;
|
||||
allModels.forEach(m=>{
|
||||
const score=a.evaluations[m];
|
||||
const isCur=a.info&&a.info[2]&&a.info[2].includes(m);
|
||||
let cls='na',text='—',click=`onclick="openCellModal('${a.name}','${m}')"`;
|
||||
if(score!==undefined&&score>0){cls=clsFor(score);text=Math.round(score);click=`onclick="openDetail('${a.name}','${m}')"`;}
|
||||
const curCls=isCur?' hm-cur':'';
|
||||
html+=`<td class="${cls}${curCls}" data-model="${m}" ${click}>${text}${isCur?' ●':''}</td>`;
|
||||
});
|
||||
html+=`<td>${a.best_model}</td><td style="font-weight:700">${Math.round(a.best_score)}</td></tr>`;
|
||||
});
|
||||
html+='</tbody></table>';
|
||||
$('matrix').innerHTML=html;
|
||||
}
|
||||
|
||||
function updateCell(agent,model,score){
|
||||
if(reportData.agents[agent]){
|
||||
reportData.agents[agent].evaluations[model]=score;
|
||||
}
|
||||
if(!allModels.includes(model)){
|
||||
allModels.push(model);
|
||||
allModels.sort();
|
||||
}
|
||||
renderTable();
|
||||
}
|
||||
|
||||
async function pollJob(jobId,pid){
|
||||
for(let i=0;i<60;i++){
|
||||
await new Promise(r=>setTimeout(r,2000));
|
||||
try{
|
||||
const res=await fetch(`${API_BASE}/api/research/${jobId}`);
|
||||
if(!res.ok) continue;
|
||||
const job=await res.json();
|
||||
if(job.status==='pending') setProgress(pid,25,'Waiting in queue...');
|
||||
else if(job.status==='running') setProgress(pid,75,'Running evaluation...');
|
||||
else if(job.status==='done'){ setProgress(pid,100,'Done!'); return job; }
|
||||
else if(job.status==='error'){ setProgress(pid,100,'Error!'); return job; }
|
||||
}catch(e){ console.warn('poll error',e); }
|
||||
}
|
||||
setProgress(pid,100,'Timeout'); return {status:'timeout'};
|
||||
}
|
||||
|
||||
async function startAgentResearch(){
|
||||
const agent=window.__activeAgent;
|
||||
const models=[...$('agentModalList').querySelectorAll('input:checked')].map(i=>i.value);
|
||||
if(!models.length)return;
|
||||
$('agentStartBtn').disabled=true;
|
||||
$('evolveAgentBtn').disabled=true;
|
||||
setProgress('agentProgress',10,'Submitting job...');
|
||||
let job;
|
||||
try{
|
||||
const res=await fetch(`${API_BASE}/api/research`,{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({agent,models})});
|
||||
if(!res.ok) throw new Error('API error '+res.status);
|
||||
job=await res.json();
|
||||
job=await pollJob(job.job_id,'agentProgress');
|
||||
}catch(e){
|
||||
$('agentResults').innerHTML='<p style="color:var(--red);margin-top:12px">API unavailable — run real-fit-engine.py to evaluate '+agent+'</p>';
|
||||
$('agentProgressText').textContent='Error: API unavailable';
|
||||
$('agentStartBtn').disabled=false;
|
||||
$('evolveAgentBtn').disabled=false;
|
||||
return;
|
||||
}
|
||||
const results=job.models_scored||[];
|
||||
let html='<table class="result-table"><thead><tr><th>Model</th><th>Score</th></tr></thead><tbody>';
|
||||
let best=-1;
|
||||
results.forEach(r=>{if(r.score>best)best=r.score;});
|
||||
results.forEach(r=>{
|
||||
const b=r.score>=best-0.1?'best':'';
|
||||
html+=`<tr class="${b}" onclick="openDetail('${agent}','${r.model}')"><td>${r.model}</td><td>${Math.round(r.score)}</td></tr>`;
|
||||
updateCell(agent,r.model,r.score);
|
||||
});
|
||||
html+='</tbody></table>';
|
||||
$('agentResults').innerHTML=html;
|
||||
$('agentProgressText').textContent='Done! Best score: '+Math.round(best);
|
||||
const store=JSON.parse(localStorage.getItem('__researchResults')||'{}');
|
||||
store[agent]={models:results,ts:Date.now()};
|
||||
localStorage.setItem('__researchResults',JSON.stringify(store));
|
||||
$('agentStartBtn').disabled=false;
|
||||
$('evolveAgentBtn').disabled=false;
|
||||
}
|
||||
|
||||
async function startEvolveAgent(){
|
||||
const agent=window.__activeAgent;
|
||||
const models=[...$('agentModalList').querySelectorAll('input:checked')].map(i=>i.value);
|
||||
if(!models.length) return;
|
||||
$('evolveAgentBtn').disabled=true;
|
||||
$('agentStartBtn').disabled=true;
|
||||
setProgress('agentProgress',10,'Submitting evolve-agent job...');
|
||||
try{
|
||||
const res=await fetch(`${API_BASE}/api/evolve-agent/start`,{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({agent,models})});
|
||||
if(!res.ok) throw new Error('API error '+res.status);
|
||||
setProgress('agentProgress',50,'Running role-fit test...');
|
||||
const result=await res.json();
|
||||
if(result.job_id){
|
||||
await pollJob(result.job_id,'agentProgress');
|
||||
}else{
|
||||
await animateProgress('agentProgress','Processing...',2000);
|
||||
}
|
||||
setProgress('agentProgress',100,'Done!');
|
||||
}catch(e){
|
||||
console.error('evolve-agent error',e);
|
||||
setProgress('agentProgress',100,'Error: '+e.message);
|
||||
$('evolveAgentBtn').disabled=false;
|
||||
$('agentStartBtn').disabled=false;
|
||||
return;
|
||||
}
|
||||
await load();
|
||||
closeModal('researchAgentModal');
|
||||
$('evolveAgentBtn').disabled=false;
|
||||
$('agentStartBtn').disabled=false;
|
||||
}
|
||||
|
||||
async function startCellResearch(){
|
||||
const agent=window.__activeAgent, model=window.__activeModel;
|
||||
$('cellStartBtn').disabled=true;
|
||||
setProgress('cellProgress',10,'Submitting...');
|
||||
let job;
|
||||
try{
|
||||
const res=await fetch(`${API_BASE}/api/research/cell`,{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({agent,model})});
|
||||
if(!res.ok) throw new Error('API error '+res.status);
|
||||
job=await res.json();
|
||||
job=await pollJob(job.job_id,'cellProgress');
|
||||
}catch(e){
|
||||
$('cellResults').innerHTML='<p style="color:var(--red);margin-top:12px">API unavailable — run real-fit-engine.py to evaluate '+agent+'</p>';
|
||||
$('cellProgressText').textContent='Error: API unavailable';
|
||||
$('cellStartBtn').disabled=false;
|
||||
return;
|
||||
}
|
||||
const result=(job.models_scored||[])[0]||{model,score:0};
|
||||
updateCell(agent,result.model,result.score);
|
||||
$('cellResults').innerHTML='<table class="result-table"><tbody><tr onclick="openDetail(\''+agent+'\',\''+result.model+'\')"><td>'+result.model+'</td><td>'+Math.round(result.score)+'</td></tr></tbody></table>';
|
||||
$('cellProgressText').textContent='Done! Score: '+Math.round(result.score);
|
||||
const store=JSON.parse(localStorage.getItem('__researchResults')||'{}');
|
||||
if(!store[agent]) store[agent]={models:[],ts:Date.now()};
|
||||
store[agent].models=store[agent].models.filter(m=>m.model!==result.model);
|
||||
store[agent].models.push(result);
|
||||
localStorage.setItem('__researchResults',JSON.stringify(store));
|
||||
}
|
||||
|
||||
load().catch(e=>$('matrix').innerHTML='Error: '+e);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
4
agent-evolution/requirements.txt
Normal file
4
agent-evolution/requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
fastapi==0.136.3
|
||||
uvicorn==0.48.0
|
||||
python-multipart==0.0.29
|
||||
pydantic==2.13.4
|
||||
138
agent-evolution/scripts/audit-system.cjs
Normal file
138
agent-evolution/scripts/audit-system.cjs
Normal file
@@ -0,0 +1,138 @@
|
||||
const fs = require('fs');
|
||||
|
||||
function parseFrontmatter(content) {
|
||||
if (!content.startsWith('---')) return null;
|
||||
const end = content.indexOf('---', 3);
|
||||
if (end === -1) return null;
|
||||
const fm = content.slice(3, end).trim();
|
||||
const data = {};
|
||||
for (const line of fm.split('\n')) {
|
||||
const m = line.match(/^(\w+):\s*(.+)$/);
|
||||
if (m) data[m[1]] = m[2].trim();
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
function stripComments(str) {
|
||||
// Remove single-line comments, but not inside strings
|
||||
return str.replace(/\/\/.*$/gm, '');
|
||||
}
|
||||
|
||||
const agents = [];
|
||||
const commands = [];
|
||||
const issues = [];
|
||||
|
||||
// 1. Parse agent .md files
|
||||
for (const f of fs.readdirSync('.kilo/agents').filter(f => f.endsWith('.md'))) {
|
||||
const content = fs.readFileSync('.kilo/agents/' + f, 'utf8');
|
||||
const fm = parseFrontmatter(content);
|
||||
if (fm && fm.model) {
|
||||
agents.push({
|
||||
name: f.replace('.md', ''),
|
||||
model: fm.model,
|
||||
mode: fm.mode || 'subagent',
|
||||
source: '.kilo/agents/' + f,
|
||||
description: fm.description || ''
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 2. Parse command .md files
|
||||
for (const f of fs.readdirSync('.kilo/commands').filter(f => f.endsWith('.md'))) {
|
||||
const content = fs.readFileSync('.kilo/commands/' + f, 'utf8');
|
||||
const fm = parseFrontmatter(content);
|
||||
if (fm && fm.model) {
|
||||
commands.push({
|
||||
name: f.replace('.md', ''),
|
||||
model: fm.model,
|
||||
mode: fm.mode || 'command',
|
||||
source: '.kilo/commands/' + f,
|
||||
description: fm.description || ''
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Parse kilo-meta.json
|
||||
const meta = JSON.parse(fs.readFileSync('kilo-meta.json', 'utf8'));
|
||||
for (const a of agents) {
|
||||
const m = meta.agents?.[a.name];
|
||||
if (m) {
|
||||
a.metaModel = m.model;
|
||||
if (a.model !== m.model) issues.push(`AGENT ${a.name}: .md=${a.model} vs meta=${m.model}`);
|
||||
}
|
||||
}
|
||||
for (const c of commands) {
|
||||
const m = meta.commands?.[c.name];
|
||||
if (m) {
|
||||
c.metaModel = m.model;
|
||||
if (c.model !== m.model) issues.push(`CMD ${c.name}: .md=${c.model} vs meta=${m.model}`);
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Parse .kilo/kilo.jsonc
|
||||
const dotKiloRaw = stripComments(fs.readFileSync('.kilo/kilo.jsonc', 'utf8'));
|
||||
const dotKilo = JSON.parse(dotKiloRaw);
|
||||
for (const [name, cfg] of Object.entries(dotKilo.agent || {})) {
|
||||
if (!cfg.model) continue;
|
||||
const agent = agents.find(a => a.name === name);
|
||||
if (agent) {
|
||||
agent.kiloModel = cfg.model;
|
||||
if (agent.model !== cfg.model) issues.push(`AGENT ${name}: .md=${agent.model} vs .kilo/kilo.jsonc=${cfg.model}`);
|
||||
}
|
||||
}
|
||||
|
||||
// 5. Parse root kilo.jsonc
|
||||
const rootKiloRaw = stripComments(fs.readFileSync('kilo.jsonc', 'utf8'));
|
||||
const rootKilo = JSON.parse(rootKiloRaw);
|
||||
for (const [name, cfg] of Object.entries(rootKilo.agent || {})) {
|
||||
if (!cfg.model) continue;
|
||||
const cmd = commands.find(c => c.name === name);
|
||||
if (cmd) {
|
||||
cmd.rootModel = cfg.model;
|
||||
if (cmd.model !== cfg.model) issues.push(`CMD ${name}: .md=${cmd.model} vs kilo.jsonc=${cfg.model}`);
|
||||
}
|
||||
}
|
||||
|
||||
// 6. Check non-ollama
|
||||
const nonOllama = [];
|
||||
for (const a of agents) if (!a.model.startsWith('ollama-cloud/')) nonOllama.push({type:'agent', name:a.name, model:a.model});
|
||||
for (const c of commands) if (!c.model.startsWith('ollama-cloud/')) nonOllama.push({type:'command', name:c.name, model:c.model});
|
||||
|
||||
// 7. Summary by model
|
||||
const modelStats = {};
|
||||
for (const a of agents) modelStats[a.model] = (modelStats[a.model] || 0) + 1;
|
||||
for (const c of commands) modelStats[c.model] = (modelStats[c.model] || 0) + 1;
|
||||
|
||||
const state = {
|
||||
generated: new Date().toISOString(),
|
||||
totalAgents: agents.length,
|
||||
totalCommands: commands.length,
|
||||
allOllama: nonOllama.length === 0,
|
||||
modelDistribution: modelStats,
|
||||
agents: agents.sort((a,b) => a.name.localeCompare(b.name)),
|
||||
commands: commands.sort((a,b) => a.name.localeCompare(b.name)),
|
||||
issues: issues,
|
||||
nonOllama: nonOllama
|
||||
};
|
||||
|
||||
fs.writeFileSync('agent-evolution/data/real-state.json', JSON.stringify(state, null, 2) + '\n');
|
||||
|
||||
// Console report
|
||||
console.log('=== REAL SYSTEM STATE ===');
|
||||
console.log('Generated:', state.generated);
|
||||
console.log('Agents:', state.totalAgents);
|
||||
console.log('Commands:', state.totalCommands);
|
||||
console.log('All ollama-cloud/:', state.allOllama ? 'YES' : 'NO (' + nonOllama.length + ' exceptions)');
|
||||
console.log('\n=== MODEL DISTRIBUTION ===');
|
||||
for (const [m, c] of Object.entries(modelStats).sort((a,b) => b[1]-a[1])) {
|
||||
console.log(` ${m}: ${c}`);
|
||||
}
|
||||
if (issues.length > 0) {
|
||||
console.log('\n=== ISSUES ===');
|
||||
issues.forEach(i => console.log(' ⚠️', i));
|
||||
}
|
||||
if (nonOllama.length > 0) {
|
||||
console.log('\n=== NON-OLLOMA ===');
|
||||
nonOllama.forEach(n => console.log(' ❌', n.type, n.name, n.model));
|
||||
}
|
||||
console.log('\n✅ State written to agent-evolution/data/real-state.json');
|
||||
29
agent-evolution/scripts/merge-real-fit.cjs
Normal file
29
agent-evolution/scripts/merge-real-fit.cjs
Normal file
@@ -0,0 +1,29 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
const DASH = path.join(__dirname, '../data/dashboard-data.json');
|
||||
const REAL = path.join(__dirname, '../data/real-fit-report.json');
|
||||
const OUT = path.join(__dirname, '../data/dashboard-data.json');
|
||||
|
||||
const dash = JSON.parse(fs.readFileSync(DASH, 'utf-8'));
|
||||
const real = JSON.parse(fs.readFileSync(REAL, 'utf-8'));
|
||||
|
||||
// Inject real_evaluations into each agent
|
||||
dash.agents.forEach(a => {
|
||||
const r = real.agents?.[a.name];
|
||||
if (r && r.evaluations) {
|
||||
a.real_evaluations = r.evaluations;
|
||||
a.real_best_model = r.best_model;
|
||||
a.real_best_score = r.best_score;
|
||||
} else {
|
||||
a.real_evaluations = {};
|
||||
}
|
||||
});
|
||||
|
||||
// Add metadata
|
||||
dash.real_fit_generated = real.generated;
|
||||
dash.real_fit_source = real.source;
|
||||
|
||||
fs.writeFileSync(OUT, JSON.stringify(dash, null, 2));
|
||||
console.log('Merged real-fit data into ' + OUT);
|
||||
console.log('Agents with real evals:', dash.agents.filter(a => Object.keys(a.real_evaluations||{}).length > 0).length);
|
||||
98
agent-evolution/scripts/patch-heatmap.js
Normal file
98
agent-evolution/scripts/patch-heatmap.js
Normal file
@@ -0,0 +1,98 @@
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
const INDEX = path.join(__dirname, '../index.standalone.html');
|
||||
|
||||
// 1. New renderHeatmap that reads real-fit data
|
||||
const newRenderHeatmap = `function renderHeatmap() {
|
||||
const esc = str => (str || '').replace(/[&<>"']/g, m => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[m]));
|
||||
const dd = window.dashboardData;
|
||||
|
||||
// Merge real-fit if loaded
|
||||
const rf = window.realFitData || {};
|
||||
const realAgents = rf.agents || {};
|
||||
|
||||
if (!dd || !dd.agents) {
|
||||
document.getElementById('hmTable').innerHTML = '<tr><td style="color:var(--text-secondary);padding:20px;text-align:center;">⚠️ No data. Run analysis.</td></tr>';
|
||||
return;
|
||||
}
|
||||
|
||||
// Build model list from real-fit (cross-model) + current dashboard data
|
||||
const modelsSeen = new Set();
|
||||
dd.agents.forEach(a => { modelsSeen.add(a.model_short); });
|
||||
Object.values(realAgents).forEach(a => { Object.keys(a.evaluations || {}).forEach(m => modelsSeen.add(m)); });
|
||||
const modelList = Array.from(modelsSeen).filter(m => m && m !== 'code-skeptic');
|
||||
|
||||
const t = document.getElementById('hmTable');
|
||||
let h = '<thead><tr><th class="hm-role">Agent</th>';
|
||||
modelList.forEach(m => {
|
||||
h += '<th style="writing-mode:vertical-lr;transform:rotate(180deg);max-width:32px;font-size:.56em;padding:3px 1px;">' + esc(m) + '</th>';
|
||||
});
|
||||
h += '<th>Best</th><th>Score</th></tr></thead><tbody>';
|
||||
|
||||
dd.agents.forEach(a => {
|
||||
const realAgent = realAgents[a.name];
|
||||
h += '<tr><td class="hm-r">' + esc(a.name) + '</td>';
|
||||
modelList.forEach(m => {
|
||||
let score = 0;
|
||||
if (realAgent && realAgent.evaluations && realAgent.evaluations[m] > 0) {
|
||||
score = Math.round(realAgent.evaluations[m]);
|
||||
}
|
||||
const isCurrent = a.model_short === m;
|
||||
let cls = 'na';
|
||||
if (score >= 90) cls = 'high';
|
||||
else if (score >= 75) cls = 'good';
|
||||
else if (score >= 50) cls = 'med';
|
||||
else if (score > 0) cls = 'low';
|
||||
const display = score > 0 ? score : (isCurrent ? Math.round(a.fit_score || 0) : '·');
|
||||
const curStyle = isCurrent ? 'box-shadow:inset 0 0 0 2px var(--accent-cyan)' : '';
|
||||
h += '<td class="score ' + cls + '" style="' + curStyle + '">' + display + '</td>';
|
||||
});
|
||||
const bestModel = realAgent ? (realAgent.best_model || a.model_short) : a.model_short;
|
||||
const bestScore = realAgent ? Math.round(realAgent.best_score || 0) : Math.round(a.fit_score || 0);
|
||||
h += '<td>' + esc(bestModel) + '</td><td style="font-weight:700">' + bestScore + '</td></tr>';
|
||||
});
|
||||
t.innerHTML = h + '</tbody>';
|
||||
}`;
|
||||
|
||||
// 2. Add loadRealFitData script after dashboard load
|
||||
const loadRealFitData = `
|
||||
// Load real-fit report for cross-model evaluation
|
||||
try {
|
||||
const rfRes = await fetch('data/real-fit-report.json');
|
||||
if (rfRes.ok) window.realFitData = await rfRes.json();
|
||||
} catch(e) { console.warn('real-fit-report.json not loaded:', e.message); }
|
||||
`;
|
||||
|
||||
let html = fs.readFileSync(INDEX, 'utf-8');
|
||||
|
||||
// Patch A: replace renderHeatmap function
|
||||
const oldPattern = /\/\/ Render Heatmap[\s\S]*?function renderHeatmap\(\)\s*\{[^}]*\{[^}]*\}[^}]*\}/;
|
||||
const oldMatch = html.match(oldPattern);
|
||||
if (oldMatch) {
|
||||
html = html.substring(0, oldMatch.index) + '// Render Heatmap (real-fit enabled)\n' + newRenderHeatmap + html.substring(oldMatch.index + oldMatch[0].length);
|
||||
console.log('Patched renderHeatmap');
|
||||
} else {
|
||||
console.log('Pattern A not found, trying fallback...');
|
||||
// Fallback: find and replace the specific renderHeatmap block
|
||||
const start = html.indexOf('function renderHeatmap() {');
|
||||
if (start !== -1) {
|
||||
let brace = 0, end = start;
|
||||
for (let i = start; i < html.length; i++) {
|
||||
if (html[i] === '{') brace++;
|
||||
else if (html[i] === '}') { brace--; if (brace === 0) { end = i + 1; break; } }
|
||||
}
|
||||
html = html.substring(0, start) + newRenderHeatmap + '\n' + html.substring(end);
|
||||
console.log('Patched renderHeatmap (fallback)');
|
||||
}
|
||||
}
|
||||
|
||||
// Patch B: insert real-fit loading after dashboard load
|
||||
const dashLoadPattern = /window\.dashboardData = await dashRes\.json\(\);/;
|
||||
if (dashLoadPattern.test(html)) {
|
||||
html = html.replace(dashLoadPattern, 'window.dashboardData = await dashRes.json();\n' + loadRealFitData.trim());
|
||||
console.log('Patched init() to load real-fit data');
|
||||
}
|
||||
|
||||
fs.writeFileSync(INDEX, html);
|
||||
console.log('Done — ' + (fs.statSync(INDEX).size / 1024).toFixed(1) + ' KB');
|
||||
173
agent-evolution/scripts/rebuild-report.py
Normal file
173
agent-evolution/scripts/rebuild-report.py
Normal file
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Rebuild real-fit-report.json from SQLite DB.
|
||||
|
||||
Usage:
|
||||
python3 rebuild-report.py
|
||||
python3 rebuild-report.py --db /path/to/real-fit.db --report /path/to/real-fit-report.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sqlite3
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _sync_agents_from_meta(db_path: Path) -> None:
|
||||
"""Import any missing agents from kilo-meta.json into the DB agents table."""
|
||||
meta_path = db_path.parent.parent.parent / "kilo-meta.json"
|
||||
if not meta_path.exists():
|
||||
return
|
||||
with open(meta_path) as f:
|
||||
meta = json.load(f)
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT name FROM agents")
|
||||
existing = {r[0] for r in cursor.fetchall()}
|
||||
|
||||
for name, info in meta.get("agents", {}).items():
|
||||
if name in existing:
|
||||
continue
|
||||
cursor.execute(
|
||||
"INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
(
|
||||
name,
|
||||
info.get("description", ""),
|
||||
info.get("category", "meta"),
|
||||
info.get("model", ""),
|
||||
info.get("color", "#6B7280"),
|
||||
datetime.now(timezone.utc).isoformat(),
|
||||
),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def build_report(db_path: Path) -> dict:
|
||||
_sync_agents_from_meta(db_path)
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.row_factory = sqlite3.Row
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
SELECT name, description, category, current_model
|
||||
FROM agents
|
||||
""")
|
||||
agents_meta = {row["name"]: dict(row) for row in cursor.fetchall()}
|
||||
|
||||
# Only take evaluations that are NOT HTTP error responses
|
||||
# AND prefer evaluator='rubric_v2' over 'rubric_v1'
|
||||
cursor.execute("""
|
||||
SELECT agent_name, model, total_score, evaluator, response
|
||||
FROM evaluations
|
||||
WHERE total_score > 0
|
||||
AND evaluator NOT LIKE '%rubric_v1%'
|
||||
AND (response IS NULL
|
||||
OR (response NOT LIKE '%[HTTP %' AND response != ''))
|
||||
ORDER BY agent_name, model,
|
||||
CASE evaluator
|
||||
WHEN 'evolution-skeptic' THEN 0
|
||||
WHEN 'rubric_v2' THEN 1
|
||||
ELSE 2
|
||||
END,
|
||||
total_score DESC
|
||||
""")
|
||||
|
||||
# Take the first (best preferred evaluator, highest score) per agent-model
|
||||
best_evals = {}
|
||||
for row in cursor.fetchall():
|
||||
agent = row["agent_name"]
|
||||
model = row["model"]
|
||||
score = row["total_score"]
|
||||
if agent not in best_evals:
|
||||
best_evals[agent] = {}
|
||||
if model not in best_evals[agent]:
|
||||
best_evals[agent][model] = score
|
||||
|
||||
# Rebuild fit_scores from selected evaluations only
|
||||
cursor.execute("""
|
||||
SELECT agent_name, model, MAX(total_score) as best_score, scores, explanation
|
||||
FROM evaluations
|
||||
WHERE total_score > 0
|
||||
AND evaluator NOT LIKE '%rubric_v1%'
|
||||
AND (response IS NULL
|
||||
OR (response NOT LIKE '%[HTTP %' AND response != ''))
|
||||
GROUP BY agent_name, model
|
||||
""")
|
||||
fit_scores = {}
|
||||
for row in cursor.fetchall():
|
||||
fit_scores[row["agent_name"]] = {
|
||||
"model": row["model"],
|
||||
"fit": row["best_score"],
|
||||
"explanation": (
|
||||
f"Best model for {row['agent_name']} is {row['model']} "
|
||||
f"with avg score {row['best_score']:.1f}. "
|
||||
"Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
|
||||
),
|
||||
}
|
||||
|
||||
conn.close()
|
||||
|
||||
agents_report = {}
|
||||
for agent_name, meta in agents_meta.items():
|
||||
evals = best_evals.get(agent_name, {})
|
||||
if evals:
|
||||
best_model = max(evals, key=evals.get)
|
||||
best_score = evals[best_model]
|
||||
else:
|
||||
best_model = ""
|
||||
best_score = 0.0
|
||||
agents_report[agent_name] = {
|
||||
"name": agent_name,
|
||||
"evaluations": evals,
|
||||
"info": [
|
||||
meta.get("description") or "",
|
||||
meta.get("category") or "",
|
||||
meta.get("current_model") or "",
|
||||
],
|
||||
"best_model": best_model,
|
||||
"best_score": best_score,
|
||||
}
|
||||
|
||||
total_evals = sum(len(evals) for evals in best_evals.values())
|
||||
generated = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
|
||||
|
||||
return {
|
||||
"generated": generated,
|
||||
"source": "real-fit-engine-db-filtered",
|
||||
"total_evaluations": total_evals,
|
||||
"agents": agents_report,
|
||||
"fit_scores": fit_scores,
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Rebuild real-fit-report.json from DB")
|
||||
parser.add_argument(
|
||||
"--db",
|
||||
type=Path,
|
||||
default=Path(__file__).parent.parent / "data" / "real-fit.db",
|
||||
help="Path to SQLite DB",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--report",
|
||||
type=Path,
|
||||
default=Path(__file__).parent.parent / "data" / "real-fit-report.json",
|
||||
help="Path to report JSON output",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
report = build_report(args.db)
|
||||
args.report.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(args.report, "w", encoding="utf-8") as f:
|
||||
json.dump(report, f, indent=2)
|
||||
|
||||
print(f"Report rebuilt: {args.report}")
|
||||
print(f"Agents: {len(report['agents'])}, Evaluations: {report['total_evaluations']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user