feat(evolution): add real-fit dashboard, API, report builder, and docker compose

- real-fit.html: API-driven research dashboard with agent/model heatmap, detail modal with score breakdown and evaluator commentary
- api.py: FastAPI backend serving /api/real-fit-report (dynamic from SQLite), /api/research, /api/evolve-agent/start
- rebuild-report.py: generates real-fit-report.json from SQLite DB for static fallback
- docker-compose.yml: add evolution-api service (Python 3.12, uvicorn) for research endpoints
- index.standalone.html: sync with dashboard data updates
- archive/index.html: standalone dashboard snapshot (263KB)
- .gitignore: exclude *.db, research-jobs.json from tracking
This commit is contained in:
Deploy Bot
2026-05-28 11:55:49 +01:00
parent dbbf4c32e1
commit b95fd41587
13 changed files with 8886 additions and 353 deletions

491
agent-evolution/api.py Normal file
View File

@@ -0,0 +1,491 @@
"""
Evolution Research API — FastAPI backend for agent-model evaluation jobs.
Endpoints:
POST /api/research → start background evaluation job
GET /api/research/{id} → job status & results
POST /api/research/cell → evaluate single agent-model pair
GET /api/real-fit-report → serve real-fit-report.json (live from DB)
GET /api/models → list available models
GET /api/evaluation/{agent}/{model} → detailed evaluation record
POST /api/evolve-agent/start → start role-fit testing job (evolution-prompt + evolution-skeptic)
"""
import json
import os
import sqlite3
import subprocess
import time
import uuid
from datetime import datetime, timezone
from pathlib import Path
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
app = FastAPI(title="Evolution Research API", version="1.1.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
JOB_STATE_PATH = Path(os.environ.get("JOB_STATE_PATH", "/app/data/research-jobs.json"))
REPORT_PATH = Path(os.environ.get("REPORT_PATH", "/app/data/real-fit-report.json"))
META_PATH = Path(os.environ.get("META_PATH", "/app/kilo-meta.json"))
EVOLUTION_PATH = Path(os.environ.get("EVOLUTION_PATH", "/app/data/evolution.json"))
ENGINE_PATH = Path(os.environ.get("ENGINE_PATH", "/app/scripts/real-fit-engine.py"))
DB_PATH = Path(os.environ.get("REAL_FIT_DB", REPORT_PATH.parent / "real-fit.db"))
def _load_json(path: Path) -> dict:
if path.exists():
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
return {}
def _save_json(path: Path, data: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2)
def _load_jobs() -> dict:
return _load_json(JOB_STATE_PATH)
def _save_jobs(jobs: dict) -> None:
_save_json(JOB_STATE_PATH, jobs)
class ResearchRequest(BaseModel):
agent: str
models: list[str]
class CellRequest(BaseModel):
agent: str
model: str
class EvolveAgentRequest(BaseModel):
agent: str
models: list[str]
def _spawn_engine_job(job_id: str, agent: str, models: list[str]) -> None:
"""Spawn real-fit-engine.py as a background subprocess to evaluate models.
After evaluation, regenerates the report JSON so results are immediately visible.
"""
model_arg = ",".join(models)
subprocess.Popen(
["python3", "-c", f"""
import subprocess, json, time, os
job_id = {repr(job_id)}
job_state_path = os.environ.get('JOB_STATE_PATH', '/app/data/research-jobs.json')
engine_path = os.environ.get('ENGINE_PATH', '/app/scripts/real-fit-engine.py')
def load_jobs():
try:
with open(job_state_path) as f:
return json.load(f)
except Exception:
return {{}}
def save_jobs(jobs):
with open(job_state_path, 'w') as f:
json.dump(jobs, f, indent=2)
jobs = load_jobs()
job = jobs.get(job_id)
if job:
job['status'] = 'running'
job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
save_jobs(jobs)
cmd = ['python3', engine_path, '--evaluate', {repr(agent)}, '--models', {repr(model_arg)}, '--report']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
stdout, stderr = proc.communicate()
jobs = load_jobs()
job = jobs.get(job_id)
if job:
job['status'] = 'done' if proc.returncode == 0 else 'error'
job['progress'] = 100
job['result'] = {{'returncode': proc.returncode, 'stdout': stdout, 'stderr': stderr}}
job['updated_at'] = time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())
save_jobs(jobs)
"""],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
@app.get("/api/models")
def get_models():
meta = _load_json(META_PATH)
agents_meta = meta.get("agents", {})
models = set()
for agent in agents_meta.values():
m = agent.get("model", "")
if m:
models.add(m)
evolution = _load_json(EVOLUTION_PATH)
for agent_data in evolution.get("agents", {}).values():
curr = agent_data.get("current", {})
m = curr.get("model", "")
if m:
models.add(m)
for rec in agent_data.get("recommendations", []):
mod = rec.get("model", "")
if mod:
models.add(mod)
return {"models": sorted(models)}
@app.get("/api/evaluation/{agent}/{model}")
def get_evaluation(agent: str, model: str):
db_path = str(DB_PATH)
if not os.path.exists(db_path):
raise HTTPException(status_code=404, detail="Evaluation database not found")
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# Step 1: Get the best evaluation for this agent-model pair
cursor.execute(
"""
SELECT e.id, e.agent_name, e.model, e.prompt_id,
e.response, e.scores, e.total_score, e.explanation,
e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at
FROM evaluations e
WHERE e.agent_name = ? AND e.model = ? AND e.total_score > 0
ORDER BY e.total_score DESC, e.id DESC
LIMIT 1
""",
(agent, model),
)
row = cursor.fetchone()
if not row:
# Fallback: try any evaluation even with score 0
cursor.execute(
"""
SELECT e.id, e.agent_name, e.model, e.prompt_id,
e.response, e.scores, e.total_score, e.explanation,
e.evaluator, e.latency_ms, e.tokens_prompt, e.tokens_response, e.evaluated_at
FROM evaluations e
WHERE e.agent_name = ? AND e.model = ?
ORDER BY e.id DESC LIMIT 1
""",
(agent, model),
)
row = cursor.fetchone()
if not row:
conn.close()
raise HTTPException(status_code=404, detail="Evaluation not found for this agent-model pair")
result = dict(row)
prompt_id = result.get("prompt_id")
# Step 2: Get prompt data — try by prompt_id first, then fallback by agent_name
system_prompt = ""
user_prompt = ""
expected_keywords_raw = "[]"
rubric_raw = "{}"
if prompt_id:
cursor.execute(
"SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE id = ?",
(prompt_id,),
)
tp = cursor.fetchone()
if tp and tp["system_prompt"]:
system_prompt = tp["system_prompt"]
user_prompt = tp["user_prompt"] or ""
expected_keywords_raw = tp["expected_keywords"] or "[]"
rubric_raw = tp["rubric"] or "{}"
# Fallback: find prompt by agent_name if JOIN failed
if not system_prompt:
cursor.execute(
"SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE agent_name = ? ORDER BY id DESC LIMIT 1",
(agent,),
)
tp = cursor.fetchone()
if tp:
system_prompt = tp["system_prompt"] or ""
user_prompt = tp["user_prompt"] or ""
expected_keywords_raw = tp["expected_keywords"] or "[]"
rubric_raw = tp["rubric"] or "{}"
conn.close()
# Assign all fetched prompt data to the result
result["system_prompt"] = system_prompt
result["user_prompt"] = user_prompt
result["expected_keywords"] = expected_keywords_raw
result["rubric"] = rubric_raw
for key in ("expected_keywords", "rubric", "scores"):
raw = result.get(key)
if isinstance(raw, str):
try:
result[key] = json.loads(raw)
except json.JSONDecodeError:
result[key] = [] if key == "expected_keywords" else {}
elif raw is None:
result[key] = [] if key == "expected_keywords" else {}
return result
def _sync_agents_from_meta(db_path: Path, meta_path: Path | None = None) -> None:
"""Import any missing agents from kilo-meta.json into the DB agents table."""
if meta_path is None:
meta_path = db_path.parent.parent.parent / "kilo-meta.json"
if not meta_path.exists():
return
with open(meta_path) as f:
meta = json.load(f)
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
cursor.execute("SELECT name FROM agents")
existing = {r[0] for r in cursor.fetchall()}
for name, info in meta.get("agents", {}).items():
if name in existing:
continue
cursor.execute(
"INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)",
(
name,
info.get("description", ""),
info.get("category", "meta"),
info.get("model", ""),
info.get("color", "#6B7280"),
datetime.now(timezone.utc).isoformat(),
),
)
conn.commit()
conn.close()
def _build_report_from_db(db_path: Path) -> dict:
"""Build real-fit report dynamically from SQLite DB (filtered, objective)."""
_sync_agents_from_meta(db_path)
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute("""
SELECT name, description, category, current_model
FROM agents
""")
agents_meta = {row["name"]: dict(row) for row in cursor.fetchall()}
# Only take evaluations that are NOT HTTP error responses
# AND prefer evaluator='rubric_v2' over 'rubric_v1'
cursor.execute("""
SELECT agent_name, model, total_score, evaluator, response
FROM evaluations
WHERE total_score > 0
AND evaluator NOT LIKE '%rubric_v1%'
AND (response IS NULL
OR (response NOT LIKE '%[HTTP %' AND response != ''))
ORDER BY agent_name, model,
CASE evaluator
WHEN 'evolution-skeptic' THEN 0
WHEN 'rubric_v2' THEN 1
ELSE 2
END,
total_score DESC
""")
# Take the first (best preferred evaluator, highest score) per agent-model
best_evals = {}
for row in cursor.fetchall():
agent = row["agent_name"]
model = row["model"]
score = row["total_score"]
if agent not in best_evals:
best_evals[agent] = {}
if model not in best_evals[agent]:
best_evals[agent][model] = score
# Rebuild fit_scores from selected evaluations only
cursor.execute("""
SELECT agent_name, model, MAX(total_score) as best_score, scores, explanation
FROM evaluations
WHERE total_score > 0
AND evaluator NOT LIKE '%rubric_v1%'
AND (response IS NULL
OR (response NOT LIKE '%[HTTP %' AND response != ''))
GROUP BY agent_name, model
""")
fit_scores = {}
for row in cursor.fetchall():
fit_scores[row["agent_name"]] = {
"model": row["model"],
"fit": row["best_score"],
"explanation": (
f"Best model for {row['agent_name']} is {row['model']} "
f"with avg score {row['best_score']:.1f}. "
"Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
),
}
conn.close()
agents_report = {}
for agent_name, meta in agents_meta.items():
evals = best_evals.get(agent_name, {})
if evals:
best_model = max(evals, key=evals.get)
best_score = evals[best_model]
else:
best_model = ""
best_score = 0.0
agents_report[agent_name] = {
"name": agent_name,
"evaluations": evals,
"info": [
meta.get("description") or "",
meta.get("category") or "",
meta.get("current_model") or "",
],
"best_model": best_model,
"best_score": best_score,
}
total_evals = sum(len(evals) for evals in best_evals.values())
return {
"generated": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()).replace("+0000", "+00:00"),
"source": "real-fit-engine-db-filtered",
"total_evaluations": total_evals,
"agents": agents_report,
"fit_scores": fit_scores,
}
@app.get("/api/real-fit-report")
def get_real_fit_report():
db_path = str(DB_PATH)
if os.path.exists(db_path):
return _build_report_from_db(DB_PATH)
return _load_json(REPORT_PATH)
@app.post("/api/research")
def start_research(req: ResearchRequest):
job_id = str(uuid.uuid4())
jobs = _load_jobs()
jobs[job_id] = {
"id": job_id,
"agent": req.agent,
"models": req.models,
"status": "pending",
"progress": 0,
"result": None,
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
_save_jobs(jobs)
_spawn_engine_job(job_id, req.agent, req.models)
return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models}
def _extract_scores_from_report(agent: str, models: list[str]) -> list[dict]:
"""Read real-fit-report.json and return scores for agent x models."""
report = _load_json(REPORT_PATH)
agent_data = report.get("agents", {}).get(agent, {})
evaluations = agent_data.get("evaluations", {})
results = []
for m in models:
score = evaluations.get(m, 0)
pending = score == 0
results.append({"model": m, "score": score, "pending": pending})
return results
@app.get("/api/research/{job_id}")
def get_research(job_id: str):
jobs = _load_jobs()
job = jobs.get(job_id)
if not job:
raise HTTPException(status_code=404, detail="Job not found")
if job.get("status") == "done" and job.get("result") is not None:
job["models_scored"] = _extract_scores_from_report(job["agent"], job.get("models", []))
return job
@app.post("/api/research/cell")
def research_cell(req: CellRequest):
job_id = str(uuid.uuid4())
jobs = _load_jobs()
jobs[job_id] = {
"id": job_id,
"agent": req.agent,
"models": [req.model],
"status": "pending",
"progress": 0,
"result": None,
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
_save_jobs(jobs)
_spawn_engine_job(job_id, req.agent, [req.model])
return {"job_id": job_id, "status": "pending", "agent": req.agent, "model": req.model}
@app.post("/api/evolve-agent/start")
def start_evolve_agent(req: EvolveAgentRequest):
"""Start a role-fit evaluation job using evolution-prompt and evolution-skeptic.
For now, this places a job in the queue that will be picked up by the real-fit-engine.
In the full implementation:
1. evolution-prompt generates role-specific stress-test prompts from agent definition
2. Each model in models list is tested with the same prompt
3. evolution-skeptic evaluates each response with per-dimension rubric scoring
4. Results are stored in SQLite and report is regenerated
"""
job_id = str(uuid.uuid4())
jobs = _load_jobs()
jobs[job_id] = {
"id": job_id,
"type": "evolve-agent",
"agent": req.agent,
"models": req.models,
"status": "pending",
"progress": 0,
"result": None,
"created_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
"updated_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
_save_jobs(jobs)
# Placeholder: spawn the same engine job with evolve-agent type
# In full implementation, this would spawn a script that:
# 1. Reads agent definition from .kilo/agents/{agent}.md
# 2. Calls Ollama API for evolution-prompt to generate test prompts
# 3. For each model: calls Ollama API, stores response
# 4. Calls Ollama API for evolution-skeptic to evaluate
# 5. Stores results in SQLite, rebuilds report
_spawn_engine_job(job_id, req.agent, req.models)
return {"job_id": job_id, "status": "pending", "agent": req.agent, "models": req.models}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,11 @@
const { chromium } = require('playwright');
const fs = require('fs');
(async () => {
const browser = await chromium.launch({ headless: true, args: ['--no-sandbox'] });
const page = await browser.newPage({ viewport: { width: 1280, height: 720 } });
await page.goto('http://host.docker.internal:3003', { waitUntil: 'domcontentloaded', timeout: 30000 });
await page.waitForTimeout(2000);
await page.screenshot({ path: '/app/tests/visual/current/dashboard_landing.png', fullPage: false });
await browser.close();
console.log('Screenshot saved to /app/tests/visual/current/dashboard_landing.png');
})();

File diff suppressed because it is too large Load Diff

View File

@@ -1,28 +1,27 @@
# Docker Compose for Agent Evolution Dashboard (mount-driven, no-rebuild)
# Docker Compose for Agent Evolution Dashboard + Research API (mount-driven, no-rebuild)
# Usage:
# docker compose -f agent-evolution/docker-compose.yml up -d
# # Edit any file in agent-evolution/ or .kilo/ on host → instant reflection
# # Just run:
# bun run sync:evolution
# # and reload the page
# # Edit any file on host → instant reflection in containers
# # Dashboard: http://localhost:3003
# # API: http://localhost:3004
#
version: '3.8'
services:
evolution-dashboard:
build:
context: .
dockerfile: Dockerfile
image: python:3.12-alpine
container_name: apaw-evolution
ports:
- "3003:80"
volumes:
# Mount the generated standalone HTML to the container's web root
- ./index.standalone.html:/app/index.html:ro
# Mount real-fit standalone report
- ./real-fit.html:/app/real-fit.html:ro
# Mount data directory for any additional assets
- ./data:/app/data:ro
# Mount .kilo directory for live config access
- ../.kilo:/app/kilo:ro
working_dir: /app
command: ["python3", "-m", "http.server", "80"]
environment:
- NODE_ENV=production
- TZ=UTC
@@ -39,6 +38,47 @@ services:
- "com.apaw.service=evolution-dashboard"
- "com.apaw.description=Agent Evolution Dashboard"
evolution-api:
image: python:3.12-alpine
container_name: apaw-evolution-api
ports:
- "3004:8000"
volumes:
# API source code
- ./api.py:/app/api.py:ro
- ./requirements.txt:/app/requirements.txt:ro
# Data directory (read-write for job state and reports)
- ./data:/app/data:rw
# real-fit-engine.py script
- ../scripts/real-fit-engine.py:/app/scripts/real-fit-engine.py:ro
# Agent definitions and metadata
- ../.kilo/agents:/app/agents:ro
- ../kilo-meta.json:/app/kilo-meta.json:ro
working_dir: /app
command: >
sh -c "pip install --no-cache-dir -r requirements.txt && uvicorn api:app --host 0.0.0.0 --port 8000"
environment:
- TZ=UTC
- PYTHONUNBUFFERED=1
- JOB_STATE_PATH=/app/data/research-jobs.json
- REPORT_PATH=/app/data/real-fit-report.json
- META_PATH=/app/kilo-meta.json
- EVOLUTION_PATH=/app/data/evolution.json
- ENGINE_PATH=/app/scripts/real-fit-engine.py
- REAL_FIT_DB=/app/data/real-fit.db
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:8000/api/models"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
networks:
- evolution-network
labels:
- "com.apaw.service=evolution-api"
- "com.apaw.description=Agent Evolution Research API"
# Optional: Nginx reverse proxy with SSL
evolution-nginx:
image: nginx:alpine
@@ -49,13 +89,14 @@ services:
- "80:80"
- "443:443"
volumes:
- ./agent-evolution/nginx.conf:/etc/nginx/nginx.conf:ro
- ./agent-evolution/ssl:/etc/nginx/ssl:ro
- ./nginx.conf:/etc/nginx/nginx.conf:ro
- ./ssl:/etc/nginx/ssl:ro
depends_on:
- evolution-dashboard
- evolution-api
networks:
- evolution-network
networks:
evolution-network:
driver: bridge
driver: bridge

View File

@@ -5083,7 +5083,7 @@ async function init() {
try {
// Load real dashboard data FIRST (overrides stale agent-versions)
try {
const dashRes = await fetch('data/dashboard-data.json');
const dashRes = await fetch('data/dashboard-data.json', { cache: 'no-cache' });
if (dashRes.ok) {
window.dashboardData = await dashRes.json();
// Sync agentData from dashboard data for all other tabs
@@ -5439,64 +5439,63 @@ function renderRecCard(r, index) {
`;
}
// Render Heatmap — REAL DATA: Agent × Current Model × Real Fit Score
// Render Heatmap — REAL DATA: Agent × Model × Live Ollama Evaluations
function renderHeatmap() {
const esc = str => (str || '').replace(/[&<>"']/g, m => ({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[m]));
const dd = window.dashboardData;
if (!dd || !dd.agents) {
document.getElementById('hmTable').innerHTML = '<tr><td style="color:var(--text-secondary);padding:20px;text-align:center;">⚠️ Нет данных. Запустите анализ.</td></tr>';
document.getElementById('hmTable').innerHTML = '<tr><td style="color:var(--text-secondary);padding:20px;text-align:center;">⚠️ No data. Run analysis.</td></tr>';
return;
}
const agents = dd.agents;
// Get unique models sorted by count of agents
const modelCounts = {};
agents.forEach(a => { modelCounts[a.model_short] = (modelCounts[a.model_short] || 0) + 1; });
const modelList = Object.entries(modelCounts)
.sort((a, b) => b[1] - a[1])
.map(([short]) => {
const m = dd.models[short] || {};
return {
short,
full: 'ollama-cloud/' + short,
name: m.name || short,
avg_fit: m.avg_fit || 0,
agents: m.agents || 0
};
});
// Collect all models from current assignments + realfit evaluations
const modelsSeen = new Set();
dd.agents.forEach(a => { if (a.model_short) modelsSeen.add(a.model_short); });
dd.agents.forEach(a => {
if (a.real_evaluations) Object.keys(a.real_evaluations).forEach(m => { if (m && m !== 'code-skeptic') modelsSeen.add(m); });
});
// Ensure real-fit evaluated models are included even if not current
const modelList = Array.from(modelsSeen).sort();
// Render table: rows=agents, cols=models
const t = document.getElementById('hmTable');
let h = '<thead><tr><th class="hm-role">Agent</th>';
modelList.forEach(m => {
const color = m.avg_fit >= 85 ? '#00ff94' : m.avg_fit >= 70 ? '#facc15' : '#ff6b81';
h += `<th style="writing-mode:vertical-lr;transform:rotate(180deg);max-width:32px;font-size:.56em;padding:3px 1px;">
${esc(m.name)}<br>
<span style="color:${color};font-size:.9em;font-weight:700">avg:${m.avg_fit}</span><br>
<span style="color:var(--text-muted);font-size:.8em">${m.agents}</span>
</th>`;
// Compute avg from dd.agents real_evaluations
let sum = 0, cnt = 0;
dd.agents.forEach(a => { const v = (a.real_evaluations || {})[m]; if (v > 0) { sum += v; cnt++; } });
const avg = cnt > 0 ? Math.round(sum / cnt) : 0;
const color = avg >= 85 ? '#00ff94' : avg >= 70 ? '#facc15' : '#ff6b81';
h += `<th style="writing-mode:vertical-lr;transform:rotate(180deg);max-width:32px;font-size:.56em;padding:3px 1px;">${esc(m)}<br><span style="color:${color};font-size:.9em;font-weight:700">avg:${avg}</span></th>`;
});
h += '</tr></thead><tbody>';
h += '<th>Best</th><th>Score</th></tr></thead><tbody>';
agents.forEach(a => {
dd.agents.forEach(a => {
h += `<tr><td class="hm-r">${esc(a.name)}</td>`;
modelList.forEach((m, j) => {
const isCurrent = a.model_short === m.short;
const score = isCurrent ? a.fit_score : 0; // Only show score for CURRENT model
const cur = isCurrent;
let marks = '';
if (cur) marks += '<span style="border:1px solid var(--accent-cyan);border-radius:50%;padding:1px 3px;font-size:8px">●</span>';
const bg = cur ? hmColor(score) : 'transparent';
const txt = cur ? hmText(score) : 'var(--text-muted)';
h += `<td style="background:${bg};color:${txt};cursor:pointer${cur ? ';box-shadow:inset 0 0 0 2px var(--accent-cyan)' : ''}" class="${cur ? 'hm-cur' : ''}"
title="${esc(a.name)}${esc(m.name)}: ${isCurrent ? 'fit=' + a.fit_score + ', if=' + a.instruction_following : 'не использует этот модель'}"
onmouseover="showTT(event,'${esc(a.name)}','${esc(m.name)}',${isCurrent ? a.fit_score : 0},${isCurrent},${cur},${isCurrent ? a.instruction_following : 0})"
onmouseout="hideTT()"
onclick="openHmModal(event, '${esc(a.name)}', '${esc(m.name)}', ${isCurrent ? a.fit_score : 0}, ${isCurrent ? a.instruction_following : 0})"
>${isCurrent ? a.fit_score : '·'}${marks}</td>`;
modelList.forEach(m => {
const isCurrent = a.model_short === m;
let score = 0;
// Prefer real-fit score, fallback to current fit_score
if (a.real_evaluations && a.real_evaluations[m] > 0) score = Math.round(a.real_evaluations[m]);
else if (isCurrent) score = Math.round(a.fit_score || 0);
let cls = 'na';
if (score >= 90) cls = 'high';
else if (score >= 75) cls = 'good';
else if (score >= 50) cls = 'med';
else if (score > 0) cls = 'low';
const curMark = isCurrent ? ' ●' : '';
const curStyle = isCurrent ? 'box-shadow:inset 0 0 0 2px var(--accent-cyan);' : '';
const bg = score > 0 ? hmColor(score) : 'transparent';
const txt = score >= 75 ? '#0e1219' : 'var(--text-primary)';
const display = score > 0 ? score : (isCurrent ? Math.round(a.fit_score || 0) : '·');
h += `<td class="score ${cls}" style="background:${bg};color:${txt};${curStyle}cursor:pointer" title="${esc(a.name)}${esc(m)}: ${score > 0 ? 'real fit=' + score : (isCurrent ? 'fit=' + a.fit_score : 'no data')}" onclick="openHmModal(event,'${esc(a.name)}','${esc(m)}',${score},${a.instruction_following || 0})">${display}${curMark}</td>`;
});
h += '</tr>';
const bestModel = a.real_best_model || a.model_short;
const bestScore = a.real_best_score ? Math.round(a.real_best_score) : Math.round(a.fit_score || 0);
h += `<td>${esc(bestModel)}</td><td style="font-weight:700">${bestScore}</td></tr>`;
});
t.innerHTML = h + '</tbody>';
}
@@ -5511,29 +5510,6 @@ function hmColor(v) {
return 'rgba(90,104,128,.2)';
}
function hmText(v) {
return v >= 75 ? '#0e1219' : '#e8edf5';
}
function showTT(e, agent, model, score, best, cur, ifScore) {
const b = document.getElementById('ttBox'), o = document.getElementById('ttOverlay');
const ifColor = ifScore >= 85 ? '#00ff94' : ifScore >= 75 ? '#facc15' : '#ff6b81';
const ifLabel = ifScore >= 85 ? 'Excellent' : ifScore >= 75 ? 'Average' : 'Weak';
b.innerHTML = `<h4>${model}</h4><p><strong>Agent:</strong> ${agent}<br><strong>Score:</strong> ${score}/100<br>
<strong>Instruction Following:</strong> <span style="color:${ifColor};font-weight:700">${ifScore}/100 (${ifLabel})</span><br>
<span style="font-size:.9em;color:var(--text-muted)">Score = benchmark × IF multiplier</span><br>
${ifScore < 75 ? '<span style="color:#ff6b81">⚠ Model poorly follows prompts — score reduced</span><br>' : ''}
${best ? '★ <strong>Best fit</strong><br>' : ''}${cur ? '📌 <strong>Current</strong>' : ''}</p>`;
const r = e.target.getBoundingClientRect();
b.style.left = Math.min(r.left, window.innerWidth - 320) + 'px';
b.style.top = (r.bottom + 6) + 'px';
o.classList.add('show');
}
function hideTT() {
document.getElementById('ttOverlay').classList.remove('show');
}
// Current modal state
let hmCurrentAgent = null;
let hmCurrentModel = null;

View File

@@ -0,0 +1,460 @@
<!DOCTYPE html>
<html lang="ru">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Real-Fit Matrix — Agent × Model Performance</title>
<style>
:root{--bg:#0a0f1a;--bg2:#0f1525;--bg3:#141c2e;--bdr:#1e2d45;--txt:#e8f1ff;--txt2:#8ba3c0;--cyan:#00d4ff;--green:#00ff94;--red:#ff4757;--orange:#ff9f43;--purple:#a855f7;}
*{margin:0;padding:0;box-sizing:border-box}
body{font-family:system-ui,-apple-system,sans-serif;background:var(--bg);color:var(--txt);min-height:100vh;padding:24px}
h1{font-size:1.6rem;background:linear-gradient(90deg,var(--cyan),var(--green));-webkit-background-clip:text;-webkit-text-fill-color:transparent;margin-bottom:8px}
.sub{color:var(--txt2);font-size:.85rem;margin-bottom:20px}
table{width:100%;border-collapse:collapse;font-size:.82rem}
th,td{padding:8px 10px;border:1px solid var(--bdr);text-align:center}
th{background:var(--bg2);color:var(--txt2);font-size:.72rem;text-transform:uppercase;letter-spacing:.5px;position:sticky;top:0}
td:first-child{text-align:left;font-weight:700;white-space:nowrap}
td.score{font-weight:700;font-family:monospace}
.hm-cur{box-shadow:inset 0 0 0 2px var(--cyan)}
.high{background:rgba(0,255,148,.18);color:var(--green)}
.good{background:rgba(0,212,255,.14);color:var(--cyan)}
.med{background:rgba(168,85,247,.15);color:var(--purple)}
.low{background:rgba(255,71,87,.1);color:var(--red)}
.na{background:transparent;color:var(--txt2);font-size:.9rem;cursor:pointer}
.na:hover{background:rgba(0,212,255,.08)}
.legend{display:flex;gap:12px;flex-wrap:wrap;margin-top:16px;font-size:.78rem;color:var(--txt2)}
.legend span{display:flex;align-items:center;gap:4px}
.dot{width:14px;height:14px;border-radius:3px}
.meta{font-size:.72rem;color:var(--txt2);margin-top:12px}
a{color:var(--cyan);text-decoration:none}
.btn-research{font-size:.9rem;background:none;border:none;cursor:pointer;margin-left:4px;opacity:.7}
.btn-research:hover{opacity:1}
.modal{position:fixed;inset:0;background:rgba(0,0,0,.7);display:flex;align-items:center;justify-content:center;z-index:1000;padding:16px}
.modal.hidden{display:none}
.modal-panel{background:var(--bg2);border:1px solid var(--cyan);border-radius:12px;max-width:90vw;width:480px;max-height:90vh;overflow-y:auto;padding:20px;position:relative}
.modal-panel.wide{width:640px}
.modal-title{font-size:1.1rem;margin-bottom:12px}
.modal-list{text-align:left;margin:12px 0;max-height:40vh;overflow-y:auto}
.modal-list label{display:flex;align-items:center;gap:8px;padding:6px 0;cursor:pointer;border-bottom:1px solid var(--bdr)}
.modal-list input{margin:0}
.modal-actions{display:flex;gap:8px;justify-content:flex-end;margin-top:16px}
.btn{padding:6px 14px;border-radius:6px;border:1px solid var(--bdr);background:var(--bg3);color:var(--txt);cursor:pointer;font-size:.85rem}
.btn.primary{background:linear-gradient(90deg,var(--green),var(--cyan));color:#000;border:none;font-weight:700}
.progress{margin-top:12px}
.progress-bar{height:8px;background:var(--bg3);border-radius:4px;overflow:hidden}
.progress-fill{height:100%;width:0%;background:linear-gradient(90deg,var(--green),var(--cyan));transition:width .3s}
.progress-text{font-size:.8rem;color:var(--txt2);margin-top:6px;text-align:center}
.result-table{width:100%;margin-top:12px;font-size:.82rem;border-collapse:collapse}
.result-table th,.result-table td{padding:6px;border:1px solid var(--bdr)}
.result-table .best{background:rgba(0,255,148,.25);color:var(--green);font-weight:700}
.result-table tbody tr{cursor:pointer}
.result-table tbody tr:hover{background:rgba(0,212,255,.06)}
.detail-row{margin-bottom:12px}
.detail-label{font-size:.72rem;color:var(--txt2);text-transform:uppercase;letter-spacing:.5px;margin-bottom:4px}
.detail-val{font-size:.85rem;white-space:pre-wrap;word-break:break-word}
.detail-pills{display:flex;flex-wrap:wrap;gap:6px;margin-top:4px}
.pill{font-size:.72rem;padding:2px 8px;border-radius:4px;background:var(--bg3);border:1px solid var(--bdr);color:var(--txt2)}
.score-big{font-size:2rem;font-weight:700;margin:4px 0}
.toggle{color:var(--cyan);cursor:pointer;font-size:.78rem}
.toggle:hover{text-decoration:underline}
.dim-bar{display:flex;align-items:center;gap:8px;margin:4px 0}
.dim-bar>span:first-child{width:120px;font-size:.75rem;color:var(--txt2);white-space:nowrap;overflow:hidden;text-overflow:ellipsis}
.dim-track{flex:1;height:8px;background:var(--bg3);border-radius:4px;overflow:hidden}
.dim-fill{height:100%;border-radius:4px}
.dim-num{width:30px;text-align:right;font-size:.78rem;font-weight:700}
.v-pass{color:var(--green)}
.v-marginal{color:var(--orange)}
.v-fail{color:var(--red)}
.commentary{font-size:.85rem;padding:10px 12px;background:rgba(0,212,255,.08);border-left:3px solid var(--cyan);border-radius:0 6px 6px 0;color:var(--txt);white-space:pre-wrap;word-break:break-word}
</style>
</head>
<body>
<h1>Real-Fit Matrix</h1>
<div class="sub">Real agent × model evaluation scores via live Ollama API (28 calls, 4 models, 7 agents)</div>
<div id="matrix"></div>
<div class="legend">
<span><span class="dot high"></span> 90+ Excellent</span>
<span><span class="dot good"></span> 7589 Good</span>
<span><span class="dot med"></span> 5074 Average</span>
<span><span class="dot low"></span> &lt;50 Weak</span>
<span style="margin-left:auto">● = assigned model</span>
</div>
<div class="meta">Data source: <a href="data/real-fit-report.json" target="_blank">real-fit-report.json</a> | Updated: <span id="updated"></span></div>
<div id="researchAgentModal" class="modal hidden">
<div class="modal-panel">
<div class="modal-title" id="agentModalTitle">Research models</div>
<div class="modal-list" id="agentModalList"></div>
<div class="progress hidden" id="agentProgress">
<div class="progress-bar"><div class="progress-fill" id="agentProgressFill"></div></div>
<div class="progress-text" id="agentProgressText"></div>
</div>
<div class="modal-actions">
<button class="btn" onclick="closeModal('researchAgentModal')">Close</button>
<button class="btn" id="evolveAgentBtn" onclick="startEvolveAgent()">Run Role-Fit Test</button>
<button class="btn primary" id="agentStartBtn" onclick="startAgentResearch()">Start Research</button>
</div>
<div id="agentResults"></div>
</div>
</div>
<div id="researchCellModal" class="modal hidden">
<div class="modal-panel">
<div class="modal-title" id="cellModalTitle">Evaluate cell</div>
<div class="modal-list" id="cellModalList"></div>
<div class="progress hidden" id="cellProgress">
<div class="progress-bar"><div class="progress-fill" id="cellProgressFill"></div></div>
<div class="progress-text" id="cellProgressText"></div>
</div>
<div class="modal-actions">
<button class="btn" onclick="closeModal('researchCellModal')">Close</button>
<button class="btn primary" id="cellStartBtn" onclick="startCellResearch()">Evaluate</button>
</div>
<div id="cellResults"></div>
</div>
</div>
<div id="detailModal" class="modal hidden">
<div class="modal-panel wide">
<div style="display:flex;justify-content:space-between;align-items:center;margin-bottom:12px">
<span style="font-size:1rem;font-weight:700" id="detailTitle"></span>
<button class="btn" onclick="closeModal('detailModal')">Close</button>
</div>
<div id="detailContent"></div>
</div>
</div>
<script>
let reportData, evoData, allModels=[], allAvailableModels=[];
const API_BASE='http://localhost:3004';
const $=id=>document.getElementById(id);
const MODEL_BENCHMARKS={
"qwen3.5-122b":91,"qwen3-coder-480b":88,"deepseek-v4-pro-max":89,
"deepseek-v4-flash":86,"kimi-k2.6":91,"kimi-k2.5":90,
"minimax-m2.5":82,"minimax-m2.7":80,"glm-5.1":90,
"glm-5":90,"nemotron-3-super":78,"nemotron-3-nano":68,
"gemma4-27b":85,"devstral-2":80,"devstral-small-2":75
};
function clsFor(s){if(s>=90)return'score high';if(s>=75)return'score good';if(s>=50)return'score med';return'score low';}
function scoreColor(s){if(s>=90)return'var(--green)';if(s>=75)return'var(--cyan)';if(s>=50)return'var(--purple)';return'var(--red)';}
function closeModal(id){$(id).classList.add('hidden');}
async function load(){
let mRes;
try{ mRes=await fetch(`${API_BASE}/api/models`); allAvailableModels=(await mRes.json()).models||[]; }
catch(e){ allAvailableModels=Object.keys(MODEL_BENCHMARKS); }
try{
const rRes=await fetch(`${API_BASE}/api/real-fit-report`);
reportData=await rRes.json();
}catch(e){
const rRes=await fetch('data/real-fit-report.json');
reportData=await rRes.json();
}
const eRes=await fetch('data/evolution.json');
evoData=await eRes.json();
reportData.generated = reportData.generated || new Date().toISOString();
// Больше не смешиваем API-данные с локальным кэшем — иначе stale cached scores перекрывают живые данные из БД
if (reportData.source && reportData.source.includes('db')) {
try { localStorage.removeItem('__researchResults'); } catch(e) {}
}
$('updated').textContent=new Date(reportData.generated).toLocaleString('ru-RU');
const agents=Object.values(reportData.agents).filter(a=>Object.values(a.evaluations).some(s=>s>0));
const modelScores={};
agents.forEach(a=>{for(const[m,s] of Object.entries(a.evaluations)){modelScores[m]=(modelScores[m]||0)+s;}});
allModels=Object.keys(modelScores).filter(m=>modelScores[m]>0).sort();
mergeCachedResults();
renderTable();
}
function currentModel(agentName){
const info=reportData.agents[agentName]?.info||[];
return (info[2]||'').split('/').pop();
}
function modelShort(full){return full.replace('ollama-cloud/','');}
function openAgentModal(agent){
$('agentModalTitle').textContent='Research models for '+agent;
const cur=currentModel(agent);
let html='';
allAvailableModels.forEach(full=>{
const m=modelShort(full);
const checked=m===cur||cur.replace(':','-')===m||m.replace('-',':')===cur?'checked':'';
const ifs=MODEL_BENCHMARKS[m]||'—';
html+=`<label><input type="checkbox" value="${m}" ${checked}> <span>${m}</span> <span style="color:var(--txt2);margin-left:auto">IF ${ifs}</span></label>`;
});
$('agentModalList').innerHTML=html||'<p style="color:var(--txt2)">No model data</p>';
$('agentResults').innerHTML='';
$('agentProgress').classList.add('hidden');
$('agentStartBtn').disabled=false;
$('evolveAgentBtn').disabled=false;
$('researchAgentModal').classList.remove('hidden');
window.__activeAgent=agent;
}
function openCellModal(agent,model){
$('cellModalTitle').textContent='Evaluate '+agent+' × '+model;
$('cellModalList').innerHTML=`<label><input type="checkbox" value="${model}" checked> <span>${model}</span></label>`;
$('cellResults').innerHTML='';
$('cellProgress').classList.add('hidden');
$('cellStartBtn').disabled=false;
$('researchCellModal').classList.remove('hidden');
window.__activeAgent=agent; window.__activeModel=model;
}
async function openDetail(agent,model){
$('detailTitle').textContent=agent+' × '+model;
$('detailContent').innerHTML='<p style="color:var(--txt2)">Loading...</p>';
$('detailModal').classList.remove('hidden');
let data;
try{
const res=await fetch(`${API_BASE}/api/evaluation/${encodeURIComponent(agent)}/${encodeURIComponent(model)}`);
if(!res.ok) throw new Error(res.status);
data=await res.json();
}catch(e){
$('detailContent').innerHTML='<p style="color:var(--red);margin-top:12px">No detailed evaluation data available for this combination. Run research first.</p>';
return;
}
const s=data.total_score??data.score??0;
const verdict=(data.verdict||'').toUpperCase();
let vClass='';
if(verdict==='PASS') vClass='v-pass';
else if(verdict==='MARGINAL') vClass='v-marginal';
else if(verdict==='FAIL') vClass='v-fail';
const verdictHtml=verdict?`<span class="${vClass}" style="font-size:.85rem;font-weight:700;border:1px solid currentColor;padding:2px 8px;border-radius:4px;margin-left:8px">${verdict}</span>`:'';
let scoresHtml='';
if(data.scores){
scoresHtml='<div class="detail-row"><div class="detail-label">Score Breakdown</div>';
for(const [k,v] of Object.entries(data.scores)){
const num=typeof v==='number'?v:Number(v)||0;
scoresHtml+=`<div class="dim-bar"><span>${k}</span><div class="dim-track"><div class="dim-fill" style="width:${num}%;background:${scoreColor(num)}"></div></div><span class="dim-num">${Math.round(num)}</span></div>`;
}
scoresHtml+='</div>';
}
let commentaryHtml='';
if(data.explanation){
commentaryHtml=`<div class="detail-row"><div class="detail-label">Evaluator Commentary</div><div class="commentary">${data.explanation}</div></div>`;
}
let rubricHtml='';
if(data.rubric){
rubricHtml='<div class="detail-row"><div class="detail-label">Rubric Weights</div><div class="detail-pills">';
for(const [k,v] of Object.entries(data.rubric)){
rubricHtml+=`<span class="pill">${k}: ${v}</span>`;
}
rubricHtml+='</div></div>';
}
let kwHtml='';
if(data.expected_keywords?.length){ kwHtml='<div class="detail-pills">'+data.expected_keywords.map(k=>`<span class="pill">${k}</span>`).join('')+'</div>'; }
const resp=(data.response||'').toString();
const trunc=resp.length>500?resp.slice(0,500)+'...':resp;
const more=resp.length>500;
const rid='r'+Math.random().toString(36).slice(2);
window.__respCache=window.__respCache||{};
window.__respCache[rid]={full:resp,trunc:trunc};
let respHtml=`<div class="detail-val" id="${rid}">${trunc}</div>`;
if(more) respHtml+=`<span class="toggle" onclick="const c=window.__respCache['${rid}'];const el=$('${rid}');const isFull=el.dataset.f==='1';el.textContent=isFull?c.trunc:c.full;el.dataset.f=isFull?'0':'1';this.textContent=isFull?'Show more':'Show less'">Show more</span>`;
const lat=data.latency_ms;
const latTxt=typeof lat==='number'?(lat>=1000?(lat/1000).toFixed(1)+'s':lat+'ms'):'—';
$('detailContent').innerHTML=`
<div class="detail-row"><div class="detail-label">Agent × Model</div><div class="detail-val">${agent} × ${model}${verdictHtml}</div></div>
<div class="detail-row"><div class="detail-label">Total Score</div><div class="score-big" style="color:${scoreColor(s)}">${Math.round(s)}</div></div>
${scoresHtml}
<div class="detail-row"><div class="detail-label">Task</div><div class="detail-val">${data.user_prompt||'—'}</div></div>
<div class="detail-row"><div class="detail-label">System Role</div><div class="detail-val">${data.system_prompt||'—'}</div></div>
<div class="detail-row"><div class="detail-label">Model Response</div>${respHtml}</div>
${commentaryHtml}
${rubricHtml}
<div class="detail-row"><div class="detail-label">Evaluator</div><div class="detail-val">${data.evaluator||'—'}</div></div>
<div class="detail-row"><div class="detail-label">Latency</div><div class="detail-val">${latTxt}</div></div>
<div class="detail-row"><div class="detail-label">Tokens</div><div class="detail-val">Prompt: ${data.tokens_prompt??0} / Response: ${data.tokens_response??0}</div></div>
<div class="detail-row"><div class="detail-label">Expected Keywords</div>${kwHtml||'<div class="detail-val">—</div>'}</div>
<div class="detail-row"><div class="detail-label">Evaluated At</div><div class="detail-val">${data.evaluated_at?new Date(data.evaluated_at).toLocaleString('ru-RU'):'—'}</div></div>
`;
}
async function animateProgress(pid,label,ms){
const bar=$(pid+'Fill'),txt=$(pid+'Text'),wrap=$(pid);
wrap.classList.remove('hidden'); txt.textContent=label; bar.style.width='0%';
await new Promise(r=>setTimeout(r,50)); bar.style.transition=`width ${ms}ms linear`;
await new Promise(r=>setTimeout(r,50)); bar.style.width='100%';
await new Promise(r=>setTimeout(r,ms));
bar.style.transition='width .3s';
}
function setProgress(pid,percent,label){
const bar=$(pid+'Fill'),txt=$(pid+'Text'),wrap=$(pid);
wrap.classList.remove('hidden'); txt.textContent=label; bar.style.width=percent+'%';
}
function mergeCachedResults(){
try{
const store=JSON.parse(localStorage.getItem('__researchResults')||'{}');
for(const[agent,rec] of Object.entries(store)){
if(!reportData.agents[agent]) continue;
for(const r of (rec.models||[])){
reportData.agents[agent].evaluations[r.model]=r.score;
if(!allModels.includes(r.model)) allModels.push(r.model);
}
}
allModels.sort();
}catch(e){}
}
function renderTable(){
const agents=Object.values(reportData.agents).filter(a=>Object.values(a.evaluations).some(s=>s>0));
let html='<table><thead><tr><th>Agent</th>';
allModels.forEach(m=>html+=`<th>${m}</th>`);
html+='<th>Best</th><th>Score</th></tr></thead><tbody>';
agents.forEach(a=>{
html+=`<tr><td>${a.name} <button class="btn-research" onclick="openAgentModal('${a.name}')" title="Research models">🔬</button></td>`;
allModels.forEach(m=>{
const score=a.evaluations[m];
const isCur=a.info&&a.info[2]&&a.info[2].includes(m);
let cls='na',text='—',click=`onclick="openCellModal('${a.name}','${m}')"`;
if(score!==undefined&&score>0){cls=clsFor(score);text=Math.round(score);click=`onclick="openDetail('${a.name}','${m}')"`;}
const curCls=isCur?' hm-cur':'';
html+=`<td class="${cls}${curCls}" data-model="${m}" ${click}>${text}${isCur?' ●':''}</td>`;
});
html+=`<td>${a.best_model}</td><td style="font-weight:700">${Math.round(a.best_score)}</td></tr>`;
});
html+='</tbody></table>';
$('matrix').innerHTML=html;
}
function updateCell(agent,model,score){
if(reportData.agents[agent]){
reportData.agents[agent].evaluations[model]=score;
}
if(!allModels.includes(model)){
allModels.push(model);
allModels.sort();
}
renderTable();
}
async function pollJob(jobId,pid){
for(let i=0;i<60;i++){
await new Promise(r=>setTimeout(r,2000));
try{
const res=await fetch(`${API_BASE}/api/research/${jobId}`);
if(!res.ok) continue;
const job=await res.json();
if(job.status==='pending') setProgress(pid,25,'Waiting in queue...');
else if(job.status==='running') setProgress(pid,75,'Running evaluation...');
else if(job.status==='done'){ setProgress(pid,100,'Done!'); return job; }
else if(job.status==='error'){ setProgress(pid,100,'Error!'); return job; }
}catch(e){ console.warn('poll error',e); }
}
setProgress(pid,100,'Timeout'); return {status:'timeout'};
}
async function startAgentResearch(){
const agent=window.__activeAgent;
const models=[...$('agentModalList').querySelectorAll('input:checked')].map(i=>i.value);
if(!models.length)return;
$('agentStartBtn').disabled=true;
$('evolveAgentBtn').disabled=true;
setProgress('agentProgress',10,'Submitting job...');
let job;
try{
const res=await fetch(`${API_BASE}/api/research`,{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({agent,models})});
if(!res.ok) throw new Error('API error '+res.status);
job=await res.json();
job=await pollJob(job.job_id,'agentProgress');
}catch(e){
$('agentResults').innerHTML='<p style="color:var(--red);margin-top:12px">API unavailable — run real-fit-engine.py to evaluate '+agent+'</p>';
$('agentProgressText').textContent='Error: API unavailable';
$('agentStartBtn').disabled=false;
$('evolveAgentBtn').disabled=false;
return;
}
const results=job.models_scored||[];
let html='<table class="result-table"><thead><tr><th>Model</th><th>Score</th></tr></thead><tbody>';
let best=-1;
results.forEach(r=>{if(r.score>best)best=r.score;});
results.forEach(r=>{
const b=r.score>=best-0.1?'best':'';
html+=`<tr class="${b}" onclick="openDetail('${agent}','${r.model}')"><td>${r.model}</td><td>${Math.round(r.score)}</td></tr>`;
updateCell(agent,r.model,r.score);
});
html+='</tbody></table>';
$('agentResults').innerHTML=html;
$('agentProgressText').textContent='Done! Best score: '+Math.round(best);
const store=JSON.parse(localStorage.getItem('__researchResults')||'{}');
store[agent]={models:results,ts:Date.now()};
localStorage.setItem('__researchResults',JSON.stringify(store));
$('agentStartBtn').disabled=false;
$('evolveAgentBtn').disabled=false;
}
async function startEvolveAgent(){
const agent=window.__activeAgent;
const models=[...$('agentModalList').querySelectorAll('input:checked')].map(i=>i.value);
if(!models.length) return;
$('evolveAgentBtn').disabled=true;
$('agentStartBtn').disabled=true;
setProgress('agentProgress',10,'Submitting evolve-agent job...');
try{
const res=await fetch(`${API_BASE}/api/evolve-agent/start`,{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({agent,models})});
if(!res.ok) throw new Error('API error '+res.status);
setProgress('agentProgress',50,'Running role-fit test...');
const result=await res.json();
if(result.job_id){
await pollJob(result.job_id,'agentProgress');
}else{
await animateProgress('agentProgress','Processing...',2000);
}
setProgress('agentProgress',100,'Done!');
}catch(e){
console.error('evolve-agent error',e);
setProgress('agentProgress',100,'Error: '+e.message);
$('evolveAgentBtn').disabled=false;
$('agentStartBtn').disabled=false;
return;
}
await load();
closeModal('researchAgentModal');
$('evolveAgentBtn').disabled=false;
$('agentStartBtn').disabled=false;
}
async function startCellResearch(){
const agent=window.__activeAgent, model=window.__activeModel;
$('cellStartBtn').disabled=true;
setProgress('cellProgress',10,'Submitting...');
let job;
try{
const res=await fetch(`${API_BASE}/api/research/cell`,{method:'POST',headers:{'Content-Type':'application/json'},body:JSON.stringify({agent,model})});
if(!res.ok) throw new Error('API error '+res.status);
job=await res.json();
job=await pollJob(job.job_id,'cellProgress');
}catch(e){
$('cellResults').innerHTML='<p style="color:var(--red);margin-top:12px">API unavailable — run real-fit-engine.py to evaluate '+agent+'</p>';
$('cellProgressText').textContent='Error: API unavailable';
$('cellStartBtn').disabled=false;
return;
}
const result=(job.models_scored||[])[0]||{model,score:0};
updateCell(agent,result.model,result.score);
$('cellResults').innerHTML='<table class="result-table"><tbody><tr onclick="openDetail(\''+agent+'\',\''+result.model+'\')"><td>'+result.model+'</td><td>'+Math.round(result.score)+'</td></tr></tbody></table>';
$('cellProgressText').textContent='Done! Score: '+Math.round(result.score);
const store=JSON.parse(localStorage.getItem('__researchResults')||'{}');
if(!store[agent]) store[agent]={models:[],ts:Date.now()};
store[agent].models=store[agent].models.filter(m=>m.model!==result.model);
store[agent].models.push(result);
localStorage.setItem('__researchResults',JSON.stringify(store));
}
load().catch(e=>$('matrix').innerHTML='Error: '+e);
</script>
</body>
</html>

View File

@@ -0,0 +1,4 @@
fastapi==0.136.3
uvicorn==0.48.0
python-multipart==0.0.29
pydantic==2.13.4

View File

@@ -0,0 +1,138 @@
const fs = require('fs');
function parseFrontmatter(content) {
if (!content.startsWith('---')) return null;
const end = content.indexOf('---', 3);
if (end === -1) return null;
const fm = content.slice(3, end).trim();
const data = {};
for (const line of fm.split('\n')) {
const m = line.match(/^(\w+):\s*(.+)$/);
if (m) data[m[1]] = m[2].trim();
}
return data;
}
function stripComments(str) {
// Remove single-line comments, but not inside strings
return str.replace(/\/\/.*$/gm, '');
}
const agents = [];
const commands = [];
const issues = [];
// 1. Parse agent .md files
for (const f of fs.readdirSync('.kilo/agents').filter(f => f.endsWith('.md'))) {
const content = fs.readFileSync('.kilo/agents/' + f, 'utf8');
const fm = parseFrontmatter(content);
if (fm && fm.model) {
agents.push({
name: f.replace('.md', ''),
model: fm.model,
mode: fm.mode || 'subagent',
source: '.kilo/agents/' + f,
description: fm.description || ''
});
}
}
// 2. Parse command .md files
for (const f of fs.readdirSync('.kilo/commands').filter(f => f.endsWith('.md'))) {
const content = fs.readFileSync('.kilo/commands/' + f, 'utf8');
const fm = parseFrontmatter(content);
if (fm && fm.model) {
commands.push({
name: f.replace('.md', ''),
model: fm.model,
mode: fm.mode || 'command',
source: '.kilo/commands/' + f,
description: fm.description || ''
});
}
}
// 3. Parse kilo-meta.json
const meta = JSON.parse(fs.readFileSync('kilo-meta.json', 'utf8'));
for (const a of agents) {
const m = meta.agents?.[a.name];
if (m) {
a.metaModel = m.model;
if (a.model !== m.model) issues.push(`AGENT ${a.name}: .md=${a.model} vs meta=${m.model}`);
}
}
for (const c of commands) {
const m = meta.commands?.[c.name];
if (m) {
c.metaModel = m.model;
if (c.model !== m.model) issues.push(`CMD ${c.name}: .md=${c.model} vs meta=${m.model}`);
}
}
// 4. Parse .kilo/kilo.jsonc
const dotKiloRaw = stripComments(fs.readFileSync('.kilo/kilo.jsonc', 'utf8'));
const dotKilo = JSON.parse(dotKiloRaw);
for (const [name, cfg] of Object.entries(dotKilo.agent || {})) {
if (!cfg.model) continue;
const agent = agents.find(a => a.name === name);
if (agent) {
agent.kiloModel = cfg.model;
if (agent.model !== cfg.model) issues.push(`AGENT ${name}: .md=${agent.model} vs .kilo/kilo.jsonc=${cfg.model}`);
}
}
// 5. Parse root kilo.jsonc
const rootKiloRaw = stripComments(fs.readFileSync('kilo.jsonc', 'utf8'));
const rootKilo = JSON.parse(rootKiloRaw);
for (const [name, cfg] of Object.entries(rootKilo.agent || {})) {
if (!cfg.model) continue;
const cmd = commands.find(c => c.name === name);
if (cmd) {
cmd.rootModel = cfg.model;
if (cmd.model !== cfg.model) issues.push(`CMD ${name}: .md=${cmd.model} vs kilo.jsonc=${cfg.model}`);
}
}
// 6. Check non-ollama
const nonOllama = [];
for (const a of agents) if (!a.model.startsWith('ollama-cloud/')) nonOllama.push({type:'agent', name:a.name, model:a.model});
for (const c of commands) if (!c.model.startsWith('ollama-cloud/')) nonOllama.push({type:'command', name:c.name, model:c.model});
// 7. Summary by model
const modelStats = {};
for (const a of agents) modelStats[a.model] = (modelStats[a.model] || 0) + 1;
for (const c of commands) modelStats[c.model] = (modelStats[c.model] || 0) + 1;
const state = {
generated: new Date().toISOString(),
totalAgents: agents.length,
totalCommands: commands.length,
allOllama: nonOllama.length === 0,
modelDistribution: modelStats,
agents: agents.sort((a,b) => a.name.localeCompare(b.name)),
commands: commands.sort((a,b) => a.name.localeCompare(b.name)),
issues: issues,
nonOllama: nonOllama
};
fs.writeFileSync('agent-evolution/data/real-state.json', JSON.stringify(state, null, 2) + '\n');
// Console report
console.log('=== REAL SYSTEM STATE ===');
console.log('Generated:', state.generated);
console.log('Agents:', state.totalAgents);
console.log('Commands:', state.totalCommands);
console.log('All ollama-cloud/:', state.allOllama ? 'YES' : 'NO (' + nonOllama.length + ' exceptions)');
console.log('\n=== MODEL DISTRIBUTION ===');
for (const [m, c] of Object.entries(modelStats).sort((a,b) => b[1]-a[1])) {
console.log(` ${m}: ${c}`);
}
if (issues.length > 0) {
console.log('\n=== ISSUES ===');
issues.forEach(i => console.log(' ⚠️', i));
}
if (nonOllama.length > 0) {
console.log('\n=== NON-OLLOMA ===');
nonOllama.forEach(n => console.log(' ❌', n.type, n.name, n.model));
}
console.log('\n✅ State written to agent-evolution/data/real-state.json');

View File

@@ -0,0 +1,29 @@
const fs = require('fs');
const path = require('path');
const DASH = path.join(__dirname, '../data/dashboard-data.json');
const REAL = path.join(__dirname, '../data/real-fit-report.json');
const OUT = path.join(__dirname, '../data/dashboard-data.json');
const dash = JSON.parse(fs.readFileSync(DASH, 'utf-8'));
const real = JSON.parse(fs.readFileSync(REAL, 'utf-8'));
// Inject real_evaluations into each agent
dash.agents.forEach(a => {
const r = real.agents?.[a.name];
if (r && r.evaluations) {
a.real_evaluations = r.evaluations;
a.real_best_model = r.best_model;
a.real_best_score = r.best_score;
} else {
a.real_evaluations = {};
}
});
// Add metadata
dash.real_fit_generated = real.generated;
dash.real_fit_source = real.source;
fs.writeFileSync(OUT, JSON.stringify(dash, null, 2));
console.log('Merged real-fit data into ' + OUT);
console.log('Agents with real evals:', dash.agents.filter(a => Object.keys(a.real_evaluations||{}).length > 0).length);

View File

@@ -0,0 +1,98 @@
const fs = require('fs');
const path = require('path');
const INDEX = path.join(__dirname, '../index.standalone.html');
// 1. New renderHeatmap that reads real-fit data
const newRenderHeatmap = `function renderHeatmap() {
const esc = str => (str || '').replace(/[&<>"']/g, m => ({'&':'&amp;','<':'&lt;','>':'&gt;','"':'&quot;',"'":'&#39;'}[m]));
const dd = window.dashboardData;
// Merge real-fit if loaded
const rf = window.realFitData || {};
const realAgents = rf.agents || {};
if (!dd || !dd.agents) {
document.getElementById('hmTable').innerHTML = '<tr><td style="color:var(--text-secondary);padding:20px;text-align:center;">⚠️ No data. Run analysis.</td></tr>';
return;
}
// Build model list from real-fit (cross-model) + current dashboard data
const modelsSeen = new Set();
dd.agents.forEach(a => { modelsSeen.add(a.model_short); });
Object.values(realAgents).forEach(a => { Object.keys(a.evaluations || {}).forEach(m => modelsSeen.add(m)); });
const modelList = Array.from(modelsSeen).filter(m => m && m !== 'code-skeptic');
const t = document.getElementById('hmTable');
let h = '<thead><tr><th class="hm-role">Agent</th>';
modelList.forEach(m => {
h += '<th style="writing-mode:vertical-lr;transform:rotate(180deg);max-width:32px;font-size:.56em;padding:3px 1px;">' + esc(m) + '</th>';
});
h += '<th>Best</th><th>Score</th></tr></thead><tbody>';
dd.agents.forEach(a => {
const realAgent = realAgents[a.name];
h += '<tr><td class="hm-r">' + esc(a.name) + '</td>';
modelList.forEach(m => {
let score = 0;
if (realAgent && realAgent.evaluations && realAgent.evaluations[m] > 0) {
score = Math.round(realAgent.evaluations[m]);
}
const isCurrent = a.model_short === m;
let cls = 'na';
if (score >= 90) cls = 'high';
else if (score >= 75) cls = 'good';
else if (score >= 50) cls = 'med';
else if (score > 0) cls = 'low';
const display = score > 0 ? score : (isCurrent ? Math.round(a.fit_score || 0) : '·');
const curStyle = isCurrent ? 'box-shadow:inset 0 0 0 2px var(--accent-cyan)' : '';
h += '<td class="score ' + cls + '" style="' + curStyle + '">' + display + '</td>';
});
const bestModel = realAgent ? (realAgent.best_model || a.model_short) : a.model_short;
const bestScore = realAgent ? Math.round(realAgent.best_score || 0) : Math.round(a.fit_score || 0);
h += '<td>' + esc(bestModel) + '</td><td style="font-weight:700">' + bestScore + '</td></tr>';
});
t.innerHTML = h + '</tbody>';
}`;
// 2. Add loadRealFitData script after dashboard load
const loadRealFitData = `
// Load real-fit report for cross-model evaluation
try {
const rfRes = await fetch('data/real-fit-report.json');
if (rfRes.ok) window.realFitData = await rfRes.json();
} catch(e) { console.warn('real-fit-report.json not loaded:', e.message); }
`;
let html = fs.readFileSync(INDEX, 'utf-8');
// Patch A: replace renderHeatmap function
const oldPattern = /\/\/ Render Heatmap[\s\S]*?function renderHeatmap\(\)\s*\{[^}]*\{[^}]*\}[^}]*\}/;
const oldMatch = html.match(oldPattern);
if (oldMatch) {
html = html.substring(0, oldMatch.index) + '// Render Heatmap (real-fit enabled)\n' + newRenderHeatmap + html.substring(oldMatch.index + oldMatch[0].length);
console.log('Patched renderHeatmap');
} else {
console.log('Pattern A not found, trying fallback...');
// Fallback: find and replace the specific renderHeatmap block
const start = html.indexOf('function renderHeatmap() {');
if (start !== -1) {
let brace = 0, end = start;
for (let i = start; i < html.length; i++) {
if (html[i] === '{') brace++;
else if (html[i] === '}') { brace--; if (brace === 0) { end = i + 1; break; } }
}
html = html.substring(0, start) + newRenderHeatmap + '\n' + html.substring(end);
console.log('Patched renderHeatmap (fallback)');
}
}
// Patch B: insert real-fit loading after dashboard load
const dashLoadPattern = /window\.dashboardData = await dashRes\.json\(\);/;
if (dashLoadPattern.test(html)) {
html = html.replace(dashLoadPattern, 'window.dashboardData = await dashRes.json();\n' + loadRealFitData.trim());
console.log('Patched init() to load real-fit data');
}
fs.writeFileSync(INDEX, html);
console.log('Done — ' + (fs.statSync(INDEX).size / 1024).toFixed(1) + ' KB');

View File

@@ -0,0 +1,173 @@
#!/usr/bin/env python3
"""
Rebuild real-fit-report.json from SQLite DB.
Usage:
python3 rebuild-report.py
python3 rebuild-report.py --db /path/to/real-fit.db --report /path/to/real-fit-report.json
"""
import argparse
import json
import sqlite3
import time
from datetime import datetime, timezone
from pathlib import Path
def _sync_agents_from_meta(db_path: Path) -> None:
"""Import any missing agents from kilo-meta.json into the DB agents table."""
meta_path = db_path.parent.parent.parent / "kilo-meta.json"
if not meta_path.exists():
return
with open(meta_path) as f:
meta = json.load(f)
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
cursor.execute("SELECT name FROM agents")
existing = {r[0] for r in cursor.fetchall()}
for name, info in meta.get("agents", {}).items():
if name in existing:
continue
cursor.execute(
"INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)",
(
name,
info.get("description", ""),
info.get("category", "meta"),
info.get("model", ""),
info.get("color", "#6B7280"),
datetime.now(timezone.utc).isoformat(),
),
)
conn.commit()
conn.close()
def build_report(db_path: Path) -> dict:
_sync_agents_from_meta(db_path)
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
cursor.execute("""
SELECT name, description, category, current_model
FROM agents
""")
agents_meta = {row["name"]: dict(row) for row in cursor.fetchall()}
# Only take evaluations that are NOT HTTP error responses
# AND prefer evaluator='rubric_v2' over 'rubric_v1'
cursor.execute("""
SELECT agent_name, model, total_score, evaluator, response
FROM evaluations
WHERE total_score > 0
AND evaluator NOT LIKE '%rubric_v1%'
AND (response IS NULL
OR (response NOT LIKE '%[HTTP %' AND response != ''))
ORDER BY agent_name, model,
CASE evaluator
WHEN 'evolution-skeptic' THEN 0
WHEN 'rubric_v2' THEN 1
ELSE 2
END,
total_score DESC
""")
# Take the first (best preferred evaluator, highest score) per agent-model
best_evals = {}
for row in cursor.fetchall():
agent = row["agent_name"]
model = row["model"]
score = row["total_score"]
if agent not in best_evals:
best_evals[agent] = {}
if model not in best_evals[agent]:
best_evals[agent][model] = score
# Rebuild fit_scores from selected evaluations only
cursor.execute("""
SELECT agent_name, model, MAX(total_score) as best_score, scores, explanation
FROM evaluations
WHERE total_score > 0
AND evaluator NOT LIKE '%rubric_v1%'
AND (response IS NULL
OR (response NOT LIKE '%[HTTP %' AND response != ''))
GROUP BY agent_name, model
""")
fit_scores = {}
for row in cursor.fetchall():
fit_scores[row["agent_name"]] = {
"model": row["model"],
"fit": row["best_score"],
"explanation": (
f"Best model for {row['agent_name']} is {row['model']} "
f"with avg score {row['best_score']:.1f}. "
"Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
),
}
conn.close()
agents_report = {}
for agent_name, meta in agents_meta.items():
evals = best_evals.get(agent_name, {})
if evals:
best_model = max(evals, key=evals.get)
best_score = evals[best_model]
else:
best_model = ""
best_score = 0.0
agents_report[agent_name] = {
"name": agent_name,
"evaluations": evals,
"info": [
meta.get("description") or "",
meta.get("category") or "",
meta.get("current_model") or "",
],
"best_model": best_model,
"best_score": best_score,
}
total_evals = sum(len(evals) for evals in best_evals.values())
generated = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
return {
"generated": generated,
"source": "real-fit-engine-db-filtered",
"total_evaluations": total_evals,
"agents": agents_report,
"fit_scores": fit_scores,
}
def main():
parser = argparse.ArgumentParser(description="Rebuild real-fit-report.json from DB")
parser.add_argument(
"--db",
type=Path,
default=Path(__file__).parent.parent / "data" / "real-fit.db",
help="Path to SQLite DB",
)
parser.add_argument(
"--report",
type=Path,
default=Path(__file__).parent.parent / "data" / "real-fit-report.json",
help="Path to report JSON output",
)
args = parser.parse_args()
report = build_report(args.db)
args.report.parent.mkdir(parents=True, exist_ok=True)
with open(args.report, "w", encoding="utf-8") as f:
json.dump(report, f, indent=2)
print(f"Report rebuilt: {args.report}")
print(f"Agents: {len(report['agents'])}, Evaluations: {report['total_evaluations']}")
if __name__ == "__main__":
main()