Files
APAW/scripts/run-focused-eval.py
Deploy Bot 4071551476 feat(scripts): add real-fit evaluation engine and supporting test scripts
- real-fit-engine.py: refactored to support --from-report, improved Ollama v1/chat/completions compatibility, agent name normalization
- run-focused-eval.py: run evaluations for specific agent/model pairs from CLI
- test_ollama_minimal.py/test_real_api.py: Ollama API connectivity tests
- real-fit-architecture.md: architecture overview document
- tests/scripts/: E2E landing test, analytics capture, evolution heatmap verification
- Remove real-fit-recalc.py (superseded by --from-report flag)
2026-05-28 11:57:46 +01:00

132 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Focused Real-Fit Eval Runner v2
Evaluates key agents × models using real-fit-engine.py (the fixed version).
"""
import sqlite3, json, os, sys, importlib.util
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
os.environ.setdefault("OLLAMA_KEY", "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx")
os.environ.setdefault("OLLAMA_HOST", "https://ollama.com/v1")
# Import the dash-named real-fit-engine.py via importlib
_spec = importlib.util.spec_from_file_location("rfe", os.path.join(os.path.dirname(__file__), "real-fit-engine.py"))
_rfe = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(_rfe)
call_ollama = _rfe.call_ollama
evaluate_response = _rfe.evaluate_response
compute_aggregates = _rfe.compute_aggregates
generate_report = _rfe.generate_report
DB_PATH = _rfe.DB_PATH
AGENTS = [
'code-skeptic',
'lead-developer',
'system-analyst',
'sdet-engineer',
'orchestrator',
'devops-engineer',
'workflow-cross-checker',
]
MODELS = [
'kimi-k2.6',
'deepseek-v4-pro-max',
'qwen3-coder:480b',
'glm-5.1',
]
def fetch_agent_tasks():
conn = sqlite3.connect(DB_PATH)
placeholders = ','.join('?' * len(AGENTS))
rows = conn.execute(f"""
SELECT id, agent_name, system_prompt, user_prompt, expected_keywords, rubric
FROM test_prompts WHERE agent_name IN ({placeholders})
""", tuple(AGENTS)).fetchall()
conn.close()
return rows
def eval_single(agent_name, model, prompt_id, system, user, expected_json, rubric_json):
resp, latency, tokens = call_ollama(model, system, user)
ev = evaluate_response(resp, expected_json, rubric_json)
return {
'agent': agent_name,
'model': model,
'prompt_id': prompt_id,
'response_text': resp[:3000],
'latency_ms': latency,
'tokens_prompt': tokens['prompt'],
'tokens_response': tokens['response'],
'total_score': ev['total'],
'scores_json': json.dumps(ev['scores']),
'explanation': ev['explanation'],
'evaluated_at': datetime.now(timezone.utc).isoformat(),
'evaluator': 'rubric_v2'
}
def save_single(res):
conn = sqlite3.connect(DB_PATH)
conn.execute("""
INSERT INTO evaluations
(agent_name, model, prompt_id, response, latency_ms, tokens_prompt, tokens_response,
scores, total_score, explanation, evaluated_at, evaluator)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
""", (res['agent'], res['model'], res['prompt_id'], res['response_text'], res['latency_ms'],
res['tokens_prompt'], res['tokens_response'],
res['scores_json'], res['total_score'], res['explanation'],
res['evaluated_at'], res['evaluator']))
conn.commit()
conn.close()
print(f" [{res['agent']}] x [{res['model']}] score={res['total_score']:.1f} lat={res['latency_ms']}ms len={len(res['response_text'])}")
def run_focused_eval(max_workers=4):
tasks = fetch_agent_tasks()
print(f"[focused] Agents: {len(AGENTS)} | Models: {len(MODELS)} | Prompts: {len(tasks)}")
print(f"[focused] Total evaluations: {len(tasks) * len(MODELS)}")
work_items = []
for pid, aname, system, user, exp_json, rub_json in tasks:
for model in MODELS:
work_items.append((aname, model, pid, system, user, exp_json, rub_json))
completed = 0
errors = 0
with ThreadPoolExecutor(max_workers=max_workers) as ex:
futures = {ex.submit(eval_single, *w): w for w in work_items}
for future in as_completed(futures):
try:
res = future.result()
save_single(res)
completed += 1
if completed % 4 == 0:
print(f"[focused] Progress: {completed}/{len(work_items)}")
except Exception as e:
import traceback
traceback.print_exc()
errors += 1
print(f"[focused] Completed {completed}/{len(work_items)} (errs={errors})")
compute_aggregates()
if __name__ == '__main__':
print("="*60)
print("FOCUSED REAL-FIT EVALUATION v2")
print(f"Models: {', '.join(MODELS)}")
print(f"Agents: {', '.join(AGENTS)}")
print(f"API: {os.environ['OLLAMA_HOST']}")
print("="*60)
# Clean old evaluations
conn = sqlite3.connect(DB_PATH)
conn.execute("DELETE FROM evaluations WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS)))
conn.execute("DELETE FROM fit_scores WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS)))
conn.commit()
conn.close()
print("[focused] Cleaned old evaluations")
run_focused_eval(max_workers=4)
report = generate_report()
print(f"[focused] Report generated with {len(report.get('agents',{}))} agents")