- real-fit-engine.py: refactored to support --from-report, improved Ollama v1/chat/completions compatibility, agent name normalization - run-focused-eval.py: run evaluations for specific agent/model pairs from CLI - test_ollama_minimal.py/test_real_api.py: Ollama API connectivity tests - real-fit-architecture.md: architecture overview document - tests/scripts/: E2E landing test, analytics capture, evolution heatmap verification - Remove real-fit-recalc.py (superseded by --from-report flag)
132 lines
4.7 KiB
Python
132 lines
4.7 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Focused Real-Fit Eval Runner v2
|
||
Evaluates key agents × models using real-fit-engine.py (the fixed version).
|
||
"""
|
||
import sqlite3, json, os, sys, importlib.util
|
||
from datetime import datetime, timezone
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
|
||
os.environ.setdefault("OLLAMA_KEY", "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx")
|
||
os.environ.setdefault("OLLAMA_HOST", "https://ollama.com/v1")
|
||
|
||
# Import the dash-named real-fit-engine.py via importlib
|
||
_spec = importlib.util.spec_from_file_location("rfe", os.path.join(os.path.dirname(__file__), "real-fit-engine.py"))
|
||
_rfe = importlib.util.module_from_spec(_spec)
|
||
_spec.loader.exec_module(_rfe)
|
||
|
||
call_ollama = _rfe.call_ollama
|
||
evaluate_response = _rfe.evaluate_response
|
||
compute_aggregates = _rfe.compute_aggregates
|
||
generate_report = _rfe.generate_report
|
||
DB_PATH = _rfe.DB_PATH
|
||
|
||
AGENTS = [
|
||
'code-skeptic',
|
||
'lead-developer',
|
||
'system-analyst',
|
||
'sdet-engineer',
|
||
'orchestrator',
|
||
'devops-engineer',
|
||
'workflow-cross-checker',
|
||
]
|
||
|
||
MODELS = [
|
||
'kimi-k2.6',
|
||
'deepseek-v4-pro-max',
|
||
'qwen3-coder:480b',
|
||
'glm-5.1',
|
||
]
|
||
|
||
def fetch_agent_tasks():
|
||
conn = sqlite3.connect(DB_PATH)
|
||
placeholders = ','.join('?' * len(AGENTS))
|
||
rows = conn.execute(f"""
|
||
SELECT id, agent_name, system_prompt, user_prompt, expected_keywords, rubric
|
||
FROM test_prompts WHERE agent_name IN ({placeholders})
|
||
""", tuple(AGENTS)).fetchall()
|
||
conn.close()
|
||
return rows
|
||
|
||
def eval_single(agent_name, model, prompt_id, system, user, expected_json, rubric_json):
|
||
resp, latency, tokens = call_ollama(model, system, user)
|
||
ev = evaluate_response(resp, expected_json, rubric_json)
|
||
return {
|
||
'agent': agent_name,
|
||
'model': model,
|
||
'prompt_id': prompt_id,
|
||
'response_text': resp[:3000],
|
||
'latency_ms': latency,
|
||
'tokens_prompt': tokens['prompt'],
|
||
'tokens_response': tokens['response'],
|
||
'total_score': ev['total'],
|
||
'scores_json': json.dumps(ev['scores']),
|
||
'explanation': ev['explanation'],
|
||
'evaluated_at': datetime.now(timezone.utc).isoformat(),
|
||
'evaluator': 'rubric_v2'
|
||
}
|
||
|
||
def save_single(res):
|
||
conn = sqlite3.connect(DB_PATH)
|
||
conn.execute("""
|
||
INSERT INTO evaluations
|
||
(agent_name, model, prompt_id, response, latency_ms, tokens_prompt, tokens_response,
|
||
scores, total_score, explanation, evaluated_at, evaluator)
|
||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
|
||
""", (res['agent'], res['model'], res['prompt_id'], res['response_text'], res['latency_ms'],
|
||
res['tokens_prompt'], res['tokens_response'],
|
||
res['scores_json'], res['total_score'], res['explanation'],
|
||
res['evaluated_at'], res['evaluator']))
|
||
conn.commit()
|
||
conn.close()
|
||
print(f" [{res['agent']}] x [{res['model']}] score={res['total_score']:.1f} lat={res['latency_ms']}ms len={len(res['response_text'])}")
|
||
|
||
def run_focused_eval(max_workers=4):
|
||
tasks = fetch_agent_tasks()
|
||
print(f"[focused] Agents: {len(AGENTS)} | Models: {len(MODELS)} | Prompts: {len(tasks)}")
|
||
print(f"[focused] Total evaluations: {len(tasks) * len(MODELS)}")
|
||
|
||
work_items = []
|
||
for pid, aname, system, user, exp_json, rub_json in tasks:
|
||
for model in MODELS:
|
||
work_items.append((aname, model, pid, system, user, exp_json, rub_json))
|
||
|
||
completed = 0
|
||
errors = 0
|
||
with ThreadPoolExecutor(max_workers=max_workers) as ex:
|
||
futures = {ex.submit(eval_single, *w): w for w in work_items}
|
||
for future in as_completed(futures):
|
||
try:
|
||
res = future.result()
|
||
save_single(res)
|
||
completed += 1
|
||
if completed % 4 == 0:
|
||
print(f"[focused] Progress: {completed}/{len(work_items)}")
|
||
except Exception as e:
|
||
import traceback
|
||
traceback.print_exc()
|
||
errors += 1
|
||
|
||
print(f"[focused] Completed {completed}/{len(work_items)} (errs={errors})")
|
||
compute_aggregates()
|
||
|
||
if __name__ == '__main__':
|
||
print("="*60)
|
||
print("FOCUSED REAL-FIT EVALUATION v2")
|
||
print(f"Models: {', '.join(MODELS)}")
|
||
print(f"Agents: {', '.join(AGENTS)}")
|
||
print(f"API: {os.environ['OLLAMA_HOST']}")
|
||
print("="*60)
|
||
|
||
# Clean old evaluations
|
||
conn = sqlite3.connect(DB_PATH)
|
||
conn.execute("DELETE FROM evaluations WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS)))
|
||
conn.execute("DELETE FROM fit_scores WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS)))
|
||
conn.commit()
|
||
conn.close()
|
||
print("[focused] Cleaned old evaluations")
|
||
|
||
run_focused_eval(max_workers=4)
|
||
report = generate_report()
|
||
print(f"[focused] Report generated with {len(report.get('agents',{}))} agents")
|