Files
APAW/scripts/run-focused-eval.py
Deploy Bot 26362c7359 fix: milestone 78 — remove USE_MOCK and hardcoded API key
- Remove _DEFAULT_KEY from real-fit-engine.py, run-focused-eval.py, test_real_api.py
- Replace USE_MOCK env var with DRY_RUN --dry-run CLI flag
- Create tests/mocks/ollama_mock.py for isolated mock testing
- Remove stale glm-5.1 from DEFAULT_MODELS and docs
- Update .env.example with OLLAMA_HOST and OLLAMA_KEY

Issues: #123
2026-06-01 12:25:50 +01:00

134 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Focused Real-Fit Eval Runner v2
Evaluates key agents × models using real-fit-engine.py (the fixed version).
"""
import sqlite3, json, os, sys, importlib.util
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
# OLLAMA_KEY must be set via environment variable or .env file — never hardcode
if not os.environ.get("OLLAMA_KEY"):
print("[FATAL] OLLAMA_KEY not set. Set it via environment variable or .env file.", file=sys.stderr)
sys.exit(1)
os.environ.setdefault("OLLAMA_HOST", "https://ollama.com/v1")
# Import the dash-named real-fit-engine.py via importlib
_spec = importlib.util.spec_from_file_location("rfe", os.path.join(os.path.dirname(__file__), "real-fit-engine.py"))
_rfe = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(_rfe)
call_ollama = _rfe.call_ollama
evaluate_response = _rfe.evaluate_response
compute_aggregates = _rfe.compute_aggregates
generate_report = _rfe.generate_report
DB_PATH = _rfe.DB_PATH
AGENTS = [
'code-skeptic',
'lead-developer',
'system-analyst',
'sdet-engineer',
'orchestrator',
'devops-engineer',
'workflow-cross-checker',
]
MODELS = [
'kimi-k2.6',
'deepseek-v4-pro-max',
'qwen3-coder:480b',
]
def fetch_agent_tasks():
conn = sqlite3.connect(DB_PATH)
placeholders = ','.join('?' * len(AGENTS))
rows = conn.execute(f"""
SELECT id, agent_name, system_prompt, user_prompt, expected_keywords, rubric
FROM test_prompts WHERE agent_name IN ({placeholders})
""", tuple(AGENTS)).fetchall()
conn.close()
return rows
def eval_single(agent_name, model, prompt_id, system, user, expected_json, rubric_json):
resp, latency, tokens = call_ollama(model, system, user)
ev = evaluate_response(resp, expected_json, rubric_json)
return {
'agent': agent_name,
'model': model,
'prompt_id': prompt_id,
'response_text': resp[:3000],
'latency_ms': latency,
'tokens_prompt': tokens['prompt'],
'tokens_response': tokens['response'],
'total_score': ev['total'],
'scores_json': json.dumps(ev['scores']),
'explanation': ev['explanation'],
'evaluated_at': datetime.now(timezone.utc).isoformat(),
'evaluator': 'rubric_v2'
}
def save_single(res):
conn = sqlite3.connect(DB_PATH)
conn.execute("""
INSERT INTO evaluations
(agent_name, model, prompt_id, response, latency_ms, tokens_prompt, tokens_response,
scores, total_score, explanation, evaluated_at, evaluator)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
""", (res['agent'], res['model'], res['prompt_id'], res['response_text'], res['latency_ms'],
res['tokens_prompt'], res['tokens_response'],
res['scores_json'], res['total_score'], res['explanation'],
res['evaluated_at'], res['evaluator']))
conn.commit()
conn.close()
print(f" [{res['agent']}] x [{res['model']}] score={res['total_score']:.1f} lat={res['latency_ms']}ms len={len(res['response_text'])}")
def run_focused_eval(max_workers=4):
tasks = fetch_agent_tasks()
print(f"[focused] Agents: {len(AGENTS)} | Models: {len(MODELS)} | Prompts: {len(tasks)}")
print(f"[focused] Total evaluations: {len(tasks) * len(MODELS)}")
work_items = []
for pid, aname, system, user, exp_json, rub_json in tasks:
for model in MODELS:
work_items.append((aname, model, pid, system, user, exp_json, rub_json))
completed = 0
errors = 0
with ThreadPoolExecutor(max_workers=max_workers) as ex:
futures = {ex.submit(eval_single, *w): w for w in work_items}
for future in as_completed(futures):
try:
res = future.result()
save_single(res)
completed += 1
if completed % 4 == 0:
print(f"[focused] Progress: {completed}/{len(work_items)}")
except Exception as e:
import traceback
traceback.print_exc()
errors += 1
print(f"[focused] Completed {completed}/{len(work_items)} (errs={errors})")
compute_aggregates()
if __name__ == '__main__':
print("="*60)
print("FOCUSED REAL-FIT EVALUATION v2")
print(f"Models: {', '.join(MODELS)}")
print(f"Agents: {', '.join(AGENTS)}")
print(f"API: {os.environ['OLLAMA_HOST']}")
print("="*60)
# Clean old evaluations
conn = sqlite3.connect(DB_PATH)
conn.execute("DELETE FROM evaluations WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS)))
conn.execute("DELETE FROM fit_scores WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS)))
conn.commit()
conn.close()
print("[focused] Cleaned old evaluations")
run_focused_eval(max_workers=4)
report = generate_report()
print(f"[focused] Report generated with {len(report.get('agents',{}))} agents")