#!/usr/bin/env python3 """ Focused Real-Fit Eval Runner v2 Evaluates key agents × models using real-fit-engine.py (the fixed version). """ import sqlite3, json, os, sys, importlib.util from datetime import datetime, timezone from concurrent.futures import ThreadPoolExecutor, as_completed os.environ.setdefault("OLLAMA_KEY", "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx") os.environ.setdefault("OLLAMA_HOST", "https://ollama.com/v1") # Import the dash-named real-fit-engine.py via importlib _spec = importlib.util.spec_from_file_location("rfe", os.path.join(os.path.dirname(__file__), "real-fit-engine.py")) _rfe = importlib.util.module_from_spec(_spec) _spec.loader.exec_module(_rfe) call_ollama = _rfe.call_ollama evaluate_response = _rfe.evaluate_response compute_aggregates = _rfe.compute_aggregates generate_report = _rfe.generate_report DB_PATH = _rfe.DB_PATH AGENTS = [ 'code-skeptic', 'lead-developer', 'system-analyst', 'sdet-engineer', 'orchestrator', 'devops-engineer', 'workflow-cross-checker', ] MODELS = [ 'kimi-k2.6', 'deepseek-v4-pro-max', 'qwen3-coder:480b', 'glm-5.1', ] def fetch_agent_tasks(): conn = sqlite3.connect(DB_PATH) placeholders = ','.join('?' * len(AGENTS)) rows = conn.execute(f""" SELECT id, agent_name, system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE agent_name IN ({placeholders}) """, tuple(AGENTS)).fetchall() conn.close() return rows def eval_single(agent_name, model, prompt_id, system, user, expected_json, rubric_json): resp, latency, tokens = call_ollama(model, system, user) ev = evaluate_response(resp, expected_json, rubric_json) return { 'agent': agent_name, 'model': model, 'prompt_id': prompt_id, 'response_text': resp[:3000], 'latency_ms': latency, 'tokens_prompt': tokens['prompt'], 'tokens_response': tokens['response'], 'total_score': ev['total'], 'scores_json': json.dumps(ev['scores']), 'explanation': ev['explanation'], 'evaluated_at': datetime.now(timezone.utc).isoformat(), 'evaluator': 'rubric_v2' } def save_single(res): conn = sqlite3.connect(DB_PATH) conn.execute(""" INSERT INTO evaluations (agent_name, model, prompt_id, response, latency_ms, tokens_prompt, tokens_response, scores, total_score, explanation, evaluated_at, evaluator) VALUES (?,?,?,?,?,?,?,?,?,?,?,?) """, (res['agent'], res['model'], res['prompt_id'], res['response_text'], res['latency_ms'], res['tokens_prompt'], res['tokens_response'], res['scores_json'], res['total_score'], res['explanation'], res['evaluated_at'], res['evaluator'])) conn.commit() conn.close() print(f" [{res['agent']}] x [{res['model']}] score={res['total_score']:.1f} lat={res['latency_ms']}ms len={len(res['response_text'])}") def run_focused_eval(max_workers=4): tasks = fetch_agent_tasks() print(f"[focused] Agents: {len(AGENTS)} | Models: {len(MODELS)} | Prompts: {len(tasks)}") print(f"[focused] Total evaluations: {len(tasks) * len(MODELS)}") work_items = [] for pid, aname, system, user, exp_json, rub_json in tasks: for model in MODELS: work_items.append((aname, model, pid, system, user, exp_json, rub_json)) completed = 0 errors = 0 with ThreadPoolExecutor(max_workers=max_workers) as ex: futures = {ex.submit(eval_single, *w): w for w in work_items} for future in as_completed(futures): try: res = future.result() save_single(res) completed += 1 if completed % 4 == 0: print(f"[focused] Progress: {completed}/{len(work_items)}") except Exception as e: import traceback traceback.print_exc() errors += 1 print(f"[focused] Completed {completed}/{len(work_items)} (errs={errors})") compute_aggregates() if __name__ == '__main__': print("="*60) print("FOCUSED REAL-FIT EVALUATION v2") print(f"Models: {', '.join(MODELS)}") print(f"Agents: {', '.join(AGENTS)}") print(f"API: {os.environ['OLLAMA_HOST']}") print("="*60) # Clean old evaluations conn = sqlite3.connect(DB_PATH) conn.execute("DELETE FROM evaluations WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS))) conn.execute("DELETE FROM fit_scores WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS))) conn.commit() conn.close() print("[focused] Cleaned old evaluations") run_focused_eval(max_workers=4) report = generate_report() print(f"[focused] Report generated with {len(report.get('agents',{}))} agents")