diff --git a/agent-evolution/data/real-fit.html b/agent-evolution/data/real-fit.html new file mode 100644 index 0000000..9b1fccf --- /dev/null +++ b/agent-evolution/data/real-fit.html @@ -0,0 +1,93 @@ + + + + + +Real-Fit Matrix — Agent × Model Performance + + + +

Real-Fit Matrix

+
Real agent × model evaluation scores via live Ollama API (28 calls, 4 models, 7 agents)
+ +
+
+ 90+ Excellent + 75–89 Good + 50–74 Average + <50 Weak + ● = assigned model +
+
Data source: real-fit-report.json | Updated:
+ + + + diff --git a/agent-evolution/docs/real-fit-architecture.md b/agent-evolution/docs/real-fit-architecture.md new file mode 100644 index 0000000..70a0fc0 --- /dev/null +++ b/agent-evolution/docs/real-fit-architecture.md @@ -0,0 +1,68 @@ +# Real-Fit Analysis System Architecture + +## Problem +Current `fit_score` is just `model_benchmarks.if_score` — generic benchmark, NOT evaluated per-role. `workflow-cross-checker` gets 92 simply because `kimi-k2.6` has IF=91, not because anyone tested if kimi is actually good at cross-checking workflows. + +## Solution: End-to-End Real Evaluation Pipeline + +### Phase 1: Test Prompt Generation +For each agent, extract role description + capabilities from `.kilo/agents/{name}.md` frontmatter + body rules. +Generate 3 representative tasks that exercise agent's actual responsibilities. + +### Phase 2: Multi-Model Execution +Run each task through N top models (kimi, deepseek, glm, qwen, etc.) via Ollama API. +Collect responses + latency + token count. + +### Phase 3: Role-Aware Evaluation +Judge each response against role-specific criteria: +- `code-skeptic`: Did it find the bug? Depth of analysis? Actionable fixes? +- `workflow-cross-checker`: Did it ask uncomfortable questions? Covered all gates? +- `lead-developer`: Working code? Tests pass? Clean structure? + +Using rubric-based scoring + model-as-judge (one model evaluates another). + +### Phase 4: Aggregation & Storage +Store per-agent-per-model scores with: +- Overall fit_score (0-100) +- Dimension scores: accuracy, completeness, relevance, role-adherence +- Explanation text: "Model X scored 87 because it correctly identified the race condition but missed the SQL injection (see response #3)" +- Raw responses for drill-down + +### Phase 5: Dashboard Integration +- Heatmap cell = real fit_score per agent per model +- Click cell → Analysis tab shows: score breakdown + explanation + raw response snippets +- "Why this score?" panel + +## Data Schema + +```json +{ + "agent": "workflow-cross-checker", + "model": "ollama-cloud/kimi-k2.6", + "fit_score": 87, + "dimensions": { + "accuracy": 90, + "completeness": 85, + "role_adherence": 92, + "actionability": 80 + }, + "explanation": "Strong at asking uncomfortable questions (gate protocol covered). Weak at suggesting concrete recovery actions.", + "tests": [ + { + "task_id": "wf-check-001", + "prompt": "...", + "response": "...", + "scores": {"accuracy": 90, "completeness": 85}, + "judge_notes": "..." + } + ], + "timestamp": "2026-05-27T18:00:00Z" +} +``` + +## Next Steps +1. Build prompt generator (read .kilo/agents/*.md → extract role → generate tasks) +2. Build batch runner (call Ollama API for each agent×model×task) +3. Build evaluator (rubric scoring + judge model) +4. Build storage (JSON DB with drill-down) +5. Build dashboard tab (Analysis with cell drill-down) diff --git a/scripts/real-fit-engine.py b/scripts/real-fit-engine.py index a3af5ef..e32deb4 100644 --- a/scripts/real-fit-engine.py +++ b/scripts/real-fit-engine.py @@ -5,12 +5,12 @@ SQLite-backed pipeline that evaluates agent-role × model fit via Ollama API. Usage: python3 real-fit-engine.py --init-db --import-evolution --generate-prompts - python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro-max + python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro python3 real-fit-engine.py --report python3 real-fit-engine.py --recalc --agent lead-developer --old-model qwen3-coder:480b --new-model kimi-k2.6 Configuration: - OLLAMA_HOST (default: http://localhost:11434) + OLLAMA_HOST (default: https://ollama.com/v1) """ import sqlite3, json, os, sys, re, time from glob import glob @@ -18,13 +18,31 @@ from datetime import datetime, timezone from urllib import request, error as urllib_error from concurrent.futures import ThreadPoolExecutor, as_completed -DB_PATH = "agent-evolution/data/real-fit.db" +DB_PATH = os.environ.get("REAL_FIT_DB", "agent-evolution/data/real-fit.db") -OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "https://api.ollama.com") -OLLAMA_KEY = os.environ.get("OLLAMA_KEY", "") -USE_MOCK = os.environ.get("OLLAMA_MOCK", "0") == "1" # Default to REAL for this env +# Load .env if present +_ENV_LOADED = False +if os.path.isfile(".env"): + with open(".env") as f: + for line in f: + if line.strip() and not line.startswith("#") and "=" in line: + k, v = line.strip().split("=", 1) + os.environ.setdefault(k, v) + _ENV_LOADED = True -DEFAULT_MODELS = ["kimi-k2.6", "deepseek-v4-pro-max", "deepseek-v4-flash", +# Ollama Cloud credentials (from .env or fallback) +_DEFAULT_KEY = "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx" +_DEFAULT_HOST = "https://ollama.com/v1" + +OLLAMA_HOST = os.environ.get("OLLAMA_HOST", _DEFAULT_HOST) +OLLAMA_KEY = os.environ.get("OLLAMA_KEY", _DEFAULT_KEY) +USE_MOCK = os.environ.get("OLLAMA_MOCK", "0") == "1" + +if not OLLAMA_KEY: + print("[FATAL] OLLAMA_KEY not set. Cannot run real evaluations.", file=sys.stderr) + sys.exit(1) + +DEFAULT_MODELS = ["kimi-k2.6", "deepseek-v4-pro", "deepseek-v4-flash", "glm-5.1", "qwen3-coder:480b", "qwen3.5-122b"] # ================================================================ @@ -116,93 +134,75 @@ def init_db(): # ================================================================ def parse_frontmatter(path): + """Parse YAML frontmatter and body from an agent markdown file.""" try: with open(path, 'r', encoding='utf-8') as f: content = f.read() - except: + except Exception: return {} if not content.startswith('---'): return {} - end = content.find('---', 3) - if end == -1: + parts = content.split('---', 2) + if len(parts) < 3: return {} - data = {} - for line in content[3:end].strip().split('\n'): - m = re.match(r'^(\w+):\s*(.+)$', line) - if m: - data[m.group(1)] = m.group(2).strip() - body = content[end+3:][:800] - data['_body_snippet'] = body.replace('\n', ' ').strip()[:300] - return data + fm_raw = parts[1].strip() + body = parts[2].strip() + try: + import yaml + fm = yaml.safe_load(fm_raw) or {} + except Exception: + fm = {} + for line in fm_raw.splitlines(): + m = re.match(r'^(\w+):\s*(.+)$', line) + if m: + fm[m.group(1)] = m.group(2).strip() + body_text = body[:1200] + fm['_body'] = body_text + fm['_body_snippet'] = body_text.replace('\n', ' ').strip()[:300] + return fm -TASK_LIBRARY = { - 'code-skeptic': { - 'system': 'You are a strict code reviewer. Find security issues, logic errors, anti-patterns. Be adversarial but constructive.', - 'task': '''Review this function for security vulnerabilities and logic errors. Report: SQL injection, XSS, race conditions, code smells, and suggested fixes. +def generate_task_for_agent(name, fm): + """Generate a realistic task prompt from the agent's actual markdown definition.""" + description = fm.get('description', '') if isinstance(fm, dict) else '' + body = (fm.get('_body', '') if isinstance(fm, dict) else '')[:1500] -```typescript -function processPayment(userId, amount, cardToken) { - const q = `UPDATE users SET balance = balance - ${amount} WHERE id = ${userId}`; - db.exec(q); - fetch('/api/charge', { body: JSON.stringify({ cardToken, amount }) }); - if (Math.random() > 0.9) { throw new Error('timeout'); } -} -```''', - 'expected': ['sql injection', 'parameterized', 'race', 'localStorage', 'xss'], - 'rubric': {'security': 35, 'logic': 25, 'actionability': 25, 'depth': 15} - }, - 'workflow-cross-checker': { - 'system': 'You are a workflow cross-checker. Before any work begins, ask uncomfortable but important questions that could block the task.', - 'task': 'A developer wants to add "admin can delete any user" directly from the UI. Run your cross-check protocol. Identify 5+ potential issues or blockers.', - 'expected': ['soft delete', 'audit log', 'cascading', 'permission', 'data retention', 'backup'], - 'rubric': {'thoroughness': 35, 'relevance': 30, 'actionability': 20, 'severity_ranking': 15} - }, - 'lead-developer': { - 'system': 'You are lead developer. Write production-ready implementation. Tests MUST pass. Follow SOLID. Max 100 lines per file.', - 'task': 'Implement a TaskQueue class with: transaction support, retry with exponential backoff, timeout handling, and Jest tests. TypeScript.', - 'expected': ['class TaskQueue', 'async', 'retry', 'timeout', 'test', 'jest'], - 'rubric': {'correctness': 30, 'test_coverage': 30, 'code_quality': 25, 'edge_cases': 15} - }, - 'sdet-engineer': { - 'system': 'You are SDET. Write tests BEFORE code. Cover edge cases, nulls, async errors, concurrent access.', - 'task': 'Write Jest tests for UserService: createUser, getUser, updateUser, deleteUser. Cover: valid inputs, nulls, duplicates, concurrent updates.', - 'expected': ['describe', 'it', 'expect', 'null', 'async', 'mock', 'beforeEach'], - 'rubric': {'coverage': 35, 'edge_cases': 30, 'readability': 20, 'mocking': 15} - }, - 'orchestrator': { - 'system': 'You are an Orchestrator. You delegate tasks to subagents. You decide routing, handle errors, and manage budgets.', - 'task': 'A user reports: "Build a REST API for ecommerce checkout". Design your delegation plan: which agents to call, in what order, what to do if one fails.', - 'expected': ['system-analyst', 'lead-developer', 'code-skeptic', 'sdet-engineer', 'budget', 'parallel'], - 'rubric': {'plan_quality': 30, 'agent_selection': 25, 'risk_handling': 25, 'budget_awareness': 20} - }, - 'system-analyst': { - 'system': 'You design technical specifications, data schemas, and API contracts before implementation.', - 'task': 'Design the API contract and DB schema for a multi-tenant SaaS billing system. Include rate limiting, audit trails, and idempotency.', - 'expected': ['openapi', 'schema', 'idempotency', 'rate limit', 'audit', 'tenant'], - 'rubric': {'completeness': 30, 'correctness': 30, 'clarity': 20, 'scalability': 20} - }, - 'devops-engineer': { - 'system': 'You handle Docker, CI/CD, infrastructure. Security first.', - 'task': 'Write a multi-stage Dockerfile for a Node.js Next.js app. Include: non-root user, health check, security scan, .dockerignore best practices.', - 'expected': ['FROM node', 'USER', 'HEALTHCHECK', 'multi-stage', '.dockerignore'], - 'rubric': {'security': 30, 'optimization': 25, 'correctness': 25, 'completeness': 20} - } -} + system = f"You are {name}. {description}" -def generate_task_for_agent(name, role): - n, r = name.lower(), role.lower() - for key, task in TASK_LIBRARY.items(): - if key in n: - return task - # Keyword fallback - for key in TASK_LIBRARY: - if key.replace('-', ' ') in r or any(kw in r for kw in key.split('-')): - return TASK_LIBRARY[key] + # Build a task from real agent instructions + lines = body.splitlines() + instruction_lines = [] + for line in lines: + stripped = line.strip() + if stripped and not stripped.startswith('#') and not stripped.startswith('---') and not stripped.startswith('|'): + instruction_lines.append(stripped) + if len(instruction_lines) >= 8: + break + + if len(instruction_lines) >= 3: + task = ( + "Based on your role definition below, respond to the following scenario as you would in production.\n\n" + "Your role instructions:\n" + '\n'.join(instruction_lines[:12]) + + "\n\nNow, given this incoming task: \"A team member has submitted a pull request with several issues." + " What do you do?\", provide your full response." + ) + else: + task = f"Demonstrate your expertise as {name} in a realistic complex scenario. Provide a complete working solution." + + expected = [name.replace('-', ' ')] + if description: + expected.extend(description.lower().split()[:5]) + for line in lines: + l = line.strip() + if l.startswith('-') or l.startswith('*'): + expected.append(l.lstrip('-*').strip().lower()) + expected = list(dict.fromkeys(expected))[:12] + + rubric = {'relevance': 40, 'completeness': 30, 'correctness': 30} return { - 'system': f'You are {name}. {role}', - 'task': f'Demonstrate your expertise as {name} in a realistic complex scenario. Provide a complete working solution.', - 'expected': [name.replace('-', ' ')], - 'rubric': {'relevance': 40, 'completeness': 30, 'correctness': 30} + 'system': system, + 'task': task, + 'expected': expected, + 'rubric': rubric } def generate_prompts(): @@ -214,7 +214,7 @@ def generate_prompts(): if not fm.get('model'): continue name = os.path.basename(path)[:-3] - task = generate_task_for_agent(name, fm.get('description', '')) + task = generate_task_for_agent(name, fm) if task: conn.execute(''' INSERT INTO test_prompts (agent_name, task_type, system_prompt, user_prompt, expected_keywords, rubric) @@ -230,91 +230,143 @@ def generate_prompts(): # OLLAMA CLIENT # ================================================================ -def call_ollama(model_short, system_prompt, user_prompt, expected_keywords=None): - """REAL Ollama API call via /api/chat. Returns (text, latency_ms, tokens_dict).""" +def call_ollama(model_short, system_prompt, user_prompt, expected_keywords=None, timeout=120): + """Call Ollama API with retries. Returns (response_text, latency_ms, token_info_dict).""" if USE_MOCK: return ( "[MOCK] This is a simulated response for testing the pipeline without API calls.", 500, {"prompt": 100, "response": 200} ) - - model_map = { - 'kimi-k2.6': 'kimi-k2.6', - 'deepseek-v4-pro-max': 'deepseek-v4-pro', - 'deepseek-v4-flash': 'deepseek-v4-flash', - 'glm-5.1': 'glm-5.1', - 'qwen3-coder:480b': 'qwen3-coder:480b', - 'qwen3.5-122b': 'kimi-k2.6', # fallback to known working model + + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {OLLAMA_KEY}", } - model_ollama = model_map.get(model_short, model_short) - payload = json.dumps({ - "model": model_ollama, + body = json.dumps({ + "model": model_short, "messages": [ {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt} + {"role": "user", "content": user_prompt}, ], - "stream": False, - "options": {"temperature": 0.3, "num_predict": 2048} - }).encode('utf-8') - - headers = {"Content-Type": "application/json"} - if OLLAMA_KEY: - headers["Authorization"] = f"Bearer {OLLAMA_KEY}" - - req = request.Request(f"{OLLAMA_HOST}/api/chat", - data=payload, headers=headers, - method='POST') - start = time.time() - try: - with request.urlopen(req, timeout=120) as resp: - elapsed = int((time.time() - start) * 1000) - data = json.loads(resp.read().decode('utf-8')) - text = data.get('message', {}).get('content', '') - return (text, elapsed, - {"prompt": data.get('prompt_eval_count', 0), - "response": data.get('eval_count', 0)}) - except urllib_error.HTTPError as e: - return (f"[HTTP {e.code}: {e.reason}]", int((time.time()-start)*1000), {"prompt":0,"response":0}) - except Exception as e: - return (f"[ERROR: {e}]", 0, {"prompt":0,"response":0}) + "temperature": 0.2, + }).encode("utf-8") + + url = f"{OLLAMA_HOST.rstrip('/')}/chat/completions" + req = request.Request(url, data=body, headers=headers, method="POST") + + latency = 0 + for attempt in range(1, 4): + start = time.time() + try: + with request.urlopen(req, timeout=timeout) as resp: + data = json.loads(resp.read().decode("utf-8")) + latency = int((time.time() - start) * 1000) + content = ( + data.get("choices", [{}])[0].get("message", {}).get("content", "") + or "" + ) + usage = data.get("usage", {}) + tokens = { + "prompt": usage.get("prompt_tokens", 0), + "response": usage.get("completion_tokens", 0), + } + return content, latency, tokens + except urllib_error.HTTPError as e: + latency = int((time.time() - start) * 1000) + if e.code in (429, 502, 503, 504): + wait = 2 ** attempt + print(f" [retry] {model_short}: HTTP {e.code} → sleeping {wait}s (attempt {attempt}/3)") + time.sleep(wait) + continue + return f"[HTTP {e.code}] {e.read().decode('utf-8', 'ignore')[:200]}", latency, {} + except urllib_error.URLError as e: + latency = int((time.time() - start) * 1000) + wait = 2 ** attempt + print(f" [retry] {model_short}: {e.reason} → sleeping {wait}s (attempt {attempt}/3)") + time.sleep(wait) + continue + except Exception as e: + latency = int((time.time() - start) * 1000) + return f"[ERROR] {type(e).__name__}: {str(e)[:200]}", latency, {} + + return "[FATAL] All retries exhausted", latency, {} # ================================================================ # EVALUATOR # ================================================================ def evaluate_response(response, expected_json, rubric_json): - """Rubric-based evaluation. Returns dict.""" + """Rubric-based evaluation. Returns dict with dimension scores mapped to rubric keys.""" expected = json.loads(expected_json) if isinstance(expected_json, str) else expected_json rubric = json.loads(rubric_json) if isinstance(rubric_json, str) else rubric_json resp_lower = (response or '').lower() lines = response.strip().split('\n') - + + # 1. Keyword coverage (generic) keyword_hits = sum(1 for kw in expected if kw.lower() in resp_lower) - keyword_score = min(100, (keyword_hits / len(expected) * 100) if expected else 50) - - has_code = '```' in response or 'function' in resp_lower or 'class ' in resp_lower - code_score = 80 if has_code else 30 - - structure_score = min(100, len(lines) * 2) # ~50 lines = 100 - - scores = {'keyword_coverage': round(keyword_score, 1), - 'code_presence': code_score, - 'structure': round(structure_score, 1)} - + keyword_score = min(100, (keyword_hits / max(1, len(expected)) * 100)) + + # 2. Code presence + has_code = '```' in response or 'function' in resp_lower or 'class ' in resp_lower or 'def ' in resp_lower + code_score = 100 if has_code else 20 + + # 3. Structure (response depth) + structure_score = min(100, max(10, len(lines) * 2)) + + # 4. Actionability (does it suggest fixes/actions?) + actionability = 0 + if any(w in resp_lower for w in ['fix', 'suggest', 'recommend', 'should', 'refactor', 'replace']): + actionability = 85 + elif any(w in resp_lower for w in ['use', 'add', 'remove', 'change', 'improve', 'consider']): + actionability = 60 + + # 5. Depth (content length, capped) + depth = min(100, len(response) / 40) + + # 6. Relevance (does response mention role-specific terms?) + relevance = min(100, keyword_score * 0.8 + 20) + + # Map rubrics to actual computed scores via heuristics + generic_scores = { + 'keyword_coverage': round(keyword_score, 1), + 'code_presence': code_score, + 'structure': round(structure_score, 1), + 'actionability': round(actionability, 1), + 'depth': round(depth, 1), + 'relevance': round(relevance, 1), + # Rubric-specific mappings (fallback chain) + 'security': max(keyword_score, code_score, actionability) if any(k in resp_lower for k in ['sql', 'inject', 'xss', 'csrf']) else round(keyword_score * 0.7, 1), + 'logic': round(structure_score * 0.8, 1), + 'correctness': round((code_score + keyword_score) / 2, 1), + 'completeness': round((keyword_score + structure_score) / 2, 1), + 'thoroughness': round((keyword_score + depth) / 2, 1), + 'clarity': round(structure_score * 0.9, 1), + 'coverage': keyword_score, + 'edge_cases': round((keyword_score + depth) / 2, 1), + 'readability': round(structure_score * 0.85, 1), + 'mocking': code_score if 'mock' in resp_lower else round(code_score * 0.5, 1), + 'plan_quality': round((keyword_score + structure_score) / 2, 1), + 'agent_selection': keyword_score, + 'risk_handling': actionability, + 'budget_awareness': keyword_score, + 'scalability': round(structure_score * 0.7, 1), + 'optimization': actionability, + } + total = 0 if rubric: for dim, weight in rubric.items(): - dim_score = scores.get(dim, keyword_score) + dim_score = generic_scores.get(dim, 50) total += (dim_score / 100) * weight else: - total = sum(scores.values()) / len(scores) - + total = sum(generic_scores.values()) / len(generic_scores) + explanation = (f"Keywords: {keyword_hits}/{len(expected)}. " f"Lines: {len(lines)}. " f"Code: {'YES' if has_code else 'NO'}. " f"Total={round(total, 1)}") - - return {'scores': scores, 'total': round(total, 1), 'explanation': explanation} + + return {'scores': generic_scores, 'total': round(total, 1), 'explanation': explanation} # ================================================================ # PARALLEL BATCH EVALUATION @@ -324,36 +376,108 @@ def evaluate_one(args): agent_name, model, pid, system, user, expected, rubric = args resp, latency, tokens = call_ollama(model, system, user, expected) ev = evaluate_response(resp, expected, rubric) + is_error = not resp or resp.startswith('[') return { 'agent': agent_name, 'model': model, 'prompt_id': pid, 'response': resp, 'latency': latency, 'tokens': tokens, 'total': ev['total'], 'scores': json.dumps(ev['scores']), - 'explanation': ev['explanation'] + 'explanation': ev['explanation'], 'is_error': is_error } -def evaluate_all(models_to_test, max_workers=4): - """Evaluate all agents × all models with parallel workers.""" +def _should_skip(agent_name, model): + """Check if we already have a non-error evaluation for this agent × model.""" conn = sqlite3.connect(DB_PATH) - agents = conn.execute("SELECT DISTINCT name FROM agents").fetchall() + row = conn.execute(''' + SELECT total_score FROM evaluations + WHERE agent_name = ? AND model = ? AND response IS NOT NULL + AND response NOT LIKE '[%' AND LENGTH(response) > 0 + LIMIT 1''', (agent_name, model)).fetchone() + conn.close() + return row[0] if row else None + + +def evaluate_single(agent_name, model, conn=None): + """Evaluate one agent × model. Reuses optional open connection.""" + close_conn = False + if conn is None: + conn = sqlite3.connect(DB_PATH) + close_conn = True + + prompts = conn.execute(''' + SELECT id, system_prompt, user_prompt, expected_keywords, rubric + FROM test_prompts WHERE agent_name = ? + ''', (agent_name,)).fetchall() + if close_conn: + conn.close() + + results = [] + for pid, sys, usr, exp, rub in prompts: + res = evaluate_one((agent_name, model, pid, sys, usr, exp, rub)) + if res.get('is_error'): + print(f" [SKIP] {agent_name} × {model}: error response — {res['response'][:200]}") + continue + conn = sqlite3.connect(DB_PATH) + conn.execute('''INSERT INTO evaluations + (agent_name, model, prompt_id, response, latency_ms, tokens_prompt, tokens_response, + scores, total_score, explanation, evaluated_at, evaluator) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?)''', + (res['agent'], res['model'], res['prompt_id'], res['response'], res['latency'], + res['tokens']['prompt'], res['tokens']['response'], + res['scores'], res['total'], res['explanation'], + datetime.now(timezone.utc).isoformat(), 'rubric_v1')) + conn.commit() + conn.close() + print(f" [{res['agent']}] × [{res['model']}] score={res['total']:.1f}") + results.append(res) + return results + + +def evaluate_all(models_to_test, max_workers=4, agent_filter=None): + """Evaluate agents × models with parallel workers. + + Args: + models_to_test: list of model name strings (e.g. ['kimi-k2.6', 'glm-5.1']) + max_workers: thread pool size + agent_filter: optional agent name to limit evaluation to one agent + """ + if isinstance(models_to_test, dict): + print("[error] evaluate_all received a dict instead of a list. " + "Use --evaluate-all --models m1,m2 for all agents, or pass a list.") + sys.exit(1) + + conn = sqlite3.connect(DB_PATH) + if agent_filter: + agents = [(agent_filter,)] + else: + agents = conn.execute("SELECT DISTINCT name FROM agents").fetchall() tasks = [] - + for (agent_name,) in agents: + for model in models_to_test: + existing = _should_skip(agent_name, model) + if existing is not None: + print(f" Already evaluated: {agent_name} × {model} = {existing:.1f} (skipping)") + continue prompts = conn.execute(''' SELECT id, system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE agent_name = ?''', (agent_name,)).fetchall() for pid, sys, usr, exp, rub in prompts: for model in models_to_test: - tasks.append((agent_name, model, pid, sys, usr, exp, rub)) - + if _should_skip(agent_name, model) is None: + tasks.append((agent_name, model, pid, sys, usr, exp, rub)) + conn.close() - + print(f"[eval] Prepared {len(tasks)} evaluations (agents × models × prompts)") - + results = [] with ThreadPoolExecutor(max_workers=max_workers) as ex: futures = {ex.submit(evaluate_one, t): t for t in tasks} for future in as_completed(futures): res = future.result() + if res.get('is_error'): + print(f" [SKIP] {res['agent']} × {res['model']}: error response — {res['response'][:200]}") + continue results.append(res) conn = sqlite3.connect(DB_PATH) conn.execute('''INSERT INTO evaluations @@ -363,11 +487,11 @@ def evaluate_all(models_to_test, max_workers=4): (res['agent'], res['model'], res['prompt_id'], res['response'], res['latency'], res['tokens']['prompt'], res['tokens']['response'], res['scores'], res['total'], res['explanation'], - datetime.now(timezone.utc).isoformat(), 'rubric_v1')) + datetime.now(timezone.utc).isoformat(), 'rubric_v1')) conn.commit() conn.close() print(f" [{res['agent']}] × [{res['model']}] score={res['total']:.1f}") - + print(f"[eval] Stored {len(results)} evaluations") compute_aggregates() @@ -485,7 +609,10 @@ def generate_report(): 'fit_scores': fit_scores } - out = 'agent-evolution/data/real-fit-report.json' + out = os.environ.get('REPORT_PATH', 'agent-evolution/data/real-fit-report.json') + out_dir = os.path.dirname(out) + if out_dir: + os.makedirs(out_dir, exist_ok=True) with open(out, 'w') as f: json.dump(report, f, ensure_ascii=False, indent=2) @@ -498,7 +625,8 @@ def generate_report(): # ================================================================ def import_from_evolution(): - with open('agent-evolution/data/evolution.json') as f: + evo_path = os.environ.get('EVOLUTION_PATH', 'agent-evolution/data/evolution.json') + with open(evo_path) as f: evo = json.load(f) conn = sqlite3.connect(DB_PATH) for name, a in evo['agents'].items(): @@ -546,7 +674,12 @@ if __name__ == '__main__': generate_prompts() if args.evaluate: models = args.models.split(',') - evaluate_all({args.evaluate: models}, args.workers) + for model in models: + existing = _should_skip(args.evaluate, model) + if existing is not None: + print(f"Already evaluated: {args.evaluate} x {model} = {existing:.1f} (skipping)") + continue + evaluate_single(args.evaluate, model) if args.evaluate_all: models = args.models.split(',') evaluate_all(models, args.workers) @@ -559,7 +692,7 @@ if __name__ == '__main__': p.print_help() print("\n=== Workflow ===") print(" python3 real-fit-engine.py --init-db --import-evolution --generate-prompts") - print(" python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro-max") + print(" python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro") print(" python3 real-fit-engine.py --report") print(" python3 real-fit-engine.py --recalc --agent lead-developer --old-model qwen3-coder:480b --new-model kimi-k2.6") print("\nSet OLLAMA_MOCK=0 for real Ollama API (port 11434)") diff --git a/scripts/real-fit-recalc.py b/scripts/real-fit-recalc.py deleted file mode 100644 index 8962fa6..0000000 --- a/scripts/real-fit-recalc.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python3 -""" -Recalculate real-fit scores from stored responses in SQLite. -No API needed. Updates evaluations, fit_scores, and generates report. -Usage: python3 scripts/real-fit-recalc.py -""" -import sqlite3, json, os, sys -from datetime import datetime, timezone - -DB_PATH = "agent-evolution/data/real-fit.db" -REPORT_PATH = "agent-evolution/data/real-fit-report.json" - - -def evaluate_response(response, expected_json, rubric_json): - expected = json.loads(expected_json) if isinstance(expected_json, str) else expected_json - rubric = json.loads(rubric_json) if isinstance(rubric_json, str) else rubric_json - resp_lower = (response or '').lower() - lines = response.strip().split('\n') - - keyword_hits = sum(1 for kw in expected if kw.lower() in resp_lower) - keyword_score = min(100, (keyword_hits / len(expected) * 100) if expected else 50) - - has_code = '```' in response or 'function' in resp_lower or 'class ' in resp_lower - code_score = 80 if has_code else 30 - - structure_score = min(100, len(lines) * 2) - - scores = {'keyword_coverage': round(keyword_score, 1), - 'code_presence': code_score, - 'structure': round(structure_score, 1)} - - total = 0 - if rubric: - for dim, weight in rubric.items(): - dim_score = scores.get(dim, keyword_score) - total += (dim_score / 100) * weight - else: - total = sum(scores.values()) / len(scores) - - explanation = (f"Keywords: {keyword_hits}/{len(expected)}. " - f"Lines: {len(lines)}. " - f"Code: {'YES' if has_code else 'NO'}. " - f"Total={round(total, 1)}") - - return {'scores': scores, 'total': round(total, 1), 'explanation': explanation} - - -def recalc(): - if not os.path.exists(DB_PATH): - print(f"[error] Database not found: {DB_PATH}") - sys.exit(1) - - conn = sqlite3.connect(DB_PATH) - c = conn.cursor() - - # Fetch all evaluations with prompt data resolved by agent_name (prompt_id mismatch safe) - c.execute('''SELECT e.id, e.agent_name, e.response, e.total_score, e.scores, e.explanation, - t.expected_keywords, t.rubric - FROM evaluations e - LEFT JOIN test_prompts t ON e.agent_name = t.agent_name''') - rows = c.fetchall() - print(f"[recalc] Found {len(rows)} evaluations") - - updated = 0 - for eid, agent_name, response, old_total, old_scores, old_exp, expected, rubric in rows: - if expected is None or rubric is None: - print(f" [skip] No prompt match for eval {eid} (agent={agent_name})") - continue - - ev = evaluate_response(response, expected, rubric) - - new_scores = json.dumps(ev['scores']) - new_total = ev['total'] - new_exp = ev['explanation'] - - c.execute('''UPDATE evaluations - SET total_score = ?, scores = ?, explanation = ? - WHERE id = ?''', - (new_total, new_scores, new_exp, eid)) - updated += 1 - - conn.commit() - print(f"[recalc] Updated {updated} evaluations") - - # Compute aggregates - c.execute('''SELECT agent_name, model, AVG(total_score) as avg_score - FROM evaluations GROUP BY agent_name, model''') - rows = c.fetchall() - - best = {} - for a, m, s in rows: - if a not in best or s > best[a][1]: - best[a] = (m, s) - - for a, (m, s) in best.items(): - c.execute('SELECT scores FROM evaluations WHERE agent_name = ? AND model = ?', (a, m)) - dims = c.fetchall() - dim_avg = {} - for (score_json,) in dims: - for k, v in json.loads(score_json).items(): - dim_avg[k] = dim_avg.get(k, 0) + v - dim_avg = {k: round(v / len(dims), 1) for k, v in dim_avg.items()} - - explanation = f"Best model for {a} is {m} with avg score {round(s,1)}. " - explanation += f"Strongest dimension: {max(dim_avg, key=dim_avg.get)}." - - c.execute('''INSERT OR REPLACE INTO fit_scores - (agent_name, model, fit_score, dimension_scores, explanation, evaluated_at) - VALUES (?, ?, ?, ?, ?, ?)''', - (a, m, round(s, 1), json.dumps(dim_avg), explanation, - datetime.now(timezone.utc).isoformat())) - - conn.commit() - print(f"[recalc] Computed fit scores for {len(best)} agents") - - # Generate report - c.execute('''SELECT agent_name, model, AVG(total_score) as avg_score, COUNT(*) as cnt - FROM evaluations GROUP BY agent_name, model''') - rows = c.fetchall() - - agents = {} - for a, m, s, cnt in rows: - if a not in agents: - c.execute('SELECT description, category, current_model FROM agents WHERE name = ?', (a,)) - info = c.fetchone() - agents[a] = {'name': a, 'evaluations': {}, 'info': info or ()} - agents[a]['evaluations'][m] = round(s, 1) - - for a in agents: - evs = agents[a]['evaluations'] - best_m = max(evs, key=evs.get) - agents[a]['best_model'] = best_m - agents[a]['best_score'] = evs[best_m] - - c.execute('SELECT agent_name, model, fit_score, explanation FROM fit_scores') - fit_scores = {} - for a, m, s, e in c.fetchall(): - fit_scores[a] = {'model': m, 'fit': s, 'explanation': e} - - report = { - 'generated': datetime.now(timezone.utc).isoformat(), - 'source': 'real-fit-engine', - 'total_evaluations': len(rows), - 'agents': agents, - 'fit_scores': fit_scores - } - - os.makedirs(os.path.dirname(REPORT_PATH), exist_ok=True) - with open(REPORT_PATH, 'w') as f: - json.dump(report, f, ensure_ascii=False, indent=2) - - print(f"[recalc] Written {REPORT_PATH}: {len(agents)} agents, {len(rows)} evaluations") - conn.close() - - -if __name__ == '__main__': - recalc() diff --git a/scripts/run-focused-eval.py b/scripts/run-focused-eval.py new file mode 100644 index 0000000..085cdf0 --- /dev/null +++ b/scripts/run-focused-eval.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +""" +Focused Real-Fit Eval Runner v2 +Evaluates key agents × models using real-fit-engine.py (the fixed version). +""" +import sqlite3, json, os, sys, importlib.util +from datetime import datetime, timezone +from concurrent.futures import ThreadPoolExecutor, as_completed + +os.environ.setdefault("OLLAMA_KEY", "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx") +os.environ.setdefault("OLLAMA_HOST", "https://ollama.com/v1") + +# Import the dash-named real-fit-engine.py via importlib +_spec = importlib.util.spec_from_file_location("rfe", os.path.join(os.path.dirname(__file__), "real-fit-engine.py")) +_rfe = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(_rfe) + +call_ollama = _rfe.call_ollama +evaluate_response = _rfe.evaluate_response +compute_aggregates = _rfe.compute_aggregates +generate_report = _rfe.generate_report +DB_PATH = _rfe.DB_PATH + +AGENTS = [ + 'code-skeptic', + 'lead-developer', + 'system-analyst', + 'sdet-engineer', + 'orchestrator', + 'devops-engineer', + 'workflow-cross-checker', +] + +MODELS = [ + 'kimi-k2.6', + 'deepseek-v4-pro-max', + 'qwen3-coder:480b', + 'glm-5.1', +] + +def fetch_agent_tasks(): + conn = sqlite3.connect(DB_PATH) + placeholders = ','.join('?' * len(AGENTS)) + rows = conn.execute(f""" + SELECT id, agent_name, system_prompt, user_prompt, expected_keywords, rubric + FROM test_prompts WHERE agent_name IN ({placeholders}) + """, tuple(AGENTS)).fetchall() + conn.close() + return rows + +def eval_single(agent_name, model, prompt_id, system, user, expected_json, rubric_json): + resp, latency, tokens = call_ollama(model, system, user) + ev = evaluate_response(resp, expected_json, rubric_json) + return { + 'agent': agent_name, + 'model': model, + 'prompt_id': prompt_id, + 'response_text': resp[:3000], + 'latency_ms': latency, + 'tokens_prompt': tokens['prompt'], + 'tokens_response': tokens['response'], + 'total_score': ev['total'], + 'scores_json': json.dumps(ev['scores']), + 'explanation': ev['explanation'], + 'evaluated_at': datetime.now(timezone.utc).isoformat(), + 'evaluator': 'rubric_v2' + } + +def save_single(res): + conn = sqlite3.connect(DB_PATH) + conn.execute(""" + INSERT INTO evaluations + (agent_name, model, prompt_id, response, latency_ms, tokens_prompt, tokens_response, + scores, total_score, explanation, evaluated_at, evaluator) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?) + """, (res['agent'], res['model'], res['prompt_id'], res['response_text'], res['latency_ms'], + res['tokens_prompt'], res['tokens_response'], + res['scores_json'], res['total_score'], res['explanation'], + res['evaluated_at'], res['evaluator'])) + conn.commit() + conn.close() + print(f" [{res['agent']}] x [{res['model']}] score={res['total_score']:.1f} lat={res['latency_ms']}ms len={len(res['response_text'])}") + +def run_focused_eval(max_workers=4): + tasks = fetch_agent_tasks() + print(f"[focused] Agents: {len(AGENTS)} | Models: {len(MODELS)} | Prompts: {len(tasks)}") + print(f"[focused] Total evaluations: {len(tasks) * len(MODELS)}") + + work_items = [] + for pid, aname, system, user, exp_json, rub_json in tasks: + for model in MODELS: + work_items.append((aname, model, pid, system, user, exp_json, rub_json)) + + completed = 0 + errors = 0 + with ThreadPoolExecutor(max_workers=max_workers) as ex: + futures = {ex.submit(eval_single, *w): w for w in work_items} + for future in as_completed(futures): + try: + res = future.result() + save_single(res) + completed += 1 + if completed % 4 == 0: + print(f"[focused] Progress: {completed}/{len(work_items)}") + except Exception as e: + import traceback + traceback.print_exc() + errors += 1 + + print(f"[focused] Completed {completed}/{len(work_items)} (errs={errors})") + compute_aggregates() + +if __name__ == '__main__': + print("="*60) + print("FOCUSED REAL-FIT EVALUATION v2") + print(f"Models: {', '.join(MODELS)}") + print(f"Agents: {', '.join(AGENTS)}") + print(f"API: {os.environ['OLLAMA_HOST']}") + print("="*60) + + # Clean old evaluations + conn = sqlite3.connect(DB_PATH) + conn.execute("DELETE FROM evaluations WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS))) + conn.execute("DELETE FROM fit_scores WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS))) + conn.commit() + conn.close() + print("[focused] Cleaned old evaluations") + + run_focused_eval(max_workers=4) + report = generate_report() + print(f"[focused] Report generated with {len(report.get('agents',{}))} agents") diff --git a/scripts/test_ollama_minimal.py b/scripts/test_ollama_minimal.py new file mode 100644 index 0000000..e3d2d1e --- /dev/null +++ b/scripts/test_ollama_minimal.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python3 +import urllib.request, json, os, time + +def call_ollama_real(model_short, system_prompt, user_prompt): + key = os.environ.get("OLLAMA_KEY", "") + host = "https://ollama.com/v1" + + payload = json.dumps({ + "model": model_short, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + "temperature": 0.3, + "max_tokens": 2048 + }).encode() + + req = urllib.request.Request( + f"{host}/chat/completions", + data=payload, + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {key}" if key else "Bearer", + "User-Agent": "Mozilla/5.0" + }, + method="POST" + ) + + start = time.time() + try: + with urllib.request.urlopen(req, timeout=120) as resp: + data = json.loads(resp.read().decode()) + text = data.get("choices", [{}])[0].get("message", {}).get("content", "") + usage = data.get("usage", {}) + elapsed = int((time.time() - start) * 1000) + print(f"Status: {resp.status}") + print(f"Latency: {elapsed}ms") + print(f"Tokens: prompt={usage.get('prompt_tokens')}, completion={usage.get('completion_tokens')}") + return text + except urllib.error.HTTPError as e: + body = e.read().decode()[:200] + print(f"HTTP Error: {e.code} {e.reason}") + print(f"Body: {body}") + return "" + except Exception as e: + print(f"Error: {e}") + return "" + +if __name__ == "__main__": + print("=== Test real Ollama API ===") + text = call_ollama_real( + "kimi-k2.6", + "You are a code reviewer. Find bugs.", + "Review: def f(x): return x+1" + ) + print(f"\nResponse (first 300 chars):\n{text[:300]}") + print(f"\nTotal length: {len(text)} chars") + print(f"Keyword 'naming' in response: {'naming' in text.lower()}") + print(f"Keyword 'return' in response: {'return' in text.lower()}") diff --git a/scripts/test_real_api.py b/scripts/test_real_api.py new file mode 100644 index 0000000..ab1cfce --- /dev/null +++ b/scripts/test_real_api.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +import sys, os +os.environ.setdefault("OLLAMA_KEY", "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx") +os.environ.setdefault("OLLAMA_HOST", "https://api.ollama.com") + +sys.path.insert(0, "scripts") +from real_fit_engine import call_ollama, evaluate_response, init_db, import_from_evolution, generate_prompts +import sqlite3 + +init_db() +import_from_evolution() +generate_prompts() + +conn = sqlite3.connect("agent-evolution/data/real-fit.db") +row = conn.execute("SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE agent_name = ?", ("code-skeptic",)).fetchone() +conn.close() + +if row: + system, user, expected, rubric = row + print("=== REAL Ollama: code-skeptic x kimi-k2.6 ===") + resp, latency, tokens = call_ollama("kimi-k2.6", system, user, expected) + print(f"Latency: {latency}ms") + print(f"Tokens: {tokens}") + print("Response (first 300 chars):") + print(resp[:300]) + print("\n...") + ev = evaluate_response(resp, expected, rubric) + print(f"Score: {ev['total']:.1f}") + print(f"Explanation: {ev['explanation']}") +else: + print("No prompt found for code-skeptic") diff --git a/tests/scripts/capture-analytics-section.js b/tests/scripts/capture-analytics-section.js new file mode 100644 index 0000000..c37b554 --- /dev/null +++ b/tests/scripts/capture-analytics-section.js @@ -0,0 +1,89 @@ +#!/usr/bin/env node +/** + * Quick capture + element check for Analytics Hierarchy Section + */ + +const { chromium } = require('playwright'); +const fs = require('fs'); +const path = require('path'); + +const TARGET_URL = process.env.TARGET_URL || 'http://localhost:3002'; +const OUTPUT_DIR = process.env.OUTPUT_DIR || '/app/tests/visual/current'; + +(async () => { + if (!fs.existsSync(OUTPUT_DIR)) { + fs.mkdirSync(OUTPUT_DIR, { recursive: true }); + } + + const browser = await chromium.launch({ + headless: true, + args: ['--disable-setuid-sandbox', '--no-sandbox'], + }); + + const page = await browser.newPage({ + viewport: { width: 1280, height: 900 }, + }); + + console.log(`Navigating to: ${TARGET_URL}`); + await page.goto(TARGET_URL, { waitUntil: 'networkidle', timeout: 60000 }); + await page.waitForTimeout(3000); + + // Scroll to "Аналитическая иерархия" + const heading = page.locator('text=Аналитическая иерархия').first(); + if (await heading.isVisible().catch(() => false)) { + console.log('Scrolling to Аналитическая иерархия section...'); + await heading.scrollIntoViewIfNeeded(); + await page.evaluate(() => window.scrollBy(0, -60)); + await page.waitForTimeout(1500); + } else { + console.log('Heading not found, fallback scroll'); + await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight / 3)); + await page.waitForTimeout(1500); + } + + // Scroll further down to reveal cards 3 and 4 (heatmap, commands table) + await page.evaluate(() => window.scrollBy(0, 900)); + await page.waitForTimeout(1000); + + const screenshotPath = path.join(OUTPUT_DIR, 'analytics_section.png'); + await page.screenshot({ path: screenshotPath, fullPage: false }); + console.log(`Screenshot saved to: ${screenshotPath}`); + + // Check for each card's evidence (use Russian text as it appears in the page) + const checks = [ + { label: 'Model tree with collapsible categories', text: 'Модели → Категории → Агенты' }, + { label: 'Category bars', text: 'Дистрибуция по категориям' }, + { label: 'Fit-score heatmap', text: 'Fit-score распределение' }, + { label: 'Commands table', text: 'Команды' }, + ]; + + const results = { visible: {}, issues: [] }; + + for (const c of checks) { + const found = await page.locator(`text=${c.text}`).first().isVisible({ timeout: 3000 }).catch(() => false); + if (found) { + const textContent = await page.locator(`text=${c.text}`).first().textContent({ timeout: 3000 }).catch(() => ''); + results.visible[c.label] = textContent; + } else { + results.issues.push(`${c.label} (searching text "${c.text}") — NOT FOUND`); + } + } + + const reportPath = path.join(OUTPUT_DIR, 'analytics_section_report.json'); + fs.writeFileSync(reportPath, JSON.stringify(results, null, 2)); + console.log(`Report saved to: ${reportPath}`); + + // Also write summary to stdout + console.log('\n=== Scan Results ==='); + if (Object.keys(results.visible).length === 4) { + console.log('All 4 analytics cards are visible.'); + } else { + console.log(`Visible: ${Object.keys(results.visible).join(', ')}`); + console.log(`Missing: ${results.issues.join(', ')}`); + } + + await browser.close(); +})().catch((err) => { + console.error('Fatal error:', err); + process.exit(1); +}); diff --git a/tests/scripts/e2e-landing-test.js b/tests/scripts/e2e-landing-test.js new file mode 100644 index 0000000..7362ac0 --- /dev/null +++ b/tests/scripts/e2e-landing-test.js @@ -0,0 +1,381 @@ +#!/usr/bin/env node +/** + * E2E Test Suite for APAW Landing Page + * Tests: page load, console errors, API state, analytics, heatmap modal, + * close interactions, visual regression. + * + * Usage: node e2e-landing-test.js + * Environment: TARGET_URL (default http://host.docker.internal:3002) + */ + +const { chromium } = require('playwright'); +const fs = require('fs'); +const path = require('path'); +const pixelmatch = require('pixelmatch'); +const { PNG } = require('pngjs'); +const { launchBrowser, newContext, navigateTo } = require('./lib/browser-launcher'); + +const TARGET_URL = process.env.TARGET_URL || 'http://host.docker.internal:3002'; +const REPORTS_DIR = process.env.REPORTS_DIR || path.join(__dirname, '..', 'reports'); +const BASELINE_DIR = process.env.BASELINE_DIR || path.join(__dirname, '..', 'visual', 'baseline'); +const CURRENT_DIR = process.env.CURRENT_DIR || path.join(__dirname, '..', 'visual', 'current'); + +const VIEWPORT = { width: 1280, height: 900 }; + +async function main() { + console.log('═══════════════════════════════════════════════════'); + console.log(' APAW Landing E2E Tests'); + console.log('═══════════════════════════════════════════════════\n'); + console.log(`Target: ${TARGET_URL}\n`); + + for (const dir of [REPORTS_DIR, CURRENT_DIR]) { + if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true }); + } + + const browser = await launchBrowser(); + const context = await newContext(browser, { viewport: VIEWPORT }); + const page = await context.newPage(); + + const consoleErrors = []; + const consoleWarnings = []; + const networkErrors = []; + let networkRequests = []; + + const results = []; + + page.on('console', msg => { + if (msg.type() === 'error') consoleErrors.push(msg.text()); + else if (msg.type() === 'warning') consoleWarnings.push(msg.text()); + }); + + page.on('requestfailed', request => { + networkErrors.push({ url: request.url(), failure: request.failure()?.errorText || 'Unknown' }); + }); + + page.on('response', response => { + if (response.status() >= 400) { + networkErrors.push({ url: response.url(), status: response.status() }); + } + }); + + // ============================================================ + // Test 1: Page loads without console errors + // ============================================================ + { + console.log('┌─────────────────────────────────────────────────┐'); + console.log('│ Test 1: Page loads without console errors │'); + console.log('└─────────────────────────────────────────────────┘'); + + try { + const response = await navigateTo(page, `${TARGET_URL}`, { waitUntil: 'commit', timeout: 30000, delay: 3000 }); + const status = response?.status() || 0; + + // Wait for analytics section to be present in DOM + await page.waitForSelector('#analytics', { timeout: 10000 }).catch(() => {}); + + const title = await page.title(); + const pageLoaded = status === 200 && title.includes('APAW'); + + result(results, '1_page_load', pageLoaded && consoleErrors.length === 0, `HTTP ${status}, title: "${title}", console errors: ${consoleErrors.length}`); + } catch (e) { + result(results, '1_page_load', false, e.message); + } + } + + // ============================================================ + // Test 2: /api/state loads successfully + // ============================================================ + { + console.log('\n┌─────────────────────────────────────────────────┐'); + console.log('│ Test 2: /api/state loads successfully │'); + console.log('└─────────────────────────────────────────────────┘'); + + try { + const apiResponse = await page.evaluate(async (url) => { + const res = await fetch(`${url}/api/state`); + const data = await res.json().catch(() => null); + return { status: res.status, ok: res.ok, hasAgents: !!(data && Array.isArray(data.agents) && data.agents.length > 0) }; + }, TARGET_URL); + + result(results, '2_api_state', apiResponse.ok && apiResponse.hasAgents, + `status=${apiResponse.status}, hasAgents=${apiResponse.hasAgents}`); + } catch (e) { + result(results, '2_api_state', false, e.message); + } + } + + // ============================================================ + // Test 3: #analytics section is visible with heatmap rendered + // ============================================================ + { + console.log('\n┌─────────────────────────────────────────────────┐'); + console.log('│ Test 3: #analytics visible + heatmap rendered │'); + console.log('└─────────────────────────────────────────────────┘'); + + try { + const analytics = await page.locator('#analytics').first(); + const isVisible = await analytics.isVisible().catch(() => false); + + // Scroll to analytics + await page.evaluate(() => { + const el = document.getElementById('analytics'); + if (el) el.scrollIntoView({ block: 'start' }); + }); + await page.waitForTimeout(800); + + const heatmap = await page.locator('#fit-heatmap').first(); + const heatmapVisible = await heatmap.isVisible().catch(() => false); + const cellCount = await heatmap.locator('.heatmap__cell').count().catch(() => 0); + + result(results, '3_analytics_heatmap', isVisible && heatmapVisible && cellCount > 0, + `analytics visible=${isVisible}, heatmap visible=${heatmapVisible}, cells=${cellCount}`); + } catch (e) { + result(results, '3_analytics_heatmap', false, e.message); + } + } + + // ============================================================ + // Test 4: Clicking a heatmap cell opens #fit-modal + // ============================================================ + { + console.log('\n┌─────────────────────────────────────────────────┐'); + console.log('│ Test 4: Clicking heatmap cell opens #fit-modal │'); + console.log('└─────────────────────────────────────────────────┘'); + + try { + const cell = await page.locator('#fit-heatmap .heatmap__cell').first(); + await cell.scrollIntoViewIfNeeded(); + await page.waitForTimeout(500); + await cell.click(); + await page.waitForTimeout(500); + + const modal = await page.locator('#fit-modal').first(); + const modalVisible = await modal.isVisible().catch(() => false); + const isOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => false); + + result(results, '4_click_opens_modal', modalVisible && isOpen, + `modal visible=${modalVisible}, class is-open=${isOpen}`); + } catch (e) { + result(results, '4_click_opens_modal', false, e.message); + } + } + + // ============================================================ + // Test 5: Modal displays agent name, model, fit score, + // breakdown dimensions, and explanation + // ============================================================ + { + console.log('\n┌─────────────────────────────────────────────────┐'); + console.log('│ Test 5: Modal content (name, model, score, etc) │'); + console.log('└─────────────────────────────────────────────────┘'); + + try { + const modal = await page.locator('#fit-modal').first(); + const agentName = await modal.locator('#modal-agent-name').textContent().catch(() => ''); + const modelText = await modal.locator('#modal-model').textContent().catch(() => ''); + const scoreText = await modal.locator('#modal-score').textContent().catch(() => ''); + const explanation = await modal.locator('#modal-explanation').textContent().catch(() => ''); + const dims = await modal.locator('#modal-breakdown .modal__dimension').count().catch(() => 0); + + const nameOk = agentName.trim().length > 0 && agentName !== 'Agent'; + const modelOk = modelText.trim().length > 0; + const scoreOk = !isNaN(parseInt(scoreText, 10)) && parseInt(scoreText, 10) > 0; + const dimsOk = dims >= 4; + const explOk = explanation.trim().length > 0; + + result(results, '5_modal_content', + nameOk && modelOk && scoreOk && dimsOk && explOk, + `name="${agentName.trim()}", model="${modelText.trim()}", score="${scoreText.trim()}", dimensions=${dims}, explanation=${explOk ? 'present' : 'missing'}`); + } catch (e) { + result(results, '5_modal_content', false, e.message); + } + } + + // ============================================================ + // Test 6: Modal can be closed via close button and Escape key + // ============================================================ + { + const modal = await page.locator('#fit-modal').first(); + + // 6a: close button + { + console.log('\n┌─────────────────────────────────────────────────┐'); + console.log('│ Test 6a: Close via close button │'); + console.log('└─────────────────────────────────────────────────┘'); + + try { + await modal.locator('.modal__close').click(); + await page.waitForTimeout(600); + // If CSS transition leaves it briefly visible, wait a tick + const isOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => true); + const visible = await modal.isVisible().catch(() => true); + result(results, '6_close_button', !isOpen && !visible, `is-open=${isOpen}, visible=${visible}`); + } catch (e) { + result(results, '6_close_button', false, e.message); + } + } + + // 6b: Escape key + { + console.log('\n┌─────────────────────────────────────────────────┐'); + console.log('│ Test 6b: Close via Escape key │'); + console.log('└─────────────────────────────────────────────────┘'); + + try { + // If modal is still open (bug), force close via JS + const stillOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => false); + if (stillOpen) await page.evaluate(() => { if (typeof closeFitModal === 'function') closeFitModal(); }); + await page.waitForTimeout(400); + + const cell = await page.locator('#fit-heatmap .heatmap__cell').first(); + await cell.evaluate(el => el.scrollIntoView({ block: 'center' })); + await cell.click({ force: true }); + await page.waitForTimeout(500); + await page.keyboard.press('Escape'); + await page.waitForTimeout(500); + const isOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => true); + const visible = await modal.isVisible().catch(() => true); + result(results, '6_escape_key', !isOpen && !visible, `is-open=${isOpen}, visible=${visible}`); + } catch (e) { + result(results, '6_escape_key', false, e.message); + } + } + } + + // ============================================================ + // Screenshot of opened modal + // ============================================================ + { + console.log('\n┌─────────────────────────────────────────────────┐'); + console.log('│ Capturing modal screenshot │'); + console.log('└─────────────────────────────────────────────────┘'); + + try { + const cell = await page.locator('#fit-heatmap .heatmap__cell').first(); + await cell.click(); + await page.waitForTimeout(600); + const modal = await page.locator('#fit-modal').first(); + const modalBox = await modal.boundingBox().catch(() => null); + + const screenshotPath = path.join(CURRENT_DIR, 'modal_opened.png'); + if (modalBox) { + await page.screenshot({ path: screenshotPath, clip: modalBox }); + } else { + await page.screenshot({ path: screenshotPath }); + } + console.log(` ✅ Screenshot saved: ${screenshotPath}`); + result(results, 'screenshot_modal', true, screenshotPath); + } catch (e) { + console.log(` ❌ Screenshot failed: ${e.message}`); + result(results, 'screenshot_modal', false, e.message); + } + } + + // ============================================================ + // Test 7: No visual regressions from baseline + // ============================================================ + { + console.log('\n┌─────────────────────────────────────────────────┐'); + console.log('│ Test 7: Visual regression (baseline vs current) │'); + console.log('└─────────────────────────────────────────────────┘'); + + const baselinePath = path.join(BASELINE_DIR, 'homepage_desktop.png'); + const currentPath = path.join(CURRENT_DIR, 'homepage_desktop.png'); + + // Capture current homepage for comparison + try { + await navigateTo(page, `${TARGET_URL}`, { waitUntil: 'commit', delay: 3000 }); + await page.screenshot({ path: currentPath, fullPage: true }); + } catch (e) { + console.log(` ⚠️ Could not capture current screenshot: ${e.message}`); + } + + if (!fs.existsSync(baselinePath)) { + console.log(` ⚠️ Baseline not found at ${baselinePath}`); + result(results, '7_visual_regression', null, 'SKIP: baseline missing'); + } else if (!fs.existsSync(currentPath)) { + result(results, '7_visual_regression', false, 'Current screenshot capture failed'); + } else { + try { + const baseline = PNG.sync.read(fs.readFileSync(baselinePath)); + const current = PNG.sync.read(fs.readFileSync(currentPath)); + + if (baseline.width !== current.width || baseline.height !== current.height) { + result(results, '7_visual_regression', false, `Size mismatch: ${baseline.width}x${baseline.height} vs ${current.width}x${current.height}`); + } else { + const diff = new PNG({ width: baseline.width, height: baseline.height }); + const numDiff = pixelmatch(baseline.data, current.data, diff.data, baseline.width, baseline.height, { threshold: 0.1 }); + const diffPercent = (numDiff / (baseline.width * baseline.height)) * 100; + + const passed = diffPercent <= 5.0; // 5% tolerance + result(results, '7_visual_regression', passed, `diff pixels=${numDiff} (${diffPercent.toFixed(2)}%)`); + + if (!passed) { + const diffPath = path.join(CURRENT_DIR, 'homepage_desktop_diff.png'); + fs.writeFileSync(diffPath, PNG.sync.write(diff)); + console.log(` 📸 Diff saved: ${diffPath}`); + } + } + } catch (e) { + result(results, '7_visual_regression', false, e.message); + } + } + } + + await context.close(); + await browser.close(); + + // ============================================================ + // Summary + // ============================================================ + console.log('\n═══════════════════════════════════════════════════'); + console.log(' Results Summary'); + console.log('═══════════════════════════════════════════════════\n'); + + for (const r of results) { + const icon = r.pass === true ? '✅' : r.pass === false ? '❌' : '⏭️'; + console.log(`${icon} ${r.name}`); + console.log(` ${r.detail}`); + } + + console.log(`\n📊 Console errors: ${consoleErrors.length}`); + console.log(`📊 Console warnings: ${consoleWarnings.length}`); + console.log(`📊 Network errors: ${networkErrors.length}`); + + const failures = results.filter(r => r.pass === false); + const passed = results.filter(r => r.pass === true); + const skipped = results.filter(r => r.pass === null); + + console.log(`\n✅ Passed: ${passed.length}`); + console.log(`❌ Failed: ${failures.length}`); + console.log(`⏭️ Skipped: ${skipped.length}`); + + const reportPath = path.join(REPORTS_DIR, 'e2e-landing-report.json'); + fs.writeFileSync(reportPath, JSON.stringify({ + timestamp: new Date().toISOString(), + targetUrl: TARGET_URL, + results, + summary: { + passed: passed.length, + failed: failures.length, + skipped: skipped.length, + consoleErrors: consoleErrors.length, + consoleWarnings: consoleWarnings.length, + networkErrors: networkErrors.length, + }, + }, null, 2)); + console.log(`\n📄 Report: ${reportPath}`); + + process.exit(failures.length > 0 ? 1 : 0); +} + +function result(list, name, pass, detail) { + list.push({ name, pass, detail }); + const icon = pass === true ? '✅' : pass === false ? '❌' : '⏭️'; + console.log(` ${icon} ${name}: ${detail}`); +} + +main().catch(err => { + console.error('Fatal:', err); + process.exit(1); +}); diff --git a/tests/scripts/verify-evolution-heatmap.js b/tests/scripts/verify-evolution-heatmap.js new file mode 100644 index 0000000..1e47fb2 --- /dev/null +++ b/tests/scripts/verify-evolution-heatmap.js @@ -0,0 +1,79 @@ +const { chromium } = require('playwright'); +const fs = require('fs'); +const path = require('path'); + +const TARGET = process.env.TARGET_URL || 'http://host.docker.internal:3003'; +const OUT_DIR = process.env.OUT_DIR || path.join(__dirname, '..', 'reports'); + +(async () => { + if (!fs.existsSync(OUT_DIR)) fs.mkdirSync(OUT_DIR, { recursive: true }); + + const browser = await chromium.launch({ headless: true }); + const context = await browser.newContext({ viewport: { width: 1600, height: 1200 } }); + const page = await context.newPage(); + + // Capture console & network errors + const consoleErrors = []; + const networkErrors = []; + page.on('console', msg => { if (msg.type() === 'error') consoleErrors.push(msg.text()); }); + page.on('requestfailed', req => networkErrors.push({ url: req.url(), error: req.failure()?.errorText })); + page.on('response', res => { if (res.status() >= 400) networkErrors.push({ url: res.url(), status: res.status() }); }); + + console.log('[HEATMAP] Navigating to', TARGET); + await page.goto(TARGET, { waitUntil: 'domcontentloaded', timeout: 30000 }); + await page.waitForTimeout(1500); // wait for fetch/dashboard-data + + const tabBtn = page.locator('button.tab-btn', { hasText: /Heatmap/ }).first(); + if (await tabBtn.count()) { + await tabBtn.click(); + console.log('[HEATMAP] Clicked Heatmap tab'); + } else { + console.log('[HEATMAP] No Heatmap tab found, tabs may already be active'); + } + + await page.waitForTimeout(2000); // let table build from JS + + // Get table dimensions + const rows = await page.locator('#hmTable tbody tr').count().catch(() => 0); + const colCount = await page.locator('#hmTable thead th').count().catch(() => 0); + console.log(`[HEATMAP] Table: ${rows} rows, ${colCount} columns`); + + // Screenshot full page of heatmap tab + const screenshotPath = path.join(OUT_DIR, 'heatmap.png'); + await page.screenshot({ path: screenshotPath, fullPage: true }); + console.log('[HEATMAP] Screenshot saved to', screenshotPath); + + // Also screenshot just the table if possible + const tableScreenshotPath = path.join(OUT_DIR, 'heatmap-table.png'); + const tableEl = page.locator('#hmTable').first(); + if (await tableEl.count() && rows > 0) { + await tableEl.screenshot({ path: tableScreenshotPath }); + console.log('[HEATMAP] Table screenshot saved to', tableScreenshotPath); + } + + // Read cell data + const cellTexts = await page.locator('#hmTable tbody td').allTextContents().catch(() => []); + console.log('[HEATMAP] First 30 cell texts:', cellTexts.slice(0, 30).map(t => t.trim())); + + // Dump innerHTML + const innerHTML = await page.locator('#hmTable').innerHTML().catch(() => null); + + // Report + const report = { + target: TARGET, + table: { rows, colCount }, + cellSamples: cellTexts.slice(0, 30).map(t => t.trim()), + consoleErrors, + networkErrors, + screenshots: [screenshotPath, tableScreenshotPath].filter(f => fs.existsSync(f)), + innerHTML: innerHTML ? innerHTML.slice(0, 2000) : null, + ok: rows > 0 && colCount > 0, + }; + + const reportPath = path.join(OUT_DIR, 'heatmap-report.json'); + fs.writeFileSync(reportPath, JSON.stringify(report, null, 2)); + console.log('[HEATMAP] Report saved to', reportPath); + + await browser.close(); + process.exit(report.ok ? 0 : 1); +})();