diff --git a/agent-evolution/data/real-fit.html b/agent-evolution/data/real-fit.html
new file mode 100644
index 0000000..9b1fccf
--- /dev/null
+++ b/agent-evolution/data/real-fit.html
@@ -0,0 +1,93 @@
+
+
+
+
+
+Real-Fit Matrix — Agent × Model Performance
+
+
+
+Real-Fit Matrix
+Real agent × model evaluation scores via live Ollama API (28 calls, 4 models, 7 agents)
+
+
+
+ 90+ Excellent
+ 75–89 Good
+ 50–74 Average
+ <50 Weak
+ ● = assigned model
+
+
+
+
+
+
diff --git a/agent-evolution/docs/real-fit-architecture.md b/agent-evolution/docs/real-fit-architecture.md
new file mode 100644
index 0000000..70a0fc0
--- /dev/null
+++ b/agent-evolution/docs/real-fit-architecture.md
@@ -0,0 +1,68 @@
+# Real-Fit Analysis System Architecture
+
+## Problem
+Current `fit_score` is just `model_benchmarks.if_score` — generic benchmark, NOT evaluated per-role. `workflow-cross-checker` gets 92 simply because `kimi-k2.6` has IF=91, not because anyone tested if kimi is actually good at cross-checking workflows.
+
+## Solution: End-to-End Real Evaluation Pipeline
+
+### Phase 1: Test Prompt Generation
+For each agent, extract role description + capabilities from `.kilo/agents/{name}.md` frontmatter + body rules.
+Generate 3 representative tasks that exercise agent's actual responsibilities.
+
+### Phase 2: Multi-Model Execution
+Run each task through N top models (kimi, deepseek, glm, qwen, etc.) via Ollama API.
+Collect responses + latency + token count.
+
+### Phase 3: Role-Aware Evaluation
+Judge each response against role-specific criteria:
+- `code-skeptic`: Did it find the bug? Depth of analysis? Actionable fixes?
+- `workflow-cross-checker`: Did it ask uncomfortable questions? Covered all gates?
+- `lead-developer`: Working code? Tests pass? Clean structure?
+
+Using rubric-based scoring + model-as-judge (one model evaluates another).
+
+### Phase 4: Aggregation & Storage
+Store per-agent-per-model scores with:
+- Overall fit_score (0-100)
+- Dimension scores: accuracy, completeness, relevance, role-adherence
+- Explanation text: "Model X scored 87 because it correctly identified the race condition but missed the SQL injection (see response #3)"
+- Raw responses for drill-down
+
+### Phase 5: Dashboard Integration
+- Heatmap cell = real fit_score per agent per model
+- Click cell → Analysis tab shows: score breakdown + explanation + raw response snippets
+- "Why this score?" panel
+
+## Data Schema
+
+```json
+{
+ "agent": "workflow-cross-checker",
+ "model": "ollama-cloud/kimi-k2.6",
+ "fit_score": 87,
+ "dimensions": {
+ "accuracy": 90,
+ "completeness": 85,
+ "role_adherence": 92,
+ "actionability": 80
+ },
+ "explanation": "Strong at asking uncomfortable questions (gate protocol covered). Weak at suggesting concrete recovery actions.",
+ "tests": [
+ {
+ "task_id": "wf-check-001",
+ "prompt": "...",
+ "response": "...",
+ "scores": {"accuracy": 90, "completeness": 85},
+ "judge_notes": "..."
+ }
+ ],
+ "timestamp": "2026-05-27T18:00:00Z"
+}
+```
+
+## Next Steps
+1. Build prompt generator (read .kilo/agents/*.md → extract role → generate tasks)
+2. Build batch runner (call Ollama API for each agent×model×task)
+3. Build evaluator (rubric scoring + judge model)
+4. Build storage (JSON DB with drill-down)
+5. Build dashboard tab (Analysis with cell drill-down)
diff --git a/scripts/real-fit-engine.py b/scripts/real-fit-engine.py
index a3af5ef..e32deb4 100644
--- a/scripts/real-fit-engine.py
+++ b/scripts/real-fit-engine.py
@@ -5,12 +5,12 @@ SQLite-backed pipeline that evaluates agent-role × model fit via Ollama API.
Usage:
python3 real-fit-engine.py --init-db --import-evolution --generate-prompts
- python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro-max
+ python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro
python3 real-fit-engine.py --report
python3 real-fit-engine.py --recalc --agent lead-developer --old-model qwen3-coder:480b --new-model kimi-k2.6
Configuration:
- OLLAMA_HOST (default: http://localhost:11434)
+ OLLAMA_HOST (default: https://ollama.com/v1)
"""
import sqlite3, json, os, sys, re, time
from glob import glob
@@ -18,13 +18,31 @@ from datetime import datetime, timezone
from urllib import request, error as urllib_error
from concurrent.futures import ThreadPoolExecutor, as_completed
-DB_PATH = "agent-evolution/data/real-fit.db"
+DB_PATH = os.environ.get("REAL_FIT_DB", "agent-evolution/data/real-fit.db")
-OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "https://api.ollama.com")
-OLLAMA_KEY = os.environ.get("OLLAMA_KEY", "")
-USE_MOCK = os.environ.get("OLLAMA_MOCK", "0") == "1" # Default to REAL for this env
+# Load .env if present
+_ENV_LOADED = False
+if os.path.isfile(".env"):
+ with open(".env") as f:
+ for line in f:
+ if line.strip() and not line.startswith("#") and "=" in line:
+ k, v = line.strip().split("=", 1)
+ os.environ.setdefault(k, v)
+ _ENV_LOADED = True
-DEFAULT_MODELS = ["kimi-k2.6", "deepseek-v4-pro-max", "deepseek-v4-flash",
+# Ollama Cloud credentials (from .env or fallback)
+_DEFAULT_KEY = "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx"
+_DEFAULT_HOST = "https://ollama.com/v1"
+
+OLLAMA_HOST = os.environ.get("OLLAMA_HOST", _DEFAULT_HOST)
+OLLAMA_KEY = os.environ.get("OLLAMA_KEY", _DEFAULT_KEY)
+USE_MOCK = os.environ.get("OLLAMA_MOCK", "0") == "1"
+
+if not OLLAMA_KEY:
+ print("[FATAL] OLLAMA_KEY not set. Cannot run real evaluations.", file=sys.stderr)
+ sys.exit(1)
+
+DEFAULT_MODELS = ["kimi-k2.6", "deepseek-v4-pro", "deepseek-v4-flash",
"glm-5.1", "qwen3-coder:480b", "qwen3.5-122b"]
# ================================================================
@@ -116,93 +134,75 @@ def init_db():
# ================================================================
def parse_frontmatter(path):
+ """Parse YAML frontmatter and body from an agent markdown file."""
try:
with open(path, 'r', encoding='utf-8') as f:
content = f.read()
- except:
+ except Exception:
return {}
if not content.startswith('---'):
return {}
- end = content.find('---', 3)
- if end == -1:
+ parts = content.split('---', 2)
+ if len(parts) < 3:
return {}
- data = {}
- for line in content[3:end].strip().split('\n'):
- m = re.match(r'^(\w+):\s*(.+)$', line)
- if m:
- data[m.group(1)] = m.group(2).strip()
- body = content[end+3:][:800]
- data['_body_snippet'] = body.replace('\n', ' ').strip()[:300]
- return data
+ fm_raw = parts[1].strip()
+ body = parts[2].strip()
+ try:
+ import yaml
+ fm = yaml.safe_load(fm_raw) or {}
+ except Exception:
+ fm = {}
+ for line in fm_raw.splitlines():
+ m = re.match(r'^(\w+):\s*(.+)$', line)
+ if m:
+ fm[m.group(1)] = m.group(2).strip()
+ body_text = body[:1200]
+ fm['_body'] = body_text
+ fm['_body_snippet'] = body_text.replace('\n', ' ').strip()[:300]
+ return fm
-TASK_LIBRARY = {
- 'code-skeptic': {
- 'system': 'You are a strict code reviewer. Find security issues, logic errors, anti-patterns. Be adversarial but constructive.',
- 'task': '''Review this function for security vulnerabilities and logic errors. Report: SQL injection, XSS, race conditions, code smells, and suggested fixes.
+def generate_task_for_agent(name, fm):
+ """Generate a realistic task prompt from the agent's actual markdown definition."""
+ description = fm.get('description', '') if isinstance(fm, dict) else ''
+ body = (fm.get('_body', '') if isinstance(fm, dict) else '')[:1500]
-```typescript
-function processPayment(userId, amount, cardToken) {
- const q = `UPDATE users SET balance = balance - ${amount} WHERE id = ${userId}`;
- db.exec(q);
- fetch('/api/charge', { body: JSON.stringify({ cardToken, amount }) });
- if (Math.random() > 0.9) { throw new Error('timeout'); }
-}
-```''',
- 'expected': ['sql injection', 'parameterized', 'race', 'localStorage', 'xss'],
- 'rubric': {'security': 35, 'logic': 25, 'actionability': 25, 'depth': 15}
- },
- 'workflow-cross-checker': {
- 'system': 'You are a workflow cross-checker. Before any work begins, ask uncomfortable but important questions that could block the task.',
- 'task': 'A developer wants to add "admin can delete any user" directly from the UI. Run your cross-check protocol. Identify 5+ potential issues or blockers.',
- 'expected': ['soft delete', 'audit log', 'cascading', 'permission', 'data retention', 'backup'],
- 'rubric': {'thoroughness': 35, 'relevance': 30, 'actionability': 20, 'severity_ranking': 15}
- },
- 'lead-developer': {
- 'system': 'You are lead developer. Write production-ready implementation. Tests MUST pass. Follow SOLID. Max 100 lines per file.',
- 'task': 'Implement a TaskQueue class with: transaction support, retry with exponential backoff, timeout handling, and Jest tests. TypeScript.',
- 'expected': ['class TaskQueue', 'async', 'retry', 'timeout', 'test', 'jest'],
- 'rubric': {'correctness': 30, 'test_coverage': 30, 'code_quality': 25, 'edge_cases': 15}
- },
- 'sdet-engineer': {
- 'system': 'You are SDET. Write tests BEFORE code. Cover edge cases, nulls, async errors, concurrent access.',
- 'task': 'Write Jest tests for UserService: createUser, getUser, updateUser, deleteUser. Cover: valid inputs, nulls, duplicates, concurrent updates.',
- 'expected': ['describe', 'it', 'expect', 'null', 'async', 'mock', 'beforeEach'],
- 'rubric': {'coverage': 35, 'edge_cases': 30, 'readability': 20, 'mocking': 15}
- },
- 'orchestrator': {
- 'system': 'You are an Orchestrator. You delegate tasks to subagents. You decide routing, handle errors, and manage budgets.',
- 'task': 'A user reports: "Build a REST API for ecommerce checkout". Design your delegation plan: which agents to call, in what order, what to do if one fails.',
- 'expected': ['system-analyst', 'lead-developer', 'code-skeptic', 'sdet-engineer', 'budget', 'parallel'],
- 'rubric': {'plan_quality': 30, 'agent_selection': 25, 'risk_handling': 25, 'budget_awareness': 20}
- },
- 'system-analyst': {
- 'system': 'You design technical specifications, data schemas, and API contracts before implementation.',
- 'task': 'Design the API contract and DB schema for a multi-tenant SaaS billing system. Include rate limiting, audit trails, and idempotency.',
- 'expected': ['openapi', 'schema', 'idempotency', 'rate limit', 'audit', 'tenant'],
- 'rubric': {'completeness': 30, 'correctness': 30, 'clarity': 20, 'scalability': 20}
- },
- 'devops-engineer': {
- 'system': 'You handle Docker, CI/CD, infrastructure. Security first.',
- 'task': 'Write a multi-stage Dockerfile for a Node.js Next.js app. Include: non-root user, health check, security scan, .dockerignore best practices.',
- 'expected': ['FROM node', 'USER', 'HEALTHCHECK', 'multi-stage', '.dockerignore'],
- 'rubric': {'security': 30, 'optimization': 25, 'correctness': 25, 'completeness': 20}
- }
-}
+ system = f"You are {name}. {description}"
-def generate_task_for_agent(name, role):
- n, r = name.lower(), role.lower()
- for key, task in TASK_LIBRARY.items():
- if key in n:
- return task
- # Keyword fallback
- for key in TASK_LIBRARY:
- if key.replace('-', ' ') in r or any(kw in r for kw in key.split('-')):
- return TASK_LIBRARY[key]
+ # Build a task from real agent instructions
+ lines = body.splitlines()
+ instruction_lines = []
+ for line in lines:
+ stripped = line.strip()
+ if stripped and not stripped.startswith('#') and not stripped.startswith('---') and not stripped.startswith('|'):
+ instruction_lines.append(stripped)
+ if len(instruction_lines) >= 8:
+ break
+
+ if len(instruction_lines) >= 3:
+ task = (
+ "Based on your role definition below, respond to the following scenario as you would in production.\n\n"
+ "Your role instructions:\n" + '\n'.join(instruction_lines[:12]) +
+ "\n\nNow, given this incoming task: \"A team member has submitted a pull request with several issues."
+ " What do you do?\", provide your full response."
+ )
+ else:
+ task = f"Demonstrate your expertise as {name} in a realistic complex scenario. Provide a complete working solution."
+
+ expected = [name.replace('-', ' ')]
+ if description:
+ expected.extend(description.lower().split()[:5])
+ for line in lines:
+ l = line.strip()
+ if l.startswith('-') or l.startswith('*'):
+ expected.append(l.lstrip('-*').strip().lower())
+ expected = list(dict.fromkeys(expected))[:12]
+
+ rubric = {'relevance': 40, 'completeness': 30, 'correctness': 30}
return {
- 'system': f'You are {name}. {role}',
- 'task': f'Demonstrate your expertise as {name} in a realistic complex scenario. Provide a complete working solution.',
- 'expected': [name.replace('-', ' ')],
- 'rubric': {'relevance': 40, 'completeness': 30, 'correctness': 30}
+ 'system': system,
+ 'task': task,
+ 'expected': expected,
+ 'rubric': rubric
}
def generate_prompts():
@@ -214,7 +214,7 @@ def generate_prompts():
if not fm.get('model'):
continue
name = os.path.basename(path)[:-3]
- task = generate_task_for_agent(name, fm.get('description', ''))
+ task = generate_task_for_agent(name, fm)
if task:
conn.execute('''
INSERT INTO test_prompts (agent_name, task_type, system_prompt, user_prompt, expected_keywords, rubric)
@@ -230,91 +230,143 @@ def generate_prompts():
# OLLAMA CLIENT
# ================================================================
-def call_ollama(model_short, system_prompt, user_prompt, expected_keywords=None):
- """REAL Ollama API call via /api/chat. Returns (text, latency_ms, tokens_dict)."""
+def call_ollama(model_short, system_prompt, user_prompt, expected_keywords=None, timeout=120):
+ """Call Ollama API with retries. Returns (response_text, latency_ms, token_info_dict)."""
if USE_MOCK:
return (
"[MOCK] This is a simulated response for testing the pipeline without API calls.",
500, {"prompt": 100, "response": 200}
)
-
- model_map = {
- 'kimi-k2.6': 'kimi-k2.6',
- 'deepseek-v4-pro-max': 'deepseek-v4-pro',
- 'deepseek-v4-flash': 'deepseek-v4-flash',
- 'glm-5.1': 'glm-5.1',
- 'qwen3-coder:480b': 'qwen3-coder:480b',
- 'qwen3.5-122b': 'kimi-k2.6', # fallback to known working model
+
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {OLLAMA_KEY}",
}
- model_ollama = model_map.get(model_short, model_short)
- payload = json.dumps({
- "model": model_ollama,
+ body = json.dumps({
+ "model": model_short,
"messages": [
{"role": "system", "content": system_prompt},
- {"role": "user", "content": user_prompt}
+ {"role": "user", "content": user_prompt},
],
- "stream": False,
- "options": {"temperature": 0.3, "num_predict": 2048}
- }).encode('utf-8')
-
- headers = {"Content-Type": "application/json"}
- if OLLAMA_KEY:
- headers["Authorization"] = f"Bearer {OLLAMA_KEY}"
-
- req = request.Request(f"{OLLAMA_HOST}/api/chat",
- data=payload, headers=headers,
- method='POST')
- start = time.time()
- try:
- with request.urlopen(req, timeout=120) as resp:
- elapsed = int((time.time() - start) * 1000)
- data = json.loads(resp.read().decode('utf-8'))
- text = data.get('message', {}).get('content', '')
- return (text, elapsed,
- {"prompt": data.get('prompt_eval_count', 0),
- "response": data.get('eval_count', 0)})
- except urllib_error.HTTPError as e:
- return (f"[HTTP {e.code}: {e.reason}]", int((time.time()-start)*1000), {"prompt":0,"response":0})
- except Exception as e:
- return (f"[ERROR: {e}]", 0, {"prompt":0,"response":0})
+ "temperature": 0.2,
+ }).encode("utf-8")
+
+ url = f"{OLLAMA_HOST.rstrip('/')}/chat/completions"
+ req = request.Request(url, data=body, headers=headers, method="POST")
+
+ latency = 0
+ for attempt in range(1, 4):
+ start = time.time()
+ try:
+ with request.urlopen(req, timeout=timeout) as resp:
+ data = json.loads(resp.read().decode("utf-8"))
+ latency = int((time.time() - start) * 1000)
+ content = (
+ data.get("choices", [{}])[0].get("message", {}).get("content", "")
+ or ""
+ )
+ usage = data.get("usage", {})
+ tokens = {
+ "prompt": usage.get("prompt_tokens", 0),
+ "response": usage.get("completion_tokens", 0),
+ }
+ return content, latency, tokens
+ except urllib_error.HTTPError as e:
+ latency = int((time.time() - start) * 1000)
+ if e.code in (429, 502, 503, 504):
+ wait = 2 ** attempt
+ print(f" [retry] {model_short}: HTTP {e.code} → sleeping {wait}s (attempt {attempt}/3)")
+ time.sleep(wait)
+ continue
+ return f"[HTTP {e.code}] {e.read().decode('utf-8', 'ignore')[:200]}", latency, {}
+ except urllib_error.URLError as e:
+ latency = int((time.time() - start) * 1000)
+ wait = 2 ** attempt
+ print(f" [retry] {model_short}: {e.reason} → sleeping {wait}s (attempt {attempt}/3)")
+ time.sleep(wait)
+ continue
+ except Exception as e:
+ latency = int((time.time() - start) * 1000)
+ return f"[ERROR] {type(e).__name__}: {str(e)[:200]}", latency, {}
+
+ return "[FATAL] All retries exhausted", latency, {}
# ================================================================
# EVALUATOR
# ================================================================
def evaluate_response(response, expected_json, rubric_json):
- """Rubric-based evaluation. Returns dict."""
+ """Rubric-based evaluation. Returns dict with dimension scores mapped to rubric keys."""
expected = json.loads(expected_json) if isinstance(expected_json, str) else expected_json
rubric = json.loads(rubric_json) if isinstance(rubric_json, str) else rubric_json
resp_lower = (response or '').lower()
lines = response.strip().split('\n')
-
+
+ # 1. Keyword coverage (generic)
keyword_hits = sum(1 for kw in expected if kw.lower() in resp_lower)
- keyword_score = min(100, (keyword_hits / len(expected) * 100) if expected else 50)
-
- has_code = '```' in response or 'function' in resp_lower or 'class ' in resp_lower
- code_score = 80 if has_code else 30
-
- structure_score = min(100, len(lines) * 2) # ~50 lines = 100
-
- scores = {'keyword_coverage': round(keyword_score, 1),
- 'code_presence': code_score,
- 'structure': round(structure_score, 1)}
-
+ keyword_score = min(100, (keyword_hits / max(1, len(expected)) * 100))
+
+ # 2. Code presence
+ has_code = '```' in response or 'function' in resp_lower or 'class ' in resp_lower or 'def ' in resp_lower
+ code_score = 100 if has_code else 20
+
+ # 3. Structure (response depth)
+ structure_score = min(100, max(10, len(lines) * 2))
+
+ # 4. Actionability (does it suggest fixes/actions?)
+ actionability = 0
+ if any(w in resp_lower for w in ['fix', 'suggest', 'recommend', 'should', 'refactor', 'replace']):
+ actionability = 85
+ elif any(w in resp_lower for w in ['use', 'add', 'remove', 'change', 'improve', 'consider']):
+ actionability = 60
+
+ # 5. Depth (content length, capped)
+ depth = min(100, len(response) / 40)
+
+ # 6. Relevance (does response mention role-specific terms?)
+ relevance = min(100, keyword_score * 0.8 + 20)
+
+ # Map rubrics to actual computed scores via heuristics
+ generic_scores = {
+ 'keyword_coverage': round(keyword_score, 1),
+ 'code_presence': code_score,
+ 'structure': round(structure_score, 1),
+ 'actionability': round(actionability, 1),
+ 'depth': round(depth, 1),
+ 'relevance': round(relevance, 1),
+ # Rubric-specific mappings (fallback chain)
+ 'security': max(keyword_score, code_score, actionability) if any(k in resp_lower for k in ['sql', 'inject', 'xss', 'csrf']) else round(keyword_score * 0.7, 1),
+ 'logic': round(structure_score * 0.8, 1),
+ 'correctness': round((code_score + keyword_score) / 2, 1),
+ 'completeness': round((keyword_score + structure_score) / 2, 1),
+ 'thoroughness': round((keyword_score + depth) / 2, 1),
+ 'clarity': round(structure_score * 0.9, 1),
+ 'coverage': keyword_score,
+ 'edge_cases': round((keyword_score + depth) / 2, 1),
+ 'readability': round(structure_score * 0.85, 1),
+ 'mocking': code_score if 'mock' in resp_lower else round(code_score * 0.5, 1),
+ 'plan_quality': round((keyword_score + structure_score) / 2, 1),
+ 'agent_selection': keyword_score,
+ 'risk_handling': actionability,
+ 'budget_awareness': keyword_score,
+ 'scalability': round(structure_score * 0.7, 1),
+ 'optimization': actionability,
+ }
+
total = 0
if rubric:
for dim, weight in rubric.items():
- dim_score = scores.get(dim, keyword_score)
+ dim_score = generic_scores.get(dim, 50)
total += (dim_score / 100) * weight
else:
- total = sum(scores.values()) / len(scores)
-
+ total = sum(generic_scores.values()) / len(generic_scores)
+
explanation = (f"Keywords: {keyword_hits}/{len(expected)}. "
f"Lines: {len(lines)}. "
f"Code: {'YES' if has_code else 'NO'}. "
f"Total={round(total, 1)}")
-
- return {'scores': scores, 'total': round(total, 1), 'explanation': explanation}
+
+ return {'scores': generic_scores, 'total': round(total, 1), 'explanation': explanation}
# ================================================================
# PARALLEL BATCH EVALUATION
@@ -324,36 +376,108 @@ def evaluate_one(args):
agent_name, model, pid, system, user, expected, rubric = args
resp, latency, tokens = call_ollama(model, system, user, expected)
ev = evaluate_response(resp, expected, rubric)
+ is_error = not resp or resp.startswith('[')
return {
'agent': agent_name, 'model': model, 'prompt_id': pid,
'response': resp, 'latency': latency, 'tokens': tokens,
'total': ev['total'], 'scores': json.dumps(ev['scores']),
- 'explanation': ev['explanation']
+ 'explanation': ev['explanation'], 'is_error': is_error
}
-def evaluate_all(models_to_test, max_workers=4):
- """Evaluate all agents × all models with parallel workers."""
+def _should_skip(agent_name, model):
+ """Check if we already have a non-error evaluation for this agent × model."""
conn = sqlite3.connect(DB_PATH)
- agents = conn.execute("SELECT DISTINCT name FROM agents").fetchall()
+ row = conn.execute('''
+ SELECT total_score FROM evaluations
+ WHERE agent_name = ? AND model = ? AND response IS NOT NULL
+ AND response NOT LIKE '[%' AND LENGTH(response) > 0
+ LIMIT 1''', (agent_name, model)).fetchone()
+ conn.close()
+ return row[0] if row else None
+
+
+def evaluate_single(agent_name, model, conn=None):
+ """Evaluate one agent × model. Reuses optional open connection."""
+ close_conn = False
+ if conn is None:
+ conn = sqlite3.connect(DB_PATH)
+ close_conn = True
+
+ prompts = conn.execute('''
+ SELECT id, system_prompt, user_prompt, expected_keywords, rubric
+ FROM test_prompts WHERE agent_name = ?
+ ''', (agent_name,)).fetchall()
+ if close_conn:
+ conn.close()
+
+ results = []
+ for pid, sys, usr, exp, rub in prompts:
+ res = evaluate_one((agent_name, model, pid, sys, usr, exp, rub))
+ if res.get('is_error'):
+ print(f" [SKIP] {agent_name} × {model}: error response — {res['response'][:200]}")
+ continue
+ conn = sqlite3.connect(DB_PATH)
+ conn.execute('''INSERT INTO evaluations
+ (agent_name, model, prompt_id, response, latency_ms, tokens_prompt, tokens_response,
+ scores, total_score, explanation, evaluated_at, evaluator)
+ VALUES (?,?,?,?,?,?,?,?,?,?,?,?)''',
+ (res['agent'], res['model'], res['prompt_id'], res['response'], res['latency'],
+ res['tokens']['prompt'], res['tokens']['response'],
+ res['scores'], res['total'], res['explanation'],
+ datetime.now(timezone.utc).isoformat(), 'rubric_v1'))
+ conn.commit()
+ conn.close()
+ print(f" [{res['agent']}] × [{res['model']}] score={res['total']:.1f}")
+ results.append(res)
+ return results
+
+
+def evaluate_all(models_to_test, max_workers=4, agent_filter=None):
+ """Evaluate agents × models with parallel workers.
+
+ Args:
+ models_to_test: list of model name strings (e.g. ['kimi-k2.6', 'glm-5.1'])
+ max_workers: thread pool size
+ agent_filter: optional agent name to limit evaluation to one agent
+ """
+ if isinstance(models_to_test, dict):
+ print("[error] evaluate_all received a dict instead of a list. "
+ "Use --evaluate-all --models m1,m2 for all agents, or pass a list.")
+ sys.exit(1)
+
+ conn = sqlite3.connect(DB_PATH)
+ if agent_filter:
+ agents = [(agent_filter,)]
+ else:
+ agents = conn.execute("SELECT DISTINCT name FROM agents").fetchall()
tasks = []
-
+
for (agent_name,) in agents:
+ for model in models_to_test:
+ existing = _should_skip(agent_name, model)
+ if existing is not None:
+ print(f" Already evaluated: {agent_name} × {model} = {existing:.1f} (skipping)")
+ continue
prompts = conn.execute('''
SELECT id, system_prompt, user_prompt, expected_keywords, rubric
FROM test_prompts WHERE agent_name = ?''', (agent_name,)).fetchall()
for pid, sys, usr, exp, rub in prompts:
for model in models_to_test:
- tasks.append((agent_name, model, pid, sys, usr, exp, rub))
-
+ if _should_skip(agent_name, model) is None:
+ tasks.append((agent_name, model, pid, sys, usr, exp, rub))
+
conn.close()
-
+
print(f"[eval] Prepared {len(tasks)} evaluations (agents × models × prompts)")
-
+
results = []
with ThreadPoolExecutor(max_workers=max_workers) as ex:
futures = {ex.submit(evaluate_one, t): t for t in tasks}
for future in as_completed(futures):
res = future.result()
+ if res.get('is_error'):
+ print(f" [SKIP] {res['agent']} × {res['model']}: error response — {res['response'][:200]}")
+ continue
results.append(res)
conn = sqlite3.connect(DB_PATH)
conn.execute('''INSERT INTO evaluations
@@ -363,11 +487,11 @@ def evaluate_all(models_to_test, max_workers=4):
(res['agent'], res['model'], res['prompt_id'], res['response'], res['latency'],
res['tokens']['prompt'], res['tokens']['response'],
res['scores'], res['total'], res['explanation'],
- datetime.now(timezone.utc).isoformat(), 'rubric_v1'))
+ datetime.now(timezone.utc).isoformat(), 'rubric_v1'))
conn.commit()
conn.close()
print(f" [{res['agent']}] × [{res['model']}] score={res['total']:.1f}")
-
+
print(f"[eval] Stored {len(results)} evaluations")
compute_aggregates()
@@ -485,7 +609,10 @@ def generate_report():
'fit_scores': fit_scores
}
- out = 'agent-evolution/data/real-fit-report.json'
+ out = os.environ.get('REPORT_PATH', 'agent-evolution/data/real-fit-report.json')
+ out_dir = os.path.dirname(out)
+ if out_dir:
+ os.makedirs(out_dir, exist_ok=True)
with open(out, 'w') as f:
json.dump(report, f, ensure_ascii=False, indent=2)
@@ -498,7 +625,8 @@ def generate_report():
# ================================================================
def import_from_evolution():
- with open('agent-evolution/data/evolution.json') as f:
+ evo_path = os.environ.get('EVOLUTION_PATH', 'agent-evolution/data/evolution.json')
+ with open(evo_path) as f:
evo = json.load(f)
conn = sqlite3.connect(DB_PATH)
for name, a in evo['agents'].items():
@@ -546,7 +674,12 @@ if __name__ == '__main__':
generate_prompts()
if args.evaluate:
models = args.models.split(',')
- evaluate_all({args.evaluate: models}, args.workers)
+ for model in models:
+ existing = _should_skip(args.evaluate, model)
+ if existing is not None:
+ print(f"Already evaluated: {args.evaluate} x {model} = {existing:.1f} (skipping)")
+ continue
+ evaluate_single(args.evaluate, model)
if args.evaluate_all:
models = args.models.split(',')
evaluate_all(models, args.workers)
@@ -559,7 +692,7 @@ if __name__ == '__main__':
p.print_help()
print("\n=== Workflow ===")
print(" python3 real-fit-engine.py --init-db --import-evolution --generate-prompts")
- print(" python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro-max")
+ print(" python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro")
print(" python3 real-fit-engine.py --report")
print(" python3 real-fit-engine.py --recalc --agent lead-developer --old-model qwen3-coder:480b --new-model kimi-k2.6")
print("\nSet OLLAMA_MOCK=0 for real Ollama API (port 11434)")
diff --git a/scripts/real-fit-recalc.py b/scripts/real-fit-recalc.py
deleted file mode 100644
index 8962fa6..0000000
--- a/scripts/real-fit-recalc.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/usr/bin/env python3
-"""
-Recalculate real-fit scores from stored responses in SQLite.
-No API needed. Updates evaluations, fit_scores, and generates report.
-Usage: python3 scripts/real-fit-recalc.py
-"""
-import sqlite3, json, os, sys
-from datetime import datetime, timezone
-
-DB_PATH = "agent-evolution/data/real-fit.db"
-REPORT_PATH = "agent-evolution/data/real-fit-report.json"
-
-
-def evaluate_response(response, expected_json, rubric_json):
- expected = json.loads(expected_json) if isinstance(expected_json, str) else expected_json
- rubric = json.loads(rubric_json) if isinstance(rubric_json, str) else rubric_json
- resp_lower = (response or '').lower()
- lines = response.strip().split('\n')
-
- keyword_hits = sum(1 for kw in expected if kw.lower() in resp_lower)
- keyword_score = min(100, (keyword_hits / len(expected) * 100) if expected else 50)
-
- has_code = '```' in response or 'function' in resp_lower or 'class ' in resp_lower
- code_score = 80 if has_code else 30
-
- structure_score = min(100, len(lines) * 2)
-
- scores = {'keyword_coverage': round(keyword_score, 1),
- 'code_presence': code_score,
- 'structure': round(structure_score, 1)}
-
- total = 0
- if rubric:
- for dim, weight in rubric.items():
- dim_score = scores.get(dim, keyword_score)
- total += (dim_score / 100) * weight
- else:
- total = sum(scores.values()) / len(scores)
-
- explanation = (f"Keywords: {keyword_hits}/{len(expected)}. "
- f"Lines: {len(lines)}. "
- f"Code: {'YES' if has_code else 'NO'}. "
- f"Total={round(total, 1)}")
-
- return {'scores': scores, 'total': round(total, 1), 'explanation': explanation}
-
-
-def recalc():
- if not os.path.exists(DB_PATH):
- print(f"[error] Database not found: {DB_PATH}")
- sys.exit(1)
-
- conn = sqlite3.connect(DB_PATH)
- c = conn.cursor()
-
- # Fetch all evaluations with prompt data resolved by agent_name (prompt_id mismatch safe)
- c.execute('''SELECT e.id, e.agent_name, e.response, e.total_score, e.scores, e.explanation,
- t.expected_keywords, t.rubric
- FROM evaluations e
- LEFT JOIN test_prompts t ON e.agent_name = t.agent_name''')
- rows = c.fetchall()
- print(f"[recalc] Found {len(rows)} evaluations")
-
- updated = 0
- for eid, agent_name, response, old_total, old_scores, old_exp, expected, rubric in rows:
- if expected is None or rubric is None:
- print(f" [skip] No prompt match for eval {eid} (agent={agent_name})")
- continue
-
- ev = evaluate_response(response, expected, rubric)
-
- new_scores = json.dumps(ev['scores'])
- new_total = ev['total']
- new_exp = ev['explanation']
-
- c.execute('''UPDATE evaluations
- SET total_score = ?, scores = ?, explanation = ?
- WHERE id = ?''',
- (new_total, new_scores, new_exp, eid))
- updated += 1
-
- conn.commit()
- print(f"[recalc] Updated {updated} evaluations")
-
- # Compute aggregates
- c.execute('''SELECT agent_name, model, AVG(total_score) as avg_score
- FROM evaluations GROUP BY agent_name, model''')
- rows = c.fetchall()
-
- best = {}
- for a, m, s in rows:
- if a not in best or s > best[a][1]:
- best[a] = (m, s)
-
- for a, (m, s) in best.items():
- c.execute('SELECT scores FROM evaluations WHERE agent_name = ? AND model = ?', (a, m))
- dims = c.fetchall()
- dim_avg = {}
- for (score_json,) in dims:
- for k, v in json.loads(score_json).items():
- dim_avg[k] = dim_avg.get(k, 0) + v
- dim_avg = {k: round(v / len(dims), 1) for k, v in dim_avg.items()}
-
- explanation = f"Best model for {a} is {m} with avg score {round(s,1)}. "
- explanation += f"Strongest dimension: {max(dim_avg, key=dim_avg.get)}."
-
- c.execute('''INSERT OR REPLACE INTO fit_scores
- (agent_name, model, fit_score, dimension_scores, explanation, evaluated_at)
- VALUES (?, ?, ?, ?, ?, ?)''',
- (a, m, round(s, 1), json.dumps(dim_avg), explanation,
- datetime.now(timezone.utc).isoformat()))
-
- conn.commit()
- print(f"[recalc] Computed fit scores for {len(best)} agents")
-
- # Generate report
- c.execute('''SELECT agent_name, model, AVG(total_score) as avg_score, COUNT(*) as cnt
- FROM evaluations GROUP BY agent_name, model''')
- rows = c.fetchall()
-
- agents = {}
- for a, m, s, cnt in rows:
- if a not in agents:
- c.execute('SELECT description, category, current_model FROM agents WHERE name = ?', (a,))
- info = c.fetchone()
- agents[a] = {'name': a, 'evaluations': {}, 'info': info or ()}
- agents[a]['evaluations'][m] = round(s, 1)
-
- for a in agents:
- evs = agents[a]['evaluations']
- best_m = max(evs, key=evs.get)
- agents[a]['best_model'] = best_m
- agents[a]['best_score'] = evs[best_m]
-
- c.execute('SELECT agent_name, model, fit_score, explanation FROM fit_scores')
- fit_scores = {}
- for a, m, s, e in c.fetchall():
- fit_scores[a] = {'model': m, 'fit': s, 'explanation': e}
-
- report = {
- 'generated': datetime.now(timezone.utc).isoformat(),
- 'source': 'real-fit-engine',
- 'total_evaluations': len(rows),
- 'agents': agents,
- 'fit_scores': fit_scores
- }
-
- os.makedirs(os.path.dirname(REPORT_PATH), exist_ok=True)
- with open(REPORT_PATH, 'w') as f:
- json.dump(report, f, ensure_ascii=False, indent=2)
-
- print(f"[recalc] Written {REPORT_PATH}: {len(agents)} agents, {len(rows)} evaluations")
- conn.close()
-
-
-if __name__ == '__main__':
- recalc()
diff --git a/scripts/run-focused-eval.py b/scripts/run-focused-eval.py
new file mode 100644
index 0000000..085cdf0
--- /dev/null
+++ b/scripts/run-focused-eval.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+Focused Real-Fit Eval Runner v2
+Evaluates key agents × models using real-fit-engine.py (the fixed version).
+"""
+import sqlite3, json, os, sys, importlib.util
+from datetime import datetime, timezone
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+os.environ.setdefault("OLLAMA_KEY", "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx")
+os.environ.setdefault("OLLAMA_HOST", "https://ollama.com/v1")
+
+# Import the dash-named real-fit-engine.py via importlib
+_spec = importlib.util.spec_from_file_location("rfe", os.path.join(os.path.dirname(__file__), "real-fit-engine.py"))
+_rfe = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_rfe)
+
+call_ollama = _rfe.call_ollama
+evaluate_response = _rfe.evaluate_response
+compute_aggregates = _rfe.compute_aggregates
+generate_report = _rfe.generate_report
+DB_PATH = _rfe.DB_PATH
+
+AGENTS = [
+ 'code-skeptic',
+ 'lead-developer',
+ 'system-analyst',
+ 'sdet-engineer',
+ 'orchestrator',
+ 'devops-engineer',
+ 'workflow-cross-checker',
+]
+
+MODELS = [
+ 'kimi-k2.6',
+ 'deepseek-v4-pro-max',
+ 'qwen3-coder:480b',
+ 'glm-5.1',
+]
+
+def fetch_agent_tasks():
+ conn = sqlite3.connect(DB_PATH)
+ placeholders = ','.join('?' * len(AGENTS))
+ rows = conn.execute(f"""
+ SELECT id, agent_name, system_prompt, user_prompt, expected_keywords, rubric
+ FROM test_prompts WHERE agent_name IN ({placeholders})
+ """, tuple(AGENTS)).fetchall()
+ conn.close()
+ return rows
+
+def eval_single(agent_name, model, prompt_id, system, user, expected_json, rubric_json):
+ resp, latency, tokens = call_ollama(model, system, user)
+ ev = evaluate_response(resp, expected_json, rubric_json)
+ return {
+ 'agent': agent_name,
+ 'model': model,
+ 'prompt_id': prompt_id,
+ 'response_text': resp[:3000],
+ 'latency_ms': latency,
+ 'tokens_prompt': tokens['prompt'],
+ 'tokens_response': tokens['response'],
+ 'total_score': ev['total'],
+ 'scores_json': json.dumps(ev['scores']),
+ 'explanation': ev['explanation'],
+ 'evaluated_at': datetime.now(timezone.utc).isoformat(),
+ 'evaluator': 'rubric_v2'
+ }
+
+def save_single(res):
+ conn = sqlite3.connect(DB_PATH)
+ conn.execute("""
+ INSERT INTO evaluations
+ (agent_name, model, prompt_id, response, latency_ms, tokens_prompt, tokens_response,
+ scores, total_score, explanation, evaluated_at, evaluator)
+ VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
+ """, (res['agent'], res['model'], res['prompt_id'], res['response_text'], res['latency_ms'],
+ res['tokens_prompt'], res['tokens_response'],
+ res['scores_json'], res['total_score'], res['explanation'],
+ res['evaluated_at'], res['evaluator']))
+ conn.commit()
+ conn.close()
+ print(f" [{res['agent']}] x [{res['model']}] score={res['total_score']:.1f} lat={res['latency_ms']}ms len={len(res['response_text'])}")
+
+def run_focused_eval(max_workers=4):
+ tasks = fetch_agent_tasks()
+ print(f"[focused] Agents: {len(AGENTS)} | Models: {len(MODELS)} | Prompts: {len(tasks)}")
+ print(f"[focused] Total evaluations: {len(tasks) * len(MODELS)}")
+
+ work_items = []
+ for pid, aname, system, user, exp_json, rub_json in tasks:
+ for model in MODELS:
+ work_items.append((aname, model, pid, system, user, exp_json, rub_json))
+
+ completed = 0
+ errors = 0
+ with ThreadPoolExecutor(max_workers=max_workers) as ex:
+ futures = {ex.submit(eval_single, *w): w for w in work_items}
+ for future in as_completed(futures):
+ try:
+ res = future.result()
+ save_single(res)
+ completed += 1
+ if completed % 4 == 0:
+ print(f"[focused] Progress: {completed}/{len(work_items)}")
+ except Exception as e:
+ import traceback
+ traceback.print_exc()
+ errors += 1
+
+ print(f"[focused] Completed {completed}/{len(work_items)} (errs={errors})")
+ compute_aggregates()
+
+if __name__ == '__main__':
+ print("="*60)
+ print("FOCUSED REAL-FIT EVALUATION v2")
+ print(f"Models: {', '.join(MODELS)}")
+ print(f"Agents: {', '.join(AGENTS)}")
+ print(f"API: {os.environ['OLLAMA_HOST']}")
+ print("="*60)
+
+ # Clean old evaluations
+ conn = sqlite3.connect(DB_PATH)
+ conn.execute("DELETE FROM evaluations WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS)))
+ conn.execute("DELETE FROM fit_scores WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS)))
+ conn.commit()
+ conn.close()
+ print("[focused] Cleaned old evaluations")
+
+ run_focused_eval(max_workers=4)
+ report = generate_report()
+ print(f"[focused] Report generated with {len(report.get('agents',{}))} agents")
diff --git a/scripts/test_ollama_minimal.py b/scripts/test_ollama_minimal.py
new file mode 100644
index 0000000..e3d2d1e
--- /dev/null
+++ b/scripts/test_ollama_minimal.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+import urllib.request, json, os, time
+
+def call_ollama_real(model_short, system_prompt, user_prompt):
+ key = os.environ.get("OLLAMA_KEY", "")
+ host = "https://ollama.com/v1"
+
+ payload = json.dumps({
+ "model": model_short,
+ "messages": [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": user_prompt}
+ ],
+ "temperature": 0.3,
+ "max_tokens": 2048
+ }).encode()
+
+ req = urllib.request.Request(
+ f"{host}/chat/completions",
+ data=payload,
+ headers={
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {key}" if key else "Bearer",
+ "User-Agent": "Mozilla/5.0"
+ },
+ method="POST"
+ )
+
+ start = time.time()
+ try:
+ with urllib.request.urlopen(req, timeout=120) as resp:
+ data = json.loads(resp.read().decode())
+ text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+ usage = data.get("usage", {})
+ elapsed = int((time.time() - start) * 1000)
+ print(f"Status: {resp.status}")
+ print(f"Latency: {elapsed}ms")
+ print(f"Tokens: prompt={usage.get('prompt_tokens')}, completion={usage.get('completion_tokens')}")
+ return text
+ except urllib.error.HTTPError as e:
+ body = e.read().decode()[:200]
+ print(f"HTTP Error: {e.code} {e.reason}")
+ print(f"Body: {body}")
+ return ""
+ except Exception as e:
+ print(f"Error: {e}")
+ return ""
+
+if __name__ == "__main__":
+ print("=== Test real Ollama API ===")
+ text = call_ollama_real(
+ "kimi-k2.6",
+ "You are a code reviewer. Find bugs.",
+ "Review: def f(x): return x+1"
+ )
+ print(f"\nResponse (first 300 chars):\n{text[:300]}")
+ print(f"\nTotal length: {len(text)} chars")
+ print(f"Keyword 'naming' in response: {'naming' in text.lower()}")
+ print(f"Keyword 'return' in response: {'return' in text.lower()}")
diff --git a/scripts/test_real_api.py b/scripts/test_real_api.py
new file mode 100644
index 0000000..ab1cfce
--- /dev/null
+++ b/scripts/test_real_api.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+import sys, os
+os.environ.setdefault("OLLAMA_KEY", "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx")
+os.environ.setdefault("OLLAMA_HOST", "https://api.ollama.com")
+
+sys.path.insert(0, "scripts")
+from real_fit_engine import call_ollama, evaluate_response, init_db, import_from_evolution, generate_prompts
+import sqlite3
+
+init_db()
+import_from_evolution()
+generate_prompts()
+
+conn = sqlite3.connect("agent-evolution/data/real-fit.db")
+row = conn.execute("SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE agent_name = ?", ("code-skeptic",)).fetchone()
+conn.close()
+
+if row:
+ system, user, expected, rubric = row
+ print("=== REAL Ollama: code-skeptic x kimi-k2.6 ===")
+ resp, latency, tokens = call_ollama("kimi-k2.6", system, user, expected)
+ print(f"Latency: {latency}ms")
+ print(f"Tokens: {tokens}")
+ print("Response (first 300 chars):")
+ print(resp[:300])
+ print("\n...")
+ ev = evaluate_response(resp, expected, rubric)
+ print(f"Score: {ev['total']:.1f}")
+ print(f"Explanation: {ev['explanation']}")
+else:
+ print("No prompt found for code-skeptic")
diff --git a/tests/scripts/capture-analytics-section.js b/tests/scripts/capture-analytics-section.js
new file mode 100644
index 0000000..c37b554
--- /dev/null
+++ b/tests/scripts/capture-analytics-section.js
@@ -0,0 +1,89 @@
+#!/usr/bin/env node
+/**
+ * Quick capture + element check for Analytics Hierarchy Section
+ */
+
+const { chromium } = require('playwright');
+const fs = require('fs');
+const path = require('path');
+
+const TARGET_URL = process.env.TARGET_URL || 'http://localhost:3002';
+const OUTPUT_DIR = process.env.OUTPUT_DIR || '/app/tests/visual/current';
+
+(async () => {
+ if (!fs.existsSync(OUTPUT_DIR)) {
+ fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+ }
+
+ const browser = await chromium.launch({
+ headless: true,
+ args: ['--disable-setuid-sandbox', '--no-sandbox'],
+ });
+
+ const page = await browser.newPage({
+ viewport: { width: 1280, height: 900 },
+ });
+
+ console.log(`Navigating to: ${TARGET_URL}`);
+ await page.goto(TARGET_URL, { waitUntil: 'networkidle', timeout: 60000 });
+ await page.waitForTimeout(3000);
+
+ // Scroll to "Аналитическая иерархия"
+ const heading = page.locator('text=Аналитическая иерархия').first();
+ if (await heading.isVisible().catch(() => false)) {
+ console.log('Scrolling to Аналитическая иерархия section...');
+ await heading.scrollIntoViewIfNeeded();
+ await page.evaluate(() => window.scrollBy(0, -60));
+ await page.waitForTimeout(1500);
+ } else {
+ console.log('Heading not found, fallback scroll');
+ await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight / 3));
+ await page.waitForTimeout(1500);
+ }
+
+ // Scroll further down to reveal cards 3 and 4 (heatmap, commands table)
+ await page.evaluate(() => window.scrollBy(0, 900));
+ await page.waitForTimeout(1000);
+
+ const screenshotPath = path.join(OUTPUT_DIR, 'analytics_section.png');
+ await page.screenshot({ path: screenshotPath, fullPage: false });
+ console.log(`Screenshot saved to: ${screenshotPath}`);
+
+ // Check for each card's evidence (use Russian text as it appears in the page)
+ const checks = [
+ { label: 'Model tree with collapsible categories', text: 'Модели → Категории → Агенты' },
+ { label: 'Category bars', text: 'Дистрибуция по категориям' },
+ { label: 'Fit-score heatmap', text: 'Fit-score распределение' },
+ { label: 'Commands table', text: 'Команды' },
+ ];
+
+ const results = { visible: {}, issues: [] };
+
+ for (const c of checks) {
+ const found = await page.locator(`text=${c.text}`).first().isVisible({ timeout: 3000 }).catch(() => false);
+ if (found) {
+ const textContent = await page.locator(`text=${c.text}`).first().textContent({ timeout: 3000 }).catch(() => '');
+ results.visible[c.label] = textContent;
+ } else {
+ results.issues.push(`${c.label} (searching text "${c.text}") — NOT FOUND`);
+ }
+ }
+
+ const reportPath = path.join(OUTPUT_DIR, 'analytics_section_report.json');
+ fs.writeFileSync(reportPath, JSON.stringify(results, null, 2));
+ console.log(`Report saved to: ${reportPath}`);
+
+ // Also write summary to stdout
+ console.log('\n=== Scan Results ===');
+ if (Object.keys(results.visible).length === 4) {
+ console.log('All 4 analytics cards are visible.');
+ } else {
+ console.log(`Visible: ${Object.keys(results.visible).join(', ')}`);
+ console.log(`Missing: ${results.issues.join(', ')}`);
+ }
+
+ await browser.close();
+})().catch((err) => {
+ console.error('Fatal error:', err);
+ process.exit(1);
+});
diff --git a/tests/scripts/e2e-landing-test.js b/tests/scripts/e2e-landing-test.js
new file mode 100644
index 0000000..7362ac0
--- /dev/null
+++ b/tests/scripts/e2e-landing-test.js
@@ -0,0 +1,381 @@
+#!/usr/bin/env node
+/**
+ * E2E Test Suite for APAW Landing Page
+ * Tests: page load, console errors, API state, analytics, heatmap modal,
+ * close interactions, visual regression.
+ *
+ * Usage: node e2e-landing-test.js
+ * Environment: TARGET_URL (default http://host.docker.internal:3002)
+ */
+
+const { chromium } = require('playwright');
+const fs = require('fs');
+const path = require('path');
+const pixelmatch = require('pixelmatch');
+const { PNG } = require('pngjs');
+const { launchBrowser, newContext, navigateTo } = require('./lib/browser-launcher');
+
+const TARGET_URL = process.env.TARGET_URL || 'http://host.docker.internal:3002';
+const REPORTS_DIR = process.env.REPORTS_DIR || path.join(__dirname, '..', 'reports');
+const BASELINE_DIR = process.env.BASELINE_DIR || path.join(__dirname, '..', 'visual', 'baseline');
+const CURRENT_DIR = process.env.CURRENT_DIR || path.join(__dirname, '..', 'visual', 'current');
+
+const VIEWPORT = { width: 1280, height: 900 };
+
+async function main() {
+ console.log('═══════════════════════════════════════════════════');
+ console.log(' APAW Landing E2E Tests');
+ console.log('═══════════════════════════════════════════════════\n');
+ console.log(`Target: ${TARGET_URL}\n`);
+
+ for (const dir of [REPORTS_DIR, CURRENT_DIR]) {
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
+ }
+
+ const browser = await launchBrowser();
+ const context = await newContext(browser, { viewport: VIEWPORT });
+ const page = await context.newPage();
+
+ const consoleErrors = [];
+ const consoleWarnings = [];
+ const networkErrors = [];
+ let networkRequests = [];
+
+ const results = [];
+
+ page.on('console', msg => {
+ if (msg.type() === 'error') consoleErrors.push(msg.text());
+ else if (msg.type() === 'warning') consoleWarnings.push(msg.text());
+ });
+
+ page.on('requestfailed', request => {
+ networkErrors.push({ url: request.url(), failure: request.failure()?.errorText || 'Unknown' });
+ });
+
+ page.on('response', response => {
+ if (response.status() >= 400) {
+ networkErrors.push({ url: response.url(), status: response.status() });
+ }
+ });
+
+ // ============================================================
+ // Test 1: Page loads without console errors
+ // ============================================================
+ {
+ console.log('┌─────────────────────────────────────────────────┐');
+ console.log('│ Test 1: Page loads without console errors │');
+ console.log('└─────────────────────────────────────────────────┘');
+
+ try {
+ const response = await navigateTo(page, `${TARGET_URL}`, { waitUntil: 'commit', timeout: 30000, delay: 3000 });
+ const status = response?.status() || 0;
+
+ // Wait for analytics section to be present in DOM
+ await page.waitForSelector('#analytics', { timeout: 10000 }).catch(() => {});
+
+ const title = await page.title();
+ const pageLoaded = status === 200 && title.includes('APAW');
+
+ result(results, '1_page_load', pageLoaded && consoleErrors.length === 0, `HTTP ${status}, title: "${title}", console errors: ${consoleErrors.length}`);
+ } catch (e) {
+ result(results, '1_page_load', false, e.message);
+ }
+ }
+
+ // ============================================================
+ // Test 2: /api/state loads successfully
+ // ============================================================
+ {
+ console.log('\n┌─────────────────────────────────────────────────┐');
+ console.log('│ Test 2: /api/state loads successfully │');
+ console.log('└─────────────────────────────────────────────────┘');
+
+ try {
+ const apiResponse = await page.evaluate(async (url) => {
+ const res = await fetch(`${url}/api/state`);
+ const data = await res.json().catch(() => null);
+ return { status: res.status, ok: res.ok, hasAgents: !!(data && Array.isArray(data.agents) && data.agents.length > 0) };
+ }, TARGET_URL);
+
+ result(results, '2_api_state', apiResponse.ok && apiResponse.hasAgents,
+ `status=${apiResponse.status}, hasAgents=${apiResponse.hasAgents}`);
+ } catch (e) {
+ result(results, '2_api_state', false, e.message);
+ }
+ }
+
+ // ============================================================
+ // Test 3: #analytics section is visible with heatmap rendered
+ // ============================================================
+ {
+ console.log('\n┌─────────────────────────────────────────────────┐');
+ console.log('│ Test 3: #analytics visible + heatmap rendered │');
+ console.log('└─────────────────────────────────────────────────┘');
+
+ try {
+ const analytics = await page.locator('#analytics').first();
+ const isVisible = await analytics.isVisible().catch(() => false);
+
+ // Scroll to analytics
+ await page.evaluate(() => {
+ const el = document.getElementById('analytics');
+ if (el) el.scrollIntoView({ block: 'start' });
+ });
+ await page.waitForTimeout(800);
+
+ const heatmap = await page.locator('#fit-heatmap').first();
+ const heatmapVisible = await heatmap.isVisible().catch(() => false);
+ const cellCount = await heatmap.locator('.heatmap__cell').count().catch(() => 0);
+
+ result(results, '3_analytics_heatmap', isVisible && heatmapVisible && cellCount > 0,
+ `analytics visible=${isVisible}, heatmap visible=${heatmapVisible}, cells=${cellCount}`);
+ } catch (e) {
+ result(results, '3_analytics_heatmap', false, e.message);
+ }
+ }
+
+ // ============================================================
+ // Test 4: Clicking a heatmap cell opens #fit-modal
+ // ============================================================
+ {
+ console.log('\n┌─────────────────────────────────────────────────┐');
+ console.log('│ Test 4: Clicking heatmap cell opens #fit-modal │');
+ console.log('└─────────────────────────────────────────────────┘');
+
+ try {
+ const cell = await page.locator('#fit-heatmap .heatmap__cell').first();
+ await cell.scrollIntoViewIfNeeded();
+ await page.waitForTimeout(500);
+ await cell.click();
+ await page.waitForTimeout(500);
+
+ const modal = await page.locator('#fit-modal').first();
+ const modalVisible = await modal.isVisible().catch(() => false);
+ const isOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => false);
+
+ result(results, '4_click_opens_modal', modalVisible && isOpen,
+ `modal visible=${modalVisible}, class is-open=${isOpen}`);
+ } catch (e) {
+ result(results, '4_click_opens_modal', false, e.message);
+ }
+ }
+
+ // ============================================================
+ // Test 5: Modal displays agent name, model, fit score,
+ // breakdown dimensions, and explanation
+ // ============================================================
+ {
+ console.log('\n┌─────────────────────────────────────────────────┐');
+ console.log('│ Test 5: Modal content (name, model, score, etc) │');
+ console.log('└─────────────────────────────────────────────────┘');
+
+ try {
+ const modal = await page.locator('#fit-modal').first();
+ const agentName = await modal.locator('#modal-agent-name').textContent().catch(() => '');
+ const modelText = await modal.locator('#modal-model').textContent().catch(() => '');
+ const scoreText = await modal.locator('#modal-score').textContent().catch(() => '');
+ const explanation = await modal.locator('#modal-explanation').textContent().catch(() => '');
+ const dims = await modal.locator('#modal-breakdown .modal__dimension').count().catch(() => 0);
+
+ const nameOk = agentName.trim().length > 0 && agentName !== 'Agent';
+ const modelOk = modelText.trim().length > 0;
+ const scoreOk = !isNaN(parseInt(scoreText, 10)) && parseInt(scoreText, 10) > 0;
+ const dimsOk = dims >= 4;
+ const explOk = explanation.trim().length > 0;
+
+ result(results, '5_modal_content',
+ nameOk && modelOk && scoreOk && dimsOk && explOk,
+ `name="${agentName.trim()}", model="${modelText.trim()}", score="${scoreText.trim()}", dimensions=${dims}, explanation=${explOk ? 'present' : 'missing'}`);
+ } catch (e) {
+ result(results, '5_modal_content', false, e.message);
+ }
+ }
+
+ // ============================================================
+ // Test 6: Modal can be closed via close button and Escape key
+ // ============================================================
+ {
+ const modal = await page.locator('#fit-modal').first();
+
+ // 6a: close button
+ {
+ console.log('\n┌─────────────────────────────────────────────────┐');
+ console.log('│ Test 6a: Close via close button │');
+ console.log('└─────────────────────────────────────────────────┘');
+
+ try {
+ await modal.locator('.modal__close').click();
+ await page.waitForTimeout(600);
+ // If CSS transition leaves it briefly visible, wait a tick
+ const isOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => true);
+ const visible = await modal.isVisible().catch(() => true);
+ result(results, '6_close_button', !isOpen && !visible, `is-open=${isOpen}, visible=${visible}`);
+ } catch (e) {
+ result(results, '6_close_button', false, e.message);
+ }
+ }
+
+ // 6b: Escape key
+ {
+ console.log('\n┌─────────────────────────────────────────────────┐');
+ console.log('│ Test 6b: Close via Escape key │');
+ console.log('└─────────────────────────────────────────────────┘');
+
+ try {
+ // If modal is still open (bug), force close via JS
+ const stillOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => false);
+ if (stillOpen) await page.evaluate(() => { if (typeof closeFitModal === 'function') closeFitModal(); });
+ await page.waitForTimeout(400);
+
+ const cell = await page.locator('#fit-heatmap .heatmap__cell').first();
+ await cell.evaluate(el => el.scrollIntoView({ block: 'center' }));
+ await cell.click({ force: true });
+ await page.waitForTimeout(500);
+ await page.keyboard.press('Escape');
+ await page.waitForTimeout(500);
+ const isOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => true);
+ const visible = await modal.isVisible().catch(() => true);
+ result(results, '6_escape_key', !isOpen && !visible, `is-open=${isOpen}, visible=${visible}`);
+ } catch (e) {
+ result(results, '6_escape_key', false, e.message);
+ }
+ }
+ }
+
+ // ============================================================
+ // Screenshot of opened modal
+ // ============================================================
+ {
+ console.log('\n┌─────────────────────────────────────────────────┐');
+ console.log('│ Capturing modal screenshot │');
+ console.log('└─────────────────────────────────────────────────┘');
+
+ try {
+ const cell = await page.locator('#fit-heatmap .heatmap__cell').first();
+ await cell.click();
+ await page.waitForTimeout(600);
+ const modal = await page.locator('#fit-modal').first();
+ const modalBox = await modal.boundingBox().catch(() => null);
+
+ const screenshotPath = path.join(CURRENT_DIR, 'modal_opened.png');
+ if (modalBox) {
+ await page.screenshot({ path: screenshotPath, clip: modalBox });
+ } else {
+ await page.screenshot({ path: screenshotPath });
+ }
+ console.log(` ✅ Screenshot saved: ${screenshotPath}`);
+ result(results, 'screenshot_modal', true, screenshotPath);
+ } catch (e) {
+ console.log(` ❌ Screenshot failed: ${e.message}`);
+ result(results, 'screenshot_modal', false, e.message);
+ }
+ }
+
+ // ============================================================
+ // Test 7: No visual regressions from baseline
+ // ============================================================
+ {
+ console.log('\n┌─────────────────────────────────────────────────┐');
+ console.log('│ Test 7: Visual regression (baseline vs current) │');
+ console.log('└─────────────────────────────────────────────────┘');
+
+ const baselinePath = path.join(BASELINE_DIR, 'homepage_desktop.png');
+ const currentPath = path.join(CURRENT_DIR, 'homepage_desktop.png');
+
+ // Capture current homepage for comparison
+ try {
+ await navigateTo(page, `${TARGET_URL}`, { waitUntil: 'commit', delay: 3000 });
+ await page.screenshot({ path: currentPath, fullPage: true });
+ } catch (e) {
+ console.log(` ⚠️ Could not capture current screenshot: ${e.message}`);
+ }
+
+ if (!fs.existsSync(baselinePath)) {
+ console.log(` ⚠️ Baseline not found at ${baselinePath}`);
+ result(results, '7_visual_regression', null, 'SKIP: baseline missing');
+ } else if (!fs.existsSync(currentPath)) {
+ result(results, '7_visual_regression', false, 'Current screenshot capture failed');
+ } else {
+ try {
+ const baseline = PNG.sync.read(fs.readFileSync(baselinePath));
+ const current = PNG.sync.read(fs.readFileSync(currentPath));
+
+ if (baseline.width !== current.width || baseline.height !== current.height) {
+ result(results, '7_visual_regression', false, `Size mismatch: ${baseline.width}x${baseline.height} vs ${current.width}x${current.height}`);
+ } else {
+ const diff = new PNG({ width: baseline.width, height: baseline.height });
+ const numDiff = pixelmatch(baseline.data, current.data, diff.data, baseline.width, baseline.height, { threshold: 0.1 });
+ const diffPercent = (numDiff / (baseline.width * baseline.height)) * 100;
+
+ const passed = diffPercent <= 5.0; // 5% tolerance
+ result(results, '7_visual_regression', passed, `diff pixels=${numDiff} (${diffPercent.toFixed(2)}%)`);
+
+ if (!passed) {
+ const diffPath = path.join(CURRENT_DIR, 'homepage_desktop_diff.png');
+ fs.writeFileSync(diffPath, PNG.sync.write(diff));
+ console.log(` 📸 Diff saved: ${diffPath}`);
+ }
+ }
+ } catch (e) {
+ result(results, '7_visual_regression', false, e.message);
+ }
+ }
+ }
+
+ await context.close();
+ await browser.close();
+
+ // ============================================================
+ // Summary
+ // ============================================================
+ console.log('\n═══════════════════════════════════════════════════');
+ console.log(' Results Summary');
+ console.log('═══════════════════════════════════════════════════\n');
+
+ for (const r of results) {
+ const icon = r.pass === true ? '✅' : r.pass === false ? '❌' : '⏭️';
+ console.log(`${icon} ${r.name}`);
+ console.log(` ${r.detail}`);
+ }
+
+ console.log(`\n📊 Console errors: ${consoleErrors.length}`);
+ console.log(`📊 Console warnings: ${consoleWarnings.length}`);
+ console.log(`📊 Network errors: ${networkErrors.length}`);
+
+ const failures = results.filter(r => r.pass === false);
+ const passed = results.filter(r => r.pass === true);
+ const skipped = results.filter(r => r.pass === null);
+
+ console.log(`\n✅ Passed: ${passed.length}`);
+ console.log(`❌ Failed: ${failures.length}`);
+ console.log(`⏭️ Skipped: ${skipped.length}`);
+
+ const reportPath = path.join(REPORTS_DIR, 'e2e-landing-report.json');
+ fs.writeFileSync(reportPath, JSON.stringify({
+ timestamp: new Date().toISOString(),
+ targetUrl: TARGET_URL,
+ results,
+ summary: {
+ passed: passed.length,
+ failed: failures.length,
+ skipped: skipped.length,
+ consoleErrors: consoleErrors.length,
+ consoleWarnings: consoleWarnings.length,
+ networkErrors: networkErrors.length,
+ },
+ }, null, 2));
+ console.log(`\n📄 Report: ${reportPath}`);
+
+ process.exit(failures.length > 0 ? 1 : 0);
+}
+
+function result(list, name, pass, detail) {
+ list.push({ name, pass, detail });
+ const icon = pass === true ? '✅' : pass === false ? '❌' : '⏭️';
+ console.log(` ${icon} ${name}: ${detail}`);
+}
+
+main().catch(err => {
+ console.error('Fatal:', err);
+ process.exit(1);
+});
diff --git a/tests/scripts/verify-evolution-heatmap.js b/tests/scripts/verify-evolution-heatmap.js
new file mode 100644
index 0000000..1e47fb2
--- /dev/null
+++ b/tests/scripts/verify-evolution-heatmap.js
@@ -0,0 +1,79 @@
+const { chromium } = require('playwright');
+const fs = require('fs');
+const path = require('path');
+
+const TARGET = process.env.TARGET_URL || 'http://host.docker.internal:3003';
+const OUT_DIR = process.env.OUT_DIR || path.join(__dirname, '..', 'reports');
+
+(async () => {
+ if (!fs.existsSync(OUT_DIR)) fs.mkdirSync(OUT_DIR, { recursive: true });
+
+ const browser = await chromium.launch({ headless: true });
+ const context = await browser.newContext({ viewport: { width: 1600, height: 1200 } });
+ const page = await context.newPage();
+
+ // Capture console & network errors
+ const consoleErrors = [];
+ const networkErrors = [];
+ page.on('console', msg => { if (msg.type() === 'error') consoleErrors.push(msg.text()); });
+ page.on('requestfailed', req => networkErrors.push({ url: req.url(), error: req.failure()?.errorText }));
+ page.on('response', res => { if (res.status() >= 400) networkErrors.push({ url: res.url(), status: res.status() }); });
+
+ console.log('[HEATMAP] Navigating to', TARGET);
+ await page.goto(TARGET, { waitUntil: 'domcontentloaded', timeout: 30000 });
+ await page.waitForTimeout(1500); // wait for fetch/dashboard-data
+
+ const tabBtn = page.locator('button.tab-btn', { hasText: /Heatmap/ }).first();
+ if (await tabBtn.count()) {
+ await tabBtn.click();
+ console.log('[HEATMAP] Clicked Heatmap tab');
+ } else {
+ console.log('[HEATMAP] No Heatmap tab found, tabs may already be active');
+ }
+
+ await page.waitForTimeout(2000); // let table build from JS
+
+ // Get table dimensions
+ const rows = await page.locator('#hmTable tbody tr').count().catch(() => 0);
+ const colCount = await page.locator('#hmTable thead th').count().catch(() => 0);
+ console.log(`[HEATMAP] Table: ${rows} rows, ${colCount} columns`);
+
+ // Screenshot full page of heatmap tab
+ const screenshotPath = path.join(OUT_DIR, 'heatmap.png');
+ await page.screenshot({ path: screenshotPath, fullPage: true });
+ console.log('[HEATMAP] Screenshot saved to', screenshotPath);
+
+ // Also screenshot just the table if possible
+ const tableScreenshotPath = path.join(OUT_DIR, 'heatmap-table.png');
+ const tableEl = page.locator('#hmTable').first();
+ if (await tableEl.count() && rows > 0) {
+ await tableEl.screenshot({ path: tableScreenshotPath });
+ console.log('[HEATMAP] Table screenshot saved to', tableScreenshotPath);
+ }
+
+ // Read cell data
+ const cellTexts = await page.locator('#hmTable tbody td').allTextContents().catch(() => []);
+ console.log('[HEATMAP] First 30 cell texts:', cellTexts.slice(0, 30).map(t => t.trim()));
+
+ // Dump innerHTML
+ const innerHTML = await page.locator('#hmTable').innerHTML().catch(() => null);
+
+ // Report
+ const report = {
+ target: TARGET,
+ table: { rows, colCount },
+ cellSamples: cellTexts.slice(0, 30).map(t => t.trim()),
+ consoleErrors,
+ networkErrors,
+ screenshots: [screenshotPath, tableScreenshotPath].filter(f => fs.existsSync(f)),
+ innerHTML: innerHTML ? innerHTML.slice(0, 2000) : null,
+ ok: rows > 0 && colCount > 0,
+ };
+
+ const reportPath = path.join(OUT_DIR, 'heatmap-report.json');
+ fs.writeFileSync(reportPath, JSON.stringify(report, null, 2));
+ console.log('[HEATMAP] Report saved to', reportPath);
+
+ await browser.close();
+ process.exit(report.ok ? 0 : 1);
+})();