diff --git a/agent-evolution/data/real-fit.html b/agent-evolution/data/real-fit.html
new file mode 100644
index 0000000..9b1fccf
--- /dev/null
+++ b/agent-evolution/data/real-fit.html
@@ -0,0 +1,93 @@
+<!DOCTYPE html>
+<html lang="ru">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Real-Fit Matrix — Agent × Model Performance</title>
+<style>
+:root{--bg:#0a0f1a;--bg2:#0f1525;--bg3:#141c2e;--bdr:#1e2d45;--txt:#e8f1ff;--txt2:#8ba3c0;--cyan:#00d4ff;--green:#00ff94;--red:#ff4757;--orange:#ff9f43;--purple:#a855f7;}
+*{margin:0;padding:0;box-sizing:border-box}
+body{font-family:system-ui,-apple-system,sans-serif;background:var(--bg);color:var(--txt);min-height:100vh;padding:24px}
+h1{font-size:1.6rem;background:linear-gradient(90deg,var(--cyan),var(--green));-webkit-background-clip:text;-webkit-text-fill-color:transparent;margin-bottom:8px}
+.sub{color:var(--txt2);font-size:.85rem;margin-bottom:20px}
+table{width:100%;border-collapse:collapse;font-size:.82rem}
+th,td{padding:8px 10px;border:1px solid var(--bdr);text-align:center}
+th{background:var(--bg2);color:var(--txt2);font-size:.72rem;text-transform:uppercase;letter-spacing:.5px;position:sticky;top:0}
+td:first-child{text-align:left;font-weight:700;white-space:nowrap}
+td.score{font-weight:700;font-family:monospace}
+.hm-cur{box-shadow:inset 0 0 0 2px var(--cyan)}
+.high{background:rgba(0,255,148,.18);color:var(--green)}
+.good{background:rgba(0,212,255,.14);color:var(--cyan)}
+.med{background:rgba(168,85,247,.15);color:var(--purple)}
+.low{background:rgba(255,71,87,.1);color:var(--red)}
+.na{background:transparent;color:var(--txt2);font-size:.9rem}
+.legend{display:flex;gap:12px;flex-wrap:wrap;margin-top:16px;font-size:.78rem;color:var(--txt2)}
+.legend span{display:flex;align-items:center;gap:4px}
+.dot{width:14px;height:14px;border-radius:3px}
+.meta{font-size:.72rem;color:var(--txt2);margin-top:12px}
+a{color:var(--cyan);text-decoration:none}
+</style>
+</head>
+<body>
+<h1>Real-Fit Matrix</h1>
+<div class="sub">Real agent × model evaluation scores via live Ollama API (28 calls, 4 models, 7 agents)</div>
+
+<div id="matrix"></div>
+<div class="legend">
+  <span><span class="dot high"></span> 90+ Excellent</span>
+  <span><span class="dot good"></span> 75–89 Good</span>
+  <span><span class="dot med"></span> 50–74 Average</span>
+  <span><span class="dot low"></span> &lt;50 Weak</span>
+  <span style="margin-left:auto">● = assigned model</span>
+</div>
+<div class="meta">Data source: <a href="data/real-fit-report.json" target="_blank">real-fit-report.json</a> | Updated: <span id="updated"></span></div>
+
+<script>
+async function load() {
+  const res = await fetch('data/real-fit-report.json');
+  const data = await res.json();
+  document.getElementById('updated').textContent = new Date(data.generated).toLocaleString('ru-RU');
+
+  // Extract focused agents (those with >0 evaluations on >1 model)
+  const agents = Object.values(data.agents).filter(a => {
+    const evs = Object.values(a.evaluations);
+    return evs.length > 0 && evs.some(s => s > 0);
+  });
+
+  // Get all models from any agent
+  const models = new Set();
+  agents.forEach(a => Object.keys(a.evaluations).forEach(m => models.add(m)));
+  const modelList = Array.from(models).sort();
+
+  // Build table
+  let html = '<table><thead><tr><th>Agent</th>';
+  modelList.forEach(m => html += `<th>${m}</th>`);
+  html += '<th>Best</th><th>Score</th></tr></thead><tbody>';
+
+  agents.forEach(a => {
+    html += `<tr><td>${a.name}</td>`;
+    modelList.forEach(m => {
+      const score = a.evaluations[m];
+      const isCur = a.info && a.info[2] && a.info[2].includes(m);
+      let cls = 'na';
+      let text = '—';
+      if (score !== undefined && score > 0) {
+        if (score >= 90) cls = 'score high';
+        else if (score >= 75) cls = 'score good';
+        else if (score >= 50) cls = 'score med';
+        else cls = 'score low';
+        text = Math.round(score);
+      }
+      const curCls = isCur ? ' hm-cur' : '';
+      html += `<td class="${cls}${curCls}">${text}${isCur ? ' ●' : ''}</td>`;
+    });
+    html += `<td>${a.best_model}</td><td style="font-weight:700">${Math.round(a.best_score)}</td></tr>`;
+  });
+
+  html += '</tbody></table>';
+  document.getElementById('matrix').innerHTML = html;
+}
+load().catch(e => document.getElementById('matrix').innerHTML = 'Error: ' + e);
+</script>
+</body>
+</html>
diff --git a/agent-evolution/docs/real-fit-architecture.md b/agent-evolution/docs/real-fit-architecture.md
new file mode 100644
index 0000000..70a0fc0
--- /dev/null
+++ b/agent-evolution/docs/real-fit-architecture.md
@@ -0,0 +1,68 @@
+# Real-Fit Analysis System Architecture
+
+## Problem
+Current `fit_score` is just `model_benchmarks.if_score` — generic benchmark, NOT evaluated per-role. `workflow-cross-checker` gets 92 simply because `kimi-k2.6` has IF=91, not because anyone tested if kimi is actually good at cross-checking workflows.
+
+## Solution: End-to-End Real Evaluation Pipeline
+
+### Phase 1: Test Prompt Generation
+For each agent, extract role description + capabilities from `.kilo/agents/{name}.md` frontmatter + body rules.
+Generate 3 representative tasks that exercise agent's actual responsibilities.
+
+### Phase 2: Multi-Model Execution
+Run each task through N top models (kimi, deepseek, glm, qwen, etc.) via Ollama API.
+Collect responses + latency + token count.
+
+### Phase 3: Role-Aware Evaluation
+Judge each response against role-specific criteria:
+- `code-skeptic`: Did it find the bug? Depth of analysis? Actionable fixes?
+- `workflow-cross-checker`: Did it ask uncomfortable questions? Covered all gates?
+- `lead-developer`: Working code? Tests pass? Clean structure?
+
+Using rubric-based scoring + model-as-judge (one model evaluates another).
+
+### Phase 4: Aggregation & Storage
+Store per-agent-per-model scores with:
+- Overall fit_score (0-100)
+- Dimension scores: accuracy, completeness, relevance, role-adherence
+- Explanation text: "Model X scored 87 because it correctly identified the race condition but missed the SQL injection (see response #3)"
+- Raw responses for drill-down
+
+### Phase 5: Dashboard Integration
+- Heatmap cell = real fit_score per agent per model
+- Click cell → Analysis tab shows: score breakdown + explanation + raw response snippets
+- "Why this score?" panel
+
+## Data Schema
+
+```json
+{
+  "agent": "workflow-cross-checker",
+  "model": "ollama-cloud/kimi-k2.6",
+  "fit_score": 87,
+  "dimensions": {
+    "accuracy": 90,
+    "completeness": 85,
+    "role_adherence": 92,
+    "actionability": 80
+  },
+  "explanation": "Strong at asking uncomfortable questions (gate protocol covered). Weak at suggesting concrete recovery actions.",
+  "tests": [
+    {
+      "task_id": "wf-check-001",
+      "prompt": "...",
+      "response": "...",
+      "scores": {"accuracy": 90, "completeness": 85},
+      "judge_notes": "..."
+    }
+  ],
+  "timestamp": "2026-05-27T18:00:00Z"
+}
+```
+
+## Next Steps
+1. Build prompt generator (read .kilo/agents/*.md → extract role → generate tasks)
+2. Build batch runner (call Ollama API for each agent×model×task)
+3. Build evaluator (rubric scoring + judge model)
+4. Build storage (JSON DB with drill-down)
+5. Build dashboard tab (Analysis with cell drill-down)
diff --git a/scripts/real-fit-engine.py b/scripts/real-fit-engine.py
index a3af5ef..e32deb4 100644
--- a/scripts/real-fit-engine.py
+++ b/scripts/real-fit-engine.py
@@ -5,12 +5,12 @@ SQLite-backed pipeline that evaluates agent-role × model fit via Ollama API.
 
 Usage:
   python3 real-fit-engine.py --init-db --import-evolution --generate-prompts
-  python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro-max
+  python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro
   python3 real-fit-engine.py --report
   python3 real-fit-engine.py --recalc --agent lead-developer --old-model qwen3-coder:480b --new-model kimi-k2.6
 
 Configuration:
-  OLLAMA_HOST  (default: http://localhost:11434)
+  OLLAMA_HOST  (default: https://ollama.com/v1)
 """
 import sqlite3, json, os, sys, re, time
 from glob import glob
@@ -18,13 +18,31 @@ from datetime import datetime, timezone
 from urllib import request, error as urllib_error
 from concurrent.futures import ThreadPoolExecutor, as_completed
 
-DB_PATH = "agent-evolution/data/real-fit.db"
+DB_PATH = os.environ.get("REAL_FIT_DB", "agent-evolution/data/real-fit.db")
 
-OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "https://api.ollama.com")
-OLLAMA_KEY = os.environ.get("OLLAMA_KEY", "")
-USE_MOCK = os.environ.get("OLLAMA_MOCK", "0") == "1"  # Default to REAL for this env
+# Load .env if present
+_ENV_LOADED = False
+if os.path.isfile(".env"):
+    with open(".env") as f:
+        for line in f:
+            if line.strip() and not line.startswith("#") and "=" in line:
+                k, v = line.strip().split("=", 1)
+                os.environ.setdefault(k, v)
+                _ENV_LOADED = True
 
-DEFAULT_MODELS = ["kimi-k2.6", "deepseek-v4-pro-max", "deepseek-v4-flash",
+# Ollama Cloud credentials (from .env or fallback)
+_DEFAULT_KEY = "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx"
+_DEFAULT_HOST = "https://ollama.com/v1"
+
+OLLAMA_HOST = os.environ.get("OLLAMA_HOST", _DEFAULT_HOST)
+OLLAMA_KEY  = os.environ.get("OLLAMA_KEY", _DEFAULT_KEY)
+USE_MOCK    = os.environ.get("OLLAMA_MOCK", "0") == "1"
+
+if not OLLAMA_KEY:
+    print("[FATAL] OLLAMA_KEY not set. Cannot run real evaluations.", file=sys.stderr)
+    sys.exit(1)
+
+DEFAULT_MODELS = ["kimi-k2.6", "deepseek-v4-pro", "deepseek-v4-flash",
                   "glm-5.1", "qwen3-coder:480b", "qwen3.5-122b"]
 
 # ================================================================
@@ -116,93 +134,75 @@ def init_db():
 # ================================================================
 
 def parse_frontmatter(path):
+    """Parse YAML frontmatter and body from an agent markdown file."""
     try:
         with open(path, 'r', encoding='utf-8') as f:
             content = f.read()
-    except:
+    except Exception:
         return {}
     if not content.startswith('---'):
         return {}
-    end = content.find('---', 3)
-    if end == -1:
+    parts = content.split('---', 2)
+    if len(parts) < 3:
         return {}
-    data = {}
-    for line in content[3:end].strip().split('\n'):
-        m = re.match(r'^(\w+):\s*(.+)$', line)
-        if m:
-            data[m.group(1)] = m.group(2).strip()
-    body = content[end+3:][:800]
-    data['_body_snippet'] = body.replace('\n', ' ').strip()[:300]
-    return data
+    fm_raw = parts[1].strip()
+    body = parts[2].strip()
+    try:
+        import yaml
+        fm = yaml.safe_load(fm_raw) or {}
+    except Exception:
+        fm = {}
+        for line in fm_raw.splitlines():
+            m = re.match(r'^(\w+):\s*(.+)$', line)
+            if m:
+                fm[m.group(1)] = m.group(2).strip()
+    body_text = body[:1200]
+    fm['_body'] = body_text
+    fm['_body_snippet'] = body_text.replace('\n', ' ').strip()[:300]
+    return fm
 
-TASK_LIBRARY = {
-    'code-skeptic': {
-        'system': 'You are a strict code reviewer. Find security issues, logic errors, anti-patterns. Be adversarial but constructive.',
-        'task': '''Review this function for security vulnerabilities and logic errors. Report: SQL injection, XSS, race conditions, code smells, and suggested fixes.
+def generate_task_for_agent(name, fm):
+    """Generate a realistic task prompt from the agent's actual markdown definition."""
+    description = fm.get('description', '') if isinstance(fm, dict) else ''
+    body = (fm.get('_body', '') if isinstance(fm, dict) else '')[:1500]
 
-```typescript
-function processPayment(userId, amount, cardToken) {
-  const q = `UPDATE users SET balance = balance - ${amount} WHERE id = ${userId}`;
-  db.exec(q);
-  fetch('/api/charge', { body: JSON.stringify({ cardToken, amount }) });
-  if (Math.random() > 0.9) { throw new Error('timeout'); }
-}
-```''',
-        'expected': ['sql injection', 'parameterized', 'race', 'localStorage', 'xss'],
-        'rubric': {'security': 35, 'logic': 25, 'actionability': 25, 'depth': 15}
-    },
-    'workflow-cross-checker': {
-        'system': 'You are a workflow cross-checker. Before any work begins, ask uncomfortable but important questions that could block the task.',
-        'task': 'A developer wants to add "admin can delete any user" directly from the UI. Run your cross-check protocol. Identify 5+ potential issues or blockers.',
-        'expected': ['soft delete', 'audit log', 'cascading', 'permission', 'data retention', 'backup'],
-        'rubric': {'thoroughness': 35, 'relevance': 30, 'actionability': 20, 'severity_ranking': 15}
-    },
-    'lead-developer': {
-        'system': 'You are lead developer. Write production-ready implementation. Tests MUST pass. Follow SOLID. Max 100 lines per file.',
-        'task': 'Implement a TaskQueue class with: transaction support, retry with exponential backoff, timeout handling, and Jest tests. TypeScript.',
-        'expected': ['class TaskQueue', 'async', 'retry', 'timeout', 'test', 'jest'],
-        'rubric': {'correctness': 30, 'test_coverage': 30, 'code_quality': 25, 'edge_cases': 15}
-    },
-    'sdet-engineer': {
-        'system': 'You are SDET. Write tests BEFORE code. Cover edge cases, nulls, async errors, concurrent access.',
-        'task': 'Write Jest tests for UserService: createUser, getUser, updateUser, deleteUser. Cover: valid inputs, nulls, duplicates, concurrent updates.',
-        'expected': ['describe', 'it', 'expect', 'null', 'async', 'mock', 'beforeEach'],
-        'rubric': {'coverage': 35, 'edge_cases': 30, 'readability': 20, 'mocking': 15}
-    },
-    'orchestrator': {
-        'system': 'You are an Orchestrator. You delegate tasks to subagents. You decide routing, handle errors, and manage budgets.',
-        'task': 'A user reports: "Build a REST API for ecommerce checkout". Design your delegation plan: which agents to call, in what order, what to do if one fails.',
-        'expected': ['system-analyst', 'lead-developer', 'code-skeptic', 'sdet-engineer', 'budget', 'parallel'],
-        'rubric': {'plan_quality': 30, 'agent_selection': 25, 'risk_handling': 25, 'budget_awareness': 20}
-    },
-    'system-analyst': {
-        'system': 'You design technical specifications, data schemas, and API contracts before implementation.',
-        'task': 'Design the API contract and DB schema for a multi-tenant SaaS billing system. Include rate limiting, audit trails, and idempotency.',
-        'expected': ['openapi', 'schema', 'idempotency', 'rate limit', 'audit', 'tenant'],
-        'rubric': {'completeness': 30, 'correctness': 30, 'clarity': 20, 'scalability': 20}
-    },
-    'devops-engineer': {
-        'system': 'You handle Docker, CI/CD, infrastructure. Security first.',
-        'task': 'Write a multi-stage Dockerfile for a Node.js Next.js app. Include: non-root user, health check, security scan, .dockerignore best practices.',
-        'expected': ['FROM node', 'USER', 'HEALTHCHECK', 'multi-stage', '.dockerignore'],
-        'rubric': {'security': 30, 'optimization': 25, 'correctness': 25, 'completeness': 20}
-    }
-}
+    system = f"You are {name}. {description}"
 
-def generate_task_for_agent(name, role):
-    n, r = name.lower(), role.lower()
-    for key, task in TASK_LIBRARY.items():
-        if key in n:
-            return task
-    # Keyword fallback
-    for key in TASK_LIBRARY:
-        if key.replace('-', ' ') in r or any(kw in r for kw in key.split('-')):
-            return TASK_LIBRARY[key]
+    # Build a task from real agent instructions
+    lines = body.splitlines()
+    instruction_lines = []
+    for line in lines:
+        stripped = line.strip()
+        if stripped and not stripped.startswith('#') and not stripped.startswith('---') and not stripped.startswith('|'):
+            instruction_lines.append(stripped)
+            if len(instruction_lines) >= 8:
+                break
+
+    if len(instruction_lines) >= 3:
+        task = (
+            "Based on your role definition below, respond to the following scenario as you would in production.\n\n"
+            "Your role instructions:\n" + '\n'.join(instruction_lines[:12]) +
+            "\n\nNow, given this incoming task: \"A team member has submitted a pull request with several issues."
+            " What do you do?\", provide your full response."
+        )
+    else:
+        task = f"Demonstrate your expertise as {name} in a realistic complex scenario. Provide a complete working solution."
+
+    expected = [name.replace('-', ' ')]
+    if description:
+        expected.extend(description.lower().split()[:5])
+    for line in lines:
+        l = line.strip()
+        if l.startswith('-') or l.startswith('*'):
+            expected.append(l.lstrip('-*').strip().lower())
+    expected = list(dict.fromkeys(expected))[:12]
+
+    rubric = {'relevance': 40, 'completeness': 30, 'correctness': 30}
     return {
-        'system': f'You are {name}. {role}',
-        'task': f'Demonstrate your expertise as {name} in a realistic complex scenario. Provide a complete working solution.',
-        'expected': [name.replace('-', ' ')],
-        'rubric': {'relevance': 40, 'completeness': 30, 'correctness': 30}
+        'system': system,
+        'task': task,
+        'expected': expected,
+        'rubric': rubric
     }
 
 def generate_prompts():
@@ -214,7 +214,7 @@ def generate_prompts():
         if not fm.get('model'):
             continue
         name = os.path.basename(path)[:-3]
-        task = generate_task_for_agent(name, fm.get('description', ''))
+        task = generate_task_for_agent(name, fm)
         if task:
             conn.execute('''
                 INSERT INTO test_prompts (agent_name, task_type, system_prompt, user_prompt, expected_keywords, rubric)
@@ -230,91 +230,143 @@ def generate_prompts():
 # OLLAMA CLIENT
 # ================================================================
 
-def call_ollama(model_short, system_prompt, user_prompt, expected_keywords=None):
-    """REAL Ollama API call via /api/chat. Returns (text, latency_ms, tokens_dict)."""
+def call_ollama(model_short, system_prompt, user_prompt, expected_keywords=None, timeout=120):
+    """Call Ollama API with retries. Returns (response_text, latency_ms, token_info_dict)."""
     if USE_MOCK:
         return (
             "[MOCK] This is a simulated response for testing the pipeline without API calls.",
             500, {"prompt": 100, "response": 200}
         )
-    
-    model_map = {
-        'kimi-k2.6': 'kimi-k2.6',
-        'deepseek-v4-pro-max': 'deepseek-v4-pro',
-        'deepseek-v4-flash': 'deepseek-v4-flash',
-        'glm-5.1': 'glm-5.1',
-        'qwen3-coder:480b': 'qwen3-coder:480b',
-        'qwen3.5-122b': 'kimi-k2.6',  # fallback to known working model
+
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {OLLAMA_KEY}",
     }
-    model_ollama = model_map.get(model_short, model_short)
-    payload = json.dumps({
-        "model": model_ollama,
+    body = json.dumps({
+        "model": model_short,
         "messages": [
             {"role": "system", "content": system_prompt},
-            {"role": "user", "content": user_prompt}
+            {"role": "user", "content": user_prompt},
         ],
-        "stream": False,
-        "options": {"temperature": 0.3, "num_predict": 2048}
-    }).encode('utf-8')
-    
-    headers = {"Content-Type": "application/json"}
-    if OLLAMA_KEY:
-        headers["Authorization"] = f"Bearer {OLLAMA_KEY}"
-    
-    req = request.Request(f"{OLLAMA_HOST}/api/chat",
-                          data=payload, headers=headers,
-                          method='POST')
-    start = time.time()
-    try:
-        with request.urlopen(req, timeout=120) as resp:
-            elapsed = int((time.time() - start) * 1000)
-            data = json.loads(resp.read().decode('utf-8'))
-            text = data.get('message', {}).get('content', '')
-            return (text, elapsed,
-                    {"prompt": data.get('prompt_eval_count', 0),
-                     "response": data.get('eval_count', 0)})
-    except urllib_error.HTTPError as e:
-        return (f"[HTTP {e.code}: {e.reason}]", int((time.time()-start)*1000), {"prompt":0,"response":0})
-    except Exception as e:
-        return (f"[ERROR: {e}]", 0, {"prompt":0,"response":0})
+        "temperature": 0.2,
+    }).encode("utf-8")
+
+    url = f"{OLLAMA_HOST.rstrip('/')}/chat/completions"
+    req = request.Request(url, data=body, headers=headers, method="POST")
+
+    latency = 0
+    for attempt in range(1, 4):
+        start = time.time()
+        try:
+            with request.urlopen(req, timeout=timeout) as resp:
+                data = json.loads(resp.read().decode("utf-8"))
+            latency = int((time.time() - start) * 1000)
+            content = (
+                data.get("choices", [{}])[0].get("message", {}).get("content", "")
+                or ""
+            )
+            usage = data.get("usage", {})
+            tokens = {
+                "prompt": usage.get("prompt_tokens", 0),
+                "response": usage.get("completion_tokens", 0),
+            }
+            return content, latency, tokens
+        except urllib_error.HTTPError as e:
+            latency = int((time.time() - start) * 1000)
+            if e.code in (429, 502, 503, 504):
+                wait = 2 ** attempt
+                print(f"  [retry] {model_short}: HTTP {e.code} → sleeping {wait}s (attempt {attempt}/3)")
+                time.sleep(wait)
+                continue
+            return f"[HTTP {e.code}] {e.read().decode('utf-8', 'ignore')[:200]}", latency, {}
+        except urllib_error.URLError as e:
+            latency = int((time.time() - start) * 1000)
+            wait = 2 ** attempt
+            print(f"  [retry] {model_short}: {e.reason} → sleeping {wait}s (attempt {attempt}/3)")
+            time.sleep(wait)
+            continue
+        except Exception as e:
+            latency = int((time.time() - start) * 1000)
+            return f"[ERROR] {type(e).__name__}: {str(e)[:200]}", latency, {}
+
+    return "[FATAL] All retries exhausted", latency, {}
 
 # ================================================================
 # EVALUATOR
 # ================================================================
 
 def evaluate_response(response, expected_json, rubric_json):
-    """Rubric-based evaluation. Returns dict."""
+    """Rubric-based evaluation. Returns dict with dimension scores mapped to rubric keys."""
     expected = json.loads(expected_json) if isinstance(expected_json, str) else expected_json
     rubric = json.loads(rubric_json) if isinstance(rubric_json, str) else rubric_json
     resp_lower = (response or '').lower()
     lines = response.strip().split('\n')
-    
+
+    # 1. Keyword coverage (generic)
     keyword_hits = sum(1 for kw in expected if kw.lower() in resp_lower)
-    keyword_score = min(100, (keyword_hits / len(expected) * 100) if expected else 50)
-    
-    has_code = '```' in response or 'function' in resp_lower or 'class ' in resp_lower
-    code_score = 80 if has_code else 30
-    
-    structure_score = min(100, len(lines) * 2)  # ~50 lines = 100
-    
-    scores = {'keyword_coverage': round(keyword_score, 1),
-              'code_presence': code_score,
-              'structure': round(structure_score, 1)}
-    
+    keyword_score = min(100, (keyword_hits / max(1, len(expected)) * 100))
+
+    # 2. Code presence
+    has_code = '```' in response or 'function' in resp_lower or 'class ' in resp_lower or 'def ' in resp_lower
+    code_score = 100 if has_code else 20
+
+    # 3. Structure (response depth)
+    structure_score = min(100, max(10, len(lines) * 2))
+
+    # 4. Actionability (does it suggest fixes/actions?)
+    actionability = 0
+    if any(w in resp_lower for w in ['fix', 'suggest', 'recommend', 'should', 'refactor', 'replace']):
+        actionability = 85
+    elif any(w in resp_lower for w in ['use', 'add', 'remove', 'change', 'improve', 'consider']):
+        actionability = 60
+
+    # 5. Depth (content length, capped)
+    depth = min(100, len(response) / 40)
+
+    # 6. Relevance (does response mention role-specific terms?)
+    relevance = min(100, keyword_score * 0.8 + 20)
+
+    # Map rubrics to actual computed scores via heuristics
+    generic_scores = {
+        'keyword_coverage': round(keyword_score, 1),
+        'code_presence': code_score,
+        'structure': round(structure_score, 1),
+        'actionability': round(actionability, 1),
+        'depth': round(depth, 1),
+        'relevance': round(relevance, 1),
+        # Rubric-specific mappings (fallback chain)
+        'security': max(keyword_score, code_score, actionability) if any(k in resp_lower for k in ['sql', 'inject', 'xss', 'csrf']) else round(keyword_score * 0.7, 1),
+        'logic': round(structure_score * 0.8, 1),
+        'correctness': round((code_score + keyword_score) / 2, 1),
+        'completeness': round((keyword_score + structure_score) / 2, 1),
+        'thoroughness': round((keyword_score + depth) / 2, 1),
+        'clarity': round(structure_score * 0.9, 1),
+        'coverage': keyword_score,
+        'edge_cases': round((keyword_score + depth) / 2, 1),
+        'readability': round(structure_score * 0.85, 1),
+        'mocking': code_score if 'mock' in resp_lower else round(code_score * 0.5, 1),
+        'plan_quality': round((keyword_score + structure_score) / 2, 1),
+        'agent_selection': keyword_score,
+        'risk_handling': actionability,
+        'budget_awareness': keyword_score,
+        'scalability': round(structure_score * 0.7, 1),
+        'optimization': actionability,
+    }
+
     total = 0
     if rubric:
         for dim, weight in rubric.items():
-            dim_score = scores.get(dim, keyword_score)
+            dim_score = generic_scores.get(dim, 50)
             total += (dim_score / 100) * weight
     else:
-        total = sum(scores.values()) / len(scores)
-    
+        total = sum(generic_scores.values()) / len(generic_scores)
+
     explanation = (f"Keywords: {keyword_hits}/{len(expected)}. "
                    f"Lines: {len(lines)}. "
                    f"Code: {'YES' if has_code else 'NO'}. "
                    f"Total={round(total, 1)}")
-    
-    return {'scores': scores, 'total': round(total, 1), 'explanation': explanation}
+
+    return {'scores': generic_scores, 'total': round(total, 1), 'explanation': explanation}
 
 # ================================================================
 # PARALLEL BATCH EVALUATION
@@ -324,36 +376,108 @@ def evaluate_one(args):
     agent_name, model, pid, system, user, expected, rubric = args
     resp, latency, tokens = call_ollama(model, system, user, expected)
     ev = evaluate_response(resp, expected, rubric)
+    is_error = not resp or resp.startswith('[')
     return {
         'agent': agent_name, 'model': model, 'prompt_id': pid,
         'response': resp, 'latency': latency, 'tokens': tokens,
         'total': ev['total'], 'scores': json.dumps(ev['scores']),
-        'explanation': ev['explanation']
+        'explanation': ev['explanation'], 'is_error': is_error
     }
 
-def evaluate_all(models_to_test, max_workers=4):
-    """Evaluate all agents × all models with parallel workers."""
+def _should_skip(agent_name, model):
+    """Check if we already have a non-error evaluation for this agent × model."""
     conn = sqlite3.connect(DB_PATH)
-    agents = conn.execute("SELECT DISTINCT name FROM agents").fetchall()
+    row = conn.execute('''
+        SELECT total_score FROM evaluations
+        WHERE agent_name = ? AND model = ? AND response IS NOT NULL
+          AND response NOT LIKE '[%' AND LENGTH(response) > 0
+        LIMIT 1''', (agent_name, model)).fetchone()
+    conn.close()
+    return row[0] if row else None
+
+
+def evaluate_single(agent_name, model, conn=None):
+    """Evaluate one agent × model. Reuses optional open connection."""
+    close_conn = False
+    if conn is None:
+        conn = sqlite3.connect(DB_PATH)
+        close_conn = True
+
+    prompts = conn.execute('''
+        SELECT id, system_prompt, user_prompt, expected_keywords, rubric
+        FROM test_prompts WHERE agent_name = ?
+    ''', (agent_name,)).fetchall()
+    if close_conn:
+        conn.close()
+
+    results = []
+    for pid, sys, usr, exp, rub in prompts:
+        res = evaluate_one((agent_name, model, pid, sys, usr, exp, rub))
+        if res.get('is_error'):
+            print(f"  [SKIP] {agent_name} × {model}: error response — {res['response'][:200]}")
+            continue
+        conn = sqlite3.connect(DB_PATH)
+        conn.execute('''INSERT INTO evaluations
+            (agent_name, model, prompt_id, response, latency_ms, tokens_prompt, tokens_response,
+             scores, total_score, explanation, evaluated_at, evaluator)
+            VALUES (?,?,?,?,?,?,?,?,?,?,?,?)''',
+            (res['agent'], res['model'], res['prompt_id'], res['response'], res['latency'],
+             res['tokens']['prompt'], res['tokens']['response'],
+             res['scores'], res['total'], res['explanation'],
+             datetime.now(timezone.utc).isoformat(), 'rubric_v1'))
+        conn.commit()
+        conn.close()
+        print(f"  [{res['agent']}] × [{res['model']}] score={res['total']:.1f}")
+        results.append(res)
+    return results
+
+
+def evaluate_all(models_to_test, max_workers=4, agent_filter=None):
+    """Evaluate agents × models with parallel workers.
+
+    Args:
+        models_to_test: list of model name strings (e.g. ['kimi-k2.6', 'glm-5.1'])
+        max_workers: thread pool size
+        agent_filter: optional agent name to limit evaluation to one agent
+    """
+    if isinstance(models_to_test, dict):
+        print("[error] evaluate_all received a dict instead of a list. "
+              "Use --evaluate-all --models m1,m2 for all agents, or pass a list.")
+        sys.exit(1)
+
+    conn = sqlite3.connect(DB_PATH)
+    if agent_filter:
+        agents = [(agent_filter,)]
+    else:
+        agents = conn.execute("SELECT DISTINCT name FROM agents").fetchall()
     tasks = []
-    
+
     for (agent_name,) in agents:
+        for model in models_to_test:
+            existing = _should_skip(agent_name, model)
+            if existing is not None:
+                print(f"  Already evaluated: {agent_name} × {model} = {existing:.1f} (skipping)")
+                continue
         prompts = conn.execute('''
             SELECT id, system_prompt, user_prompt, expected_keywords, rubric
             FROM test_prompts WHERE agent_name = ?''', (agent_name,)).fetchall()
         for pid, sys, usr, exp, rub in prompts:
             for model in models_to_test:
-                tasks.append((agent_name, model, pid, sys, usr, exp, rub))
-    
+                if _should_skip(agent_name, model) is None:
+                    tasks.append((agent_name, model, pid, sys, usr, exp, rub))
+
     conn.close()
-    
+
     print(f"[eval] Prepared {len(tasks)} evaluations (agents × models × prompts)")
-    
+
     results = []
     with ThreadPoolExecutor(max_workers=max_workers) as ex:
         futures = {ex.submit(evaluate_one, t): t for t in tasks}
         for future in as_completed(futures):
             res = future.result()
+            if res.get('is_error'):
+                print(f"  [SKIP] {res['agent']} × {res['model']}: error response — {res['response'][:200]}")
+                continue
             results.append(res)
             conn = sqlite3.connect(DB_PATH)
             conn.execute('''INSERT INTO evaluations
@@ -363,11 +487,11 @@ def evaluate_all(models_to_test, max_workers=4):
                 (res['agent'], res['model'], res['prompt_id'], res['response'], res['latency'],
                  res['tokens']['prompt'], res['tokens']['response'],
                  res['scores'], res['total'], res['explanation'],
-    datetime.now(timezone.utc).isoformat(), 'rubric_v1'))
+                 datetime.now(timezone.utc).isoformat(), 'rubric_v1'))
             conn.commit()
             conn.close()
             print(f"  [{res['agent']}] × [{res['model']}] score={res['total']:.1f}")
-    
+
     print(f"[eval] Stored {len(results)} evaluations")
     compute_aggregates()
 
@@ -485,7 +609,10 @@ def generate_report():
         'fit_scores': fit_scores
     }
     
-    out = 'agent-evolution/data/real-fit-report.json'
+    out = os.environ.get('REPORT_PATH', 'agent-evolution/data/real-fit-report.json')
+    out_dir = os.path.dirname(out)
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
     with open(out, 'w') as f:
         json.dump(report, f, ensure_ascii=False, indent=2)
     
@@ -498,7 +625,8 @@ def generate_report():
 # ================================================================
 
 def import_from_evolution():
-    with open('agent-evolution/data/evolution.json') as f:
+    evo_path = os.environ.get('EVOLUTION_PATH', 'agent-evolution/data/evolution.json')
+    with open(evo_path) as f:
         evo = json.load(f)
     conn = sqlite3.connect(DB_PATH)
     for name, a in evo['agents'].items():
@@ -546,7 +674,12 @@ if __name__ == '__main__':
         generate_prompts()
     if args.evaluate:
         models = args.models.split(',')
-        evaluate_all({args.evaluate: models}, args.workers)
+        for model in models:
+            existing = _should_skip(args.evaluate, model)
+            if existing is not None:
+                print(f"Already evaluated: {args.evaluate} x {model} = {existing:.1f} (skipping)")
+                continue
+            evaluate_single(args.evaluate, model)
     if args.evaluate_all:
         models = args.models.split(',')
         evaluate_all(models, args.workers)
@@ -559,7 +692,7 @@ if __name__ == '__main__':
         p.print_help()
         print("\n=== Workflow ===")
         print("  python3 real-fit-engine.py --init-db --import-evolution --generate-prompts")
-        print("  python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro-max")
+        print("  python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro")
         print("  python3 real-fit-engine.py --report")
         print("  python3 real-fit-engine.py --recalc --agent lead-developer --old-model qwen3-coder:480b --new-model kimi-k2.6")
         print("\nSet OLLAMA_MOCK=0 for real Ollama API (port 11434)")
diff --git a/scripts/real-fit-recalc.py b/scripts/real-fit-recalc.py
deleted file mode 100644
index 8962fa6..0000000
--- a/scripts/real-fit-recalc.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/usr/bin/env python3
-"""
-Recalculate real-fit scores from stored responses in SQLite.
-No API needed. Updates evaluations, fit_scores, and generates report.
-Usage: python3 scripts/real-fit-recalc.py
-"""
-import sqlite3, json, os, sys
-from datetime import datetime, timezone
-
-DB_PATH = "agent-evolution/data/real-fit.db"
-REPORT_PATH = "agent-evolution/data/real-fit-report.json"
-
-
-def evaluate_response(response, expected_json, rubric_json):
-    expected = json.loads(expected_json) if isinstance(expected_json, str) else expected_json
-    rubric = json.loads(rubric_json) if isinstance(rubric_json, str) else rubric_json
-    resp_lower = (response or '').lower()
-    lines = response.strip().split('\n')
-
-    keyword_hits = sum(1 for kw in expected if kw.lower() in resp_lower)
-    keyword_score = min(100, (keyword_hits / len(expected) * 100) if expected else 50)
-
-    has_code = '```' in response or 'function' in resp_lower or 'class ' in resp_lower
-    code_score = 80 if has_code else 30
-
-    structure_score = min(100, len(lines) * 2)
-
-    scores = {'keyword_coverage': round(keyword_score, 1),
-              'code_presence': code_score,
-              'structure': round(structure_score, 1)}
-
-    total = 0
-    if rubric:
-        for dim, weight in rubric.items():
-            dim_score = scores.get(dim, keyword_score)
-            total += (dim_score / 100) * weight
-    else:
-        total = sum(scores.values()) / len(scores)
-
-    explanation = (f"Keywords: {keyword_hits}/{len(expected)}. "
-                   f"Lines: {len(lines)}. "
-                   f"Code: {'YES' if has_code else 'NO'}. "
-                   f"Total={round(total, 1)}")
-
-    return {'scores': scores, 'total': round(total, 1), 'explanation': explanation}
-
-
-def recalc():
-    if not os.path.exists(DB_PATH):
-        print(f"[error] Database not found: {DB_PATH}")
-        sys.exit(1)
-
-    conn = sqlite3.connect(DB_PATH)
-    c = conn.cursor()
-
-    # Fetch all evaluations with prompt data resolved by agent_name (prompt_id mismatch safe)
-    c.execute('''SELECT e.id, e.agent_name, e.response, e.total_score, e.scores, e.explanation,
-                        t.expected_keywords, t.rubric
-                   FROM evaluations e
-                   LEFT JOIN test_prompts t ON e.agent_name = t.agent_name''')
-    rows = c.fetchall()
-    print(f"[recalc] Found {len(rows)} evaluations")
-
-    updated = 0
-    for eid, agent_name, response, old_total, old_scores, old_exp, expected, rubric in rows:
-        if expected is None or rubric is None:
-            print(f"  [skip] No prompt match for eval {eid} (agent={agent_name})")
-            continue
-
-        ev = evaluate_response(response, expected, rubric)
-
-        new_scores = json.dumps(ev['scores'])
-        new_total = ev['total']
-        new_exp = ev['explanation']
-
-        c.execute('''UPDATE evaluations
-                       SET total_score = ?, scores = ?, explanation = ?
-                       WHERE id = ?''',
-                  (new_total, new_scores, new_exp, eid))
-        updated += 1
-
-    conn.commit()
-    print(f"[recalc] Updated {updated} evaluations")
-
-    # Compute aggregates
-    c.execute('''SELECT agent_name, model, AVG(total_score) as avg_score
-                   FROM evaluations GROUP BY agent_name, model''')
-    rows = c.fetchall()
-
-    best = {}
-    for a, m, s in rows:
-        if a not in best or s > best[a][1]:
-            best[a] = (m, s)
-
-    for a, (m, s) in best.items():
-        c.execute('SELECT scores FROM evaluations WHERE agent_name = ? AND model = ?', (a, m))
-        dims = c.fetchall()
-        dim_avg = {}
-        for (score_json,) in dims:
-            for k, v in json.loads(score_json).items():
-                dim_avg[k] = dim_avg.get(k, 0) + v
-        dim_avg = {k: round(v / len(dims), 1) for k, v in dim_avg.items()}
-
-        explanation = f"Best model for {a} is {m} with avg score {round(s,1)}. "
-        explanation += f"Strongest dimension: {max(dim_avg, key=dim_avg.get)}."
-
-        c.execute('''INSERT OR REPLACE INTO fit_scores
-                       (agent_name, model, fit_score, dimension_scores, explanation, evaluated_at)
-                       VALUES (?, ?, ?, ?, ?, ?)''',
-                  (a, m, round(s, 1), json.dumps(dim_avg), explanation,
-                   datetime.now(timezone.utc).isoformat()))
-
-    conn.commit()
-    print(f"[recalc] Computed fit scores for {len(best)} agents")
-
-    # Generate report
-    c.execute('''SELECT agent_name, model, AVG(total_score) as avg_score, COUNT(*) as cnt
-                   FROM evaluations GROUP BY agent_name, model''')
-    rows = c.fetchall()
-
-    agents = {}
-    for a, m, s, cnt in rows:
-        if a not in agents:
-            c.execute('SELECT description, category, current_model FROM agents WHERE name = ?', (a,))
-            info = c.fetchone()
-            agents[a] = {'name': a, 'evaluations': {}, 'info': info or ()}
-        agents[a]['evaluations'][m] = round(s, 1)
-
-    for a in agents:
-        evs = agents[a]['evaluations']
-        best_m = max(evs, key=evs.get)
-        agents[a]['best_model'] = best_m
-        agents[a]['best_score'] = evs[best_m]
-
-    c.execute('SELECT agent_name, model, fit_score, explanation FROM fit_scores')
-    fit_scores = {}
-    for a, m, s, e in c.fetchall():
-        fit_scores[a] = {'model': m, 'fit': s, 'explanation': e}
-
-    report = {
-        'generated': datetime.now(timezone.utc).isoformat(),
-        'source': 'real-fit-engine',
-        'total_evaluations': len(rows),
-        'agents': agents,
-        'fit_scores': fit_scores
-    }
-
-    os.makedirs(os.path.dirname(REPORT_PATH), exist_ok=True)
-    with open(REPORT_PATH, 'w') as f:
-        json.dump(report, f, ensure_ascii=False, indent=2)
-
-    print(f"[recalc] Written {REPORT_PATH}: {len(agents)} agents, {len(rows)} evaluations")
-    conn.close()
-
-
-if __name__ == '__main__':
-    recalc()
diff --git a/scripts/run-focused-eval.py b/scripts/run-focused-eval.py
new file mode 100644
index 0000000..085cdf0
--- /dev/null
+++ b/scripts/run-focused-eval.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+Focused Real-Fit Eval Runner v2
+Evaluates key agents × models using real-fit-engine.py (the fixed version).
+"""
+import sqlite3, json, os, sys, importlib.util
+from datetime import datetime, timezone
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+os.environ.setdefault("OLLAMA_KEY", "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx")
+os.environ.setdefault("OLLAMA_HOST", "https://ollama.com/v1")
+
+# Import the dash-named real-fit-engine.py via importlib
+_spec = importlib.util.spec_from_file_location("rfe", os.path.join(os.path.dirname(__file__), "real-fit-engine.py"))
+_rfe = importlib.util.module_from_spec(_spec)
+_spec.loader.exec_module(_rfe)
+
+call_ollama = _rfe.call_ollama
+evaluate_response = _rfe.evaluate_response
+compute_aggregates = _rfe.compute_aggregates
+generate_report = _rfe.generate_report
+DB_PATH = _rfe.DB_PATH
+
+AGENTS = [
+    'code-skeptic',
+    'lead-developer',
+    'system-analyst',
+    'sdet-engineer',
+    'orchestrator',
+    'devops-engineer',
+    'workflow-cross-checker',
+]
+
+MODELS = [
+    'kimi-k2.6',
+    'deepseek-v4-pro-max',
+    'qwen3-coder:480b',
+    'glm-5.1',
+]
+
+def fetch_agent_tasks():
+    conn = sqlite3.connect(DB_PATH)
+    placeholders = ','.join('?' * len(AGENTS))
+    rows = conn.execute(f"""
+        SELECT id, agent_name, system_prompt, user_prompt, expected_keywords, rubric
+        FROM test_prompts WHERE agent_name IN ({placeholders})
+    """, tuple(AGENTS)).fetchall()
+    conn.close()
+    return rows
+
+def eval_single(agent_name, model, prompt_id, system, user, expected_json, rubric_json):
+    resp, latency, tokens = call_ollama(model, system, user)
+    ev = evaluate_response(resp, expected_json, rubric_json)
+    return {
+        'agent': agent_name,
+        'model': model,
+        'prompt_id': prompt_id,
+        'response_text': resp[:3000],
+        'latency_ms': latency,
+        'tokens_prompt': tokens['prompt'],
+        'tokens_response': tokens['response'],
+        'total_score': ev['total'],
+        'scores_json': json.dumps(ev['scores']),
+        'explanation': ev['explanation'],
+        'evaluated_at': datetime.now(timezone.utc).isoformat(),
+        'evaluator': 'rubric_v2'
+    }
+
+def save_single(res):
+    conn = sqlite3.connect(DB_PATH)
+    conn.execute("""
+        INSERT INTO evaluations
+        (agent_name, model, prompt_id, response, latency_ms, tokens_prompt, tokens_response,
+         scores, total_score, explanation, evaluated_at, evaluator)
+        VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
+    """, (res['agent'], res['model'], res['prompt_id'], res['response_text'], res['latency_ms'],
+          res['tokens_prompt'], res['tokens_response'],
+          res['scores_json'], res['total_score'], res['explanation'],
+          res['evaluated_at'], res['evaluator']))
+    conn.commit()
+    conn.close()
+    print(f"  [{res['agent']}] x [{res['model']}] score={res['total_score']:.1f} lat={res['latency_ms']}ms len={len(res['response_text'])}")
+
+def run_focused_eval(max_workers=4):
+    tasks = fetch_agent_tasks()
+    print(f"[focused] Agents: {len(AGENTS)} | Models: {len(MODELS)} | Prompts: {len(tasks)}")
+    print(f"[focused] Total evaluations: {len(tasks) * len(MODELS)}")
+
+    work_items = []
+    for pid, aname, system, user, exp_json, rub_json in tasks:
+        for model in MODELS:
+            work_items.append((aname, model, pid, system, user, exp_json, rub_json))
+
+    completed = 0
+    errors = 0
+    with ThreadPoolExecutor(max_workers=max_workers) as ex:
+        futures = {ex.submit(eval_single, *w): w for w in work_items}
+        for future in as_completed(futures):
+            try:
+                res = future.result()
+                save_single(res)
+                completed += 1
+                if completed % 4 == 0:
+                    print(f"[focused] Progress: {completed}/{len(work_items)}")
+            except Exception as e:
+                import traceback
+                traceback.print_exc()
+                errors += 1
+
+    print(f"[focused] Completed {completed}/{len(work_items)} (errs={errors})")
+    compute_aggregates()
+
+if __name__ == '__main__':
+    print("="*60)
+    print("FOCUSED REAL-FIT EVALUATION v2")
+    print(f"Models: {', '.join(MODELS)}")
+    print(f"Agents: {', '.join(AGENTS)}")
+    print(f"API: {os.environ['OLLAMA_HOST']}")
+    print("="*60)
+    
+    # Clean old evaluations
+    conn = sqlite3.connect(DB_PATH)
+    conn.execute("DELETE FROM evaluations WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS)))
+    conn.execute("DELETE FROM fit_scores WHERE agent_name IN ({})".format(','.join(f"'{a}'" for a in AGENTS)))
+    conn.commit()
+    conn.close()
+    print("[focused] Cleaned old evaluations")
+
+    run_focused_eval(max_workers=4)
+    report = generate_report()
+    print(f"[focused] Report generated with {len(report.get('agents',{}))} agents")
diff --git a/scripts/test_ollama_minimal.py b/scripts/test_ollama_minimal.py
new file mode 100644
index 0000000..e3d2d1e
--- /dev/null
+++ b/scripts/test_ollama_minimal.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+import urllib.request, json, os, time
+
+def call_ollama_real(model_short, system_prompt, user_prompt):
+    key = os.environ.get("OLLAMA_KEY", "")
+    host = "https://ollama.com/v1"
+    
+    payload = json.dumps({
+        "model": model_short,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ],
+        "temperature": 0.3,
+        "max_tokens": 2048
+    }).encode()
+    
+    req = urllib.request.Request(
+        f"{host}/chat/completions",
+        data=payload,
+        headers={
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {key}" if key else "Bearer",
+            "User-Agent": "Mozilla/5.0"
+        },
+        method="POST"
+    )
+    
+    start = time.time()
+    try:
+        with urllib.request.urlopen(req, timeout=120) as resp:
+            data = json.loads(resp.read().decode())
+            text = data.get("choices", [{}])[0].get("message", {}).get("content", "")
+            usage = data.get("usage", {})
+            elapsed = int((time.time() - start) * 1000)
+            print(f"Status: {resp.status}")
+            print(f"Latency: {elapsed}ms")
+            print(f"Tokens: prompt={usage.get('prompt_tokens')}, completion={usage.get('completion_tokens')}")
+            return text
+    except urllib.error.HTTPError as e:
+        body = e.read().decode()[:200]
+        print(f"HTTP Error: {e.code} {e.reason}")
+        print(f"Body: {body}")
+        return ""
+    except Exception as e:
+        print(f"Error: {e}")
+        return ""
+
+if __name__ == "__main__":
+    print("=== Test real Ollama API ===")
+    text = call_ollama_real(
+        "kimi-k2.6",
+        "You are a code reviewer. Find bugs.",
+        "Review: def f(x): return x+1"
+    )
+    print(f"\nResponse (first 300 chars):\n{text[:300]}")
+    print(f"\nTotal length: {len(text)} chars")
+    print(f"Keyword 'naming' in response: {'naming' in text.lower()}")
+    print(f"Keyword 'return' in response: {'return' in text.lower()}")
diff --git a/scripts/test_real_api.py b/scripts/test_real_api.py
new file mode 100644
index 0000000..ab1cfce
--- /dev/null
+++ b/scripts/test_real_api.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+import sys, os
+os.environ.setdefault("OLLAMA_KEY", "feaa56e2dff045af989346ca74cb33a6.xzJ-plOVSgTL1FbmL8PZZ3Wx")
+os.environ.setdefault("OLLAMA_HOST", "https://api.ollama.com")
+
+sys.path.insert(0, "scripts")
+from real_fit_engine import call_ollama, evaluate_response, init_db, import_from_evolution, generate_prompts
+import sqlite3
+
+init_db()
+import_from_evolution()
+generate_prompts()
+
+conn = sqlite3.connect("agent-evolution/data/real-fit.db")
+row = conn.execute("SELECT system_prompt, user_prompt, expected_keywords, rubric FROM test_prompts WHERE agent_name = ?", ("code-skeptic",)).fetchone()
+conn.close()
+
+if row:
+    system, user, expected, rubric = row
+    print("=== REAL Ollama: code-skeptic x kimi-k2.6 ===")
+    resp, latency, tokens = call_ollama("kimi-k2.6", system, user, expected)
+    print(f"Latency: {latency}ms")
+    print(f"Tokens: {tokens}")
+    print("Response (first 300 chars):")
+    print(resp[:300])
+    print("\n...")
+    ev = evaluate_response(resp, expected, rubric)
+    print(f"Score: {ev['total']:.1f}")
+    print(f"Explanation: {ev['explanation']}")
+else:
+    print("No prompt found for code-skeptic")
diff --git a/tests/scripts/capture-analytics-section.js b/tests/scripts/capture-analytics-section.js
new file mode 100644
index 0000000..c37b554
--- /dev/null
+++ b/tests/scripts/capture-analytics-section.js
@@ -0,0 +1,89 @@
+#!/usr/bin/env node
+/**
+ * Quick capture + element check for Analytics Hierarchy Section
+ */
+
+const { chromium } = require('playwright');
+const fs = require('fs');
+const path = require('path');
+
+const TARGET_URL = process.env.TARGET_URL || 'http://localhost:3002';
+const OUTPUT_DIR = process.env.OUTPUT_DIR || '/app/tests/visual/current';
+
+(async () => {
+  if (!fs.existsSync(OUTPUT_DIR)) {
+    fs.mkdirSync(OUTPUT_DIR, { recursive: true });
+  }
+
+  const browser = await chromium.launch({
+    headless: true,
+    args: ['--disable-setuid-sandbox', '--no-sandbox'],
+  });
+
+  const page = await browser.newPage({
+    viewport: { width: 1280, height: 900 },
+  });
+
+  console.log(`Navigating to: ${TARGET_URL}`);
+  await page.goto(TARGET_URL, { waitUntil: 'networkidle', timeout: 60000 });
+  await page.waitForTimeout(3000);
+
+  // Scroll to "Аналитическая иерархия"
+  const heading = page.locator('text=Аналитическая иерархия').first();
+  if (await heading.isVisible().catch(() => false)) {
+    console.log('Scrolling to Аналитическая иерархия section...');
+    await heading.scrollIntoViewIfNeeded();
+    await page.evaluate(() => window.scrollBy(0, -60));
+    await page.waitForTimeout(1500);
+  } else {
+    console.log('Heading not found, fallback scroll');
+    await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight / 3));
+    await page.waitForTimeout(1500);
+  }
+
+  // Scroll further down to reveal cards 3 and 4 (heatmap, commands table)
+  await page.evaluate(() => window.scrollBy(0, 900));
+  await page.waitForTimeout(1000);
+
+  const screenshotPath = path.join(OUTPUT_DIR, 'analytics_section.png');
+  await page.screenshot({ path: screenshotPath, fullPage: false });
+  console.log(`Screenshot saved to: ${screenshotPath}`);
+
+  // Check for each card's evidence (use Russian text as it appears in the page)
+  const checks = [
+    { label: 'Model tree with collapsible categories', text: 'Модели → Категории → Агенты' },
+    { label: 'Category bars', text: 'Дистрибуция по категориям' },
+    { label: 'Fit-score heatmap', text: 'Fit-score распределение' },
+    { label: 'Commands table', text: 'Команды' },
+  ];
+
+  const results = { visible: {}, issues: [] };
+
+  for (const c of checks) {
+    const found = await page.locator(`text=${c.text}`).first().isVisible({ timeout: 3000 }).catch(() => false);
+    if (found) {
+      const textContent = await page.locator(`text=${c.text}`).first().textContent({ timeout: 3000 }).catch(() => '');
+      results.visible[c.label] = textContent;
+    } else {
+      results.issues.push(`${c.label} (searching text "${c.text}") — NOT FOUND`);
+    }
+  }
+
+  const reportPath = path.join(OUTPUT_DIR, 'analytics_section_report.json');
+  fs.writeFileSync(reportPath, JSON.stringify(results, null, 2));
+  console.log(`Report saved to: ${reportPath}`);
+
+  // Also write summary to stdout
+  console.log('\n=== Scan Results ===');
+  if (Object.keys(results.visible).length === 4) {
+    console.log('All 4 analytics cards are visible.');
+  } else {
+    console.log(`Visible: ${Object.keys(results.visible).join(', ')}`);
+    console.log(`Missing: ${results.issues.join(', ')}`);
+  }
+
+  await browser.close();
+})().catch((err) => {
+  console.error('Fatal error:', err);
+  process.exit(1);
+});
diff --git a/tests/scripts/e2e-landing-test.js b/tests/scripts/e2e-landing-test.js
new file mode 100644
index 0000000..7362ac0
--- /dev/null
+++ b/tests/scripts/e2e-landing-test.js
@@ -0,0 +1,381 @@
+#!/usr/bin/env node
+/**
+ * E2E Test Suite for APAW Landing Page
+ * Tests: page load, console errors, API state, analytics, heatmap modal,
+ * close interactions, visual regression.
+ *
+ * Usage: node e2e-landing-test.js
+ * Environment: TARGET_URL (default http://host.docker.internal:3002)
+ */
+
+const { chromium } = require('playwright');
+const fs = require('fs');
+const path = require('path');
+const pixelmatch = require('pixelmatch');
+const { PNG } = require('pngjs');
+const { launchBrowser, newContext, navigateTo } = require('./lib/browser-launcher');
+
+const TARGET_URL = process.env.TARGET_URL || 'http://host.docker.internal:3002';
+const REPORTS_DIR = process.env.REPORTS_DIR || path.join(__dirname, '..', 'reports');
+const BASELINE_DIR = process.env.BASELINE_DIR || path.join(__dirname, '..', 'visual', 'baseline');
+const CURRENT_DIR = process.env.CURRENT_DIR || path.join(__dirname, '..', 'visual', 'current');
+
+const VIEWPORT = { width: 1280, height: 900 };
+
+async function main() {
+  console.log('═══════════════════════════════════════════════════');
+  console.log('  APAW Landing E2E Tests');
+  console.log('═══════════════════════════════════════════════════\n');
+  console.log(`Target: ${TARGET_URL}\n`);
+
+  for (const dir of [REPORTS_DIR, CURRENT_DIR]) {
+    if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
+  }
+
+  const browser = await launchBrowser();
+  const context = await newContext(browser, { viewport: VIEWPORT });
+  const page = await context.newPage();
+
+  const consoleErrors = [];
+  const consoleWarnings = [];
+  const networkErrors = [];
+  let networkRequests = [];
+
+  const results = [];
+
+  page.on('console', msg => {
+    if (msg.type() === 'error') consoleErrors.push(msg.text());
+    else if (msg.type() === 'warning') consoleWarnings.push(msg.text());
+  });
+
+  page.on('requestfailed', request => {
+    networkErrors.push({ url: request.url(), failure: request.failure()?.errorText || 'Unknown' });
+  });
+
+  page.on('response', response => {
+    if (response.status() >= 400) {
+      networkErrors.push({ url: response.url(), status: response.status() });
+    }
+  });
+
+  // ============================================================
+  // Test 1: Page loads without console errors
+  // ============================================================
+  {
+    console.log('┌─────────────────────────────────────────────────┐');
+    console.log('│ Test 1: Page loads without console errors       │');
+    console.log('└─────────────────────────────────────────────────┘');
+
+    try {
+      const response = await navigateTo(page, `${TARGET_URL}`, { waitUntil: 'commit', timeout: 30000, delay: 3000 });
+      const status = response?.status() || 0;
+
+      // Wait for analytics section to be present in DOM
+      await page.waitForSelector('#analytics', { timeout: 10000 }).catch(() => {});
+
+      const title = await page.title();
+      const pageLoaded = status === 200 && title.includes('APAW');
+
+      result(results, '1_page_load', pageLoaded && consoleErrors.length === 0, `HTTP ${status}, title: "${title}", console errors: ${consoleErrors.length}`);
+    } catch (e) {
+      result(results, '1_page_load', false, e.message);
+    }
+  }
+
+  // ============================================================
+  // Test 2: /api/state loads successfully
+  // ============================================================
+  {
+    console.log('\n┌─────────────────────────────────────────────────┐');
+    console.log('│ Test 2: /api/state loads successfully         │');
+    console.log('└─────────────────────────────────────────────────┘');
+
+    try {
+      const apiResponse = await page.evaluate(async (url) => {
+        const res = await fetch(`${url}/api/state`);
+        const data = await res.json().catch(() => null);
+        return { status: res.status, ok: res.ok, hasAgents: !!(data && Array.isArray(data.agents) && data.agents.length > 0) };
+      }, TARGET_URL);
+
+      result(results, '2_api_state', apiResponse.ok && apiResponse.hasAgents,
+        `status=${apiResponse.status}, hasAgents=${apiResponse.hasAgents}`);
+    } catch (e) {
+      result(results, '2_api_state', false, e.message);
+    }
+  }
+
+  // ============================================================
+  // Test 3: #analytics section is visible with heatmap rendered
+  // ============================================================
+  {
+    console.log('\n┌─────────────────────────────────────────────────┐');
+    console.log('│ Test 3: #analytics visible + heatmap rendered   │');
+    console.log('└─────────────────────────────────────────────────┘');
+
+    try {
+      const analytics = await page.locator('#analytics').first();
+      const isVisible = await analytics.isVisible().catch(() => false);
+
+      // Scroll to analytics
+      await page.evaluate(() => {
+        const el = document.getElementById('analytics');
+        if (el) el.scrollIntoView({ block: 'start' });
+      });
+      await page.waitForTimeout(800);
+
+      const heatmap = await page.locator('#fit-heatmap').first();
+      const heatmapVisible = await heatmap.isVisible().catch(() => false);
+      const cellCount = await heatmap.locator('.heatmap__cell').count().catch(() => 0);
+
+      result(results, '3_analytics_heatmap', isVisible && heatmapVisible && cellCount > 0,
+        `analytics visible=${isVisible}, heatmap visible=${heatmapVisible}, cells=${cellCount}`);
+    } catch (e) {
+      result(results, '3_analytics_heatmap', false, e.message);
+    }
+  }
+
+  // ============================================================
+  // Test 4: Clicking a heatmap cell opens #fit-modal
+  // ============================================================
+  {
+    console.log('\n┌─────────────────────────────────────────────────┐');
+    console.log('│ Test 4: Clicking heatmap cell opens #fit-modal  │');
+    console.log('└─────────────────────────────────────────────────┘');
+
+    try {
+      const cell = await page.locator('#fit-heatmap .heatmap__cell').first();
+      await cell.scrollIntoViewIfNeeded();
+      await page.waitForTimeout(500);
+      await cell.click();
+      await page.waitForTimeout(500);
+
+      const modal = await page.locator('#fit-modal').first();
+      const modalVisible = await modal.isVisible().catch(() => false);
+      const isOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => false);
+
+      result(results, '4_click_opens_modal', modalVisible && isOpen,
+        `modal visible=${modalVisible}, class is-open=${isOpen}`);
+    } catch (e) {
+      result(results, '4_click_opens_modal', false, e.message);
+    }
+  }
+
+  // ============================================================
+  // Test 5: Modal displays agent name, model, fit score,
+  // breakdown dimensions, and explanation
+  // ============================================================
+  {
+    console.log('\n┌─────────────────────────────────────────────────┐');
+    console.log('│ Test 5: Modal content (name, model, score, etc) │');
+    console.log('└─────────────────────────────────────────────────┘');
+
+    try {
+      const modal = await page.locator('#fit-modal').first();
+      const agentName = await modal.locator('#modal-agent-name').textContent().catch(() => '');
+      const modelText = await modal.locator('#modal-model').textContent().catch(() => '');
+      const scoreText = await modal.locator('#modal-score').textContent().catch(() => '');
+      const explanation = await modal.locator('#modal-explanation').textContent().catch(() => '');
+      const dims = await modal.locator('#modal-breakdown .modal__dimension').count().catch(() => 0);
+
+      const nameOk = agentName.trim().length > 0 && agentName !== 'Agent';
+      const modelOk = modelText.trim().length > 0;
+      const scoreOk = !isNaN(parseInt(scoreText, 10)) && parseInt(scoreText, 10) > 0;
+      const dimsOk = dims >= 4;
+      const explOk = explanation.trim().length > 0;
+
+      result(results, '5_modal_content',
+        nameOk && modelOk && scoreOk && dimsOk && explOk,
+        `name="${agentName.trim()}", model="${modelText.trim()}", score="${scoreText.trim()}", dimensions=${dims}, explanation=${explOk ? 'present' : 'missing'}`);
+    } catch (e) {
+      result(results, '5_modal_content', false, e.message);
+    }
+  }
+
+  // ============================================================
+  // Test 6: Modal can be closed via close button and Escape key
+  // ============================================================
+  {
+    const modal = await page.locator('#fit-modal').first();
+
+    // 6a: close button
+      {
+        console.log('\n┌─────────────────────────────────────────────────┐');
+        console.log('│ Test 6a: Close via close button                   │');
+        console.log('└─────────────────────────────────────────────────┘');
+
+        try {
+          await modal.locator('.modal__close').click();
+          await page.waitForTimeout(600);
+          // If CSS transition leaves it briefly visible, wait a tick
+          const isOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => true);
+          const visible = await modal.isVisible().catch(() => true);
+          result(results, '6_close_button', !isOpen && !visible, `is-open=${isOpen}, visible=${visible}`);
+        } catch (e) {
+          result(results, '6_close_button', false, e.message);
+        }
+      }
+
+      // 6b: Escape key
+      {
+        console.log('\n┌─────────────────────────────────────────────────┐');
+        console.log('│ Test 6b: Close via Escape key                     │');
+        console.log('└─────────────────────────────────────────────────┘');
+
+        try {
+          // If modal is still open (bug), force close via JS
+          const stillOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => false);
+          if (stillOpen) await page.evaluate(() => { if (typeof closeFitModal === 'function') closeFitModal(); });
+          await page.waitForTimeout(400);
+
+          const cell = await page.locator('#fit-heatmap .heatmap__cell').first();
+          await cell.evaluate(el => el.scrollIntoView({ block: 'center' }));
+          await cell.click({ force: true });
+          await page.waitForTimeout(500);
+          await page.keyboard.press('Escape');
+          await page.waitForTimeout(500);
+          const isOpen = await modal.evaluate(el => el.classList.contains('is-open')).catch(() => true);
+          const visible = await modal.isVisible().catch(() => true);
+          result(results, '6_escape_key', !isOpen && !visible, `is-open=${isOpen}, visible=${visible}`);
+        } catch (e) {
+          result(results, '6_escape_key', false, e.message);
+        }
+      }
+  }
+
+  // ============================================================
+  // Screenshot of opened modal
+  // ============================================================
+  {
+    console.log('\n┌─────────────────────────────────────────────────┐');
+    console.log('│ Capturing modal screenshot                       │');
+    console.log('└─────────────────────────────────────────────────┘');
+
+    try {
+      const cell = await page.locator('#fit-heatmap .heatmap__cell').first();
+      await cell.click();
+      await page.waitForTimeout(600);
+      const modal = await page.locator('#fit-modal').first();
+      const modalBox = await modal.boundingBox().catch(() => null);
+
+      const screenshotPath = path.join(CURRENT_DIR, 'modal_opened.png');
+      if (modalBox) {
+        await page.screenshot({ path: screenshotPath, clip: modalBox });
+      } else {
+        await page.screenshot({ path: screenshotPath });
+      }
+      console.log(`  ✅ Screenshot saved: ${screenshotPath}`);
+      result(results, 'screenshot_modal', true, screenshotPath);
+    } catch (e) {
+      console.log(`  ❌ Screenshot failed: ${e.message}`);
+      result(results, 'screenshot_modal', false, e.message);
+    }
+  }
+
+  // ============================================================
+  // Test 7: No visual regressions from baseline
+  // ============================================================
+  {
+    console.log('\n┌─────────────────────────────────────────────────┐');
+    console.log('│ Test 7: Visual regression (baseline vs current) │');
+    console.log('└─────────────────────────────────────────────────┘');
+
+    const baselinePath = path.join(BASELINE_DIR, 'homepage_desktop.png');
+    const currentPath = path.join(CURRENT_DIR, 'homepage_desktop.png');
+
+    // Capture current homepage for comparison
+    try {
+      await navigateTo(page, `${TARGET_URL}`, { waitUntil: 'commit', delay: 3000 });
+      await page.screenshot({ path: currentPath, fullPage: true });
+    } catch (e) {
+      console.log(`  ⚠️ Could not capture current screenshot: ${e.message}`);
+    }
+
+    if (!fs.existsSync(baselinePath)) {
+      console.log(`  ⚠️ Baseline not found at ${baselinePath}`);
+      result(results, '7_visual_regression', null, 'SKIP: baseline missing');
+    } else if (!fs.existsSync(currentPath)) {
+      result(results, '7_visual_regression', false, 'Current screenshot capture failed');
+    } else {
+      try {
+        const baseline = PNG.sync.read(fs.readFileSync(baselinePath));
+        const current = PNG.sync.read(fs.readFileSync(currentPath));
+
+        if (baseline.width !== current.width || baseline.height !== current.height) {
+          result(results, '7_visual_regression', false, `Size mismatch: ${baseline.width}x${baseline.height} vs ${current.width}x${current.height}`);
+        } else {
+          const diff = new PNG({ width: baseline.width, height: baseline.height });
+          const numDiff = pixelmatch(baseline.data, current.data, diff.data, baseline.width, baseline.height, { threshold: 0.1 });
+          const diffPercent = (numDiff / (baseline.width * baseline.height)) * 100;
+
+          const passed = diffPercent <= 5.0; // 5% tolerance
+          result(results, '7_visual_regression', passed, `diff pixels=${numDiff} (${diffPercent.toFixed(2)}%)`);
+
+          if (!passed) {
+            const diffPath = path.join(CURRENT_DIR, 'homepage_desktop_diff.png');
+            fs.writeFileSync(diffPath, PNG.sync.write(diff));
+            console.log(`  📸 Diff saved: ${diffPath}`);
+          }
+        }
+      } catch (e) {
+        result(results, '7_visual_regression', false, e.message);
+      }
+    }
+  }
+
+  await context.close();
+  await browser.close();
+
+  // ============================================================
+  // Summary
+  // ============================================================
+  console.log('\n═══════════════════════════════════════════════════');
+  console.log('  Results Summary');
+  console.log('═══════════════════════════════════════════════════\n');
+
+  for (const r of results) {
+    const icon = r.pass === true ? '✅' : r.pass === false ? '❌' : '⏭️';
+    console.log(`${icon} ${r.name}`);
+    console.log(`   ${r.detail}`);
+  }
+
+  console.log(`\n📊 Console errors: ${consoleErrors.length}`);
+  console.log(`📊 Console warnings: ${consoleWarnings.length}`);
+  console.log(`📊 Network errors: ${networkErrors.length}`);
+
+  const failures = results.filter(r => r.pass === false);
+  const passed = results.filter(r => r.pass === true);
+  const skipped = results.filter(r => r.pass === null);
+
+  console.log(`\n✅ Passed: ${passed.length}`);
+  console.log(`❌ Failed: ${failures.length}`);
+  console.log(`⏭️ Skipped: ${skipped.length}`);
+
+  const reportPath = path.join(REPORTS_DIR, 'e2e-landing-report.json');
+  fs.writeFileSync(reportPath, JSON.stringify({
+    timestamp: new Date().toISOString(),
+    targetUrl: TARGET_URL,
+    results,
+    summary: {
+      passed: passed.length,
+      failed: failures.length,
+      skipped: skipped.length,
+      consoleErrors: consoleErrors.length,
+      consoleWarnings: consoleWarnings.length,
+      networkErrors: networkErrors.length,
+    },
+  }, null, 2));
+  console.log(`\n📄 Report: ${reportPath}`);
+
+  process.exit(failures.length > 0 ? 1 : 0);
+}
+
+function result(list, name, pass, detail) {
+  list.push({ name, pass, detail });
+  const icon = pass === true ? '✅' : pass === false ? '❌' : '⏭️';
+  console.log(`  ${icon} ${name}: ${detail}`);
+}
+
+main().catch(err => {
+  console.error('Fatal:', err);
+  process.exit(1);
+});
diff --git a/tests/scripts/verify-evolution-heatmap.js b/tests/scripts/verify-evolution-heatmap.js
new file mode 100644
index 0000000..1e47fb2
--- /dev/null
+++ b/tests/scripts/verify-evolution-heatmap.js
@@ -0,0 +1,79 @@
+const { chromium } = require('playwright');
+const fs = require('fs');
+const path = require('path');
+
+const TARGET = process.env.TARGET_URL || 'http://host.docker.internal:3003';
+const OUT_DIR = process.env.OUT_DIR || path.join(__dirname, '..', 'reports');
+
+(async () => {
+  if (!fs.existsSync(OUT_DIR)) fs.mkdirSync(OUT_DIR, { recursive: true });
+
+  const browser = await chromium.launch({ headless: true });
+  const context = await browser.newContext({ viewport: { width: 1600, height: 1200 } });
+  const page = await context.newPage();
+
+  // Capture console & network errors
+  const consoleErrors = [];
+  const networkErrors = [];
+  page.on('console', msg => { if (msg.type() === 'error') consoleErrors.push(msg.text()); });
+  page.on('requestfailed', req => networkErrors.push({ url: req.url(), error: req.failure()?.errorText }));
+  page.on('response', res => { if (res.status() >= 400) networkErrors.push({ url: res.url(), status: res.status() }); });
+
+  console.log('[HEATMAP] Navigating to', TARGET);
+  await page.goto(TARGET, { waitUntil: 'domcontentloaded', timeout: 30000 });
+  await page.waitForTimeout(1500); // wait for fetch/dashboard-data
+
+  const tabBtn = page.locator('button.tab-btn', { hasText: /Heatmap/ }).first();
+  if (await tabBtn.count()) {
+    await tabBtn.click();
+    console.log('[HEATMAP] Clicked Heatmap tab');
+  } else {
+    console.log('[HEATMAP] No Heatmap tab found, tabs may already be active');
+  }
+
+  await page.waitForTimeout(2000); // let table build from JS
+
+  // Get table dimensions
+  const rows = await page.locator('#hmTable tbody tr').count().catch(() => 0);
+  const colCount = await page.locator('#hmTable thead th').count().catch(() => 0);
+  console.log(`[HEATMAP] Table: ${rows} rows, ${colCount} columns`);
+
+  // Screenshot full page of heatmap tab
+  const screenshotPath = path.join(OUT_DIR, 'heatmap.png');
+  await page.screenshot({ path: screenshotPath, fullPage: true });
+  console.log('[HEATMAP] Screenshot saved to', screenshotPath);
+
+  // Also screenshot just the table if possible
+  const tableScreenshotPath = path.join(OUT_DIR, 'heatmap-table.png');
+  const tableEl = page.locator('#hmTable').first();
+  if (await tableEl.count() && rows > 0) {
+    await tableEl.screenshot({ path: tableScreenshotPath });
+    console.log('[HEATMAP] Table screenshot saved to', tableScreenshotPath);
+  }
+
+  // Read cell data
+  const cellTexts = await page.locator('#hmTable tbody td').allTextContents().catch(() => []);
+  console.log('[HEATMAP] First 30 cell texts:', cellTexts.slice(0, 30).map(t => t.trim()));
+
+  // Dump innerHTML
+  const innerHTML = await page.locator('#hmTable').innerHTML().catch(() => null);
+
+  // Report
+  const report = {
+    target: TARGET,
+    table: { rows, colCount },
+    cellSamples: cellTexts.slice(0, 30).map(t => t.trim()),
+    consoleErrors,
+    networkErrors,
+    screenshots: [screenshotPath, tableScreenshotPath].filter(f => fs.existsSync(f)),
+    innerHTML: innerHTML ? innerHTML.slice(0, 2000) : null,
+    ok: rows > 0 && colCount > 0,
+  };
+
+  const reportPath = path.join(OUT_DIR, 'heatmap-report.json');
+  fs.writeFileSync(reportPath, JSON.stringify(report, null, 2));
+  console.log('[HEATMAP] Report saved to', reportPath);
+
+  await browser.close();
+  process.exit(report.ok ? 0 : 1);
+})();