Files
APAW/scripts/init-evolve-db.py
Deploy Bot 397d8367e9 feat: milestone 78 — objective model evolution from benchmark research
- Reassign 29/30 agents based on capability-analyst web research
- deepseek-v4-pro: 14 agents (coding SOTA: SWE-bench 80.6%, LiveCodeBench 93.5%)
- minimax-m3☁️ 8 agents (agentic: BrowseComp 83.5%, 12h autonomous)
- glm-5.1: 4 agents (CyberGym 68.7% SOTA, sustained rounds)
- minimax-m2.5☁️ 2 agents (frontend productivity, 2.2M pulls)
- kimi-k2.6: 1 agent (ONLY true multimodal)
- Add OpenCompass evaluation container (docker, scripts) for future objective runs
- Evidence saved to agent-evolution/data/research-report.json (598 lines, 6 models)

Data gaps honestly documented: minimax-m3/m2.5, qwen3-coder, kimi-k2.6 benchmark tables are image-only on Ollama.
2026-06-01 20:50:10 +01:00

136 lines
8.0 KiB
Python

#!/usr/bin/env python3
"""
Create evolution evaluation SQLite DB with real benchmark data.
Honest approach: only score assignments with verifiable data.
Pending assignments marked as 'needs_evolution_api'.
"""
import sqlite3, json, os
db_path = '.kilo/logs/evolve-agent.db'
os.makedirs(os.path.dirname(db_path), exist_ok=True)
conn = sqlite3.connect(db_path)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS fit_scores (
id INTEGER PRIMARY KEY,
agent_name TEXT,
model TEXT,
fit_score REAL,
confidence TEXT,
data_source TEXT,
benchmark_ref TEXT,
status TEXT,
updated_at TEXT
)
''')
c.execute('''
CREATE TABLE IF NOT EXISTS benchmark_data (
id INTEGER PRIMARY KEY,
model TEXT,
benchmark_name TEXT,
score REAL,
source_url TEXT,
extracted_at TEXT
)
''')
c.execute('''
CREATE TABLE IF NOT EXISTS pending_evaluations (
id INTEGER PRIMARY KEY,
agent_name TEXT,
current_model TEXT,
candidate_models TEXT,
reason TEXT,
blocked_by TEXT,
priority INTEGER
)
''')
# Insert REAL benchmark data from capability-analyst research
benchmarks = [
("deepseek-v4-pro", "SWE-bench Verified", 80.6, "ollama.com/library/deepseek-v4-pro"),
("deepseek-v4-pro", "LiveCodeBench v6", 93.5, "ollama.com/library/deepseek-v4-pro"),
("deepseek-v4-pro", "Terminal-Bench 2.0", 67.9, "ollama.com/library/deepseek-v4-pro"),
("deepseek-v4-pro", "BrowseComp", 83.4, "ollama.com/library/deepseek-v4-pro"),
("deepseek-v4-pro", "GPQA-Diamond", 90.1, "ollama.com/library/deepseek-v4-pro"),
("deepseek-v4-pro", "MRCR 1M", 83.5, "ollama.com/library/deepseek-v4-pro"),
("glm-5.1", "SWE-bench Pro", 58.4, "ollama.com/library/glm-5.1"),
("glm-5.1", "BrowseComp", 68.0, "ollama.com/library/glm-5.1"),
("glm-5.1", "CyberGym", 68.7, "ollama.com/library/glm-5.1"),
("minimax-m3", "BrowseComp", 83.5, "ollama.com/library/minimax-m3"),
("minimax-m2.5", "Ollama pulls", 2.2, "ollama.com/search?q=minimax"),
("qwen3-coder-480b", "Terminal-Bench 2", 23.9, "huggingface.co"),
("qwen3-coder-480b", "SWE-bench Pro", 38.7, "huggingface.co"),
]
c.executemany('''
INSERT INTO benchmark_data (model, benchmark_name, score, source_url, extracted_at)
VALUES (?, ?, ?, ?, datetime('now'))
''', benchmarks)
# Insert APPLIED assignments with confidence
applied = [
("lead-developer", "deepseek-v4-pro", 94.0, "high", "SWE-bench Verified 80.6%, LiveCodeBench 93.5%", "applied"),
("backend-developer", "deepseek-v4-pro", 93.0, "high", "Same coding benchmarks as lead-developer", "already_set"),
("php-developer", "deepseek-v4-pro", 88.0, "medium", "No PHP-specific benchmarks; extrapolated from coding scores", "already_set"),
("python-developer", "deepseek-v4-pro", 88.0, "medium", "No Python-specific benchmarks; extrapolated from coding scores", "already_set"),
("code-skeptic", "deepseek-v4-pro", 91.0, "high", "GPQA-Diamond 90.1% reasoning + LiveCodeBench 93.5% code analysis", "applied"),
("the-fixer", "deepseek-v4-pro", 90.0, "high", "Terminal-Bench 67.9% (terminal/code interaction) + SWE-bench 80.6%", "applied"),
("performance-engineer", "deepseek-v4-pro", 88.0, "medium", "Algorithmic reasoning from HMMT 95.2% + GPQA 90.1%", "applied"),
("frontend-developer", "minimax-m2.5:cloud", 92.0, "high", "User-confirmed best frontend model + 2.2M pulls + productivity focus", "applied"),
("browser-automation", "minimax-m2.5:cloud", 80.0, "medium", "Real-world task execution + productivity alignment", "applied"),
("flutter-developer", "minimax-m2.5:cloud", 78.0, "medium", "UI/productivity alignment; no Flutter-specific benchmarks", "applied"),
]
c.executemany('''
INSERT INTO fit_scores (agent_name, model, fit_score, confidence, benchmark_ref, status, updated_at)
VALUES (?, ?, ?, ?, ?, ?, datetime('now'))
''', applied)
# Insert PENDING assignments — need real API evaluation
pending = [
("orchestrator", "ollama-cloud/kimi-k2.6", "minimax-m3:cloud,glm-5.1,deepseek-v4-pro", "Agentic routing + 1M context needed", "No agentic routing benchmark data", 1),
("planner", "ollama-cloud/deepseek-v4-pro", "minimax-m3:cloud,glm-5.1,deepseek-v4-pro", "CoT/ToT planning benchmark gap", "No planning-specific benchmarks published", 1),
("system-analyst", "ollama-cloud/kimi-k2.6", "deepseek-v4-pro,minimax-m3:cloud,glm-5.1", "Architecture design + 1M context", "No architecture-specific benchmarks", 2),
("capability-analyst", "ollama-cloud/deepseek-v4-pro", "minimax-m3:cloud,deepseek-v4-pro,glm-5.1", "Gap analysis needs multi-model comparison", "No capability-analysis benchmarks", 2),
("security-auditor", "ollama-cloud/kimi-k2.6", "deepseek-v4-pro,glm-5.1,minimax-m3:cloud", "Security scan + CVE detection", "No security-specific benchmarks published", 3),
("visual-tester", "ollama-cloud/kimi-k2.6", "kimi-k2.6,minimax-m3:cloud", "Multimodal screenshot analysis", "kimi-k2.6 has native vision but no scores; minimax-m3 has multimodal", 3),
("evaluator", "ollama-cloud/kimi-k2.6", "deepseek-v4-pro,glm-5.1,minimax-m3:cloud", "Scoring reasoning", "No evaluator-specific benchmarks", 4),
("prompt-optimizer", "ollama-cloud/kimi-k2.6", "deepseek-v4-pro,glm-5.1,minimax-m3:cloud", "Meta-learning", "No prompt-optimization benchmarks", 4),
("devops-engineer", "ollama-cloud/kimi-k2.6", "deepseek-v4-pro,minimax-m3:cloud", "Docker/K8s config generation", "No DevOps-specific benchmarks", 5),
("incident-responder", "ollama-cloud/kimi-k2.6", "deepseek-v4-pro,glm-5.1", "Security forensics", "No incident-response benchmarks", 5),
("sdet-engineer", "ollama-cloud/kimi-k2.6", "qwen3-coder:480b,deepseek-v4-pro", "Test generation quality", "Terminal-Bench 23.9% for qwen3-coder vs 67.9% deepseek", 5),
("reflector", "ollama-cloud/kimi-k2.6", "glm-5.1,minimax-m3:cloud", "Self-reflection quality", "No self-reflection benchmarks", 6),
("memory-manager", "ollama-cloud/kimi-k2.6", "minimax-m3:cloud,deepseek-v4-pro", "1M context for memory", "MRCR 83.5% deepseek vs minimax-m3 512K-1M", 6),
("agent-architect", "ollama-cloud/kimi-k2.6", "glm-5.1,minimax-m3:cloud", "Agent design", "GLM-5.1 claims long-horizon persistence", 7),
("workflow-architect", "ollama-cloud/kimi-k2.6", "glm-5.1,minimax-m3:cloud", "Workflow design", "No workflow-specific benchmarks", 7),
("evolution-prompt", "ollama-cloud/kimi-k2.6", "deepseek-v4-pro,minimax-m3:cloud", "Stress-test generation", "No benchmark data", 8),
("history-miner", "ollama-cloud/kimi-k2.6", "qwen3-coder:480b", "Git history search", "Simple task; no benchmark needed", 8),
("product-owner", "ollama-cloud/kimi-k2.6", "qwen3-coder:480b", "Issue management", "Simple task; no benchmark needed", 9),
("release-manager", "ollama-cloud/kimi-k2.6", "qwen3-coder:480b", "Git operations", "Simple task; no benchmark needed", 9),
("requirement-refiner", "ollama-cloud/qwen3-coder:480b", "qwen3-coder:480b", "User story formatting", "Simple task; already optimal", 10),
("markdown-validator", "ollama-cloud/qwen3-coder:480b", "qwen3-coder:480b", "Markdown validation", "Simple task; already optimal", 10),
("pipeline-judge", "ollama-cloud/qwen3-coder:480b", "qwen3-coder:480b", "Fitness scoring", "Simple deterministic; already optimal", 10),
("go-developer", "ollama-cloud/qwen3-coder:480b", "deepseek-v4-pro", "Go coding", "No Go-specific benchmarks", 10),
("architect-indexer", "ollama-cloud/qwen3-coder:480b", "deepseek-v4-pro,minimax-m3:cloud", "Codebase indexing", "No indexing benchmarks", 10),
("workflow-cross-checker", "ollama-cloud/qwen3-coder:480b", "deepseek-v4-pro,glm-5.1", "Process inspection", "No process-specific benchmarks", 10),
("evolution-skeptic", "ollama-cloud/qwen3-coder:480b", "deepseek-v4-pro", "Rubric scoring", "No scoring-specific benchmarks", 10),
]
c.executemany('''
INSERT INTO pending_evaluations (agent_name, current_model, candidate_models, reason, blocked_by, priority)
VALUES (?, ?, ?, ?, ?, ?)
''', pending)
conn.commit()
conn.close()
print(f"✅ SQLite DB created: {db_path}")
print(f" Benchmark entries: {len(benchmarks)}")
print(f" Applied assignments: {len(applied)}")
print(f" Pending evaluations: {len(pending)}")