Files
APAW/agent-evolution/data/model-benchmarks-verified.json
Deploy Bot 9b0f160587 feat(dashboard): unified data pipeline, verified benchmarks, and browser testing
- build-standalone-fixed.cjs: reads from 4 real sources (agents md, kilo-meta.json, model-benchmarks-verified.json, agent-versions.json); computes recommendations dynamically
- build-standalone-direct.cjs: direct data export + HTML embed pipeline
- dashboard-smoke-test.ts: Playwright E2E smoke test covering all 6 tabs
- model-benchmarks-verified.json: verified IF scores from artificialanalysis.ai for 15 models (SWE-bench unverifiable → null)
- agent-versions.json: 347 git history entries extracted for 34 agents
- kilo-meta.json: prompt-optimizer → qwen3.5-122b, memory-manager → deepseek-v4-pro-max
- index.html: Recommendations tab rendering updated for dynamic data
- Dockerfile + docker-compose.yml: mount-driven build, no image rebuild for data changes
- README.md: updated dashboard docs and verified benchmark sources
2026-05-25 21:05:14 +01:00

307 lines
11 KiB
JSON

{
"version": "2.0.0",
"generated": "2026-05-25T16:58:00Z",
"source_note": "IF scores verified against Artificial Analysis IFBench component (where available). SWE-bench scores removed — NONE of the 15 models appear on the official SWE-bench leaderboard (swebench.com). All SWE-bench claims were unverifiable vendor/proprietary scores.",
"sources_checked": [
{
"name": "artificialanalysis.ai",
"url": "https://artificialanalysis.ai/",
"date": "2026-05-25",
"data": "IFBench component extracted from Intelligence Index v4.0"
},
{
"name": "swebench.com",
"url": "https://www.swebench.com/",
"date": "2026-05-25",
"data": "0 of 15 models found on Verified/Lite/Full leaderboards"
},
{
"name": "aider.chat",
"url": "https://aider.chat/docs/leaderboards/",
"date": "2026-05-25",
"data": "Kimi K2=59.1%, DeepSeek V3.2=74.2%. Exact Ollama Cloud models not benchmarked."
}
],
"models": [
{
"id": "deepseek-v4-pro-max",
"name": "DeepSeek V4-Pro Max",
"organization": "DeepSeek",
"parameters": "1.6T/49B active MoE",
"context_window": 1000,
"context_window_str": "1M",
"if_score": 89,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.6 removed.",
"categories": ["coding", "agent", "reasoning"],
"provider": "ollama-cloud",
"updated": "2026-05-03"
},
{
"id": "deepseek-v4-flash",
"name": "DeepSeek V4-Flash",
"organization": "DeepSeek",
"parameters": "284B/13B active MoE",
"context_window": 1000,
"context_window_str": "1M",
"if_score": 86,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 79 removed.",
"categories": ["coding", "efficient", "agent"],
"provider": "ollama-cloud",
"updated": "2026-05-03"
},
{
"id": "kimi-k2.6",
"name": "Kimi K2.6",
"organization": "Moonshot AI",
"parameters": "1T/32B active MoE",
"context_window": 1000,
"context_window_str": "256K→1M",
"if_score": 91,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.2 removed. Aider polyglot: Kimi K2 = 59.1%.",
"categories": ["coding", "agent", "multimodal", "vision"],
"provider": "ollama-cloud",
"updated": "2026-04-24"
},
{
"id": "kimi-k2.5",
"name": "Kimi K2.5",
"organization": "Moonshot AI",
"parameters": "1T/32B active MoE",
"context_window": 256,
"context_window_str": "256K",
"if_score": 90,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 78 removed.",
"categories": ["coding", "agent", "multimodal", "vision"],
"provider": "ollama-cloud",
"updated": "2026-02-24"
},
{
"id": "qwen3-coder-480b",
"name": "Qwen3-Coder 480B",
"organization": "Qwen",
"parameters": "480B/35B active",
"context_window": 1000,
"context_window_str": "256K→1M",
"if_score": 88,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component (legacy model, superseded by Qwen3.5)",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 66.5 removed.",
"categories": ["coding", "agent"],
"provider": "ollama-cloud",
"updated": "2026-02-24"
},
{
"id": "qwen3.5-122b",
"name": "Qwen 3.5 122B",
"organization": "Qwen",
"parameters": "122B/10B active",
"context_window": 128,
"context_window_str": "128K",
"if_score": 92,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Brand new model (May 2026). No SWE-bench data yet.",
"categories": ["reasoning", "efficient", "vision", "tools"],
"provider": "ollama-cloud",
"updated": "2026-05-22"
},
{
"id": "gemma4-27b",
"name": "Gemma 4 (27B)",
"organization": "Google",
"parameters": "27B",
"context_window": 128,
"context_window_str": "128K",
"if_score": 85,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Brand new model (May 2026). No SWE-bench data yet.",
"categories": ["coding", "agent", "reasoning", "vision", "audio"],
"provider": "ollama-cloud",
"updated": "2026-05-22"
},
{
"id": "minimax-m2.5",
"name": "MiniMax M2.5",
"organization": "MiniMax",
"parameters": "MoE undisclosed",
"context_window": 128,
"context_window_str": "128K",
"if_score": 82,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.2 removed.",
"categories": ["coding", "agent"],
"provider": "ollama-cloud",
"updated": "2026-02-24"
},
{
"id": "minimax-m2.7",
"name": "MiniMax M2.7",
"organization": "MiniMax",
"parameters": "~10B active",
"context_window": 128,
"context_window_str": "128K",
"if_score": 80,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 78 removed.",
"categories": ["coding", "agent", "efficient"],
"provider": "ollama-cloud",
"updated": "2026-03-24"
},
{
"id": "glm-5.1",
"name": "GLM-5.1",
"organization": "Z.ai",
"parameters": "744B/40B active",
"context_window": 128,
"context_window_str": "128K",
"if_score": 90,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of SWE-Bench Pro SOTA removed. 8 agents assigned to GLM-5.1 — highest risk.",
"categories": ["reasoning", "agent"],
"provider": "ollama-cloud",
"updated": "2026-04-24"
},
{
"id": "glm-5",
"name": "GLM-5",
"organization": "Z.ai",
"parameters": "744B/40B active",
"context_window": 128,
"context_window_str": "128K",
"if_score": 90,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Superseded by GLM-5.1.",
"categories": ["reasoning", "agent"],
"provider": "ollama-cloud",
"updated": "2026-02-24"
},
{
"id": "nemotron-3-super",
"name": "Nemotron 3 Super",
"organization": "NVIDIA",
"parameters": "120B/12B active",
"context_window": 1000,
"context_window_str": "1M",
"if_score": 78,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 60.5 removed.",
"categories": ["agent", "reasoning", "efficient"],
"provider": "ollama-cloud",
"updated": "2026-03-24"
},
{
"id": "nemotron-3-nano",
"name": "Nemotron 3 Nano",
"organization": "NVIDIA",
"parameters": "30B/4B",
"context_window": 128,
"context_window_str": "128K",
"if_score": 68,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Lightweight model with lowest IF in fleet.",
"categories": ["agent", "efficient"],
"provider": "ollama-cloud",
"updated": "2026-03-24"
},
{
"id": "devstral-2",
"name": "Devstral 2",
"organization": "Mistral / Devstral",
"parameters": "123B",
"context_window": 128,
"context_window_str": "128K",
"if_score": 80,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard. Code model without verified code benchmark.",
"categories": ["coding", "agent"],
"provider": "ollama-cloud",
"updated": "2026-02-24"
},
{
"id": "devstral-small-2",
"name": "Devstral Small 2",
"organization": "Mistral / Devstral",
"parameters": "24B",
"context_window": 128,
"context_window_str": "128K",
"if_score": 75,
"if_score_verified": true,
"if_source": "artificialanalysis.ai IFBench component",
"swe_bench": null,
"swe_bench_verified": false,
"swe_bench_note": "Not on swebench.com leaderboard.",
"categories": ["coding", "agent"],
"provider": "ollama-cloud",
"updated": "2026-02-24"
}
],
"if_scores": {
"deepseek-v4-pro-max": 89,
"deepseek-v4-flash": 86,
"kimi-k2.6": 91,
"kimi-k2.5": 90,
"qwen3-coder-480b": 88,
"qwen3.5-122b": 92,
"gemma4-27b": 85,
"minimax-m2.5": 82,
"minimax-m2.7": 80,
"glm-5.1": 90,
"glm-5": 90,
"nemotron-3-super": 78,
"nemotron-3-nano": 68,
"devstral-2": 80,
"devstral-small-2": 75
},
"data_quality_summary": {
"if_scores_verified": 15,
"if_scores_unverified": 0,
"swe_bench_verified": 0,
"swe_bench_unverified": 15,
"recommendation": "Since all SWE-bench scores have been removed (unable to verify), the dashboard scoring formula should rely primarily on IF scores + context window bonus. Consider running SWE-bench Verified locally for glm-5.1 and kimi-k2.6 before assigning them to coding-heavy agents."
}
}