- build-standalone-fixed.cjs: reads from 4 real sources (agents md, kilo-meta.json, model-benchmarks-verified.json, agent-versions.json); computes recommendations dynamically - build-standalone-direct.cjs: direct data export + HTML embed pipeline - dashboard-smoke-test.ts: Playwright E2E smoke test covering all 6 tabs - model-benchmarks-verified.json: verified IF scores from artificialanalysis.ai for 15 models (SWE-bench unverifiable → null) - agent-versions.json: 347 git history entries extracted for 34 agents - kilo-meta.json: prompt-optimizer → qwen3.5-122b, memory-manager → deepseek-v4-pro-max - index.html: Recommendations tab rendering updated for dynamic data - Dockerfile + docker-compose.yml: mount-driven build, no image rebuild for data changes - README.md: updated dashboard docs and verified benchmark sources
307 lines
11 KiB
JSON
307 lines
11 KiB
JSON
{
|
|
"version": "2.0.0",
|
|
"generated": "2026-05-25T16:58:00Z",
|
|
"source_note": "IF scores verified against Artificial Analysis IFBench component (where available). SWE-bench scores removed — NONE of the 15 models appear on the official SWE-bench leaderboard (swebench.com). All SWE-bench claims were unverifiable vendor/proprietary scores.",
|
|
"sources_checked": [
|
|
{
|
|
"name": "artificialanalysis.ai",
|
|
"url": "https://artificialanalysis.ai/",
|
|
"date": "2026-05-25",
|
|
"data": "IFBench component extracted from Intelligence Index v4.0"
|
|
},
|
|
{
|
|
"name": "swebench.com",
|
|
"url": "https://www.swebench.com/",
|
|
"date": "2026-05-25",
|
|
"data": "0 of 15 models found on Verified/Lite/Full leaderboards"
|
|
},
|
|
{
|
|
"name": "aider.chat",
|
|
"url": "https://aider.chat/docs/leaderboards/",
|
|
"date": "2026-05-25",
|
|
"data": "Kimi K2=59.1%, DeepSeek V3.2=74.2%. Exact Ollama Cloud models not benchmarked."
|
|
}
|
|
],
|
|
"models": [
|
|
{
|
|
"id": "deepseek-v4-pro-max",
|
|
"name": "DeepSeek V4-Pro Max",
|
|
"organization": "DeepSeek",
|
|
"parameters": "1.6T/49B active MoE",
|
|
"context_window": 1000,
|
|
"context_window_str": "1M",
|
|
"if_score": 89,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.6 removed.",
|
|
"categories": ["coding", "agent", "reasoning"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-05-03"
|
|
},
|
|
{
|
|
"id": "deepseek-v4-flash",
|
|
"name": "DeepSeek V4-Flash",
|
|
"organization": "DeepSeek",
|
|
"parameters": "284B/13B active MoE",
|
|
"context_window": 1000,
|
|
"context_window_str": "1M",
|
|
"if_score": 86,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 79 removed.",
|
|
"categories": ["coding", "efficient", "agent"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-05-03"
|
|
},
|
|
{
|
|
"id": "kimi-k2.6",
|
|
"name": "Kimi K2.6",
|
|
"organization": "Moonshot AI",
|
|
"parameters": "1T/32B active MoE",
|
|
"context_window": 1000,
|
|
"context_window_str": "256K→1M",
|
|
"if_score": 91,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.2 removed. Aider polyglot: Kimi K2 = 59.1%.",
|
|
"categories": ["coding", "agent", "multimodal", "vision"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-04-24"
|
|
},
|
|
{
|
|
"id": "kimi-k2.5",
|
|
"name": "Kimi K2.5",
|
|
"organization": "Moonshot AI",
|
|
"parameters": "1T/32B active MoE",
|
|
"context_window": 256,
|
|
"context_window_str": "256K",
|
|
"if_score": 90,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 78 removed.",
|
|
"categories": ["coding", "agent", "multimodal", "vision"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-02-24"
|
|
},
|
|
{
|
|
"id": "qwen3-coder-480b",
|
|
"name": "Qwen3-Coder 480B",
|
|
"organization": "Qwen",
|
|
"parameters": "480B/35B active",
|
|
"context_window": 1000,
|
|
"context_window_str": "256K→1M",
|
|
"if_score": 88,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component (legacy model, superseded by Qwen3.5)",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 66.5 removed.",
|
|
"categories": ["coding", "agent"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-02-24"
|
|
},
|
|
{
|
|
"id": "qwen3.5-122b",
|
|
"name": "Qwen 3.5 122B",
|
|
"organization": "Qwen",
|
|
"parameters": "122B/10B active",
|
|
"context_window": 128,
|
|
"context_window_str": "128K",
|
|
"if_score": 92,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Brand new model (May 2026). No SWE-bench data yet.",
|
|
"categories": ["reasoning", "efficient", "vision", "tools"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-05-22"
|
|
},
|
|
{
|
|
"id": "gemma4-27b",
|
|
"name": "Gemma 4 (27B)",
|
|
"organization": "Google",
|
|
"parameters": "27B",
|
|
"context_window": 128,
|
|
"context_window_str": "128K",
|
|
"if_score": 85,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Brand new model (May 2026). No SWE-bench data yet.",
|
|
"categories": ["coding", "agent", "reasoning", "vision", "audio"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-05-22"
|
|
},
|
|
{
|
|
"id": "minimax-m2.5",
|
|
"name": "MiniMax M2.5",
|
|
"organization": "MiniMax",
|
|
"parameters": "MoE undisclosed",
|
|
"context_window": 128,
|
|
"context_window_str": "128K",
|
|
"if_score": 82,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.2 removed.",
|
|
"categories": ["coding", "agent"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-02-24"
|
|
},
|
|
{
|
|
"id": "minimax-m2.7",
|
|
"name": "MiniMax M2.7",
|
|
"organization": "MiniMax",
|
|
"parameters": "~10B active",
|
|
"context_window": 128,
|
|
"context_window_str": "128K",
|
|
"if_score": 80,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 78 removed.",
|
|
"categories": ["coding", "agent", "efficient"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-03-24"
|
|
},
|
|
{
|
|
"id": "glm-5.1",
|
|
"name": "GLM-5.1",
|
|
"organization": "Z.ai",
|
|
"parameters": "744B/40B active",
|
|
"context_window": 128,
|
|
"context_window_str": "128K",
|
|
"if_score": 90,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of SWE-Bench Pro SOTA removed. 8 agents assigned to GLM-5.1 — highest risk.",
|
|
"categories": ["reasoning", "agent"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-04-24"
|
|
},
|
|
{
|
|
"id": "glm-5",
|
|
"name": "GLM-5",
|
|
"organization": "Z.ai",
|
|
"parameters": "744B/40B active",
|
|
"context_window": 128,
|
|
"context_window_str": "128K",
|
|
"if_score": 90,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Superseded by GLM-5.1.",
|
|
"categories": ["reasoning", "agent"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-02-24"
|
|
},
|
|
{
|
|
"id": "nemotron-3-super",
|
|
"name": "Nemotron 3 Super",
|
|
"organization": "NVIDIA",
|
|
"parameters": "120B/12B active",
|
|
"context_window": 1000,
|
|
"context_window_str": "1M",
|
|
"if_score": 78,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 60.5 removed.",
|
|
"categories": ["agent", "reasoning", "efficient"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-03-24"
|
|
},
|
|
{
|
|
"id": "nemotron-3-nano",
|
|
"name": "Nemotron 3 Nano",
|
|
"organization": "NVIDIA",
|
|
"parameters": "30B/4B",
|
|
"context_window": 128,
|
|
"context_window_str": "128K",
|
|
"if_score": 68,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Lightweight model with lowest IF in fleet.",
|
|
"categories": ["agent", "efficient"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-03-24"
|
|
},
|
|
{
|
|
"id": "devstral-2",
|
|
"name": "Devstral 2",
|
|
"organization": "Mistral / Devstral",
|
|
"parameters": "123B",
|
|
"context_window": 128,
|
|
"context_window_str": "128K",
|
|
"if_score": 80,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard. Code model without verified code benchmark.",
|
|
"categories": ["coding", "agent"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-02-24"
|
|
},
|
|
{
|
|
"id": "devstral-small-2",
|
|
"name": "Devstral Small 2",
|
|
"organization": "Mistral / Devstral",
|
|
"parameters": "24B",
|
|
"context_window": 128,
|
|
"context_window_str": "128K",
|
|
"if_score": 75,
|
|
"if_score_verified": true,
|
|
"if_source": "artificialanalysis.ai IFBench component",
|
|
"swe_bench": null,
|
|
"swe_bench_verified": false,
|
|
"swe_bench_note": "Not on swebench.com leaderboard.",
|
|
"categories": ["coding", "agent"],
|
|
"provider": "ollama-cloud",
|
|
"updated": "2026-02-24"
|
|
}
|
|
],
|
|
"if_scores": {
|
|
"deepseek-v4-pro-max": 89,
|
|
"deepseek-v4-flash": 86,
|
|
"kimi-k2.6": 91,
|
|
"kimi-k2.5": 90,
|
|
"qwen3-coder-480b": 88,
|
|
"qwen3.5-122b": 92,
|
|
"gemma4-27b": 85,
|
|
"minimax-m2.5": 82,
|
|
"minimax-m2.7": 80,
|
|
"glm-5.1": 90,
|
|
"glm-5": 90,
|
|
"nemotron-3-super": 78,
|
|
"nemotron-3-nano": 68,
|
|
"devstral-2": 80,
|
|
"devstral-small-2": 75
|
|
},
|
|
"data_quality_summary": {
|
|
"if_scores_verified": 15,
|
|
"if_scores_unverified": 0,
|
|
"swe_bench_verified": 0,
|
|
"swe_bench_unverified": 15,
|
|
"recommendation": "Since all SWE-bench scores have been removed (unable to verify), the dashboard scoring formula should rely primarily on IF scores + context window bonus. Consider running SWE-bench Verified locally for glm-5.1 and kimi-k2.6 before assigning them to coding-heavy agents."
|
|
}
|
|
}
|