{ "version": "2.0.0", "generated": "2026-05-25T16:58:00Z", "source_note": "IF scores verified against Artificial Analysis IFBench component (where available). SWE-bench scores removed — NONE of the 15 models appear on the official SWE-bench leaderboard (swebench.com). All SWE-bench claims were unverifiable vendor/proprietary scores.", "sources_checked": [ { "name": "artificialanalysis.ai", "url": "https://artificialanalysis.ai/", "date": "2026-05-25", "data": "IFBench component extracted from Intelligence Index v4.0" }, { "name": "swebench.com", "url": "https://www.swebench.com/", "date": "2026-05-25", "data": "0 of 15 models found on Verified/Lite/Full leaderboards" }, { "name": "aider.chat", "url": "https://aider.chat/docs/leaderboards/", "date": "2026-05-25", "data": "Kimi K2=59.1%, DeepSeek V3.2=74.2%. Exact Ollama Cloud models not benchmarked." } ], "models": [ { "id": "deepseek-v4-pro-max", "name": "DeepSeek V4-Pro Max", "organization": "DeepSeek", "parameters": "1.6T/49B active MoE", "context_window": 1000, "context_window_str": "1M", "if_score": 89, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.6 removed.", "categories": ["coding", "agent", "reasoning"], "provider": "ollama-cloud", "updated": "2026-05-03" }, { "id": "deepseek-v4-flash", "name": "DeepSeek V4-Flash", "organization": "DeepSeek", "parameters": "284B/13B active MoE", "context_window": 1000, "context_window_str": "1M", "if_score": 86, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 79 removed.", "categories": ["coding", "efficient", "agent"], "provider": "ollama-cloud", "updated": "2026-05-03" }, { "id": "kimi-k2.6", "name": "Kimi K2.6", "organization": "Moonshot AI", "parameters": "1T/32B active MoE", "context_window": 1000, "context_window_str": "256K→1M", "if_score": 91, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.2 removed. Aider polyglot: Kimi K2 = 59.1%.", "categories": ["coding", "agent", "multimodal", "vision"], "provider": "ollama-cloud", "updated": "2026-04-24" }, { "id": "kimi-k2.5", "name": "Kimi K2.5", "organization": "Moonshot AI", "parameters": "1T/32B active MoE", "context_window": 256, "context_window_str": "256K", "if_score": 90, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 78 removed.", "categories": ["coding", "agent", "multimodal", "vision"], "provider": "ollama-cloud", "updated": "2026-02-24" }, { "id": "qwen3-coder-480b", "name": "Qwen3-Coder 480B", "organization": "Qwen", "parameters": "480B/35B active", "context_window": 1000, "context_window_str": "256K→1M", "if_score": 88, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component (legacy model, superseded by Qwen3.5)", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 66.5 removed.", "categories": ["coding", "agent"], "provider": "ollama-cloud", "updated": "2026-02-24" }, { "id": "qwen3.5-122b", "name": "Qwen 3.5 122B", "organization": "Qwen", "parameters": "122B/10B active", "context_window": 128, "context_window_str": "128K", "if_score": 92, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Brand new model (May 2026). No SWE-bench data yet.", "categories": ["reasoning", "efficient", "vision", "tools"], "provider": "ollama-cloud", "updated": "2026-05-22" }, { "id": "gemma4-27b", "name": "Gemma 4 (27B)", "organization": "Google", "parameters": "27B", "context_window": 128, "context_window_str": "128K", "if_score": 85, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Brand new model (May 2026). No SWE-bench data yet.", "categories": ["coding", "agent", "reasoning", "vision", "audio"], "provider": "ollama-cloud", "updated": "2026-05-22" }, { "id": "minimax-m2.5", "name": "MiniMax M2.5", "organization": "MiniMax", "parameters": "MoE undisclosed", "context_window": 128, "context_window_str": "128K", "if_score": 82, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.2 removed.", "categories": ["coding", "agent"], "provider": "ollama-cloud", "updated": "2026-02-24" }, { "id": "minimax-m2.7", "name": "MiniMax M2.7", "organization": "MiniMax", "parameters": "~10B active", "context_window": 128, "context_window_str": "128K", "if_score": 80, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 78 removed.", "categories": ["coding", "agent", "efficient"], "provider": "ollama-cloud", "updated": "2026-03-24" }, { "id": "glm-5.1", "name": "GLM-5.1", "organization": "Z.ai", "parameters": "744B/40B active", "context_window": 128, "context_window_str": "128K", "if_score": 90, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of SWE-Bench Pro SOTA removed. 8 agents assigned to GLM-5.1 — highest risk.", "categories": ["reasoning", "agent"], "provider": "ollama-cloud", "updated": "2026-04-24" }, { "id": "glm-5", "name": "GLM-5", "organization": "Z.ai", "parameters": "744B/40B active", "context_window": 128, "context_window_str": "128K", "if_score": 90, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Superseded by GLM-5.1.", "categories": ["reasoning", "agent"], "provider": "ollama-cloud", "updated": "2026-02-24" }, { "id": "nemotron-3-super", "name": "Nemotron 3 Super", "organization": "NVIDIA", "parameters": "120B/12B active", "context_window": 1000, "context_window_str": "1M", "if_score": 78, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 60.5 removed.", "categories": ["agent", "reasoning", "efficient"], "provider": "ollama-cloud", "updated": "2026-03-24" }, { "id": "nemotron-3-nano", "name": "Nemotron 3 Nano", "organization": "NVIDIA", "parameters": "30B/4B", "context_window": 128, "context_window_str": "128K", "if_score": 68, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Lightweight model with lowest IF in fleet.", "categories": ["agent", "efficient"], "provider": "ollama-cloud", "updated": "2026-03-24" }, { "id": "devstral-2", "name": "Devstral 2", "organization": "Mistral / Devstral", "parameters": "123B", "context_window": 128, "context_window_str": "128K", "if_score": 80, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard. Code model without verified code benchmark.", "categories": ["coding", "agent"], "provider": "ollama-cloud", "updated": "2026-02-24" }, { "id": "devstral-small-2", "name": "Devstral Small 2", "organization": "Mistral / Devstral", "parameters": "24B", "context_window": 128, "context_window_str": "128K", "if_score": 75, "if_score_verified": true, "if_source": "artificialanalysis.ai IFBench component", "swe_bench": null, "swe_bench_verified": false, "swe_bench_note": "Not on swebench.com leaderboard.", "categories": ["coding", "agent"], "provider": "ollama-cloud", "updated": "2026-02-24" } ], "if_scores": { "deepseek-v4-pro-max": 89, "deepseek-v4-flash": 86, "kimi-k2.6": 91, "kimi-k2.5": 90, "qwen3-coder-480b": 88, "qwen3.5-122b": 92, "gemma4-27b": 85, "minimax-m2.5": 82, "minimax-m2.7": 80, "glm-5.1": 90, "glm-5": 90, "nemotron-3-super": 78, "nemotron-3-nano": 68, "devstral-2": 80, "devstral-small-2": 75 }, "data_quality_summary": { "if_scores_verified": 15, "if_scores_unverified": 0, "swe_bench_verified": 0, "swe_bench_unverified": 15, "recommendation": "Since all SWE-bench scores have been removed (unable to verify), the dashboard scoring formula should rely primarily on IF scores + context window bonus. Consider running SWE-bench Verified locally for glm-5.1 and kimi-k2.6 before assigning them to coding-heavy agents." } }