APAW/agent-evolution/data/model-benchmarks-verified.json

{
  "version": "2.0.0",
  "generated": "2026-05-25T16:58:00Z",
  "source_note": "IF scores verified against Artificial Analysis IFBench component (where available). SWE-bench scores removed — NONE of the 15 models appear on the official SWE-bench leaderboard (swebench.com). All SWE-bench claims were unverifiable vendor/proprietary scores.",
  "sources_checked": [
    {
      "name": "artificialanalysis.ai",
      "url": "https://artificialanalysis.ai/",
      "date": "2026-05-25",
      "data": "IFBench component extracted from Intelligence Index v4.0"
    },
    {
      "name": "swebench.com",
      "url": "https://www.swebench.com/",
      "date": "2026-05-25",
      "data": "0 of 15 models found on Verified/Lite/Full leaderboards"
    },
    {
      "name": "aider.chat",
      "url": "https://aider.chat/docs/leaderboards/",
      "date": "2026-05-25",
      "data": "Kimi K2=59.1%, DeepSeek V3.2=74.2%. Exact Ollama Cloud models not benchmarked."
    }
  ],
  "models": [
    {
      "id": "deepseek-v4-pro-max",
      "name": "DeepSeek V4-Pro Max",
      "organization": "DeepSeek",
      "parameters": "1.6T/49B active MoE",
      "context_window": 1000,
      "context_window_str": "1M",
      "if_score": 89,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.6 removed.",
      "categories": ["coding", "agent", "reasoning"],
      "provider": "ollama-cloud",
      "updated": "2026-05-03"
    },
    {
      "id": "deepseek-v4-flash",
      "name": "DeepSeek V4-Flash",
      "organization": "DeepSeek",
      "parameters": "284B/13B active MoE",
      "context_window": 1000,
      "context_window_str": "1M",
      "if_score": 86,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 79 removed.",
      "categories": ["coding", "efficient", "agent"],
      "provider": "ollama-cloud",
      "updated": "2026-05-03"
    },
    {
      "id": "kimi-k2.6",
      "name": "Kimi K2.6",
      "organization": "Moonshot AI",
      "parameters": "1T/32B active MoE",
      "context_window": 1000,
      "context_window_str": "256K→1M",
      "if_score": 91,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.2 removed. Aider polyglot: Kimi K2 = 59.1%.",
      "categories": ["coding", "agent", "multimodal", "vision"],
      "provider": "ollama-cloud",
      "updated": "2026-04-24"
    },
    {
      "id": "kimi-k2.5",
      "name": "Kimi K2.5",
      "organization": "Moonshot AI",
      "parameters": "1T/32B active MoE",
      "context_window": 256,
      "context_window_str": "256K",
      "if_score": 90,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 78 removed.",
      "categories": ["coding", "agent", "multimodal", "vision"],
      "provider": "ollama-cloud",
      "updated": "2026-02-24"
    },
    {
      "id": "qwen3-coder-480b",
      "name": "Qwen3-Coder 480B",
      "organization": "Qwen",
      "parameters": "480B/35B active",
      "context_window": 1000,
      "context_window_str": "256K→1M",
      "if_score": 88,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component (legacy model, superseded by Qwen3.5)",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 66.5 removed.",
      "categories": ["coding", "agent"],
      "provider": "ollama-cloud",
      "updated": "2026-02-24"
    },
    {
      "id": "qwen3.5-122b",
      "name": "Qwen 3.5 122B",
      "organization": "Qwen",
      "parameters": "122B/10B active",
      "context_window": 128,
      "context_window_str": "128K",
      "if_score": 92,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Brand new model (May 2026). No SWE-bench data yet.",
      "categories": ["reasoning", "efficient", "vision", "tools"],
      "provider": "ollama-cloud",
      "updated": "2026-05-22"
    },
    {
      "id": "gemma4-27b",
      "name": "Gemma 4 (27B)",
      "organization": "Google",
      "parameters": "27B",
      "context_window": 128,
      "context_window_str": "128K",
      "if_score": 85,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Brand new model (May 2026). No SWE-bench data yet.",
      "categories": ["coding", "agent", "reasoning", "vision", "audio"],
      "provider": "ollama-cloud",
      "updated": "2026-05-22"
    },
    {
      "id": "minimax-m2.5",
      "name": "MiniMax M2.5",
      "organization": "MiniMax",
      "parameters": "MoE undisclosed",
      "context_window": 128,
      "context_window_str": "128K",
      "if_score": 82,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 80.2 removed.",
      "categories": ["coding", "agent"],
      "provider": "ollama-cloud",
      "updated": "2026-02-24"
    },
    {
      "id": "minimax-m2.7",
      "name": "MiniMax M2.7",
      "organization": "MiniMax",
      "parameters": "~10B active",
      "context_window": 128,
      "context_window_str": "128K",
      "if_score": 80,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 78 removed.",
      "categories": ["coding", "agent", "efficient"],
      "provider": "ollama-cloud",
      "updated": "2026-03-24"
    },
    {
      "id": "glm-5.1",
      "name": "GLM-5.1",
      "organization": "Z.ai",
      "parameters": "744B/40B active",
      "context_window": 128,
      "context_window_str": "128K",
      "if_score": 90,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of SWE-Bench Pro SOTA removed. 8 agents assigned to GLM-5.1 — highest risk.",
      "categories": ["reasoning", "agent"],
      "provider": "ollama-cloud",
      "updated": "2026-04-24"
    },
    {
      "id": "glm-5",
      "name": "GLM-5",
      "organization": "Z.ai",
      "parameters": "744B/40B active",
      "context_window": 128,
      "context_window_str": "128K",
      "if_score": 90,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Superseded by GLM-5.1.",
      "categories": ["reasoning", "agent"],
      "provider": "ollama-cloud",
      "updated": "2026-02-24"
    },
    {
      "id": "nemotron-3-super",
      "name": "Nemotron 3 Super",
      "organization": "NVIDIA",
      "parameters": "120B/12B active",
      "context_window": 1000,
      "context_window_str": "1M",
      "if_score": 78,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Previous claim of 60.5 removed.",
      "categories": ["agent", "reasoning", "efficient"],
      "provider": "ollama-cloud",
      "updated": "2026-03-24"
    },
    {
      "id": "nemotron-3-nano",
      "name": "Nemotron 3 Nano",
      "organization": "NVIDIA",
      "parameters": "30B/4B",
      "context_window": 128,
      "context_window_str": "128K",
      "if_score": 68,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Lightweight model with lowest IF in fleet.",
      "categories": ["agent", "efficient"],
      "provider": "ollama-cloud",
      "updated": "2026-03-24"
    },
    {
      "id": "devstral-2",
      "name": "Devstral 2",
      "organization": "Mistral / Devstral",
      "parameters": "123B",
      "context_window": 128,
      "context_window_str": "128K",
      "if_score": 80,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard. Code model without verified code benchmark.",
      "categories": ["coding", "agent"],
      "provider": "ollama-cloud",
      "updated": "2026-02-24"
    },
    {
      "id": "devstral-small-2",
      "name": "Devstral Small 2",
      "organization": "Mistral / Devstral",
      "parameters": "24B",
      "context_window": 128,
      "context_window_str": "128K",
      "if_score": 75,
      "if_score_verified": true,
      "if_source": "artificialanalysis.ai IFBench component",
      "swe_bench": null,
      "swe_bench_verified": false,
      "swe_bench_note": "Not on swebench.com leaderboard.",
      "categories": ["coding", "agent"],
      "provider": "ollama-cloud",
      "updated": "2026-02-24"
    }
  ],
  "if_scores": {
    "deepseek-v4-pro-max": 89,
    "deepseek-v4-flash": 86,
    "kimi-k2.6": 91,
    "kimi-k2.5": 90,
    "qwen3-coder-480b": 88,
    "qwen3.5-122b": 92,
    "gemma4-27b": 85,
    "minimax-m2.5": 82,
    "minimax-m2.7": 80,
    "glm-5.1": 90,
    "glm-5": 90,
    "nemotron-3-super": 78,
    "nemotron-3-nano": 68,
    "devstral-2": 80,
    "devstral-small-2": 75
  },
  "data_quality_summary": {
    "if_scores_verified": 15,
    "if_scores_unverified": 0,
    "swe_bench_verified": 0,
    "swe_bench_unverified": 15,
    "recommendation": "Since all SWE-bench scores have been removed (unable to verify), the dashboard scoring formula should rely primarily on IF scores + context window bonus. Consider running SWE-bench Verified locally for glm-5.1 and kimi-k2.6 before assigning them to coding-heavy agents."
  }
}