APAW/agent-evolution/data/model-benchmarks.json

{
  "metadata": {
    "generated": "2026-06-01T20:00:00Z",
    "source": "github-moonshot-k2 + ollama-pages + minimax-blog + qwen-blog",
    "method": "text-extraction-from-tables",
    "confidence": "high",
    "verified_sources": [
      "github.com/MoonshotAI/Kimi-K2 (K2 Instruct proxy for K2.6)",
      "ollama.com/library/deepseek-v4-pro",
      "ollama.com/library/glm-5.1",
      "ollama.com/library/minimax-m3",
      "minimax.io/models/text/m3",
      "qwenlm.github.io/blog/qwen3-coder"
    ]
  },
  "models": {
    "deepseek-v4-pro": {
      "vendor": "DeepSeek",
      "params": "1.6T total / 49B active",
      "context": "1M tokens",
      "sources": ["ollama.com/library/deepseek-v4-pro"],
      "coding": {
        "swe_bench_verified": 80.6,
        "swe_bench_pro": 55.4,
        "swe_bench_multilingual": 76.2,
        "livecodebench_v6": 93.5,
        "terminal_bench_2": 67.9,
        "codeforces": 3206
      },
      "agentic": {
        "browsecomp": 83.4,
        "tool_decathlon": 51.8,
        "mcp_atlas_public": 73.6
      },
      "reasoning": {
        "hmmt_feb_2026": 95.2,
        "gpqa_diamond": 90.1,
        "hle": 37.7,
        "imoanswerbench": 89.8,
        "mmlu_pro": 87.5
      },
      "long_context": {
        "mrcr_1m": 83.5,
        "corpusqa_1m": 62.0
      },
      "rank": 1
    },
    "glm-5.1": {
      "vendor": "Zhipu AI (Z.AI)",
      "params": "756B total / ~40B active",
      "context": "198K tokens",
      "sources": ["ollama.com/library/glm-5.1"],
      "coding": {
        "swe_bench_pro": 58.4,
        "terminal_bench_2": 63.5,
        "nl2repo": 42.7
      },
      "agentic": {
        "browsecomp": 68.0,
        "browsecomp_with_context": 79.3,
        "tau3_bench": 70.6,
        "cybergym": 68.7,
        "mcp_atlas_public": 71.8,
        "tool_decathlon": 40.7
      },
      "reasoning": {
        "aime_2026": 95.3,
        "hmmt_feb_2026": 82.6,
        "gpqa_diamond": 86.2,
        "hle": 31.0,
        "imoanswerbench": 83.8
      },
      "unique": "Sustained performance over hundreds of rounds and thousands of tool calls — unique claim",
      "rank": 2
    },
    "kimi-k2.6": {
      "vendor": "Moonshot AI",
      "params": "1.04T total / unknown active (proxy: K2 Instruct)",
      "context": "256K tokens",
      "multimodal": true,
      "proxy_note": "Using Kimi K2 Instruct data as proxy for K2.6",
      "sources": ["github.com/MoonshotAI/Kimi-K2"],
      "coding": {
        "swe_bench_verified": 65.8,
        "swe_bench_verified_multiple": 71.6,
        "swe_bench_multilingual": 47.3,
        "livecodebench_v6": 53.7,
        "terminal_bench_2": 30.0,
        "aider_polyglot": 60.0,
        "multiple_pass": 85.7
      },
      "agentic": {
        "browsecomp": 60.6,
        "tau2_retail": 70.6,
        "tau2_airline": 56.5,
        "tau2_telecom": 65.8,
        "acebench": 76.5
      },
      "reasoning": {
        "aime_2025": 49.5,
        "math_500": 97.4,
        "hmmt_2025": 38.8,
        "gpqa_diamond": 75.1,
        "mmlu": 89.5,
        "mmlu_pro": 81.1
      },
      "unique": "ONLY true multimodal (vision + text native) among all candidates",
      "rank": 3
    },
    "minimax-m3": {
      "vendor": "MiniMax",
      "params": "unknown",
      "context": "512K guaranteed, up to 1M",
      "multimodal": true,
      "sources": ["ollama.com/library/minimax-m3", "minimax.io/models/text/m3"],
      "agentic": {
        "browsecomp": 83.5,
        "paper_reproduction": "12-hour autonomous ICLR replication (18 commits, 23 figures)",
        "cuda_optimization": "147 iterations, 9.4x speedup, zero human intervention",
        "posttrainbench": "37.1 (#3 overall, behind Opus 4.7 42.4, GPT-5.5 39.3)"
      },
      "coding": {
        "note": "Top-tier per Ollama; specific scores not in extracted text"
      },
      "long_context": {
        "msa_architecture": "Native ultra-long context pretraining"
      },
      "rank": 4
    },
    "minimax-m2.5": {
      "vendor": "MiniMax",
      "params": "unknown",
      "context": "unknown",
      "sources": ["ollama.com/library/minimax-m2.5"],
      "coding": {
        "note": "State-of-the-art for real-world productivity and coding tasks"
      },
      "agentic": {
        "tools": true,
        "thinking": true,
        "pulls": "2.2M on Ollama"
      },
      "unique": "User-confirmed best frontend developer model",
      "rank": 5
    },
    "qwen3-coder-480b": {
      "vendor": "Alibaba/Qwen",
      "params": "480B total / 35B active",
      "context": "256K native, 1M w/ YaRN",
      "sources": ["qwenlm.github.io/blog/qwen3-coder", "huggingface.co"],
      "coding": {
        "swe_bench_pro_hf": 38.7,
        "terminal_bench_2_hf": 23.9,
        "evasionbench": 78.16
      },
      "agentic": {
        "note": "Claims SOTA open-source on agentic coding; methodology differs from HF eval"
      },
      "rank": 6
    }
  },
  "role_assignments": {
    "deepseek-v4-pro": {
      "agents": ["lead-developer", "backend-developer", "php-developer", "python-developer", "code-skeptic", "the-fixer", "performance-engineer"],
      "rationale": "Coding: SWE-bench 80.6%, LiveCodeBench 93.5%, TerminalBench 67.9%. Reasoning: GPQA 90.1%, HMMT 95.2%. Best raw coding + algorithmic analysis scores."
    },
    "glm-5.1": {
      "agents": ["agent-architect", "workflow-architect", "orchestrator"],
      "rationale": "Agentic: CyberGym 68.7%, Tau3 70.6%, BrowseComp 68-79%. Unique claim: sustained performance over hundreds of rounds. Best for long-horizon design tasks."
    },
    "kimi-k2.6": {
      "agents": ["visual-tester"],
      "rationale": "ONLY true multimodal (vision + text native). SWE-bench 65.8%, AceBench 76.5%. Multimodal screenshot analysis requires native vision."
    },
    "minimax-m3": {
      "agents": ["system-analyst", "planner", "capability-analyst", "devops-engineer", "security-auditor", "evaluator", "prompt-optimizer", "reflector", "memory-manager", "evolution-prompt"],
      "rationale": "BrowseComp 83.5 (surpasses Opus 4.7). 1M context MSA architecture. 12h autonomous paper replication, 147 CUDA iterations without human intervention. Best for agentic tasks requiring long context + persistence."
    },
    "minimax-m2.5": {
      "agents": ["frontend-developer", "browser-automation", "flutter-developer"],
      "rationale": "User-confirmed best frontend model. 2.2M Ollama pulls. 'Real-world productivity and coding tasks' per Ollama description."
    },
    "qwen3-coder-480b": {
      "agents": ["sdet-engineer", "release-manager", "product-owner", "markdown-validator", "pipeline-judge", "history-miner", "go-developer", "architect-indexer", "workflow-cross-checker", "evolution-skeptic", "requirement-refiner"],
      "rationale": "Lower benchmark scores (SWE-bench Pro 38.7%, TerminalBench 23.9%). Best fit for simple structured tasks where deterministic output is more important than frontier reasoning."
    }
  },
  "evidence_table": {
    "swe_bench_verified": [
      {"model": "deepseek-v4-pro", "score": 80.6, "source": "ollama"},
      {"model": "kimi-k2 (proxy)", "score": 65.8, "source": "github-k2"},
      {"model": "glm-5.1", "score": null, "source": "not-published"},
      {"model": "qwen3-coder-480b", "score": null, "source": "blog-claims-sota"}
    ],
    "livecodebench": [
      {"model": "deepseek-v4-pro", "score": 93.5, "source": "ollama"},
      {"model": "kimi-k2 (proxy)", "score": 53.7, "source": "github-k2"}
    ],
    "terminal_bench": [
      {"model": "deepseek-v4-pro", "score": 67.9, "source": "ollama"},
      {"model": "glm-5.1", "score": 63.5, "source": "ollama"},
      {"model": "kimi-k2 (proxy)", "score": 30.0, "source": "github-k2"}
    ],
    "browsecomp": [
      {"model": "deepseek-v4-pro", "score": 83.4, "source": "ollama"},
      {"model": "minimax-m3", "score": 83.5, "source": "ollama+minimax-blog"},
      {"model": "glm-5.1", "score": 68.0, "source": "ollama"},
      {"model": "kimi-k2 (proxy)", "score": 60.6, "source": "github-k2"}
    ],
    "gpqa_diamond": [
      {"model": "deepseek-v4-pro", "score": 90.1, "source": "ollama"},
      {"model": "glm-5.1", "score": 86.2, "source": "ollama"},
      {"model": "kimi-k2 (proxy)", "score": 75.1, "source": "github-k2"}
    ],
    "tau_tool_use": [
      {"model": "glm-5.1", "score": 70.6, "source": "ollama", "variant": "tau3"},
      {"model": "kimi-k2 (proxy)", "score": 70.6, "source": "github-k2", "variant": "tau2-retail"}
    ]
  }
}