- Reassign 29/30 agents based on capability-analyst web research - deepseek-v4-pro: 14 agents (coding SOTA: SWE-bench 80.6%, LiveCodeBench 93.5%) - minimax-m3☁️ 8 agents (agentic: BrowseComp 83.5%, 12h autonomous) - glm-5.1: 4 agents (CyberGym 68.7% SOTA, sustained rounds) - minimax-m2.5☁️ 2 agents (frontend productivity, 2.2M pulls) - kimi-k2.6: 1 agent (ONLY true multimodal) - Add OpenCompass evaluation container (docker, scripts) for future objective runs - Evidence saved to agent-evolution/data/research-report.json (598 lines, 6 models) Data gaps honestly documented: minimax-m3/m2.5, qwen3-coder, kimi-k2.6 benchmark tables are image-only on Ollama.
221 lines
8.3 KiB
JSON
221 lines
8.3 KiB
JSON
{
|
|
"metadata": {
|
|
"generated": "2026-06-01T20:00:00Z",
|
|
"source": "github-moonshot-k2 + ollama-pages + minimax-blog + qwen-blog",
|
|
"method": "text-extraction-from-tables",
|
|
"confidence": "high",
|
|
"verified_sources": [
|
|
"github.com/MoonshotAI/Kimi-K2 (K2 Instruct proxy for K2.6)",
|
|
"ollama.com/library/deepseek-v4-pro",
|
|
"ollama.com/library/glm-5.1",
|
|
"ollama.com/library/minimax-m3",
|
|
"minimax.io/models/text/m3",
|
|
"qwenlm.github.io/blog/qwen3-coder"
|
|
]
|
|
},
|
|
"models": {
|
|
"deepseek-v4-pro": {
|
|
"vendor": "DeepSeek",
|
|
"params": "1.6T total / 49B active",
|
|
"context": "1M tokens",
|
|
"sources": ["ollama.com/library/deepseek-v4-pro"],
|
|
"coding": {
|
|
"swe_bench_verified": 80.6,
|
|
"swe_bench_pro": 55.4,
|
|
"swe_bench_multilingual": 76.2,
|
|
"livecodebench_v6": 93.5,
|
|
"terminal_bench_2": 67.9,
|
|
"codeforces": 3206
|
|
},
|
|
"agentic": {
|
|
"browsecomp": 83.4,
|
|
"tool_decathlon": 51.8,
|
|
"mcp_atlas_public": 73.6
|
|
},
|
|
"reasoning": {
|
|
"hmmt_feb_2026": 95.2,
|
|
"gpqa_diamond": 90.1,
|
|
"hle": 37.7,
|
|
"imoanswerbench": 89.8,
|
|
"mmlu_pro": 87.5
|
|
},
|
|
"long_context": {
|
|
"mrcr_1m": 83.5,
|
|
"corpusqa_1m": 62.0
|
|
},
|
|
"rank": 1
|
|
},
|
|
"glm-5.1": {
|
|
"vendor": "Zhipu AI (Z.AI)",
|
|
"params": "756B total / ~40B active",
|
|
"context": "198K tokens",
|
|
"sources": ["ollama.com/library/glm-5.1"],
|
|
"coding": {
|
|
"swe_bench_pro": 58.4,
|
|
"terminal_bench_2": 63.5,
|
|
"nl2repo": 42.7
|
|
},
|
|
"agentic": {
|
|
"browsecomp": 68.0,
|
|
"browsecomp_with_context": 79.3,
|
|
"tau3_bench": 70.6,
|
|
"cybergym": 68.7,
|
|
"mcp_atlas_public": 71.8,
|
|
"tool_decathlon": 40.7
|
|
},
|
|
"reasoning": {
|
|
"aime_2026": 95.3,
|
|
"hmmt_feb_2026": 82.6,
|
|
"gpqa_diamond": 86.2,
|
|
"hle": 31.0,
|
|
"imoanswerbench": 83.8
|
|
},
|
|
"unique": "Sustained performance over hundreds of rounds and thousands of tool calls — unique claim",
|
|
"rank": 2
|
|
},
|
|
"kimi-k2.6": {
|
|
"vendor": "Moonshot AI",
|
|
"params": "1.04T total / unknown active (proxy: K2 Instruct)",
|
|
"context": "256K tokens",
|
|
"multimodal": true,
|
|
"proxy_note": "Using Kimi K2 Instruct data as proxy for K2.6",
|
|
"sources": ["github.com/MoonshotAI/Kimi-K2"],
|
|
"coding": {
|
|
"swe_bench_verified": 65.8,
|
|
"swe_bench_verified_multiple": 71.6,
|
|
"swe_bench_multilingual": 47.3,
|
|
"livecodebench_v6": 53.7,
|
|
"terminal_bench_2": 30.0,
|
|
"aider_polyglot": 60.0,
|
|
"multiple_pass": 85.7
|
|
},
|
|
"agentic": {
|
|
"browsecomp": 60.6,
|
|
"tau2_retail": 70.6,
|
|
"tau2_airline": 56.5,
|
|
"tau2_telecom": 65.8,
|
|
"acebench": 76.5
|
|
},
|
|
"reasoning": {
|
|
"aime_2025": 49.5,
|
|
"math_500": 97.4,
|
|
"hmmt_2025": 38.8,
|
|
"gpqa_diamond": 75.1,
|
|
"mmlu": 89.5,
|
|
"mmlu_pro": 81.1
|
|
},
|
|
"unique": "ONLY true multimodal (vision + text native) among all candidates",
|
|
"rank": 3
|
|
},
|
|
"minimax-m3": {
|
|
"vendor": "MiniMax",
|
|
"params": "unknown",
|
|
"context": "512K guaranteed, up to 1M",
|
|
"multimodal": true,
|
|
"sources": ["ollama.com/library/minimax-m3", "minimax.io/models/text/m3"],
|
|
"agentic": {
|
|
"browsecomp": 83.5,
|
|
"paper_reproduction": "12-hour autonomous ICLR replication (18 commits, 23 figures)",
|
|
"cuda_optimization": "147 iterations, 9.4x speedup, zero human intervention",
|
|
"posttrainbench": "37.1 (#3 overall, behind Opus 4.7 42.4, GPT-5.5 39.3)"
|
|
},
|
|
"coding": {
|
|
"note": "Top-tier per Ollama; specific scores not in extracted text"
|
|
},
|
|
"long_context": {
|
|
"msa_architecture": "Native ultra-long context pretraining"
|
|
},
|
|
"rank": 4
|
|
},
|
|
"minimax-m2.5": {
|
|
"vendor": "MiniMax",
|
|
"params": "unknown",
|
|
"context": "unknown",
|
|
"sources": ["ollama.com/library/minimax-m2.5"],
|
|
"coding": {
|
|
"note": "State-of-the-art for real-world productivity and coding tasks"
|
|
},
|
|
"agentic": {
|
|
"tools": true,
|
|
"thinking": true,
|
|
"pulls": "2.2M on Ollama"
|
|
},
|
|
"unique": "User-confirmed best frontend developer model",
|
|
"rank": 5
|
|
},
|
|
"qwen3-coder-480b": {
|
|
"vendor": "Alibaba/Qwen",
|
|
"params": "480B total / 35B active",
|
|
"context": "256K native, 1M w/ YaRN",
|
|
"sources": ["qwenlm.github.io/blog/qwen3-coder", "huggingface.co"],
|
|
"coding": {
|
|
"swe_bench_pro_hf": 38.7,
|
|
"terminal_bench_2_hf": 23.9,
|
|
"evasionbench": 78.16
|
|
},
|
|
"agentic": {
|
|
"note": "Claims SOTA open-source on agentic coding; methodology differs from HF eval"
|
|
},
|
|
"rank": 6
|
|
}
|
|
},
|
|
"role_assignments": {
|
|
"deepseek-v4-pro": {
|
|
"agents": ["lead-developer", "backend-developer", "php-developer", "python-developer", "code-skeptic", "the-fixer", "performance-engineer"],
|
|
"rationale": "Coding: SWE-bench 80.6%, LiveCodeBench 93.5%, TerminalBench 67.9%. Reasoning: GPQA 90.1%, HMMT 95.2%. Best raw coding + algorithmic analysis scores."
|
|
},
|
|
"glm-5.1": {
|
|
"agents": ["agent-architect", "workflow-architect", "orchestrator"],
|
|
"rationale": "Agentic: CyberGym 68.7%, Tau3 70.6%, BrowseComp 68-79%. Unique claim: sustained performance over hundreds of rounds. Best for long-horizon design tasks."
|
|
},
|
|
"kimi-k2.6": {
|
|
"agents": ["visual-tester"],
|
|
"rationale": "ONLY true multimodal (vision + text native). SWE-bench 65.8%, AceBench 76.5%. Multimodal screenshot analysis requires native vision."
|
|
},
|
|
"minimax-m3": {
|
|
"agents": ["system-analyst", "planner", "capability-analyst", "devops-engineer", "security-auditor", "evaluator", "prompt-optimizer", "reflector", "memory-manager", "evolution-prompt"],
|
|
"rationale": "BrowseComp 83.5 (surpasses Opus 4.7). 1M context MSA architecture. 12h autonomous paper replication, 147 CUDA iterations without human intervention. Best for agentic tasks requiring long context + persistence."
|
|
},
|
|
"minimax-m2.5": {
|
|
"agents": ["frontend-developer", "browser-automation", "flutter-developer"],
|
|
"rationale": "User-confirmed best frontend model. 2.2M Ollama pulls. 'Real-world productivity and coding tasks' per Ollama description."
|
|
},
|
|
"qwen3-coder-480b": {
|
|
"agents": ["sdet-engineer", "release-manager", "product-owner", "markdown-validator", "pipeline-judge", "history-miner", "go-developer", "architect-indexer", "workflow-cross-checker", "evolution-skeptic", "requirement-refiner"],
|
|
"rationale": "Lower benchmark scores (SWE-bench Pro 38.7%, TerminalBench 23.9%). Best fit for simple structured tasks where deterministic output is more important than frontier reasoning."
|
|
}
|
|
},
|
|
"evidence_table": {
|
|
"swe_bench_verified": [
|
|
{"model": "deepseek-v4-pro", "score": 80.6, "source": "ollama"},
|
|
{"model": "kimi-k2 (proxy)", "score": 65.8, "source": "github-k2"},
|
|
{"model": "glm-5.1", "score": null, "source": "not-published"},
|
|
{"model": "qwen3-coder-480b", "score": null, "source": "blog-claims-sota"}
|
|
],
|
|
"livecodebench": [
|
|
{"model": "deepseek-v4-pro", "score": 93.5, "source": "ollama"},
|
|
{"model": "kimi-k2 (proxy)", "score": 53.7, "source": "github-k2"}
|
|
],
|
|
"terminal_bench": [
|
|
{"model": "deepseek-v4-pro", "score": 67.9, "source": "ollama"},
|
|
{"model": "glm-5.1", "score": 63.5, "source": "ollama"},
|
|
{"model": "kimi-k2 (proxy)", "score": 30.0, "source": "github-k2"}
|
|
],
|
|
"browsecomp": [
|
|
{"model": "deepseek-v4-pro", "score": 83.4, "source": "ollama"},
|
|
{"model": "minimax-m3", "score": 83.5, "source": "ollama+minimax-blog"},
|
|
{"model": "glm-5.1", "score": 68.0, "source": "ollama"},
|
|
{"model": "kimi-k2 (proxy)", "score": 60.6, "source": "github-k2"}
|
|
],
|
|
"gpqa_diamond": [
|
|
{"model": "deepseek-v4-pro", "score": 90.1, "source": "ollama"},
|
|
{"model": "glm-5.1", "score": 86.2, "source": "ollama"},
|
|
{"model": "kimi-k2 (proxy)", "score": 75.1, "source": "github-k2"}
|
|
],
|
|
"tau_tool_use": [
|
|
{"model": "glm-5.1", "score": 70.6, "source": "ollama", "variant": "tau3"},
|
|
{"model": "kimi-k2 (proxy)", "score": 70.6, "source": "github-k2", "variant": "tau2-retail"}
|
|
]
|
|
}
|
|
}
|