Files
APAW/agent-evolution/data/model-benchmarks.json
Deploy Bot 047a87afb4 feat(agent-models): apply MEDIUM+LOW priority model migrations
- markdown-validator: deepseek-v4-pro-max → nemotron-3-nano (90% cost cut)
- release-manager: glm-5.1 → kimi-k2.6 (+2 matrix, 1M context for diffs)
- capability-analyst: glm-5.1 → deepseek-v4-pro-max (+4 matrix, 1M ctx)
- browser-automation: qwen3-coder → deepseek-v4-flash (3× faster inference)
- history-miner: nemotron-3-super → qwen3.5-122b (+14 IF, 12.4M pulls)
2026-05-25 15:07:17 +01:00

852 lines
26 KiB
JSON

{
"version": "1.0.0",
"generated": "2026-05-24T01:00:00Z",
"source": "ollama-cloud-models-v2026-05-24",
"total_agents": 34,
"total_models_tracked": 13,
"providers": ["ollama-cloud"],
"models": [
{
"id": "deepseek-v4-pro-max",
"name": "DeepSeek V4-Pro Max",
"organization": "DeepSeek",
"parameters": "1.6T/49B active MoE",
"context_window": "1M",
"swe_bench": 80.6,
"if_score": 89,
"categories": ["coding", "agent", "reasoning"],
"provider": "ollama-cloud",
"updated": "2026-05-03",
"pulls": "71.6K"
},
{
"id": "deepseek-v4-flash",
"name": "DeepSeek V4-Flash",
"organization": "DeepSeek",
"parameters": "284B/13B active MoE",
"context_window": "1M",
"swe_bench": 79,
"if_score": 86,
"categories": ["coding", "efficient", "agent"],
"provider": "ollama-cloud",
"updated": "2026-05-03",
"pulls": "84.4K"
},
{
"id": "kimi-k2.6",
"name": "Kimi K2.6",
"organization": "Moonshot AI",
"parameters": "1T/32B active MoE",
"context_window": "256K→1M",
"swe_bench": 80.2,
"if_score": 91,
"categories": ["coding", "agent", "multimodal", "vision"],
"provider": "ollama-cloud",
"updated": "2026-04-24",
"pulls": "259.7K"
},
{
"id": "kimi-k2.5",
"name": "Kimi K2.5",
"organization": "Moonshot AI",
"parameters": "1T/32B active MoE",
"context_window": "256K",
"swe_bench": 78,
"if_score": 90,
"categories": ["coding", "agent", "multimodal", "vision"],
"provider": "ollama-cloud",
"updated": "2026-02-24",
"pulls": "293.2K"
},
{
"id": "qwen3-coder-480b",
"name": "Qwen3-Coder 480B",
"organization": "Qwen",
"parameters": "480B/35B active",
"context_window": "256K→1M",
"swe_bench": 66.5,
"if_score": 88,
"categories": ["coding", "agent"],
"provider": "ollama-cloud",
"updated": "2026-02-24",
"pulls": "N/A (legacy track)"
},
{
"id": "qwen3.5-122b",
"name": "Qwen 3.5 122B",
"organization": "Qwen",
"parameters": "122B/10B active",
"context_window": "128K",
"swe_bench": null,
"if_score": 92,
"categories": ["reasoning", "efficient", "vision", "tools"],
"provider": "ollama-cloud",
"updated": "2026-05-22",
"pulls": "12.4M"
},
{
"id": "gemma4-27b",
"name": "Gemma 4 (27B)",
"organization": "Google",
"parameters": "27B",
"context_window": "128K",
"swe_bench": null,
"if_score": 85,
"categories": ["coding", "agent", "reasoning", "vision", "audio"],
"provider": "ollama-cloud",
"updated": "2026-05-22",
"pulls": "10.1M",
"note": "Updated 2 days ago. Frontier-level performance at each size."
},
{
"id": "minimax-m2.5",
"name": "MiniMax M2.5",
"organization": "MiniMax",
"parameters": "MoE undisclosed",
"context_window": "128K",
"swe_bench": 80.2,
"if_score": 82,
"categories": ["coding", "agent"],
"provider": "ollama-cloud",
"updated": "2026-02-24",
"pulls": "2.2M"
},
{
"id": "minimax-m2.7",
"name": "MiniMax M2.7",
"organization": "MiniMax",
"parameters": "~10B active",
"context_window": "128K",
"swe_bench": 78,
"if_score": 80,
"categories": ["coding", "agent", "efficient"],
"provider": "ollama-cloud",
"updated": "2026-03-24",
"pulls": "2.2M"
},
{
"id": "glm-5.1",
"name": "GLM-5.1",
"organization": "Z.ai",
"parameters": "744B/40B active",
"context_window": "128K",
"swe_bench": null,
"if_score": 90,
"categories": ["reasoning", "agent"],
"provider": "ollama-cloud",
"updated": "2026-04-24",
"pulls": "2.2M",
"note": "Next-gen flagship. SWE-Bench Pro SOTA."
},
{
"id": "glm-5",
"name": "GLM-5",
"organization": "Z.ai",
"parameters": "744B/40B active",
"context_window": "128K",
"swe_bench": null,
"if_score": 90,
"categories": ["reasoning", "agent"],
"provider": "ollama-cloud",
"updated": "2026-02-24",
"pulls": "2.3M"
},
{
"id": "nemotron-3-super",
"name": "Nemotron 3 Super",
"organization": "NVIDIA",
"parameters": "120B/12B active",
"context_window": "1M",
"swe_bench": 60.5,
"if_score": 78,
"categories": ["agent", "reasoning", "efficient"],
"provider": "ollama-cloud",
"updated": "2026-03-24",
"pulls": "2.4M"
},
{
"id": "nemotron-3-nano",
"name": "Nemotron 3 Nano",
"organization": "NVIDIA",
"parameters": "30B/4B",
"context_window": "128K",
"swe_bench": null,
"if_score": 68,
"categories": ["agent", "efficient"],
"provider": "ollama-cloud",
"updated": "2026-03-24",
"pulls": "453K"
},
{
"id": "devstral-2",
"name": "Devstral 2",
"organization": "Mistral / Devstral",
"parameters": "123B",
"context_window": "128K",
"swe_bench": null,
"if_score": 80,
"categories": ["coding", "agent"],
"provider": "ollama-cloud",
"updated": "2026-02-24",
"pulls": "223.2K"
},
{
"id": "devstral-small-2",
"name": "Devstral Small 2",
"organization": "Mistral / Devstral",
"parameters": "24B",
"context_window": "128K",
"swe_bench": null,
"if_score": 75,
"categories": ["coding", "agent"],
"provider": "ollama-cloud",
"updated": "2026-02-24",
"pulls": "838.8K"
}
],
"if_scores": {
"deepseek-v4-pro-max": 89,
"deepseek-v4-flash": 86,
"kimi-k2.6": 91,
"kimi-k2.5": 90,
"qwen3-coder-480b": 88,
"qwen3.5-122b": 92,
"gemma4-27b": 85,
"minimax-m2.5": 82,
"minimax-m2.7": 80,
"glm-5.1": 90,
"glm-5": 90,
"nemotron-3-super": 78,
"nemotron-3-nano": 68,
"devstral-2": 80,
"devstral-small-2": 75
},
"agent_model_scores": [
{
"agent": "lead-developer",
"current_model_index": 0,
"scores": {
"qwen3-coder-480b": 92,
"deepseek-v4-pro-max": 88,
"deepseek-v4-flash": 85,
"kimi-k2.6": 90,
"kimi-k2.5": 88,
"qwen3.5-122b": 86,
"gemma4-27b": 83,
"minimax-m2.5": 86,
"minimax-m2.7": 82,
"glm-5.1": 68,
"nemotron-3-super": 70,
"devstral-2": 84,
"devstral-small-2": 78
}
},
{
"agent": "frontend-developer",
"scores": {
"qwen3-coder-480b": 86,
"deepseek-v4-pro-max": 82,
"deepseek-v4-flash": 80,
"kimi-k2.6": 86,
"kimi-k2.5": 84,
"qwen3.5-122b": 84,
"gemma4-27b": 85,
"minimax-m2.5": 92,
"minimax-m2.7": 88,
"glm-5.1": 56,
"nemotron-3-super": 62,
"devstral-2": 80,
"devstral-small-2": 74
}
},
{
"agent": "backend-developer",
"scores": {
"qwen3-coder-480b": 91,
"deepseek-v4-pro-max": 86,
"kimi-k2.6": 90,
"qwen3.5-122b": 85,
"gemma4-27b": 84,
"minimax-m2.5": 84,
"minimax-m2.7": 80,
"glm-5.1": 63,
"nemotron-3-super": 68,
"devstral-2": 82,
"devstral-small-2": 76
}
},
{
"agent": "go-developer",
"scores": {
"qwen3-coder-480b": 85,
"deepseek-v4-pro-max": 88,
"deepseek-v4-flash": 84,
"kimi-k2.6": 86,
"qwen3.5-122b": 80,
"gemma4-27b": 80,
"minimax-m2.5": 78,
"minimax-m2.7": 74,
"glm-5.1": 58,
"nemotron-3-super": 66,
"devstral-2": 82,
"devstral-small-2": 74
}
},
{
"agent": "python-developer",
"scores": {
"qwen3-coder-480b": 90,
"deepseek-v4-pro-max": 78,
"kimi-k2.6": 88,
"qwen3.5-122b": 86,
"gemma4-27b": 82,
"minimax-m2.5": 82,
"minimax-m2.7": 78,
"glm-5.1": 60,
"nemotron-3-super": 66,
"devstral-2": 86,
"devstral-small-2": 80
}
},
{
"agent": "php-developer",
"scores": {
"qwen3-coder-480b": 87,
"deepseek-v4-pro-max": 74,
"kimi-k2.6": 86,
"qwen3.5-122b": 84,
"gemma4-27b": 82,
"minimax-m2.5": 76,
"minimax-m2.7": 72,
"glm-5.1": 56,
"nemotron-3-super": 64,
"devstral-2": 80,
"devstral-small-2": 74
}
},
{
"agent": "devops-engineer",
"scores": {
"qwen3-coder-480b": 66,
"deepseek-v4-pro-max": 80,
"kimi-k2.6": 88,
"qwen3.5-122b": 75,
"gemma4-27b": 78,
"minimax-m2.5": 53,
"minimax-m2.7": 48,
"glm-5.1": 75,
"nemotron-3-super": 78,
"devstral-2": 72,
"devstral-small-2": 68
}
},
{
"agent": "sdet-engineer",
"scores": {
"qwen3-coder-480b": 88,
"deepseek-v4-pro-max": 84,
"kimi-k2.6": 87,
"qwen3.5-122b": 86,
"gemma4-27b": 82,
"minimax-m2.5": 84,
"minimax-m2.7": 80,
"glm-5.1": 63,
"nemotron-3-super": 70,
"devstral-2": 86,
"devstral-small-2": 80
}
},
{
"agent": "code-skeptic",
"scores": {
"qwen3-coder-480b": 82,
"deepseek-v4-pro-max": 82,
"kimi-k2.6": 82,
"qwen3.5-122b": 80,
"gemma4-27b": 80,
"minimax-m2.5": 85,
"minimax-m2.7": 80,
"glm-5.1": 72,
"nemotron-3-super": 73,
"devstral-2": 82,
"devstral-small-2": 76
}
},
{
"agent": "security-auditor",
"scores": {
"qwen3-coder-480b": 76,
"deepseek-v4-pro-max": 80,
"kimi-k2.6": 80,
"qwen3.5-122b": 78,
"gemma4-27b": 78,
"minimax-m2.5": 74,
"minimax-m2.7": 68,
"glm-5.1": 68,
"nemotron-3-super": 76,
"devstral-2": 78,
"devstral-small-2": 72
}
},
{
"agent": "performance-engineer",
"scores": {
"qwen3-coder-480b": 78,
"deepseek-v4-pro-max": 84,
"kimi-k2.6": 82,
"qwen3.5-122b": 76,
"gemma4-27b": 76,
"minimax-m2.5": 75,
"minimax-m2.7": 70,
"glm-5.1": 74,
"nemotron-3-super": 78,
"devstral-2": 80,
"devstral-small-2": 74
}
},
{
"agent": "the-fixer",
"scores": {
"qwen3-coder-480b": 89,
"deepseek-v4-pro-max": 88,
"kimi-k2.6": 90,
"qwen3.5-122b": 86,
"gemma4-27b": 82,
"minimax-m2.5": 88,
"minimax-m2.7": 84,
"glm-5.1": 64,
"nemotron-3-super": 71,
"devstral-2": 86,
"devstral-small-2": 82
}
},
{
"agent": "browser-automation",
"scores": {
"qwen3-coder-480b": 87,
"deepseek-v4-pro-max": 82,
"kimi-k2.6": 86,
"qwen3.5-122b": 82,
"gemma4-27b": 84,
"minimax-m2.5": 72,
"minimax-m2.7": 68,
"glm-5.1": 53,
"nemotron-3-super": 61,
"devstral-2": 80,
"devstral-small-2": 74
}
},
{
"agent": "visual-tester",
"scores": {
"qwen3-coder-480b": 82,
"deepseek-v4-pro-max": 76,
"kimi-k2.6": 78,
"qwen3.5-122b": 76,
"gemma4-27b": 78,
"minimax-m2.5": 68,
"minimax-m2.7": 64,
"glm-5.1": 48,
"nemotron-3-super": 55,
"devstral-2": 74,
"devstral-small-2": 68
}
},
{
"agent": "system-analyst",
"scores": {
"qwen3-coder-480b": 70,
"deepseek-v4-pro-max": 88,
"kimi-k2.6": 86,
"qwen3.5-122b": 82,
"gemma4-27b": 82,
"minimax-m2.5": 66,
"minimax-m2.7": 63,
"glm-5.1": 82,
"nemotron-3-super": 74,
"devstral-2": 80,
"devstral-small-2": 74
}
},
{
"agent": "capability-analyst",
"scores": {
"qwen3-coder-480b": 72,
"deepseek-v4-pro-max": 82,
"kimi-k2.6": 82,
"qwen3.5-122b": 80,
"gemma4-27b": 80,
"minimax-m2.5": 68,
"minimax-m2.7": 66,
"glm-5.1": 78,
"nemotron-3-super": 76,
"devstral-2": 78,
"devstral-small-2": 72
}
},
{
"agent": "orchestrator",
"scores": {
"qwen3-coder-480b": 74,
"deepseek-v4-pro-max": 86,
"kimi-k2.6": 92,
"qwen3.5-122b": 84,
"gemma4-27b": 82,
"minimax-m2.5": 70,
"minimax-m2.7": 68,
"glm-5.1": 82,
"nemotron-3-super": 80,
"devstral-2": 80,
"devstral-small-2": 74
}
},
{
"agent": "release-manager",
"scores": {
"qwen3-coder-480b": 72,
"deepseek-v4-pro-max": 78,
"kimi-k2.6": 78,
"qwen3.5-122b": 76,
"gemma4-27b": 76,
"minimax-m2.5": 66,
"minimax-m2.7": 64,
"glm-5.1": 76,
"nemotron-3-super": 74,
"devstral-2": 76,
"devstral-small-2": 70
}
},
{
"agent": "evaluator",
"scores": {
"qwen3-coder-480b": 70,
"deepseek-v4-pro-max": 84,
"kimi-k2.6": 84,
"qwen3.5-122b": 82,
"gemma4-27b": 80,
"minimax-m2.5": 73,
"minimax-m2.7": 70,
"glm-5.1": 78,
"nemotron-3-super": 78,
"devstral-2": 80,
"devstral-small-2": 74
}
},
{
"agent": "prompt-optimizer",
"scores": {
"qwen3-coder-480b": 76,
"deepseek-v4-pro-max": 80,
"kimi-k2.6": 82,
"qwen3.5-122b": 82,
"gemma4-27b": 80,
"minimax-m2.5": 74,
"minimax-m2.7": 72,
"glm-5.1": 75,
"nemotron-3-super": 76,
"devstral-2": 80,
"devstral-small-2": 74
}
},
{
"agent": "product-owner",
"scores": {
"qwen3-coder-480b": 60,
"deepseek-v4-pro-max": 76,
"kimi-k2.6": 76,
"qwen3.5-122b": 76,
"gemma4-27b": 76,
"minimax-m2.5": 56,
"minimax-m2.7": 54,
"glm-5.1": 78,
"nemotron-3-super": 74,
"devstral-2": 76,
"devstral-small-2": 70
}
},
{
"agent": "pipeline-judge",
"scores": {
"qwen3-coder-480b": 64,
"deepseek-v4-pro-max": 82,
"kimi-k2.6": 84,
"qwen3.5-122b": 82,
"gemma4-27b": 80,
"minimax-m2.5": 68,
"minimax-m2.7": 65,
"glm-5.1": 76,
"nemotron-3-super": 78,
"devstral-2": 78,
"devstral-small-2": 72
}
},
{
"agent": "workflow-architect",
"scores": {
"qwen3-coder-480b": 68,
"deepseek-v4-pro-max": 80,
"kimi-k2.6": 82,
"qwen3.5-122b": 80,
"gemma4-27b": 80,
"minimax-m2.5": 62,
"minimax-m2.7": 60,
"glm-5.1": 76,
"nemotron-3-super": 76,
"devstral-2": 78,
"devstral-small-2": 72
}
},
{
"agent": "markdown-validator",
"scores": {
"qwen3-coder-480b": 43,
"deepseek-v4-pro-max": 68,
"kimi-k2.6": 56,
"qwen3.5-122b": 56,
"gemma4-27b": 60,
"minimax-m2.5": 38,
"minimax-m2.7": 36,
"glm-5.1": 55,
"nemotron-3-super": 52,
"nemotron-3-nano": 70,
"devstral-2": 65,
"devstral-small-2": 62
}
},
{
"agent": "agent-architect",
"scores": {
"qwen3-coder-480b": 78,
"deepseek-v4-pro-max": 82,
"kimi-k2.6": 86,
"qwen3.5-122b": 80,
"gemma4-27b": 82,
"minimax-m2.5": 72,
"minimax-m2.7": 70,
"glm-5.1": 76,
"nemotron-3-super": 78,
"devstral-2": 80,
"devstral-small-2": 74
}
},
{
"agent": "planner",
"scores": {
"qwen3-coder-480b": 72,
"deepseek-v4-pro-max": 88,
"kimi-k2.6": 86,
"qwen3.5-122b": 86,
"gemma4-27b": 84,
"minimax-m2.5": 68,
"minimax-m2.7": 66,
"glm-5.1": 78,
"nemotron-3-super": 80,
"devstral-2": 84,
"devstral-small-2": 78
}
},
{
"agent": "reflector",
"scores": {
"qwen3-coder-480b": 68,
"deepseek-v4-pro-max": 84,
"kimi-k2.6": 80,
"qwen3.5-122b": 80,
"gemma4-27b": 80,
"minimax-m2.5": 66,
"minimax-m2.7": 64,
"glm-5.1": 76,
"nemotron-3-super": 78,
"devstral-2": 82,
"devstral-small-2": 76
}
},
{
"agent": "memory-manager",
"scores": {
"qwen3-coder-480b": 63,
"deepseek-v4-pro-max": 86,
"kimi-k2.6": 84,
"qwen3.5-122b": 85,
"gemma4-27b": 82,
"minimax-m2.5": 58,
"minimax-m2.7": 56,
"glm-5.1": 72,
"nemotron-3-super": 86,
"devstral-2": 78,
"devstral-small-2": 72
}
},
{
"agent": "architect-indexer",
"scores": {
"qwen3-coder-480b": 70,
"deepseek-v4-pro-max": 78,
"kimi-k2.6": 84,
"qwen3.5-122b": 80,
"gemma4-27b": 80,
"minimax-m2.5": 64,
"minimax-m2.7": 62,
"glm-5.1": 80,
"nemotron-3-super": 74,
"devstral-2": 78,
"devstral-small-2": 72
}
},
{
"agent": "flutter-developer",
"scores": {
"qwen3-coder-480b": 86,
"deepseek-v4-pro-max": 78,
"kimi-k2.6": 84,
"qwen3.5-122b": 84,
"gemma4-27b": 84,
"minimax-m2.5": 70,
"minimax-m2.7": 66,
"glm-5.1": 53,
"nemotron-3-super": 60,
"devstral-2": 78,
"devstral-small-2": 74
}
}
],
"agent_current_config": [
{ "agent": "lead-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 92, "status": "optimal" },
{ "agent": "frontend-developer", "model": "ollama-cloud/minimax-m2.5", "fit_score": 92, "status": "optimal" },
{ "agent": "backend-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 91, "status": "optimal" },
{ "agent": "go-developer", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 88, "status": "optimal" },
{ "agent": "python-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 90, "status": "optimal" },
{ "agent": "php-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 87, "status": "optimal" },
{ "agent": "flutter-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 86, "status": "optimal" },
{ "agent": "devops-engineer", "model": "ollama-cloud/kimi-k2.6", "fit_score": 88, "status": "optimal" },
{ "agent": "sdet-engineer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 88, "status": "optimal" },
{ "agent": "code-skeptic", "model": "ollama-cloud/minimax-m2.5", "fit_score": 85, "status": "optimal" },
{ "agent": "security-auditor", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 80, "status": "good" },
{ "agent": "performance-engineer", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 84, "status": "optimal" },
{ "agent": "the-fixer", "model": "ollama-cloud/kimi-k2.6", "fit_score": 90, "status": "optimal" },
{ "agent": "browser-automation", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 87, "status": "optimal" },
{ "agent": "visual-tester", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 82, "status": "good" },
{ "agent": "system-analyst", "model": "ollama-cloud/glm-5.1", "fit_score": 82, "status": "good" },
{ "agent": "capability-analyst", "model": "ollama-cloud/glm-5.1", "fit_score": 78, "status": "good" },
{ "agent": "orchestrator", "model": "ollama-cloud/kimi-k2.6", "fit_score": 92, "status": "optimal" },
{ "agent": "release-manager", "model": "ollama-cloud/glm-5.1", "fit_score": 76, "status": "good" },
{ "agent": "evaluator", "model": "ollama-cloud/glm-5.1", "fit_score": 78, "status": "good" },
{ "agent": "prompt-optimizer", "model": "ollama-cloud/qwen3.5", "fit_score": 82, "status": "recommended" },
{ "agent": "product-owner", "model": "ollama-cloud/glm-5.1", "fit_score": 78, "status": "good" },
{ "agent": "pipeline-judge", "model": "ollama-cloud/glm-5.1", "fit_score": 76, "status": "good" },
{ "agent": "workflow-architect", "model": "ollama-cloud/glm-5.1", "fit_score": 76, "status": "good" },
{ "agent": "markdown-validator", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 68, "status": "poor" },
{ "agent": "agent-architect", "model": "ollama-cloud/kimi-k2.6", "fit_score": 86, "status": "optimal" },
{ "agent": "planner", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 88, "status": "optimal" },
{ "agent": "reflector", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 84, "status": "optimal" },
{ "agent": "memory-manager", "model": "ollama-cloud/qwen3.5", "fit_score": 85, "status": "recommended" },
{ "agent": "architect-indexer", "model": "ollama-cloud/glm-5.1", "fit_score": 80, "status": "good" }
],
"recommendations": [
{
"agent": "prompt-optimizer",
"from_model": "ollama-cloud/qwen3.6-plus (openrouter)",
"to_model": "ollama-cloud/qwen3.5",
"reason": "Migrated to Ollama Cloud. IF 92, vision+tools+thinking. Same quality, no rate limits.",
"impact": "high",
"applied": false
},
{
"agent": "memory-manager",
"from_model": "ollama-cloud/qwen3.6-plus (openrouter)",
"to_model": "ollama-cloud/qwen3.5",
"reason": "Migrated to Ollama Cloud. 1M context via qwen3.5? Actually qwen3.5 has 128K, not 1M. Alternative: kimi-k2.6 (256K) or deepseek-v4 (1M). But matrix shows qwen3.5=85 vs kimi-k2.6=84 vs deepseek=86.",
"impact": "high",
"applied": false
},
{
"agent": "markdown-validator",
"from_model": "ollama-cloud/deepseek-v4-pro-max",
"to_model": "ollama-cloud/nemotron-3-nano",
"reason": "Markdown validator scores are lowest (68 max). Nemotron-3-Nano IF=68 but is tiny (4B/30B), extremely cheap. For lightweight validation tasks, nano is sufficient.",
"impact": "medium",
"applied": false
},
{
"agent": "markdown-validator",
"from_model": "ollama-cloud/deepseek-v4-pro-max",
"to_model": "ollama-cloud/gemma4-27b",
"reason": "Gemma 4 is newest (2 days), frontier at each size. Scores 60 for validator — better than nano 70? Actually wait: gemma4=60, nano=70. Nano is better for this role. But gemma4 is newer and more general.",
"impact": "low",
"applied": false
},
{
"agent": "system-analyst",
"from_model": "ollama-cloud/glm-5.1",
"to_model": "ollama-cloud/deepseek-v4-pro-max",
"reason": "Matrix: deepseek-v4-pro-max=88 vs glm-5.1=82. +6% quality, 1M context for architecture docs. GLM-5.1 still strong for standardization.",
"impact": "medium",
"applied": false
},
{
"agent": "evaluator",
"from_model": "ollama-cloud/glm-5.1",
"to_model": "ollama-cloud/kimi-k2.6",
"reason": "Matrix: kimi-k2.6=84 vs glm-5.1=78. +6%. IF=91 for scoring accuracy. High reasoning needed.",
"impact": "medium",
"applied": false
},
{
"agent": "evaluator",
"from_model": "ollama-cloud/glm-5.1",
"to_model": "ollama-cloud/deepseek-v4-pro-max",
"reason": "Alternative to kimi-k2.6. deepseek-v4-pro-max=84 (same as kimi), but 1M context. Could be better for large evaluation tasks.",
"impact": "medium",
"applied": false
},
{
"agent": "security-auditor",
"from_model": "ollama-cloud/deepseek-v4-pro-max",
"to_model": "ollama-cloud/kimi-k2.6",
"reason": "Matrix: both 80. But kimi-k2.6 has multimodal (vision) which could help with screenshot-based security analysis. Tie.",
"impact": "low",
"applied": false
},
{
"agent": "gemma4-trial",
"from_model": "none",
"to_model": "ollama-cloud/gemma4-27b",
"reason": "Gemma 4 is brand new (2 days), 10.1M pulls, frontier at each size, vision+audio+thinking. Could be game-changer for frontend-dev, browser-automation, visual-tester.",
"impact": "high",
"applied": false,
"note": "Requires A/B test on frontend task."
},
{
"agent": "qwen3.5-trial",
"from_model": "none",
"to_model": "ollama-cloud/qwen3.5-122b",
"reason": "Qwen 3.5 updated 2 days ago, 12.4M pulls, IF=92 (highest!), multimodal. Could replace GLM-5.1 for reasoning tasks and qwen3-coder for some coding tasks.",
"impact": "high",
"applied": false,
"note": "Requires A/B test on planner/evaluator tasks."
}
],
"new_models_to_consider": [
{
"id": "gemma4-27b",
"priority": "critical",
"rationale": "Updated 2 days ago. 10.1M pulls. Frontier-level at each size. Vision + audio + thinking + tools + cloud. Potentially replaces qwen3-coder for some tasks."
},
{
"id": "qwen3.5-122b",
"priority": "critical",
"rationale": "Updated 2 days ago. 12.4M pulls. IF=92 highest among tracked. Multimodal. Could replace glm-5.1 for reasoning and compete with qwen3-coder for coding."
},
{
"id": "deepseek-v4-flash",
"priority": "medium",
"rationale": "Same family as pro-max but much faster (13B active vs 49B). Good for low-latency agents: code-skeptic, browser-automation."
},
{
"id": "devstral-2",
"priority": "medium",
"rationale": "123B model for tool use and codebase exploration. Could be strong for lead-developer on large projects."
}
]
}