Files
APAW/agent-evolution/data/model-research-latest.json
Deploy Bot 047a87afb4 feat(agent-models): apply MEDIUM+LOW priority model migrations
- markdown-validator: deepseek-v4-pro-max → nemotron-3-nano (90% cost cut)
- release-manager: glm-5.1 → kimi-k2.6 (+2 matrix, 1M context for diffs)
- capability-analyst: glm-5.1 → deepseek-v4-pro-max (+4 matrix, 1M ctx)
- browser-automation: qwen3-coder → deepseek-v4-flash (3× faster inference)
- history-miner: nemotron-3-super → qwen3.5-122b (+14 IF, 12.4M pulls)
2026-05-25 15:07:17 +01:00

326 lines
12 KiB
JSON

{
"version": "1.0.0",
"generated": "2026-05-24T00:16:00Z",
"source": "orchestrator-deep-analysis",
"models": [
{
"id": "deepseek-v4-pro-max",
"name": "DeepSeek V4-Pro Max",
"organization": "DeepSeek",
"parameters": "1.6T/49B active MoE",
"context_window": "1M",
"swe_bench": 80.6,
"if_score": 89,
"categories": ["coding", "agent", "reasoning"],
"provider": "ollama-cloud"
},
{
"id": "kimi-k2-6",
"name": "Kimi K2.6",
"organization": "Moonshot AI",
"parameters": "1T/32B active MoE",
"context_window": "256K→1M",
"swe_bench": 80.2,
"if_score": 91,
"categories": ["coding", "agent", "multimodal"],
"provider": "ollama-cloud"
},
{
"id": "qwen3-coder-480b",
"name": "Qwen3-Coder 480B",
"organization": "Qwen",
"parameters": "480B/35B active",
"context_window": "256K→1M",
"swe_bench": 66.5,
"if_score": 88,
"categories": ["coding", "agent"],
"provider": "ollama-cloud"
},
{
"id": "minimax-m2.5",
"name": "MiniMax M2.5",
"organization": "MiniMax",
"parameters": "MoE undisclosed",
"context_window": "128K",
"swe_bench": 80.2,
"if_score": 82,
"categories": ["coding", "agent"],
"provider": "ollama-cloud"
},
{
"id": "glm-5.1",
"name": "GLM-5",
"organization": "Z.ai",
"parameters": "744B/40B active",
"context_window": "128K",
"swe_bench": null,
"if_score": 90,
"categories": ["reasoning", "agent"],
"provider": "ollama-cloud"
},
{
"id": "qwen3-6-plus",
"name": "Qwen 3.6 Plus",
"organization": "Qwen",
"parameters": "Hybrid MoE",
"context_window": "1M",
"swe_bench": 78.8,
"if_score": 91,
"categories": ["coding", "agent", "reasoning"],
"provider": "openrouter",
"note": "FREE on OpenRouter. Rate-limited."
}
],
"recommendations": [
{
"agent": "frontend-developer",
"action": "sync_to_source_of_truth",
"current_model_in_agent_versions": "ollama-cloud/qwen3-coder:480b",
"source_of_truth_model": "ollama-cloud/minimax-m2.5",
"impact": "high",
"expected_improvement": {
"quality": "+6% (92 vs 86 in benchmark matrix)",
"speed": "~1x",
"context_window": "128K"
},
"score_before": 86,
"score_after": 92,
"score_delta": 6,
"rationale": "agent-versions.json is stale. kilo-meta.json (source of truth) already has minimax-m2.5. Matrix score for frontend-dev on M2.5 = 92 (highest!). MiniMax also leads SWE-bench at 80.2%.",
"applied": false,
"applied_date": null
},
{
"agent": "lead-developer",
"action": "sync_to_source_of_truth",
"current_model_in_agent_versions": "ollama-cloud/nemotron-3-super",
"source_of_truth_model": "ollama-cloud/qwen3-coder:480b",
"impact": "high",
"expected_improvement": {
"quality": "+22% (92 vs 70 in benchmark matrix)",
"speed": "~1x",
"context_window": "256K→1M"
},
"score_before": 70,
"score_after": 92,
"score_delta": 22,
"rationale": "agent-versions.json shows nemotron-3-super (outdated). kilo-meta.json has qwen3-coder:480b. Matrix score: qwen3-coder 92 is the highest for lead-developer. SWE-bench 66.5% and massive coding context make it the SOTA choice.",
"applied": false,
"applied_date": null
},
{
"agent": "system-analyst",
"action": "consider_upgrade",
"current_model": "ollama-cloud/glm-5.1",
"recommended_model": "ollama-cloud/deepseek-v4-pro-max",
"impact": "medium",
"expected_improvement": {
"quality": "+6% (88 vs 82 in benchmark matrix)",
"speed": "~1x",
"context_window": "128K→1M"
},
"score_before": 82,
"score_after": 88,
"score_delta": 6,
"rationale": "system-analyst matrix: glm-5.1 = 82, deepseek-v4-pro-max = 88. 1M context is critical for architecture docs. However GLM-5.1 has Arena ELO 1451 and strong reasoning. Keep GLM-5.1 if standardization across 12 agents matters; otherwise deepseek-v4-pro-max gives measurable gain.",
"applied": false,
"applied_date": null
},
{
"agent": "evaluator",
"action": "consider_upgrade",
"current_model": "ollama-cloud/glm-5.1",
"recommended_model": "ollama-cloud/kimi-k2.6",
"impact": "medium",
"expected_improvement": {
"quality": "+6% (84 vs 78)",
"speed": "~1x",
"context_window": "128K→256K"
},
"score_before": 78,
"score_after": 84,
"score_delta": 6,
"rationale": "evaluator needs high IF and reasoning accuracy. kimi-k2-6 IF=91, matrix score 84 vs glm-5.1 78. Alternative: deepseek-v4-pro-max also 84.",
"applied": false,
"applied_date": null
},
{
"agent": "planner",
"action": "confirm_current",
"current_model": "ollama-cloud/deepseek-v4-pro-max",
"impact": "low",
"expected_improvement": {
"quality": "0% (already optimal)",
"speed": "~1x",
"context_window": "1M"
},
"score_before": 88,
"score_after": 88,
"score_delta": 0,
"rationale": "planner is already on deepseek-v4-pro-max, which is the best model for this role (88). GPQA 90.1 confirms strong reasoning for chain-of-thought planning. No change needed.",
"applied": true,
"applied_date": "2026-04-27"
},
{
"agent": "reflector",
"action": "confirm_current",
"current_model": "ollama-cloud/deepseek-v4-pro-max",
"impact": "low",
"expected_improvement": {
"quality": "0% (already optimal)",
"speed": "~1x",
"context_window": "1M"
},
"score_before": 84,
"score_after": 84,
"score_delta": 0,
"rationale": "reflector already on deepseek-v4-pro-max (84), the best fit. Self-reflection requires strong reasoning chains; deepseek-v4 excels here.",
"applied": true,
"applied_date": "2026-04-27"
},
{
"agent": "workflow-architect",
"action": "consider_upgrade",
"current_model": "ollama-cloud/glm-5.1",
"recommended_model": "ollama-cloud/kimi-k2.6",
"impact": "medium",
"expected_improvement": {
"quality": "+6% (82 vs 76)",
"speed": "~1x",
"context_window": "128K→256K"
},
"score_before": 76,
"score_after": 82,
"score_delta": 6,
"rationale": "workflow-architect matrix: glm-5.1 = 76, kimi-k2-6 = 82. Alternative deepseek-v4-pro-max = 80.",
"applied": false,
"applied_date": null
},
{
"agent": "pipeline-judge",
"action": "consider_free_tier",
"current_model": "ollama-cloud/glm-5.1",
"recommended_model": "openrouter/qwen3-6-plus:free",
"impact": "low",
"expected_improvement": {
"quality": "+4% (80 vs 76)",
"speed": "~1x (rate-limited)",
"context_window": "128K→1M"
},
"score_before": 76,
"score_after": 80,
"score_delta": 4,
"rationale": "qwen3-6-plus is FREE on OpenRouter with IF=91 and SWE-bench 78.8. For pipeline-judge (measurement-only, no code writing) free tier can cut costs. BUT: OpenRouter free has strict rate limits; verify before production.",
"applied": false,
"applied_date": null,
},
{
"agent": "orchestrator",
"action": "confirm_current",
"current_model": "ollama-cloud/kimi-k2.6",
"impact": "low",
"expected_improvement": {
"quality": "0% (already optimal)",
"speed": "~1x",
"context_window": "256K"
},
"score_before": 92,
"score_after": 92,
"score_delta": 0,
"rationale": "orchestrator on kimi-k2.6 is the absolute best fit (92). 300 sub-agent swarm capability aligns with orchestration needs. IF=91 ensures routing accuracy.",
"applied": true,
"applied_date": "2026-04-27"
},
{
"agent": "the-fixer",
"action": "confirm_current",
"current_model": "ollama-cloud/kimi-k2.6",
"impact": "low",
"expected_improvement": {
"quality": "0% (already optimal)",
"speed": "~1x",
"context_window": "256K"
},
"score_before": 90,
"score_after": 90,
"score_delta": 0,
"rationale": "the-fixer on kimi-k2.6 (90) is optimal. SWE-Pro 58.6 (#1!) and strong bug-fixing capabilities make it the best choice. MiniMax M2.5 and DeepSeek V4-Pro Max tie at 88, but kimi-k2-6 leads.",
"applied": true,
"applied_date": "2026-04-27"
},
{
"agent": "memory-manager",
"action": "confirm_current",
"current_model": "ollama-cloud/qwen3.6-plus",
"impact": "low",
"expected_improvement": {
"quality": "0% (already optimal)",
"speed": "~1x",
"context_window": "1M"
},
"score_before": 87,
"score_after": 87,
"score_delta": 0,
"rationale": "memory-manager on qwen3.6-plus (87) is the best fit. 1M context is critical for memory operations. DeepSeek V4-Pro Max and Nemotron-3-Super tie at 86.",
"applied": true,
"applied_date": "2026-04-27"
}
],
"data_gaps": [
{
"gap": "performance_log is empty for ALL agents",
"severity": "critical",
"impact": "Cannot compute Avg Score, Success Rate, Avg Duration",
"action": "Instrument agent-executions.jsonl parser into sync-agent-history.ts to populate performance_log from Gitea issue comments"
},
{
"gap": "No latency / TPS per model",
"severity": "high",
"impact": "Cannot optimize speed or cost-per-token for high-frequency agents (orchestrator, code-skeptic)",
"action": "Add timing instrumentation to pipeline-judge and log wall-clock time per agent invocation"
},
{
"gap": "No invocation frequency / heatmap per agent",
"severity": "medium",
"impact": "Cannot identify bottlenecks or overused agents; no data for load-balancing decisions",
"action": "Add invocation counter to agent-executions.jsonl and build frequency heatmap in dashboard"
},
{
"gap": "No A/B test results for model changes",
"severity": "medium",
"impact": "Recommendations are purely benchmark-based, not validated with real pipeline data",
"action": "After any model change, run 5 pipeline iterations and compare fitness scores before/after"
},
{
"gap": "Missing cost data for OpenRouter free-tier agents",
"severity": "medium",
"impact": "Cannot compute true ROI for pipeline-judge / evaluator if switched to free models",
"action": "Track actual token consumption per provider and compute $/task"
},
{
"gap": "Stale agent-versions.json (not synced with kilo-meta.json)",
"severity": "high",
"impact": "Dashboard shows incorrect current models for 8+ agents; recommendations targeting wrong baseline",
"action": "Run sync-agent-history.ts with kilo-meta.json as primary source and fix JSON parse error in kilo.jsonc"
},
{
"gap": "No custom benchmark for markdown-validator",
"severity": "low",
"impact": "markdown-validator scores are lowest across matrix (68 max). Need lightweight-model benchmark.",
"action": "Create micro-benchmark for YAML frontmatter validation and test nano/instant models"
}
],
"summary": {
"agents_total": 34,
"agents_optimal": 22,
"agents_need_sync": 2,
"agents_need_upgrade": 4,
"agents_consider_free_tier": 1,
"avg_quality_improvement_potential": "+4.2%",
"providers_used": ["ollama-cloud", "openrouter"],
"key_models": ["kimi-k2.6", "deepseek-v4-pro-max", "qwen3-coder-480b", "minimax-m2.5", "glm-5.1"],
"pending_recommendations": 11,
"critical_data_gaps": 2
}
}