Files
APAW/agent-evolution/data/evolution-summary.json
Deploy Bot c1e50495a9 evolution: objective model assignments from benchmark research + code-skeptic review
- 6 model mismatches fixed: product-owner, incident-responder, history-miner,
  architect-indexer, pipeline-judge, workflow-cross-checker
- incident-responder capabilities corrected (was copy-pasted from workflow-cross-checker)
- history-miner entry added to capability-index.yaml
- LLM Stats API metadata ingested into research-report.json
- planner rationale corrected (removed false minimax-m3 "300-agent swarm" claim)
- capability-index.yaml: stale qwen3-coder:480b fallback_models removed (4 agents)
- capability-index.yaml: duplicate kimi-k2.6 entry removed (evolution-prompt)
- sync-agents.cjs --check: zero violations
2026-06-01 22:13:49 +01:00

60 lines
3.3 KiB
JSON

{
"ts": "2026-06-01T21:30:00Z",
"event": "evolution_complete_report",
"trigger": "user_request_objective_evolution",
"methodology": "capability-analyst_research_report + deterministic_sync + code_skeptic_review",
"agents_changed": 32,
"model_distribution": {
"deepseek-v4-pro": 16,
"minimax-m3:cloud": 10,
"glm-5.1": 5,
"kimi-k2.6": 5,
"minimax-m2.5:cloud": 3
},
"evidence_file": "agent-evolution/data/research-report.json",
"evidence_sources": [
"github.com/MoonshotAI/Kimi-K2",
"ollama.com/library/deepseek-v4-pro",
"ollama.com/library/glm-5.1",
"ollama.com/library/kimi-k2.6",
"ollama.com/library/minimax-m3",
"ollama.com/library/minimax-m2.5",
"minimax.io/models/text/m3",
"minimax.io/news/minimax-m25",
"qwenlm.github.io/blog/qwen3-coder",
"api.llm-stats.com/v1/ (pricing/provider metadata only, no benchmark scores)"
],
"code_skeptic_findings": {
"issues_fixed": [
"incident-responder in capability-index.yaml had copy-pasted workflow-cross-checker capabilities; replaced with correct incident_response capabilities",
"removed orphaned 'workflow-cross-checker: null' field and unjustified 'variant: thinking' from incident-responder",
"added missing history-miner entry to capability-index.yaml",
"3 model mismatches fixed: product-owner (kimi-k2.6 → minimax-m2.5:cloud), incident-responder (deepseek-v4-pro → glm-5.1), history-miner (qwen3-coder:480b → deepseek-v4-pro)",
"3 additional mismatches fixed: architect-indexer, pipeline-judge, workflow-cross-checker (all qwen3-coder:480b → deepseek-v4-pro)"
],
"total_model_mismatches_fixed": 6
},
"opencompass_container": {
"files": ["docker/docker-compose.opencompass.yml", "docker/Dockerfile.opencompass", "scripts/opencompass-eval.sh", "scripts/opencompass-setup.sh"],
"status": "config_complete_build_blocked_network",
"note": "Docker build requires internet access for pip install. Files validated and ready. Not needed — no benchmark endpoint available."
},
"llm_stats_api": {
"status": "pricing_registry_only",
"benchmarks_available": false,
"models_with_metadata": ["deepseek-v4-pro-max", "glm-5.1", "kimi-k2.6", "minimax-m2.5", "minimax-m2.7"],
"models_not_found": ["minimax-m3", "qwen3-coder-480b"],
"finding": "LLM Stats API (api.llm-stats.com/v1/) provides model registry, pricing, provider metadata, and param_count but has NO benchmark score endpoints. Manual research remains the sole source of benchmark data."
},
"data_gaps": [
"minimax-m3: Not found in LLM Stats API. ALL benchmark tables on ollama.com and minimax.io are IMAGE-ONLY. Specific coding scores unavailable.",
"qwen3-coder-480b: Not found in LLM Stats API. ALL benchmarks image-only. No longer assigned to any agent.",
"kimi-k2.6: Ollama page image-only. Using K2 Instruct as proxy (likely understates performance). API provides pricing/providers.",
"minimax-m2.5: Ollama images + partial blog text. Reasoning benchmarks missing. API provides pricing/providers and a 1M context discrepancy (manual said 198K, API shows 1M).",
"minimax-m2.7: Not in manual research. Found in API with release_date 2026-03-18. param_count null in API. SWE-Pro 56.22% from API description."
],
"verification": "scripts/sync-agents.cjs --check PASSED",
"total_agents_assigned": 36,
"zero_unassigned": true
}