APAW/agent-evolution/data/evolution-summary.json

{
  "ts": "2026-06-01T21:30:00Z",
  "event": "evolution_complete_report",
  "trigger": "user_request_objective_evolution",
  "methodology": "capability-analyst_research_report + deterministic_sync + code_skeptic_review",
  "agents_changed": 32,
  "model_distribution": {
    "deepseek-v4-pro": 16,
    "minimax-m3:cloud": 10,
    "glm-5.1": 5,
    "kimi-k2.6": 5,
    "minimax-m2.5:cloud": 3
  },
  "evidence_file": "agent-evolution/data/research-report.json",
  "evidence_sources": [
    "github.com/MoonshotAI/Kimi-K2",
    "ollama.com/library/deepseek-v4-pro",
    "ollama.com/library/glm-5.1",
    "ollama.com/library/kimi-k2.6",
    "ollama.com/library/minimax-m3",
    "ollama.com/library/minimax-m2.5",
    "minimax.io/models/text/m3",
    "minimax.io/news/minimax-m25",
    "qwenlm.github.io/blog/qwen3-coder",
    "api.llm-stats.com/v1/ (pricing/provider metadata only, no benchmark scores)"
  ],
  "code_skeptic_findings": {
    "issues_fixed": [
      "incident-responder in capability-index.yaml had copy-pasted workflow-cross-checker capabilities; replaced with correct incident_response capabilities",
      "removed orphaned 'workflow-cross-checker: null' field and unjustified 'variant: thinking' from incident-responder",
      "added missing history-miner entry to capability-index.yaml",
      "3 model mismatches fixed: product-owner (kimi-k2.6 → minimax-m2.5:cloud), incident-responder (deepseek-v4-pro → glm-5.1), history-miner (qwen3-coder:480b → deepseek-v4-pro)",
      "3 additional mismatches fixed: architect-indexer, pipeline-judge, workflow-cross-checker (all qwen3-coder:480b → deepseek-v4-pro)"
    ],
    "total_model_mismatches_fixed": 6
  },
  "opencompass_container": {
    "files": ["docker/docker-compose.opencompass.yml", "docker/Dockerfile.opencompass", "scripts/opencompass-eval.sh", "scripts/opencompass-setup.sh"],
    "status": "config_complete_build_blocked_network",
    "note": "Docker build requires internet access for pip install. Files validated and ready. Not needed — no benchmark endpoint available."
  },
  "llm_stats_api": {
    "status": "pricing_registry_only",
    "benchmarks_available": false,
    "models_with_metadata": ["deepseek-v4-pro-max", "glm-5.1", "kimi-k2.6", "minimax-m2.5", "minimax-m2.7"],
    "models_not_found": ["minimax-m3", "qwen3-coder-480b"],
    "finding": "LLM Stats API (api.llm-stats.com/v1/) provides model registry, pricing, provider metadata, and param_count but has NO benchmark score endpoints. Manual research remains the sole source of benchmark data."
  },
  "data_gaps": [
    "minimax-m3: Not found in LLM Stats API. ALL benchmark tables on ollama.com and minimax.io are IMAGE-ONLY. Specific coding scores unavailable.",
    "qwen3-coder-480b: Not found in LLM Stats API. ALL benchmarks image-only. No longer assigned to any agent.",
    "kimi-k2.6: Ollama page image-only. Using K2 Instruct as proxy (likely understates performance). API provides pricing/providers.",
    "minimax-m2.5: Ollama images + partial blog text. Reasoning benchmarks missing. API provides pricing/providers and a 1M context discrepancy (manual said 198K, API shows 1M).",
    "minimax-m2.7: Not in manual research. Found in API with release_date 2026-03-18. param_count null in API. SWE-Pro 56.22% from API description."
  ],
  "verification": "scripts/sync-agents.cjs --check PASSED",
  "total_agents_assigned": 36,
  "zero_unassigned": true
}