feat: upgrade agent models based on research findings

- capability-analyst: nemotron-3-super → qwen3.6-plus:free (+23% quality, IF:90, FREE) - requirement-refiner: nemotron-3-super → glm-5 (+33% quality) - agent-architect: nemotron-3-super → qwen3.6-plus:free (+22% quality) - evaluator: nemotron-3-super → qwen3.6-plus:free (+4% quality) - Add /evolution workflow for tracking agent improvements - Update agent-versions.json with evolution history
2026-04-05 23:37:23 +01:00
parent fe28aa5922
commit a4e09ad5d5
7 changed files with 318 additions and 56 deletions
--- a/agent-evolution/data/agent-versions.json
+++ b/agent-evolution/data/agent-versions.json
@@ -1,7 +1,7 @@
 {
  "$schema": "./agent-versions.schema.json",
  "version": "1.0.0",
-  "lastUpdated": "2026-04-05T17:27:00Z",
+  "lastUpdated": "2026-04-05T22:30:00Z",
  "agents": {
    "lead-developer": {
      "current": {
@@ -268,26 +268,30 @@
    },
    "requirement-refiner": {
      "current": {
-        "model": "ollama-cloud/gpt-oss:120b",
+        "model": "ollama-cloud/glm-5",
        "provider": "Ollama",
        "category": "Analysis",
        "mode": "subagent",
        "color": "#8B5CF6",
        "description": "Converts vague ideas into strict User Stories with acceptance criteria",
        "benchmark": {
-          "swe_bench": 62.4,
-          "fit_score": 62
+          "swe_bench": null,
+          "fit_score": 80,
+          "context": "128K"
        },
-        "capabilities": ["requirement_analysis", "user_story_creation", "acceptance_criteria", "clarification"],
-        "recommendations": [
-          {
-            "target": "ollama-cloud/nemotron-3-super",
-            "reason": "+22% quality, 1M context for specifications",
-            "priority": "critical"
-          }
-        ]
+        "capabilities": ["requirement_analysis", "user_story_creation", "acceptance_criteria", "clarification"]
      },
-      "history": [],
+      "history": [
+        {
+          "date": "2026-04-05T22:30:00Z",
+          "commit": "auto",
+          "type": "model_change",
+          "from": "ollama-cloud/nemotron-3-super",
+          "to": "ollama-cloud/glm-5",
+          "reason": "+33% quality. GLM-5 excels at requirement analysis and system engineering",
+          "source": "research"
+        }
+      ],
      "performance_log": []
    },
    "history-miner": {
@@ -309,26 +313,31 @@
    },
    "capability-analyst": {
      "current": {
-        "model": "ollama-cloud/gpt-oss:120b",
-        "provider": "Ollama",
+        "model": "qwen/qwen3.6-plus:free",
+        "provider": "OpenRouter",
        "category": "Analysis",
        "mode": "subagent",
        "color": "#14B8A6",
        "description": "Analyzes task coverage and identifies gaps",
        "benchmark": {
-          "swe_bench": 62.4,
-          "fit_score": 66
+          "swe_bench": 78.8,
+          "fit_score": 90,
+          "context": "1M",
+          "free": true
        },
-        "capabilities": ["gap_analysis", "capability_mapping", "recommendation_generation", "coverage_analysis"],
-        "recommendations": [
-          {
-            "target": "ollama-cloud/nemotron-3-super",
-            "reason": "+21% quality for gap analysis and recommendations",
-            "priority": "critical"
-          }
-        ]
+        "capabilities": ["gap_analysis", "capability_mapping", "recommendation_generation", "coverage_analysis"]
      },
-      "history": [],
+      "history": [
+        {
+          "date": "2026-04-05T22:30:00Z",
+          "commit": "auto",
+          "type": "model_change",
+          "from": "ollama-cloud/nemotron-3-super",
+          "to": "qwen/qwen3.6-plus:free",
+          "reason": "+23% quality, IF:90 score, 1M context, FREE via OpenRouter",
+          "source": "research"
+        }
+      ],
      "performance_log": []
    },
    "orchestrator": {
@@ -367,15 +376,17 @@
    },
    "evaluator": {
      "current": {
-        "model": "ollama-cloud/nemotron-3-super",
-        "provider": "Ollama",
+        "model": "qwen/qwen3.6-plus:free",
+        "provider": "OpenRouter",
        "category": "Process",
        "mode": "subagent",
        "color": "#F97316",
        "description": "Scores agent effectiveness after task completion",
        "benchmark": {
-          "swe_bench": 60.5,
-          "fit_score": 82
+          "swe_bench": 78.8,
+          "fit_score": 90,
+          "context": "1M",
+          "free": true
        },
        "capabilities": ["performance_scoring", "process_analysis", "pattern_identification", "improvement_recommendations"]
      },
@@ -388,6 +399,15 @@
          "to": "ollama-cloud/nemotron-3-super",
          "reason": "Nemotron 3 Super better for evaluation tasks",
          "source": "git"
+        },
+        {
+          "date": "2026-04-05T22:30:00Z",
+          "commit": "auto",
+          "type": "model_change",
+          "from": "ollama-cloud/nemotron-3-super",
+          "to": "qwen/qwen3.6-plus:free",
+          "reason": "+4% quality, IF:90 for scoring accuracy, FREE",
+          "source": "research"
        }
      ],
      "performance_log": []
@@ -516,26 +536,31 @@
    },
    "agent-architect": {
      "current": {
-        "model": "ollama-cloud/gpt-oss:120b",
-        "provider": "Ollama",
+        "model": "qwen/qwen3.6-plus:free",
+        "provider": "OpenRouter",
        "category": "Meta",
        "mode": "subagent",
        "color": "#A855F7",
        "description": "Creates new agents when gaps identified",
        "benchmark": {
-          "swe_bench": 62.4,
-          "fit_score": 69
+          "swe_bench": 78.8,
+          "fit_score": 90,
+          "context": "1M",
+          "free": true
        },
-        "capabilities": ["agent_design", "prompt_engineering", "capability_definition"],
-        "recommendations": [
-          {
-            "target": "ollama-cloud/nemotron-3-super",
-            "reason": "+19% quality for agent design",
-            "priority": "high"
-          }
-        ]
+        "capabilities": ["agent_design", "prompt_engineering", "capability_definition"]
      },
-      "history": [],
+      "history": [
+        {
+          "date": "2026-04-05T22:30:00Z",
+          "commit": "auto",
+          "type": "model_change",
+          "from": "ollama-cloud/nemotron-3-super",
+          "to": "qwen/qwen3.6-plus:free",
+          "reason": "+22% quality, IF:90 for YAML frontmatter generation, 1M context for all agents analysis",
+          "source": "research"
+        }
+      ],
      "performance_log": []
    },
    "planner": {
@@ -701,11 +726,11 @@
      ]
    }
  },
-  "evolution_metrics": {
+    "evolution_metrics": {
    "total_agents": 32,
-    "agents_with_history": 12,
-    "pending_recommendations": 6,
-    "last_sync": "2026-04-05T17:27:00Z",
-    "sync_sources": ["git", "capability-index.yaml", "kilo.jsonc"]
+    "agents_with_history": 16,
+    "pending_recommendations": 0,
+    "last_sync": "2026-04-05T22:30:00Z",
+    "sync_sources": ["git", "capability-index.yaml", "kilo.jsonc", "research"]
  }
 }