diff --git a/.kilo/agents/browser-automation.md b/.kilo/agents/browser-automation.md
index 8e2ba56..013f7c0 100755
--- a/.kilo/agents/browser-automation.md
+++ b/.kilo/agents/browser-automation.md
@@ -1,7 +1,7 @@
---
description: Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction (GNS-2 Tier 0)
mode: subagent
-model: ollama-cloud/qwen3-coder:480b
+model: ollama-cloud/deepseek-v4-flash
color: "#1E88E5"
permission:
read: allow
diff --git a/.kilo/agents/capability-analyst.md b/.kilo/agents/capability-analyst.md
index 851ec22..a1f279f 100755
--- a/.kilo/agents/capability-analyst.md
+++ b/.kilo/agents/capability-analyst.md
@@ -1,7 +1,7 @@
---
description: Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components. Tier 2 meta-agent with self-cascade enabled.
mode: subagent
-model: ollama-cloud/glm-5.1
+model: ollama-cloud/deepseek-v4-pro-max
color: "#6366F1"
permission:
read: allow
diff --git a/.kilo/agents/history-miner.md b/.kilo/agents/history-miner.md
index d1dd2fb..7b776bd 100755
--- a/.kilo/agents/history-miner.md
+++ b/.kilo/agents/history-miner.md
@@ -1,7 +1,7 @@
---
description: Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work (GNS-2 Tier 0)
mode: subagent
-model: ollama-cloud/nemotron-3-super
+model: ollama-cloud/qwen3.5-122b
color: "#059669"
permission:
read: allow
diff --git a/.kilo/agents/markdown-validator.md b/.kilo/agents/markdown-validator.md
index 6463400..152d840 100755
--- a/.kilo/agents/markdown-validator.md
+++ b/.kilo/agents/markdown-validator.md
@@ -1,7 +1,7 @@
---
description: Validates and corrects Markdown descriptions for Gitea issues (GNS-2 Tier 0)
mode: subagent
-model: ollama-cloud/deepseek-v4-pro-max
+model: ollama-cloud/nemotron-3-nano
color: "#F97316"
permission:
read: allow
diff --git a/.kilo/agents/release-manager.md b/.kilo/agents/release-manager.md
index e02809b..180f9e7 100755
--- a/.kilo/agents/release-manager.md
+++ b/.kilo/agents/release-manager.md
@@ -1,7 +1,7 @@
---
description: Manages git operations, semantic versioning, branching, and deployments. Ensures clean history (GNS-2 Tier 1)
mode: subagent
-model: ollama-cloud/glm-5.1
+model: ollama-cloud/kimi-k2.6
color: "#581C87"
permission:
read: allow
diff --git a/.kilo/capability-index.yaml b/.kilo/capability-index.yaml
index 86de347..f39913e 100644
--- a/.kilo/capability-index.yaml
+++ b/.kilo/capability-index.yaml
@@ -412,7 +412,7 @@ agents:
- screenshots
forbidden:
- unit_testing
- model: ollama-cloud/qwen3-coder:480b
+ model: ollama-cloud/deepseek-v4-flash
mode: subagent
delegates_to:
- orchestrator
@@ -501,7 +501,7 @@ agents:
- new_agent_specs
forbidden:
- implementation
- model: ollama-cloud/glm-5.1
+ model: ollama-cloud/deepseek-v4-pro-max
mode: subagent
delegates_to:
- agent-architect
@@ -585,7 +585,7 @@ agents:
forbidden:
- code_changes
- feature_development
- model: ollama-cloud/glm-5.1
+ model: ollama-cloud/kimi-k2.6
mode: subagent
delegates_to:
- evaluator
@@ -734,7 +734,7 @@ agents:
- corrections
forbidden:
- content_creation
- model: ollama-cloud/deepseek-v4-pro-max
+ model: ollama-cloud/nemotron-3-nano
mode: subagent
delegates_to:
- orchestrator
diff --git a/agent-evolution/Dockerfile b/agent-evolution/Dockerfile
index e60fca5..c7dfe16 100644
--- a/agent-evolution/Dockerfile
+++ b/agent-evolution/Dockerfile
@@ -1,30 +1,24 @@
# Agent Evolution Dashboard Dockerfile
-# Standalone version - works from file:// or HTTP
+# Mount-required version: all content is mounted via volumes.
+# No file copies into the image — rebuild is never required for data changes.
+#
+# Build once:
+# docker build -t apaw-evolution -f agent-evolution/Dockerfile .
+#
+# Workflow:
+# bun run sync:evolution # host-side — regenerates index.standalone.html
+# bash agent-evolution/docker-run.sh reload # container restarts with new mounts
-# Build stage - run sync to generate standalone HTML
-FROM oven/bun:1 AS builder
-
-WORKDIR /build
-
-# Copy config files for sync
-COPY .kilo/agents/*.md ./.kilo/agents/
-COPY .kilo/capability-index.yaml ./.kilo/
-COPY .kilo/kilo.jsonc ./.kilo/
-COPY agent-evolution/ ./agent-evolution/
-
-# Run sync to generate standalone HTML with embedded data
-RUN bun agent-evolution/scripts/sync-agent-history.ts || true
-
-# Production stage - Python HTTP server
-FROM python:3.12-alpine AS production
+FROM python:3.12-alpine
WORKDIR /app
-# Copy standalone HTML (embedded data)
-COPY --from=builder /build/agent-evolution/index.standalone.html ./index.html
+# Placeholder content until host mounts the real index.standalone.html
+RUN echo '
APAW Evolution DashboardMount required
Run bun run sync:evolution on the host, then reload the container.
' > index.html
-# Expose port
EXPOSE 3001
-# Simple HTTP server (no CORS issues)
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+ CMD wget --no-verbose --tries=1 --spider http://127.0.0.1:3001/ || exit 1
+
CMD ["python3", "-m", "http.server", "3001"]
\ No newline at end of file
diff --git a/agent-evolution/data/agent-versions.json b/agent-evolution/data/agent-versions.json
index a3e7889..8a397de 100644
--- a/agent-evolution/data/agent-versions.json
+++ b/agent-evolution/data/agent-versions.json
@@ -1,22 +1,17 @@
{
"version": "1.0.0",
- "lastUpdated": "2026-04-27T20:28:58.592Z",
+ "lastUpdated": "2026-05-25T13:37:20.281Z",
"agents": {
"lead-developer": {
"current": {
- "description": "Primary code writer for backend and core logic. Writes implementation to pass tests",
+ "description": "Primary code writer for backend and core logic. Writes implementation to pass tests (GNS-2 Tier 1)",
"mode": "subagent",
- "model": "ollama-cloud/nemotron-3-super",
+ "model": "ollama-cloud/qwen3-coder:480b",
"provider": "Ollama",
"variant": "thinking",
"color": "\"#DC2626\"",
"category": "General",
- "capabilities": [
- "code_writing",
- "refactoring",
- "bug_fixing",
- "implementation"
- ]
+ "capabilities": []
},
"history": [
{
@@ -25,47 +20,39 @@
"type": "model_change",
"from": null,
"to": "ollama-cloud/qwen3-coder:480b",
- "reason": "Initial configuration from capability-index.yaml",
+ "reason": "Initial configuration",
"source": "git"
},
{
- "date": "2026-04-27T16:56:09.013Z",
+ "date": "2026-04-27T16:56:09Z",
"commit": "model-research-sync",
"type": "model_change",
"from": "ollama-cloud/qwen3-coder:480b",
"to": "ollama-cloud/nemotron-3-super",
- "reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.",
+ "reason": "Nemotron 3 Super has better reasoning",
"source": "research"
},
{
- "date": "2026-04-27T20:28:58.592Z",
- "commit": "model-research-sync",
+ "date": "2026-05-24T01:00:00Z",
+ "commit": "ollama-cloud-consolidation",
"type": "model_change",
- "from": "ollama-cloud/qwen3-coder:480b",
- "to": "ollama-cloud/nemotron-3-super",
- "reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.",
- "source": "research"
+ "from": "ollama-cloud/nemotron-3-super",
+ "to": "ollama-cloud/qwen3-coder:480b",
+ "reason": "Reverted to qwen3-coder: SWE-bench 66.5% is coding-benchmark standard. Matrix score 92 vs nemotron 70.",
+ "source": "orchestrator-analysis"
}
],
"performance_log": []
},
"frontend-developer": {
"current": {
- "description": "Handles UI implementation with multimodal capabilities. Accepts visual references like screenshots and mockups",
+ "description": "Handles UI implementation with multimodal capabilities. Accepts visual references like screenshots and mockups (GNS-2 Tier 1)",
"mode": "all",
- "model": "ollama-cloud/qwen3-coder:480b",
+ "model": "ollama-cloud/minimax-m2.5",
"provider": "Ollama",
"color": "\"#0EA5E9\"",
"category": "General",
- "capabilities": [
- "ui_implementation",
- "component_creation",
- "styling",
- "responsive_design",
- "nextjs_development",
- "vue_nuxt_development",
- "react_development"
- ]
+ "capabilities": []
},
"history": [
{
@@ -76,48 +63,41 @@
"to": "ollama-cloud/qwen3-coder:480b",
"reason": "Flutter development support added",
"source": "git"
+ },
+ {
+ "date": "2026-04-27T17:00:00Z",
+ "commit": "model-research-sync",
+ "type": "model_change",
+ "from": "ollama-cloud/qwen3-coder:480b",
+ "to": "ollama-cloud/minimax-m2.5",
+ "reason": "Matrix score 92 for frontend on M2.5. SWE-bench 80.2%.",
+ "source": "research"
}
],
"performance_log": []
},
"backend-developer": {
"current": {
- "description": "Backend specialist for Node.js, Express, APIs, and database integration",
+ "description": "Backend specialist for Node.js, Express, APIs, and database integration (GNS-2 Tier 1)",
"mode": "subagent",
"model": "ollama-cloud/qwen3-coder:480b",
"provider": "Ollama",
"color": "\"#10B981\"",
"category": "General",
- "capabilities": [
- "api_development",
- "database_design",
- "server_logic",
- "authentication",
- "postgresql_integration",
- "sqlite_integration"
- ]
+ "capabilities": []
},
"history": [],
"performance_log": []
},
"go-developer": {
"current": {
- "description": "Go backend specialist for Gin, Echo, APIs, and database integration",
+ "description": "Go backend specialist for Gin, Echo, APIs, and database integration (GNS-2 Tier 1)",
"mode": "subagent",
- "model": "ollama-cloud/qwen3-coder:480b",
+ "model": "ollama-cloud/deepseek-v4-pro-max",
"provider": "Ollama",
"color": "\"#00ADD8\"",
"category": "General",
- "capabilities": [
- "go_api_development",
- "go_database_design",
- "go_concurrent_programming",
- "go_authentication",
- "go_microservices",
- "postgresql_integration",
- "sqlite_integration",
- "clickhouse_integration"
- ]
+ "capabilities": []
},
"history": [
{
@@ -126,64 +106,57 @@
"type": "model_change",
"from": "ollama-cloud/deepseek-v3.2",
"to": "ollama-cloud/qwen3-coder:480b",
- "reason": "Qwen3-Coder optimized for Go development",
+ "reason": "Qwen3-Coder optimized for Go",
"source": "git"
+ },
+ {
+ "date": "2026-04-27T17:00:00Z",
+ "commit": "model-research-sync",
+ "type": "model_change",
+ "from": "ollama-cloud/qwen3-coder:480b",
+ "to": "ollama-cloud/deepseek-v4-pro-max",
+ "reason": "Matrix score 88 for go-dev on V4-Pro. DeepSeek traditionally strong in Go/Rust.",
+ "source": "research"
}
],
"performance_log": []
},
"sdet-engineer": {
"current": {
- "description": "Writes tests following TDD methodology. Tests MUST fail initially (Red phase)",
+ "description": "Writes tests following TDD methodology. Tests MUST fail initially (Red phase) (GNS-2 Tier 1)",
"mode": "all",
"model": "ollama-cloud/qwen3-coder:480b",
"provider": "Ollama",
"variant": "thinking",
"color": "\"#8B5CF6\"",
"category": "General",
- "capabilities": [
- "unit_tests",
- "integration_tests",
- "e2e_tests",
- "test_planning",
- "visual_regression"
- ]
+ "capabilities": []
},
"history": [],
"performance_log": []
},
"code-skeptic": {
"current": {
- "description": "Adversarial code reviewer. Finds problems and issues. Does NOT suggest implementations",
+ "description": "Adversarial code reviewer. Finds problems and issues. Does NOT suggest implementations (GNS-2 Tier 0)",
"mode": "subagent",
"model": "ollama-cloud/minimax-m2.5",
"provider": "Ollama",
"color": "\"#E11D48\"",
"category": "General",
- "capabilities": [
- "code_review",
- "security_review",
- "style_check",
- "issue_identification"
- ]
+ "capabilities": []
},
"history": [],
"performance_log": []
},
"security-auditor": {
"current": {
- "description": "Scans for security vulnerabilities, OWASP Top 10, dependency CVEs, and hardcoded secrets",
- "mode": "all",
- "model": "ollama-cloud/nemotron-3-super",
+ "description": "Scans for security vulnerabilities, OWASP Top 10, dependency CVEs, and hardcoded secrets (GNS-2 Tier 0)",
+ "mode": "subagent",
+ "model": "ollama-cloud/deepseek-v4-pro-max",
"provider": "Ollama",
- "color": "\"#7F1D1D\"",
+ "color": "\"#DC2626\"",
"category": "General",
- "capabilities": [
- "vulnerability_scan",
- "owasp_check",
- "secret_detection",
- "auth_review"
- ]
+ "capabilities": []
},
"history": [
{
@@ -192,26 +165,30 @@
"type": "model_change",
"from": "ollama-cloud/deepseek-v3.2",
"to": "ollama-cloud/nemotron-3-super",
- "reason": "Nemotron 3 Super optimized for security analysis with RULER@1M",
+ "reason": "Nemotron 3 Super optimized for security analysis",
"source": "git"
+ },
+ {
+ "date": "2026-05-24T01:00:00Z",
+ "commit": "ollama-cloud-consolidation",
+ "type": "model_change",
+ "from": "ollama-cloud/nemotron-3-super",
+ "to": "ollama-cloud/deepseek-v4-pro-max",
+ "reason": "V4-Pro Max matrix=80 vs nemotron=76. SWE-V 80.6, 1M context.",
+ "source": "orchestrator-analysis"
}
],
"performance_log": []
},
"performance-engineer": {
"current": {
- "description": "Reviews code for performance issues. Focuses on efficiency, N+1 queries, memory leaks, and algorithmic complexity",
+ "description": "Reviews code for performance issues. Focuses on efficiency, N+1 queries, memory leaks, and algorithmic complexity (GNS-2 Tier 0)",
"mode": "all",
- "model": "ollama-cloud/nemotron-3-super",
+ "model": "ollama-cloud/deepseek-v4-pro-max",
"provider": "Ollama",
"color": "\"#0D9488\"",
"category": "General",
- "capabilities": [
- "performance_analysis",
- "n_plus_one_detection",
- "memory_leak_check",
- "algorithm_analysis"
- ]
+ "capabilities": []
},
"history": [
{
@@ -222,68 +199,54 @@
"to": "ollama-cloud/nemotron-3-super",
"reason": "Better reasoning for performance analysis",
"source": "git"
+ },
+ {
+ "date": "2026-05-24T01:00:00Z",
+ "commit": "ollama-cloud-consolidation",
+ "type": "model_change",
+ "from": "ollama-cloud/nemotron-3-super",
+ "to": "ollama-cloud/deepseek-v4-pro-max",
+ "reason": "Matrix=84 for perf-engineer on V4-Pro. GPQA 90.1 for reasoning.",
+ "source": "orchestrator-analysis"
}
],
"performance_log": []
},
"browser-automation": {
"current": {
- "description": "Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction",
+ "description": "Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction (GNS-2 Tier 0)",
"mode": "subagent",
"model": "ollama-cloud/qwen3-coder:480b",
"provider": "Ollama",
"color": "\"#1E88E5\"",
"category": "General",
- "capabilities": [
- "e2e_browser_tests",
- "form_filling",
- "navigation_testing",
- "screenshot_capture"
- ]
+ "capabilities": []
},
"history": [],
"performance_log": []
},
"visual-tester": {
"current": {
- "description": "Visual regression testing agent that captures screenshots, extracts UI elements with bounding boxes, compares via pixelmatch, and detects console/network errors",
+ "description": "Visual regression testing agent that compares screenshots and detects UI differences using pixelmatch and image diff (GNS-2 Tier 0)",
"mode": "subagent",
"model": "ollama-cloud/qwen3-coder:480b",
"provider": "Ollama",
"color": "\"#E91E63\"",
"category": "General",
- "capabilities": [
- "visual_regression",
- "pixel_comparison",
- "screenshot_diff",
- "ui_validation",
- "bbox_element_extraction",
- "console_error_detection",
- "network_error_detection",
- "responsive_layout_check",
- "button_overflow_detection",
- "gitea_integration",
- "docker_networking"
- ]
+ "capabilities": []
},
"history": [],
"performance_log": []
},
"system-analyst": {
"current": {
- "description": "Designs technical specifications, data schemas, and API contracts before implementation",
+ "description": "Designs technical specifications, data schemas, and API contracts before implementation (GNS-2 Tier 1)",
"mode": "subagent",
- "model": "ollama-cloud/nemotron-3-super",
+ "model": "ollama-cloud/deepseek-v4-pro-max",
"provider": "Ollama",
- "variant": "thinking",
"color": "\"#0891B2\"",
"category": "General",
- "capabilities": [
- "architecture_design",
- "api_specification",
- "database_modeling",
- "technical_documentation"
- ]
+ "capabilities": []
},
"history": [
{
@@ -292,11 +255,11 @@
"type": "model_change",
"from": "ollama-cloud/gpt-oss:120b",
"to": "ollama-cloud/glm-5",
- "reason": "GLM-5 better for system engineering and architecture",
+ "reason": "GLM-5 better for system engineering",
"source": "git"
},
{
- "date": "2026-04-23T06:24:32.546Z",
+ "date": "2026-04-23T06:24:32Z",
"commit": "sync",
"type": "model_change",
"from": "ollama-cloud/glm-5",
@@ -305,32 +268,44 @@
"source": "git"
},
{
- "date": "2026-04-27T16:59:52.825Z",
+ "date": "2026-04-27T16:59:52Z",
"commit": "model-research-sync",
"type": "model_change",
"from": "ollama-cloud/glm-5.1",
"to": "ollama-cloud/nemotron-3-super",
"reason": "Test recommendation for model research sync script",
"source": "research"
+ },
+ {
+ "date": "2026-05-24T01:00:00Z",
+ "commit": "ollama-cloud-consolidation",
+ "type": "model_change",
+ "from": "ollama-cloud/nemotron-3-super",
+ "to": "ollama-cloud/glm-5.1",
+ "reason": "Reverted: GLM-5.1 Arena ELO 1451, instruction following ~90. Standardization with 12 other agents.",
+ "source": "orchestrator-analysis"
+ },
+ {
+ "date": "2026-05-25T13:37:20.281Z",
+ "commit": "sync",
+ "type": "model_change",
+ "from": "ollama-cloud/glm-5.1",
+ "to": "ollama-cloud/deepseek-v4-pro-max",
+ "reason": "Model update from sync",
+ "source": "git"
}
],
"performance_log": []
},
"requirement-refiner": {
"current": {
- "description": "Converts vague ideas and bug reports into strict User Stories with acceptance criteria checklists",
+ "description": "Converts vague ideas and bug reports into strict User Stories with acceptance criteria checklists (GNS-2 Tier 1)",
"mode": "all",
- "model": "ollama-cloud/glm-5.1",
+ "model": "ollama-cloud/kimi-k2-thinking",
"provider": "Ollama",
"variant": "thinking",
"color": "\"#4F46E5\"",
- "category": "General",
- "capabilities": [
- "requirement_analysis",
- "user_story_creation",
- "acceptance_criteria",
- "clarification"
- ]
+ "category": "General"
},
"history": [
{
@@ -339,39 +314,51 @@
"type": "model_change",
"from": "ollama-cloud/nemotron-3-super",
"to": "ollama-cloud/glm-5",
- "reason": "+33% quality. GLM-5 excels at requirement analysis and system engineering",
+ "reason": "+33% quality. GLM-5 excels at requirement analysis",
"source": "research"
},
{
- "date": "2026-04-23T06:24:32.546Z",
+ "date": "2026-04-23T06:24:32Z",
"commit": "sync",
"type": "model_change",
"from": "ollama-cloud/glm-5",
"to": "ollama-cloud/glm-5.1",
"reason": "Model update from sync",
"source": "git"
+ },
+ {
+ "date": "2026-05-24T01:00:00Z",
+ "commit": "ollama-cloud-consolidation",
+ "type": "model_change",
+ "from": "ollama-cloud/glm-5.1",
+ "to": "ollama-cloud/kimi-k2.6",
+ "reason": "kimi-k2.6 IF=91 highest, multimodal for mockup understanding. Matrix ~88-90 for req-refiner.",
+ "source": "orchestrator-analysis"
+ },
+ {
+ "date": "2026-05-23T23:35:02.184Z",
+ "commit": "sync",
+ "type": "model_change",
+ "from": "ollama-cloud/kimi-k2.6",
+ "to": "ollama-cloud/kimi-k2-thinking",
+ "reason": "Model update from sync",
+ "source": "git"
}
],
"performance_log": []
},
"history-miner": {
"current": {
- "description": "Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work",
+ "description": "Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work (GNS-2 Tier 0)",
"mode": "subagent",
"model": "ollama-cloud/nemotron-3-super",
"provider": "Ollama",
"color": "\"#059669\"",
- "category": "General",
- "capabilities": [
- "git_search",
- "duplicate_detection",
- "past_solution_finder",
- "pattern_identification"
- ]
+ "category": "General"
},
"history": [
{
- "date": "2026-04-23T06:24:32.546Z",
+ "date": "2026-04-23T06:24:32Z",
"commit": "sync",
"type": "model_change",
"from": "ollama-cloud/glm-5",
@@ -384,18 +371,13 @@
},
"capability-analyst": {
"current": {
- "description": "Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components.",
+ "description": "Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components. Tier 2 meta-agent with self-cascade enabled.",
"mode": "subagent",
"model": "ollama-cloud/glm-5.1",
"provider": "Ollama",
"color": "\"#6366F1\"",
"category": "General",
- "capabilities": [
- "gap_analysis",
- "capability_mapping",
- "recommendation_generation",
- "coverage_analysis"
- ]
+ "capabilities": []
},
"history": [
{
@@ -404,11 +386,11 @@
"type": "model_change",
"from": "ollama-cloud/nemotron-3-super",
"to": "openrouter/qwen/qwen3.6-plus:free",
- "reason": "+23% quality, IF:90 score, 1M context, FREE via OpenRouter",
+ "reason": "+23% quality, IF:90, FREE via OpenRouter",
"source": "research"
},
{
- "date": "2026-04-23T06:24:32.546Z",
+ "date": "2026-04-23T06:24:32Z",
"commit": "sync",
"type": "model_change",
"from": "openrouter/qwen/qwen3.6-plus:free",
@@ -421,51 +403,50 @@
},
"orchestrator": {
"current": {
- "description": "Main dispatcher. Routes tasks between agents based on Issue status and manages the workflow state machine. IF:90 for optimal routing accuracy.",
+ "description": "Main dispatcher. Routes tasks between agents based on Issue status and manages the workflow state machine. IF:90 for optimal routing accuracy. (GNS-2 Tier 1)",
"mode": "all",
- "model": "ollama-cloud/glm-5.1",
+ "model": "ollama-cloud/kimi-k2.6",
"provider": "Ollama",
"variant": "thinking",
"color": "\"#7C3AED\"",
"category": "General",
- "capabilities": [
- "task_routing",
- "state_management",
- "agent_coordination",
- "workflow_execution"
- ]
+ "capabilities": []
},
"history": [
{
- "date": "2026-04-23T06:24:32.546Z",
+ "date": "2026-04-23T06:24:32Z",
"commit": "sync",
"type": "model_change",
"from": "ollama-cloud/glm-5",
"to": "ollama-cloud/glm-5.1",
"reason": "Model update from sync",
"source": "git"
+ },
+ {
+ "date": "2026-04-27T20:28:58Z",
+ "commit": "model-research-sync",
+ "type": "model_change",
+ "from": "ollama-cloud/glm-5.1",
+ "to": "ollama-cloud/kimi-k2.6",
+ "reason": "kimi-k2.6 best fit for orchestration (92). 300 sub-agent swarm.",
+ "source": "research"
}
],
"performance_log": []
},
"release-manager": {
"current": {
- "description": "Manages git operations, semantic versioning, branching, and deployments. Ensures clean history",
+ "description": "Manages git operations, semantic versioning, branching, and deployments. Ensures clean history (GNS-2 Tier 1)",
"mode": "subagent",
"model": "ollama-cloud/glm-5.1",
"provider": "Ollama",
"color": "\"#581C87\"",
"category": "General",
- "capabilities": [
- "git_operations",
- "version_management",
- "changelog_creation",
- "deployment"
- ]
+ "capabilities": []
},
"history": [
{
- "date": "2026-04-23T06:24:32.546Z",
+ "date": "2026-04-23T06:24:32Z",
"commit": "sync",
"type": "model_change",
"from": "ollama-cloud/devstral-2:123b",
@@ -478,19 +459,14 @@
},
"evaluator": {
"current": {
- "description": "Scores agent effectiveness after task completion for continuous improvement",
+ "description": "Scores agent effectiveness after task completion for continuous improvement. Tier 2 meta-agent with self-cascade enabled.",
"mode": "subagent",
- "model": "ollama-cloud/glm-5.1",
+ "model": "ollama-cloud/qwen3.5-122b",
"provider": "Ollama",
"variant": "thinking",
"color": "\"#047857\"",
"category": "General",
- "capabilities": [
- "performance_scoring",
- "process_analysis",
- "pattern_identification",
- "improvement_recommendations"
- ]
+ "capabilities": []
},
"history": [
{
@@ -512,31 +488,35 @@
"source": "research"
},
{
- "date": "2026-04-23T06:24:32.546Z",
+ "date": "2026-04-23T06:24:32Z",
"commit": "sync",
"type": "model_change",
"from": "openrouter/qwen/qwen3.6-plus:free",
"to": "ollama-cloud/glm-5.1",
"reason": "Model update from sync",
"source": "git"
+ },
+ {
+ "date": "2026-05-25T13:37:20.281Z",
+ "commit": "sync",
+ "type": "model_change",
+ "from": "ollama-cloud/glm-5.1",
+ "to": "ollama-cloud/qwen3.5-122b",
+ "reason": "Model update from sync",
+ "source": "git"
}
],
"performance_log": []
},
"prompt-optimizer": {
"current": {
- "description": "Improves agent system prompts based on performance failures. Meta-learner for prompt optimization",
+ "description": "Improves agent system prompts based on performance failures. Meta-learner for prompt optimization (GNS-2 Tier 1)",
"mode": "subagent",
- "model": "ollama-cloud/glm-5.1",
+ "model": "ollama-cloud/qwen3.5-122b",
"provider": "Ollama",
- "variant": "instant",
"color": "\"#BE185D\"",
"category": "General",
- "capabilities": [
- "prompt_analysis",
- "prompt_improvement",
- "failure_pattern_detection"
- ]
+ "capabilities": []
},
"history": [
{
@@ -549,48 +529,66 @@
"source": "git"
},
{
- "date": "2026-04-23T06:24:32.546Z",
+ "date": "2026-04-23T06:24:32Z",
"commit": "sync",
"type": "model_change",
"from": "ollama-cloud/nemotron-3-super",
"to": "ollama-cloud/glm-5.1",
"reason": "Model update from sync",
"source": "git"
+ },
+ {
+ "date": "2026-05-24T01:00:00Z",
+ "commit": "ollama-cloud-consolidation",
+ "type": "model_change",
+ "from": "ollama-cloud/glm-5.1",
+ "to": "ollama-cloud/qwen3.5",
+ "reason": "MIGRATION: qwen3.6-plus was OpenRouter (not Ollama Cloud). qwen3.5 has IF=92, updated 2 days ago, 12.4M pulls.",
+ "source": "orchestrator-analysis"
+ },
+ {
+ "date": "2026-05-23T23:35:02.184Z",
+ "commit": "sync",
+ "type": "model_change",
+ "from": "ollama-cloud/qwen3.5",
+ "to": "ollama-cloud/qwen3.6-plus",
+ "reason": "Model update from sync",
+ "source": "git"
+ },
+ {
+ "date": "2026-05-25T13:37:20.281Z",
+ "commit": "sync",
+ "type": "model_change",
+ "from": "ollama-cloud/qwen3.6-plus",
+ "to": "ollama-cloud/qwen3.5-122b",
+ "reason": "Model update from sync",
+ "source": "git"
}
],
"performance_log": []
},
"the-fixer": {
"current": {
- "description": "Iteratively fixes bugs based on specific error reports and test failures",
+ "description": "Iteratively fixes bugs based on specific error reports and test failures (GNS-2 Tier 1)",
"mode": "all",
- "model": "ollama-cloud/minimax-m2.5",
+ "model": "ollama-cloud/kimi-k2.6",
"provider": "Ollama",
"color": "\"#F59E0B\"",
"category": "General",
- "capabilities": [
- "bug_fixing",
- "issue_resolution",
- "code_correction"
- ]
+ "capabilities": []
},
"history": [],
"performance_log": []
},
"product-owner": {
"current": {
- "description": "Manages issue checklists, status labels, tracks progress and coordinates with human users",
+ "description": "Manages issue checklists, status labels, tracks progress and coordinates with human users (GNS-2 Tier 1)",
"mode": "subagent",
"model": "ollama-cloud/glm-5.1",
"provider": "Ollama",
"color": "\"#EA580C\"",
"category": "General",
- "capabilities": [
- "issue_management",
- "prioritization",
- "backlog_management",
- "workflow_completion"
- ]
+ "capabilities": []
},
"history": [
{
@@ -603,7 +601,7 @@
"source": "git"
},
{
- "date": "2026-04-23T06:24:32.546Z",
+ "date": "2026-04-23T06:24:32Z",
"commit": "sync",
"type": "model_change",
"from": "ollama-cloud/glm-5",
@@ -616,45 +614,46 @@
},
"workflow-architect": {
"current": {
- "description": "Creates and maintains workflow definitions with complete architecture, Gitea integration, and quality gates",
+ "description": "Creates and maintains workflow definitions with complete architecture, Gitea integration, and quality gates (GNS-2 Tier 1)",
"mode": "subagent",
- "model": "ollama-cloud/glm-5.1",
+ "model": "ollama-cloud/qwen3.5-122b",
"provider": "Ollama",
"variant": "thinking",
"color": "\"#EC4899\"",
"category": "General",
- "capabilities": [
- "workflow_design",
- "process_definition",
- "automation_setup"
- ]
+ "capabilities": []
},
"history": [
{
- "date": "2026-04-23T06:24:32.546Z",
+ "date": "2026-04-23T06:24:32Z",
"commit": "sync",
"type": "model_change",
"from": "ollama-cloud/glm-5",
"to": "ollama-cloud/glm-5.1",
"reason": "Model update from sync",
"source": "git"
+ },
+ {
+ "date": "2026-05-25T13:37:20.281Z",
+ "commit": "sync",
+ "type": "model_change",
+ "from": "ollama-cloud/glm-5.1",
+ "to": "ollama-cloud/qwen3.5-122b",
+ "reason": "Model update from sync",
+ "source": "git"
}
],
"performance_log": []
},
"markdown-validator": {
"current": {
- "description": "Validates and corrects Markdown descriptions for Gitea issues",
+ "description": "Validates and corrects Markdown descriptions for Gitea issues (GNS-2 Tier 0)",
"mode": "subagent",
- "model": "ollama-cloud/nemotron-3-nano:30b",
+ "model": "ollama-cloud/deepseek-v4-pro-max",
"provider": "Ollama",
"color": "\"#F97316\"",
"category": "General",
- "capabilities": [
- "markdown_validation",
- "formatting_check",
- "link_validation"
- ]
+ "capabilities": []
},
"history": [
{
@@ -665,6 +664,24 @@
"to": "ollama-cloud/nemotron-3-nano:30b",
"reason": "Nano efficient for lightweight validation tasks",
"source": "git"
+ },
+ {
+ "date": "2026-05-24T01:00:00Z",
+ "commit": "ollama-cloud-consolidation",
+ "type": "model_change",
+ "from": "ollama-cloud/nemotron-3-nano:30b",
+ "to": "ollama-cloud/nemotron-3-nano",
+ "reason": "Unified naming. Nano IF=68, tiny and cheap, perfect for validation.",
+ "source": "orchestrator-analysis"
+ },
+ {
+ "date": "2026-05-23T23:35:02.185Z",
+ "commit": "sync",
+ "type": "model_change",
+ "from": "ollama-cloud/nemotron-3-nano",
+ "to": "ollama-cloud/deepseek-v4-pro-max",
+ "reason": "Model update from sync",
+ "source": "git"
}
],
"performance_log": []
@@ -673,17 +690,12 @@
"current": {
"name": "Agent Architect",
"mode": "subagent",
- "model": "ollama-cloud/glm-5.1",
+ "model": "ollama-cloud/kimi-k2.6",
"provider": "Ollama",
- "variant": "thinking",
- "description": "Creates, modifies, and reviews new agents, workflows, and skills based on capability gap analysis",
+ "description": "Creates, modifies, and reviews new agents, workflows, and skills based on capability gap analysis. Tier 2 meta-agent with self-cascade enabled.",
"color": "\"#8B5CF6\"",
"category": "General",
- "capabilities": [
- "agent_design",
- "prompt_engineering",
- "capability_definition"
- ]
+ "capabilities": []
},
"history": [
{
@@ -692,36 +704,39 @@
"type": "model_change",
"from": "ollama-cloud/nemotron-3-super",
"to": "openrouter/qwen/qwen3.6-plus:free",
- "reason": "+22% quality, IF:90 for YAML frontmatter generation, 1M context for all agents analysis",
+ "reason": "+22% quality, IF:90 for YAML frontmatter generation",
"source": "research"
},
{
- "date": "2026-04-23T06:24:32.546Z",
+ "date": "2026-04-23T06:24:32Z",
"commit": "sync",
"type": "model_change",
"from": "openrouter/qwen/qwen3.6-plus:free",
"to": "ollama-cloud/glm-5.1",
"reason": "Model update from sync",
"source": "git"
+ },
+ {
+ "date": "2026-05-24T01:00:00Z",
+ "commit": "ollama-cloud-consolidation",
+ "type": "model_change",
+ "from": "ollama-cloud/glm-5.1",
+ "to": "ollama-cloud/kimi-k2.6",
+ "reason": "kimi-k2.6 best fit for agent-architect (86). Multimodal for reviewing UI components.",
+ "source": "orchestrator-analysis"
}
],
"performance_log": []
},
"planner": {
"current": {
- "description": "Advanced task planner using Chain of Thought, Tree of Thoughts, and Plan-Execute-Reflect",
+ "description": "Advanced task planner using Chain of Thought, Tree of Thoughts, and Plan-Execute-Reflect (GNS-2 Tier 0)",
"mode": "subagent",
- "model": "ollama-cloud/nemotron-3-super",
+ "model": "ollama-cloud/deepseek-v4-pro-max",
"provider": "Ollama",
"color": "\"#F59E0B\"",
"category": "General",
- "capabilities": [
- "task_decomposition",
- "chain_of_thought",
- "tree_of_thoughts",
- "plan_execute_reflect",
- "dependency_analysis"
- ]
+ "capabilities": []
},
"history": [
{
@@ -732,25 +747,28 @@
"to": "ollama-cloud/nemotron-3-super",
"reason": "Nemotron 3 Super excels at planning",
"source": "git"
+ },
+ {
+ "date": "2026-04-27T17:00:00Z",
+ "commit": "model-research-sync",
+ "type": "model_change",
+ "from": "ollama-cloud/nemotron-3-super",
+ "to": "ollama-cloud/deepseek-v4-pro-max",
+ "reason": "Matrix score 88 for planner on V4-Pro. GPQA 90.1.",
+ "source": "research"
}
],
"performance_log": []
},
"reflector": {
"current": {
- "description": "Self-reflection agent using Reflexion pattern - learns from mistakes",
+ "description": "Self-reflection agent using Reflexion pattern - learns from mistakes (GNS-2 Tier 0)",
"mode": "subagent",
- "model": "ollama-cloud/nemotron-3-super",
+ "model": "ollama-cloud/deepseek-v4-pro-max",
"provider": "Ollama",
"color": "\"#10B981\"",
"category": "General",
- "capabilities": [
- "self_reflection",
- "mistake_analysis",
- "lesson_extraction",
- "trajectory_analysis",
- "heuristic_evaluation"
- ]
+ "capabilities": []
},
"history": [
{
@@ -761,25 +779,28 @@
"to": "ollama-cloud/nemotron-3-super",
"reason": "Better for reflection tasks",
"source": "git"
+ },
+ {
+ "date": "2026-04-27T17:00:00Z",
+ "commit": "model-research-sync",
+ "type": "model_change",
+ "from": "ollama-cloud/nemotron-3-super",
+ "to": "ollama-cloud/deepseek-v4-pro-max",
+ "reason": "Matrix score 84. Strong reasoning chains.",
+ "source": "research"
}
],
"performance_log": []
},
"memory-manager": {
"current": {
- "description": "Manages agent memory systems - short-term (context), long-term (vector store), and episodic (experiences)",
+ "description": "Manages agent memory systems - short-term (context), long-term (vector store), and episodic (experiences) (GNS-2 Tier 0)",
"mode": "subagent",
- "model": "ollama-cloud/nemotron-3-super",
+ "model": "ollama-cloud/deepseek-v4-pro-max",
"provider": "Ollama",
"color": "\"#8B5CF6\"",
"category": "General",
- "capabilities": [
- "memory_retrieval",
- "memory_storage",
- "memory_consolidation",
- "relevance_scoring",
- "episodic_management"
- ]
+ "capabilities": []
},
"history": [
{
@@ -790,44 +811,59 @@
"to": "ollama-cloud/nemotron-3-super",
"reason": "RULER@1M critical for memory ctx",
"source": "git"
+ },
+ {
+ "date": "2026-05-24T01:00:00Z",
+ "commit": "ollama-cloud-consolidation",
+ "type": "model_change",
+ "from": "ollama-cloud/nemotron-3-super",
+ "to": "ollama-cloud/deepseek-v4-pro-max",
+ "reason": "MIGRATION: qwen3.6-plus was OpenRouter. deepseek-v4-pro-max has 1M context (same as nemotron), matrix 86, SWE-V 80.6.",
+ "source": "orchestrator-analysis"
+ },
+ {
+ "date": "2026-05-23T23:35:02.184Z",
+ "commit": "sync",
+ "type": "model_change",
+ "from": "ollama-cloud/deepseek-v4-pro-max",
+ "to": "ollama-cloud/qwen3.6-plus",
+ "reason": "Model update from sync",
+ "source": "git"
+ },
+ {
+ "date": "2026-05-25T13:37:20.281Z",
+ "commit": "sync",
+ "type": "model_change",
+ "from": "ollama-cloud/qwen3.6-plus",
+ "to": "ollama-cloud/deepseek-v4-pro-max",
+ "reason": "Model update from sync",
+ "source": "git"
}
],
"performance_log": []
},
"devops-engineer": {
"current": {
- "description": "DevOps specialist for Docker, Kubernetes, CI/CD pipeline automation, and infrastructure management",
+ "description": "DevOps specialist for Docker, Kubernetes, CI/CD pipeline automation, and infrastructure management (GNS-2 Tier 1)",
"mode": "subagent",
- "model": "ollama-cloud/nemotron-3-super",
+ "model": "ollama-cloud/kimi-k2.6",
"provider": "Ollama",
"color": "\"#FF6B35\"",
"category": "General",
- "capabilities": [
- "docker_configuration",
- "kubernetes_setup",
- "ci_cd_pipeline",
- "infrastructure_automation",
- "container_optimization"
- ]
+ "capabilities": []
},
"history": [],
"performance_log": []
},
"flutter-developer": {
"current": {
- "description": "Flutter mobile specialist for cross-platform apps, state management, and UI components",
+ "description": "Flutter mobile specialist for cross-platform apps, state management, and UI components (GNS-2 Tier 1)",
"mode": "subagent",
"model": "ollama-cloud/qwen3-coder:480b",
"provider": "Ollama",
"color": "\"#02569B\"",
"category": "General",
- "capabilities": [
- "dart_programming",
- "flutter_ui",
- "mobile_app_development",
- "widget_creation",
- "state_management"
- ]
+ "capabilities": []
},
"history": [
{
@@ -844,100 +880,153 @@
},
"architect-indexer": {
"current": {
- "description": "Indexes and maps project codebase architecture into .architect/ directory. Creates and maintains structured documentation of entities, APIs, DB schema, file graphs, and conventions.",
+ "description": "Indexes and maps project codebase architecture into .architect/ directory. Creates and maintains structured documentation of entities, APIs, DB schema, file graphs, and conventions. (GNS-2 Tier 0)",
"mode": "subagent",
"model": "ollama-cloud/glm-5.1",
"provider": "Ollama",
"variant": "thinking",
"color": "\"#10B981\"",
"category": "General",
- "capabilities": [
- "codebase_indexing",
- "project_mapping",
- "architecture_documentation",
- "dependency_analysis",
- "entity_extraction",
- "api_surface_discovery",
- "convention_detection",
- "staleness_detection"
- ]
+ "capabilities": []
},
"history": [],
"performance_log": []
},
"php-developer": {
"current": {
- "description": "PHP backend specialist for Laravel, Symfony, WordPress, and full-stack web applications",
+ "description": "PHP backend specialist for Laravel, Symfony, WordPress, and full-stack web applications (GNS-2 Tier 1)",
"mode": "subagent",
"model": "ollama-cloud/qwen3-coder:480b",
"provider": "Ollama",
"variant": "thinking",
"color": "\"#8B5CF6\"",
"category": "General",
- "capabilities": [
- "php_web_development",
- "laravel_development",
- "symfony_development",
- "wordpress_development",
- "php_api_development",
- "php_database_design",
- "php_authentication",
- "php_modular_architecture",
- "php_testing",
- "php_security"
- ]
+ "capabilities": []
},
"history": [],
"performance_log": []
},
"pipeline-judge": {
"current": {
- "description": "Automated pipeline judge. Evaluates workflow execution by running tests, measuring token cost and wall-clock time. Produces objective fitness scores. Never writes code - only measures and scores.",
+ "description": "Automated pipeline judge. Evaluates workflow execution by running tests, measuring token cost and wall-clock time. Produces objective fitness scores. Never writes code - only measures and scores. (GNS-2 Tier 0)",
"mode": "subagent",
- "model": "ollama-cloud/glm-5.1",
+ "model": "ollama-cloud/kimi-k2.6",
"provider": "Ollama",
"color": "\"#DC2626\"",
"category": "General",
- "capabilities": [
- "test_execution",
- "fitness_scoring",
- "metric_collection",
- "bottleneck_detection"
- ]
+ "capabilities": []
},
"history": [
{
- "date": "2026-04-06T00:23:50 +0100Z",
+ "date": "2026-04-06T00:23:50+0100Z",
"commit": "fa68141d",
"type": "agent_created",
"from": null,
"to": "",
"reason": "feat: add pipeline-judge agent and evolution workflow system",
"source": "git"
+ },
+ {
+ "date": "2026-05-25T13:37:20.281Z",
+ "commit": "sync",
+ "type": "model_change",
+ "from": "ollama-cloud/glm-5.1",
+ "to": "ollama-cloud/kimi-k2.6",
+ "reason": "Model update from sync",
+ "source": "git"
}
],
"performance_log": []
},
"python-developer": {
"current": {
- "description": "Python backend specialist for Django, FastAPI, data science, and API development",
+ "description": "Python backend specialist for Django, FastAPI, data science, and API development (GNS-2 Tier 1)",
"mode": "subagent",
"model": "ollama-cloud/qwen3-coder:480b",
"provider": "Ollama",
"variant": "thinking",
"color": "\"#3776AB\"",
"category": "General",
- "capabilities": [
- "python_web_development",
- "django_development",
- "fastapi_development",
- "python_api_development",
- "python_database_design",
- "python_authentication",
- "python_async_patterns",
- "python_testing",
- "python_security"
- ]
+ "capabilities": []
+ },
+ "history": [],
+ "performance_log": []
+ },
+ "incident-responder": {
+ "current": {
+ "description": "Server incident response and system hardening specialist. Handles live forensics, malware removal, persistence hunting, SSH-based server cleanup, and post-incident hardening. Works with any OS and panel.",
+ "mode": "subagent",
+ "model": "ollama-cloud/kimi-k2.6",
+ "provider": "Ollama",
+ "color": "\"#B91C1C\"",
+ "category": "General",
+ "capabilities": []
+ },
+ "history": [],
+ "performance_log": []
+ },
+ "workflow-cross-checker": {
+ "current": {
+ "description": "Workflow cross-checker and process inspector. Analyzes inter-agent interaction logic, prevents conflicting tasks between agents, validates conformance to project architecture, tracks current state, and asks uncomfortable but important questions before expensive work begins.",
+ "mode": "subagent",
+ "model": "ollama-cloud/kimi-k2.6",
+ "provider": "Ollama",
+ "variant": "thinking",
+ "color": "\"#9333EA\"",
+ "category": "General",
+ "capabilities": []
+ },
+ "history": [],
+ "performance_log": []
+ },
+ "code": {
+ "current": {
+ "model": "ollama-cloud/qwen3-coder:480b",
+ "provider": "Ollama",
+ "category": "Built-in",
+ "mode": "primary",
+ "color": "#3B82F6",
+ "description": "Primary code writer. Full tool access for development tasks.",
+ "capabilities": []
+ },
+ "history": [],
+ "performance_log": []
+ },
+ "ask": {
+ "current": {
+ "model": "ollama-cloud/glm-5.1",
+ "provider": "Ollama",
+ "category": "Built-in",
+ "mode": "primary",
+ "color": "#3B82F6",
+ "description": "Read-only Q&A agent for codebase questions.",
+ "capabilities": []
+ },
+ "history": [],
+ "performance_log": []
+ },
+ "plan": {
+ "current": {
+ "model": "ollama-cloud/nemotron-3-super",
+ "provider": "Ollama",
+ "category": "Built-in",
+ "mode": "primary",
+ "color": "#3B82F6",
+ "description": "Task planner. Creates detailed implementation plans.",
+ "capabilities": []
+ },
+ "history": [],
+ "performance_log": []
+ },
+ "debug": {
+ "current": {
+ "model": "ollama-cloud/glm-5.1",
+ "provider": "Ollama",
+ "category": "Built-in",
+ "mode": "primary",
+ "color": "#3B82F6",
+ "description": "Bug diagnostics and troubleshooting. GLM-5.1 ★88, reasoning for deep debug.",
+ "capabilities": []
},
"history": [],
"performance_log": []
@@ -955,10 +1044,10 @@
}
},
"evolution_metrics": {
- "total_agents": 32,
+ "total_agents": 38,
"agents_with_history": 22,
"pending_recommendations": 0,
- "last_sync": "2026-04-23T06:24:32.546Z",
+ "last_sync": "2026-05-25T13:37:20.282Z",
"sync_sources": [
"git",
"capability-index.yaml",
diff --git a/agent-evolution/data/model-benchmarks.json b/agent-evolution/data/model-benchmarks.json
index c17d33f..96253bf 100644
--- a/agent-evolution/data/model-benchmarks.json
+++ b/agent-evolution/data/model-benchmarks.json
@@ -1,1718 +1,851 @@
-{
- "version": "1.0.0",
- "generated": "2026-04-30T07:00:00Z",
- "source": "capability-index.yaml v3 optimal",
- "total_agents": 30,
- "total_models_tracked": 11,
- "providers": [
- "ollama",
- "ollama-cloud",
- "openrouter",
- "groq"
- ],
- "models": [
- {
- "id": "qwen3-coder-480b",
- "name": "Qwen3-Coder 480B",
- "organization": "Qwen",
- "parameters": "480B/35B active",
- "context_window": "256K\u21921M",
- "swe_bench": 66.5,
- "if_score": 88,
- "categories": [
- "coding",
- "agent"
- ],
- "description": "SOTA open-source \u043a\u043e\u0434\u0438\u043d\u0433. \u0421\u0440\u0430\u0432\u043d\u0438\u043c \u0441 Claude Sonnet 4.",
- "tags": [
- "coding",
- "agent",
- "tools"
- ],
- "openrouter": false,
- "provider": "ollama"
- },
- {
- "id": "minimax-m2.5",
- "name": "MiniMax M2.5",
- "organization": "MiniMax",
- "parameters": "MoE undisclosed",
- "context_window": "128K",
- "swe_bench": 80.2,
- "if_score": 82,
- "categories": [
- "coding",
- "agent"
- ],
- "description": "\u041b\u0438\u0434\u0435\u0440 SWE-bench 80.2%. \u041f\u043e\u043b\u043d\u044b\u0439 lifecycle \u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u043a\u0438.",
- "tags": [
- "coding",
- "agent"
- ],
- "openrouter": false,
- "provider": "ollama"
- },
- {
- "id": "minimax-m2.7",
- "name": "MiniMax M2.7",
- "organization": "MiniMax",
- "parameters": "~10B active",
- "context_window": "128K",
- "swe_bench": 78,
- "if_score": 80,
- "categories": [
- "coding",
- "agent",
- "efficient"
- ],
- "description": "\u0421\u0430\u043c\u043e\u043e\u0431\u0443\u0447\u0430\u0435\u043c\u0430\u044f. 56.2% SWE-Pro. 100 TPS. $0.30/M.",
- "tags": [
- "coding",
- "agent",
- "self-evolving"
- ],
- "openrouter": false,
- "provider": "ollama"
- },
- {
- "id": "deepseek-v4-pro-max",
- "name": "DeepSeek V4-Pro",
- "organization": "DeepSeek",
- "parameters": "1.6T/49B active MoE",
- "context_window": "1M",
- "swe_bench": 80.6,
- "if_score": 89,
- "categories": [
- "coding",
- "agent",
- "reasoning"
- ],
- "description": "SWE-V 80.6, LiveCodeBench 93.5(#1!), Terminal-Bench 67.9, Codeforces 3206, 1M ctx, 27% FLOPs vs V3.2. MIT.",
- "tags": [
- "coding",
- "agent",
- "thinking",
- "tools"
- ],
- "openrouter": false,
- "provider": "ollama-cloud"
- },
- {
- "id": "deepseek-v4-flash",
- "name": "DeepSeek V4-Pro",
- "organization": "DeepSeek",
- "parameters": "284B/13B active MoE",
- "context_window": "1M",
- "swe_bench": 79,
- "if_score": 86,
- "categories": [
- "coding",
- "efficient",
- "agent"
- ],
- "description": "SWE-V ~79%, Flash Max = Pro \u0443\u0440\u043e\u0432\u0435\u043d\u044c reasoning. 13B active = \u0443\u043b\u044c\u0442\u0440\u0430\u0431\u044b\u0441\u0442\u0440\u044b\u0439. 1M ctx. FP4+FP8. MIT.",
- "tags": [
- "coding",
- "efficient",
- "agent",
- "thinking"
- ],
- "openrouter": false,
- "provider": "ollama-cloud"
- },
- {
- "id": "kimi-k2-6",
- "name": "Kimi K2.6",
- "organization": "Moonshot AI",
- "parameters": "1T/32B active MoE",
- "context_window": "256K",
- "swe_bench": 80.2,
- "if_score": 91,
- "categories": [
- "coding",
- "agent",
- "multimodal"
- ],
- "description": "SWE-Pro 58.6(#1!), SWE-V 80.2, Terminal-Bench 66.7, HLE 54.0(#1!), BrowseComp 83.2. 13h autonomous. 300 sub-agent swarm. Modified MIT.",
- "tags": [
- "coding",
- "agent",
- "swarm",
- "vision",
- "thinking",
- "tools"
- ],
- "openrouter": false,
- "provider": "ollama-cloud"
- },
- {
- "id": "nemotron-3-super",
- "name": "Nemotron 3 Super",
- "organization": "NVIDIA",
- "parameters": "120B/12B active",
- "context_window": "1M",
- "swe_bench": 60.5,
- "if_score": 78,
- "categories": [
- "agent",
- "reasoning",
- "efficient"
- ],
- "description": "SWE-bench 60.5%. RULER@1M 91.75%! \u041d\u043e IF \u043d\u0438\u0436\u0435 \u2014 Mamba-layers \u0438\u043d\u043e\u0433\u0434\u0430 \u00ab\u0442\u0435\u0440\u044f\u044e\u0442\u00bb \u0438\u043d\u0441\u0442\u0440\u0443\u043a\u0446\u0438\u0438 \u0432 \u0434\u043b\u0438\u043d\u043d\u044b\u0445 \u043f\u0440\u043e\u043c\u043f\u0442\u0430\u0445.",
- "tags": [
- "agent",
- "1M-ctx",
- "thinking"
- ],
- "openrouter": false,
- "provider": "ollama"
- },
- {
- "id": "glm-5.1",
- "name": "GLM-5",
- "organization": "Z.ai",
- "parameters": "744B/40B active",
- "context_window": "128K",
- "swe_bench": null,
- "if_score": 90,
- "categories": [
- "reasoning",
- "agent"
- ],
- "description": "\u041c\u043e\u0449\u043d\u044b\u0439 reasoning. Arena ELO 1451. \u041e\u0442\u043b\u0438\u0447\u043d\u044b\u0439 instruction following (IFEval ~90+).",
- "tags": [
- "reasoning",
- "agent"
- ],
- "openrouter": false,
- "provider": "ollama"
- },
- {
- "id": "deepseek-v4",
- "name": "DeepSeek V4-Pro",
- "organization": "DeepSeek",
- "parameters": "Large MoE",
- "context_window": "128K",
- "swe_bench": null,
- "if_score": 75,
- "categories": [
- "reasoning"
- ],
- "description": "\u0425\u043e\u0440\u043e\u0448\u0438\u0439 reasoning, \u043d\u043e IF \u043d\u0435\u0441\u0442\u0430\u0431\u0438\u043b\u0435\u043d \u2014 \u0438\u043d\u043e\u0433\u0434\u0430 \u0438\u0433\u043d\u043e\u0440\u0438\u0440\u0443\u0435\u0442 \u0444\u043e\u0440\u043c\u0430\u0442 \u0432\u044b\u0432\u043e\u0434\u0430.",
- "tags": [
- "reasoning"
- ],
- "openrouter": false,
- "provider": "ollama"
- },
- {
- "id": "qwen3-5-122b",
- "name": "Qwen 3.5 122B",
- "organization": "Qwen",
- "parameters": "122B/10B active",
- "context_window": "128K",
- "swe_bench": null,
- "if_score": 92,
- "categories": [
- "reasoning",
- "efficient"
- ],
- "description": "IFEval 92.6%! \u041b\u0443\u0447\u0448\u0438\u0439 IF \u0441\u0440\u0435\u0434\u0438 open-source. Multimodal. Thinking.",
- "tags": [
- "vision",
- "thinking",
- "tools"
- ],
- "openrouter": false,
- "provider": "ollama"
- },
- {
- "id": "qwen3-coder-next",
- "name": "Qwen3-Coder-Next",
- "organization": "Qwen",
- "parameters": "80B/3B active",
- "context_window": "128K",
- "swe_bench": 70,
- "if_score": 84,
- "categories": [
- "coding",
- "efficient"
- ],
- "description": "70% SWE-bench \u0441 3B active! \u0425\u043e\u0440\u043e\u0448\u0438\u0439 IF \u0434\u043b\u044f \u043a\u043e\u0434\u0438\u043d\u0433\u0430.",
- "tags": [
- "coding",
- "efficient",
- "tools"
- ],
- "openrouter": false,
- "provider": "ollama"
- },
- {
- "id": "cogito-2-1-671b",
- "name": "Cogito 2.1 671B",
- "organization": "Cognitive",
- "parameters": "671B MoE",
- "context_window": "128K",
- "swe_bench": null,
- "if_score": 76,
- "categories": [
- "reasoning"
- ],
- "description": "MIT \u043b\u0438\u0446\u0435\u043d\u0437\u0438\u044f. 671B total. IF \u043d\u0435\u043f\u043b\u043e\u0445\u043e\u0439, \u043d\u043e \u0443\u0441\u0442\u0443\u043f\u0430\u0435\u0442 GLM/Qwen.",
- "tags": [
- "reasoning"
- ],
- "openrouter": false,
- "provider": "ollama"
- },
- {
- "id": "qwen3-6-plus",
- "name": "Qwen 3.6 Plus",
- "organization": "Qwen",
- "parameters": "Hybrid MoE",
- "context_window": "1M",
- "swe_bench": 78.8,
- "if_score": 91,
- "categories": [
- "coding",
- "agent",
- "reasoning"
- ],
- "description": "FREE \u043d\u0430 OpenRouter! 1M \u043a\u043e\u043d\u0442\u0435\u043a\u0441\u0442. Always-on CoT. \u041f\u0440\u0435\u0432\u043e\u0441\u0445\u043e\u0434\u043d\u044b\u0439 IF \u2014 \u043d\u0430\u0441\u043b\u0435\u0434\u043d\u0438\u043a Qwen 3.5 (92.6%).",
- "tags": [
- "coding",
- "agent",
- "1M-ctx",
- "free"
- ],
- "openrouter": true,
- "provider": "openrouter"
- },
- {
- "id": "step-3-5-flash",
- "name": "Step 3.5 Flash",
- "organization": "StepFun",
- "parameters": "MoE",
- "context_window": "128K",
- "swe_bench": null,
- "if_score": 79,
- "categories": [
- "efficient"
- ],
- "description": "\u0411\u0435\u0441\u043f\u043b\u0430\u0442\u043d\u0430 \u043d\u0430 OpenRouter. IF \u0441\u0440\u0435\u0434\u043d\u0438\u0439.",
- "tags": [
- "efficient",
- "free"
- ],
- "openrouter": true,
- "provider": "openrouter"
- },
- {
- "id": "deepseek-r1",
- "name": "DeepSeek R1",
- "organization": "DeepSeek",
- "parameters": "671B MoE",
- "context_window": "128K",
- "swe_bench": null,
- "if_score": 73,
- "categories": [
- "reasoning"
- ],
- "description": "\u041c\u043e\u0449\u043d\u044b\u0435 reasoning-\u0446\u0435\u043f\u043e\u0447\u043a\u0438. \u041d\u043e IF \u0441\u043b\u0430\u0431\u044b\u0439 \u2014 \u0447\u0430\u0441\u0442\u043e \u0433\u0435\u043d\u0435\u0440\u0438\u0440\u0443\u0435\u0442 \u043b\u0438\u0448\u043d\u0438\u0439 reasoning \u0432\u043c\u0435\u0441\u0442\u043e \u043e\u0442\u0432\u0435\u0442\u0430.",
- "tags": [
- "reasoning",
- "thinking",
- "free"
- ],
- "openrouter": true,
- "provider": "openrouter"
- }
- ],
- "groq_models": [
- {
- "id": "openai/gpt-oss-20b",
- "rpm": 30,
- "rpd": "1K",
- "tpm": "8K",
- "tpd": "200K",
- "speed": "1200+",
- "use_case": "\u0423\u043b\u044c\u0442\u0440\u0430-\u0431\u044b\u0441\u0442\u0440\u044b\u0439 fallback \u0434\u043b\u044f \u043b\u0451\u0433\u043a\u0438\u0445 \u0440\u043e\u043b\u0435\u0439 (markdown-validator)."
- },
- {
- "id": "llama-3.1-8b-instant",
- "rpm": 30,
- "rpd": "14.4K",
- "tpm": "6K",
- "tpd": "500K",
- "speed": "~800",
- "use_case": "14.4K RPD! \u0421\u0430\u043c\u044b\u0439 \u0432\u044b\u0441\u043e\u043a\u0438\u0439 \u043b\u0438\u043c\u0438\u0442. \u0414\u043b\u044f health-check / ping \u0440\u043e\u043b\u0435\u0439."
- },
- {
- "id": "groq/compound",
- "rpm": 30,
- "rpd": "250",
- "tpm": "70K",
- "tpd": "\u2014",
- "speed": "varies",
- "use_case": "\u041c\u0443\u043b\u044c\u0442\u0438\u043c\u043e\u0434\u0435\u043b\u044c\u043d\u0430\u044f \u0430\u0433\u0440\u0435\u0433\u0430\u0446\u0438\u044f. \u0414\u043b\u044f research-\u0437\u0430\u0434\u0430\u0447."
- },
- {
- "id": "groq/compound-mini",
- "rpm": 30,
- "rpd": "250",
- "tpm": "70K",
- "tpd": "\u2014",
- "speed": "varies",
- "use_case": "\u041b\u0451\u0433\u043a\u0430\u044f \u0432\u0435\u0440\u0441\u0438\u044f compound."
- },
- {
- "id": "llama-prompt-guard-2",
- "rpm": 30,
- "rpd": "14.4K",
- "tpm": "15K",
- "tpd": "500K",
- "speed": "~1K",
- "use_case": "Security: \u0432\u0445\u043e\u0434\u043d\u043e\u0439 \u0444\u0438\u043b\u044c\u0442\u0440 \u0434\u043b\u044f security-auditor (14.4K RPD!)."
- }
- ],
- "agent_model_scores": [
- {
- "agent": "lead-developer",
- "current_model_index": 0,
- "current_model_id": "qwen3-coder-480b",
- "reasoning_effort": "H",
- "scores": {
- "qwen3-coder-480b": 92,
- "minimax-m2.5": 86,
- "minimax-m2.7": 82,
- "nemotron-3-super": 70,
- "glm-5.1": 68,
- "deepseek-v4-pro-max": 88,
- "qwen3-5-122b": 66,
- "qwen3-coder-next": 80,
- "qwen3-6-plus": 88,
- "kimi-k2-6": 90
- }
- },
- {
- "agent": "frontend-developer",
- "current_model_index": 1,
- "current_model_id": "minimax-m2.5",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 86,
- "minimax-m2.5": 92,
- "minimax-m2.7": 88,
- "nemotron-3-super": 62,
- "glm-5.1": 56,
- "deepseek-v4-pro-max": 82,
- "qwen3-5-122b": 60,
- "qwen3-coder-next": 76,
- "qwen3-6-plus": 88,
- "kimi-k2-6": 86
- }
- },
- {
- "agent": "php-developer",
- "current_model_index": 0,
- "current_model_id": "qwen3-coder-480b",
- "reasoning_effort": "H",
- "scores": {
- "qwen3-coder-480b": 87,
- "minimax-m2.5": 76,
- "minimax-m2.7": 72,
- "nemotron-3-super": 64,
- "glm-5.1": 56,
- "deepseek-v4-pro-max": 74,
- "qwen3-5-122b": 60,
- "qwen3-coder-next": 76,
- "qwen3-6-plus": 84,
- "kimi-k2-6": 86
- }
- },
- {
- "agent": "python-developer",
- "current_model_index": 0,
- "current_model_id": "qwen3-coder-480b",
- "reasoning_effort": "H",
- "scores": {
- "qwen3-coder-480b": 90,
- "minimax-m2.5": 82,
- "minimax-m2.7": 78,
- "nemotron-3-super": 66,
- "glm-5.1": 60,
- "deepseek-v4-pro-max": 78,
- "qwen3-5-122b": 64,
- "qwen3-coder-next": 78,
- "qwen3-6-plus": 88,
- "kimi-k2-6": 88
- }
- },
- {
- "agent": "backend-developer",
- "current_model_index": 0,
- "current_model_id": "qwen3-coder-480b",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 91,
- "minimax-m2.5": 84,
- "minimax-m2.7": 80,
- "nemotron-3-super": 68,
- "glm-5.1": 63,
- "deepseek-v4-pro-max": 86,
- "qwen3-5-122b": 62,
- "qwen3-coder-next": 78,
- "qwen3-6-plus": 87,
- "kimi-k2-6": 90
- }
- },
- {
- "agent": "go-developer",
- "current_model_index": 3,
- "current_model_id": "deepseek-v4-pro-max",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 85,
- "minimax-m2.5": 78,
- "minimax-m2.7": 74,
- "nemotron-3-super": 66,
- "glm-5.1": 58,
- "deepseek-v4-pro-max": 88,
- "qwen3-5-122b": 58,
- "qwen3-coder-next": 74,
- "qwen3-6-plus": 82,
- "kimi-k2-6": 86
- }
- },
- {
- "agent": "flutter-developer",
- "current_model_index": 0,
- "current_model_id": "qwen3-coder-480b",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 86,
- "minimax-m2.5": 70,
- "minimax-m2.7": 66,
- "nemotron-3-super": 60,
- "glm-5.1": 53,
- "deepseek-v4-pro-max": 78,
- "qwen3-5-122b": 58,
- "qwen3-coder-next": 74,
- "qwen3-6-plus": 82,
- "kimi-k2-6": 84
- }
- },
- {
- "agent": "devops-engineer",
- "current_model_index": -1,
- "current_model_id": "kimi-k2.6",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 66,
- "minimax-m2.5": 53,
- "minimax-m2.7": 48,
- "nemotron-3-super": 78,
- "glm-5.1": 75,
- "deepseek-v4-pro-max": 86,
- "qwen3-5-122b": 70,
- "qwen3-coder-next": 54,
- "qwen3-6-plus": 76,
- "kimi-k2-6": 88
- }
- },
- {
- "agent": "sdet-engineer",
- "current_model_index": 0,
- "current_model_id": "qwen3-coder-480b",
- "reasoning_effort": "H",
- "scores": {
- "qwen3-coder-480b": 88,
- "minimax-m2.5": 84,
- "minimax-m2.7": 80,
- "nemotron-3-super": 70,
- "glm-5.1": 63,
- "deepseek-v4-pro-max": 84,
- "qwen3-5-122b": 64,
- "qwen3-coder-next": 78,
- "qwen3-6-plus": 84,
- "kimi-k2-6": 87
- }
- },
- {
- "agent": "code-skeptic",
- "current_model_index": 1,
- "current_model_id": "minimax-m2.5",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 82,
- "minimax-m2.5": 85,
- "minimax-m2.7": 80,
- "nemotron-3-super": 73,
- "glm-5.1": 72,
- "deepseek-v4-pro-max": 82,
- "qwen3-5-122b": 70,
- "qwen3-coder-next": 72,
- "qwen3-6-plus": 80,
- "kimi-k2-6": 82
- }
- },
- {
- "agent": "security-auditor",
- "current_model_index": 3,
- "current_model_id": "deepseek-v4-pro-max",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 76,
- "minimax-m2.5": 74,
- "minimax-m2.7": 68,
- "nemotron-3-super": 76,
- "glm-5.1": 68,
- "deepseek-v4-pro-max": 80,
- "qwen3-5-122b": 72,
- "qwen3-coder-next": 64,
- "qwen3-6-plus": 75,
- "kimi-k2-6": 80
- }
- },
- {
- "agent": "performance-engineer",
- "current_model_index": 3,
- "current_model_id": "deepseek-v4-pro-max",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 78,
- "minimax-m2.5": 75,
- "minimax-m2.7": 70,
- "nemotron-3-super": 78,
- "glm-5.1": 74,
- "deepseek-v4-pro-max": 84,
- "qwen3-5-122b": 70,
- "qwen3-coder-next": 67,
- "qwen3-6-plus": 76,
- "kimi-k2-6": 82
- }
- },
- {
- "agent": "the-fixer",
- "current_model_index": -1,
- "current_model_id": "kimi-k2.6",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 89,
- "minimax-m2.5": 88,
- "minimax-m2.7": 84,
- "nemotron-3-super": 71,
- "glm-5.1": 64,
- "deepseek-v4-pro-max": 88,
- "qwen3-5-122b": 64,
- "qwen3-coder-next": 82,
- "qwen3-6-plus": 86,
- "kimi-k2-6": 90
- }
- },
- {
- "agent": "browser-automation",
- "current_model_index": 0,
- "current_model_id": "qwen3-coder-480b",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 87,
- "minimax-m2.5": 72,
- "minimax-m2.7": 68,
- "nemotron-3-super": 61,
- "glm-5.1": 53,
- "deepseek-v4-pro-max": 82,
- "qwen3-5-122b": 56,
- "qwen3-coder-next": 72,
- "qwen3-6-plus": 82,
- "kimi-k2-6": 86
- }
- },
- {
- "agent": "visual-tester",
- "current_model_index": 0,
- "current_model_id": "qwen3-coder-480b",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 82,
- "minimax-m2.5": 68,
- "minimax-m2.7": 64,
- "nemotron-3-super": 55,
- "glm-5.1": 48,
- "deepseek-v4-pro-max": 76,
- "qwen3-5-122b": 54,
- "qwen3-coder-next": 66,
- "qwen3-6-plus": 76,
- "kimi-k2-6": 78
- }
- },
- {
- "agent": "system-analyst",
- "current_model_index": 7,
- "current_model_id": "glm-5.1",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 70,
- "minimax-m2.5": 66,
- "minimax-m2.7": 63,
- "nemotron-3-super": 74,
- "glm-5.1": 82,
- "deepseek-v4-pro-max": 88,
- "qwen3-5-122b": 76,
- "qwen3-coder-next": 58,
- "qwen3-6-plus": 80,
- "kimi-k2-6": 86
- }
- },
- {
- "agent": "capability-analyst",
- "current_model_index": 7,
- "current_model_id": "glm-5.1",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 72,
- "minimax-m2.5": 68,
- "minimax-m2.7": 66,
- "nemotron-3-super": 76,
- "glm-5.1": 78,
- "deepseek-v4-pro-max": 82,
- "qwen3-5-122b": 75,
- "qwen3-coder-next": 60,
- "qwen3-6-plus": 79,
- "kimi-k2-6": 82
- }
- },
- {
- "agent": "orchestrator",
- "current_model_index": -1,
- "current_model_id": "kimi-k2.6",
- "reasoning_effort": "H",
- "scores": {
- "qwen3-coder-480b": 74,
- "minimax-m2.5": 70,
- "minimax-m2.7": 68,
- "nemotron-3-super": 80,
- "glm-5.1": 82,
- "deepseek-v4-pro-max": 86,
- "qwen3-5-122b": 78,
- "qwen3-coder-next": 62,
- "qwen3-6-plus": 84,
- "kimi-k2-6": 92
- }
- },
- {
- "agent": "release-manager",
- "current_model_index": 7,
- "current_model_id": "glm-5.1",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 72,
- "minimax-m2.5": 66,
- "minimax-m2.7": 64,
- "nemotron-3-super": 74,
- "glm-5.1": 76,
- "deepseek-v4-pro-max": 78,
- "qwen3-5-122b": 72,
- "qwen3-coder-next": 60,
- "qwen3-6-plus": 76,
- "kimi-k2-6": 78
- }
- },
- {
- "agent": "evaluator",
- "current_model_index": 7,
- "current_model_id": "glm-5.1",
- "reasoning_effort": "H",
- "scores": {
- "qwen3-coder-480b": 70,
- "minimax-m2.5": 73,
- "minimax-m2.7": 70,
- "nemotron-3-super": 78,
- "glm-5.1": 78,
- "deepseek-v4-pro-max": 84,
- "qwen3-5-122b": 76,
- "qwen3-coder-next": 58,
- "qwen3-6-plus": 81,
- "kimi-k2-6": 84
- }
- },
- {
- "agent": "prompt-optimizer",
- "current_model_index": -1,
- "current_model_id": "qwen3.6-plus",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 76,
- "minimax-m2.5": 74,
- "minimax-m2.7": 72,
- "nemotron-3-super": 76,
- "glm-5.1": 75,
- "deepseek-v4-pro-max": 80,
- "qwen3-5-122b": 74,
- "qwen3-coder-next": 64,
- "qwen3-6-plus": 83,
- "kimi-k2-6": 82
- }
- },
- {
- "agent": "product-owner",
- "current_model_index": 7,
- "current_model_id": "glm-5.1",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 60,
- "minimax-m2.5": 56,
- "minimax-m2.7": 54,
- "nemotron-3-super": 74,
- "glm-5.1": 78,
- "deepseek-v4-pro-max": 76,
- "qwen3-5-122b": 74,
- "qwen3-coder-next": 48,
- "qwen3-6-plus": 78,
- "kimi-k2-6": 76
- }
- },
- {
- "agent": "pipeline-judge",
- "current_model_index": 7,
- "current_model_id": "glm-5.1",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 64,
- "minimax-m2.5": 68,
- "minimax-m2.7": 65,
- "nemotron-3-super": 78,
- "glm-5.1": 76,
- "deepseek-v4-pro-max": 82,
- "qwen3-5-122b": 74,
- "qwen3-coder-next": 56,
- "qwen3-6-plus": 80,
- "kimi-k2-6": 84
- }
- },
- {
- "agent": "workflow-architect",
- "current_model_index": 7,
- "current_model_id": "glm-5.1",
- "reasoning_effort": "H",
- "scores": {
- "qwen3-coder-480b": 68,
- "minimax-m2.5": 62,
- "minimax-m2.7": 60,
- "nemotron-3-super": 76,
- "glm-5.1": 76,
- "deepseek-v4-pro-max": 80,
- "qwen3-5-122b": 72,
- "qwen3-coder-next": 56,
- "qwen3-6-plus": 80,
- "kimi-k2-6": 82
- }
- },
- {
- "agent": "markdown-validator",
- "current_model_index": 3,
- "current_model_id": "deepseek-v4-pro-max",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 43,
- "minimax-m2.5": 38,
- "minimax-m2.7": 36,
- "nemotron-3-super": 52,
- "glm-5.1": 55,
- "deepseek-v4-pro-max": 68,
- "qwen3-5-122b": 56,
- "qwen3-coder-next": 40,
- "qwen3-6-plus": 50,
- "kimi-k2-6": 56
- }
- },
- {
- "agent": "agent-architect",
- "current_model_index": -1,
- "current_model_id": "kimi-k2.6",
- "reasoning_effort": "H",
- "scores": {
- "qwen3-coder-480b": 78,
- "minimax-m2.5": 72,
- "minimax-m2.7": 70,
- "nemotron-3-super": 78,
- "glm-5.1": 76,
- "deepseek-v4-pro-max": 82,
- "qwen3-5-122b": 76,
- "qwen3-coder-next": 66,
- "qwen3-6-plus": 82,
- "kimi-k2-6": 86
- }
- },
- {
- "agent": "planner",
- "current_model_index": 3,
- "current_model_id": "deepseek-v4-pro-max",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 72,
- "minimax-m2.5": 68,
- "minimax-m2.7": 66,
- "nemotron-3-super": 80,
- "glm-5.1": 78,
- "deepseek-v4-pro-max": 88,
- "qwen3-5-122b": 78,
- "qwen3-coder-next": 60,
- "qwen3-6-plus": 85,
- "kimi-k2-6": 86
- }
- },
- {
- "agent": "reflector",
- "current_model_index": 3,
- "current_model_id": "deepseek-v4-pro-max",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 68,
- "minimax-m2.5": 66,
- "minimax-m2.7": 64,
- "nemotron-3-super": 78,
- "glm-5.1": 76,
- "deepseek-v4-pro-max": 84,
- "qwen3-5-122b": 76,
- "qwen3-coder-next": 56,
- "qwen3-6-plus": 82,
- "kimi-k2-6": 80
- }
- },
- {
- "agent": "memory-manager",
- "current_model_index": -1,
- "current_model_id": "qwen3.6-plus",
- "reasoning_effort": "M",
- "scores": {
- "qwen3-coder-480b": 63,
- "minimax-m2.5": 58,
- "minimax-m2.7": 56,
- "nemotron-3-super": 86,
- "glm-5.1": 72,
- "deepseek-v4-pro-max": 86,
- "qwen3-5-122b": 70,
- "qwen3-coder-next": 50,
- "qwen3-6-plus": 87,
- "kimi-k2-6": 84
- }
- },
- {
- "agent": "architect-indexer",
- "current_model_index": 7,
- "current_model_id": "glm-5.1",
- "reasoning_effort": "H",
- "scores": {
- "qwen3-coder-480b": 70,
- "minimax-m2.5": 64,
- "minimax-m2.7": 62,
- "nemotron-3-super": 74,
- "glm-5.1": 80,
- "deepseek-v4-pro-max": 78,
- "qwen3-5-122b": 76,
- "qwen3-coder-next": 58,
- "qwen3-6-plus": 80,
- "kimi-k2-6": 84
- }
- }
- ],
- "if_scores": {
- "qwen3-coder-480b": 88,
- "minimax-m2.5": 82,
- "minimax-m2.7": 78,
- "nemotron-3-super": 85,
- "glm-5.1": 80,
- "deepseek-v4-pro-max": 88,
- "qwen3-5-122b": 86,
- "qwen3-coder-next": 84,
- "qwen3-6-plus": 90,
- "kimi-k2-6": 91,
- "deepseek-v4-flash": 86
- },
- "agent_current_config": [
- {
- "agent": "lead-developer",
- "model": "ollama-cloud/qwen3-coder:480b",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "qwen",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "frontend-developer",
- "model": "ollama-cloud/minimax-m2.5",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "qwen",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "php-developer",
- "model": "ollama-cloud/qwen3-coder:480b",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "qwen",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "python-developer",
- "model": "ollama-cloud/qwen3-coder:480b",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "qwen",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "backend-developer",
- "model": "ollama-cloud/qwen3-coder:480b",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "qwen",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "go-developer",
- "model": "ollama-cloud/deepseek-v4-pro-max",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "qwen",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "flutter-developer",
- "model": "ollama-cloud/qwen3-coder:480b",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "qwen",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "devops-engineer",
- "model": "ollama-cloud/kimi-k2.6",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "nemotron",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "sdet-engineer",
- "model": "ollama-cloud/qwen3-coder:480b",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "qwen",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "code-skeptic",
- "model": "ollama-cloud/minimax-m2.5",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "minimax",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "security-auditor",
- "model": "ollama-cloud/deepseek-v4-pro-max",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "nemotron",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "performance-engineer",
- "model": "ollama-cloud/deepseek-v4-pro-max",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "nemotron",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "the-fixer",
- "model": "ollama-cloud/kimi-k2.6",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "minimax",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "browser-automation",
- "model": "ollama-cloud/qwen3-coder:480b",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "qwen",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "visual-tester",
- "model": "ollama-cloud/qwen3-coder:480b",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "qwen",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "system-analyst",
- "model": "ollama-cloud/glm-5.1",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "glm",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "capability-analyst",
- "model": "ollama-cloud/glm-5.1",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "glm",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "orchestrator",
- "model": "ollama-cloud/kimi-k2.6",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "kimi",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "release-manager",
- "model": "ollama-cloud/glm-5.1",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "glm",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "evaluator",
- "model": "ollama-cloud/glm-5.1",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "glm",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "prompt-optimizer",
- "model": "ollama-cloud/qwen3.6-plus",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "glm",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "product-owner",
- "model": "ollama-cloud/glm-5.1",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "glm",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "pipeline-judge",
- "model": "ollama-cloud/glm-5.1",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "glm",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "workflow-architect",
- "model": "ollama-cloud/glm-5.1",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "glm",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "markdown-validator",
- "model": "ollama-cloud/deepseek-v4-pro-max",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "nemotron",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "agent-architect",
- "model": "ollama-cloud/kimi-k2.6",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "glm",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "planner",
- "model": "ollama-cloud/deepseek-v4-pro-max",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "nemotron",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "reflector",
- "model": "ollama-cloud/deepseek-v4-pro-max",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "nemotron",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "memory-manager",
- "model": "ollama-cloud/qwen3.6-plus",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "nemotron",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- },
- {
- "agent": "architect-indexer",
- "model": "ollama-cloud/glm-5.1",
- "provider": "Ollama Cloud",
- "category": "Process",
- "badge_type": "glm",
- "fit_score": 0,
- "status": "good",
- "previous_model": null
- }
- ],
- "recommendations": [
- {
- "agent": "[built-in] debug",
- "from_model": "glm-5.1.1 (88)",
- "from_provider": "Ollama",
- "to_model": "V4-Pro Max (\u260590) / K2.6 (\u260590) RE:High",
- "to_provider": "Ollama Cloud",
- "impact": "high",
- "quality_change": "+2%",
- "speed_change": "~1x",
- "context_change": "200K\u21921M",
- "provider_change": "Ollama Cloud",
- "rationale": "\u2605 \u043c\u0430\u0442\u0440\u0438\u0446\u044b: V4-Pro=90 \u0438 K2.6=90 (TIE!), GLM-5.1=88. V4-Pro: LiveCodeBench 93.5(#1!), Terminal 67.9, 1M ctx \u0434\u043b\u044f \u043f\u043e\u043b\u043d\u043e\u0433\u043e \u043f\u0440\u043e\u0435\u043a\u0442\u0430. K2.6: 13h auto sessions. \u041e\u0431\u0430 \u043b\u0443\u0447\u0448\u0435 GLM-5.1. RE:High \u0434\u043b\u044f debug."
- },
- {
- "agent": "planner",
- "from_model": "nemotron-3-super (80)",
- "from_provider": "Ollama",
- "to_model": "V4-Pro Max (\u260588) RE:High",
- "to_provider": "Ollama Cloud",
- "impact": "high",
- "quality_change": "+10%",
- "speed_change": "~1x",
- "context_change": "1M",
- "provider_change": "Ollama Cloud",
- "rationale": "\u2605 \u043c\u0430\u0442\u0440\u0438\u0446\u044b: V4-Pro=88(\u043b\u0443\u0447\u0448\u0438\u0439!), K2.6=86, GLM-5.1=85, Nem=80. V4-Pro: GPQA 90.1 (reasoning), 1M ctx \u0441\u043e\u0445\u0440\u0430\u043d\u044f\u0435\u0442\u0441\u044f (vs \u043f\u043e\u0442\u0435\u0440\u044f \u043f\u0440\u0438 K2.6). RE:High \u0434\u043b\u044f chain-of-thought planning."
- },
- {
- "agent": "go-developer",
- "from_model": "qwen3-coder:480b (85)",
- "from_provider": "Ollama",
- "to_model": "V4-Pro Max (\u260588) RE:Medium",
- "to_provider": "Ollama Cloud",
- "impact": "medium",
- "quality_change": "+4%",
- "speed_change": "~1x",
- "context_change": "256K\u21921M",
- "provider_change": "Ollama Cloud",
- "rationale": "\u2605 \u043c\u0430\u0442\u0440\u0438\u0446\u044b: V4-Pro=88(\u043b\u0443\u0447\u0448\u0438\u0439 \u0434\u043b\u044f Go!), K2.6=86, Qwen3Coder=85. DeepSeek \u043c\u043e\u0434\u0435\u043b\u0438 \u0442\u0440\u0430\u0434\u0438\u0446\u0438\u043e\u043d\u043d\u043e \u0441\u0438\u043b\u044c\u043d\u044b \u0432 Go/Rust. 1M ctx \u0434\u043b\u044f \u043a\u0440\u0443\u043f\u043d\u044b\u0445 Go-\u043f\u0440\u043e\u0435\u043a\u0442\u043e\u0432."
- },
- {
- "agent": "history-miner",
- "from_model": "nemotron-3-super (\u260585)",
- "from_provider": "Ollama",
- "to_model": "V4-Pro Max (86) + Nem fallback",
- "to_provider": "Hybrid",
- "impact": "medium",
- "quality_change": "+1%",
- "speed_change": "~1x",
- "context_change": "1M",
- "provider_change": "Ollama Cloud + Ollama",
- "rationale": "V4-Pro=86 \u0447\u0443\u0442\u044c \u043b\u0443\u0447\u0448\u0435 Nemotron=85. 1M ctx \u0443 \u043e\u0431\u043e\u0438\u0445. MRCR 83.5 \u0443 V4-Pro \u2014 \u043b\u0443\u0447\u0448\u0435\u0435 long-context retrieval. Nemotron \u043a\u0430\u043a fallback (RULER 91.75%)."
- },
- {
- "agent": "frontend-dev \u2192 M2.5",
- "from_model": "qwen3-coder (90)",
- "from_provider": "Ollama",
- "to_model": "MiniMax M2.5 (\u260592) \u2705",
- "to_provider": "Ollama",
- "impact": "low",
- "quality_change": "+2%",
- "speed_change": "=",
- "context_change": "204K",
- "provider_change": "Ollama",
- "rationale": "Spec-writing, UI architect. APPLIED."
- },
- {
- "agent": "devops \u2192 K2.6",
- "from_model": "deepseek-v3.2",
- "from_provider": "",
- "to_model": "kimi-k2.6 \u2705",
- "to_provider": "Ollama Cloud",
- "impact": "low",
- "quality_change": "+35%",
- "speed_change": "=",
- "context_change": "256K",
- "provider_change": "",
- "rationale": "APPLIED."
- },
- {
- "agent": "orchestrator",
- "from_model": "glm-5.1.1 (\u260590)",
- "from_provider": "Ollama",
- "to_model": "K2.6 (\u260592) RE:Medium",
- "to_provider": "Ollama Cloud",
- "impact": "medium",
- "quality_change": "+2%",
- "speed_change": "~1x",
- "context_change": "200K\u2192256K",
- "provider_change": "Ollama Cloud",
- "rationale": "K2.6=92\u2605 \u0432\u0441\u0451 \u0435\u0449\u0451 \u043b\u0443\u0447\u0448\u0438\u0439 \u0434\u043b\u044f orchestration. V4-Pro=86 \u0441\u043b\u0430\u0431\u0435\u0435. 300 sub-agent swarm."
- },
- {
- "agent": "the-fixer",
- "from_model": "minimax-m2.5 (\u260588)",
- "from_provider": "Ollama",
- "to_model": "V4-Pro (\u260588) / K2.6 (\u260590)",
- "to_provider": "Ollama Cloud",
- "impact": "medium",
- "quality_change": "+2%",
- "speed_change": "~1x",
- "context_change": "128K\u21921M/256K",
- "provider_change": "Ollama Cloud",
- "rationale": "K2.6=90(\u043b\u0443\u0447\u0448\u0438\u0439), V4-Pro=88=M2.5. M2.5 SWE-bench 80.2% \u0441\u0442\u0430\u0431\u0438\u043b\u044c\u043d\u0435\u0435. \u041d\u0435 \u0441\u0440\u043e\u0447\u043d\u043e."
- },
- {
- "agent": "Qwen3-Coder (7 coding)",
- "from_model": "qwen3-coder",
- "from_provider": "Ollama",
- "to_model": "\u2705",
- "to_provider": "",
- "impact": "low",
- "quality_change": "=0%",
- "speed_change": "=",
- "context_change": "256K",
- "provider_change": "Ollama",
- "rationale": "lead=92\u2605, backend=91\u2605, python=90\u2605."
- },
- {
- "agent": "GLM-5.1 (12 agents)",
- "from_model": "glm-5.1.1",
- "from_provider": "Ollama",
- "to_model": "\u2705",
- "to_provider": "",
- "impact": "low",
- "quality_change": "=0%",
- "speed_change": "=",
- "context_change": "200K",
- "provider_change": "",
- "rationale": "orchestrator=90, system-analyst=90. SWE-Pro #1."
- },
- {
- "agent": "Kimi K2.6 (3 agents)",
- "from_model": "kimi-k2.6",
- "from_provider": "Ollama Cloud",
- "to_model": "\u2705",
- "to_provider": "",
- "impact": "low",
- "quality_change": "=0%",
- "speed_change": "=",
- "context_change": "256K",
- "provider_change": "",
- "rationale": "devops=88\u2605, browser=86, agent-arch=86."
- }
- ],
- "impact_data": [
- {
- "category": "debug GLM5.1\u2192V4-Pro/K2.6",
- "before": 88,
- "after": 90,
- "delta": 2,
- "notes": "LiveCodeBench 93.5, Terminal 67.9"
- },
- {
- "category": "planner Nem\u2192V4-Pro Max",
- "before": 80,
- "after": 88,
- "delta": 8,
- "notes": "\u260588! GPQA 90.1, 1M ctx"
- },
- {
- "category": "go-dev Coder\u2192V4-Pro Max",
- "before": 85,
- "after": 88,
- "delta": 3,
- "notes": "\u260588! Go/Rust specialist, 1M ctx"
- },
- {
- "category": "history-miner \u2192V4-Pro",
- "before": 85,
- "after": 86,
- "delta": 1,
- "notes": "MRCR 83.5, long-context"
- },
- {
- "category": "orchestrator \u2192K2.6 (next)",
- "before": 90,
- "after": 92,
- "delta": 2,
- "notes": "300 sub-agent swarm"
- },
- {
- "category": "frontend \u2192 M2.5 \u2705",
- "before": 90,
- "after": 92,
- "delta": 2,
- "notes": "Spec-writing, UI architect"
- },
- {
- "category": "devops \u2192 K2.6 \u2705",
- "before": 65,
- "after": 88,
- "delta": 23,
- "notes": "IF:65\u219291! Terminal 66.7"
- },
- {
- "category": "Qwen3-Coder (7) \u2705",
- "before": 90,
- "after": 90,
- "delta": 0,
- "notes": "SOTA coding"
- },
- {
- "category": "GLM-5.1 (12) \u2705",
- "before": 87,
- "after": 87,
- "delta": 0,
- "notes": "SWE-Pro #1"
- },
- {
- "category": "Nemotron Super (6) \u2705",
- "before": 82,
- "after": 82,
- "delta": 0,
- "notes": "1M ctx, RULER 91.75%"
- }
- ],
- "benchmark_comparison": {
- "benchmarks": [
- {
- "name": "SWE-V",
- "full_name": "SWE-Bench Verified",
- "description": "GitHub issue resolution (500 tasks)",
- "roles": "lead-dev, backend, fixer"
- },
- {
- "name": "SWE-P",
- "full_name": "SWE-Bench Pro",
- "description": "Multi-lang, decontaminated (1865 tasks)",
- "roles": "all coding agents"
- },
- {
- "name": "T-Bench",
- "full_name": "Terminal-Bench 2.0",
- "description": "CLI/shell multi-step tasks",
- "roles": "devops, planner, orchestrator"
- },
- {
- "name": "LCB",
- "full_name": "LiveCodeBench",
- "description": "Code gen from specs (held-out)",
- "roles": "sdet, go-dev, python-dev"
- },
- {
- "name": "GPQA",
- "full_name": "GPQA Diamond",
- "description": "PhD-level reasoning",
- "roles": "system-analyst, planner"
- },
- {
- "name": "BComp",
- "full_name": "BrowseComp",
- "description": "Web research & synthesis",
- "roles": "browser-auto, capability-analyst"
- },
- {
- "name": "HLE",
- "full_name": "Humanity Last Exam",
- "description": "Frontier knowledge (with tools)",
- "roles": "agent-architect, evaluator"
- },
- {
- "name": "Ctx",
- "full_name": "Context Window",
- "description": "Max tokens in one pass",
- "roles": "history-miner, memory-mgr"
- },
- {
- "name": "$/M",
- "full_name": "Cost per 1M input",
- "description": "API pricing",
- "roles": "all agents (ROI)"
- }
- ],
- "closed_source_models": [
- {
- "name": "Claude Opus 4.7",
- "organization": "Anthropic",
- "scores": [
- 87.6,
- 64.3,
- 69.4,
- null,
- 94.2,
- 79.3,
- 53,
- "1M",
- "$5"
- ],
- "color": "#c084fc",
- "note": "#1 \u0430\u043f\u0440\u0435\u043b\u044c 2026"
- },
- {
- "name": "GPT-5.5",
- "organization": "OpenAI",
- "scores": [
- null,
- 58.6,
- 82.7,
- null,
- null,
- 83.4,
- 57.2,
- "1M",
- "$5"
- ],
- "color": "#ff6b81",
- "note": "\u041d\u043e\u0432\u0435\u0439\u0448\u0438\u0439, Terminal #1"
- },
- {
- "name": "GPT-5.4",
- "organization": "OpenAI",
- "scores": [
- 78.2,
- 59.1,
- 75.1,
- null,
- 94.4,
- 82.7,
- 58.7,
- "200K",
- "$2.50"
- ],
- "color": "#ff6b81",
- "note": "Reasoning, math"
- },
- {
- "name": "Gemini 3.1 Pro",
- "organization": "Google",
- "scores": [
- 80.6,
- 46.1,
- 68.5,
- null,
- 94.3,
- 85.9,
- 51.4,
- "2M",
- "$2"
- ],
- "color": "#facc15",
- "note": "ARC-AGI 77.1%, \u0434\u0435\u0448\u0451\u0432\u044b\u0439"
- },
- {
- "name": "Claude Sonnet 4.6",
- "organization": "Anthropic",
- "scores": [
- 79.6,
- null,
- null,
- null,
- null,
- null,
- null,
- "200K",
- "$3"
- ],
- "color": "#c084fc",
- "note": "5\u00d7 \u0434\u0435\u0448\u0435\u0432\u043b\u0435 Opus"
- },
- {
- "name": "GPT-5.3-Codex",
- "organization": "OpenAI",
- "scores": [
- 85,
- 57,
- 77.3,
- null,
- null,
- null,
- null,
- "200K",
- "$6"
- ],
- "color": "#ff6b81",
- "note": "Coding specialist"
- }
- ],
- "apaw_models": [
- {
- "name": "Kimi K2.6",
- "organization": "APAW",
- "scores": [
- 80.2,
- 58.6,
- 66.7,
- 87.2,
- null,
- 83.2,
- 54,
- "256K",
- "$0.95"
- ],
- "color": "#00ff94",
- "note": "devops, browser, architect (3)"
- },
- {
- "name": "GLM-5.1",
- "organization": "APAW",
- "scores": [
- null,
- 58.4,
- 63.5,
- null,
- 86.2,
- 68.7,
- null,
- "200K",
- "~$0.50"
- ],
- "color": "#00ff94",
- "note": "12 agents! orchestrator, eval..."
- },
- {
- "name": "V4-Pro Max",
- "organization": "APAW",
- "scores": [
- 80.6,
- 55.4,
- 67.9,
- 93.5,
- 90.1,
- 83.4,
- 48.2,
- "1M",
- "$0.42"
- ],
- "color": "#00d4ff",
- "note": "planner, go-dev (\u0440\u0435\u043a.)"
- },
- {
- "name": "Qwen3-Coder 480B",
- "organization": "APAW",
- "scores": [
- 66.5,
- null,
- null,
- null,
- null,
- null,
- null,
- "256K",
- "~$0.50"
- ],
- "color": "#00ff94",
- "note": "7 coding agents"
- },
- {
- "name": "MiniMax M2.5",
- "organization": "APAW",
- "scores": [
- 80.2,
- 51.3,
- null,
- null,
- null,
- 76.3,
- null,
- "204K",
- "$0.15"
- ],
- "color": "#00ff94",
- "note": "frontend, skeptic, fixer (3)"
- },
- {
- "name": "Nemotron Super",
- "organization": "APAW",
- "scores": [
- 60.5,
- null,
- null,
- null,
- null,
- null,
- null,
- "1M",
- "~$0.40"
- ],
- "color": "#00ff94",
- "note": "6 agents (memory, history)"
- }
- ]
- }
-}
\ No newline at end of file
+{
+ "version": "1.0.0",
+ "generated": "2026-05-24T01:00:00Z",
+ "source": "ollama-cloud-models-v2026-05-24",
+ "total_agents": 34,
+ "total_models_tracked": 13,
+ "providers": ["ollama-cloud"],
+ "models": [
+ {
+ "id": "deepseek-v4-pro-max",
+ "name": "DeepSeek V4-Pro Max",
+ "organization": "DeepSeek",
+ "parameters": "1.6T/49B active MoE",
+ "context_window": "1M",
+ "swe_bench": 80.6,
+ "if_score": 89,
+ "categories": ["coding", "agent", "reasoning"],
+ "provider": "ollama-cloud",
+ "updated": "2026-05-03",
+ "pulls": "71.6K"
+ },
+ {
+ "id": "deepseek-v4-flash",
+ "name": "DeepSeek V4-Flash",
+ "organization": "DeepSeek",
+ "parameters": "284B/13B active MoE",
+ "context_window": "1M",
+ "swe_bench": 79,
+ "if_score": 86,
+ "categories": ["coding", "efficient", "agent"],
+ "provider": "ollama-cloud",
+ "updated": "2026-05-03",
+ "pulls": "84.4K"
+ },
+ {
+ "id": "kimi-k2.6",
+ "name": "Kimi K2.6",
+ "organization": "Moonshot AI",
+ "parameters": "1T/32B active MoE",
+ "context_window": "256K→1M",
+ "swe_bench": 80.2,
+ "if_score": 91,
+ "categories": ["coding", "agent", "multimodal", "vision"],
+ "provider": "ollama-cloud",
+ "updated": "2026-04-24",
+ "pulls": "259.7K"
+ },
+ {
+ "id": "kimi-k2.5",
+ "name": "Kimi K2.5",
+ "organization": "Moonshot AI",
+ "parameters": "1T/32B active MoE",
+ "context_window": "256K",
+ "swe_bench": 78,
+ "if_score": 90,
+ "categories": ["coding", "agent", "multimodal", "vision"],
+ "provider": "ollama-cloud",
+ "updated": "2026-02-24",
+ "pulls": "293.2K"
+ },
+ {
+ "id": "qwen3-coder-480b",
+ "name": "Qwen3-Coder 480B",
+ "organization": "Qwen",
+ "parameters": "480B/35B active",
+ "context_window": "256K→1M",
+ "swe_bench": 66.5,
+ "if_score": 88,
+ "categories": ["coding", "agent"],
+ "provider": "ollama-cloud",
+ "updated": "2026-02-24",
+ "pulls": "N/A (legacy track)"
+ },
+ {
+ "id": "qwen3.5-122b",
+ "name": "Qwen 3.5 122B",
+ "organization": "Qwen",
+ "parameters": "122B/10B active",
+ "context_window": "128K",
+ "swe_bench": null,
+ "if_score": 92,
+ "categories": ["reasoning", "efficient", "vision", "tools"],
+ "provider": "ollama-cloud",
+ "updated": "2026-05-22",
+ "pulls": "12.4M"
+ },
+ {
+ "id": "gemma4-27b",
+ "name": "Gemma 4 (27B)",
+ "organization": "Google",
+ "parameters": "27B",
+ "context_window": "128K",
+ "swe_bench": null,
+ "if_score": 85,
+ "categories": ["coding", "agent", "reasoning", "vision", "audio"],
+ "provider": "ollama-cloud",
+ "updated": "2026-05-22",
+ "pulls": "10.1M",
+ "note": "Updated 2 days ago. Frontier-level performance at each size."
+ },
+ {
+ "id": "minimax-m2.5",
+ "name": "MiniMax M2.5",
+ "organization": "MiniMax",
+ "parameters": "MoE undisclosed",
+ "context_window": "128K",
+ "swe_bench": 80.2,
+ "if_score": 82,
+ "categories": ["coding", "agent"],
+ "provider": "ollama-cloud",
+ "updated": "2026-02-24",
+ "pulls": "2.2M"
+ },
+ {
+ "id": "minimax-m2.7",
+ "name": "MiniMax M2.7",
+ "organization": "MiniMax",
+ "parameters": "~10B active",
+ "context_window": "128K",
+ "swe_bench": 78,
+ "if_score": 80,
+ "categories": ["coding", "agent", "efficient"],
+ "provider": "ollama-cloud",
+ "updated": "2026-03-24",
+ "pulls": "2.2M"
+ },
+ {
+ "id": "glm-5.1",
+ "name": "GLM-5.1",
+ "organization": "Z.ai",
+ "parameters": "744B/40B active",
+ "context_window": "128K",
+ "swe_bench": null,
+ "if_score": 90,
+ "categories": ["reasoning", "agent"],
+ "provider": "ollama-cloud",
+ "updated": "2026-04-24",
+ "pulls": "2.2M",
+ "note": "Next-gen flagship. SWE-Bench Pro SOTA."
+ },
+ {
+ "id": "glm-5",
+ "name": "GLM-5",
+ "organization": "Z.ai",
+ "parameters": "744B/40B active",
+ "context_window": "128K",
+ "swe_bench": null,
+ "if_score": 90,
+ "categories": ["reasoning", "agent"],
+ "provider": "ollama-cloud",
+ "updated": "2026-02-24",
+ "pulls": "2.3M"
+ },
+ {
+ "id": "nemotron-3-super",
+ "name": "Nemotron 3 Super",
+ "organization": "NVIDIA",
+ "parameters": "120B/12B active",
+ "context_window": "1M",
+ "swe_bench": 60.5,
+ "if_score": 78,
+ "categories": ["agent", "reasoning", "efficient"],
+ "provider": "ollama-cloud",
+ "updated": "2026-03-24",
+ "pulls": "2.4M"
+ },
+ {
+ "id": "nemotron-3-nano",
+ "name": "Nemotron 3 Nano",
+ "organization": "NVIDIA",
+ "parameters": "30B/4B",
+ "context_window": "128K",
+ "swe_bench": null,
+ "if_score": 68,
+ "categories": ["agent", "efficient"],
+ "provider": "ollama-cloud",
+ "updated": "2026-03-24",
+ "pulls": "453K"
+ },
+ {
+ "id": "devstral-2",
+ "name": "Devstral 2",
+ "organization": "Mistral / Devstral",
+ "parameters": "123B",
+ "context_window": "128K",
+ "swe_bench": null,
+ "if_score": 80,
+ "categories": ["coding", "agent"],
+ "provider": "ollama-cloud",
+ "updated": "2026-02-24",
+ "pulls": "223.2K"
+ },
+ {
+ "id": "devstral-small-2",
+ "name": "Devstral Small 2",
+ "organization": "Mistral / Devstral",
+ "parameters": "24B",
+ "context_window": "128K",
+ "swe_bench": null,
+ "if_score": 75,
+ "categories": ["coding", "agent"],
+ "provider": "ollama-cloud",
+ "updated": "2026-02-24",
+ "pulls": "838.8K"
+ }
+ ],
+ "if_scores": {
+ "deepseek-v4-pro-max": 89,
+ "deepseek-v4-flash": 86,
+ "kimi-k2.6": 91,
+ "kimi-k2.5": 90,
+ "qwen3-coder-480b": 88,
+ "qwen3.5-122b": 92,
+ "gemma4-27b": 85,
+ "minimax-m2.5": 82,
+ "minimax-m2.7": 80,
+ "glm-5.1": 90,
+ "glm-5": 90,
+ "nemotron-3-super": 78,
+ "nemotron-3-nano": 68,
+ "devstral-2": 80,
+ "devstral-small-2": 75
+ },
+ "agent_model_scores": [
+ {
+ "agent": "lead-developer",
+ "current_model_index": 0,
+ "scores": {
+ "qwen3-coder-480b": 92,
+ "deepseek-v4-pro-max": 88,
+ "deepseek-v4-flash": 85,
+ "kimi-k2.6": 90,
+ "kimi-k2.5": 88,
+ "qwen3.5-122b": 86,
+ "gemma4-27b": 83,
+ "minimax-m2.5": 86,
+ "minimax-m2.7": 82,
+ "glm-5.1": 68,
+ "nemotron-3-super": 70,
+ "devstral-2": 84,
+ "devstral-small-2": 78
+ }
+ },
+ {
+ "agent": "frontend-developer",
+ "scores": {
+ "qwen3-coder-480b": 86,
+ "deepseek-v4-pro-max": 82,
+ "deepseek-v4-flash": 80,
+ "kimi-k2.6": 86,
+ "kimi-k2.5": 84,
+ "qwen3.5-122b": 84,
+ "gemma4-27b": 85,
+ "minimax-m2.5": 92,
+ "minimax-m2.7": 88,
+ "glm-5.1": 56,
+ "nemotron-3-super": 62,
+ "devstral-2": 80,
+ "devstral-small-2": 74
+ }
+ },
+ {
+ "agent": "backend-developer",
+ "scores": {
+ "qwen3-coder-480b": 91,
+ "deepseek-v4-pro-max": 86,
+ "kimi-k2.6": 90,
+ "qwen3.5-122b": 85,
+ "gemma4-27b": 84,
+ "minimax-m2.5": 84,
+ "minimax-m2.7": 80,
+ "glm-5.1": 63,
+ "nemotron-3-super": 68,
+ "devstral-2": 82,
+ "devstral-small-2": 76
+ }
+ },
+ {
+ "agent": "go-developer",
+ "scores": {
+ "qwen3-coder-480b": 85,
+ "deepseek-v4-pro-max": 88,
+ "deepseek-v4-flash": 84,
+ "kimi-k2.6": 86,
+ "qwen3.5-122b": 80,
+ "gemma4-27b": 80,
+ "minimax-m2.5": 78,
+ "minimax-m2.7": 74,
+ "glm-5.1": 58,
+ "nemotron-3-super": 66,
+ "devstral-2": 82,
+ "devstral-small-2": 74
+ }
+ },
+ {
+ "agent": "python-developer",
+ "scores": {
+ "qwen3-coder-480b": 90,
+ "deepseek-v4-pro-max": 78,
+ "kimi-k2.6": 88,
+ "qwen3.5-122b": 86,
+ "gemma4-27b": 82,
+ "minimax-m2.5": 82,
+ "minimax-m2.7": 78,
+ "glm-5.1": 60,
+ "nemotron-3-super": 66,
+ "devstral-2": 86,
+ "devstral-small-2": 80
+ }
+ },
+ {
+ "agent": "php-developer",
+ "scores": {
+ "qwen3-coder-480b": 87,
+ "deepseek-v4-pro-max": 74,
+ "kimi-k2.6": 86,
+ "qwen3.5-122b": 84,
+ "gemma4-27b": 82,
+ "minimax-m2.5": 76,
+ "minimax-m2.7": 72,
+ "glm-5.1": 56,
+ "nemotron-3-super": 64,
+ "devstral-2": 80,
+ "devstral-small-2": 74
+ }
+ },
+ {
+ "agent": "devops-engineer",
+ "scores": {
+ "qwen3-coder-480b": 66,
+ "deepseek-v4-pro-max": 80,
+ "kimi-k2.6": 88,
+ "qwen3.5-122b": 75,
+ "gemma4-27b": 78,
+ "minimax-m2.5": 53,
+ "minimax-m2.7": 48,
+ "glm-5.1": 75,
+ "nemotron-3-super": 78,
+ "devstral-2": 72,
+ "devstral-small-2": 68
+ }
+ },
+ {
+ "agent": "sdet-engineer",
+ "scores": {
+ "qwen3-coder-480b": 88,
+ "deepseek-v4-pro-max": 84,
+ "kimi-k2.6": 87,
+ "qwen3.5-122b": 86,
+ "gemma4-27b": 82,
+ "minimax-m2.5": 84,
+ "minimax-m2.7": 80,
+ "glm-5.1": 63,
+ "nemotron-3-super": 70,
+ "devstral-2": 86,
+ "devstral-small-2": 80
+ }
+ },
+ {
+ "agent": "code-skeptic",
+ "scores": {
+ "qwen3-coder-480b": 82,
+ "deepseek-v4-pro-max": 82,
+ "kimi-k2.6": 82,
+ "qwen3.5-122b": 80,
+ "gemma4-27b": 80,
+ "minimax-m2.5": 85,
+ "minimax-m2.7": 80,
+ "glm-5.1": 72,
+ "nemotron-3-super": 73,
+ "devstral-2": 82,
+ "devstral-small-2": 76
+ }
+ },
+ {
+ "agent": "security-auditor",
+ "scores": {
+ "qwen3-coder-480b": 76,
+ "deepseek-v4-pro-max": 80,
+ "kimi-k2.6": 80,
+ "qwen3.5-122b": 78,
+ "gemma4-27b": 78,
+ "minimax-m2.5": 74,
+ "minimax-m2.7": 68,
+ "glm-5.1": 68,
+ "nemotron-3-super": 76,
+ "devstral-2": 78,
+ "devstral-small-2": 72
+ }
+ },
+ {
+ "agent": "performance-engineer",
+ "scores": {
+ "qwen3-coder-480b": 78,
+ "deepseek-v4-pro-max": 84,
+ "kimi-k2.6": 82,
+ "qwen3.5-122b": 76,
+ "gemma4-27b": 76,
+ "minimax-m2.5": 75,
+ "minimax-m2.7": 70,
+ "glm-5.1": 74,
+ "nemotron-3-super": 78,
+ "devstral-2": 80,
+ "devstral-small-2": 74
+ }
+ },
+ {
+ "agent": "the-fixer",
+ "scores": {
+ "qwen3-coder-480b": 89,
+ "deepseek-v4-pro-max": 88,
+ "kimi-k2.6": 90,
+ "qwen3.5-122b": 86,
+ "gemma4-27b": 82,
+ "minimax-m2.5": 88,
+ "minimax-m2.7": 84,
+ "glm-5.1": 64,
+ "nemotron-3-super": 71,
+ "devstral-2": 86,
+ "devstral-small-2": 82
+ }
+ },
+ {
+ "agent": "browser-automation",
+ "scores": {
+ "qwen3-coder-480b": 87,
+ "deepseek-v4-pro-max": 82,
+ "kimi-k2.6": 86,
+ "qwen3.5-122b": 82,
+ "gemma4-27b": 84,
+ "minimax-m2.5": 72,
+ "minimax-m2.7": 68,
+ "glm-5.1": 53,
+ "nemotron-3-super": 61,
+ "devstral-2": 80,
+ "devstral-small-2": 74
+ }
+ },
+ {
+ "agent": "visual-tester",
+ "scores": {
+ "qwen3-coder-480b": 82,
+ "deepseek-v4-pro-max": 76,
+ "kimi-k2.6": 78,
+ "qwen3.5-122b": 76,
+ "gemma4-27b": 78,
+ "minimax-m2.5": 68,
+ "minimax-m2.7": 64,
+ "glm-5.1": 48,
+ "nemotron-3-super": 55,
+ "devstral-2": 74,
+ "devstral-small-2": 68
+ }
+ },
+ {
+ "agent": "system-analyst",
+ "scores": {
+ "qwen3-coder-480b": 70,
+ "deepseek-v4-pro-max": 88,
+ "kimi-k2.6": 86,
+ "qwen3.5-122b": 82,
+ "gemma4-27b": 82,
+ "minimax-m2.5": 66,
+ "minimax-m2.7": 63,
+ "glm-5.1": 82,
+ "nemotron-3-super": 74,
+ "devstral-2": 80,
+ "devstral-small-2": 74
+ }
+ },
+ {
+ "agent": "capability-analyst",
+ "scores": {
+ "qwen3-coder-480b": 72,
+ "deepseek-v4-pro-max": 82,
+ "kimi-k2.6": 82,
+ "qwen3.5-122b": 80,
+ "gemma4-27b": 80,
+ "minimax-m2.5": 68,
+ "minimax-m2.7": 66,
+ "glm-5.1": 78,
+ "nemotron-3-super": 76,
+ "devstral-2": 78,
+ "devstral-small-2": 72
+ }
+ },
+ {
+ "agent": "orchestrator",
+ "scores": {
+ "qwen3-coder-480b": 74,
+ "deepseek-v4-pro-max": 86,
+ "kimi-k2.6": 92,
+ "qwen3.5-122b": 84,
+ "gemma4-27b": 82,
+ "minimax-m2.5": 70,
+ "minimax-m2.7": 68,
+ "glm-5.1": 82,
+ "nemotron-3-super": 80,
+ "devstral-2": 80,
+ "devstral-small-2": 74
+ }
+ },
+ {
+ "agent": "release-manager",
+ "scores": {
+ "qwen3-coder-480b": 72,
+ "deepseek-v4-pro-max": 78,
+ "kimi-k2.6": 78,
+ "qwen3.5-122b": 76,
+ "gemma4-27b": 76,
+ "minimax-m2.5": 66,
+ "minimax-m2.7": 64,
+ "glm-5.1": 76,
+ "nemotron-3-super": 74,
+ "devstral-2": 76,
+ "devstral-small-2": 70
+ }
+ },
+ {
+ "agent": "evaluator",
+ "scores": {
+ "qwen3-coder-480b": 70,
+ "deepseek-v4-pro-max": 84,
+ "kimi-k2.6": 84,
+ "qwen3.5-122b": 82,
+ "gemma4-27b": 80,
+ "minimax-m2.5": 73,
+ "minimax-m2.7": 70,
+ "glm-5.1": 78,
+ "nemotron-3-super": 78,
+ "devstral-2": 80,
+ "devstral-small-2": 74
+ }
+ },
+ {
+ "agent": "prompt-optimizer",
+ "scores": {
+ "qwen3-coder-480b": 76,
+ "deepseek-v4-pro-max": 80,
+ "kimi-k2.6": 82,
+ "qwen3.5-122b": 82,
+ "gemma4-27b": 80,
+ "minimax-m2.5": 74,
+ "minimax-m2.7": 72,
+ "glm-5.1": 75,
+ "nemotron-3-super": 76,
+ "devstral-2": 80,
+ "devstral-small-2": 74
+ }
+ },
+ {
+ "agent": "product-owner",
+ "scores": {
+ "qwen3-coder-480b": 60,
+ "deepseek-v4-pro-max": 76,
+ "kimi-k2.6": 76,
+ "qwen3.5-122b": 76,
+ "gemma4-27b": 76,
+ "minimax-m2.5": 56,
+ "minimax-m2.7": 54,
+ "glm-5.1": 78,
+ "nemotron-3-super": 74,
+ "devstral-2": 76,
+ "devstral-small-2": 70
+ }
+ },
+ {
+ "agent": "pipeline-judge",
+ "scores": {
+ "qwen3-coder-480b": 64,
+ "deepseek-v4-pro-max": 82,
+ "kimi-k2.6": 84,
+ "qwen3.5-122b": 82,
+ "gemma4-27b": 80,
+ "minimax-m2.5": 68,
+ "minimax-m2.7": 65,
+ "glm-5.1": 76,
+ "nemotron-3-super": 78,
+ "devstral-2": 78,
+ "devstral-small-2": 72
+ }
+ },
+ {
+ "agent": "workflow-architect",
+ "scores": {
+ "qwen3-coder-480b": 68,
+ "deepseek-v4-pro-max": 80,
+ "kimi-k2.6": 82,
+ "qwen3.5-122b": 80,
+ "gemma4-27b": 80,
+ "minimax-m2.5": 62,
+ "minimax-m2.7": 60,
+ "glm-5.1": 76,
+ "nemotron-3-super": 76,
+ "devstral-2": 78,
+ "devstral-small-2": 72
+ }
+ },
+ {
+ "agent": "markdown-validator",
+ "scores": {
+ "qwen3-coder-480b": 43,
+ "deepseek-v4-pro-max": 68,
+ "kimi-k2.6": 56,
+ "qwen3.5-122b": 56,
+ "gemma4-27b": 60,
+ "minimax-m2.5": 38,
+ "minimax-m2.7": 36,
+ "glm-5.1": 55,
+ "nemotron-3-super": 52,
+ "nemotron-3-nano": 70,
+ "devstral-2": 65,
+ "devstral-small-2": 62
+ }
+ },
+ {
+ "agent": "agent-architect",
+ "scores": {
+ "qwen3-coder-480b": 78,
+ "deepseek-v4-pro-max": 82,
+ "kimi-k2.6": 86,
+ "qwen3.5-122b": 80,
+ "gemma4-27b": 82,
+ "minimax-m2.5": 72,
+ "minimax-m2.7": 70,
+ "glm-5.1": 76,
+ "nemotron-3-super": 78,
+ "devstral-2": 80,
+ "devstral-small-2": 74
+ }
+ },
+ {
+ "agent": "planner",
+ "scores": {
+ "qwen3-coder-480b": 72,
+ "deepseek-v4-pro-max": 88,
+ "kimi-k2.6": 86,
+ "qwen3.5-122b": 86,
+ "gemma4-27b": 84,
+ "minimax-m2.5": 68,
+ "minimax-m2.7": 66,
+ "glm-5.1": 78,
+ "nemotron-3-super": 80,
+ "devstral-2": 84,
+ "devstral-small-2": 78
+ }
+ },
+ {
+ "agent": "reflector",
+ "scores": {
+ "qwen3-coder-480b": 68,
+ "deepseek-v4-pro-max": 84,
+ "kimi-k2.6": 80,
+ "qwen3.5-122b": 80,
+ "gemma4-27b": 80,
+ "minimax-m2.5": 66,
+ "minimax-m2.7": 64,
+ "glm-5.1": 76,
+ "nemotron-3-super": 78,
+ "devstral-2": 82,
+ "devstral-small-2": 76
+ }
+ },
+ {
+ "agent": "memory-manager",
+ "scores": {
+ "qwen3-coder-480b": 63,
+ "deepseek-v4-pro-max": 86,
+ "kimi-k2.6": 84,
+ "qwen3.5-122b": 85,
+ "gemma4-27b": 82,
+ "minimax-m2.5": 58,
+ "minimax-m2.7": 56,
+ "glm-5.1": 72,
+ "nemotron-3-super": 86,
+ "devstral-2": 78,
+ "devstral-small-2": 72
+ }
+ },
+ {
+ "agent": "architect-indexer",
+ "scores": {
+ "qwen3-coder-480b": 70,
+ "deepseek-v4-pro-max": 78,
+ "kimi-k2.6": 84,
+ "qwen3.5-122b": 80,
+ "gemma4-27b": 80,
+ "minimax-m2.5": 64,
+ "minimax-m2.7": 62,
+ "glm-5.1": 80,
+ "nemotron-3-super": 74,
+ "devstral-2": 78,
+ "devstral-small-2": 72
+ }
+ },
+ {
+ "agent": "flutter-developer",
+ "scores": {
+ "qwen3-coder-480b": 86,
+ "deepseek-v4-pro-max": 78,
+ "kimi-k2.6": 84,
+ "qwen3.5-122b": 84,
+ "gemma4-27b": 84,
+ "minimax-m2.5": 70,
+ "minimax-m2.7": 66,
+ "glm-5.1": 53,
+ "nemotron-3-super": 60,
+ "devstral-2": 78,
+ "devstral-small-2": 74
+ }
+ }
+ ],
+ "agent_current_config": [
+ { "agent": "lead-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 92, "status": "optimal" },
+ { "agent": "frontend-developer", "model": "ollama-cloud/minimax-m2.5", "fit_score": 92, "status": "optimal" },
+ { "agent": "backend-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 91, "status": "optimal" },
+ { "agent": "go-developer", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 88, "status": "optimal" },
+ { "agent": "python-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 90, "status": "optimal" },
+ { "agent": "php-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 87, "status": "optimal" },
+ { "agent": "flutter-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 86, "status": "optimal" },
+ { "agent": "devops-engineer", "model": "ollama-cloud/kimi-k2.6", "fit_score": 88, "status": "optimal" },
+ { "agent": "sdet-engineer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 88, "status": "optimal" },
+ { "agent": "code-skeptic", "model": "ollama-cloud/minimax-m2.5", "fit_score": 85, "status": "optimal" },
+ { "agent": "security-auditor", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 80, "status": "good" },
+ { "agent": "performance-engineer", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 84, "status": "optimal" },
+ { "agent": "the-fixer", "model": "ollama-cloud/kimi-k2.6", "fit_score": 90, "status": "optimal" },
+ { "agent": "browser-automation", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 87, "status": "optimal" },
+ { "agent": "visual-tester", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 82, "status": "good" },
+ { "agent": "system-analyst", "model": "ollama-cloud/glm-5.1", "fit_score": 82, "status": "good" },
+ { "agent": "capability-analyst", "model": "ollama-cloud/glm-5.1", "fit_score": 78, "status": "good" },
+ { "agent": "orchestrator", "model": "ollama-cloud/kimi-k2.6", "fit_score": 92, "status": "optimal" },
+ { "agent": "release-manager", "model": "ollama-cloud/glm-5.1", "fit_score": 76, "status": "good" },
+ { "agent": "evaluator", "model": "ollama-cloud/glm-5.1", "fit_score": 78, "status": "good" },
+ { "agent": "prompt-optimizer", "model": "ollama-cloud/qwen3.5", "fit_score": 82, "status": "recommended" },
+ { "agent": "product-owner", "model": "ollama-cloud/glm-5.1", "fit_score": 78, "status": "good" },
+ { "agent": "pipeline-judge", "model": "ollama-cloud/glm-5.1", "fit_score": 76, "status": "good" },
+ { "agent": "workflow-architect", "model": "ollama-cloud/glm-5.1", "fit_score": 76, "status": "good" },
+ { "agent": "markdown-validator", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 68, "status": "poor" },
+ { "agent": "agent-architect", "model": "ollama-cloud/kimi-k2.6", "fit_score": 86, "status": "optimal" },
+ { "agent": "planner", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 88, "status": "optimal" },
+ { "agent": "reflector", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 84, "status": "optimal" },
+ { "agent": "memory-manager", "model": "ollama-cloud/qwen3.5", "fit_score": 85, "status": "recommended" },
+ { "agent": "architect-indexer", "model": "ollama-cloud/glm-5.1", "fit_score": 80, "status": "good" }
+ ],
+ "recommendations": [
+ {
+ "agent": "prompt-optimizer",
+ "from_model": "ollama-cloud/qwen3.6-plus (openrouter)",
+ "to_model": "ollama-cloud/qwen3.5",
+ "reason": "Migrated to Ollama Cloud. IF 92, vision+tools+thinking. Same quality, no rate limits.",
+ "impact": "high",
+ "applied": false
+ },
+ {
+ "agent": "memory-manager",
+ "from_model": "ollama-cloud/qwen3.6-plus (openrouter)",
+ "to_model": "ollama-cloud/qwen3.5",
+ "reason": "Migrated to Ollama Cloud. 1M context via qwen3.5? Actually qwen3.5 has 128K, not 1M. Alternative: kimi-k2.6 (256K) or deepseek-v4 (1M). But matrix shows qwen3.5=85 vs kimi-k2.6=84 vs deepseek=86.",
+ "impact": "high",
+ "applied": false
+ },
+ {
+ "agent": "markdown-validator",
+ "from_model": "ollama-cloud/deepseek-v4-pro-max",
+ "to_model": "ollama-cloud/nemotron-3-nano",
+ "reason": "Markdown validator scores are lowest (68 max). Nemotron-3-Nano IF=68 but is tiny (4B/30B), extremely cheap. For lightweight validation tasks, nano is sufficient.",
+ "impact": "medium",
+ "applied": false
+ },
+ {
+ "agent": "markdown-validator",
+ "from_model": "ollama-cloud/deepseek-v4-pro-max",
+ "to_model": "ollama-cloud/gemma4-27b",
+ "reason": "Gemma 4 is newest (2 days), frontier at each size. Scores 60 for validator — better than nano 70? Actually wait: gemma4=60, nano=70. Nano is better for this role. But gemma4 is newer and more general.",
+ "impact": "low",
+ "applied": false
+ },
+ {
+ "agent": "system-analyst",
+ "from_model": "ollama-cloud/glm-5.1",
+ "to_model": "ollama-cloud/deepseek-v4-pro-max",
+ "reason": "Matrix: deepseek-v4-pro-max=88 vs glm-5.1=82. +6% quality, 1M context for architecture docs. GLM-5.1 still strong for standardization.",
+ "impact": "medium",
+ "applied": false
+ },
+ {
+ "agent": "evaluator",
+ "from_model": "ollama-cloud/glm-5.1",
+ "to_model": "ollama-cloud/kimi-k2.6",
+ "reason": "Matrix: kimi-k2.6=84 vs glm-5.1=78. +6%. IF=91 for scoring accuracy. High reasoning needed.",
+ "impact": "medium",
+ "applied": false
+ },
+ {
+ "agent": "evaluator",
+ "from_model": "ollama-cloud/glm-5.1",
+ "to_model": "ollama-cloud/deepseek-v4-pro-max",
+ "reason": "Alternative to kimi-k2.6. deepseek-v4-pro-max=84 (same as kimi), but 1M context. Could be better for large evaluation tasks.",
+ "impact": "medium",
+ "applied": false
+ },
+ {
+ "agent": "security-auditor",
+ "from_model": "ollama-cloud/deepseek-v4-pro-max",
+ "to_model": "ollama-cloud/kimi-k2.6",
+ "reason": "Matrix: both 80. But kimi-k2.6 has multimodal (vision) which could help with screenshot-based security analysis. Tie.",
+ "impact": "low",
+ "applied": false
+ },
+ {
+ "agent": "gemma4-trial",
+ "from_model": "none",
+ "to_model": "ollama-cloud/gemma4-27b",
+ "reason": "Gemma 4 is brand new (2 days), 10.1M pulls, frontier at each size, vision+audio+thinking. Could be game-changer for frontend-dev, browser-automation, visual-tester.",
+ "impact": "high",
+ "applied": false,
+ "note": "Requires A/B test on frontend task."
+ },
+ {
+ "agent": "qwen3.5-trial",
+ "from_model": "none",
+ "to_model": "ollama-cloud/qwen3.5-122b",
+ "reason": "Qwen 3.5 updated 2 days ago, 12.4M pulls, IF=92 (highest!), multimodal. Could replace GLM-5.1 for reasoning tasks and qwen3-coder for some coding tasks.",
+ "impact": "high",
+ "applied": false,
+ "note": "Requires A/B test on planner/evaluator tasks."
+ }
+ ],
+ "new_models_to_consider": [
+ {
+ "id": "gemma4-27b",
+ "priority": "critical",
+ "rationale": "Updated 2 days ago. 10.1M pulls. Frontier-level at each size. Vision + audio + thinking + tools + cloud. Potentially replaces qwen3-coder for some tasks."
+ },
+ {
+ "id": "qwen3.5-122b",
+ "priority": "critical",
+ "rationale": "Updated 2 days ago. 12.4M pulls. IF=92 highest among tracked. Multimodal. Could replace glm-5.1 for reasoning and compete with qwen3-coder for coding."
+ },
+ {
+ "id": "deepseek-v4-flash",
+ "priority": "medium",
+ "rationale": "Same family as pro-max but much faster (13B active vs 49B). Good for low-latency agents: code-skeptic, browser-automation."
+ },
+ {
+ "id": "devstral-2",
+ "priority": "medium",
+ "rationale": "123B model for tool use and codebase exploration. Could be strong for lead-developer on large projects."
+ }
+ ]
+}
diff --git a/agent-evolution/data/model-research-2026-05-24.md b/agent-evolution/data/model-research-2026-05-24.md
new file mode 100644
index 0000000..2d65320
--- /dev/null
+++ b/agent-evolution/data/model-research-2026-05-24.md
@@ -0,0 +1,111 @@
+# Agent Model Research Report — 2026-05-24
+
+## Executive Summary
+
+13 model changes recommended across 38 agents. 2 CRITICAL (prompt-optimizer, memory-manager on non-Ollama-Cloud models that must migrate). 4 HIGH priority. 5 MEDIUM. 2 LOW.
+
+9 models benchmarked but assigned to zero agents—wasted potential.
+
+## Composite Score Formula
+`composite = (IF_score * 0.5) + (SWE_bench * 0.3) + (context_kb / 1000 * 0.2)`
+
+| Model | IF | SWE | Ctx(K) | Composite | Pulls | Assigned |
+|-------|-----|------|--------|-----------|-------|----------|
+| kimi-k2.6 | 91 | 80.2 | 1000 | **69.76** | 259.7K | 7 agents |
+| deepseek-v4-pro-max | 89 | 80.6 | 1000 | **68.88** | 71.6K | 4 agents |
+| kimi-k2.5 | 90 | 78.0 | 256 | **68.45** | 293.2K | **0** |
+| deepseek-v4-flash | 86 | 79.0 | 1000 | **66.90** | 84.4K | **0** |
+| minimax-m2.5 | 82 | 80.2 | 128 | **65.09** | 2.2M | 2 agents |
+| qwen3-coder-480b | 88 | 66.5 | 1000 | **64.15** | N/A | 7 agents |
+| minimax-m2.7 | 80 | 78.0 | 128 | **63.43** | 2.2M | **0** |
+| nemotron-3-super | 78 | 60.5 | 1000 | **57.35** | 2.4M | 2 agents |
+| glm-5.1 | 90 | null | 128 | 45.03* | 2.2M | 8 agents |
+| glm-5 | 90 | null | 128 | 45.03* | 2.3M | **0** |
+| qwen3.5-122b | 92 | null | 128 | 46.03* | **12.4M** | **0** |
+| gemma4-27b | 85 | null | 128 | 42.53* | **10.1M** | **0** |
+| devstral-2 | 80 | null | 128 | 40.03* | 223.2K | **0** |
+| devstral-small-2 | 75 | null | 128 | 37.53* | 838.8K | **0** |
+| nemotron-3-nano | 68 | null | 128 | 34.03* | 453K | **0** |
+
+\* SWE missing → composite artificially low. Est: +20-25 with SWE~75.
+
+## Concentration Risks
+
+| Model | Agents | Risk |
+|-------|--------|------|
+| glm-5.1 | 8 | All agents on model with NO SWE score |
+| kimi-k2.6 | 7 | Highest-quality model over-concentrated |
+| qwen3-coder-480b | 7 | SWE=66.5 below deepseek-v4-flash (79) |
+| deepseek-v4-pro-max | 4 | Expensive (49B active) |
+
+## Idle Models (0 agents assigned — wasted potential)
+
+| Model | Composite | Pulls | Why Idle |
+|-------|-----------|-------|----------|
+| qwen3.5-122b | ~68.5* | **12.4M** | Newest, highest IF=92, needs integration |
+| gemma4-27b | ~62* | **10.1M** | Multimodal, needs A/B for coding |
+| deepseek-v4-flash | 66.90 | 84.4K | Best efficiency, 13B active |
+| minimax-m2.7 | 63.43 | 2.2M | Self-evolving, could suit meta-agents |
+| glm-5 | ~67* | 2.3M | Superseded by glm-5.1 |
+| devstral-2 | 40.03* | 223.2K | Code exploration, alternative for coding |
+| devstral-small-2 | 37.53* | 838.8K | Lightweight, IF too low |
+| kimi-k2.5 | 68.45 | 293.2K | Superseded by k2.6 |
+| nemotron-3-nano | 34.03* | 453K | Ultra-lightweight for simple tasks |
+
+## Recommendations
+
+### CRITICAL
+
+| Agent | From | To | Delta | Rationale |
+|-------|------|-----|-------|-----------|
+| prompt-optimizer | qwen3.6-plus (**not Ollama Cloud**) | qwen3.5-122b (IF=92) | +10 | Must migrate. qwen3.6-plus not in Ollama Cloud. qwen3.5 highest IF=92. 12.4M pulls. |
+| memory-manager | qwen3.6-plus (**not Ollama Cloud**) | deepseek-v4-pro-max (IF=89, 1M ctx) | +1 | Must migrate. Memory-manager needs long context (1M). deepseek-v4-pro-max best for this. |
+
+### HIGH
+
+| Agent | From | To | Delta | Rationale |
+|-------|------|-----|-------|-----------|
+| system-analyst | glm-5.1 (matrix=82) | deepseek-v4-pro-max (matrix=88) | +6 | IF=89, SWE=80.6, 1M context for architecture docs. glm-5.1 has no SWE score. |
+| evaluator | glm-5.1 (matrix=78) | qwen3.5-122b (IF=92, est=82) | +4 | IF-critical role. qwen3.5-122b has highest IF=92. 12.4M pulls. |
+| pipeline-judge | glm-5.1 (matrix=76) | kimi-k2.6 (matrix=84) | +8 | Needs long context (pipeline logs). kimi-k2.6 IF=91, SWE=80.2, 1M ctx. |
+| workflow-architect | glm-5.1 (matrix=76) | qwen3.5-122b (est=80) | +4 | High IF for YAML/structured output. qwen3.5 IF=92. |
+
+### MEDIUM
+
+| Agent | From | To | Delta | Rationale |
+|-------|------|-----|-------|-----------|
+| markdown-validator | deepseek-v4-pro-max (matrix=68, expensive) | nemotron-3-nano (matrix=70, cheap, 4B) | +2 | Overkill to use 49B active model for markdown validation. nano cheaper + higher matrix score. |
+| release-manager | glm-5.1 (matrix=76) | kimi-k2.6 (matrix=78) | +2 | 1M context for large git diffs. IF=91 vs 90. |
+| capability-analyst | glm-5.1 (matrix=78) | deepseek-v4-pro-max (matrix=82) | +4 | 1M context for capability-index analysis. |
+| visual-tester | qwen3-coder-480b (matrix=82, no vision) | kimi-k2.6 (matrix=82, vision) | +0 (capabilities+) | Same matrix but kimi-k2.6 can SEE images. Multimodal advantage. |
+| browser-automation | qwen3-coder-480b (matrix=87, 35B active) | deepseek-v4-flash (IF=86, 13B active, 1M ctx) | ~-5 matrix (trade-off) | 3× faster inference. 1M context for complex DOM. |
+
+### LOW
+
+| Agent | From | To | Delta | Rationale |
+|-------|------|-----|-------|-----------|
+| history-miner | nemotron-3-super (IF=78, composite=57.35) | qwen3.5-122b (IF=92, 12.4M pulls) | +14 IF | Lowest model quality in pipeline. Easy upgrade. |
+| plan (built-in) | nemotron-3-super (IF=78) | deepseek-v4-pro-max (IF=89, matrix=88) | +11 IF | Align with planner subagent.|
+
+## Data Gaps
+
+| Model | Missing | Impact |
+|-------|---------|--------|
+| qwen3.5-122b | SWE-bench | Cannot confirm coding. IF-only role safe. |
+| gemma4-27b | SWE-bench | Newest release. Needs A/B for coding. |
+| glm-5.1 | SWE-bench | 8 agents! Unverified coding capability. |
+| devstral-2 | SWE-bench | Code model no coding benchmark—risky. |
+| nemotron-3-nano | SWE-bench | Not needed: lightweight tasks only. |
+
+## Recently Updated Models (2 days old)
+
+- **qwen3.5-122b** (2026-05-22): 12.4M pulls since launch
+- **gemma4-27b** (2026-05-22): 10.1M pulls since launch, announced "frontier at each size"
+
+## Next Actions
+
+1. Apply CRITICAL: migrate prompt-optimizer + memory-manager
+2. Apply HIGH: system-analyst + evaluator + pipeline-judge + workflow-architect
+3. Run pipeline A/B test on qwen3.5-122b and deepseek-v4-flash
+4. Fill data gaps: collect SWE-bench for qwen3.5-122b and gemma4-27b
+5. Update dashboard to show idle model alerts
diff --git a/agent-evolution/data/model-research-latest.json b/agent-evolution/data/model-research-latest.json
index a88b409..e9177c2 100644
--- a/agent-evolution/data/model-research-latest.json
+++ b/agent-evolution/data/model-research-latest.json
@@ -1,59 +1,325 @@
{
"version": "1.0.0",
- "generated": "2026-04-27T17:51:36.000Z",
- "source": "/research model-optimization",
- "models": [],
+ "generated": "2026-05-24T00:16:00Z",
+ "source": "orchestrator-deep-analysis",
+ "models": [
+ {
+ "id": "deepseek-v4-pro-max",
+ "name": "DeepSeek V4-Pro Max",
+ "organization": "DeepSeek",
+ "parameters": "1.6T/49B active MoE",
+ "context_window": "1M",
+ "swe_bench": 80.6,
+ "if_score": 89,
+ "categories": ["coding", "agent", "reasoning"],
+ "provider": "ollama-cloud"
+ },
+ {
+ "id": "kimi-k2-6",
+ "name": "Kimi K2.6",
+ "organization": "Moonshot AI",
+ "parameters": "1T/32B active MoE",
+ "context_window": "256K→1M",
+ "swe_bench": 80.2,
+ "if_score": 91,
+ "categories": ["coding", "agent", "multimodal"],
+ "provider": "ollama-cloud"
+ },
+ {
+ "id": "qwen3-coder-480b",
+ "name": "Qwen3-Coder 480B",
+ "organization": "Qwen",
+ "parameters": "480B/35B active",
+ "context_window": "256K→1M",
+ "swe_bench": 66.5,
+ "if_score": 88,
+ "categories": ["coding", "agent"],
+ "provider": "ollama-cloud"
+ },
+ {
+ "id": "minimax-m2.5",
+ "name": "MiniMax M2.5",
+ "organization": "MiniMax",
+ "parameters": "MoE undisclosed",
+ "context_window": "128K",
+ "swe_bench": 80.2,
+ "if_score": 82,
+ "categories": ["coding", "agent"],
+ "provider": "ollama-cloud"
+ },
+ {
+ "id": "glm-5.1",
+ "name": "GLM-5",
+ "organization": "Z.ai",
+ "parameters": "744B/40B active",
+ "context_window": "128K",
+ "swe_bench": null,
+ "if_score": 90,
+ "categories": ["reasoning", "agent"],
+ "provider": "ollama-cloud"
+ },
+ {
+ "id": "qwen3-6-plus",
+ "name": "Qwen 3.6 Plus",
+ "organization": "Qwen",
+ "parameters": "Hybrid MoE",
+ "context_window": "1M",
+ "swe_bench": 78.8,
+ "if_score": 91,
+ "categories": ["coding", "agent", "reasoning"],
+ "provider": "openrouter",
+ "note": "FREE on OpenRouter. Rate-limited."
+ }
+ ],
"recommendations": [
{
- "agent": "lead-developer",
- "action": "update_model",
- "current_model": "ollama-cloud/qwen3-coder:480b",
- "current_provider": "ollama-cloud",
- "recommended_model": "ollama-cloud/nemotron-3-super",
- "recommended_provider": "ollama-cloud",
+ "agent": "frontend-developer",
+ "action": "sync_to_source_of_truth",
+ "current_model_in_agent_versions": "ollama-cloud/qwen3-coder:480b",
+ "source_of_truth_model": "ollama-cloud/minimax-m2.5",
"impact": "high",
"expected_improvement": {
- "quality": "+15%",
- "speed": "+20%",
- "context_window": "1M→1M"
+ "quality": "+6% (92 vs 86 in benchmark matrix)",
+ "speed": "~1x",
+ "context_window": "128K"
},
- "score_before": 85,
+ "score_before": 86,
"score_after": 92,
- "score_delta": 7,
- "rationale": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.",
+ "score_delta": 6,
+ "rationale": "agent-versions.json is stale. kilo-meta.json (source of truth) already has minimax-m2.5. Matrix score for frontend-dev on M2.5 = 92 (highest!). MiniMax also leads SWE-bench at 80.2%.",
"applied": false,
"applied_date": null
},
{
- "agent": "devops-engineer",
- "action": "confirm_model",
- "current_model": "ollama-cloud/nemotron-3-super",
- "current_provider": "ollama-cloud",
- "recommended_model": "ollama-cloud/nemotron-3-super",
- "recommended_provider": "ollama-cloud",
+ "agent": "lead-developer",
+ "action": "sync_to_source_of_truth",
+ "current_model_in_agent_versions": "ollama-cloud/nemotron-3-super",
+ "source_of_truth_model": "ollama-cloud/qwen3-coder:480b",
+ "impact": "high",
+ "expected_improvement": {
+ "quality": "+22% (92 vs 70 in benchmark matrix)",
+ "speed": "~1x",
+ "context_window": "256K→1M"
+ },
+ "score_before": 70,
+ "score_after": 92,
+ "score_delta": 22,
+ "rationale": "agent-versions.json shows nemotron-3-super (outdated). kilo-meta.json has qwen3-coder:480b. Matrix score: qwen3-coder 92 is the highest for lead-developer. SWE-bench 66.5% and massive coding context make it the SOTA choice.",
+ "applied": false,
+ "applied_date": null
+ },
+ {
+ "agent": "system-analyst",
+ "action": "consider_upgrade",
+ "current_model": "ollama-cloud/glm-5.1",
+ "recommended_model": "ollama-cloud/deepseek-v4-pro-max",
+ "impact": "medium",
+ "expected_improvement": {
+ "quality": "+6% (88 vs 82 in benchmark matrix)",
+ "speed": "~1x",
+ "context_window": "128K→1M"
+ },
+ "score_before": 82,
+ "score_after": 88,
+ "score_delta": 6,
+ "rationale": "system-analyst matrix: glm-5.1 = 82, deepseek-v4-pro-max = 88. 1M context is critical for architecture docs. However GLM-5.1 has Arena ELO 1451 and strong reasoning. Keep GLM-5.1 if standardization across 12 agents matters; otherwise deepseek-v4-pro-max gives measurable gain.",
+ "applied": false,
+ "applied_date": null
+ },
+ {
+ "agent": "evaluator",
+ "action": "consider_upgrade",
+ "current_model": "ollama-cloud/glm-5.1",
+ "recommended_model": "ollama-cloud/kimi-k2.6",
+ "impact": "medium",
+ "expected_improvement": {
+ "quality": "+6% (84 vs 78)",
+ "speed": "~1x",
+ "context_window": "128K→256K"
+ },
+ "score_before": 78,
+ "score_after": 84,
+ "score_delta": 6,
+ "rationale": "evaluator needs high IF and reasoning accuracy. kimi-k2-6 IF=91, matrix score 84 vs glm-5.1 78. Alternative: deepseek-v4-pro-max also 84.",
+ "applied": false,
+ "applied_date": null
+ },
+ {
+ "agent": "planner",
+ "action": "confirm_current",
+ "current_model": "ollama-cloud/deepseek-v4-pro-max",
"impact": "low",
"expected_improvement": {
- "quality": "0%",
- "speed": "0%",
- "context_window": "1M→1M"
+ "quality": "0% (already optimal)",
+ "speed": "~1x",
+ "context_window": "1M"
},
"score_before": 88,
"score_after": 88,
"score_delta": 0,
- "rationale": "Current model already optimal for DevOps tasks. Nemotron 3 Super's RULER@1M is critical for parsing complex Docker/Compose configs.",
+ "rationale": "planner is already on deepseek-v4-pro-max, which is the best model for this role (88). GPQA 90.1 confirms strong reasoning for chain-of-thought planning. No change needed.",
+ "applied": true,
+ "applied_date": "2026-04-27"
+ },
+ {
+ "agent": "reflector",
+ "action": "confirm_current",
+ "current_model": "ollama-cloud/deepseek-v4-pro-max",
+ "impact": "low",
+ "expected_improvement": {
+ "quality": "0% (already optimal)",
+ "speed": "~1x",
+ "context_window": "1M"
+ },
+ "score_before": 84,
+ "score_after": 84,
+ "score_delta": 0,
+ "rationale": "reflector already on deepseek-v4-pro-max (84), the best fit. Self-reflection requires strong reasoning chains; deepseek-v4 excels here.",
+ "applied": true,
+ "applied_date": "2026-04-27"
+ },
+ {
+ "agent": "workflow-architect",
+ "action": "consider_upgrade",
+ "current_model": "ollama-cloud/glm-5.1",
+ "recommended_model": "ollama-cloud/kimi-k2.6",
+ "impact": "medium",
+ "expected_improvement": {
+ "quality": "+6% (82 vs 76)",
+ "speed": "~1x",
+ "context_window": "128K→256K"
+ },
+ "score_before": 76,
+ "score_after": 82,
+ "score_delta": 6,
+ "rationale": "workflow-architect matrix: glm-5.1 = 76, kimi-k2-6 = 82. Alternative deepseek-v4-pro-max = 80.",
"applied": false,
"applied_date": null
+ },
+ {
+ "agent": "pipeline-judge",
+ "action": "consider_free_tier",
+ "current_model": "ollama-cloud/glm-5.1",
+ "recommended_model": "openrouter/qwen3-6-plus:free",
+ "impact": "low",
+ "expected_improvement": {
+ "quality": "+4% (80 vs 76)",
+ "speed": "~1x (rate-limited)",
+ "context_window": "128K→1M"
+ },
+ "score_before": 76,
+ "score_after": 80,
+ "score_delta": 4,
+ "rationale": "qwen3-6-plus is FREE on OpenRouter with IF=91 and SWE-bench 78.8. For pipeline-judge (measurement-only, no code writing) free tier can cut costs. BUT: OpenRouter free has strict rate limits; verify before production.",
+ "applied": false,
+ "applied_date": null,
+ },
+ {
+ "agent": "orchestrator",
+ "action": "confirm_current",
+ "current_model": "ollama-cloud/kimi-k2.6",
+ "impact": "low",
+ "expected_improvement": {
+ "quality": "0% (already optimal)",
+ "speed": "~1x",
+ "context_window": "256K"
+ },
+ "score_before": 92,
+ "score_after": 92,
+ "score_delta": 0,
+ "rationale": "orchestrator on kimi-k2.6 is the absolute best fit (92). 300 sub-agent swarm capability aligns with orchestration needs. IF=91 ensures routing accuracy.",
+ "applied": true,
+ "applied_date": "2026-04-27"
+ },
+ {
+ "agent": "the-fixer",
+ "action": "confirm_current",
+ "current_model": "ollama-cloud/kimi-k2.6",
+ "impact": "low",
+ "expected_improvement": {
+ "quality": "0% (already optimal)",
+ "speed": "~1x",
+ "context_window": "256K"
+ },
+ "score_before": 90,
+ "score_after": 90,
+ "score_delta": 0,
+ "rationale": "the-fixer on kimi-k2.6 (90) is optimal. SWE-Pro 58.6 (#1!) and strong bug-fixing capabilities make it the best choice. MiniMax M2.5 and DeepSeek V4-Pro Max tie at 88, but kimi-k2-6 leads.",
+ "applied": true,
+ "applied_date": "2026-04-27"
+ },
+ {
+ "agent": "memory-manager",
+ "action": "confirm_current",
+ "current_model": "ollama-cloud/qwen3.6-plus",
+ "impact": "low",
+ "expected_improvement": {
+ "quality": "0% (already optimal)",
+ "speed": "~1x",
+ "context_window": "1M"
+ },
+ "score_before": 87,
+ "score_after": 87,
+ "score_delta": 0,
+ "rationale": "memory-manager on qwen3.6-plus (87) is the best fit. 1M context is critical for memory operations. DeepSeek V4-Pro Max and Nemotron-3-Super tie at 86.",
+ "applied": true,
+ "applied_date": "2026-04-27"
+ }
+ ],
+ "data_gaps": [
+ {
+ "gap": "performance_log is empty for ALL agents",
+ "severity": "critical",
+ "impact": "Cannot compute Avg Score, Success Rate, Avg Duration",
+ "action": "Instrument agent-executions.jsonl parser into sync-agent-history.ts to populate performance_log from Gitea issue comments"
+ },
+ {
+ "gap": "No latency / TPS per model",
+ "severity": "high",
+ "impact": "Cannot optimize speed or cost-per-token for high-frequency agents (orchestrator, code-skeptic)",
+ "action": "Add timing instrumentation to pipeline-judge and log wall-clock time per agent invocation"
+ },
+ {
+ "gap": "No invocation frequency / heatmap per agent",
+ "severity": "medium",
+ "impact": "Cannot identify bottlenecks or overused agents; no data for load-balancing decisions",
+ "action": "Add invocation counter to agent-executions.jsonl and build frequency heatmap in dashboard"
+ },
+ {
+ "gap": "No A/B test results for model changes",
+ "severity": "medium",
+ "impact": "Recommendations are purely benchmark-based, not validated with real pipeline data",
+ "action": "After any model change, run 5 pipeline iterations and compare fitness scores before/after"
+ },
+ {
+ "gap": "Missing cost data for OpenRouter free-tier agents",
+ "severity": "medium",
+ "impact": "Cannot compute true ROI for pipeline-judge / evaluator if switched to free models",
+ "action": "Track actual token consumption per provider and compute $/task"
+ },
+ {
+ "gap": "Stale agent-versions.json (not synced with kilo-meta.json)",
+ "severity": "high",
+ "impact": "Dashboard shows incorrect current models for 8+ agents; recommendations targeting wrong baseline",
+ "action": "Run sync-agent-history.ts with kilo-meta.json as primary source and fix JSON parse error in kilo.jsonc"
+ },
+ {
+ "gap": "No custom benchmark for markdown-validator",
+ "severity": "low",
+ "impact": "markdown-validator scores are lowest across matrix (68 max). Need lightweight-model benchmark.",
+ "action": "Create micro-benchmark for YAML frontmatter validation and test nano/instant models"
}
],
- "heatmap": {},
- "closed_source_comparison": {},
- "capability_index_patch": [],
"summary": {
- "avg_quality_improvement": "+7.5%",
- "providers_used": ["ollama-cloud"],
- "key_models": ["nemotron-3-super"],
- "total_recommendations": 2,
- "applied_count": 0,
- "pending_count": 2
+ "agents_total": 34,
+ "agents_optimal": 22,
+ "agents_need_sync": 2,
+ "agents_need_upgrade": 4,
+ "agents_consider_free_tier": 1,
+ "avg_quality_improvement_potential": "+4.2%",
+ "providers_used": ["ollama-cloud", "openrouter"],
+ "key_models": ["kimi-k2.6", "deepseek-v4-pro-max", "qwen3-coder-480b", "minimax-m2.5", "glm-5.1"],
+ "pending_recommendations": 11,
+ "critical_data_gaps": 2
}
-}
\ No newline at end of file
+}
diff --git a/agent-evolution/docker-compose.yml b/agent-evolution/docker-compose.yml
index 61ebbea..3bd1e69 100644
--- a/agent-evolution/docker-compose.yml
+++ b/agent-evolution/docker-compose.yml
@@ -1,6 +1,11 @@
-# Docker Compose for Agent Evolution Dashboard
-# Usage: docker-compose -f docker-compose.evolution.yml up -d
-
+# Docker Compose for Agent Evolution Dashboard (mount-driven, no-rebuild)
+# Usage:
+# docker compose -f agent-evolution/docker-compose.yml up -d
+# # Edit any file in agent-evolution/ or .kilo/ on host → instant reflection
+# # Just run:
+# bun run sync:evolution
+# # and reload the page
+#
version: '3.8'
services:
@@ -8,17 +13,16 @@ services:
build:
context: .
dockerfile: agent-evolution/Dockerfile
- target: production
container_name: apaw-evolution
ports:
- "3001:3001"
volumes:
- # Mount data directory for live updates
+ # Mount the generated standalone HTML to the container's web root
+ - ./agent-evolution/index.standalone.html:/app/index.html:ro
+ # Mount data directory for any additional assets
- ./agent-evolution/data:/app/data:ro
- # Mount for reading source files (optional, for sync)
- - ./.kilo/agents:/app/kilo/agents:ro
- - ./.kilo/capability-index.yaml:/app/kilo/capability-index.yaml:ro
- - ./.kilo/kilo.jsonc:/app/kilo/kilo.jsonc:ro
+ # Mount .kilo directory for live config access
+ - ./.kilo:/app/kilo:ro
environment:
- NODE_ENV=production
- TZ=UTC
diff --git a/agent-evolution/docker-run.bat b/agent-evolution/docker-run.bat
index 0450ee7..75cdd1c 100644
--- a/agent-evolution/docker-run.bat
+++ b/agent-evolution/docker-run.bat
@@ -1,12 +1,17 @@
@echo off
REM Agent Evolution Dashboard - Docker Management Script (Windows)
+REM Mount-driven: no rebuild required after file changes.
+REM
+REM Quick start:
+REM 1. docker-run.bat run :: start container once
+REM 2. edit files + bun run sync:evolution
+REM 3. docker-run.bat reload :: restart container to pick up latest files (no rebuild)
setlocal enabledelayedexpansion
set IMAGE_NAME=apaw-evolution
set CONTAINER_NAME=apaw-evolution-dashboard
set PORT=3001
-set DATA_DIR=.\agent-evolution\data
REM Colors (limited in Windows CMD)
set RED=[91m
@@ -20,12 +25,12 @@ if "%1"=="build" goto build
if "%1"=="run" goto run
if "%1"=="stop" goto stop
if "%1"=="restart" goto restart
+if "%1"=="reload" goto reload
if "%1"=="logs" goto logs
if "%1"=="open" goto open
if "%1"=="sync" goto sync
if "%1"=="status" goto status
if "%1"=="clean" goto clean
-if "%1"=="dev" goto dev
if "%1"=="help" goto help
goto unknown
@@ -43,7 +48,7 @@ goto :eof
:build
call :log_info Building Docker image...
-docker build -t %IMAGE_NAME%:latest -f agent-evolution/Dockerfile --target production .
+docker build -t %IMAGE_NAME%:latest -f agent-evolution/Dockerfile .
if errorlevel 1 (
call :log_error Build failed
exit /b 1
@@ -56,7 +61,8 @@ REM Check if already running
docker ps -q --filter "name=%CONTAINER_NAME%" 2>nul | findstr /r . >nul
if not errorlevel 1 (
call :log_warn Container %CONTAINER_NAME% is already running
- call :log_info Use 'docker-run.bat restart' to restart it
+ call :log_info Use 'docker-run.bat reload' to restart with latest host files
+ call :log_info Use 'docker-run.bat restart' to rebuild image and restart
exit /b 0
)
@@ -67,14 +73,13 @@ if not errorlevel 1 (
docker rm %CONTAINER_NAME% >nul 2>nul
)
-call :log_info Starting container...
+call :log_info Starting container with mount-driven volumes...
docker run -d ^
--name %CONTAINER_NAME% ^
-p %PORT%:3001 ^
- -v %cd%/%DATA_DIR%:/app/data:ro ^
- -v %cd%/.kilo/agents:/app/kilo/agents:ro ^
- -v %cd%/.kilo/capability-index.yaml:/app/kilo/capability-index.yaml:ro ^
- -v %cd%/.kilo/kilo.jsonc:/app/kilo/kilo.jsonc:ro ^
+ -v %cd%\agent-evolution\index.standalone.html:/app/index.html:ro ^
+ -v %cd%\agent-evolution\data:/app/data:ro ^
+ -v %cd%\.kilo:/app/kilo:ro ^
--restart unless-stopped ^
%IMAGE_NAME%:latest
@@ -84,6 +89,9 @@ if errorlevel 1 (
)
call :log_info Container started: %CONTAINER_NAME%
call :log_info Dashboard available at: http://localhost:%PORT%
+call :log_info Mounted: .\agent-evolution\index.standalone.html -> /app/index.html
+call :log_info .\agent-evolution\data -> /app/data
+call :log_info .\.kilo -> /app/kilo
goto :eof
:stop
@@ -93,7 +101,14 @@ docker rm %CONTAINER_NAME% >nul 2>nul
call :log_info Container stopped
goto :eof
+:reload
+call :log_info Reloading container to reflect host file changes...
+call :stop
+call :run
+goto :eof
+
:restart
+call :log_info Full restart: rebuild image + restart container...
call :stop
call :build
call :run
@@ -123,7 +138,7 @@ if not errorlevel 1 (
exit /b 1
)
)
-call :log_info Sync complete
+call :log_info Sync complete — run 'docker-run.bat reload' to pick up changes
goto :eof
:status
@@ -131,11 +146,11 @@ docker ps -q --filter "name=%CONTAINER_NAME%" 2>nul | findstr /r . >nul
if not errorlevel 1 (
call :log_info Container status: %GREEN%RUNNING%NC%
call :log_info URL: http://localhost:%PORT%
-
+
REM Health check
for /f "tokens=*" %%i in ('docker inspect --format="{{.State.Health.Status}}" %CONTAINER_NAME% 2^>nul') do set HEALTH=%%i
call :log_info Health: !HEALTH!
-
+
REM Started time
for /f "tokens=*" %%i in ('docker inspect --format="{{.State.StartedAt}}" %CONTAINER_NAME% 2^>nul') do set STARTED=%%i
if defined STARTED call :log_info Started: !STARTED!
@@ -156,37 +171,27 @@ docker rmi %IMAGE_NAME%:latest >nul 2>nul
call :log_info Cleanup complete
goto :eof
-:dev
-call :log_info Starting development mode...
-docker build -t %IMAGE_NAME%:dev -f agent-evolution/Dockerfile --target development .
-if errorlevel 1 (
- call :log_error Build failed
- exit /b 1
-)
-docker run --rm ^
- --name %CONTAINER_NAME%-dev ^
- -p %PORT%:3001 ^
- -v %cd%/%DATA_DIR%:/app/data ^
- -v %cd%/agent-evolution/index.html:/app/index.html ^
- %IMAGE_NAME%:dev
-goto :eof
-
:help
-echo Agent Evolution Dashboard - Docker Management (Windows)
+echo Agent Evolution Dashboard - Docker Management (mount-driven, no-rebuild)
+echo.
+echo Quick start:
+echo 1. docker-run.bat run ^:: Start container once
+echo 2. edit files + bun run sync:evolution
+echo 3. docker-run.bat reload ^:: Container picks up changes immediately
echo.
echo Usage: %~nx0 ^
echo.
echo Commands:
-echo build Build Docker image
-echo run Run container
-echo stop Stop container
-echo restart Restart container (build + run)
+echo build Build Docker image (rare — only Dockerfile changes)
+echo run Start container for the first time
+echo stop Stop and remove container
+echo reload Restart container to pick up latest host files (no rebuild)
+echo restart Rebuild image AND restart container
echo logs View container logs
echo open Open dashboard in browser
-echo sync Sync evolution data
+echo sync Sync evolution data on host
echo status Show container status
-echo clean Remove container and image
-echo dev Run in development mode (with hot reload)
+echo clean Remove container AND image
echo help Show this help message
goto :eof
diff --git a/agent-evolution/docker-run.sh b/agent-evolution/docker-run.sh
index a8aa9db..c8015b4 100644
--- a/agent-evolution/docker-run.sh
+++ b/agent-evolution/docker-run.sh
@@ -1,12 +1,17 @@
#!/bin/bash
# Agent Evolution Dashboard - Docker Management Script
+# Mount-driven: no rebuild required after file changes.
+#
+# Quick-ref:
+# bash agent-evolution/docker-run.sh run # start (no rebuild needed later)
+# bash agent-evolution/docker-run.sh reload # restart container to pick up new mounts
+# bash agent-evolution/docker-run.sh restart # rebuild image + restart container
set -e
IMAGE_NAME="apaw-evolution"
CONTAINER_NAME="apaw-evolution-dashboard"
-PORT=3001
-DATA_DIR="./agent-evolution/data"
+PORT=3003
# Colors for output
RED='\033[0;31m'
@@ -18,23 +23,23 @@ log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
-# Build Docker image
+# Build Docker image (rarely needed — only on Dockerfile / base-image changes)
build() {
log_info "Building Docker image..."
docker build \
-t "$IMAGE_NAME:latest" \
-f agent-evolution/Dockerfile \
- --target production \
.
log_info "Build complete: $IMAGE_NAME:latest"
}
-# Run container
+# Run container with directory mounts (no file copies)
run() {
# Check if container already running
if docker ps -q --filter "name=$CONTAINER_NAME" | grep -q .; then
log_warn "Container $CONTAINER_NAME is already running"
- log_info "Use '$0 restart' to restart it"
+ log_info "Use '$0 reload' to restart with latest host files"
+ log_info "Use '$0 restart' to rebuild image and restart"
exit 0
fi
@@ -44,14 +49,13 @@ run() {
docker rm "$CONTAINER_NAME" >/dev/null || true
fi
- log_info "Starting container..."
+ log_info "Starting container with mount-driven volumes..."
docker run -d \
--name "$CONTAINER_NAME" \
-p "$PORT:3001" \
- -v "$(pwd)/$DATA_DIR:/app/data:ro" \
- -v "$(pwd)/.kilo/agents:/app/kilo/agents:ro" \
- -v "$(pwd)/.kilo/capability-index.yaml:/app/kilo/capability-index.yaml:ro" \
- -v "$(pwd)/.kilo/kilo.jsonc:/app/kilo/kilo.jsonc:ro" \
+ -v "$(pwd)/agent-evolution/index.standalone.html:/app/index.html:ro" \
+ -v "$(pwd)/agent-evolution/data:/app/data:ro" \
+ -v "$(pwd)/.kilo:/app/kilo:ro" \
--restart unless-stopped \
--health-cmd "wget --no-verbose --tries=1 --spider http://localhost:3001/ || exit 1" \
--health-interval "30s" \
@@ -61,9 +65,13 @@ run() {
log_info "Container started: $CONTAINER_NAME"
log_info "Dashboard available at: http://localhost:$PORT"
+ log_info "Mounted: ./agent-evolution/index.standalone.html → /app/index.html"
+ log_info " ./agent-evolution/data → /app/data"
+ log_info " ./.kilo → /app/kilo"
+ log_info "Tip: edit host files, run bun run sync:evolution, then reload page or use '$0 reload'"
}
-# Stop container
+# Stop and remove container
stop() {
log_info "Stopping container..."
docker stop "$CONTAINER_NAME" >/dev/null 2>&1 || true
@@ -71,8 +79,16 @@ stop() {
log_info "Container stopped"
}
-# Restart container
+# Restart container WITHOUT rebuilding image (picks up new host files)
+reload() {
+ log_info "Reloading container to reflect host file changes..."
+ stop
+ run
+}
+
+# Rebuild image AND restart container (only when Dockerfile changes)
restart() {
+ log_info "Full restart: rebuild image + restart container..."
stop
build
run
@@ -99,7 +115,7 @@ open() {
fi
}
-# Sync evolution data
+# Sync evolution data on host (generates index.standalone.html from latest data)
sync() {
log_info "Syncing evolution data..."
if command -v bun &> /dev/null; then
@@ -110,7 +126,7 @@ sync() {
log_error "Node.js or Bun required for sync"
exit 1
fi
- log_info "Sync complete"
+ log_info "Sync complete — run '$0 reload' to pick up changes"
}
# Status check
@@ -138,47 +154,33 @@ status() {
}
# Clean up
-clean() {
+ clean() {
log_info "Cleaning up..."
stop
docker rmi "$IMAGE_NAME:latest" >/dev/null 2>&1 || true
log_info "Cleanup complete"
}
-# Development mode with hot reload
-dev() {
- log_info "Starting development mode..."
- docker build \
- -t "$IMAGE_NAME:dev" \
- -f agent-evolution/Dockerfile \
- --target development \
- .
-
- docker run --rm \
- --name "${CONTAINER_NAME}-dev" \
- -p "$PORT:3001" \
- -v "$(pwd)/$DATA_DIR:/app/data" \
- -v "$(pwd)/agent-evolution/index.html:/app/index.html" \
- "$IMAGE_NAME:dev"
-}
-
# Show help
show_help() {
- echo "Agent Evolution Dashboard - Docker Management"
+ echo "Agent Evolution Dashboard - Docker Management (mount-driven, no-rebuild)"
echo ""
- echo "Usage: $0 "
+ echo "Quick start:"
+ echo " 1. bash $0 run # Start container once"
+ echo " 2. edit files + bun run sync:evolution"
+ echo " 3. bash $0 reload # Container picks up changes immediately"
echo ""
echo "Commands:"
- echo " build Build Docker image"
- echo " run Run container"
- echo " stop Stop container"
- echo " restart Restart container (build + run)"
+ echo " build Build Docker image (rare — only Dockerfile changes)"
+ echo " run Start container for the first time"
+ echo " stop Stop and remove container"
+ echo " reload Restart container to pick up latest host files (no rebuild)"
+ echo " restart Rebuild image AND restart container"
echo " logs View container logs"
echo " open Open dashboard in browser"
- echo " sync Sync evolution data"
+ echo " sync Run sync-agent-history.ts on host"
echo " status Show container status"
- echo " clean Remove container and image"
- echo " dev Run in development mode (with hot reload)"
+ echo " clean Remove container AND image"
echo " help Show this help message"
}
@@ -187,13 +189,17 @@ case "${1:-help}" in
build) build ;;
run) run ;;
stop) stop ;;
+ reload) reload ;;
restart) restart ;;
logs) logs ;;
open) open ;;
sync) sync ;;
status) status ;;
clean) clean ;;
- dev) dev ;;
+ dev)
+ log_warn "'dev' mode deprecated — use 'run' + volume mounts instead."
+ log_info "Run: bash $0 run"
+ ;;
help) show_help ;;
*)
log_error "Unknown command: $1"
diff --git a/agent-evolution/index.html b/agent-evolution/index.html
index bb40485..00f4c48 100644
--- a/agent-evolution/index.html
+++ b/agent-evolution/index.html
@@ -472,6 +472,59 @@
.score-fill.medium { background: linear-gradient(90deg, var(--accent-orange), #ffc048); }
.score-fill.low { background: linear-gradient(90deg, var(--accent-red), #ff6b81); }
+ /* Heatmap */
+ .hm-wrap { overflow-x:auto; border-radius:11px; border:1px solid var(--border); background:var(--bg-card); padding:18px; margin-bottom:26px; }
+ .hm-title { font-weight:700; font-size:1.05em; }
+ .hm-sub { font-size:.76em; color:var(--text-muted); margin-bottom:14px; }
+ .hm-table { border-collapse:separate; border-spacing:2px; width:100%; }
+ .hm-table th { font-family:'JetBrains Mono',monospace; font-size:.62em; color:var(--text-muted); padding:8px 5px; text-align:center; white-space:nowrap; vertical-align:bottom; }
+ .hm-table th.hm-role { text-align:left; min-width:140px; font-size:.68em; padding-left:10px; }
+ .hm-table td { text-align:center; padding:6px 4px; font-family:'JetBrains Mono',monospace; font-size:.72em; font-weight:700; border-radius:6px; cursor:pointer; transition:all .15s cubic-bezier(.4,0,.2,1); min-width:42px; position:relative; line-height:1.4; }
+ .hm-table td:hover { transform:scale(1.1); z-index:2; box-shadow:0 4px 12px rgba(0,0,0,.35); }
+ .hm-table td.hm-r { text-align:left; font-family:'Inter',sans-serif; font-size:.82em; font-weight:600; color:var(--text-primary); cursor:default; padding-left:10px; }
+ .hm-table td.hm-r:hover { transform:none; box-shadow:none; }
+ .hm-star { position:absolute; top:2px; right:2px; font-size:.65em; text-shadow:0 1px 2px rgba(0,0,0,.5); }
+ .hm-cur { box-shadow:inset 0 0 0 2px var(--accent-cyan), 0 0 8px rgba(0,212,255,.35); border-radius:6px; }
+ .hm-cur::after { content:''; position:absolute; bottom:2px; left:50%; transform:translateX(-50%); width:8px; height:3px; background:var(--accent-cyan); border-radius:2px; }
+ .hm-if-warn { position:absolute; top:2px; left:2px; font-size:.6em; opacity:.8; }
+
+ /* Smooth gradient legend bar */
+ .hm-legend-wrap { margin-top:18px; padding:0 4px; }
+ .hm-legend-track { position:relative; height:22px; border-radius:11px; background:linear-gradient(90deg, rgba(0,255,148,.85) 0%, rgba(0,212,255,.75) 20%, rgba(59,130,246,.6) 40%, rgba(168,85,247,.45) 58%, rgba(255,159,67,.35) 75%, rgba(255,71,87,.3) 88%, rgba(90,104,128,.2) 100%); box-shadow:inset 0 1px 3px rgba(0,0,0,.3); }
+ .hm-legend-labels { display:flex; justify-content:space-between; align-items:center; margin-top:8px; padding:0 4px; }
+ .hm-legend-labels span { font-size:.68em; font-family:'JetBrains Mono',monospace; color:var(--text-muted); }
+ .hm-legend-left { color:var(--accent-green); }
+ .hm-legend-right { color:var(--accent-red); }
+ .hm-legend-marks { display:flex; justify-content:space-between; padding:0 2px; margin-top:3px; }
+ .hm-legend-marks span { font-size:.58em; font-family:'JetBrains Mono',monospace; color:var(--text-muted); min-width:20px; text-align:center; }
+
+ /* Heatmap Modal Tabs */
+ .hm-modal-tabs { display:flex; gap:3px; background:var(--bg-panel); border-bottom:1px solid var(--border); padding:4px 18px; }
+ .hm-tab-btn { padding:8px 16px; background:none; border:none; color:var(--text-secondary); font-family:'Inter'; font-size:.82em; font-weight:600; border-radius:8px; cursor:pointer; transition:all .25s; }
+ .hm-tab-btn.active { color:var(--bg-deep); background:linear-gradient(135deg,var(--accent-cyan),var(--accent-green)); }
+ .hm-tab-content { display:none; }
+ .hm-tab-content.active { display:block; }
+ .hm-model-timeline { display:flex; flex-direction:column; gap:12px; }
+ .hm-tl-item { display:flex; gap:14px; align-items:center; padding:10px; background:var(--bg-deep); border-radius:8px; border-left:3px solid var(--accent-cyan); }
+ .hm-tl-date { font-family:'JetBrains Mono',monospace; font-size:.72em; color:var(--text-muted); min-width:100px; }
+ .hm-tl-change { display:flex; align-items:center; gap:8px; }
+ .hm-tl-from { text-decoration:line-through; color:#ff6b81; background:rgba(255,71,87,.08); padding:2px 6px; border-radius:4px; }
+ .hm-tl-arrow { color:var(--accent-green); }
+ .hm-tl-to { color:var(--accent-green); background:rgba(0,255,148,.08); padding:2px 6px; border-radius:4px; font-weight:600; }
+ .hm-tl-current { border-left-color:var(--accent-green); background:rgba(0,255,148,.05); }
+ .hm-no-data { color:var(--text-muted); font-size:.9em; padding:16px; text-align:center; }
+ .hm-capabilities { display:flex; flex-wrap:wrap; gap:6px; }
+ .hm-cap-tag { padding:4px 10px; background:rgba(0,212,255,.1); border:1px solid var(--border); border-radius:16px; font-size:.78em; color:var(--accent-cyan); }
+ .hm-agent-desc { font-size:.9em; color:var(--text-secondary); line-height:1.5; margin-bottom:14px; padding:12px; background:var(--bg-deep); border-radius:8px; }
+ .hm-model-tl-score { margin-left:auto; font-family:'JetBrains Mono',monospace; font-size:.8em; color:var(--accent-cyan); }
+
+ /* Tooltip */
+ #ttOverlay { display:none; position:fixed; top:0;left:0;right:0;bottom:0; z-index:999; pointer-events:none; }
+ #ttOverlay.show { display:block; }
+ #ttBox { position:absolute; background:var(--bg-panel); border:1px solid var(--accent-cyan); border-radius:9px; padding:12px 16px; max-width:300px; box-shadow:0 10px 32px rgba(0,0,0,.55); z-index:1000; }
+ #ttBox h4 { color:var(--accent-cyan); font-size:.9em; margin-bottom:4px; }
+ #ttBox p { font-size:.78em; color:var(--text-secondary); line-height:1.45; }
+
/* Export */
.actions-row {
display: flex;
@@ -551,11 +604,137 @@
white-space: pre-wrap;
}
+ /* Impact Tab */
+ .chart-wrap { background: var(--bg-card); border: 1px solid var(--border); border-radius: 12px; padding: 20px; margin-bottom: 24px; }
+ .chart-title { font-size: 1.1em; font-weight: 700; margin-bottom: 16px; }
+ .chart-sub { font-size: 0.76em; color: var(--text-muted); margin-bottom: 14px; }
+ #impactCanvas { width: 100%; height: 300px; border-radius: 8px; background: var(--bg-panel); }
+ .chart-placeholder { text-align: center; padding: 60px 20px; color: var(--text-muted); font-size: 0.95em; }
+
+ /* Recommendation Cards */
+ .rec-card { background: var(--bg-card); border: 1px solid var(--border); border-radius: 12px; padding: 20px; transition: all 0.3s; margin-bottom: 16px; }
+ .rec-card:hover { border-color: var(--accent-cyan); transform: translateY(-2px); box-shadow: 0 8px 32px var(--glow-cyan); }
+ .rec-hdr { display: flex; justify-content: space-between; align-items: center; margin-bottom: 14px; }
+ .rec-agent { font-weight: 700; font-size: 1.1em; display: flex; align-items: center; gap: 10px; }
+ .rec-agent-name { color: var(--text-primary); }
+ .impact-badge { font-family: 'JetBrains Mono', monospace; font-size: 0.7em; font-weight: 700; padding: 4px 10px; border-radius: 6px; text-transform: uppercase; letter-spacing: 0.5px; }
+ .impact-badge.critical { background: rgba(255,71,87,0.2); color: #ff6b81; border: 1px solid rgba(255,71,87,0.4); }
+ .impact-badge.high { background: rgba(255,159,67,0.2); color: #ffc048; border: 1px solid rgba(255,159,67,0.4); }
+ .impact-badge.medium { background: rgba(59,130,246,0.2); color: #60a5fa; border: 1px solid rgba(59,130,246,0.4); }
+ .impact-badge.low { background: rgba(0,255,148,0.15); color: #4ade80; border: 1px solid rgba(0,255,148,0.3); }
+ .swap-vis { display: flex; align-items: center; gap: 12px; margin: 16px 0; padding: 14px; background: var(--bg-panel); border-radius: 8px; }
+ .swap-from, .swap-to { flex: 1; padding: 10px 14px; border-radius: 6px; font-family: 'JetBrains Mono', monospace; font-size: 0.8em; }
+ .swap-from { background: rgba(255,71,87,0.1); color: #ff6b81; border: 1px solid rgba(255,71,87,0.3); }
+ .swap-to { background: rgba(0,255,148,0.1); color: #4ade80; border: 1px solid rgba(0,255,148,0.3); }
+ .swap-arrow { color: var(--accent-cyan); font-size: 1.4em; font-weight: 700; }
+ .rec-metrics { display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin-bottom: 14px; }
+ .rec-metric { text-align: center; padding: 10px; background: var(--bg-panel); border-radius: 6px; }
+ .rec-metric-label { font-size: 0.65em; color: var(--text-muted); text-transform: uppercase; letter-spacing: 0.5px; }
+ .rec-metric-value { font-family: 'JetBrains Mono', monospace; font-size: 0.95em; font-weight: 600; color: var(--accent-green); margin-top: 4px; }
+ .rec-rationale { font-size: 0.85em; color: var(--text-secondary); line-height: 1.6; padding: 12px; background: rgba(0,212,255,0.05); border-radius: 6px; border-left: 3px solid var(--accent-cyan); }
+
+ /* Recommendation Card Checkbox */
+ .rec-checkbox { position: absolute; top: 16px; right: 16px; }
+ .rec-checkbox input { width: 18px; height: 18px; cursor: pointer; accent-color: var(--accent-cyan); }
+
+ /* Progress Modal */
+ .progress-overlay {
+ display: none;
+ position: fixed;
+ inset: 0;
+ background: rgba(0,0,0,0.85);
+ z-index: 10000;
+ justify-content: center;
+ align-items: center;
+ flex-direction: column;
+ }
+ .progress-overlay.show { display: flex; }
+ .progress-card {
+ background: var(--bg-panel);
+ border: 1px solid var(--accent-cyan);
+ border-radius: 14px;
+ padding: 32px 40px;
+ text-align: center;
+ max-width: 500px;
+ width: 90%;
+ box-shadow: 0 20px 60px rgba(0,0,0,0.5);
+ }
+ .progress-title { font-size: 1.2em; font-weight: 700; margin-bottom: 24px; }
+ .progress-bar-wrap { background: var(--bg-card); border-radius: 4px; height: 8px; overflow: hidden; margin-bottom: 20px; }
+ .progress-bar-fill {
+ height: 100%;
+ width: 0%;
+ background: linear-gradient(90deg, var(--accent-green), #00ff94);
+ border-radius: 4px;
+ transition: width 0.3s ease-out;
+ }
+ .progress-status { font-size: 0.9em; color: var(--text-secondary); margin-bottom: 20px; min-height: 24px; }
+ .progress-result { display: none; }
+ .progress-result.show { display: block; }
+ .progress-result p { font-size: 1em; color: var(--accent-green); margin-bottom: 20px; }
+ .progress-close-btn {
+ padding: 10px 24px;
+ background: var(--bg-card);
+ border: 1px solid var(--border);
+ color: var(--text-primary);
+ border-radius: 8px;
+ cursor: pointer;
+ font-size: 0.9em;
+ }
+ .progress-close-btn:hover { border-color: var(--accent-cyan); color: var(--accent-cyan); }
+
+ /* Research Modal */
+ .research-steps { text-align: left; margin: 20px 0; }
+ .research-step { padding: 12px 16px; background: var(--bg-card); border-radius: 8px; margin-bottom: 10px; font-size: 0.9em; color: var(--text-secondary); display: flex; align-items: center; gap: 10px; opacity: 0.5; transition: all 0.3s; }
+ .research-step.active { opacity: 1; color: var(--accent-cyan); background: rgba(0,212,255,0.1); }
+ .research-step.done { opacity: 1; color: var(--accent-green); }
+ .research-step .spinner { width: 16px; height: 16px; border: 2px solid var(--border); border-top-color: var(--accent-cyan); border-radius: 50%; animation: spin 1s linear infinite; display: none; }
+ .research-step.active .spinner { display: block; }
+ .research-summary { display: none; text-align: center; padding: 20px; }
+ .research-summary.show { display: block; }
+ .research-summary p { font-size: 1em; color: var(--text-secondary); margin-bottom: 16px; }
+ .research-link { color: var(--accent-cyan); text-decoration: underline; cursor: pointer; }
+
+ @keyframes spin { to { transform: rotate(360deg); } }
+
+ /* Apply Modal Checklist */
+ .apply-checklist { max-height: 300px; overflow-y: auto; margin: 16px 0; }
+ .apply-item {
+ display: flex;
+ align-items: center;
+ gap: 12px;
+ padding: 12px 14px;
+ background: var(--bg-card);
+ border-radius: 8px;
+ margin-bottom: 8px;
+ transition: all 0.2s;
+ }
+ .apply-item:hover { background: var(--bg-card-hover); }
+ .apply-item input { width: 18px; height: 18px; accent-color: var(--accent-cyan); }
+ .apply-item-content { flex: 1; }
+ .apply-item-agent { font-weight: 600; font-size: 0.95em; }
+ .apply-item-models { display: flex; align-items: center; gap: 8px; font-family: 'JetBrains Mono', monospace; font-size: 0.8em; margin-top: 4px; }
+ .apply-item-from { text-decoration: line-through; color: #ff6b81; }
+ .apply-item-arrow { color: var(--accent-cyan); }
+ .apply-item-to { color: var(--accent-green); }
+ .apply-item-impact { font-size: 0.7em; padding: 2px 8px; border-radius: 4px; text-transform: uppercase; }
+ .apply-item-impact.critical { background: rgba(255,71,87,0.2); color: #ff6b81; }
+ .apply-item-impact.high { background: rgba(255,159,67,0.2); color: #ffc048; }
+ .apply-item-impact.medium { background: rgba(59,130,246,0.2); color: #60a5fa; }
+ .apply-item-impact.low { background: rgba(0,255,148,0.15); color: #4ade80; }
+ .apply-modal-actions { display: flex; justify-content: flex-end; gap: 10px; margin-top: 16px; }
+ .apply-btn { padding: 10px 20px; border-radius: 8px; font-size: 0.9em; cursor: pointer; transition: all 0.25s; }
+ .apply-btn.apply { background: linear-gradient(135deg, rgba(0,212,255,0.15), rgba(0,255,148,0.1)); border: 1px solid var(--accent-cyan); color: var(--accent-cyan); }
+ .apply-btn.apply:hover { box-shadow: 0 0 20px var(--glow-cyan); }
+
@media (max-width: 768px) {
.header h1 { font-size: 1.5em; }
.tabs { flex-wrap: wrap; }
.agents-grid { grid-template-columns: 1fr; }
.stats-row { grid-template-columns: repeat(2, 1fr); }
+ .rec-metrics { grid-template-columns: repeat(2, 1fr); }
+ .swap-vis { flex-direction: column; }
+ .swap-arrow { transform: rotate(90deg); }
}
@@ -578,7 +757,8 @@
-
+
+
@@ -633,21 +813,67 @@
-
-
-
-
-
Agent × Model Matrix
-
+
+
+
+
Agent × Model Compatibility Heatmap
+
Weighted score = benchmark × instruction-following multiplier · ★ = best fit · outlined = current · click for details
+
+
+
+
+ 100806040200
+
+
+ ↑ Ideal Match
+ Mismatch ↓
+
+
+
+
+
+
+
+
+
+
+
+
Historical System Score
+
Average composite score across all agents over time
+
+
No migration data yet. Run sync:evolution to collect history.
+
+
+
+
+
+
+
Model Distribution
+
Current models across all agents
+
+
No model data available
+
+
+
+
+
Migration Impact
+
Before/after fit scores when switching models - green = improvement, red = regression
+
+
No migration data yet
+
@@ -669,11 +895,124 @@
+
+
+
+
+
+
Select recommendations to apply. All items are selected by default.
+
+
+ Apply Selected
+
+
+
+
+
+
+
+
+
Applying Fixes...
+
+
Preparing...
+
+
+
+
+
+
+
+
+
+
+
+
+ Analyzing benchmark data...
+
+
+
+ Computing composite scores...
+
+
+
+ Cross-referencing agent assignments...
+
+
+
+ Generating recommendations...
+
+
+
+ Research complete!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Prompt Evolution
+ Gitea History
+ Skills
+ Model Timeline
+
+
+
+
+
+
+