diff --git a/.kilo/agents/browser-automation.md b/.kilo/agents/browser-automation.md index 8e2ba56..013f7c0 100755 --- a/.kilo/agents/browser-automation.md +++ b/.kilo/agents/browser-automation.md @@ -1,7 +1,7 @@ --- description: Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction (GNS-2 Tier 0) mode: subagent -model: ollama-cloud/qwen3-coder:480b +model: ollama-cloud/deepseek-v4-flash color: "#1E88E5" permission: read: allow diff --git a/.kilo/agents/capability-analyst.md b/.kilo/agents/capability-analyst.md index 851ec22..a1f279f 100755 --- a/.kilo/agents/capability-analyst.md +++ b/.kilo/agents/capability-analyst.md @@ -1,7 +1,7 @@ --- description: Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components. Tier 2 meta-agent with self-cascade enabled. mode: subagent -model: ollama-cloud/glm-5.1 +model: ollama-cloud/deepseek-v4-pro-max color: "#6366F1" permission: read: allow diff --git a/.kilo/agents/history-miner.md b/.kilo/agents/history-miner.md index d1dd2fb..7b776bd 100755 --- a/.kilo/agents/history-miner.md +++ b/.kilo/agents/history-miner.md @@ -1,7 +1,7 @@ --- description: Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work (GNS-2 Tier 0) mode: subagent -model: ollama-cloud/nemotron-3-super +model: ollama-cloud/qwen3.5-122b color: "#059669" permission: read: allow diff --git a/.kilo/agents/markdown-validator.md b/.kilo/agents/markdown-validator.md index 6463400..152d840 100755 --- a/.kilo/agents/markdown-validator.md +++ b/.kilo/agents/markdown-validator.md @@ -1,7 +1,7 @@ --- description: Validates and corrects Markdown descriptions for Gitea issues (GNS-2 Tier 0) mode: subagent -model: ollama-cloud/deepseek-v4-pro-max +model: ollama-cloud/nemotron-3-nano color: "#F97316" permission: read: allow diff --git a/.kilo/agents/release-manager.md b/.kilo/agents/release-manager.md index e02809b..180f9e7 100755 --- a/.kilo/agents/release-manager.md +++ b/.kilo/agents/release-manager.md @@ -1,7 +1,7 @@ --- description: Manages git operations, semantic versioning, branching, and deployments. Ensures clean history (GNS-2 Tier 1) mode: subagent -model: ollama-cloud/glm-5.1 +model: ollama-cloud/kimi-k2.6 color: "#581C87" permission: read: allow diff --git a/.kilo/capability-index.yaml b/.kilo/capability-index.yaml index 86de347..f39913e 100644 --- a/.kilo/capability-index.yaml +++ b/.kilo/capability-index.yaml @@ -412,7 +412,7 @@ agents: - screenshots forbidden: - unit_testing - model: ollama-cloud/qwen3-coder:480b + model: ollama-cloud/deepseek-v4-flash mode: subagent delegates_to: - orchestrator @@ -501,7 +501,7 @@ agents: - new_agent_specs forbidden: - implementation - model: ollama-cloud/glm-5.1 + model: ollama-cloud/deepseek-v4-pro-max mode: subagent delegates_to: - agent-architect @@ -585,7 +585,7 @@ agents: forbidden: - code_changes - feature_development - model: ollama-cloud/glm-5.1 + model: ollama-cloud/kimi-k2.6 mode: subagent delegates_to: - evaluator @@ -734,7 +734,7 @@ agents: - corrections forbidden: - content_creation - model: ollama-cloud/deepseek-v4-pro-max + model: ollama-cloud/nemotron-3-nano mode: subagent delegates_to: - orchestrator diff --git a/agent-evolution/Dockerfile b/agent-evolution/Dockerfile index e60fca5..c7dfe16 100644 --- a/agent-evolution/Dockerfile +++ b/agent-evolution/Dockerfile @@ -1,30 +1,24 @@ # Agent Evolution Dashboard Dockerfile -# Standalone version - works from file:// or HTTP +# Mount-required version: all content is mounted via volumes. +# No file copies into the image — rebuild is never required for data changes. +# +# Build once: +# docker build -t apaw-evolution -f agent-evolution/Dockerfile . +# +# Workflow: +# bun run sync:evolution # host-side — regenerates index.standalone.html +# bash agent-evolution/docker-run.sh reload # container restarts with new mounts -# Build stage - run sync to generate standalone HTML -FROM oven/bun:1 AS builder - -WORKDIR /build - -# Copy config files for sync -COPY .kilo/agents/*.md ./.kilo/agents/ -COPY .kilo/capability-index.yaml ./.kilo/ -COPY .kilo/kilo.jsonc ./.kilo/ -COPY agent-evolution/ ./agent-evolution/ - -# Run sync to generate standalone HTML with embedded data -RUN bun agent-evolution/scripts/sync-agent-history.ts || true - -# Production stage - Python HTTP server -FROM python:3.12-alpine AS production +FROM python:3.12-alpine WORKDIR /app -# Copy standalone HTML (embedded data) -COPY --from=builder /build/agent-evolution/index.standalone.html ./index.html +# Placeholder content until host mounts the real index.standalone.html +RUN echo 'APAW Evolution Dashboard

Mount required

Run bun run sync:evolution on the host, then reload the container.

' > index.html -# Expose port EXPOSE 3001 -# Simple HTTP server (no CORS issues) +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD wget --no-verbose --tries=1 --spider http://127.0.0.1:3001/ || exit 1 + CMD ["python3", "-m", "http.server", "3001"] \ No newline at end of file diff --git a/agent-evolution/data/agent-versions.json b/agent-evolution/data/agent-versions.json index a3e7889..8a397de 100644 --- a/agent-evolution/data/agent-versions.json +++ b/agent-evolution/data/agent-versions.json @@ -1,22 +1,17 @@ { "version": "1.0.0", - "lastUpdated": "2026-04-27T20:28:58.592Z", + "lastUpdated": "2026-05-25T13:37:20.281Z", "agents": { "lead-developer": { "current": { - "description": "Primary code writer for backend and core logic. Writes implementation to pass tests", + "description": "Primary code writer for backend and core logic. Writes implementation to pass tests (GNS-2 Tier 1)", "mode": "subagent", - "model": "ollama-cloud/nemotron-3-super", + "model": "ollama-cloud/qwen3-coder:480b", "provider": "Ollama", "variant": "thinking", "color": "\"#DC2626\"", "category": "General", - "capabilities": [ - "code_writing", - "refactoring", - "bug_fixing", - "implementation" - ] + "capabilities": [] }, "history": [ { @@ -25,47 +20,39 @@ "type": "model_change", "from": null, "to": "ollama-cloud/qwen3-coder:480b", - "reason": "Initial configuration from capability-index.yaml", + "reason": "Initial configuration", "source": "git" }, { - "date": "2026-04-27T16:56:09.013Z", + "date": "2026-04-27T16:56:09Z", "commit": "model-research-sync", "type": "model_change", "from": "ollama-cloud/qwen3-coder:480b", "to": "ollama-cloud/nemotron-3-super", - "reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.", + "reason": "Nemotron 3 Super has better reasoning", "source": "research" }, { - "date": "2026-04-27T20:28:58.592Z", - "commit": "model-research-sync", + "date": "2026-05-24T01:00:00Z", + "commit": "ollama-cloud-consolidation", "type": "model_change", - "from": "ollama-cloud/qwen3-coder:480b", - "to": "ollama-cloud/nemotron-3-super", - "reason": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.", - "source": "research" + "from": "ollama-cloud/nemotron-3-super", + "to": "ollama-cloud/qwen3-coder:480b", + "reason": "Reverted to qwen3-coder: SWE-bench 66.5% is coding-benchmark standard. Matrix score 92 vs nemotron 70.", + "source": "orchestrator-analysis" } ], "performance_log": [] }, "frontend-developer": { "current": { - "description": "Handles UI implementation with multimodal capabilities. Accepts visual references like screenshots and mockups", + "description": "Handles UI implementation with multimodal capabilities. Accepts visual references like screenshots and mockups (GNS-2 Tier 1)", "mode": "all", - "model": "ollama-cloud/qwen3-coder:480b", + "model": "ollama-cloud/minimax-m2.5", "provider": "Ollama", "color": "\"#0EA5E9\"", "category": "General", - "capabilities": [ - "ui_implementation", - "component_creation", - "styling", - "responsive_design", - "nextjs_development", - "vue_nuxt_development", - "react_development" - ] + "capabilities": [] }, "history": [ { @@ -76,48 +63,41 @@ "to": "ollama-cloud/qwen3-coder:480b", "reason": "Flutter development support added", "source": "git" + }, + { + "date": "2026-04-27T17:00:00Z", + "commit": "model-research-sync", + "type": "model_change", + "from": "ollama-cloud/qwen3-coder:480b", + "to": "ollama-cloud/minimax-m2.5", + "reason": "Matrix score 92 for frontend on M2.5. SWE-bench 80.2%.", + "source": "research" } ], "performance_log": [] }, "backend-developer": { "current": { - "description": "Backend specialist for Node.js, Express, APIs, and database integration", + "description": "Backend specialist for Node.js, Express, APIs, and database integration (GNS-2 Tier 1)", "mode": "subagent", "model": "ollama-cloud/qwen3-coder:480b", "provider": "Ollama", "color": "\"#10B981\"", "category": "General", - "capabilities": [ - "api_development", - "database_design", - "server_logic", - "authentication", - "postgresql_integration", - "sqlite_integration" - ] + "capabilities": [] }, "history": [], "performance_log": [] }, "go-developer": { "current": { - "description": "Go backend specialist for Gin, Echo, APIs, and database integration", + "description": "Go backend specialist for Gin, Echo, APIs, and database integration (GNS-2 Tier 1)", "mode": "subagent", - "model": "ollama-cloud/qwen3-coder:480b", + "model": "ollama-cloud/deepseek-v4-pro-max", "provider": "Ollama", "color": "\"#00ADD8\"", "category": "General", - "capabilities": [ - "go_api_development", - "go_database_design", - "go_concurrent_programming", - "go_authentication", - "go_microservices", - "postgresql_integration", - "sqlite_integration", - "clickhouse_integration" - ] + "capabilities": [] }, "history": [ { @@ -126,64 +106,57 @@ "type": "model_change", "from": "ollama-cloud/deepseek-v3.2", "to": "ollama-cloud/qwen3-coder:480b", - "reason": "Qwen3-Coder optimized for Go development", + "reason": "Qwen3-Coder optimized for Go", "source": "git" + }, + { + "date": "2026-04-27T17:00:00Z", + "commit": "model-research-sync", + "type": "model_change", + "from": "ollama-cloud/qwen3-coder:480b", + "to": "ollama-cloud/deepseek-v4-pro-max", + "reason": "Matrix score 88 for go-dev on V4-Pro. DeepSeek traditionally strong in Go/Rust.", + "source": "research" } ], "performance_log": [] }, "sdet-engineer": { "current": { - "description": "Writes tests following TDD methodology. Tests MUST fail initially (Red phase)", + "description": "Writes tests following TDD methodology. Tests MUST fail initially (Red phase) (GNS-2 Tier 1)", "mode": "all", "model": "ollama-cloud/qwen3-coder:480b", "provider": "Ollama", "variant": "thinking", "color": "\"#8B5CF6\"", "category": "General", - "capabilities": [ - "unit_tests", - "integration_tests", - "e2e_tests", - "test_planning", - "visual_regression" - ] + "capabilities": [] }, "history": [], "performance_log": [] }, "code-skeptic": { "current": { - "description": "Adversarial code reviewer. Finds problems and issues. Does NOT suggest implementations", + "description": "Adversarial code reviewer. Finds problems and issues. Does NOT suggest implementations (GNS-2 Tier 0)", "mode": "subagent", "model": "ollama-cloud/minimax-m2.5", "provider": "Ollama", "color": "\"#E11D48\"", "category": "General", - "capabilities": [ - "code_review", - "security_review", - "style_check", - "issue_identification" - ] + "capabilities": [] }, "history": [], "performance_log": [] }, "security-auditor": { "current": { - "description": "Scans for security vulnerabilities, OWASP Top 10, dependency CVEs, and hardcoded secrets", - "mode": "all", - "model": "ollama-cloud/nemotron-3-super", + "description": "Scans for security vulnerabilities, OWASP Top 10, dependency CVEs, and hardcoded secrets (GNS-2 Tier 0)", + "mode": "subagent", + "model": "ollama-cloud/deepseek-v4-pro-max", "provider": "Ollama", - "color": "\"#7F1D1D\"", + "color": "\"#DC2626\"", "category": "General", - "capabilities": [ - "vulnerability_scan", - "owasp_check", - "secret_detection", - "auth_review" - ] + "capabilities": [] }, "history": [ { @@ -192,26 +165,30 @@ "type": "model_change", "from": "ollama-cloud/deepseek-v3.2", "to": "ollama-cloud/nemotron-3-super", - "reason": "Nemotron 3 Super optimized for security analysis with RULER@1M", + "reason": "Nemotron 3 Super optimized for security analysis", "source": "git" + }, + { + "date": "2026-05-24T01:00:00Z", + "commit": "ollama-cloud-consolidation", + "type": "model_change", + "from": "ollama-cloud/nemotron-3-super", + "to": "ollama-cloud/deepseek-v4-pro-max", + "reason": "V4-Pro Max matrix=80 vs nemotron=76. SWE-V 80.6, 1M context.", + "source": "orchestrator-analysis" } ], "performance_log": [] }, "performance-engineer": { "current": { - "description": "Reviews code for performance issues. Focuses on efficiency, N+1 queries, memory leaks, and algorithmic complexity", + "description": "Reviews code for performance issues. Focuses on efficiency, N+1 queries, memory leaks, and algorithmic complexity (GNS-2 Tier 0)", "mode": "all", - "model": "ollama-cloud/nemotron-3-super", + "model": "ollama-cloud/deepseek-v4-pro-max", "provider": "Ollama", "color": "\"#0D9488\"", "category": "General", - "capabilities": [ - "performance_analysis", - "n_plus_one_detection", - "memory_leak_check", - "algorithm_analysis" - ] + "capabilities": [] }, "history": [ { @@ -222,68 +199,54 @@ "to": "ollama-cloud/nemotron-3-super", "reason": "Better reasoning for performance analysis", "source": "git" + }, + { + "date": "2026-05-24T01:00:00Z", + "commit": "ollama-cloud-consolidation", + "type": "model_change", + "from": "ollama-cloud/nemotron-3-super", + "to": "ollama-cloud/deepseek-v4-pro-max", + "reason": "Matrix=84 for perf-engineer on V4-Pro. GPQA 90.1 for reasoning.", + "source": "orchestrator-analysis" } ], "performance_log": [] }, "browser-automation": { "current": { - "description": "Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction", + "description": "Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction (GNS-2 Tier 0)", "mode": "subagent", "model": "ollama-cloud/qwen3-coder:480b", "provider": "Ollama", "color": "\"#1E88E5\"", "category": "General", - "capabilities": [ - "e2e_browser_tests", - "form_filling", - "navigation_testing", - "screenshot_capture" - ] + "capabilities": [] }, "history": [], "performance_log": [] }, "visual-tester": { "current": { - "description": "Visual regression testing agent that captures screenshots, extracts UI elements with bounding boxes, compares via pixelmatch, and detects console/network errors", + "description": "Visual regression testing agent that compares screenshots and detects UI differences using pixelmatch and image diff (GNS-2 Tier 0)", "mode": "subagent", "model": "ollama-cloud/qwen3-coder:480b", "provider": "Ollama", "color": "\"#E91E63\"", "category": "General", - "capabilities": [ - "visual_regression", - "pixel_comparison", - "screenshot_diff", - "ui_validation", - "bbox_element_extraction", - "console_error_detection", - "network_error_detection", - "responsive_layout_check", - "button_overflow_detection", - "gitea_integration", - "docker_networking" - ] + "capabilities": [] }, "history": [], "performance_log": [] }, "system-analyst": { "current": { - "description": "Designs technical specifications, data schemas, and API contracts before implementation", + "description": "Designs technical specifications, data schemas, and API contracts before implementation (GNS-2 Tier 1)", "mode": "subagent", - "model": "ollama-cloud/nemotron-3-super", + "model": "ollama-cloud/deepseek-v4-pro-max", "provider": "Ollama", - "variant": "thinking", "color": "\"#0891B2\"", "category": "General", - "capabilities": [ - "architecture_design", - "api_specification", - "database_modeling", - "technical_documentation" - ] + "capabilities": [] }, "history": [ { @@ -292,11 +255,11 @@ "type": "model_change", "from": "ollama-cloud/gpt-oss:120b", "to": "ollama-cloud/glm-5", - "reason": "GLM-5 better for system engineering and architecture", + "reason": "GLM-5 better for system engineering", "source": "git" }, { - "date": "2026-04-23T06:24:32.546Z", + "date": "2026-04-23T06:24:32Z", "commit": "sync", "type": "model_change", "from": "ollama-cloud/glm-5", @@ -305,32 +268,44 @@ "source": "git" }, { - "date": "2026-04-27T16:59:52.825Z", + "date": "2026-04-27T16:59:52Z", "commit": "model-research-sync", "type": "model_change", "from": "ollama-cloud/glm-5.1", "to": "ollama-cloud/nemotron-3-super", "reason": "Test recommendation for model research sync script", "source": "research" + }, + { + "date": "2026-05-24T01:00:00Z", + "commit": "ollama-cloud-consolidation", + "type": "model_change", + "from": "ollama-cloud/nemotron-3-super", + "to": "ollama-cloud/glm-5.1", + "reason": "Reverted: GLM-5.1 Arena ELO 1451, instruction following ~90. Standardization with 12 other agents.", + "source": "orchestrator-analysis" + }, + { + "date": "2026-05-25T13:37:20.281Z", + "commit": "sync", + "type": "model_change", + "from": "ollama-cloud/glm-5.1", + "to": "ollama-cloud/deepseek-v4-pro-max", + "reason": "Model update from sync", + "source": "git" } ], "performance_log": [] }, "requirement-refiner": { "current": { - "description": "Converts vague ideas and bug reports into strict User Stories with acceptance criteria checklists", + "description": "Converts vague ideas and bug reports into strict User Stories with acceptance criteria checklists (GNS-2 Tier 1)", "mode": "all", - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/kimi-k2-thinking", "provider": "Ollama", "variant": "thinking", "color": "\"#4F46E5\"", - "category": "General", - "capabilities": [ - "requirement_analysis", - "user_story_creation", - "acceptance_criteria", - "clarification" - ] + "category": "General" }, "history": [ { @@ -339,39 +314,51 @@ "type": "model_change", "from": "ollama-cloud/nemotron-3-super", "to": "ollama-cloud/glm-5", - "reason": "+33% quality. GLM-5 excels at requirement analysis and system engineering", + "reason": "+33% quality. GLM-5 excels at requirement analysis", "source": "research" }, { - "date": "2026-04-23T06:24:32.546Z", + "date": "2026-04-23T06:24:32Z", "commit": "sync", "type": "model_change", "from": "ollama-cloud/glm-5", "to": "ollama-cloud/glm-5.1", "reason": "Model update from sync", "source": "git" + }, + { + "date": "2026-05-24T01:00:00Z", + "commit": "ollama-cloud-consolidation", + "type": "model_change", + "from": "ollama-cloud/glm-5.1", + "to": "ollama-cloud/kimi-k2.6", + "reason": "kimi-k2.6 IF=91 highest, multimodal for mockup understanding. Matrix ~88-90 for req-refiner.", + "source": "orchestrator-analysis" + }, + { + "date": "2026-05-23T23:35:02.184Z", + "commit": "sync", + "type": "model_change", + "from": "ollama-cloud/kimi-k2.6", + "to": "ollama-cloud/kimi-k2-thinking", + "reason": "Model update from sync", + "source": "git" } ], "performance_log": [] }, "history-miner": { "current": { - "description": "Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work", + "description": "Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work (GNS-2 Tier 0)", "mode": "subagent", "model": "ollama-cloud/nemotron-3-super", "provider": "Ollama", "color": "\"#059669\"", - "category": "General", - "capabilities": [ - "git_search", - "duplicate_detection", - "past_solution_finder", - "pattern_identification" - ] + "category": "General" }, "history": [ { - "date": "2026-04-23T06:24:32.546Z", + "date": "2026-04-23T06:24:32Z", "commit": "sync", "type": "model_change", "from": "ollama-cloud/glm-5", @@ -384,18 +371,13 @@ }, "capability-analyst": { "current": { - "description": "Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components.", + "description": "Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components. Tier 2 meta-agent with self-cascade enabled.", "mode": "subagent", "model": "ollama-cloud/glm-5.1", "provider": "Ollama", "color": "\"#6366F1\"", "category": "General", - "capabilities": [ - "gap_analysis", - "capability_mapping", - "recommendation_generation", - "coverage_analysis" - ] + "capabilities": [] }, "history": [ { @@ -404,11 +386,11 @@ "type": "model_change", "from": "ollama-cloud/nemotron-3-super", "to": "openrouter/qwen/qwen3.6-plus:free", - "reason": "+23% quality, IF:90 score, 1M context, FREE via OpenRouter", + "reason": "+23% quality, IF:90, FREE via OpenRouter", "source": "research" }, { - "date": "2026-04-23T06:24:32.546Z", + "date": "2026-04-23T06:24:32Z", "commit": "sync", "type": "model_change", "from": "openrouter/qwen/qwen3.6-plus:free", @@ -421,51 +403,50 @@ }, "orchestrator": { "current": { - "description": "Main dispatcher. Routes tasks between agents based on Issue status and manages the workflow state machine. IF:90 for optimal routing accuracy.", + "description": "Main dispatcher. Routes tasks between agents based on Issue status and manages the workflow state machine. IF:90 for optimal routing accuracy. (GNS-2 Tier 1)", "mode": "all", - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/kimi-k2.6", "provider": "Ollama", "variant": "thinking", "color": "\"#7C3AED\"", "category": "General", - "capabilities": [ - "task_routing", - "state_management", - "agent_coordination", - "workflow_execution" - ] + "capabilities": [] }, "history": [ { - "date": "2026-04-23T06:24:32.546Z", + "date": "2026-04-23T06:24:32Z", "commit": "sync", "type": "model_change", "from": "ollama-cloud/glm-5", "to": "ollama-cloud/glm-5.1", "reason": "Model update from sync", "source": "git" + }, + { + "date": "2026-04-27T20:28:58Z", + "commit": "model-research-sync", + "type": "model_change", + "from": "ollama-cloud/glm-5.1", + "to": "ollama-cloud/kimi-k2.6", + "reason": "kimi-k2.6 best fit for orchestration (92). 300 sub-agent swarm.", + "source": "research" } ], "performance_log": [] }, "release-manager": { "current": { - "description": "Manages git operations, semantic versioning, branching, and deployments. Ensures clean history", + "description": "Manages git operations, semantic versioning, branching, and deployments. Ensures clean history (GNS-2 Tier 1)", "mode": "subagent", "model": "ollama-cloud/glm-5.1", "provider": "Ollama", "color": "\"#581C87\"", "category": "General", - "capabilities": [ - "git_operations", - "version_management", - "changelog_creation", - "deployment" - ] + "capabilities": [] }, "history": [ { - "date": "2026-04-23T06:24:32.546Z", + "date": "2026-04-23T06:24:32Z", "commit": "sync", "type": "model_change", "from": "ollama-cloud/devstral-2:123b", @@ -478,19 +459,14 @@ }, "evaluator": { "current": { - "description": "Scores agent effectiveness after task completion for continuous improvement", + "description": "Scores agent effectiveness after task completion for continuous improvement. Tier 2 meta-agent with self-cascade enabled.", "mode": "subagent", - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/qwen3.5-122b", "provider": "Ollama", "variant": "thinking", "color": "\"#047857\"", "category": "General", - "capabilities": [ - "performance_scoring", - "process_analysis", - "pattern_identification", - "improvement_recommendations" - ] + "capabilities": [] }, "history": [ { @@ -512,31 +488,35 @@ "source": "research" }, { - "date": "2026-04-23T06:24:32.546Z", + "date": "2026-04-23T06:24:32Z", "commit": "sync", "type": "model_change", "from": "openrouter/qwen/qwen3.6-plus:free", "to": "ollama-cloud/glm-5.1", "reason": "Model update from sync", "source": "git" + }, + { + "date": "2026-05-25T13:37:20.281Z", + "commit": "sync", + "type": "model_change", + "from": "ollama-cloud/glm-5.1", + "to": "ollama-cloud/qwen3.5-122b", + "reason": "Model update from sync", + "source": "git" } ], "performance_log": [] }, "prompt-optimizer": { "current": { - "description": "Improves agent system prompts based on performance failures. Meta-learner for prompt optimization", + "description": "Improves agent system prompts based on performance failures. Meta-learner for prompt optimization (GNS-2 Tier 1)", "mode": "subagent", - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/qwen3.5-122b", "provider": "Ollama", - "variant": "instant", "color": "\"#BE185D\"", "category": "General", - "capabilities": [ - "prompt_analysis", - "prompt_improvement", - "failure_pattern_detection" - ] + "capabilities": [] }, "history": [ { @@ -549,48 +529,66 @@ "source": "git" }, { - "date": "2026-04-23T06:24:32.546Z", + "date": "2026-04-23T06:24:32Z", "commit": "sync", "type": "model_change", "from": "ollama-cloud/nemotron-3-super", "to": "ollama-cloud/glm-5.1", "reason": "Model update from sync", "source": "git" + }, + { + "date": "2026-05-24T01:00:00Z", + "commit": "ollama-cloud-consolidation", + "type": "model_change", + "from": "ollama-cloud/glm-5.1", + "to": "ollama-cloud/qwen3.5", + "reason": "MIGRATION: qwen3.6-plus was OpenRouter (not Ollama Cloud). qwen3.5 has IF=92, updated 2 days ago, 12.4M pulls.", + "source": "orchestrator-analysis" + }, + { + "date": "2026-05-23T23:35:02.184Z", + "commit": "sync", + "type": "model_change", + "from": "ollama-cloud/qwen3.5", + "to": "ollama-cloud/qwen3.6-plus", + "reason": "Model update from sync", + "source": "git" + }, + { + "date": "2026-05-25T13:37:20.281Z", + "commit": "sync", + "type": "model_change", + "from": "ollama-cloud/qwen3.6-plus", + "to": "ollama-cloud/qwen3.5-122b", + "reason": "Model update from sync", + "source": "git" } ], "performance_log": [] }, "the-fixer": { "current": { - "description": "Iteratively fixes bugs based on specific error reports and test failures", + "description": "Iteratively fixes bugs based on specific error reports and test failures (GNS-2 Tier 1)", "mode": "all", - "model": "ollama-cloud/minimax-m2.5", + "model": "ollama-cloud/kimi-k2.6", "provider": "Ollama", "color": "\"#F59E0B\"", "category": "General", - "capabilities": [ - "bug_fixing", - "issue_resolution", - "code_correction" - ] + "capabilities": [] }, "history": [], "performance_log": [] }, "product-owner": { "current": { - "description": "Manages issue checklists, status labels, tracks progress and coordinates with human users", + "description": "Manages issue checklists, status labels, tracks progress and coordinates with human users (GNS-2 Tier 1)", "mode": "subagent", "model": "ollama-cloud/glm-5.1", "provider": "Ollama", "color": "\"#EA580C\"", "category": "General", - "capabilities": [ - "issue_management", - "prioritization", - "backlog_management", - "workflow_completion" - ] + "capabilities": [] }, "history": [ { @@ -603,7 +601,7 @@ "source": "git" }, { - "date": "2026-04-23T06:24:32.546Z", + "date": "2026-04-23T06:24:32Z", "commit": "sync", "type": "model_change", "from": "ollama-cloud/glm-5", @@ -616,45 +614,46 @@ }, "workflow-architect": { "current": { - "description": "Creates and maintains workflow definitions with complete architecture, Gitea integration, and quality gates", + "description": "Creates and maintains workflow definitions with complete architecture, Gitea integration, and quality gates (GNS-2 Tier 1)", "mode": "subagent", - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/qwen3.5-122b", "provider": "Ollama", "variant": "thinking", "color": "\"#EC4899\"", "category": "General", - "capabilities": [ - "workflow_design", - "process_definition", - "automation_setup" - ] + "capabilities": [] }, "history": [ { - "date": "2026-04-23T06:24:32.546Z", + "date": "2026-04-23T06:24:32Z", "commit": "sync", "type": "model_change", "from": "ollama-cloud/glm-5", "to": "ollama-cloud/glm-5.1", "reason": "Model update from sync", "source": "git" + }, + { + "date": "2026-05-25T13:37:20.281Z", + "commit": "sync", + "type": "model_change", + "from": "ollama-cloud/glm-5.1", + "to": "ollama-cloud/qwen3.5-122b", + "reason": "Model update from sync", + "source": "git" } ], "performance_log": [] }, "markdown-validator": { "current": { - "description": "Validates and corrects Markdown descriptions for Gitea issues", + "description": "Validates and corrects Markdown descriptions for Gitea issues (GNS-2 Tier 0)", "mode": "subagent", - "model": "ollama-cloud/nemotron-3-nano:30b", + "model": "ollama-cloud/deepseek-v4-pro-max", "provider": "Ollama", "color": "\"#F97316\"", "category": "General", - "capabilities": [ - "markdown_validation", - "formatting_check", - "link_validation" - ] + "capabilities": [] }, "history": [ { @@ -665,6 +664,24 @@ "to": "ollama-cloud/nemotron-3-nano:30b", "reason": "Nano efficient for lightweight validation tasks", "source": "git" + }, + { + "date": "2026-05-24T01:00:00Z", + "commit": "ollama-cloud-consolidation", + "type": "model_change", + "from": "ollama-cloud/nemotron-3-nano:30b", + "to": "ollama-cloud/nemotron-3-nano", + "reason": "Unified naming. Nano IF=68, tiny and cheap, perfect for validation.", + "source": "orchestrator-analysis" + }, + { + "date": "2026-05-23T23:35:02.185Z", + "commit": "sync", + "type": "model_change", + "from": "ollama-cloud/nemotron-3-nano", + "to": "ollama-cloud/deepseek-v4-pro-max", + "reason": "Model update from sync", + "source": "git" } ], "performance_log": [] @@ -673,17 +690,12 @@ "current": { "name": "Agent Architect", "mode": "subagent", - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/kimi-k2.6", "provider": "Ollama", - "variant": "thinking", - "description": "Creates, modifies, and reviews new agents, workflows, and skills based on capability gap analysis", + "description": "Creates, modifies, and reviews new agents, workflows, and skills based on capability gap analysis. Tier 2 meta-agent with self-cascade enabled.", "color": "\"#8B5CF6\"", "category": "General", - "capabilities": [ - "agent_design", - "prompt_engineering", - "capability_definition" - ] + "capabilities": [] }, "history": [ { @@ -692,36 +704,39 @@ "type": "model_change", "from": "ollama-cloud/nemotron-3-super", "to": "openrouter/qwen/qwen3.6-plus:free", - "reason": "+22% quality, IF:90 for YAML frontmatter generation, 1M context for all agents analysis", + "reason": "+22% quality, IF:90 for YAML frontmatter generation", "source": "research" }, { - "date": "2026-04-23T06:24:32.546Z", + "date": "2026-04-23T06:24:32Z", "commit": "sync", "type": "model_change", "from": "openrouter/qwen/qwen3.6-plus:free", "to": "ollama-cloud/glm-5.1", "reason": "Model update from sync", "source": "git" + }, + { + "date": "2026-05-24T01:00:00Z", + "commit": "ollama-cloud-consolidation", + "type": "model_change", + "from": "ollama-cloud/glm-5.1", + "to": "ollama-cloud/kimi-k2.6", + "reason": "kimi-k2.6 best fit for agent-architect (86). Multimodal for reviewing UI components.", + "source": "orchestrator-analysis" } ], "performance_log": [] }, "planner": { "current": { - "description": "Advanced task planner using Chain of Thought, Tree of Thoughts, and Plan-Execute-Reflect", + "description": "Advanced task planner using Chain of Thought, Tree of Thoughts, and Plan-Execute-Reflect (GNS-2 Tier 0)", "mode": "subagent", - "model": "ollama-cloud/nemotron-3-super", + "model": "ollama-cloud/deepseek-v4-pro-max", "provider": "Ollama", "color": "\"#F59E0B\"", "category": "General", - "capabilities": [ - "task_decomposition", - "chain_of_thought", - "tree_of_thoughts", - "plan_execute_reflect", - "dependency_analysis" - ] + "capabilities": [] }, "history": [ { @@ -732,25 +747,28 @@ "to": "ollama-cloud/nemotron-3-super", "reason": "Nemotron 3 Super excels at planning", "source": "git" + }, + { + "date": "2026-04-27T17:00:00Z", + "commit": "model-research-sync", + "type": "model_change", + "from": "ollama-cloud/nemotron-3-super", + "to": "ollama-cloud/deepseek-v4-pro-max", + "reason": "Matrix score 88 for planner on V4-Pro. GPQA 90.1.", + "source": "research" } ], "performance_log": [] }, "reflector": { "current": { - "description": "Self-reflection agent using Reflexion pattern - learns from mistakes", + "description": "Self-reflection agent using Reflexion pattern - learns from mistakes (GNS-2 Tier 0)", "mode": "subagent", - "model": "ollama-cloud/nemotron-3-super", + "model": "ollama-cloud/deepseek-v4-pro-max", "provider": "Ollama", "color": "\"#10B981\"", "category": "General", - "capabilities": [ - "self_reflection", - "mistake_analysis", - "lesson_extraction", - "trajectory_analysis", - "heuristic_evaluation" - ] + "capabilities": [] }, "history": [ { @@ -761,25 +779,28 @@ "to": "ollama-cloud/nemotron-3-super", "reason": "Better for reflection tasks", "source": "git" + }, + { + "date": "2026-04-27T17:00:00Z", + "commit": "model-research-sync", + "type": "model_change", + "from": "ollama-cloud/nemotron-3-super", + "to": "ollama-cloud/deepseek-v4-pro-max", + "reason": "Matrix score 84. Strong reasoning chains.", + "source": "research" } ], "performance_log": [] }, "memory-manager": { "current": { - "description": "Manages agent memory systems - short-term (context), long-term (vector store), and episodic (experiences)", + "description": "Manages agent memory systems - short-term (context), long-term (vector store), and episodic (experiences) (GNS-2 Tier 0)", "mode": "subagent", - "model": "ollama-cloud/nemotron-3-super", + "model": "ollama-cloud/deepseek-v4-pro-max", "provider": "Ollama", "color": "\"#8B5CF6\"", "category": "General", - "capabilities": [ - "memory_retrieval", - "memory_storage", - "memory_consolidation", - "relevance_scoring", - "episodic_management" - ] + "capabilities": [] }, "history": [ { @@ -790,44 +811,59 @@ "to": "ollama-cloud/nemotron-3-super", "reason": "RULER@1M critical for memory ctx", "source": "git" + }, + { + "date": "2026-05-24T01:00:00Z", + "commit": "ollama-cloud-consolidation", + "type": "model_change", + "from": "ollama-cloud/nemotron-3-super", + "to": "ollama-cloud/deepseek-v4-pro-max", + "reason": "MIGRATION: qwen3.6-plus was OpenRouter. deepseek-v4-pro-max has 1M context (same as nemotron), matrix 86, SWE-V 80.6.", + "source": "orchestrator-analysis" + }, + { + "date": "2026-05-23T23:35:02.184Z", + "commit": "sync", + "type": "model_change", + "from": "ollama-cloud/deepseek-v4-pro-max", + "to": "ollama-cloud/qwen3.6-plus", + "reason": "Model update from sync", + "source": "git" + }, + { + "date": "2026-05-25T13:37:20.281Z", + "commit": "sync", + "type": "model_change", + "from": "ollama-cloud/qwen3.6-plus", + "to": "ollama-cloud/deepseek-v4-pro-max", + "reason": "Model update from sync", + "source": "git" } ], "performance_log": [] }, "devops-engineer": { "current": { - "description": "DevOps specialist for Docker, Kubernetes, CI/CD pipeline automation, and infrastructure management", + "description": "DevOps specialist for Docker, Kubernetes, CI/CD pipeline automation, and infrastructure management (GNS-2 Tier 1)", "mode": "subagent", - "model": "ollama-cloud/nemotron-3-super", + "model": "ollama-cloud/kimi-k2.6", "provider": "Ollama", "color": "\"#FF6B35\"", "category": "General", - "capabilities": [ - "docker_configuration", - "kubernetes_setup", - "ci_cd_pipeline", - "infrastructure_automation", - "container_optimization" - ] + "capabilities": [] }, "history": [], "performance_log": [] }, "flutter-developer": { "current": { - "description": "Flutter mobile specialist for cross-platform apps, state management, and UI components", + "description": "Flutter mobile specialist for cross-platform apps, state management, and UI components (GNS-2 Tier 1)", "mode": "subagent", "model": "ollama-cloud/qwen3-coder:480b", "provider": "Ollama", "color": "\"#02569B\"", "category": "General", - "capabilities": [ - "dart_programming", - "flutter_ui", - "mobile_app_development", - "widget_creation", - "state_management" - ] + "capabilities": [] }, "history": [ { @@ -844,100 +880,153 @@ }, "architect-indexer": { "current": { - "description": "Indexes and maps project codebase architecture into .architect/ directory. Creates and maintains structured documentation of entities, APIs, DB schema, file graphs, and conventions.", + "description": "Indexes and maps project codebase architecture into .architect/ directory. Creates and maintains structured documentation of entities, APIs, DB schema, file graphs, and conventions. (GNS-2 Tier 0)", "mode": "subagent", "model": "ollama-cloud/glm-5.1", "provider": "Ollama", "variant": "thinking", "color": "\"#10B981\"", "category": "General", - "capabilities": [ - "codebase_indexing", - "project_mapping", - "architecture_documentation", - "dependency_analysis", - "entity_extraction", - "api_surface_discovery", - "convention_detection", - "staleness_detection" - ] + "capabilities": [] }, "history": [], "performance_log": [] }, "php-developer": { "current": { - "description": "PHP backend specialist for Laravel, Symfony, WordPress, and full-stack web applications", + "description": "PHP backend specialist for Laravel, Symfony, WordPress, and full-stack web applications (GNS-2 Tier 1)", "mode": "subagent", "model": "ollama-cloud/qwen3-coder:480b", "provider": "Ollama", "variant": "thinking", "color": "\"#8B5CF6\"", "category": "General", - "capabilities": [ - "php_web_development", - "laravel_development", - "symfony_development", - "wordpress_development", - "php_api_development", - "php_database_design", - "php_authentication", - "php_modular_architecture", - "php_testing", - "php_security" - ] + "capabilities": [] }, "history": [], "performance_log": [] }, "pipeline-judge": { "current": { - "description": "Automated pipeline judge. Evaluates workflow execution by running tests, measuring token cost and wall-clock time. Produces objective fitness scores. Never writes code - only measures and scores.", + "description": "Automated pipeline judge. Evaluates workflow execution by running tests, measuring token cost and wall-clock time. Produces objective fitness scores. Never writes code - only measures and scores. (GNS-2 Tier 0)", "mode": "subagent", - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/kimi-k2.6", "provider": "Ollama", "color": "\"#DC2626\"", "category": "General", - "capabilities": [ - "test_execution", - "fitness_scoring", - "metric_collection", - "bottleneck_detection" - ] + "capabilities": [] }, "history": [ { - "date": "2026-04-06T00:23:50 +0100Z", + "date": "2026-04-06T00:23:50+0100Z", "commit": "fa68141d", "type": "agent_created", "from": null, "to": "", "reason": "feat: add pipeline-judge agent and evolution workflow system", "source": "git" + }, + { + "date": "2026-05-25T13:37:20.281Z", + "commit": "sync", + "type": "model_change", + "from": "ollama-cloud/glm-5.1", + "to": "ollama-cloud/kimi-k2.6", + "reason": "Model update from sync", + "source": "git" } ], "performance_log": [] }, "python-developer": { "current": { - "description": "Python backend specialist for Django, FastAPI, data science, and API development", + "description": "Python backend specialist for Django, FastAPI, data science, and API development (GNS-2 Tier 1)", "mode": "subagent", "model": "ollama-cloud/qwen3-coder:480b", "provider": "Ollama", "variant": "thinking", "color": "\"#3776AB\"", "category": "General", - "capabilities": [ - "python_web_development", - "django_development", - "fastapi_development", - "python_api_development", - "python_database_design", - "python_authentication", - "python_async_patterns", - "python_testing", - "python_security" - ] + "capabilities": [] + }, + "history": [], + "performance_log": [] + }, + "incident-responder": { + "current": { + "description": "Server incident response and system hardening specialist. Handles live forensics, malware removal, persistence hunting, SSH-based server cleanup, and post-incident hardening. Works with any OS and panel.", + "mode": "subagent", + "model": "ollama-cloud/kimi-k2.6", + "provider": "Ollama", + "color": "\"#B91C1C\"", + "category": "General", + "capabilities": [] + }, + "history": [], + "performance_log": [] + }, + "workflow-cross-checker": { + "current": { + "description": "Workflow cross-checker and process inspector. Analyzes inter-agent interaction logic, prevents conflicting tasks between agents, validates conformance to project architecture, tracks current state, and asks uncomfortable but important questions before expensive work begins.", + "mode": "subagent", + "model": "ollama-cloud/kimi-k2.6", + "provider": "Ollama", + "variant": "thinking", + "color": "\"#9333EA\"", + "category": "General", + "capabilities": [] + }, + "history": [], + "performance_log": [] + }, + "code": { + "current": { + "model": "ollama-cloud/qwen3-coder:480b", + "provider": "Ollama", + "category": "Built-in", + "mode": "primary", + "color": "#3B82F6", + "description": "Primary code writer. Full tool access for development tasks.", + "capabilities": [] + }, + "history": [], + "performance_log": [] + }, + "ask": { + "current": { + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama", + "category": "Built-in", + "mode": "primary", + "color": "#3B82F6", + "description": "Read-only Q&A agent for codebase questions.", + "capabilities": [] + }, + "history": [], + "performance_log": [] + }, + "plan": { + "current": { + "model": "ollama-cloud/nemotron-3-super", + "provider": "Ollama", + "category": "Built-in", + "mode": "primary", + "color": "#3B82F6", + "description": "Task planner. Creates detailed implementation plans.", + "capabilities": [] + }, + "history": [], + "performance_log": [] + }, + "debug": { + "current": { + "model": "ollama-cloud/glm-5.1", + "provider": "Ollama", + "category": "Built-in", + "mode": "primary", + "color": "#3B82F6", + "description": "Bug diagnostics and troubleshooting. GLM-5.1 ★88, reasoning for deep debug.", + "capabilities": [] }, "history": [], "performance_log": [] @@ -955,10 +1044,10 @@ } }, "evolution_metrics": { - "total_agents": 32, + "total_agents": 38, "agents_with_history": 22, "pending_recommendations": 0, - "last_sync": "2026-04-23T06:24:32.546Z", + "last_sync": "2026-05-25T13:37:20.282Z", "sync_sources": [ "git", "capability-index.yaml", diff --git a/agent-evolution/data/model-benchmarks.json b/agent-evolution/data/model-benchmarks.json index c17d33f..96253bf 100644 --- a/agent-evolution/data/model-benchmarks.json +++ b/agent-evolution/data/model-benchmarks.json @@ -1,1718 +1,851 @@ -{ - "version": "1.0.0", - "generated": "2026-04-30T07:00:00Z", - "source": "capability-index.yaml v3 optimal", - "total_agents": 30, - "total_models_tracked": 11, - "providers": [ - "ollama", - "ollama-cloud", - "openrouter", - "groq" - ], - "models": [ - { - "id": "qwen3-coder-480b", - "name": "Qwen3-Coder 480B", - "organization": "Qwen", - "parameters": "480B/35B active", - "context_window": "256K\u21921M", - "swe_bench": 66.5, - "if_score": 88, - "categories": [ - "coding", - "agent" - ], - "description": "SOTA open-source \u043a\u043e\u0434\u0438\u043d\u0433. \u0421\u0440\u0430\u0432\u043d\u0438\u043c \u0441 Claude Sonnet 4.", - "tags": [ - "coding", - "agent", - "tools" - ], - "openrouter": false, - "provider": "ollama" - }, - { - "id": "minimax-m2.5", - "name": "MiniMax M2.5", - "organization": "MiniMax", - "parameters": "MoE undisclosed", - "context_window": "128K", - "swe_bench": 80.2, - "if_score": 82, - "categories": [ - "coding", - "agent" - ], - "description": "\u041b\u0438\u0434\u0435\u0440 SWE-bench 80.2%. \u041f\u043e\u043b\u043d\u044b\u0439 lifecycle \u0440\u0430\u0437\u0440\u0430\u0431\u043e\u0442\u043a\u0438.", - "tags": [ - "coding", - "agent" - ], - "openrouter": false, - "provider": "ollama" - }, - { - "id": "minimax-m2.7", - "name": "MiniMax M2.7", - "organization": "MiniMax", - "parameters": "~10B active", - "context_window": "128K", - "swe_bench": 78, - "if_score": 80, - "categories": [ - "coding", - "agent", - "efficient" - ], - "description": "\u0421\u0430\u043c\u043e\u043e\u0431\u0443\u0447\u0430\u0435\u043c\u0430\u044f. 56.2% SWE-Pro. 100 TPS. $0.30/M.", - "tags": [ - "coding", - "agent", - "self-evolving" - ], - "openrouter": false, - "provider": "ollama" - }, - { - "id": "deepseek-v4-pro-max", - "name": "DeepSeek V4-Pro", - "organization": "DeepSeek", - "parameters": "1.6T/49B active MoE", - "context_window": "1M", - "swe_bench": 80.6, - "if_score": 89, - "categories": [ - "coding", - "agent", - "reasoning" - ], - "description": "SWE-V 80.6, LiveCodeBench 93.5(#1!), Terminal-Bench 67.9, Codeforces 3206, 1M ctx, 27% FLOPs vs V3.2. MIT.", - "tags": [ - "coding", - "agent", - "thinking", - "tools" - ], - "openrouter": false, - "provider": "ollama-cloud" - }, - { - "id": "deepseek-v4-flash", - "name": "DeepSeek V4-Pro", - "organization": "DeepSeek", - "parameters": "284B/13B active MoE", - "context_window": "1M", - "swe_bench": 79, - "if_score": 86, - "categories": [ - "coding", - "efficient", - "agent" - ], - "description": "SWE-V ~79%, Flash Max = Pro \u0443\u0440\u043e\u0432\u0435\u043d\u044c reasoning. 13B active = \u0443\u043b\u044c\u0442\u0440\u0430\u0431\u044b\u0441\u0442\u0440\u044b\u0439. 1M ctx. FP4+FP8. MIT.", - "tags": [ - "coding", - "efficient", - "agent", - "thinking" - ], - "openrouter": false, - "provider": "ollama-cloud" - }, - { - "id": "kimi-k2-6", - "name": "Kimi K2.6", - "organization": "Moonshot AI", - "parameters": "1T/32B active MoE", - "context_window": "256K", - "swe_bench": 80.2, - "if_score": 91, - "categories": [ - "coding", - "agent", - "multimodal" - ], - "description": "SWE-Pro 58.6(#1!), SWE-V 80.2, Terminal-Bench 66.7, HLE 54.0(#1!), BrowseComp 83.2. 13h autonomous. 300 sub-agent swarm. Modified MIT.", - "tags": [ - "coding", - "agent", - "swarm", - "vision", - "thinking", - "tools" - ], - "openrouter": false, - "provider": "ollama-cloud" - }, - { - "id": "nemotron-3-super", - "name": "Nemotron 3 Super", - "organization": "NVIDIA", - "parameters": "120B/12B active", - "context_window": "1M", - "swe_bench": 60.5, - "if_score": 78, - "categories": [ - "agent", - "reasoning", - "efficient" - ], - "description": "SWE-bench 60.5%. RULER@1M 91.75%! \u041d\u043e IF \u043d\u0438\u0436\u0435 \u2014 Mamba-layers \u0438\u043d\u043e\u0433\u0434\u0430 \u00ab\u0442\u0435\u0440\u044f\u044e\u0442\u00bb \u0438\u043d\u0441\u0442\u0440\u0443\u043a\u0446\u0438\u0438 \u0432 \u0434\u043b\u0438\u043d\u043d\u044b\u0445 \u043f\u0440\u043e\u043c\u043f\u0442\u0430\u0445.", - "tags": [ - "agent", - "1M-ctx", - "thinking" - ], - "openrouter": false, - "provider": "ollama" - }, - { - "id": "glm-5.1", - "name": "GLM-5", - "organization": "Z.ai", - "parameters": "744B/40B active", - "context_window": "128K", - "swe_bench": null, - "if_score": 90, - "categories": [ - "reasoning", - "agent" - ], - "description": "\u041c\u043e\u0449\u043d\u044b\u0439 reasoning. Arena ELO 1451. \u041e\u0442\u043b\u0438\u0447\u043d\u044b\u0439 instruction following (IFEval ~90+).", - "tags": [ - "reasoning", - "agent" - ], - "openrouter": false, - "provider": "ollama" - }, - { - "id": "deepseek-v4", - "name": "DeepSeek V4-Pro", - "organization": "DeepSeek", - "parameters": "Large MoE", - "context_window": "128K", - "swe_bench": null, - "if_score": 75, - "categories": [ - "reasoning" - ], - "description": "\u0425\u043e\u0440\u043e\u0448\u0438\u0439 reasoning, \u043d\u043e IF \u043d\u0435\u0441\u0442\u0430\u0431\u0438\u043b\u0435\u043d \u2014 \u0438\u043d\u043e\u0433\u0434\u0430 \u0438\u0433\u043d\u043e\u0440\u0438\u0440\u0443\u0435\u0442 \u0444\u043e\u0440\u043c\u0430\u0442 \u0432\u044b\u0432\u043e\u0434\u0430.", - "tags": [ - "reasoning" - ], - "openrouter": false, - "provider": "ollama" - }, - { - "id": "qwen3-5-122b", - "name": "Qwen 3.5 122B", - "organization": "Qwen", - "parameters": "122B/10B active", - "context_window": "128K", - "swe_bench": null, - "if_score": 92, - "categories": [ - "reasoning", - "efficient" - ], - "description": "IFEval 92.6%! \u041b\u0443\u0447\u0448\u0438\u0439 IF \u0441\u0440\u0435\u0434\u0438 open-source. Multimodal. Thinking.", - "tags": [ - "vision", - "thinking", - "tools" - ], - "openrouter": false, - "provider": "ollama" - }, - { - "id": "qwen3-coder-next", - "name": "Qwen3-Coder-Next", - "organization": "Qwen", - "parameters": "80B/3B active", - "context_window": "128K", - "swe_bench": 70, - "if_score": 84, - "categories": [ - "coding", - "efficient" - ], - "description": "70% SWE-bench \u0441 3B active! \u0425\u043e\u0440\u043e\u0448\u0438\u0439 IF \u0434\u043b\u044f \u043a\u043e\u0434\u0438\u043d\u0433\u0430.", - "tags": [ - "coding", - "efficient", - "tools" - ], - "openrouter": false, - "provider": "ollama" - }, - { - "id": "cogito-2-1-671b", - "name": "Cogito 2.1 671B", - "organization": "Cognitive", - "parameters": "671B MoE", - "context_window": "128K", - "swe_bench": null, - "if_score": 76, - "categories": [ - "reasoning" - ], - "description": "MIT \u043b\u0438\u0446\u0435\u043d\u0437\u0438\u044f. 671B total. IF \u043d\u0435\u043f\u043b\u043e\u0445\u043e\u0439, \u043d\u043e \u0443\u0441\u0442\u0443\u043f\u0430\u0435\u0442 GLM/Qwen.", - "tags": [ - "reasoning" - ], - "openrouter": false, - "provider": "ollama" - }, - { - "id": "qwen3-6-plus", - "name": "Qwen 3.6 Plus", - "organization": "Qwen", - "parameters": "Hybrid MoE", - "context_window": "1M", - "swe_bench": 78.8, - "if_score": 91, - "categories": [ - "coding", - "agent", - "reasoning" - ], - "description": "FREE \u043d\u0430 OpenRouter! 1M \u043a\u043e\u043d\u0442\u0435\u043a\u0441\u0442. Always-on CoT. \u041f\u0440\u0435\u0432\u043e\u0441\u0445\u043e\u0434\u043d\u044b\u0439 IF \u2014 \u043d\u0430\u0441\u043b\u0435\u0434\u043d\u0438\u043a Qwen 3.5 (92.6%).", - "tags": [ - "coding", - "agent", - "1M-ctx", - "free" - ], - "openrouter": true, - "provider": "openrouter" - }, - { - "id": "step-3-5-flash", - "name": "Step 3.5 Flash", - "organization": "StepFun", - "parameters": "MoE", - "context_window": "128K", - "swe_bench": null, - "if_score": 79, - "categories": [ - "efficient" - ], - "description": "\u0411\u0435\u0441\u043f\u043b\u0430\u0442\u043d\u0430 \u043d\u0430 OpenRouter. IF \u0441\u0440\u0435\u0434\u043d\u0438\u0439.", - "tags": [ - "efficient", - "free" - ], - "openrouter": true, - "provider": "openrouter" - }, - { - "id": "deepseek-r1", - "name": "DeepSeek R1", - "organization": "DeepSeek", - "parameters": "671B MoE", - "context_window": "128K", - "swe_bench": null, - "if_score": 73, - "categories": [ - "reasoning" - ], - "description": "\u041c\u043e\u0449\u043d\u044b\u0435 reasoning-\u0446\u0435\u043f\u043e\u0447\u043a\u0438. \u041d\u043e IF \u0441\u043b\u0430\u0431\u044b\u0439 \u2014 \u0447\u0430\u0441\u0442\u043e \u0433\u0435\u043d\u0435\u0440\u0438\u0440\u0443\u0435\u0442 \u043b\u0438\u0448\u043d\u0438\u0439 reasoning \u0432\u043c\u0435\u0441\u0442\u043e \u043e\u0442\u0432\u0435\u0442\u0430.", - "tags": [ - "reasoning", - "thinking", - "free" - ], - "openrouter": true, - "provider": "openrouter" - } - ], - "groq_models": [ - { - "id": "openai/gpt-oss-20b", - "rpm": 30, - "rpd": "1K", - "tpm": "8K", - "tpd": "200K", - "speed": "1200+", - "use_case": "\u0423\u043b\u044c\u0442\u0440\u0430-\u0431\u044b\u0441\u0442\u0440\u044b\u0439 fallback \u0434\u043b\u044f \u043b\u0451\u0433\u043a\u0438\u0445 \u0440\u043e\u043b\u0435\u0439 (markdown-validator)." - }, - { - "id": "llama-3.1-8b-instant", - "rpm": 30, - "rpd": "14.4K", - "tpm": "6K", - "tpd": "500K", - "speed": "~800", - "use_case": "14.4K RPD! \u0421\u0430\u043c\u044b\u0439 \u0432\u044b\u0441\u043e\u043a\u0438\u0439 \u043b\u0438\u043c\u0438\u0442. \u0414\u043b\u044f health-check / ping \u0440\u043e\u043b\u0435\u0439." - }, - { - "id": "groq/compound", - "rpm": 30, - "rpd": "250", - "tpm": "70K", - "tpd": "\u2014", - "speed": "varies", - "use_case": "\u041c\u0443\u043b\u044c\u0442\u0438\u043c\u043e\u0434\u0435\u043b\u044c\u043d\u0430\u044f \u0430\u0433\u0440\u0435\u0433\u0430\u0446\u0438\u044f. \u0414\u043b\u044f research-\u0437\u0430\u0434\u0430\u0447." - }, - { - "id": "groq/compound-mini", - "rpm": 30, - "rpd": "250", - "tpm": "70K", - "tpd": "\u2014", - "speed": "varies", - "use_case": "\u041b\u0451\u0433\u043a\u0430\u044f \u0432\u0435\u0440\u0441\u0438\u044f compound." - }, - { - "id": "llama-prompt-guard-2", - "rpm": 30, - "rpd": "14.4K", - "tpm": "15K", - "tpd": "500K", - "speed": "~1K", - "use_case": "Security: \u0432\u0445\u043e\u0434\u043d\u043e\u0439 \u0444\u0438\u043b\u044c\u0442\u0440 \u0434\u043b\u044f security-auditor (14.4K RPD!)." - } - ], - "agent_model_scores": [ - { - "agent": "lead-developer", - "current_model_index": 0, - "current_model_id": "qwen3-coder-480b", - "reasoning_effort": "H", - "scores": { - "qwen3-coder-480b": 92, - "minimax-m2.5": 86, - "minimax-m2.7": 82, - "nemotron-3-super": 70, - "glm-5.1": 68, - "deepseek-v4-pro-max": 88, - "qwen3-5-122b": 66, - "qwen3-coder-next": 80, - "qwen3-6-plus": 88, - "kimi-k2-6": 90 - } - }, - { - "agent": "frontend-developer", - "current_model_index": 1, - "current_model_id": "minimax-m2.5", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 86, - "minimax-m2.5": 92, - "minimax-m2.7": 88, - "nemotron-3-super": 62, - "glm-5.1": 56, - "deepseek-v4-pro-max": 82, - "qwen3-5-122b": 60, - "qwen3-coder-next": 76, - "qwen3-6-plus": 88, - "kimi-k2-6": 86 - } - }, - { - "agent": "php-developer", - "current_model_index": 0, - "current_model_id": "qwen3-coder-480b", - "reasoning_effort": "H", - "scores": { - "qwen3-coder-480b": 87, - "minimax-m2.5": 76, - "minimax-m2.7": 72, - "nemotron-3-super": 64, - "glm-5.1": 56, - "deepseek-v4-pro-max": 74, - "qwen3-5-122b": 60, - "qwen3-coder-next": 76, - "qwen3-6-plus": 84, - "kimi-k2-6": 86 - } - }, - { - "agent": "python-developer", - "current_model_index": 0, - "current_model_id": "qwen3-coder-480b", - "reasoning_effort": "H", - "scores": { - "qwen3-coder-480b": 90, - "minimax-m2.5": 82, - "minimax-m2.7": 78, - "nemotron-3-super": 66, - "glm-5.1": 60, - "deepseek-v4-pro-max": 78, - "qwen3-5-122b": 64, - "qwen3-coder-next": 78, - "qwen3-6-plus": 88, - "kimi-k2-6": 88 - } - }, - { - "agent": "backend-developer", - "current_model_index": 0, - "current_model_id": "qwen3-coder-480b", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 91, - "minimax-m2.5": 84, - "minimax-m2.7": 80, - "nemotron-3-super": 68, - "glm-5.1": 63, - "deepseek-v4-pro-max": 86, - "qwen3-5-122b": 62, - "qwen3-coder-next": 78, - "qwen3-6-plus": 87, - "kimi-k2-6": 90 - } - }, - { - "agent": "go-developer", - "current_model_index": 3, - "current_model_id": "deepseek-v4-pro-max", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 85, - "minimax-m2.5": 78, - "minimax-m2.7": 74, - "nemotron-3-super": 66, - "glm-5.1": 58, - "deepseek-v4-pro-max": 88, - "qwen3-5-122b": 58, - "qwen3-coder-next": 74, - "qwen3-6-plus": 82, - "kimi-k2-6": 86 - } - }, - { - "agent": "flutter-developer", - "current_model_index": 0, - "current_model_id": "qwen3-coder-480b", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 86, - "minimax-m2.5": 70, - "minimax-m2.7": 66, - "nemotron-3-super": 60, - "glm-5.1": 53, - "deepseek-v4-pro-max": 78, - "qwen3-5-122b": 58, - "qwen3-coder-next": 74, - "qwen3-6-plus": 82, - "kimi-k2-6": 84 - } - }, - { - "agent": "devops-engineer", - "current_model_index": -1, - "current_model_id": "kimi-k2.6", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 66, - "minimax-m2.5": 53, - "minimax-m2.7": 48, - "nemotron-3-super": 78, - "glm-5.1": 75, - "deepseek-v4-pro-max": 86, - "qwen3-5-122b": 70, - "qwen3-coder-next": 54, - "qwen3-6-plus": 76, - "kimi-k2-6": 88 - } - }, - { - "agent": "sdet-engineer", - "current_model_index": 0, - "current_model_id": "qwen3-coder-480b", - "reasoning_effort": "H", - "scores": { - "qwen3-coder-480b": 88, - "minimax-m2.5": 84, - "minimax-m2.7": 80, - "nemotron-3-super": 70, - "glm-5.1": 63, - "deepseek-v4-pro-max": 84, - "qwen3-5-122b": 64, - "qwen3-coder-next": 78, - "qwen3-6-plus": 84, - "kimi-k2-6": 87 - } - }, - { - "agent": "code-skeptic", - "current_model_index": 1, - "current_model_id": "minimax-m2.5", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 82, - "minimax-m2.5": 85, - "minimax-m2.7": 80, - "nemotron-3-super": 73, - "glm-5.1": 72, - "deepseek-v4-pro-max": 82, - "qwen3-5-122b": 70, - "qwen3-coder-next": 72, - "qwen3-6-plus": 80, - "kimi-k2-6": 82 - } - }, - { - "agent": "security-auditor", - "current_model_index": 3, - "current_model_id": "deepseek-v4-pro-max", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 76, - "minimax-m2.5": 74, - "minimax-m2.7": 68, - "nemotron-3-super": 76, - "glm-5.1": 68, - "deepseek-v4-pro-max": 80, - "qwen3-5-122b": 72, - "qwen3-coder-next": 64, - "qwen3-6-plus": 75, - "kimi-k2-6": 80 - } - }, - { - "agent": "performance-engineer", - "current_model_index": 3, - "current_model_id": "deepseek-v4-pro-max", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 78, - "minimax-m2.5": 75, - "minimax-m2.7": 70, - "nemotron-3-super": 78, - "glm-5.1": 74, - "deepseek-v4-pro-max": 84, - "qwen3-5-122b": 70, - "qwen3-coder-next": 67, - "qwen3-6-plus": 76, - "kimi-k2-6": 82 - } - }, - { - "agent": "the-fixer", - "current_model_index": -1, - "current_model_id": "kimi-k2.6", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 89, - "minimax-m2.5": 88, - "minimax-m2.7": 84, - "nemotron-3-super": 71, - "glm-5.1": 64, - "deepseek-v4-pro-max": 88, - "qwen3-5-122b": 64, - "qwen3-coder-next": 82, - "qwen3-6-plus": 86, - "kimi-k2-6": 90 - } - }, - { - "agent": "browser-automation", - "current_model_index": 0, - "current_model_id": "qwen3-coder-480b", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 87, - "minimax-m2.5": 72, - "minimax-m2.7": 68, - "nemotron-3-super": 61, - "glm-5.1": 53, - "deepseek-v4-pro-max": 82, - "qwen3-5-122b": 56, - "qwen3-coder-next": 72, - "qwen3-6-plus": 82, - "kimi-k2-6": 86 - } - }, - { - "agent": "visual-tester", - "current_model_index": 0, - "current_model_id": "qwen3-coder-480b", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 82, - "minimax-m2.5": 68, - "minimax-m2.7": 64, - "nemotron-3-super": 55, - "glm-5.1": 48, - "deepseek-v4-pro-max": 76, - "qwen3-5-122b": 54, - "qwen3-coder-next": 66, - "qwen3-6-plus": 76, - "kimi-k2-6": 78 - } - }, - { - "agent": "system-analyst", - "current_model_index": 7, - "current_model_id": "glm-5.1", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 70, - "minimax-m2.5": 66, - "minimax-m2.7": 63, - "nemotron-3-super": 74, - "glm-5.1": 82, - "deepseek-v4-pro-max": 88, - "qwen3-5-122b": 76, - "qwen3-coder-next": 58, - "qwen3-6-plus": 80, - "kimi-k2-6": 86 - } - }, - { - "agent": "capability-analyst", - "current_model_index": 7, - "current_model_id": "glm-5.1", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 72, - "minimax-m2.5": 68, - "minimax-m2.7": 66, - "nemotron-3-super": 76, - "glm-5.1": 78, - "deepseek-v4-pro-max": 82, - "qwen3-5-122b": 75, - "qwen3-coder-next": 60, - "qwen3-6-plus": 79, - "kimi-k2-6": 82 - } - }, - { - "agent": "orchestrator", - "current_model_index": -1, - "current_model_id": "kimi-k2.6", - "reasoning_effort": "H", - "scores": { - "qwen3-coder-480b": 74, - "minimax-m2.5": 70, - "minimax-m2.7": 68, - "nemotron-3-super": 80, - "glm-5.1": 82, - "deepseek-v4-pro-max": 86, - "qwen3-5-122b": 78, - "qwen3-coder-next": 62, - "qwen3-6-plus": 84, - "kimi-k2-6": 92 - } - }, - { - "agent": "release-manager", - "current_model_index": 7, - "current_model_id": "glm-5.1", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 72, - "minimax-m2.5": 66, - "minimax-m2.7": 64, - "nemotron-3-super": 74, - "glm-5.1": 76, - "deepseek-v4-pro-max": 78, - "qwen3-5-122b": 72, - "qwen3-coder-next": 60, - "qwen3-6-plus": 76, - "kimi-k2-6": 78 - } - }, - { - "agent": "evaluator", - "current_model_index": 7, - "current_model_id": "glm-5.1", - "reasoning_effort": "H", - "scores": { - "qwen3-coder-480b": 70, - "minimax-m2.5": 73, - "minimax-m2.7": 70, - "nemotron-3-super": 78, - "glm-5.1": 78, - "deepseek-v4-pro-max": 84, - "qwen3-5-122b": 76, - "qwen3-coder-next": 58, - "qwen3-6-plus": 81, - "kimi-k2-6": 84 - } - }, - { - "agent": "prompt-optimizer", - "current_model_index": -1, - "current_model_id": "qwen3.6-plus", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 76, - "minimax-m2.5": 74, - "minimax-m2.7": 72, - "nemotron-3-super": 76, - "glm-5.1": 75, - "deepseek-v4-pro-max": 80, - "qwen3-5-122b": 74, - "qwen3-coder-next": 64, - "qwen3-6-plus": 83, - "kimi-k2-6": 82 - } - }, - { - "agent": "product-owner", - "current_model_index": 7, - "current_model_id": "glm-5.1", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 60, - "minimax-m2.5": 56, - "minimax-m2.7": 54, - "nemotron-3-super": 74, - "glm-5.1": 78, - "deepseek-v4-pro-max": 76, - "qwen3-5-122b": 74, - "qwen3-coder-next": 48, - "qwen3-6-plus": 78, - "kimi-k2-6": 76 - } - }, - { - "agent": "pipeline-judge", - "current_model_index": 7, - "current_model_id": "glm-5.1", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 64, - "minimax-m2.5": 68, - "minimax-m2.7": 65, - "nemotron-3-super": 78, - "glm-5.1": 76, - "deepseek-v4-pro-max": 82, - "qwen3-5-122b": 74, - "qwen3-coder-next": 56, - "qwen3-6-plus": 80, - "kimi-k2-6": 84 - } - }, - { - "agent": "workflow-architect", - "current_model_index": 7, - "current_model_id": "glm-5.1", - "reasoning_effort": "H", - "scores": { - "qwen3-coder-480b": 68, - "minimax-m2.5": 62, - "minimax-m2.7": 60, - "nemotron-3-super": 76, - "glm-5.1": 76, - "deepseek-v4-pro-max": 80, - "qwen3-5-122b": 72, - "qwen3-coder-next": 56, - "qwen3-6-plus": 80, - "kimi-k2-6": 82 - } - }, - { - "agent": "markdown-validator", - "current_model_index": 3, - "current_model_id": "deepseek-v4-pro-max", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 43, - "minimax-m2.5": 38, - "minimax-m2.7": 36, - "nemotron-3-super": 52, - "glm-5.1": 55, - "deepseek-v4-pro-max": 68, - "qwen3-5-122b": 56, - "qwen3-coder-next": 40, - "qwen3-6-plus": 50, - "kimi-k2-6": 56 - } - }, - { - "agent": "agent-architect", - "current_model_index": -1, - "current_model_id": "kimi-k2.6", - "reasoning_effort": "H", - "scores": { - "qwen3-coder-480b": 78, - "minimax-m2.5": 72, - "minimax-m2.7": 70, - "nemotron-3-super": 78, - "glm-5.1": 76, - "deepseek-v4-pro-max": 82, - "qwen3-5-122b": 76, - "qwen3-coder-next": 66, - "qwen3-6-plus": 82, - "kimi-k2-6": 86 - } - }, - { - "agent": "planner", - "current_model_index": 3, - "current_model_id": "deepseek-v4-pro-max", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 72, - "minimax-m2.5": 68, - "minimax-m2.7": 66, - "nemotron-3-super": 80, - "glm-5.1": 78, - "deepseek-v4-pro-max": 88, - "qwen3-5-122b": 78, - "qwen3-coder-next": 60, - "qwen3-6-plus": 85, - "kimi-k2-6": 86 - } - }, - { - "agent": "reflector", - "current_model_index": 3, - "current_model_id": "deepseek-v4-pro-max", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 68, - "minimax-m2.5": 66, - "minimax-m2.7": 64, - "nemotron-3-super": 78, - "glm-5.1": 76, - "deepseek-v4-pro-max": 84, - "qwen3-5-122b": 76, - "qwen3-coder-next": 56, - "qwen3-6-plus": 82, - "kimi-k2-6": 80 - } - }, - { - "agent": "memory-manager", - "current_model_index": -1, - "current_model_id": "qwen3.6-plus", - "reasoning_effort": "M", - "scores": { - "qwen3-coder-480b": 63, - "minimax-m2.5": 58, - "minimax-m2.7": 56, - "nemotron-3-super": 86, - "glm-5.1": 72, - "deepseek-v4-pro-max": 86, - "qwen3-5-122b": 70, - "qwen3-coder-next": 50, - "qwen3-6-plus": 87, - "kimi-k2-6": 84 - } - }, - { - "agent": "architect-indexer", - "current_model_index": 7, - "current_model_id": "glm-5.1", - "reasoning_effort": "H", - "scores": { - "qwen3-coder-480b": 70, - "minimax-m2.5": 64, - "minimax-m2.7": 62, - "nemotron-3-super": 74, - "glm-5.1": 80, - "deepseek-v4-pro-max": 78, - "qwen3-5-122b": 76, - "qwen3-coder-next": 58, - "qwen3-6-plus": 80, - "kimi-k2-6": 84 - } - } - ], - "if_scores": { - "qwen3-coder-480b": 88, - "minimax-m2.5": 82, - "minimax-m2.7": 78, - "nemotron-3-super": 85, - "glm-5.1": 80, - "deepseek-v4-pro-max": 88, - "qwen3-5-122b": 86, - "qwen3-coder-next": 84, - "qwen3-6-plus": 90, - "kimi-k2-6": 91, - "deepseek-v4-flash": 86 - }, - "agent_current_config": [ - { - "agent": "lead-developer", - "model": "ollama-cloud/qwen3-coder:480b", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "qwen", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "frontend-developer", - "model": "ollama-cloud/minimax-m2.5", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "qwen", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "php-developer", - "model": "ollama-cloud/qwen3-coder:480b", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "qwen", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "python-developer", - "model": "ollama-cloud/qwen3-coder:480b", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "qwen", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "backend-developer", - "model": "ollama-cloud/qwen3-coder:480b", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "qwen", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "go-developer", - "model": "ollama-cloud/deepseek-v4-pro-max", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "qwen", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "flutter-developer", - "model": "ollama-cloud/qwen3-coder:480b", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "qwen", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "devops-engineer", - "model": "ollama-cloud/kimi-k2.6", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "nemotron", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "sdet-engineer", - "model": "ollama-cloud/qwen3-coder:480b", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "qwen", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "code-skeptic", - "model": "ollama-cloud/minimax-m2.5", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "minimax", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "security-auditor", - "model": "ollama-cloud/deepseek-v4-pro-max", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "nemotron", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "performance-engineer", - "model": "ollama-cloud/deepseek-v4-pro-max", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "nemotron", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "the-fixer", - "model": "ollama-cloud/kimi-k2.6", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "minimax", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "browser-automation", - "model": "ollama-cloud/qwen3-coder:480b", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "qwen", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "visual-tester", - "model": "ollama-cloud/qwen3-coder:480b", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "qwen", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "system-analyst", - "model": "ollama-cloud/glm-5.1", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "glm", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "capability-analyst", - "model": "ollama-cloud/glm-5.1", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "glm", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "orchestrator", - "model": "ollama-cloud/kimi-k2.6", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "kimi", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "release-manager", - "model": "ollama-cloud/glm-5.1", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "glm", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "evaluator", - "model": "ollama-cloud/glm-5.1", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "glm", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "prompt-optimizer", - "model": "ollama-cloud/qwen3.6-plus", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "glm", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "product-owner", - "model": "ollama-cloud/glm-5.1", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "glm", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "pipeline-judge", - "model": "ollama-cloud/glm-5.1", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "glm", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "workflow-architect", - "model": "ollama-cloud/glm-5.1", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "glm", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "markdown-validator", - "model": "ollama-cloud/deepseek-v4-pro-max", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "nemotron", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "agent-architect", - "model": "ollama-cloud/kimi-k2.6", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "glm", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "planner", - "model": "ollama-cloud/deepseek-v4-pro-max", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "nemotron", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "reflector", - "model": "ollama-cloud/deepseek-v4-pro-max", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "nemotron", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "memory-manager", - "model": "ollama-cloud/qwen3.6-plus", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "nemotron", - "fit_score": 0, - "status": "good", - "previous_model": null - }, - { - "agent": "architect-indexer", - "model": "ollama-cloud/glm-5.1", - "provider": "Ollama Cloud", - "category": "Process", - "badge_type": "glm", - "fit_score": 0, - "status": "good", - "previous_model": null - } - ], - "recommendations": [ - { - "agent": "[built-in] debug", - "from_model": "glm-5.1.1 (88)", - "from_provider": "Ollama", - "to_model": "V4-Pro Max (\u260590) / K2.6 (\u260590) RE:High", - "to_provider": "Ollama Cloud", - "impact": "high", - "quality_change": "+2%", - "speed_change": "~1x", - "context_change": "200K\u21921M", - "provider_change": "Ollama Cloud", - "rationale": "\u2605 \u043c\u0430\u0442\u0440\u0438\u0446\u044b: V4-Pro=90 \u0438 K2.6=90 (TIE!), GLM-5.1=88. V4-Pro: LiveCodeBench 93.5(#1!), Terminal 67.9, 1M ctx \u0434\u043b\u044f \u043f\u043e\u043b\u043d\u043e\u0433\u043e \u043f\u0440\u043e\u0435\u043a\u0442\u0430. K2.6: 13h auto sessions. \u041e\u0431\u0430 \u043b\u0443\u0447\u0448\u0435 GLM-5.1. RE:High \u0434\u043b\u044f debug." - }, - { - "agent": "planner", - "from_model": "nemotron-3-super (80)", - "from_provider": "Ollama", - "to_model": "V4-Pro Max (\u260588) RE:High", - "to_provider": "Ollama Cloud", - "impact": "high", - "quality_change": "+10%", - "speed_change": "~1x", - "context_change": "1M", - "provider_change": "Ollama Cloud", - "rationale": "\u2605 \u043c\u0430\u0442\u0440\u0438\u0446\u044b: V4-Pro=88(\u043b\u0443\u0447\u0448\u0438\u0439!), K2.6=86, GLM-5.1=85, Nem=80. V4-Pro: GPQA 90.1 (reasoning), 1M ctx \u0441\u043e\u0445\u0440\u0430\u043d\u044f\u0435\u0442\u0441\u044f (vs \u043f\u043e\u0442\u0435\u0440\u044f \u043f\u0440\u0438 K2.6). RE:High \u0434\u043b\u044f chain-of-thought planning." - }, - { - "agent": "go-developer", - "from_model": "qwen3-coder:480b (85)", - "from_provider": "Ollama", - "to_model": "V4-Pro Max (\u260588) RE:Medium", - "to_provider": "Ollama Cloud", - "impact": "medium", - "quality_change": "+4%", - "speed_change": "~1x", - "context_change": "256K\u21921M", - "provider_change": "Ollama Cloud", - "rationale": "\u2605 \u043c\u0430\u0442\u0440\u0438\u0446\u044b: V4-Pro=88(\u043b\u0443\u0447\u0448\u0438\u0439 \u0434\u043b\u044f Go!), K2.6=86, Qwen3Coder=85. DeepSeek \u043c\u043e\u0434\u0435\u043b\u0438 \u0442\u0440\u0430\u0434\u0438\u0446\u0438\u043e\u043d\u043d\u043e \u0441\u0438\u043b\u044c\u043d\u044b \u0432 Go/Rust. 1M ctx \u0434\u043b\u044f \u043a\u0440\u0443\u043f\u043d\u044b\u0445 Go-\u043f\u0440\u043e\u0435\u043a\u0442\u043e\u0432." - }, - { - "agent": "history-miner", - "from_model": "nemotron-3-super (\u260585)", - "from_provider": "Ollama", - "to_model": "V4-Pro Max (86) + Nem fallback", - "to_provider": "Hybrid", - "impact": "medium", - "quality_change": "+1%", - "speed_change": "~1x", - "context_change": "1M", - "provider_change": "Ollama Cloud + Ollama", - "rationale": "V4-Pro=86 \u0447\u0443\u0442\u044c \u043b\u0443\u0447\u0448\u0435 Nemotron=85. 1M ctx \u0443 \u043e\u0431\u043e\u0438\u0445. MRCR 83.5 \u0443 V4-Pro \u2014 \u043b\u0443\u0447\u0448\u0435\u0435 long-context retrieval. Nemotron \u043a\u0430\u043a fallback (RULER 91.75%)." - }, - { - "agent": "frontend-dev \u2192 M2.5", - "from_model": "qwen3-coder (90)", - "from_provider": "Ollama", - "to_model": "MiniMax M2.5 (\u260592) \u2705", - "to_provider": "Ollama", - "impact": "low", - "quality_change": "+2%", - "speed_change": "=", - "context_change": "204K", - "provider_change": "Ollama", - "rationale": "Spec-writing, UI architect. APPLIED." - }, - { - "agent": "devops \u2192 K2.6", - "from_model": "deepseek-v3.2", - "from_provider": "", - "to_model": "kimi-k2.6 \u2705", - "to_provider": "Ollama Cloud", - "impact": "low", - "quality_change": "+35%", - "speed_change": "=", - "context_change": "256K", - "provider_change": "", - "rationale": "APPLIED." - }, - { - "agent": "orchestrator", - "from_model": "glm-5.1.1 (\u260590)", - "from_provider": "Ollama", - "to_model": "K2.6 (\u260592) RE:Medium", - "to_provider": "Ollama Cloud", - "impact": "medium", - "quality_change": "+2%", - "speed_change": "~1x", - "context_change": "200K\u2192256K", - "provider_change": "Ollama Cloud", - "rationale": "K2.6=92\u2605 \u0432\u0441\u0451 \u0435\u0449\u0451 \u043b\u0443\u0447\u0448\u0438\u0439 \u0434\u043b\u044f orchestration. V4-Pro=86 \u0441\u043b\u0430\u0431\u0435\u0435. 300 sub-agent swarm." - }, - { - "agent": "the-fixer", - "from_model": "minimax-m2.5 (\u260588)", - "from_provider": "Ollama", - "to_model": "V4-Pro (\u260588) / K2.6 (\u260590)", - "to_provider": "Ollama Cloud", - "impact": "medium", - "quality_change": "+2%", - "speed_change": "~1x", - "context_change": "128K\u21921M/256K", - "provider_change": "Ollama Cloud", - "rationale": "K2.6=90(\u043b\u0443\u0447\u0448\u0438\u0439), V4-Pro=88=M2.5. M2.5 SWE-bench 80.2% \u0441\u0442\u0430\u0431\u0438\u043b\u044c\u043d\u0435\u0435. \u041d\u0435 \u0441\u0440\u043e\u0447\u043d\u043e." - }, - { - "agent": "Qwen3-Coder (7 coding)", - "from_model": "qwen3-coder", - "from_provider": "Ollama", - "to_model": "\u2705", - "to_provider": "", - "impact": "low", - "quality_change": "=0%", - "speed_change": "=", - "context_change": "256K", - "provider_change": "Ollama", - "rationale": "lead=92\u2605, backend=91\u2605, python=90\u2605." - }, - { - "agent": "GLM-5.1 (12 agents)", - "from_model": "glm-5.1.1", - "from_provider": "Ollama", - "to_model": "\u2705", - "to_provider": "", - "impact": "low", - "quality_change": "=0%", - "speed_change": "=", - "context_change": "200K", - "provider_change": "", - "rationale": "orchestrator=90, system-analyst=90. SWE-Pro #1." - }, - { - "agent": "Kimi K2.6 (3 agents)", - "from_model": "kimi-k2.6", - "from_provider": "Ollama Cloud", - "to_model": "\u2705", - "to_provider": "", - "impact": "low", - "quality_change": "=0%", - "speed_change": "=", - "context_change": "256K", - "provider_change": "", - "rationale": "devops=88\u2605, browser=86, agent-arch=86." - } - ], - "impact_data": [ - { - "category": "debug GLM5.1\u2192V4-Pro/K2.6", - "before": 88, - "after": 90, - "delta": 2, - "notes": "LiveCodeBench 93.5, Terminal 67.9" - }, - { - "category": "planner Nem\u2192V4-Pro Max", - "before": 80, - "after": 88, - "delta": 8, - "notes": "\u260588! GPQA 90.1, 1M ctx" - }, - { - "category": "go-dev Coder\u2192V4-Pro Max", - "before": 85, - "after": 88, - "delta": 3, - "notes": "\u260588! Go/Rust specialist, 1M ctx" - }, - { - "category": "history-miner \u2192V4-Pro", - "before": 85, - "after": 86, - "delta": 1, - "notes": "MRCR 83.5, long-context" - }, - { - "category": "orchestrator \u2192K2.6 (next)", - "before": 90, - "after": 92, - "delta": 2, - "notes": "300 sub-agent swarm" - }, - { - "category": "frontend \u2192 M2.5 \u2705", - "before": 90, - "after": 92, - "delta": 2, - "notes": "Spec-writing, UI architect" - }, - { - "category": "devops \u2192 K2.6 \u2705", - "before": 65, - "after": 88, - "delta": 23, - "notes": "IF:65\u219291! Terminal 66.7" - }, - { - "category": "Qwen3-Coder (7) \u2705", - "before": 90, - "after": 90, - "delta": 0, - "notes": "SOTA coding" - }, - { - "category": "GLM-5.1 (12) \u2705", - "before": 87, - "after": 87, - "delta": 0, - "notes": "SWE-Pro #1" - }, - { - "category": "Nemotron Super (6) \u2705", - "before": 82, - "after": 82, - "delta": 0, - "notes": "1M ctx, RULER 91.75%" - } - ], - "benchmark_comparison": { - "benchmarks": [ - { - "name": "SWE-V", - "full_name": "SWE-Bench Verified", - "description": "GitHub issue resolution (500 tasks)", - "roles": "lead-dev, backend, fixer" - }, - { - "name": "SWE-P", - "full_name": "SWE-Bench Pro", - "description": "Multi-lang, decontaminated (1865 tasks)", - "roles": "all coding agents" - }, - { - "name": "T-Bench", - "full_name": "Terminal-Bench 2.0", - "description": "CLI/shell multi-step tasks", - "roles": "devops, planner, orchestrator" - }, - { - "name": "LCB", - "full_name": "LiveCodeBench", - "description": "Code gen from specs (held-out)", - "roles": "sdet, go-dev, python-dev" - }, - { - "name": "GPQA", - "full_name": "GPQA Diamond", - "description": "PhD-level reasoning", - "roles": "system-analyst, planner" - }, - { - "name": "BComp", - "full_name": "BrowseComp", - "description": "Web research & synthesis", - "roles": "browser-auto, capability-analyst" - }, - { - "name": "HLE", - "full_name": "Humanity Last Exam", - "description": "Frontier knowledge (with tools)", - "roles": "agent-architect, evaluator" - }, - { - "name": "Ctx", - "full_name": "Context Window", - "description": "Max tokens in one pass", - "roles": "history-miner, memory-mgr" - }, - { - "name": "$/M", - "full_name": "Cost per 1M input", - "description": "API pricing", - "roles": "all agents (ROI)" - } - ], - "closed_source_models": [ - { - "name": "Claude Opus 4.7", - "organization": "Anthropic", - "scores": [ - 87.6, - 64.3, - 69.4, - null, - 94.2, - 79.3, - 53, - "1M", - "$5" - ], - "color": "#c084fc", - "note": "#1 \u0430\u043f\u0440\u0435\u043b\u044c 2026" - }, - { - "name": "GPT-5.5", - "organization": "OpenAI", - "scores": [ - null, - 58.6, - 82.7, - null, - null, - 83.4, - 57.2, - "1M", - "$5" - ], - "color": "#ff6b81", - "note": "\u041d\u043e\u0432\u0435\u0439\u0448\u0438\u0439, Terminal #1" - }, - { - "name": "GPT-5.4", - "organization": "OpenAI", - "scores": [ - 78.2, - 59.1, - 75.1, - null, - 94.4, - 82.7, - 58.7, - "200K", - "$2.50" - ], - "color": "#ff6b81", - "note": "Reasoning, math" - }, - { - "name": "Gemini 3.1 Pro", - "organization": "Google", - "scores": [ - 80.6, - 46.1, - 68.5, - null, - 94.3, - 85.9, - 51.4, - "2M", - "$2" - ], - "color": "#facc15", - "note": "ARC-AGI 77.1%, \u0434\u0435\u0448\u0451\u0432\u044b\u0439" - }, - { - "name": "Claude Sonnet 4.6", - "organization": "Anthropic", - "scores": [ - 79.6, - null, - null, - null, - null, - null, - null, - "200K", - "$3" - ], - "color": "#c084fc", - "note": "5\u00d7 \u0434\u0435\u0448\u0435\u0432\u043b\u0435 Opus" - }, - { - "name": "GPT-5.3-Codex", - "organization": "OpenAI", - "scores": [ - 85, - 57, - 77.3, - null, - null, - null, - null, - "200K", - "$6" - ], - "color": "#ff6b81", - "note": "Coding specialist" - } - ], - "apaw_models": [ - { - "name": "Kimi K2.6", - "organization": "APAW", - "scores": [ - 80.2, - 58.6, - 66.7, - 87.2, - null, - 83.2, - 54, - "256K", - "$0.95" - ], - "color": "#00ff94", - "note": "devops, browser, architect (3)" - }, - { - "name": "GLM-5.1", - "organization": "APAW", - "scores": [ - null, - 58.4, - 63.5, - null, - 86.2, - 68.7, - null, - "200K", - "~$0.50" - ], - "color": "#00ff94", - "note": "12 agents! orchestrator, eval..." - }, - { - "name": "V4-Pro Max", - "organization": "APAW", - "scores": [ - 80.6, - 55.4, - 67.9, - 93.5, - 90.1, - 83.4, - 48.2, - "1M", - "$0.42" - ], - "color": "#00d4ff", - "note": "planner, go-dev (\u0440\u0435\u043a.)" - }, - { - "name": "Qwen3-Coder 480B", - "organization": "APAW", - "scores": [ - 66.5, - null, - null, - null, - null, - null, - null, - "256K", - "~$0.50" - ], - "color": "#00ff94", - "note": "7 coding agents" - }, - { - "name": "MiniMax M2.5", - "organization": "APAW", - "scores": [ - 80.2, - 51.3, - null, - null, - null, - 76.3, - null, - "204K", - "$0.15" - ], - "color": "#00ff94", - "note": "frontend, skeptic, fixer (3)" - }, - { - "name": "Nemotron Super", - "organization": "APAW", - "scores": [ - 60.5, - null, - null, - null, - null, - null, - null, - "1M", - "~$0.40" - ], - "color": "#00ff94", - "note": "6 agents (memory, history)" - } - ] - } -} \ No newline at end of file +{ + "version": "1.0.0", + "generated": "2026-05-24T01:00:00Z", + "source": "ollama-cloud-models-v2026-05-24", + "total_agents": 34, + "total_models_tracked": 13, + "providers": ["ollama-cloud"], + "models": [ + { + "id": "deepseek-v4-pro-max", + "name": "DeepSeek V4-Pro Max", + "organization": "DeepSeek", + "parameters": "1.6T/49B active MoE", + "context_window": "1M", + "swe_bench": 80.6, + "if_score": 89, + "categories": ["coding", "agent", "reasoning"], + "provider": "ollama-cloud", + "updated": "2026-05-03", + "pulls": "71.6K" + }, + { + "id": "deepseek-v4-flash", + "name": "DeepSeek V4-Flash", + "organization": "DeepSeek", + "parameters": "284B/13B active MoE", + "context_window": "1M", + "swe_bench": 79, + "if_score": 86, + "categories": ["coding", "efficient", "agent"], + "provider": "ollama-cloud", + "updated": "2026-05-03", + "pulls": "84.4K" + }, + { + "id": "kimi-k2.6", + "name": "Kimi K2.6", + "organization": "Moonshot AI", + "parameters": "1T/32B active MoE", + "context_window": "256K→1M", + "swe_bench": 80.2, + "if_score": 91, + "categories": ["coding", "agent", "multimodal", "vision"], + "provider": "ollama-cloud", + "updated": "2026-04-24", + "pulls": "259.7K" + }, + { + "id": "kimi-k2.5", + "name": "Kimi K2.5", + "organization": "Moonshot AI", + "parameters": "1T/32B active MoE", + "context_window": "256K", + "swe_bench": 78, + "if_score": 90, + "categories": ["coding", "agent", "multimodal", "vision"], + "provider": "ollama-cloud", + "updated": "2026-02-24", + "pulls": "293.2K" + }, + { + "id": "qwen3-coder-480b", + "name": "Qwen3-Coder 480B", + "organization": "Qwen", + "parameters": "480B/35B active", + "context_window": "256K→1M", + "swe_bench": 66.5, + "if_score": 88, + "categories": ["coding", "agent"], + "provider": "ollama-cloud", + "updated": "2026-02-24", + "pulls": "N/A (legacy track)" + }, + { + "id": "qwen3.5-122b", + "name": "Qwen 3.5 122B", + "organization": "Qwen", + "parameters": "122B/10B active", + "context_window": "128K", + "swe_bench": null, + "if_score": 92, + "categories": ["reasoning", "efficient", "vision", "tools"], + "provider": "ollama-cloud", + "updated": "2026-05-22", + "pulls": "12.4M" + }, + { + "id": "gemma4-27b", + "name": "Gemma 4 (27B)", + "organization": "Google", + "parameters": "27B", + "context_window": "128K", + "swe_bench": null, + "if_score": 85, + "categories": ["coding", "agent", "reasoning", "vision", "audio"], + "provider": "ollama-cloud", + "updated": "2026-05-22", + "pulls": "10.1M", + "note": "Updated 2 days ago. Frontier-level performance at each size." + }, + { + "id": "minimax-m2.5", + "name": "MiniMax M2.5", + "organization": "MiniMax", + "parameters": "MoE undisclosed", + "context_window": "128K", + "swe_bench": 80.2, + "if_score": 82, + "categories": ["coding", "agent"], + "provider": "ollama-cloud", + "updated": "2026-02-24", + "pulls": "2.2M" + }, + { + "id": "minimax-m2.7", + "name": "MiniMax M2.7", + "organization": "MiniMax", + "parameters": "~10B active", + "context_window": "128K", + "swe_bench": 78, + "if_score": 80, + "categories": ["coding", "agent", "efficient"], + "provider": "ollama-cloud", + "updated": "2026-03-24", + "pulls": "2.2M" + }, + { + "id": "glm-5.1", + "name": "GLM-5.1", + "organization": "Z.ai", + "parameters": "744B/40B active", + "context_window": "128K", + "swe_bench": null, + "if_score": 90, + "categories": ["reasoning", "agent"], + "provider": "ollama-cloud", + "updated": "2026-04-24", + "pulls": "2.2M", + "note": "Next-gen flagship. SWE-Bench Pro SOTA." + }, + { + "id": "glm-5", + "name": "GLM-5", + "organization": "Z.ai", + "parameters": "744B/40B active", + "context_window": "128K", + "swe_bench": null, + "if_score": 90, + "categories": ["reasoning", "agent"], + "provider": "ollama-cloud", + "updated": "2026-02-24", + "pulls": "2.3M" + }, + { + "id": "nemotron-3-super", + "name": "Nemotron 3 Super", + "organization": "NVIDIA", + "parameters": "120B/12B active", + "context_window": "1M", + "swe_bench": 60.5, + "if_score": 78, + "categories": ["agent", "reasoning", "efficient"], + "provider": "ollama-cloud", + "updated": "2026-03-24", + "pulls": "2.4M" + }, + { + "id": "nemotron-3-nano", + "name": "Nemotron 3 Nano", + "organization": "NVIDIA", + "parameters": "30B/4B", + "context_window": "128K", + "swe_bench": null, + "if_score": 68, + "categories": ["agent", "efficient"], + "provider": "ollama-cloud", + "updated": "2026-03-24", + "pulls": "453K" + }, + { + "id": "devstral-2", + "name": "Devstral 2", + "organization": "Mistral / Devstral", + "parameters": "123B", + "context_window": "128K", + "swe_bench": null, + "if_score": 80, + "categories": ["coding", "agent"], + "provider": "ollama-cloud", + "updated": "2026-02-24", + "pulls": "223.2K" + }, + { + "id": "devstral-small-2", + "name": "Devstral Small 2", + "organization": "Mistral / Devstral", + "parameters": "24B", + "context_window": "128K", + "swe_bench": null, + "if_score": 75, + "categories": ["coding", "agent"], + "provider": "ollama-cloud", + "updated": "2026-02-24", + "pulls": "838.8K" + } + ], + "if_scores": { + "deepseek-v4-pro-max": 89, + "deepseek-v4-flash": 86, + "kimi-k2.6": 91, + "kimi-k2.5": 90, + "qwen3-coder-480b": 88, + "qwen3.5-122b": 92, + "gemma4-27b": 85, + "minimax-m2.5": 82, + "minimax-m2.7": 80, + "glm-5.1": 90, + "glm-5": 90, + "nemotron-3-super": 78, + "nemotron-3-nano": 68, + "devstral-2": 80, + "devstral-small-2": 75 + }, + "agent_model_scores": [ + { + "agent": "lead-developer", + "current_model_index": 0, + "scores": { + "qwen3-coder-480b": 92, + "deepseek-v4-pro-max": 88, + "deepseek-v4-flash": 85, + "kimi-k2.6": 90, + "kimi-k2.5": 88, + "qwen3.5-122b": 86, + "gemma4-27b": 83, + "minimax-m2.5": 86, + "minimax-m2.7": 82, + "glm-5.1": 68, + "nemotron-3-super": 70, + "devstral-2": 84, + "devstral-small-2": 78 + } + }, + { + "agent": "frontend-developer", + "scores": { + "qwen3-coder-480b": 86, + "deepseek-v4-pro-max": 82, + "deepseek-v4-flash": 80, + "kimi-k2.6": 86, + "kimi-k2.5": 84, + "qwen3.5-122b": 84, + "gemma4-27b": 85, + "minimax-m2.5": 92, + "minimax-m2.7": 88, + "glm-5.1": 56, + "nemotron-3-super": 62, + "devstral-2": 80, + "devstral-small-2": 74 + } + }, + { + "agent": "backend-developer", + "scores": { + "qwen3-coder-480b": 91, + "deepseek-v4-pro-max": 86, + "kimi-k2.6": 90, + "qwen3.5-122b": 85, + "gemma4-27b": 84, + "minimax-m2.5": 84, + "minimax-m2.7": 80, + "glm-5.1": 63, + "nemotron-3-super": 68, + "devstral-2": 82, + "devstral-small-2": 76 + } + }, + { + "agent": "go-developer", + "scores": { + "qwen3-coder-480b": 85, + "deepseek-v4-pro-max": 88, + "deepseek-v4-flash": 84, + "kimi-k2.6": 86, + "qwen3.5-122b": 80, + "gemma4-27b": 80, + "minimax-m2.5": 78, + "minimax-m2.7": 74, + "glm-5.1": 58, + "nemotron-3-super": 66, + "devstral-2": 82, + "devstral-small-2": 74 + } + }, + { + "agent": "python-developer", + "scores": { + "qwen3-coder-480b": 90, + "deepseek-v4-pro-max": 78, + "kimi-k2.6": 88, + "qwen3.5-122b": 86, + "gemma4-27b": 82, + "minimax-m2.5": 82, + "minimax-m2.7": 78, + "glm-5.1": 60, + "nemotron-3-super": 66, + "devstral-2": 86, + "devstral-small-2": 80 + } + }, + { + "agent": "php-developer", + "scores": { + "qwen3-coder-480b": 87, + "deepseek-v4-pro-max": 74, + "kimi-k2.6": 86, + "qwen3.5-122b": 84, + "gemma4-27b": 82, + "minimax-m2.5": 76, + "minimax-m2.7": 72, + "glm-5.1": 56, + "nemotron-3-super": 64, + "devstral-2": 80, + "devstral-small-2": 74 + } + }, + { + "agent": "devops-engineer", + "scores": { + "qwen3-coder-480b": 66, + "deepseek-v4-pro-max": 80, + "kimi-k2.6": 88, + "qwen3.5-122b": 75, + "gemma4-27b": 78, + "minimax-m2.5": 53, + "minimax-m2.7": 48, + "glm-5.1": 75, + "nemotron-3-super": 78, + "devstral-2": 72, + "devstral-small-2": 68 + } + }, + { + "agent": "sdet-engineer", + "scores": { + "qwen3-coder-480b": 88, + "deepseek-v4-pro-max": 84, + "kimi-k2.6": 87, + "qwen3.5-122b": 86, + "gemma4-27b": 82, + "minimax-m2.5": 84, + "minimax-m2.7": 80, + "glm-5.1": 63, + "nemotron-3-super": 70, + "devstral-2": 86, + "devstral-small-2": 80 + } + }, + { + "agent": "code-skeptic", + "scores": { + "qwen3-coder-480b": 82, + "deepseek-v4-pro-max": 82, + "kimi-k2.6": 82, + "qwen3.5-122b": 80, + "gemma4-27b": 80, + "minimax-m2.5": 85, + "minimax-m2.7": 80, + "glm-5.1": 72, + "nemotron-3-super": 73, + "devstral-2": 82, + "devstral-small-2": 76 + } + }, + { + "agent": "security-auditor", + "scores": { + "qwen3-coder-480b": 76, + "deepseek-v4-pro-max": 80, + "kimi-k2.6": 80, + "qwen3.5-122b": 78, + "gemma4-27b": 78, + "minimax-m2.5": 74, + "minimax-m2.7": 68, + "glm-5.1": 68, + "nemotron-3-super": 76, + "devstral-2": 78, + "devstral-small-2": 72 + } + }, + { + "agent": "performance-engineer", + "scores": { + "qwen3-coder-480b": 78, + "deepseek-v4-pro-max": 84, + "kimi-k2.6": 82, + "qwen3.5-122b": 76, + "gemma4-27b": 76, + "minimax-m2.5": 75, + "minimax-m2.7": 70, + "glm-5.1": 74, + "nemotron-3-super": 78, + "devstral-2": 80, + "devstral-small-2": 74 + } + }, + { + "agent": "the-fixer", + "scores": { + "qwen3-coder-480b": 89, + "deepseek-v4-pro-max": 88, + "kimi-k2.6": 90, + "qwen3.5-122b": 86, + "gemma4-27b": 82, + "minimax-m2.5": 88, + "minimax-m2.7": 84, + "glm-5.1": 64, + "nemotron-3-super": 71, + "devstral-2": 86, + "devstral-small-2": 82 + } + }, + { + "agent": "browser-automation", + "scores": { + "qwen3-coder-480b": 87, + "deepseek-v4-pro-max": 82, + "kimi-k2.6": 86, + "qwen3.5-122b": 82, + "gemma4-27b": 84, + "minimax-m2.5": 72, + "minimax-m2.7": 68, + "glm-5.1": 53, + "nemotron-3-super": 61, + "devstral-2": 80, + "devstral-small-2": 74 + } + }, + { + "agent": "visual-tester", + "scores": { + "qwen3-coder-480b": 82, + "deepseek-v4-pro-max": 76, + "kimi-k2.6": 78, + "qwen3.5-122b": 76, + "gemma4-27b": 78, + "minimax-m2.5": 68, + "minimax-m2.7": 64, + "glm-5.1": 48, + "nemotron-3-super": 55, + "devstral-2": 74, + "devstral-small-2": 68 + } + }, + { + "agent": "system-analyst", + "scores": { + "qwen3-coder-480b": 70, + "deepseek-v4-pro-max": 88, + "kimi-k2.6": 86, + "qwen3.5-122b": 82, + "gemma4-27b": 82, + "minimax-m2.5": 66, + "minimax-m2.7": 63, + "glm-5.1": 82, + "nemotron-3-super": 74, + "devstral-2": 80, + "devstral-small-2": 74 + } + }, + { + "agent": "capability-analyst", + "scores": { + "qwen3-coder-480b": 72, + "deepseek-v4-pro-max": 82, + "kimi-k2.6": 82, + "qwen3.5-122b": 80, + "gemma4-27b": 80, + "minimax-m2.5": 68, + "minimax-m2.7": 66, + "glm-5.1": 78, + "nemotron-3-super": 76, + "devstral-2": 78, + "devstral-small-2": 72 + } + }, + { + "agent": "orchestrator", + "scores": { + "qwen3-coder-480b": 74, + "deepseek-v4-pro-max": 86, + "kimi-k2.6": 92, + "qwen3.5-122b": 84, + "gemma4-27b": 82, + "minimax-m2.5": 70, + "minimax-m2.7": 68, + "glm-5.1": 82, + "nemotron-3-super": 80, + "devstral-2": 80, + "devstral-small-2": 74 + } + }, + { + "agent": "release-manager", + "scores": { + "qwen3-coder-480b": 72, + "deepseek-v4-pro-max": 78, + "kimi-k2.6": 78, + "qwen3.5-122b": 76, + "gemma4-27b": 76, + "minimax-m2.5": 66, + "minimax-m2.7": 64, + "glm-5.1": 76, + "nemotron-3-super": 74, + "devstral-2": 76, + "devstral-small-2": 70 + } + }, + { + "agent": "evaluator", + "scores": { + "qwen3-coder-480b": 70, + "deepseek-v4-pro-max": 84, + "kimi-k2.6": 84, + "qwen3.5-122b": 82, + "gemma4-27b": 80, + "minimax-m2.5": 73, + "minimax-m2.7": 70, + "glm-5.1": 78, + "nemotron-3-super": 78, + "devstral-2": 80, + "devstral-small-2": 74 + } + }, + { + "agent": "prompt-optimizer", + "scores": { + "qwen3-coder-480b": 76, + "deepseek-v4-pro-max": 80, + "kimi-k2.6": 82, + "qwen3.5-122b": 82, + "gemma4-27b": 80, + "minimax-m2.5": 74, + "minimax-m2.7": 72, + "glm-5.1": 75, + "nemotron-3-super": 76, + "devstral-2": 80, + "devstral-small-2": 74 + } + }, + { + "agent": "product-owner", + "scores": { + "qwen3-coder-480b": 60, + "deepseek-v4-pro-max": 76, + "kimi-k2.6": 76, + "qwen3.5-122b": 76, + "gemma4-27b": 76, + "minimax-m2.5": 56, + "minimax-m2.7": 54, + "glm-5.1": 78, + "nemotron-3-super": 74, + "devstral-2": 76, + "devstral-small-2": 70 + } + }, + { + "agent": "pipeline-judge", + "scores": { + "qwen3-coder-480b": 64, + "deepseek-v4-pro-max": 82, + "kimi-k2.6": 84, + "qwen3.5-122b": 82, + "gemma4-27b": 80, + "minimax-m2.5": 68, + "minimax-m2.7": 65, + "glm-5.1": 76, + "nemotron-3-super": 78, + "devstral-2": 78, + "devstral-small-2": 72 + } + }, + { + "agent": "workflow-architect", + "scores": { + "qwen3-coder-480b": 68, + "deepseek-v4-pro-max": 80, + "kimi-k2.6": 82, + "qwen3.5-122b": 80, + "gemma4-27b": 80, + "minimax-m2.5": 62, + "minimax-m2.7": 60, + "glm-5.1": 76, + "nemotron-3-super": 76, + "devstral-2": 78, + "devstral-small-2": 72 + } + }, + { + "agent": "markdown-validator", + "scores": { + "qwen3-coder-480b": 43, + "deepseek-v4-pro-max": 68, + "kimi-k2.6": 56, + "qwen3.5-122b": 56, + "gemma4-27b": 60, + "minimax-m2.5": 38, + "minimax-m2.7": 36, + "glm-5.1": 55, + "nemotron-3-super": 52, + "nemotron-3-nano": 70, + "devstral-2": 65, + "devstral-small-2": 62 + } + }, + { + "agent": "agent-architect", + "scores": { + "qwen3-coder-480b": 78, + "deepseek-v4-pro-max": 82, + "kimi-k2.6": 86, + "qwen3.5-122b": 80, + "gemma4-27b": 82, + "minimax-m2.5": 72, + "minimax-m2.7": 70, + "glm-5.1": 76, + "nemotron-3-super": 78, + "devstral-2": 80, + "devstral-small-2": 74 + } + }, + { + "agent": "planner", + "scores": { + "qwen3-coder-480b": 72, + "deepseek-v4-pro-max": 88, + "kimi-k2.6": 86, + "qwen3.5-122b": 86, + "gemma4-27b": 84, + "minimax-m2.5": 68, + "minimax-m2.7": 66, + "glm-5.1": 78, + "nemotron-3-super": 80, + "devstral-2": 84, + "devstral-small-2": 78 + } + }, + { + "agent": "reflector", + "scores": { + "qwen3-coder-480b": 68, + "deepseek-v4-pro-max": 84, + "kimi-k2.6": 80, + "qwen3.5-122b": 80, + "gemma4-27b": 80, + "minimax-m2.5": 66, + "minimax-m2.7": 64, + "glm-5.1": 76, + "nemotron-3-super": 78, + "devstral-2": 82, + "devstral-small-2": 76 + } + }, + { + "agent": "memory-manager", + "scores": { + "qwen3-coder-480b": 63, + "deepseek-v4-pro-max": 86, + "kimi-k2.6": 84, + "qwen3.5-122b": 85, + "gemma4-27b": 82, + "minimax-m2.5": 58, + "minimax-m2.7": 56, + "glm-5.1": 72, + "nemotron-3-super": 86, + "devstral-2": 78, + "devstral-small-2": 72 + } + }, + { + "agent": "architect-indexer", + "scores": { + "qwen3-coder-480b": 70, + "deepseek-v4-pro-max": 78, + "kimi-k2.6": 84, + "qwen3.5-122b": 80, + "gemma4-27b": 80, + "minimax-m2.5": 64, + "minimax-m2.7": 62, + "glm-5.1": 80, + "nemotron-3-super": 74, + "devstral-2": 78, + "devstral-small-2": 72 + } + }, + { + "agent": "flutter-developer", + "scores": { + "qwen3-coder-480b": 86, + "deepseek-v4-pro-max": 78, + "kimi-k2.6": 84, + "qwen3.5-122b": 84, + "gemma4-27b": 84, + "minimax-m2.5": 70, + "minimax-m2.7": 66, + "glm-5.1": 53, + "nemotron-3-super": 60, + "devstral-2": 78, + "devstral-small-2": 74 + } + } + ], + "agent_current_config": [ + { "agent": "lead-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 92, "status": "optimal" }, + { "agent": "frontend-developer", "model": "ollama-cloud/minimax-m2.5", "fit_score": 92, "status": "optimal" }, + { "agent": "backend-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 91, "status": "optimal" }, + { "agent": "go-developer", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 88, "status": "optimal" }, + { "agent": "python-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 90, "status": "optimal" }, + { "agent": "php-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 87, "status": "optimal" }, + { "agent": "flutter-developer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 86, "status": "optimal" }, + { "agent": "devops-engineer", "model": "ollama-cloud/kimi-k2.6", "fit_score": 88, "status": "optimal" }, + { "agent": "sdet-engineer", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 88, "status": "optimal" }, + { "agent": "code-skeptic", "model": "ollama-cloud/minimax-m2.5", "fit_score": 85, "status": "optimal" }, + { "agent": "security-auditor", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 80, "status": "good" }, + { "agent": "performance-engineer", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 84, "status": "optimal" }, + { "agent": "the-fixer", "model": "ollama-cloud/kimi-k2.6", "fit_score": 90, "status": "optimal" }, + { "agent": "browser-automation", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 87, "status": "optimal" }, + { "agent": "visual-tester", "model": "ollama-cloud/qwen3-coder:480b", "fit_score": 82, "status": "good" }, + { "agent": "system-analyst", "model": "ollama-cloud/glm-5.1", "fit_score": 82, "status": "good" }, + { "agent": "capability-analyst", "model": "ollama-cloud/glm-5.1", "fit_score": 78, "status": "good" }, + { "agent": "orchestrator", "model": "ollama-cloud/kimi-k2.6", "fit_score": 92, "status": "optimal" }, + { "agent": "release-manager", "model": "ollama-cloud/glm-5.1", "fit_score": 76, "status": "good" }, + { "agent": "evaluator", "model": "ollama-cloud/glm-5.1", "fit_score": 78, "status": "good" }, + { "agent": "prompt-optimizer", "model": "ollama-cloud/qwen3.5", "fit_score": 82, "status": "recommended" }, + { "agent": "product-owner", "model": "ollama-cloud/glm-5.1", "fit_score": 78, "status": "good" }, + { "agent": "pipeline-judge", "model": "ollama-cloud/glm-5.1", "fit_score": 76, "status": "good" }, + { "agent": "workflow-architect", "model": "ollama-cloud/glm-5.1", "fit_score": 76, "status": "good" }, + { "agent": "markdown-validator", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 68, "status": "poor" }, + { "agent": "agent-architect", "model": "ollama-cloud/kimi-k2.6", "fit_score": 86, "status": "optimal" }, + { "agent": "planner", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 88, "status": "optimal" }, + { "agent": "reflector", "model": "ollama-cloud/deepseek-v4-pro-max", "fit_score": 84, "status": "optimal" }, + { "agent": "memory-manager", "model": "ollama-cloud/qwen3.5", "fit_score": 85, "status": "recommended" }, + { "agent": "architect-indexer", "model": "ollama-cloud/glm-5.1", "fit_score": 80, "status": "good" } + ], + "recommendations": [ + { + "agent": "prompt-optimizer", + "from_model": "ollama-cloud/qwen3.6-plus (openrouter)", + "to_model": "ollama-cloud/qwen3.5", + "reason": "Migrated to Ollama Cloud. IF 92, vision+tools+thinking. Same quality, no rate limits.", + "impact": "high", + "applied": false + }, + { + "agent": "memory-manager", + "from_model": "ollama-cloud/qwen3.6-plus (openrouter)", + "to_model": "ollama-cloud/qwen3.5", + "reason": "Migrated to Ollama Cloud. 1M context via qwen3.5? Actually qwen3.5 has 128K, not 1M. Alternative: kimi-k2.6 (256K) or deepseek-v4 (1M). But matrix shows qwen3.5=85 vs kimi-k2.6=84 vs deepseek=86.", + "impact": "high", + "applied": false + }, + { + "agent": "markdown-validator", + "from_model": "ollama-cloud/deepseek-v4-pro-max", + "to_model": "ollama-cloud/nemotron-3-nano", + "reason": "Markdown validator scores are lowest (68 max). Nemotron-3-Nano IF=68 but is tiny (4B/30B), extremely cheap. For lightweight validation tasks, nano is sufficient.", + "impact": "medium", + "applied": false + }, + { + "agent": "markdown-validator", + "from_model": "ollama-cloud/deepseek-v4-pro-max", + "to_model": "ollama-cloud/gemma4-27b", + "reason": "Gemma 4 is newest (2 days), frontier at each size. Scores 60 for validator — better than nano 70? Actually wait: gemma4=60, nano=70. Nano is better for this role. But gemma4 is newer and more general.", + "impact": "low", + "applied": false + }, + { + "agent": "system-analyst", + "from_model": "ollama-cloud/glm-5.1", + "to_model": "ollama-cloud/deepseek-v4-pro-max", + "reason": "Matrix: deepseek-v4-pro-max=88 vs glm-5.1=82. +6% quality, 1M context for architecture docs. GLM-5.1 still strong for standardization.", + "impact": "medium", + "applied": false + }, + { + "agent": "evaluator", + "from_model": "ollama-cloud/glm-5.1", + "to_model": "ollama-cloud/kimi-k2.6", + "reason": "Matrix: kimi-k2.6=84 vs glm-5.1=78. +6%. IF=91 for scoring accuracy. High reasoning needed.", + "impact": "medium", + "applied": false + }, + { + "agent": "evaluator", + "from_model": "ollama-cloud/glm-5.1", + "to_model": "ollama-cloud/deepseek-v4-pro-max", + "reason": "Alternative to kimi-k2.6. deepseek-v4-pro-max=84 (same as kimi), but 1M context. Could be better for large evaluation tasks.", + "impact": "medium", + "applied": false + }, + { + "agent": "security-auditor", + "from_model": "ollama-cloud/deepseek-v4-pro-max", + "to_model": "ollama-cloud/kimi-k2.6", + "reason": "Matrix: both 80. But kimi-k2.6 has multimodal (vision) which could help with screenshot-based security analysis. Tie.", + "impact": "low", + "applied": false + }, + { + "agent": "gemma4-trial", + "from_model": "none", + "to_model": "ollama-cloud/gemma4-27b", + "reason": "Gemma 4 is brand new (2 days), 10.1M pulls, frontier at each size, vision+audio+thinking. Could be game-changer for frontend-dev, browser-automation, visual-tester.", + "impact": "high", + "applied": false, + "note": "Requires A/B test on frontend task." + }, + { + "agent": "qwen3.5-trial", + "from_model": "none", + "to_model": "ollama-cloud/qwen3.5-122b", + "reason": "Qwen 3.5 updated 2 days ago, 12.4M pulls, IF=92 (highest!), multimodal. Could replace GLM-5.1 for reasoning tasks and qwen3-coder for some coding tasks.", + "impact": "high", + "applied": false, + "note": "Requires A/B test on planner/evaluator tasks." + } + ], + "new_models_to_consider": [ + { + "id": "gemma4-27b", + "priority": "critical", + "rationale": "Updated 2 days ago. 10.1M pulls. Frontier-level at each size. Vision + audio + thinking + tools + cloud. Potentially replaces qwen3-coder for some tasks." + }, + { + "id": "qwen3.5-122b", + "priority": "critical", + "rationale": "Updated 2 days ago. 12.4M pulls. IF=92 highest among tracked. Multimodal. Could replace glm-5.1 for reasoning and compete with qwen3-coder for coding." + }, + { + "id": "deepseek-v4-flash", + "priority": "medium", + "rationale": "Same family as pro-max but much faster (13B active vs 49B). Good for low-latency agents: code-skeptic, browser-automation." + }, + { + "id": "devstral-2", + "priority": "medium", + "rationale": "123B model for tool use and codebase exploration. Could be strong for lead-developer on large projects." + } + ] +} diff --git a/agent-evolution/data/model-research-2026-05-24.md b/agent-evolution/data/model-research-2026-05-24.md new file mode 100644 index 0000000..2d65320 --- /dev/null +++ b/agent-evolution/data/model-research-2026-05-24.md @@ -0,0 +1,111 @@ +# Agent Model Research Report — 2026-05-24 + +## Executive Summary + +13 model changes recommended across 38 agents. 2 CRITICAL (prompt-optimizer, memory-manager on non-Ollama-Cloud models that must migrate). 4 HIGH priority. 5 MEDIUM. 2 LOW. + +9 models benchmarked but assigned to zero agents—wasted potential. + +## Composite Score Formula +`composite = (IF_score * 0.5) + (SWE_bench * 0.3) + (context_kb / 1000 * 0.2)` + +| Model | IF | SWE | Ctx(K) | Composite | Pulls | Assigned | +|-------|-----|------|--------|-----------|-------|----------| +| kimi-k2.6 | 91 | 80.2 | 1000 | **69.76** | 259.7K | 7 agents | +| deepseek-v4-pro-max | 89 | 80.6 | 1000 | **68.88** | 71.6K | 4 agents | +| kimi-k2.5 | 90 | 78.0 | 256 | **68.45** | 293.2K | **0** | +| deepseek-v4-flash | 86 | 79.0 | 1000 | **66.90** | 84.4K | **0** | +| minimax-m2.5 | 82 | 80.2 | 128 | **65.09** | 2.2M | 2 agents | +| qwen3-coder-480b | 88 | 66.5 | 1000 | **64.15** | N/A | 7 agents | +| minimax-m2.7 | 80 | 78.0 | 128 | **63.43** | 2.2M | **0** | +| nemotron-3-super | 78 | 60.5 | 1000 | **57.35** | 2.4M | 2 agents | +| glm-5.1 | 90 | null | 128 | 45.03* | 2.2M | 8 agents | +| glm-5 | 90 | null | 128 | 45.03* | 2.3M | **0** | +| qwen3.5-122b | 92 | null | 128 | 46.03* | **12.4M** | **0** | +| gemma4-27b | 85 | null | 128 | 42.53* | **10.1M** | **0** | +| devstral-2 | 80 | null | 128 | 40.03* | 223.2K | **0** | +| devstral-small-2 | 75 | null | 128 | 37.53* | 838.8K | **0** | +| nemotron-3-nano | 68 | null | 128 | 34.03* | 453K | **0** | + +\* SWE missing → composite artificially low. Est: +20-25 with SWE~75. + +## Concentration Risks + +| Model | Agents | Risk | +|-------|--------|------| +| glm-5.1 | 8 | All agents on model with NO SWE score | +| kimi-k2.6 | 7 | Highest-quality model over-concentrated | +| qwen3-coder-480b | 7 | SWE=66.5 below deepseek-v4-flash (79) | +| deepseek-v4-pro-max | 4 | Expensive (49B active) | + +## Idle Models (0 agents assigned — wasted potential) + +| Model | Composite | Pulls | Why Idle | +|-------|-----------|-------|----------| +| qwen3.5-122b | ~68.5* | **12.4M** | Newest, highest IF=92, needs integration | +| gemma4-27b | ~62* | **10.1M** | Multimodal, needs A/B for coding | +| deepseek-v4-flash | 66.90 | 84.4K | Best efficiency, 13B active | +| minimax-m2.7 | 63.43 | 2.2M | Self-evolving, could suit meta-agents | +| glm-5 | ~67* | 2.3M | Superseded by glm-5.1 | +| devstral-2 | 40.03* | 223.2K | Code exploration, alternative for coding | +| devstral-small-2 | 37.53* | 838.8K | Lightweight, IF too low | +| kimi-k2.5 | 68.45 | 293.2K | Superseded by k2.6 | +| nemotron-3-nano | 34.03* | 453K | Ultra-lightweight for simple tasks | + +## Recommendations + +### CRITICAL + +| Agent | From | To | Delta | Rationale | +|-------|------|-----|-------|-----------| +| prompt-optimizer | qwen3.6-plus (**not Ollama Cloud**) | qwen3.5-122b (IF=92) | +10 | Must migrate. qwen3.6-plus not in Ollama Cloud. qwen3.5 highest IF=92. 12.4M pulls. | +| memory-manager | qwen3.6-plus (**not Ollama Cloud**) | deepseek-v4-pro-max (IF=89, 1M ctx) | +1 | Must migrate. Memory-manager needs long context (1M). deepseek-v4-pro-max best for this. | + +### HIGH + +| Agent | From | To | Delta | Rationale | +|-------|------|-----|-------|-----------| +| system-analyst | glm-5.1 (matrix=82) | deepseek-v4-pro-max (matrix=88) | +6 | IF=89, SWE=80.6, 1M context for architecture docs. glm-5.1 has no SWE score. | +| evaluator | glm-5.1 (matrix=78) | qwen3.5-122b (IF=92, est=82) | +4 | IF-critical role. qwen3.5-122b has highest IF=92. 12.4M pulls. | +| pipeline-judge | glm-5.1 (matrix=76) | kimi-k2.6 (matrix=84) | +8 | Needs long context (pipeline logs). kimi-k2.6 IF=91, SWE=80.2, 1M ctx. | +| workflow-architect | glm-5.1 (matrix=76) | qwen3.5-122b (est=80) | +4 | High IF for YAML/structured output. qwen3.5 IF=92. | + +### MEDIUM + +| Agent | From | To | Delta | Rationale | +|-------|------|-----|-------|-----------| +| markdown-validator | deepseek-v4-pro-max (matrix=68, expensive) | nemotron-3-nano (matrix=70, cheap, 4B) | +2 | Overkill to use 49B active model for markdown validation. nano cheaper + higher matrix score. | +| release-manager | glm-5.1 (matrix=76) | kimi-k2.6 (matrix=78) | +2 | 1M context for large git diffs. IF=91 vs 90. | +| capability-analyst | glm-5.1 (matrix=78) | deepseek-v4-pro-max (matrix=82) | +4 | 1M context for capability-index analysis. | +| visual-tester | qwen3-coder-480b (matrix=82, no vision) | kimi-k2.6 (matrix=82, vision) | +0 (capabilities+) | Same matrix but kimi-k2.6 can SEE images. Multimodal advantage. | +| browser-automation | qwen3-coder-480b (matrix=87, 35B active) | deepseek-v4-flash (IF=86, 13B active, 1M ctx) | ~-5 matrix (trade-off) | 3× faster inference. 1M context for complex DOM. | + +### LOW + +| Agent | From | To | Delta | Rationale | +|-------|------|-----|-------|-----------| +| history-miner | nemotron-3-super (IF=78, composite=57.35) | qwen3.5-122b (IF=92, 12.4M pulls) | +14 IF | Lowest model quality in pipeline. Easy upgrade. | +| plan (built-in) | nemotron-3-super (IF=78) | deepseek-v4-pro-max (IF=89, matrix=88) | +11 IF | Align with planner subagent.| + +## Data Gaps + +| Model | Missing | Impact | +|-------|---------|--------| +| qwen3.5-122b | SWE-bench | Cannot confirm coding. IF-only role safe. | +| gemma4-27b | SWE-bench | Newest release. Needs A/B for coding. | +| glm-5.1 | SWE-bench | 8 agents! Unverified coding capability. | +| devstral-2 | SWE-bench | Code model no coding benchmark—risky. | +| nemotron-3-nano | SWE-bench | Not needed: lightweight tasks only. | + +## Recently Updated Models (2 days old) + +- **qwen3.5-122b** (2026-05-22): 12.4M pulls since launch +- **gemma4-27b** (2026-05-22): 10.1M pulls since launch, announced "frontier at each size" + +## Next Actions + +1. Apply CRITICAL: migrate prompt-optimizer + memory-manager +2. Apply HIGH: system-analyst + evaluator + pipeline-judge + workflow-architect +3. Run pipeline A/B test on qwen3.5-122b and deepseek-v4-flash +4. Fill data gaps: collect SWE-bench for qwen3.5-122b and gemma4-27b +5. Update dashboard to show idle model alerts diff --git a/agent-evolution/data/model-research-latest.json b/agent-evolution/data/model-research-latest.json index a88b409..e9177c2 100644 --- a/agent-evolution/data/model-research-latest.json +++ b/agent-evolution/data/model-research-latest.json @@ -1,59 +1,325 @@ { "version": "1.0.0", - "generated": "2026-04-27T17:51:36.000Z", - "source": "/research model-optimization", - "models": [], + "generated": "2026-05-24T00:16:00Z", + "source": "orchestrator-deep-analysis", + "models": [ + { + "id": "deepseek-v4-pro-max", + "name": "DeepSeek V4-Pro Max", + "organization": "DeepSeek", + "parameters": "1.6T/49B active MoE", + "context_window": "1M", + "swe_bench": 80.6, + "if_score": 89, + "categories": ["coding", "agent", "reasoning"], + "provider": "ollama-cloud" + }, + { + "id": "kimi-k2-6", + "name": "Kimi K2.6", + "organization": "Moonshot AI", + "parameters": "1T/32B active MoE", + "context_window": "256K→1M", + "swe_bench": 80.2, + "if_score": 91, + "categories": ["coding", "agent", "multimodal"], + "provider": "ollama-cloud" + }, + { + "id": "qwen3-coder-480b", + "name": "Qwen3-Coder 480B", + "organization": "Qwen", + "parameters": "480B/35B active", + "context_window": "256K→1M", + "swe_bench": 66.5, + "if_score": 88, + "categories": ["coding", "agent"], + "provider": "ollama-cloud" + }, + { + "id": "minimax-m2.5", + "name": "MiniMax M2.5", + "organization": "MiniMax", + "parameters": "MoE undisclosed", + "context_window": "128K", + "swe_bench": 80.2, + "if_score": 82, + "categories": ["coding", "agent"], + "provider": "ollama-cloud" + }, + { + "id": "glm-5.1", + "name": "GLM-5", + "organization": "Z.ai", + "parameters": "744B/40B active", + "context_window": "128K", + "swe_bench": null, + "if_score": 90, + "categories": ["reasoning", "agent"], + "provider": "ollama-cloud" + }, + { + "id": "qwen3-6-plus", + "name": "Qwen 3.6 Plus", + "organization": "Qwen", + "parameters": "Hybrid MoE", + "context_window": "1M", + "swe_bench": 78.8, + "if_score": 91, + "categories": ["coding", "agent", "reasoning"], + "provider": "openrouter", + "note": "FREE on OpenRouter. Rate-limited." + } + ], "recommendations": [ { - "agent": "lead-developer", - "action": "update_model", - "current_model": "ollama-cloud/qwen3-coder:480b", - "current_provider": "ollama-cloud", - "recommended_model": "ollama-cloud/nemotron-3-super", - "recommended_provider": "ollama-cloud", + "agent": "frontend-developer", + "action": "sync_to_source_of_truth", + "current_model_in_agent_versions": "ollama-cloud/qwen3-coder:480b", + "source_of_truth_model": "ollama-cloud/minimax-m2.5", "impact": "high", "expected_improvement": { - "quality": "+15%", - "speed": "+20%", - "context_window": "1M→1M" + "quality": "+6% (92 vs 86 in benchmark matrix)", + "speed": "~1x", + "context_window": "128K" }, - "score_before": 85, + "score_before": 86, "score_after": 92, - "score_delta": 7, - "rationale": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.", + "score_delta": 6, + "rationale": "agent-versions.json is stale. kilo-meta.json (source of truth) already has minimax-m2.5. Matrix score for frontend-dev on M2.5 = 92 (highest!). MiniMax also leads SWE-bench at 80.2%.", "applied": false, "applied_date": null }, { - "agent": "devops-engineer", - "action": "confirm_model", - "current_model": "ollama-cloud/nemotron-3-super", - "current_provider": "ollama-cloud", - "recommended_model": "ollama-cloud/nemotron-3-super", - "recommended_provider": "ollama-cloud", + "agent": "lead-developer", + "action": "sync_to_source_of_truth", + "current_model_in_agent_versions": "ollama-cloud/nemotron-3-super", + "source_of_truth_model": "ollama-cloud/qwen3-coder:480b", + "impact": "high", + "expected_improvement": { + "quality": "+22% (92 vs 70 in benchmark matrix)", + "speed": "~1x", + "context_window": "256K→1M" + }, + "score_before": 70, + "score_after": 92, + "score_delta": 22, + "rationale": "agent-versions.json shows nemotron-3-super (outdated). kilo-meta.json has qwen3-coder:480b. Matrix score: qwen3-coder 92 is the highest for lead-developer. SWE-bench 66.5% and massive coding context make it the SOTA choice.", + "applied": false, + "applied_date": null + }, + { + "agent": "system-analyst", + "action": "consider_upgrade", + "current_model": "ollama-cloud/glm-5.1", + "recommended_model": "ollama-cloud/deepseek-v4-pro-max", + "impact": "medium", + "expected_improvement": { + "quality": "+6% (88 vs 82 in benchmark matrix)", + "speed": "~1x", + "context_window": "128K→1M" + }, + "score_before": 82, + "score_after": 88, + "score_delta": 6, + "rationale": "system-analyst matrix: glm-5.1 = 82, deepseek-v4-pro-max = 88. 1M context is critical for architecture docs. However GLM-5.1 has Arena ELO 1451 and strong reasoning. Keep GLM-5.1 if standardization across 12 agents matters; otherwise deepseek-v4-pro-max gives measurable gain.", + "applied": false, + "applied_date": null + }, + { + "agent": "evaluator", + "action": "consider_upgrade", + "current_model": "ollama-cloud/glm-5.1", + "recommended_model": "ollama-cloud/kimi-k2.6", + "impact": "medium", + "expected_improvement": { + "quality": "+6% (84 vs 78)", + "speed": "~1x", + "context_window": "128K→256K" + }, + "score_before": 78, + "score_after": 84, + "score_delta": 6, + "rationale": "evaluator needs high IF and reasoning accuracy. kimi-k2-6 IF=91, matrix score 84 vs glm-5.1 78. Alternative: deepseek-v4-pro-max also 84.", + "applied": false, + "applied_date": null + }, + { + "agent": "planner", + "action": "confirm_current", + "current_model": "ollama-cloud/deepseek-v4-pro-max", "impact": "low", "expected_improvement": { - "quality": "0%", - "speed": "0%", - "context_window": "1M→1M" + "quality": "0% (already optimal)", + "speed": "~1x", + "context_window": "1M" }, "score_before": 88, "score_after": 88, "score_delta": 0, - "rationale": "Current model already optimal for DevOps tasks. Nemotron 3 Super's RULER@1M is critical for parsing complex Docker/Compose configs.", + "rationale": "planner is already on deepseek-v4-pro-max, which is the best model for this role (88). GPQA 90.1 confirms strong reasoning for chain-of-thought planning. No change needed.", + "applied": true, + "applied_date": "2026-04-27" + }, + { + "agent": "reflector", + "action": "confirm_current", + "current_model": "ollama-cloud/deepseek-v4-pro-max", + "impact": "low", + "expected_improvement": { + "quality": "0% (already optimal)", + "speed": "~1x", + "context_window": "1M" + }, + "score_before": 84, + "score_after": 84, + "score_delta": 0, + "rationale": "reflector already on deepseek-v4-pro-max (84), the best fit. Self-reflection requires strong reasoning chains; deepseek-v4 excels here.", + "applied": true, + "applied_date": "2026-04-27" + }, + { + "agent": "workflow-architect", + "action": "consider_upgrade", + "current_model": "ollama-cloud/glm-5.1", + "recommended_model": "ollama-cloud/kimi-k2.6", + "impact": "medium", + "expected_improvement": { + "quality": "+6% (82 vs 76)", + "speed": "~1x", + "context_window": "128K→256K" + }, + "score_before": 76, + "score_after": 82, + "score_delta": 6, + "rationale": "workflow-architect matrix: glm-5.1 = 76, kimi-k2-6 = 82. Alternative deepseek-v4-pro-max = 80.", "applied": false, "applied_date": null + }, + { + "agent": "pipeline-judge", + "action": "consider_free_tier", + "current_model": "ollama-cloud/glm-5.1", + "recommended_model": "openrouter/qwen3-6-plus:free", + "impact": "low", + "expected_improvement": { + "quality": "+4% (80 vs 76)", + "speed": "~1x (rate-limited)", + "context_window": "128K→1M" + }, + "score_before": 76, + "score_after": 80, + "score_delta": 4, + "rationale": "qwen3-6-plus is FREE on OpenRouter with IF=91 and SWE-bench 78.8. For pipeline-judge (measurement-only, no code writing) free tier can cut costs. BUT: OpenRouter free has strict rate limits; verify before production.", + "applied": false, + "applied_date": null, + }, + { + "agent": "orchestrator", + "action": "confirm_current", + "current_model": "ollama-cloud/kimi-k2.6", + "impact": "low", + "expected_improvement": { + "quality": "0% (already optimal)", + "speed": "~1x", + "context_window": "256K" + }, + "score_before": 92, + "score_after": 92, + "score_delta": 0, + "rationale": "orchestrator on kimi-k2.6 is the absolute best fit (92). 300 sub-agent swarm capability aligns with orchestration needs. IF=91 ensures routing accuracy.", + "applied": true, + "applied_date": "2026-04-27" + }, + { + "agent": "the-fixer", + "action": "confirm_current", + "current_model": "ollama-cloud/kimi-k2.6", + "impact": "low", + "expected_improvement": { + "quality": "0% (already optimal)", + "speed": "~1x", + "context_window": "256K" + }, + "score_before": 90, + "score_after": 90, + "score_delta": 0, + "rationale": "the-fixer on kimi-k2.6 (90) is optimal. SWE-Pro 58.6 (#1!) and strong bug-fixing capabilities make it the best choice. MiniMax M2.5 and DeepSeek V4-Pro Max tie at 88, but kimi-k2-6 leads.", + "applied": true, + "applied_date": "2026-04-27" + }, + { + "agent": "memory-manager", + "action": "confirm_current", + "current_model": "ollama-cloud/qwen3.6-plus", + "impact": "low", + "expected_improvement": { + "quality": "0% (already optimal)", + "speed": "~1x", + "context_window": "1M" + }, + "score_before": 87, + "score_after": 87, + "score_delta": 0, + "rationale": "memory-manager on qwen3.6-plus (87) is the best fit. 1M context is critical for memory operations. DeepSeek V4-Pro Max and Nemotron-3-Super tie at 86.", + "applied": true, + "applied_date": "2026-04-27" + } + ], + "data_gaps": [ + { + "gap": "performance_log is empty for ALL agents", + "severity": "critical", + "impact": "Cannot compute Avg Score, Success Rate, Avg Duration", + "action": "Instrument agent-executions.jsonl parser into sync-agent-history.ts to populate performance_log from Gitea issue comments" + }, + { + "gap": "No latency / TPS per model", + "severity": "high", + "impact": "Cannot optimize speed or cost-per-token for high-frequency agents (orchestrator, code-skeptic)", + "action": "Add timing instrumentation to pipeline-judge and log wall-clock time per agent invocation" + }, + { + "gap": "No invocation frequency / heatmap per agent", + "severity": "medium", + "impact": "Cannot identify bottlenecks or overused agents; no data for load-balancing decisions", + "action": "Add invocation counter to agent-executions.jsonl and build frequency heatmap in dashboard" + }, + { + "gap": "No A/B test results for model changes", + "severity": "medium", + "impact": "Recommendations are purely benchmark-based, not validated with real pipeline data", + "action": "After any model change, run 5 pipeline iterations and compare fitness scores before/after" + }, + { + "gap": "Missing cost data for OpenRouter free-tier agents", + "severity": "medium", + "impact": "Cannot compute true ROI for pipeline-judge / evaluator if switched to free models", + "action": "Track actual token consumption per provider and compute $/task" + }, + { + "gap": "Stale agent-versions.json (not synced with kilo-meta.json)", + "severity": "high", + "impact": "Dashboard shows incorrect current models for 8+ agents; recommendations targeting wrong baseline", + "action": "Run sync-agent-history.ts with kilo-meta.json as primary source and fix JSON parse error in kilo.jsonc" + }, + { + "gap": "No custom benchmark for markdown-validator", + "severity": "low", + "impact": "markdown-validator scores are lowest across matrix (68 max). Need lightweight-model benchmark.", + "action": "Create micro-benchmark for YAML frontmatter validation and test nano/instant models" } ], - "heatmap": {}, - "closed_source_comparison": {}, - "capability_index_patch": [], "summary": { - "avg_quality_improvement": "+7.5%", - "providers_used": ["ollama-cloud"], - "key_models": ["nemotron-3-super"], - "total_recommendations": 2, - "applied_count": 0, - "pending_count": 2 + "agents_total": 34, + "agents_optimal": 22, + "agents_need_sync": 2, + "agents_need_upgrade": 4, + "agents_consider_free_tier": 1, + "avg_quality_improvement_potential": "+4.2%", + "providers_used": ["ollama-cloud", "openrouter"], + "key_models": ["kimi-k2.6", "deepseek-v4-pro-max", "qwen3-coder-480b", "minimax-m2.5", "glm-5.1"], + "pending_recommendations": 11, + "critical_data_gaps": 2 } -} \ No newline at end of file +} diff --git a/agent-evolution/docker-compose.yml b/agent-evolution/docker-compose.yml index 61ebbea..3bd1e69 100644 --- a/agent-evolution/docker-compose.yml +++ b/agent-evolution/docker-compose.yml @@ -1,6 +1,11 @@ -# Docker Compose for Agent Evolution Dashboard -# Usage: docker-compose -f docker-compose.evolution.yml up -d - +# Docker Compose for Agent Evolution Dashboard (mount-driven, no-rebuild) +# Usage: +# docker compose -f agent-evolution/docker-compose.yml up -d +# # Edit any file in agent-evolution/ or .kilo/ on host → instant reflection +# # Just run: +# bun run sync:evolution +# # and reload the page +# version: '3.8' services: @@ -8,17 +13,16 @@ services: build: context: . dockerfile: agent-evolution/Dockerfile - target: production container_name: apaw-evolution ports: - "3001:3001" volumes: - # Mount data directory for live updates + # Mount the generated standalone HTML to the container's web root + - ./agent-evolution/index.standalone.html:/app/index.html:ro + # Mount data directory for any additional assets - ./agent-evolution/data:/app/data:ro - # Mount for reading source files (optional, for sync) - - ./.kilo/agents:/app/kilo/agents:ro - - ./.kilo/capability-index.yaml:/app/kilo/capability-index.yaml:ro - - ./.kilo/kilo.jsonc:/app/kilo/kilo.jsonc:ro + # Mount .kilo directory for live config access + - ./.kilo:/app/kilo:ro environment: - NODE_ENV=production - TZ=UTC diff --git a/agent-evolution/docker-run.bat b/agent-evolution/docker-run.bat index 0450ee7..75cdd1c 100644 --- a/agent-evolution/docker-run.bat +++ b/agent-evolution/docker-run.bat @@ -1,12 +1,17 @@ @echo off REM Agent Evolution Dashboard - Docker Management Script (Windows) +REM Mount-driven: no rebuild required after file changes. +REM +REM Quick start: +REM 1. docker-run.bat run :: start container once +REM 2. edit files + bun run sync:evolution +REM 3. docker-run.bat reload :: restart container to pick up latest files (no rebuild) setlocal enabledelayedexpansion set IMAGE_NAME=apaw-evolution set CONTAINER_NAME=apaw-evolution-dashboard set PORT=3001 -set DATA_DIR=.\agent-evolution\data REM Colors (limited in Windows CMD) set RED=[91m @@ -20,12 +25,12 @@ if "%1"=="build" goto build if "%1"=="run" goto run if "%1"=="stop" goto stop if "%1"=="restart" goto restart +if "%1"=="reload" goto reload if "%1"=="logs" goto logs if "%1"=="open" goto open if "%1"=="sync" goto sync if "%1"=="status" goto status if "%1"=="clean" goto clean -if "%1"=="dev" goto dev if "%1"=="help" goto help goto unknown @@ -43,7 +48,7 @@ goto :eof :build call :log_info Building Docker image... -docker build -t %IMAGE_NAME%:latest -f agent-evolution/Dockerfile --target production . +docker build -t %IMAGE_NAME%:latest -f agent-evolution/Dockerfile . if errorlevel 1 ( call :log_error Build failed exit /b 1 @@ -56,7 +61,8 @@ REM Check if already running docker ps -q --filter "name=%CONTAINER_NAME%" 2>nul | findstr /r . >nul if not errorlevel 1 ( call :log_warn Container %CONTAINER_NAME% is already running - call :log_info Use 'docker-run.bat restart' to restart it + call :log_info Use 'docker-run.bat reload' to restart with latest host files + call :log_info Use 'docker-run.bat restart' to rebuild image and restart exit /b 0 ) @@ -67,14 +73,13 @@ if not errorlevel 1 ( docker rm %CONTAINER_NAME% >nul 2>nul ) -call :log_info Starting container... +call :log_info Starting container with mount-driven volumes... docker run -d ^ --name %CONTAINER_NAME% ^ -p %PORT%:3001 ^ - -v %cd%/%DATA_DIR%:/app/data:ro ^ - -v %cd%/.kilo/agents:/app/kilo/agents:ro ^ - -v %cd%/.kilo/capability-index.yaml:/app/kilo/capability-index.yaml:ro ^ - -v %cd%/.kilo/kilo.jsonc:/app/kilo/kilo.jsonc:ro ^ + -v %cd%\agent-evolution\index.standalone.html:/app/index.html:ro ^ + -v %cd%\agent-evolution\data:/app/data:ro ^ + -v %cd%\.kilo:/app/kilo:ro ^ --restart unless-stopped ^ %IMAGE_NAME%:latest @@ -84,6 +89,9 @@ if errorlevel 1 ( ) call :log_info Container started: %CONTAINER_NAME% call :log_info Dashboard available at: http://localhost:%PORT% +call :log_info Mounted: .\agent-evolution\index.standalone.html -> /app/index.html +call :log_info .\agent-evolution\data -> /app/data +call :log_info .\.kilo -> /app/kilo goto :eof :stop @@ -93,7 +101,14 @@ docker rm %CONTAINER_NAME% >nul 2>nul call :log_info Container stopped goto :eof +:reload +call :log_info Reloading container to reflect host file changes... +call :stop +call :run +goto :eof + :restart +call :log_info Full restart: rebuild image + restart container... call :stop call :build call :run @@ -123,7 +138,7 @@ if not errorlevel 1 ( exit /b 1 ) ) -call :log_info Sync complete +call :log_info Sync complete — run 'docker-run.bat reload' to pick up changes goto :eof :status @@ -131,11 +146,11 @@ docker ps -q --filter "name=%CONTAINER_NAME%" 2>nul | findstr /r . >nul if not errorlevel 1 ( call :log_info Container status: %GREEN%RUNNING%NC% call :log_info URL: http://localhost:%PORT% - + REM Health check for /f "tokens=*" %%i in ('docker inspect --format="{{.State.Health.Status}}" %CONTAINER_NAME% 2^>nul') do set HEALTH=%%i call :log_info Health: !HEALTH! - + REM Started time for /f "tokens=*" %%i in ('docker inspect --format="{{.State.StartedAt}}" %CONTAINER_NAME% 2^>nul') do set STARTED=%%i if defined STARTED call :log_info Started: !STARTED! @@ -156,37 +171,27 @@ docker rmi %IMAGE_NAME%:latest >nul 2>nul call :log_info Cleanup complete goto :eof -:dev -call :log_info Starting development mode... -docker build -t %IMAGE_NAME%:dev -f agent-evolution/Dockerfile --target development . -if errorlevel 1 ( - call :log_error Build failed - exit /b 1 -) -docker run --rm ^ - --name %CONTAINER_NAME%-dev ^ - -p %PORT%:3001 ^ - -v %cd%/%DATA_DIR%:/app/data ^ - -v %cd%/agent-evolution/index.html:/app/index.html ^ - %IMAGE_NAME%:dev -goto :eof - :help -echo Agent Evolution Dashboard - Docker Management (Windows) +echo Agent Evolution Dashboard - Docker Management (mount-driven, no-rebuild) +echo. +echo Quick start: +echo 1. docker-run.bat run ^:: Start container once +echo 2. edit files + bun run sync:evolution +echo 3. docker-run.bat reload ^:: Container picks up changes immediately echo. echo Usage: %~nx0 ^ echo. echo Commands: -echo build Build Docker image -echo run Run container -echo stop Stop container -echo restart Restart container (build + run) +echo build Build Docker image (rare — only Dockerfile changes) +echo run Start container for the first time +echo stop Stop and remove container +echo reload Restart container to pick up latest host files (no rebuild) +echo restart Rebuild image AND restart container echo logs View container logs echo open Open dashboard in browser -echo sync Sync evolution data +echo sync Sync evolution data on host echo status Show container status -echo clean Remove container and image -echo dev Run in development mode (with hot reload) +echo clean Remove container AND image echo help Show this help message goto :eof diff --git a/agent-evolution/docker-run.sh b/agent-evolution/docker-run.sh index a8aa9db..c8015b4 100644 --- a/agent-evolution/docker-run.sh +++ b/agent-evolution/docker-run.sh @@ -1,12 +1,17 @@ #!/bin/bash # Agent Evolution Dashboard - Docker Management Script +# Mount-driven: no rebuild required after file changes. +# +# Quick-ref: +# bash agent-evolution/docker-run.sh run # start (no rebuild needed later) +# bash agent-evolution/docker-run.sh reload # restart container to pick up new mounts +# bash agent-evolution/docker-run.sh restart # rebuild image + restart container set -e IMAGE_NAME="apaw-evolution" CONTAINER_NAME="apaw-evolution-dashboard" -PORT=3001 -DATA_DIR="./agent-evolution/data" +PORT=3003 # Colors for output RED='\033[0;31m' @@ -18,23 +23,23 @@ log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; } -# Build Docker image +# Build Docker image (rarely needed — only on Dockerfile / base-image changes) build() { log_info "Building Docker image..." docker build \ -t "$IMAGE_NAME:latest" \ -f agent-evolution/Dockerfile \ - --target production \ . log_info "Build complete: $IMAGE_NAME:latest" } -# Run container +# Run container with directory mounts (no file copies) run() { # Check if container already running if docker ps -q --filter "name=$CONTAINER_NAME" | grep -q .; then log_warn "Container $CONTAINER_NAME is already running" - log_info "Use '$0 restart' to restart it" + log_info "Use '$0 reload' to restart with latest host files" + log_info "Use '$0 restart' to rebuild image and restart" exit 0 fi @@ -44,14 +49,13 @@ run() { docker rm "$CONTAINER_NAME" >/dev/null || true fi - log_info "Starting container..." + log_info "Starting container with mount-driven volumes..." docker run -d \ --name "$CONTAINER_NAME" \ -p "$PORT:3001" \ - -v "$(pwd)/$DATA_DIR:/app/data:ro" \ - -v "$(pwd)/.kilo/agents:/app/kilo/agents:ro" \ - -v "$(pwd)/.kilo/capability-index.yaml:/app/kilo/capability-index.yaml:ro" \ - -v "$(pwd)/.kilo/kilo.jsonc:/app/kilo/kilo.jsonc:ro" \ + -v "$(pwd)/agent-evolution/index.standalone.html:/app/index.html:ro" \ + -v "$(pwd)/agent-evolution/data:/app/data:ro" \ + -v "$(pwd)/.kilo:/app/kilo:ro" \ --restart unless-stopped \ --health-cmd "wget --no-verbose --tries=1 --spider http://localhost:3001/ || exit 1" \ --health-interval "30s" \ @@ -61,9 +65,13 @@ run() { log_info "Container started: $CONTAINER_NAME" log_info "Dashboard available at: http://localhost:$PORT" + log_info "Mounted: ./agent-evolution/index.standalone.html → /app/index.html" + log_info " ./agent-evolution/data → /app/data" + log_info " ./.kilo → /app/kilo" + log_info "Tip: edit host files, run bun run sync:evolution, then reload page or use '$0 reload'" } -# Stop container +# Stop and remove container stop() { log_info "Stopping container..." docker stop "$CONTAINER_NAME" >/dev/null 2>&1 || true @@ -71,8 +79,16 @@ stop() { log_info "Container stopped" } -# Restart container +# Restart container WITHOUT rebuilding image (picks up new host files) +reload() { + log_info "Reloading container to reflect host file changes..." + stop + run +} + +# Rebuild image AND restart container (only when Dockerfile changes) restart() { + log_info "Full restart: rebuild image + restart container..." stop build run @@ -99,7 +115,7 @@ open() { fi } -# Sync evolution data +# Sync evolution data on host (generates index.standalone.html from latest data) sync() { log_info "Syncing evolution data..." if command -v bun &> /dev/null; then @@ -110,7 +126,7 @@ sync() { log_error "Node.js or Bun required for sync" exit 1 fi - log_info "Sync complete" + log_info "Sync complete — run '$0 reload' to pick up changes" } # Status check @@ -138,47 +154,33 @@ status() { } # Clean up -clean() { + clean() { log_info "Cleaning up..." stop docker rmi "$IMAGE_NAME:latest" >/dev/null 2>&1 || true log_info "Cleanup complete" } -# Development mode with hot reload -dev() { - log_info "Starting development mode..." - docker build \ - -t "$IMAGE_NAME:dev" \ - -f agent-evolution/Dockerfile \ - --target development \ - . - - docker run --rm \ - --name "${CONTAINER_NAME}-dev" \ - -p "$PORT:3001" \ - -v "$(pwd)/$DATA_DIR:/app/data" \ - -v "$(pwd)/agent-evolution/index.html:/app/index.html" \ - "$IMAGE_NAME:dev" -} - # Show help show_help() { - echo "Agent Evolution Dashboard - Docker Management" + echo "Agent Evolution Dashboard - Docker Management (mount-driven, no-rebuild)" echo "" - echo "Usage: $0 " + echo "Quick start:" + echo " 1. bash $0 run # Start container once" + echo " 2. edit files + bun run sync:evolution" + echo " 3. bash $0 reload # Container picks up changes immediately" echo "" echo "Commands:" - echo " build Build Docker image" - echo " run Run container" - echo " stop Stop container" - echo " restart Restart container (build + run)" + echo " build Build Docker image (rare — only Dockerfile changes)" + echo " run Start container for the first time" + echo " stop Stop and remove container" + echo " reload Restart container to pick up latest host files (no rebuild)" + echo " restart Rebuild image AND restart container" echo " logs View container logs" echo " open Open dashboard in browser" - echo " sync Sync evolution data" + echo " sync Run sync-agent-history.ts on host" echo " status Show container status" - echo " clean Remove container and image" - echo " dev Run in development mode (with hot reload)" + echo " clean Remove container AND image" echo " help Show this help message" } @@ -187,13 +189,17 @@ case "${1:-help}" in build) build ;; run) run ;; stop) stop ;; + reload) reload ;; restart) restart ;; logs) logs ;; open) open ;; sync) sync ;; status) status ;; clean) clean ;; - dev) dev ;; + dev) + log_warn "'dev' mode deprecated — use 'run' + volume mounts instead." + log_info "Run: bash $0 run" + ;; help) show_help ;; *) log_error "Unknown command: $1" diff --git a/agent-evolution/index.html b/agent-evolution/index.html index bb40485..00f4c48 100644 --- a/agent-evolution/index.html +++ b/agent-evolution/index.html @@ -472,6 +472,59 @@ .score-fill.medium { background: linear-gradient(90deg, var(--accent-orange), #ffc048); } .score-fill.low { background: linear-gradient(90deg, var(--accent-red), #ff6b81); } + /* Heatmap */ + .hm-wrap { overflow-x:auto; border-radius:11px; border:1px solid var(--border); background:var(--bg-card); padding:18px; margin-bottom:26px; } + .hm-title { font-weight:700; font-size:1.05em; } + .hm-sub { font-size:.76em; color:var(--text-muted); margin-bottom:14px; } + .hm-table { border-collapse:separate; border-spacing:2px; width:100%; } + .hm-table th { font-family:'JetBrains Mono',monospace; font-size:.62em; color:var(--text-muted); padding:8px 5px; text-align:center; white-space:nowrap; vertical-align:bottom; } + .hm-table th.hm-role { text-align:left; min-width:140px; font-size:.68em; padding-left:10px; } + .hm-table td { text-align:center; padding:6px 4px; font-family:'JetBrains Mono',monospace; font-size:.72em; font-weight:700; border-radius:6px; cursor:pointer; transition:all .15s cubic-bezier(.4,0,.2,1); min-width:42px; position:relative; line-height:1.4; } + .hm-table td:hover { transform:scale(1.1); z-index:2; box-shadow:0 4px 12px rgba(0,0,0,.35); } + .hm-table td.hm-r { text-align:left; font-family:'Inter',sans-serif; font-size:.82em; font-weight:600; color:var(--text-primary); cursor:default; padding-left:10px; } + .hm-table td.hm-r:hover { transform:none; box-shadow:none; } + .hm-star { position:absolute; top:2px; right:2px; font-size:.65em; text-shadow:0 1px 2px rgba(0,0,0,.5); } + .hm-cur { box-shadow:inset 0 0 0 2px var(--accent-cyan), 0 0 8px rgba(0,212,255,.35); border-radius:6px; } + .hm-cur::after { content:''; position:absolute; bottom:2px; left:50%; transform:translateX(-50%); width:8px; height:3px; background:var(--accent-cyan); border-radius:2px; } + .hm-if-warn { position:absolute; top:2px; left:2px; font-size:.6em; opacity:.8; } + + /* Smooth gradient legend bar */ + .hm-legend-wrap { margin-top:18px; padding:0 4px; } + .hm-legend-track { position:relative; height:22px; border-radius:11px; background:linear-gradient(90deg, rgba(0,255,148,.85) 0%, rgba(0,212,255,.75) 20%, rgba(59,130,246,.6) 40%, rgba(168,85,247,.45) 58%, rgba(255,159,67,.35) 75%, rgba(255,71,87,.3) 88%, rgba(90,104,128,.2) 100%); box-shadow:inset 0 1px 3px rgba(0,0,0,.3); } + .hm-legend-labels { display:flex; justify-content:space-between; align-items:center; margin-top:8px; padding:0 4px; } + .hm-legend-labels span { font-size:.68em; font-family:'JetBrains Mono',monospace; color:var(--text-muted); } + .hm-legend-left { color:var(--accent-green); } + .hm-legend-right { color:var(--accent-red); } + .hm-legend-marks { display:flex; justify-content:space-between; padding:0 2px; margin-top:3px; } + .hm-legend-marks span { font-size:.58em; font-family:'JetBrains Mono',monospace; color:var(--text-muted); min-width:20px; text-align:center; } + + /* Heatmap Modal Tabs */ + .hm-modal-tabs { display:flex; gap:3px; background:var(--bg-panel); border-bottom:1px solid var(--border); padding:4px 18px; } + .hm-tab-btn { padding:8px 16px; background:none; border:none; color:var(--text-secondary); font-family:'Inter'; font-size:.82em; font-weight:600; border-radius:8px; cursor:pointer; transition:all .25s; } + .hm-tab-btn.active { color:var(--bg-deep); background:linear-gradient(135deg,var(--accent-cyan),var(--accent-green)); } + .hm-tab-content { display:none; } + .hm-tab-content.active { display:block; } + .hm-model-timeline { display:flex; flex-direction:column; gap:12px; } + .hm-tl-item { display:flex; gap:14px; align-items:center; padding:10px; background:var(--bg-deep); border-radius:8px; border-left:3px solid var(--accent-cyan); } + .hm-tl-date { font-family:'JetBrains Mono',monospace; font-size:.72em; color:var(--text-muted); min-width:100px; } + .hm-tl-change { display:flex; align-items:center; gap:8px; } + .hm-tl-from { text-decoration:line-through; color:#ff6b81; background:rgba(255,71,87,.08); padding:2px 6px; border-radius:4px; } + .hm-tl-arrow { color:var(--accent-green); } + .hm-tl-to { color:var(--accent-green); background:rgba(0,255,148,.08); padding:2px 6px; border-radius:4px; font-weight:600; } + .hm-tl-current { border-left-color:var(--accent-green); background:rgba(0,255,148,.05); } + .hm-no-data { color:var(--text-muted); font-size:.9em; padding:16px; text-align:center; } + .hm-capabilities { display:flex; flex-wrap:wrap; gap:6px; } + .hm-cap-tag { padding:4px 10px; background:rgba(0,212,255,.1); border:1px solid var(--border); border-radius:16px; font-size:.78em; color:var(--accent-cyan); } + .hm-agent-desc { font-size:.9em; color:var(--text-secondary); line-height:1.5; margin-bottom:14px; padding:12px; background:var(--bg-deep); border-radius:8px; } + .hm-model-tl-score { margin-left:auto; font-family:'JetBrains Mono',monospace; font-size:.8em; color:var(--accent-cyan); } + + /* Tooltip */ + #ttOverlay { display:none; position:fixed; top:0;left:0;right:0;bottom:0; z-index:999; pointer-events:none; } + #ttOverlay.show { display:block; } + #ttBox { position:absolute; background:var(--bg-panel); border:1px solid var(--accent-cyan); border-radius:9px; padding:12px 16px; max-width:300px; box-shadow:0 10px 32px rgba(0,0,0,.55); z-index:1000; } + #ttBox h4 { color:var(--accent-cyan); font-size:.9em; margin-bottom:4px; } + #ttBox p { font-size:.78em; color:var(--text-secondary); line-height:1.45; } + /* Export */ .actions-row { display: flex; @@ -551,11 +604,137 @@ white-space: pre-wrap; } + /* Impact Tab */ + .chart-wrap { background: var(--bg-card); border: 1px solid var(--border); border-radius: 12px; padding: 20px; margin-bottom: 24px; } + .chart-title { font-size: 1.1em; font-weight: 700; margin-bottom: 16px; } + .chart-sub { font-size: 0.76em; color: var(--text-muted); margin-bottom: 14px; } + #impactCanvas { width: 100%; height: 300px; border-radius: 8px; background: var(--bg-panel); } + .chart-placeholder { text-align: center; padding: 60px 20px; color: var(--text-muted); font-size: 0.95em; } + + /* Recommendation Cards */ + .rec-card { background: var(--bg-card); border: 1px solid var(--border); border-radius: 12px; padding: 20px; transition: all 0.3s; margin-bottom: 16px; } + .rec-card:hover { border-color: var(--accent-cyan); transform: translateY(-2px); box-shadow: 0 8px 32px var(--glow-cyan); } + .rec-hdr { display: flex; justify-content: space-between; align-items: center; margin-bottom: 14px; } + .rec-agent { font-weight: 700; font-size: 1.1em; display: flex; align-items: center; gap: 10px; } + .rec-agent-name { color: var(--text-primary); } + .impact-badge { font-family: 'JetBrains Mono', monospace; font-size: 0.7em; font-weight: 700; padding: 4px 10px; border-radius: 6px; text-transform: uppercase; letter-spacing: 0.5px; } + .impact-badge.critical { background: rgba(255,71,87,0.2); color: #ff6b81; border: 1px solid rgba(255,71,87,0.4); } + .impact-badge.high { background: rgba(255,159,67,0.2); color: #ffc048; border: 1px solid rgba(255,159,67,0.4); } + .impact-badge.medium { background: rgba(59,130,246,0.2); color: #60a5fa; border: 1px solid rgba(59,130,246,0.4); } + .impact-badge.low { background: rgba(0,255,148,0.15); color: #4ade80; border: 1px solid rgba(0,255,148,0.3); } + .swap-vis { display: flex; align-items: center; gap: 12px; margin: 16px 0; padding: 14px; background: var(--bg-panel); border-radius: 8px; } + .swap-from, .swap-to { flex: 1; padding: 10px 14px; border-radius: 6px; font-family: 'JetBrains Mono', monospace; font-size: 0.8em; } + .swap-from { background: rgba(255,71,87,0.1); color: #ff6b81; border: 1px solid rgba(255,71,87,0.3); } + .swap-to { background: rgba(0,255,148,0.1); color: #4ade80; border: 1px solid rgba(0,255,148,0.3); } + .swap-arrow { color: var(--accent-cyan); font-size: 1.4em; font-weight: 700; } + .rec-metrics { display: grid; grid-template-columns: repeat(4, 1fr); gap: 12px; margin-bottom: 14px; } + .rec-metric { text-align: center; padding: 10px; background: var(--bg-panel); border-radius: 6px; } + .rec-metric-label { font-size: 0.65em; color: var(--text-muted); text-transform: uppercase; letter-spacing: 0.5px; } + .rec-metric-value { font-family: 'JetBrains Mono', monospace; font-size: 0.95em; font-weight: 600; color: var(--accent-green); margin-top: 4px; } + .rec-rationale { font-size: 0.85em; color: var(--text-secondary); line-height: 1.6; padding: 12px; background: rgba(0,212,255,0.05); border-radius: 6px; border-left: 3px solid var(--accent-cyan); } + + /* Recommendation Card Checkbox */ + .rec-checkbox { position: absolute; top: 16px; right: 16px; } + .rec-checkbox input { width: 18px; height: 18px; cursor: pointer; accent-color: var(--accent-cyan); } + + /* Progress Modal */ + .progress-overlay { + display: none; + position: fixed; + inset: 0; + background: rgba(0,0,0,0.85); + z-index: 10000; + justify-content: center; + align-items: center; + flex-direction: column; + } + .progress-overlay.show { display: flex; } + .progress-card { + background: var(--bg-panel); + border: 1px solid var(--accent-cyan); + border-radius: 14px; + padding: 32px 40px; + text-align: center; + max-width: 500px; + width: 90%; + box-shadow: 0 20px 60px rgba(0,0,0,0.5); + } + .progress-title { font-size: 1.2em; font-weight: 700; margin-bottom: 24px; } + .progress-bar-wrap { background: var(--bg-card); border-radius: 4px; height: 8px; overflow: hidden; margin-bottom: 20px; } + .progress-bar-fill { + height: 100%; + width: 0%; + background: linear-gradient(90deg, var(--accent-green), #00ff94); + border-radius: 4px; + transition: width 0.3s ease-out; + } + .progress-status { font-size: 0.9em; color: var(--text-secondary); margin-bottom: 20px; min-height: 24px; } + .progress-result { display: none; } + .progress-result.show { display: block; } + .progress-result p { font-size: 1em; color: var(--accent-green); margin-bottom: 20px; } + .progress-close-btn { + padding: 10px 24px; + background: var(--bg-card); + border: 1px solid var(--border); + color: var(--text-primary); + border-radius: 8px; + cursor: pointer; + font-size: 0.9em; + } + .progress-close-btn:hover { border-color: var(--accent-cyan); color: var(--accent-cyan); } + + /* Research Modal */ + .research-steps { text-align: left; margin: 20px 0; } + .research-step { padding: 12px 16px; background: var(--bg-card); border-radius: 8px; margin-bottom: 10px; font-size: 0.9em; color: var(--text-secondary); display: flex; align-items: center; gap: 10px; opacity: 0.5; transition: all 0.3s; } + .research-step.active { opacity: 1; color: var(--accent-cyan); background: rgba(0,212,255,0.1); } + .research-step.done { opacity: 1; color: var(--accent-green); } + .research-step .spinner { width: 16px; height: 16px; border: 2px solid var(--border); border-top-color: var(--accent-cyan); border-radius: 50%; animation: spin 1s linear infinite; display: none; } + .research-step.active .spinner { display: block; } + .research-summary { display: none; text-align: center; padding: 20px; } + .research-summary.show { display: block; } + .research-summary p { font-size: 1em; color: var(--text-secondary); margin-bottom: 16px; } + .research-link { color: var(--accent-cyan); text-decoration: underline; cursor: pointer; } + + @keyframes spin { to { transform: rotate(360deg); } } + + /* Apply Modal Checklist */ + .apply-checklist { max-height: 300px; overflow-y: auto; margin: 16px 0; } + .apply-item { + display: flex; + align-items: center; + gap: 12px; + padding: 12px 14px; + background: var(--bg-card); + border-radius: 8px; + margin-bottom: 8px; + transition: all 0.2s; + } + .apply-item:hover { background: var(--bg-card-hover); } + .apply-item input { width: 18px; height: 18px; accent-color: var(--accent-cyan); } + .apply-item-content { flex: 1; } + .apply-item-agent { font-weight: 600; font-size: 0.95em; } + .apply-item-models { display: flex; align-items: center; gap: 8px; font-family: 'JetBrains Mono', monospace; font-size: 0.8em; margin-top: 4px; } + .apply-item-from { text-decoration: line-through; color: #ff6b81; } + .apply-item-arrow { color: var(--accent-cyan); } + .apply-item-to { color: var(--accent-green); } + .apply-item-impact { font-size: 0.7em; padding: 2px 8px; border-radius: 4px; text-transform: uppercase; } + .apply-item-impact.critical { background: rgba(255,71,87,0.2); color: #ff6b81; } + .apply-item-impact.high { background: rgba(255,159,67,0.2); color: #ffc048; } + .apply-item-impact.medium { background: rgba(59,130,246,0.2); color: #60a5fa; } + .apply-item-impact.low { background: rgba(0,255,148,0.15); color: #4ade80; } + .apply-modal-actions { display: flex; justify-content: flex-end; gap: 10px; margin-top: 16px; } + .apply-btn { padding: 10px 20px; border-radius: 8px; font-size: 0.9em; cursor: pointer; transition: all 0.25s; } + .apply-btn.apply { background: linear-gradient(135deg, rgba(0,212,255,0.15), rgba(0,255,148,0.1)); border: 1px solid var(--accent-cyan); color: var(--accent-cyan); } + .apply-btn.apply:hover { box-shadow: 0 0 20px var(--glow-cyan); } + @media (max-width: 768px) { .header h1 { font-size: 1.5em; } .tabs { flex-wrap: wrap; } .agents-grid { grid-template-columns: 1fr; } .stats-row { grid-template-columns: repeat(2, 1fr); } + .rec-metrics { grid-template-columns: repeat(2, 1fr); } + .swap-vis { flex-direction: column; } + .swap-arrow { transform: rotate(90deg); } } @@ -578,7 +757,8 @@ - + + @@ -633,21 +813,67 @@
- + +
- -
-
-

Agent × Model Matrix

- - - -
+ +
+
+
Agent × Model Compatibility Heatmap
+
Weighted score = benchmark × instruction-following multiplier · ★ = best fit · outlined = current · click for details
+
+
+
+
+ 100806040200 +
+
+ ↑ Ideal Match + Mismatch ↓ +
+
+
+
+ + +
+
+ + +
+
Historical System Score
+
Average composite score across all agents over time
+ + +
+ + +
+ +
+
Model Distribution
+
Current models across all agents
+ + +
+ + +
+
Migration Impact
+
Before/after fit scores when switching models - green = improvement, red = regression
+ + +
@@ -669,11 +895,124 @@
+ + + + +
+
+
Applying Fixes...
+
+
+
+
Preparing...
+
+

+ +
+
+
+ + + + + +
+ + + +