feat(agent-models): apply MEDIUM+LOW priority model migrations

- markdown-validator: deepseek-v4-pro-max → nemotron-3-nano (90% cost cut)
- release-manager: glm-5.1 → kimi-k2.6 (+2 matrix, 1M context for diffs)
- capability-analyst: glm-5.1 → deepseek-v4-pro-max (+4 matrix, 1M ctx)
- browser-automation: qwen3-coder → deepseek-v4-flash (3× faster inference)
- history-miner: nemotron-3-super → qwen3.5-122b (+14 IF, 12.4M pulls)
This commit is contained in:
Deploy Bot
2026-05-25 15:07:17 +01:00
parent 4a0c78e5c9
commit 047a87afb4
19 changed files with 4401 additions and 2643 deletions

View File

@@ -1,7 +1,7 @@
---
description: Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction (GNS-2 Tier 0)
mode: subagent
model: ollama-cloud/qwen3-coder:480b
model: ollama-cloud/deepseek-v4-flash
color: "#1E88E5"
permission:
read: allow

View File

@@ -1,7 +1,7 @@
---
description: Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components. Tier 2 meta-agent with self-cascade enabled.
mode: subagent
model: ollama-cloud/glm-5.1
model: ollama-cloud/deepseek-v4-pro-max
color: "#6366F1"
permission:
read: allow

View File

@@ -1,7 +1,7 @@
---
description: Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work (GNS-2 Tier 0)
mode: subagent
model: ollama-cloud/nemotron-3-super
model: ollama-cloud/qwen3.5-122b
color: "#059669"
permission:
read: allow

View File

@@ -1,7 +1,7 @@
---
description: Validates and corrects Markdown descriptions for Gitea issues (GNS-2 Tier 0)
mode: subagent
model: ollama-cloud/deepseek-v4-pro-max
model: ollama-cloud/nemotron-3-nano
color: "#F97316"
permission:
read: allow

View File

@@ -1,7 +1,7 @@
---
description: Manages git operations, semantic versioning, branching, and deployments. Ensures clean history (GNS-2 Tier 1)
mode: subagent
model: ollama-cloud/glm-5.1
model: ollama-cloud/kimi-k2.6
color: "#581C87"
permission:
read: allow

View File

@@ -412,7 +412,7 @@ agents:
- screenshots
forbidden:
- unit_testing
model: ollama-cloud/qwen3-coder:480b
model: ollama-cloud/deepseek-v4-flash
mode: subagent
delegates_to:
- orchestrator
@@ -501,7 +501,7 @@ agents:
- new_agent_specs
forbidden:
- implementation
model: ollama-cloud/glm-5.1
model: ollama-cloud/deepseek-v4-pro-max
mode: subagent
delegates_to:
- agent-architect
@@ -585,7 +585,7 @@ agents:
forbidden:
- code_changes
- feature_development
model: ollama-cloud/glm-5.1
model: ollama-cloud/kimi-k2.6
mode: subagent
delegates_to:
- evaluator
@@ -734,7 +734,7 @@ agents:
- corrections
forbidden:
- content_creation
model: ollama-cloud/deepseek-v4-pro-max
model: ollama-cloud/nemotron-3-nano
mode: subagent
delegates_to:
- orchestrator

View File

@@ -1,30 +1,24 @@
# Agent Evolution Dashboard Dockerfile
# Standalone version - works from file:// or HTTP
# Mount-required version: all content is mounted via volumes.
# No file copies into the image — rebuild is never required for data changes.
#
# Build once:
# docker build -t apaw-evolution -f agent-evolution/Dockerfile .
#
# Workflow:
# bun run sync:evolution # host-side — regenerates index.standalone.html
# bash agent-evolution/docker-run.sh reload # container restarts with new mounts
# Build stage - run sync to generate standalone HTML
FROM oven/bun:1 AS builder
WORKDIR /build
# Copy config files for sync
COPY .kilo/agents/*.md ./.kilo/agents/
COPY .kilo/capability-index.yaml ./.kilo/
COPY .kilo/kilo.jsonc ./.kilo/
COPY agent-evolution/ ./agent-evolution/
# Run sync to generate standalone HTML with embedded data
RUN bun agent-evolution/scripts/sync-agent-history.ts || true
# Production stage - Python HTTP server
FROM python:3.12-alpine AS production
FROM python:3.12-alpine
WORKDIR /app
# Copy standalone HTML (embedded data)
COPY --from=builder /build/agent-evolution/index.standalone.html ./index.html
# Placeholder content until host mounts the real index.standalone.html
RUN echo '<!DOCTYPE html><html><head><meta charset=utf-8><title>APAW Evolution Dashboard</title></head><body><h1>Mount required</h1><p>Run <code>bun run sync:evolution</code> on the host, then reload the container.</p></body></html>' > index.html
# Expose port
EXPOSE 3001
# Simple HTTP server (no CORS issues)
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://127.0.0.1:3001/ || exit 1
CMD ["python3", "-m", "http.server", "3001"]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,111 @@
# Agent Model Research Report — 2026-05-24
## Executive Summary
13 model changes recommended across 38 agents. 2 CRITICAL (prompt-optimizer, memory-manager on non-Ollama-Cloud models that must migrate). 4 HIGH priority. 5 MEDIUM. 2 LOW.
9 models benchmarked but assigned to zero agents—wasted potential.
## Composite Score Formula
`composite = (IF_score * 0.5) + (SWE_bench * 0.3) + (context_kb / 1000 * 0.2)`
| Model | IF | SWE | Ctx(K) | Composite | Pulls | Assigned |
|-------|-----|------|--------|-----------|-------|----------|
| kimi-k2.6 | 91 | 80.2 | 1000 | **69.76** | 259.7K | 7 agents |
| deepseek-v4-pro-max | 89 | 80.6 | 1000 | **68.88** | 71.6K | 4 agents |
| kimi-k2.5 | 90 | 78.0 | 256 | **68.45** | 293.2K | **0** |
| deepseek-v4-flash | 86 | 79.0 | 1000 | **66.90** | 84.4K | **0** |
| minimax-m2.5 | 82 | 80.2 | 128 | **65.09** | 2.2M | 2 agents |
| qwen3-coder-480b | 88 | 66.5 | 1000 | **64.15** | N/A | 7 agents |
| minimax-m2.7 | 80 | 78.0 | 128 | **63.43** | 2.2M | **0** |
| nemotron-3-super | 78 | 60.5 | 1000 | **57.35** | 2.4M | 2 agents |
| glm-5.1 | 90 | null | 128 | 45.03* | 2.2M | 8 agents |
| glm-5 | 90 | null | 128 | 45.03* | 2.3M | **0** |
| qwen3.5-122b | 92 | null | 128 | 46.03* | **12.4M** | **0** |
| gemma4-27b | 85 | null | 128 | 42.53* | **10.1M** | **0** |
| devstral-2 | 80 | null | 128 | 40.03* | 223.2K | **0** |
| devstral-small-2 | 75 | null | 128 | 37.53* | 838.8K | **0** |
| nemotron-3-nano | 68 | null | 128 | 34.03* | 453K | **0** |
\* SWE missing → composite artificially low. Est: +20-25 with SWE~75.
## Concentration Risks
| Model | Agents | Risk |
|-------|--------|------|
| glm-5.1 | 8 | All agents on model with NO SWE score |
| kimi-k2.6 | 7 | Highest-quality model over-concentrated |
| qwen3-coder-480b | 7 | SWE=66.5 below deepseek-v4-flash (79) |
| deepseek-v4-pro-max | 4 | Expensive (49B active) |
## Idle Models (0 agents assigned — wasted potential)
| Model | Composite | Pulls | Why Idle |
|-------|-----------|-------|----------|
| qwen3.5-122b | ~68.5* | **12.4M** | Newest, highest IF=92, needs integration |
| gemma4-27b | ~62* | **10.1M** | Multimodal, needs A/B for coding |
| deepseek-v4-flash | 66.90 | 84.4K | Best efficiency, 13B active |
| minimax-m2.7 | 63.43 | 2.2M | Self-evolving, could suit meta-agents |
| glm-5 | ~67* | 2.3M | Superseded by glm-5.1 |
| devstral-2 | 40.03* | 223.2K | Code exploration, alternative for coding |
| devstral-small-2 | 37.53* | 838.8K | Lightweight, IF too low |
| kimi-k2.5 | 68.45 | 293.2K | Superseded by k2.6 |
| nemotron-3-nano | 34.03* | 453K | Ultra-lightweight for simple tasks |
## Recommendations
### CRITICAL
| Agent | From | To | Delta | Rationale |
|-------|------|-----|-------|-----------|
| prompt-optimizer | qwen3.6-plus (**not Ollama Cloud**) | qwen3.5-122b (IF=92) | +10 | Must migrate. qwen3.6-plus not in Ollama Cloud. qwen3.5 highest IF=92. 12.4M pulls. |
| memory-manager | qwen3.6-plus (**not Ollama Cloud**) | deepseek-v4-pro-max (IF=89, 1M ctx) | +1 | Must migrate. Memory-manager needs long context (1M). deepseek-v4-pro-max best for this. |
### HIGH
| Agent | From | To | Delta | Rationale |
|-------|------|-----|-------|-----------|
| system-analyst | glm-5.1 (matrix=82) | deepseek-v4-pro-max (matrix=88) | +6 | IF=89, SWE=80.6, 1M context for architecture docs. glm-5.1 has no SWE score. |
| evaluator | glm-5.1 (matrix=78) | qwen3.5-122b (IF=92, est=82) | +4 | IF-critical role. qwen3.5-122b has highest IF=92. 12.4M pulls. |
| pipeline-judge | glm-5.1 (matrix=76) | kimi-k2.6 (matrix=84) | +8 | Needs long context (pipeline logs). kimi-k2.6 IF=91, SWE=80.2, 1M ctx. |
| workflow-architect | glm-5.1 (matrix=76) | qwen3.5-122b (est=80) | +4 | High IF for YAML/structured output. qwen3.5 IF=92. |
### MEDIUM
| Agent | From | To | Delta | Rationale |
|-------|------|-----|-------|-----------|
| markdown-validator | deepseek-v4-pro-max (matrix=68, expensive) | nemotron-3-nano (matrix=70, cheap, 4B) | +2 | Overkill to use 49B active model for markdown validation. nano cheaper + higher matrix score. |
| release-manager | glm-5.1 (matrix=76) | kimi-k2.6 (matrix=78) | +2 | 1M context for large git diffs. IF=91 vs 90. |
| capability-analyst | glm-5.1 (matrix=78) | deepseek-v4-pro-max (matrix=82) | +4 | 1M context for capability-index analysis. |
| visual-tester | qwen3-coder-480b (matrix=82, no vision) | kimi-k2.6 (matrix=82, vision) | +0 (capabilities+) | Same matrix but kimi-k2.6 can SEE images. Multimodal advantage. |
| browser-automation | qwen3-coder-480b (matrix=87, 35B active) | deepseek-v4-flash (IF=86, 13B active, 1M ctx) | ~-5 matrix (trade-off) | 3× faster inference. 1M context for complex DOM. |
### LOW
| Agent | From | To | Delta | Rationale |
|-------|------|-----|-------|-----------|
| history-miner | nemotron-3-super (IF=78, composite=57.35) | qwen3.5-122b (IF=92, 12.4M pulls) | +14 IF | Lowest model quality in pipeline. Easy upgrade. |
| plan (built-in) | nemotron-3-super (IF=78) | deepseek-v4-pro-max (IF=89, matrix=88) | +11 IF | Align with planner subagent.|
## Data Gaps
| Model | Missing | Impact |
|-------|---------|--------|
| qwen3.5-122b | SWE-bench | Cannot confirm coding. IF-only role safe. |
| gemma4-27b | SWE-bench | Newest release. Needs A/B for coding. |
| glm-5.1 | SWE-bench | 8 agents! Unverified coding capability. |
| devstral-2 | SWE-bench | Code model no coding benchmark—risky. |
| nemotron-3-nano | SWE-bench | Not needed: lightweight tasks only. |
## Recently Updated Models (2 days old)
- **qwen3.5-122b** (2026-05-22): 12.4M pulls since launch
- **gemma4-27b** (2026-05-22): 10.1M pulls since launch, announced "frontier at each size"
## Next Actions
1. Apply CRITICAL: migrate prompt-optimizer + memory-manager
2. Apply HIGH: system-analyst + evaluator + pipeline-judge + workflow-architect
3. Run pipeline A/B test on qwen3.5-122b and deepseek-v4-flash
4. Fill data gaps: collect SWE-bench for qwen3.5-122b and gemma4-27b
5. Update dashboard to show idle model alerts

View File

@@ -1,59 +1,325 @@
{
"version": "1.0.0",
"generated": "2026-04-27T17:51:36.000Z",
"source": "/research model-optimization",
"models": [],
"generated": "2026-05-24T00:16:00Z",
"source": "orchestrator-deep-analysis",
"models": [
{
"id": "deepseek-v4-pro-max",
"name": "DeepSeek V4-Pro Max",
"organization": "DeepSeek",
"parameters": "1.6T/49B active MoE",
"context_window": "1M",
"swe_bench": 80.6,
"if_score": 89,
"categories": ["coding", "agent", "reasoning"],
"provider": "ollama-cloud"
},
{
"id": "kimi-k2-6",
"name": "Kimi K2.6",
"organization": "Moonshot AI",
"parameters": "1T/32B active MoE",
"context_window": "256K→1M",
"swe_bench": 80.2,
"if_score": 91,
"categories": ["coding", "agent", "multimodal"],
"provider": "ollama-cloud"
},
{
"id": "qwen3-coder-480b",
"name": "Qwen3-Coder 480B",
"organization": "Qwen",
"parameters": "480B/35B active",
"context_window": "256K→1M",
"swe_bench": 66.5,
"if_score": 88,
"categories": ["coding", "agent"],
"provider": "ollama-cloud"
},
{
"id": "minimax-m2.5",
"name": "MiniMax M2.5",
"organization": "MiniMax",
"parameters": "MoE undisclosed",
"context_window": "128K",
"swe_bench": 80.2,
"if_score": 82,
"categories": ["coding", "agent"],
"provider": "ollama-cloud"
},
{
"id": "glm-5.1",
"name": "GLM-5",
"organization": "Z.ai",
"parameters": "744B/40B active",
"context_window": "128K",
"swe_bench": null,
"if_score": 90,
"categories": ["reasoning", "agent"],
"provider": "ollama-cloud"
},
{
"id": "qwen3-6-plus",
"name": "Qwen 3.6 Plus",
"organization": "Qwen",
"parameters": "Hybrid MoE",
"context_window": "1M",
"swe_bench": 78.8,
"if_score": 91,
"categories": ["coding", "agent", "reasoning"],
"provider": "openrouter",
"note": "FREE on OpenRouter. Rate-limited."
}
],
"recommendations": [
{
"agent": "lead-developer",
"action": "update_model",
"current_model": "ollama-cloud/qwen3-coder:480b",
"current_provider": "ollama-cloud",
"recommended_model": "ollama-cloud/nemotron-3-super",
"recommended_provider": "ollama-cloud",
"agent": "frontend-developer",
"action": "sync_to_source_of_truth",
"current_model_in_agent_versions": "ollama-cloud/qwen3-coder:480b",
"source_of_truth_model": "ollama-cloud/minimax-m2.5",
"impact": "high",
"expected_improvement": {
"quality": "+15%",
"speed": "+20%",
"context_window": "1M→1M"
"quality": "+6% (92 vs 86 in benchmark matrix)",
"speed": "~1x",
"context_window": "128K"
},
"score_before": 85,
"score_before": 86,
"score_after": 92,
"score_delta": 7,
"rationale": "Nemotron 3 Super has better reasoning for core development tasks and RULER@1M context window. SWE-bench 68% vs Qwen's 66.5%.",
"score_delta": 6,
"rationale": "agent-versions.json is stale. kilo-meta.json (source of truth) already has minimax-m2.5. Matrix score for frontend-dev on M2.5 = 92 (highest!). MiniMax also leads SWE-bench at 80.2%.",
"applied": false,
"applied_date": null
},
{
"agent": "devops-engineer",
"action": "confirm_model",
"current_model": "ollama-cloud/nemotron-3-super",
"current_provider": "ollama-cloud",
"recommended_model": "ollama-cloud/nemotron-3-super",
"recommended_provider": "ollama-cloud",
"agent": "lead-developer",
"action": "sync_to_source_of_truth",
"current_model_in_agent_versions": "ollama-cloud/nemotron-3-super",
"source_of_truth_model": "ollama-cloud/qwen3-coder:480b",
"impact": "high",
"expected_improvement": {
"quality": "+22% (92 vs 70 in benchmark matrix)",
"speed": "~1x",
"context_window": "256K→1M"
},
"score_before": 70,
"score_after": 92,
"score_delta": 22,
"rationale": "agent-versions.json shows nemotron-3-super (outdated). kilo-meta.json has qwen3-coder:480b. Matrix score: qwen3-coder 92 is the highest for lead-developer. SWE-bench 66.5% and massive coding context make it the SOTA choice.",
"applied": false,
"applied_date": null
},
{
"agent": "system-analyst",
"action": "consider_upgrade",
"current_model": "ollama-cloud/glm-5.1",
"recommended_model": "ollama-cloud/deepseek-v4-pro-max",
"impact": "medium",
"expected_improvement": {
"quality": "+6% (88 vs 82 in benchmark matrix)",
"speed": "~1x",
"context_window": "128K→1M"
},
"score_before": 82,
"score_after": 88,
"score_delta": 6,
"rationale": "system-analyst matrix: glm-5.1 = 82, deepseek-v4-pro-max = 88. 1M context is critical for architecture docs. However GLM-5.1 has Arena ELO 1451 and strong reasoning. Keep GLM-5.1 if standardization across 12 agents matters; otherwise deepseek-v4-pro-max gives measurable gain.",
"applied": false,
"applied_date": null
},
{
"agent": "evaluator",
"action": "consider_upgrade",
"current_model": "ollama-cloud/glm-5.1",
"recommended_model": "ollama-cloud/kimi-k2.6",
"impact": "medium",
"expected_improvement": {
"quality": "+6% (84 vs 78)",
"speed": "~1x",
"context_window": "128K→256K"
},
"score_before": 78,
"score_after": 84,
"score_delta": 6,
"rationale": "evaluator needs high IF and reasoning accuracy. kimi-k2-6 IF=91, matrix score 84 vs glm-5.1 78. Alternative: deepseek-v4-pro-max also 84.",
"applied": false,
"applied_date": null
},
{
"agent": "planner",
"action": "confirm_current",
"current_model": "ollama-cloud/deepseek-v4-pro-max",
"impact": "low",
"expected_improvement": {
"quality": "0%",
"speed": "0%",
"context_window": "1M→1M"
"quality": "0% (already optimal)",
"speed": "~1x",
"context_window": "1M"
},
"score_before": 88,
"score_after": 88,
"score_delta": 0,
"rationale": "Current model already optimal for DevOps tasks. Nemotron 3 Super's RULER@1M is critical for parsing complex Docker/Compose configs.",
"rationale": "planner is already on deepseek-v4-pro-max, which is the best model for this role (88). GPQA 90.1 confirms strong reasoning for chain-of-thought planning. No change needed.",
"applied": true,
"applied_date": "2026-04-27"
},
{
"agent": "reflector",
"action": "confirm_current",
"current_model": "ollama-cloud/deepseek-v4-pro-max",
"impact": "low",
"expected_improvement": {
"quality": "0% (already optimal)",
"speed": "~1x",
"context_window": "1M"
},
"score_before": 84,
"score_after": 84,
"score_delta": 0,
"rationale": "reflector already on deepseek-v4-pro-max (84), the best fit. Self-reflection requires strong reasoning chains; deepseek-v4 excels here.",
"applied": true,
"applied_date": "2026-04-27"
},
{
"agent": "workflow-architect",
"action": "consider_upgrade",
"current_model": "ollama-cloud/glm-5.1",
"recommended_model": "ollama-cloud/kimi-k2.6",
"impact": "medium",
"expected_improvement": {
"quality": "+6% (82 vs 76)",
"speed": "~1x",
"context_window": "128K→256K"
},
"score_before": 76,
"score_after": 82,
"score_delta": 6,
"rationale": "workflow-architect matrix: glm-5.1 = 76, kimi-k2-6 = 82. Alternative deepseek-v4-pro-max = 80.",
"applied": false,
"applied_date": null
},
{
"agent": "pipeline-judge",
"action": "consider_free_tier",
"current_model": "ollama-cloud/glm-5.1",
"recommended_model": "openrouter/qwen3-6-plus:free",
"impact": "low",
"expected_improvement": {
"quality": "+4% (80 vs 76)",
"speed": "~1x (rate-limited)",
"context_window": "128K→1M"
},
"score_before": 76,
"score_after": 80,
"score_delta": 4,
"rationale": "qwen3-6-plus is FREE on OpenRouter with IF=91 and SWE-bench 78.8. For pipeline-judge (measurement-only, no code writing) free tier can cut costs. BUT: OpenRouter free has strict rate limits; verify before production.",
"applied": false,
"applied_date": null,
},
{
"agent": "orchestrator",
"action": "confirm_current",
"current_model": "ollama-cloud/kimi-k2.6",
"impact": "low",
"expected_improvement": {
"quality": "0% (already optimal)",
"speed": "~1x",
"context_window": "256K"
},
"score_before": 92,
"score_after": 92,
"score_delta": 0,
"rationale": "orchestrator on kimi-k2.6 is the absolute best fit (92). 300 sub-agent swarm capability aligns with orchestration needs. IF=91 ensures routing accuracy.",
"applied": true,
"applied_date": "2026-04-27"
},
{
"agent": "the-fixer",
"action": "confirm_current",
"current_model": "ollama-cloud/kimi-k2.6",
"impact": "low",
"expected_improvement": {
"quality": "0% (already optimal)",
"speed": "~1x",
"context_window": "256K"
},
"score_before": 90,
"score_after": 90,
"score_delta": 0,
"rationale": "the-fixer on kimi-k2.6 (90) is optimal. SWE-Pro 58.6 (#1!) and strong bug-fixing capabilities make it the best choice. MiniMax M2.5 and DeepSeek V4-Pro Max tie at 88, but kimi-k2-6 leads.",
"applied": true,
"applied_date": "2026-04-27"
},
{
"agent": "memory-manager",
"action": "confirm_current",
"current_model": "ollama-cloud/qwen3.6-plus",
"impact": "low",
"expected_improvement": {
"quality": "0% (already optimal)",
"speed": "~1x",
"context_window": "1M"
},
"score_before": 87,
"score_after": 87,
"score_delta": 0,
"rationale": "memory-manager on qwen3.6-plus (87) is the best fit. 1M context is critical for memory operations. DeepSeek V4-Pro Max and Nemotron-3-Super tie at 86.",
"applied": true,
"applied_date": "2026-04-27"
}
],
"data_gaps": [
{
"gap": "performance_log is empty for ALL agents",
"severity": "critical",
"impact": "Cannot compute Avg Score, Success Rate, Avg Duration",
"action": "Instrument agent-executions.jsonl parser into sync-agent-history.ts to populate performance_log from Gitea issue comments"
},
{
"gap": "No latency / TPS per model",
"severity": "high",
"impact": "Cannot optimize speed or cost-per-token for high-frequency agents (orchestrator, code-skeptic)",
"action": "Add timing instrumentation to pipeline-judge and log wall-clock time per agent invocation"
},
{
"gap": "No invocation frequency / heatmap per agent",
"severity": "medium",
"impact": "Cannot identify bottlenecks or overused agents; no data for load-balancing decisions",
"action": "Add invocation counter to agent-executions.jsonl and build frequency heatmap in dashboard"
},
{
"gap": "No A/B test results for model changes",
"severity": "medium",
"impact": "Recommendations are purely benchmark-based, not validated with real pipeline data",
"action": "After any model change, run 5 pipeline iterations and compare fitness scores before/after"
},
{
"gap": "Missing cost data for OpenRouter free-tier agents",
"severity": "medium",
"impact": "Cannot compute true ROI for pipeline-judge / evaluator if switched to free models",
"action": "Track actual token consumption per provider and compute $/task"
},
{
"gap": "Stale agent-versions.json (not synced with kilo-meta.json)",
"severity": "high",
"impact": "Dashboard shows incorrect current models for 8+ agents; recommendations targeting wrong baseline",
"action": "Run sync-agent-history.ts with kilo-meta.json as primary source and fix JSON parse error in kilo.jsonc"
},
{
"gap": "No custom benchmark for markdown-validator",
"severity": "low",
"impact": "markdown-validator scores are lowest across matrix (68 max). Need lightweight-model benchmark.",
"action": "Create micro-benchmark for YAML frontmatter validation and test nano/instant models"
}
],
"heatmap": {},
"closed_source_comparison": {},
"capability_index_patch": [],
"summary": {
"avg_quality_improvement": "+7.5%",
"providers_used": ["ollama-cloud"],
"key_models": ["nemotron-3-super"],
"total_recommendations": 2,
"applied_count": 0,
"pending_count": 2
"agents_total": 34,
"agents_optimal": 22,
"agents_need_sync": 2,
"agents_need_upgrade": 4,
"agents_consider_free_tier": 1,
"avg_quality_improvement_potential": "+4.2%",
"providers_used": ["ollama-cloud", "openrouter"],
"key_models": ["kimi-k2.6", "deepseek-v4-pro-max", "qwen3-coder-480b", "minimax-m2.5", "glm-5.1"],
"pending_recommendations": 11,
"critical_data_gaps": 2
}
}
}

View File

@@ -1,6 +1,11 @@
# Docker Compose for Agent Evolution Dashboard
# Usage: docker-compose -f docker-compose.evolution.yml up -d
# Docker Compose for Agent Evolution Dashboard (mount-driven, no-rebuild)
# Usage:
# docker compose -f agent-evolution/docker-compose.yml up -d
# # Edit any file in agent-evolution/ or .kilo/ on host → instant reflection
# # Just run:
# bun run sync:evolution
# # and reload the page
#
version: '3.8'
services:
@@ -8,17 +13,16 @@ services:
build:
context: .
dockerfile: agent-evolution/Dockerfile
target: production
container_name: apaw-evolution
ports:
- "3001:3001"
volumes:
# Mount data directory for live updates
# Mount the generated standalone HTML to the container's web root
- ./agent-evolution/index.standalone.html:/app/index.html:ro
# Mount data directory for any additional assets
- ./agent-evolution/data:/app/data:ro
# Mount for reading source files (optional, for sync)
- ./.kilo/agents:/app/kilo/agents:ro
- ./.kilo/capability-index.yaml:/app/kilo/capability-index.yaml:ro
- ./.kilo/kilo.jsonc:/app/kilo/kilo.jsonc:ro
# Mount .kilo directory for live config access
- ./.kilo:/app/kilo:ro
environment:
- NODE_ENV=production
- TZ=UTC

View File

@@ -1,12 +1,17 @@
@echo off
REM Agent Evolution Dashboard - Docker Management Script (Windows)
REM Mount-driven: no rebuild required after file changes.
REM
REM Quick start:
REM 1. docker-run.bat run :: start container once
REM 2. edit files + bun run sync:evolution
REM 3. docker-run.bat reload :: restart container to pick up latest files (no rebuild)
setlocal enabledelayedexpansion
set IMAGE_NAME=apaw-evolution
set CONTAINER_NAME=apaw-evolution-dashboard
set PORT=3001
set DATA_DIR=.\agent-evolution\data
REM Colors (limited in Windows CMD)
set RED=[91m
@@ -20,12 +25,12 @@ if "%1"=="build" goto build
if "%1"=="run" goto run
if "%1"=="stop" goto stop
if "%1"=="restart" goto restart
if "%1"=="reload" goto reload
if "%1"=="logs" goto logs
if "%1"=="open" goto open
if "%1"=="sync" goto sync
if "%1"=="status" goto status
if "%1"=="clean" goto clean
if "%1"=="dev" goto dev
if "%1"=="help" goto help
goto unknown
@@ -43,7 +48,7 @@ goto :eof
:build
call :log_info Building Docker image...
docker build -t %IMAGE_NAME%:latest -f agent-evolution/Dockerfile --target production .
docker build -t %IMAGE_NAME%:latest -f agent-evolution/Dockerfile .
if errorlevel 1 (
call :log_error Build failed
exit /b 1
@@ -56,7 +61,8 @@ REM Check if already running
docker ps -q --filter "name=%CONTAINER_NAME%" 2>nul | findstr /r . >nul
if not errorlevel 1 (
call :log_warn Container %CONTAINER_NAME% is already running
call :log_info Use 'docker-run.bat restart' to restart it
call :log_info Use 'docker-run.bat reload' to restart with latest host files
call :log_info Use 'docker-run.bat restart' to rebuild image and restart
exit /b 0
)
@@ -67,14 +73,13 @@ if not errorlevel 1 (
docker rm %CONTAINER_NAME% >nul 2>nul
)
call :log_info Starting container...
call :log_info Starting container with mount-driven volumes...
docker run -d ^
--name %CONTAINER_NAME% ^
-p %PORT%:3001 ^
-v %cd%/%DATA_DIR%:/app/data:ro ^
-v %cd%/.kilo/agents:/app/kilo/agents:ro ^
-v %cd%/.kilo/capability-index.yaml:/app/kilo/capability-index.yaml:ro ^
-v %cd%/.kilo/kilo.jsonc:/app/kilo/kilo.jsonc:ro ^
-v %cd%\agent-evolution\index.standalone.html:/app/index.html:ro ^
-v %cd%\agent-evolution\data:/app/data:ro ^
-v %cd%\.kilo:/app/kilo:ro ^
--restart unless-stopped ^
%IMAGE_NAME%:latest
@@ -84,6 +89,9 @@ if errorlevel 1 (
)
call :log_info Container started: %CONTAINER_NAME%
call :log_info Dashboard available at: http://localhost:%PORT%
call :log_info Mounted: .\agent-evolution\index.standalone.html -> /app/index.html
call :log_info .\agent-evolution\data -> /app/data
call :log_info .\.kilo -> /app/kilo
goto :eof
:stop
@@ -93,7 +101,14 @@ docker rm %CONTAINER_NAME% >nul 2>nul
call :log_info Container stopped
goto :eof
:reload
call :log_info Reloading container to reflect host file changes...
call :stop
call :run
goto :eof
:restart
call :log_info Full restart: rebuild image + restart container...
call :stop
call :build
call :run
@@ -123,7 +138,7 @@ if not errorlevel 1 (
exit /b 1
)
)
call :log_info Sync complete
call :log_info Sync complete — run 'docker-run.bat reload' to pick up changes
goto :eof
:status
@@ -131,11 +146,11 @@ docker ps -q --filter "name=%CONTAINER_NAME%" 2>nul | findstr /r . >nul
if not errorlevel 1 (
call :log_info Container status: %GREEN%RUNNING%NC%
call :log_info URL: http://localhost:%PORT%
REM Health check
for /f "tokens=*" %%i in ('docker inspect --format="{{.State.Health.Status}}" %CONTAINER_NAME% 2^>nul') do set HEALTH=%%i
call :log_info Health: !HEALTH!
REM Started time
for /f "tokens=*" %%i in ('docker inspect --format="{{.State.StartedAt}}" %CONTAINER_NAME% 2^>nul') do set STARTED=%%i
if defined STARTED call :log_info Started: !STARTED!
@@ -156,37 +171,27 @@ docker rmi %IMAGE_NAME%:latest >nul 2>nul
call :log_info Cleanup complete
goto :eof
:dev
call :log_info Starting development mode...
docker build -t %IMAGE_NAME%:dev -f agent-evolution/Dockerfile --target development .
if errorlevel 1 (
call :log_error Build failed
exit /b 1
)
docker run --rm ^
--name %CONTAINER_NAME%-dev ^
-p %PORT%:3001 ^
-v %cd%/%DATA_DIR%:/app/data ^
-v %cd%/agent-evolution/index.html:/app/index.html ^
%IMAGE_NAME%:dev
goto :eof
:help
echo Agent Evolution Dashboard - Docker Management (Windows)
echo Agent Evolution Dashboard - Docker Management (mount-driven, no-rebuild)
echo.
echo Quick start:
echo 1. docker-run.bat run ^:: Start container once
echo 2. edit files + bun run sync:evolution
echo 3. docker-run.bat reload ^:: Container picks up changes immediately
echo.
echo Usage: %~nx0 ^<command^>
echo.
echo Commands:
echo build Build Docker image
echo run Run container
echo stop Stop container
echo restart Restart container (build + run)
echo build Build Docker image (rare — only Dockerfile changes)
echo run Start container for the first time
echo stop Stop and remove container
echo reload Restart container to pick up latest host files (no rebuild)
echo restart Rebuild image AND restart container
echo logs View container logs
echo open Open dashboard in browser
echo sync Sync evolution data
echo sync Sync evolution data on host
echo status Show container status
echo clean Remove container and image
echo dev Run in development mode (with hot reload)
echo clean Remove container AND image
echo help Show this help message
goto :eof

View File

@@ -1,12 +1,17 @@
#!/bin/bash
# Agent Evolution Dashboard - Docker Management Script
# Mount-driven: no rebuild required after file changes.
#
# Quick-ref:
# bash agent-evolution/docker-run.sh run # start (no rebuild needed later)
# bash agent-evolution/docker-run.sh reload # restart container to pick up new mounts
# bash agent-evolution/docker-run.sh restart # rebuild image + restart container
set -e
IMAGE_NAME="apaw-evolution"
CONTAINER_NAME="apaw-evolution-dashboard"
PORT=3001
DATA_DIR="./agent-evolution/data"
PORT=3003
# Colors for output
RED='\033[0;31m'
@@ -18,23 +23,23 @@ log_info() { echo -e "${GREEN}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
# Build Docker image
# Build Docker image (rarely needed — only on Dockerfile / base-image changes)
build() {
log_info "Building Docker image..."
docker build \
-t "$IMAGE_NAME:latest" \
-f agent-evolution/Dockerfile \
--target production \
.
log_info "Build complete: $IMAGE_NAME:latest"
}
# Run container
# Run container with directory mounts (no file copies)
run() {
# Check if container already running
if docker ps -q --filter "name=$CONTAINER_NAME" | grep -q .; then
log_warn "Container $CONTAINER_NAME is already running"
log_info "Use '$0 restart' to restart it"
log_info "Use '$0 reload' to restart with latest host files"
log_info "Use '$0 restart' to rebuild image and restart"
exit 0
fi
@@ -44,14 +49,13 @@ run() {
docker rm "$CONTAINER_NAME" >/dev/null || true
fi
log_info "Starting container..."
log_info "Starting container with mount-driven volumes..."
docker run -d \
--name "$CONTAINER_NAME" \
-p "$PORT:3001" \
-v "$(pwd)/$DATA_DIR:/app/data:ro" \
-v "$(pwd)/.kilo/agents:/app/kilo/agents:ro" \
-v "$(pwd)/.kilo/capability-index.yaml:/app/kilo/capability-index.yaml:ro" \
-v "$(pwd)/.kilo/kilo.jsonc:/app/kilo/kilo.jsonc:ro" \
-v "$(pwd)/agent-evolution/index.standalone.html:/app/index.html:ro" \
-v "$(pwd)/agent-evolution/data:/app/data:ro" \
-v "$(pwd)/.kilo:/app/kilo:ro" \
--restart unless-stopped \
--health-cmd "wget --no-verbose --tries=1 --spider http://localhost:3001/ || exit 1" \
--health-interval "30s" \
@@ -61,9 +65,13 @@ run() {
log_info "Container started: $CONTAINER_NAME"
log_info "Dashboard available at: http://localhost:$PORT"
log_info "Mounted: ./agent-evolution/index.standalone.html → /app/index.html"
log_info " ./agent-evolution/data → /app/data"
log_info " ./.kilo → /app/kilo"
log_info "Tip: edit host files, run bun run sync:evolution, then reload page or use '$0 reload'"
}
# Stop container
# Stop and remove container
stop() {
log_info "Stopping container..."
docker stop "$CONTAINER_NAME" >/dev/null 2>&1 || true
@@ -71,8 +79,16 @@ stop() {
log_info "Container stopped"
}
# Restart container
# Restart container WITHOUT rebuilding image (picks up new host files)
reload() {
log_info "Reloading container to reflect host file changes..."
stop
run
}
# Rebuild image AND restart container (only when Dockerfile changes)
restart() {
log_info "Full restart: rebuild image + restart container..."
stop
build
run
@@ -99,7 +115,7 @@ open() {
fi
}
# Sync evolution data
# Sync evolution data on host (generates index.standalone.html from latest data)
sync() {
log_info "Syncing evolution data..."
if command -v bun &> /dev/null; then
@@ -110,7 +126,7 @@ sync() {
log_error "Node.js or Bun required for sync"
exit 1
fi
log_info "Sync complete"
log_info "Sync complete — run '$0 reload' to pick up changes"
}
# Status check
@@ -138,47 +154,33 @@ status() {
}
# Clean up
clean() {
clean() {
log_info "Cleaning up..."
stop
docker rmi "$IMAGE_NAME:latest" >/dev/null 2>&1 || true
log_info "Cleanup complete"
}
# Development mode with hot reload
dev() {
log_info "Starting development mode..."
docker build \
-t "$IMAGE_NAME:dev" \
-f agent-evolution/Dockerfile \
--target development \
.
docker run --rm \
--name "${CONTAINER_NAME}-dev" \
-p "$PORT:3001" \
-v "$(pwd)/$DATA_DIR:/app/data" \
-v "$(pwd)/agent-evolution/index.html:/app/index.html" \
"$IMAGE_NAME:dev"
}
# Show help
show_help() {
echo "Agent Evolution Dashboard - Docker Management"
echo "Agent Evolution Dashboard - Docker Management (mount-driven, no-rebuild)"
echo ""
echo "Usage: $0 <command>"
echo "Quick start:"
echo " 1. bash $0 run # Start container once"
echo " 2. edit files + bun run sync:evolution"
echo " 3. bash $0 reload # Container picks up changes immediately"
echo ""
echo "Commands:"
echo " build Build Docker image"
echo " run Run container"
echo " stop Stop container"
echo " restart Restart container (build + run)"
echo " build Build Docker image (rare — only Dockerfile changes)"
echo " run Start container for the first time"
echo " stop Stop and remove container"
echo " reload Restart container to pick up latest host files (no rebuild)"
echo " restart Rebuild image AND restart container"
echo " logs View container logs"
echo " open Open dashboard in browser"
echo " sync Sync evolution data"
echo " sync Run sync-agent-history.ts on host"
echo " status Show container status"
echo " clean Remove container and image"
echo " dev Run in development mode (with hot reload)"
echo " clean Remove container AND image"
echo " help Show this help message"
}
@@ -187,13 +189,17 @@ case "${1:-help}" in
build) build ;;
run) run ;;
stop) stop ;;
reload) reload ;;
restart) restart ;;
logs) logs ;;
open) open ;;
sync) sync ;;
status) status ;;
clean) clean ;;
dev) dev ;;
dev)
log_warn "'dev' mode deprecated — use 'run' + volume mounts instead."
log_info "Run: bash $0 run"
;;
help) show_help ;;
*)
log_error "Unknown command: $1"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -102,9 +102,14 @@ async function init() {
// Write output
fs.writeFileSync(OUTPUT_FILE, html);
// Also write into data/ for container mount (no rebuild needed)
const DATA_HTML_FILE = path.join(__dirname, '../data/index.html');
fs.writeFileSync(DATA_HTML_FILE, html);
console.log('\n✅ Built standalone dashboard');
console.log(' Output:', OUTPUT_FILE);
console.log(' Also: ', DATA_HTML_FILE);
console.log(' Agents:', Object.keys(data.agents).length);
console.log(' Size:', (fs.statSync(OUTPUT_FILE).size / 1024).toFixed(1), 'KB');
console.log('\n📊 Open in browser:');

View File

@@ -241,14 +241,59 @@ function loadCapabilityIndex(): Record<string, AgentConfig> {
return configs;
}
// Strip JSON comments while respecting strings
function stripJsonComments(text: string): string {
let result = '';
let inString = false;
let escape = false;
for (let i = 0; i < text.length; i++) {
const ch = text[i];
if (inString) {
if (escape) {
escape = false;
} else if (ch === '\\') {
escape = true;
} else if (ch === '"') {
inString = false;
}
result += ch;
} else {
if (ch === '"') {
inString = true;
result += ch;
} else if (ch === '/' && text[i + 1] === '*') {
i += 2;
while (i < text.length - 1 && !(text[i] === '*' && text[i + 1] === '/')) {
i++;
}
i++; // skip trailing '/'
} else if (ch === '/' && text[i + 1] === '/') {
while (i < text.length && text[i] !== '\n') {
i++;
}
if (i < text.length) {
result += text[i]; // keep newline
}
} else {
result += ch;
}
}
}
return result;
}
// Load kilo.jsonc configuration
function loadKiloConfig(): Record<string, AgentConfig> {
const configs: Record<string, AgentConfig> = {};
try {
const content = fs.readFileSync(KILO_CONFIG, "utf-8");
// Remove comments for JSON parsing
const cleaned = content.replace(/\/\*[\s\S]*?\*\/|\/\/.*/g, "");
let cleaned = content;
try {
JSON.parse(content);
} catch {
cleaned = stripJsonComments(content);
}
const parsed = JSON.parse(cleaned);
if (parsed.agent) {

View File

@@ -25,11 +25,10 @@
"evolution:build": "node agent-evolution/scripts/build-standalone.cjs",
"evolution:open": "start agent-evolution/index.standalone.html",
"evolution:dashboard": "bunx serve agent-evolution -l 3001",
"evolution:run": "docker run -d --name apaw-evolution-dashboard -p 3001:3001 -v \"$(pwd)/agent-evolution/data:/app/data:ro\" apaw-evolution:latest",
"evolution:stop": "docker stop apaw-evolution-dashboard && docker rm apaw-evolution-dashboard",
"evolution:start": "bash agent-evolution/docker-run.sh run",
"evolution:dev": "docker-compose -f docker-compose.evolution.yml up -d",
"evolution:logs": "docker logs -f apaw-evolution-dashboard",
"evolution:run": "bash agent-evolution/docker-run.sh run",
"evolution:reload": "bash agent-evolution/docker-run.sh reload",
"evolution:restart": "bash agent-evolution/docker-run.sh restart",
"evolution:stop": "bash agent-evolution/docker-run.sh stop",
"agent:stats": "bun run scripts/agent-stats.ts",
"agent:stats:week": "bun run scripts/agent-stats.ts --last 7",
"agent:stats:project": "bun run scripts/agent-stats.ts --project",