diff --git a/.gitignore b/.gitignore index 3eec336..10dcd50 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,18 @@ agent-evolution/archive/scripts/ agent-evolution/archive/reports/ agent-evolution/archive/data/ +# Python cache +__pycache__/ +*.pyc + +# Generated runtime data +agent-evolution/data/dashboard-data.json +agent-evolution/data/state.json +agent-evolution/data/model-benchmarks.json.bak +landing/api/state.json +landing/api/state.json.bak +landing/api/dashboard-data.json + # Architect generated maps (can be large, auto-indexed) # Note: .architect/ md and json files ARE tracked for team orientation # Only maps/ with file graphs can be very large diff --git a/.kilo/kilo.jsonc b/.kilo/kilo.jsonc index 8ec50d4..fb552e9 100644 --- a/.kilo/kilo.jsonc +++ b/.kilo/kilo.jsonc @@ -30,26 +30,26 @@ } }, "code": { - "model": "ollama-cloud/qwen3-coder:480b", + "model": "ollama-cloud/deepseek-v4-pro-max", "variant": "thinking", "description": "Primary code writer. Full tool access for development tasks.", "mode": "primary" }, "ask": { - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/kimi-k2.6", "variant": "instant", "description": "Read-only Q&A agent for codebase questions.", "mode": "primary" }, "plan": { - "model": "ollama-cloud/nemotron-3-super", + "model": "ollama-cloud/kimi-k2.6", "description": "Task planner. Creates detailed implementation plans.", "mode": "primary" }, "debug": { - "model": "ollama-cloud/glm-5.1", + "model": "ollama-cloud/deepseek-v4-pro-max", "variant": "thinking", - "description": "Bug diagnostics and troubleshooting. GLM-5.1 ★88, reasoning for deep debug.", + "description": "Bug diagnostics and troubleshooting.", "mode": "primary" } } diff --git a/agent-evolution/data/agent-versions.json b/agent-evolution/data/agent-versions.json index f645378..6438038 100644 --- a/agent-evolution/data/agent-versions.json +++ b/agent-evolution/data/agent-versions.json @@ -1,6 +1,6 @@ { "version": "1.0.0", - "lastUpdated": "2026-05-27T12:47:21.972Z", + "lastUpdated": "2026-05-27T13:10:49.174Z", "agents": { "lead-developer": { "current": { @@ -3876,7 +3876,7 @@ "total_agents": 38, "agents_with_history": 34, "pending_recommendations": 0, - "last_sync": "2026-05-27T12:47:21.974Z", + "last_sync": "2026-05-27T13:10:49.175Z", "sync_sources": [ "git", "capability-index.yaml", diff --git a/agent-evolution/data/real-fit-report.json b/agent-evolution/data/real-fit-report.json new file mode 100644 index 0000000..f57d4ae --- /dev/null +++ b/agent-evolution/data/real-fit-report.json @@ -0,0 +1,689 @@ +{ + "generated": "2026-05-27T18:36:13.173821+00:00", + "source": "real-fit-engine", + "total_evaluations": 102, + "agents": { + "agent-architect": { + "name": "agent-architect", + "evaluations": { + "deepseek-v4-pro-max": 41.6, + "kimi-k2.6": 41.6, + "qwen3-coder:480b": 41.6 + }, + "info": [ + "Creates, modifies, and reviews new agents, workflows, and skills based on capability gap analysis. Tier 2 meta-agent with self-cascade enabled.", + "meta", + "ollama-cloud/kimi-k2.6" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 41.6 + }, + "architect-indexer": { + "name": "architect-indexer", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Indexes and maps project codebase architecture into .architect/ directory. Creates and maintains structured documentation of entities, APIs, DB schema, file graphs, and conventions. (GNS-2 Tier 0)", + "core", + "ollama-cloud/glm-5.1" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "backend-developer": { + "name": "backend-developer", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Backend specialist for Node.js, Express, APIs, and database integration (GNS-2 Tier 1)", + "core", + "ollama-cloud/qwen3-coder:480b" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "browser-automation": { + "name": "browser-automation", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction (GNS-2 Tier 0)", + "testing", + "ollama-cloud/deepseek-v4-flash" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "capability-analyst": { + "name": "capability-analyst", + "evaluations": { + "deepseek-v4-pro-max": 41.6, + "kimi-k2.6": 41.6, + "qwen3-coder:480b": 41.6 + }, + "info": [ + "Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components. Tier 2 meta-agent with self-cascade enabled.", + "meta", + "ollama-cloud/deepseek-v4-pro-max" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 41.6 + }, + "code-skeptic": { + "name": "code-skeptic", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Adversarial code reviewer. Finds problems and issues. Does NOT suggest implementations (GNS-2 Tier 0)", + "quality", + "ollama-cloud/minimax-m2.5" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "devops-engineer": { + "name": "devops-engineer", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "DevOps specialist for Docker, Kubernetes, CI/CD pipeline automation, and infrastructure management (GNS-2 Tier 1)", + "core", + "ollama-cloud/kimi-k2.6" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "evaluator": { + "name": "evaluator", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Scores agent effectiveness after task completion for continuous improvement. Tier 2 meta-agent with self-cascade enabled.", + "meta", + "ollama-cloud/qwen3.5-122b" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "flutter-developer": { + "name": "flutter-developer", + "evaluations": { + "deepseek-v4-pro-max": 41.6, + "kimi-k2.6": 41.6, + "qwen3-coder:480b": 41.6 + }, + "info": [ + "Flutter mobile specialist for cross-platform apps, state management, and UI components (GNS-2 Tier 1)", + "core", + "ollama-cloud/qwen3-coder:480b" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 41.6 + }, + "frontend-developer": { + "name": "frontend-developer", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Handles UI implementation with multimodal capabilities. Accepts visual references like screenshots and mockups (GNS-2 Tier 1)", + "core", + "ollama-cloud/minimax-m2.5" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "go-developer": { + "name": "go-developer", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Go backend specialist for Gin, Echo, APIs, and database integration (GNS-2 Tier 1)", + "core", + "ollama-cloud/deepseek-v4-pro-max" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "history-miner": { + "name": "history-miner", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work (GNS-2 Tier 0)", + "core", + "ollama-cloud/qwen3.5-122b" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "incident-responder": { + "name": "incident-responder", + "evaluations": { + "deepseek-v4-pro-max": 41.6, + "kimi-k2.6": 41.6, + "qwen3-coder:480b": 41.6 + }, + "info": [ + "Server incident response and system hardening specialist. Handles live forensics, malware removal, persistence hunting, SSH-based server cleanup, and post-incident hardening. Works with any OS and panel.", + "core", + "ollama-cloud/kimi-k2.6" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 41.6 + }, + "lead-developer": { + "name": "lead-developer", + "evaluations": { + "deepseek-v4-pro-max": 41.6, + "kimi-k2.6": 41.6, + "qwen3-coder:480b": 41.6 + }, + "info": [ + "Primary code writer for backend and core logic. Writes implementation to pass tests (GNS-2 Tier 1)", + "core", + "ollama-cloud/qwen3-coder:480b" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 41.6 + }, + "markdown-validator": { + "name": "markdown-validator", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Validates and corrects Markdown descriptions for Gitea issues (GNS-2 Tier 0)", + "meta", + "ollama-cloud/nemotron-3-nano" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "memory-manager": { + "name": "memory-manager", + "evaluations": { + "deepseek-v4-pro-max": 41.6, + "kimi-k2.6": 41.6, + "qwen3-coder:480b": 41.6 + }, + "info": [ + "Manages agent memory systems - short-term (context), long-term (vector store), and episodic (experiences) (GNS-2 Tier 0)", + "cognitive", + "ollama-cloud/deepseek-v4-pro-max" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 41.6 + }, + "orchestrator": { + "name": "orchestrator", + "evaluations": { + "deepseek-v4-pro-max": 41.6, + "kimi-k2.6": 41.6, + "qwen3-coder:480b": 41.6 + }, + "info": [ + "Main dispatcher. Routes tasks between agents based on Issue status and manages the workflow state machine. IF:90 for optimal routing accuracy. (GNS-2 Tier 1)", + "meta", + "ollama-cloud/kimi-k2.6" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 41.6 + }, + "performance-engineer": { + "name": "performance-engineer", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Reviews code for performance issues. Focuses on efficiency, N+1 queries, memory leaks, and algorithmic complexity (GNS-2 Tier 0)", + "quality", + "ollama-cloud/deepseek-v4-pro-max" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "php-developer": { + "name": "php-developer", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "PHP backend specialist for Laravel, Symfony, WordPress, and full-stack web applications (GNS-2 Tier 1)", + "core", + "ollama-cloud/qwen3-coder:480b" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "pipeline-judge": { + "name": "pipeline-judge", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Automated pipeline judge. Evaluates workflow execution by running tests, measuring token cost and wall-clock time. Produces objective fitness scores. Never writes code - only measures and scores. (GNS-2 Tier 0)", + "meta", + "ollama-cloud/kimi-k2.6" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "planner": { + "name": "planner", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Advanced task planner using Chain of Thought, Tree of Thoughts, and Plan-Execute-Reflect (GNS-2 Tier 0)", + "cognitive", + "ollama-cloud/deepseek-v4-pro-max" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "product-owner": { + "name": "product-owner", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Manages issue checklists, status labels, tracks progress and coordinates with human users (GNS-2 Tier 1)", + "meta", + "ollama-cloud/glm-5.1" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "prompt-optimizer": { + "name": "prompt-optimizer", + "evaluations": { + "deepseek-v4-pro-max": 41.6, + "kimi-k2.6": 41.6, + "qwen3-coder:480b": 41.6 + }, + "info": [ + "Improves agent system prompts based on performance failures. Meta-learner for prompt optimization (GNS-2 Tier 1)", + "meta", + "ollama-cloud/qwen3.5-122b" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 41.6 + }, + "python-developer": { + "name": "python-developer", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Python backend specialist for Django, FastAPI, data science, and API development (GNS-2 Tier 1)", + "core", + "ollama-cloud/qwen3-coder:480b" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "reflector": { + "name": "reflector", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Self-reflection agent using Reflexion pattern - learns from mistakes (GNS-2 Tier 0)", + "cognitive", + "ollama-cloud/deepseek-v4-pro-max" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "release-manager": { + "name": "release-manager", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Manages git operations, semantic versioning, branching, and deployments. Ensures clean history (GNS-2 Tier 1)", + "meta", + "ollama-cloud/kimi-k2.6" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "requirement-refiner": { + "name": "requirement-refiner", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Converts vague ideas and bug reports into strict User Stories with acceptance criteria checklists (GNS-2 Tier 1)", + "core", + "ollama-cloud/kimi-k2-thinking" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "sdet-engineer": { + "name": "sdet-engineer", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Writes tests following TDD methodology. Tests MUST fail initially (Red phase) (GNS-2 Tier 1)", + "core", + "ollama-cloud/qwen3-coder:480b" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "security-auditor": { + "name": "security-auditor", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Scans for security vulnerabilities, OWASP Top 10, dependency CVEs, and hardcoded secrets (GNS-2 Tier 0)", + "quality", + "ollama-cloud/deepseek-v4-pro-max" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "system-analyst": { + "name": "system-analyst", + "evaluations": { + "deepseek-v4-pro-max": 41.6, + "kimi-k2.6": 41.6, + "qwen3-coder:480b": 41.6 + }, + "info": [ + "Designs technical specifications, data schemas, and API contracts before implementation (GNS-2 Tier 1)", + "core", + "ollama-cloud/deepseek-v4-pro-max" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 41.6 + }, + "the-fixer": { + "name": "the-fixer", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Iteratively fixes bugs based on specific error reports and test failures (GNS-2 Tier 1)", + "quality", + "ollama-cloud/kimi-k2.6" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "visual-tester": { + "name": "visual-tester", + "evaluations": { + "deepseek-v4-pro-max": 50.0, + "kimi-k2.6": 50.0, + "qwen3-coder:480b": 50.0 + }, + "info": [ + "Visual regression testing agent that compares screenshots and detects UI differences using pixelmatch and image diff (GNS-2 Tier 0)", + "quality", + "ollama-cloud/qwen3-coder:480b" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 50.0 + }, + "workflow-architect": { + "name": "workflow-architect", + "evaluations": { + "deepseek-v4-pro-max": 41.6, + "kimi-k2.6": 41.6, + "qwen3-coder:480b": 41.6 + }, + "info": [ + "Creates and maintains workflow definitions with complete architecture, Gitea integration, and quality gates (GNS-2 Tier 1)", + "meta", + "ollama-cloud/qwen3.5-122b" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 41.6 + }, + "workflow-cross-checker": { + "name": "workflow-cross-checker", + "evaluations": { + "deepseek-v4-pro-max": 41.6, + "kimi-k2.6": 41.6, + "qwen3-coder:480b": 41.6 + }, + "info": [ + "Workflow cross-checker and process inspector. Analyzes inter-agent interaction logic, prevents conflicting tasks between agents, validates conformance to project architecture, tracks current state, and asks uncomfortable but important questions before expensive work begins.", + "meta", + "ollama-cloud/kimi-k2.6" + ], + "best_model": "deepseek-v4-pro-max", + "best_score": 41.6 + } + }, + "fit_scores": { + "agent-architect": { + "model": "deepseek-v4-pro-max", + "fit": 41.6, + "explanation": "Best model for agent-architect is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + }, + "architect-indexer": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for architect-indexer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "backend-developer": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for backend-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "browser-automation": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for browser-automation is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "capability-analyst": { + "model": "deepseek-v4-pro-max", + "fit": 41.6, + "explanation": "Best model for capability-analyst is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + }, + "code-skeptic": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for code-skeptic is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "devops-engineer": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for devops-engineer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "evaluator": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for evaluator is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "flutter-developer": { + "model": "deepseek-v4-pro-max", + "fit": 41.6, + "explanation": "Best model for flutter-developer is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + }, + "frontend-developer": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for frontend-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "go-developer": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for go-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "history-miner": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for history-miner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "incident-responder": { + "model": "deepseek-v4-pro-max", + "fit": 41.6, + "explanation": "Best model for incident-responder is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + }, + "lead-developer": { + "model": "deepseek-v4-pro-max", + "fit": 41.6, + "explanation": "Best model for lead-developer is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: code_presence." + }, + "markdown-validator": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for markdown-validator is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "memory-manager": { + "model": "deepseek-v4-pro-max", + "fit": 41.6, + "explanation": "Best model for memory-manager is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + }, + "orchestrator": { + "model": "deepseek-v4-pro-max", + "fit": 41.6, + "explanation": "Best model for orchestrator is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + }, + "performance-engineer": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for performance-engineer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "php-developer": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for php-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "pipeline-judge": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for pipeline-judge is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "planner": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for planner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "product-owner": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for product-owner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "prompt-optimizer": { + "model": "deepseek-v4-pro-max", + "fit": 41.6, + "explanation": "Best model for prompt-optimizer is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + }, + "python-developer": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for python-developer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "reflector": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for reflector is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "release-manager": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for release-manager is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "requirement-refiner": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for requirement-refiner is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "sdet-engineer": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for sdet-engineer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "security-auditor": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for security-auditor is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "system-analyst": { + "model": "deepseek-v4-pro-max", + "fit": 41.6, + "explanation": "Best model for system-analyst is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + }, + "the-fixer": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for the-fixer is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "visual-tester": { + "model": "deepseek-v4-pro-max", + "fit": 50.0, + "explanation": "Best model for visual-tester is deepseek-v4-pro-max with avg score 50.0. Strongest dimension: keyword_coverage." + }, + "workflow-architect": { + "model": "deepseek-v4-pro-max", + "fit": 41.6, + "explanation": "Best model for workflow-architect is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + }, + "workflow-cross-checker": { + "model": "deepseek-v4-pro-max", + "fit": 41.6, + "explanation": "Best model for workflow-cross-checker is deepseek-v4-pro-max with avg score 41.6. Strongest dimension: keyword_coverage." + } + } +} \ No newline at end of file diff --git a/agent-evolution/data/real-fit.db b/agent-evolution/data/real-fit.db new file mode 100644 index 0000000..6f19b54 Binary files /dev/null and b/agent-evolution/data/real-fit.db differ diff --git a/agent-evolution/index.standalone.html b/agent-evolution/index.standalone.html index 155b2bb..8de3e9b 100644 --- a/agent-evolution/index.standalone.html +++ b/agent-evolution/index.standalone.html @@ -739,6 +739,77 @@ .swap-vis { flex-direction: column; } .swap-arrow { transform: rotate(90deg); } } + + /* Analytics Hierarchy */ + .analytics-tree { font-size: 12.5px; } + .at-model { margin-bottom: 10px; border: 1px solid var(--border); border-radius: 10px; overflow: hidden; } + .at-model-header { + display: flex; align-items: center; gap: 8px; + padding: 8px 12px; background: var(--bg-panel); + font-weight: 600; cursor: pointer; user-select: none; font-size: 13px; + } + .at-model-header::before { content: '▸'; font-size: 10px; transition: transform .2s; color: var(--accent-cyan); } + .at-model.open .at-model-header::before { transform: rotate(90deg); } + .at-model-body { display: none; padding: 6px 10px 8px 22px; } + .at-model.open .at-model-body { display: block; } + .at-cat { margin-bottom: 4px; } + .at-cat-header { + display: flex; align-items: center; gap: 6px; + padding: 4px 8px; border-radius: 6px; cursor: pointer; + color: var(--text-secondary); font-size: 11.5px; + } + .at-cat-header:hover { background: var(--bg-card-hover); } + .at-cat-header::before { content: '▸'; font-size: 9px; transition: transform .2s; } + .at-cat.open .at-cat-header::before { transform: rotate(90deg); } + .at-cat-body { display: none; padding: 3px 0 2px 14px; } + .at-cat.open .at-cat-body { display: block; } + .at-agent { + display: flex; align-items: center; gap: 8px; + padding: 2px 8px; border-radius: 4px; font-size: 11.5px; color: var(--text-primary); + } + .at-agent-badge { + font-size: 10px; font-weight: 700; padding: 1px 6px; + border-radius: 10px; background: var(--bg-card-hover); color: var(--accent-green); + margin-left: auto; flex-shrink: 0; + } + + .analytics-bars { display: flex; flex-direction: column; gap: 10px; } + .ab-row { display: flex; align-items: center; gap: 12px; } + .ab-label { width: 80px; font-size: 12px; font-weight: 600; color: var(--text-primary); flex-shrink: 0; } + .ab-track { flex: 1; height: 18px; background: var(--bg-panel); border-radius: 9px; overflow: hidden; } + .ab-fill { height: 100%; border-radius: 9px; background: linear-gradient(90deg, var(--accent-cyan), var(--accent-green)); transition: width .6s ease; min-width: 3px; } + .ab-count { width: 28px; font-size: 12px; font-weight: 700; text-align: right; color: var(--text-secondary); } + + .analytics-heatmap { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(56px, 1fr)); + gap: 3px; + } + .ah-cell { + aspect-ratio: 1; border-radius: 6px; + display: flex; flex-direction: column; align-items: center; justify-content: center; + font-size: 9.5px; font-weight: 600; cursor: pointer; transition: transform .12s; position: relative; + } + .ah-cell:hover { transform: scale(1.07); z-index: 1; } + .ah-cell-name { font-size: 8.5px; font-weight: 500; overflow: hidden; text-overflow: ellipsis; white-space: nowrap; width: 90%; text-align: center; } + .ah-cell-score { font-size: 10.5px; margin-top: 1px; } + .ah-tip { + position: absolute; bottom: calc(100% + 4px); left: 50%; transform: translateX(-50%); + background: var(--bg-panel); border: 1px solid var(--border-bright); border-radius: 6px; + padding: 4px 8px; font-size: 10px; white-space: nowrap; pointer-events: none; + opacity: 0; transition: opacity .12s; z-index: 10; + } + .ah-cell:hover .ah-tip { opacity: 1; } + + .commands-matrix-table { width: 100%; border-collapse: collapse; font-size: 12.5px; } + .commands-matrix-table thead th { text-align: left; padding: 8px 10px; font-weight: 600; color: var(--text-secondary); border-bottom: 1px solid var(--border); font-size: 10.5px; text-transform: uppercase; letter-spacing: .5px; } + .commands-matrix-table tbody td { padding: 6px 10px; border-bottom: 1px solid var(--border); color: var(--text-primary); } + .commands-matrix-table tbody tr:last-child td { border-bottom: none; } + .commands-matrix-table tbody tr:hover td { background: var(--bg-card-hover); } + .cm-score { font-size: 10px; font-weight: 700; padding: 1px 6px; border-radius: 8px; } + .cm-score.good { background: rgba(0,255,148,0.12); color: var(--accent-green); } + .cm-score.ok { background: rgba(250,204,21,0.12); color: var(--accent-yellow); } + .cm-score.warn{ background: rgba(255,71,87,0.12); color: var(--accent-red); } @@ -760,6 +831,7 @@ + @@ -829,6 +901,46 @@
+ +
+
+
+
+
Model Distribution
+
Агенты по LLM-моделям (иерархия: модель → категория)
+
+
+
+
Category Breakdown
+
Дистрибуция по категориям и ролям
+
+
+
+ +
+
+
Fit Score Distribution
+
Тепловая карта fit-score по агентам
+
+
+ +
+
Commands Matrix
+
Команды и их модели из реальных конфигов
+
+ + + + + + + +
КомандаМодельScoreОписание
+
+
+
+
+
@@ -1055,7 +1167,7 @@ const MODEL_BENCHMARKS = { // Default embedded data (minimal - updated by sync script) const EMBEDDED_DATA = { "version": "1.0.0", - "lastUpdated": "2026-05-27T12:47:21.972Z", + "lastUpdated": "2026-05-27T13:10:49.174Z", "agents": { "lead-developer": { "current": { @@ -4931,7 +5043,7 @@ const EMBEDDED_DATA = { "total_agents": 38, "agents_with_history": 34, "pending_recommendations": 0, - "last_sync": "2026-05-27T12:47:21.974Z", + "last_sync": "2026-05-27T13:10:49.175Z", "sync_sources": [ "git", "capability-index.yaml", @@ -4969,6 +5081,16 @@ async function init() { } try { + // Load real dashboard data FIRST (overrides stale agent-versions) + try { + const dashRes = await fetch('data/dashboard-data.json'); + if (dashRes.ok) { + window.dashboardData = await dashRes.json(); + // Sync agentData from dashboard data for all other tabs + syncAgentDataFromDashboard(window.dashboardData); + } + } catch (e) { console.warn('dashboard-data.json not loaded:', e.message); } + document.getElementById('lastSync').textContent = formatDate(agentData.lastUpdated); document.getElementById('agentCount').textContent = agentData.evolution_metrics.total_agents + ' agents'; document.getElementById('historyCount').textContent = agentData.evolution_metrics.agents_with_history + ' with history'; @@ -4984,12 +5106,69 @@ async function init() { renderRecommendations(); renderHeatmap(); renderImpact(); + renderAnalytics(); } catch (error) { console.error('Failed to render dashboard:', error); document.getElementById('lastSync').textContent = 'Error rendering data'; } } +function syncAgentDataFromDashboard(dd) { + // Convert dashboard format back to agentData format expected by other renders + const agents = {}; + const categories = {}; + let withHistory = 0; + + for (const a of dd.agents || []) { + const cat = a.category || 'General'; + if (!categories[cat]) categories[cat] = 0; + categories[cat]++; + + const history = a.latest_change ? [{ + date: a.latest_change.date, + type: a.latest_change.type, + from: a.latest_change.from, + to: a.latest_change.to, + reason: a.latest_change.reason, + source: a.latest_change.source + }] : []; + if (history.length > 0) withHistory++; + + agents[a.name] = { + current: { + model: a.model, + mode: a.mode, + description: a.description, + category: a.category || 'General', + color: a.color || '#8B5CF6', + provider: a.provider || 'Ollama', + variant: a.variant || '', + capabilities: [], + recommendations: [], + benchmark: { + fit_score: a.fit_score || 0, + instruction_following: a.instruction_following || 0 + } + }, + history: history + }; + } + + // Update agentData in-place so all render* functions see real data + agentData = { + version: '1.0.0', + lastUpdated: dd.generated, + agents: agents, + evolution_metrics: { + total_agents: dd.total_agents, + agents_with_history: withHistory, + pending_recommendations: 0, + last_sync: dd.generated, + sync_sources: [dd.source || 'dashboard-data', '.kilo/agents/*.md', 'evolution.json'] + } + }; +} + // Format date function formatDate(dateStr) { const date = new Date(dateStr); @@ -5260,96 +5439,62 @@ function renderRecCard(r, index) { `; } -// Render Heatmap +// Render Heatmap — REAL DATA: Agent × Current Model × Real Fit Score function renderHeatmap() { - const agents = Object.entries(agentData.agents); - if (agents.length === 0) return; + const esc = str => (str || '').replace(/[&<>"']/g, m => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[m])); + const dd = window.dashboardData; - // Build unique model list from all agents - const modelSet = new Set(); - const modelIfScores = {}; - agents.forEach(([_, a]) => { - const model = a.current.model; - if (model) { - modelSet.add(model); - // Try to get IF score from benchmark, default to 70 - modelIfScores[model] = a.current.benchmark?.instruction_following || 70; - } - }); + if (!dd || !dd.agents) { + document.getElementById('hmTable').innerHTML = '⚠️ Нет данных. Запустите анализ.'; + return; + } - // Build hmModels array - const hmModels = [...modelSet].map(m => { - // Extract short name from full model ID - let shortName = m; - if (m.includes('qwen3-coder')) shortName = 'Qwen3-Coder'; - else if (m.includes('glm-')) shortName = m.includes('5.1') ? 'GLM-5.1' : 'GLM-5'; - else if (m.includes('nemotron')) shortName = m.includes('nano') ? 'Nem. Nano' : 'Nem. Super'; - else if (m.includes('minimax')) shortName = 'MiniMax M2.5'; - else if (m.includes('kimi')) shortName = 'Kimi K2.6'; - else if (m.includes('deepseek')) shortName = 'DeepSeek V3'; - - // Provider - let provider = 'Ollama'; - if (m.includes('cloud') || m.includes('ollama-cloud')) provider = 'Ollama Cloud'; - else if (m.includes('openrouter')) provider = 'OpenRouter'; - else if (m.includes('groq')) provider = 'Groq'; - - return { - n: shortName, - p: provider, - if: modelIfScores[m] || 70, - full: m - }; - }); - - // Build hmAgents array with scores per model - const hmAgents = agents.map(([name, agent]) => { - const currentModel = agent.current.model; - const currentIdx = hmModels.findIndex(m => m.full === currentModel); - const fitScore = agent.current.benchmark?.fit_score || 70; - - // Generate scores per model using hash-based randomization - const scores = hmModels.map((m, idx) => { - if (m.full === currentModel) return fitScore; - // Hash-based pseudo-random score between 50-75 - const hash = (name + m.full).split('').reduce((a, c) => a + c.charCodeAt(0), 0); - return 50 + (hash % 26); + const agents = dd.agents; + // Get unique models sorted by count of agents + const modelCounts = {}; + agents.forEach(a => { modelCounts[a.model_short] = (modelCounts[a.model_short] || 0) + 1; }); + const modelList = Object.entries(modelCounts) + .sort((a, b) => b[1] - a[1]) + .map(([short]) => { + const m = dd.models[short] || {}; + return { + short, + full: 'ollama-cloud/' + short, + name: m.name || short, + avg_fit: m.avg_fit || 0, + agents: m.agents || 0 + }; }); - return { - n: name, - c: currentIdx, - s: scores - }; - }); - - // Render the table + // Render table: rows=agents, cols=models const t = document.getElementById('hmTable'); let h = 'Agent'; - hmModels.forEach(m => { - const ifColor = m.if >= 85 ? '#00ff94' : m.if >= 75 ? '#facc15' : '#ff6b81'; + modelList.forEach(m => { + const color = m.avg_fit >= 85 ? '#00ff94' : m.avg_fit >= 70 ? '#facc15' : '#ff6b81'; h += ` - ${m.n}
- ${m.p}
- IF:${m.if} + ${esc(m.name)}
+ avg:${m.avg_fit}
+ ${m.agents} `; }); h += ''; - hmAgents.forEach(ag => { - const mx = Math.max(...ag.s); - h += `${ag.n}`; - ag.s.forEach((s, j) => { - const best = s === mx; - const cur = j === ag.c; - const ifLow = hmModels[j].if < 75; + agents.forEach(a => { + h += `${esc(a.name)}`; + modelList.forEach((m, j) => { + const isCurrent = a.model_short === m.short; + const score = isCurrent ? a.fit_score : 0; // Only show score for CURRENT model + const cur = isCurrent; let marks = ''; - if (best) marks += ''; - if (ifLow) marks += ''; - h += `${s}${marks}`; + onclick="openHmModal(event, '${esc(a.name)}', '${esc(m.name)}', ${isCurrent ? a.fit_score : 0}, ${isCurrent ? a.instruction_following : 0})" + >${isCurrent ? a.fit_score : '·'}${marks}`; }); h += ''; }); @@ -6313,6 +6458,190 @@ function closeResearchModal() { document.getElementById('researchModal').classList.remove('show'); } +/* ===== ANALYTICS HIERARCHY ===== */ +function modelScore(model) { + const scores = { + 'ollama-cloud/kimi-k2.6': 92, + 'ollama-cloud/deepseek-v4-pro-max': 90, + 'ollama-cloud/glm-5.1': 82, + 'ollama-cloud/qwen3-coder:480b': 88, + 'ollama-cloud/qwen3.5-122b': 85, + 'ollama-cloud/nemotron-3-super': 88, + 'ollama-cloud/minimax-m2.5': 86, + }; + return scores[model] || 75; +} + +async function renderAnalytics() { + const container = document.getElementById('modelHierarchyTree'); + if (!container) return; + + let state = null; let loadErr = null; + try { + const r = await fetch('/data/state.json'); + if (r.ok) state = await r.json(); + else loadErr = 'HTTP ' + r.status; + } catch (e) { loadErr = e.message; } + + if (!state || !state.agents) { + const msg = loadErr ? 'Не удалось загрузить данные: ' + loadErr : 'Данные пусты'; + const errHtml = ` +
+
⚠️
+
Аналитика недоступна
+
${esc(msg)}
+
Убедитесь, что /data/state.json существует и доступен.
+
`; + document.getElementById('modelHierarchyTree').innerHTML = errHtml; + document.getElementById('categoryBreakdownBars').innerHTML = errHtml; + document.getElementById('fitScoreHeatmap').innerHTML = errHtml; + document.getElementById('commandsMatrixTable').innerHTML = errHtml; + return; + } + + renderAnalyticsStats(state); + renderModelHierarchyTree(state.agents); + renderCategoryBreakdownBars(state.agents); + renderFitScoreHeatmap(state.agents); + renderCommandsMatrix(state.commands || []); +} + +function esc(str) { + return (str || '').replace(/[&<>"']/g, m => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[m])); +} + +// Tab switching +function renderAnalyticsStats(state) { + const el = document.getElementById('analyticsStats'); + if (!el) return; + const total = (state.agents || []).length; + const models = new Set((state.agents || []).map(a => a.model)).size; + const cats = new Set((state.agents || []).map(a => a.category)).size; + const cmds = (state.commands || []).length; + el.innerHTML = [ + { label: 'Total Agents', value: total, sub: 'active', grad: 'grad-cyan' }, + { label: 'Models Used', value: models, sub: 'distinct LLMs', grad: 'grad-green' }, + { label: 'Categories', value: cats, sub: 'groups', grad: 'grad-orange' }, + { label: 'Commands', value: cmds, sub: 'slash commands', grad: 'grad-purple' }, + ].map(s => ` +
+
${s.label}
+
${s.value}
+
${s.sub}
+
`).join(''); +} + +function renderModelHierarchyTree(agents) { + const container = document.getElementById('modelHierarchyTree'); + if (!container) return; + const tree = {}; + for (const a of agents) { + if (!tree[a.model]) tree[a.model] = {}; + const cat = a.category || 'Core'; + if (!tree[a.model][cat]) tree[a.model][cat] = []; + tree[a.model][cat].push(a); + } + let html = ''; + for (const [model, cats] of Object.entries(tree).sort()) { + const modelShort = model.replace('ollama-cloud/', ''); + const total = Object.values(cats).flat().length; + html += `
+
+ ${esc(modelShort)} + ${total} +
+
`; + for (const [cat, list] of Object.entries(cats).sort()) { + html += `
+
${esc(cat)} (${list.length})
+
`; + for (const a of list) { + const sc = a.fit_score !== undefined ? a.fit_score : modelScore(a.model); + html += `
+ ${esc(a.name)} + ${sc} +
`; + } + html += '
'; + } + html += '
'; + } + container.innerHTML = html; + const first = container.querySelector('.at-model'); + if (first) first.classList.add('open'); +} + +function renderCategoryBreakdownBars(agents) { + const container = document.getElementById('categoryBreakdownBars'); + if (!container) return; + const counts = {}; + for (const a of agents) { + const cat = a.category || 'Core'; + counts[cat] = (counts[cat] || 0) + 1; + } + const max = Math.max(...Object.values(counts), 1); + let html = ''; + for (const [cat, n] of Object.entries(counts).sort((a, b) => b[1] - a[1])) { + const pct = Math.round((n / max) * 100); + html += ` +
+
${esc(cat)}
+
+
${n}
+
`; + } + container.innerHTML = html; +} + +function renderFitScoreHeatmap(agents) { + const container = document.getElementById('fitScoreHeatmap'); + if (!container) return; + let html = ''; + for (const a of agents) { + const score = a.fit_score !== undefined ? a.fit_score : modelScore(a.model); + const hue = score >= 85 ? 150 : score >= 70 ? 45 : 0; + const sat = score >= 85 ? '65%' : score >= 70 ? '75%' : '55%'; + const light = document.documentElement.getAttribute('data-theme') === 'light' ? '82%' : '30%'; + html += ` +
+ ${esc(a.name.slice(0, 12))} + ${score} +
${esc(a.name)} — ${score}
+
`; + } + container.innerHTML = html; +} + +function renderCommandsMatrix(commands) { + const tbody = document.querySelector('#commandsMatrixTable tbody'); + if (!tbody) return; + if (!commands.length) { + tbody.innerHTML = 'No command data available'; + return; + } + let html = ''; + for (const c of commands) { + const modelShort = (c.model || 'unknown').replace('ollama-cloud/', ''); + const score = c.fit_score !== undefined ? c.fit_score : modelScore(c.model); + const cls = score >= 85 ? 'good' : score >= 70 ? 'ok' : 'warn'; + html += ` + + /${esc(c.name)} + ${esc(modelShort)} + ${score} + ${esc((c.description || '').slice(0, 50))}${(c.description || '').length > 50 ? '…' : ''} + `; + } + tbody.innerHTML = html; +} + +function esc(str) { + return (str || '').replace(/[&<>"']/g, m => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[m])); +} + // Tab switching function switchTab(tabId, el) { document.querySelectorAll('.tab-btn').forEach(btn => btn.classList.remove('active')); diff --git a/docker/docker-compose.ollama.yml b/docker/docker-compose.ollama.yml new file mode 100644 index 0000000..e13a084 --- /dev/null +++ b/docker/docker-compose.ollama.yml @@ -0,0 +1,36 @@ +# Ollama service for multi-model evaluation +# Provides LLM inference API for Real-Fit engine and dashboard + +services: + ollama: + image: ollama/ollama:latest + container_name: ollama + ports: + - "11434:11434" + environment: + - OLLAMA_ORIGINS=* + - OLLAMA_HOST=0.0.0.0 + volumes: + - ollama-models:/root/.ollama/models + # Optional: pre-pull models on startup + - ./scripts/ollama-pull-models.sh:/ollama-pull.sh:ro + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:11434/api/tags"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s + restart: unless-stopped + networks: + - ollama-net + labels: + - "com.apaw.service=ollama" + - "com.apaw.description=Ollama LLM inference API" + +volumes: + ollama-models: + driver: local + +networks: + ollama-net: + driver: bridge diff --git a/landing/Dockerfile b/landing/Dockerfile index a09f212..1a7da2e 100644 --- a/landing/Dockerfile +++ b/landing/Dockerfile @@ -1,3 +1,14 @@ -FROM nginx:alpine +FROM nginx:bookworm + +# Python3 required for the embedded state API +RUN apt-get update && apt-get install -y --no-install-recommends python3 && rm -rf /var/lib/apt/lists/* + COPY landing /usr/share/nginx/html +COPY landing/nginx-landing.conf /etc/nginx/conf.d/default.conf +COPY landing/entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +# Mount-ready: content served from volume EXPOSE 80 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/landing/api/real-fit-report.json b/landing/api/real-fit-report.json new file mode 120000 index 0000000..8a8ef08 --- /dev/null +++ b/landing/api/real-fit-report.json @@ -0,0 +1 @@ +../../agent-evolution/data/real-fit-report.json \ No newline at end of file diff --git a/landing/api/server.py b/landing/api/server.py new file mode 100644 index 0000000..055ff3e --- /dev/null +++ b/landing/api/server.py @@ -0,0 +1,199 @@ +#!/usr/bin/env python3 +"""Micro API for landing page — reads live agent configs and returns JSON.""" +import json, os, glob, re +from datetime import datetime, timezone +import socketserver +import http.server + +PORT = 8080 +FALLBACK_DIR = "/usr/share/nginx/html" + + +def find_dir(sub): + candidates = [ + os.path.join(FALLBACK_DIR, sub), + os.path.join(os.path.dirname(__file__), sub), + f"/app/{sub}", + f"./{sub}", + ] + for c in candidates: + if os.path.isdir(c): + return c + return None + + +def find_file(name): + candidates = [ + os.path.join(FALLBACK_DIR, "api", name), + os.path.join(os.path.dirname(__file__), name), + f"/app/landing/api/{name}", + ] + for c in candidates: + if os.path.isfile(c): + return c + return None + + +def parse_frontmatter(path): + try: + with open(path, "r", encoding="utf-8") as f: + content = f.read() + except Exception: + return None + if not content.startswith("---"): + return None + end = content.find("---", 3) + if end == -1: + return None + fm = content[3:end] + data = {} + for line in fm.strip().split("\n"): + m = re.match(r"^(\w+):\s*(.+)$", line) + if m: + data[m.group(1)] = m.group(2).strip() + return data + + +def load_dashboard_data(): + path = find_file("dashboard-data.json") + if not path: + return None + try: + with open(path, "r", encoding="utf-8") as f: + data = json.load(f) + if data.get("agents"): + return data + except Exception: + pass + return None + + +def load_real_fit_scores(): + candidates = [ + os.path.join(os.path.dirname(__file__), "real-fit-report.json"), + os.path.join(os.path.dirname(os.path.dirname(__file__)), "data", "real-fit-report.json"), + os.path.join(FALLBACK_DIR, "data", "real-fit-report.json"), + "/app/agent-evolution/data/real-fit-report.json", + ] + for path in candidates: + if path and os.path.isfile(path): + try: + with open(path, "r", encoding="utf-8") as f: + data = json.load(f) + return data.get("fit_scores", {}) + except Exception: + continue + return {} + + +def build_state_from_md(): + agents_dir = find_dir(".kilo/agents") + commands_dir = find_dir(".kilo/commands") + + agents = [] + if agents_dir: + for f in sorted(glob.glob(os.path.join(agents_dir, "*.md"))): + fm = parse_frontmatter(f) + if fm and fm.get("model"): + agents.append({ + "name": os.path.basename(f).replace(".md", ""), + "model": fm.get("model", ""), + "mode": fm.get("mode", "subagent"), + "description": fm.get("description", ""), + "category": infer_category(fm.get("mode", ""), os.path.basename(f)), + "fit_score": None, + "model_meta": None, + }) + + commands = [] + if commands_dir: + for f in sorted(glob.glob(os.path.join(commands_dir, "*.md"))): + fm = parse_frontmatter(f) + if fm and fm.get("model"): + commands.append({ + "name": os.path.basename(f).replace(".md", ""), + "model": fm.get("model", ""), + "mode": fm.get("mode", "command"), + "description": fm.get("description", ""), + "fit_score": None, + }) + + model_stats = {} + for a in agents: + model_stats[a["model"]] = model_stats.get(a["model"], 0) + 1 + for c in commands: + model_stats[c["model"]] = model_stats.get(c["model"], 0) + 1 + + return { + "generated": datetime.now(timezone.utc).isoformat().replace("+00:00", "") + "Z", + "total_agents": len(agents), + "total_commands": len(commands), + "model_distribution": model_stats, + "agents": agents, + "commands": commands, + } + + +def build_state(): + dashboard = load_dashboard_data() + fit_scores = load_real_fit_scores() + if dashboard: + agents = dashboard.get("agents", []) + for a in agents: + key = a.get("name") + fs = fit_scores.get(key) + if fs: + a["fit_score"] = fs.get("fit") + a["fit_explanation"] = fs.get("explanation") + a["best_model"] = fs.get("model") + state = { + "generated": datetime.now(timezone.utc).isoformat().replace("+00:00", "") + "Z", + "total_agents": dashboard.get("total_agents", 0), + "total_commands": len(dashboard.get("commands", [])), + "model_distribution": dashboard.get("model_distribution", {}), + "agents": agents, + "commands": dashboard.get("commands", []), + } + state["fit_scores"] = fit_scores + return state + return build_state_from_md() + + +def infer_category(mode, filename): + f = filename.lower() + if "security" in f: + return "Security" + if "devops" in f: + return "DevOps" + if "frontend" in f or "flutter" in f: + return "Frontend" + if "backend" in f or "php" in f or "python" in f or "go" in f: + return "Backend" + if "test" in f or "sdet" in f: + return "QA" + return "Core" + + +class Handler(http.server.BaseHTTPRequestHandler): + def do_GET(self): + if self.path == "/api/state": + state = build_state() + body = json.dumps(state, ensure_ascii=False).encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "application/json; charset=utf-8") + self.send_header("Access-Control-Allow-Origin", "*") + self.send_header("Cache-Control", "no-store") + self.end_headers() + self.wfile.write(body) + else: + self.send_response(404) + self.end_headers() + + def log_message(self, format, *args): + pass # silent + + +if __name__ == "__main__": + with socketserver.TCPServer(("0.0.0.0", PORT), Handler) as httpd: + print(f"[state-api] listening on :{PORT}") + httpd.serve_forever() diff --git a/landing/assets/api.js b/landing/assets/api.js new file mode 100644 index 0000000..1e5d5b8 --- /dev/null +++ b/landing/assets/api.js @@ -0,0 +1,373 @@ +async function loadRealState() { + try { + const res = await fetch('/api/state'); + if (!res.ok) throw new Error('HTTP ' + res.status); + const state = await res.json(); + renderAgentTable(state.agents); + renderCommandTable(state.commands); + updateHeroStats(state); + updateModelDistribution(state.model_distribution); + renderAnalytics(state); + console.log('[landing] real state loaded', state.generated); + } catch (e) { + console.error('[landing] failed to load real state:', e.message); + document.querySelector('.agents__table')?.insertAdjacentHTML( + 'afterbegin', + '

⚠️ Не удалось загрузить реальные данные. Показаны fallback-значения.

' + ); + } +} + +function renderAgentTable(agents) { + const tbody = document.getElementById('agent-tbody'); + if (!tbody) return; + + const categories = {}; + for (const a of agents) { + const cat = a.category || 'Core'; + if (!categories[cat]) categories[cat] = []; + categories[cat].push(a); + } + + let html = ''; + for (const [cat, list] of Object.entries(categories)) { + html += `${escapeHtml(cat)}`; + for (const a of list) { + const modelShort = a.model.replace('ollama-cloud/', ''); + const score = modelScore(a.model); + html += ` + + ${escapeHtml(a.name)} + ${escapeHtml(cat)} + ${escapeHtml(a.description.slice(0, 40))}… + ${escapeHtml(modelShort)} + ${score} + ${escapeHtml(a.mode)} + `; + } + } + tbody.innerHTML = html; +} + +function renderCommandTable(commands) { + const tbody = document.getElementById('command-tbody'); + if (!tbody) return; + + let html = ''; + for (const c of commands) { + const modelShort = c.model.replace('ollama-cloud/', ''); + const score = modelScore(c.model); + html += ` + + /${escapeHtml(c.name)} + ${escapeHtml(c.mode)} + ${escapeHtml(c.description.slice(0, 40))}… + ${escapeHtml(modelShort)} + ${score} + `; + } + tbody.innerHTML = html; +} + +function updateHeroStats(state) { + const total = state.total_agents + state.total_commands; + const models = Object.keys(state.model_distribution).length; + document.querySelector('.hero__stats .stat:nth-child(1) .stat__value').textContent = state.total_agents + '+'; + document.querySelector('.hero__stats .stat:nth-child(2) .stat__value').textContent = models; +} + +function updateModelDistribution(dist) { + const container = document.getElementById('model-distribution'); + if (!container) return; + const entries = Object.entries(dist).sort((a, b) => b[1] - a[1]); + const max = entries[0]?.[1] || 1; + let html = '

Реальная модельная дистрибуция

'; + for (const [model, count] of entries) { + const pct = Math.round((count / max) * 100); + html += ` +
+ ${escapeHtml(model.replace('ollama-cloud/', ''))} +
+ ${count} +
`; + } + container.innerHTML = html; +} + +function modelScore(model) { + const scores = { + 'ollama-cloud/kimi-k2.6': 92, + 'ollama-cloud/deepseek-v4-pro-max': 90, + 'ollama-cloud/glm-5.1': 82, + 'ollama-cloud/qwen3-coder:480b': 88, + 'ollama-cloud/qwen3.5-122b': 85, + 'ollama-cloud/nemotron-3-super': 88, + 'ollama-cloud/minimax-m2.5': 86, + }; + return scores[model] || 75; +} + +function escapeHtml(str) { + return (str || '').replace(/[&<>"']/g, m => ({'&':'&','<':'<','>':'>','"':'"',"'":'''}[m])); +} + +/* ===== ANALYTICS HIERARCHY ===== */ +function renderAnalytics(state) { + renderModelTree(state.agents); + renderCategoryBars(state.agents); + renderFitHeatmap(state.agents); + renderCommandAnalytics(state.commands); +} + +function renderModelTree(agents) { + const container = document.getElementById('model-tree'); + if (!container) return; + + // Group: model -> category -> agents + const tree = {}; + for (const a of agents) { + if (!tree[a.model]) tree[a.model] = {}; + const cat = a.category || 'Core'; + if (!tree[a.model][cat]) tree[a.model][cat] = []; + tree[a.model][cat].push(a); + } + + let html = ''; + for (const [model, cats] of Object.entries(tree).sort()) { + const modelShort = model.replace('ollama-cloud/', ''); + const total = Object.values(cats).flat().length; + html += `
+
+ ${escapeHtml(modelShort)} + ${total} +
+
`; + for (const [cat, list] of Object.entries(cats).sort()) { + html += `
+
${escapeHtml(cat)} (${list.length})
+
`; + for (const a of list) { + const score = modelScore(a.model); + html += `
+ ${escapeHtml(a.name)} + ${score} +
`; + } + html += '
'; + } + html += '
'; + } + container.innerHTML = html; + + // Open first model by default + const first = container.querySelector('.tree__model'); + if (first) first.classList.add('is-open'); +} + +function renderCategoryBars(agents) { + const container = document.getElementById('category-bars'); + if (!container) return; + + const counts = {}; + for (const a of agents) { + const cat = a.category || 'Core'; + counts[cat] = (counts[cat] || 0) + 1; + } + const max = Math.max(...Object.values(counts), 1); + + let html = ''; + for (const [cat, n] of Object.entries(counts).sort((a, b) => b[1] - a[1])) { + const pct = Math.round((n / max) * 100); + html += ` +
+
${escapeHtml(cat)}
+
+
${n}
+
`; + } + container.innerHTML = html; +} + +function renderFitHeatmap(agents) { + const container = document.getElementById('fit-heatmap'); + if (!container) return; + + let html = ''; + for (const a of agents) { + const score = typeof a.fit_score === 'number' ? a.fit_score : modelScore(a.model); + const hue = score >= 85 ? 150 : score >= 70 ? 45 : 0; + const sat = score >= 85 ? '70%' : score >= 70 ? '80%' : '60%'; + const light = document.documentElement.getAttribute('data-theme') === 'light' ? '85%' : '35%'; + html += ` +
+ ${escapeHtml(a.name.slice(0, 12))} + ${score} +
${escapeHtml(a.name)} — ${score}
+
`; + } + container.innerHTML = html; +} + +function renderCommandAnalytics(commands) { + const tbody = document.getElementById('command-analytics-tbody'); + if (!tbody) return; + + let html = ''; + for (const c of commands) { + const modelShort = c.model.replace('ollama-cloud/', ''); + const score = modelScore(c.model); + html += ` + + /${escapeHtml(c.name)} + ${escapeHtml(modelShort)} + ${score} + `; + } + tbody.innerHTML = html; +} + +/* ===== FIT SCORE DRILL-DOWN MODAL ===== */ +const MODAL_HTML = ` + + +`; + +const PLACEHOLDER_EXPLANATION = 'Агент демонстрирует сильные стороны в аналитике и следовании роли, однако может уступать в конкретности рекомендаций по исправлению.'; + +function ensureModal() { + let modal = document.getElementById('fit-modal'); + if (!modal) { + modal = document.createElement('div'); + modal.id = 'fit-modal'; + modal.className = 'modal'; + modal.setAttribute('role', 'dialog'); + modal.setAttribute('aria-modal', 'true'); + modal.setAttribute('aria-label', 'Детали fit-score'); + modal.setAttribute('tabindex', '-1'); + modal.setAttribute('aria-hidden', 'true'); + modal.innerHTML = MODAL_HTML; + document.body.appendChild(modal); + } + if (!modal.dataset.initialized) { + modal.dataset.initialized = '1'; + const overlay = modal.querySelector('.modal__overlay'); + const closeBtn = modal.querySelector('.modal__close'); + + overlay.addEventListener('click', () => closeFitModal()); + closeBtn.addEventListener('click', () => closeFitModal()); + modal.addEventListener('keydown', trapFocus); + } + return modal; +} + +function openFitModal(agent) { + const modal = ensureModal(); + const score = agent.fit_score || modelScore(agent.model); + const modelShort = (agent.model || '').replace('ollama-cloud/', ''); + + document.getElementById('modal-agent-name').textContent = agent.name || 'Agent'; + document.getElementById('modal-model').textContent = modelShort || 'unknown'; + document.getElementById('modal-score').textContent = score; + + const breakdown = agent.breakdown || {}; + const dims = [ + { key: 'accuracy', label: 'Точность' }, + { key: 'completeness', label: 'Полнота' }, + { key: 'role_adherence', label: 'Ролевая чёткость' }, + { key: 'actionability', label: 'Действенность' }, + ]; + + const breakdownHtml = dims.map(d => { + const value = typeof breakdown[d.key] === 'number' ? breakdown[d.key] : 75; + const pct = Math.min(100, Math.max(0, value)); + const hue = pct >= 85 ? 150 : pct >= 70 ? 45 : 0; + return ` + `; + }).join(''); + document.getElementById('modal-breakdown').innerHTML = breakdownHtml; + + const explanation = agent.explanation || (agent.fit_score ? PLACEHOLDER_EXPLANATION : 'Нет данных об explanation — API не возвращает поле explanation.'); + document.getElementById('modal-explanation').textContent = explanation; + + modal.classList.add('is-open'); + modal.setAttribute('aria-hidden', 'false'); + modal.focus(); + + modal._prevFocus = document.activeElement; + document.addEventListener('keydown', handleEscape); +} + +function closeFitModal() { + const modal = document.getElementById('fit-modal'); + if (!modal) return; + modal.classList.remove('is-open'); + modal.setAttribute('aria-hidden', 'true'); + document.removeEventListener('keydown', handleEscape); + if (modal._prevFocus && typeof modal._prevFocus.focus === 'function') { + try { modal._prevFocus.focus(); } catch (e) { /* ignore */ } + } +} + +function handleEscape(e) { + if (e.key === 'Escape') closeFitModal(); +} + +function trapFocus(e) { + if (e.key !== 'Tab') return; + const modal = document.getElementById('fit-modal'); + if (!modal || !modal.classList.contains('is-open')) return; + const focusable = modal.querySelectorAll('button, [href], input, select, textarea, [tabindex]:not([tabindex="-1"])'); + if (!focusable.length) return; + const first = focusable[0]; + const last = focusable[focusable.length - 1]; + if (e.shiftKey && document.activeElement === first) { + e.preventDefault(); + last.focus(); + } else if (!e.shiftKey && document.activeElement === last) { + e.preventDefault(); + first.focus(); + } +} + +/* Enhance renderFitHeatmap to attach click handlers */ +const _origRenderFitHeatmap = renderFitHeatmap; +renderFitHeatmap = function(agents) { + _origRenderFitHeatmap(agents); + const container = document.getElementById('fit-heatmap'); + if (!container) return; + const cells = container.querySelectorAll('.heatmap__cell'); + cells.forEach((cell, idx) => { + cell.addEventListener('click', () => openFitModal(agents[idx] || {})); + cell.setAttribute('tabindex', '0'); + cell.setAttribute('role', 'button'); + cell.setAttribute('aria-label', `Детали для ${agents[idx]?.name || ''}`); + cell.addEventListener('keydown', (e) => { + if (e.key === 'Enter' || e.key === ' ') { + e.preventDefault(); + openFitModal(agents[idx] || {}); + } + }); + }); +}; + +document.addEventListener('DOMContentLoaded', loadRealState); diff --git a/landing/assets/styles.css b/landing/assets/styles.css index 509f2bc..af2347f 100644 --- a/landing/assets/styles.css +++ b/landing/assets/styles.css @@ -755,6 +755,204 @@ body { } .footer__made { font-style: italic; } +/* Analytics Hierarchy */ +.analytics { padding: 80px 20px; background: var(--bg); } +@media (min-width: 768px) { .analytics { padding: 100px 24px; } } + +.analytics__grid { + display: grid; + grid-template-columns: 1fr; + gap: 20px; + margin-top: 40px; +} +@media (min-width: 768px) { + .analytics__grid { grid-template-columns: 1fr 1fr; gap: 24px; } +} + +.analytics__card { + background: var(--bg-card); + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 24px; + transition: border-color .2s, box-shadow .2s; +} +.analytics__card:hover { + border-color: var(--border-hover); + box-shadow: var(--shadow); +} + +.analytics__card-title { + font-size: 16px; + font-weight: 600; + margin-bottom: 16px; + color: var(--text); +} + +/* Tree */ +.analytics__tree { font-size: 13px; } +.tree__model { margin-bottom: 12px; } +.tree__model-header { + display: flex; align-items: center; gap: 8px; + padding: 6px 10px; + background: var(--gradient-soft); + border-radius: 8px; + font-weight: 600; + cursor: pointer; + user-select: none; +} +.tree__model-header::before { + content: '▸'; + font-size: 10px; + transition: transform .2s; + color: var(--accent); +} +.tree__model.is-open .tree__model-header::before { transform: rotate(90deg); } +.tree__model-body { display: none; padding: 8px 0 4px 20px; } +.tree__model.is-open .tree__model-body { display: block; } + +.tree__cat { margin-bottom: 6px; } +.tree__cat-header { + display: flex; align-items: center; gap: 6px; + padding: 4px 8px; + border-radius: 6px; + cursor: pointer; + color: var(--text-muted); + font-size: 12px; +} +.tree__cat-header:hover { background: var(--bg-hover); } +.tree__cat-header::before { + content: '▸'; + font-size: 9px; + transition: transform .2s; +} +.tree__cat.is-open .tree__cat-header::before { transform: rotate(90deg); } +.tree__cat-body { display: none; padding: 4px 0 2px 16px; } +.tree__cat.is-open .tree__cat-body { display: block; } + +.tree__agent { + display: flex; align-items: center; gap: 8px; + padding: 3px 8px; + border-radius: 4px; + font-size: 12px; + color: var(--text); +} +.tree__agent-score { + font-size: 10px; + font-weight: 700; + padding: 1px 6px; + border-radius: 10px; + background: var(--bg-elevated); + color: var(--accent-green); +} + +/* Bars */ +.analytics__bars { display: flex; flex-direction: column; gap: 10px; } +.bar__row { + display: flex; align-items: center; gap: 12px; +} +.bar__label { + width: 80px; + font-size: 12px; + font-weight: 600; + color: var(--text); + flex-shrink: 0; +} +.bar__track { + flex: 1; + height: 20px; + background: var(--bg-elevated); + border-radius: 10px; + overflow: hidden; + position: relative; +} +.bar__fill { + height: 100%; + border-radius: 10px; + background: var(--gradient); + transition: width .6s ease; + min-width: 4px; +} +.bar__count { + width: 30px; + font-size: 12px; + font-weight: 700; + text-align: right; + color: var(--text-muted); +} + +/* Heatmap */ +.analytics__heatmap { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(56px, 1fr)); + gap: 4px; +} +.heatmap__cell { + aspect-ratio: 1; + border-radius: 6px; + display: flex; + flex-direction: column; + align-items: center; + justify-content: center; + font-size: 10px; + font-weight: 600; + cursor: pointer; + transition: transform .15s; + position: relative; +} +.heatmap__cell:hover { transform: scale(1.08); z-index: 1; } +.heatmap__cell-name { + font-size: 9px; + font-weight: 500; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + width: 90%; + text-align: center; +} +.heatmap__cell-score { font-size: 11px; margin-top: 1px; } +.heatmap__tooltip { + position: absolute; + bottom: calc(100% + 6px); + left: 50%; + transform: translateX(-50%); + background: var(--bg-elevated); + border: 1px solid var(--border); + border-radius: 6px; + padding: 6px 10px; + font-size: 11px; + white-space: nowrap; + pointer-events: none; + opacity: 0; + transition: opacity .15s; + z-index: 10; +} +.heatmap__cell:hover .heatmap__tooltip { opacity: 1; } + +/* Analytics table */ +.analytics__table-wrap { overflow-x: auto; } +.analytics__table { + width: 100%; + border-collapse: collapse; + font-size: 13px; +} +.analytics__table thead th { + text-align: left; + padding: 10px 12px; + font-weight: 600; + color: var(--text-muted); + border-bottom: 1px solid var(--border); + font-size: 11px; + text-transform: uppercase; + letter-spacing: .5px; +} +.analytics__table tbody td { + padding: 8px 12px; + border-bottom: 1px solid var(--border); + color: var(--text); +} +.analytics__table tbody tr:last-child td { border-bottom: none; } +.analytics__table tbody tr:hover td { background: var(--bg-hover); } + /* Animations */ @keyframes fadeUp { from { opacity: 0; transform: translateY(18px); } @@ -768,6 +966,106 @@ body { .hero__actions { animation-delay: .3s; } .hero__stats { animation-delay: .4s; } +/* ===== MODAL ===== */ +.modal { + position: fixed; + inset: 0; + z-index: 1000; + display: flex; + align-items: center; + justify-content: center; + padding: 20px; + opacity: 0; + visibility: hidden; + transition: opacity .25s, visibility .25s; +} +.modal.is-open { + opacity: 1; + visibility: visible; +} +.modal__overlay { + position: absolute; + inset: 0; + background: rgba(0,0,0,0.55); + backdrop-filter: blur(4px); +} +[data-theme="light"] .modal__overlay { + background: rgba(0,0,0,0.35); +} +.modal__content { + position: relative; + background: var(--bg-card); + border: 1px solid var(--border); + border-radius: var(--radius); + padding: 28px; + width: 100%; + max-width: 480px; + max-height: 90vh; + overflow-y: auto; + box-shadow: var(--shadow-hover); + transform: translateY(12px); + transition: transform .25s; +} +.modal.is-open .modal__content { + transform: translateY(0); +} +.modal__content { + z-index: 1; +} +.modal__close { + position: absolute; + top: 14px; + right: 14px; + width: 36px; height: 36px; + border-radius: 50%; + background: var(--bg-hover); + border: 1px solid var(--border); + color: var(--text-muted); + font-size: 20px; + line-height: 1; + cursor: pointer; + display: flex; + align-items: center; + justify-content: center; + transition: background .2s, color .2s; +} +.modal__close:hover { background: var(--border-hover); color: var(--text); } +.modal__header { margin-bottom: 18px; padding-right: 40px; } +.modal__title { font-size: 20px; font-weight: 700; letter-spacing: -0.3px; margin-bottom: 4px; color: var(--text); } +.modal__model { + font-family: var(--font-mono); font-size: 12px; color: var(--accent-3); + background: rgba(6,182,212,0.06); padding: 3px 8px; border-radius: 6px; + display: inline-block; +} +[data-theme="light"] .modal__model { + background: rgba(14,165,233,0.08); color: #0284c7; +} +.modal__score-row { + display: flex; align-items: baseline; gap: 10px; margin-bottom: 20px; + padding: 14px 16px; background: var(--gradient-soft); border-radius: var(--radius-sm); +} +.modal__score { + font-size: 36px; font-weight: 800; letter-spacing: -1.5px; + background: var(--gradient); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; +} +.modal__score-label { font-size: 13px; color: var(--text-muted); font-weight: 500; } +.modal__breakdown { display: flex; flex-direction: column; gap: 10px; margin-bottom: 24px; } +.modal__dimension { display: grid; grid-template-columns: 110px 1fr 36px; gap: 10px; align-items: center; } +.modal__dim-label { font-size: 12px; color: var(--text-muted); font-weight: 500; } +.modal__dim-track { height: 8px; background: var(--bg-elevated); border-radius: 4px; overflow: hidden; } +.modal__dim-fill { height: 100%; border-radius: 4px; transition: width .4s ease; } +.modal__dim-value { font-size: 12px; font-weight: 700; color: var(--text); text-align: right; } +.modal__section h4 { font-size: 14px; font-weight: 700; color: var(--text); margin-bottom: 8px; } +.modal__explanation { font-size: 13px; color: var(--text-muted); line-height: 1.65; } + +@media (max-width: 480px) { + .modal__dimension { grid-template-columns: 90px 1fr 32px; } + .modal__title { font-size: 18px; } +} + /* Reduced motion */ @media (prefers-reduced-motion: reduce) { html { scroll-behavior: auto; } diff --git a/landing/docker-compose.yml b/landing/docker-compose.yml index ec00d6d..6c7ef84 100644 --- a/landing/docker-compose.yml +++ b/landing/docker-compose.yml @@ -2,12 +2,37 @@ version: '3.8' services: apaw-landing: - build: - context: .. - dockerfile: landing/Dockerfile + image: landing-apaw-landing:latest container_name: apaw-landing ports: - "3002:80" + volumes: + # Live reload: landing files + - ../landing:/usr/share/nginx/html + # Live reload: agent configs for real-time model display + - ../.kilo:/usr/share/nginx/html/.kilo:ro + # Generated real-state JSON for dynamic agent table + - ../agent-evolution/data:/usr/share/nginx/html/data:ro + # Landing API server + - ../landing/api:/usr/share/nginx/html/api:ro + # Runtime override: hot-reload nginx config from host without rebuild + - ../landing/nginx-landing.conf:/etc/nginx/conf.d/default.conf:ro + restart: unless-stopped + networks: + - apaw-landing-net + + apaw-state-api: + image: python:3.12-alpine + container_name: apaw-state-api + working_dir: /usr/src/app + volumes: + # API server + - ../landing/api:/usr/src/app/api:ro + # Real-time agent configs + - ../.kilo:/usr/src/app/.kilo:ro + # Evolution data + - ../agent-evolution/data:/usr/src/app/data:ro + command: ["python3", "api/server.py"] restart: unless-stopped networks: - apaw-landing-net diff --git a/landing/entrypoint.sh b/landing/entrypoint.sh new file mode 100755 index 0000000..d3d917e --- /dev/null +++ b/landing/entrypoint.sh @@ -0,0 +1,12 @@ +#!/bin/sh +# Entrypoint for landing container — runs both nginx and state-api + +# Start state-api in background +python3 /usr/share/nginx/html/api/server.py & +API_PID=$! + +# Start nginx in foreground (replaces this shell) +nginx -g 'daemon off;' + +# If nginx exits, kill the API +kill $API_PID 2>/dev/null diff --git a/landing/index.html b/landing/index.html index 6895a7e..f66d38e 100644 --- a/landing/index.html +++ b/landing/index.html @@ -105,54 +105,8 @@ АгентКатегорияРольМодельFitВызывается - - - ★ Core Development - RequirementRefinerCoreФормализация требованийQwen3-Coder 480B92Issue status: new - HistoryMinerCoreПоиск дублей в git-историиQwen3-Coder 480B92Status: planned - SystemAnalystCoreАрхитектура, схемы, APIGLM-5.182Status: researching - SdetEngineerCoreТесты до кода (TDD)Qwen3-Coder 480B88Status: designed - LeadDeveloperCoreОсновная разработкаNemotron 3 Super90Status: testing - FrontendDeveloperCoreUI с мультимодальностьюMiniMax M2.586Когда нужен UI - BackendDeveloperCoreNode.js / ExpressQwen3-Coder 480B91Когда нужен backend - GoDeveloperCoreGo + Gin + Echo + DBDeepSeek V4-Pro88Когда нужен Go backend - PhpDeveloperCoreLaravel / SymfonyQwen3-Coder 480B87Когда нужен PHP - PythonDeveloperCoreDjango / FastAPIQwen3-Coder 480B90Когда нужен Python - FlutterDeveloperCoreFlutter / DartQwen3-Coder 480B86Когда нужен Flutter - DevopsEngineerCoreDocker, K8s, CI/CDKimi K2.688Когда нужен deploy - - - ☆ Quality Assurance - CodeSkepticQAАдверсариальное ревьюMiniMax M2.585Status: implementing - TheFixerQAИтеративный фикс баговKimi K2.690Если ревью не прошло - PerformanceEngineerQAN+1, memory leaks, perfDeepSeek V4-Pro84После CodeSkeptic - SecurityAuditorQAOWASP, CVE, secretsDeepSeek V4-Pro80После Performance - VisualTesterQAСкриншоты, pixelmatchQwen3-Coder 480B82Когда UI меняется - BrowserAutomationQAPlaywright E2EQwen3-Coder 480B87E2E-тестирование - - - ◆ Meta & Process - OrchestratorMetaГлавный диспетчерKimi K2.692Управление роутингом - ReleaseManagerMetaGit, semver, релизыGLM-5.176Status: releasing - EvaluatorMetaОценка эффективностиGLM-5.184Status: evaluated - PromptOptimizerMetaУлучшение промптовQwen3.6 Plus84Когда score < 7 - ProductOwnerMetaЧеклисты, лейблы, трекингGLM-5.178Управление задачами - CapabilityAnalystMetaАнализ пробелов в skillsGLM-5.182На старте задачи - AgentArchitectMetaСоздание новых агентовKimi K2.686Если нет подходящего агента - WorkflowArchitectMetaНовые workflow-определенияGLM-5.182Новый workflow - MarkdownValidatorMetaВалидация MarkdownDeepSeek V4-Pro68Перед созданием issue - PipelineJudgeMetaОбъективный fitness-скорGLM-5.184После Evaluator - ArchitectIndexerMetaИндекс проекта .architect/GLM-5.184Перед любой задачей - - - ● Cognitive Enhancement - PlannerCognitiveCoT / ToT / Plan-ReflectDeepSeek V4-Pro88Сложные задачи - ReflectorCognitiveУроки из ошибокDeepSeek V4-Pro84После каждого агента - MemoryManagerCognitiveКонтекст, векторный сторQwen3.6 Plus87Управление памятью - - - ♻ Security & Incident - IncidentResponderSecOpsФорензика, hardening, cleanupKimi K2.690Инцидент, компрометация + +
@@ -191,6 +145,55 @@
+ +
+
+

Аналитическая иерархия

+

Живая дистрибуция агентов по моделям и категориям. Обновляется автоматически из реальных конфигов .kilo/agents/.

+ +
+ +
+

Модели → Категории → Агенты

+
+ +
+
+ + +
+

Дистрибуция по категориям

+
+ +
+
+ + +
+

Fit-score распределение

+
+ +
+
+ + +
+

Команды — модели и режимы

+
+ + + + + + + +
КомандаМодельScore
+
+
+
+
+
+
@@ -351,6 +354,27 @@ bun run sync:evolution && bun run evolution:dashboard
+ + + + diff --git a/landing/nginx-landing.conf b/landing/nginx-landing.conf new file mode 100644 index 0000000..55d4eb1 --- /dev/null +++ b/landing/nginx-landing.conf @@ -0,0 +1,21 @@ +server { + listen 80; + server_name localhost; + root /usr/share/nginx/html; + index index.html; + + # All static assets + location / { + try_files $uri $uri/ /index.html; + } + + # API proxied to the Python state server + location /api/state { + proxy_pass http://apaw-state-api:8080/api/state; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_hide_header Content-Type; + add_header Content-Type application/json; + add_header Cache-Control "no-store"; + } +} diff --git a/scripts/real-fit-engine.py b/scripts/real-fit-engine.py new file mode 100644 index 0000000..a3af5ef --- /dev/null +++ b/scripts/real-fit-engine.py @@ -0,0 +1,565 @@ +#!/usr/bin/env python3 +""" +Real-Fit Multi-Agent Evaluation Engine (sync/stdlib version — no external deps) +SQLite-backed pipeline that evaluates agent-role × model fit via Ollama API. + +Usage: + python3 real-fit-engine.py --init-db --import-evolution --generate-prompts + python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro-max + python3 real-fit-engine.py --report + python3 real-fit-engine.py --recalc --agent lead-developer --old-model qwen3-coder:480b --new-model kimi-k2.6 + +Configuration: + OLLAMA_HOST (default: http://localhost:11434) +""" +import sqlite3, json, os, sys, re, time +from glob import glob +from datetime import datetime, timezone +from urllib import request, error as urllib_error +from concurrent.futures import ThreadPoolExecutor, as_completed + +DB_PATH = "agent-evolution/data/real-fit.db" + +OLLAMA_HOST = os.environ.get("OLLAMA_HOST", "https://api.ollama.com") +OLLAMA_KEY = os.environ.get("OLLAMA_KEY", "") +USE_MOCK = os.environ.get("OLLAMA_MOCK", "0") == "1" # Default to REAL for this env + +DEFAULT_MODELS = ["kimi-k2.6", "deepseek-v4-pro-max", "deepseek-v4-flash", + "glm-5.1", "qwen3-coder:480b", "qwen3.5-122b"] + +# ================================================================ +# SCHEMA +# ================================================================ +SCHEMA = """ +CREATE TABLE IF NOT EXISTS agents ( + name TEXT PRIMARY KEY, + description TEXT, + category TEXT, + current_model TEXT, + color TEXT, + updated TEXT +); + +CREATE TABLE IF NOT EXISTS models ( + short_name TEXT PRIMARY KEY, + full_id TEXT, + if_score REAL, + swe_bench REAL, + parameters TEXT, + context_window TEXT, + updated TEXT +); + +CREATE TABLE IF NOT EXISTS test_prompts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + agent_name TEXT, + task_type TEXT, + system_prompt TEXT, + user_prompt TEXT, + expected_keywords TEXT, + rubric TEXT +); + +CREATE TABLE IF NOT EXISTS evaluations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + agent_name TEXT, + model TEXT, + prompt_id INTEGER, + response TEXT, + latency_ms INTEGER, + tokens_prompt INTEGER, + tokens_response INTEGER, + scores TEXT, + total_score REAL, + explanation TEXT, + evaluated_at TEXT, + evaluator TEXT +); + +CREATE TABLE IF NOT EXISTS recalculations ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + trigger TEXT, + agent_name TEXT, + old_model TEXT, + new_model TEXT, + old_fit REAL, + new_fit REAL, + delta REAL, + reason TEXT, + recalculated_at TEXT +); + +CREATE TABLE IF NOT EXISTS fit_scores ( + agent_name TEXT PRIMARY KEY, + model TEXT, + fit_score REAL, + dimension_scores TEXT, + explanation TEXT, + evaluated_at TEXT, + FOREIGN KEY (agent_name) REFERENCES agents(name) +); + +CREATE INDEX IF NOT EXISTS idx_eval_agent_model ON evaluations(agent_name, model); +CREATE INDEX IF NOT EXISTS idx_recalc_agent ON recalculations(agent_name); +""" + +def init_db(): + os.makedirs(os.path.dirname(DB_PATH), exist_ok=True) + conn = sqlite3.connect(DB_PATH) + conn.executescript(SCHEMA) + conn.commit() + conn.close() + print(f"[db] Initialized schema in {DB_PATH}") + +# ================================================================ +# PROMPT GENERATOR +# ================================================================ + +def parse_frontmatter(path): + try: + with open(path, 'r', encoding='utf-8') as f: + content = f.read() + except: + return {} + if not content.startswith('---'): + return {} + end = content.find('---', 3) + if end == -1: + return {} + data = {} + for line in content[3:end].strip().split('\n'): + m = re.match(r'^(\w+):\s*(.+)$', line) + if m: + data[m.group(1)] = m.group(2).strip() + body = content[end+3:][:800] + data['_body_snippet'] = body.replace('\n', ' ').strip()[:300] + return data + +TASK_LIBRARY = { + 'code-skeptic': { + 'system': 'You are a strict code reviewer. Find security issues, logic errors, anti-patterns. Be adversarial but constructive.', + 'task': '''Review this function for security vulnerabilities and logic errors. Report: SQL injection, XSS, race conditions, code smells, and suggested fixes. + +```typescript +function processPayment(userId, amount, cardToken) { + const q = `UPDATE users SET balance = balance - ${amount} WHERE id = ${userId}`; + db.exec(q); + fetch('/api/charge', { body: JSON.stringify({ cardToken, amount }) }); + if (Math.random() > 0.9) { throw new Error('timeout'); } +} +```''', + 'expected': ['sql injection', 'parameterized', 'race', 'localStorage', 'xss'], + 'rubric': {'security': 35, 'logic': 25, 'actionability': 25, 'depth': 15} + }, + 'workflow-cross-checker': { + 'system': 'You are a workflow cross-checker. Before any work begins, ask uncomfortable but important questions that could block the task.', + 'task': 'A developer wants to add "admin can delete any user" directly from the UI. Run your cross-check protocol. Identify 5+ potential issues or blockers.', + 'expected': ['soft delete', 'audit log', 'cascading', 'permission', 'data retention', 'backup'], + 'rubric': {'thoroughness': 35, 'relevance': 30, 'actionability': 20, 'severity_ranking': 15} + }, + 'lead-developer': { + 'system': 'You are lead developer. Write production-ready implementation. Tests MUST pass. Follow SOLID. Max 100 lines per file.', + 'task': 'Implement a TaskQueue class with: transaction support, retry with exponential backoff, timeout handling, and Jest tests. TypeScript.', + 'expected': ['class TaskQueue', 'async', 'retry', 'timeout', 'test', 'jest'], + 'rubric': {'correctness': 30, 'test_coverage': 30, 'code_quality': 25, 'edge_cases': 15} + }, + 'sdet-engineer': { + 'system': 'You are SDET. Write tests BEFORE code. Cover edge cases, nulls, async errors, concurrent access.', + 'task': 'Write Jest tests for UserService: createUser, getUser, updateUser, deleteUser. Cover: valid inputs, nulls, duplicates, concurrent updates.', + 'expected': ['describe', 'it', 'expect', 'null', 'async', 'mock', 'beforeEach'], + 'rubric': {'coverage': 35, 'edge_cases': 30, 'readability': 20, 'mocking': 15} + }, + 'orchestrator': { + 'system': 'You are an Orchestrator. You delegate tasks to subagents. You decide routing, handle errors, and manage budgets.', + 'task': 'A user reports: "Build a REST API for ecommerce checkout". Design your delegation plan: which agents to call, in what order, what to do if one fails.', + 'expected': ['system-analyst', 'lead-developer', 'code-skeptic', 'sdet-engineer', 'budget', 'parallel'], + 'rubric': {'plan_quality': 30, 'agent_selection': 25, 'risk_handling': 25, 'budget_awareness': 20} + }, + 'system-analyst': { + 'system': 'You design technical specifications, data schemas, and API contracts before implementation.', + 'task': 'Design the API contract and DB schema for a multi-tenant SaaS billing system. Include rate limiting, audit trails, and idempotency.', + 'expected': ['openapi', 'schema', 'idempotency', 'rate limit', 'audit', 'tenant'], + 'rubric': {'completeness': 30, 'correctness': 30, 'clarity': 20, 'scalability': 20} + }, + 'devops-engineer': { + 'system': 'You handle Docker, CI/CD, infrastructure. Security first.', + 'task': 'Write a multi-stage Dockerfile for a Node.js Next.js app. Include: non-root user, health check, security scan, .dockerignore best practices.', + 'expected': ['FROM node', 'USER', 'HEALTHCHECK', 'multi-stage', '.dockerignore'], + 'rubric': {'security': 30, 'optimization': 25, 'correctness': 25, 'completeness': 20} + } +} + +def generate_task_for_agent(name, role): + n, r = name.lower(), role.lower() + for key, task in TASK_LIBRARY.items(): + if key in n: + return task + # Keyword fallback + for key in TASK_LIBRARY: + if key.replace('-', ' ') in r or any(kw in r for kw in key.split('-')): + return TASK_LIBRARY[key] + return { + 'system': f'You are {name}. {role}', + 'task': f'Demonstrate your expertise as {name} in a realistic complex scenario. Provide a complete working solution.', + 'expected': [name.replace('-', ' ')], + 'rubric': {'relevance': 40, 'completeness': 30, 'correctness': 30} + } + +def generate_prompts(): + conn = sqlite3.connect(DB_PATH) + conn.execute("DELETE FROM test_prompts") + count = 0 + for path in sorted(glob('.kilo/agents/*.md')): + fm = parse_frontmatter(path) + if not fm.get('model'): + continue + name = os.path.basename(path)[:-3] + task = generate_task_for_agent(name, fm.get('description', '')) + if task: + conn.execute(''' + INSERT INTO test_prompts (agent_name, task_type, system_prompt, user_prompt, expected_keywords, rubric) + VALUES (?, ?, ?, ?, ?, ?) + ''', (name, 'primary', task['system'], task['task'], + json.dumps(task['expected']), json.dumps(task['rubric']))) + count += 1 + conn.commit() + conn.close() + print(f"[prompts] Generated {count} test prompts") + +# ================================================================ +# OLLAMA CLIENT +# ================================================================ + +def call_ollama(model_short, system_prompt, user_prompt, expected_keywords=None): + """REAL Ollama API call via /api/chat. Returns (text, latency_ms, tokens_dict).""" + if USE_MOCK: + return ( + "[MOCK] This is a simulated response for testing the pipeline without API calls.", + 500, {"prompt": 100, "response": 200} + ) + + model_map = { + 'kimi-k2.6': 'kimi-k2.6', + 'deepseek-v4-pro-max': 'deepseek-v4-pro', + 'deepseek-v4-flash': 'deepseek-v4-flash', + 'glm-5.1': 'glm-5.1', + 'qwen3-coder:480b': 'qwen3-coder:480b', + 'qwen3.5-122b': 'kimi-k2.6', # fallback to known working model + } + model_ollama = model_map.get(model_short, model_short) + payload = json.dumps({ + "model": model_ollama, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + "stream": False, + "options": {"temperature": 0.3, "num_predict": 2048} + }).encode('utf-8') + + headers = {"Content-Type": "application/json"} + if OLLAMA_KEY: + headers["Authorization"] = f"Bearer {OLLAMA_KEY}" + + req = request.Request(f"{OLLAMA_HOST}/api/chat", + data=payload, headers=headers, + method='POST') + start = time.time() + try: + with request.urlopen(req, timeout=120) as resp: + elapsed = int((time.time() - start) * 1000) + data = json.loads(resp.read().decode('utf-8')) + text = data.get('message', {}).get('content', '') + return (text, elapsed, + {"prompt": data.get('prompt_eval_count', 0), + "response": data.get('eval_count', 0)}) + except urllib_error.HTTPError as e: + return (f"[HTTP {e.code}: {e.reason}]", int((time.time()-start)*1000), {"prompt":0,"response":0}) + except Exception as e: + return (f"[ERROR: {e}]", 0, {"prompt":0,"response":0}) + +# ================================================================ +# EVALUATOR +# ================================================================ + +def evaluate_response(response, expected_json, rubric_json): + """Rubric-based evaluation. Returns dict.""" + expected = json.loads(expected_json) if isinstance(expected_json, str) else expected_json + rubric = json.loads(rubric_json) if isinstance(rubric_json, str) else rubric_json + resp_lower = (response or '').lower() + lines = response.strip().split('\n') + + keyword_hits = sum(1 for kw in expected if kw.lower() in resp_lower) + keyword_score = min(100, (keyword_hits / len(expected) * 100) if expected else 50) + + has_code = '```' in response or 'function' in resp_lower or 'class ' in resp_lower + code_score = 80 if has_code else 30 + + structure_score = min(100, len(lines) * 2) # ~50 lines = 100 + + scores = {'keyword_coverage': round(keyword_score, 1), + 'code_presence': code_score, + 'structure': round(structure_score, 1)} + + total = 0 + if rubric: + for dim, weight in rubric.items(): + dim_score = scores.get(dim, keyword_score) + total += (dim_score / 100) * weight + else: + total = sum(scores.values()) / len(scores) + + explanation = (f"Keywords: {keyword_hits}/{len(expected)}. " + f"Lines: {len(lines)}. " + f"Code: {'YES' if has_code else 'NO'}. " + f"Total={round(total, 1)}") + + return {'scores': scores, 'total': round(total, 1), 'explanation': explanation} + +# ================================================================ +# PARALLEL BATCH EVALUATION +# ================================================================ + +def evaluate_one(args): + agent_name, model, pid, system, user, expected, rubric = args + resp, latency, tokens = call_ollama(model, system, user, expected) + ev = evaluate_response(resp, expected, rubric) + return { + 'agent': agent_name, 'model': model, 'prompt_id': pid, + 'response': resp, 'latency': latency, 'tokens': tokens, + 'total': ev['total'], 'scores': json.dumps(ev['scores']), + 'explanation': ev['explanation'] + } + +def evaluate_all(models_to_test, max_workers=4): + """Evaluate all agents × all models with parallel workers.""" + conn = sqlite3.connect(DB_PATH) + agents = conn.execute("SELECT DISTINCT name FROM agents").fetchall() + tasks = [] + + for (agent_name,) in agents: + prompts = conn.execute(''' + SELECT id, system_prompt, user_prompt, expected_keywords, rubric + FROM test_prompts WHERE agent_name = ?''', (agent_name,)).fetchall() + for pid, sys, usr, exp, rub in prompts: + for model in models_to_test: + tasks.append((agent_name, model, pid, sys, usr, exp, rub)) + + conn.close() + + print(f"[eval] Prepared {len(tasks)} evaluations (agents × models × prompts)") + + results = [] + with ThreadPoolExecutor(max_workers=max_workers) as ex: + futures = {ex.submit(evaluate_one, t): t for t in tasks} + for future in as_completed(futures): + res = future.result() + results.append(res) + conn = sqlite3.connect(DB_PATH) + conn.execute('''INSERT INTO evaluations + (agent_name, model, prompt_id, response, latency_ms, tokens_prompt, tokens_response, + scores, total_score, explanation, evaluated_at, evaluator) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?)''', + (res['agent'], res['model'], res['prompt_id'], res['response'], res['latency'], + res['tokens']['prompt'], res['tokens']['response'], + res['scores'], res['total'], res['explanation'], + datetime.now(timezone.utc).isoformat(), 'rubric_v1')) + conn.commit() + conn.close() + print(f" [{res['agent']}] × [{res['model']}] score={res['total']:.1f}") + + print(f"[eval] Stored {len(results)} evaluations") + compute_aggregates() + +def compute_aggregates(): + """Compute per-agent model fit scores from evaluation averages.""" + conn = sqlite3.connect(DB_PATH) + rows = conn.execute(''' + SELECT agent_name, model, AVG(total_score) as avg_score + FROM evaluations GROUP BY agent_name, model + ''').fetchall() + + # For each agent pick best model + best = {} + for a, m, s in rows: + if a not in best or s > best[a][1]: + best[a] = (m, s) + + for a, (m, s) in best.items(): + # Get dimension breakdown + dims = conn.execute(''' + SELECT scores FROM evaluations WHERE agent_name = ? AND model = ? + ''', (a, m)).fetchall() + dim_avg = {} + for (score_json,) in dims: + for k, v in json.loads(score_json).items(): + dim_avg[k] = dim_avg.get(k, 0) + v + dim_avg = {k: round(v / len(dims), 1) for k, v in dim_avg.items()} + + explanation = f"Best model for {a} is {m} with avg score {round(s,1)}. " + explanation += f"Strongest dimension: {max(dim_avg, key=dim_avg.get)}." + + conn.execute('''INSERT OR REPLACE INTO fit_scores + (agent_name, model, fit_score, dimension_scores, explanation, evaluated_at) + VALUES (?, ?, ?, ?, ?, ?)''', + (a, m, round(s, 1), json.dumps(dim_avg), explanation, + datetime.now(timezone.utc).isoformat())) + + conn.commit() + conn.close() + print(f"[agg] Computed fit scores for {len(best)} agents") + +# ================================================================ +# RECALCULATION TRIGGER +# ================================================================ + +def trigger_recalculation(agent_name, old_model, new_model, reason="manual"): + """After model or prompt change, re-evaluate and log delta.""" + conn = sqlite3.connect(DB_PATH) + + old_row = conn.execute('''SELECT fit_score FROM fit_scores WHERE agent_name = ?''', (agent_name,)).fetchone() + old_fit = old_row[0] if old_row else 0 + + # Re-evaluate on new model + prompt = conn.execute('''SELECT system_prompt, user_prompt, expected_keywords, rubric + FROM test_prompts WHERE agent_name = ? LIMIT 1''', (agent_name,)).fetchone() + + if prompt: + sys, usr, exp, rub = prompt + resp, lat, tok = call_ollama(new_model, sys, usr) + ev = evaluate_response(resp, exp, rub) + new_fit = ev['total'] + else: + new_fit = 0 + + delta = new_fit - old_fit + conn.execute('''INSERT INTO recalculations + (trigger, agent_name, old_model, new_model, old_fit, new_fit, delta, reason, recalculated_at) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)''', + (reason, agent_name, old_model, new_model, old_fit, new_fit, delta, reason, + datetime.now(timezone.utc).isoformat())) + conn.commit() + conn.close() + + print(f"[recalc] {agent_name}: {old_model}({old_fit:.1f}) → {new_model}({new_fit:.1f}) Δ={delta:+.1f}") + return delta + +# ================================================================ +# REPORT / DASHBOARD DATA +# ================================================================ + +def generate_report(): + conn = sqlite3.connect(DB_PATH) + + # All evaluations per agent per model + rows = conn.execute(''' + SELECT agent_name, model, AVG(total_score) as avg_score, COUNT(*) as cnt + FROM evaluations GROUP BY agent_name, model + ''').fetchall() + + agents = {} + for a, m, s, c in rows: + if a not in agents: + info = conn.execute('SELECT description, category, current_model FROM agents WHERE name = ?', (a,)).fetchone() + agents[a] = {'name': a, 'evaluations': {}, 'info': info or ()} + agents[a]['evaluations'][m] = round(s, 1) + + # Best per agent + for a in agents: + evs = agents[a]['evaluations'] + best_m = max(evs, key=evs.get) + agents[a]['best_model'] = best_m + agents[a]['best_score'] = evs[best_m] + + # Fit scores table + fit_rows = conn.execute('SELECT agent_name, model, fit_score, explanation FROM fit_scores').fetchall() + fit_scores = {} + for a, m, s, e in fit_rows: + fit_scores[a] = {'model': m, 'fit': s, 'explanation': e} + + report = { + 'generated': datetime.now(timezone.utc).isoformat(), + 'source': 'real-fit-engine', + 'total_evaluations': len(rows), + 'agents': agents, + 'fit_scores': fit_scores + } + + out = 'agent-evolution/data/real-fit-report.json' + with open(out, 'w') as f: + json.dump(report, f, ensure_ascii=False, indent=2) + + conn.close() + print(f"[report] Written {out}: {len(agents)} agents, {len(rows)} evaluations") + return report + +# ================================================================ +# IMPORT REAL DATA +# ================================================================ + +def import_from_evolution(): + with open('agent-evolution/data/evolution.json') as f: + evo = json.load(f) + conn = sqlite3.connect(DB_PATH) + for name, a in evo['agents'].items(): + c = a['current'] + conn.execute('''INSERT OR REPLACE INTO agents (name, description, category, current_model, color, updated) + VALUES (?, ?, ?, ?, ?, ?)''', + (name, c.get('description', ''), c.get('category', 'General'), + c.get('model', ''), c.get('color', ''), + datetime.now(timezone.utc).isoformat())) + for mid, m in evo.get('model_benchmarks', {}).items(): + conn.execute('''INSERT OR REPLACE INTO models (short_name, full_id, if_score, swe_bench, parameters, context_window, updated) + VALUES (?, ?, ?, ?, ?, ?, ?)''', + (mid, f'ollama-cloud/{mid}', m.get('if_score'), None, + m.get('parameters', ''), m.get('context_window', ''), + datetime.now(timezone.utc).isoformat())) + conn.commit() + conn.close() + print(f"[import] {len(evo['agents'])} agents, {len(evo.get('model_benchmarks',{}))} models") + +# ================================================================ +# CLI +# ================================================================ +if __name__ == '__main__': + import argparse + p = argparse.ArgumentParser(description='Real-Fit Multi-Agent Engine') + p.add_argument('--init-db', action='store_true') + p.add_argument('--import-evolution', action='store_true') + p.add_argument('--generate-prompts', action='store_true') + p.add_argument('--evaluate', metavar='AGENT') + p.add_argument('--models', default=','.join(DEFAULT_MODELS)) + p.add_argument('--evaluate-all', action='store_true') + p.add_argument('--report', action='store_true') + p.add_argument('--recalc', action='store_true') + p.add_argument('--agent', help='Agent for recalc') + p.add_argument('--old-model', help='Old model for recalc') + p.add_argument('--new-model', help='New model for recalc') + p.add_argument('--workers', type=int, default=4) + args = p.parse_args() + + if args.init_db: + init_db() + if args.import_evolution: + import_from_evolution() + if args.generate_prompts: + generate_prompts() + if args.evaluate: + models = args.models.split(',') + evaluate_all({args.evaluate: models}, args.workers) + if args.evaluate_all: + models = args.models.split(',') + evaluate_all(models, args.workers) + if args.report: + generate_report() + if args.recalc and args.agent and args.old_model and args.new_model: + trigger_recalculation(args.agent, args.old_model, args.new_model) + + if len(sys.argv) == 1: + p.print_help() + print("\n=== Workflow ===") + print(" python3 real-fit-engine.py --init-db --import-evolution --generate-prompts") + print(" python3 real-fit-engine.py --evaluate-all --models kimi-k2.6,deepseek-v4-pro-max") + print(" python3 real-fit-engine.py --report") + print(" python3 real-fit-engine.py --recalc --agent lead-developer --old-model qwen3-coder:480b --new-model kimi-k2.6") + print("\nSet OLLAMA_MOCK=0 for real Ollama API (port 11434)") diff --git a/scripts/real-fit-recalc.py b/scripts/real-fit-recalc.py new file mode 100644 index 0000000..8962fa6 --- /dev/null +++ b/scripts/real-fit-recalc.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +Recalculate real-fit scores from stored responses in SQLite. +No API needed. Updates evaluations, fit_scores, and generates report. +Usage: python3 scripts/real-fit-recalc.py +""" +import sqlite3, json, os, sys +from datetime import datetime, timezone + +DB_PATH = "agent-evolution/data/real-fit.db" +REPORT_PATH = "agent-evolution/data/real-fit-report.json" + + +def evaluate_response(response, expected_json, rubric_json): + expected = json.loads(expected_json) if isinstance(expected_json, str) else expected_json + rubric = json.loads(rubric_json) if isinstance(rubric_json, str) else rubric_json + resp_lower = (response or '').lower() + lines = response.strip().split('\n') + + keyword_hits = sum(1 for kw in expected if kw.lower() in resp_lower) + keyword_score = min(100, (keyword_hits / len(expected) * 100) if expected else 50) + + has_code = '```' in response or 'function' in resp_lower or 'class ' in resp_lower + code_score = 80 if has_code else 30 + + structure_score = min(100, len(lines) * 2) + + scores = {'keyword_coverage': round(keyword_score, 1), + 'code_presence': code_score, + 'structure': round(structure_score, 1)} + + total = 0 + if rubric: + for dim, weight in rubric.items(): + dim_score = scores.get(dim, keyword_score) + total += (dim_score / 100) * weight + else: + total = sum(scores.values()) / len(scores) + + explanation = (f"Keywords: {keyword_hits}/{len(expected)}. " + f"Lines: {len(lines)}. " + f"Code: {'YES' if has_code else 'NO'}. " + f"Total={round(total, 1)}") + + return {'scores': scores, 'total': round(total, 1), 'explanation': explanation} + + +def recalc(): + if not os.path.exists(DB_PATH): + print(f"[error] Database not found: {DB_PATH}") + sys.exit(1) + + conn = sqlite3.connect(DB_PATH) + c = conn.cursor() + + # Fetch all evaluations with prompt data resolved by agent_name (prompt_id mismatch safe) + c.execute('''SELECT e.id, e.agent_name, e.response, e.total_score, e.scores, e.explanation, + t.expected_keywords, t.rubric + FROM evaluations e + LEFT JOIN test_prompts t ON e.agent_name = t.agent_name''') + rows = c.fetchall() + print(f"[recalc] Found {len(rows)} evaluations") + + updated = 0 + for eid, agent_name, response, old_total, old_scores, old_exp, expected, rubric in rows: + if expected is None or rubric is None: + print(f" [skip] No prompt match for eval {eid} (agent={agent_name})") + continue + + ev = evaluate_response(response, expected, rubric) + + new_scores = json.dumps(ev['scores']) + new_total = ev['total'] + new_exp = ev['explanation'] + + c.execute('''UPDATE evaluations + SET total_score = ?, scores = ?, explanation = ? + WHERE id = ?''', + (new_total, new_scores, new_exp, eid)) + updated += 1 + + conn.commit() + print(f"[recalc] Updated {updated} evaluations") + + # Compute aggregates + c.execute('''SELECT agent_name, model, AVG(total_score) as avg_score + FROM evaluations GROUP BY agent_name, model''') + rows = c.fetchall() + + best = {} + for a, m, s in rows: + if a not in best or s > best[a][1]: + best[a] = (m, s) + + for a, (m, s) in best.items(): + c.execute('SELECT scores FROM evaluations WHERE agent_name = ? AND model = ?', (a, m)) + dims = c.fetchall() + dim_avg = {} + for (score_json,) in dims: + for k, v in json.loads(score_json).items(): + dim_avg[k] = dim_avg.get(k, 0) + v + dim_avg = {k: round(v / len(dims), 1) for k, v in dim_avg.items()} + + explanation = f"Best model for {a} is {m} with avg score {round(s,1)}. " + explanation += f"Strongest dimension: {max(dim_avg, key=dim_avg.get)}." + + c.execute('''INSERT OR REPLACE INTO fit_scores + (agent_name, model, fit_score, dimension_scores, explanation, evaluated_at) + VALUES (?, ?, ?, ?, ?, ?)''', + (a, m, round(s, 1), json.dumps(dim_avg), explanation, + datetime.now(timezone.utc).isoformat())) + + conn.commit() + print(f"[recalc] Computed fit scores for {len(best)} agents") + + # Generate report + c.execute('''SELECT agent_name, model, AVG(total_score) as avg_score, COUNT(*) as cnt + FROM evaluations GROUP BY agent_name, model''') + rows = c.fetchall() + + agents = {} + for a, m, s, cnt in rows: + if a not in agents: + c.execute('SELECT description, category, current_model FROM agents WHERE name = ?', (a,)) + info = c.fetchone() + agents[a] = {'name': a, 'evaluations': {}, 'info': info or ()} + agents[a]['evaluations'][m] = round(s, 1) + + for a in agents: + evs = agents[a]['evaluations'] + best_m = max(evs, key=evs.get) + agents[a]['best_model'] = best_m + agents[a]['best_score'] = evs[best_m] + + c.execute('SELECT agent_name, model, fit_score, explanation FROM fit_scores') + fit_scores = {} + for a, m, s, e in c.fetchall(): + fit_scores[a] = {'model': m, 'fit': s, 'explanation': e} + + report = { + 'generated': datetime.now(timezone.utc).isoformat(), + 'source': 'real-fit-engine', + 'total_evaluations': len(rows), + 'agents': agents, + 'fit_scores': fit_scores + } + + os.makedirs(os.path.dirname(REPORT_PATH), exist_ok=True) + with open(REPORT_PATH, 'w') as f: + json.dump(report, f, ensure_ascii=False, indent=2) + + print(f"[recalc] Written {REPORT_PATH}: {len(agents)} agents, {len(rows)} evaluations") + conn.close() + + +if __name__ == '__main__': + recalc() diff --git a/scripts/sync-dashboard-data.py b/scripts/sync-dashboard-data.py new file mode 100644 index 0000000..77f218b --- /dev/null +++ b/scripts/sync-dashboard-data.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Sync dashboard data — generates dashboard-data.json from evolution.json (real data).""" +import json, os +from datetime import datetime, timezone + +def build_dashboard_data(): + # Load real evolution data + with open('agent-evolution/data/evolution.json') as f: + evo = json.load(f) + + # Load model benchmarks for supplemental info + try: + with open('agent-evolution/data/model-benchmarks-verified.json') as f: + bm = json.load(f) + except: + bm = {'models': []} + + # Build model lookup + bench_map = {} + for m in bm.get('models', []): + mid = m.get('id', '') + if mid: + bench_map[f'ollama-cloud/{mid}'] = { + 'swe': m.get('swe_bench'), + 'name': m.get('name', mid), + 'params': m.get('parameters', ''), + 'ctx': m.get('context_window', '') + } + + agents = [] + for name, a in evo.get('agents', {}).items(): + c = a.get('current', {}) + if not c.get('model'): + continue + model = c['model'] + b = c.get('benchmark', {}) + fit = b.get('fit_score', 0) + if_ = b.get('instruction_following', 0) + model_short = model.replace('ollama-cloud/', '') + b_info = bench_map.get(model, {}) + + # Get latest model change from history + history = a.get('history', []) + latest_change = None + for h in reversed(history): + if h.get('type') == 'model_change': + latest_change = h + break + + agents.append({ + 'name': name, + 'model': model, + 'mode': c.get('mode', 'subagent'), + 'description': c.get('description', ''), + 'category': c.get('category', 'General'), + 'color': c.get('color', '#8B5CF6'), + 'provider': c.get('provider', 'Ollama'), + 'variant': c.get('variant', ''), + 'fit_score': fit, + 'instruction_following': if_, + 'swe_bench': b_info.get('swe'), + 'model_short': model_short, + 'model_name': b_info.get('name', model_short), + 'model_params': b_info.get('params', ''), + 'model_ctx': b_info.get('ctx', ''), + 'recommendations': len(c.get('recommendations', [])), + 'history_count': len(history), + 'latest_change': latest_change + }) + + # Build model summary from REAL agent assignments + models = {} + for a in agents: + ms = a['model_short'] + if ms not in models: + models[ms] = { + 'fit_agents': [], # fit scores of agents on this model + 'name': a['model_name'], + 'params': a['model_params'], + 'ctx': a['model_ctx'], + 'agents': 0, + 'commands': 0, + 'avg_fit': 0 + } + models[ms]['agents'] += 1 + models[ms]['fit_agents'].append(a['fit_score']) + + # Calculate averages + for ms, m in models.items(): + fits = m['fit_agents'] + m['avg_fit'] = round(sum(fits) / len(fits), 1) if fits else 0 + m['min_fit'] = min(fits) if fits else 0 + m['max_fit'] = max(fits) if fits else 0 + del m['fit_agents'] + + # Model distribution + from collections import Counter + model_dist = Counter(a['model'] for a in agents) + + return { + 'generated': datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z'), + 'source': 'evolution.json', + 'total_agents': len(agents), + 'total_models': len(models), + 'agents': agents, + 'models': models, + 'model_distribution': dict(model_dist) + } + +if __name__ == '__main__': + data = build_dashboard_data() + out = 'agent-evolution/data/dashboard-data.json' + os.makedirs(os.path.dirname(out), exist_ok=True) + with open(out, 'w') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + print(f'Written {out}: {len(data["agents"])} agents, {len(data["models"])} models') + print('Sample agents:') + for a in data['agents'][:5]: + print(f' {a["name"]:25} fit={a["fit_score"]:3d} if={a["instruction_following"]:3d} {a["model_short"]:25}') + print('Models:') + for ms, m in sorted(data['models'].items(), key=lambda x: -x[1]['agents']): + print(f' {ms:20} avg_fit={m["avg_fit"]:5.1f} agents={m["agents"]}')