{ "generated": "2026-05-28T10:48:02.581965+00:00", "source": "real-fit-engine", "total_evaluations": 147, "agents": { "agent-architect": { "name": "agent-architect", "evaluations": { "deepseek-v4-pro": 48.3, "glm-5.1": 48.3, "kimi-k2.6": 53.5, "qwen3-coder:480b": 48.3 }, "info": [ "Creates, modifies, and reviews new agents, workflows, and skills based on capability gap analysis. Tier 2 meta-agent with self-cascade enabled.", "meta", "ollama-cloud/kimi-k2.6" ], "best_model": "kimi-k2.6", "best_score": 53.5 }, "architect-indexer": { "name": "architect-indexer", "evaluations": { "deepseek-v4-pro": 43.2, "glm-5.1": 48.6, "kimi-k2.6": 46.5, "qwen3-coder:480b": 54.0 }, "info": [ "Indexes and maps project codebase architecture into .architect/ directory. Creates and maintains structured documentation of entities, APIs, DB schema, file graphs, and conventions. (GNS-2 Tier 0)", "core", "ollama-cloud/glm-5.1" ], "best_model": "qwen3-coder:480b", "best_score": 54.0 }, "backend-developer": { "name": "backend-developer", "evaluations": { "deepseek-v4-pro": 53.5, "glm-5.1": 48.3, "kimi-k2.6": 48.3, "qwen3-coder:480b": 43.2 }, "info": [ "Backend specialist for Node.js, Express, APIs, and database integration (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], "best_model": "deepseek-v4-pro", "best_score": 53.5 }, "browser-automation": { "name": "browser-automation", "evaluations": { "deepseek-v4-pro": 42.8, "glm-5.1": 53.3, "kimi-k2.6": 63.8, "qwen3-coder:480b": 48.9 }, "info": [ "Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction (GNS-2 Tier 0)", "testing", "ollama-cloud/deepseek-v4-flash" ], "best_model": "kimi-k2.6", "best_score": 63.8 }, "capability-analyst": { "name": "capability-analyst", "evaluations": { "deepseek-v4-pro": 58.7, "glm-5.1": 53.5, "kimi-k2.6": 58.7, "qwen3-coder:480b": 52.3 }, "info": [ "Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components. Tier 2 meta-agent with self-cascade enabled.", "meta", "ollama-cloud/deepseek-v4-pro-max" ], "best_model": "deepseek-v4-pro", "best_score": 58.7 }, "code-skeptic": { "name": "code-skeptic", "evaluations": { "deepseek-v4-pro": 22.8, "glm-5.1": 89.1, "kimi-k2.6": 91.2, "minimax-m2.5": 45.0, "qwen3-coder:480b": 90.6 }, "info": [ "Adversarial code reviewer. Finds problems and issues. Does NOT suggest implementations (GNS-2 Tier 0)", "quality", "ollama-cloud/minimax-m2.5" ], "best_model": "kimi-k2.6", "best_score": 91.2 }, "devops-engineer": { "name": "devops-engineer", "evaluations": { "deepseek-v4-pro": 29.7, "glm-5.1": 96.2, "kimi-k2.6": 87.2, "qwen3-coder:480b": 87.2 }, "info": [ "DevOps specialist for Docker, Kubernetes, CI/CD pipeline automation, and infrastructure management (GNS-2 Tier 1)", "core", "ollama-cloud/kimi-k2.6" ], "best_model": "glm-5.1", "best_score": 96.2 }, "evaluator": { "name": "evaluator", "evaluations": { "deepseek-v4-pro": 50.6, "glm-5.1": 58.7, "kimi-k2.6": 53.5, "qwen3-coder:480b": 43.8 }, "info": [ "Scores agent effectiveness after task completion for continuous improvement. Tier 2 meta-agent with self-cascade enabled.", "meta", "ollama-cloud/qwen3.5-122b" ], "best_model": "glm-5.1", "best_score": 58.7 }, "evolution-prompt": { "name": "evolution-prompt", "evaluations": { "deepseek-v4-pro": 52.6, "glm-5.1": 44.7, "kimi-k2.6": 53.5, "qwen3-coder:480b": 21.3 }, "info": [ "Generates role-specific stress-test prompts by analyzing agent definitions", "meta", "ollama-cloud/deepseek-v4-pro-max" ], "best_model": "kimi-k2.6", "best_score": 53.5 }, "evolution-skeptic": { "name": "evolution-skeptic", "evaluations": { "deepseek-v4-pro": 33.1, "glm-5.1": 31.6, "kimi-k2.6": 37.3, "qwen3-coder:480b": 42.9 }, "info": [ "Evaluates model responses against role-specific rubrics with detailed scoring and commentary", "meta", "ollama-cloud/deepseek-v4-pro-max" ], "best_model": "qwen3-coder:480b", "best_score": 42.9 }, "flutter-developer": { "name": "flutter-developer", "evaluations": { "deepseek-v4-pro": 34.5, "glm-5.1": 54.9, "kimi-k2.6": 49.3, "qwen3-coder:480b": 54.9 }, "info": [ "Flutter mobile specialist for cross-platform apps, state management, and UI components (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], "best_model": "glm-5.1", "best_score": 54.9 }, "frontend-developer": { "name": "frontend-developer", "evaluations": { "deepseek-v4-pro": 31.6, "glm-5.1": 53.2, "kimi-k2.6": 38.8, "qwen3-coder:480b": 56.0 }, "info": [ "Handles UI implementation with multimodal capabilities. Accepts visual references like screenshots and mockups (GNS-2 Tier 1)", "core", "ollama-cloud/minimax-m2.5" ], "best_model": "qwen3-coder:480b", "best_score": 56.0 }, "go-developer": { "name": "go-developer", "evaluations": { "deepseek-v4-pro": 41.4, "glm-5.1": 53.5, "kimi-k2.6": 48.3, "qwen3-coder:480b": 58.7 }, "info": [ "Go backend specialist for Gin, Echo, APIs, and database integration (GNS-2 Tier 1)", "core", "ollama-cloud/deepseek-v4-pro-max" ], "best_model": "qwen3-coder:480b", "best_score": 58.7 }, "history-miner": { "name": "history-miner", "evaluations": { "deepseek-v4-pro": 30.1, "glm-5.1": 44.3, "kimi-k2.6": 46.9, "qwen3-coder:480b": 44.8 }, "info": [ "Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work (GNS-2 Tier 0)", "core", "ollama-cloud/qwen3.5-122b" ], "best_model": "kimi-k2.6", "best_score": 46.9 }, "incident-responder": { "name": "incident-responder", "evaluations": { "deepseek-v4-pro": 48.6, "glm-5.1": 65.6, "kimi-k2.6": 59.1, "qwen3-coder:480b": 56.4 }, "info": [ "Server incident response and system hardening specialist. Handles live forensics, malware removal, persistence hunting, SSH-based server cleanup, and post-incident hardening. Works with any OS and panel.", "core", "ollama-cloud/kimi-k2.6" ], "best_model": "glm-5.1", "best_score": 65.6 }, "lead-developer": { "name": "lead-developer", "evaluations": { "deepseek-v4-pro": 28.7, "glm-5.1": 68.8, "kimi-k2.6": 72.5, "qwen3-coder:480b": 72.5 }, "info": [ "Primary code writer for backend and core logic. Writes implementation to pass tests (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], "best_model": "kimi-k2.6", "best_score": 72.5 }, "markdown-validator": { "name": "markdown-validator", "evaluations": { "deepseek-v4-pro": 39.0, "glm-5.1": 37.2, "kimi-k2.6": 24.0, "qwen3-coder:480b": 47.4 }, "info": [ "Validates and corrects Markdown descriptions for Gitea issues (GNS-2 Tier 0)", "meta", "ollama-cloud/nemotron-3-nano" ], "best_model": "qwen3-coder:480b", "best_score": 47.4 }, "memory-manager": { "name": "memory-manager", "evaluations": { "deepseek-v4-pro": 35.8, "glm-5.1": 48.3, "kimi-k2.6": 41.5, "qwen3-coder:480b": 46.8 }, "info": [ "Manages agent memory systems - short-term (context), long-term (vector store), and episodic (experiences) (GNS-2 Tier 0)", "cognitive", "ollama-cloud/deepseek-v4-pro-max" ], "best_model": "glm-5.1", "best_score": 48.3 }, "orchestrator": { "name": "orchestrator", "evaluations": { "deepseek-v4-flash": 27.0, "deepseek-v4-pro": 19.6, "glm-5.1": 36.2, "kimi-k2.6": 40.0, "minimax-m2.5": 36.3, "qwen3-coder:480b": 39.1 }, "info": [ "Main dispatcher. Routes tasks between agents based on Issue status and manages the workflow state machine. IF:90 for optimal routing accuracy. (GNS-2 Tier 1)", "meta", "ollama-cloud/kimi-k2.6" ], "best_model": "kimi-k2.6", "best_score": 40.0 }, "performance-engineer": { "name": "performance-engineer", "evaluations": { "deepseek-v4-pro": 27.9, "glm-5.1": 63.8, "kimi-k2.6": 34.3, "qwen3-coder:480b": 36.3 }, "info": [ "Reviews code for performance issues. Focuses on efficiency, N+1 queries, memory leaks, and algorithmic complexity (GNS-2 Tier 0)", "quality", "ollama-cloud/deepseek-v4-pro-max" ], "best_model": "glm-5.1", "best_score": 63.8 }, "php-developer": { "name": "php-developer", "evaluations": { "deepseek-v4-pro": 53.5, "glm-5.1": 48.3, "kimi-k2.6": 48.3, "qwen3-coder:480b": 48.3 }, "info": [ "PHP backend specialist for Laravel, Symfony, WordPress, and full-stack web applications (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], "best_model": "deepseek-v4-pro", "best_score": 53.5 }, "pipeline-judge": { "name": "pipeline-judge", "evaluations": { "deepseek-v4-pro": 34.6, "glm-5.1": 45.6, "kimi-k2.6": 46.5, "qwen3-coder:480b": 52.9 }, "info": [ "Automated pipeline judge. Evaluates workflow execution by running tests, measuring token cost and wall-clock time. Produces objective fitness scores. Never writes code - only measures and scores. (GNS-2 Tier 0)", "meta", "ollama-cloud/kimi-k2.6" ], "best_model": "qwen3-coder:480b", "best_score": 52.9 }, "planner": { "name": "planner", "evaluations": { "deepseek-v4-pro": 41.7, "glm-5.1": 31.8, "kimi-k2.6": 34.6, "qwen3-coder:480b": 33.7 }, "info": [ "Advanced task planner using Chain of Thought, Tree of Thoughts, and Plan-Execute-Reflect (GNS-2 Tier 0)", "cognitive", "ollama-cloud/deepseek-v4-pro-max" ], "best_model": "deepseek-v4-pro", "best_score": 41.7 }, "product-owner": { "name": "product-owner", "evaluations": { "deepseek-v4-pro": 27.0, "glm-5.1": 33.4, "kimi-k2.6": 34.6, "qwen3-coder:480b": 27.0 }, "info": [ "Manages issue checklists, status labels, tracks progress and coordinates with human users (GNS-2 Tier 1)", "meta", "ollama-cloud/glm-5.1" ], "best_model": "kimi-k2.6", "best_score": 34.6 }, "prompt-optimizer": { "name": "prompt-optimizer", "evaluations": { "deepseek-v4-pro": 27.0, "glm-5.1": 48.3, "kimi-k2.6": 33.0, "qwen3-coder:480b": 31.8 }, "info": [ "Improves agent system prompts based on performance failures. Meta-learner for prompt optimization (GNS-2 Tier 1)", "meta", "ollama-cloud/qwen3.5-122b" ], "best_model": "glm-5.1", "best_score": 48.3 }, "python-developer": { "name": "python-developer", "evaluations": { "deepseek-v4-pro": 48.3, "glm-5.1": 48.3, "kimi-k2.6": 48.3, "qwen3-coder:480b": 48.3 }, "info": [ "Python backend specialist for Django, FastAPI, data science, and API development (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], "best_model": "deepseek-v4-pro", "best_score": 48.3 }, "reflector": { "name": "reflector", "evaluations": { "deepseek-v4-pro": 43.2, "glm-5.1": 53.5, "kimi-k2.6": 58.7, "qwen3-coder:480b": 20.9 }, "info": [ "Self-reflection agent using Reflexion pattern - learns from mistakes (GNS-2 Tier 0)", "cognitive", "ollama-cloud/deepseek-v4-pro-max" ], "best_model": "kimi-k2.6", "best_score": 58.7 }, "release-manager": { "name": "release-manager", "evaluations": { "deepseek-v4-pro": 23.7, "glm-5.1": 38.0, "kimi-k2.6": 50.2, "qwen3-coder:480b": 41.7 }, "info": [ "Manages git operations, semantic versioning, branching, and deployments. Ensures clean history (GNS-2 Tier 1)", "meta", "ollama-cloud/kimi-k2.6" ], "best_model": "kimi-k2.6", "best_score": 50.2 }, "requirement-refiner": { "name": "requirement-refiner", "evaluations": { "deepseek-v4-pro": 30.3, "glm-5.1": 31.0, "kimi-k2.6": 31.2, "qwen3-coder:480b": 45.3 }, "info": [ "Converts vague ideas and bug reports into strict User Stories with acceptance criteria checklists (GNS-2 Tier 1)", "core", "ollama-cloud/kimi-k2-thinking" ], "best_model": "qwen3-coder:480b", "best_score": 45.3 }, "sdet-engineer": { "name": "sdet-engineer", "evaluations": { "deepseek-v4-pro": 58.7, "glm-5.1": 86.0, "kimi-k2.6": 97.0, "qwen3-coder:480b": 97.0 }, "info": [ "Writes tests following TDD methodology. Tests MUST fail initially (Red phase) (GNS-2 Tier 1)", "core", "ollama-cloud/qwen3-coder:480b" ], "best_model": "kimi-k2.6", "best_score": 97.0 }, "security-auditor": { "name": "security-auditor", "evaluations": { "deepseek-v4-pro": 46.4, "glm-5.1": 58.7, "kimi-k2.6": 63.8, "qwen3-coder:480b": 41.5 }, "info": [ "Scans for security vulnerabilities, OWASP Top 10, dependency CVEs, and hardcoded secrets (GNS-2 Tier 0)", "quality", "ollama-cloud/deepseek-v4-pro-max" ], "best_model": "kimi-k2.6", "best_score": 63.8 }, "system-analyst": { "name": "system-analyst", "evaluations": { "deepseek-v4-pro": 56.4, "glm-5.1": 87.0, "kimi-k2.6": 92.0, "qwen3-coder:480b": 77.0 }, "info": [ "Designs technical specifications, data schemas, and API contracts before implementation (GNS-2 Tier 1)", "core", "ollama-cloud/deepseek-v4-pro-max" ], "best_model": "kimi-k2.6", "best_score": 92.0 }, "the-fixer": { "name": "the-fixer", "evaluations": { "deepseek-v4-pro": 43.6, "glm-5.1": 46.6, "kimi-k2.6": 36.4, "qwen3-coder:480b": 42.9 }, "info": [ "Iteratively fixes bugs based on specific error reports and test failures (GNS-2 Tier 1)", "quality", "ollama-cloud/kimi-k2.6" ], "best_model": "glm-5.1", "best_score": 46.6 }, "visual-tester": { "name": "visual-tester", "evaluations": { "deepseek-v4-pro": 47.3, "glm-5.1": 58.7, "kimi-k2.6": 53.5, "qwen3-coder:480b": 53.5 }, "info": [ "Visual regression testing agent that compares screenshots and detects UI differences using pixelmatch and image diff (GNS-2 Tier 0)", "quality", "ollama-cloud/qwen3-coder:480b" ], "best_model": "glm-5.1", "best_score": 58.7 }, "workflow-architect": { "name": "workflow-architect", "evaluations": { "deepseek-v4-pro": 36.3, "glm-5.1": 48.3, "kimi-k2.6": 48.3, "qwen3-coder:480b": 36.3 }, "info": [ "Creates and maintains workflow definitions with complete architecture, Gitea integration, and quality gates (GNS-2 Tier 1)", "meta", "ollama-cloud/qwen3.5-122b" ], "best_model": "glm-5.1", "best_score": 48.3 }, "workflow-cross-checker": { "name": "workflow-cross-checker", "evaluations": { "deepseek-v4-pro": 54.2, "glm-5.1": 63.3, "kimi-k2.6": 52.1, "qwen3-coder:480b": 65.6 }, "info": [ "Workflow cross-checker and process inspector. Analyzes inter-agent interaction logic, prevents conflicting tasks between agents, validates conformance to project architecture, tracks current state, and asks uncomfortable but important questions before expensive work begins.", "meta", "ollama-cloud/kimi-k2.6" ], "best_model": "qwen3-coder:480b", "best_score": 65.6 } }, "fit_scores": { "agent-architect": { "model": "kimi-k2.6", "fit": 53.5, "explanation": "Best model for agent-architect is kimi-k2.6 with avg score 53.5. Strongest dimension: code_presence." }, "architect-indexer": { "model": "qwen3-coder:480b", "fit": 54.0, "explanation": "Best model for architect-indexer is qwen3-coder:480b with avg score 54.0. Strongest dimension: code_presence." }, "backend-developer": { "model": "deepseek-v4-pro", "fit": 53.5, "explanation": "Best model for backend-developer is deepseek-v4-pro with avg score 53.5. Strongest dimension: code_presence." }, "browser-automation": { "model": "kimi-k2.6", "fit": 63.8, "explanation": "Best model for browser-automation is kimi-k2.6 with avg score 63.8. Strongest dimension: code_presence." }, "capability-analyst": { "model": "deepseek-v4-pro", "fit": 58.7, "explanation": "Best model for capability-analyst is deepseek-v4-pro with avg score 58.7. Strongest dimension: code_presence." }, "code-skeptic": { "model": "kimi-k2.6", "fit": 91.2, "explanation": "Best model for code-skeptic is kimi-k2.6 with avg score 91.2. Strongest dimension: code_presence." }, "devops-engineer": { "model": "glm-5.1", "fit": 96.2, "explanation": "Best model for devops-engineer is glm-5.1 with avg score 96.2. Strongest dimension: keyword_coverage." }, "evaluator": { "model": "glm-5.1", "fit": 58.7, "explanation": "Best model for evaluator is glm-5.1 with avg score 58.7. Strongest dimension: code_presence." }, "evolution-prompt": { "model": "kimi-k2.6", "fit": 53.5, "explanation": "Best model for evolution-prompt is kimi-k2.6 with avg score 53.5. Strongest dimension: code_presence." }, "evolution-skeptic": { "model": "qwen3-coder:480b", "fit": 42.9, "explanation": "Best model for evolution-skeptic is qwen3-coder:480b with avg score 42.9. Strongest dimension: structure." }, "flutter-developer": { "model": "glm-5.1", "fit": 54.9, "explanation": "Best model for flutter-developer is glm-5.1 with avg score 54.9. Strongest dimension: code_presence." }, "frontend-developer": { "model": "qwen3-coder:480b", "fit": 56.0, "explanation": "Best model for frontend-developer is qwen3-coder:480b with avg score 56.0. Strongest dimension: code_presence." }, "go-developer": { "model": "qwen3-coder:480b", "fit": 58.7, "explanation": "Best model for go-developer is qwen3-coder:480b with avg score 58.7. Strongest dimension: code_presence." }, "history-miner": { "model": "kimi-k2.6", "fit": 46.9, "explanation": "Best model for history-miner is kimi-k2.6 with avg score 46.9. Strongest dimension: code_presence." }, "incident-responder": { "model": "glm-5.1", "fit": 65.6, "explanation": "Best model for incident-responder is glm-5.1 with avg score 65.6. Strongest dimension: code_presence." }, "lead-developer": { "model": "kimi-k2.6", "fit": 72.5, "explanation": "Best model for lead-developer is kimi-k2.6 with avg score 72.5. Strongest dimension: keyword_coverage." }, "markdown-validator": { "model": "qwen3-coder:480b", "fit": 47.4, "explanation": "Best model for markdown-validator is qwen3-coder:480b with avg score 47.4. Strongest dimension: code_presence." }, "memory-manager": { "model": "glm-5.1", "fit": 48.3, "explanation": "Best model for memory-manager is glm-5.1 with avg score 48.3. Strongest dimension: code_presence." }, "orchestrator": { "model": "kimi-k2.6", "fit": 40.0, "explanation": "Best model for orchestrator is kimi-k2.6 with avg score 40.0. Strongest dimension: code_presence." }, "performance-engineer": { "model": "glm-5.1", "fit": 63.8, "explanation": "Best model for performance-engineer is glm-5.1 with avg score 63.8. Strongest dimension: code_presence." }, "php-developer": { "model": "deepseek-v4-pro", "fit": 53.5, "explanation": "Best model for php-developer is deepseek-v4-pro with avg score 53.5. Strongest dimension: code_presence." }, "pipeline-judge": { "model": "qwen3-coder:480b", "fit": 52.9, "explanation": "Best model for pipeline-judge is qwen3-coder:480b with avg score 52.9. Strongest dimension: code_presence." }, "planner": { "model": "deepseek-v4-pro", "fit": 41.7, "explanation": "Best model for planner is deepseek-v4-pro with avg score 41.7. Strongest dimension: code_presence." }, "product-owner": { "model": "kimi-k2.6", "fit": 34.6, "explanation": "Best model for product-owner is kimi-k2.6 with avg score 34.6. Strongest dimension: actionability." }, "prompt-optimizer": { "model": "glm-5.1", "fit": 48.3, "explanation": "Best model for prompt-optimizer is glm-5.1 with avg score 48.3. Strongest dimension: code_presence." }, "python-developer": { "model": "deepseek-v4-pro", "fit": 48.3, "explanation": "Best model for python-developer is deepseek-v4-pro with avg score 48.3. Strongest dimension: code_presence." }, "reflector": { "model": "kimi-k2.6", "fit": 58.7, "explanation": "Best model for reflector is kimi-k2.6 with avg score 58.7. Strongest dimension: code_presence." }, "release-manager": { "model": "kimi-k2.6", "fit": 50.2, "explanation": "Best model for release-manager is kimi-k2.6 with avg score 50.2. Strongest dimension: code_presence." }, "requirement-refiner": { "model": "qwen3-coder:480b", "fit": 45.3, "explanation": "Best model for requirement-refiner is qwen3-coder:480b with avg score 45.3. Strongest dimension: code_presence." }, "sdet-engineer": { "model": "kimi-k2.6", "fit": 97.0, "explanation": "Best model for sdet-engineer is kimi-k2.6 with avg score 97.0. Strongest dimension: keyword_coverage." }, "security-auditor": { "model": "kimi-k2.6", "fit": 63.8, "explanation": "Best model for security-auditor is kimi-k2.6 with avg score 63.8. Strongest dimension: code_presence." }, "system-analyst": { "model": "kimi-k2.6", "fit": 92.0, "explanation": "Best model for system-analyst is kimi-k2.6 with avg score 92.0. Strongest dimension: keyword_coverage." }, "the-fixer": { "model": "glm-5.1", "fit": 46.6, "explanation": "Best model for the-fixer is glm-5.1 with avg score 46.6. Strongest dimension: code_presence." }, "visual-tester": { "model": "glm-5.1", "fit": 58.7, "explanation": "Best model for visual-tester is glm-5.1 with avg score 58.7. Strongest dimension: code_presence." }, "workflow-architect": { "model": "glm-5.1", "fit": 48.3, "explanation": "Best model for workflow-architect is glm-5.1 with avg score 48.3. Strongest dimension: code_presence." }, "workflow-cross-checker": { "model": "qwen3-coder:480b", "fit": 65.6, "explanation": "Best model for workflow-cross-checker is qwen3-coder:480b with avg score 65.6. Strongest dimension: code_presence." } } }