- real-fit.html: API-driven research dashboard with agent/model heatmap, detail modal with score breakdown and evaluator commentary - api.py: FastAPI backend serving /api/real-fit-report (dynamic from SQLite), /api/research, /api/evolve-agent/start - rebuild-report.py: generates real-fit-report.json from SQLite DB for static fallback - docker-compose.yml: add evolution-api service (Python 3.12, uvicorn) for research endpoints - index.standalone.html: sync with dashboard data updates - archive/index.html: standalone dashboard snapshot (263KB) - .gitignore: exclude *.db, research-jobs.json from tracking
768 lines
25 KiB
JSON
768 lines
25 KiB
JSON
{
|
|
"generated": "2026-05-28T10:48:02.581965+00:00",
|
|
"source": "real-fit-engine",
|
|
"total_evaluations": 147,
|
|
"agents": {
|
|
"agent-architect": {
|
|
"name": "agent-architect",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 48.3,
|
|
"glm-5.1": 48.3,
|
|
"kimi-k2.6": 53.5,
|
|
"qwen3-coder:480b": 48.3
|
|
},
|
|
"info": [
|
|
"Creates, modifies, and reviews new agents, workflows, and skills based on capability gap analysis. Tier 2 meta-agent with self-cascade enabled.",
|
|
"meta",
|
|
"ollama-cloud/kimi-k2.6"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 53.5
|
|
},
|
|
"architect-indexer": {
|
|
"name": "architect-indexer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 43.2,
|
|
"glm-5.1": 48.6,
|
|
"kimi-k2.6": 46.5,
|
|
"qwen3-coder:480b": 54.0
|
|
},
|
|
"info": [
|
|
"Indexes and maps project codebase architecture into .architect/ directory. Creates and maintains structured documentation of entities, APIs, DB schema, file graphs, and conventions. (GNS-2 Tier 0)",
|
|
"core",
|
|
"ollama-cloud/glm-5.1"
|
|
],
|
|
"best_model": "qwen3-coder:480b",
|
|
"best_score": 54.0
|
|
},
|
|
"backend-developer": {
|
|
"name": "backend-developer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 53.5,
|
|
"glm-5.1": 48.3,
|
|
"kimi-k2.6": 48.3,
|
|
"qwen3-coder:480b": 43.2
|
|
},
|
|
"info": [
|
|
"Backend specialist for Node.js, Express, APIs, and database integration (GNS-2 Tier 1)",
|
|
"core",
|
|
"ollama-cloud/qwen3-coder:480b"
|
|
],
|
|
"best_model": "deepseek-v4-pro",
|
|
"best_score": 53.5
|
|
},
|
|
"browser-automation": {
|
|
"name": "browser-automation",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 42.8,
|
|
"glm-5.1": 53.3,
|
|
"kimi-k2.6": 63.8,
|
|
"qwen3-coder:480b": 48.9
|
|
},
|
|
"info": [
|
|
"Browser automation agent using Playwright MCP for E2E testing, form filling, navigation, and web interaction (GNS-2 Tier 0)",
|
|
"testing",
|
|
"ollama-cloud/deepseek-v4-flash"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 63.8
|
|
},
|
|
"capability-analyst": {
|
|
"name": "capability-analyst",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 58.7,
|
|
"glm-5.1": 53.5,
|
|
"kimi-k2.6": 58.7,
|
|
"qwen3-coder:480b": 52.3
|
|
},
|
|
"info": [
|
|
"Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components. Tier 2 meta-agent with self-cascade enabled.",
|
|
"meta",
|
|
"ollama-cloud/deepseek-v4-pro-max"
|
|
],
|
|
"best_model": "deepseek-v4-pro",
|
|
"best_score": 58.7
|
|
},
|
|
"code-skeptic": {
|
|
"name": "code-skeptic",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 22.8,
|
|
"glm-5.1": 89.1,
|
|
"kimi-k2.6": 91.2,
|
|
"minimax-m2.5": 45.0,
|
|
"qwen3-coder:480b": 90.6
|
|
},
|
|
"info": [
|
|
"Adversarial code reviewer. Finds problems and issues. Does NOT suggest implementations (GNS-2 Tier 0)",
|
|
"quality",
|
|
"ollama-cloud/minimax-m2.5"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 91.2
|
|
},
|
|
"devops-engineer": {
|
|
"name": "devops-engineer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 29.7,
|
|
"glm-5.1": 96.2,
|
|
"kimi-k2.6": 87.2,
|
|
"qwen3-coder:480b": 87.2
|
|
},
|
|
"info": [
|
|
"DevOps specialist for Docker, Kubernetes, CI/CD pipeline automation, and infrastructure management (GNS-2 Tier 1)",
|
|
"core",
|
|
"ollama-cloud/kimi-k2.6"
|
|
],
|
|
"best_model": "glm-5.1",
|
|
"best_score": 96.2
|
|
},
|
|
"evaluator": {
|
|
"name": "evaluator",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 50.6,
|
|
"glm-5.1": 58.7,
|
|
"kimi-k2.6": 53.5,
|
|
"qwen3-coder:480b": 43.8
|
|
},
|
|
"info": [
|
|
"Scores agent effectiveness after task completion for continuous improvement. Tier 2 meta-agent with self-cascade enabled.",
|
|
"meta",
|
|
"ollama-cloud/qwen3.5-122b"
|
|
],
|
|
"best_model": "glm-5.1",
|
|
"best_score": 58.7
|
|
},
|
|
"evolution-prompt": {
|
|
"name": "evolution-prompt",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 52.6,
|
|
"glm-5.1": 44.7,
|
|
"kimi-k2.6": 53.5,
|
|
"qwen3-coder:480b": 21.3
|
|
},
|
|
"info": [
|
|
"Generates role-specific stress-test prompts by analyzing agent definitions",
|
|
"meta",
|
|
"ollama-cloud/deepseek-v4-pro-max"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 53.5
|
|
},
|
|
"evolution-skeptic": {
|
|
"name": "evolution-skeptic",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 33.1,
|
|
"glm-5.1": 31.6,
|
|
"kimi-k2.6": 37.3,
|
|
"qwen3-coder:480b": 42.9
|
|
},
|
|
"info": [
|
|
"Evaluates model responses against role-specific rubrics with detailed scoring and commentary",
|
|
"meta",
|
|
"ollama-cloud/deepseek-v4-pro-max"
|
|
],
|
|
"best_model": "qwen3-coder:480b",
|
|
"best_score": 42.9
|
|
},
|
|
"flutter-developer": {
|
|
"name": "flutter-developer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 34.5,
|
|
"glm-5.1": 54.9,
|
|
"kimi-k2.6": 49.3,
|
|
"qwen3-coder:480b": 54.9
|
|
},
|
|
"info": [
|
|
"Flutter mobile specialist for cross-platform apps, state management, and UI components (GNS-2 Tier 1)",
|
|
"core",
|
|
"ollama-cloud/qwen3-coder:480b"
|
|
],
|
|
"best_model": "glm-5.1",
|
|
"best_score": 54.9
|
|
},
|
|
"frontend-developer": {
|
|
"name": "frontend-developer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 31.6,
|
|
"glm-5.1": 53.2,
|
|
"kimi-k2.6": 38.8,
|
|
"qwen3-coder:480b": 56.0
|
|
},
|
|
"info": [
|
|
"Handles UI implementation with multimodal capabilities. Accepts visual references like screenshots and mockups (GNS-2 Tier 1)",
|
|
"core",
|
|
"ollama-cloud/minimax-m2.5"
|
|
],
|
|
"best_model": "qwen3-coder:480b",
|
|
"best_score": 56.0
|
|
},
|
|
"go-developer": {
|
|
"name": "go-developer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 41.4,
|
|
"glm-5.1": 53.5,
|
|
"kimi-k2.6": 48.3,
|
|
"qwen3-coder:480b": 58.7
|
|
},
|
|
"info": [
|
|
"Go backend specialist for Gin, Echo, APIs, and database integration (GNS-2 Tier 1)",
|
|
"core",
|
|
"ollama-cloud/deepseek-v4-pro-max"
|
|
],
|
|
"best_model": "qwen3-coder:480b",
|
|
"best_score": 58.7
|
|
},
|
|
"history-miner": {
|
|
"name": "history-miner",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 30.1,
|
|
"glm-5.1": 44.3,
|
|
"kimi-k2.6": 46.9,
|
|
"qwen3-coder:480b": 44.8
|
|
},
|
|
"info": [
|
|
"Analyzes git history to find duplicates and past solutions, preventing regression and duplicate work (GNS-2 Tier 0)",
|
|
"core",
|
|
"ollama-cloud/qwen3.5-122b"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 46.9
|
|
},
|
|
"incident-responder": {
|
|
"name": "incident-responder",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 48.6,
|
|
"glm-5.1": 65.6,
|
|
"kimi-k2.6": 59.1,
|
|
"qwen3-coder:480b": 56.4
|
|
},
|
|
"info": [
|
|
"Server incident response and system hardening specialist. Handles live forensics, malware removal, persistence hunting, SSH-based server cleanup, and post-incident hardening. Works with any OS and panel.",
|
|
"core",
|
|
"ollama-cloud/kimi-k2.6"
|
|
],
|
|
"best_model": "glm-5.1",
|
|
"best_score": 65.6
|
|
},
|
|
"lead-developer": {
|
|
"name": "lead-developer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 28.7,
|
|
"glm-5.1": 68.8,
|
|
"kimi-k2.6": 72.5,
|
|
"qwen3-coder:480b": 72.5
|
|
},
|
|
"info": [
|
|
"Primary code writer for backend and core logic. Writes implementation to pass tests (GNS-2 Tier 1)",
|
|
"core",
|
|
"ollama-cloud/qwen3-coder:480b"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 72.5
|
|
},
|
|
"markdown-validator": {
|
|
"name": "markdown-validator",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 39.0,
|
|
"glm-5.1": 37.2,
|
|
"kimi-k2.6": 24.0,
|
|
"qwen3-coder:480b": 47.4
|
|
},
|
|
"info": [
|
|
"Validates and corrects Markdown descriptions for Gitea issues (GNS-2 Tier 0)",
|
|
"meta",
|
|
"ollama-cloud/nemotron-3-nano"
|
|
],
|
|
"best_model": "qwen3-coder:480b",
|
|
"best_score": 47.4
|
|
},
|
|
"memory-manager": {
|
|
"name": "memory-manager",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 35.8,
|
|
"glm-5.1": 48.3,
|
|
"kimi-k2.6": 41.5,
|
|
"qwen3-coder:480b": 46.8
|
|
},
|
|
"info": [
|
|
"Manages agent memory systems - short-term (context), long-term (vector store), and episodic (experiences) (GNS-2 Tier 0)",
|
|
"cognitive",
|
|
"ollama-cloud/deepseek-v4-pro-max"
|
|
],
|
|
"best_model": "glm-5.1",
|
|
"best_score": 48.3
|
|
},
|
|
"orchestrator": {
|
|
"name": "orchestrator",
|
|
"evaluations": {
|
|
"deepseek-v4-flash": 27.0,
|
|
"deepseek-v4-pro": 19.6,
|
|
"glm-5.1": 36.2,
|
|
"kimi-k2.6": 40.0,
|
|
"minimax-m2.5": 36.3,
|
|
"qwen3-coder:480b": 39.1
|
|
},
|
|
"info": [
|
|
"Main dispatcher. Routes tasks between agents based on Issue status and manages the workflow state machine. IF:90 for optimal routing accuracy. (GNS-2 Tier 1)",
|
|
"meta",
|
|
"ollama-cloud/kimi-k2.6"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 40.0
|
|
},
|
|
"performance-engineer": {
|
|
"name": "performance-engineer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 27.9,
|
|
"glm-5.1": 63.8,
|
|
"kimi-k2.6": 34.3,
|
|
"qwen3-coder:480b": 36.3
|
|
},
|
|
"info": [
|
|
"Reviews code for performance issues. Focuses on efficiency, N+1 queries, memory leaks, and algorithmic complexity (GNS-2 Tier 0)",
|
|
"quality",
|
|
"ollama-cloud/deepseek-v4-pro-max"
|
|
],
|
|
"best_model": "glm-5.1",
|
|
"best_score": 63.8
|
|
},
|
|
"php-developer": {
|
|
"name": "php-developer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 53.5,
|
|
"glm-5.1": 48.3,
|
|
"kimi-k2.6": 48.3,
|
|
"qwen3-coder:480b": 48.3
|
|
},
|
|
"info": [
|
|
"PHP backend specialist for Laravel, Symfony, WordPress, and full-stack web applications (GNS-2 Tier 1)",
|
|
"core",
|
|
"ollama-cloud/qwen3-coder:480b"
|
|
],
|
|
"best_model": "deepseek-v4-pro",
|
|
"best_score": 53.5
|
|
},
|
|
"pipeline-judge": {
|
|
"name": "pipeline-judge",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 34.6,
|
|
"glm-5.1": 45.6,
|
|
"kimi-k2.6": 46.5,
|
|
"qwen3-coder:480b": 52.9
|
|
},
|
|
"info": [
|
|
"Automated pipeline judge. Evaluates workflow execution by running tests, measuring token cost and wall-clock time. Produces objective fitness scores. Never writes code - only measures and scores. (GNS-2 Tier 0)",
|
|
"meta",
|
|
"ollama-cloud/kimi-k2.6"
|
|
],
|
|
"best_model": "qwen3-coder:480b",
|
|
"best_score": 52.9
|
|
},
|
|
"planner": {
|
|
"name": "planner",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 41.7,
|
|
"glm-5.1": 31.8,
|
|
"kimi-k2.6": 34.6,
|
|
"qwen3-coder:480b": 33.7
|
|
},
|
|
"info": [
|
|
"Advanced task planner using Chain of Thought, Tree of Thoughts, and Plan-Execute-Reflect (GNS-2 Tier 0)",
|
|
"cognitive",
|
|
"ollama-cloud/deepseek-v4-pro-max"
|
|
],
|
|
"best_model": "deepseek-v4-pro",
|
|
"best_score": 41.7
|
|
},
|
|
"product-owner": {
|
|
"name": "product-owner",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 27.0,
|
|
"glm-5.1": 33.4,
|
|
"kimi-k2.6": 34.6,
|
|
"qwen3-coder:480b": 27.0
|
|
},
|
|
"info": [
|
|
"Manages issue checklists, status labels, tracks progress and coordinates with human users (GNS-2 Tier 1)",
|
|
"meta",
|
|
"ollama-cloud/glm-5.1"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 34.6
|
|
},
|
|
"prompt-optimizer": {
|
|
"name": "prompt-optimizer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 27.0,
|
|
"glm-5.1": 48.3,
|
|
"kimi-k2.6": 33.0,
|
|
"qwen3-coder:480b": 31.8
|
|
},
|
|
"info": [
|
|
"Improves agent system prompts based on performance failures. Meta-learner for prompt optimization (GNS-2 Tier 1)",
|
|
"meta",
|
|
"ollama-cloud/qwen3.5-122b"
|
|
],
|
|
"best_model": "glm-5.1",
|
|
"best_score": 48.3
|
|
},
|
|
"python-developer": {
|
|
"name": "python-developer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 48.3,
|
|
"glm-5.1": 48.3,
|
|
"kimi-k2.6": 48.3,
|
|
"qwen3-coder:480b": 48.3
|
|
},
|
|
"info": [
|
|
"Python backend specialist for Django, FastAPI, data science, and API development (GNS-2 Tier 1)",
|
|
"core",
|
|
"ollama-cloud/qwen3-coder:480b"
|
|
],
|
|
"best_model": "deepseek-v4-pro",
|
|
"best_score": 48.3
|
|
},
|
|
"reflector": {
|
|
"name": "reflector",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 43.2,
|
|
"glm-5.1": 53.5,
|
|
"kimi-k2.6": 58.7,
|
|
"qwen3-coder:480b": 20.9
|
|
},
|
|
"info": [
|
|
"Self-reflection agent using Reflexion pattern - learns from mistakes (GNS-2 Tier 0)",
|
|
"cognitive",
|
|
"ollama-cloud/deepseek-v4-pro-max"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 58.7
|
|
},
|
|
"release-manager": {
|
|
"name": "release-manager",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 23.7,
|
|
"glm-5.1": 38.0,
|
|
"kimi-k2.6": 50.2,
|
|
"qwen3-coder:480b": 41.7
|
|
},
|
|
"info": [
|
|
"Manages git operations, semantic versioning, branching, and deployments. Ensures clean history (GNS-2 Tier 1)",
|
|
"meta",
|
|
"ollama-cloud/kimi-k2.6"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 50.2
|
|
},
|
|
"requirement-refiner": {
|
|
"name": "requirement-refiner",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 30.3,
|
|
"glm-5.1": 31.0,
|
|
"kimi-k2.6": 31.2,
|
|
"qwen3-coder:480b": 45.3
|
|
},
|
|
"info": [
|
|
"Converts vague ideas and bug reports into strict User Stories with acceptance criteria checklists (GNS-2 Tier 1)",
|
|
"core",
|
|
"ollama-cloud/kimi-k2-thinking"
|
|
],
|
|
"best_model": "qwen3-coder:480b",
|
|
"best_score": 45.3
|
|
},
|
|
"sdet-engineer": {
|
|
"name": "sdet-engineer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 58.7,
|
|
"glm-5.1": 86.0,
|
|
"kimi-k2.6": 97.0,
|
|
"qwen3-coder:480b": 97.0
|
|
},
|
|
"info": [
|
|
"Writes tests following TDD methodology. Tests MUST fail initially (Red phase) (GNS-2 Tier 1)",
|
|
"core",
|
|
"ollama-cloud/qwen3-coder:480b"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 97.0
|
|
},
|
|
"security-auditor": {
|
|
"name": "security-auditor",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 46.4,
|
|
"glm-5.1": 58.7,
|
|
"kimi-k2.6": 63.8,
|
|
"qwen3-coder:480b": 41.5
|
|
},
|
|
"info": [
|
|
"Scans for security vulnerabilities, OWASP Top 10, dependency CVEs, and hardcoded secrets (GNS-2 Tier 0)",
|
|
"quality",
|
|
"ollama-cloud/deepseek-v4-pro-max"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 63.8
|
|
},
|
|
"system-analyst": {
|
|
"name": "system-analyst",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 56.4,
|
|
"glm-5.1": 87.0,
|
|
"kimi-k2.6": 92.0,
|
|
"qwen3-coder:480b": 77.0
|
|
},
|
|
"info": [
|
|
"Designs technical specifications, data schemas, and API contracts before implementation (GNS-2 Tier 1)",
|
|
"core",
|
|
"ollama-cloud/deepseek-v4-pro-max"
|
|
],
|
|
"best_model": "kimi-k2.6",
|
|
"best_score": 92.0
|
|
},
|
|
"the-fixer": {
|
|
"name": "the-fixer",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 43.6,
|
|
"glm-5.1": 46.6,
|
|
"kimi-k2.6": 36.4,
|
|
"qwen3-coder:480b": 42.9
|
|
},
|
|
"info": [
|
|
"Iteratively fixes bugs based on specific error reports and test failures (GNS-2 Tier 1)",
|
|
"quality",
|
|
"ollama-cloud/kimi-k2.6"
|
|
],
|
|
"best_model": "glm-5.1",
|
|
"best_score": 46.6
|
|
},
|
|
"visual-tester": {
|
|
"name": "visual-tester",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 47.3,
|
|
"glm-5.1": 58.7,
|
|
"kimi-k2.6": 53.5,
|
|
"qwen3-coder:480b": 53.5
|
|
},
|
|
"info": [
|
|
"Visual regression testing agent that compares screenshots and detects UI differences using pixelmatch and image diff (GNS-2 Tier 0)",
|
|
"quality",
|
|
"ollama-cloud/qwen3-coder:480b"
|
|
],
|
|
"best_model": "glm-5.1",
|
|
"best_score": 58.7
|
|
},
|
|
"workflow-architect": {
|
|
"name": "workflow-architect",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 36.3,
|
|
"glm-5.1": 48.3,
|
|
"kimi-k2.6": 48.3,
|
|
"qwen3-coder:480b": 36.3
|
|
},
|
|
"info": [
|
|
"Creates and maintains workflow definitions with complete architecture, Gitea integration, and quality gates (GNS-2 Tier 1)",
|
|
"meta",
|
|
"ollama-cloud/qwen3.5-122b"
|
|
],
|
|
"best_model": "glm-5.1",
|
|
"best_score": 48.3
|
|
},
|
|
"workflow-cross-checker": {
|
|
"name": "workflow-cross-checker",
|
|
"evaluations": {
|
|
"deepseek-v4-pro": 54.2,
|
|
"glm-5.1": 63.3,
|
|
"kimi-k2.6": 52.1,
|
|
"qwen3-coder:480b": 65.6
|
|
},
|
|
"info": [
|
|
"Workflow cross-checker and process inspector. Analyzes inter-agent interaction logic, prevents conflicting tasks between agents, validates conformance to project architecture, tracks current state, and asks uncomfortable but important questions before expensive work begins.",
|
|
"meta",
|
|
"ollama-cloud/kimi-k2.6"
|
|
],
|
|
"best_model": "qwen3-coder:480b",
|
|
"best_score": 65.6
|
|
}
|
|
},
|
|
"fit_scores": {
|
|
"agent-architect": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 53.5,
|
|
"explanation": "Best model for agent-architect is kimi-k2.6 with avg score 53.5. Strongest dimension: code_presence."
|
|
},
|
|
"architect-indexer": {
|
|
"model": "qwen3-coder:480b",
|
|
"fit": 54.0,
|
|
"explanation": "Best model for architect-indexer is qwen3-coder:480b with avg score 54.0. Strongest dimension: code_presence."
|
|
},
|
|
"backend-developer": {
|
|
"model": "deepseek-v4-pro",
|
|
"fit": 53.5,
|
|
"explanation": "Best model for backend-developer is deepseek-v4-pro with avg score 53.5. Strongest dimension: code_presence."
|
|
},
|
|
"browser-automation": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 63.8,
|
|
"explanation": "Best model for browser-automation is kimi-k2.6 with avg score 63.8. Strongest dimension: code_presence."
|
|
},
|
|
"capability-analyst": {
|
|
"model": "deepseek-v4-pro",
|
|
"fit": 58.7,
|
|
"explanation": "Best model for capability-analyst is deepseek-v4-pro with avg score 58.7. Strongest dimension: code_presence."
|
|
},
|
|
"code-skeptic": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 91.2,
|
|
"explanation": "Best model for code-skeptic is kimi-k2.6 with avg score 91.2. Strongest dimension: code_presence."
|
|
},
|
|
"devops-engineer": {
|
|
"model": "glm-5.1",
|
|
"fit": 96.2,
|
|
"explanation": "Best model for devops-engineer is glm-5.1 with avg score 96.2. Strongest dimension: keyword_coverage."
|
|
},
|
|
"evaluator": {
|
|
"model": "glm-5.1",
|
|
"fit": 58.7,
|
|
"explanation": "Best model for evaluator is glm-5.1 with avg score 58.7. Strongest dimension: code_presence."
|
|
},
|
|
"evolution-prompt": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 53.5,
|
|
"explanation": "Best model for evolution-prompt is kimi-k2.6 with avg score 53.5. Strongest dimension: code_presence."
|
|
},
|
|
"evolution-skeptic": {
|
|
"model": "qwen3-coder:480b",
|
|
"fit": 42.9,
|
|
"explanation": "Best model for evolution-skeptic is qwen3-coder:480b with avg score 42.9. Strongest dimension: structure."
|
|
},
|
|
"flutter-developer": {
|
|
"model": "glm-5.1",
|
|
"fit": 54.9,
|
|
"explanation": "Best model for flutter-developer is glm-5.1 with avg score 54.9. Strongest dimension: code_presence."
|
|
},
|
|
"frontend-developer": {
|
|
"model": "qwen3-coder:480b",
|
|
"fit": 56.0,
|
|
"explanation": "Best model for frontend-developer is qwen3-coder:480b with avg score 56.0. Strongest dimension: code_presence."
|
|
},
|
|
"go-developer": {
|
|
"model": "qwen3-coder:480b",
|
|
"fit": 58.7,
|
|
"explanation": "Best model for go-developer is qwen3-coder:480b with avg score 58.7. Strongest dimension: code_presence."
|
|
},
|
|
"history-miner": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 46.9,
|
|
"explanation": "Best model for history-miner is kimi-k2.6 with avg score 46.9. Strongest dimension: code_presence."
|
|
},
|
|
"incident-responder": {
|
|
"model": "glm-5.1",
|
|
"fit": 65.6,
|
|
"explanation": "Best model for incident-responder is glm-5.1 with avg score 65.6. Strongest dimension: code_presence."
|
|
},
|
|
"lead-developer": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 72.5,
|
|
"explanation": "Best model for lead-developer is kimi-k2.6 with avg score 72.5. Strongest dimension: keyword_coverage."
|
|
},
|
|
"markdown-validator": {
|
|
"model": "qwen3-coder:480b",
|
|
"fit": 47.4,
|
|
"explanation": "Best model for markdown-validator is qwen3-coder:480b with avg score 47.4. Strongest dimension: code_presence."
|
|
},
|
|
"memory-manager": {
|
|
"model": "glm-5.1",
|
|
"fit": 48.3,
|
|
"explanation": "Best model for memory-manager is glm-5.1 with avg score 48.3. Strongest dimension: code_presence."
|
|
},
|
|
"orchestrator": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 40.0,
|
|
"explanation": "Best model for orchestrator is kimi-k2.6 with avg score 40.0. Strongest dimension: code_presence."
|
|
},
|
|
"performance-engineer": {
|
|
"model": "glm-5.1",
|
|
"fit": 63.8,
|
|
"explanation": "Best model for performance-engineer is glm-5.1 with avg score 63.8. Strongest dimension: code_presence."
|
|
},
|
|
"php-developer": {
|
|
"model": "deepseek-v4-pro",
|
|
"fit": 53.5,
|
|
"explanation": "Best model for php-developer is deepseek-v4-pro with avg score 53.5. Strongest dimension: code_presence."
|
|
},
|
|
"pipeline-judge": {
|
|
"model": "qwen3-coder:480b",
|
|
"fit": 52.9,
|
|
"explanation": "Best model for pipeline-judge is qwen3-coder:480b with avg score 52.9. Strongest dimension: code_presence."
|
|
},
|
|
"planner": {
|
|
"model": "deepseek-v4-pro",
|
|
"fit": 41.7,
|
|
"explanation": "Best model for planner is deepseek-v4-pro with avg score 41.7. Strongest dimension: code_presence."
|
|
},
|
|
"product-owner": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 34.6,
|
|
"explanation": "Best model for product-owner is kimi-k2.6 with avg score 34.6. Strongest dimension: actionability."
|
|
},
|
|
"prompt-optimizer": {
|
|
"model": "glm-5.1",
|
|
"fit": 48.3,
|
|
"explanation": "Best model for prompt-optimizer is glm-5.1 with avg score 48.3. Strongest dimension: code_presence."
|
|
},
|
|
"python-developer": {
|
|
"model": "deepseek-v4-pro",
|
|
"fit": 48.3,
|
|
"explanation": "Best model for python-developer is deepseek-v4-pro with avg score 48.3. Strongest dimension: code_presence."
|
|
},
|
|
"reflector": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 58.7,
|
|
"explanation": "Best model for reflector is kimi-k2.6 with avg score 58.7. Strongest dimension: code_presence."
|
|
},
|
|
"release-manager": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 50.2,
|
|
"explanation": "Best model for release-manager is kimi-k2.6 with avg score 50.2. Strongest dimension: code_presence."
|
|
},
|
|
"requirement-refiner": {
|
|
"model": "qwen3-coder:480b",
|
|
"fit": 45.3,
|
|
"explanation": "Best model for requirement-refiner is qwen3-coder:480b with avg score 45.3. Strongest dimension: code_presence."
|
|
},
|
|
"sdet-engineer": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 97.0,
|
|
"explanation": "Best model for sdet-engineer is kimi-k2.6 with avg score 97.0. Strongest dimension: keyword_coverage."
|
|
},
|
|
"security-auditor": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 63.8,
|
|
"explanation": "Best model for security-auditor is kimi-k2.6 with avg score 63.8. Strongest dimension: code_presence."
|
|
},
|
|
"system-analyst": {
|
|
"model": "kimi-k2.6",
|
|
"fit": 92.0,
|
|
"explanation": "Best model for system-analyst is kimi-k2.6 with avg score 92.0. Strongest dimension: keyword_coverage."
|
|
},
|
|
"the-fixer": {
|
|
"model": "glm-5.1",
|
|
"fit": 46.6,
|
|
"explanation": "Best model for the-fixer is glm-5.1 with avg score 46.6. Strongest dimension: code_presence."
|
|
},
|
|
"visual-tester": {
|
|
"model": "glm-5.1",
|
|
"fit": 58.7,
|
|
"explanation": "Best model for visual-tester is glm-5.1 with avg score 58.7. Strongest dimension: code_presence."
|
|
},
|
|
"workflow-architect": {
|
|
"model": "glm-5.1",
|
|
"fit": 48.3,
|
|
"explanation": "Best model for workflow-architect is glm-5.1 with avg score 48.3. Strongest dimension: code_presence."
|
|
},
|
|
"workflow-cross-checker": {
|
|
"model": "qwen3-coder:480b",
|
|
"fit": 65.6,
|
|
"explanation": "Best model for workflow-cross-checker is qwen3-coder:480b with avg score 65.6. Strongest dimension: code_presence."
|
|
}
|
|
}
|
|
} |