fix(dashboard): remove last deepseek-v4-pro-max duplicate from report

- rebuild-report.py: sync current_model from kilo-meta.json (UPDATE not only INSERT)
- real-fit-report.json: regenerated from DB after agents table model rename
- real-fit.db: 10 agents updated: current_model pro-max → pro
- real-fit.html: remove stale model alias fallback
This commit is contained in:
Deploy Bot
2026-05-28 13:10:18 +01:00
parent 869a9f266a
commit a7a90129ce
4 changed files with 148 additions and 151 deletions

View File

@@ -1,7 +1,7 @@
{
"generated": "2026-05-28T10:48:02.581965+00:00",
"source": "real-fit-engine",
"total_evaluations": 147,
"generated": "2026-05-28T12:07:59Z",
"source": "real-fit-engine-db-filtered",
"total_evaluations": 144,
"agents": {
"agent-architect": {
"name": "agent-architect",
@@ -78,7 +78,7 @@
"info": [
"Analyzes task requirements against available agents, workflows, and skills. Identifies gaps and recommends new components. Tier 2 meta-agent with self-cascade enabled.",
"meta",
"ollama-cloud/deepseek-v4-pro-max"
"ollama-cloud/deepseek-v4-pro"
],
"best_model": "deepseek-v4-pro",
"best_score": 58.7
@@ -89,7 +89,6 @@
"deepseek-v4-pro": 22.8,
"glm-5.1": 89.1,
"kimi-k2.6": 91.2,
"minimax-m2.5": 45.0,
"qwen3-coder:480b": 90.6
},
"info": [
@@ -132,38 +131,6 @@
"best_model": "glm-5.1",
"best_score": 58.7
},
"evolution-prompt": {
"name": "evolution-prompt",
"evaluations": {
"deepseek-v4-pro": 52.6,
"glm-5.1": 44.7,
"kimi-k2.6": 53.5,
"qwen3-coder:480b": 21.3
},
"info": [
"Generates role-specific stress-test prompts by analyzing agent definitions",
"meta",
"ollama-cloud/deepseek-v4-pro-max"
],
"best_model": "kimi-k2.6",
"best_score": 53.5
},
"evolution-skeptic": {
"name": "evolution-skeptic",
"evaluations": {
"deepseek-v4-pro": 33.1,
"glm-5.1": 31.6,
"kimi-k2.6": 37.3,
"qwen3-coder:480b": 42.9
},
"info": [
"Evaluates model responses against role-specific rubrics with detailed scoring and commentary",
"meta",
"ollama-cloud/deepseek-v4-pro-max"
],
"best_model": "qwen3-coder:480b",
"best_score": 42.9
},
"flutter-developer": {
"name": "flutter-developer",
"evaluations": {
@@ -207,7 +174,7 @@
"info": [
"Go backend specialist for Gin, Echo, APIs, and database integration (GNS-2 Tier 1)",
"core",
"ollama-cloud/deepseek-v4-pro-max"
"ollama-cloud/deepseek-v4-pro"
],
"best_model": "qwen3-coder:480b",
"best_score": 58.7
@@ -287,7 +254,7 @@
"info": [
"Manages agent memory systems - short-term (context), long-term (vector store), and episodic (experiences) (GNS-2 Tier 0)",
"cognitive",
"ollama-cloud/deepseek-v4-pro-max"
"ollama-cloud/deepseek-v4-pro"
],
"best_model": "glm-5.1",
"best_score": 48.3
@@ -295,11 +262,9 @@
"orchestrator": {
"name": "orchestrator",
"evaluations": {
"deepseek-v4-flash": 27.0,
"deepseek-v4-pro": 19.6,
"glm-5.1": 36.2,
"kimi-k2.6": 40.0,
"minimax-m2.5": 36.3,
"qwen3-coder:480b": 39.1
},
"info": [
@@ -321,7 +286,7 @@
"info": [
"Reviews code for performance issues. Focuses on efficiency, N+1 queries, memory leaks, and algorithmic complexity (GNS-2 Tier 0)",
"quality",
"ollama-cloud/deepseek-v4-pro-max"
"ollama-cloud/deepseek-v4-pro"
],
"best_model": "glm-5.1",
"best_score": 63.8
@@ -369,7 +334,7 @@
"info": [
"Advanced task planner using Chain of Thought, Tree of Thoughts, and Plan-Execute-Reflect (GNS-2 Tier 0)",
"cognitive",
"ollama-cloud/deepseek-v4-pro-max"
"ollama-cloud/deepseek-v4-pro"
],
"best_model": "deepseek-v4-pro",
"best_score": 41.7
@@ -433,7 +398,7 @@
"info": [
"Self-reflection agent using Reflexion pattern - learns from mistakes (GNS-2 Tier 0)",
"cognitive",
"ollama-cloud/deepseek-v4-pro-max"
"ollama-cloud/deepseek-v4-pro"
],
"best_model": "kimi-k2.6",
"best_score": 58.7
@@ -497,7 +462,7 @@
"info": [
"Scans for security vulnerabilities, OWASP Top 10, dependency CVEs, and hardcoded secrets (GNS-2 Tier 0)",
"quality",
"ollama-cloud/deepseek-v4-pro-max"
"ollama-cloud/deepseek-v4-pro"
],
"best_model": "kimi-k2.6",
"best_score": 63.8
@@ -513,7 +478,7 @@
"info": [
"Designs technical specifications, data schemas, and API contracts before implementation (GNS-2 Tier 1)",
"core",
"ollama-cloud/deepseek-v4-pro-max"
"ollama-cloud/deepseek-v4-pro"
],
"best_model": "kimi-k2.6",
"best_score": 92.0
@@ -581,188 +546,220 @@
],
"best_model": "qwen3-coder:480b",
"best_score": 65.6
},
"evolution-skeptic": {
"name": "evolution-skeptic",
"evaluations": {
"deepseek-v4-pro": 33.1,
"glm-5.1": 31.6,
"kimi-k2.6": 37.3,
"qwen3-coder:480b": 42.9
},
"info": [
"Evaluates model responses against role-specific rubrics with detailed scoring and commentary",
"meta",
"ollama-cloud/deepseek-v4-pro"
],
"best_model": "qwen3-coder:480b",
"best_score": 42.9
},
"evolution-prompt": {
"name": "evolution-prompt",
"evaluations": {
"deepseek-v4-pro": 52.6,
"glm-5.1": 44.7,
"kimi-k2.6": 53.5,
"qwen3-coder:480b": 21.3
},
"info": [
"Generates role-specific stress-test prompts by analyzing agent definitions",
"meta",
"ollama-cloud/deepseek-v4-pro"
],
"best_model": "kimi-k2.6",
"best_score": 53.5
}
},
"fit_scores": {
"agent-architect": {
"model": "kimi-k2.6",
"fit": 53.5,
"explanation": "Best model for agent-architect is kimi-k2.6 with avg score 53.5. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 48.3,
"explanation": "Best model for agent-architect is qwen3-coder:480b with avg score 48.3. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"architect-indexer": {
"model": "qwen3-coder:480b",
"fit": 54.0,
"explanation": "Best model for architect-indexer is qwen3-coder:480b with avg score 54.0. Strongest dimension: code_presence."
"explanation": "Best model for architect-indexer is qwen3-coder:480b with avg score 54.0. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"backend-developer": {
"model": "deepseek-v4-pro",
"fit": 53.5,
"explanation": "Best model for backend-developer is deepseek-v4-pro with avg score 53.5. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 43.2,
"explanation": "Best model for backend-developer is qwen3-coder:480b with avg score 43.2. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"browser-automation": {
"model": "kimi-k2.6",
"fit": 63.8,
"explanation": "Best model for browser-automation is kimi-k2.6 with avg score 63.8. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 48.9,
"explanation": "Best model for browser-automation is qwen3-coder:480b with avg score 48.9. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"capability-analyst": {
"model": "deepseek-v4-pro",
"fit": 58.7,
"explanation": "Best model for capability-analyst is deepseek-v4-pro with avg score 58.7. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 52.3,
"explanation": "Best model for capability-analyst is qwen3-coder:480b with avg score 52.3. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"code-skeptic": {
"model": "kimi-k2.6",
"fit": 91.2,
"explanation": "Best model for code-skeptic is kimi-k2.6 with avg score 91.2. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 90.6,
"explanation": "Best model for code-skeptic is qwen3-coder:480b with avg score 90.6. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"devops-engineer": {
"model": "glm-5.1",
"fit": 96.2,
"explanation": "Best model for devops-engineer is glm-5.1 with avg score 96.2. Strongest dimension: keyword_coverage."
"model": "qwen3-coder:480b",
"fit": 87.2,
"explanation": "Best model for devops-engineer is qwen3-coder:480b with avg score 87.2. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"evaluator": {
"model": "glm-5.1",
"fit": 58.7,
"explanation": "Best model for evaluator is glm-5.1 with avg score 58.7. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 43.8,
"explanation": "Best model for evaluator is qwen3-coder:480b with avg score 43.8. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"evolution-prompt": {
"model": "kimi-k2.6",
"fit": 53.5,
"explanation": "Best model for evolution-prompt is kimi-k2.6 with avg score 53.5. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 21.3,
"explanation": "Best model for evolution-prompt is qwen3-coder:480b with avg score 21.3. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"evolution-skeptic": {
"model": "qwen3-coder:480b",
"fit": 42.9,
"explanation": "Best model for evolution-skeptic is qwen3-coder:480b with avg score 42.9. Strongest dimension: structure."
"explanation": "Best model for evolution-skeptic is qwen3-coder:480b with avg score 42.9. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"flutter-developer": {
"model": "glm-5.1",
"model": "qwen3-coder:480b",
"fit": 54.9,
"explanation": "Best model for flutter-developer is glm-5.1 with avg score 54.9. Strongest dimension: code_presence."
"explanation": "Best model for flutter-developer is qwen3-coder:480b with avg score 54.9. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"frontend-developer": {
"model": "qwen3-coder:480b",
"fit": 56.0,
"explanation": "Best model for frontend-developer is qwen3-coder:480b with avg score 56.0. Strongest dimension: code_presence."
"explanation": "Best model for frontend-developer is qwen3-coder:480b with avg score 56.0. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"go-developer": {
"model": "qwen3-coder:480b",
"fit": 58.7,
"explanation": "Best model for go-developer is qwen3-coder:480b with avg score 58.7. Strongest dimension: code_presence."
"explanation": "Best model for go-developer is qwen3-coder:480b with avg score 58.7. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"history-miner": {
"model": "kimi-k2.6",
"fit": 46.9,
"explanation": "Best model for history-miner is kimi-k2.6 with avg score 46.9. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 44.8,
"explanation": "Best model for history-miner is qwen3-coder:480b with avg score 44.8. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"incident-responder": {
"model": "glm-5.1",
"fit": 65.6,
"explanation": "Best model for incident-responder is glm-5.1 with avg score 65.6. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 56.4,
"explanation": "Best model for incident-responder is qwen3-coder:480b with avg score 56.4. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"lead-developer": {
"model": "kimi-k2.6",
"model": "qwen3-coder:480b",
"fit": 72.5,
"explanation": "Best model for lead-developer is kimi-k2.6 with avg score 72.5. Strongest dimension: keyword_coverage."
"explanation": "Best model for lead-developer is qwen3-coder:480b with avg score 72.5. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"markdown-validator": {
"model": "qwen3-coder:480b",
"fit": 47.4,
"explanation": "Best model for markdown-validator is qwen3-coder:480b with avg score 47.4. Strongest dimension: code_presence."
"explanation": "Best model for markdown-validator is qwen3-coder:480b with avg score 47.4. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"memory-manager": {
"model": "glm-5.1",
"fit": 48.3,
"explanation": "Best model for memory-manager is glm-5.1 with avg score 48.3. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 46.8,
"explanation": "Best model for memory-manager is qwen3-coder:480b with avg score 46.8. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"orchestrator": {
"model": "kimi-k2.6",
"fit": 40.0,
"explanation": "Best model for orchestrator is kimi-k2.6 with avg score 40.0. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 39.1,
"explanation": "Best model for orchestrator is qwen3-coder:480b with avg score 39.1. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"performance-engineer": {
"model": "glm-5.1",
"fit": 63.8,
"explanation": "Best model for performance-engineer is glm-5.1 with avg score 63.8. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 36.3,
"explanation": "Best model for performance-engineer is qwen3-coder:480b with avg score 36.3. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"php-developer": {
"model": "deepseek-v4-pro",
"fit": 53.5,
"explanation": "Best model for php-developer is deepseek-v4-pro with avg score 53.5. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 48.3,
"explanation": "Best model for php-developer is qwen3-coder:480b with avg score 48.3. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"pipeline-judge": {
"model": "qwen3-coder:480b",
"fit": 52.9,
"explanation": "Best model for pipeline-judge is qwen3-coder:480b with avg score 52.9. Strongest dimension: code_presence."
"explanation": "Best model for pipeline-judge is qwen3-coder:480b with avg score 52.9. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"planner": {
"model": "deepseek-v4-pro",
"fit": 41.7,
"explanation": "Best model for planner is deepseek-v4-pro with avg score 41.7. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 33.7,
"explanation": "Best model for planner is qwen3-coder:480b with avg score 33.7. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"product-owner": {
"model": "kimi-k2.6",
"fit": 34.6,
"explanation": "Best model for product-owner is kimi-k2.6 with avg score 34.6. Strongest dimension: actionability."
"model": "qwen3-coder:480b",
"fit": 27.0,
"explanation": "Best model for product-owner is qwen3-coder:480b with avg score 27.0. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"prompt-optimizer": {
"model": "glm-5.1",
"fit": 48.3,
"explanation": "Best model for prompt-optimizer is glm-5.1 with avg score 48.3. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 31.8,
"explanation": "Best model for prompt-optimizer is qwen3-coder:480b with avg score 31.8. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"python-developer": {
"model": "deepseek-v4-pro",
"model": "qwen3-coder:480b",
"fit": 48.3,
"explanation": "Best model for python-developer is deepseek-v4-pro with avg score 48.3. Strongest dimension: code_presence."
"explanation": "Best model for python-developer is qwen3-coder:480b with avg score 48.3. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"reflector": {
"model": "kimi-k2.6",
"fit": 58.7,
"explanation": "Best model for reflector is kimi-k2.6 with avg score 58.7. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 20.9,
"explanation": "Best model for reflector is qwen3-coder:480b with avg score 20.9. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"release-manager": {
"model": "kimi-k2.6",
"fit": 50.2,
"explanation": "Best model for release-manager is kimi-k2.6 with avg score 50.2. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 41.7,
"explanation": "Best model for release-manager is qwen3-coder:480b with avg score 41.7. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"requirement-refiner": {
"model": "qwen3-coder:480b",
"fit": 45.3,
"explanation": "Best model for requirement-refiner is qwen3-coder:480b with avg score 45.3. Strongest dimension: code_presence."
"explanation": "Best model for requirement-refiner is qwen3-coder:480b with avg score 45.3. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"sdet-engineer": {
"model": "kimi-k2.6",
"model": "qwen3-coder:480b",
"fit": 97.0,
"explanation": "Best model for sdet-engineer is kimi-k2.6 with avg score 97.0. Strongest dimension: keyword_coverage."
"explanation": "Best model for sdet-engineer is qwen3-coder:480b with avg score 97.0. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"security-auditor": {
"model": "kimi-k2.6",
"fit": 63.8,
"explanation": "Best model for security-auditor is kimi-k2.6 with avg score 63.8. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 41.5,
"explanation": "Best model for security-auditor is qwen3-coder:480b with avg score 41.5. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"system-analyst": {
"model": "kimi-k2.6",
"fit": 92.0,
"explanation": "Best model for system-analyst is kimi-k2.6 with avg score 92.0. Strongest dimension: keyword_coverage."
"model": "qwen3-coder:480b",
"fit": 77.0,
"explanation": "Best model for system-analyst is qwen3-coder:480b with avg score 77.0. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"the-fixer": {
"model": "glm-5.1",
"fit": 46.6,
"explanation": "Best model for the-fixer is glm-5.1 with avg score 46.6. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 42.9,
"explanation": "Best model for the-fixer is qwen3-coder:480b with avg score 42.9. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"visual-tester": {
"model": "glm-5.1",
"fit": 58.7,
"explanation": "Best model for visual-tester is glm-5.1 with avg score 58.7. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 53.5,
"explanation": "Best model for visual-tester is qwen3-coder:480b with avg score 53.5. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"workflow-architect": {
"model": "glm-5.1",
"fit": 48.3,
"explanation": "Best model for workflow-architect is glm-5.1 with avg score 48.3. Strongest dimension: code_presence."
"model": "qwen3-coder:480b",
"fit": 36.3,
"explanation": "Best model for workflow-architect is qwen3-coder:480b with avg score 36.3. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
},
"workflow-cross-checker": {
"model": "qwen3-coder:480b",
"fit": 65.6,
"explanation": "Best model for workflow-cross-checker is qwen3-coder:480b with avg score 65.6. Strongest dimension: code_presence."
"explanation": "Best model for workflow-cross-checker is qwen3-coder:480b with avg score 65.6. Evaluator preference: evolution-skeptic > rubric_v2 > rubric_v1 (ignored HTTP errors)."
}
}
}

Binary file not shown.

View File

@@ -170,11 +170,7 @@ function currentModel(agentName){
return (info[2]||'').split('/').pop();
}
function modelShort(full){
const base=full.replace('ollama-cloud/','');
if(base==='deepseek-v4-pro-max') return 'deepseek-v4-pro';
return base;
}
function modelShort(full){return full.replace('ollama-cloud/','');}
function openAgentModal(agent){
$('agentModalTitle').textContent='Research models for '+agent;

View File

@@ -30,18 +30,22 @@ def _sync_agents_from_meta(db_path: Path) -> None:
for name, info in meta.get("agents", {}).items():
if name in existing:
continue
cursor.execute(
"INSERT OR IGNORE INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)",
(
name,
info.get("description", ""),
info.get("category", "meta"),
info.get("model", ""),
info.get("color", "#6B7280"),
datetime.now(timezone.utc).isoformat(),
),
)
cursor.execute(
"UPDATE agents SET current_model = ? WHERE name = ?",
(info.get("model", ""), name),
)
else:
cursor.execute(
"INSERT INTO agents (name, description, category, current_model, color, updated) VALUES (?, ?, ?, ?, ?, ?)",
(
name,
info.get("description", ""),
info.get("category", "meta"),
info.get("model", ""),
info.get("color", "#6B7280"),
datetime.now(timezone.utc).isoformat(),
),
)
conn.commit()
conn.close()