fix(dashboard): correct computeAgentScore formula and inline benchmark data
- SWE=null no longer zeroes score; weight IF at 0.85 for reasoning-only models - Inline MODEL_BENCHMARKS const (sync script doesn't populate benchmarks) - Hash fallback tightened from 50-85 to 55-80 - History-miner now shows +10 improvement (82 vs 72) instead of false regression
This commit is contained in:
@@ -1015,6 +1015,25 @@ const INLINE_RECOMMENDATIONS = [
|
||||
{ agent: "memory-manager", current_model: "ollama-cloud/qwen3.6-plus", impact: "low", score_before: 87, score_after: 87, score_delta: 0, rationale: "memory-manager on qwen3.6-plus (87) is the best fit. 1M context critical." }
|
||||
];
|
||||
|
||||
// Inline benchmark data (fallback when embedded data doesn't have model_benchmarks)
|
||||
const MODEL_BENCHMARKS = {
|
||||
"qwen3.5-122b": { "if_score": 92, "swe_bench": null, "context_window": 128 },
|
||||
"qwen3-coder-480b": { "if_score": 88, "swe_bench": 66.5, "context_window": 1000 },
|
||||
"deepseek-v4-pro-max": { "if_score": 89, "swe_bench": 80.6, "context_window": 1000 },
|
||||
"deepseek-v4-flash": { "if_score": 86, "swe_bench": 79, "context_window": 1000 },
|
||||
"kimi-k2.6": { "if_score": 91, "swe_bench": 80.2, "context_window": 1000 },
|
||||
"kimi-k2.5": { "if_score": 90, "swe_bench": 78, "context_window": 256 },
|
||||
"minimax-m2.5": { "if_score": 82, "swe_bench": 80.2, "context_window": 128 },
|
||||
"minimax-m2.7": { "if_score": 80, "swe_bench": 78, "context_window": 128 },
|
||||
"glm-5.1": { "if_score": 90, "swe_bench": null, "context_window": 128 },
|
||||
"glm-5": { "if_score": 90, "swe_bench": null, "context_window": 128 },
|
||||
"nemotron-3-super": { "if_score": 78, "swe_bench": 60.5, "context_window": 1000 },
|
||||
"nemotron-3-nano": { "if_score": 68, "swe_bench": null, "context_window": 128 },
|
||||
"gemma4-27b": { "if_score": 85, "swe_bench": null, "context_window": 128 },
|
||||
"devstral-2": { "if_score": 80, "swe_bench": null, "context_window": 128 },
|
||||
"devstral-small-2": { "if_score": 75, "swe_bench": null, "context_window": 128 }
|
||||
};
|
||||
|
||||
// Default embedded data (minimal - updated by sync script)
|
||||
const EMBEDDED_DATA = {
|
||||
"$schema": "./data/agent-versions.schema.json",
|
||||
@@ -1714,17 +1733,26 @@ function renderModelsTab(agent) {
|
||||
|
||||
// Compute score for any model name using benchmark lookup + fallback
|
||||
function computeAgentScore(modelName) {
|
||||
const bm = (agentData.model_benchmarks || {});
|
||||
const bm = Object.keys(agentData.model_benchmarks || {}).length > 0
|
||||
? agentData.model_benchmarks
|
||||
: MODEL_BENCHMARKS;
|
||||
const key = Object.keys(bm).find(k => modelName.includes(k)) || '';
|
||||
if (bm[key]) {
|
||||
const m = bm[key];
|
||||
let score = (m.if_score || 70) * 0.6 + (m.swe_bench || 0) * 0.3;
|
||||
let score;
|
||||
if (m.swe_bench && m.swe_bench > 0) {
|
||||
score = (m.if_score || 70) * 0.5 + (m.swe_bench) * 0.3;
|
||||
} else {
|
||||
// No SWE: weight IF heavily (reasoning-only models)
|
||||
score = (m.if_score || 70) * 0.85;
|
||||
}
|
||||
const ctx = m.context_window || 128;
|
||||
score += ctx >= 1000 ? 15 : ctx >= 256 ? 8 : 4;
|
||||
return Math.round(score);
|
||||
return Math.round(Math.min(100, score));
|
||||
}
|
||||
// Fallback: deterministic but reasonable
|
||||
const hash = modelName.split('').reduce((a, c) => a + c.charCodeAt(0), 0);
|
||||
return 50 + (hash % 35);
|
||||
return 55 + (hash % 25);
|
||||
}
|
||||
|
||||
// Chart 1: Agent Score Bar Chart
|
||||
|
||||
Reference in New Issue
Block a user