Files
APAW/agent-evolution/data/model-benchmarks.schema.json
¨NW¨ 3badb259cc feat: bidirectional research dashboard + agent config fixes
- Integrate apaw_agent_model_research_v3.html as standalone dashboard
- Add model-benchmarks.json with 32 agents, 11 scored models, 11 recommendations
- Add build-research-dashboard.ts: inject live data into template → standalone HTML
- Add rebuild-template.cjs: regenerate template from v3.html source
- Add sync-benchmarks-from-yaml.cjs: sync YAML → JSON round-trip
- Add sync-model-research.ts: apply recommendation matrix to config files
- Add model-benchmarks.schema.json and model-research.schema.json for validation
- Add bidirectional-data-flow.md architecture documentation
- Add log-execution.cjs pipeline hook
- Update capability-index.yaml: add fallback_models, failover_strategy
- Update kilo-meta.json, kilo.jsonc, KILO_SPEC.md with synced models
- Update evolution.md / research.md / self-evolution.md / evolutionary-sync.md docs
- Fix security-auditor.md: quote YAML color (#DC2626)
- Fix orchestrator.md: remove duplicate devops-engineer key
- Build research-dashboard.html (106KB standalone) + dated archive
2026-04-29 21:04:22 +01:00

553 lines
13 KiB
JSON
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://app.kilo.ai/model-benchmarks.schema.json",
"title": "APAW Model Benchmarks Data",
"description": "Schema for static model benchmarks extracted from HTML sources",
"type": "object",
"required": [
"version",
"generated",
"source",
"metadata",
"models",
"groq_models",
"agent_model_scores",
"if_scores",
"agent_current_config",
"recommendations",
"impact_data",
"benchmark_comparison"
],
"properties": {
"version": {
"type": "string",
"const": "1.0.0"
},
"generated": {
"type": "string",
"format": "date-time"
},
"source": {
"type": "string",
"description": "Source of benchmark data (e.g. HTML scraping, API, manual entry)"
},
"metadata": {
"type": "object",
"properties": {
"scrape_date": {
"type": "string",
"format": "date-time"
},
"source_urls": {
"type": "array",
"items": {
"type": "string"
}
},
"notes": {
"type": "string"
},
"data_quality": {
"type": "string",
"enum": [
"high",
"medium",
"low",
"estimated"
]
}
}
},
"models": {
"type": "array",
"description": "All benchmarked models from various providers",
"items": {
"type": "object",
"required": [
"id",
"name",
"provider",
"category"
],
"properties": {
"id": {
"type": "string",
"description": "Model identifier"
},
"name": {
"type": "string"
},
"organization": {
"type": "string"
},
"provider": {
"type": "string",
"enum": [
"ollama",
"ollama-cloud",
"openrouter",
"groq",
"anthropic",
"openai",
"meta",
"cohere",
"google",
"microsoft",
"unknown"
]
},
"category": {
"type": "string",
"enum": [
"big",
"medium",
"small",
"coder",
"reasoning",
"creative"
]
},
"parameters": {
"type": "string"
},
"benchmarks": {
"type": "object",
"properties": {
"swe_bench": {
"type": [
"number",
"null"
]
},
"swe_bench_pro": {
"type": [
"number",
"null"
]
},
"terminal_bench": {
"type": [
"number",
"null"
]
},
"live_codebench": {
"type": [
"number",
"null"
]
},
"gpqa": {
"type": [
"number",
"null"
]
},
"hle": {
"type": [
"number",
"null"
]
},
"browse_comp": {
"type": [
"number",
"null"
]
},
"m_mlu": {
"type": [
"number",
"null"
]
},
"m_mlu_pro": {
"type": [
"number",
"null"
]
}
}
},
"description": {
"type": "string"
},
"availability": {
"type": "object",
"properties": {
"rpm": {
"type": [
"integer",
"null"
]
},
"rpd": {
"type": [
"integer",
"string",
"null"
]
},
"tpm": {
"type": [
"integer",
"string",
"null"
]
},
"tpd": {
"type": [
"integer",
"string",
"null"
]
}
}
},
"free": {
"type": "boolean"
},
"cost_per_1m_input": {
"type": [
"number",
"string",
"null"
]
},
"tier": {
"type": "string",
"enum": [
"free",
"trial",
"paid",
"enterprise"
]
}
}
}
},
"groq_models": {
"type": "array",
"description": "Groq-specific models with performance data",
"items": {
"type": "object",
"required": [
"id",
"name",
"speed_tps",
"provider"
],
"properties": {
"id": {
"type": "string"
},
"name": {
"type": "string"
},
"speed_tps": {
"type": [
"number",
"string"
]
},
"provider": {
"type": "string",
"const": "groq"
},
"benchmarks": {
"type": "object"
},
"availability": {
"type": "object"
}
}
}
},
"agent_model_scores": {
"type": "array",
"description": "Agent × Model compatibility scoring matrices",
"items": {
"type": "object",
"required": [
"agent",
"model_id",
"score",
"category"
],
"properties": {
"agent": {
"type": "string"
},
"model_id": {
"type": "string"
},
"score": {
"type": "number",
"minimum": 0,
"maximum": 100
},
"category": {
"type": "string",
"enum": [
"performance",
"instruction_following",
"creativity",
"code_generation"
]
},
"reason": {
"type": "string"
},
"timestamp": {
"type": "string",
"format": "date-time"
},
"current_model_id": {
"type": "string",
"description": "Current model ID string (replaces index)"
}
}
}
},
"if_scores": {
"type": "object",
"description": "Instruction Following scores mapping",
"additionalProperties": {
"type": "number",
"minimum": 0,
"maximum": 100
}
},
"agent_current_config": {
"type": "array",
"description": "Current agent model configurations",
"items": {
"type": "object",
"required": [
"agent",
"model",
"provider",
"status"
],
"properties": {
"agent": {
"type": "string"
},
"model": {
"type": "string"
},
"provider": {
"type": "string"
},
"status": {
"type": "string",
"enum": [
"active",
"testing",
"deprecated",
"pending"
]
},
"reasoning_effort": {
"type": "string",
"enum": [
"L",
"M",
"H"
]
},
"fit_score": {
"type": "number"
},
"date_applied": {
"type": "string",
"format": "date-time"
}
}
}
},
"recommendations": {
"type": "array",
"description": "Model change recommendations based on benchmarks",
"items": {
"type": "object",
"required": [
"agent",
"action",
"current_model",
"recommended_model",
"impact"
],
"properties": {
"agent": {
"type": "string"
},
"action": {
"type": "string",
"enum": [
"update_model",
"confirm_model",
"add_fallback",
"redesign_agent"
]
},
"current_model": {
"type": "string"
},
"current_provider": {
"type": "string"
},
"recommended_model": {
"type": "string"
},
"recommended_provider": {
"type": "string"
},
"impact": {
"type": "string",
"enum": [
"critical",
"high",
"medium",
"low"
]
},
"rationale": {
"type": "string"
},
"expected_improvement": {
"type": "object"
},
"applied": {
"type": "boolean"
}
}
}
},
"impact_data": {
"type": "array",
"description": "Impact analysis of model changes",
"items": {
"type": "object",
"required": [
"agent",
"model_change",
"impact_score"
],
"properties": {
"agent": {
"type": "string"
},
"model_change": {
"type": "string"
},
"impact_score": {
"type": "number",
"minimum": 0,
"maximum": 100,
"description": "Impact score 0-100"
}
}
}
},
"benchmark_comparison": {
"type": "object",
"description": "APAW vs closed-source benchmark comparison",
"properties": {
"benchmarks": {
"type": "array",
"description": "Benchmark names used for comparison",
"items": {
"type": "string"
}
},
"closed_source_models": {
"type": "array",
"description": "Closed-source models included in comparison",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"provider": {
"type": "string"
},
"benchmarks": {
"type": "object"
}
}
}
},
"apaw_models": {
"type": "array",
"description": "APAW pipeline models included in comparison",
"items": {
"type": "object",
"properties": {
"name": {
"type": "string"
},
"provider": {
"type": "string"
},
"benchmarks": {
"type": "object"
}
}
}
},
"apaw_best": {
"type": "object",
"description": "Best APAW model per benchmark",
"additionalProperties": {
"type": "object",
"properties": {
"model": {
"type": "string"
},
"score": {
"type": "number"
},
"gap_to_closed": {
"type": [
"number",
"string"
]
}
}
}
},
"closed_best": {
"type": "object",
"description": "Best closed-source model per benchmark",
"additionalProperties": {
"type": "object",
"properties": {
"model": {
"type": "string"
},
"score": {
"type": "number"
}
}
}
},
"summary": {
"type": "object",
"properties": {
"apaw_avg_score": {
"type": "number"
},
"closed_avg_score": {
"type": "number"
},
"coverage_gap": {
"type": "string"
}
}
}
}
}
}
}