{ "$schema": "http://json-schema.org/draft-07/schema#", "$id": "https://app.kilo.ai/model-benchmarks.schema.json", "title": "APAW Model Benchmarks Data", "description": "Schema for static model benchmarks extracted from HTML sources", "type": "object", "required": [ "version", "generated", "source", "metadata", "models", "groq_models", "agent_model_scores", "if_scores", "agent_current_config", "recommendations", "impact_data", "benchmark_comparison" ], "properties": { "version": { "type": "string", "const": "1.0.0" }, "generated": { "type": "string", "format": "date-time" }, "source": { "type": "string", "description": "Source of benchmark data (e.g. HTML scraping, API, manual entry)" }, "metadata": { "type": "object", "properties": { "scrape_date": { "type": "string", "format": "date-time" }, "source_urls": { "type": "array", "items": { "type": "string" } }, "notes": { "type": "string" }, "data_quality": { "type": "string", "enum": [ "high", "medium", "low", "estimated" ] } } }, "models": { "type": "array", "description": "All benchmarked models from various providers", "items": { "type": "object", "required": [ "id", "name", "provider", "category" ], "properties": { "id": { "type": "string", "description": "Model identifier" }, "name": { "type": "string" }, "organization": { "type": "string" }, "provider": { "type": "string", "enum": [ "ollama", "ollama-cloud", "openrouter", "groq", "anthropic", "openai", "meta", "cohere", "google", "microsoft", "unknown" ] }, "category": { "type": "string", "enum": [ "big", "medium", "small", "coder", "reasoning", "creative" ] }, "parameters": { "type": "string" }, "benchmarks": { "type": "object", "properties": { "swe_bench": { "type": [ "number", "null" ] }, "swe_bench_pro": { "type": [ "number", "null" ] }, "terminal_bench": { "type": [ "number", "null" ] }, "live_codebench": { "type": [ "number", "null" ] }, "gpqa": { "type": [ "number", "null" ] }, "hle": { "type": [ "number", "null" ] }, "browse_comp": { "type": [ "number", "null" ] }, "m_mlu": { "type": [ "number", "null" ] }, "m_mlu_pro": { "type": [ "number", "null" ] } } }, "description": { "type": "string" }, "availability": { "type": "object", "properties": { "rpm": { "type": [ "integer", "null" ] }, "rpd": { "type": [ "integer", "string", "null" ] }, "tpm": { "type": [ "integer", "string", "null" ] }, "tpd": { "type": [ "integer", "string", "null" ] } } }, "free": { "type": "boolean" }, "cost_per_1m_input": { "type": [ "number", "string", "null" ] }, "tier": { "type": "string", "enum": [ "free", "trial", "paid", "enterprise" ] } } } }, "groq_models": { "type": "array", "description": "Groq-specific models with performance data", "items": { "type": "object", "required": [ "id", "name", "speed_tps", "provider" ], "properties": { "id": { "type": "string" }, "name": { "type": "string" }, "speed_tps": { "type": [ "number", "string" ] }, "provider": { "type": "string", "const": "groq" }, "benchmarks": { "type": "object" }, "availability": { "type": "object" } } } }, "agent_model_scores": { "type": "array", "description": "Agent × Model compatibility scoring matrices", "items": { "type": "object", "required": [ "agent", "model_id", "score", "category" ], "properties": { "agent": { "type": "string" }, "model_id": { "type": "string" }, "score": { "type": "number", "minimum": 0, "maximum": 100 }, "category": { "type": "string", "enum": [ "performance", "instruction_following", "creativity", "code_generation" ] }, "reason": { "type": "string" }, "timestamp": { "type": "string", "format": "date-time" }, "current_model_id": { "type": "string", "description": "Current model ID string (replaces index)" } } } }, "if_scores": { "type": "object", "description": "Instruction Following scores mapping", "additionalProperties": { "type": "number", "minimum": 0, "maximum": 100 } }, "agent_current_config": { "type": "array", "description": "Current agent model configurations", "items": { "type": "object", "required": [ "agent", "model", "provider", "status" ], "properties": { "agent": { "type": "string" }, "model": { "type": "string" }, "provider": { "type": "string" }, "status": { "type": "string", "enum": [ "active", "testing", "deprecated", "pending" ] }, "reasoning_effort": { "type": "string", "enum": [ "L", "M", "H" ] }, "fit_score": { "type": "number" }, "date_applied": { "type": "string", "format": "date-time" } } } }, "recommendations": { "type": "array", "description": "Model change recommendations based on benchmarks", "items": { "type": "object", "required": [ "agent", "action", "current_model", "recommended_model", "impact" ], "properties": { "agent": { "type": "string" }, "action": { "type": "string", "enum": [ "update_model", "confirm_model", "add_fallback", "redesign_agent" ] }, "current_model": { "type": "string" }, "current_provider": { "type": "string" }, "recommended_model": { "type": "string" }, "recommended_provider": { "type": "string" }, "impact": { "type": "string", "enum": [ "critical", "high", "medium", "low" ] }, "rationale": { "type": "string" }, "expected_improvement": { "type": "object" }, "applied": { "type": "boolean" } } } }, "impact_data": { "type": "array", "description": "Impact analysis of model changes", "items": { "type": "object", "required": [ "agent", "model_change", "impact_score" ], "properties": { "agent": { "type": "string" }, "model_change": { "type": "string" }, "impact_score": { "type": "number", "minimum": 0, "maximum": 100, "description": "Impact score 0-100" } } } }, "benchmark_comparison": { "type": "object", "description": "APAW vs closed-source benchmark comparison", "properties": { "benchmarks": { "type": "array", "description": "Benchmark names used for comparison", "items": { "type": "string" } }, "closed_source_models": { "type": "array", "description": "Closed-source models included in comparison", "items": { "type": "object", "properties": { "name": { "type": "string" }, "provider": { "type": "string" }, "benchmarks": { "type": "object" } } } }, "apaw_models": { "type": "array", "description": "APAW pipeline models included in comparison", "items": { "type": "object", "properties": { "name": { "type": "string" }, "provider": { "type": "string" }, "benchmarks": { "type": "object" } } } }, "apaw_best": { "type": "object", "description": "Best APAW model per benchmark", "additionalProperties": { "type": "object", "properties": { "model": { "type": "string" }, "score": { "type": "number" }, "gap_to_closed": { "type": [ "number", "string" ] } } } }, "closed_best": { "type": "object", "description": "Best closed-source model per benchmark", "additionalProperties": { "type": "object", "properties": { "model": { "type": "string" }, "score": { "type": "number" } } } }, "summary": { "type": "object", "properties": { "apaw_avg_score": { "type": "number" }, "closed_avg_score": { "type": "number" }, "coverage_gap": { "type": "string" } } } } } } }