APAW/agent-evolution/data/model-benchmarks.schema.json

{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "$id": "https://app.kilo.ai/model-benchmarks.schema.json",
  "title": "APAW Model Benchmarks Data",
  "description": "Schema for static model benchmarks extracted from HTML sources",
  "type": "object",
  "required": [
    "version",
    "generated",
    "source",
    "metadata",
    "models",
    "groq_models",
    "agent_model_scores",
    "if_scores",
    "agent_current_config",
    "recommendations",
    "impact_data",
    "benchmark_comparison"
  ],
  "properties": {
    "version": {
      "type": "string",
      "const": "1.0.0"
    },
    "generated": {
      "type": "string",
      "format": "date-time"
    },
    "source": {
      "type": "string",
      "description": "Source of benchmark data (e.g. HTML scraping, API, manual entry)"
    },
    "metadata": {
      "type": "object",
      "properties": {
        "scrape_date": {
          "type": "string",
          "format": "date-time"
        },
        "source_urls": {
          "type": "array",
          "items": {
            "type": "string"
          }
        },
        "notes": {
          "type": "string"
        },
        "data_quality": {
          "type": "string",
          "enum": [
            "high",
            "medium",
            "low",
            "estimated"
          ]
        }
      }
    },
    "models": {
      "type": "array",
      "description": "All benchmarked models from various providers",
      "items": {
        "type": "object",
        "required": [
          "id",
          "name",
          "provider",
          "category"
        ],
        "properties": {
          "id": {
            "type": "string",
            "description": "Model identifier"
          },
          "name": {
            "type": "string"
          },
          "organization": {
            "type": "string"
          },
          "provider": {
            "type": "string",
            "enum": [
              "ollama",
              "ollama-cloud",
              "openrouter",
              "groq",
              "anthropic",
              "openai",
              "meta",
              "cohere",
              "google",
              "microsoft",
              "unknown"
            ]
          },
          "category": {
            "type": "string",
            "enum": [
              "big",
              "medium",
              "small",
              "coder",
              "reasoning",
              "creative"
            ]
          },
          "parameters": {
            "type": "string"
          },
          "benchmarks": {
            "type": "object",
            "properties": {
              "swe_bench": {
                "type": [
                  "number",
                  "null"
                ]
              },
              "swe_bench_pro": {
                "type": [
                  "number",
                  "null"
                ]
              },
              "terminal_bench": {
                "type": [
                  "number",
                  "null"
                ]
              },
              "live_codebench": {
                "type": [
                  "number",
                  "null"
                ]
              },
              "gpqa": {
                "type": [
                  "number",
                  "null"
                ]
              },
              "hle": {
                "type": [
                  "number",
                  "null"
                ]
              },
              "browse_comp": {
                "type": [
                  "number",
                  "null"
                ]
              },
              "m_mlu": {
                "type": [
                  "number",
                  "null"
                ]
              },
              "m_mlu_pro": {
                "type": [
                  "number",
                  "null"
                ]
              }
            }
          },
          "description": {
            "type": "string"
          },
          "availability": {
            "type": "object",
            "properties": {
              "rpm": {
                "type": [
                  "integer",
                  "null"
                ]
              },
              "rpd": {
                "type": [
                  "integer",
                  "string",
                  "null"
                ]
              },
              "tpm": {
                "type": [
                  "integer",
                  "string",
                  "null"
                ]
              },
              "tpd": {
                "type": [
                  "integer",
                  "string",
                  "null"
                ]
              }
            }
          },
          "free": {
            "type": "boolean"
          },
          "cost_per_1m_input": {
            "type": [
              "number",
              "string",
              "null"
            ]
          },
          "tier": {
            "type": "string",
            "enum": [
              "free",
              "trial",
              "paid",
              "enterprise"
            ]
          }
        }
      }
    },
    "groq_models": {
      "type": "array",
      "description": "Groq-specific models with performance data",
      "items": {
        "type": "object",
        "required": [
          "id",
          "name",
          "speed_tps",
          "provider"
        ],
        "properties": {
          "id": {
            "type": "string"
          },
          "name": {
            "type": "string"
          },
          "speed_tps": {
            "type": [
              "number",
              "string"
            ]
          },
          "provider": {
            "type": "string",
            "const": "groq"
          },
          "benchmarks": {
            "type": "object"
          },
          "availability": {
            "type": "object"
          }
        }
      }
    },
    "agent_model_scores": {
      "type": "array",
      "description": "Agent × Model compatibility scoring matrices",
      "items": {
        "type": "object",
        "required": [
          "agent",
          "model_id",
          "score",
          "category"
        ],
        "properties": {
          "agent": {
            "type": "string"
          },
          "model_id": {
            "type": "string"
          },
          "score": {
            "type": "number",
            "minimum": 0,
            "maximum": 100
          },
          "category": {
            "type": "string",
            "enum": [
              "performance",
              "instruction_following",
              "creativity",
              "code_generation"
            ]
          },
          "reason": {
            "type": "string"
          },
          "timestamp": {
            "type": "string",
            "format": "date-time"
          },
          "current_model_id": {
            "type": "string",
            "description": "Current model ID string (replaces index)"
          }
        }
      }
    },
    "if_scores": {
      "type": "object",
      "description": "Instruction Following scores mapping",
      "additionalProperties": {
        "type": "number",
        "minimum": 0,
        "maximum": 100
      }
    },
    "agent_current_config": {
      "type": "array",
      "description": "Current agent model configurations",
      "items": {
        "type": "object",
        "required": [
          "agent",
          "model",
          "provider",
          "status"
        ],
        "properties": {
          "agent": {
            "type": "string"
          },
          "model": {
            "type": "string"
          },
          "provider": {
            "type": "string"
          },
          "status": {
            "type": "string",
            "enum": [
              "active",
              "testing",
              "deprecated",
              "pending"
            ]
          },
          "reasoning_effort": {
            "type": "string",
            "enum": [
              "L",
              "M",
              "H"
            ]
          },
          "fit_score": {
            "type": "number"
          },
          "date_applied": {
            "type": "string",
            "format": "date-time"
          }
        }
      }
    },
    "recommendations": {
      "type": "array",
      "description": "Model change recommendations based on benchmarks",
      "items": {
        "type": "object",
        "required": [
          "agent",
          "action",
          "current_model",
          "recommended_model",
          "impact"
        ],
        "properties": {
          "agent": {
            "type": "string"
          },
          "action": {
            "type": "string",
            "enum": [
              "update_model",
              "confirm_model",
              "add_fallback",
              "redesign_agent"
            ]
          },
          "current_model": {
            "type": "string"
          },
          "current_provider": {
            "type": "string"
          },
          "recommended_model": {
            "type": "string"
          },
          "recommended_provider": {
            "type": "string"
          },
          "impact": {
            "type": "string",
            "enum": [
              "critical",
              "high",
              "medium",
              "low"
            ]
          },
          "rationale": {
            "type": "string"
          },
          "expected_improvement": {
            "type": "object"
          },
          "applied": {
            "type": "boolean"
          }
        }
      }
    },
    "impact_data": {
      "type": "array",
      "description": "Impact analysis of model changes",
      "items": {
        "type": "object",
        "required": [
          "agent",
          "model_change",
          "impact_score"
        ],
        "properties": {
          "agent": {
            "type": "string"
          },
          "model_change": {
            "type": "string"
          },
          "impact_score": {
            "type": "number",
            "minimum": 0,
            "maximum": 100,
            "description": "Impact score 0-100"
          }
        }
      }
    },
    "benchmark_comparison": {
      "type": "object",
      "description": "APAW vs closed-source benchmark comparison",
      "properties": {
        "benchmarks": {
          "type": "array",
          "description": "Benchmark names used for comparison",
          "items": {
            "type": "string"
          }
        },
        "closed_source_models": {
          "type": "array",
          "description": "Closed-source models included in comparison",
          "items": {
            "type": "object",
            "properties": {
              "name": {
                "type": "string"
              },
              "provider": {
                "type": "string"
              },
              "benchmarks": {
                "type": "object"
              }
            }
          }
        },
        "apaw_models": {
          "type": "array",
          "description": "APAW pipeline models included in comparison",
          "items": {
            "type": "object",
            "properties": {
              "name": {
                "type": "string"
              },
              "provider": {
                "type": "string"
              },
              "benchmarks": {
                "type": "object"
              }
            }
          }
        },
        "apaw_best": {
          "type": "object",
          "description": "Best APAW model per benchmark",
          "additionalProperties": {
            "type": "object",
            "properties": {
              "model": {
                "type": "string"
              },
              "score": {
                "type": "number"
              },
              "gap_to_closed": {
                "type": [
                  "number",
                  "string"
                ]
              }
            }
          }
        },
        "closed_best": {
          "type": "object",
          "description": "Best closed-source model per benchmark",
          "additionalProperties": {
            "type": "object",
            "properties": {
              "model": {
                "type": "string"
              },
              "score": {
                "type": "number"
              }
            }
          }
        },
        "summary": {
          "type": "object",
          "properties": {
            "apaw_avg_score": {
              "type": "number"
            },
            "closed_avg_score": {
              "type": "number"
            },
            "coverage_gap": {
              "type": "string"
            }
          }
        }
      }
    }
  }
}