- Reassign 29/30 agents based on capability-analyst web research - deepseek-v4-pro: 14 agents (coding SOTA: SWE-bench 80.6%, LiveCodeBench 93.5%) - minimax-m3☁️ 8 agents (agentic: BrowseComp 83.5%, 12h autonomous) - glm-5.1: 4 agents (CyberGym 68.7% SOTA, sustained rounds) - minimax-m2.5☁️ 2 agents (frontend productivity, 2.2M pulls) - kimi-k2.6: 1 agent (ONLY true multimodal) - Add OpenCompass evaluation container (docker, scripts) for future objective runs - Evidence saved to agent-evolution/data/research-report.json (598 lines, 6 models) Data gaps honestly documented: minimax-m3/m2.5, qwen3-coder, kimi-k2.6 benchmark tables are image-only on Ollama.
80 lines
1.7 KiB
Bash
Executable File
80 lines
1.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# OpenCompass evaluation wrapper for Ollama models
|
|
# Usage: /eval.sh --model MODEL_ID --datasets DATASET_LIST --output OUTPUT_FILE
|
|
|
|
MODEL=""
|
|
DATASETS=""
|
|
OUTPUT=""
|
|
|
|
usage() {
|
|
cat <<EOF
|
|
Usage: $0 --model MODEL_ID --datasets DATASET_LIST --output OUTPUT_FILE
|
|
|
|
Example:
|
|
$0 --model ollama-cloud/deepseek-v4-pro --datasets mmlu hellaswag gsm8k --output /data/results.json
|
|
EOF
|
|
exit 1
|
|
}
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--model)
|
|
MODEL="${2:-}"
|
|
shift 2
|
|
;;
|
|
--datasets)
|
|
shift
|
|
DATASETS="$*"
|
|
break
|
|
;;
|
|
--output)
|
|
OUTPUT="${2:-}"
|
|
shift 2
|
|
;;
|
|
--help|-h)
|
|
usage
|
|
;;
|
|
*)
|
|
echo "Unknown option: $1" >&2
|
|
usage
|
|
;;
|
|
esac
|
|
done
|
|
|
|
if [[ -z "$MODEL" || -z "$OUTPUT" ]]; then
|
|
echo "Error: --model and --output are required." >&2
|
|
usage
|
|
fi
|
|
|
|
OLLAMA_API_URL="${OLLAMA_API_URL:-http://ollama:11434}"
|
|
|
|
# Verify Ollama connectivity
|
|
echo "Checking Ollama API at ${OLLAMA_API_URL} ..."
|
|
if ! wget -q --spider "${OLLAMA_API_URL}/api/tags"; then
|
|
echo "Error: Ollama not reachable at ${OLLAMA_API_URL}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
echo "Model: ${MODEL}"
|
|
echo "Datasets: ${DATASETS}"
|
|
echo "Output: ${OUTPUT}"
|
|
|
|
# Setup datasets if needed
|
|
if [[ -x /setup.sh ]]; then
|
|
/setup.sh
|
|
fi
|
|
|
|
# Run OpenCompass with Ollama backend via OpenAI-compatible API
|
|
opencompass \
|
|
--models ollama_api \
|
|
--datasets ${DATASETS} \
|
|
--work-dir /data \
|
|
--max-num-workers 1 \
|
|
--cfg-options \
|
|
model=dict(path="${MODEL}",openai_api_base="${OLLAMA_API_URL}/v1") \
|
|
| tee "${OUTPUT}"
|
|
|
|
echo "Evaluation complete. Results written to ${OUTPUT}"
|