DeepSeek-Math/evaluation/evaluation_results.json
ZhihongShao 21cc5c6701 init
2024-02-06 10:27:40 +08:00

204 lines
5.1 KiB
JSON

{
"DeepSeekMath-Base": {
"OCWCourses": {
"cot": {
"accuracy": 0.15441176470588236,
"n_samples": 272
},
"tool": {
"n_samples": 0
}
},
"cmath-cot-test": {
"cot": {
"accuracy": 0.7167577413479053,
"n_samples": 1098
},
"tool": {
"n_samples": 0
}
},
"miniF2F-Isabelle-test": {
"cot": {
"accuracy": 1.0,
"n_samples": 244
},
"tool": {
"n_samples": 0
}
},
"gsm8k-cot-test": {
"cot": {
"accuracy": 0.6421531463229719,
"n_samples": 1319
},
"tool": {
"n_samples": 0
}
},
"MMLU-STEM-test": {
"cot": {
"accuracy": 0.5646123260437376,
"n_samples": 3018
},
"tool": {
"n_samples": 0
}
},
"agieval-gaokao-mathqa-cot-test": {
"cot": {
"accuracy": 0.35327635327635326,
"n_samples": 351
},
"tool": {
"n_samples": 0
}
},
"agieval-gaokao-mathcloze-cot-test": {
"cot": {
"accuracy": 0.2033898305084746,
"n_samples": 118
},
"tool": {
"n_samples": 0
}
},
"gsm8k-pal-test": {
"cot": {
"n_samples": 0
},
"tool": {
"accuracy": 0.66868840030326,
"n_samples": 1319
}
},
"math_sat": {
"cot": {
"accuracy": 0.84375,
"n_samples": 32
},
"tool": {
"n_samples": 0
}
},
"miniF2F-Isabelle-valid": {
"cot": {
"accuracy": 1.0,
"n_samples": 244
},
"tool": {
"n_samples": 0
}
},
"math-pal-test": {
"cot": {
"n_samples": 0
},
"tool": {
"accuracy": 0.3142,
"n_samples": 5000
}
},
"math-cot-test": {
"cot": {
"accuracy": 0.3618,
"n_samples": 5000
},
"tool": {
"n_samples": 0
}
}
},
"DeepSeekMath-RL": {
"mgsm-zh": {
"cot": {
"accuracy": 0.796,
"n_samples": 250
},
"tool": {
"accuracy": 0.784,
"program_accuracy": 0.776,
"n_samples": 250
}
},
"cmath": {
"cot": {
"accuracy": 0.8879781420765027,
"n_samples": 1098
},
"tool": {
"accuracy": 0.8761384335154827,
"program_accuracy": 0.8570127504553734,
"n_samples": 1098
}
},
"math-test": {
"cot": {
"accuracy": 0.517,
"n_samples": 5000
},
"tool": {
"accuracy": 0.5878,
"program_accuracy": 0.509,
"n_samples": 5000
}
},
"gsm8k-test": {
"cot": {
"accuracy": 0.8824867323730099,
"n_samples": 1319
},
"tool": {
"accuracy": 0.866565579984837,
"program_accuracy": 0.868081880212282,
"n_samples": 1319
}
}
},
"DeepSeekMath-Instruct": {
"gsm8k-test": {
"cot": {
"accuracy": 0.8286580742987112,
"n_samples": 1319
},
"tool": {
"accuracy": 0.8369977255496588,
"program_accuracy": 0.8332069749810462,
"n_samples": 1319
}
},
"math-test": {
"cot": {
"accuracy": 0.4682,
"n_samples": 5000
},
"tool": {
"accuracy": 0.575,
"program_accuracy": 0.4664,
"n_samples": 5000
}
},
"cmath": {
"cot": {
"accuracy": 0.8460837887067395,
"n_samples": 1098
},
"tool": {
"accuracy": 0.843351548269581,
"program_accuracy": 0.8214936247723132,
"n_samples": 1098
}
},
"mgsm-zh": {
"cot": {
"accuracy": 0.732,
"n_samples": 250
},
"tool": {
"accuracy": 0.72,
"program_accuracy": 0.716,
"n_samples": 250
}
}
}
}