mirror of
https://github.com/deepseek-ai/DeepSeek-Math
synced 2024-11-24 04:54:00 +00:00
204 lines
5.1 KiB
JSON
204 lines
5.1 KiB
JSON
|
{
|
||
|
"DeepSeekMath-Base": {
|
||
|
"OCWCourses": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.15441176470588236,
|
||
|
"n_samples": 272
|
||
|
},
|
||
|
"tool": {
|
||
|
"n_samples": 0
|
||
|
}
|
||
|
},
|
||
|
"cmath-cot-test": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.7167577413479053,
|
||
|
"n_samples": 1098
|
||
|
},
|
||
|
"tool": {
|
||
|
"n_samples": 0
|
||
|
}
|
||
|
},
|
||
|
"miniF2F-Isabelle-test": {
|
||
|
"cot": {
|
||
|
"accuracy": 1.0,
|
||
|
"n_samples": 244
|
||
|
},
|
||
|
"tool": {
|
||
|
"n_samples": 0
|
||
|
}
|
||
|
},
|
||
|
"gsm8k-cot-test": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.6421531463229719,
|
||
|
"n_samples": 1319
|
||
|
},
|
||
|
"tool": {
|
||
|
"n_samples": 0
|
||
|
}
|
||
|
},
|
||
|
"MMLU-STEM-test": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.5646123260437376,
|
||
|
"n_samples": 3018
|
||
|
},
|
||
|
"tool": {
|
||
|
"n_samples": 0
|
||
|
}
|
||
|
},
|
||
|
"agieval-gaokao-mathqa-cot-test": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.35327635327635326,
|
||
|
"n_samples": 351
|
||
|
},
|
||
|
"tool": {
|
||
|
"n_samples": 0
|
||
|
}
|
||
|
},
|
||
|
"agieval-gaokao-mathcloze-cot-test": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.2033898305084746,
|
||
|
"n_samples": 118
|
||
|
},
|
||
|
"tool": {
|
||
|
"n_samples": 0
|
||
|
}
|
||
|
},
|
||
|
"gsm8k-pal-test": {
|
||
|
"cot": {
|
||
|
"n_samples": 0
|
||
|
},
|
||
|
"tool": {
|
||
|
"accuracy": 0.66868840030326,
|
||
|
"n_samples": 1319
|
||
|
}
|
||
|
},
|
||
|
"math_sat": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.84375,
|
||
|
"n_samples": 32
|
||
|
},
|
||
|
"tool": {
|
||
|
"n_samples": 0
|
||
|
}
|
||
|
},
|
||
|
"miniF2F-Isabelle-valid": {
|
||
|
"cot": {
|
||
|
"accuracy": 1.0,
|
||
|
"n_samples": 244
|
||
|
},
|
||
|
"tool": {
|
||
|
"n_samples": 0
|
||
|
}
|
||
|
},
|
||
|
"math-pal-test": {
|
||
|
"cot": {
|
||
|
"n_samples": 0
|
||
|
},
|
||
|
"tool": {
|
||
|
"accuracy": 0.3142,
|
||
|
"n_samples": 5000
|
||
|
}
|
||
|
},
|
||
|
"math-cot-test": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.3618,
|
||
|
"n_samples": 5000
|
||
|
},
|
||
|
"tool": {
|
||
|
"n_samples": 0
|
||
|
}
|
||
|
}
|
||
|
},
|
||
|
"DeepSeekMath-RL": {
|
||
|
"mgsm-zh": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.796,
|
||
|
"n_samples": 250
|
||
|
},
|
||
|
"tool": {
|
||
|
"accuracy": 0.784,
|
||
|
"program_accuracy": 0.776,
|
||
|
"n_samples": 250
|
||
|
}
|
||
|
},
|
||
|
"cmath": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.8879781420765027,
|
||
|
"n_samples": 1098
|
||
|
},
|
||
|
"tool": {
|
||
|
"accuracy": 0.8761384335154827,
|
||
|
"program_accuracy": 0.8570127504553734,
|
||
|
"n_samples": 1098
|
||
|
}
|
||
|
},
|
||
|
"math-test": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.517,
|
||
|
"n_samples": 5000
|
||
|
},
|
||
|
"tool": {
|
||
|
"accuracy": 0.5878,
|
||
|
"program_accuracy": 0.509,
|
||
|
"n_samples": 5000
|
||
|
}
|
||
|
},
|
||
|
"gsm8k-test": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.8824867323730099,
|
||
|
"n_samples": 1319
|
||
|
},
|
||
|
"tool": {
|
||
|
"accuracy": 0.866565579984837,
|
||
|
"program_accuracy": 0.868081880212282,
|
||
|
"n_samples": 1319
|
||
|
}
|
||
|
}
|
||
|
},
|
||
|
"DeepSeekMath-Instruct": {
|
||
|
"gsm8k-test": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.8286580742987112,
|
||
|
"n_samples": 1319
|
||
|
},
|
||
|
"tool": {
|
||
|
"accuracy": 0.8369977255496588,
|
||
|
"program_accuracy": 0.8332069749810462,
|
||
|
"n_samples": 1319
|
||
|
}
|
||
|
},
|
||
|
"math-test": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.4682,
|
||
|
"n_samples": 5000
|
||
|
},
|
||
|
"tool": {
|
||
|
"accuracy": 0.575,
|
||
|
"program_accuracy": 0.4664,
|
||
|
"n_samples": 5000
|
||
|
}
|
||
|
},
|
||
|
"cmath": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.8460837887067395,
|
||
|
"n_samples": 1098
|
||
|
},
|
||
|
"tool": {
|
||
|
"accuracy": 0.843351548269581,
|
||
|
"program_accuracy": 0.8214936247723132,
|
||
|
"n_samples": 1098
|
||
|
}
|
||
|
},
|
||
|
"mgsm-zh": {
|
||
|
"cot": {
|
||
|
"accuracy": 0.732,
|
||
|
"n_samples": 250
|
||
|
},
|
||
|
"tool": {
|
||
|
"accuracy": 0.72,
|
||
|
"program_accuracy": 0.716,
|
||
|
"n_samples": 250
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|