mirror of
https://github.com/deepseek-ai/DeepSeek-Math
synced 2024-11-24 13:05:27 +00:00
121 lines
5.1 KiB
Python
121 lines
5.1 KiB
Python
|
import os
|
||
|
import json
|
||
|
import argparse
|
||
|
|
||
|
from glob import glob
|
||
|
from copy import deepcopy
|
||
|
|
||
|
def seek_metrics(path):
|
||
|
if os.path.isdir(path):
|
||
|
for subpath in glob(os.path.join(path, "*")):
|
||
|
yield from seek_metrics(subpath)
|
||
|
else:
|
||
|
if "metrics.json" in path:
|
||
|
yield path
|
||
|
|
||
|
def seek_predictions(path):
|
||
|
if os.path.isdir(path):
|
||
|
for subpath in glob(os.path.join(path, "*")):
|
||
|
yield from seek_predictions(subpath)
|
||
|
else:
|
||
|
if "predictions.json" in path:
|
||
|
yield path
|
||
|
|
||
|
def aggregate_metrics(paths):
|
||
|
result = {}
|
||
|
total = 0
|
||
|
for path in paths:
|
||
|
metric = json.load(open(path, "r"))
|
||
|
n_samples = metric['n_samples']
|
||
|
total += n_samples
|
||
|
for key, val in metric.items():
|
||
|
if key != 'n_samples':
|
||
|
result[key] = result.get(key, 0) + val * n_samples
|
||
|
for key, val in result.items():
|
||
|
result[key] = val / total
|
||
|
result['n_samples'] = total
|
||
|
return result
|
||
|
|
||
|
def aggregate_predictions(paths):
|
||
|
data = []
|
||
|
for path in paths:
|
||
|
try:
|
||
|
data.extend(json.load(open(path, "r")))
|
||
|
except:
|
||
|
print(path, flush=True)
|
||
|
continue
|
||
|
return data
|
||
|
|
||
|
def main():
|
||
|
parser = argparse.ArgumentParser()
|
||
|
parser.add_argument("--dirname", type=str, default="outputs")
|
||
|
parser.add_argument("--eval-atp", action='store_true')
|
||
|
parser.add_argument("--isa-path", type=str, default="")
|
||
|
parser.add_argument("--theory-file", type=str, default="")
|
||
|
args = parser.parse_args()
|
||
|
|
||
|
model2dataset2task2metric = {}
|
||
|
for model in os.listdir(args.dirname):
|
||
|
model2dataset2task2metric[model] = {}
|
||
|
subdir = os.path.join(args.dirname, model)
|
||
|
for dataset in os.listdir(subdir):
|
||
|
log_dir = os.path.join(subdir, dataset, "infer_logs")
|
||
|
agg_dirname = os.path.join(subdir, dataset, "results")
|
||
|
if not os.path.exists(log_dir):
|
||
|
os.makedirs(log_dir, exist_ok=True)
|
||
|
os.system(f"mv {subdir}/{dataset}/* {log_dir}")
|
||
|
metric_paths = list(seek_metrics(log_dir))
|
||
|
pred_paths = list(seek_predictions(log_dir))
|
||
|
task2metric_paths = {'cot': [], 'tool': []}
|
||
|
task2pred_paths = {'cot': [], 'tool': []}
|
||
|
for path in metric_paths:
|
||
|
if 'cot' in path:
|
||
|
task2metric_paths['cot'].append(path)
|
||
|
else:
|
||
|
task2metric_paths['tool'].append(path)
|
||
|
for path in pred_paths:
|
||
|
if 'cot' in path:
|
||
|
task2pred_paths['cot'].append(path)
|
||
|
else:
|
||
|
task2pred_paths['tool'].append(path)
|
||
|
task2metric = {task: aggregate_metrics(paths) for task, paths in task2metric_paths.items()}
|
||
|
task2pred = {task: aggregate_predictions(paths) for task, paths in task2pred_paths.items()}
|
||
|
model2dataset2task2metric[model][dataset] = task2metric
|
||
|
|
||
|
for task in task2metric:
|
||
|
task_dirname = os.path.join(agg_dirname, task)
|
||
|
os.makedirs(task_dirname, exist_ok=True)
|
||
|
metric_path = os.path.join(task_dirname, "metrics.json")
|
||
|
pred_path = os.path.join(task_dirname, "predictions.json")
|
||
|
if 'math6' in dataset.lower() and task == 'cot':
|
||
|
data_to_score = []
|
||
|
for pred in task2pred[task]:
|
||
|
item = deepcopy(pred['metadata'])
|
||
|
item['model_answer_turns_1'] = pred['turns'][0]['model_output']
|
||
|
item['model_answer_turns_2'] = pred['turns'][1]['model_output']
|
||
|
data_to_score.append(item)
|
||
|
_metrics = math6_score(data_to_score)
|
||
|
task2metric[task].update(_metrics)
|
||
|
model2dataset2task2metric[model][dataset][task].update(_metrics)
|
||
|
json.dump(task2metric[task], open(metric_path, "w"), indent=4)
|
||
|
json.dump(task2pred[task], open(pred_path, "w"), indent=4)
|
||
|
if 'minif2f' in dataset.lower() and 'isabelle' in dataset.lower() and task2pred[task] and args.eval_atp:
|
||
|
eval_path = metric_path + ".eval"
|
||
|
if os.path.exists(eval_path) and json.load(open(eval_path, "r")).get('n_samples', 0):
|
||
|
model2dataset2task2metric[model][dataset][task] = json.load(open(eval_path, "r"))
|
||
|
continue
|
||
|
print(f"Running minif2f-isabelle evaluation on {dataset} ...", flush=True)
|
||
|
print(f"Predictions >>> {pred_path}", flush=True)
|
||
|
cmd = f"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python unsafe_score_minif2f_isabelle.py " \
|
||
|
f"--isa-path {args.isa_path} " \
|
||
|
f"--theory-file {args.theory_file} " \
|
||
|
f"--working-dir {args.working_dir} " \
|
||
|
f"--port 9000 " \
|
||
|
f"--output {pred_path} "
|
||
|
os.system(cmd)
|
||
|
|
||
|
json.dump(model2dataset2task2metric, open("evaluation_results.json", "w"), indent=4, ensure_ascii=False)
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|