DeepSeek-Math/evaluation/summarize_results.py

121 lines
5.1 KiB
Python
Raw Normal View History

2024-02-06 02:27:40 +00:00
import os
import json
import argparse
from glob import glob
from copy import deepcopy
def seek_metrics(path):
if os.path.isdir(path):
for subpath in glob(os.path.join(path, "*")):
yield from seek_metrics(subpath)
else:
if "metrics.json" in path:
yield path
def seek_predictions(path):
if os.path.isdir(path):
for subpath in glob(os.path.join(path, "*")):
yield from seek_predictions(subpath)
else:
if "predictions.json" in path:
yield path
def aggregate_metrics(paths):
result = {}
total = 0
for path in paths:
metric = json.load(open(path, "r"))
n_samples = metric['n_samples']
total += n_samples
for key, val in metric.items():
if key != 'n_samples':
result[key] = result.get(key, 0) + val * n_samples
for key, val in result.items():
result[key] = val / total
result['n_samples'] = total
return result
def aggregate_predictions(paths):
data = []
for path in paths:
try:
data.extend(json.load(open(path, "r")))
except:
print(path, flush=True)
continue
return data
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--dirname", type=str, default="outputs")
parser.add_argument("--eval-atp", action='store_true')
parser.add_argument("--isa-path", type=str, default="")
parser.add_argument("--theory-file", type=str, default="")
args = parser.parse_args()
model2dataset2task2metric = {}
for model in os.listdir(args.dirname):
model2dataset2task2metric[model] = {}
subdir = os.path.join(args.dirname, model)
for dataset in os.listdir(subdir):
log_dir = os.path.join(subdir, dataset, "infer_logs")
agg_dirname = os.path.join(subdir, dataset, "results")
if not os.path.exists(log_dir):
os.makedirs(log_dir, exist_ok=True)
os.system(f"mv {subdir}/{dataset}/* {log_dir}")
metric_paths = list(seek_metrics(log_dir))
pred_paths = list(seek_predictions(log_dir))
task2metric_paths = {'cot': [], 'tool': []}
task2pred_paths = {'cot': [], 'tool': []}
for path in metric_paths:
if 'cot' in path:
task2metric_paths['cot'].append(path)
else:
task2metric_paths['tool'].append(path)
for path in pred_paths:
if 'cot' in path:
task2pred_paths['cot'].append(path)
else:
task2pred_paths['tool'].append(path)
task2metric = {task: aggregate_metrics(paths) for task, paths in task2metric_paths.items()}
task2pred = {task: aggregate_predictions(paths) for task, paths in task2pred_paths.items()}
model2dataset2task2metric[model][dataset] = task2metric
for task in task2metric:
task_dirname = os.path.join(agg_dirname, task)
os.makedirs(task_dirname, exist_ok=True)
metric_path = os.path.join(task_dirname, "metrics.json")
pred_path = os.path.join(task_dirname, "predictions.json")
if 'math6' in dataset.lower() and task == 'cot':
data_to_score = []
for pred in task2pred[task]:
item = deepcopy(pred['metadata'])
item['model_answer_turns_1'] = pred['turns'][0]['model_output']
item['model_answer_turns_2'] = pred['turns'][1]['model_output']
data_to_score.append(item)
_metrics = math6_score(data_to_score)
task2metric[task].update(_metrics)
model2dataset2task2metric[model][dataset][task].update(_metrics)
json.dump(task2metric[task], open(metric_path, "w"), indent=4)
json.dump(task2pred[task], open(pred_path, "w"), indent=4)
if 'minif2f' in dataset.lower() and 'isabelle' in dataset.lower() and task2pred[task] and args.eval_atp:
eval_path = metric_path + ".eval"
if os.path.exists(eval_path) and json.load(open(eval_path, "r")).get('n_samples', 0):
model2dataset2task2metric[model][dataset][task] = json.load(open(eval_path, "r"))
continue
print(f"Running minif2f-isabelle evaluation on {dataset} ...", flush=True)
print(f"Predictions >>> {pred_path}", flush=True)
cmd = f"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python unsafe_score_minif2f_isabelle.py " \
f"--isa-path {args.isa_path} " \
f"--theory-file {args.theory_file} " \
f"--working-dir {args.working_dir} " \
f"--port 9000 " \
f"--output {pred_path} "
os.system(cmd)
json.dump(model2dataset2task2metric, open("evaluation_results.json", "w"), indent=4, ensure_ascii=False)
if __name__ == '__main__':
main()