DeepSeek-Math/evaluation/summarize_results.py

import os
import json
import argparse

from glob import glob
from copy import deepcopy

def seek_metrics(path):
    if os.path.isdir(path):
        for subpath in glob(os.path.join(path, "*")):
            yield from seek_metrics(subpath)
    else:
        if "metrics.json" in path:
            yield path

def seek_predictions(path):
    if os.path.isdir(path):
        for subpath in glob(os.path.join(path, "*")):
            yield from seek_predictions(subpath)
    else:
        if "predictions.json" in path:
            yield path

def aggregate_metrics(paths):
    result = {}
    total = 0
    for path in paths:
        metric = json.load(open(path, "r"))
        n_samples = metric['n_samples']
        total += n_samples
        for key, val in metric.items():
            if key != 'n_samples':
                result[key] = result.get(key, 0) + val * n_samples
    for key, val in result.items():
        result[key] = val / total
    result['n_samples'] = total
    return result

def aggregate_predictions(paths):
    data = []
    for path in paths:
        try:
            data.extend(json.load(open(path, "r")))
        except:
            print(path, flush=True)
            continue
    return data

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dirname", type=str, default="outputs")
    parser.add_argument("--eval-atp", action='store_true')
    parser.add_argument("--isa-path", type=str, default="")
    parser.add_argument("--theory-file", type=str, default="")
    args = parser.parse_args()

    model2dataset2task2metric = {}
    for model in os.listdir(args.dirname):
        model2dataset2task2metric[model] = {}
        subdir = os.path.join(args.dirname, model)
        for dataset in os.listdir(subdir):
            log_dir = os.path.join(subdir, dataset, "infer_logs")
            agg_dirname = os.path.join(subdir, dataset, "results")
            if not os.path.exists(log_dir):
                os.makedirs(log_dir, exist_ok=True)
                os.system(f"mv {subdir}/{dataset}/* {log_dir}")
            metric_paths = list(seek_metrics(log_dir))
            pred_paths = list(seek_predictions(log_dir))
            task2metric_paths = {'cot': [], 'tool': []}
            task2pred_paths = {'cot': [], 'tool': []}
            for path in metric_paths:
                if 'cot' in path:
                    task2metric_paths['cot'].append(path)
                else:
                    task2metric_paths['tool'].append(path)
            for path in pred_paths:
                if 'cot' in path:
                    task2pred_paths['cot'].append(path)
                else:
                    task2pred_paths['tool'].append(path)
            task2metric = {task: aggregate_metrics(paths) for task, paths in task2metric_paths.items()}
            task2pred = {task: aggregate_predictions(paths) for task, paths in task2pred_paths.items()}
            model2dataset2task2metric[model][dataset] = task2metric

            for task in task2metric:
                task_dirname = os.path.join(agg_dirname, task)
                os.makedirs(task_dirname, exist_ok=True)
                metric_path = os.path.join(task_dirname, "metrics.json")
                pred_path = os.path.join(task_dirname, "predictions.json")
                if 'math6' in dataset.lower() and task == 'cot':
                    data_to_score = []
                    for pred in task2pred[task]:
                        item = deepcopy(pred['metadata'])
                        item['model_answer_turns_1'] = pred['turns'][0]['model_output']
                        item['model_answer_turns_2'] = pred['turns'][1]['model_output']
                        data_to_score.append(item)
                    _metrics = math6_score(data_to_score)
                    task2metric[task].update(_metrics)
                    model2dataset2task2metric[model][dataset][task].update(_metrics)
                json.dump(task2metric[task], open(metric_path, "w"), indent=4)
                json.dump(task2pred[task], open(pred_path, "w"), indent=4)
                if 'minif2f' in dataset.lower() and 'isabelle' in dataset.lower() and task2pred[task] and args.eval_atp:
                    eval_path = metric_path + ".eval"
                    if os.path.exists(eval_path) and json.load(open(eval_path, "r")).get('n_samples', 0):
                        model2dataset2task2metric[model][dataset][task] = json.load(open(eval_path, "r"))
                        continue
                    print(f"Running minif2f-isabelle evaluation on {dataset} ...", flush=True)
                    print(f"Predictions >>> {pred_path}", flush=True)
                    cmd = f"PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python python unsafe_score_minif2f_isabelle.py " \
                        f"--isa-path {args.isa_path} " \
                        f"--theory-file {args.theory_file} " \
                        f"--working-dir {args.working_dir} " \
                        f"--port 9000 " \
                        f"--output {pred_path} "
                    os.system(cmd)

    json.dump(model2dataset2task2metric, open("evaluation_results.json", "w"), indent=4, ensure_ascii=False)

if __name__ == '__main__':
    main()