DeepSeek-Math/evaluation/eval/eval_utils.py

import multiprocessing
from math import isclose
import numpy as np
from typing import Union, Any, Dict

from sympy import simplify, N
from sympy.parsing.sympy_parser import parse_expr
from sympy.parsing.latex import parse_latex
import re
import regex

from data_processing.answer_extraction import extract_answer, extract_program_output, strip_string

def extract_program(result: str, last_only=True):
    """
    extract the program after "```python", and before "```"
    """
    program = ""
    start = False
    for line in result.split("\n"):
        if line.startswith("```python"):
            if last_only:
                program = "" # only extract the last program
            else:
                program += "\n# ========\n"
            start = True
        elif line.startswith("```"):
            start = False
        elif start:
            program += line + "\n"
    return program


def parse_ground_truth(example: Dict[str, Any], data_name):
    if 'gt_cot' in example:
        return example['gt_cot'], strip_string(example['gt'])

    # parse ground truth
    if data_name in ["math", 'ocw']:
        gt_cot = example['solution']
        gt_ans = extract_answer(gt_cot)
    elif data_name == "gsm8k":
        gt_cot, gt_ans = example['answer'].split("####")
    elif data_name == "gsm-hard":
        gt_cot, gt_ans = example['code'], example['target']
    elif data_name == "svamp":
        gt_cot, gt_ans = example['Equation'], example['Answer']
    elif data_name == "asdiv":
        gt_cot = example['formula']
        gt_ans = re.sub(r"\(.*?\)", "", example['answer'])
    elif data_name == "mawps":
        gt_cot, gt_ans = None, example['target']
    elif data_name == "tabmwp":
        gt_cot = example['solution']
        gt_ans = example['answer']
        if example['ans_type'] in ['integer_number', 'decimal_number']:
            if '/' in gt_ans:
                gt_ans = int(gt_ans.split('/')[0]) / int(gt_ans.split('/')[1])
            elif ',' in gt_ans:
                gt_ans = float(gt_ans.replace(',', ''))
            elif '%' in gt_ans:
                gt_ans = float(gt_ans.split('%')[0]) / 100
            else:
                gt_ans = float(gt_ans)
    elif data_name == "bbh":
        gt_cot, gt_ans = None, example['target']
    else:
        raise NotImplementedError(data_name)
    # post process
    gt_cot = str(gt_cot).strip()
    gt_ans = strip_string(gt_ans)
    return gt_cot, gt_ans


def parse_question(example, data_name):
    question = ""
    if data_name == "asdiv":
        question = f"{example['body'].strip()} {example['question'].strip()}"
    elif data_name == "svamp":
        body = example["Body"].strip()
        if not body.endswith("."):
            body = body + "."
        question = f'{body} {example["Question"].strip()}'
    elif data_name == "tabmwp":
        title_str = f'regarding "{example["table_title"]}" ' if example['table_title'] else ""
        question = f'Read the following table {title_str}and answer a question:\n'
        question += f'{example["table"]}\n{example["question"]}'
        if example['choices']:
            question += f' Please select from the following options: {example["choices"]}'
    else:
        for key in ['question', 'problem', 'Question', 'input']:
            if key in example:
                question = example[key]
                break
    assert question != ""
    return question.strip()


def run_execute(executor, result, prompt_type, execute=False):
    if not result or result == 'error':
        return None, None
    report = None

    if "program_only" in prompt_type:
        prediction = extract_program_output(result)
    elif prompt_type in ["pot", "pal"] and execute:
        code = extract_program(result)
        prediction, report = executor.apply(code)
    else:
        prediction = extract_answer(result)

    prediction = strip_string(prediction)
    return prediction, report


def parse_digits(num):
    # format: 234.23 || 23%
    num = regex.sub(',', '', str(num))
    try:
        return float(num)
    except:
        if num.endswith('%'):
            num = num[:-1]
            if num.endswith('\\'):
                num = num[:-1]
            try:
                return float(num) / 100
            except:
                pass
    return None

def is_digit(num):
    # paired with parse_digits
    return parse_digits(num) is not None


def normalize_prediction(prediction):
    try: # 1. numerical equal
        if is_digit(prediction):
            prediction = np.round(float(str(prediction).replace(",", "")), 6)
        return str(prediction)
    except:
        pass

    # 2. symbolic equal
    prediction = str(prediction).strip()

    ## deal with [], (), {}
    brackets = []
    while prediction.startswith("[") and prediction.endswith("]") or (prediction.startswith("(") and prediction.endswith(")")):
        bracket = prediction[0]
        prediction = prediction[1:-1]
    if brackets and ',' in prediction:
        pred_parts = [normalize_prediction(part) for part in prediction.split(",")]
        prediction = ",".join(pred_parts)

    if brackets:
        for b in reversed(brackets):
            if b == '[':
                prediction = '[' + prediction + ']'
            else:
                assert b == '('
                prediction = '(' + prediction + ')'

    def _parse(s):
        for f in [parse_latex, parse_expr]:
            try:
                return f(s)
            except:
                pass
        return s

    prediction = _parse(prediction)

    for s in ['{', "}", "(", ")"]:
        prediction = prediction.replace(s, "")

    return prediction


def math_equal(prediction: Union[bool, float, str],
                reference: Union[float, str],
                include_percentage: bool = True,
                is_close: bool = True,
                timeout: bool = False,
                ) -> bool:
    """
    Exact match of math if and only if:
    1. numerical equal: both can convert to float and are equal
    2. symbolic equal: both can convert to sympy expression and are equal
    """
    if str(prediction) == str(reference):
        return True

    try: # 1. numerical equal
        if is_digit(prediction) and is_digit(reference):
            prediction = parse_digits(prediction)
            reference = parse_digits(reference)
            # number questions
            if include_percentage:
                gt_result = [reference / 100, reference, reference * 100]
            else:
                gt_result = [reference]
            for item in gt_result:
                try:
                    if is_close:
                        if isclose(item, prediction, abs_tol=1e-3):
                            return True
                    else:
                        if item == prediction:
                            return True
                except Exception:
                    continue
            return False
    except:
        pass

    if not prediction and prediction not in [0, False]:
        return False

    # 2. symbolic equal
    reference = str(reference).strip()
    prediction = str(prediction).strip()

    if regex.match(r'(\(|\[).+(\)|\])', prediction) is not None and regex.match(r'(\(|\[).+(\)|\])', reference) is not None:
        pred_parts = prediction[1:-1].split(",")
        ref_parts = reference[1:-1].split(",")
        if len(pred_parts) == len(ref_parts):
            if all([math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))]):
                return True

    if (prediction.startswith("\\begin{pmatrix}") or prediction.startswith("\\begin{bmatrix}")) and (prediction.endswith("\\end{pmatrix}") or prediction.endswith("\\end{bmatrix}")) and \
        (reference.startswith("\\begin{pmatrix}") or reference.startswith("\\begin{bmatrix}")) and (reference.endswith("\\end{pmatrix}") or reference.endswith("\\end{bmatrix}")):
        pred_lines = [line.strip() for line in prediction[len("\\begin{pmatrix}"): -len("\\end{pmatrix}")].split("\\\\") if line.strip()]
        ref_lines = [line.strip() for line in reference[len("\\begin{pmatrix}"): -len("\\end{pmatrix}")].split("\\\\") if line.strip()]
        matched = True
        if len(pred_lines) == len(ref_lines):
            for pred_line, ref_line in zip(pred_lines, ref_lines):
                pred_parts = pred_line.split("&")
                ref_parts = ref_line.split("&")
                if len(pred_parts) == len(ref_parts):
                    if not all([math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))]):
                        matched = False
                        break
                else:
                    matched = False
                if not matched:
                    break
        else:
            matched = False
        if matched:
            return True

    if prediction.count('=') == 1 and reference.count('=') == 1:
        pred = prediction.split('=')
        pred = f"{pred[0].strip()} - ({pred[1].strip()})"
        ref = reference.split('=')
        ref = f"{ref[0].strip()} - ({ref[1].strip()})"
        if symbolic_equal(pred, ref) or symbolic_equal(f"-({pred})", ref):
            return True
    elif prediction.count('=') == 1 and len(prediction.split('=')[0].strip()) <= 2 and '=' not in reference:
        if math_equal(prediction.split('=')[1], reference, include_percentage, is_close):
            return True
    elif reference.count('=') == 1 and len(reference.split('=')[0].strip()) <= 2 and '=' not in prediction:
        if math_equal(prediction, reference.split('=')[1], include_percentage, is_close):
            return True

    # symbolic equal with sympy
    if timeout:
        if call_with_timeout(symbolic_equal_process, prediction, reference):
            return True
    else:
        if symbolic_equal(prediction, reference):
            return True

    return False


def math_equal_process(param):
    return math_equal(param[-2], param[-1])


def symbolic_equal(a, b):
    def _parse(s):
        for f in [parse_latex, parse_expr]:
            try:
                return f(s)
            except:
                pass
        return s
    a = _parse(a)
    b = _parse(b)

    try:
        if simplify(a-b) == 0:
            return True
    except:
        pass

    try:
        if isclose(N(a), N(b), abs_tol=1e-3):
            return True
    except:
        pass
    return False


def symbolic_equal_process(a, b, output_queue):  
    result = symbolic_equal(a, b)
    output_queue.put(result)  


def call_with_timeout(func, *args, timeout=1, **kwargs):  
    output_queue = multiprocessing.Queue()  
    process_args = args + (output_queue,)  
    process = multiprocessing.Process(target=func, args=process_args, kwargs=kwargs)  
    process.start()  
    process.join(timeout)  
  
    if process.is_alive():  
        process.terminate()
        process.join()  
        return False  
  
    return output_queue.get()
init 2024-02-06 02:27:40 +00:00			`import multiprocessing`
			`from math import isclose`
			`import numpy as np`
			`from typing import Union, Any, Dict`

			`from sympy import simplify, N`
			`from sympy.parsing.sympy_parser import parse_expr`
			`from sympy.parsing.latex import parse_latex`
			`import re`
			`import regex`

			`from data_processing.answer_extraction import extract_answer, extract_program_output, strip_string`

			`def extract_program(result: str, last_only=True):`
			`"""`
			extract the program after "```python", and before "```"
			`"""`
			`program = ""`
			`start = False`
			`for line in result.split("\n"):`
			if line.startswith("```python"):
			`if last_only:`
			`program = "" # only extract the last program`
			`else:`
			`program += "\n# ========\n"`
			`start = True`
			elif line.startswith("```"):
			`start = False`
			`elif start:`
			`program += line + "\n"`
			`return program`


			`def parse_ground_truth(example: Dict[str, Any], data_name):`
			`if 'gt_cot' in example:`
			`return example['gt_cot'], strip_string(example['gt'])`

			`# parse ground truth`
			`if data_name in ["math", 'ocw']:`
			`gt_cot = example['solution']`
			`gt_ans = extract_answer(gt_cot)`
			`elif data_name == "gsm8k":`
			`gt_cot, gt_ans = example['answer'].split("####")`
			`elif data_name == "gsm-hard":`
			`gt_cot, gt_ans = example['code'], example['target']`
			`elif data_name == "svamp":`
			`gt_cot, gt_ans = example['Equation'], example['Answer']`
			`elif data_name == "asdiv":`
			`gt_cot = example['formula']`
			`gt_ans = re.sub(r"\(.*?\)", "", example['answer'])`
			`elif data_name == "mawps":`
			`gt_cot, gt_ans = None, example['target']`
			`elif data_name == "tabmwp":`
			`gt_cot = example['solution']`
			`gt_ans = example['answer']`
			`if example['ans_type'] in ['integer_number', 'decimal_number']:`
			`if '/' in gt_ans:`
			`gt_ans = int(gt_ans.split('/')[0]) / int(gt_ans.split('/')[1])`
			`elif ',' in gt_ans:`
			`gt_ans = float(gt_ans.replace(',', ''))`
			`elif '%' in gt_ans:`
			`gt_ans = float(gt_ans.split('%')[0]) / 100`
			`else:`
			`gt_ans = float(gt_ans)`
			`elif data_name == "bbh":`
			`gt_cot, gt_ans = None, example['target']`
			`else:`
			`raise NotImplementedError(data_name)`
			`# post process`
			`gt_cot = str(gt_cot).strip()`
			`gt_ans = strip_string(gt_ans)`
			`return gt_cot, gt_ans`


			`def parse_question(example, data_name):`
			`question = ""`
			`if data_name == "asdiv":`
			`question = f"{example['body'].strip()} {example['question'].strip()}"`
			`elif data_name == "svamp":`
			`body = example["Body"].strip()`
			`if not body.endswith("."):`
			`body = body + "."`
			`question = f'{body} {example["Question"].strip()}'`
			`elif data_name == "tabmwp":`
			`title_str = f'regarding "{example["table_title"]}" ' if example['table_title'] else ""`
			`question = f'Read the following table {title_str}and answer a question:\n'`
			`question += f'{example["table"]}\n{example["question"]}'`
			`if example['choices']:`
			`question += f' Please select from the following options: {example["choices"]}'`
			`else:`
			`for key in ['question', 'problem', 'Question', 'input']:`
			`if key in example:`
			`question = example[key]`
			`break`
			`assert question != ""`
			`return question.strip()`


			`def run_execute(executor, result, prompt_type, execute=False):`
			`if not result or result == 'error':`
			`return None, None`
			`report = None`

			`if "program_only" in prompt_type:`
			`prediction = extract_program_output(result)`
			`elif prompt_type in ["pot", "pal"] and execute:`
			`code = extract_program(result)`
			`prediction, report = executor.apply(code)`
			`else:`
			`prediction = extract_answer(result)`

			`prediction = strip_string(prediction)`
			`return prediction, report`


			`def parse_digits(num):`
			`# format: 234.23 \|\| 23%`
			`num = regex.sub(',', '', str(num))`
			`try:`
			`return float(num)`
			`except:`
			`if num.endswith('%'):`
			`num = num[:-1]`
			`if num.endswith('\\'):`
			`num = num[:-1]`
			`try:`
			`return float(num) / 100`
			`except:`
			`pass`
			`return None`

			`def is_digit(num):`
			`# paired with parse_digits`
			`return parse_digits(num) is not None`


			`def normalize_prediction(prediction):`
			`try: # 1. numerical equal`
			`if is_digit(prediction):`
			`prediction = np.round(float(str(prediction).replace(",", "")), 6)`
			`return str(prediction)`
			`except:`
			`pass`

			`# 2. symbolic equal`
			`prediction = str(prediction).strip()`

			`## deal with [], (), {}`
			`brackets = []`
			`while prediction.startswith("[") and prediction.endswith("]") or (prediction.startswith("(") and prediction.endswith(")")):`
			`bracket = prediction[0]`
			`prediction = prediction[1:-1]`
			`if brackets and ',' in prediction:`
			`pred_parts = [normalize_prediction(part) for part in prediction.split(",")]`
			`prediction = ",".join(pred_parts)`

			`if brackets:`
			`for b in reversed(brackets):`
			`if b == '[':`
			`prediction = '[' + prediction + ']'`
			`else:`
			`assert b == '('`
			`prediction = '(' + prediction + ')'`

			`def _parse(s):`
			`for f in [parse_latex, parse_expr]:`
			`try:`
			`return f(s)`
			`except:`
			`pass`
			`return s`

			`prediction = _parse(prediction)`

			`for s in ['{', "}", "(", ")"]:`
			`prediction = prediction.replace(s, "")`

			`return prediction`


			`def math_equal(prediction: Union[bool, float, str],`
			`reference: Union[float, str],`
			`include_percentage: bool = True,`
			`is_close: bool = True,`
			`timeout: bool = False,`
			`) -> bool:`
			`"""`
			`Exact match of math if and only if:`
			`1. numerical equal: both can convert to float and are equal`
			`2. symbolic equal: both can convert to sympy expression and are equal`
			`"""`
			`if str(prediction) == str(reference):`
			`return True`

			`try: # 1. numerical equal`
			`if is_digit(prediction) and is_digit(reference):`
			`prediction = parse_digits(prediction)`
			`reference = parse_digits(reference)`
			`# number questions`
			`if include_percentage:`
			`gt_result = [reference / 100, reference, reference * 100]`
			`else:`
			`gt_result = [reference]`
			`for item in gt_result:`
			`try:`
			`if is_close:`
			`if isclose(item, prediction, abs_tol=1e-3):`
			`return True`
			`else:`
			`if item == prediction:`
			`return True`
			`except Exception:`
			`continue`
			`return False`
			`except:`
			`pass`

			`if not prediction and prediction not in [0, False]:`
			`return False`

			`# 2. symbolic equal`
			`reference = str(reference).strip()`
			`prediction = str(prediction).strip()`

			`if regex.match(r'(\(\|\[).+(\)\|\])', prediction) is not None and regex.match(r'(\(\|\[).+(\)\|\])', reference) is not None:`
			`pred_parts = prediction[1:-1].split(",")`
			`ref_parts = reference[1:-1].split(",")`
			`if len(pred_parts) == len(ref_parts):`
			`if all([math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))]):`
			`return True`

			`if (prediction.startswith("\\begin{pmatrix}") or prediction.startswith("\\begin{bmatrix}")) and (prediction.endswith("\\end{pmatrix}") or prediction.endswith("\\end{bmatrix}")) and \`
			`(reference.startswith("\\begin{pmatrix}") or reference.startswith("\\begin{bmatrix}")) and (reference.endswith("\\end{pmatrix}") or reference.endswith("\\end{bmatrix}")):`
			`pred_lines = [line.strip() for line in prediction[len("\\begin{pmatrix}"): -len("\\end{pmatrix}")].split("\\\\") if line.strip()]`
			`ref_lines = [line.strip() for line in reference[len("\\begin{pmatrix}"): -len("\\end{pmatrix}")].split("\\\\") if line.strip()]`
			`matched = True`
			`if len(pred_lines) == len(ref_lines):`
			`for pred_line, ref_line in zip(pred_lines, ref_lines):`
			`pred_parts = pred_line.split("&")`
			`ref_parts = ref_line.split("&")`
			`if len(pred_parts) == len(ref_parts):`
			`if not all([math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))]):`
			`matched = False`
			`break`
			`else:`
			`matched = False`
			`if not matched:`
			`break`
			`else:`
			`matched = False`
			`if matched:`
			`return True`

			`if prediction.count('=') == 1 and reference.count('=') == 1:`
			`pred = prediction.split('=')`
			`pred = f"{pred[0].strip()} - ({pred[1].strip()})"`
			`ref = reference.split('=')`
			`ref = f"{ref[0].strip()} - ({ref[1].strip()})"`
			`if symbolic_equal(pred, ref) or symbolic_equal(f"-({pred})", ref):`
			`return True`
			`elif prediction.count('=') == 1 and len(prediction.split('=')[0].strip()) <= 2 and '=' not in reference:`
			`if math_equal(prediction.split('=')[1], reference, include_percentage, is_close):`
			`return True`
			`elif reference.count('=') == 1 and len(reference.split('=')[0].strip()) <= 2 and '=' not in prediction:`
			`if math_equal(prediction, reference.split('=')[1], include_percentage, is_close):`
			`return True`

			`# symbolic equal with sympy`
			`if timeout:`
			`if call_with_timeout(symbolic_equal_process, prediction, reference):`
			`return True`
			`else:`
			`if symbolic_equal(prediction, reference):`
			`return True`

			`return False`


			`def math_equal_process(param):`
			`return math_equal(param[-2], param[-1])`


			`def symbolic_equal(a, b):`
			`def _parse(s):`
			`for f in [parse_latex, parse_expr]:`
			`try:`
			`return f(s)`
			`except:`
			`pass`
			`return s`
			`a = _parse(a)`
			`b = _parse(b)`

			`try:`
			`if simplify(a-b) == 0:`
			`return True`
			`except:`
			`pass`

			`try:`
			`if isclose(N(a), N(b), abs_tol=1e-3):`
			`return True`
			`except:`
			`pass`
			`return False`


			`def symbolic_equal_process(a, b, output_queue):`
			`result = symbolic_equal(a, b)`
			`output_queue.put(result)`


			`def call_with_timeout(func, args, timeout=1, *kwargs):`
			`output_queue = multiprocessing.Queue()`
			`process_args = args + (output_queue,)`
			`process = multiprocessing.Process(target=func, args=process_args, kwargs=kwargs)`
			`process.start()`
			`process.join(timeout)`

			`if process.is_alive():`
			`process.terminate()`
			`process.join()`
			`return False`

			`return output_queue.get()`