DeepSeek-Math/evaluation/eval/eval_utils.py

326 lines
11 KiB
Python
Raw Normal View History

2024-02-06 02:27:40 +00:00
import multiprocessing
from math import isclose
import numpy as np
from typing import Union, Any, Dict
from sympy import simplify, N
from sympy.parsing.sympy_parser import parse_expr
from sympy.parsing.latex import parse_latex
import re
import regex
from data_processing.answer_extraction import extract_answer, extract_program_output, strip_string
def extract_program(result: str, last_only=True):
"""
extract the program after "```python", and before "```"
"""
program = ""
start = False
for line in result.split("\n"):
if line.startswith("```python"):
if last_only:
program = "" # only extract the last program
else:
program += "\n# ========\n"
start = True
elif line.startswith("```"):
start = False
elif start:
program += line + "\n"
return program
def parse_ground_truth(example: Dict[str, Any], data_name):
if 'gt_cot' in example:
return example['gt_cot'], strip_string(example['gt'])
# parse ground truth
if data_name in ["math", 'ocw']:
gt_cot = example['solution']
gt_ans = extract_answer(gt_cot)
elif data_name == "gsm8k":
gt_cot, gt_ans = example['answer'].split("####")
elif data_name == "gsm-hard":
gt_cot, gt_ans = example['code'], example['target']
elif data_name == "svamp":
gt_cot, gt_ans = example['Equation'], example['Answer']
elif data_name == "asdiv":
gt_cot = example['formula']
gt_ans = re.sub(r"\(.*?\)", "", example['answer'])
elif data_name == "mawps":
gt_cot, gt_ans = None, example['target']
elif data_name == "tabmwp":
gt_cot = example['solution']
gt_ans = example['answer']
if example['ans_type'] in ['integer_number', 'decimal_number']:
if '/' in gt_ans:
gt_ans = int(gt_ans.split('/')[0]) / int(gt_ans.split('/')[1])
elif ',' in gt_ans:
gt_ans = float(gt_ans.replace(',', ''))
elif '%' in gt_ans:
gt_ans = float(gt_ans.split('%')[0]) / 100
else:
gt_ans = float(gt_ans)
elif data_name == "bbh":
gt_cot, gt_ans = None, example['target']
else:
raise NotImplementedError(data_name)
# post process
gt_cot = str(gt_cot).strip()
gt_ans = strip_string(gt_ans)
return gt_cot, gt_ans
def parse_question(example, data_name):
question = ""
if data_name == "asdiv":
question = f"{example['body'].strip()} {example['question'].strip()}"
elif data_name == "svamp":
body = example["Body"].strip()
if not body.endswith("."):
body = body + "."
question = f'{body} {example["Question"].strip()}'
elif data_name == "tabmwp":
title_str = f'regarding "{example["table_title"]}" ' if example['table_title'] else ""
question = f'Read the following table {title_str}and answer a question:\n'
question += f'{example["table"]}\n{example["question"]}'
if example['choices']:
question += f' Please select from the following options: {example["choices"]}'
else:
for key in ['question', 'problem', 'Question', 'input']:
if key in example:
question = example[key]
break
assert question != ""
return question.strip()
def run_execute(executor, result, prompt_type, execute=False):
if not result or result == 'error':
return None, None
report = None
if "program_only" in prompt_type:
prediction = extract_program_output(result)
elif prompt_type in ["pot", "pal"] and execute:
code = extract_program(result)
prediction, report = executor.apply(code)
else:
prediction = extract_answer(result)
prediction = strip_string(prediction)
return prediction, report
def parse_digits(num):
# format: 234.23 || 23%
num = regex.sub(',', '', str(num))
try:
return float(num)
except:
if num.endswith('%'):
num = num[:-1]
if num.endswith('\\'):
num = num[:-1]
try:
return float(num) / 100
except:
pass
return None
def is_digit(num):
# paired with parse_digits
return parse_digits(num) is not None
def normalize_prediction(prediction):
try: # 1. numerical equal
if is_digit(prediction):
prediction = np.round(float(str(prediction).replace(",", "")), 6)
return str(prediction)
except:
pass
# 2. symbolic equal
prediction = str(prediction).strip()
## deal with [], (), {}
brackets = []
while prediction.startswith("[") and prediction.endswith("]") or (prediction.startswith("(") and prediction.endswith(")")):
bracket = prediction[0]
prediction = prediction[1:-1]
if brackets and ',' in prediction:
pred_parts = [normalize_prediction(part) for part in prediction.split(",")]
prediction = ",".join(pred_parts)
if brackets:
for b in reversed(brackets):
if b == '[':
prediction = '[' + prediction + ']'
else:
assert b == '('
prediction = '(' + prediction + ')'
def _parse(s):
for f in [parse_latex, parse_expr]:
try:
return f(s)
except:
pass
return s
prediction = _parse(prediction)
for s in ['{', "}", "(", ")"]:
prediction = prediction.replace(s, "")
return prediction
def math_equal(prediction: Union[bool, float, str],
reference: Union[float, str],
include_percentage: bool = True,
is_close: bool = True,
timeout: bool = False,
) -> bool:
"""
Exact match of math if and only if:
1. numerical equal: both can convert to float and are equal
2. symbolic equal: both can convert to sympy expression and are equal
"""
if str(prediction) == str(reference):
return True
try: # 1. numerical equal
if is_digit(prediction) and is_digit(reference):
prediction = parse_digits(prediction)
reference = parse_digits(reference)
# number questions
if include_percentage:
gt_result = [reference / 100, reference, reference * 100]
else:
gt_result = [reference]
for item in gt_result:
try:
if is_close:
if isclose(item, prediction, abs_tol=1e-3):
return True
else:
if item == prediction:
return True
except Exception:
continue
return False
except:
pass
if not prediction and prediction not in [0, False]:
return False
# 2. symbolic equal
reference = str(reference).strip()
prediction = str(prediction).strip()
if regex.match(r'(\(|\[).+(\)|\])', prediction) is not None and regex.match(r'(\(|\[).+(\)|\])', reference) is not None:
pred_parts = prediction[1:-1].split(",")
ref_parts = reference[1:-1].split(",")
if len(pred_parts) == len(ref_parts):
if all([math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))]):
return True
if (prediction.startswith("\\begin{pmatrix}") or prediction.startswith("\\begin{bmatrix}")) and (prediction.endswith("\\end{pmatrix}") or prediction.endswith("\\end{bmatrix}")) and \
(reference.startswith("\\begin{pmatrix}") or reference.startswith("\\begin{bmatrix}")) and (reference.endswith("\\end{pmatrix}") or reference.endswith("\\end{bmatrix}")):
pred_lines = [line.strip() for line in prediction[len("\\begin{pmatrix}"): -len("\\end{pmatrix}")].split("\\\\") if line.strip()]
ref_lines = [line.strip() for line in reference[len("\\begin{pmatrix}"): -len("\\end{pmatrix}")].split("\\\\") if line.strip()]
matched = True
if len(pred_lines) == len(ref_lines):
for pred_line, ref_line in zip(pred_lines, ref_lines):
pred_parts = pred_line.split("&")
ref_parts = ref_line.split("&")
if len(pred_parts) == len(ref_parts):
if not all([math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))]):
matched = False
break
else:
matched = False
if not matched:
break
else:
matched = False
if matched:
return True
if prediction.count('=') == 1 and reference.count('=') == 1:
pred = prediction.split('=')
pred = f"{pred[0].strip()} - ({pred[1].strip()})"
ref = reference.split('=')
ref = f"{ref[0].strip()} - ({ref[1].strip()})"
if symbolic_equal(pred, ref) or symbolic_equal(f"-({pred})", ref):
return True
elif prediction.count('=') == 1 and len(prediction.split('=')[0].strip()) <= 2 and '=' not in reference:
if math_equal(prediction.split('=')[1], reference, include_percentage, is_close):
return True
elif reference.count('=') == 1 and len(reference.split('=')[0].strip()) <= 2 and '=' not in prediction:
if math_equal(prediction, reference.split('=')[1], include_percentage, is_close):
return True
# symbolic equal with sympy
if timeout:
if call_with_timeout(symbolic_equal_process, prediction, reference):
return True
else:
if symbolic_equal(prediction, reference):
return True
return False
def math_equal_process(param):
return math_equal(param[-2], param[-1])
def symbolic_equal(a, b):
def _parse(s):
for f in [parse_latex, parse_expr]:
try:
return f(s)
except:
pass
return s
a = _parse(a)
b = _parse(b)
try:
if simplify(a-b) == 0:
return True
except:
pass
try:
if isclose(N(a), N(b), abs_tol=1e-3):
return True
except:
pass
return False
def symbolic_equal_process(a, b, output_queue):
result = symbolic_equal(a, b)
output_queue.put(result)
def call_with_timeout(func, *args, timeout=1, **kwargs):
output_queue = multiprocessing.Queue()
process_args = args + (output_queue,)
process = multiprocessing.Process(target=func, args=process_args, kwargs=kwargs)
process.start()
process.join(timeout)
if process.is_alive():
process.terminate()
process.join()
return False
return output_queue.get()