init

2025-06-26 18:16:20 +00:00 · 2024-02-06 10:27:40 +08:00
commit 21cc5c6701
59 changed files with 17325 additions and 0 deletions
--- a/evaluation/infer/run_cot_eval.py
+++ b/evaluation/infer/run_cot_eval.py
@@ -0,0 +1,229 @@
+import argparse
+import os
+
+from tqdm import tqdm
+import json
+from copy import deepcopy
+from vllm import LLM, SamplingParams
+from pebble import ProcessPool
+from concurrent.futures import TimeoutError
+import random
+from eval.utils import generate_completions, load_hf_lm_and_tokenizer
+
+from transformers import AutoTokenizer
+
+from data_processing.answer_extraction import *
+from eval.eval_script import *
+from few_shot_prompts import *
+
+def evaluate(eval_fn, tasks, _timeout=15):
+    with ProcessPool() as pool:
+        timeout_cnt = 0
+        iterator = pool.map(eval_fn, tasks, timeout=_timeout).result()
+        labels = []
+        while True:
+            try:
+                labels.append(int(next(iterator)))
+            except StopIteration:
+                break
+            except TimeoutError as error:
+                labels.append(0)
+                timeout_cnt += 1
+            except Exception as error:
+                print(error.traceback, flush=True)
+                exit()
+    return labels, timeout_cnt
+
+def infer(args, test_data):
+    global tokenizer
+    if tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path, trust_remote_code=True)
+
+    if args.prompt_format == 'few_shot':
+        assert args.few_shot_prompt is not None
+        prompting = eval(args.few_shot_prompt)()
+
+    prompts = []
+    for example in test_data:
+        prompt = ""
+        if args.prompt_format == 'few_shot':
+            prompt = prompting.format_prompt(example['messages'][-2]['content'], example['messages'][-1]['content'])
+        else:
+            for mess in example['messages']:
+                if args.prompt_format == 'sft':
+                    if mess['role'] == 'user':
+                        prompt += f"{tokenizer.eos_token}User: {mess['content'].strip()}\n\nAssistant:"
+                    elif mess['role'] == 'assistant':
+                        prompt += mess['content'].rstrip()
+                else:
+                    raise NotImplementedError()
+            prompt = prompt.lstrip()
+            if args.prompt_format == 'sft' and prompt.startswith(tokenizer.eos_token):
+                prompt = prompt[len(tokenizer.eos_token):].lstrip()
+        example['prompt'] = prompt
+        prompts.append(prompt.lstrip())
+
+    global model
+    print("Loading model and tokenizer...")
+    if args.use_vllm:
+        if model is None:
+            model = LLM(model=args.model_name_or_path, tokenizer=args.tokenizer_name_or_path, trust_remote_code=True, tensor_parallel_size=len(os.environ['CUDA_VISIBLE_DEVICES'].split(",")))
+        eos_token = tokenizer.eos_token if tokenizer is not None and tokenizer.eos_token is not None else '</s>'
+        stop_words = [eos_token]
+        if args.prompt_format == 'few_shot':
+            stop_words.extend(prompting.stop_words())
+        outputs = model.generate(prompts, SamplingParams(temperature=args.temperature, top_p=1.0, max_tokens=1024, n=1, stop=stop_words))
+        outputs = sorted(outputs, key=lambda x: int(x.request_id)) # sort outputs by request_id
+        outputs = [output.outputs[0].text for output in outputs]
+    else:
+        model, tokenizer = load_hf_lm_and_tokenizer(
+            model_name_or_path=args.model_name_or_path, 
+            tokenizer_name_or_path=args.tokenizer_name_or_path, 
+            load_in_8bit=args.load_in_8bit, 
+            load_in_half=args.load_in_half,
+            gptq_model=args.gptq
+        )
+
+        stop_id_sequences = []
+        if tokenizer.eos_token_id is not None:
+            stop_id_sequences = [[tokenizer.eos_token_id]]
+        if args.prompt_format == 'few_shot':
+            stop_id_sequences.extend([tokenizer.encode(word) for word in prompting.stop_words()])
+        outputs, finish_completion = generate_completions(
+            model=model,
+            tokenizer=tokenizer,
+            prompts=prompts,
+            max_new_tokens=512,
+            batch_size=args.eval_batch_size,
+            stop_id_sequences=stop_id_sequences if stop_id_sequences else None,
+            end_of_generation_id_sequence=[tokenizer.eos_token_id] if tokenizer.eos_token_id is not None else None
+        )
+
+    if args.complete_partial_output:
+        model_outputs = [example['messages'][-1]['content'] + output for example, output in zip(test_data, outputs)]
+    else:
+        model_outputs = outputs
+
+    predictions = [eval(args.answer_extraction_fn)(item['messages'][-2]['content'], output, task='cot') for item, output in tqdm(zip(test_data, model_outputs), desc="extract answer", total=len(model_outputs))]
+    assert len(model_outputs) > 0, f"{len(model_outputs)}"
+
+    results = []
+    for example, output, pred in zip(test_data, model_outputs, predictions):
+        item = deepcopy(example)
+        item.update({
+            'model_output': output,
+            'prediction': pred,
+        })
+        results.append(item)
+    return results
+
+def main(args):
+    random.seed(42)
+
+    print("Loading data...")
+    test_data = []
+    with open(os.path.join(args.data_dir, f"train.jsonl" if args.infer_train_set else f"test.jsonl")) as fin:
+        for line in fin:
+            example = json.loads(line)
+            messages = example['messages']
+            assert messages[-1]['role'] == 'assistant'
+            if not args.complete_partial_output:
+                example['reference'] = example.get('reference', '') or [mess['content'] for mess in messages if mess['role'] == 'assistant']
+                for mess in messages:
+                    if mess['role'] == 'assistant':
+                        mess['content'] = ''
+            example['messages'] = messages
+            test_data.append(example)
+
+    if args.max_num_examples and len(test_data) > args.max_num_examples:
+        test_data = random.sample(test_data, args.max_num_examples)
+
+    if args.n_subsets > 1:
+        assert args.subset_id >= 0 and args.subset_id < args.n_subsets
+        test_data = [item for i, item in enumerate(test_data) if i % args.n_subsets == args.subset_id]
+
+    if not test_data:
+        return
+
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir, exist_ok=True)
+
+    results = infer(args, test_data)
+
+    labels, eval_timeout_cnt = evaluate(eval(args.eval_fn), results)
+    for item, label in zip(results, labels):
+        item['accuracy'] = label
+
+    print("Calculating accuracy...")
+    acc = 0
+    for item in results:
+        acc += item['accuracy']
+    print("output acc = {:.5f}".format(acc / len(results) * 100), flush=True)
+
+    print(f"Timeout count >>> output eval = {eval_timeout_cnt}", flush=True)
+
+    pred_fname = "predictions.json"
+    if args.n_subsets > 1:
+        pred_fname = f"predictions.{args.subset_id}.json"
+    with open(os.path.join(args.save_dir, pred_fname), "w") as fout:
+        json.dump(results, fout, ensure_ascii=True)
+
+    metric_fname = "metrics.json"
+    if args.n_subsets > 1:
+        metric_fname = f"metrics.{args.subset_id}.json"
+    with open(os.path.join(args.save_dir, metric_fname), "w") as fout:
+        json.dump({
+            "n_samples": len(results),
+            "accuracy": sum(item['accuracy'] for item in results) / len(results),
+        }, fout, indent=4)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir", type=str, default="data/mgsm")
+    parser.add_argument("--max_num_examples", type=int, default=None, help="maximum number of examples to evaluate.")
+    parser.add_argument("--save_dir", type=str, default="results/mgsm")
+    parser.add_argument("--model_name_or_path", type=str, default=None, help="if specified, we will load the model to generate the predictions.")
+    parser.add_argument("--tokenizer_name_or_path", type=str, default=None, help="if specified, we will load the tokenizer from here.")
+    parser.add_argument("--eval_batch_size", type=int, default=1, help="batch size for evaluation.")
+    parser.add_argument("--load_in_8bit", action="store_true", help="load model in 8bit mode, which will reduce memory and speed up inference.")
+    parser.add_argument("--gptq", action="store_true", help="If given, we're evaluating a 4-bit quantized GPTQ model.")
+    parser.add_argument("--use_vllm", action="store_true")
+    parser.add_argument("--load_in_half", action='store_true')
+    parser.add_argument("--infer_train_set", action="store_true")
+    parser.add_argument("--n_subsets", type=int, default=1)
+    parser.add_argument("--subset_id", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--repeat_id_start", type=int, default=0)
+    parser.add_argument("--n_repeat_sampling", type=int, default=1)
+    parser.add_argument("--complete_partial_output", action='store_true')
+    parser.add_argument("--prompt_format", type=str, choices=['sft', 'few_shot'], default='sft')
+    parser.add_argument("--few_shot_prompt", type=str, default=None)
+    parser.add_argument("--answer_extraction_fn", type=str, required=True)
+    parser.add_argument("--eval_fn", type=str, required=True)
+    parser.add_argument("--gpus", type=str, default=None)
+    args, unparsed_args = parser.parse_known_args()
+    if args.gpus is not None:
+        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus
+
+    print(unparsed_args, flush=True)
+
+    if 'math6' in args.data_dir:
+        args.multi_turn = True
+
+    # model_name_or_path cannot be both None or both not None.
+    model = None
+    tokenizer = None
+    pool = None
+    if args.n_repeat_sampling > 1 or args.repeat_id_start != 0:
+        assert args.temperature > 0
+        save_dir = args.save_dir
+        for i in range(args.repeat_id_start, args.repeat_id_start + args.n_repeat_sampling):
+            print(f"working on the {i} trials ...", flush=True)
+            args.save_dir = os.path.join(save_dir, str(i))
+            os.makedirs(args.save_dir, exist_ok=True)
+            main(args)
+    else:
+        main(args)
+
+    if pool is not None:
+        pool.close()
--- a/evaluation/infer/run_pal_eval.py
+++ b/evaluation/infer/run_pal_eval.py
@@ -0,0 +1,238 @@
+import argparse
+import os
+import json
+from copy import deepcopy
+from functools import partial
+from vllm import LLM, SamplingParams
+from pebble import ProcessPool
+from concurrent.futures import TimeoutError
+import random
+from eval.utils import generate_completions, load_hf_lm_and_tokenizer
+from eval.python_executor import PythonExecutor
+
+from transformers import AutoTokenizer
+
+from data_processing.answer_extraction import *
+from eval.eval_script import *
+from few_shot_prompts import *
+
+def evaluate(eval_fn, tasks, _timeout=15):
+    with ProcessPool() as pool:
+        timeout_cnt = 0
+        iterator = pool.map(eval_fn, tasks, timeout=_timeout).result()
+        labels = []
+        while True:
+            try:
+                labels.append(int(next(iterator)))
+            except StopIteration:
+                break
+            except TimeoutError as error:
+                labels.append(0)
+                timeout_cnt += 1
+            except Exception as error:
+                print(error.traceback, flush=True)
+                exit()
+    return labels, timeout_cnt
+
+def main(args):
+    random.seed(42)
+
+    print("Loading data...")
+    test_data = []
+    with open(os.path.join(args.data_dir, f"train.jsonl" if args.infer_train_set else f"test.jsonl")) as fin:
+        for line in fin:
+            example = json.loads(line)
+            messages = example['messages']
+            assert len(messages) in [2, 3]
+            assert messages[-1]['role'] == 'assistant'
+            if not args.complete_partial_output:
+                example['reference'] = example.get('reference', '') or messages[-1]['content']
+                messages[-1]['content'] = ''
+            example['messages'] = messages
+            test_data.append(example)
+
+    if args.max_num_examples and len(test_data) > args.max_num_examples:
+        test_data = random.sample(test_data, args.max_num_examples)
+
+    if args.n_subsets > 1:
+        assert args.subset_id >= 0 and args.subset_id < args.n_subsets
+        test_data = [item for i, item in enumerate(test_data) if i % args.n_subsets == args.subset_id]
+
+    if not test_data:
+        return
+
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir, exist_ok=True)
+
+    if args.prompt_format == 'few_shot':
+        assert args.few_shot_prompt is not None
+        prompting = eval(args.few_shot_prompt)()
+
+    prompts = []
+    for example in test_data:
+        prompt = ""
+        if args.prompt_format == 'few_shot':
+            prompt = prompting.format_prompt(example['messages'][-2]['content'], example['messages'][-1]['content'])
+        else:
+            for mess in example['messages']:
+                if args.prompt_format == 'sft':
+                    if mess['role'] == 'user':
+                        prompt += f"User: {mess['content'].strip()}\n\nAssistant:"
+                    elif mess['role'] == 'assistant':
+                        prompt += mess['content'].strip()
+                else:
+                    raise NotImplementedError()
+            prompt = prompt.lstrip()
+        example['prompt'] = prompt
+        prompts.append(prompt.lstrip())
+
+    global model, tokenizer
+    if tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path, trust_remote_code=True)
+    print("Loading model and tokenizer...")
+    if args.use_vllm:
+        if model is None:
+            model = LLM(model=args.model_name_or_path, tokenizer=args.tokenizer_name_or_path, trust_remote_code=True, tensor_parallel_size=len(os.environ['CUDA_VISIBLE_DEVICES'].split(",")))
+        eos_token = tokenizer.eos_token if tokenizer is not None and tokenizer.eos_token is not None else '</s>'
+        stop_words = [eos_token]
+        if args.prompt_format == 'few_shot':
+            stop_words.extend(prompting.stop_words())
+        outputs = model.generate(prompts, SamplingParams(temperature=args.temperature, top_p=1.0, max_tokens=1024, n=1, stop=stop_words))
+        outputs = sorted(outputs, key=lambda x: int(x.request_id)) # sort outputs by request_id
+        outputs = [output.outputs[0].text for output in outputs]
+    else:
+        model, tokenizer = load_hf_lm_and_tokenizer(
+            model_name_or_path=args.model_name_or_path, 
+            tokenizer_name_or_path=args.tokenizer_name_or_path, 
+            load_in_8bit=args.load_in_8bit, 
+            load_in_half=args.load_in_half,
+            gptq_model=args.gptq
+        )
+
+        stop_id_sequences = []
+        if tokenizer.eos_token_id is not None:
+            stop_id_sequences = [[tokenizer.eos_token_id]]
+        if args.prompt_format == 'few_shot':
+            stop_id_sequences.extend([tokenizer.encode(word) for word in prompting.stop_words()])
+        outputs, finish_completion = generate_completions(
+            model=model,
+            tokenizer=tokenizer,
+            prompts=prompts,
+            max_new_tokens=512,
+            batch_size=args.eval_batch_size,
+            stop_id_sequences=stop_id_sequences if stop_id_sequences else None,
+            end_of_generation_id_sequence=[tokenizer.eos_token_id] if tokenizer.eos_token_id is not None else None
+        )
+
+    if args.complete_partial_output:
+        model_outputs = [example['messages'][-1]['content'] + output for example, output in zip(test_data, outputs)]
+    else:
+        model_outputs = outputs
+
+    if 'PALGSMPrompt' in args.few_shot_prompt:
+        executor = PythonExecutor(get_answer_expr='solution()')
+        codes = model_outputs
+    elif 'PALMathPrompt' in args.few_shot_prompt:
+        executor = PythonExecutor(get_answer_symbol='answer')
+        codes = []
+        for text in model_outputs:
+            if text.count("```") == 4:
+                segments = text.split("```")
+                assert len(segments) == 5
+                code = f"{segments[3]}\n\n{segments[1]}"
+            else:
+                code = "answer = '[invalid]'"
+            codes.append(code)
+    else:
+        raise NotImplementedError()
+
+    predictions = []
+    runtime_errors = []
+    for pred, err in executor.batch_apply(codes):
+        predictions.append(str(pred))
+        runtime_errors.append(str(err['exec_info']).strip())
+
+    assert len(model_outputs) > 0, f"{len(model_outputs)}"
+
+    results = []
+    for example, output, pred in zip(test_data, model_outputs, predictions):
+        item = deepcopy(example)
+        item.update({
+            'model_output': output,
+            'program_output': pred,
+        })
+        results.append(item)
+
+    labels, eval_timeout_cnt = evaluate(partial(eval(args.eval_fn), pred_key='program_output'), results)
+    for item, label in zip(results, labels):
+        item['accuracy'] = label
+
+    print("Calculating accuracy...")
+    acc = 0
+    for item in results:
+        acc += item['accuracy']
+    print("output acc = {:.5f}".format(acc / len(results) * 100), flush=True)
+
+    print(f"Timeout count >>> output eval = {eval_timeout_cnt}", flush=True)
+
+    pred_fname = "predictions.json"
+    if args.n_subsets > 1:
+        pred_fname = f"predictions.{args.subset_id}.json"
+    with open(os.path.join(args.save_dir, pred_fname), "w") as fout:
+        json.dump(results, fout, ensure_ascii=True)
+
+    metric_fname = "metrics.json"
+    if args.n_subsets > 1:
+        metric_fname = f"metrics.{args.subset_id}.json"
+    with open(os.path.join(args.save_dir, metric_fname), "w") as fout:
+        json.dump({
+            "n_samples": len(results),
+            "accuracy": sum(item['accuracy'] for item in results) / len(results),
+        }, fout, indent=4)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir", type=str, default="data/mgsm")
+    parser.add_argument("--max_num_examples", type=int, default=None, help="maximum number of examples to evaluate.")
+    parser.add_argument("--save_dir", type=str, default="results/mgsm")
+    parser.add_argument("--model_name_or_path", type=str, default=None, help="if specified, we will load the model to generate the predictions.")
+    parser.add_argument("--tokenizer_name_or_path", type=str, default=None, help="if specified, we will load the tokenizer from here.")
+    parser.add_argument("--eval_batch_size", type=int, default=1, help="batch size for evaluation.")
+    parser.add_argument("--load_in_8bit", action="store_true", help="load model in 8bit mode, which will reduce memory and speed up inference.")
+    parser.add_argument("--gptq", action="store_true", help="If given, we're evaluating a 4-bit quantized GPTQ model.")
+    parser.add_argument("--use_vllm", action="store_true")
+    parser.add_argument("--load_in_half", action='store_true')
+    parser.add_argument("--infer_train_set", action="store_true")
+    parser.add_argument("--n_subsets", type=int, default=1)
+    parser.add_argument("--subset_id", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--repeat_id_start", type=int, default=0)
+    parser.add_argument("--n_repeat_sampling", type=int, default=1)
+    parser.add_argument("--complete_partial_output", action='store_true')
+    parser.add_argument("--prompt_format", type=str, choices=['sft', 'few_shot'], default='sft')
+    parser.add_argument("--few_shot_prompt", type=str, default=None)
+    parser.add_argument("--answer_extraction_fn", type=str, default=None)
+    parser.add_argument("--eval_fn", type=str, required=True)
+    parser.add_argument("--gpus", type=str, default=None)
+    args, unparsed_args = parser.parse_known_args()
+    if args.gpus is not None:
+        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus
+
+    print(unparsed_args, flush=True)
+
+    model = None
+    tokenizer = None
+    pool = None
+    if args.n_repeat_sampling > 1 or args.repeat_id_start != 0:
+        assert args.temperature > 0
+        save_dir = args.save_dir
+        for i in range(args.repeat_id_start, args.repeat_id_start + args.n_repeat_sampling):
+            print(f"working on the {i} trials ...", flush=True)
+            args.save_dir = os.path.join(save_dir, str(i))
+            os.makedirs(args.save_dir, exist_ok=True)
+            main(args)
+    else:
+        main(args)
+
+    if pool is not None:
+        pool.close()
--- a/evaluation/infer/run_tool_integrated_eval.py
+++ b/evaluation/infer/run_tool_integrated_eval.py
@@ -0,0 +1,324 @@
+import argparse
+import os
+from tqdm import tqdm
+import regex
+import json
+import random
+from copy import deepcopy
+from functools import partial
+from vllm import LLM, SamplingParams
+from pebble import ProcessPool
+from concurrent.futures import TimeoutError
+from eval.utils import generate_completions, load_hf_lm_and_tokenizer
+from eval.python_executor import PythonExecutor
+from transformers import AutoTokenizer
+
+from data_processing.answer_extraction import *
+from eval.eval_script import *
+from few_shot_prompts import *
+
+def extract_code(text):
+    if not text.strip().endswith("```"):
+        return ""
+    if text.startswith("```python"):
+        text = "hey\n" + text
+    blocks = [block.split("```", 1)[0].strip() for block in text.split("```python") if '```' in block]
+    blocks = [block for block in blocks if block]
+    if not blocks:
+        return ""
+    code = []
+    for block in blocks[:-1]:
+        for line in block.split("\n"):
+            if line.startswith("    ") or line.startswith("import") or line.startswith("def "):
+                code.append(line)
+            elif 'print(' not in line:
+                code.append(line)
+    code = "\n".join(code) + "\n" + blocks[-1]
+    return code.strip()
+
+def finish_answer_prediction(text):
+    patt = regex.search(r"\\boxed{(?P<ans>.+)}", text)
+    return patt is not None and patt.group('ans').strip()
+
+def evaluate(eval_fn, tasks, _timeout=15):
+    with ProcessPool() as pool:
+        timeout_cnt = 0
+        iterator = pool.map(eval_fn, tasks, timeout=_timeout).result()
+        labels = []
+        while True:
+            try:
+                labels.append(int(next(iterator)))
+            except StopIteration:
+                break
+            except TimeoutError as error:
+                labels.append(0)
+                timeout_cnt += 1
+            except Exception as error:
+                print(error.traceback, flush=True)
+                exit()
+    return labels, timeout_cnt
+
+def infer(args, test_data):
+    if args.prompt_format == 'few_shot':
+        assert args.few_shot_prompt is not None
+        prompting = eval(args.few_shot_prompt)()
+
+    prompts = []
+    for example in test_data:
+        prompt = ""
+        if args.prompt_format == 'few_shot':
+            prompt = prompting.format_prompt(example['messages'][-2]['content'], example['messages'][-1]['content'])
+        else:
+            for mess in example['messages']:
+                if args.prompt_format == 'sft':
+                    if mess['role'] == 'user':
+                        prompt += f"User: {mess['content'].strip()}\n\nAssistant:"
+                    elif mess['role'] == 'assistant':
+                        prompt += mess['content'].strip()
+                else:
+                    raise NotImplementedError()
+            prompt = prompt.lstrip()
+        example['prompt'] = prompt
+        prompts.append(prompt.lstrip())
+    model_outputs = [item['messages'][-1]['content'].strip() for item in test_data]
+    unfinished_ids = list(range(len(prompts)))
+
+    executor = PythonExecutor(get_answer_from_stdout=True)
+
+    n_iters = 2
+    global model, tokenizer
+    while n_iters and unfinished_ids:
+        model_inputs = [prompts[i] for i in unfinished_ids]
+        finish_completion = None
+        print("Loading model and tokenizer...")
+        if args.use_vllm:
+            if tokenizer is None:
+                tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name_or_path, trust_remote_code=True)
+                print(f"{'-' * 20} prompt_to_ids {'-' * 20}\n{tokenizer.encode(model_inputs[0])}\n{'-' * 50}", flush=True)
+                print(f"eos_token: {tokenizer.eos_token}", flush=True)
+            if model is None:
+                model = LLM(model=args.model_name_or_path, tokenizer=args.tokenizer_name_or_path, trust_remote_code=True, tensor_parallel_size=len(os.environ['CUDA_VISIBLE_DEVICES'].split(",")))
+            stop_words = [tokenizer.eos_token if tokenizer is not None and tokenizer.eos_token is not None else '</s>']
+            if not args.no_execution:
+                stop_words.append("```output")
+            if args.prompt_format == 'few_shot':
+                stop_words.extend(prompting.stop_words())
+            outputs = model.generate(model_inputs, SamplingParams(temperature=args.temperature, top_p=1.0, max_tokens=1024, n=1, stop=stop_words))
+            outputs = sorted(outputs, key=lambda x: int(x.request_id)) # sort outputs by request_id
+            finish_completion = [output.outputs[0].token_ids[-1] == tokenizer.eos_token_id for output in outputs]
+            outputs = [output.outputs[0].text for output in outputs]
+        else:
+            if model is None or tokenizer is None:
+                model, tokenizer = load_hf_lm_and_tokenizer(
+                    model_name_or_path=args.model_name_or_path, 
+                    tokenizer_name_or_path=args.tokenizer_name_or_path, 
+                    load_in_8bit=args.load_in_8bit, 
+                    load_in_half=args.load_in_half,
+                    gptq_model=args.gptq
+                )
+
+            stop_id_sequences = [tokenizer.encode("```output", add_special_tokens=False)]
+            if tokenizer.eos_token_id is not None:
+                stop_id_sequences.append([tokenizer.eos_token_id])
+            outputs, finish_completion = generate_completions(
+                model=model,
+                tokenizer=tokenizer,
+                prompts=model_inputs,
+                max_new_tokens=512,
+                batch_size=args.eval_batch_size,
+                stop_id_sequences=stop_id_sequences,
+                end_of_generation_id_sequence=[tokenizer.eos_token_id] if tokenizer.eos_token_id is not None else None
+            )
+
+        if len(unfinished_ids) != len(outputs):
+            print(f"input-output mismatch >>> {len(unfinished_ids)} != {len(outputs)}", flush=True)
+            print(f"----- DEBUG -----\ninputs:\n{model_inputs[:10]}\noutputs:\n{str(outputs[:10])}\n----- DEBUG -----\n", flush=True)
+            raise RuntimeError()
+
+        if finish_completion is None:
+            finish_completion = [finish_answer_prediction(output) for output in outputs]
+
+        print("extract code ...", flush=True)
+        codes = []
+        code_indices = []
+        for i, output, is_finished in zip(unfinished_ids, outputs, finish_completion):
+            output = output.rstrip()
+            if not args.no_execution and not is_finished:
+                code = extract_code(model_outputs[i] + output)
+                if code:
+                    codes.append(code)
+                    code_indices.append(i)
+            prompts[i] += output
+            model_outputs[i] += output
+
+        print(f"execute {len(codes)} code snippets ...", flush=True)
+        batch_results = executor.batch_apply(codes)
+
+        for i, (exec_result, metadata) in zip(code_indices, batch_results):
+            exec_result = str(exec_result).strip()
+            if len(exec_result) > 100:
+                exec_result = exec_result[:50] + "..." + exec_result[-50:]
+            runtime_msg = str(metadata['concise_exec_info']).strip() if args.use_concise_exec_info else str(metadata['exec_info']).strip()
+            if not exec_result:
+                runtime_msg = str(runtime_msg).strip()
+                if args.use_concise_exec_info:
+                    if len(runtime_msg) > 100:
+                        runtime_msg = runtime_msg[:50] + "..." + runtime_msg[-50:]
+                    exec_result = runtime_msg
+                else:
+                    if tokenizer is None:
+                        tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path or args.tokenizer_name_or_path, trust_remote_code=True)
+                    tokens = tokenizer.tokenize(runtime_msg)
+                    if len(tokens) > 100:
+                        runtime_msg = f"{tokenizer.convert_tokens_to_string(tokens[:50]).strip()} ... {tokenizer.convert_tokens_to_string(tokens[-50:]).strip()}"
+                    exec_result = f"Runtime errors: {runtime_msg}"
+
+            prompts[i] += f"\n```output\n{exec_result.strip()}\n```\n"
+            model_outputs[i] += f"\n```output\n{exec_result.strip()}\n```\n"
+
+        unfinished_ids = [i for i, is_finished in zip(unfinished_ids, finish_completion) if not is_finished]
+
+        n_iters -= 1
+
+    predictions = [eval(args.answer_extraction_fn)(item['messages'][-2]['content'], output, task='interleave') for item, output in tqdm(zip(test_data, model_outputs), desc="extract answer", total=len(model_outputs))]
+    program_outputs = [extract_program_output(output) for output in tqdm(model_outputs, desc='extract program output', total=len(model_outputs))]
+    assert len(model_outputs) > 0, f"{len(model_outputs)}"
+
+    results = []
+    for example, output, pred, program_output in zip(test_data, model_outputs, predictions, program_outputs):
+        item = deepcopy(example)
+        item.update({
+            'model_output': output,
+            'prediction': pred,
+            'program_output': program_output,
+        })
+        results.append(item)
+
+    return results
+
+
+def main(args):
+    random.seed(42)
+
+    print("Loading data...")
+    test_data = []
+    with open(os.path.join(args.data_dir, f"train.jsonl" if args.infer_train_set else f"test.jsonl")) as fin:
+        for line in fin:
+            example = json.loads(line)
+            messages = example['messages']
+            assert len(messages) in [2, 3]
+            assert messages[-1]['role'] == 'assistant'
+            if not args.complete_partial_output:
+                example['reference'] = example.get('reference', '') or messages[-1]['content']
+                messages[-1]['content'] = ''
+            example['messages'] = messages
+            test_data.append(example)
+
+    if args.max_num_examples and len(test_data) > args.max_num_examples:
+        test_data = random.sample(test_data, args.max_num_examples)
+
+    if args.n_subsets > 1:
+        assert args.subset_id >= 0 and args.subset_id < args.n_subsets
+        test_data = [item for i, item in enumerate(test_data) if i % args.n_subsets == args.subset_id]
+
+    if not test_data:
+        return
+
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir, exist_ok=True)
+
+    results = [[item] for item in infer(args, test_data)]
+
+    all_items = []
+    for items in results:
+        for item in items:
+            all_items.append(item)
+
+    labels, eval_timeout_cnt = evaluate(eval(args.eval_fn), all_items)
+    for item, label in zip(all_items, labels):
+        item['accuracy'] = label
+    program_labels, program_eval_timeout_cnt = evaluate(partial(eval(args.eval_fn), pred_key='program_output'), all_items)
+    for item, label in zip(all_items, program_labels):
+        item['program_accuracy'] = label
+
+    _results = []
+    for items in tqdm(results, desc='eval', total=len(results)):
+        item = items[-1]
+        item['history'] = items[:-1]
+        _results.append(item)
+    results = _results
+
+    print("Calculating accuracy...")
+    acc = 0
+    program_acc = 0
+    for item in results:
+        acc += item['accuracy']
+        program_acc += item['program_accuracy']
+    print("output acc = {:.5f}; program acc = {:.5f}".format(acc / len(results) * 100, program_acc / len(results) * 100), flush=True)
+
+    print(f"Timeout count >>> output eval = {eval_timeout_cnt}; program eval = {program_eval_timeout_cnt}", flush=True)
+
+    pred_fname = "predictions.json"
+    if args.n_subsets > 1:
+        pred_fname = f"predictions.{args.subset_id}.json"
+    with open(os.path.join(args.save_dir, pred_fname), "w") as fout:
+        json.dump(results, fout, ensure_ascii=True)
+
+    metric_fname = "metrics.json"
+    if args.n_subsets > 1:
+        metric_fname = f"metrics.{args.subset_id}.json"
+    with open(os.path.join(args.save_dir, metric_fname), "w") as fout:
+        json.dump({
+            "n_samples": len(results),
+            "accuracy": sum(item['accuracy'] for item in results) / len(results),
+            "program_accuracy": sum(item['program_accuracy'] for item in results) / len(results)
+        }, fout, indent=4)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir", type=str, default="data/mgsm")
+    parser.add_argument("--max_num_examples", type=int, default=None, help="maximum number of examples to evaluate.")
+    parser.add_argument("--save_dir", type=str, default="results/mgsm")
+    parser.add_argument("--model_name_or_path", type=str, default=None, help="if specified, we will load the model to generate the predictions.")
+    parser.add_argument("--tokenizer_name_or_path", type=str, default=None, help="if specified, we will load the tokenizer from here.")
+    parser.add_argument("--eval_batch_size", type=int, default=1, help="batch size for evaluation.")
+    parser.add_argument("--load_in_8bit", action="store_true", help="load model in 8bit mode, which will reduce memory and speed up inference.")
+    parser.add_argument("--gptq", action="store_true", help="If given, we're evaluating a 4-bit quantized GPTQ model.")
+    parser.add_argument("--use_vllm", action="store_true")
+    parser.add_argument("--load_in_half", action='store_true')
+    parser.add_argument("--infer_train_set", action="store_true")
+    parser.add_argument("--n_subsets", type=int, default=1)
+    parser.add_argument("--subset_id", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.0)
+    parser.add_argument("--repeat_id_start", type=int, default=0)
+    parser.add_argument("--n_repeat_sampling", type=int, default=1)
+    parser.add_argument("--complete_partial_output", action='store_true')
+    parser.add_argument("--use_concise_exec_info", action='store_true')
+    parser.add_argument("--prompt_format", type=str, choices=['sft', 'few_shot'], default='sft')
+    parser.add_argument("--few_shot_prompt", type=str, default=None)
+    parser.add_argument("--answer_extraction_fn", type=str, required=True)
+    parser.add_argument("--no-execution", action='store_true')
+    parser.add_argument("--eval_fn", type=str, required=True)
+    parser.add_argument("--gpus", type=str, default=None)
+    args, unparsed_args = parser.parse_known_args()
+    if args.gpus is not None:
+        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpus
+
+    print(unparsed_args, flush=True)
+
+    model = None
+    tokenizer = None
+    pool = None
+    if args.n_repeat_sampling > 1 or args.repeat_id_start != 0:
+        assert args.temperature > 0
+        save_dir = args.save_dir
+        for i in range(args.repeat_id_start, args.repeat_id_start + args.n_repeat_sampling):
+            print(f"working on the {i} trials ...", flush=True)
+            args.save_dir = os.path.join(save_dir, str(i))
+            os.makedirs(args.save_dir, exist_ok=True)
+            main(args)
+    else:
+        main(args)
+
+    if pool is not None:
+        pool.close()