add instruction model eval script

2025-06-26 18:25:53 +00:00 · 2023-11-04 17:39:04 +08:00
parent 8a99c2154a
commit 118e71a1af
2 changed files with 232 additions and 0 deletions
--- a/Evaluation/HumanEval/eval_instruct.py
+++ b/Evaluation/HumanEval/eval_instruct.py
@@ -0,0 +1,126 @@
+import argparse
+import json
+import os
+import torch
+from pathlib import Path
+from tqdm import tqdm
+
+data_abs_dir = Path(__file__).parent / "data"
+
+from utils.utils import extract_generation_code
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from human_eval.evaluation import evaluate_functional_correctness
+
+def build_deepseekcoder_instruction(languge: str, question: str):
+    return '''
+Please help me to complete the function. Use the given packages only and DO NOT refer any new package. Please return all completed function in a codeblock. 
+Here is the given code to do completion:
+```{}
+{}
+```
+'''.strip().format(languge.lower(), question)
+
+
+def generate_one(example, lang, tokenizer, model):
+    prompt = build_deepseekcoder_instruction(lang, example['prompt'])
+    inputs = tokenizer.apply_chat_template(
+        [{'role': 'user', 'content': prompt }],
+        return_tensors="pt"
+    ).to(model.device)
+
+    stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
+    assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
+
+    outputs = model.generate(
+        inputs, 
+        max_new_tokens=512,
+        do_sample=False, 
+        top_p=0.95,
+        eos_token_id=stop_id
+    )
+    output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
+    example['output'] = output
+    
+    return extract_generation_code(example, lang_code=lang)
+
+def generate_main(args):
+    model_name_or_path = args.model
+    lang = args.language
+    saved_path = args.output_path
+    temp_dir = args.temp_dir
+    os.makedirs(temp_dir, exist_ok=True)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    print("load tokenizer {} from {} over.".format(tokenizer.__class__, model_name_or_path))
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name_or_path,
+        torch_dtype=torch.bfloat16,
+        device_map="cuda"
+    )
+    
+    model.eval()
+    problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
+    examples = [json.loads(x) for x in open(problem_file) if x.strip()]
+    print("Read {} examples for evaluation over.".format(len(examples)))
+
+    generated_examples = []
+    for ex in tqdm(examples, desc='Generating'):
+        gen_example = generate_one(ex, lang, tokenizer, model)
+        generated_examples.append(gen_example)
+    
+    print("Generate all over!!!")
+    with open(saved_path, 'w', encoding='utf-8') as fw:
+        for ex in generated_examples:
+            fw.write(json.dumps(ex) + '\n')
+        print("Save {} processed examples into {} over!".format(len(generated_examples), saved_path))
+    
+    result = evaluate_functional_correctness(
+        input_file=saved_path,
+        tmp_dir=temp_dir,
+        n_workers=8,
+        timeout=3.0,
+        problem_file=problem_file,
+        language=lang
+    )
+    print(lang, result, model_name_or_path)
+    pass
+
+def evaluation_only(args):
+    lang = args.language
+    temp_dir = args.temp_dir
+    assert os.path.exists(args.output_path), "Not fond output file: {}".format(args.output_path)
+    os.makedirs(temp_dir, exist_ok=True)
+
+    output_name = os.path.basename(args.output_path)
+    output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
+
+    processed_examples = [extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")]
+    processed_path = os.path.join(temp_dir, output_name)
+    with open(processed_path, 'w', encoding='utf-8') as fw:
+        for ex in processed_examples:
+            fw.write(json.dumps(ex) + '\n')
+        print("Save {} processed examples into {} over!".format(len(processed_examples), processed_path))
+
+    problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
+    from human_eval.evaluation import evaluate_functional_correctness
+    result = evaluate_functional_correctness(
+        input_file=processed_path,
+        tmp_dir=temp_dir,
+        n_workers=8,
+        timeout=3.0,
+        problem_file=problem_file,
+        language=lang
+    )
+    print(lang, result)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, help="model name or path")
+    parser.add_argument('--output_path', type=str, help="output path of your generation")
+    parser.add_argument('--language', type=str, help="langauge")
+    parser.add_argument('--temp_dir', type=str, help="temp dir for evaluation", default="tmp")
+    args = parser.parse_args()
+
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    generate_main(args)
+    pass
--- a/Evaluation/HumanEval/utils/utils.py
+++ b/Evaluation/HumanEval/utils/utils.py
@@ -1,3 +1,109 @@
+import re
+
+languge_settings = {
+    'python': {
+        'full_name': 'Python',
+        'indent': 4,
+    },
+    'cpp': {
+        'full_name': 'cpp',
+        'indent': 0,
+        'main': "int main()",
+    },
+    'java': {
+        'full_name': 'Java',
+        'indent': 4,
+        'main': "public static void main",
+    },
+    'cs': {
+        'full_name': "csharp",
+        'indent': 0,
+        'main': "public static void Main",
+    },
+    'php': {
+        'full_name': "PHP",
+        'indent': 0,
+    },
+    'ts': {
+        'full_name': "TypeScript",
+        'indent': 0,
+    },
+    'js': {
+        'full_name': "JavaScript",
+        'indent': 0
+    },
+    'sh': {
+        'full_name': "Bash",
+        'indent': 0
+    }
+}
+
+def get_function_name(question: str, lang: str):
+    func_lines = [x for x in question.strip().split('\n') if x.strip()]
+
+    if lang.lower() == 'python':
+        func_idx = [i for i in range(len(func_lines)) if func_lines[i].startswith("def ")][-1]
+        func_name = func_lines[func_idx].split('(')[0].strip()
+        func_prefix = "\n".join(func_lines[:func_idx])
+        return func_name, func_prefix
+    
+    func_name = func_lines[-1].split('{')[0].strip()
+    func_prefix = "\n".join(func_lines[:-1])
+    return func_name, func_prefix
+
+def extract_generation_code(example: str, lang_code: str, verbose: bool=False):
+    task_id = example['task_id']
+    output = example.get('output', example.get("gpt_completion"))
+    question = example["prompt"].strip()
+    setting = languge_settings[lang_code]
+    lang = setting['full_name']
+    indent = setting['indent']
+
+    try:
+        code_block: str = re.findall(f'```{lang.lower()}\n(.*?)```', output, re.DOTALL | re.IGNORECASE)[0]
+        if verbose:
+            print(">>> Task: {}\n{}".format(task_id, code_block))
+        
+        # Remove main
+        if setting.get('main', None) and setting['main'] in code_block:
+            main_start = code_block.index(setting['main'])
+            code_block = code_block[:main_start]
+        
+        func_name, func_prefix = get_function_name(question, lang)
+
+        try:
+            start = code_block.lower().index(func_name.lower())
+            indent = 0
+            while start - indent >= 0 and code_block[start - indent-1] == ' ':
+                indent += 1
+            
+            try:
+                end = code_block.rindex('\n' + ' '*indent + '}')
+            except:
+                end = len(code_block)
+        except:
+            start = 0
+            try:
+                end = code_block.rindex('\n' + ' '*indent + '}')
+            except:
+                end = len(code_block)
+
+        body = code_block[start:end]
+
+        if lang_code.lower() in ['php', 'ts', 'js']:
+            body += '\n' + ' '*indent + '}'
+    
+        generation = func_prefix + '\n' + body + '\n'
+        example['generation'] = generation
+
+    except Exception as ex:
+        print("Failed to extract code block with error `{}`:\n>>> Task: {}\n>>> Output:\n{}".format(
+            ex, task_id, output
+        ))
+        example['generation'] = example['prompt'] + '\n' + output
+    
+    return example
+
 def cleanup_code(
    code: str,
    language_type: str = None,