diff --git a/Evaluation/HumanEval/eval_instruct.py b/Evaluation/HumanEval/eval_instruct.py new file mode 100644 index 0000000..f09a37d --- /dev/null +++ b/Evaluation/HumanEval/eval_instruct.py @@ -0,0 +1,126 @@ +import argparse +import json +import os +import torch +from pathlib import Path +from tqdm import tqdm + +data_abs_dir = Path(__file__).parent / "data" + +from utils.utils import extract_generation_code +from transformers import AutoTokenizer, AutoModelForCausalLM +from human_eval.evaluation import evaluate_functional_correctness + +def build_deepseekcoder_instruction(languge: str, question: str): + return ''' +Please help me to complete the function. Use the given packages only and DO NOT refer any new package. Please return all completed function in a codeblock. +Here is the given code to do completion: +```{} +{} +``` +'''.strip().format(languge.lower(), question) + + +def generate_one(example, lang, tokenizer, model): + prompt = build_deepseekcoder_instruction(lang, example['prompt']) + inputs = tokenizer.apply_chat_template( + [{'role': 'user', 'content': prompt }], + return_tensors="pt" + ).to(model.device) + + stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>") + assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found" + + outputs = model.generate( + inputs, + max_new_tokens=512, + do_sample=False, + top_p=0.95, + eos_token_id=stop_id + ) + output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True) + example['output'] = output + + return extract_generation_code(example, lang_code=lang) + +def generate_main(args): + model_name_or_path = args.model + lang = args.language + saved_path = args.output_path + temp_dir = args.temp_dir + os.makedirs(temp_dir, exist_ok=True) + + tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) + print("load tokenizer {} from {} over.".format(tokenizer.__class__, model_name_or_path)) + model = AutoModelForCausalLM.from_pretrained( + model_name_or_path, + torch_dtype=torch.bfloat16, + device_map="cuda" + ) + + model.eval() + problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl") + examples = [json.loads(x) for x in open(problem_file) if x.strip()] + print("Read {} examples for evaluation over.".format(len(examples))) + + generated_examples = [] + for ex in tqdm(examples, desc='Generating'): + gen_example = generate_one(ex, lang, tokenizer, model) + generated_examples.append(gen_example) + + print("Generate all over!!!") + with open(saved_path, 'w', encoding='utf-8') as fw: + for ex in generated_examples: + fw.write(json.dumps(ex) + '\n') + print("Save {} processed examples into {} over!".format(len(generated_examples), saved_path)) + + result = evaluate_functional_correctness( + input_file=saved_path, + tmp_dir=temp_dir, + n_workers=8, + timeout=3.0, + problem_file=problem_file, + language=lang + ) + print(lang, result, model_name_or_path) + pass + +def evaluation_only(args): + lang = args.language + temp_dir = args.temp_dir + assert os.path.exists(args.output_path), "Not fond output file: {}".format(args.output_path) + os.makedirs(temp_dir, exist_ok=True) + + output_name = os.path.basename(args.output_path) + output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()] + + processed_examples = [extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")] + processed_path = os.path.join(temp_dir, output_name) + with open(processed_path, 'w', encoding='utf-8') as fw: + for ex in processed_examples: + fw.write(json.dumps(ex) + '\n') + print("Save {} processed examples into {} over!".format(len(processed_examples), processed_path)) + + problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl") + from human_eval.evaluation import evaluate_functional_correctness + result = evaluate_functional_correctness( + input_file=processed_path, + tmp_dir=temp_dir, + n_workers=8, + timeout=3.0, + problem_file=problem_file, + language=lang + ) + print(lang, result) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--model', type=str, help="model name or path") + parser.add_argument('--output_path', type=str, help="output path of your generation") + parser.add_argument('--language', type=str, help="langauge") + parser.add_argument('--temp_dir', type=str, help="temp dir for evaluation", default="tmp") + args = parser.parse_args() + + os.environ["TOKENIZERS_PARALLELISM"] = "false" + generate_main(args) + pass \ No newline at end of file diff --git a/Evaluation/HumanEval/utils/utils.py b/Evaluation/HumanEval/utils/utils.py index 21b9c1b..41fcb3d 100644 --- a/Evaluation/HumanEval/utils/utils.py +++ b/Evaluation/HumanEval/utils/utils.py @@ -1,3 +1,109 @@ +import re + +languge_settings = { + 'python': { + 'full_name': 'Python', + 'indent': 4, + }, + 'cpp': { + 'full_name': 'cpp', + 'indent': 0, + 'main': "int main()", + }, + 'java': { + 'full_name': 'Java', + 'indent': 4, + 'main': "public static void main", + }, + 'cs': { + 'full_name': "csharp", + 'indent': 0, + 'main': "public static void Main", + }, + 'php': { + 'full_name': "PHP", + 'indent': 0, + }, + 'ts': { + 'full_name': "TypeScript", + 'indent': 0, + }, + 'js': { + 'full_name': "JavaScript", + 'indent': 0 + }, + 'sh': { + 'full_name': "Bash", + 'indent': 0 + } +} + +def get_function_name(question: str, lang: str): + func_lines = [x for x in question.strip().split('\n') if x.strip()] + + if lang.lower() == 'python': + func_idx = [i for i in range(len(func_lines)) if func_lines[i].startswith("def ")][-1] + func_name = func_lines[func_idx].split('(')[0].strip() + func_prefix = "\n".join(func_lines[:func_idx]) + return func_name, func_prefix + + func_name = func_lines[-1].split('{')[0].strip() + func_prefix = "\n".join(func_lines[:-1]) + return func_name, func_prefix + +def extract_generation_code(example: str, lang_code: str, verbose: bool=False): + task_id = example['task_id'] + output = example.get('output', example.get("gpt_completion")) + question = example["prompt"].strip() + setting = languge_settings[lang_code] + lang = setting['full_name'] + indent = setting['indent'] + + try: + code_block: str = re.findall(f'```{lang.lower()}\n(.*?)```', output, re.DOTALL | re.IGNORECASE)[0] + if verbose: + print(">>> Task: {}\n{}".format(task_id, code_block)) + + # Remove main + if setting.get('main', None) and setting['main'] in code_block: + main_start = code_block.index(setting['main']) + code_block = code_block[:main_start] + + func_name, func_prefix = get_function_name(question, lang) + + try: + start = code_block.lower().index(func_name.lower()) + indent = 0 + while start - indent >= 0 and code_block[start - indent-1] == ' ': + indent += 1 + + try: + end = code_block.rindex('\n' + ' '*indent + '}') + except: + end = len(code_block) + except: + start = 0 + try: + end = code_block.rindex('\n' + ' '*indent + '}') + except: + end = len(code_block) + + body = code_block[start:end] + + if lang_code.lower() in ['php', 'ts', 'js']: + body += '\n' + ' '*indent + '}' + + generation = func_prefix + '\n' + body + '\n' + example['generation'] = generation + + except Exception as ex: + print("Failed to extract code block with error `{}`:\n>>> Task: {}\n>>> Output:\n{}".format( + ex, task_id, output + )) + example['generation'] = example['prompt'] + '\n' + output + + return example + def cleanup_code( code: str, language_type: str = None,