mirror of
https://github.com/deepseek-ai/DeepSeek-Coder
synced 2024-12-05 02:24:46 +00:00
add instruction model eval script
This commit is contained in:
parent
8a99c2154a
commit
118e71a1af
126
Evaluation/HumanEval/eval_instruct.py
Normal file
126
Evaluation/HumanEval/eval_instruct.py
Normal file
@ -0,0 +1,126 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import torch
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
|
||||
data_abs_dir = Path(__file__).parent / "data"
|
||||
|
||||
from utils.utils import extract_generation_code
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||
from human_eval.evaluation import evaluate_functional_correctness
|
||||
|
||||
def build_deepseekcoder_instruction(languge: str, question: str):
|
||||
return '''
|
||||
Please help me to complete the function. Use the given packages only and DO NOT refer any new package. Please return all completed function in a codeblock.
|
||||
Here is the given code to do completion:
|
||||
```{}
|
||||
{}
|
||||
```
|
||||
'''.strip().format(languge.lower(), question)
|
||||
|
||||
|
||||
def generate_one(example, lang, tokenizer, model):
|
||||
prompt = build_deepseekcoder_instruction(lang, example['prompt'])
|
||||
inputs = tokenizer.apply_chat_template(
|
||||
[{'role': 'user', 'content': prompt }],
|
||||
return_tensors="pt"
|
||||
).to(model.device)
|
||||
|
||||
stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
|
||||
assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
|
||||
|
||||
outputs = model.generate(
|
||||
inputs,
|
||||
max_new_tokens=512,
|
||||
do_sample=False,
|
||||
top_p=0.95,
|
||||
eos_token_id=stop_id
|
||||
)
|
||||
output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
|
||||
example['output'] = output
|
||||
|
||||
return extract_generation_code(example, lang_code=lang)
|
||||
|
||||
def generate_main(args):
|
||||
model_name_or_path = args.model
|
||||
lang = args.language
|
||||
saved_path = args.output_path
|
||||
temp_dir = args.temp_dir
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
||||
print("load tokenizer {} from {} over.".format(tokenizer.__class__, model_name_or_path))
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_name_or_path,
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="cuda"
|
||||
)
|
||||
|
||||
model.eval()
|
||||
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
||||
examples = [json.loads(x) for x in open(problem_file) if x.strip()]
|
||||
print("Read {} examples for evaluation over.".format(len(examples)))
|
||||
|
||||
generated_examples = []
|
||||
for ex in tqdm(examples, desc='Generating'):
|
||||
gen_example = generate_one(ex, lang, tokenizer, model)
|
||||
generated_examples.append(gen_example)
|
||||
|
||||
print("Generate all over!!!")
|
||||
with open(saved_path, 'w', encoding='utf-8') as fw:
|
||||
for ex in generated_examples:
|
||||
fw.write(json.dumps(ex) + '\n')
|
||||
print("Save {} processed examples into {} over!".format(len(generated_examples), saved_path))
|
||||
|
||||
result = evaluate_functional_correctness(
|
||||
input_file=saved_path,
|
||||
tmp_dir=temp_dir,
|
||||
n_workers=8,
|
||||
timeout=3.0,
|
||||
problem_file=problem_file,
|
||||
language=lang
|
||||
)
|
||||
print(lang, result, model_name_or_path)
|
||||
pass
|
||||
|
||||
def evaluation_only(args):
|
||||
lang = args.language
|
||||
temp_dir = args.temp_dir
|
||||
assert os.path.exists(args.output_path), "Not fond output file: {}".format(args.output_path)
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
output_name = os.path.basename(args.output_path)
|
||||
output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
|
||||
|
||||
processed_examples = [extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")]
|
||||
processed_path = os.path.join(temp_dir, output_name)
|
||||
with open(processed_path, 'w', encoding='utf-8') as fw:
|
||||
for ex in processed_examples:
|
||||
fw.write(json.dumps(ex) + '\n')
|
||||
print("Save {} processed examples into {} over!".format(len(processed_examples), processed_path))
|
||||
|
||||
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
||||
from human_eval.evaluation import evaluate_functional_correctness
|
||||
result = evaluate_functional_correctness(
|
||||
input_file=processed_path,
|
||||
tmp_dir=temp_dir,
|
||||
n_workers=8,
|
||||
timeout=3.0,
|
||||
problem_file=problem_file,
|
||||
language=lang
|
||||
)
|
||||
print(lang, result)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model', type=str, help="model name or path")
|
||||
parser.add_argument('--output_path', type=str, help="output path of your generation")
|
||||
parser.add_argument('--language', type=str, help="langauge")
|
||||
parser.add_argument('--temp_dir', type=str, help="temp dir for evaluation", default="tmp")
|
||||
args = parser.parse_args()
|
||||
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
generate_main(args)
|
||||
pass
|
@ -1,3 +1,109 @@
|
||||
import re
|
||||
|
||||
languge_settings = {
|
||||
'python': {
|
||||
'full_name': 'Python',
|
||||
'indent': 4,
|
||||
},
|
||||
'cpp': {
|
||||
'full_name': 'cpp',
|
||||
'indent': 0,
|
||||
'main': "int main()",
|
||||
},
|
||||
'java': {
|
||||
'full_name': 'Java',
|
||||
'indent': 4,
|
||||
'main': "public static void main",
|
||||
},
|
||||
'cs': {
|
||||
'full_name': "csharp",
|
||||
'indent': 0,
|
||||
'main': "public static void Main",
|
||||
},
|
||||
'php': {
|
||||
'full_name': "PHP",
|
||||
'indent': 0,
|
||||
},
|
||||
'ts': {
|
||||
'full_name': "TypeScript",
|
||||
'indent': 0,
|
||||
},
|
||||
'js': {
|
||||
'full_name': "JavaScript",
|
||||
'indent': 0
|
||||
},
|
||||
'sh': {
|
||||
'full_name': "Bash",
|
||||
'indent': 0
|
||||
}
|
||||
}
|
||||
|
||||
def get_function_name(question: str, lang: str):
|
||||
func_lines = [x for x in question.strip().split('\n') if x.strip()]
|
||||
|
||||
if lang.lower() == 'python':
|
||||
func_idx = [i for i in range(len(func_lines)) if func_lines[i].startswith("def ")][-1]
|
||||
func_name = func_lines[func_idx].split('(')[0].strip()
|
||||
func_prefix = "\n".join(func_lines[:func_idx])
|
||||
return func_name, func_prefix
|
||||
|
||||
func_name = func_lines[-1].split('{')[0].strip()
|
||||
func_prefix = "\n".join(func_lines[:-1])
|
||||
return func_name, func_prefix
|
||||
|
||||
def extract_generation_code(example: str, lang_code: str, verbose: bool=False):
|
||||
task_id = example['task_id']
|
||||
output = example.get('output', example.get("gpt_completion"))
|
||||
question = example["prompt"].strip()
|
||||
setting = languge_settings[lang_code]
|
||||
lang = setting['full_name']
|
||||
indent = setting['indent']
|
||||
|
||||
try:
|
||||
code_block: str = re.findall(f'```{lang.lower()}\n(.*?)```', output, re.DOTALL | re.IGNORECASE)[0]
|
||||
if verbose:
|
||||
print(">>> Task: {}\n{}".format(task_id, code_block))
|
||||
|
||||
# Remove main
|
||||
if setting.get('main', None) and setting['main'] in code_block:
|
||||
main_start = code_block.index(setting['main'])
|
||||
code_block = code_block[:main_start]
|
||||
|
||||
func_name, func_prefix = get_function_name(question, lang)
|
||||
|
||||
try:
|
||||
start = code_block.lower().index(func_name.lower())
|
||||
indent = 0
|
||||
while start - indent >= 0 and code_block[start - indent-1] == ' ':
|
||||
indent += 1
|
||||
|
||||
try:
|
||||
end = code_block.rindex('\n' + ' '*indent + '}')
|
||||
except:
|
||||
end = len(code_block)
|
||||
except:
|
||||
start = 0
|
||||
try:
|
||||
end = code_block.rindex('\n' + ' '*indent + '}')
|
||||
except:
|
||||
end = len(code_block)
|
||||
|
||||
body = code_block[start:end]
|
||||
|
||||
if lang_code.lower() in ['php', 'ts', 'js']:
|
||||
body += '\n' + ' '*indent + '}'
|
||||
|
||||
generation = func_prefix + '\n' + body + '\n'
|
||||
example['generation'] = generation
|
||||
|
||||
except Exception as ex:
|
||||
print("Failed to extract code block with error `{}`:\n>>> Task: {}\n>>> Output:\n{}".format(
|
||||
ex, task_id, output
|
||||
))
|
||||
example['generation'] = example['prompt'] + '\n' + output
|
||||
|
||||
return example
|
||||
|
||||
def cleanup_code(
|
||||
code: str,
|
||||
language_type: str = None,
|
||||
|
Loading…
Reference in New Issue
Block a user