mirror of
https://github.com/deepseek-ai/DeepSeek-Coder
synced 2025-06-26 18:25:53 +00:00
init project
This commit is contained in:
0
Evaluation/MBPP/human_eval/__init__.py
Normal file
0
Evaluation/MBPP/human_eval/__init__.py
Normal file
BIN
Evaluation/MBPP/human_eval/__pycache__/__init__.cpython-37.pyc
Normal file
BIN
Evaluation/MBPP/human_eval/__pycache__/__init__.cpython-37.pyc
Normal file
Binary file not shown.
BIN
Evaluation/MBPP/human_eval/__pycache__/__init__.cpython-38.pyc
Normal file
BIN
Evaluation/MBPP/human_eval/__pycache__/__init__.cpython-38.pyc
Normal file
Binary file not shown.
BIN
Evaluation/MBPP/human_eval/__pycache__/data.cpython-37.pyc
Normal file
BIN
Evaluation/MBPP/human_eval/__pycache__/data.cpython-37.pyc
Normal file
Binary file not shown.
BIN
Evaluation/MBPP/human_eval/__pycache__/data.cpython-38.pyc
Normal file
BIN
Evaluation/MBPP/human_eval/__pycache__/data.cpython-38.pyc
Normal file
Binary file not shown.
Binary file not shown.
BIN
Evaluation/MBPP/human_eval/__pycache__/evaluation.cpython-37.pyc
Normal file
BIN
Evaluation/MBPP/human_eval/__pycache__/evaluation.cpython-37.pyc
Normal file
Binary file not shown.
BIN
Evaluation/MBPP/human_eval/__pycache__/evaluation.cpython-38.pyc
Normal file
BIN
Evaluation/MBPP/human_eval/__pycache__/evaluation.cpython-38.pyc
Normal file
Binary file not shown.
BIN
Evaluation/MBPP/human_eval/__pycache__/execution.cpython-38.pyc
Normal file
BIN
Evaluation/MBPP/human_eval/__pycache__/execution.cpython-38.pyc
Normal file
Binary file not shown.
49
Evaluation/MBPP/human_eval/data.py
Normal file
49
Evaluation/MBPP/human_eval/data.py
Normal file
@@ -0,0 +1,49 @@
|
||||
from typing import Iterable, Dict
|
||||
import gzip
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
ROOT = os.path.dirname(os.path.abspath(__file__))
|
||||
HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
|
||||
|
||||
|
||||
def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
|
||||
return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
|
||||
|
||||
|
||||
def stream_jsonl(filename: str) -> Iterable[Dict]:
|
||||
"""
|
||||
Parses each jsonl line and yields it as a dictionary
|
||||
"""
|
||||
if filename.endswith(".gz"):
|
||||
with open(filename, "rb") as gzfp:
|
||||
with gzip.open(gzfp, 'rt') as fp:
|
||||
for line in fp:
|
||||
if any(not x.isspace() for x in line):
|
||||
yield json.loads(line)
|
||||
else:
|
||||
with open(filename, "r", encoding="utf-8") as fp:
|
||||
for line in fp:
|
||||
if any(not x.isspace() for x in line):
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
|
||||
"""
|
||||
Writes an iterable of dictionaries to jsonl
|
||||
"""
|
||||
if append:
|
||||
mode = 'ab'
|
||||
else:
|
||||
mode = 'wb'
|
||||
filename = os.path.expanduser(filename)
|
||||
if filename.endswith(".gz"):
|
||||
with open(filename, mode) as fp:
|
||||
with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
|
||||
for x in data:
|
||||
gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
|
||||
else:
|
||||
with open(filename, mode) as fp:
|
||||
for x in data:
|
||||
fp.write((json.dumps(x) + "\n").encode('utf-8'))
|
||||
@@ -0,0 +1,29 @@
|
||||
import fire
|
||||
import sys
|
||||
|
||||
from .data import HUMAN_EVAL
|
||||
from .evaluation import evaluate_functional_correctness
|
||||
|
||||
|
||||
def entry_point(
|
||||
sample_file: str,
|
||||
k: str = "1,10,100",
|
||||
n_workers: int = 4,
|
||||
timeout: float = 3.0,
|
||||
problem_file: str = "",
|
||||
is_mbpp: bool = False,
|
||||
):
|
||||
"""
|
||||
Evaluates the functional correctness of generated samples, and writes
|
||||
results to f"{sample_file}_results.jsonl.gz"
|
||||
"""
|
||||
k = list(map(int, k.split(",")))
|
||||
results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file, is_mbpp)
|
||||
print(results)
|
||||
|
||||
|
||||
def main():
|
||||
fire.Fire(entry_point)
|
||||
|
||||
|
||||
sys.exit(main())
|
||||
298
Evaluation/MBPP/human_eval/evaluation.py
Normal file
298
Evaluation/MBPP/human_eval/evaluation.py
Normal file
@@ -0,0 +1,298 @@
|
||||
import os
|
||||
import sys
|
||||
import fire
|
||||
import json
|
||||
import gzip
|
||||
import regex
|
||||
import numpy as np
|
||||
import itertools
|
||||
|
||||
from typing import *
|
||||
from tqdm.auto import tqdm
|
||||
from collections import defaultdict
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from .data import stream_jsonl
|
||||
from .execution import check_correctness
|
||||
IMPORT_HELPER = {
|
||||
"python": [
|
||||
"import math",
|
||||
"import re",
|
||||
"import sys",
|
||||
"import copy",
|
||||
"import datetime",
|
||||
"import itertools",
|
||||
"import collections",
|
||||
"import heapq",
|
||||
"import functools",
|
||||
"import hashlib",
|
||||
"import numpy",
|
||||
"import numpy as np",
|
||||
"import string",
|
||||
"from typing import *",
|
||||
"from collections import *",
|
||||
],
|
||||
"go" : [
|
||||
"math",
|
||||
"strings",
|
||||
"fmt",
|
||||
"strconv",
|
||||
"time",
|
||||
"bytes",
|
||||
"regexp",
|
||||
"sort",
|
||||
"math/rand",
|
||||
"crypto/md5",
|
||||
],
|
||||
"cpp" : [
|
||||
"#include<stdlib.h>",
|
||||
"#include<algorithm>",
|
||||
"#include<math.h>",
|
||||
"#include<stdio.h>",
|
||||
"#include<vector>",
|
||||
"#include<string>",
|
||||
"#include<climits>",
|
||||
"#include<cstring>",
|
||||
"#include<iostream>",
|
||||
"#include<cassert>"
|
||||
],
|
||||
"cs": ["using System.Numerics;", "using System.Diagnostics;", "using System.Collections.Generic;", "using System.Linq;", "using System.Text;", "using System.Security.Cryptography;", "using System.Collections.Generic;"]
|
||||
}
|
||||
|
||||
|
||||
LANGUAGE_NAME = {
|
||||
"cpp" : "CPP",
|
||||
"go" : "Go",
|
||||
"java" : "Java",
|
||||
"js" : "JavaScript",
|
||||
"python": "Python",
|
||||
}
|
||||
|
||||
|
||||
def read_dataset(
|
||||
data_file: str = None,
|
||||
dataset_type: str = "humaneval",
|
||||
num_shot=None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Reads a dataset and returns a dictionary of tasks.
|
||||
"""
|
||||
if num_shot is not None:
|
||||
print(f"{num_shot}-shot setting...")
|
||||
if "humaneval" in dataset_type.lower():
|
||||
if data_file is None:
|
||||
current_path = os.path.dirname(os.path.abspath(__file__))
|
||||
data_file = os.path.join(current_path, "..", "humaneval-x", "python", "data", "humaneval_python.jsonl.gz")
|
||||
dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
|
||||
else:
|
||||
raise f"Dataset: {dataset_type} not supported."
|
||||
|
||||
return dataset
|
||||
|
||||
def estimate_pass_at_k(
|
||||
num_samples: Union[int, List[int], np.ndarray],
|
||||
num_correct: Union[List[int], np.ndarray],
|
||||
k: int
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Estimates pass@k of each problem and returns them in an array.
|
||||
"""
|
||||
|
||||
def estimator(n: int, c: int, k: int) -> float:
|
||||
"""
|
||||
Calculates 1 - comb(n - c, k) / comb(n, k).
|
||||
"""
|
||||
if n - c < k:
|
||||
return 1.0
|
||||
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
||||
|
||||
if isinstance(num_samples, int):
|
||||
num_samples_it = itertools.repeat(num_samples, len(num_correct))
|
||||
else:
|
||||
assert len(num_samples) == len(num_correct)
|
||||
num_samples_it = iter(num_samples)
|
||||
|
||||
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
|
||||
|
||||
def process_humaneval_test(sample, problems, example_test=False, is_mbpp=False, language="python"):
|
||||
"""
|
||||
Processes a sample for evaluation.
|
||||
"""
|
||||
task_id = sample["task_id"]
|
||||
if is_mbpp:
|
||||
return sample["generation"] + "\n" + "\n".join(problems[task_id]["test"])
|
||||
|
||||
#prompt = sample["prompt"]
|
||||
if example_test and "example_test" in problems[task_id] and problems[task_id]["example_test"] != "":
|
||||
test = problems[task_id]["example_test"]
|
||||
else:
|
||||
test = problems[task_id]["test"]
|
||||
test = "\n".join(test)
|
||||
|
||||
code = sample["generation"]
|
||||
|
||||
# Pre-process for different languages
|
||||
if language == "python":
|
||||
test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
|
||||
test_string = test_setup + code + "\n" + test + "\n"
|
||||
elif language == "cpp":
|
||||
test_set_up = ""
|
||||
for s in IMPORT_HELPER["cpp"]:
|
||||
if s not in prompt:
|
||||
test_set_up += s + "\n"
|
||||
test_string = test_set_up + "\n" + code + "\n" + test
|
||||
elif language == "java":
|
||||
test_string = code + "\n" + test
|
||||
elif language == "cs":
|
||||
test_set_up = ""
|
||||
for s in IMPORT_HELPER["cs"]:
|
||||
test_set_up += s + "\n"
|
||||
test_string = test_set_up + "\n" + code + "\n" + test
|
||||
elif language in ["js", "javascript", "ts", "sh", "go"]:
|
||||
test_string = code + "\n" + test
|
||||
elif language == "go232":
|
||||
import_string = problems[task_id]["import"]
|
||||
prompt = prompt.replace(import_string, "")
|
||||
if example_test and "example_test" in problems[task_id]:
|
||||
test = problems[task_id]["example_test"]
|
||||
else:
|
||||
test = problems[task_id]["test"]
|
||||
test_setup = problems[task_id]["test_setup"]
|
||||
other_pkgs = []
|
||||
for pkg in IMPORT_HELPER["go"]:
|
||||
if pkg not in test_setup:
|
||||
p = pkg.split("/")[-1]
|
||||
if p + "." in code:
|
||||
other_pkgs.append(f"\"{pkg}\"")
|
||||
if other_pkgs:
|
||||
import_other_pkgs = "import (\n" + " ".join([p + "\n" for p in other_pkgs]) + ")"
|
||||
test_string = test_setup + "\n" + import_other_pkgs + "\n" + prompt + code + "\n" + test
|
||||
else:
|
||||
test_string = test_setup + "\n" + prompt + code + "\n" + test
|
||||
elif language == "rust":
|
||||
main = "\nfn main(){ \n } \n"
|
||||
declaration = problems[task_id]["declaration"]
|
||||
test_string = main + declaration + prompt + code + test
|
||||
elif language == "php":
|
||||
if code[:5] != "<?php":
|
||||
code = "<?php\n" + code
|
||||
test_string = code + "\n" + test + "?>"
|
||||
return test_string
|
||||
|
||||
|
||||
def stream_jsonl_all(filename: str) -> Iterable[Dict]:
|
||||
"""
|
||||
Streams a JSONL file.
|
||||
"""
|
||||
results = []
|
||||
if filename.endswith(".gz"):
|
||||
fp = gzip.open(open(filename, "rb"), "rt")
|
||||
else:
|
||||
fp = open(filename, "r")
|
||||
for line in fp:
|
||||
if any(not x.isspace() for x in line):
|
||||
results.append(json.loads(line))
|
||||
fp.close()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def evaluate_functional_correctness(
|
||||
input_file: str = None,
|
||||
tmp_dir: str = "./",
|
||||
n_workers: int = 32,
|
||||
timeout: float = 10.0,
|
||||
problem_file: str = "../data/humaneval_python.jsonl.gz",
|
||||
out_dir: str = None,
|
||||
k: List[int] = [1, 10, 100],
|
||||
test_groundtruth: bool = False,
|
||||
example_test: bool = False,
|
||||
is_mbpp: bool = False,
|
||||
language: str = "python",
|
||||
):
|
||||
"""
|
||||
Evaluates the functional correctness of a model.
|
||||
"""
|
||||
if example_test:
|
||||
print("Example test...")
|
||||
|
||||
problems = read_dataset(problem_file,
|
||||
dataset_type="humaneval")
|
||||
sample_jsonl = stream_jsonl_all(input_file)
|
||||
|
||||
|
||||
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
||||
|
||||
futures = []
|
||||
completion_id = Counter()
|
||||
n_samples = 0
|
||||
results = defaultdict(list)
|
||||
|
||||
if test_groundtruth:
|
||||
print("Testing ground truth...")
|
||||
for sample in tqdm(problems.values()):
|
||||
task_id = sample["task_id"]
|
||||
lang = task_id.split("/")[0].lower()
|
||||
if lang == "javascript":
|
||||
lang = "js"
|
||||
tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
|
||||
sample["generation"] = sample["canonical_solution"]
|
||||
sample["test_code"] = process_humaneval_test(sample, problems, example_test, language)
|
||||
if sample["test_code"] is None:
|
||||
continue
|
||||
args = (task_id, sample, lang, timeout, tmp_dir_, completion_id[task_id])
|
||||
future = executor.submit(check_correctness, *args)
|
||||
futures.append(future)
|
||||
completion_id[task_id] += 1
|
||||
n_samples += 1
|
||||
else:
|
||||
print("Reading samples...")
|
||||
for sample in tqdm(sample_jsonl):
|
||||
task_id = sample["task_id"]
|
||||
if not is_mbpp:
|
||||
lang = language
|
||||
if not is_mbpp and lang == "javascript":
|
||||
lang = "js"
|
||||
if is_mbpp:
|
||||
lang = "python"
|
||||
tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
|
||||
sample["task_id"] = task_id
|
||||
sample["test_code"] = process_humaneval_test(sample, problems, example_test, is_mbpp, language)
|
||||
if sample["test_code"] is None:
|
||||
continue
|
||||
if "completion_id" in sample:
|
||||
completion_id_ = sample["completion_id"]
|
||||
else:
|
||||
completion_id_ = completion_id[task_id]
|
||||
args = (task_id, sample, lang, timeout, tmp_dir_, completion_id_)
|
||||
future = executor.submit(check_correctness, *args)
|
||||
futures.append(future)
|
||||
completion_id[task_id] += 1
|
||||
n_samples += 1
|
||||
|
||||
if len(completion_id) == len(problems):
|
||||
evaluate_pass_at_k = True
|
||||
else:
|
||||
evaluate_pass_at_k = False
|
||||
|
||||
print("Running test suites...")
|
||||
for future in tqdm(as_completed(futures), total=len(futures)):
|
||||
result = future.result()
|
||||
results[result["task_id"]].append((result["completion_id"], result))
|
||||
|
||||
# Calculate pass@k.
|
||||
total, correct = [], []
|
||||
for result in results.values():
|
||||
passed = [r[1]["passed"] for r in result]
|
||||
total.append(len(passed))
|
||||
correct.append(sum(passed))
|
||||
total = np.array(total)
|
||||
correct = np.array(correct)
|
||||
if evaluate_pass_at_k:
|
||||
ks = k
|
||||
pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
|
||||
for k in ks if (total >= k).all()}
|
||||
print(pass_at_k)
|
||||
else:
|
||||
print("Total:", np.sum(total))
|
||||
print("Correct:", np.sum(correct))
|
||||
return pass_at_k
|
||||
731
Evaluation/MBPP/human_eval/execution.py
Normal file
731
Evaluation/MBPP/human_eval/execution.py
Normal file
@@ -0,0 +1,731 @@
|
||||
import contextlib
|
||||
import faulthandler
|
||||
import io
|
||||
import multiprocessing
|
||||
import os
|
||||
import platform
|
||||
import signal
|
||||
import random
|
||||
import subprocess
|
||||
import tempfile
|
||||
import gzip
|
||||
import json
|
||||
from typing import *
|
||||
import traceback
|
||||
|
||||
java_exec = ""
|
||||
node_exec = ""
|
||||
tsc_exec = ""
|
||||
go_exec = ""
|
||||
php_exec = ""
|
||||
cs_exec = ""
|
||||
|
||||
def check_correctness(
|
||||
task_id: str,
|
||||
sample: dict,
|
||||
language_type: str,
|
||||
timeout: float = 3.0,
|
||||
tmp_dir: str = None,
|
||||
completion_id: Optional[int] = None,
|
||||
) -> Dict:
|
||||
"""
|
||||
Evaluates the functional correctness of a completion by running the test
|
||||
suite provided in the problem.
|
||||
"""
|
||||
|
||||
def unsafe_execute(tmp_dir):
|
||||
random_id = random.randint(1, 100000)
|
||||
if "python" in language_type.lower():
|
||||
with create_tempdir():
|
||||
|
||||
# These system calls are needed when cleaning up tempdir.
|
||||
import os
|
||||
import shutil
|
||||
rmtree = shutil.rmtree
|
||||
rmdir = os.rmdir
|
||||
chdir = os.chdir
|
||||
|
||||
# Disable functionalities that can make destructive changes to the test.
|
||||
reliability_guard()
|
||||
|
||||
try:
|
||||
exec_globals = {}
|
||||
with swallow_io():
|
||||
with time_limit(timeout):
|
||||
# WARNING
|
||||
# This program exists to execute untrusted model-generated code. Although
|
||||
# it is highly unlikely that model-generated code will do something overtly
|
||||
# malicious in response to this test suite, model-generated code may act
|
||||
# destructively due to a lack of model capability or alignment.
|
||||
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
||||
# does not perform destructive actions on their host or network.
|
||||
# Once you have read this disclaimer and taken appropriate precautions,
|
||||
# uncomment the following line and proceed at your own risk:
|
||||
exec(sample["test_code"], exec_globals)
|
||||
result.append("passed")
|
||||
except TimeoutException:
|
||||
result.append("timed out")
|
||||
except AssertionError as e:
|
||||
result.append(f"failed: AssertionError")
|
||||
except BaseException as e:
|
||||
result.append(f"failed: {e}")
|
||||
#print(sample["test_code"])
|
||||
#print(result)
|
||||
# Needed for cleaning up.
|
||||
shutil.rmtree = rmtree
|
||||
os.rmdir = rmdir
|
||||
os.chdir = chdir
|
||||
|
||||
elif "go" in language_type.lower():
|
||||
assert tmp_dir is not None, "Go should be evaluated in a dir where necessary module files installed."
|
||||
|
||||
import os
|
||||
import shutil
|
||||
|
||||
if "tmp" not in tmp_dir:
|
||||
tmp_dir = os.path.join(tmp_dir, "tmp")
|
||||
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.makedirs(tmp_dir)
|
||||
origin_path = os.getcwd()
|
||||
os.chdir(tmp_dir)
|
||||
open(f"main_test.go", 'w').write(sample["test_code"])
|
||||
try:
|
||||
exec_result = None
|
||||
with time_limit(timeout):
|
||||
# WARNING
|
||||
# This program exists to execute untrusted model-generated code. Although
|
||||
# it is highly unlikely that model-generated code will do something overtly
|
||||
# malicious in response to this test suite, model-generated code may act
|
||||
# destructively due to a lack of model capability or alignment.
|
||||
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
||||
# does not perform destructive actions on their host or network.
|
||||
# Once you have read this disclaimer and taken appropriate precautions,
|
||||
# uncomment the following line and proceed at your own risk:
|
||||
exec_result = subprocess.run([f"{go_exec}go", "test", f"-timeout={timeout}s", "main_test.go"], timeout=timeout, capture_output=True)
|
||||
|
||||
if exec_result.returncode == 0:
|
||||
result.append("passed")
|
||||
else:
|
||||
if exec_result.stderr:
|
||||
try:
|
||||
err = exec_result.stderr.decode()
|
||||
except:
|
||||
err = exec_result.stderr
|
||||
else:
|
||||
try:
|
||||
err = exec_result.stdout.decode()
|
||||
except:
|
||||
err = exec_result.stdout
|
||||
result.append(f"failed: {err}")
|
||||
|
||||
except TimeoutException:
|
||||
result.append("timed out")
|
||||
os.chdir(origin_path)
|
||||
shutil.rmtree(tmp_dir)
|
||||
elif "js" in language_type.lower():
|
||||
import os
|
||||
import shutil
|
||||
|
||||
if "tmp" not in tmp_dir:
|
||||
tmp_dir = os.path.join(tmp_dir, "tmp")
|
||||
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.makedirs(tmp_dir)
|
||||
origin_path = os.getcwd()
|
||||
os.chdir(tmp_dir)
|
||||
open(f"test.js", 'w').write(sample["test_code"])
|
||||
try:
|
||||
exec_result = None
|
||||
with time_limit(timeout):
|
||||
# WARNING
|
||||
# This program exists to execute untrusted model-generated code. Although
|
||||
# it is highly unlikely that model-generated code will do something overtly
|
||||
# malicious in response to this test suite, model-generated code may act
|
||||
# destructively due to a lack of model capability or alignment.
|
||||
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
||||
# does not perform destructive actions on their host or network.
|
||||
# Once you have read this disclaimer and taken appropriate precautions,
|
||||
# uncomment the following line and proceed at your own risk:
|
||||
exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
|
||||
|
||||
if exec_result.stderr.decode():
|
||||
err = exec_result.stderr.decode()
|
||||
result.append(f"failed: {err}")
|
||||
elif exec_result.stdout.decode():
|
||||
err = exec_result.stdout.decode()
|
||||
result.append(f"failed: {err}")
|
||||
else:
|
||||
result.append("passed")
|
||||
|
||||
except TimeoutException:
|
||||
result.append("timed out")
|
||||
os.chdir(origin_path)
|
||||
shutil.rmtree(tmp_dir)
|
||||
elif "cpp" in language_type.lower():
|
||||
import os
|
||||
import shutil
|
||||
origin_path = os.getcwd()
|
||||
if "tmp" not in tmp_dir:
|
||||
tmp_dir = os.path.join(tmp_dir, "tmp")
|
||||
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.makedirs(tmp_dir)
|
||||
|
||||
os.chdir(tmp_dir)
|
||||
open(f"test.cpp", 'w').write(sample["test_code"])
|
||||
if "162" in task_id:
|
||||
compilation_result = subprocess.run(["/usr/bin/g++", "-std=c++17", "test.cpp", "-lcrypto", "-lssl"],
|
||||
timeout=timeout,
|
||||
capture_output=True)
|
||||
else:
|
||||
compilation_result = subprocess.run(["/usr/bin/g++", "-std=c++17", "test.cpp"], timeout=timeout,
|
||||
capture_output=True)
|
||||
if compilation_result.returncode != 0:
|
||||
if compilation_result.stderr:
|
||||
err = compilation_result.stderr.decode()
|
||||
else:
|
||||
err = compilation_result.stdout.decode()
|
||||
result.append(f"failed: compilation error: {err}")
|
||||
else:
|
||||
try:
|
||||
exec_result = None
|
||||
with time_limit(timeout):
|
||||
# WARNING
|
||||
# This program exists to execute untrusted model-generated code. Although
|
||||
# it is highly unlikely that model-generated code will do something overtly
|
||||
# malicious in response to this test suite, model-generated code may act
|
||||
# destructively due to a lack of model capability or alignment.
|
||||
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
||||
# does not perform destructive actions on their host or network.
|
||||
# Once you have read this disclaimer and taken appropriate precautions,
|
||||
# uncomment the following line and proceed at your own risk:
|
||||
exec_result = subprocess.run(["./a.out"], timeout=timeout, capture_output=True)
|
||||
|
||||
if exec_result.returncode == 0:
|
||||
result.append("passed")
|
||||
else:
|
||||
if exec_result.stderr:
|
||||
try:
|
||||
err = exec_result.stderr.decode()
|
||||
except:
|
||||
err = exec_result.stderr
|
||||
else:
|
||||
try:
|
||||
err = exec_result.stdout.decode()
|
||||
except:
|
||||
err = exec_result.stdout
|
||||
result.append(f"failed: {err}")
|
||||
except TimeoutException:
|
||||
result.append("timed out")
|
||||
#print(result[-1])
|
||||
#print(sample["test_code"])
|
||||
os.chdir(origin_path)
|
||||
shutil.rmtree(tmp_dir)
|
||||
elif "php" in language_type.lower():
|
||||
import os
|
||||
import shutil
|
||||
origin_path = os.getcwd()
|
||||
if "tmp" not in tmp_dir:
|
||||
tmp_dir = os.path.join(tmp_dir, "tmp")
|
||||
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.makedirs(tmp_dir)
|
||||
|
||||
os.chdir(tmp_dir)
|
||||
open(f"test.php", 'w').write(sample["test_code"])
|
||||
try:
|
||||
exec_result = None
|
||||
with time_limit(timeout):
|
||||
cmd = f"{php_exec}php -f test.php"
|
||||
exec_result = subprocess.run(cmd, timeout=timeout, capture_output=True, shell=True)
|
||||
|
||||
if exec_result.returncode == 0:
|
||||
result.append("passed")
|
||||
else:
|
||||
if exec_result.stderr:
|
||||
try:
|
||||
err = exec_result.stderr.decode()
|
||||
except:
|
||||
err = exec_result.stderr
|
||||
else:
|
||||
try:
|
||||
err = exec_result.stdout.decode()
|
||||
except:
|
||||
err = exec_result.stdout
|
||||
result.append(f"failed: {err}")
|
||||
except TimeoutException:
|
||||
result.append("timed out")
|
||||
print(result[-1])
|
||||
print(sample["test_code"])
|
||||
os.chdir(origin_path)
|
||||
shutil.rmtree(tmp_dir)
|
||||
elif "sh" in language_type.lower():
|
||||
import os
|
||||
import shutil
|
||||
origin_path = os.getcwd()
|
||||
if "tmp" not in tmp_dir:
|
||||
tmp_dir = os.path.join(tmp_dir, "tmp")
|
||||
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.makedirs(tmp_dir)
|
||||
|
||||
os.chdir(tmp_dir)
|
||||
open(f"test.sh", 'w').write(sample["test_code"])
|
||||
try:
|
||||
exec_result = None
|
||||
with time_limit(timeout):
|
||||
cmd = "/bin/bash test.sh"
|
||||
exec_result = subprocess.run(cmd, timeout=10, capture_output=True, shell=True)
|
||||
|
||||
if exec_result.returncode == 0:
|
||||
result.append("passed")
|
||||
else:
|
||||
if exec_result.stderr:
|
||||
try:
|
||||
err = exec_result.stderr.decode()
|
||||
except:
|
||||
err = exec_result.stderr
|
||||
else:
|
||||
try:
|
||||
err = exec_result.stdout.decode()
|
||||
except:
|
||||
err = exec_result.stdout
|
||||
result.append(f"failed: {err}")
|
||||
except TimeoutException:
|
||||
result.append("timed out")
|
||||
#print(result[-1])
|
||||
#print(sample["test_code"])
|
||||
os.chdir(origin_path)
|
||||
shutil.rmtree(tmp_dir)
|
||||
elif "ts" in language_type.lower():
|
||||
import os
|
||||
import shutil
|
||||
origin_path = os.getcwd()
|
||||
if "tmp" not in tmp_dir:
|
||||
tmp_dir = os.path.join(tmp_dir, "tmp")
|
||||
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.makedirs(tmp_dir)
|
||||
|
||||
os.chdir(tmp_dir)
|
||||
env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
|
||||
open(f"test.ts", 'w').write(sample["test_code"])
|
||||
cmd = f"{tsc_exec}tsc test.ts --target ES2015 --lib ES2015,DOM"
|
||||
compilation_result = subprocess.run(cmd, timeout=timeout, capture_output=True, env=env, shell=True)
|
||||
if compilation_result.returncode != 0:
|
||||
if compilation_result.stderr:
|
||||
err = compilation_result.stderr.decode()
|
||||
else:
|
||||
err = compilation_result.stdout.decode()
|
||||
result.append(f"failed: compilation error: {err}")
|
||||
else:
|
||||
try:
|
||||
exec_result = None
|
||||
with time_limit(timeout):
|
||||
exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
|
||||
|
||||
if exec_result.returncode == 0:
|
||||
result.append("passed")
|
||||
else:
|
||||
if exec_result.stderr:
|
||||
try:
|
||||
err = exec_result.stderr.decode()
|
||||
except:
|
||||
err = exec_result.stderr
|
||||
else:
|
||||
try:
|
||||
err = exec_result.stdout.decode()
|
||||
except:
|
||||
err = exec_result.stdout
|
||||
result.append(f"failed: {err}")
|
||||
except TimeoutException:
|
||||
result.append("timed out")
|
||||
if result[-1] != "passed":
|
||||
env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
|
||||
cmd = f"{tsc_exec}tsc test.ts"
|
||||
compilation_result = subprocess.run(cmd, timeout=timeout, capture_output=True, env=env, shell=True)
|
||||
if compilation_result.returncode != 0:
|
||||
if compilation_result.stderr:
|
||||
err = compilation_result.stderr.decode()
|
||||
else:
|
||||
err = compilation_result.stdout.decode()
|
||||
result[-1] = f"failed: compilation error: {err}"
|
||||
else:
|
||||
try:
|
||||
exec_result = None
|
||||
with time_limit(timeout):
|
||||
exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
|
||||
|
||||
if exec_result.returncode == 0:
|
||||
result[-1] = "passed"
|
||||
else:
|
||||
if exec_result.stderr:
|
||||
try:
|
||||
err = exec_result.stderr.decode()
|
||||
except:
|
||||
err = exec_result.stderr
|
||||
else:
|
||||
try:
|
||||
err = exec_result.stdout.decode()
|
||||
except:
|
||||
err = exec_result.stdout
|
||||
result[-1] = f"failed: {err}"
|
||||
except TimeoutException:
|
||||
result[-1] = "timed out"
|
||||
|
||||
os.chdir(origin_path)
|
||||
shutil.rmtree(tmp_dir)
|
||||
elif "cs" in language_type.lower():
|
||||
import os
|
||||
import shutil
|
||||
origin_path = os.getcwd()
|
||||
if "tmp" not in tmp_dir:
|
||||
tmp_dir = os.path.join(tmp_dir, "tmp")
|
||||
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.makedirs(tmp_dir)
|
||||
os.chdir(tmp_dir)
|
||||
open(f"Program.cs", 'w').write(sample["test_code"])
|
||||
cmd = f"{cs_exec}mcs -d:DEBUG Program.cs"
|
||||
compilation_result = subprocess.run(cmd, shell=True, capture_output=True)
|
||||
if compilation_result.returncode != 0:
|
||||
if compilation_result.stderr:
|
||||
err = compilation_result.stderr.decode()
|
||||
else:
|
||||
err = compilation_result.stdout.decode()
|
||||
result.append(f"failed: compilation error: {err}")
|
||||
else:
|
||||
try:
|
||||
exec_result = None
|
||||
cmd = f"{cs_exec}mono Program.exe"
|
||||
env = dict(MONO_TRACE_LISTENER="Console.Error")
|
||||
with time_limit(timeout):
|
||||
exec_result = subprocess.run(cmd, timeout=timeout, shell=True, capture_output=True, env=env)
|
||||
|
||||
if "Fail" not in exec_result.stderr.decode():
|
||||
result.append("passed")
|
||||
else:
|
||||
if exec_result.stderr:
|
||||
try:
|
||||
err = exec_result.stderr.decode()
|
||||
except:
|
||||
err = exec_result.stderr
|
||||
else:
|
||||
try:
|
||||
err = exec_result.stdout.decode()
|
||||
except:
|
||||
err = exec_result.stdout
|
||||
result.append(f"failed: {err}")
|
||||
except TimeoutException:
|
||||
result.append("timed out")
|
||||
except Exception as e:
|
||||
result.append(f"failed: {e}")
|
||||
os.chdir(origin_path)
|
||||
shutil.rmtree(tmp_dir)
|
||||
elif "rust" in language_type.lower():
|
||||
import os
|
||||
|
||||
WD: str = os.path.dirname(os.path.abspath(__file__))
|
||||
RUST_DIR: str = os.path.join(WD, "rust")
|
||||
RUST_SRC: str = os.path.join(RUST_DIR, "src")
|
||||
RUST_BIN: str = os.path.join(RUST_SRC, "bin")
|
||||
RUST_TMP_DIR: str = os.path.join(RUST_DIR, "tmp")
|
||||
RUST_LOGS: str = os.path.join(RUST_TMP_DIR, "logs")
|
||||
RUST_EXT: str = ".rs"
|
||||
|
||||
# Create mandatory tmp directories
|
||||
os.makedirs(RUST_TMP_DIR, exist_ok=True)
|
||||
os.makedirs(RUST_LOGS, exist_ok=True)
|
||||
os.makedirs(RUST_SRC, exist_ok=True)
|
||||
os.makedirs(RUST_BIN, exist_ok=True)
|
||||
|
||||
with tempfile.NamedTemporaryFile(dir = RUST_BIN, delete=False) as f:
|
||||
#temporal file name
|
||||
file_prefix = sample["task_id"].lower().replace("/", "_")
|
||||
file_name:str = file_prefix +RUST_EXT
|
||||
|
||||
os.rename(f.name, os.path.join(RUST_BIN, file_name))
|
||||
|
||||
# Sample to pure Rust function
|
||||
rust_code: str = sample["test_code"]
|
||||
|
||||
# dump the rust source code in the target temporal file
|
||||
f.write(rust_code.encode('utf-8'))
|
||||
|
||||
# Proceed towards Rust binaries compilation. Therefore move to Rust module root dir.
|
||||
os.chdir(RUST_DIR)
|
||||
|
||||
# Two possible outcomes
|
||||
# Pass OR Fail compilation
|
||||
log_filename: str = file_prefix + ".jsonl"
|
||||
log_path: str = os.path.join(RUST_LOGS, log_filename)
|
||||
cargo_check: str = "cargo check --bin " + file_prefix + " --message-format json >> " + log_path
|
||||
# Compilation build status
|
||||
returned_val_compilation: int
|
||||
|
||||
# Overwrite file content
|
||||
if os.path.exists(log_path):
|
||||
if(file_size := os.path.getsize(log_path)) >= 0:
|
||||
os.remove(log_path)
|
||||
returned_val_compilation = os.system(cargo_check)
|
||||
|
||||
else:
|
||||
returned_val_compilation = os.system(cargo_check)
|
||||
|
||||
# 0 means success
|
||||
if returned_val_compilation == 0:
|
||||
|
||||
#Execution pipeline
|
||||
cargo_test: str = "cargo test --bin " +file_prefix+ " --message-format json >> " + log_path
|
||||
returned_val_execution = os.system(cargo_test)
|
||||
|
||||
if returned_val_execution == 0:
|
||||
result.append("passed")
|
||||
else:
|
||||
result.append(f"failed: execution error")
|
||||
|
||||
else:
|
||||
result.append(f"failed: compilation error")
|
||||
|
||||
|
||||
elif "java" in language_type.lower():
|
||||
assert tmp_dir is not None, "Java should be evaluated in a temporary dir."
|
||||
|
||||
import os
|
||||
import shutil
|
||||
|
||||
if "tmp" not in tmp_dir:
|
||||
tmp_dir = os.path.join(tmp_dir, "tmp")
|
||||
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.makedirs(tmp_dir)
|
||||
open(os.path.join(tmp_dir, "Problem.java"), 'w').write(sample["test_code"])
|
||||
origin_path = os.getcwd()
|
||||
os.system(f"cp ./javatuples-1.2.jar {tmp_dir}/")
|
||||
os.chdir(tmp_dir)
|
||||
res = "failed: unknown error"
|
||||
compile_returncode = -1
|
||||
for _ in range(5):
|
||||
try:
|
||||
cmd = f"{java_exec}javac -cp javatuples-1.2.jar Problem.java"
|
||||
compilation_result = subprocess.run(cmd, timeout=60, capture_output=True, shell=True)
|
||||
compile_returncode = compilation_result.returncode
|
||||
break
|
||||
except subprocess.TimeoutExpired as e:
|
||||
continue
|
||||
if compile_returncode != 0:
|
||||
res = "failed: compilation error"
|
||||
else:
|
||||
exec_result = None
|
||||
try:
|
||||
# WARNING
|
||||
# This program exists to execute untrusted model-generated code. Although
|
||||
# it is highly unlikely that model-generated code will do something overtly
|
||||
# malicious in response to this test suite, model-generated code may act
|
||||
# destructively due to a lack of model capability or alignment.
|
||||
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
||||
# does not perform destructive actions on their host or network.
|
||||
# Once you have read this disclaimer and taken appropriate precautions,
|
||||
# uncomment the following line and proceed at your own risk:
|
||||
cmd = f"{java_exec}java -ea -cp .:javatuples-1.2.jar Problem"
|
||||
exec_result = subprocess.run(cmd, timeout=timeout, capture_output=True, shell=True)
|
||||
if exec_result.returncode == 0:
|
||||
res = "passed"
|
||||
elif exec_result.returncode == 1:
|
||||
if "AssertionError" in exec_result.stderr.decode('unicode-escape'):
|
||||
res = "failed: wrong answer"
|
||||
else:
|
||||
res = f"failed: {exec_result.stderr.decode()}"
|
||||
except subprocess.TimeoutExpired as e:
|
||||
res = "time out"
|
||||
except BaseException as e:
|
||||
res = f"failed: {e}"
|
||||
|
||||
result.append(res)
|
||||
os.chdir(origin_path)
|
||||
shutil.rmtree(tmp_dir)
|
||||
|
||||
manager = multiprocessing.Manager()
|
||||
result = manager.list()
|
||||
|
||||
p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,))
|
||||
p.start()
|
||||
p.join(timeout=timeout + 1)
|
||||
if p.is_alive():
|
||||
p.kill()
|
||||
|
||||
if not result:
|
||||
result.append("timed out")
|
||||
|
||||
return {
|
||||
"task_id" : task_id,
|
||||
"completion_id": completion_id,
|
||||
"result" : result[0],
|
||||
"passed" : result[0] == "passed",
|
||||
"finish" : -1 if "finish" not in sample else sample["finish"],
|
||||
"code" : sample["test_code"]
|
||||
}
|
||||
|
||||
# Copyright (c) OpenAI (https://openai.com)
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
# of this software and associated documentation files (the "Software"), to deal
|
||||
# in the Software without restriction, including without limitation the rights
|
||||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
# copies of the Software, and to permit persons to whom the Software is
|
||||
# furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
# THE SOFTWARE.
|
||||
# ============================================================================
|
||||
@contextlib.contextmanager
|
||||
def time_limit(seconds: float):
|
||||
def signal_handler(signum, frame):
|
||||
raise TimeoutException("Timed out!")
|
||||
|
||||
signal.setitimer(signal.ITIMER_REAL, seconds)
|
||||
signal.signal(signal.SIGALRM, signal_handler)
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
signal.setitimer(signal.ITIMER_REAL, 0)
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def swallow_io():
|
||||
stream = WriteOnlyStringIO()
|
||||
with contextlib.redirect_stdout(stream):
|
||||
with contextlib.redirect_stderr(stream):
|
||||
with redirect_stdin(stream):
|
||||
yield
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def create_tempdir():
|
||||
with tempfile.TemporaryDirectory() as dirname:
|
||||
with chdir(dirname):
|
||||
yield dirname
|
||||
|
||||
|
||||
class TimeoutException(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class WriteOnlyStringIO(io.StringIO):
|
||||
""" StringIO that throws an exception when it's read from """
|
||||
|
||||
def read(self, *args, **kwargs):
|
||||
raise IOError
|
||||
|
||||
def readline(self, *args, **kwargs):
|
||||
raise IOError
|
||||
|
||||
def readlines(self, *args, **kwargs):
|
||||
raise IOError
|
||||
|
||||
def readable(self, *args, **kwargs):
|
||||
""" Returns True if the IO object can be read. """
|
||||
return False
|
||||
|
||||
|
||||
class redirect_stdin(contextlib._RedirectStream): # type: ignore
|
||||
_stream = 'stdin'
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def chdir(root):
|
||||
if root == ".":
|
||||
yield
|
||||
return
|
||||
cwd = os.getcwd()
|
||||
os.chdir(root)
|
||||
try:
|
||||
yield
|
||||
except BaseException as exc:
|
||||
raise exc
|
||||
finally:
|
||||
os.chdir(cwd)
|
||||
|
||||
|
||||
def reliability_guard(maximum_memory_bytes: Optional[int] = None):
|
||||
"""
|
||||
This disables various destructive functions and prevents the generated code
|
||||
from interfering with the test (e.g. fork bomb, killing other processes,
|
||||
removing filesystem files, etc.)
|
||||
|
||||
WARNING
|
||||
This function is NOT a security sandbox. Untrusted code, including, model-
|
||||
generated code, should not be blindly executed outside of one. See the
|
||||
Codex paper for more information about OpenAI's code sandbox, and proceed
|
||||
with caution.
|
||||
"""
|
||||
|
||||
if maximum_memory_bytes is not None:
|
||||
import resource
|
||||
resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
|
||||
resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
|
||||
if not platform.uname().system == 'Darwin':
|
||||
resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
|
||||
|
||||
faulthandler.disable()
|
||||
|
||||
import builtins
|
||||
builtins.exit = None
|
||||
builtins.quit = None
|
||||
|
||||
import os
|
||||
os.environ['OMP_NUM_THREADS'] = '1'
|
||||
|
||||
os.kill = None
|
||||
os.system = None
|
||||
os.putenv = None
|
||||
os.remove = None
|
||||
os.removedirs = None
|
||||
os.rmdir = None
|
||||
os.fchdir = None
|
||||
os.setuid = None
|
||||
os.fork = None
|
||||
os.forkpty = None
|
||||
os.killpg = None
|
||||
os.rename = None
|
||||
os.renames = None
|
||||
os.truncate = None
|
||||
os.replace = None
|
||||
os.unlink = None
|
||||
os.fchmod = None
|
||||
os.fchown = None
|
||||
os.chmod = None
|
||||
os.chown = None
|
||||
os.chroot = None
|
||||
os.fchdir = None
|
||||
os.lchflags = None
|
||||
os.lchmod = None
|
||||
os.lchown = None
|
||||
os.getcwd = None
|
||||
os.chdir = None
|
||||
|
||||
import shutil
|
||||
shutil.rmtree = None
|
||||
shutil.move = None
|
||||
shutil.chown = None
|
||||
|
||||
import subprocess
|
||||
subprocess.Popen = None # type: ignore
|
||||
|
||||
__builtins__['help'] = None
|
||||
|
||||
import sys
|
||||
sys.modules['ipdb'] = None
|
||||
sys.modules['joblib'] = None
|
||||
sys.modules['resource'] = None
|
||||
sys.modules['psutil'] = None
|
||||
sys.modules['tkinter'] = None
|
||||
Reference in New Issue
Block a user