DeepSeek-Coder/Evaluation/MBPP/human_eval/execution.py

731 lines
30 KiB
Python
Raw Normal View History

2023-11-02 14:07:09 +00:00
import contextlib
import faulthandler
import io
import multiprocessing
import os
import platform
import signal
import random
import subprocess
import tempfile
import gzip
import json
from typing import *
import traceback
java_exec = ""
node_exec = ""
tsc_exec = ""
go_exec = ""
php_exec = ""
cs_exec = ""
def check_correctness(
task_id: str,
sample: dict,
language_type: str,
timeout: float = 3.0,
tmp_dir: str = None,
completion_id: Optional[int] = None,
) -> Dict:
"""
Evaluates the functional correctness of a completion by running the test
suite provided in the problem.
"""
def unsafe_execute(tmp_dir):
random_id = random.randint(1, 100000)
if "python" in language_type.lower():
with create_tempdir():
# These system calls are needed when cleaning up tempdir.
import os
import shutil
rmtree = shutil.rmtree
rmdir = os.rmdir
chdir = os.chdir
# Disable functionalities that can make destructive changes to the test.
reliability_guard()
try:
exec_globals = {}
with swallow_io():
with time_limit(timeout):
# WARNING
# This program exists to execute untrusted model-generated code. Although
# it is highly unlikely that model-generated code will do something overtly
# malicious in response to this test suite, model-generated code may act
# destructively due to a lack of model capability or alignment.
# Users are strongly encouraged to sandbox this evaluation suite so that it
# does not perform destructive actions on their host or network.
# Once you have read this disclaimer and taken appropriate precautions,
# uncomment the following line and proceed at your own risk:
exec(sample["test_code"], exec_globals)
result.append("passed")
except TimeoutException:
result.append("timed out")
except AssertionError as e:
result.append(f"failed: AssertionError")
except BaseException as e:
result.append(f"failed: {e}")
#print(sample["test_code"])
#print(result)
# Needed for cleaning up.
shutil.rmtree = rmtree
os.rmdir = rmdir
os.chdir = chdir
elif "go" in language_type.lower():
assert tmp_dir is not None, "Go should be evaluated in a dir where necessary module files installed."
import os
import shutil
if "tmp" not in tmp_dir:
tmp_dir = os.path.join(tmp_dir, "tmp")
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
origin_path = os.getcwd()
os.chdir(tmp_dir)
open(f"main_test.go", 'w').write(sample["test_code"])
try:
exec_result = None
with time_limit(timeout):
# WARNING
# This program exists to execute untrusted model-generated code. Although
# it is highly unlikely that model-generated code will do something overtly
# malicious in response to this test suite, model-generated code may act
# destructively due to a lack of model capability or alignment.
# Users are strongly encouraged to sandbox this evaluation suite so that it
# does not perform destructive actions on their host or network.
# Once you have read this disclaimer and taken appropriate precautions,
# uncomment the following line and proceed at your own risk:
exec_result = subprocess.run([f"{go_exec}go", "test", f"-timeout={timeout}s", "main_test.go"], timeout=timeout, capture_output=True)
if exec_result.returncode == 0:
result.append("passed")
else:
if exec_result.stderr:
try:
err = exec_result.stderr.decode()
except:
err = exec_result.stderr
else:
try:
err = exec_result.stdout.decode()
except:
err = exec_result.stdout
result.append(f"failed: {err}")
except TimeoutException:
result.append("timed out")
os.chdir(origin_path)
shutil.rmtree(tmp_dir)
elif "js" in language_type.lower():
import os
import shutil
if "tmp" not in tmp_dir:
tmp_dir = os.path.join(tmp_dir, "tmp")
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
origin_path = os.getcwd()
os.chdir(tmp_dir)
open(f"test.js", 'w').write(sample["test_code"])
try:
exec_result = None
with time_limit(timeout):
# WARNING
# This program exists to execute untrusted model-generated code. Although
# it is highly unlikely that model-generated code will do something overtly
# malicious in response to this test suite, model-generated code may act
# destructively due to a lack of model capability or alignment.
# Users are strongly encouraged to sandbox this evaluation suite so that it
# does not perform destructive actions on their host or network.
# Once you have read this disclaimer and taken appropriate precautions,
# uncomment the following line and proceed at your own risk:
exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
if exec_result.stderr.decode():
err = exec_result.stderr.decode()
result.append(f"failed: {err}")
elif exec_result.stdout.decode():
err = exec_result.stdout.decode()
result.append(f"failed: {err}")
else:
result.append("passed")
except TimeoutException:
result.append("timed out")
os.chdir(origin_path)
shutil.rmtree(tmp_dir)
elif "cpp" in language_type.lower():
import os
import shutil
origin_path = os.getcwd()
if "tmp" not in tmp_dir:
tmp_dir = os.path.join(tmp_dir, "tmp")
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
os.chdir(tmp_dir)
open(f"test.cpp", 'w').write(sample["test_code"])
if "162" in task_id:
compilation_result = subprocess.run(["/usr/bin/g++", "-std=c++17", "test.cpp", "-lcrypto", "-lssl"],
timeout=timeout,
capture_output=True)
else:
compilation_result = subprocess.run(["/usr/bin/g++", "-std=c++17", "test.cpp"], timeout=timeout,
capture_output=True)
if compilation_result.returncode != 0:
if compilation_result.stderr:
err = compilation_result.stderr.decode()
else:
err = compilation_result.stdout.decode()
result.append(f"failed: compilation error: {err}")
else:
try:
exec_result = None
with time_limit(timeout):
# WARNING
# This program exists to execute untrusted model-generated code. Although
# it is highly unlikely that model-generated code will do something overtly
# malicious in response to this test suite, model-generated code may act
# destructively due to a lack of model capability or alignment.
# Users are strongly encouraged to sandbox this evaluation suite so that it
# does not perform destructive actions on their host or network.
# Once you have read this disclaimer and taken appropriate precautions,
# uncomment the following line and proceed at your own risk:
exec_result = subprocess.run(["./a.out"], timeout=timeout, capture_output=True)
if exec_result.returncode == 0:
result.append("passed")
else:
if exec_result.stderr:
try:
err = exec_result.stderr.decode()
except:
err = exec_result.stderr
else:
try:
err = exec_result.stdout.decode()
except:
err = exec_result.stdout
result.append(f"failed: {err}")
except TimeoutException:
result.append("timed out")
#print(result[-1])
#print(sample["test_code"])
os.chdir(origin_path)
shutil.rmtree(tmp_dir)
elif "php" in language_type.lower():
import os
import shutil
origin_path = os.getcwd()
if "tmp" not in tmp_dir:
tmp_dir = os.path.join(tmp_dir, "tmp")
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
os.chdir(tmp_dir)
open(f"test.php", 'w').write(sample["test_code"])
try:
exec_result = None
with time_limit(timeout):
cmd = f"{php_exec}php -f test.php"
exec_result = subprocess.run(cmd, timeout=timeout, capture_output=True, shell=True)
if exec_result.returncode == 0:
result.append("passed")
else:
if exec_result.stderr:
try:
err = exec_result.stderr.decode()
except:
err = exec_result.stderr
else:
try:
err = exec_result.stdout.decode()
except:
err = exec_result.stdout
result.append(f"failed: {err}")
except TimeoutException:
result.append("timed out")
print(result[-1])
print(sample["test_code"])
os.chdir(origin_path)
shutil.rmtree(tmp_dir)
elif "sh" in language_type.lower():
import os
import shutil
origin_path = os.getcwd()
if "tmp" not in tmp_dir:
tmp_dir = os.path.join(tmp_dir, "tmp")
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
os.chdir(tmp_dir)
open(f"test.sh", 'w').write(sample["test_code"])
try:
exec_result = None
with time_limit(timeout):
cmd = "/bin/bash test.sh"
exec_result = subprocess.run(cmd, timeout=10, capture_output=True, shell=True)
if exec_result.returncode == 0:
result.append("passed")
else:
if exec_result.stderr:
try:
err = exec_result.stderr.decode()
except:
err = exec_result.stderr
else:
try:
err = exec_result.stdout.decode()
except:
err = exec_result.stdout
result.append(f"failed: {err}")
except TimeoutException:
result.append("timed out")
#print(result[-1])
#print(sample["test_code"])
os.chdir(origin_path)
shutil.rmtree(tmp_dir)
elif "ts" in language_type.lower():
import os
import shutil
origin_path = os.getcwd()
if "tmp" not in tmp_dir:
tmp_dir = os.path.join(tmp_dir, "tmp")
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
os.chdir(tmp_dir)
env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
open(f"test.ts", 'w').write(sample["test_code"])
cmd = f"{tsc_exec}tsc test.ts --target ES2015 --lib ES2015,DOM"
compilation_result = subprocess.run(cmd, timeout=timeout, capture_output=True, env=env, shell=True)
if compilation_result.returncode != 0:
if compilation_result.stderr:
err = compilation_result.stderr.decode()
else:
err = compilation_result.stdout.decode()
result.append(f"failed: compilation error: {err}")
else:
try:
exec_result = None
with time_limit(timeout):
exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
if exec_result.returncode == 0:
result.append("passed")
else:
if exec_result.stderr:
try:
err = exec_result.stderr.decode()
except:
err = exec_result.stderr
else:
try:
err = exec_result.stdout.decode()
except:
err = exec_result.stdout
result.append(f"failed: {err}")
except TimeoutException:
result.append("timed out")
if result[-1] != "passed":
env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
cmd = f"{tsc_exec}tsc test.ts"
compilation_result = subprocess.run(cmd, timeout=timeout, capture_output=True, env=env, shell=True)
if compilation_result.returncode != 0:
if compilation_result.stderr:
err = compilation_result.stderr.decode()
else:
err = compilation_result.stdout.decode()
result[-1] = f"failed: compilation error: {err}"
else:
try:
exec_result = None
with time_limit(timeout):
exec_result = subprocess.run([f"{node_exec}node", "test.js"], timeout=timeout, capture_output=True)
if exec_result.returncode == 0:
result[-1] = "passed"
else:
if exec_result.stderr:
try:
err = exec_result.stderr.decode()
except:
err = exec_result.stderr
else:
try:
err = exec_result.stdout.decode()
except:
err = exec_result.stdout
result[-1] = f"failed: {err}"
except TimeoutException:
result[-1] = "timed out"
os.chdir(origin_path)
shutil.rmtree(tmp_dir)
elif "cs" in language_type.lower():
import os
import shutil
origin_path = os.getcwd()
if "tmp" not in tmp_dir:
tmp_dir = os.path.join(tmp_dir, "tmp")
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
os.chdir(tmp_dir)
open(f"Program.cs", 'w').write(sample["test_code"])
cmd = f"{cs_exec}mcs -d:DEBUG Program.cs"
compilation_result = subprocess.run(cmd, shell=True, capture_output=True)
if compilation_result.returncode != 0:
if compilation_result.stderr:
err = compilation_result.stderr.decode()
else:
err = compilation_result.stdout.decode()
result.append(f"failed: compilation error: {err}")
else:
try:
exec_result = None
cmd = f"{cs_exec}mono Program.exe"
env = dict(MONO_TRACE_LISTENER="Console.Error")
with time_limit(timeout):
exec_result = subprocess.run(cmd, timeout=timeout, shell=True, capture_output=True, env=env)
if "Fail" not in exec_result.stderr.decode():
result.append("passed")
else:
if exec_result.stderr:
try:
err = exec_result.stderr.decode()
except:
err = exec_result.stderr
else:
try:
err = exec_result.stdout.decode()
except:
err = exec_result.stdout
result.append(f"failed: {err}")
except TimeoutException:
result.append("timed out")
except Exception as e:
result.append(f"failed: {e}")
os.chdir(origin_path)
shutil.rmtree(tmp_dir)
elif "rust" in language_type.lower():
import os
WD: str = os.path.dirname(os.path.abspath(__file__))
RUST_DIR: str = os.path.join(WD, "rust")
RUST_SRC: str = os.path.join(RUST_DIR, "src")
RUST_BIN: str = os.path.join(RUST_SRC, "bin")
RUST_TMP_DIR: str = os.path.join(RUST_DIR, "tmp")
RUST_LOGS: str = os.path.join(RUST_TMP_DIR, "logs")
RUST_EXT: str = ".rs"
# Create mandatory tmp directories
os.makedirs(RUST_TMP_DIR, exist_ok=True)
os.makedirs(RUST_LOGS, exist_ok=True)
os.makedirs(RUST_SRC, exist_ok=True)
os.makedirs(RUST_BIN, exist_ok=True)
with tempfile.NamedTemporaryFile(dir = RUST_BIN, delete=False) as f:
#temporal file name
file_prefix = sample["task_id"].lower().replace("/", "_")
file_name:str = file_prefix +RUST_EXT
os.rename(f.name, os.path.join(RUST_BIN, file_name))
# Sample to pure Rust function
rust_code: str = sample["test_code"]
# dump the rust source code in the target temporal file
f.write(rust_code.encode('utf-8'))
# Proceed towards Rust binaries compilation. Therefore move to Rust module root dir.
os.chdir(RUST_DIR)
# Two possible outcomes
# Pass OR Fail compilation
log_filename: str = file_prefix + ".jsonl"
log_path: str = os.path.join(RUST_LOGS, log_filename)
cargo_check: str = "cargo check --bin " + file_prefix + " --message-format json >> " + log_path
# Compilation build status
returned_val_compilation: int
# Overwrite file content
if os.path.exists(log_path):
if(file_size := os.path.getsize(log_path)) >= 0:
os.remove(log_path)
returned_val_compilation = os.system(cargo_check)
else:
returned_val_compilation = os.system(cargo_check)
# 0 means success
if returned_val_compilation == 0:
#Execution pipeline
cargo_test: str = "cargo test --bin " +file_prefix+ " --message-format json >> " + log_path
returned_val_execution = os.system(cargo_test)
if returned_val_execution == 0:
result.append("passed")
else:
result.append(f"failed: execution error")
else:
result.append(f"failed: compilation error")
elif "java" in language_type.lower():
assert tmp_dir is not None, "Java should be evaluated in a temporary dir."
import os
import shutil
if "tmp" not in tmp_dir:
tmp_dir = os.path.join(tmp_dir, "tmp")
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
if not os.path.exists(tmp_dir):
os.makedirs(tmp_dir)
open(os.path.join(tmp_dir, "Problem.java"), 'w').write(sample["test_code"])
origin_path = os.getcwd()
os.system(f"cp ./javatuples-1.2.jar {tmp_dir}/")
os.chdir(tmp_dir)
res = "failed: unknown error"
compile_returncode = -1
for _ in range(5):
try:
cmd = f"{java_exec}javac -cp javatuples-1.2.jar Problem.java"
compilation_result = subprocess.run(cmd, timeout=60, capture_output=True, shell=True)
compile_returncode = compilation_result.returncode
break
except subprocess.TimeoutExpired as e:
continue
if compile_returncode != 0:
res = "failed: compilation error"
else:
exec_result = None
try:
# WARNING
# This program exists to execute untrusted model-generated code. Although
# it is highly unlikely that model-generated code will do something overtly
# malicious in response to this test suite, model-generated code may act
# destructively due to a lack of model capability or alignment.
# Users are strongly encouraged to sandbox this evaluation suite so that it
# does not perform destructive actions on their host or network.
# Once you have read this disclaimer and taken appropriate precautions,
# uncomment the following line and proceed at your own risk:
cmd = f"{java_exec}java -ea -cp .:javatuples-1.2.jar Problem"
exec_result = subprocess.run(cmd, timeout=timeout, capture_output=True, shell=True)
if exec_result.returncode == 0:
res = "passed"
elif exec_result.returncode == 1:
if "AssertionError" in exec_result.stderr.decode('unicode-escape'):
res = "failed: wrong answer"
else:
res = f"failed: {exec_result.stderr.decode()}"
except subprocess.TimeoutExpired as e:
res = "time out"
except BaseException as e:
res = f"failed: {e}"
result.append(res)
os.chdir(origin_path)
shutil.rmtree(tmp_dir)
manager = multiprocessing.Manager()
result = manager.list()
p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,))
p.start()
p.join(timeout=timeout + 1)
if p.is_alive():
p.kill()
if not result:
result.append("timed out")
return {
"task_id" : task_id,
"completion_id": completion_id,
"result" : result[0],
"passed" : result[0] == "passed",
"finish" : -1 if "finish" not in sample else sample["finish"],
"code" : sample["test_code"]
}
# Copyright (c) OpenAI (https://openai.com)
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# ============================================================================
@contextlib.contextmanager
def time_limit(seconds: float):
def signal_handler(signum, frame):
raise TimeoutException("Timed out!")
signal.setitimer(signal.ITIMER_REAL, seconds)
signal.signal(signal.SIGALRM, signal_handler)
try:
yield
finally:
signal.setitimer(signal.ITIMER_REAL, 0)
@contextlib.contextmanager
def swallow_io():
stream = WriteOnlyStringIO()
with contextlib.redirect_stdout(stream):
with contextlib.redirect_stderr(stream):
with redirect_stdin(stream):
yield
@contextlib.contextmanager
def create_tempdir():
with tempfile.TemporaryDirectory() as dirname:
with chdir(dirname):
yield dirname
class TimeoutException(Exception):
pass
class WriteOnlyStringIO(io.StringIO):
""" StringIO that throws an exception when it's read from """
def read(self, *args, **kwargs):
raise IOError
def readline(self, *args, **kwargs):
raise IOError
def readlines(self, *args, **kwargs):
raise IOError
def readable(self, *args, **kwargs):
""" Returns True if the IO object can be read. """
return False
class redirect_stdin(contextlib._RedirectStream): # type: ignore
_stream = 'stdin'
@contextlib.contextmanager
def chdir(root):
if root == ".":
yield
return
cwd = os.getcwd()
os.chdir(root)
try:
yield
except BaseException as exc:
raise exc
finally:
os.chdir(cwd)
def reliability_guard(maximum_memory_bytes: Optional[int] = None):
"""
This disables various destructive functions and prevents the generated code
from interfering with the test (e.g. fork bomb, killing other processes,
removing filesystem files, etc.)
WARNING
This function is NOT a security sandbox. Untrusted code, including, model-
generated code, should not be blindly executed outside of one. See the
Codex paper for more information about OpenAI's code sandbox, and proceed
with caution.
"""
if maximum_memory_bytes is not None:
import resource
resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
if not platform.uname().system == 'Darwin':
resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
faulthandler.disable()
import builtins
builtins.exit = None
builtins.quit = None
import os
os.environ['OMP_NUM_THREADS'] = '1'
os.kill = None
os.system = None
os.putenv = None
os.remove = None
os.removedirs = None
os.rmdir = None
os.fchdir = None
os.setuid = None
os.fork = None
os.forkpty = None
os.killpg = None
os.rename = None
os.renames = None
os.truncate = None
os.replace = None
os.unlink = None
os.fchmod = None
os.fchown = None
os.chmod = None
os.chown = None
os.chroot = None
os.fchdir = None
os.lchflags = None
os.lchmod = None
os.lchown = None
os.getcwd = None
os.chdir = None
import shutil
shutil.rmtree = None
shutil.move = None
shutil.chown = None
import subprocess
subprocess.Popen = None # type: ignore
__builtins__['help'] = None
import sys
sys.modules['ipdb'] = None
sys.modules['joblib'] = None
sys.modules['resource'] = None
sys.modules['psutil'] = None
sys.modules['tkinter'] = None