Fix issues with swe-bench

This commit is contained in:
ricardo-larosa 2024-06-22 15:45:12 -07:00
parent 5e0ebae057
commit 91806d296b
4 changed files with 34 additions and 21 deletions

View File

@ -7,7 +7,7 @@ import time
print("Downloading dataset...") print("Downloading dataset...")
dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split = "test", cache_dir='datasets_cache') dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split = "test", cache_dir='datasets_cache')
preds_path = "llama2-70b-4096.jsonl" preds_path = "llama3-70b-8192.jsonl"
try: try:
with open(preds_path, "r") as file: with open(preds_path, "r") as file:
preds_jsonl = file.read() preds_jsonl = file.read()
@ -34,8 +34,8 @@ def save_jsonl(jsonl_object, file_path="preds.jsonl"):
args = argparse.Namespace( args = argparse.Namespace(
# backend='mixtral-8x7b-32768', # backend='mixtral-8x7b-32768',
backend='llama2-70b-4096', backend='llama3-70b-8192',
temperature=0.5, temperature=0.7,
task='swe', task='swe',
naive_run=False, naive_run=False,
prompt_sample='cot', prompt_sample='cot',
@ -50,12 +50,16 @@ print("Solving...")
task = SWETask(dataset) task = SWETask(dataset)
for index in range(6,100): for index in range(3,4):
ys, infos = solve(args, task, index, to_print=False) instance_id = dataset[index]["instance_id"]
size = len(dataset[index]["problem_statement"])
print(f" ### Task {index} -- {instance_id} -> size ({size} )###")
ys, infos, _ = solve(args, task, index, to_print=False)
preds_jsonl = update_jsonl(dataset[index]["instance_id"], SWETask.parse_diff_block(ys[0]), args.backend, preds_jsonl) preds_jsonl = update_jsonl(dataset[index]["instance_id"], SWETask.parse_diff_block(ys[0]), args.backend, preds_jsonl)
save_jsonl(preds_jsonl, preds_path) save_jsonl(preds_jsonl, preds_path)
# print("-----------Predicted----------------------") print("-----------Predicted----------------------")
# print(SWETask.parse_diff_block(ys[0])) print(SWETask.parse_diff_block(ys[0]))
# # print("-----------Expected----------------------") print("-----------Expected----------------------")
# # print(dataset[index]["patch"]) print(dataset[index]["patch"])
time.sleep(60) # time.sleep(1)

View File

@ -1,4 +1,4 @@
import itertools, os import itertools, os, time
import numpy as np import numpy as np
from functools import partial from functools import partial
@ -64,6 +64,7 @@ def solve(args, task, idx, to_print=True):
ys = [''] # current output candidates ys = [''] # current output candidates
infos = [] infos = []
for step in range(task.steps): for step in range(task.steps):
# print(f"Step {step} - gen")
# generation # generation
if args.method_generate == 'sample': if args.method_generate == 'sample':
new_ys = [get_samples(task, x, y, args.n_generate_sample, prompt_sample=args.prompt_sample, stop=task.stops[step]) for y in ys] new_ys = [get_samples(task, x, y, args.n_generate_sample, prompt_sample=args.prompt_sample, stop=task.stops[step]) for y in ys]
@ -71,12 +72,13 @@ def solve(args, task, idx, to_print=True):
new_ys = [get_proposals(task, x, y) for y in ys] new_ys = [get_proposals(task, x, y) for y in ys]
new_ys = list(itertools.chain(*new_ys)) new_ys = list(itertools.chain(*new_ys))
ids = list(range(len(new_ys))) ids = list(range(len(new_ys)))
time.sleep(5)
# evaluation # evaluation
# print(f"Step {step} - eval")
if args.method_evaluate == 'vote': if args.method_evaluate == 'vote':
values = get_votes(task, x, new_ys, args.n_evaluate_sample) values = get_votes(task, x, new_ys, args.n_evaluate_sample)
elif args.method_evaluate == 'value': elif args.method_evaluate == 'value':
values = get_values(task, x, new_ys, args.n_evaluate_sample) values = get_values(task, x, new_ys, args.n_evaluate_sample)
# selection # selection
if args.method_select == 'sample': if args.method_select == 'sample':
ps = np.array(values) / sum(values) ps = np.array(values) / sum(values)
@ -92,12 +94,14 @@ def solve(args, task, idx, to_print=True):
infos.append({'step': step, 'x': x, 'ys': ys, 'new_ys': new_ys, 'values': values, 'select_new_ys': select_new_ys}) infos.append({'step': step, 'x': x, 'ys': ys, 'new_ys': new_ys, 'values': values, 'select_new_ys': select_new_ys})
ys = select_new_ys ys = select_new_ys
if step == 0:
time.sleep(5)
print(usage()) total_usage = usage()
if to_print: if to_print:
print(ys) print(ys)
return ys, {'steps': infos} return ys, {'steps': infos}, total_usage
def naive_solve(args, task, idx, to_print=True): def naive_solve(args, task, idx, to_print=True):
global platform global platform

View File

@ -1,4 +1,4 @@
import os import os, time
import openai import openai
import backoff import backoff
@ -37,6 +37,7 @@ def chatgpt(messages, model="gpt-4", temperature=0.7, max_tokens=1000, n=1, stop
# log completion tokens # log completion tokens
completion_tokens += res["usage"]["completion_tokens"] completion_tokens += res["usage"]["completion_tokens"]
prompt_tokens += res["usage"]["prompt_tokens"] prompt_tokens += res["usage"]["prompt_tokens"]
time.sleep(2)
return outputs return outputs
def gpt_usage(backend="gpt-4"): def gpt_usage(backend="gpt-4"):
@ -45,14 +46,16 @@ def gpt_usage(backend="gpt-4"):
cost = completion_tokens / 1000 * 0.06 + prompt_tokens / 1000 * 0.03 cost = completion_tokens / 1000 * 0.06 + prompt_tokens / 1000 * 0.03
elif backend == "gpt-3.5-turbo": elif backend == "gpt-3.5-turbo":
cost = completion_tokens / 1000 * 0.002 + prompt_tokens / 1000 * 0.0015 cost = completion_tokens / 1000 * 0.002 + prompt_tokens / 1000 * 0.0015
elif backend == "gpt-4-turbo":
cost = completion_tokens / 1000 * 0.03 + prompt_tokens / 1000 * 0.01
return {"completion_tokens": completion_tokens, "prompt_tokens": prompt_tokens, "cost": cost} return {"completion_tokens": completion_tokens, "prompt_tokens": prompt_tokens, "cost": cost}
def groq(prompt, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=1500, n=1, stop=None) -> list: def groq(prompt, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=1000, n=1, stop=None) -> list:
global completion_tokens, prompt_tokens global completion_tokens, prompt_tokens
messages = [{"role": "user", "content": prompt}] messages = [{"role": "user", "content": prompt}]
return groqgpt(messages, model=model, temperature=temperature, max_tokens=max_tokens, n=n, stop=stop) return groqgpt(messages, model=model, temperature=temperature, max_tokens=max_tokens, n=n, stop=stop)
def groqgpt(messages, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=2000,n=1, stop=None) -> list: def groqgpt(messages, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=1000,n=1, stop=None) -> list:
global completion_tokens, prompt_tokens global completion_tokens, prompt_tokens
outputs = [] outputs = []
while n > 0: while n > 0:
@ -62,6 +65,7 @@ def groqgpt(messages, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=20
# log completion tokens # log completion tokens
completion_tokens += res["usage"]["completion_tokens"] completion_tokens += res["usage"]["completion_tokens"]
prompt_tokens += res["usage"]["prompt_tokens"] prompt_tokens += res["usage"]["prompt_tokens"]
time.sleep(2)
return outputs return outputs
def groq_usage(): def groq_usage():

View File

@ -20,13 +20,14 @@ class SWETask(Task):
super().__init__() super().__init__()
self.data = dataset self.data = dataset
self.steps = 2 self.steps = 2
self.stops = ['\nPatch:\n', None] self.stops = ['\nPatch:\n', '<|eot_id|>']
def __len__(self) -> int: def __len__(self) -> int:
return len(self.data) return len(self.data)
def get_input(self, idx: int) -> str: def get_input(self, idx: int) -> str:
return instance_info.format(repo=self.data[idx]['repo'], base_commit=self.data[idx]['base_commit'], problem_statement=self.data[idx]['problem_statement']) return instance_info.format(repo=self.data[idx]['repo'], base_commit=self.data[idx]['base_commit'], problem_statement=self.data[idx]['problem_statement'])
# return instance_info.format(repo=self.data[idx]['repo'], base_commit=self.data[idx]['base_commit'], problem_statement=self.data[idx]['text'])
def test_output(self, idx: int, output: str): def test_output(self, idx: int, output: str):
output = output.split('Patch:\n')[-1] output = output.split('Patch:\n')[-1]
@ -35,7 +36,7 @@ class SWETask(Task):
if api_base == 'https://api.groq.com/openai/v1': if api_base == 'https://api.groq.com/openai/v1':
score_output = groq(prompt, n=5, model='mixtral-8x7b-32768') score_output = groq(prompt, n=5, model='mixtral-8x7b-32768')
else: else:
score_outputs = gpt(prompt, n=5, model='gpt-4') score_outputs = gpt(prompt, n=5, model='gpt-4-turbo')
scores = [] scores = []
for score_output in score_outputs: for score_output in score_outputs:
print("score_output: ",score_output) print("score_output: ",score_output)