mirror of
https://github.com/princeton-nlp/tree-of-thought-llm
synced 2025-04-21 22:54:13 +00:00
Fix issues with swe-bench
This commit is contained in:
parent
5e0ebae057
commit
91806d296b
@ -7,7 +7,7 @@ import time
|
|||||||
|
|
||||||
print("Downloading dataset...")
|
print("Downloading dataset...")
|
||||||
dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split = "test", cache_dir='datasets_cache')
|
dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split = "test", cache_dir='datasets_cache')
|
||||||
preds_path = "llama2-70b-4096.jsonl"
|
preds_path = "llama3-70b-8192.jsonl"
|
||||||
try:
|
try:
|
||||||
with open(preds_path, "r") as file:
|
with open(preds_path, "r") as file:
|
||||||
preds_jsonl = file.read()
|
preds_jsonl = file.read()
|
||||||
@ -34,8 +34,8 @@ def save_jsonl(jsonl_object, file_path="preds.jsonl"):
|
|||||||
|
|
||||||
args = argparse.Namespace(
|
args = argparse.Namespace(
|
||||||
# backend='mixtral-8x7b-32768',
|
# backend='mixtral-8x7b-32768',
|
||||||
backend='llama2-70b-4096',
|
backend='llama3-70b-8192',
|
||||||
temperature=0.5,
|
temperature=0.7,
|
||||||
task='swe',
|
task='swe',
|
||||||
naive_run=False,
|
naive_run=False,
|
||||||
prompt_sample='cot',
|
prompt_sample='cot',
|
||||||
@ -50,12 +50,16 @@ print("Solving...")
|
|||||||
task = SWETask(dataset)
|
task = SWETask(dataset)
|
||||||
|
|
||||||
|
|
||||||
for index in range(6,100):
|
for index in range(3,4):
|
||||||
ys, infos = solve(args, task, index, to_print=False)
|
instance_id = dataset[index]["instance_id"]
|
||||||
|
size = len(dataset[index]["problem_statement"])
|
||||||
|
print(f" ### Task {index} -- {instance_id} -> size ({size} )###")
|
||||||
|
ys, infos, _ = solve(args, task, index, to_print=False)
|
||||||
preds_jsonl = update_jsonl(dataset[index]["instance_id"], SWETask.parse_diff_block(ys[0]), args.backend, preds_jsonl)
|
preds_jsonl = update_jsonl(dataset[index]["instance_id"], SWETask.parse_diff_block(ys[0]), args.backend, preds_jsonl)
|
||||||
save_jsonl(preds_jsonl, preds_path)
|
save_jsonl(preds_jsonl, preds_path)
|
||||||
# print("-----------Predicted----------------------")
|
print("-----------Predicted----------------------")
|
||||||
# print(SWETask.parse_diff_block(ys[0]))
|
print(SWETask.parse_diff_block(ys[0]))
|
||||||
# # print("-----------Expected----------------------")
|
print("-----------Expected----------------------")
|
||||||
# # print(dataset[index]["patch"])
|
print(dataset[index]["patch"])
|
||||||
time.sleep(60)
|
# time.sleep(1)
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import itertools, os
|
import itertools, os, time
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from functools import partial
|
from functools import partial
|
||||||
|
|
||||||
@ -64,6 +64,7 @@ def solve(args, task, idx, to_print=True):
|
|||||||
ys = [''] # current output candidates
|
ys = [''] # current output candidates
|
||||||
infos = []
|
infos = []
|
||||||
for step in range(task.steps):
|
for step in range(task.steps):
|
||||||
|
# print(f"Step {step} - gen")
|
||||||
# generation
|
# generation
|
||||||
if args.method_generate == 'sample':
|
if args.method_generate == 'sample':
|
||||||
new_ys = [get_samples(task, x, y, args.n_generate_sample, prompt_sample=args.prompt_sample, stop=task.stops[step]) for y in ys]
|
new_ys = [get_samples(task, x, y, args.n_generate_sample, prompt_sample=args.prompt_sample, stop=task.stops[step]) for y in ys]
|
||||||
@ -71,12 +72,13 @@ def solve(args, task, idx, to_print=True):
|
|||||||
new_ys = [get_proposals(task, x, y) for y in ys]
|
new_ys = [get_proposals(task, x, y) for y in ys]
|
||||||
new_ys = list(itertools.chain(*new_ys))
|
new_ys = list(itertools.chain(*new_ys))
|
||||||
ids = list(range(len(new_ys)))
|
ids = list(range(len(new_ys)))
|
||||||
|
time.sleep(5)
|
||||||
# evaluation
|
# evaluation
|
||||||
|
# print(f"Step {step} - eval")
|
||||||
if args.method_evaluate == 'vote':
|
if args.method_evaluate == 'vote':
|
||||||
values = get_votes(task, x, new_ys, args.n_evaluate_sample)
|
values = get_votes(task, x, new_ys, args.n_evaluate_sample)
|
||||||
elif args.method_evaluate == 'value':
|
elif args.method_evaluate == 'value':
|
||||||
values = get_values(task, x, new_ys, args.n_evaluate_sample)
|
values = get_values(task, x, new_ys, args.n_evaluate_sample)
|
||||||
|
|
||||||
# selection
|
# selection
|
||||||
if args.method_select == 'sample':
|
if args.method_select == 'sample':
|
||||||
ps = np.array(values) / sum(values)
|
ps = np.array(values) / sum(values)
|
||||||
@ -92,12 +94,14 @@ def solve(args, task, idx, to_print=True):
|
|||||||
|
|
||||||
infos.append({'step': step, 'x': x, 'ys': ys, 'new_ys': new_ys, 'values': values, 'select_new_ys': select_new_ys})
|
infos.append({'step': step, 'x': x, 'ys': ys, 'new_ys': new_ys, 'values': values, 'select_new_ys': select_new_ys})
|
||||||
ys = select_new_ys
|
ys = select_new_ys
|
||||||
|
if step == 0:
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
print(usage())
|
total_usage = usage()
|
||||||
|
|
||||||
if to_print:
|
if to_print:
|
||||||
print(ys)
|
print(ys)
|
||||||
return ys, {'steps': infos}
|
return ys, {'steps': infos}, total_usage
|
||||||
|
|
||||||
def naive_solve(args, task, idx, to_print=True):
|
def naive_solve(args, task, idx, to_print=True):
|
||||||
global platform
|
global platform
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import os
|
import os, time
|
||||||
import openai
|
import openai
|
||||||
import backoff
|
import backoff
|
||||||
|
|
||||||
@ -37,6 +37,7 @@ def chatgpt(messages, model="gpt-4", temperature=0.7, max_tokens=1000, n=1, stop
|
|||||||
# log completion tokens
|
# log completion tokens
|
||||||
completion_tokens += res["usage"]["completion_tokens"]
|
completion_tokens += res["usage"]["completion_tokens"]
|
||||||
prompt_tokens += res["usage"]["prompt_tokens"]
|
prompt_tokens += res["usage"]["prompt_tokens"]
|
||||||
|
time.sleep(2)
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def gpt_usage(backend="gpt-4"):
|
def gpt_usage(backend="gpt-4"):
|
||||||
@ -45,14 +46,16 @@ def gpt_usage(backend="gpt-4"):
|
|||||||
cost = completion_tokens / 1000 * 0.06 + prompt_tokens / 1000 * 0.03
|
cost = completion_tokens / 1000 * 0.06 + prompt_tokens / 1000 * 0.03
|
||||||
elif backend == "gpt-3.5-turbo":
|
elif backend == "gpt-3.5-turbo":
|
||||||
cost = completion_tokens / 1000 * 0.002 + prompt_tokens / 1000 * 0.0015
|
cost = completion_tokens / 1000 * 0.002 + prompt_tokens / 1000 * 0.0015
|
||||||
|
elif backend == "gpt-4-turbo":
|
||||||
|
cost = completion_tokens / 1000 * 0.03 + prompt_tokens / 1000 * 0.01
|
||||||
return {"completion_tokens": completion_tokens, "prompt_tokens": prompt_tokens, "cost": cost}
|
return {"completion_tokens": completion_tokens, "prompt_tokens": prompt_tokens, "cost": cost}
|
||||||
|
|
||||||
def groq(prompt, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=1500, n=1, stop=None) -> list:
|
def groq(prompt, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=1000, n=1, stop=None) -> list:
|
||||||
global completion_tokens, prompt_tokens
|
global completion_tokens, prompt_tokens
|
||||||
messages = [{"role": "user", "content": prompt}]
|
messages = [{"role": "user", "content": prompt}]
|
||||||
return groqgpt(messages, model=model, temperature=temperature, max_tokens=max_tokens, n=n, stop=stop)
|
return groqgpt(messages, model=model, temperature=temperature, max_tokens=max_tokens, n=n, stop=stop)
|
||||||
|
|
||||||
def groqgpt(messages, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=2000,n=1, stop=None) -> list:
|
def groqgpt(messages, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=1000,n=1, stop=None) -> list:
|
||||||
global completion_tokens, prompt_tokens
|
global completion_tokens, prompt_tokens
|
||||||
outputs = []
|
outputs = []
|
||||||
while n > 0:
|
while n > 0:
|
||||||
@ -62,6 +65,7 @@ def groqgpt(messages, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=20
|
|||||||
# log completion tokens
|
# log completion tokens
|
||||||
completion_tokens += res["usage"]["completion_tokens"]
|
completion_tokens += res["usage"]["completion_tokens"]
|
||||||
prompt_tokens += res["usage"]["prompt_tokens"]
|
prompt_tokens += res["usage"]["prompt_tokens"]
|
||||||
|
time.sleep(2)
|
||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
def groq_usage():
|
def groq_usage():
|
||||||
|
@ -20,13 +20,14 @@ class SWETask(Task):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.data = dataset
|
self.data = dataset
|
||||||
self.steps = 2
|
self.steps = 2
|
||||||
self.stops = ['\nPatch:\n', None]
|
self.stops = ['\nPatch:\n', '<|eot_id|>']
|
||||||
|
|
||||||
def __len__(self) -> int:
|
def __len__(self) -> int:
|
||||||
return len(self.data)
|
return len(self.data)
|
||||||
|
|
||||||
def get_input(self, idx: int) -> str:
|
def get_input(self, idx: int) -> str:
|
||||||
return instance_info.format(repo=self.data[idx]['repo'], base_commit=self.data[idx]['base_commit'], problem_statement=self.data[idx]['problem_statement'])
|
return instance_info.format(repo=self.data[idx]['repo'], base_commit=self.data[idx]['base_commit'], problem_statement=self.data[idx]['problem_statement'])
|
||||||
|
# return instance_info.format(repo=self.data[idx]['repo'], base_commit=self.data[idx]['base_commit'], problem_statement=self.data[idx]['text'])
|
||||||
|
|
||||||
def test_output(self, idx: int, output: str):
|
def test_output(self, idx: int, output: str):
|
||||||
output = output.split('Patch:\n')[-1]
|
output = output.split('Patch:\n')[-1]
|
||||||
@ -35,7 +36,7 @@ class SWETask(Task):
|
|||||||
if api_base == 'https://api.groq.com/openai/v1':
|
if api_base == 'https://api.groq.com/openai/v1':
|
||||||
score_output = groq(prompt, n=5, model='mixtral-8x7b-32768')
|
score_output = groq(prompt, n=5, model='mixtral-8x7b-32768')
|
||||||
else:
|
else:
|
||||||
score_outputs = gpt(prompt, n=5, model='gpt-4')
|
score_outputs = gpt(prompt, n=5, model='gpt-4-turbo')
|
||||||
scores = []
|
scores = []
|
||||||
for score_output in score_outputs:
|
for score_output in score_outputs:
|
||||||
print("score_output: ",score_output)
|
print("score_output: ",score_output)
|
||||||
|
Loading…
Reference in New Issue
Block a user