From 91806d296bd137308653e37c7e73f5e45cc2930b Mon Sep 17 00:00:00 2001 From: ricardo-larosa Date: Sat, 22 Jun 2024 15:45:12 -0700 Subject: [PATCH] Fix issues with swe-bench --- runs/swe-bench.py | 24 ++++++++++++++---------- src/tot/methods/bfs.py | 14 +++++++++----- src/tot/models.py | 12 ++++++++---- src/tot/tasks/swe.py | 5 +++-- 4 files changed, 34 insertions(+), 21 deletions(-) diff --git a/runs/swe-bench.py b/runs/swe-bench.py index 36542cf..007842f 100644 --- a/runs/swe-bench.py +++ b/runs/swe-bench.py @@ -7,7 +7,7 @@ import time print("Downloading dataset...") dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split = "test", cache_dir='datasets_cache') -preds_path = "llama2-70b-4096.jsonl" +preds_path = "llama3-70b-8192.jsonl" try: with open(preds_path, "r") as file: preds_jsonl = file.read() @@ -34,8 +34,8 @@ def save_jsonl(jsonl_object, file_path="preds.jsonl"): args = argparse.Namespace( # backend='mixtral-8x7b-32768', - backend='llama2-70b-4096', - temperature=0.5, + backend='llama3-70b-8192', + temperature=0.7, task='swe', naive_run=False, prompt_sample='cot', @@ -50,12 +50,16 @@ print("Solving...") task = SWETask(dataset) -for index in range(6,100): - ys, infos = solve(args, task, index, to_print=False) +for index in range(3,4): + instance_id = dataset[index]["instance_id"] + size = len(dataset[index]["problem_statement"]) + print(f" ### Task {index} -- {instance_id} -> size ({size} )###") + ys, infos, _ = solve(args, task, index, to_print=False) preds_jsonl = update_jsonl(dataset[index]["instance_id"], SWETask.parse_diff_block(ys[0]), args.backend, preds_jsonl) save_jsonl(preds_jsonl, preds_path) - # print("-----------Predicted----------------------") - # print(SWETask.parse_diff_block(ys[0])) - # # print("-----------Expected----------------------") - # # print(dataset[index]["patch"]) - time.sleep(60) + print("-----------Predicted----------------------") + print(SWETask.parse_diff_block(ys[0])) + print("-----------Expected----------------------") + print(dataset[index]["patch"]) + # time.sleep(1) + diff --git a/src/tot/methods/bfs.py b/src/tot/methods/bfs.py index ba41a15..05c3c13 100644 --- a/src/tot/methods/bfs.py +++ b/src/tot/methods/bfs.py @@ -1,4 +1,4 @@ -import itertools, os +import itertools, os, time import numpy as np from functools import partial @@ -64,6 +64,7 @@ def solve(args, task, idx, to_print=True): ys = [''] # current output candidates infos = [] for step in range(task.steps): + # print(f"Step {step} - gen") # generation if args.method_generate == 'sample': new_ys = [get_samples(task, x, y, args.n_generate_sample, prompt_sample=args.prompt_sample, stop=task.stops[step]) for y in ys] @@ -71,12 +72,13 @@ def solve(args, task, idx, to_print=True): new_ys = [get_proposals(task, x, y) for y in ys] new_ys = list(itertools.chain(*new_ys)) ids = list(range(len(new_ys))) + time.sleep(5) # evaluation + # print(f"Step {step} - eval") if args.method_evaluate == 'vote': values = get_votes(task, x, new_ys, args.n_evaluate_sample) elif args.method_evaluate == 'value': values = get_values(task, x, new_ys, args.n_evaluate_sample) - # selection if args.method_select == 'sample': ps = np.array(values) / sum(values) @@ -92,12 +94,14 @@ def solve(args, task, idx, to_print=True): infos.append({'step': step, 'x': x, 'ys': ys, 'new_ys': new_ys, 'values': values, 'select_new_ys': select_new_ys}) ys = select_new_ys - - print(usage()) + if step == 0: + time.sleep(5) + + total_usage = usage() if to_print: print(ys) - return ys, {'steps': infos} + return ys, {'steps': infos}, total_usage def naive_solve(args, task, idx, to_print=True): global platform diff --git a/src/tot/models.py b/src/tot/models.py index f0c539f..850f641 100644 --- a/src/tot/models.py +++ b/src/tot/models.py @@ -1,6 +1,6 @@ -import os +import os, time import openai -import backoff +import backoff completion_tokens = prompt_tokens = 0 @@ -37,6 +37,7 @@ def chatgpt(messages, model="gpt-4", temperature=0.7, max_tokens=1000, n=1, stop # log completion tokens completion_tokens += res["usage"]["completion_tokens"] prompt_tokens += res["usage"]["prompt_tokens"] + time.sleep(2) return outputs def gpt_usage(backend="gpt-4"): @@ -45,14 +46,16 @@ def gpt_usage(backend="gpt-4"): cost = completion_tokens / 1000 * 0.06 + prompt_tokens / 1000 * 0.03 elif backend == "gpt-3.5-turbo": cost = completion_tokens / 1000 * 0.002 + prompt_tokens / 1000 * 0.0015 + elif backend == "gpt-4-turbo": + cost = completion_tokens / 1000 * 0.03 + prompt_tokens / 1000 * 0.01 return {"completion_tokens": completion_tokens, "prompt_tokens": prompt_tokens, "cost": cost} -def groq(prompt, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=1500, n=1, stop=None) -> list: +def groq(prompt, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=1000, n=1, stop=None) -> list: global completion_tokens, prompt_tokens messages = [{"role": "user", "content": prompt}] return groqgpt(messages, model=model, temperature=temperature, max_tokens=max_tokens, n=n, stop=stop) -def groqgpt(messages, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=2000,n=1, stop=None) -> list: +def groqgpt(messages, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=1000,n=1, stop=None) -> list: global completion_tokens, prompt_tokens outputs = [] while n > 0: @@ -62,6 +65,7 @@ def groqgpt(messages, model="mixtral-8x7b-32768", temperature=0.5, max_tokens=20 # log completion tokens completion_tokens += res["usage"]["completion_tokens"] prompt_tokens += res["usage"]["prompt_tokens"] + time.sleep(2) return outputs def groq_usage(): diff --git a/src/tot/tasks/swe.py b/src/tot/tasks/swe.py index ce20cbc..1669266 100644 --- a/src/tot/tasks/swe.py +++ b/src/tot/tasks/swe.py @@ -20,13 +20,14 @@ class SWETask(Task): super().__init__() self.data = dataset self.steps = 2 - self.stops = ['\nPatch:\n', None] + self.stops = ['\nPatch:\n', '<|eot_id|>'] def __len__(self) -> int: return len(self.data) def get_input(self, idx: int) -> str: return instance_info.format(repo=self.data[idx]['repo'], base_commit=self.data[idx]['base_commit'], problem_statement=self.data[idx]['problem_statement']) + # return instance_info.format(repo=self.data[idx]['repo'], base_commit=self.data[idx]['base_commit'], problem_statement=self.data[idx]['text']) def test_output(self, idx: int, output: str): output = output.split('Patch:\n')[-1] @@ -35,7 +36,7 @@ class SWETask(Task): if api_base == 'https://api.groq.com/openai/v1': score_output = groq(prompt, n=5, model='mixtral-8x7b-32768') else: - score_outputs = gpt(prompt, n=5, model='gpt-4') + score_outputs = gpt(prompt, n=5, model='gpt-4-turbo') scores = [] for score_output in score_outputs: print("score_output: ",score_output)