code pt2 for run_bench

This commit is contained in:
emilyworks 2024-12-07 04:49:49 +00:00
parent c69681e3a0
commit e968d35da4
5 changed files with 282 additions and 86 deletions

View File

@ -7,8 +7,14 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
import torch import torch
import torch.quantization import torch.quantization
from src.tot.data.benchmark.bench import *
from src.tot.prompts.bench import value_prompt, propose_prompt from src.tot.prompts.bench import value_prompt, propose_prompt
from datasets import load_dataset
from torch.utils.data import DataLoader
import random
import multiprocessing
def load_llama(quant=None): def load_llama(quant=None):
'''Load in one of the llama models''' '''Load in one of the llama models'''
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct") tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
@ -62,19 +68,103 @@ def final_eval(gt, final_prop):
print("THIS IS THE FINAL PROP") print("THIS IS THE FINAL PROP")
print(final_prop) print(final_prop)
print("THIS IS THE GT") print("THIS IS THE GT")
print(gt)
if gt in final_prop: if gt in final_prop:
return 1.0 return 1.0
else: else:
return 0.0 return 0.0
def get_test_data(tokenizer, batch_size):
'''
Process and return the composite benchmark test data in a dataloader
'''
# print(tokenizer)
gpqa_raw = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
gpqa_choices = [[a, b, c, d] for a, b, c, d in
zip(gpqa_raw['train']['Correct Answer'], gpqa_raw['train']['Incorrect Answer 1'],
gpqa_raw['train']['Incorrect Answer 2'], gpqa_raw['train']['Incorrect Answer 3'])]
for choices in gpqa_choices:
random.shuffle(choices)
gpqa_questions_proc = format_for_mm(gpqa_raw['train']['Question'], gpqa_choices)
#math (for math)
math_raw = load_dataset("lighteval/MATH", "all")
# #mmlu (for gen knowledge + reasoning)
mmlu_raw = load_dataset("cais/mmlu", "all")
mmlu_questions_proc_test = format_for_mm(mmlu_raw['test']['question'], mmlu_raw['test']['choices'])
#master list - test
sublist_input_test = gpqa_questions_proc[158:] + math_raw['test']['problem'] + mmlu_questions_proc_test
sublist_answer_test = gpqa_raw['train']['Correct Answer'][158:] + math_raw['test']['solution'] + mmlu_raw['test']['answer']
agg_test_set = benchmark_dataset(sublist_input_test, sublist_answer_test, tokenizer)
return DataLoader(agg_test_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_qat)
def solve(input_ids, label, mask, model, tokenizer, device, args):
'''
the main tot run
'''
problem = tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0]
# print(problem)
selected = ""
for i in range(args.depth): #args.depth number of attempts to reach the solution
#propose next step/solutions per node/prompt
out = model.generate(
input_ids,
attention_mask=mask,
temperature=args.temperature,
max_new_tokens=args.max_new_tokens,
num_return_sequences=args.breadth,
)
#evaluate/rate the proposals
current_state = tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0]
proposals = []
for o in out:
string_answer = tokenizer.decode(o[-args.max_new_tokens:], skip_special_tokens=True)
assert isinstance(string_answer, str)
proposals.extend([string_answer])
valuations = value_proposals(problem=problem, current_state=current_state, proposals=proposals, tokenizer=tokenizer, model=model, device=device)
#if the model believes it has reached the final solution before args.depth is up, break
if 100.0 in valuations:
break
#select the best proposal
val_props = list(zip(proposals, valuations))
val_props.sort(key = lambda ele: ele[1], reverse=True)
selected = val_props[:args.greedy_n][0][0]
# print("THIS IS SELCTED")
# print(selected)
#format the chosen proposal for the next iteration
next_prompt = propose_prompt.format(problem=problem, current_state=selected)
inputs = tokenizer(next_prompt, return_tensors='pt')
input_ids = inputs['input_ids'].to(device)
mask = inputs['attention_mask'].to(device)
#compare the proposed final answer vs the ground truth
gt = tokenizer.batch_decode(label, skip_special_tokens=True)
judgement = final_eval(gt[0], selected)
return judgement
def run(args): def run(args):
''' '''
main run function main run function
''' '''
#load in specific llama model, if applicable ### SETUP MODEL ###
#bc of the way the original repo is structured, will need to load in llama models in run.py to avoid repeated loading in models.py #bc of the way the original repo is structured, will need to load in llama models in run.py to avoid repeated loading in models.py
if args.backend == 'llama': if args.backend == 'llama':
if args.quantize: if args.quantize:
@ -85,78 +175,34 @@ def run(args):
model = None model = None
tokenizer = None tokenizer = None
### SETUP DATA ###
test_data = get_test_data(tokenizer, args.concurrent)
### OTHER SETUP ###
device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device) model.to(device)
#set up
test_data = torch.load('src/tot/data/agg_dl_test.pt')
total = 0 total = 0
right = 0 right = 0
for samples in test_data: for sample in test_data:
#extract out the sample parts for the initial input
input_ids = sample['input_ids'].to(device)
label = sample['label'].to(device)
mask = sample['attention_mask'].to(device)
for sample in samples: #going to do this one problem at a time for now. #cannot get multiple gpus. will run this on a single gpu one sample at a time for simplicity
for i in range(len(input_ids)):
#extract out the sample parts for the initial input judgement = solve(input_ids[i].view(1,-1), label[i].view(1,-1), mask[i].view(1,-1), model, tokenizer, device, args)
input_ids = sample['input_ids'].to(device)
label = sample['label'].to(device)
mask = sample['attention_mask'].to(device)
problem = tokenizer.decode(sample, skip_special_tokens=True)
#start solving via tot
start_timer = perf.counter()
selected = ""
for i in range(args.depth): #args.depth number of attempts to reach the solution
#propose next step/solutions per node/prompt
out = model.generate(
input_ids,
attention_mask=mask,
temperature=args.temperature,
max_new_tokens=args.max_tokens,
num_return_sequences=args.breadth)
#evaluate/rate the proposals
current_state = tokenizer.decode(input_ids, skip_special_tokens=True)
proposals = []
for o in out:
string_answer = tokenizer.decode(o)
proposals.extend([string_answer])
valuations = value_proposals(problem=problem, current_state=current_state, proposals=proposals, tokenizer=tokenizer, model=model, device=device)
#if the model believes it has reached the final solution before args.depth is up, break
if 100.0 in valuations:
break
#select the best proposal
val_props = list(zip(proposals, valuations))
val_props.sort(key = lambda ele: ele[1], descending=True)
selected = val_props[:args.greedy_n]
#format the chosen proposal for the next iteration
next_prompt = propose_prompt.format(problem=problem, current_state=selected)
inputs = tokenizer(next_prompt, return_tensors='pt')
input_ids = inputs['input_ids'].to(device)
mask = inputs['attention_mask'].to(device)
#compare the proposed final answer vs the ground truth
gt = tokenizer.decode(label, skip_special_token=True)
judgement = final_eval(gt, selected)
#keep track of the running totals
total += 1.0 total += 1.0
right += judgement right += judgement
print("Accuracy so far: ", total/right)
total_accuracy = right/total #keep track of the running totals
print("Accuracy so far: ", right/total)
print("FINAL ACCURACY: ", right/total)
def parse_args(): def parse_args():
''' '''
@ -172,6 +218,7 @@ def parse_args():
args.add_argument('--depth', type=int, default=3) args.add_argument('--depth', type=int, default=3)
args.add_argument('--breadth', type=int, default=3) args.add_argument('--breadth', type=int, default=3)
args.add_argument('--greedy_n', type=int, default=1) args.add_argument('--greedy_n', type=int, default=1)
args.add_argument('--concurrent', type=int, default=4)
args = args.parse_args() args = args.parse_args()
return args return args

View File

@ -0,0 +1,79 @@
import torch
import torch.quantization
from datasets import load_dataset
from torch.utils.data import DataLoader
import random
import multiprocessing
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
class benchmark_dataset(torch.utils.data.Dataset):
'''formats the data for dataloader'''
def __init__(self, input, labels, tokenizer, filter_n=150):
'''constructor. input samples and output labels'''
self.input = input
self.labels = labels
self.tokenizer = tokenizer
self.filter_len(filter_n)
def filter_len(self, n):
new_input = []
new_label = []
for q, a in zip(self.input, self.labels):
tk_len_q = len(tokenizer(str(q), return_tensors='pt')['input_ids'][0])
tk_len_a = len(tokenizer(str(a), return_tensors='pt')['input_ids'][0])
if tk_len_q <= n and tk_len_a <= n:
new_input.append(q)
new_label.append(a)
print(f"""
Len of Original Input: {len(self.input)}
Len of Original Labels: {len(self.labels)}
Len of New_Input: {len(new_input)}
Len of New_Label: {len(new_label)}
Sample Input, Label: {new_input[0], new_label[0]}
""")
self.input = new_input
self.labels = new_label
def __len__(self):
return len(self.input)
def __getitem__(self, idx):
return {"question": self.input[idx], "answer": self.labels[idx]}
def format_for_mm(question, choices):
'''
Formats questions and choices into one multiple-choice-question string
'''
return [f"""Choose the choice that best answer the following question:
Question:
{q.strip()}
Choices:
{c}
"""
for q, c in zip(question, choices)]
def collate_fn_qat(batch):
# Now collate into mini-batches
inputs = tokenizer([i['question'] for i in batch], return_tensors='pt', padding='max_length', truncation=True, max_length=150)
# labels = tokenizer([str(i['answer']) for i in batch], return_tensors='pt', padding='max_length', truncation=True, max_length=65)
labels = tokenizer([str(i['answer']) for i in batch], return_tensors='pt', padding='max_length', truncation=True, max_length=150)
# labels = [ele[-100:] for ele in labels['input_ids']]
return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'label': labels['input_ids']}

View File

@ -62,7 +62,7 @@ def get_samples(task, x, y, n_generate_sample, prompt_sample, stop):
def solve(args, task, idx, model, tokenizer, to_print=True): def solve(args, task, idx, model, tokenizer, to_print=True):
global inference_model global inference_model
inference_model = partial(inference_model, model=model, tokenizer=tokenizer, temperature=args.temperature) inference_model = partial(inference_model, model=model, tokenizer=tokenizer, temperature=args.temperature)
# print(inference_model) print(inference_model)
x = task.get_input(idx) # input x = task.get_input(idx) # input
ys = [''] # current output candidates ys = [''] # current output candidates
infos = [] infos = []
@ -145,3 +145,72 @@ def naive_solve(args, task, idx, to_print=True):
x = task.get_input(idx) # input x = task.get_input(idx) # input
ys = get_samples(task, x, '', args.n_generate_sample, args.prompt_sample, stop=None) ys = get_samples(task, x, '', args.n_generate_sample, args.prompt_sample, stop=None)
return ys, {} return ys, {}
def solve_bench(args, model, tokenizer, to_print=True, depth=5, breadth=2):
global inference_model
inference_model = partial(inference_model, model=model, tokenizer=tokenizer, temperature=args.temperature)
ys = [''] # current output candidates
infos = []
for step in range(5):
# print("Started Steps!!")
# generation
print("Started Generation...")
new_ys_with_times = []
generated_ys, generate_times = get_samples(task, x, y, args.n_generate_sample, prompt_sample=args.prompt_sample, stop=task.stops[step])
new_ys_with_times.extend(zip(generated_ys, generate_times))
new_ys, generate_times = zip(*new_ys_with_times)
new_ys = list(new_ys)
generate_times = list(generate_times)
# new_ys = list(itertools.chain(new_ys))
new_ys = ys + new_ys # extend the current queue with the frontier
ids = list(range(len(new_ys)))
# evaluation
# shouldn't worry about reevaluating the same ys as the values should be saved in the task cache
# but could potentially add logic to remove expanded from queue
print("Finished Generation...Started Eval!")
start_time = time.perf_counter()
if args.method_evaluate == 'vote':
values = get_votes(task, x, new_ys, args.n_evaluate_sample)
elif args.method_evaluate == 'value':
values, eval_times = get_values(task, x, new_ys, args.n_evaluate_sample)
eval_time = time.perf_counter()-start_time
print(f"Node Eval Time: {eval_time} seconds")
# selection
print("Finished Eval...Started Selection...")
start_time = time.perf_counter()
if args.method_select == 'sample':
ps = np.array(values) / sum(values)
select_ids = np.random.choice(ids, size=args.n_select_sample, p=ps).tolist()
elif args.method_select == 'greedy':
select_ids = sorted(ids, key=lambda x: values[x], reverse=True)[:args.n_select_sample]
select_new_ys = [new_ys[select_id] for select_id in select_ids]
selection_time = time.perf_counter()-start_time()
print(f"Selection Time: {selection_time} seconds")
# log
print("Finished Selection...Logging...")
if to_print:
sorted_new_ys, sorted_values = zip(*sorted(zip(new_ys, values), key=lambda x: x[1], reverse=True))
print(f'-- new_ys --: {sorted_new_ys}\n-- sol values --: {sorted_values}\n-- choices --: {select_new_ys}\n-- generate times --: {generate_times}\n-- eval times --: {eval_times}\n')
infos.append({'step': step, 'x': x, 'ys': ys, 'new_ys': new_ys, 'values': values, 'select_new_ys': select_new_ys, 'generate_times': generate_times, 'eval_times': eval_times})
ys = select_new_ys
if to_print:
print(ys)
return ys, {'steps': infos}

View File

@ -1,4 +1,4 @@
propose_prompt = '''Given a problem goal and a current state, propose one possible next step to solve the problem from the current state. If the current state solves the problem, say "current state is the solution". propose_prompt = '''[INST]Given a problem goal and a current state, propose one possible next step to solve the problem from the current state. If the current state solves the problem, say "current state is the solution".
### ###
Here are two examples to help: Here are two examples to help:
@ -19,7 +19,7 @@ Choose the choice that best answer the following question:
Choices: Choices:
['Adams only.', 'Brooks only.', 'Case only.', 'Adams and Brooks'] ['Adams only.', 'Brooks only.', 'Case only.', 'Adams and Brooks']
Current State: Current State:
Brooks Only. Choice 2
Possible next step: Possible next step:
current state is the solution. current state is the solution.
@ -28,10 +28,10 @@ Problem Goal:
{problem} {problem}
Current State: Current State:
{current_state} {current_state}
Possible next step: Possible next step:[\INST]
''' '''
value_prompt = '''Given a problem goal, a current state, and a proposed next step, evaluate if the next step from the current state can help solve or answer the problem (yes/likely/no) value_prompt = '''[INST]Given a problem goal, a current state, and a proposed next step, evaluate if the next step from the current state can help solve or answer the problem (yes/likely/no)
### ###
Here are three examples to help: Here are three examples to help:
Example #1: Example #1:
@ -52,7 +52,7 @@ Choose the choice that best answer the following question:
Choices: Choices:
['Adams only.', 'Brooks only.', 'Case only.', 'Adams and Brooks'] ['Adams only.', 'Brooks only.', 'Case only.', 'Adams and Brooks']
Current State: Current State:
Adams and Brooks. Choice 4
Proposal: Proposal:
current state is the solution. current state is the solution.
Evaluation: Evaluation:
@ -65,7 +65,7 @@ Choose the choice that best answer the following question:
Choices: Choices:
['Adams only.', 'Brooks only.', 'Case only.', 'Adams and Brooks'] ['Adams only.', 'Brooks only.', 'Case only.', 'Adams and Brooks']
Current State: Current State:
Brooks Only. Choice 2
Proposal: Proposal:
current state is the solution. current state is the solution.
Evaluation: Evaluation:
@ -78,5 +78,5 @@ Current State:
{current_state} {current_state}
Proposal: Proposal:
{proposal} {proposal}
Evaluation: Evaluation:[\INST]
''' '''

View File

@ -11,7 +11,7 @@ def get_current_numbers(y: str) -> str:
return last_line.split('left: ')[-1].split(')')[0] return last_line.split('left: ')[-1].split(')')[0]
class Game24Task(Task): class Bench(Task):
""" """
Input (x) : a string of Input (x) : a string of
Output (y) : a trajectory of 3 steps to reach 24 Output (y) : a trajectory of 3 steps to reach 24
@ -24,7 +24,7 @@ class Game24Task(Task):
6 * 4 = 24 (left: 24) 6 * 4 = 24 (left: 24)
(1 + 2 + 3) * 4 = 24 (1 + 2 + 3) * 4 = 24
""" """
def __init__(self, file='24.csv', depth=5): def __init__(self, depth=5):
""" """
file: a csv file (fixed) file: a csv file (fixed)
""" """
@ -34,13 +34,14 @@ class Game24Task(Task):
self.steps = depth self.steps = depth
self.stops = None self.stops = None
def __len__(self) -> int: # def __len__(self) -> int:
return len(self.data) # return len(self.data)
def get_input(self, idx: int) -> str: # def get_input(self, idx: int) -> str:
return self.data[idx] # return self.data[idx]
def test_output(self, idx: int, output: str): def test_output(self, idx: int, output: str):
gt =
expression = output.strip().split('\n')[-1].lower().replace('answer: ', '').split('=')[0] expression = output.strip().split('\n')[-1].lower().replace('answer: ', '').split('=')[0]
numbers = re.findall(r'\d+', expression) numbers = re.findall(r'\d+', expression)
problem_numbers = re.findall(r'\d+', self.data[idx]) problem_numbers = re.findall(r'\d+', self.data[idx])
@ -55,13 +56,13 @@ class Game24Task(Task):
print("entered exception!!") print("entered exception!!")
return {'r': 0} return {'r': 0}
@staticmethod # @staticmethod
def standard_prompt_wrap(x: str, y:str='') -> str: # def standard_prompt_wrap(x: str, y:str='') -> str:
return standard_prompt.format(input=x) + y # return standard_prompt.format(input=x) + y
@staticmethod # @staticmethod
def cot_prompt_wrap(x: str, y:str='') -> str: # def cot_prompt_wrap(x: str, y:str='') -> str:
return cot_prompt.format(input=x) + y # return cot_prompt.format(input=x) + y
@staticmethod @staticmethod
def propose_prompt_wrap(x: str, y: str='') -> str: def propose_prompt_wrap(x: str, y: str='') -> str: