code pt2 for run_bench

2025-06-26 18:26:00 +00:00 · 2024-12-07 04:49:49 +00:00 · 2024-12-07 04:49:49 +00:00 · e968d35da4
commit e968d35da4
parent c69681e3a0
5 changed files with 282 additions and 86 deletions
--- a/run_bench.py
+++ b/run_bench.py
@ -7,8 +7,14 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 import torch.quantization
 from src.tot.data.benchmark.bench import *
 from src.tot.prompts.bench import value_prompt, propose_prompt
 from datasets import load_dataset
 from torch.utils.data import DataLoader
 import random
 import multiprocessing
 def load_llama(quant=None):
    '''Load in one of the llama models'''
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
@ -62,19 +68,103 @@ def final_eval(gt, final_prop):
    print("THIS IS THE FINAL PROP")
    print(final_prop)
    print("THIS IS THE GT")
-    print(gt)
+    
    if gt in final_prop:
        return 1.0
    else:
        return 0.0
 def get_test_data(tokenizer, batch_size):
    '''
    Process and return the composite benchmark test data in a dataloader
    '''
    # print(tokenizer)
    gpqa_raw = load_dataset("Idavidrein/gpqa", "gpqa_diamond")
    gpqa_choices = [[a, b, c, d] for a, b, c, d in
                    zip(gpqa_raw['train']['Correct Answer'], gpqa_raw['train']['Incorrect Answer 1'],
                        gpqa_raw['train']['Incorrect Answer 2'], gpqa_raw['train']['Incorrect Answer 3'])]
    for choices in gpqa_choices:
        random.shuffle(choices)
    gpqa_questions_proc = format_for_mm(gpqa_raw['train']['Question'], gpqa_choices)
    #math (for math)
    math_raw = load_dataset("lighteval/MATH", "all")
    # #mmlu (for gen knowledge + reasoning)
    mmlu_raw = load_dataset("cais/mmlu", "all")
    mmlu_questions_proc_test = format_for_mm(mmlu_raw['test']['question'], mmlu_raw['test']['choices'])
    #master list - test
    sublist_input_test = gpqa_questions_proc[158:] + math_raw['test']['problem'] + mmlu_questions_proc_test
    sublist_answer_test = gpqa_raw['train']['Correct Answer'][158:] + math_raw['test']['solution'] + mmlu_raw['test']['answer']
    agg_test_set = benchmark_dataset(sublist_input_test, sublist_answer_test, tokenizer)
    return DataLoader(agg_test_set, batch_size=batch_size, shuffle=True, collate_fn=collate_fn_qat)
 def solve(input_ids, label, mask, model, tokenizer, device, args):
    '''
    the main tot run
    '''
    problem = tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0]
    # print(problem)
    selected = ""
    for i in range(args.depth): #args.depth number of attempts to reach the solution
        #propose next step/solutions per node/prompt
        out = model.generate(
            input_ids,
            attention_mask=mask,
            temperature=args.temperature, 
            max_new_tokens=args.max_new_tokens, 
            num_return_sequences=args.breadth,
            )
        #evaluate/rate the proposals
        current_state = tokenizer.batch_decode(input_ids, skip_special_tokens=True)[0]
        proposals = []
        for o in out:
            string_answer = tokenizer.decode(o[-args.max_new_tokens:], skip_special_tokens=True)
            assert isinstance(string_answer, str)
            proposals.extend([string_answer])
        valuations = value_proposals(problem=problem, current_state=current_state, proposals=proposals, tokenizer=tokenizer, model=model, device=device)
        #if the model believes it has reached the final solution before args.depth is up, break
        if 100.0 in valuations:
            break
        #select the best proposal
        val_props = list(zip(proposals, valuations))
        val_props.sort(key = lambda ele: ele[1], reverse=True)
        selected = val_props[:args.greedy_n][0][0]
        # print("THIS IS SELCTED")
        # print(selected)
        #format the chosen proposal for the next iteration
        next_prompt = propose_prompt.format(problem=problem, current_state=selected)
        inputs = tokenizer(next_prompt, return_tensors='pt')
        input_ids = inputs['input_ids'].to(device)
        mask = inputs['attention_mask'].to(device)
    #compare the proposed final answer vs the ground truth
    gt = tokenizer.batch_decode(label, skip_special_tokens=True)
    judgement = final_eval(gt[0], selected)
    return judgement
 def run(args):
    '''
    main run function
    '''
-    #load in specific llama model, if applicable
+    ### SETUP MODEL ###
    #bc of the way the original repo is structured, will need to load in llama models in run.py to avoid repeated loading in models.py
    if args.backend == 'llama':
        if args.quantize:
@ -85,78 +175,34 @@ def run(args):
        model = None
        tokenizer = None
    ### SETUP DATA ###
    test_data = get_test_data(tokenizer, args.concurrent)
    ### OTHER SETUP ###
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    #set up
    test_data = torch.load('src/tot/data/agg_dl_test.pt')
    total = 0
    right = 0
-    for samples in test_data:
+    for sample in test_data:
        #extract out the sample parts for the initial input
        input_ids = sample['input_ids'].to(device)
        label = sample['label'].to(device)
        mask = sample['attention_mask'].to(device)
-        for sample in samples: #going to do this one problem at a time for now.
+        #cannot get multiple gpus. will run this on a single gpu one sample at a time for simplicity
-
+        for i in range(len(input_ids)):
-            #extract out the sample parts for the initial input
+            judgement = solve(input_ids[i].view(1,-1), label[i].view(1,-1), mask[i].view(1,-1), model, tokenizer, device, args)
            input_ids = sample['input_ids'].to(device)
            label = sample['label'].to(device)
            mask = sample['attention_mask'].to(device)
            problem = tokenizer.decode(sample, skip_special_tokens=True)
            #start solving via tot
            start_timer = perf.counter()
            selected = ""
            for i in range(args.depth): #args.depth number of attempts to reach the solution
                #propose next step/solutions per node/prompt
                out = model.generate(
                    input_ids,
                    attention_mask=mask,
                    temperature=args.temperature, 
                    max_new_tokens=args.max_tokens, 
                    num_return_sequences=args.breadth)
                #evaluate/rate the proposals
                current_state = tokenizer.decode(input_ids, skip_special_tokens=True)
                proposals = []
                for o in out:
                    string_answer = tokenizer.decode(o)
                    proposals.extend([string_answer])
                valuations = value_proposals(problem=problem, current_state=current_state, proposals=proposals, tokenizer=tokenizer, model=model, device=device)
                #if the model believes it has reached the final solution before args.depth is up, break
                if 100.0 in valuations:
                    break
                #select the best proposal
                val_props = list(zip(proposals, valuations))
                val_props.sort(key = lambda ele: ele[1], descending=True)
                selected = val_props[:args.greedy_n]
                #format the chosen proposal for the next iteration
                next_prompt = propose_prompt.format(problem=problem, current_state=selected)
                inputs = tokenizer(next_prompt, return_tensors='pt')
                input_ids = inputs['input_ids'].to(device)
                mask = inputs['attention_mask'].to(device)
            #compare the proposed final answer vs the ground truth
            gt = tokenizer.decode(label, skip_special_token=True)
            judgement = final_eval(gt, selected)
            #keep track of the running totals
            total += 1.0
            right += judgement
            print("Accuracy so far: ", total/right)
-    total_accuracy = right/total
+        #keep track of the running totals
        print("Accuracy so far: ", right/total)
    print("FINAL ACCURACY: ", right/total)
 def parse_args():
    '''
@ -172,6 +218,7 @@ def parse_args():
    args.add_argument('--depth', type=int, default=3)
    args.add_argument('--breadth', type=int, default=3)
    args.add_argument('--greedy_n', type=int, default=1)
    args.add_argument('--concurrent', type=int, default=4)
    args = args.parse_args()
    return args
--- a/src/tot/data/benchmark/bench.py
+++ b/src/tot/data/benchmark/bench.py
@ -0,0 +1,79 @@
 import torch
 import torch.quantization
 from datasets import load_dataset
 from torch.utils.data import DataLoader
 import random
 import multiprocessing
 import os
 from transformers import AutoTokenizer, AutoModelForCausalLM
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
 tokenizer.pad_token = tokenizer.eos_token
 class benchmark_dataset(torch.utils.data.Dataset):
  '''formats the data for dataloader'''
  def __init__(self, input, labels, tokenizer, filter_n=150):
    '''constructor. input samples and output labels'''
    self.input = input
    self.labels = labels
    self.tokenizer = tokenizer
    self.filter_len(filter_n)
  def filter_len(self, n):
    new_input = []
    new_label = []
    for q, a in zip(self.input, self.labels):
      tk_len_q = len(tokenizer(str(q), return_tensors='pt')['input_ids'][0])
      tk_len_a = len(tokenizer(str(a), return_tensors='pt')['input_ids'][0])
      if tk_len_q <= n and tk_len_a <= n:
        new_input.append(q)
        new_label.append(a)
    print(f"""
    Len of Original Input: {len(self.input)}
    Len of Original Labels: {len(self.labels)}
    Len of New_Input: {len(new_input)}
    Len of New_Label: {len(new_label)}
    Sample Input, Label: {new_input[0], new_label[0]}
    """)
    self.input = new_input
    self.labels = new_label
  def __len__(self):
    return len(self.input)
  def __getitem__(self, idx):
    return {"question": self.input[idx], "answer": self.labels[idx]}
 def format_for_mm(question, choices):
  '''
  Formats questions and choices into one multiple-choice-question string
  '''
  return [f"""Choose the choice that best answer the following question:
  Question:
  {q.strip()}
  Choices:
  {c}
  """
  for q, c in zip(question, choices)]
 def collate_fn_qat(batch):
    # Now collate into mini-batches
    inputs = tokenizer([i['question'] for i in batch], return_tensors='pt', padding='max_length', truncation=True, max_length=150)
    # labels = tokenizer([str(i['answer']) for i in batch], return_tensors='pt', padding='max_length', truncation=True, max_length=65)
    labels = tokenizer([str(i['answer']) for i in batch], return_tensors='pt', padding='max_length', truncation=True, max_length=150)
    # labels = [ele[-100:] for ele in labels['input_ids']]
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'label': labels['input_ids']}
--- a/src/tot/methods/bfs.py
+++ b/src/tot/methods/bfs.py
@ -62,7 +62,7 @@ def get_samples(task, x, y, n_generate_sample, prompt_sample, stop):
 def solve(args, task, idx, model, tokenizer, to_print=True):
    global inference_model
    inference_model = partial(inference_model, model=model, tokenizer=tokenizer, temperature=args.temperature)
-    # print(inference_model)
+    print(inference_model)
    x = task.get_input(idx)  # input
    ys = ['']  # current output candidates
    infos = []
@ -145,3 +145,72 @@ def naive_solve(args, task, idx, to_print=True):
    x = task.get_input(idx)  # input
    ys = get_samples(task, x, '', args.n_generate_sample, args.prompt_sample, stop=None)
    return ys, {}
 def solve_bench(args, model, tokenizer, to_print=True, depth=5, breadth=2):
    global inference_model
    inference_model = partial(inference_model, model=model, tokenizer=tokenizer, temperature=args.temperature)
    ys = ['']  # current output candidates
    infos = []
    for step in range(5): 
        # print("Started Steps!!")
        # generation
        print("Started Generation...")
        new_ys_with_times = []
        generated_ys, generate_times = get_samples(task, x, y, args.n_generate_sample, prompt_sample=args.prompt_sample, stop=task.stops[step])
        new_ys_with_times.extend(zip(generated_ys, generate_times))
        new_ys, generate_times = zip(*new_ys_with_times)
        new_ys = list(new_ys)
        generate_times = list(generate_times)
        # new_ys = list(itertools.chain(new_ys))
        new_ys = ys + new_ys    # extend the current queue with the frontier
        ids = list(range(len(new_ys)))
        # evaluation
        # shouldn't worry about reevaluating the same ys as the values should be saved in the task cache
        # but could potentially add logic to remove expanded from queue
        print("Finished Generation...Started Eval!")
        start_time = time.perf_counter()
        if args.method_evaluate == 'vote':
            values = get_votes(task, x, new_ys, args.n_evaluate_sample)
        elif args.method_evaluate == 'value':
            values, eval_times = get_values(task, x, new_ys, args.n_evaluate_sample)
        eval_time = time.perf_counter()-start_time
        print(f"Node Eval Time: {eval_time} seconds")
        # selection
        print("Finished Eval...Started Selection...")
        start_time = time.perf_counter()
        if args.method_select == 'sample':
            ps = np.array(values) / sum(values)
            select_ids = np.random.choice(ids, size=args.n_select_sample, p=ps).tolist()
        elif args.method_select == 'greedy':
            select_ids = sorted(ids, key=lambda x: values[x], reverse=True)[:args.n_select_sample]
        select_new_ys = [new_ys[select_id] for select_id in select_ids]
        selection_time = time.perf_counter()-start_time()
        print(f"Selection Time: {selection_time} seconds")
        # log
        print("Finished Selection...Logging...")
        if to_print: 
            sorted_new_ys, sorted_values = zip(*sorted(zip(new_ys, values), key=lambda x: x[1], reverse=True))
            print(f'-- new_ys --: {sorted_new_ys}\n-- sol values --: {sorted_values}\n-- choices --: {select_new_ys}\n-- generate times --: {generate_times}\n-- eval times --: {eval_times}\n')
        infos.append({'step': step, 'x': x, 'ys': ys, 'new_ys': new_ys, 'values': values, 'select_new_ys': select_new_ys, 'generate_times': generate_times, 'eval_times': eval_times})
        ys = select_new_ys
    if to_print: 
        print(ys)
    return ys, {'steps': infos}
--- a/src/tot/prompts/bench.py
+++ b/src/tot/prompts/bench.py
@ -1,4 +1,4 @@
-propose_prompt = '''Given a problem goal and a current state, propose one possible next step to solve the problem from the current state. If the current state solves the problem, say "current state is the solution".
+propose_prompt = '''[INST]Given a problem goal and a current state, propose one possible next step to solve the problem from the current state. If the current state solves the problem, say "current state is the solution".
 ###
 Here are two examples to help:
@ -19,7 +19,7 @@ Choose the choice that best answer the following question:
    Choices:
    ['Adams only.', 'Brooks only.', 'Case only.', 'Adams and Brooks']
 Current State:
-Brooks Only.
+Choice 2
 Possible next step:
 current state is the solution.
@ -28,10 +28,10 @@ Problem Goal:
 {problem}
 Current State:
 {current_state}
-Possible next step:
+Possible next step:[\INST]
 '''
-value_prompt = '''Given a problem goal, a current state, and a proposed next step, evaluate if the next step from the current state can help solve or answer the problem (yes/likely/no)
+value_prompt = '''[INST]Given a problem goal, a current state, and a proposed next step, evaluate if the next step from the current state can help solve or answer the problem (yes/likely/no)
 ###
 Here are three examples to help:
 Example #1:
@ -52,7 +52,7 @@ Choose the choice that best answer the following question:
    Choices:
    ['Adams only.', 'Brooks only.', 'Case only.', 'Adams and Brooks']
 Current State:
-Adams and Brooks.
+Choice 4
 Proposal:
 current state is the solution.
 Evaluation:
@ -65,7 +65,7 @@ Choose the choice that best answer the following question:
    Choices:
    ['Adams only.', 'Brooks only.', 'Case only.', 'Adams and Brooks']
 Current State:
-Brooks Only.
+Choice 2
 Proposal:
 current state is the solution.
 Evaluation:
@ -78,5 +78,5 @@ Current State:
 {current_state}
 Proposal:
 {proposal}
-Evaluation:
+Evaluation:[\INST]
 '''
--- a/src/tot/tasks/bench.py
+++ b/src/tot/tasks/bench.py
@ -11,7 +11,7 @@ def get_current_numbers(y: str) -> str:
    return last_line.split('left: ')[-1].split(')')[0]
-class Game24Task(Task):
+class Bench(Task):
    """
    Input (x)   : a string of 
    Output (y)  : a trajectory of 3 steps to reach 24
@ -24,7 +24,7 @@ class Game24Task(Task):
        6 * 4 = 24 (left: 24)
        (1 + 2 + 3) * 4 = 24
    """
-    def __init__(self, file='24.csv', depth=5):
+    def __init__(self, depth=5):
        """
        file: a csv file (fixed)
        """
@ -34,13 +34,14 @@ class Game24Task(Task):
        self.steps = depth
        self.stops = None
-    def __len__(self) -> int:
+    # def __len__(self) -> int:
-        return len(self.data)
+    #     return len(self.data)
-    def get_input(self, idx: int) -> str:
+    # def get_input(self, idx: int) -> str:
-        return self.data[idx]
+    #     return self.data[idx]
    def test_output(self, idx: int, output: str):
        gt = 
        expression = output.strip().split('\n')[-1].lower().replace('answer: ', '').split('=')[0]
        numbers = re.findall(r'\d+', expression)
        problem_numbers = re.findall(r'\d+', self.data[idx])
@ -55,13 +56,13 @@ class Game24Task(Task):
            print("entered exception!!")
            return {'r': 0}
-    @staticmethod
+    # @staticmethod
-    def standard_prompt_wrap(x: str, y:str='') -> str:
+    # def standard_prompt_wrap(x: str, y:str='') -> str:
-        return standard_prompt.format(input=x) + y
+    #     return standard_prompt.format(input=x) + y
-    @staticmethod
+    # @staticmethod
-    def cot_prompt_wrap(x: str, y:str='') -> str:
+    # def cot_prompt_wrap(x: str, y:str='') -> str:
-        return cot_prompt.format(input=x) + y
+    #     return cot_prompt.format(input=x) + y
    @staticmethod
    def propose_prompt_wrap(x: str, y: str='') -> str: