mirror of
https://github.com/deepseek-ai/ESFT
synced 2024-11-22 03:27:38 +00:00
18d23501ab
update readme update readme update readme Update benchmarks.py Update download_adapters.sh Update esft.py
76 lines
3.3 KiB
Python
76 lines
3.3 KiB
Python
import json
|
|
from benchmarks import *
|
|
import os
|
|
import torch
|
|
from torch import nn
|
|
import argparse
|
|
from random import shuffle
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
from utils import get_formatted_input_and_target
|
|
|
|
# constants for deepseek-v2-lite
|
|
TOP_K=6
|
|
N_EXPERTS=64
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--base_model_path", type=str, required=True)
|
|
parser.add_argument("--eval_datasets", type=str, required=True)
|
|
parser.add_argument("--output_dir", type=str, required=True)
|
|
parser.add_argument("--n_sample_tokens", type=int, required=True)
|
|
args = parser.parse_args()
|
|
|
|
eval_datasets = args.eval_datasets.split(",")
|
|
output_dir = args.output_dir
|
|
base_model_path = args.base_model_path
|
|
n_sample_tokens = args.n_sample_tokens
|
|
|
|
model, tokenizer = AutoModelForCausalLM.from_pretrained(base_model_path, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto"), AutoTokenizer.from_pretrained(base_model_path)
|
|
model.config.log_expert_weights = True
|
|
|
|
for dataset_name in eval_datasets:
|
|
dataset = [json.loads(i) for i in open(f"datasets/train/{dataset_name}.jsonl").readlines()]
|
|
shuffle(dataset)
|
|
model.config.expert_log_dir = os.path.join(args.output_dir, dataset_name)
|
|
# make dir -p this
|
|
os.makedirs(os.path.join(args.output_dir, dataset_name), exist_ok=True)
|
|
done_tokens = 0
|
|
for instance in dataset:
|
|
input_ids, target_ids = get_formatted_input_and_target(instance['messages'], tokenizer, -100)
|
|
model(input_ids=torch.tensor(input_ids).unsqueeze(0), labels=torch.tensor(target_ids).unsqueeze(0))
|
|
done_tokens += len(input_ids)
|
|
if done_tokens >= n_sample_tokens:
|
|
break
|
|
|
|
# open all files under os.path.join(args.output_dir, dataset_name). For each file, generate a summary of it
|
|
# and write it to a file in the same directory
|
|
files = os.listdir(os.path.join(args.output_dir, dataset_name))
|
|
summary_file = os.path.join(args.output_dir, dataset_name, "summary.json")
|
|
token_scores = {}
|
|
gate_scores = {}
|
|
|
|
for file in files:
|
|
if not file.endswith(".txt"):
|
|
continue
|
|
layer_idx = file.split("_")[2].split(".")[0]
|
|
token_scores[layer_idx] = {expert:0 for expert in range(N_EXPERTS)}
|
|
gate_scores[layer_idx] = {expert:0 for expert in range(N_EXPERTS)}
|
|
|
|
with open(os.path.join(args.output_dir, dataset_name, file)) as f:
|
|
data = f.readlines()
|
|
for line in data:
|
|
expert_ids, expert_weights = line.split("\t\t")
|
|
expert_ids = [int(i) for i in expert_ids.split("\t")]
|
|
expert_weights = [float(i) for i in expert_weights.split("\t")]
|
|
for expert_id, expert_weight in zip(expert_ids, expert_weights):
|
|
gate_scores[layer_idx][expert_id] += expert_weight
|
|
token_scores[layer_idx][expert_id] += 1. / TOP_K
|
|
total = sum(token_scores[layer_idx].values())
|
|
gate_scores[layer_idx] = {expert: round(gate_scores[layer_idx][expert] / total, 4) for expert in gate_scores[layer_idx]}
|
|
token_scores[layer_idx] = {expert: round(token_scores[layer_idx][expert] / total, 4) for expert in token_scores[layer_idx]}
|
|
|
|
|
|
with open(summary_file, "w") as f:
|
|
f.write(json.dumps({"token_scores": token_scores, "gate_scores": gate_scores}))
|
|
|
|
|