mirror of
https://github.com/deepseek-ai/ESFT
synced 2025-06-26 18:15:50 +00:00
update eval and readme
This commit is contained in:
@@ -1,12 +1,24 @@
|
||||
# first, download adapter models and put them to the corresponding directories
|
||||
# first: download adapter models and put them to the corresponding directories
|
||||
|
||||
|
||||
python scripts/eval.py \
|
||||
python eval_multigpu.py \
|
||||
--eval_datasets=translation \
|
||||
--base_model_path=deepseek-ai/ESFT-vanilla-lite \
|
||||
--adapter_dir=all_models/adapters/token \
|
||||
--output_dir=results/completions/token \
|
||||
--max_new_tokens=512 \
|
||||
--openai_api_key=REPLACE_WITH_YOUR_KEY \
|
||||
--eval_batch_size=2
|
||||
--eval_batch_size=2 \
|
||||
--world_size=4 \
|
||||
--gpus_per_rank=2
|
||||
|
||||
# this script is used for single-gpu training and has been deprecated. If you have no multiple gpus, you can set above world_size=1 and gpus_per_rank=1
|
||||
|
||||
# python scripts/eval.py \
|
||||
# --eval_datasets=translation \
|
||||
# --base_model_path=deepseek-ai/ESFT-vanilla-lite \
|
||||
# --adapter_dir=all_models/adapters/token \
|
||||
# --output_dir=results/completions/token \
|
||||
# --max_new_tokens=512 \
|
||||
# --openai_api_key=REPLACE_WITH_YOUR_KEY \
|
||||
# --eval_batch_size=2
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
python scripts/get_expert_scores.py \
|
||||
--eval_datasets=intent,summary,law,translation \
|
||||
python scripts/expert/get_expert_scores.py \
|
||||
--eval_dataset=translation \
|
||||
--base_model_path=deepseek-ai/ESFT-vanilla-lite \
|
||||
--output_dir=results/expert_scores \
|
||||
--n_sample_tokens=8192 # this sample size is a hyperparameter
|
||||
--output_dir=results/expert_scores/translation \
|
||||
--n_sample_tokens=131072 \
|
||||
--world_size=4 \
|
||||
--gpus_per_rank=2
|
||||
|
||||
python scripts/generate_expert_config.py \
|
||||
python scripts/expert/generate_expert_config.py \
|
||||
--eval_datasets=intent,summary,law,translation \
|
||||
--expert_scores_dir=results/expert_scores \
|
||||
--output_dir=results/expert_configs \
|
||||
97
scripts/expert/generate_expert_config.py
Normal file
97
scripts/expert/generate_expert_config.py
Normal file
@@ -0,0 +1,97 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
from multiprocessing import Pool
|
||||
import numpy as np
|
||||
|
||||
|
||||
def parse_line(line):
|
||||
expert_ids, expert_weights = line.split("\t\t")
|
||||
expert_ids = [int(i) for i in expert_ids.split("\t")]
|
||||
expert_weights = [float(i) for i in expert_weights.split("\t")]
|
||||
return expert_ids, expert_weights
|
||||
|
||||
|
||||
def get_summary(files):
|
||||
TOP_K=6
|
||||
N_EXPERTS=64
|
||||
N_LAYERS=26 # 27 layers totally, the first layer is not MoE
|
||||
|
||||
gate_scores = np.zeros((N_LAYERS, N_EXPERTS))
|
||||
token_scores = np.zeros((N_LAYERS, N_EXPERTS))
|
||||
|
||||
print("loading files")
|
||||
for rank, file in files:
|
||||
layer_id = int(file.split(".")[0].split("_")[2]) - 1
|
||||
|
||||
with open(os.path.join(args.expert_scores_dir, rank, file)) as f:
|
||||
data = f.readlines()
|
||||
for line in data:
|
||||
expert_ids, expert_weights = parse_line(line)
|
||||
np.add.at(gate_scores[layer_id], expert_ids, expert_weights)
|
||||
np.add.at(token_scores[layer_id], expert_ids, np.ones_like(expert_weights) / TOP_K)
|
||||
|
||||
gate_scores = gate_scores / np.sum(gate_scores, axis=0)
|
||||
token_scores = token_scores / np.sum(token_scores, axis=0)
|
||||
|
||||
summary = {"token_scores": token_scores, "gate_scores": gate_scores}
|
||||
summary = {k: {str(i+1): {str(j): round(v, 4) for j, v in enumerate(l)} for i, l in enumerate(v)} for k, v in summary.items()}
|
||||
|
||||
return summary
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--eval_dataset", type=str, required=True)
|
||||
parser.add_argument("--expert_scores_dir", type=str, required=True)
|
||||
parser.add_argument("--output_path", type=str, required=True)
|
||||
parser.add_argument("--score_function", type=str, required=True)
|
||||
parser.add_argument("--top_p", type=float, required=True)
|
||||
parser.add_argument("--train_shared_experts", action="store_true")
|
||||
parser.add_argument("--train_non_expert_modules", action="store_true")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
expert_cfg = { # initialize expert config
|
||||
"experts": {},
|
||||
"shared_experts": args.train_shared_experts,
|
||||
"non_expert_modules": args.train_non_expert_modules
|
||||
}
|
||||
|
||||
# let's walk inside args.expert_scores_dir and get abs file names
|
||||
file_names = []
|
||||
for rank in [i for i in os.listdir(args.expert_scores_dir) if 'rank' in i]:
|
||||
for file in os.listdir(os.path.join(args.expert_scores_dir, rank)):
|
||||
file_names.append([rank, file])
|
||||
|
||||
|
||||
summary_file = os.path.join(args.expert_scores_dir, "summary.json")
|
||||
summary = get_summary(file_names)
|
||||
|
||||
with open(summary_file, "w") as f:
|
||||
f.write(json.dumps(summary))
|
||||
|
||||
|
||||
scores = summary[f"{args.score_function}_scores"]
|
||||
for layer, l_score in scores.items():
|
||||
l_score = [(int(k), v) for k,v in l_score.items()]
|
||||
l_score = sorted(l_score, key=lambda x: x[1], reverse=True)
|
||||
selected_experts = []
|
||||
current_score = 0
|
||||
for expert, score in l_score:
|
||||
if current_score >= args.top_p:
|
||||
break
|
||||
selected_experts.append(expert)
|
||||
current_score += score
|
||||
expert_cfg["experts"][layer] = selected_experts
|
||||
|
||||
top_p = args.top_p
|
||||
train_shared_experts = args.train_shared_experts
|
||||
train_non_expert_modules = args.train_non_expert_modules
|
||||
|
||||
|
||||
|
||||
os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
|
||||
with open(args.output_path, "w") as f:
|
||||
json.dump(expert_cfg, f)
|
||||
78
scripts/expert/get_expert_scores.py
Normal file
78
scripts/expert/get_expert_scores.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import json
|
||||
import os
|
||||
import torch
|
||||
import argparse
|
||||
import random
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from utils import get_formatted_input_and_target
|
||||
import torch.multiprocessing as mp
|
||||
from itertools import accumulate
|
||||
from accelerate import dispatch_model
|
||||
|
||||
|
||||
def infer_auto_device_map(model, pp_splits, visible_devices):
|
||||
assert len(pp_splits) == len(visible_devices)
|
||||
device_map = {
|
||||
"model.embed_tokens": 0,
|
||||
"model.norm": len(pp_splits) - 1,
|
||||
"lm_head": len(pp_splits) - 1
|
||||
}
|
||||
assert len(model.model.layers) == sum(pp_splits)
|
||||
pp_splits = [0, *list(accumulate(pp_splits))]
|
||||
for idx, (start, end) in enumerate(zip(pp_splits[:-1], pp_splits[1:])):
|
||||
for i in range(start, end):
|
||||
device_map.update({f"model.layers.{i}": idx})
|
||||
for k, v in device_map.items():
|
||||
device_map[k] = visible_devices[v]
|
||||
return device_map
|
||||
|
||||
|
||||
def eval_expert(rank, args, model, dataset):
|
||||
try:
|
||||
print(f"Rank {rank} starting expert evaluation...", flush=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.base_model_path)
|
||||
visible_devices = list(range(rank * args.gpus_per_rank, (rank + 1) * args.gpus_per_rank))
|
||||
device_map = infer_auto_device_map(model, [14, 13], visible_devices)
|
||||
model = dispatch_model(model, device_map)
|
||||
model.config.expert_log_dir = os.path.join(args.output_dir, f"rank_{rank}")
|
||||
n_sample_tokens = args.n_sample_tokens // args.world_size
|
||||
os.makedirs(os.path.join(args.output_dir, f"rank_{rank}"), exist_ok=True)
|
||||
done_tokens = 0
|
||||
cur_dataset = dataset[rank::args.world_size]
|
||||
for instance in cur_dataset:
|
||||
input_ids, target_ids = get_formatted_input_and_target(instance['messages'], tokenizer, -100)
|
||||
model(input_ids=torch.tensor(input_ids).unsqueeze(0), labels=torch.tensor(target_ids).unsqueeze(0))
|
||||
done_tokens += len(input_ids)
|
||||
if done_tokens >= n_sample_tokens:
|
||||
break
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in process {rank}: {e}", flush=True)
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Evaluate a model with adapters on a specified dataset.")
|
||||
parser.add_argument("--eval_dataset", type=str, required=True, help="Name of the evaluation dataset")
|
||||
parser.add_argument("--base_model_path", type=str, required=True, help="Path to the base model")
|
||||
parser.add_argument("--output_dir", type=str, required=True, help="Path to save the evaluation results")
|
||||
parser.add_argument("--world_size", type=int, default=4, help="Number of processes to use for evaluation")
|
||||
parser.add_argument("--gpus_per_rank", type=int, default=2, help="Number of GPUs per process")
|
||||
parser.add_argument("--n_sample_tokens", type=int, required=True, help="Token to sample for expert evaluation")
|
||||
args = parser.parse_args()
|
||||
random.seed(5934875)
|
||||
|
||||
|
||||
print("Loading base model...")
|
||||
model = AutoModelForCausalLM.from_pretrained(args.base_model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) # not using tokenizer here to aviod deadlock
|
||||
model.config.log_expert_weights = True
|
||||
|
||||
|
||||
print(f"Running expert evaluation on {args.eval_dataset}...")
|
||||
dataset = [json.loads(i) for i in open(f"datasets/train/{args.eval_dataset}.jsonl").readlines()]
|
||||
random.shuffle(dataset)
|
||||
|
||||
|
||||
print("Start Evaluating...")
|
||||
mp.spawn(eval_expert, args=(args, model, dataset), nprocs=args.world_size, join=True)
|
||||
@@ -1,50 +0,0 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--eval_datasets", type=str, required=True)
|
||||
parser.add_argument("--expert_scores_dir", type=str, required=True)
|
||||
parser.add_argument("--output_dir", type=str, required=True)
|
||||
parser.add_argument("--score_function", type=str, required=True)
|
||||
parser.add_argument("--top_p", type=float, required=True)
|
||||
parser.add_argument("--train_shared_experts", action="store_true")
|
||||
parser.add_argument("--train_non_expert_modules", action="store_true")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
eval_datasets = args.eval_datasets.split(",")
|
||||
expert_scores_dir = args.expert_scores_dir
|
||||
output_dir = args.output_dir
|
||||
score_function = args.score_function
|
||||
top_p = args.top_p
|
||||
train_shared_experts = args.train_shared_experts
|
||||
train_non_expert_modules = args.train_non_expert_modules
|
||||
|
||||
for dataset_name in eval_datasets:
|
||||
summary_file = f"{expert_scores_dir}/{dataset_name}/summary.json"
|
||||
expert_cfg = {"experts": {}, "shared_experts": train_shared_experts, "non_expert_modules": train_non_expert_modules}
|
||||
|
||||
with open(summary_file) as f:
|
||||
data = json.load(f)
|
||||
assert score_function in ["gate", "token"], f"Unknown score function: {score_function}"
|
||||
scores = data[f"{score_function}_scores"]
|
||||
|
||||
for layer, l_score in scores.items():
|
||||
l_score = [(int(k), v) for k,v in l_score.items()]
|
||||
l_score = sorted(l_score, key=lambda x: x[1], reverse=True)
|
||||
# get the top experts that make the threshold exceed top_p
|
||||
selected_experts = []
|
||||
current_score = 0
|
||||
for expert, score in l_score:
|
||||
if current_score >= top_p:
|
||||
break
|
||||
selected_experts.append(expert)
|
||||
current_score += score
|
||||
expert_cfg["experts"][layer] = selected_experts
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
with open(f"{output_dir}/{dataset_name}.json", "w") as f:
|
||||
json.dump(expert_cfg, f)
|
||||
|
||||
|
||||
@@ -1,75 +0,0 @@
|
||||
import json
|
||||
from benchmarks import *
|
||||
import os
|
||||
import torch
|
||||
from torch import nn
|
||||
import argparse
|
||||
from random import shuffle
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from utils import get_formatted_input_and_target
|
||||
|
||||
# constants for deepseek-v2-lite
|
||||
TOP_K=6
|
||||
N_EXPERTS=64
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--base_model_path", type=str, required=True)
|
||||
parser.add_argument("--eval_datasets", type=str, required=True)
|
||||
parser.add_argument("--output_dir", type=str, required=True)
|
||||
parser.add_argument("--n_sample_tokens", type=int, required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
eval_datasets = args.eval_datasets.split(",")
|
||||
output_dir = args.output_dir
|
||||
base_model_path = args.base_model_path
|
||||
n_sample_tokens = args.n_sample_tokens
|
||||
|
||||
model, tokenizer = AutoModelForCausalLM.from_pretrained(base_model_path, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto"), AutoTokenizer.from_pretrained(base_model_path)
|
||||
model.config.log_expert_weights = True
|
||||
|
||||
for dataset_name in eval_datasets:
|
||||
dataset = [json.loads(i) for i in open(f"datasets/train/{dataset_name}.jsonl").readlines()]
|
||||
shuffle(dataset)
|
||||
model.config.expert_log_dir = os.path.join(args.output_dir, dataset_name)
|
||||
# make dir -p this
|
||||
os.makedirs(os.path.join(args.output_dir, dataset_name), exist_ok=True)
|
||||
done_tokens = 0
|
||||
for instance in dataset:
|
||||
input_ids, target_ids = get_formatted_input_and_target(instance['messages'], tokenizer, -100)
|
||||
model(input_ids=torch.tensor(input_ids).unsqueeze(0), labels=torch.tensor(target_ids).unsqueeze(0))
|
||||
done_tokens += len(input_ids)
|
||||
if done_tokens >= n_sample_tokens:
|
||||
break
|
||||
|
||||
# open all files under os.path.join(args.output_dir, dataset_name). For each file, generate a summary of it
|
||||
# and write it to a file in the same directory
|
||||
files = os.listdir(os.path.join(args.output_dir, dataset_name))
|
||||
summary_file = os.path.join(args.output_dir, dataset_name, "summary.json")
|
||||
token_scores = {}
|
||||
gate_scores = {}
|
||||
|
||||
for file in files:
|
||||
if not file.endswith(".txt"):
|
||||
continue
|
||||
layer_idx = file.split("_")[2].split(".")[0]
|
||||
token_scores[layer_idx] = {expert:0 for expert in range(N_EXPERTS)}
|
||||
gate_scores[layer_idx] = {expert:0 for expert in range(N_EXPERTS)}
|
||||
|
||||
with open(os.path.join(args.output_dir, dataset_name, file)) as f:
|
||||
data = f.readlines()
|
||||
for line in data:
|
||||
expert_ids, expert_weights = line.split("\t\t")
|
||||
expert_ids = [int(i) for i in expert_ids.split("\t")]
|
||||
expert_weights = [float(i) for i in expert_weights.split("\t")]
|
||||
for expert_id, expert_weight in zip(expert_ids, expert_weights):
|
||||
gate_scores[layer_idx][expert_id] += expert_weight
|
||||
token_scores[layer_idx][expert_id] += 1. / TOP_K
|
||||
total = sum(token_scores[layer_idx].values())
|
||||
gate_scores[layer_idx] = {expert: round(gate_scores[layer_idx][expert] / total, 4) for expert in gate_scores[layer_idx]}
|
||||
token_scores[layer_idx] = {expert: round(token_scores[layer_idx][expert] / total, 4) for expert in token_scores[layer_idx]}
|
||||
|
||||
|
||||
with open(summary_file, "w") as f:
|
||||
f.write(json.dumps({"token_scores": token_scores, "gate_scores": gate_scores}))
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
|
||||
exp_name="test/eval_translation"
|
||||
base_model_path="/hf3fs-jd/prod/deepseek/shared/wangzihan/models/huggingface/vanilla_model"
|
||||
base_model_path="deepseek-ai/esft-vanilla-lite"
|
||||
# turn above to for loop
|
||||
python train.py \
|
||||
--base_model_path=${base_model_path} \
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
|
||||
exp_name="test/eval_translation"
|
||||
base_model_path="/hf3fs-jd/prod/deepseek/shared/wangzihan/models/huggingface/vanilla_model"
|
||||
base_model_path="deepseek-ai/esft-vanilla-lite"
|
||||
torchrun --nproc-per-node=8 train_ep.py \
|
||||
--base_model_path=${base_model_path} \
|
||||
--expert_config=results/expert_configs/translation.json \
|
||||
|
||||
Reference in New Issue
Block a user