mirror of
https://github.com/deepseek-ai/ESFT
synced 2025-06-26 18:15:50 +00:00
streamline code; add intermediate saving support for ep
This commit is contained in:
@@ -2,17 +2,18 @@
|
||||
|
||||
|
||||
python eval_multigpu.py \
|
||||
--eval_datasets=translation \
|
||||
--eval_dataset=intent \
|
||||
--base_model_path=deepseek-ai/ESFT-vanilla-lite \
|
||||
--adapter_dir=all_models/adapters/token \
|
||||
--output_dir=results/completions/token \
|
||||
--adapter_dir=all_models/adapters/token/intent \
|
||||
--output_path=results/completions/token/intent.jsonl \
|
||||
--max_new_tokens=512 \
|
||||
--openai_api_key=REPLACE_WITH_YOUR_KEY \
|
||||
--eval_batch_size=2 \
|
||||
--world_size=4 \
|
||||
--openai_api_key=REPLACE_WITH_YOUR_KEY \
|
||||
--gpus_per_rank=2
|
||||
|
||||
# this script is used for single-gpu training and has been deprecated. If you have no multiple gpus, you can set above world_size=1 and gpus_per_rank=1
|
||||
|
||||
# below script is used for single-gpu training and has been deprecated. If you have only one gpu, you can set above world_size=1 and gpus_per_rank=1
|
||||
|
||||
# python scripts/eval.py \
|
||||
# --eval_datasets=translation \
|
||||
|
||||
@@ -1,15 +1,17 @@
|
||||
export PYTHONPATH=$PYTHONPATH:$(pwd)
|
||||
|
||||
python scripts/expert/get_expert_scores.py \
|
||||
--eval_dataset=translation \
|
||||
--eval_dataset=intent \
|
||||
--base_model_path=deepseek-ai/ESFT-vanilla-lite \
|
||||
--output_dir=results/expert_scores/translation \
|
||||
--output_dir=results/expert_scores/intent \
|
||||
--n_sample_tokens=131072 \
|
||||
--world_size=4 \
|
||||
--gpus_per_rank=2
|
||||
|
||||
python scripts/expert/generate_expert_config.py \
|
||||
--eval_datasets=intent,summary,law,translation \
|
||||
--expert_scores_dir=results/expert_scores \
|
||||
--output_dir=results/expert_configs \
|
||||
--eval_dataset=intent \
|
||||
--expert_scores_dir=results/expert_scores/intent \
|
||||
--output_path=results/expert_configs/intent.json \
|
||||
--score_function=token \
|
||||
--top_p=0.2 # the scoring function and top_p are hyperparameters
|
||||
# --train_shared_experts
|
||||
|
||||
@@ -30,9 +30,10 @@ def get_summary(files):
|
||||
expert_ids, expert_weights = parse_line(line)
|
||||
np.add.at(gate_scores[layer_id], expert_ids, expert_weights)
|
||||
np.add.at(token_scores[layer_id], expert_ids, np.ones_like(expert_weights) / TOP_K)
|
||||
|
||||
gate_scores = gate_scores / np.sum(gate_scores, axis=0)
|
||||
token_scores = token_scores / np.sum(token_scores, axis=0)
|
||||
|
||||
total = sum(token_scores[0])
|
||||
gate_scores = gate_scores / total
|
||||
token_scores = token_scores / total
|
||||
|
||||
summary = {"token_scores": token_scores, "gate_scores": gate_scores}
|
||||
summary = {k: {str(i+1): {str(j): round(v, 4) for j, v in enumerate(l)} for i, l in enumerate(v)} for k, v in summary.items()}
|
||||
@@ -65,7 +66,6 @@ if __name__ == "__main__":
|
||||
for file in os.listdir(os.path.join(args.expert_scores_dir, rank)):
|
||||
file_names.append([rank, file])
|
||||
|
||||
|
||||
summary_file = os.path.join(args.expert_scores_dir, "summary.json")
|
||||
summary = get_summary(file_names)
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ from utils import get_formatted_input_and_target
|
||||
import torch.multiprocessing as mp
|
||||
from itertools import accumulate
|
||||
from accelerate import dispatch_model
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
def infer_auto_device_map(model, pp_splits, visible_devices):
|
||||
assert len(pp_splits) == len(visible_devices)
|
||||
@@ -27,8 +27,10 @@ def infer_auto_device_map(model, pp_splits, visible_devices):
|
||||
return device_map
|
||||
|
||||
|
||||
def eval_expert(rank, args, model, dataset):
|
||||
def eval_expert(rank, args, dataset):
|
||||
try:
|
||||
model = AutoModelForCausalLM.from_pretrained(args.base_model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) # not using tokenizer here to aviod deadlock
|
||||
model.config.log_expert_weights = True
|
||||
print(f"Rank {rank} starting expert evaluation...", flush=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(args.base_model_path)
|
||||
visible_devices = list(range(rank * args.gpus_per_rank, (rank + 1) * args.gpus_per_rank))
|
||||
@@ -39,12 +41,15 @@ def eval_expert(rank, args, model, dataset):
|
||||
os.makedirs(os.path.join(args.output_dir, f"rank_{rank}"), exist_ok=True)
|
||||
done_tokens = 0
|
||||
cur_dataset = dataset[rank::args.world_size]
|
||||
pbar = tqdm(total=n_sample_tokens, desc=f"Rank {rank} processing tokens", position=rank)
|
||||
for instance in cur_dataset:
|
||||
input_ids, target_ids = get_formatted_input_and_target(instance['messages'], tokenizer, -100)
|
||||
model(input_ids=torch.tensor(input_ids).unsqueeze(0), labels=torch.tensor(target_ids).unsqueeze(0))
|
||||
done_tokens += len(input_ids)
|
||||
pbar.update(len(input_ids))
|
||||
if done_tokens >= n_sample_tokens:
|
||||
break
|
||||
pbar.close()
|
||||
|
||||
|
||||
except Exception as e:
|
||||
@@ -64,15 +69,10 @@ if __name__ == "__main__":
|
||||
random.seed(5934875)
|
||||
|
||||
|
||||
print("Loading base model...")
|
||||
model = AutoModelForCausalLM.from_pretrained(args.base_model_path, trust_remote_code=True, torch_dtype=torch.bfloat16) # not using tokenizer here to aviod deadlock
|
||||
model.config.log_expert_weights = True
|
||||
|
||||
|
||||
print(f"Running expert evaluation on {args.eval_dataset}...")
|
||||
dataset = [json.loads(i) for i in open(f"datasets/train/{args.eval_dataset}.jsonl").readlines()]
|
||||
random.shuffle(dataset)
|
||||
|
||||
|
||||
print("Start Evaluating...")
|
||||
mp.spawn(eval_expert, args=(args, model, dataset), nprocs=args.world_size, join=True)
|
||||
mp.spawn(eval_expert, args=(args, dataset), nprocs=args.world_size, join=True)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
|
||||
exp_name="test/eval_translation"
|
||||
base_model_path="deepseek-ai/esft-vanilla-lite"
|
||||
base_model_path="deepseek-ai/ESFT-vanilla-lite"
|
||||
# turn above to for loop
|
||||
python train.py \
|
||||
--base_model_path=${base_model_path} \
|
||||
|
||||
@@ -1,11 +1,27 @@
|
||||
|
||||
export TOKENIZERS_PARALLELISM=false
|
||||
|
||||
exp_name="test/eval_translation"
|
||||
base_model_path="deepseek-ai/esft-vanilla-lite"
|
||||
exp_name="test/eval_intent"
|
||||
base_model_path="deepseek-ai/ESFT-vanilla-lite"
|
||||
torchrun --nproc-per-node=8 train_ep.py \
|
||||
--base_model_path=${base_model_path} \
|
||||
--expert_config=results/expert_configs/translation.json \
|
||||
--train_dataset=translation \
|
||||
--expert_config=results/expert_configs/intent.json \
|
||||
--train_dataset=intent \
|
||||
--train_config=configs/base.yaml \
|
||||
--output_dir=results/checkpoints/${exp_name}
|
||||
--output_dir=results/checkpoints/${exp_name}
|
||||
|
||||
|
||||
|
||||
cp results/expert_configs/intent.json results/checkpoints/${exp_name}/checkpoint-1/expert_cfg.json
|
||||
|
||||
|
||||
python eval_multigpu.py \
|
||||
--eval_dataset=intent \
|
||||
--base_model_path=deepseek-ai/ESFT-vanilla-lite \
|
||||
--adapter_dir=results/checkpoints/${exp_name}/checkpoint-1 \
|
||||
--output_path=results/completions/token/intent.jsonl \
|
||||
--max_new_tokens=512 \
|
||||
--eval_batch_size=2 \
|
||||
--world_size=4 \
|
||||
--openai_api_key=REPLACE_WITH_YOUR_KEY \
|
||||
--gpus_per_rank=2
|
||||
Reference in New Issue
Block a user