mirror of
https://github.com/deepseek-ai/ESFT
synced 2024-11-26 05:38:07 +00:00
31 lines
905 B
YAML
31 lines
905 B
YAML
|
seed: 5934875
|
||
|
# Model settings
|
||
|
seq_length: 4096 # Maximum sequence length
|
||
|
|
||
|
# Data settings
|
||
|
per_device_batch_size: 1
|
||
|
n_device: 8 # Number of devices
|
||
|
|
||
|
# Training settings
|
||
|
optim: adamw_torch_fused
|
||
|
steps: 500 # Number of training steps
|
||
|
learning_rate: 0.00001 # Learning rate
|
||
|
weight_decay: 0.1 # Weight decay for optimizer
|
||
|
warmup_steps: 0 # Number of warmup steps for learning rate scheduler
|
||
|
logging_steps: 10 # Log every X steps
|
||
|
adam_beta1: 0.9
|
||
|
adam_beta2: 0.95
|
||
|
random_concat_ratio: 0.2 # Ratio of random concatenation
|
||
|
|
||
|
|
||
|
# Evaluation settings
|
||
|
eval_steps: 100 # Evaluate every X steps
|
||
|
save_steps: 100 # Save model every X steps
|
||
|
|
||
|
# Tokenizer settings
|
||
|
|
||
|
# Additional settings (if needed)
|
||
|
gradient_checkpointing: true
|
||
|
gradient_accumulation_steps: 16 # Number of updates steps to accumulate before performing a backward/update pass
|
||
|
max_grad_norm: 1.0 # Max gradient norm for gradient clipping
|
||
|
ep_size: 2
|