ESFT/configs/base.yaml

31 lines
905 B
YAML
Raw Normal View History

2024-08-09 08:23:48 +00:00
seed: 5934875
# Model settings
seq_length: 4096 # Maximum sequence length
# Data settings
per_device_batch_size: 1
n_device: 8 # Number of devices
# Training settings
optim: adamw_torch_fused
steps: 500 # Number of training steps
learning_rate: 0.00001 # Learning rate
weight_decay: 0.1 # Weight decay for optimizer
warmup_steps: 0 # Number of warmup steps for learning rate scheduler
logging_steps: 10 # Log every X steps
adam_beta1: 0.9
adam_beta2: 0.95
random_concat_ratio: 0.2 # Ratio of random concatenation
# Evaluation settings
eval_steps: 100 # Evaluate every X steps
save_steps: 100 # Save model every X steps
# Tokenizer settings
# Additional settings (if needed)
gradient_checkpointing: true
gradient_accumulation_steps: 16 # Number of updates steps to accumulate before performing a backward/update pass
max_grad_norm: 1.0 # Max gradient norm for gradient clipping
ep_size: 2