model:
model_id: mamba3
model_name_or_path: /home/hf_weights/Mamba3/
trust_remote_code: True
train_from_scratch: True
tokenizer_name_or_path: /home/hf_weights/Mamba3/
init_model_with_meta_device: True
data:
dataset:
file_name: "/home/dataset/train-00000-of-00042-d964455e17e96d5a.parquet"
template: qwen3
cutoff_len: 2048
max_samples: 100000
overwrite_cache: True
preprocessing_num_workers: 1
data_manager_type: mg
parallel:
fsdp_size: 8
fsdp_modules:
- model.layers.{*}
- model.embed_tokens
- lm_head
tp_size: 1
ep_modules:
- model.layers.{*}.mlp.experts
ep_size: 1
ep_fsdp_size: 1
ep_fsdp_modules:
- model.layers.{*}.mlp.experts
ignored_modules:
- model.layers.{*}.mlp.router
ep_dispatcher: eager
recompute: False
recompute_modules:
- model.layers.{*}
cp_size: 1
cp_type: ulysses
training:
stage: pt
per_device_train_batch_size: 1
gradient_accumulation_steps: 1
dataloader_num_workers : 1
disable_shuffling: 1
seed: 42
dataloader_drop_last: True
output_dir: ./output
optimizer: adamw
lr: 1e-05
weight_decay: 0.01
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1e-08
max_grad_norm: 1.0
lr_scheduler_type: cosine
warmup_ratio: 0.0
min_lr: 1e-06
num_train_epochs: 3.0
max_steps: -1
save_steps: 500
logging_steps: 1
optimization:
use_triton_rmsnormgated: True