model:
model_id: qwen3_next
model_name_or_path: /data/ci/models/qwen3_next/hf/qwen3_next_L4_mtp
trust_remote_code: False
train_from_scratch: False
init_model_with_meta_device: True
data:
dataset:
file_name: "/data/ci/datasets/origin/train-00000-of-00001-a09b74b3ef9c3b56.parquet"
template: qwen3
cutoff_len: 4096
max_samples: 100000
overwrite_cache: True
preprocessing_num_workers: 1
data_manager_type: mg
reset_attention_mask: True
append_eod: True
parallel:
fsdp_size: 8
fsdp_modules:
- model.layers.{*}
- model.embed_tokens
- lm_head
tp_size: 1
ep_size: 1
ep_modules:
- model.layers.{*}.mlp.experts
ep_fsdp_size: 1
ep_fsdp_modules:
- model.layers.{*}.mlp.experts
ep_dispatcher: eager
recompute: True
reduce_dtype: bf16
recompute_modules:
- model.layers.{*}
cp_size: 2
cp_type: ulysses
training:
stage: pt
per_device_train_batch_size: 1
gradient_accumulation_steps: 2
dataloader_num_workers : 4
seed: 42
dataloader_drop_last: True
output_dir: ./output
optimizer: adamw
lr: 1e-05
weight_decay: 0.01
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1e-08
max_grad_norm: 1.0
lr_scheduler_type: cosine
warmup_ratio: 0.0
min_lr: 1e-06
num_train_epochs: 1000
max_steps: 15
logging_steps: 1
optimization:
chunk_loss_size: 1024
use_triton_gdn: False
use_flash_gdn: True
use_fused_rmsnorm: True
moe_grouped_gemm: True
use_fused_rotary_pos_emb: True
use_flash_attn: True