model:
  model_id: qwen3_next
  model_name_or_path: /data/ci/models/qwen3_next/hf/qwen3_next_L4_mtp
  trust_remote_code: False
  train_from_scratch: False
  init_model_with_meta_device: True

data:
  dataset:
    file_name: "/data/ci/datasets/origin/train-00000-of-00001-a09b74b3ef9c3b56.parquet"
  template: qwen3
  cutoff_len: 4096
  max_samples: 100000
  overwrite_cache: True
  preprocessing_num_workers: 1
  data_manager_type: mg
  reset_attention_mask: True
  append_eod: True

parallel:
  fsdp_size: 8
  fsdp_modules:
    - model.layers.{*}
    - model.embed_tokens
    - lm_head
  tp_size: 1
  ep_size: 1
  ep_modules:
    - model.layers.{*}.mlp.experts
  ep_fsdp_size: 1
  ep_fsdp_modules:
    - model.layers.{*}.mlp.experts
  ep_dispatcher: eager
  recompute: True
  reduce_dtype: bf16
  recompute_modules:
    - model.layers.{*}
  cp_size: 2
  cp_type: ulysses



training:
  stage: pt
  per_device_train_batch_size: 1
  gradient_accumulation_steps: 2
  dataloader_num_workers : 4
  seed: 42
  dataloader_drop_last: True
  output_dir: ./output
  optimizer: adamw
  lr: 1e-05
  weight_decay: 0.01
  adam_beta1: 0.9
  adam_beta2: 0.95
  adam_epsilon: 1e-08
  max_grad_norm: 1.0
  lr_scheduler_type: cosine
  warmup_ratio: 0.0
  min_lr: 1e-06
  num_train_epochs: 1000
  max_steps: 15
  logging_steps: 1

optimization:
  chunk_loss_size: 1024
  use_triton_gdn: False
  use_flash_gdn: True
  use_fused_rmsnorm: True
  moe_grouped_gemm: True
  use_fused_rotary_pos_emb: True
  use_flash_attn: True