MindSpeed-LLM/tests/st/shell_scripts/pretrain_qwen3_8b_4K_fsdp2.yaml-代码预览-MindSpeed-LLM:基于昇腾生态的大语言模型分布式训练套件 - AtomGit

ascend-robotfeat(pytorch):ring cp support MLA/GQA

model:
  model_name_or_path: /data/ci/models/Qwen3-8B/hf/Qwen3-8B
  trust_remote_code: False
  train_from_scratch: False
  init_model_with_meta_device: True

data:
  dataset:
    file_name: "/data/ci/datasets/origin/train-00000-of-00001-a09b74b3ef9c3b56.parquet"
  template: qwen3
  cutoff_len: 4096
  max_samples: 100000
  overwrite_cache: True
  preprocessing_num_workers: 1
  data_manager_type: mg

parallel:
  fsdp_size: 8
  fsdp_modules:
    - model.layers.{*}
    - model.embed_tokens
    - lm_head
  tp_size: 1
  ep_size: 1
  ep_modules:
    - model.layers.{*}.mlp.experts
  ep_fsdp_size: 1
  ep_fsdp_modules:
    - model.layers.{*}.mlp.experts
  ep_dispatcher: eager
  recompute: True
  recompute_modules:
    - model.layers.{*}
  cp_size: 2
  cp_type: ring

training:
  stage: pt
  per_device_train_batch_size: 2
  gradient_accumulation_steps: 1
  dataloader_num_workers : 4
  seed: 42
  dataloader_drop_last: True
  output_dir: ./output
  optimizer: adamw
  lr: 1e-05
  weight_decay: 0.01
  adam_beta1: 0.9
  adam_beta2: 0.95
  adam_epsilon: 1e-08
  max_grad_norm: 1.0
  lr_scheduler_type: cosine
  warmup_ratio: 0.0
  min_lr: 1e-06
  num_train_epochs: 3.0
  max_steps: 15
  save_steps: 0
  save_epochs: 0
  logging_steps: 1