spec:
data_path: /home/dataset/model/gpt-3.5/alpaca_text_document
vocab_file: /home/dataset/model/gpt-3.5/vocab.json
merge_file: /home/dataset/model/gpt-3.5/merges.txt
checkpoint_path: ./ckpt
nnodes: 1
max_steps: 20
mbs: 2
gbs: 16
products:
- { id: [ 'recomputation-01' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --recompute-activations"' ] }
- { id: [ 'recomputation-02' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --recompute-granularity full --recompute-method uniform --recompute-num-layers 1"' ] }
- { id: [ 'recomputation-03' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --recompute-granularity full --recompute-method block --recompute-num-layers 1"' ] }
- { id: [ 'adaptive-recompute' ], use_mcore: [ False ], adaptive_recomputing: [ 1 ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --adaptive-recompute-device-swap"' ] }
- { id: [ 'activation-function-recompute-01' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --recompute-activation-function --recompute-activation-function-num-layers 2"' ] }
- { id: [ 'activation-function-recompute-02' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 4 ], extra_args: [ '"--sequence-parallel --recompute-activation-function --recompute-activation-function-num-layers 2"' ] }
- { id: [ 'activation-function-recompute-03' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 4 ], vp_size: [ 1 ], extra_args: [ '"--sequence-parallel --recompute-activation-function --recompute-activation-function-num-layers 2"' ] }
- { id: [ 'recompute-independent-pipelining-01' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 4 ], vp_size: [ 1 ], extra_args: [ '"--sequence-parallel --recompute-in-bubble"' ] }
- { id: [ 'recompute-independent-pipelining-02' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 4 ], vp_size: [ 1 ], extra_args: [ '"--sequence-parallel --recompute-in-advance --recompute-granularity full --recompute-method block --recompute-num-layers 2"' ] }
- { id: [ 'distributed-optimizer-01' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --use-distributed-optimizer"' ] }
- { id: [ 'distributed-optimizer-02' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --use-distributed-optimizer --overlap-grad-reduce"' ] }
- { id: [ 'distributed-optimizer-03' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"' ] }
- { id: [ 'async-ddp-01' ], use_mcore: [ True, False ], tp_size: [ 1 ], pp_size: [ 4 ], vp_size: [ 1 ], extra_args: [ '"--use-distributed-optimizer --overlap-grad-reduce"' ] }
- { id: [ 'async-ddp-02' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 4 ], vp_size: [ 1 ], extra_args: [ '"--sequence-parallel --use-distributed-optimizer --overlap-grad-reduce"' ] }
- { id: [ 'async-ddp-param-gather-01' ], use_mcore: [ True ], tp_size: [ 1 ], pp_size: [ 4 ], vp_size: [ 1 ], extra_args: [ '"--use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"' ] }
- { id: [ 'async-ddp-param-gather-02' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 4 ], vp_size: [ 1 ], extra_args: [ '"--sequence-parallel --use-distributed-optimizer --overlap-grad-reduce --overlap-param-gather"' ] }
- { id: [ 'coc-01' ], use_mcore: [ True, False ], tp_size: [ 8 ], pp_size: [ 1 ], coc_parallel_num: [ 2, 4, 8 ], extra_args: [ '"--sequence-parallel --use-ascend-coc"' ] }
- { id: [ 'coc-02' ], use_mcore: [ True, False ], tp_size: [ 8 ], pp_size: [ 1 ], coc_parallel_num: [ 2, 4, 8 ], extra_args: [ '"--sequence-parallel --use-ascend-coc --coc-fused-kernel"' ] }
- { id: [ 'mc2' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --use-ascend-mc2"' ] }
- { id: [ 'fused-permute' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --disable-bias-linear --moe-token-dispatcher-type alltoall --use-fused-moe-token-permute-and-unpermute"' ] }
- { id: [ 'ring-attn-update' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], seq_len: [ 32768 ], extra_args: [ '"--sequence-parallel --use-flash-attn --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --use-fused-ring-attention-update"' ] }
- { id: [ 'reuse-fp32-param-01' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 4 ], extra_args: [ '"--sequence-parallel --reuse-fp32-param"' ] }
- { id: [ 'reuse-fp32-param-02' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --reuse-fp32-param --use-distributed-optimizer"' ] }
- { id: [ 'reuse-fp32-param-03' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 4 ], extra_args: [ '"--sequence-parallel --reuse-fp32-param --use-distributed-optimizer"' ] }
- { id: [ 'fused-rmsnorm' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --normalization RMSNorm --use-fused-rmsnorm"' ] }
- { id: [ 'fused-swiglu' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --swiglu --use-fused-swiglu"' ] }
- { id: [ 'fused-rope-01' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --position-embedding-type rope --rotary-interleaved --no-rope-fusion"' ] }
- { id: [ 'fused-rope-02' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --position-embedding-type rope --use-fused-rotary-pos-emb"' ] }
- { id: [ 'fa' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --use-flash-attn"' ] }
- { id: [ 'fav2' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --use-flash-attn --use-fusion-attn-v2"' ] }
- { id: [ 'nanopipe-pipeline-parallel-01' ], use_mcore: [ False ], tp_size: [ 1 ], pp_size: [ 4 ], vp_size: [ 1 ], mbs: [2], extra_args: [ '"--use-nanopipe"' ] }
- { id: [ 'nanopipe-pipeline-parallel-02' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 4 ], vp_size: [ 1 ], extra_args: [ '"--sequence-parallel --use-nanopipe"' ] }
- { id: [ 'optimize-vpp-send-recv-comm-01' ], use_mcore: [ True, False ], tp_size: [ 1 ], pp_size: [ 4 ], extra_args: [ '"--micro-batch-size 2 --global-batch-size 16 --optimize-vpp-send-recv-comm --num-layers-per-virtual-pipeline-stage 1"' ] }
- { id: [ 'optimize-vpp-send-recv-comm-02' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 4 ], extra_args: [ '"--micro-batch-size 2 --global-batch-size 16 --sequence-parallel --optimize-vpp-send-recv-comm --num-layers-per-virtual-pipeline-stage 1"' ] }
- { id: [ 'noop-layers' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --noop-layers 0,7"' ] }
- { id: [ 'cp-ulysses-parallel' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], seq_len: [ 32768 ], extra_args: [ '"--sequence-parallel --context-parallel-size 2 --context-parallel-algo ulysses_cp_algo"' ] }
- { id: [ 'cp-ring-parallel-01' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], seq_len: [ 32768 ], extra_args: [ '"--sequence-parallel --use-flash-attn --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --attention-mask-type causal"' ] }
- { id: [ 'cp-ring-parallel-02' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], seq_len: [ 32768 ], extra_args: [ '"--sequence-parallel --use-flash-attn --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --attention-mask-type general"' ] }
- { id: [ 'cp-ring-parallel-03' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], seq_len: [ 32768 ], extra_args: [ '"--sequence-parallel --use-flash-attn --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --use-cp-send-recv-overlap"' ] }
- { id: [ 'cp-hybrid-parallel-01' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 1 ], seq_len: [ 32768 ], extra_args: [ '"--sequence-parallel --use-flash-attn --context-parallel-size 4 --context-parallel-algo hybrid_cp_algo --ulysses-degree-in-cp 2 --attention-mask-type general"' ] }
- { id: [ 'cp-hybrid-parallel-02' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 1 ], seq_len: [ 32768 ], extra_args: [ '"--sequence-parallel --use-flash-attn --context-parallel-size 4 --context-parallel-algo hybrid_cp_algo --ulysses-degree-in-cp 2 --attention-mask-type causal"' ] }
- { id: [ 'alibi-01' ], use_mcore: [ True, False ], tp_size: [ 1 ], pp_size: [ 2 ], extra_args: [ '"--position-embedding-type alibi --square-alibi-mask"' ] }
- { id: [ 'alibi-02' ], use_mcore: [ True, False ], tp_size: [ 1 ], pp_size: [ 2 ], extra_args: [ '"--position-embedding-type alibi --square-alibi-mask --fill-neg-inf"' ] }
- { id: [ 'alibi-03' ], use_mcore: [ True, False ], tp_size: [ 1 ], pp_size: [ 2 ], extra_args: [ '"--position-embedding-type alibi --square-alibi-mask --context-parallel-algo megatron_cp_algo --use-fusion-attn-v2 --alibi-fusion-attn-type 0"' ] }
- { id: [ 'alibi-04' ], use_mcore: [ True, False ], tp_size: [ 1 ], pp_size: [ 2 ], extra_args: [ '"--position-embedding-type alibi --square-alibi-mask --context-parallel-algo megatron_cp_algo --use-fusion-attn-v2 --alibi-fusion-attn-type 2"' ] }
- { id: [ 'alibi-05' ], use_mcore: [ True, False ], tp_size: [ 1 ], pp_size: [ 2 ], extra_args: [ '"--position-embedding-type alibi --square-alibi-mask --context-parallel-algo megatron_cp_algo --use-fusion-attn-v2 --alibi-fusion-attn-type 3"' ] }
- { id: [ 'alibi-06' ], use_mcore: [ True, False ], tp_size: [ 1 ], pp_size: [ 2 ], extra_args: [ '"--position-embedding-type alibi --square-alibi-mask --context-parallel-algo megatron_cp_algo --use-fusion-attn-v2 --alibi-fusion-attn-type 3 --alibi-diagonal-opposite"' ] }
- { id: [ 'moe-01' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 1 ], extra_args: [ '"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"' ] }
- { id: [ 'moe-02' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 1 ], extra_args: [ '"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-token-dispatcher-type alltoall --disable-bias-linear"' ] }
- { id: [ 'moe-03' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 1 ], extra_args: [ '"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --use-distributed-optimizer --moe-router-load-balancing-type sinkhorn --moe-router-topk 1"' ] }
- { id: [ 'moe-04' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 1 ], extra_args: [ '"--sequence-parallel --disable-bias-linear --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type aux_loss --moe-router-topk 2 --moe-aux-loss-coeff 1e-2"' ] }
- { id: [ 'deepspeed-moe-01' ], use_mcore: [ False ], tp_size: [ 1 ], pp_size: [ 4 ], extra_args: [ '"--num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe"' ] }
- { id: [ 'megatron-moe-02' ], use_mcore: [ True ], tp_size: [ 1 ], pp_size: [ 4 ], extra_args: [ '"--num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type megatron_moe"' ] }
- { id: [ 'deepspeed-moe-03' ], use_mcore: [ False ], tp_size: [ 1 ], pp_size: [ 4 ], extra_args: [ '"--use-pipe-experts --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe"' ] }
- { id: [ 'megatron-moe-04' ], use_mcore: [ True ], tp_size: [ 1 ], pp_size: [ 4 ], extra_args: [ '"--use-pipe-experts --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type megatron_moe"' ] }
- { id: [ 'deepspeed-moe-pipe-experts-01' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --use-pipe-experts --pipe-experts-multi-stream --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe"' ] }
- { id: [ 'deepspeed-moe-pipe-experts-02' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --use-pipe-experts --pipe-experts-multi-data 2 --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe"' ] }
- { id: [ 'deepspeed-moe-pipe-experts-03' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --use-pipe-experts --pipe-experts-multi-stream --pipe-experts-multi-data 2 --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe"' ] }
- { id: [ 'deepspeed-moe-ampipe-01' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 1 ], extra_args: [ '"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe --use-flash-attn --disable-bias-linear --enable-token-rearrange-opt --ampipe-degree 2"' ] }
- { id: [ 'deepspeed-moe-ampipe-02' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 1 ], extra_args: [ '"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe --use-flash-attn --disable-bias-linear --enable-token-rearrange-opt --ampipe-degree 2 --ampipe-tp-sp-comm-overlap"' ] }
- { id: [ 'deepspeed-moe-ampipe-03' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 1 ], extra_args: [ '"--sequence-parallel --num-experts 8 --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --use-cp-send-recv-overlap --expert-model-parallel-size 2 --moe-router-topk 2 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe --use-flash-attn --disable-bias-linear --enable-token-rearrange-opt --ampipe-degree 2 --ampipe-tp-sp-comm-overlap"' ] }
- { id: [ 'deepspeed-moe-ampipe-04' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 1 ], extra_args: [ '"--sequence-parallel --use-pipe-experts --pipe-experts-multi-data 2 --num-experts 8 --expert-model-parallel-size 4 --moe-router-topk 2 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe --use-flash-attn --disable-bias-linear --enable-token-rearrange-opt --ampipe-degree 2 --ampipe-tp-sp-comm-overlap"' ] }
- { id: [ 'deepspeed-moe-ampipe-05' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 1 ], extra_args: [ '"--sequence-parallel --use-pipe-experts --pipe-experts-multi-stream --pipe-experts-multi-data 2 --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe --use-flash-attn --disable-bias-linear --enable-token-rearrange-opt --ampipe-degree 2 --ampipe-tp-sp-comm-overlap"' ] }
- { id: [ 'deepspeed-moe-token-rearrange-01' ], use_mcore: [ False ], tp_size: [ 1 ], pp_size: [ 4 ], extra_args: [ '"--num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe --enable-token-rearrange-opt"' ] }
- { id: [ 'deepspeed-moe-token-rearrange-02' ], use_mcore: [ False ], tp_size: [ 1 ], pp_size: [ 4 ], extra_args: [ '"--use-pipe-experts --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type deepspeed_moe --enable-token-rearrange-opt"' ] }
- { id: [ 'deepspeed-moe-dropless' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 1 ], extra_args: [ '"--sequence-parallel --disable-bias-linear --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 2 --moe-no-drop --moe-dynamic-padding --moe-use-sinkhorn --moe-model-type deepspeed_moe"' ] }
- { id: [ 'megatron-moe-adaptive' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 1 ], extra_args: [ '"--sequence-parallel --disable-bias-linear --num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type aux_loss --moe-router-topk 2 --moe-aux-loss-coeff 1e-2 --moe-adaptive-recompute-activation"' ] }
- { id: [ 'megatron-moe-gemm' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type megatron_moe --moe-grouped-gemm --disable-bias-linear"' ] }
- { id: [ 'moe-drop-pad' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --num-experts 4 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-model-type megatron_moe --disable-bias-linear --moe-token-dispatcher-type alltoall --moe-pad-expert-input-to-capacity --moe-expert-capacity-factor 1.0 --moe-token-drop-policy probs"' ] }
- { id: [ 'megatron-moe-allgather' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type megatron_moe --moe-token-dispatcher-type allgather --moe-permutation-async-comm"' ] }
- { id: [ 'megatron-moe-alltoall' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --num-experts 8 --expert-model-parallel-size 2 --moe-router-topk 2 --moe-aux-loss-coeff 0.01 --moe-train-capacity-factor 1.1 --noisy-gate-policy RSample --moe-model-type megatron_moe --disable-bias-linear --moe-token-dispatcher-type alltoall_seq --moe-permutation-async-comm"' ] }
- { id: [ 'eod-reset-01' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --context-parallel-size 2 --context-parallel-algo megatron_cp_algo --attention-mask-type general --reset-attention-mask --reset-position-ids"' ] }
- { id: [ 'eod-reset-02' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --use-flash-attn --attention-mask-type general --reset-attention-mask --reset-position-ids"' ] }
- { id: [ 'generate-mask' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --no-create-attention-mask-in-dataloader"' ] }
- { id: [ 'others-01' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], training_dtype: [ bf16, fp16 ], extra_args: [ '"--sequence-parallel --test-mode --use-cpu-initialization --no-overlap-p2p-communication"' ] }
- { id: [ 'no-mmap-bin' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --no-mmap-bin-files"' ] }
- { id: [ 'decoupled-lr' ], use_mcore: [ True ], tp_size: [ 1 ], pp_size: [ 2 ], vp_size: [ 1 ], extra_args: [ '"--micro-batch-size 2 --global-batch-size 16 --decoupled-lr 0.0002"' ] }
- { id: [ 'qk-layernorm' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --qk-layernorm --test-mode"' ] }
- { id: [ 'gqa' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --group-query-attention --num-query-groups 4"' ] }
- { id: [ 'swiglu' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --swiglu"' ] }
- { id: [ 'untie-embed' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --untie-embeddings-and-output-weights"' ] }
- { id: [ 'exit-duration' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --eval-interval 10 --exit-duration-in-mins 5"' ] }
- { id: [ 'exit-signal-handler' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--sequence-parallel --exit-signal-handler"' ] }
- { id: [ 'weight-decay' ], use_mcore: [ True, False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--lr-decay-style constant --weight-decay-incr-style linear --start-weight-decay 1e-3 --end-weight-decay 1e-2"', '"--lr-decay-style cosine --weight-decay-incr-style constant"', '"--lr-decay-style inverse-square-root --weight-decay-incr-style cosine --start-weight-decay 1e-3 --end-weight-decay 1e-2"' ] }
- { id: [ 'load_ckpt-010' ], use_mcore: [ True ], tp_size: [ 1 ], pp_size: [ 2 ], extra_args: [ '"--auto-detect-ckpt-format --use-distributed-optimizer"' ] }
- { id: [ 'load_ckpt-011' ], checkpoint_resume_test: [ 1 ], use_mcore: [ True ], tp_size: [ 1 ], pp_size: [ 2 ], extra_args: [ '"--auto-detect-ckpt-format --use-checkpoint-args --use-distributed-optimizer"' ] }
- { id: [ 'load_ckpt-020' ], use_mcore: [ False ], tp_size: [ 1 ], pp_size: [ 2 ] }
- { id: [ 'load_ckpt-021' ], checkpoint_resume_test: [ 1 ], use_mcore: [ False ], tp_size: [ 1 ], pp_size: [ 2 ] }
- { id: [ 'load_ckpt-030' ], use_mcore: [ True ], tp_size: [ 1 ], pp_size: [ 2 ], extra_args: [ '"--num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 2"' ] }
- { id: [ 'load_ckpt-031' ], checkpoint_resume_test: [ 1 ], use_mcore: [ True ], tp_size: [ 1 ], pp_size: [ 2 ], extra_args: [ '"--num-experts 8 --expert-model-parallel-size 2 --moe-router-load-balancing-type sinkhorn --moe-router-topk 2"' ] }
- { id: [ 'load_ckpt-040' ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--distribute-saved-activations --recompute-granularity full --recompute-method block --recompute-num-layers 1"' ] }
- { id: [ 'load_ckpt-041' ], checkpoint_resume_test: [ 1 ], use_mcore: [ True ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--distribute-saved-activations --recompute-granularity full --recompute-method block --recompute-num-layers 1"' ] }
- { id: [ 'load_ckpt-050' ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--distribute-saved-activations --recompute-granularity full --recompute-method block --recompute-num-layers 1"' ] }
- { id: [ 'load_ckpt-051' ], checkpoint_resume_test: [ 1 ], use_mcore: [ False ], tp_size: [ 2 ], pp_size: [ 2 ], extra_args: [ '"--distribute-saved-activations --recompute-granularity full --recompute-method block --recompute-num-layers 1"' ] }