qwen3_32b:
use_mcore_models: true
num_layers: 64
hidden_size: 5120
ffn_hidden_size: 25600
num_attention_heads: 64
rotary_base: 1000000
max_position_embeddings: 40960
make_vocab_size_divisible_by: 1
padded_vocab_size: 151936
untie_embeddings_and_output_weights: true
disable_bias_linear: true
group_query_attention: true
num_query_groups: 8
position_embedding_type: rope
normalization: RMSNorm
swiglu: true
attention_softmax_in_fp32: true
attention_bias: false
qk_layernorm: true
kv_channels: 128
no_gradient_accumulation_fusion: true