qwen3_235b_a22b:
use_mcore_models: true
spec:
- mindspeed_llm.tasks.models.spec.qwen3_spec
- layer_spec
num_layers: 94
num_experts: 128
moe_router_topk: 8
moe_ffn_hidden_size: 1536
hidden_size: 4096
ffn_hidden_size: 12288
num_attention_heads: 64
group_query_attention: true
num_query_groups: 4
untie_embeddings_and_output_weights: true
disable_bias_linear: true
qk_layernorm: true
kv_channels: 128
norm_topk_prob: true
position_embedding_type: rope
use_rotary_position_embeddings: true
rotary_base: 1000000
max_position_embeddings: 40960
padded_vocab_size: 151936
make_vocab_size_divisible_by: 1
normalization: RMSNorm
norm_epsilon: 1e-6
swiglu: true
moe_grouped_gemm: true
moe_permutation_async_comm: true
moe_token_dispatcher_type: alltoall_seq
use_fused_moe_token_permute_and_unpermute: true
moe_router_load_balancing_type: aux_loss
moe_aux_loss_coeff: 0.001