17c602e5创建于 2025年7月11日历史提交
qwen3_8b:

  use_mcore_models: true

  num_layers: 36

  hidden_size: 4096

  ffn_hidden_size: 12288

  num_attention_heads: 32

  rotary_base: 1000000

  max_position_embeddings: 40960

  make_vocab_size_divisible_by: 1

  padded_vocab_size: 151936

  untie_embeddings_and_output_weights: true

  disable_bias_linear: true

  group_query_attention: true

  num_query_groups: 8

  position_embedding_type: rope

  normalization: RMSNorm

  swiglu: true

  attention_softmax_in_fp32: true

  attention_bias: false

  qk_layernorm: true