{
  "train_version": {
    "configuration_0": {
      "weights": {
        "embedding.word_embeddings.weight": [
          1000,
          128
        ],
        "embedding.position_embeddings.weight": [
          128,
          128
        ],
        "decoder.layers.0.input_layernorm.weight": [
          128
        ],
        "decoder.layers.0.self_attention.linear_proj.weight": [
          128,
          128
        ],
        "decoder.layers.0.self_attention.linear_qkv.weight": [
          384,
          128
        ],
        "decoder.layers.0.pre_mlp_layernorm.weight": [
          128
        ],
        "decoder.layers.0.mlp.linear_fc1.weight": [
          512,
          128
        ],
        "decoder.layers.0.mlp.linear_fc2.weight": [
          128,
          256
        ],
        "decoder.layers.1.input_layernorm.weight": [
          128
        ],
        "decoder.layers.1.self_attention.linear_proj.weight": [
          128,
          128
        ],
        "decoder.layers.1.self_attention.linear_qkv.weight": [
          384,
          128
        ],
        "decoder.layers.1.pre_mlp_layernorm.weight": [
          128
        ],
        "decoder.layers.1.mlp.linear_fc1.weight": [
          512,
          128
        ],
        "decoder.layers.1.mlp.linear_fc2.weight": [
          128,
          256
        ],
        "decoder.final_layernorm.weight": [
          128
        ],
        "mtp.layers.0.enorm.weight": [
          128
        ],
        "mtp.layers.0.hnorm.weight": [
          128
        ],
        "mtp.layers.0.eh_proj.weight": [
          128,
          256
        ],
        "mtp.layers.0.transformer_layer.input_layernorm.weight": [
          128
        ],
        "mtp.layers.0.transformer_layer.self_attention.linear_proj.weight": [
          128,
          128
        ],
        "mtp.layers.0.transformer_layer.self_attention.linear_qkv.weight": [
          384,
          128
        ],
        "mtp.layers.0.transformer_layer.pre_mlp_layernorm.weight": [
          128
        ],
        "mtp.layers.0.transformer_layer.mlp.linear_fc1.weight": [
          512,
          128
        ],
        "mtp.layers.0.transformer_layer.mlp.linear_fc2.weight": [
          128,
          256
        ],
        "mtp.layers.0.final_layernorm.weight": [
          128
        ],
        "output_layer.weight": [
          1000,
          128
        ]
      },
      "weight_count": 26
    },
    "configuration_1": {
      "weights": {
        "embedding.word_embeddings.weight": [
          1000,
          128
        ],
        "embedding.position_embeddings.weight": [
          128,
          128
        ],
        "decoder.layers.0.input_layernorm.weight": [
          128
        ],
        "decoder.layers.0.self_attention.linear_proj.weight": [
          128,
          512
        ],
        "decoder.layers.0.self_attention.linear_q_down_proj.weight": [
          512,
          128
        ],
        "decoder.layers.0.self_attention.linear_q_up_proj.weight": [
          768,
          512
        ],
        "decoder.layers.0.self_attention.linear_kv_down_proj.weight": [
          576,
          128
        ],
        "decoder.layers.0.self_attention.linear_kv_up_proj.weight": [
          1024,
          512
        ],
        "decoder.layers.0.pre_mlp_layernorm.weight": [
          128
        ],
        "decoder.layers.0.mlp.linear_fc1.weight": [
          512,
          128
        ],
        "decoder.layers.0.mlp.linear_fc2.weight": [
          128,
          256
        ],
        "decoder.layers.1.input_layernorm.weight": [
          128
        ],
        "decoder.layers.1.self_attention.linear_proj.weight": [
          128,
          512
        ],
        "decoder.layers.1.self_attention.linear_q_down_proj.weight": [
          512,
          128
        ],
        "decoder.layers.1.self_attention.linear_q_up_proj.weight": [
          768,
          512
        ],
        "decoder.layers.1.self_attention.linear_kv_down_proj.weight": [
          576,
          128
        ],
        "decoder.layers.1.self_attention.linear_kv_up_proj.weight": [
          1024,
          512
        ],
        "decoder.layers.1.pre_mlp_layernorm.weight": [
          128
        ],
        "decoder.layers.1.mlp.router.weight": [
          2,
          128
        ],
        "decoder.layers.1.mlp.router.expert_bias": [
          2
        ],
        "decoder.layers.1.mlp.router.expert_load": [
          2
        ],
        "decoder.layers.1.mlp.router.fi_accu": [
          2
        ],
        "decoder.layers.1.mlp.experts.weight1": [
          256,
          512
        ],
        "decoder.layers.1.mlp.experts.weight2": [
          512,
          128
        ],
        "decoder.layers.1.mlp.shared_experts.linear_fc1.weight": [
          1024,
          128
        ],
        "decoder.layers.1.mlp.shared_experts.linear_fc2.weight": [
          128,
          512
        ],
        "decoder.final_layernorm.weight": [
          128
        ],
        "mtp.layers.0.enorm.weight": [
          128
        ],
        "mtp.layers.0.hnorm.weight": [
          128
        ],
        "mtp.layers.0.eh_proj.weight": [
          128,
          256
        ],
        "mtp.layers.0.transformer_layer.input_layernorm.weight": [
          128
        ],
        "mtp.layers.0.transformer_layer.self_attention.linear_proj.weight": [
          128,
          512
        ],
        "mtp.layers.0.transformer_layer.self_attention.linear_q_down_proj.weight": [
          512,
          128
        ],
        "mtp.layers.0.transformer_layer.self_attention.linear_q_up_proj.weight": [
          768,
          512
        ],
        "mtp.layers.0.transformer_layer.self_attention.linear_kv_down_proj.weight": [
          576,
          128
        ],
        "mtp.layers.0.transformer_layer.self_attention.linear_kv_up_proj.weight": [
          1024,
          512
        ],
        "mtp.layers.0.transformer_layer.pre_mlp_layernorm.weight": [
          128
        ],
        "mtp.layers.0.transformer_layer.mlp.router.weight": [
          2,
          128
        ],
        "mtp.layers.0.transformer_layer.mlp.router.expert_bias": [
          2
        ],
        "mtp.layers.0.transformer_layer.mlp.router.expert_load": [
          2
        ],
        "mtp.layers.0.transformer_layer.mlp.router.fi_accu": [
          2
        ],
        "mtp.layers.0.transformer_layer.mlp.experts.weight1": [
          256,
          512
        ],
        "mtp.layers.0.transformer_layer.mlp.experts.weight2": [
          512,
          128
        ],
        "mtp.layers.0.transformer_layer.mlp.shared_experts.linear_fc1.weight": [
          1024,
          128
        ],
        "mtp.layers.0.transformer_layer.mlp.shared_experts.linear_fc2.weight": [
          128,
          512
        ],
        "mtp.layers.0.final_layernorm.weight": [
          128
        ],
        "output_layer.weight": [
          1000,
          128
        ]
      },
      "weight_count": 47
    },
    "configuration_2": {
      "weights": {
        "embedding.word_embeddings.weight": [
          1000,
          128
        ],
        "embedding.position_embeddings.weight": [
          128,
          128
        ],
        "decoder.layers.0.input_layernorm.weight": [
          128
        ],
        "decoder.layers.0.self_attention.linear_proj.weight": [
          128,
          512
        ],
        "decoder.layers.0.self_attention.linear_qb.weight": [
          768,
          512
        ],
        "decoder.layers.0.self_attention.linear_qkv.weight": [
          1088,
          128
        ],
        "decoder.layers.0.self_attention.linear_kvb.weight": [
          1024,
          512
        ],
        "decoder.layers.0.pre_mlp_layernorm.weight": [
          128
        ],
        "decoder.layers.0.mlp.linear_fc1.weight": [
          512,
          128
        ],
        "decoder.layers.0.mlp.linear_fc2.weight": [
          128,
          256
        ],
        "decoder.layers.1.input_layernorm.weight": [
          128
        ],
        "decoder.layers.1.self_attention.linear_proj.weight": [
          128,
          512
        ],
        "decoder.layers.1.self_attention.linear_qb.weight": [
          768,
          512
        ],
        "decoder.layers.1.self_attention.linear_qkv.weight": [
          1088,
          128
        ],
        "decoder.layers.1.self_attention.linear_kvb.weight": [
          1024,
          512
        ],
        "decoder.layers.1.pre_mlp_layernorm.weight": [
          128
        ],
        "decoder.layers.1.mlp.linear_fc1.weight": [
          512,
          128
        ],
        "decoder.layers.1.mlp.linear_fc2.weight": [
          128,
          256
        ],
        "decoder.final_layernorm.weight": [
          128
        ],
        "mtp.layers.0.enorm.weight": [
          128
        ],
        "mtp.layers.0.hnorm.weight": [
          128
        ],
        "mtp.layers.0.eh_proj.weight": [
          128,
          256
        ],
        "mtp.layers.0.transformer_layer.input_layernorm.weight": [
          128
        ],
        "mtp.layers.0.transformer_layer.self_attention.linear_proj.weight": [
          128,
          512
        ],
        "mtp.layers.0.transformer_layer.self_attention.linear_qb.weight": [
          768,
          512
        ],
        "mtp.layers.0.transformer_layer.self_attention.linear_qkv.weight": [
          1088,
          128
        ],
        "mtp.layers.0.transformer_layer.self_attention.linear_kvb.weight": [
          1024,
          512
        ],
        "mtp.layers.0.transformer_layer.pre_mlp_layernorm.weight": [
          128
        ],
        "mtp.layers.0.transformer_layer.mlp.linear_fc1.weight": [
          512,
          128
        ],
        "mtp.layers.0.transformer_layer.mlp.linear_fc2.weight": [
          128,
          256
        ],
        "mtp.layers.0.final_layernorm.weight": [
          128
        ],
        "output_layer.weight": [
          1000,
          128
        ]
      },
      "weight_count": 32
    }
  },
  "infer_version": {
    "configuration_0": {
      "weights": {
        "embedding.word_embeddings.weight": [
          1000,
          128
        ],
        "decoder.layers.0.input_layernorm.weight": [
          128
        ],
        "decoder.layers.0.self_attention.linear_proj.weight": [
          128,
          128
        ],
        "decoder.layers.0.self_attention.linear_proj.w_scale": [
          128
        ],
        "decoder.layers.0.self_attention.linear_qkv.weight": [
          384,
          128
        ],
        "decoder.layers.0.self_attention.linear_qkv.w_scale": [
          384
        ],
        "decoder.layers.0.pre_mlp_layernorm.weight": [
          128
        ],
        "decoder.layers.0.mlp.linear_fc1.weight": [
          512,
          128
        ],
        "decoder.layers.0.mlp.linear_fc1.w_scale": [
          512
        ],
        "decoder.layers.0.mlp.linear_fc2.weight": [
          128,
          256
        ],
        "decoder.layers.0.mlp.linear_fc2.w_scale": [
          128
        ],
        "decoder.layers.1.input_layernorm.weight": [
          128
        ],
        "decoder.layers.1.self_attention.linear_proj.weight": [
          128,
          128
        ],
        "decoder.layers.1.self_attention.linear_proj.w_scale": [
          128
        ],
        "decoder.layers.1.self_attention.linear_qkv.weight": [
          384,
          128
        ],
        "decoder.layers.1.self_attention.linear_qkv.w_scale": [
          384
        ],
        "decoder.layers.1.pre_mlp_layernorm.weight": [
          128
        ],
        "decoder.layers.1.mlp.linear_fc1.weight": [
          512,
          128
        ],
        "decoder.layers.1.mlp.linear_fc1.w_scale": [
          512
        ],
        "decoder.layers.1.mlp.linear_fc2.weight": [
          128,
          256
        ],
        "decoder.layers.1.mlp.linear_fc2.w_scale": [
          128
        ],
        "decoder.final_layernorm.weight": [
          128
        ],
        "output_layer.weight": [
          1000,
          128
        ]
      },
      "weight_count": 23
    },
    "configuration_1": {
      "weights": {
        "embedding.word_embeddings.weight": [
          1000,
          128
        ],
        "decoder.layers.0.self_attention.qnope_scale": [
          4
        ],
        "decoder.layers.0.self_attention.ctkv_scale": [
          1
        ],
        "decoder.layers.0.self_attention.linear_proj.weight": [
          128,
          512
        ],
        "decoder.layers.0.self_attention.linear_proj.w_scale": [
          128
        ],
        "decoder.layers.0.self_attention.input_layernorm.weight": [
          128
        ],
        "decoder.layers.0.self_attention.linear_qkv_down_proj.weight": [
          1088,
          128
        ],
        "decoder.layers.0.self_attention.linear_qkv_down_proj.w_scale": [
          1088
        ],
        "decoder.layers.0.self_attention.linear_q_up_proj.weight": [
          768,
          512
        ],
        "decoder.layers.0.self_attention.linear_q_up_proj.w_scale": [
          768
        ],
        "decoder.layers.0.self_attention.linear_kv_up_proj.weight": [
          1024,
          512
        ],
        "decoder.layers.0.self_attention.linear_kv_up_proj.w_scale": [
          1024
        ],
        "decoder.layers.0.self_attention.q_layernorm.weight": [
          512
        ],
        "decoder.layers.0.self_attention.kv_layernorm.weight": [
          512
        ],
        "decoder.layers.0.pre_mlp_layernorm.weight": [
          128
        ],
        "decoder.layers.0.mlp.linear_fc1.weight": [
          512,
          128
        ],
        "decoder.layers.0.mlp.linear_fc1.w_scale": [
          512
        ],
        "decoder.layers.0.mlp.linear_fc2.weight": [
          128,
          256
        ],
        "decoder.layers.0.mlp.linear_fc2.w_scale": [
          128
        ],
        "decoder.layers.1.self_attention.linear_proj.weight": [
          128,
          512
        ],
        "decoder.layers.1.self_attention.linear_proj.w_scale": [
          128
        ],
        "decoder.layers.1.self_attention.input_layernorm.weight": [
          128
        ],
        "decoder.layers.1.self_attention.linear_qkv_down_proj.weight": [
          1088,
          128
        ],
        "decoder.layers.1.self_attention.linear_qkv_down_proj.w_scale": [
          1088
        ],
        "decoder.layers.1.self_attention.linear_q_up_proj.weight": [
          768,
          512
        ],
        "decoder.layers.1.self_attention.linear_q_up_proj.w_scale": [
          768
        ],
        "decoder.layers.1.self_attention.linear_kv_up_proj.weight": [
          1024,
          512
        ],
        "decoder.layers.1.self_attention.linear_kv_up_proj.w_scale": [
          1024
        ],
        "decoder.layers.1.self_attention.q_layernorm.weight": [
          512
        ],
        "decoder.layers.1.self_attention.kv_layernorm.weight": [
          512
        ],
        "decoder.layers.1.pre_mlp_layernorm.weight": [
          128
        ],
        "decoder.layers.1.mlp.router.weight": [
          2,
          128
        ],
        "decoder.layers.1.mlp.router.expert_bias": [
          2
        ],
        "decoder.layers.1.mlp.experts.weight1": [
          2,
          128,
          256
        ],
        "decoder.layers.1.mlp.experts.weight2": [
          2,
          256,
          64
        ],
        "decoder.layers.1.mlp.experts.linear_fc1.gmm_bias": [
          2,
          512
        ],
        "decoder.layers.1.mlp.experts.linear_fc1.w_scale": [
          2,
          1,
          512
        ],
        "decoder.layers.1.mlp.experts.linear_fc2.gmm_bias": [
          2,
          128
        ],
        "decoder.layers.1.mlp.experts.linear_fc2.w_scale": [
          2,
          2,
          128
        ],
        "decoder.layers.1.mlp.shared_experts.linear_fc1.weight": [
          1024,
          128
        ],
        "decoder.layers.1.mlp.shared_experts.linear_fc1.w_scale": [
          1024
        ],
        "decoder.layers.1.mlp.shared_experts.linear_fc2.weight": [
          128,
          512
        ],
        "decoder.layers.1.mlp.shared_experts.linear_fc2.w_scale": [
          128
        ],
        "decoder.final_layernorm.weight": [
          128
        ],
        "output_layer.weight": [
          1000,
          128
        ]
      },
      "weight_count": 45
    }
  },
  "metadata": {
    "generated_at": "2025-01-28 17:22:25"
  }
}