{
"train_version": {
"configuration_0": {
"weights": {
"embedding.word_embeddings.weight": [
1000,
128
],
"embedding.position_embeddings.weight": [
128,
128
],
"decoder.layers.0.input_layernorm.weight": [
128
],
"decoder.layers.0.self_attention.linear_proj.weight": [
128,
128
],
"decoder.layers.0.self_attention.linear_qkv.weight": [
384,
128
],
"decoder.layers.0.pre_mlp_layernorm.weight": [
128
],
"decoder.layers.0.mlp.linear_fc1.weight": [
512,
128
],
"decoder.layers.0.mlp.linear_fc2.weight": [
128,
256
],
"decoder.layers.1.input_layernorm.weight": [
128
],
"decoder.layers.1.self_attention.linear_proj.weight": [
128,
128
],
"decoder.layers.1.self_attention.linear_qkv.weight": [
384,
128
],
"decoder.layers.1.pre_mlp_layernorm.weight": [
128
],
"decoder.layers.1.mlp.linear_fc1.weight": [
512,
128
],
"decoder.layers.1.mlp.linear_fc2.weight": [
128,
256
],
"decoder.final_layernorm.weight": [
128
],
"mtp.layers.0.enorm.weight": [
128
],
"mtp.layers.0.hnorm.weight": [
128
],
"mtp.layers.0.eh_proj.weight": [
128,
256
],
"mtp.layers.0.transformer_layer.input_layernorm.weight": [
128
],
"mtp.layers.0.transformer_layer.self_attention.linear_proj.weight": [
128,
128
],
"mtp.layers.0.transformer_layer.self_attention.linear_qkv.weight": [
384,
128
],
"mtp.layers.0.transformer_layer.pre_mlp_layernorm.weight": [
128
],
"mtp.layers.0.transformer_layer.mlp.linear_fc1.weight": [
512,
128
],
"mtp.layers.0.transformer_layer.mlp.linear_fc2.weight": [
128,
256
],
"mtp.layers.0.final_layernorm.weight": [
128
],
"output_layer.weight": [
1000,
128
]
},
"weight_count": 26
},
"configuration_1": {
"weights": {
"embedding.word_embeddings.weight": [
1000,
128
],
"embedding.position_embeddings.weight": [
128,
128
],
"decoder.layers.0.input_layernorm.weight": [
128
],
"decoder.layers.0.self_attention.linear_proj.weight": [
128,
512
],
"decoder.layers.0.self_attention.linear_q_down_proj.weight": [
512,
128
],
"decoder.layers.0.self_attention.linear_q_up_proj.weight": [
768,
512
],
"decoder.layers.0.self_attention.linear_kv_down_proj.weight": [
576,
128
],
"decoder.layers.0.self_attention.linear_kv_up_proj.weight": [
1024,
512
],
"decoder.layers.0.pre_mlp_layernorm.weight": [
128
],
"decoder.layers.0.mlp.linear_fc1.weight": [
512,
128
],
"decoder.layers.0.mlp.linear_fc2.weight": [
128,
256
],
"decoder.layers.1.input_layernorm.weight": [
128
],
"decoder.layers.1.self_attention.linear_proj.weight": [
128,
512
],
"decoder.layers.1.self_attention.linear_q_down_proj.weight": [
512,
128
],
"decoder.layers.1.self_attention.linear_q_up_proj.weight": [
768,
512
],
"decoder.layers.1.self_attention.linear_kv_down_proj.weight": [
576,
128
],
"decoder.layers.1.self_attention.linear_kv_up_proj.weight": [
1024,
512
],
"decoder.layers.1.pre_mlp_layernorm.weight": [
128
],
"decoder.layers.1.mlp.router.weight": [
2,
128
],
"decoder.layers.1.mlp.router.expert_bias": [
2
],
"decoder.layers.1.mlp.router.expert_load": [
2
],
"decoder.layers.1.mlp.router.fi_accu": [
2
],
"decoder.layers.1.mlp.experts.weight1": [
256,
512
],
"decoder.layers.1.mlp.experts.weight2": [
512,
128
],
"decoder.layers.1.mlp.shared_experts.linear_fc1.weight": [
1024,
128
],
"decoder.layers.1.mlp.shared_experts.linear_fc2.weight": [
128,
512
],
"decoder.final_layernorm.weight": [
128
],
"mtp.layers.0.enorm.weight": [
128
],
"mtp.layers.0.hnorm.weight": [
128
],
"mtp.layers.0.eh_proj.weight": [
128,
256
],
"mtp.layers.0.transformer_layer.input_layernorm.weight": [
128
],
"mtp.layers.0.transformer_layer.self_attention.linear_proj.weight": [
128,
512
],
"mtp.layers.0.transformer_layer.self_attention.linear_q_down_proj.weight": [
512,
128
],
"mtp.layers.0.transformer_layer.self_attention.linear_q_up_proj.weight": [
768,
512
],
"mtp.layers.0.transformer_layer.self_attention.linear_kv_down_proj.weight": [
576,
128
],
"mtp.layers.0.transformer_layer.self_attention.linear_kv_up_proj.weight": [
1024,
512
],
"mtp.layers.0.transformer_layer.pre_mlp_layernorm.weight": [
128
],
"mtp.layers.0.transformer_layer.mlp.router.weight": [
2,
128
],
"mtp.layers.0.transformer_layer.mlp.router.expert_bias": [
2
],
"mtp.layers.0.transformer_layer.mlp.router.expert_load": [
2
],
"mtp.layers.0.transformer_layer.mlp.router.fi_accu": [
2
],
"mtp.layers.0.transformer_layer.mlp.experts.weight1": [
256,
512
],
"mtp.layers.0.transformer_layer.mlp.experts.weight2": [
512,
128
],
"mtp.layers.0.transformer_layer.mlp.shared_experts.linear_fc1.weight": [
1024,
128
],
"mtp.layers.0.transformer_layer.mlp.shared_experts.linear_fc2.weight": [
128,
512
],
"mtp.layers.0.final_layernorm.weight": [
128
],
"output_layer.weight": [
1000,
128
]
},
"weight_count": 47
},
"configuration_2": {
"weights": {
"embedding.word_embeddings.weight": [
1000,
128
],
"embedding.position_embeddings.weight": [
128,
128
],
"decoder.layers.0.input_layernorm.weight": [
128
],
"decoder.layers.0.self_attention.linear_proj.weight": [
128,
512
],
"decoder.layers.0.self_attention.linear_qb.weight": [
768,
512
],
"decoder.layers.0.self_attention.linear_qkv.weight": [
1088,
128
],
"decoder.layers.0.self_attention.linear_kvb.weight": [
1024,
512
],
"decoder.layers.0.pre_mlp_layernorm.weight": [
128
],
"decoder.layers.0.mlp.linear_fc1.weight": [
512,
128
],
"decoder.layers.0.mlp.linear_fc2.weight": [
128,
256
],
"decoder.layers.1.input_layernorm.weight": [
128
],
"decoder.layers.1.self_attention.linear_proj.weight": [
128,
512
],
"decoder.layers.1.self_attention.linear_qb.weight": [
768,
512
],
"decoder.layers.1.self_attention.linear_qkv.weight": [
1088,
128
],
"decoder.layers.1.self_attention.linear_kvb.weight": [
1024,
512
],
"decoder.layers.1.pre_mlp_layernorm.weight": [
128
],
"decoder.layers.1.mlp.linear_fc1.weight": [
512,
128
],
"decoder.layers.1.mlp.linear_fc2.weight": [
128,
256
],
"decoder.final_layernorm.weight": [
128
],
"mtp.layers.0.enorm.weight": [
128
],
"mtp.layers.0.hnorm.weight": [
128
],
"mtp.layers.0.eh_proj.weight": [
128,
256
],
"mtp.layers.0.transformer_layer.input_layernorm.weight": [
128
],
"mtp.layers.0.transformer_layer.self_attention.linear_proj.weight": [
128,
512
],
"mtp.layers.0.transformer_layer.self_attention.linear_qb.weight": [
768,
512
],
"mtp.layers.0.transformer_layer.self_attention.linear_qkv.weight": [
1088,
128
],
"mtp.layers.0.transformer_layer.self_attention.linear_kvb.weight": [
1024,
512
],
"mtp.layers.0.transformer_layer.pre_mlp_layernorm.weight": [
128
],
"mtp.layers.0.transformer_layer.mlp.linear_fc1.weight": [
512,
128
],
"mtp.layers.0.transformer_layer.mlp.linear_fc2.weight": [
128,
256
],
"mtp.layers.0.final_layernorm.weight": [
128
],
"output_layer.weight": [
1000,
128
]
},
"weight_count": 32
}
},
"infer_version": {
"configuration_0": {
"weights": {
"embedding.word_embeddings.weight": [
1000,
128
],
"decoder.layers.0.input_layernorm.weight": [
128
],
"decoder.layers.0.self_attention.linear_proj.weight": [
128,
128
],
"decoder.layers.0.self_attention.linear_proj.w_scale": [
128
],
"decoder.layers.0.self_attention.linear_qkv.weight": [
384,
128
],
"decoder.layers.0.self_attention.linear_qkv.w_scale": [
384
],
"decoder.layers.0.pre_mlp_layernorm.weight": [
128
],
"decoder.layers.0.mlp.linear_fc1.weight": [
512,
128
],
"decoder.layers.0.mlp.linear_fc1.w_scale": [
512
],
"decoder.layers.0.mlp.linear_fc2.weight": [
128,
256
],
"decoder.layers.0.mlp.linear_fc2.w_scale": [
128
],
"decoder.layers.1.input_layernorm.weight": [
128
],
"decoder.layers.1.self_attention.linear_proj.weight": [
128,
128
],
"decoder.layers.1.self_attention.linear_proj.w_scale": [
128
],
"decoder.layers.1.self_attention.linear_qkv.weight": [
384,
128
],
"decoder.layers.1.self_attention.linear_qkv.w_scale": [
384
],
"decoder.layers.1.pre_mlp_layernorm.weight": [
128
],
"decoder.layers.1.mlp.linear_fc1.weight": [
512,
128
],
"decoder.layers.1.mlp.linear_fc1.w_scale": [
512
],
"decoder.layers.1.mlp.linear_fc2.weight": [
128,
256
],
"decoder.layers.1.mlp.linear_fc2.w_scale": [
128
],
"decoder.final_layernorm.weight": [
128
],
"output_layer.weight": [
1000,
128
]
},
"weight_count": 23
},
"configuration_1": {
"weights": {
"embedding.word_embeddings.weight": [
1000,
128
],
"decoder.layers.0.self_attention.qnope_scale": [
4
],
"decoder.layers.0.self_attention.ctkv_scale": [
1
],
"decoder.layers.0.self_attention.linear_proj.weight": [
128,
512
],
"decoder.layers.0.self_attention.linear_proj.w_scale": [
128
],
"decoder.layers.0.self_attention.input_layernorm.weight": [
128
],
"decoder.layers.0.self_attention.linear_qkv_down_proj.weight": [
1088,
128
],
"decoder.layers.0.self_attention.linear_qkv_down_proj.w_scale": [
1088
],
"decoder.layers.0.self_attention.linear_q_up_proj.weight": [
768,
512
],
"decoder.layers.0.self_attention.linear_q_up_proj.w_scale": [
768
],
"decoder.layers.0.self_attention.linear_kv_up_proj.weight": [
1024,
512
],
"decoder.layers.0.self_attention.linear_kv_up_proj.w_scale": [
1024
],
"decoder.layers.0.self_attention.q_layernorm.weight": [
512
],
"decoder.layers.0.self_attention.kv_layernorm.weight": [
512
],
"decoder.layers.0.pre_mlp_layernorm.weight": [
128
],
"decoder.layers.0.mlp.linear_fc1.weight": [
512,
128
],
"decoder.layers.0.mlp.linear_fc1.w_scale": [
512
],
"decoder.layers.0.mlp.linear_fc2.weight": [
128,
256
],
"decoder.layers.0.mlp.linear_fc2.w_scale": [
128
],
"decoder.layers.1.self_attention.linear_proj.weight": [
128,
512
],
"decoder.layers.1.self_attention.linear_proj.w_scale": [
128
],
"decoder.layers.1.self_attention.input_layernorm.weight": [
128
],
"decoder.layers.1.self_attention.linear_qkv_down_proj.weight": [
1088,
128
],
"decoder.layers.1.self_attention.linear_qkv_down_proj.w_scale": [
1088
],
"decoder.layers.1.self_attention.linear_q_up_proj.weight": [
768,
512
],
"decoder.layers.1.self_attention.linear_q_up_proj.w_scale": [
768
],
"decoder.layers.1.self_attention.linear_kv_up_proj.weight": [
1024,
512
],
"decoder.layers.1.self_attention.linear_kv_up_proj.w_scale": [
1024
],
"decoder.layers.1.self_attention.q_layernorm.weight": [
512
],
"decoder.layers.1.self_attention.kv_layernorm.weight": [
512
],
"decoder.layers.1.pre_mlp_layernorm.weight": [
128
],
"decoder.layers.1.mlp.router.weight": [
2,
128
],
"decoder.layers.1.mlp.router.expert_bias": [
2
],
"decoder.layers.1.mlp.experts.weight1": [
2,
128,
256
],
"decoder.layers.1.mlp.experts.weight2": [
2,
256,
64
],
"decoder.layers.1.mlp.experts.linear_fc1.gmm_bias": [
2,
512
],
"decoder.layers.1.mlp.experts.linear_fc1.w_scale": [
2,
1,
512
],
"decoder.layers.1.mlp.experts.linear_fc2.gmm_bias": [
2,
128
],
"decoder.layers.1.mlp.experts.linear_fc2.w_scale": [
2,
2,
128
],
"decoder.layers.1.mlp.shared_experts.linear_fc1.weight": [
1024,
128
],
"decoder.layers.1.mlp.shared_experts.linear_fc1.w_scale": [
1024
],
"decoder.layers.1.mlp.shared_experts.linear_fc2.weight": [
128,
512
],
"decoder.layers.1.mlp.shared_experts.linear_fc2.w_scale": [
128
],
"decoder.final_layernorm.weight": [
128
],
"output_layer.weight": [
1000,
128
]
},
"weight_count": 45
}
},
"metadata": {
"generated_at": "2025-01-28 17:22:25"
}
}