* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file deepseek_spec.h
* \brief
*/
#pragma once
#include <map>
#include <string>
#include <variant>
static std::map<std::string, std::variant<bool, int, float, std::string>> deepseekConfig1 = {
{"architectures", "DeepseekForCausalLM"},
{"attention_bias", false},
{"attentionDropout", 0},
{"AutoConfig", "g_deepseekConfig"},
{"AutoModel", "DeepseekModel"},
{"AutoModelForCausalLM", "DeepseekForCausalLM"},
{"auxLossAlpha", 0.001f},
{"bosTokenId", 100000},
{"eosTokenId", 100001},
{"epSize", 1},
{"firstKDenseReplace", 3},
{"hiddenAct", "silu"},
{"initializerRange", 0.02f},
{"intermediateSize", 18432},
{"lmHead", false},
{"maxPositionEmbeddings", 4096},
{"modelType", "deepseek_v3"},
{"moeIntermediateSize", 2048},
{"moeLayerFreq", 1},
{"nGroup", 8},
{"nRoutedExperts", 256},
{"nSharedExperts", 1},
{"normTopkProb", true},
{"numExpertsPerTok", 8},
{"numHiddenLayers", 61},
{"numKeyValueHeads", 128},
{"pretrainingTp", 1},
{"numAttentionHeads", 2},
{"hiddenSize", 256},
{"qLoraRank", 512},
{"kvLoraRank", 512},
{"qkNopeHeadDim", 128},
{"qkRopeHeadDim", 64},
{"rmHead", false},
{"rmsNormEps", 1e-06f},
{"ropeScaling", 1},
{"ropeTheta", 10000},
{"routedScalingFactor", 2.5f},
{"scoringFunc", "sigmoid"},
{"seqAux", true},
{"tieWordEmbeddings", false},
{"topkGroup", 4},
{"topkMethod", "noaux_tc"},
{"torchDtype", "bfloat16"},
{"transformersVersion", "4.33.1"},
{"useCache", true},
{"vHeadDim", 128},
{"vocabSize", 129280},
{"fp8Format", "e4m3"},
{"initFp8Params", true}};
static std::map<std::string, std::variant<bool, int, float, std::string>> deepseekConfigMoE = {
{"architectures", "DeepseekForCausalLM"},
{"attention_bias", false},
{"attentionDropout", 0},
{"AutoConfig", "g_deepseekConfig"},
{"AutoModel", "DeepseekModel"},
{"AutoModelForCausalLM", "DeepseekForCausalLM"},
{"auxLossAlpha", 0.001f},
{"bosTokenId", 100000},
{"eosTokenId", 100001},
{"epSize", 1},
{"firstKDenseReplace", 3},
{"hiddenAct", "silu"},
{"initializerRange", 0.02f},
{"intermediateSize", 512},
{"lmHead", false},
{"maxPositionEmbeddings", 4096},
{"modelType", "deepseek_v3"},
{"moeIntermediateSize", 512},
{"moeLayerFreq", 1},
{"nGroup", 8},
{"nRoutedExperts", 256},
{"nSharedExperts", 1},
{"normTopkProb", true},
{"numExpertsPerTok", 8},
{"numHiddenLayers", 61},
{"numKeyValueHeads", 128},
{"pretrainingTp", 1},
{"numAttentionHeads", 2},
{"hiddenSize", 256},
{"qLoraRank", 512},
{"kvLoraRank", 512},
{"qkNopeHeadDim", 128},
{"qkRopeHeadDim", 64},
{"rmHead", false},
{"rmsNormEps", 1e-06f},
{"ropeScaling", 1},
{"ropeTheta", 10000},
{"routedScalingFactor", 2.5f},
{"scoringFunc", "sigmoid"},
{"seqAux", true},
{"tieWordEmbeddings", false},
{"topkGroup", 4},
{"topkMethod", "noaux_tc"},
{"torchDtype", "bfloat16"},
{"transformersVersion", "4.33.1"},
{"useCache", true},
{"vHeadDim", 128},
{"vocabSize", 129280},
{"fp8Format", "e4m3"},
{"initFp8Params", true}};