* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file matmul_server.h
* \brief
*/
#if !defined(__ASCENDC_INCLUDE_INTERNAL_HEADERS__)
#pragma message( \
"impl/adv_api/detail/matmul/kfc/matmul_server.h is an internal header file and must not be used directly. Functions or variables defined in this file may be removed in the future. Please use \"#include \"adv_api/matmul/matmul_client.h\"\" and use public functions or variables defined in interface headers files.")
#define __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#define __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_DETAIL_MATMUL_KFC_MATMUL_SERVER_H__
#endif
#ifndef IMPL_MATMUL_KFC_MATMUL_SERVER_H
#define IMPL_MATMUL_KFC_MATMUL_SERVER_H
#include "kernel_operator_common_intf_impl.h"
#include "kernel_operator_cache_intf.h"
#include "matmul_server_utils.h"
#include "../utils/matmul_config_utils.h"
#include "../utils/matmul_utils.h"
namespace AscendC {
template <
class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE, const auto& MM_CFG = CFG_NORM,
class MM_CB = MatmulCallBackFunc<nullptr, nullptr, nullptr>, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)>
class MatmulService {
using SrcAT = typename A_TYPE::T;
using SrcBT = typename B_TYPE::T;
using SrcT = typename A_TYPE::T;
using DstT = typename C_TYPE::T;
using BiasT = typename BIAS_TYPE::T;
using TILING_TYPE = typename std::remove_cv<typename std::remove_reference<decltype(MM_CFG)>::type>::type;
#if defined(USE_SSBUF)
using IMPL = MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY>;
using UserDefDataType = typename MATMUL_POLICY<MM_CFG, IMPL, A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::UserDefDataType;
#endif
public:
__aicore__ inline MatmulService() {}
template <class T>
__aicore__ inline void InitKfc(TPipe* tpipe, T* tiling, KFC_COMM_SERVER_PTR kfc, int32_t instID, GM_ADDR workspace)
{
ASSERT(instID >= 0 && "instID should be not less than 0 when init kfc matmul server");
this->instID = instID;
if constexpr (!ToMatmulConfig(MM_CFG).enableMixDualMaster) {
ASSERT(kfc != nullptr && "kfc cannot be nullptr when init kfc matmul server");
this->kfcCommSrv = kfc;
mul.SetSubBlockIdx(kfcCommSrv->subBlockID);
#if defined(USE_WORKSPACE)
ASSERT(workspace != nullptr && "workspace cannot be nullptr when init kfc matmul server");
this->workspace = workspace;
if constexpr (!ToMatmulConfig(MM_CFG).enableInit) {
msgAux.msg0.setOrgShape = false;
msgAux.msg1.setOrgShape = false;
}
#endif
this->devEvtID = instID;
if constexpr ((A_TYPE::ibShare || B_TYPE::ibShare) && !(A_TYPE::ibShare && B_TYPE::ibShare)) {
if (kfcCommSrv->subBlockID == 0) {
gCache.Init();
}
}
} else {
mul.SetSubBlockIdx(0);
}
if constexpr (IsSameTypeV<TILING_TYPE, MatmulApiStaticTiling>) {
#if !defined(ASCENDC_CPU_DEBUG) && defined(__CCE_IS_AICORE__)
if constexpr (IsSameTypeV<T, const __gm__ TCubeTiling>) {
TCubeTiling cubeTiling;
CopyTiling<A_TYPE, B_TYPE, MM_CFG>(tiling, cubeTiling);
tiling_.SetTiling(&cubeTiling);
mul.Init(tiling_.GetTiling(), nullptr);
} else {
#endif
tiling_.SetTiling((TCubeTiling*)tiling);
mul.Init(tiling_.GetTiling(), nullptr);
#if !defined(ASCENDC_CPU_DEBUG) && defined(__CCE_IS_AICORE__)
}
#endif
#if defined(USE_SSBUF)
InitL1Addr();
#endif
} else if (tiling) {
tiling_.SetTiling((TCubeTiling*)tiling);
mul.Init(tiling_.GetTiling(), nullptr);
#if defined(USE_SSBUF)
InitL1Addr();
#endif
}
}
__aicore__ inline void Init(MSG_POS KfcMsg* msg);
__aicore__ inline void SetSubBlockIdx(uint8_t idx) { mul.SetSubBlockIdx(idx); }
__aicore__ inline void SetOrgShape(MSG_POS KfcMsg* msg);
__aicore__ inline void SetSingleShape(__gm__ KfcMsg* msg)
{
if (msg->body.setTail) {
mul.SetSingleShape(msg->body.singleM, msg->body.singleN, msg->body.singleK);
}
}
__aicore__ inline void SetTail(MsgTmpPos MatmulConfigParams* body)
{
if (body->setTail) {
mul.SetTail(body->singleM, body->singleN, body->singleK);
}
}
__aicore__ inline void SetHF32(MSG_POS KfcMsg* msg)
{
mul.SetHF32(static_cast<bool>(msg->body.enHF32), static_cast<int32_t>(msg->body.hf32TransMode));
}
__aicore__ inline void SetTensorA(MsgTmpPos MatmulConfigParams* body);
__aicore__ inline void SetTensorA(MsgTmpPos MatmulConfigParams* body, const uint64_t size, const uint64_t offset);
__aicore__ inline void SetQuantVector(MsgTmpPos MatmulConfigParams* body)
{
if (!body->setQuant) {
return;
}
int quantMode = body->quantMode;
if (quantMode == 1) {
uint64_t quantScalar = body->quantScalar;
mul.SetQuantScalar(quantScalar);
} else if (quantMode == 2) {
const uint64_t size = static_cast<uint64_t>(body->quantSize);
GlobalTensor<uint64_t> quantGlobal;
quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(body->quantAddr), size);
mul.SetQuantVector(quantGlobal);
} else if (quantMode == 3) {
const uint64_t size = static_cast<uint64_t>(body->quantSize);
LocalTensor<uint64_t> quantL1 = GetLocalTensor<uint64_t, TPosition::TSCM>(body->quantAddr, size);
mul.SetQuantVector(quantL1);
}
}
__aicore__ inline void SetBatchNum(MsgTmpPos MatmulConfigParams* body)
{
if constexpr (A_TYPE::layout == LayoutMode::NONE) {
return;
}
if (!body->setBatch) {
return;
}
mul.SetBatchNum(body->batchA, body->batchB);
}
#if defined(USE_SSBUF)
__aicore__ inline void SetUserDefInfo(MSG_POS KfcMsg* msg)
{
if (msg->userCustomData == 1) {
mul.SetUserDefInfo(msg->userDefInfo.tilingPtr);
return;
}
UserDefDataType userData;
uint32_t* ptr = reinterpret_cast<uint32_t*>(&userData);
MSG_POS uint32_t* ptrMsg = reinterpret_cast<MSG_POS uint32_t*>(&(msg->body));
for (int i = 0; i < sizeof(UserDefDataType) / sizeof(uint32_t); i++) {
*(ptr + i) = *(ptrMsg + i);
}
mul.SetSelfDefineData(userData);
}
__aicore__ inline void SetSelfDefineData(MSG_POS KfcMsg* msg, MsgTmpPos MatmulConfigParams* body)
{
if (body->userInfoType == 0) {
return;
}
UserDefDataType userData;
uint32_t* ptr = reinterpret_cast<uint32_t*>(&userData);
if constexpr (sizeof(UserDefDataType) == 4) {
*ptr = msg->userCustomData;
} else if constexpr (sizeof(UserDefDataType) == 8) {
*ptr = msg->userCustomData;
*(ptr + 1) = body->userCustomData;
}
mul.SetSelfDefineData(userData);
}
#else
__aicore__ inline void SetSelfDefineData(__gm__ KfcMsg* msg)
{
GlobalTensor<int64_t> msgGlobal;
msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
mul.SetSelfDefineData(msg->body.dataPtr);
if constexpr (!ToMatmulConfig(MM_CFG).enableReuse) {
GlobalTensor<uint32_t> dataGlobal;
dataGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t*>(msg->body.dataPtr));
DataCacheCleanAndInvalid<uint32_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(dataGlobal);
}
}
__aicore__ inline void SetUserDefInfo(__gm__ KfcMsg* msg) { mul.SetUserDefInfo(msg->userDefInfo.tilingPtr); }
#endif
__aicore__ inline void SetTensorB(MsgTmpPos MatmulConfigParams* body);
__aicore__ inline void SetTensorB(MsgTmpPos MatmulConfigParams* body, const uint64_t size, const uint64_t offset);
__aicore__ inline void SetBias(MsgTmpPos MatmulConfigParams* body);
__aicore__ inline void SetBias(MsgTmpPos MatmulConfigParams* body, const uint64_t offset);
__aicore__ inline bool GetTensorC(MSG_POS KfcMsg* msg);
__aicore__ inline uint16_t GetInstID() { return instID; }
__aicore__ inline void IterateSetMessage(MSG_POS KfcMsg* msg, MsgTmpPos MatmulConfigParams* body)
{
#if defined(USE_WORKSPACE)
if constexpr (!ToMatmulConfig(MM_CFG).enableInit) {
if (mul.GetSubBlockIdx() == 0 && msgAux.msg0.setOrgShape) {
mul.SetOrgShape(
msgAux.msg0.orgM, msgAux.msg0.orgN, msgAux.msg0.orgKa, msgAux.msg0.orgKb, msgAux.msg0.orgKc);
} else if (mul.GetSubBlockIdx() == 1 && msgAux.msg1.setOrgShape) {
mul.SetOrgShape(
msgAux.msg1.orgM, msgAux.msg1.orgN, msgAux.msg1.orgKa, msgAux.msg1.orgKb, msgAux.msg1.orgKc);
}
}
#endif
if (body->isFirstIter) {
SetTensorA(body);
SetTensorB(body);
if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) {
SetBias(body);
}
if constexpr (ToMatmulConfig(MM_CFG).enableSetTail) {
SetTail(body);
}
if constexpr (ToMatmulConfig(MM_CFG).enableQuantVector) {
SetQuantVector(body);
}
if constexpr (
((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) ||
((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) {
if constexpr (A_TYPE::layout != LayoutMode::NONE) {
SetBatchNum(body);
}
}
#if defined(USE_SSBUF)
if constexpr (ToMatmulConfig(MM_CFG).enableSetOrgShape) {
if (body->setOrgShape) {
mul.SetOrgShape(body->orgM, body->orgN, body->orgKa, body->orgKb, body->orgKc);
}
}
if constexpr (HasScalePosition<A_TYPE>::value) {
SetTensorScaleA(*body);
}
if constexpr (HasScalePosition<B_TYPE>::value) {
SetTensorScaleB(*body);
}
if constexpr (ToMatmulConfig(MM_CFG).enableSetDefineData) {
SetSelfDefineData(msg, body);
}
#else
if constexpr (ToMatmulConfig(MM_CFG).enableSetDefineData) {
SetSelfDefineData(msg);
}
#endif
}
}
__aicore__ inline void IterateSetMessage(
MsgTmpPos MatmulConfigParams* body, const uint64_t batchASize, const uint64_t batchBSize,
const uint64_t offsetA = 0, const uint64_t offsetB = 0, const uint64_t offsetBias = 0)
{
if (body->isFirstIter) {
SetTensorA(body, batchASize, offsetA);
SetTensorB(body, batchBSize, offsetB);
SetBias(body, offsetBias);
SetTail(body);
SetQuantVector(body);
if constexpr (A_TYPE::layout != LayoutMode::NONE) {
SetBatchNum(body);
}
}
}
__aicore__ inline bool IterateBatch(MSG_POS KfcMsg* msg);
__aicore__ inline void StartIterateNBatch(MsgTmpPos MatmulConfigParams* body, uint32_t& cntIterator);
__aicore__ inline bool IterateNBatch(MSG_POS KfcMsg* msg);
__aicore__ inline void GetOffsetSize(
MsgTmpPos MatmulConfigParams* body, KFC_Enum funID, uint32_t sync, uint64_t& offsetSize,
uint32_t& enSequentialWrite, bool hasSetWorkspace = false);
__aicore__ inline bool StartIterate(
MsgTmpPos MatmulConfigParams* body, KFC_Enum funID, uint32_t sync, uint32_t& cntIterator);
__aicore__ inline bool Iterate(MSG_POS KfcMsg* msg, KFC_Enum funID);
#if defined(__ASCENDC_ENABLE_SUPER_KERNEL__)
__aicore__ inline void SuperKernelEventCount(uint16_t eventID);
#endif
__aicore__ inline void QuantCacheRefresh(__gm__ KfcMsg* msg)
{
if constexpr (
((IsSameType<SrcT, int4b_t>::value || IsSameType<SrcT, int8_t>::value) && IsSameType<DstT, half>::value) ||
((IsSameType<SrcT, half>::value || IsSameType<SrcT, bfloat16_t>::value) &&
IsSameType<DstT, int8_t>::value) ||
(IsSameType<SrcT, int8_t>::value &&
(IsSameType<DstT, uint8_t>::value || IsSameType<DstT, int8_t>::value))) {
GlobalTensor<int64_t> msgGlobal;
msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t));
DataCacheCleanAndInvalid<int64_t, CacheLine::SINGLE_CACHE_LINE, DcciDst::CACHELINE_OUT>(msgGlobal);
}
}
__aicore__ inline bool IterateIntraBlockPartSum(MSG_POS KfcMsg* msg, KFC_Enum funID)
{
if constexpr (A_TYPE::layout != LayoutMode::NONE) {
return true;
}
QuantCacheRefresh(msg);
IterateSetMessage(msg, &(msg->body));
if (mul.GetSubBlockIdx() == 0) {
return true;
}
uint64_t size;
if constexpr (ToMatmulConfig(MM_CFG).singleCoreMN != 0) {
size = ToMatmulConfig(MM_CFG).singleCoreMN;
} else {
size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN();
}
GlobalTensor<DstT> cGlobal;
cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size);
mul.IterateAll(
cGlobal, msg->body.enAtomic, msg->body.enSequentialWrite, msg->body.waitIterateAll,
msg->body.iterateFakeMsg);
uint16_t eventID0 = static_cast<uint16_t>(this->devEvtID * 2 + 0);
uint16_t eventID1 = static_cast<uint16_t>(this->devEvtID * 2 + 1);
if (msg->body.sync || msg->body.waitIterateAll) {
ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL);
NotifyEvent<PIPE_FIX>(eventID0);
NotifyEvent<PIPE_FIX>(eventID1);
}
if (!msg->body.iterateFakeMsg) {
mul.End();
}
TRACE_STOP(TraceId::MatMul_CALC);
return true;
}
__aicore__ inline bool IsSharedObj()
{
if constexpr (!ToMatmulConfig(MM_CFG).enableInit || ToMatmulConfig(MM_CFG).enableMixDualMaster) {
return true;
}
return false;
}
__aicore__ inline bool IsEnableMixHdAbility()
{
if constexpr (ToMatmulConfig(MM_CFG).enableMixDualMaster) {
return true;
}
return false;
}
template <uint8_t enableHardPoll = 0>
__aicore__ inline bool SkipMsg(KFC_Enum funID, bool& freeMsg, int& lastMsgId, const int subBlockID)
{
#if defined(USE_SSBUF)
if constexpr (enableHardPoll == 1) {
return false;
}
if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) {
if (lastMsgId == subBlockID) {
freeMsg = false;
return true;
} else if (subBlockID == 1) {
lastMsgId = 1;
return true;
}
lastMsgId = 0;
return false;
}
#else
if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) {
return false;
}
#endif
if constexpr (A_TYPE::ibShare || B_TYPE::ibShare || ToMatmulConfig(MM_CFG).intraBlockPartSum) {
if (funID == KFC_Enum::MMFUN_ITERATE_ALL) {
if (lastMsgId == subBlockID) {
freeMsg = false;
return true;
}
lastMsgId = subBlockID;
return false;
}
return false;
} else {
return false;
}
}
template <uint8_t enableHardPoll = 0>
__aicore__ inline bool LockMsgQueue(
KFC_Enum funID, bool& freeMsg, int& lastMsgId, const int subBlockID, MSG_POS KfcMsg* msg = nullptr)
{
#if defined(USE_SSBUF)
if constexpr (!(A_TYPE::ibShare && B_TYPE::ibShare)) {
if (funID == KFC_Enum::MMFUN_ITERATE) {
if (msg->body.cAddr == 0) {
return true;
}
}
}
if constexpr (enableHardPoll == 1) {
return true;
}
#else
if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) {
return true;
}
#endif
return false;
}
__aicore__ inline bool Process(MSG_POS KfcMsg* msg, KFC_Enum funID)
{
if constexpr (
((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) ||
((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0)) {
if ((static_cast<uint16_t>(funID) & static_cast<uint16_t>(KFC_Enum::MMFUN_MASK)) ==
static_cast<uint16_t>(KFC_Enum::MMFUN_MASK)) {
if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) {
#if defined(USE_WORKSPACE)
return IterateIntraBlockPartSum(msg, funID);
#endif
} else {
return Iterate(msg, funID);
}
}
}
if constexpr (
((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) &&
(A_TYPE::layout != LayoutMode::NONE)) {
if (funID == KFC_Enum::MMFUN_ITERATE_BATCH_ALL) {
return IterateBatch(msg);
}
}
if constexpr (ToMatmulConfig(MM_CFG).enableEnd) {
if (funID == KFC_Enum::MMFUN_END) {
mul.End();
}
}
if constexpr (ToMatmulConfig(MM_CFG).enableGetTensorC) {
if (funID == KFC_Enum::MMFUN_GET_TENSOR_C) {
return GetTensorC(msg);
}
}
#if defined(USE_WORKSPACE)
if constexpr (ToMatmulConfig(MM_CFG).enableSetOrgShape) {
if (funID == KFC_Enum::MMFUN_SET_ORG_SHAPE) {
SetOrgShape(msg);
return true;
}
}
#endif
if constexpr (ToMatmulConfig(MM_CFG).enableInit) {
if (funID == KFC_Enum::MMFUN_INIT) {
Init(msg);
return true;
}
}
if constexpr (
((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0) &&
(A_TYPE::layout != LayoutMode::NONE)) {
if (funID == KFC_Enum::MMFUN_ITERATE_N_BATCH_ALL) {
return IterateNBatch(msg);
}
}
if constexpr (ToMatmulConfig(MM_CFG).enableSetDefineData) {
if (funID == KFC_Enum::MMFUN_SET_USER_DEF_INFO) {
SetUserDefInfo(msg);
return true;
}
}
if (funID == KFC_Enum::MMFUN_SET_HF32) {
SetHF32(msg);
return true;
}
ASSERT("illegal function ID.");
return true;
}
template <class Dtype, TPosition Tpos>
__aicore__ LocalTensor<Dtype> GetLocalTensor(uint64_t addr, const uint64_t size)
{
LocalTensor<Dtype> localTensor;
TBuffAddr tbufOutTmp;
tbufOutTmp.logicPos = (uint8_t)(Tpos);
tbufOutTmp.bufferAddr = addr;
#if ASCENDC_CPU_DEBUG
#if defined(USE_SSBUF)
if constexpr (Tpos == TPosition::TSCM) {
addr = addr & 0xffffffff;
tbufOutTmp.dataLen = TOTAL_L1_SIZE * sizeof(Dtype);
} else {
tbufOutTmp.dataLen = size * sizeof(Dtype);
}
#else
tbufOutTmp.dataLen = size * sizeof(Dtype);
#endif
if constexpr (PhyPosIsUB(Tpos)) {
tbufOutTmp.absAddr = reinterpret_cast<uint8_t*>(addr);
} else if constexpr (Tpos == TPosition::TSCM) {
tbufOutTmp.absAddr = GetTPipePtr()->GetBaseAddr((uint8_t)(Tpos)) + addr;
}
#endif
localTensor.SetAddr(tbufOutTmp);
return localTensor;
}
#if defined(USE_SSBUF)
__aicore__ inline void GetMsgFromSSbuf(MSG_POS KfcMsg* msg, MatmulConfigParams& body);
__aicore__ inline void InitL1Addr();
__aicore__ inline void CopyL1Addr2SSBUF(MSG_POS MsgMatmulL1Addr* matmulL1AddrMsg_, MatrixL1Addr* matrixL1Addr_);
__aicore__ inline void WaitAB(MatmulConfigParams& body);
__aicore__ inline void IterNotify();
__aicore__ inline void SetTensorScaleA(MatmulConfigParams& body);
__aicore__ inline void SetTensorScaleB(MatmulConfigParams& body);
#endif
public:
MatmulImpl<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, MATMUL_POLICY> mul;
private:
GM_ADDR workspace;
KFC_COMM_SERVER_PTR kfcCommSrv;
MatmulTiling<MM_CFG> tiling_;
TCubeTiling tmpTiling_;
typename IBShareCache<IsIBShare<A_TYPE, B_TYPE>()>::ShareCache gCache;
typename ShareMatmulAux<!ToMatmulConfig(MM_CFG).enableInit>::MSG msgAux;
public:
uint16_t instID;
private:
uint16_t devEvtID;
#if defined(USE_SSBUF)
uint8_t enPartialSum_;
uint8_t isSyncIterate_;
#endif
};
}
#endif
#if defined(__UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_DETAIL_MATMUL_KFC_MATMUL_SERVER_H__)
#undef __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#undef __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_DETAIL_MATMUL_KFC_MATMUL_SERVER_H__
#endif