* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file kernel_operator_common_intf_impl.h
* \brief
*/
#ifndef ASCENDC_MODULE_OPERATOR_COMMON_INTERFACE_IMPL_H
#define ASCENDC_MODULE_OPERATOR_COMMON_INTERFACE_IMPL_H
#include "kernel_tensor.h"
#include "kernel_struct_mm.h"
* ingroup:SetAtomicAdd
* brief:Set the next data from UB to the outside of AI Core whether the move write Tensor operation performs
* atomic accumulation.
*/
#if __NPU_ARCH__ == 1001
#include "dav_c100/kernel_operator_set_atomic_impl.h"
#include "dav_c100/kernel_operator_common_impl.h"
#include "dav_c100/kernel_operator_vec_duplicate_impl.h"
#include "dav_c100/kernel_operator_sync_impl.h"
#elif __NPU_ARCH__ == 2002
#include "dav_m200/kernel_operator_set_atomic_impl.h"
#include "dav_m200/kernel_operator_common_impl.h"
#include "dav_m200/kernel_operator_vec_duplicate_impl.h"
#include "dav_m200/kernel_operator_sync_impl.h"
#elif __NPU_ARCH__ == 2201
#include "dav_c220/kernel_operator_set_atomic_impl.h"
#include "dav_c220/kernel_operator_common_impl.h"
#include "dav_c220/kernel_operator_sync_impl.h"
#include "dav_c220/kernel_operator_vec_duplicate_impl.h"
#include "dav_c220/kfc/kfc_comm_client.h"
#include "dav_c220/kfc/kfc_comm_server.h"
#include "dav_c220/core_mng/roc/kernel_operator_cube_group_handle_impl.h"
#include "dav_c220/core_mng/roc/kernel_operator_group_barrier_impl.h"
#elif __NPU_ARCH__ == 3002
#include "dav_m300/kernel_operator_set_atomic_impl.h"
#elif __NPU_ARCH__ == 3003
#include "dav_l300/kernel_operator_sync_impl.h"
#include "dav_l300/kernel_operator_set_atomic_impl.h"
#include "dav_c310/kernel_operator_common_impl.h"
#elif __NPU_ARCH__ == 3102
#include "dav_m310/kernel_operator_set_atomic_impl.h"
#elif __NPU_ARCH__ == 3101
#include "dav_c310/kernel_operator_set_atomic_impl.h"
#include "dav_c310/kernel_operator_common_impl.h"
#include "dav_c310/kernel_operator_sync_impl.h"
#if KFC_C310_SSBUF == 1
#include "dav_c310/kfc/kfc_comm_client.h"
#include "dav_c310/kfc/kfc_comm_server.h"
#else
#include "dav_c310/kfc/kfc_comm_client_gm.h"
#include "dav_c310/kfc/kfc_comm_server_gm.h"
#endif
#if __NPU_ARCH__ == 3101
#include "dav_c310/core_mng/roc/kernel_operator_cube_group_handle_impl.h"
#include "dav_c310/core_mng/roc/kernel_operator_group_barrier_impl.h"
#endif
#elif (__NPU_ARCH__ == 5102)
#include "dav_m510/kernel_operator_set_atomic_impl.h"
#include "dav_m510/kernel_operator_common_impl.h"
#include "dav_m510/kernel_operator_sync_impl.h"
#elif (__NPU_ARCH__ == 3113)
#include "dav_l311/kernel_operator_sync_impl.h"
#include "dav_l311/kernel_operator_set_atomic_impl.h"
#include "dav_c310/kernel_operator_common_impl.h"
#endif
#include "kernel_pop_stack_buffer.h"
namespace AscendC {
* @ingroup:IBSet, IBWait
* @brief:Set the flag bit of a core
* @param [in] gmWorkspace GlobalTensor to store core state
* @param [in] ubWorkspce LocalTensor for current core
* @param [in] blockIdx the idx number waiting for the core
* @param [in] eventID Set and wait events
*/
__aicore__ inline int64_t GetBlockNum();
template <bool isAIVOnly>
__aicore__ inline void IBSet(const GlobalTensor<int32_t> &gmWorkspace,
const LocalTensor<int32_t> &ubWorkspace, int32_t blockIdx, int32_t eventID)
{
int32_t blockNum = GetBlockNum();
#if (__NPU_ARCH__ == 2201) || (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
#if (__NPU_ARCH__ != 5102)
if ASCEND_IS_AIC {
return;
}
#endif
if (!isAIVOnly) {
blockNum = GetBlockNum() * 2;
}
#endif
#if __NPU_ARCH__ == 2201 || __NPU_ARCH__ == 2002
__ib_set_stub(blockIdx, eventID, isAIVOnly);
#endif
auto localSyncGM = gmWorkspace[blockNum * 8 * eventID + blockIdx * 8];
pipe_barrier(PIPE_ALL);
while (true) {
DataCopy(ubWorkspace, localSyncGM, ONE_BLK_SIZE / sizeof(int32_t));
event_t eventIdMte2ToS = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_S));
SetFlag<HardEvent::MTE2_S>(eventIdMte2ToS);
WaitFlag<HardEvent::MTE2_S>(eventIdMte2ToS);
if (ubWorkspace.GetValue(0) == 0) {
ubWorkspace.SetValue(0, 1);
event_t eventIdSToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::S_MTE3));
SetFlag<HardEvent::S_MTE3>(eventIdSToMte3);
WaitFlag<HardEvent::S_MTE3>(eventIdSToMte3);
DataCopy(localSyncGM, ubWorkspace, ONE_BLK_SIZE / sizeof(int32_t));
break;
}
}
pipe_barrier(PIPE_ALL);
#if __NPU_ARCH__ == 2201 || __NPU_ARCH__ == 2002
__ib_set_stub(blockIdx, eventID, isAIVOnly);
#endif
}
template <bool isAIVOnly>
__aicore__ inline void IBWait(const GlobalTensor<int32_t> &gmWorkspace,
const LocalTensor<int32_t> &ubWorkspace, int32_t blockIdx, int32_t eventID)
{
int32_t blockNum = GetBlockNum();
#if (__NPU_ARCH__ == 2201) || (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
#if (__NPU_ARCH__ != 5102)
if ASCEND_IS_AIC {
return;
}
#endif
if (!isAIVOnly) {
blockNum = GetBlockNum() * 2;
}
#endif
#if __NPU_ARCH__ == 2201 || __NPU_ARCH__ == 2002
__ib_wait_stub(blockIdx, eventID, isAIVOnly);
#endif
auto localSyncGM = gmWorkspace[blockNum * 8 * eventID + blockIdx * 8];
pipe_barrier(PIPE_ALL);
while (true) {
DataCopy(ubWorkspace, localSyncGM, ONE_BLK_SIZE / sizeof(int32_t));
event_t eventIdMte2ToS = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_S));
SetFlag<HardEvent::MTE2_S>(eventIdMte2ToS);
WaitFlag<HardEvent::MTE2_S>(eventIdMte2ToS);
if (ubWorkspace.GetValue(0) == 1) {
ubWorkspace.SetValue(0, 0);
event_t eventIdSToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::S_MTE3));
SetFlag<HardEvent::S_MTE3>(eventIdSToMte3);
WaitFlag<HardEvent::S_MTE3>(eventIdSToMte3);
DataCopy(localSyncGM, ubWorkspace, ONE_BLK_SIZE / sizeof(int32_t));
break;
}
}
pipe_barrier(PIPE_ALL);
#if __NPU_ARCH__ == 2201 || __NPU_ARCH__ == 2002
__ib_wait_stub(blockIdx, eventID, isAIVOnly);
#endif
}
* @ingroup:SetNextTaskStart, WaitPreTaskEnd
* @brief:In SuperKernel fusion mode, set wait flag between two operators
*/
template<pipe_t AIV_PIPE, pipe_t AIC_PIPE>
__aicore__ inline void SetNextTaskStart()
{
#ifdef __ASCENDC_ENABLE_SET_NEXT_TASK_START
SetNextTaskStartImpl<AIV_PIPE, AIC_PIPE>();
#endif
}
__aicore__ inline void WaitPreTaskEnd()
{
#ifdef __ASCENDC_ENABLE_WAIT_PRE_TASK_END
WaitPreTaskEndImpl();
#endif
}
* @ingroup:SyncALL
* @brief:Set flag bits of all cores
* @param [in] gmWorkspace GlobalTensor to store core state
* @param [in] ubWorkspce LocalTensor for current core
*/
template <bool isAIVOnly>
__aicore__ inline void SyncAll(const GlobalTensor<int32_t> &gmWorkspace,
const LocalTensor<int32_t> &ubWorkspace, const int usedCores)
{
#if ASCENDC_CPU_DEBUG
SoftSyncAllImpl<false>((__gm__ int32_t*)gmWorkspace.GetPhyAddr(),
(__ubuf__ int32_t*)ubWorkspace.GetPhyAddr(), usedCores);
#else
SoftSyncAllImpl<isAIVOnly>((__gm__ int32_t*)gmWorkspace.GetPhyAddr(),
(__ubuf__ int32_t*)ubWorkspace.GetPhyAddr(), usedCores);
#endif
}
__aicore__ inline void InitSocState()
{
AscendCUtils::InitSocStateImpl();
}
__aicore__ inline int64_t GetBlockIdx()
{
return GetBlockIdxImpl();
}
__aicore__ inline int64_t GetBlockNum()
{
#ifdef __SUPER_KERNEL_STATIC_BLOCK_NUM__
return __SUPER_KERNEL_STATIC_BLOCK_NUM__;
#elif defined(__SUPER_KERNEL_DYNAMIC_BLOCK_NUM__)
return g_super_kernel_dynamic_block_num;
#else
return get_block_num();
#endif
}
__aicore__ inline int64_t GetSubBlockIdx()
{
return GetSubBlockIdxImpl();
}
__aicore__ inline int64_t GetTaskRatio()
{
return GetTaskRationImpl();
}
__aicore__ inline int64_t GetTaskRation()
{
return GetTaskRatio();
}
template <typename T>
__aicore__ inline __in_pipe__(V)
__out_pipe__(MTE3) void InitOutput(GlobalTensor<T> gmWorkspaceAddr, uint32_t size, T value)
{
#if (__NPU_ARCH__ == 2201) || (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
#if (__NPU_ARCH__ != 5102)
if ASCEND_IS_AIC {
return;
}
#endif
LocalTensor<T> popBuffer;
bool ret = PopStackBuffer<T, TPosition::LCM>(popBuffer);
uint32_t maxBurstSize = (MAX_REPEAT_TIMES * ONE_BLK_SIZE) / sizeof(T);
uint32_t popSize = popBuffer.GetSize() >= maxBurstSize ? maxBurstSize : popBuffer.GetSize();
uint32_t round = size / popSize;
uint32_t tail = size % popSize;
uint32_t roundSize = round != 0 ? popSize : 0;
DuplicateImpl<T>((__ubuf__ T*)popBuffer.GetPhyAddr(), value, popSize);
event_t eventIDVToMTE3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
SetFlag<HardEvent::V_MTE3>(eventIDVToMTE3);
WaitFlag<HardEvent::V_MTE3>(eventIDVToMTE3);
struct DataCopyExtParams repeatParams;
uint32_t comOffset = 0;
repeatParams = { 1, static_cast<uint32_t>(roundSize * sizeof(T)), 0, 0, 0 };
for (int index = 0; index < round; ++index) {
DataCopyPadUB2GMImpl((__gm__ T*)gmWorkspaceAddr.GetPhyAddr() + comOffset,
(__ubuf__ T*)popBuffer.GetPhyAddr(),
repeatParams);
comOffset += roundSize;
}
repeatParams = {1, static_cast<uint32_t>(tail * sizeof(T)), 0, 0, 0};
if (tail != 0) {
comOffset = round * roundSize;
DataCopyPadUB2GMImpl((__gm__ T*)gmWorkspaceAddr.GetPhyAddr() + comOffset,
(__ubuf__ T*)popBuffer.GetPhyAddr(),
repeatParams);
}
#endif
}
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template<bool isAIVOnly, const SyncAllConfig& config>
__aicore__ inline void SyncAll()
{
SyncAllImpl<isAIVOnly, config.triggerPipe, config.waitPipe>();
}
#else
template<bool isAIVOnly>
__aicore__ inline void SyncAll()
{
SyncAllImpl<isAIVOnly>();
}
#endif
template <AtomicDtype type, AtomicOp op>
__aicore__ inline void SetStoreAtomicConfig()
{
SetStoreAtomicConfigImpl<static_cast<atomic_type_t>(type), static_cast<atomic_op_t>(op)>();
}
__aicore__ inline int64_t GetStoreAtomicConfig()
{
return GetStoreAtomicConfigImpl();
}
__aicore__ inline void GetStoreAtomicConfig(uint16_t &atomicType, uint16_t &atomicOp)
{
GetStoreAtomicConfigImpl(atomicType, atomicOp);
}
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <pipe_t pipe, uint8_t subBlockSyncMode = 2>
__aicore__ inline void NotifyEvent(uint16_t flagId)
{
NotifyEventImpl<subBlockSyncMode, pipe>(flagId);
}
template <pipe_t pipe=PIPE_S, uint8_t mode = 0>
__aicore__ inline void WaitEvent(uint16_t flagId)
{
WaitEventImpl<mode, pipe>(flagId);
}
#else
template <pipe_t pipe>
__aicore__ inline void NotifyEvent(uint16_t flagId)
{
constexpr uint8_t subBlockSyncMode = 0x02;
NotifyEventImpl<subBlockSyncMode, pipe>(flagId);
}
template <pipe_t pipe=PIPE_S>
__aicore__ inline void WaitEvent(uint16_t flagId)
{
constexpr uint8_t mode = 0;
WaitEventImpl<mode, pipe>(flagId);
}
#endif
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <int count>
__aicore__ inline void Nop()
{
if (count <= 0) {
return;
}
PipeBarrier<PIPE_ALL>();
#pragma unroll
for (int i = 0; i < count; i++) {
asm volatile("nop");
}
PipeBarrier<PIPE_ALL>();
}
#endif
template<uint8_t modeId, pipe_t pipe>
__aicore__ inline void CrossCoreSetFlag(uint16_t flagId)
{
NotifyEventImpl<modeId, pipe>(flagId);
}
template <uint8_t modeId, pipe_t pipe>
__aicore__ inline void CrossCoreWaitFlag(uint16_t flagId)
{
WaitEventImpl<modeId, pipe>(flagId);
}
template <typename T>
__aicore__ inline void DataCachePreload(const GlobalTensor<uint64_t> &src, const T cacheOffset)
{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
DataCachePreloadImpl((__gm__ uint64_t*)src.GetPhyAddr(), cacheOffset);
#else
DataCachePreloadImpl(src, cacheOffset);
#endif
}
__aicore__ inline void ICachePreLoad(const int64_t preFetchLen)
{
PreLoad(preFetchLen);
}
__aicore__ inline int64_t GetICachePreloadStatus()
{
return GetICachePreloadStatusImpl();
}
__aicore__ inline void CheckLocalMemoryIA(const CheckLocalMemoryIAParam& checkParams)
{
CheckLocalMemoryIAImpl(checkParams);
}
#if (__NPU_ARCH__ == 2201) || (__NPU_ARCH__ == 3002) || (__NPU_ARCH__ == 3102) || (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <HardEvent event, MemoryT memT, bool isVirtual> __aicore__ inline void HSetFlag(int32_t eventID)
{
if (g_coreType == AIV) {
return;
}
HSetFlagImpl<event, memT, isVirtual>(eventID);
}
template <HardEvent event, MemoryT memT, bool isVirtual> __aicore__ inline void HWaitFlag(int32_t eventID)
{
if (g_coreType == AIV) {
return;
}
HWaitFlagImpl<event, memT, isVirtual>(eventID);
}
#endif
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102) || (__NPU_ARCH__ == 3003) || (__NPU_ARCH__ == 3113)
template <SpecialPurposeReg spr>
__aicore__ inline int64_t GetSpr(){
return GetSprImpl<spr>();
}
template <SpecialPurposeReg spr>
__aicore__ inline void ClearSpr(){
ClearSprImpl<spr>();
}
template <int8_t startBit, int8_t endBit>
__aicore__ static inline void SetCtrlSpr(int64_t value){
SetCtrlSprImpl<startBit, endBit>(value);
}
template <int8_t startBit, int8_t endBit>
__aicore__ static inline int64_t GetCtrlSpr(){
return GetCtrlSprImpl<startBit, endBit>();
}
template <int8_t startBit, int8_t endBit>
__aicore__ static inline void ResetCtrlSpr(){
ResetCtrlSprImpl<startBit, endBit>();
}
#endif
}
#endif