* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file angle_v2_complex.h
* \brief
*/
#ifndef _ANGLE_V2_COMPLEX_H_
#define _ANGLE_V2_COMPLEX_H_
#include "angle_v2_base.h"
namespace AngleV2N {
template <typename yType>
class AngleV2Complex : public AngleV2Base<yType>
{
public:
__aicore__ inline AngleV2Complex()
{}
__aicore__ inline void Init(GM_ADDR x, GM_ADDR y, const AngleV2TilingData* __restrict tilingData, TPipe* inputPipe)
{
pipe = inputPipe;
this->BaseMemberDataInit(tilingData);
repeatTimes = (this->tileLength + this->mask - 1) / this->mask;
int64_t totalLengthAlignedCp64 = (this->totalLength + alignNumCp64 - 1) / alignNumCp64 * alignNumCp64;
int64_t diffLength = this->totalLengthAligned - totalLengthAlignedCp64;
int64_t currentOffset = this->offset;
complexTailLength = static_cast<uint32_t>(this->lastTileLength * COEFFICENT);
if (GetBlockIdx() == this->formerNum + this->tailNum - 1) {
if (this->offset < diffLength) {
complexTailLength = static_cast<uint32_t>(this->lastTileLength * COEFFICENT - diffLength * COEFFICENT);
} else {
currentOffset = this->offset - diffLength;
}
}
xGm.SetGlobalBuffer(
reinterpret_cast<__gm__ yType*>(x) + currentOffset * COEFFICENT, this->blockLength * COEFFICENT);
yGm.SetGlobalBuffer(reinterpret_cast<__gm__ yType*>(y) + currentOffset, this->blockLength);
pipe->InitBuffer(inQueue, BUFFER_NUM, this->tileLength * sizeof(yType) * COEFFICENT);
pipe->InitBuffer(outQueue, BUFFER_NUM, this->tileLength * sizeof(yType));
pipe->InitBuffer(maskBuf1, this->tileLength * sizeof(uint8_t));
pipe->InitBuffer(maskBuf2, this->tileLength * sizeof(uint8_t));
pipe->InitBuffer(realBuf, this->tileLength * sizeof(yType));
pipe->InitBuffer(imagBuf, this->tileLength * sizeof(yType));
pipe->InitBuffer(zeroBuf, this->tileLength * sizeof(yType));
pipe->InitBuffer(oneBuf, this->tileLength * sizeof(yType));
pipe->InitBuffer(tempBuf1, this->tileLength * sizeof(yType));
pipe->InitBuffer(tempBuf2, this->tileLength * sizeof(yType));
pipe->InitBuffer(tempBuf3, this->tileLength * sizeof(yType));
pipe->InitBuffer(tempBuf4, this->tileLength * sizeof(yType));
pipe->InitBuffer(tempBuf5, this->tileLength * sizeof(yType));
}
__aicore__ inline void Process()
{
BufferGet();
int64_t complexLength = this->tileLength * COEFFICENT;
#if (__CCE_AICORE__ >= 200)
for (int32_t i = 0; i < this->tileNum; i++) {
CopyInComplex(i * complexLength, complexLength);
PipeBarrier<PIPE_MTE2>();
Compute(this->mask, repeatTimes, this->tileLength);
PipeBarrier<PIPE_V>();
CopyOut(i * this->tileLength, this->tileLength);
}
if (this->lastTileLength > 0) {
complexLength = this->lastTileLength * COEFFICENT;
repeatTimes = (this->lastTileLength + this->mask - 1) / this->mask;
CopyInComplex(this->blockLength * COEFFICENT - complexLength, complexTailLength);
PipeBarrier<PIPE_MTE2>();
Compute(this->mask, repeatTimes, this->lastTileLength);
PipeBarrier<PIPE_V>();
CopyOut(this->blockLength - this->lastTileLength, this->lastTileLength);
}
#else
for (int32_t i = 0; i < this->tileNum; i++) {
CopyInComplex(i * complexLength, complexLength);
PipeBarrier<PIPE_ALL>();
Compute(this->mask, repeatTimes, this->tileLength);
PipeBarrier<PIPE_ALL>();
CopyOut(i * this->tileLength, this->tileLength);
}
if (this->lastTileLength > 0) {
complexLength = this->lastTileLength * COEFFICENT;
repeatTimes = (this->lastTileLength + this->mask - 1) / this->mask;
CopyInComplex(this->blockLength * COEFFICENT - complexLength, complexTailLength);
PipeBarrier<PIPE_ALL>();
Compute(this->mask, repeatTimes, this->lastTileLength);
PipeBarrier<PIPE_ALL>();
CopyOut(this->blockLength - this->lastTileLength, this->lastTileLength);
}
#endif
}
private:
__aicore__ inline void BufferGet()
{
realLocal = realBuf.Get<yType>();
imagLocal = imagBuf.Get<yType>();
zeroTensor = zeroBuf.Get<yType>();
oneTensor = oneBuf.Get<yType>();
tempTensor1 = tempBuf1.Get<yType>();
tempTensor2 = tempBuf2.Get<yType>();
tempTensor3 = tempBuf3.Get<yType>();
tempTensor4 = tempBuf4.Get<yType>();
tempTensor5 = tempBuf5.Get<yType>();
mask1 = maskBuf1.Get<uint8_t>();
mask2 = maskBuf2.Get<uint8_t>();
Duplicate(zeroTensor, static_cast<yType>(0.0), this->tileLength);
Duplicate(oneTensor, static_cast<yType>(1.0), this->tileLength);
}
__aicore__ inline void SplitComplex(
LocalTensor<yType>& realLocal, LocalTensor<yType>& imagLocal, LocalTensor<yType>& input, uint32_t calCount)
{
#if (__CCE_AICORE__ >= 200)
GatherMaskParams params;
uint64_t rsvdCnt = 0;
uint32_t complexMask = calCount * 2;
params.repeatTimes = 1;
GatherMask(realLocal, input, GATHER_MASK_MODE_ONE, true, complexMask, params, rsvdCnt);
GatherMask(imagLocal, input, GATHER_MASK_MODE_TWO, true, complexMask, params, rsvdCnt);
PipeBarrier<PIPE_V>();
#else
for (int32_t i = 0; i < calCount; i++) {
realLocal.SetValue(i, input.GetValue(COEFFICENT * i));
imagLocal.SetValue(i, input.GetValue(COEFFICENT * i + 1));
}
#endif
}
__aicore__ inline void CopyInComplex(int64_t coreOffset, uint32_t coreLength)
{
LocalTensor<yType> xLocal = inQueue.AllocTensor<yType>();
DataCopy(xLocal, xGm[coreOffset], coreLength);
inQueue.EnQue(xLocal);
}
__aicore__ inline void Compute(uint64_t mask, uint8_t repeatTimes, uint32_t calCount)
{
LocalTensor<yType> input = inQueue.DeQue<yType>();
LocalTensor<yType> result = outQueue.AllocTensor<yType>();
SplitComplex(realLocal, imagLocal, input, calCount);
Div(tempTensor5, imagLocal, realLocal, calCount);
PipeBarrier<PIPE_V>();
#if (__CCE_AICORE__ >= 200)
Atan<yType, false>(result, tempTensor5, calCount);
#else
AtanCompute(result, tempTensor5, tempTensor1, calCount);
#endif
PipeBarrier<PIPE_V>();
mask1 = imagLocal >= zeroTensor;
Duplicate(tempTensor1, static_cast<yType>(1.0), calCount);
Duplicate(tempTensor2, static_cast<yType>(-1.0), calCount);
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor1, mask1, tempTensor1, tempTensor2, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = realLocal < zeroTensor;
this->DoSelect(tempTensor2, mask1, tempTensor1, zeroTensor, mask, repeatTimes);
Duplicate(tempTensor3, static_cast<yType>(constData.const_pi_by_two), calCount);
PipeBarrier<PIPE_V>();
Mul(tempTensor1, tempTensor3, tempTensor1, calCount);
PipeBarrier<PIPE_V>();
Duplicate(tempTensor3, static_cast<yType>(constData.const_pi), calCount);
PipeBarrier<PIPE_V>();
Mul(tempTensor2, tempTensor3, tempTensor2, calCount);
mask1 = realLocal == zeroTensor;
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, tempTensor1, result, mask, repeatTimes);
PipeBarrier<PIPE_V>();
Add(result, tempTensor2, result, calCount);
PipeBarrier<PIPE_V>();
outQueue.EnQue<yType>(result);
CornerProcess(mask, repeatTimes, calCount);
inQueue.FreeTensor(input);
}
__aicore__ inline void CornerProcess(uint64_t mask, uint8_t repeatTimes, uint32_t calCount)
{
LocalTensor<yType> result = outQueue.DeQue<yType>();
mask1 = imagLocal == zeroTensor;
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor1, mask1, oneTensor, zeroTensor, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = imagLocal < zeroTensor;
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor2, mask1, oneTensor, zeroTensor, mask, repeatTimes);
PipeBarrier<PIPE_V>();
Mul(tempTensor2, tempTensor1, tempTensor2, calCount);
PipeBarrier<PIPE_V>();
Sub(tempTensor3, tempTensor1, tempTensor2, calCount);
mask1 = realLocal < zeroTensor;
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor1, mask1, oneTensor, zeroTensor, mask, repeatTimes);
PipeBarrier<PIPE_V>();
Sub(tempTensor4, oneTensor, tempTensor1, calCount);
Mul(tempTensor5, tempTensor3, tempTensor1, calCount);
PipeBarrier<PIPE_V>();
mask1 = tempTensor5 == oneTensor;
PipeBarrier<PIPE_V>();
Duplicate(tempTensor5, static_cast<yType>(constData.const_pi), calCount);
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, tempTensor5, result, mask, repeatTimes);
Mul(tempTensor3, tempTensor4, tempTensor3, calCount);
PipeBarrier<PIPE_V>();
mask1 = tempTensor3 == oneTensor;
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, zeroTensor, result, mask, repeatTimes);
Mul(tempTensor1, tempTensor2, tempTensor1, calCount);
PipeBarrier<PIPE_V>();
mask1 = tempTensor1 == oneTensor;
PipeBarrier<PIPE_V>();
Duplicate(tempTensor1, static_cast<yType>(constData.const_neg_pi), calCount);
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, tempTensor1, result, mask, repeatTimes);
Mul(tempTensor4, tempTensor2, tempTensor4, calCount);
PipeBarrier<PIPE_V>();
mask1 = tempTensor4 == oneTensor;
Duplicate(tempTensor1, static_cast<yType>(float(-0.0)), calCount);
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, tempTensor1, result, mask, repeatTimes);
CornerProcessINFNAN(result, oneTensor, mask, repeatTimes, calCount);
outQueue.EnQue<yType>(result);
}
__aicore__ inline void CornerProcessINFNAN(
LocalTensor<yType>& result, LocalTensor<yType>& oneTensor, uint64_t mask, uint8_t repeatTimes,
uint32_t calCount)
{
CornerProcessRealINF(result, oneTensor, mask, repeatTimes, calCount);
CornerProcessRealNINF(result, oneTensor, mask, repeatTimes, calCount);
CornerProcessNAN(result, mask, repeatTimes, calCount);
}
__aicore__ inline void CornerProcessRealINF(
LocalTensor<yType>& result, LocalTensor<yType>& oneTensor, uint64_t mask, uint8_t repeatTimes,
uint32_t calCount)
{
mask1 = imagLocal < zeroTensor;
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor2, mask1, oneTensor, zeroTensor, mask, repeatTimes);
Duplicate(tempTensor3, static_cast<yType>(-INFINITY), calCount);
PipeBarrier<PIPE_V>();
mask1 = imagLocal > tempTensor3;
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor2, mask1, tempTensor2, zeroTensor, mask, repeatTimes);
Duplicate(tempTensor4, static_cast<yType>(INFINITY), calCount);
PipeBarrier<PIPE_V>();
mask2 = realLocal == tempTensor4;
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor2, mask2, tempTensor2, zeroTensor, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = tempTensor2 == oneTensor;
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, tempTensor1, result, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = imagLocal == tempTensor4;
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor1, mask1, oneTensor, zeroTensor, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = imagLocal == tempTensor3;
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor2, mask1, oneTensor, zeroTensor, mask, repeatTimes);
this->DoSelect(tempTensor5, mask2, tempTensor1, zeroTensor, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = tempTensor5 == oneTensor;
PipeBarrier<PIPE_V>();
Duplicate(tempTensor5, static_cast<yType>(constData.const_pi_by_four), calCount);
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, tempTensor5, result, mask, repeatTimes);
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor5, mask2, tempTensor2, zeroTensor, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = tempTensor5 == oneTensor;
PipeBarrier<PIPE_V>();
Duplicate(tempTensor5, static_cast<yType>(constData.const_neg_pi_by_four), calCount);
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, tempTensor5, result, mask, repeatTimes);
}
__aicore__ inline void CornerProcessRealNINF(
LocalTensor<yType>& result, LocalTensor<yType>& oneTensor, uint64_t mask, uint8_t repeatTimes,
uint32_t calCount)
{
mask2 = realLocal == tempTensor3;
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor3, mask2, tempTensor1, zeroTensor, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = tempTensor3 == oneTensor;
PipeBarrier<PIPE_V>();
Duplicate(tempTensor3, static_cast<yType>(constData.const_pi_by_three_quarters), calCount);
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, tempTensor3, result, mask, repeatTimes);
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor3, mask2, tempTensor2, zeroTensor, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = tempTensor3 == oneTensor;
PipeBarrier<PIPE_V>();
Duplicate(tempTensor3, static_cast<yType>(constData.const_neg_pi_by_three_quarters), calCount);
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, tempTensor3, result, mask, repeatTimes);
PipeBarrier<PIPE_V>();
Abs(tempTensor3, realLocal, calCount);
PipeBarrier<PIPE_V>();
mask2 = tempTensor3 < tempTensor4;
PipeBarrier<PIPE_V>();
this->DoSelect(tempTensor1, mask2, tempTensor1, zeroTensor, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = tempTensor1 == oneTensor;
PipeBarrier<PIPE_V>();
Duplicate(tempTensor1, static_cast<yType>(constData.const_pi_by_two), calCount);
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, tempTensor1, result, mask, repeatTimes);
this->DoSelect(tempTensor2, mask2, tempTensor2, zeroTensor, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = tempTensor2 == oneTensor;
Duplicate(tempTensor1, static_cast<yType>(constData.const_neg_pi_by_two), calCount);
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, tempTensor1, result, mask, repeatTimes);
}
__aicore__ inline void CornerProcessNAN(
LocalTensor<yType>& result, uint64_t mask, uint8_t repeatTimes, uint32_t calCount)
{
Duplicate(tempTensor1, static_cast<yType>(NAN), calCount);
mask1 = realLocal == realLocal;
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, result, tempTensor1, mask, repeatTimes);
PipeBarrier<PIPE_V>();
mask1 = imagLocal == imagLocal;
PipeBarrier<PIPE_V>();
this->DoSelect(result, mask1, result, tempTensor1, mask, repeatTimes);
}
__aicore__ inline void CopyOut(int64_t coreOffset, uint32_t coreLength)
{
LocalTensor<yType> result = outQueue.DeQue<yType>();
DataCopy(yGm[coreOffset], result, coreLength);
outQueue.FreeTensor(result);
}
__aicore__ inline void Sign(
LocalTensor<yType>& dst, LocalTensor<yType>& src, LocalTensor<yType>& denominator, uint32_t calCount)
{
Muls(dst, src, static_cast<yType>(ATAN_FP32_MAX), calCount);
PipeBarrier<PIPE_V>();
Abs(denominator, dst, calCount);
PipeBarrier<PIPE_V>();
Adds(denominator, denominator, static_cast<yType>(ATAN_FP32_MIN), calCount);
PipeBarrier<PIPE_V>();
Div(dst, dst, denominator, calCount);
PipeBarrier<PIPE_V>();
}
__aicore__ inline void TaylorExpand(
LocalTensor<yType>& dstTensor, LocalTensor<yType>& srcTensor, LocalTensor<yType>& squareTensor,
int32_t expandLevel, uint32_t calCount)
{
Mul(squareTensor, srcTensor, srcTensor, calCount);
Mul(dstTensor, srcTensor, srcTensor, calCount);
PipeBarrier<PIPE_V>();
Muls(dstTensor, dstTensor, factorList[expandLevel], calCount);
PipeBarrier<PIPE_V>();
for (uint32_t i = expandLevel - 1; i > 0; --i) {
Adds(dstTensor, dstTensor, factorList[i], calCount);
PipeBarrier<PIPE_V>();
Mul(dstTensor, dstTensor, squareTensor, calCount);
PipeBarrier<PIPE_V>();
}
Adds(dstTensor, dstTensor, factorList[0], calCount);
PipeBarrier<PIPE_V>();
Mul(dstTensor, dstTensor, srcTensor, calCount);
}
__aicore__ inline void AtanTransform(
LocalTensor<yType>& dstTensor, LocalTensor<yType>& srcTensor, LocalTensor<yType>& tmpTensor,
const float transFactor, uint32_t calCount)
{
const float transFactorNeg = 0 - transFactor;
Muls(dstTensor, srcTensor, transFactor, calCount);
PipeBarrier<PIPE_V>();
Adds(dstTensor, dstTensor, static_cast<yType>(1.0), calCount);
Adds(tmpTensor, srcTensor, transFactorNeg, calCount);
PipeBarrier<PIPE_V>();
Div(dstTensor, tmpTensor, dstTensor, calCount);
PipeBarrier<PIPE_V>();
Abs(dstTensor, dstTensor, calCount);
PipeBarrier<PIPE_V>();
}
__aicore__ inline void AtanImpl(
LocalTensor<yType>& dstTensor, LocalTensor<yType>& srcTensor, LocalTensor<yType>& tmpTensor, uint32_t calCount)
{
const float piByFour = 0.78539816339744830961566084581988;
const float piByEight = 0.39269908169872415480783042290994;
const float tanPiByEight = 0.4142135623730950;
LocalTensor<yType> absTensor = tempTensor3;
LocalTensor<yType> tmpTensor2 = tempTensor2;
LocalTensor<yType> squareTensor = tempTensor4;
Abs(absTensor, srcTensor, calCount);
PipeBarrier<PIPE_V>();
Mins(absTensor, absTensor, static_cast<yType>(10000), calCount);
PipeBarrier<PIPE_V>();
TaylorExpand(dstTensor, absTensor, squareTensor, TAYLOR_COUNT_FOUR, calCount);
AtanTransform(tmpTensor, absTensor, tmpTensor2, tanPiByEight, calCount);
TaylorExpand(tmpTensor2, tmpTensor, squareTensor, TAYLOR_COUNT_FOUR, calCount);
PipeBarrier<PIPE_V>();
Adds(tmpTensor2, tmpTensor2, piByEight, calCount);
PipeBarrier<PIPE_V>();
Min(dstTensor, dstTensor, tmpTensor2, calCount);
Adds(tmpTensor2, absTensor, static_cast<yType>(1.0), calCount);
Adds(tmpTensor, absTensor, -static_cast<yType>(1.0), calCount);
PipeBarrier<PIPE_V>();
Div(tmpTensor, tmpTensor, tmpTensor2, calCount);
PipeBarrier<PIPE_V>();
Abs(tmpTensor, tmpTensor, calCount);
PipeBarrier<PIPE_V>();
TaylorExpand(tmpTensor2, tmpTensor, squareTensor, TAYLOR_COUNT_FOUR, calCount);
PipeBarrier<PIPE_V>();
Adds(tmpTensor2, tmpTensor2, piByFour, calCount);
PipeBarrier<PIPE_V>();
Min(dstTensor, dstTensor, tmpTensor2, calCount);
AtanTransform(tmpTensor2, tmpTensor, squareTensor, tanPiByEight, calCount);
TaylorExpand(tmpTensor, tmpTensor2, squareTensor, TAYLOR_COUNT_SIX, calCount);
PipeBarrier<PIPE_V>();
Adds(tmpTensor, tmpTensor, piByEight, calCount);
PipeBarrier<PIPE_V>();
Adds(tmpTensor, tmpTensor, piByFour, calCount);
PipeBarrier<PIPE_V>();
Min(dstTensor, dstTensor, tmpTensor, calCount);
PipeBarrier<PIPE_V>();
}
__aicore__ inline void AtanCompute(
LocalTensor<yType>& dstTensor, LocalTensor<yType>& srcTensor, LocalTensor<yType>& tmpTensor, uint32_t calCount)
{
AtanImpl(dstTensor, srcTensor, tmpTensor, calCount);
Sign(tmpTensor, srcTensor, tempTensor2, calCount);
Mul(dstTensor, dstTensor, tmpTensor, calCount);
PipeBarrier<PIPE_V>();
}
private:
TPipe* pipe;
ConstData constData;
uint8_t repeatTimes;
uint32_t complexTailLength = 64;
const float factorList[7] = {1,
-0.3333333333333333,
0.2,
-0.14285714285714285,
0.1111111111111111,
-0.09090909090909091,
0.07692307692307693};
const float ATAN_FP32_MAX = 4611686018427387904;
const float ATAN_FP32_MIN = 2.168404344971009e-19;
const uint8_t TAYLOR_COUNT_FOUR = 4;
const uint8_t TAYLOR_COUNT_SIX = 6;
const uint8_t GATHER_MASK_MODE_ONE = 1;
const uint8_t GATHER_MASK_MODE_TWO = 2;
const int64_t alignNumCp64 = 4;
GlobalTensor<yType> xGm;
GlobalTensor<yType> yGm;
TQue<QuePosition::VECIN, BUFFER_NUM> inQueue;
TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue;
TBuf<TPosition::VECCALC> maskBuf1;
TBuf<TPosition::VECCALC> tempBuf1;
TBuf<TPosition::VECCALC> tempBuf2;
TBuf<TPosition::VECCALC> tempBuf3;
TBuf<TPosition::VECCALC> maskBuf2;
TBuf<TPosition::VECCALC> tempBuf4;
TBuf<TPosition::VECCALC> tempBuf5;
TBuf<TPosition::VECCALC> oneBuf;
TBuf<TPosition::VECCALC> zeroBuf;
TBuf<TPosition::VECCALC> realBuf;
TBuf<TPosition::VECCALC> imagBuf;
LocalTensor<yType> realLocal;
LocalTensor<yType> imagLocal;
LocalTensor<yType> zeroTensor;
LocalTensor<yType> oneTensor;
LocalTensor<yType> tempTensor1;
LocalTensor<yType> tempTensor2;
LocalTensor<yType> tempTensor3;
LocalTensor<yType> tempTensor4;
LocalTensor<yType> tempTensor5;
LocalTensor<uint8_t> mask1;
LocalTensor<uint8_t> mask2;
};
}
#endif