* Copyright (c) 2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file acos.h
* \brief
*/
#ifndef ACOS_H
#define ACOS_H
#include "kernel_operator.h"
#include "kernel_tiling/kernel_tiling.h"
#include "acos_tiling_data.h"
#include "acos_tiling_key.h"
namespace NsAcos {
using namespace AscendC;
constexpr int32_t BUFFER_NUM = 2;
constexpr int32_t BYTE_TO_BIT = 8;
constexpr int32_t BYTE_ALIGN = 256;
constexpr float taylorCoefficients[] = {1.0, 1.0 / 6, 3.0 / 40, 5.0 / 112,
35.0 / 1152, 63.0 / 2816, 231.0 / 13312, 143.0 / 10240};
constexpr float halfPi = 1.5707963267948996192313216916398;
constexpr float Boudry = 0.70710678118654752440084436210485;
template <typename T>
class Acos {
public:
__aicore__ inline Acos(){};
__aicore__ inline void Init(GM_ADDR x, GM_ADDR y, const AcosTilingData* tilingData);
__aicore__ inline void Process();
inline static constexpr int32_t ELEMTENT_ALIGN = BYTE_ALIGN / sizeof(T);
private:
__aicore__ inline void CopyIn(uint64_t progress, uint64_t tileLength);
__aicore__ inline void CopyOut(uint64_t progress, uint64_t tileLength);
__aicore__ inline void Compute(uint64_t progress, uint64_t tileLength);
__aicore__ inline void ComputeArcCos(LocalTensor<float> yLocal, LocalTensor<float> xLocal, uint64_t tileLength);
__aicore__ inline void ComputeArcSin(LocalTensor<float> yLocal, LocalTensor<float> xLocal, uint64_t tileLength);
private:
TPipe pipe;
TQue<QuePosition::VECIN, BUFFER_NUM> inputQueueX;
TQue<QuePosition::VECOUT, BUFFER_NUM> outputQueueY;
GlobalTensor<T> inputGMX;
GlobalTensor<T> outputGMY;
TBuf<TPosition::VECCALC> inputTempBuf;
TBuf<TPosition::VECCALC> outputTempBuf1;
TBuf<TPosition::VECCALC> outputTempBuf2;
TBuf<TPosition::VECCALC> xPowTempBuf;
TBuf<TPosition::VECCALC> calcTempBuf;
TBuf<TPosition::VECCALC> xBoudryMarkMask;
TBuf<TPosition::VECCALC> xSignMask;
uint64_t loopCount_ = 0;
uint64_t blockLength_ = 0;
uint64_t tileBufferLen_ = 0;
uint64_t tailTileLen_ = 0;
};
template <typename T>
__aicore__ inline void Acos<T>::Init(GM_ADDR x, GM_ADDR y, const AcosTilingData* tilingData)
{
auto blockIdx_ = GetBlockIdx();
uint64_t offset;
if (blockIdx_ >= tilingData->formerCoreNum) {
blockLength_ = tilingData->tailCoreDataNum;
loopCount_ = tilingData->tailCoreLoopCount;
tileBufferLen_ = (tilingData->tailCoreFormerDataNum + ELEMTENT_ALIGN - 1) / ELEMTENT_ALIGN * ELEMTENT_ALIGN;
tailTileLen_ = tilingData->tailCoreTailDataNum;
offset = tilingData->formerCoreNum * tilingData->formerCoreDataNum +
(blockIdx_ - tilingData->formerCoreNum) * blockLength_;
} else {
blockLength_ = tilingData->formerCoreDataNum;
loopCount_ = tilingData->formerCoreLoopCount;
tileBufferLen_ = (tilingData->formerCoreFormerDataNum + ELEMTENT_ALIGN - 1) / ELEMTENT_ALIGN * ELEMTENT_ALIGN;
;
tailTileLen_ = tilingData->formerCoreTailDataNum;
offset = blockLength_ * blockIdx_;
}
inputGMX.SetGlobalBuffer((__gm__ T*)x + offset, blockLength_);
outputGMY.SetGlobalBuffer((__gm__ T*)y + offset, blockLength_);
pipe.InitBuffer(inputQueueX, BUFFER_NUM, tileBufferLen_ * sizeof(T));
pipe.InitBuffer(outputQueueY, BUFFER_NUM, tileBufferLen_ * sizeof(T));
pipe.InitBuffer(inputTempBuf, tileBufferLen_ * sizeof(float));
pipe.InitBuffer(outputTempBuf1, tileBufferLen_ * sizeof(float));
pipe.InitBuffer(outputTempBuf2, tileBufferLen_ * sizeof(float));
pipe.InitBuffer(xPowTempBuf, tileBufferLen_ * sizeof(float));
pipe.InitBuffer(calcTempBuf, tileBufferLen_ * sizeof(float));
pipe.InitBuffer(xSignMask, (tileBufferLen_ + BYTE_TO_BIT - 1) / BYTE_TO_BIT);
pipe.InitBuffer(xBoudryMarkMask, (tileBufferLen_ + BYTE_TO_BIT - 1) / BYTE_TO_BIT);
}
template <typename T>
__aicore__ inline void Acos<T>::CopyIn(uint64_t progress, uint64_t tileLength)
{
AscendC::LocalTensor<T> xLocal = inputQueueX.AllocTensor<T>();
AscendC::DataCopyParams copyParams;
copyParams.blockCount = 1;
copyParams.blockLen = tileLength * sizeof(T);
copyParams.srcStride = 0;
copyParams.dstStride = 0;
AscendC::DataCopyPad(xLocal, inputGMX[progress * tileBufferLen_], copyParams, {false, 0, 0, 0});
inputQueueX.EnQue(xLocal);
}
template <typename T>
__aicore__ inline void Acos<T>::CopyOut(uint64_t progress, uint64_t tileLength)
{
AscendC::LocalTensor<T> yLocal = outputQueueY.DeQue<T>();
AscendC::DataCopyParams copyParams;
copyParams.blockCount = 1;
copyParams.blockLen = tileLength * sizeof(T);
copyParams.srcStride = 0;
copyParams.dstStride = 0;
AscendC::DataCopyPad(outputGMY[progress * tileBufferLen_], yLocal, copyParams);
outputQueueY.FreeTensor(yLocal);
}
template <typename T>
__aicore__ inline void Acos<T>::ComputeArcSin(LocalTensor<float> yLocal, LocalTensor<float> xLocal, uint64_t tileLength)
{
其中C为泰勒系数,参考常量taylorCoefficients */
LocalTensor<float> xPowTempTensor = xPowTempBuf.Get<float>();
LocalTensor<float> calcTempTensor = calcTempBuf.Get<float>();
DataCopy(xPowTempTensor, xLocal, tileLength);
DataCopy(yLocal, xLocal, tileLength);
Mul(xLocal, xLocal, xLocal, tileLength);
for (auto i = 1; i < sizeof(taylorCoefficients) / sizeof(taylorCoefficients[0]); i++) {
Mul(xPowTempTensor, xPowTempTensor, xLocal, tileLength);
Muls(calcTempTensor, xPowTempTensor, taylorCoefficients[i], tileLength);
Add(yLocal, yLocal, calcTempTensor, tileLength);
}
}
template <typename T>
__aicore__ inline void Acos<T>::ComputeArcCos(LocalTensor<float> yLocal, LocalTensor<float> xLocal, uint64_t tileLength)
{
LocalTensor<float> outputTempTensor2 = outputTempBuf2.Get<float>();
LocalTensor<uint8_t> xSign = xSignMask.Get<uint8_t>();
LocalTensor<uint8_t> xBoudryMark = xBoudryMarkMask.Get<uint8_t>();
CompareScalar(xSign, xLocal, (float)0.0, AscendC::CMPMODE::LT, tileLength);
Abs(xLocal, xLocal, tileLength);
CompareScalar(xBoudryMark, xLocal, Boudry, AscendC::CMPMODE::LT, tileLength);
ComputeArcSin(yLocal, xLocal, tileLength);
Muls(xLocal, xLocal, (float)(-1.0), tileLength);
Adds(xLocal, xLocal, (float)1.0, tileLength);
Sqrt(xLocal, xLocal, tileLength);
ComputeArcSin(outputTempTensor2, xLocal, tileLength);
Muls(outputTempTensor2, outputTempTensor2, (float)(-1.0), tileLength);
Adds(outputTempTensor2, outputTempTensor2, halfPi, tileLength);
Select(yLocal, xBoudryMark, yLocal, outputTempTensor2, SELMODE::VSEL_TENSOR_TENSOR_MODE, tileLength);
Muls(outputTempTensor2, yLocal, (float)(-1.0), tileLength);
Select(yLocal, xSign, outputTempTensor2, yLocal, SELMODE::VSEL_TENSOR_TENSOR_MODE, tileLength);
Muls(yLocal, yLocal, (float)(-1.0), tileLength);
Adds(yLocal, yLocal, halfPi, tileLength);
}
template <typename T>
__aicore__ inline void Acos<T>::Compute(uint64_t progress, uint64_t tileLength)
{
LocalTensor<T> xLocal = inputQueueX.DeQue<T>();
LocalTensor<T> yLocal = outputQueueY.AllocTensor<T>();
LocalTensor<float> inputTempTensor = inputTempBuf.Get<float>();
LocalTensor<float> outputTempTensor1 = outputTempBuf1.Get<float>();
if constexpr (std::is_same_v<T, float>) {
DataCopy(inputTempTensor, xLocal, tileLength);
} else {
Cast(inputTempTensor, xLocal, RoundMode::CAST_NONE, tileLength);
}
ComputeArcCos(outputTempTensor1, inputTempTensor, tileLength);
if constexpr (std::is_same_v<T, float>) {
DataCopy(yLocal, outputTempTensor1, tileLength);
} else {
Cast(yLocal, outputTempTensor1, RoundMode::CAST_ROUND, tileLength);
}
outputQueueY.EnQue<T>(yLocal);
inputQueueX.FreeTensor(xLocal);
}
template <typename T>
__aicore__ inline void Acos<T>::Process()
{
for (auto i = 0; i < loopCount_ - 1; i++) {
CopyIn(i, tileBufferLen_);
Compute(i, tileBufferLen_);
CopyOut(i, tileBufferLen_);
}
CopyIn(loopCount_ - 1, tailTileLen_);
Compute(loopCount_ - 1, (tailTileLen_ + ELEMTENT_ALIGN - 1) / ELEMTENT_ALIGN * ELEMTENT_ALIGN);
CopyOut(loopCount_ - 1, tailTileLen_);
}
}
#endif