* Copyright (c) 2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file asinh.h
* \brief
*/
#ifndef ASINH_H
#define ASINH_H
#include "kernel_operator.h"
#include "kernel_tiling/kernel_tiling.h"
#include "asinh_tiling_data.h"
#include "asinh_tiling_key.h"
namespace NsAsinh {
using namespace AscendC;
constexpr int32_t BUFFER_NUM = 2;
constexpr int32_t BYTE_TO_BIT = 8;
template <typename T>
class Asinh {
public:
__aicore__ inline Asinh(){};
__aicore__ inline void Init(GM_ADDR x, GM_ADDR y, const AsinhTilingData* tilingData);
__aicore__ inline void Process();
inline static constexpr T taylorCoefficients[] = {1.0, -1.0 / 6, 3.0 / 40, -5.0 / 112,
35.0 / 1152, -63.0 / 2816, 231.0 / 13312, -143.0 / 10240};
inline static constexpr T Boudry = 0.70710678118654752440084436210485;
inline static constexpr int32_t ELEMTENT_ALIGN = 256 / sizeof(T);
private:
__aicore__ inline void CopyIn(uint64_t progress, uint64_t tileLength);
__aicore__ inline void CopyOut(uint64_t progress, uint64_t tileLength);
__aicore__ inline void Compute(uint64_t progress, uint64_t tileLength);
__aicore__ inline void ComputeArcSinh(LocalTensor<T> yLocal, LocalTensor<T> xLocal, uint64_t tileLength);
private:
TPipe pipe;
TQue<QuePosition::VECIN, BUFFER_NUM> inputQueueX;
TQue<QuePosition::VECOUT, BUFFER_NUM> outputQueueY;
GlobalTensor<T> inputGMX;
GlobalTensor<T> outputGMY;
TBuf<TPosition::VECCALC> outputTempBuf;
TBuf<TPosition::VECCALC> xPowTempBuf;
TBuf<TPosition::VECCALC> calcTempBuf;
TBuf<TPosition::VECCALC> xBoudryMarkMask;
TBuf<TPosition::VECCALC> xSignMask;
uint64_t loopCount_ = 0;
uint64_t blockLength_ = 0;
uint64_t tileBufferLen_ = 0;
uint64_t tailTileLen_ = 0;
};
template <typename T>
__aicore__ inline void Asinh<T>::Init(GM_ADDR x, GM_ADDR y, const AsinhTilingData* tilingData)
{
auto blockIdx_ = GetBlockIdx();
uint64_t offset;
if (blockIdx_ >= tilingData->formerCoreNum) {
blockLength_ = tilingData->tailCoreDataNum;
loopCount_ = tilingData->tailCoreLoopCount;
tileBufferLen_ = tilingData->tailCoreFormerDataNum;
tailTileLen_ = tilingData->tailCoreTailDataNum;
offset = tilingData->formerCoreNum * tilingData->formerCoreDataNum + (blockIdx_ - tilingData->formerCoreNum) * blockLength_;
} else {
blockLength_ = tilingData->formerCoreDataNum;
loopCount_ = tilingData->formerCoreLoopCount;
tileBufferLen_ = tilingData->formerCoreFormerDataNum;
tailTileLen_ = tilingData->formerCoreTailDataNum;
offset = blockLength_ * blockIdx_;
}
inputGMX.SetGlobalBuffer((__gm__ T*)x + offset, blockLength_);
outputGMY.SetGlobalBuffer((__gm__ T*)y + offset, blockLength_);
pipe.InitBuffer(inputQueueX, BUFFER_NUM, tileBufferLen_ * sizeof(T));
pipe.InitBuffer(outputQueueY, BUFFER_NUM, tileBufferLen_ * sizeof(T));
pipe.InitBuffer(outputTempBuf, tileBufferLen_ * sizeof(T));
pipe.InitBuffer(xPowTempBuf, tileBufferLen_ * sizeof(T));
pipe.InitBuffer(calcTempBuf, tileBufferLen_ * sizeof(T));
pipe.InitBuffer(xSignMask, (tileBufferLen_ + BYTE_TO_BIT - 1) / BYTE_TO_BIT);
pipe.InitBuffer(xBoudryMarkMask, (tileBufferLen_ + BYTE_TO_BIT - 1) / BYTE_TO_BIT);
}
template <typename T>
__aicore__ inline void Asinh<T>::CopyIn(uint64_t progress, uint64_t tileLength)
{
AscendC::LocalTensor<T> xLocal = inputQueueX.AllocTensor<T>();
AscendC::DataCopyParams copyParams;
copyParams.blockCount = 1;
copyParams.blockLen = tileLength * sizeof(T);
copyParams.srcStride = 0;
copyParams.dstStride = 0;
AscendC::DataCopyPad(xLocal, inputGMX[progress * tileBufferLen_], copyParams, {false, 0, 0, 0});
inputQueueX.EnQue(xLocal);
}
template <typename T>
__aicore__ inline void Asinh<T>::CopyOut(uint64_t progress, uint64_t tileLength)
{
AscendC::LocalTensor<T> yLocal = outputQueueY.DeQue<T>();
AscendC::DataCopyParams copyParams;
copyParams.blockCount = 1;
copyParams.blockLen = tileLength * sizeof(T);
copyParams.srcStride = 0;
copyParams.dstStride = 0;
AscendC::DataCopyPad(outputGMY[progress * tileBufferLen_], yLocal, copyParams);
outputQueueY.FreeTensor(yLocal);
}
template <typename T>
__aicore__ inline void Asinh<T>::ComputeArcSinh(LocalTensor<T> yLocal, LocalTensor<T> xLocal, uint64_t tileLength)
{
其中C为泰勒系数,参考常量taylorCoefficients,注意xLocal后续计算还需要所以不能修改 */
LocalTensor<T> xPowTempTensor = xPowTempBuf.Get<T>();
LocalTensor<T> calcTempTensor = calcTempBuf.Get<T>();
DataCopy(xPowTempTensor, xLocal, tileLength);
DataCopy(yLocal, xLocal, tileLength);
Mul(xLocal, xLocal, xLocal, tileLength);
for (auto i = 1; i < sizeof(taylorCoefficients) / sizeof(T); i++) {
Mul(xPowTempTensor, xPowTempTensor, xLocal, tileLength);
Muls(calcTempTensor, xPowTempTensor, (T)taylorCoefficients[i], tileLength);
Add(yLocal, yLocal, calcTempTensor, tileLength);
}
}
template <typename T>
__aicore__ inline void Asinh<T>::Compute(uint64_t progress, uint64_t tileLength)
{
LocalTensor<T> xLocal = inputQueueX.DeQue<T>();
LocalTensor<T> yLocal = outputQueueY.AllocTensor<T>();
LocalTensor<T> outputTempTensor = outputTempBuf.Get<T>();
LocalTensor<uint8_t> xSign = xSignMask.Get<uint8_t>();
LocalTensor<uint8_t> xBoudryMark = xBoudryMarkMask.Get<uint8_t>();
CompareScalar(xSign, xLocal, (T)0.0, AscendC::CMPMODE::LT, tileLength);
Abs(xLocal, xLocal, tileLength);
CompareScalar(xBoudryMark, xLocal, (T)Boudry, AscendC::CMPMODE::LT, tileLength);
DataCopy(outputTempTensor, xLocal, tileLength);
ComputeArcSinh(yLocal, xLocal, tileLength);
Adds(xLocal, xLocal, (T)1.0, tileLength);
Sqrt(xLocal, xLocal, tileLength);
Add(xLocal, xLocal, outputTempTensor, tileLength);
Ln(outputTempTensor, xLocal, tileLength);
Select(yLocal, xBoudryMark, yLocal, outputTempTensor, SELMODE::VSEL_TENSOR_TENSOR_MODE, tileLength);
Muls(outputTempTensor, yLocal, (T)(-1.0), tileLength);
Select(yLocal, xSign, outputTempTensor, yLocal, SELMODE::VSEL_TENSOR_TENSOR_MODE, tileLength);
outputQueueY.EnQue<T>(yLocal);
inputQueueX.FreeTensor(xLocal);
}
template <typename T>
__aicore__ inline void Asinh<T>::Process()
{
for (auto i = 0; i < loopCount_ - 1; i++) {
CopyIn(i, tileBufferLen_);
Compute(i, tileBufferLen_);
CopyOut(i, tileBufferLen_);
}
CopyIn(loopCount_ - 1, tailTileLen_);
Compute(loopCount_ - 1, (tailTileLen_ + ELEMTENT_ALIGN - 1) / ELEMTENT_ALIGN * ELEMTENT_ALIGN);
CopyOut(loopCount_ - 1, tailTileLen_);
}
}
#endif