* Copyright (c) 2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file acosh.h
* \brief Acosh 算子 Kernel 类(arch35 / Ascend950)
*
* 与 DESIGN.md v2.1 §3.5 对齐:
* - 13 步数值稳定 acosh 公式
* - FP32 直通 / FP16/BF16 入口 CAST_NONE → 13 步 FP32 → 出口 CAST_RINT
* - Double Buffer (BUFFER_NUM=2)
* - Buffer 复用:dataTBuf (step1→6/10) / dataRBuf (step5→6/11/8/9/12) / logTmpBuf (step13a/b/c)
* - Log natural 三参数调用(不接收 sharedTmpBuffer),由框架自动从未 InitBuffer 的剩余 UB 申请
*
* 迭代一范围(FP32 主线骨架):
* - FP32 路径完整实现;FP16/BF16 路径在迭代二落地(这里预留 else 分支编译占位)
*/
#ifndef NSACOSH_ACOSH_H
#define NSACOSH_ACOSH_H
#include "kernel_operator.h"
#include "kernel_tiling/kernel_tiling.h"
#include "acosh_tiling_data.h"
#include "acosh_tiling_key.h"
namespace NsAcosh {
using namespace AscendC;
constexpr float CONST_NEG_ONE = -1.0f;
constexpr float CONST_ONE = 1.0f;
constexpr float CONST_S_MIN = 1.0e-45f;
constexpr float CONST_S_MAX = 3.4028235e34f;
constexpr float CONST_LN2_ADD = 0.693147180559945286227f;
static constexpr int32_t BUFFER_NUM = 2;
template <typename T>
class Acosh {
public:
__aicore__ inline Acosh() {}
__aicore__ inline void Init(GM_ADDR self, GM_ADDR out, const AcoshTilingData* tilingData);
__aicore__ inline void Process();
private:
__aicore__ inline void CopyIn(int64_t progress, int64_t currentNum);
__aicore__ inline void Compute(int64_t currentNum);
__aicore__ inline void CopyOut(int64_t progress, int64_t currentNum);
__aicore__ inline void ComputeFp32Pipeline(LocalTensor<float>& xFp32,
LocalTensor<float>& yFp32,
int64_t count);
private:
TPipe pipe;
TQue<QuePosition::VECIN, BUFFER_NUM> inputQue;
TQue<QuePosition::VECOUT, BUFFER_NUM> outputQue;
TBuf<QuePosition::VECCALC> fp32WorkBuf;
TBuf<QuePosition::VECCALC> dataTBuf;
TBuf<QuePosition::VECCALC> dataRBuf;
TBuf<QuePosition::VECCALC> logTmpBuf;
GlobalTensor<T> inputGM;
GlobalTensor<T> outputGM;
int64_t blockLength_ = 0;
int64_t ubFactor_ = 0;
};
template <typename T>
__aicore__ inline void Acosh<T>::Init(GM_ADDR self, GM_ADDR out, const AcoshTilingData* tilingData)
{
int64_t blockIdx = AscendC::GetBlockIdx();
int64_t remainder = tilingData->totalNum - tilingData->blockFactor * blockIdx;
blockLength_ = (remainder > tilingData->blockFactor) ? tilingData->blockFactor : remainder;
ubFactor_ = tilingData->ubFactor;
inputGM.SetGlobalBuffer((__gm__ T*)self + tilingData->blockFactor * blockIdx, blockLength_);
outputGM.SetGlobalBuffer((__gm__ T*)out + tilingData->blockFactor * blockIdx, blockLength_);
pipe.InitBuffer(inputQue, BUFFER_NUM, ubFactor_ * sizeof(T));
pipe.InitBuffer(outputQue, BUFFER_NUM, ubFactor_ * sizeof(T));
if constexpr (!std::is_same_v<T, float>) {
pipe.InitBuffer(fp32WorkBuf, ubFactor_ * sizeof(float));
}
pipe.InitBuffer(dataTBuf, ubFactor_ * sizeof(float));
pipe.InitBuffer(dataRBuf, ubFactor_ * sizeof(float));
pipe.InitBuffer(logTmpBuf, ubFactor_ * sizeof(float));
}
template <typename T>
__aicore__ inline void Acosh<T>::Process()
{
if (blockLength_ <= 0) {
return;
}
int64_t loopCount = (blockLength_ + ubFactor_ - 1) / ubFactor_;
for (int64_t i = 0; i < loopCount; i++) {
int64_t currentNum = (i == (loopCount - 1)) ? (blockLength_ - ubFactor_ * i) : ubFactor_;
CopyIn(i, currentNum);
Compute(currentNum);
CopyOut(i, currentNum);
}
}
template <typename T>
__aicore__ inline void Acosh<T>::CopyIn(int64_t progress, int64_t currentNum)
{
LocalTensor<T> xLocal = inputQue.template AllocTensor<T>();
AscendC::DataCopyExtParams copyParams;
copyParams.blockCount = 1;
copyParams.blockLen = static_cast<uint32_t>(currentNum * sizeof(T));
copyParams.srcStride = 0;
copyParams.dstStride = 0;
AscendC::DataCopyPad(xLocal, inputGM[progress * ubFactor_], copyParams, {false, 0, 0, 0});
inputQue.EnQue(xLocal);
}
template <typename T>
__aicore__ inline void Acosh<T>::CopyOut(int64_t progress, int64_t currentNum)
{
LocalTensor<T> yLocal = outputQue.template DeQue<T>();
AscendC::DataCopyExtParams copyParams;
copyParams.blockCount = 1;
copyParams.blockLen = static_cast<uint32_t>(currentNum * sizeof(T));
copyParams.srcStride = 0;
copyParams.dstStride = 0;
AscendC::DataCopyPad(outputGM[progress * ubFactor_], yLocal, copyParams);
outputQue.FreeTensor(yLocal);
}
template <typename T>
__aicore__ inline void Acosh<T>::Compute(int64_t currentNum)
{
LocalTensor<T> xLocal = inputQue.template DeQue<T>();
LocalTensor<T> yLocal = outputQue.template AllocTensor<T>();
if constexpr (std::is_same_v<T, float>) {
ComputeFp32Pipeline(xLocal, yLocal, currentNum);
} else {
LocalTensor<float> fp32Work = fp32WorkBuf.Get<float>();
AscendC::Cast(fp32Work, xLocal, AscendC::RoundMode::CAST_NONE, currentNum);
ComputeFp32Pipeline(fp32Work, fp32Work, currentNum);
AscendC::Cast(yLocal, fp32Work, AscendC::RoundMode::CAST_RINT, currentNum);
}
outputQue.template EnQue<T>(yLocal);
inputQue.FreeTensor(xLocal);
}
template <typename T>
__aicore__ inline void Acosh<T>::ComputeFp32Pipeline(
LocalTensor<float>& xFp32,
LocalTensor<float>& yFp32,
int64_t count)
{
LocalTensor<float> dataT = dataTBuf.Get<float>();
LocalTensor<float> dataR = dataRBuf.Get<float>();
LocalTensor<float> logTmp = logTmpBuf.Get<float>();
uint32_t n = static_cast<uint32_t>(count);
AscendC::Log(logTmp, xFp32, n);
AscendC::Adds(logTmp, logTmp, CONST_LN2_ADD, n);
AscendC::Adds(dataT, xFp32, CONST_NEG_ONE, n);
AscendC::Add(yFp32, dataT, dataT, n);
AscendC::Mul(dataR, dataT, dataT, n);
AscendC::Add(dataR, dataR, yFp32, n);
AscendC::Sqrt(dataR, dataR, n);
AscendC::Add(dataR, dataT, dataR, n);
AscendC::Adds(yFp32, dataR, CONST_ONE, n);
AscendC::Log(dataT, yFp32, n);
AscendC::Mul(dataT, dataT, dataR, n);
AscendC::Adds(dataR, yFp32, CONST_NEG_ONE, n);
AscendC::Maxs(dataR, dataR, CONST_S_MIN, n);
AscendC::Mins(dataR, dataR, CONST_S_MAX, n);
AscendC::Div(dataT, dataT, dataR, n);
AscendC::Min(yFp32, dataT, logTmp, n);
}
}
#endif