* Copyright (c) 2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file cosh.h
* \brief Cosh 算子 kernel 类定义(标准 Ascend C kernel,arch35 / Ascend950PR,DESIGN §3.4)
*
* 数学契约: y = cosh(x) = (e^x + e^(-x)) / 2 (spec math_semantics.formula: y = np.cosh(x))
*
* NPU 实现采用 exp 分解数值流程(REQUIREMENTS §4.2,与数学契约等价,全 fp32 计算):
* Step1: ax = abs(x) // cosh 为偶函数,取绝对值降低大值溢出风险
* Step2: x1 = ax * 0.5 // x/2
* Step3: x2 = ax * (-1.5) // -1.5x
* Step4: e1 = exp(x1) // exp(x/2)
* Step5: e2 = exp(x2) // exp(-1.5x)
* Step6: x3 = e1 + e2 // exp(x/2) + exp(-1.5x)
* Step7: x4 = x3 * 0.5 // 0.5 * (...)
* Step8: y = x4 * e1 // 0.5*(exp(x/2)+exp(-1.5x))*exp(x/2) = cosh(x)
*
* 模板参数 T:输入/输出 dtype。
* - half / bfloat16_t:WithCast 路径,入口 Cast(CAST_NONE) 升 fp32、出口 Cast(CAST_RINT) 还原。
* - float:WithoutCast 路径,fp32 直通,省去两次 Cast。
* 分支由 needCast = is_same_v<T,half> || is_same_v<T,bfloat16_t> 编译期裁剪。
*
* 三级流水(CopyIn -> Compute -> CopyOut)+ double buffer(BUFFER_NUM=2)。
* 8 步全 fp32,使用 3 份独立 fp32 VECCALC 缓冲(ax / e1 / work),无原位别名。
* DataCopyPad 处理非 32B 对齐尾块。
*/
#ifndef OPS_COSH_OP_KERNEL_ARCH35_COSH_H_
#define OPS_COSH_OP_KERNEL_ARCH35_COSH_H_
#include "kernel_operator.h"
#include "kernel_tiling/kernel_tiling.h"
#include "cosh_tiling_data.h"
#include "cosh_tiling_key.h"
namespace NsCosh {
using namespace AscendC;
template <typename T>
class KernelCosh {
static constexpr bool needCast = std::is_same_v<T, half> || std::is_same_v<T, bfloat16_t>;
static constexpr int32_t BUFFER_NUM = 2;
public:
__aicore__ inline KernelCosh() {}
__aicore__ inline void Init(GM_ADDR x, GM_ADDR y, const CoshTilingData* tilingData);
__aicore__ inline void Process();
private:
__aicore__ inline void CopyIn(int64_t progress, int64_t currentNum);
__aicore__ inline void Compute(int64_t currentNum);
__aicore__ inline void CopyOut(int64_t progress, int64_t currentNum);
private:
TPipe pipe_;
TQue<QuePosition::VECIN, BUFFER_NUM> inQueue_;
TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue_;
TBuf<QuePosition::VECCALC> axBuf_;
TBuf<QuePosition::VECCALC> e1Buf_;
TBuf<QuePosition::VECCALC> workBuf_;
GlobalTensor<T> xGm_;
GlobalTensor<T> yGm_;
int64_t blockLength_ = 0;
int64_t ubFactor_ = 0;
};
template <typename T>
__aicore__ inline void KernelCosh<T>::Init(GM_ADDR x, GM_ADDR y, const CoshTilingData* tilingData)
{
int64_t remainderLength = tilingData->totalNum - tilingData->blockFactor * AscendC::GetBlockIdx();
blockLength_ = (remainderLength > tilingData->blockFactor) ? tilingData->blockFactor : remainderLength;
ubFactor_ = tilingData->ubFactor;
xGm_.SetGlobalBuffer((__gm__ T*)x + tilingData->blockFactor * AscendC::GetBlockIdx(), blockLength_);
yGm_.SetGlobalBuffer((__gm__ T*)y + tilingData->blockFactor * AscendC::GetBlockIdx(), blockLength_);
pipe_.InitBuffer(inQueue_, BUFFER_NUM, ubFactor_ * sizeof(T));
pipe_.InitBuffer(outQueue_, BUFFER_NUM, ubFactor_ * sizeof(T));
pipe_.InitBuffer(axBuf_, ubFactor_ * sizeof(float));
pipe_.InitBuffer(e1Buf_, ubFactor_ * sizeof(float));
pipe_.InitBuffer(workBuf_, ubFactor_ * sizeof(float));
}
template <typename T>
__aicore__ inline void KernelCosh<T>::CopyIn(int64_t progress, int64_t currentNum)
{
AscendC::LocalTensor<T> xLocal = inQueue_.template AllocTensor<T>();
AscendC::DataCopyExtParams copyParams;
copyParams.blockCount = 1;
copyParams.blockLen = static_cast<uint32_t>(currentNum * sizeof(T));
copyParams.srcStride = 0;
copyParams.dstStride = 0;
copyParams.rsv = 0;
AscendC::DataCopyPadExtParams<T> padParams{false, 0, 0, 0};
AscendC::DataCopyPad(xLocal, xGm_[progress * ubFactor_], copyParams, padParams);
inQueue_.EnQue(xLocal);
}
template <typename T>
__aicore__ inline void KernelCosh<T>::Compute(int64_t currentNum)
{
AscendC::LocalTensor<T> xLocal = inQueue_.template DeQue<T>();
AscendC::LocalTensor<T> yLocal = outQueue_.template AllocTensor<T>();
AscendC::LocalTensor<float> ax = axBuf_.Get<float>();
AscendC::LocalTensor<float> e1 = e1Buf_.Get<float>();
AscendC::LocalTensor<float> work = workBuf_.Get<float>();
const int32_t cnt = static_cast<int32_t>(currentNum);
if constexpr (needCast) {
AscendC::Cast(ax, xLocal, AscendC::RoundMode::CAST_NONE, cnt);
AscendC::Abs(ax, ax, cnt);
} else {
AscendC::Abs(ax, xLocal, cnt);
}
AscendC::Muls(e1, ax, static_cast<float>(0.5), cnt);
AscendC::Muls(work, ax, static_cast<float>(-1.5), cnt);
AscendC::Exp(e1, e1, cnt);
AscendC::Exp(work, work, cnt);
AscendC::Add(work, e1, work, cnt);
AscendC::Muls(work, work, static_cast<float>(0.5), cnt);
AscendC::Mul(work, work, e1, cnt);
if constexpr (needCast) {
AscendC::Cast(yLocal, work, AscendC::RoundMode::CAST_RINT, cnt);
} else {
AscendC::Adds(yLocal, work, static_cast<float>(0.0f), cnt);
}
outQueue_.template EnQue<T>(yLocal);
inQueue_.FreeTensor(xLocal);
}
template <typename T>
__aicore__ inline void KernelCosh<T>::CopyOut(int64_t progress, int64_t currentNum)
{
AscendC::LocalTensor<T> yLocal = outQueue_.template DeQue<T>();
AscendC::DataCopyExtParams copyParams;
copyParams.blockCount = 1;
copyParams.blockLen = static_cast<uint32_t>(currentNum * sizeof(T));
copyParams.srcStride = 0;
copyParams.dstStride = 0;
copyParams.rsv = 0;
AscendC::DataCopyPad(yGm_[progress * ubFactor_], yLocal, copyParams);
outQueue_.FreeTensor(yLocal);
}
template <typename T>
__aicore__ inline void KernelCosh<T>::Process()
{
int64_t loopCount = (blockLength_ + ubFactor_ - 1) / ubFactor_;
for (int64_t i = 0; i < loopCount; i++) {
int64_t currentNum = (i == (loopCount - 1)) ? (blockLength_ - ubFactor_ * i) : ubFactor_;
CopyIn(i, currentNum);
Compute(currentNum);
CopyOut(i, currentNum);
}
}
}
#endif