* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file exp_common_impl.h
* \brief
*/
#if !defined(__ASCENDC_INCLUDE_INTERNAL_HEADERS__)
#pragma message( \
"impl/adv_api/detail/math/exp/exp_common_impl.h is an internal header file and must not be used directly. Functions or variables defined in this file may be removed in the future. Please use \"#include \"adv_api/math/exp.h\"\" and use public functions or variables defined in interface headers files.")
#define __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#define __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_MATH_EXP_EXP_COMMON_IMPL_H__
#endif
#ifndef IMPL_MATH_EXP_EXP_COMMON_IMPL_H
#define IMPL_MATH_EXP_EXP_COMMON_IMPL_H
#include "kernel_basic_intf.h"
#include "kernel_tensor.h"
#include "kernel_pop_stack_buffer.h"
#include "../../common/check.h"
#ifdef ASCENDC_CPU_DEBUG
#include "../../api_check/kernel_check/math/exp/exp_check.h"
#endif
#include "../../api_check/kernel_api_check.h"
namespace AscendC {
namespace ExpAPI {
constexpr uint8_t HALF_REPEAT_STRIDE = 4;
constexpr uint32_t EXP_TWO = 2;
constexpr uint32_t EXP_THREE = 3;
constexpr uint32_t EXP_FOUR = 4;
template <typename T, bool isReuseSource = false, uint8_t expandLevel = 10>
__aicore__ inline void UpdateExpParams(
const LocalTensor<T>& src, const uint32_t calCount, const LocalTensor<float>& stackBuffer, ExpParams<float>& params)
{
uint32_t alignNum = ONE_BLK_SIZE / sizeof(T);
bool isFloat = IsSameType<T, float>::value;
uint32_t numberOfTmpBuf = EXP_FOUR;
if (isFloat) {
numberOfTmpBuf = isReuseSource ? EXP_TWO : EXP_THREE;
}
uint32_t inputSize = calCount;
uint32_t stackBufferSize = stackBuffer.GetSize();
uint32_t oneTmpSize = stackBufferSize / numberOfTmpBuf;
oneTmpSize = oneTmpSize / alignNum * alignNum;
uint32_t secondOffset = (isFloat && isReuseSource) ? 0 : oneTmpSize;
uint32_t fourthOffset = isFloat ? 0 : oneTmpSize;
CheckTmpBufferSize(oneTmpSize, 0, stackBufferSize);
params.inputSize = inputSize;
params.oneTmpSize = oneTmpSize;
params.firstTmpStartPos = 0;
params.secondTmpStartPos = secondOffset;
params.thirdTmpStartPos = params.secondTmpStartPos + oneTmpSize;
params.fourthTmpStartPos = params.thirdTmpStartPos + fourthOffset;
params.loopNum = inputSize / oneTmpSize;
params.tailSize = inputSize % oneTmpSize;
params.tailPos = inputSize - params.tailSize;
params.curDataLength = oneTmpSize;
params.expandLevel = expandLevel;
}
template <bool isReuseSource = false, uint8_t expandLevel = 10>
__aicore__ inline void GetExpTensorInfo(
const LocalTensor<half>& src, const LocalTensor<half>& dst, const uint32_t calCount,
const LocalTensor<float>& stackBuffer, ExpParams<float>& params)
{
UpdateExpParams<half, isReuseSource, expandLevel>(src, calCount, stackBuffer, params);
params.tempTensorFloorX = stackBuffer[params.firstTmpStartPos];
params.tempTensorFloorXPow = stackBuffer[params.secondTmpStartPos];
params.tempTensorRes = stackBuffer[params.thirdTmpStartPos];
params.tempTensorIntPart = stackBuffer[params.fourthTmpStartPos];
}
template <bool isReuseSource = false, uint8_t expandLevel = 10>
__aicore__ inline void GetExpTensorInfo(
const LocalTensor<float>& src, const LocalTensor<float>& dst, const uint32_t calCount,
const LocalTensor<float>& stackBuffer, ExpParams<float>& params)
{
UpdateExpParams<float, isReuseSource, expandLevel>(src, calCount, stackBuffer, params);
if constexpr (isReuseSource) {
params.tempTensorFloorX = src;
} else {
params.tempTensorFloorX = stackBuffer[params.firstTmpStartPos];
}
params.tempTensorFloorXPow = stackBuffer[params.secondTmpStartPos];
params.tempTensorRes = dst;
params.tempTensorIntPart = stackBuffer[params.fourthTmpStartPos];
}
template <typename T>
__aicore__ inline void GetExpInputInTmp(const LocalTensor<T>& src, const ExpParams<float>& params, uint32_t maskLength)
{
UnaryRepeatParams unaryParams;
SetVectorMask<float, MaskMode::COUNTER>(0, maskLength);
if constexpr (IsSameType<T, half>::value) {
unaryParams.srcRepStride = HALF_REPEAT_STRIDE;
Cast<float, half, false>(params.tempTensorFloorX, src, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, unaryParams);
} else {
Adds<float, false>(params.tempTensorFloorX, src, 0.0, MASK_PLACEHOLDER, 1, unaryParams);
}
PipeBarrier<PIPE_V>();
}
__aicore__ inline void GetExpFloorInput(const ExpParams<float>& params, uint32_t maskLength)
{
UnaryRepeatParams unaryParams;
BinaryRepeatParams binaryParams;
SetVectorMask<float, MaskMode::COUNTER>(0, maskLength);
#if defined(__NPU_ARCH__) && __NPU_ARCH__ == 2201
Cast<float, float, false>(
params.tempTensorIntPart, params.tempTensorFloorX, RoundMode::CAST_FLOOR, MASK_PLACEHOLDER, 1, unaryParams);
#elif defined(__NPU_ARCH__) && __NPU_ARCH__ == 2002
Cast<int32_t, float, false>(
params.tempTensorIntPart.ReinterpretCast<int32_t>(), params.tempTensorFloorX, RoundMode::CAST_FLOOR,
MASK_PLACEHOLDER, 1, {1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE});
PipeBarrier<PIPE_V>();
Cast<float, int32_t, false>(
params.tempTensorIntPart, params.tempTensorIntPart.ReinterpretCast<int32_t>(), RoundMode::CAST_NONE,
MASK_PLACEHOLDER, 1, {1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE});
#endif
PipeBarrier<PIPE_V>();
Sub<float, false>(
params.tempTensorFloorX, params.tempTensorFloorX, params.tempTensorIntPart, MASK_PLACEHOLDER, 1, binaryParams);
PipeBarrier<PIPE_V>();
Exp<float, false>(params.tempTensorIntPart, params.tempTensorIntPart, MASK_PLACEHOLDER, 1, unaryParams);
PipeBarrier<PIPE_V>();
}
__aicore__ inline void ExpHighPrecisionExec(const ExpParams<float>& params, uint32_t maskLength, uint32_t offset)
{
const UnaryRepeatParams unaryParams;
const BinaryRepeatParams binaryParams;
SetVectorMask<float, MaskMode::COUNTER>(0, maskLength);
Adds<float, false>(params.tempTensorFloorXPow, params.tempTensorFloorX, 0.0, MASK_PLACEHOLDER, 1, unaryParams);
PipeBarrier<PIPE_V>();
Adds<float, false>(params.tempTensorRes[offset], params.tempTensorFloorX, 0.0, MASK_PLACEHOLDER, 1, unaryParams);
PipeBarrier<PIPE_V>();
Adds<float, false>(params.tempTensorRes[offset], params.tempTensorRes[offset], 1, MASK_PLACEHOLDER, 1, unaryParams);
PipeBarrier<PIPE_V>();
for (int32_t i = 2; i < params.expandLevel + 1; i++) {
Mul<float, false>(
params.tempTensorFloorXPow, params.tempTensorFloorX, params.tempTensorFloorXPow, MASK_PLACEHOLDER, 1,
binaryParams);
PipeBarrier<PIPE_V>();
Muls<float, false>(
params.tempTensorFloorXPow, params.tempTensorFloorXPow, static_cast<float>(1.0) / static_cast<float>(i),
MASK_PLACEHOLDER, 1, unaryParams);
PipeBarrier<PIPE_V>();
Add<float, false>(
params.tempTensorRes[offset], params.tempTensorRes[offset], params.tempTensorFloorXPow, MASK_PLACEHOLDER, 1,
binaryParams);
PipeBarrier<PIPE_V>();
}
Mul<float, false>(
params.tempTensorRes[offset], params.tempTensorRes[offset], params.tempTensorIntPart, MASK_PLACEHOLDER, 1,
binaryParams);
PipeBarrier<PIPE_V>();
}
__aicore__ inline void GetExpCastedResult(
const LocalTensor<half>& dst, const ExpParams<float>& params, uint32_t maskLength)
{
UnaryRepeatParams unaryParams;
unaryParams.dstRepStride = HALF_REPEAT_STRIDE;
SetVectorMask<float, MaskMode::COUNTER>(0, maskLength);
Cast<half, float, false>(dst, params.tempTensorRes, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, unaryParams);
PipeBarrier<PIPE_V>();
}
template <typename T>
__aicore__ inline void ExpHighPrecisionND(
const LocalTensor<T>& src, const LocalTensor<T>& dst, const ExpParams<float>& params, uint32_t offset,
uint32_t maskLength)
{
GetExpInputInTmp(src[offset], params, maskLength);
GetExpFloorInput(params, maskLength);
if constexpr (IsSameType<T, half>::value) {
ExpHighPrecisionExec(params, maskLength, 0);
GetExpCastedResult(dst[offset], params, maskLength);
} else {
ExpHighPrecisionExec(params, maskLength, offset);
}
}
template <typename T>
__aicore__ inline void ExpND(const LocalTensor<T>& src, const LocalTensor<T>& dst, const ExpParams<float>& params)
{
SetMaskCount();
uint32_t offset = 0;
for (uint32_t index = 0; index < params.loopNum; index++) {
ExpHighPrecisionND(src, dst, params, offset, params.curDataLength);
offset += params.oneTmpSize;
}
if (params.tailSize > 0) {
ExpHighPrecisionND(src, dst, params, offset, params.tailSize);
}
}
template <typename T, uint8_t taylorExpandLevel, bool isReuseSource>
__aicore__ inline void ExpImpl(
const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal, const LocalTensor<uint8_t>& sharedTmpBuffer,
const uint32_t calCount)
{
CHECK_FUNC_HIGHLEVEL_API(
Exp, (T, taylorExpandLevel, isReuseSource), (dstLocal, srcLocal, sharedTmpBuffer, calCount));
if (taylorExpandLevel == 0) {
Exp<T>(dstLocal, srcLocal, calCount);
return;
}
uint32_t bufferSize = sharedTmpBuffer.GetSize();
CheckTmpBufferSize(bufferSize, 0, bufferSize);
LocalTensor<float> stackBuffer = sharedTmpBuffer.ReinterpretCast<float>();
ExpParams<float> expParams;
ExpAPI::GetExpTensorInfo<isReuseSource, taylorExpandLevel>(srcLocal, dstLocal, calCount, stackBuffer, expParams);
ExpAPI::ExpND<T>(srcLocal, dstLocal, expParams);
SetMaskNorm();
ResetMask();
}
template <typename T, uint8_t taylorExpandLevel, bool isReuseSource>
__aicore__ inline void ExpImpl(const LocalTensor<T>& dstLocal, const LocalTensor<T>& srcLocal, const uint32_t calCount)
{
LocalTensor<uint8_t> sharedTmpBuffer;
bool ans = PopStackBuffer<uint8_t, TPosition::LCM>(sharedTmpBuffer);
ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); });
ExpImpl<T, taylorExpandLevel, isReuseSource>(dstLocal, srcLocal, sharedTmpBuffer, calCount);
}
}
}
#endif
#if defined(__UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_MATH_EXP_EXP_COMMON_IMPL_H__)
#undef __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#undef __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_MATH_EXP_EXP_COMMON_IMPL_H__
#endif