* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file pow_f32_nddma_with_loops.h
* \brief
*/
#ifndef ASCENDC_POW_F32_NDDMA_WITH_LOOPS_H_
#define ASCENDC_POW_F32_NDDMA_WITH_LOOPS_H_
#include "kernel_operator.h"
#include "atvoss/util/broadcast_utils.h"
namespace Pow
{
using namespace Ops::Base;
using AscendC::GlobalTensor;
using AscendC::LocalTensor;
using AscendC::TBuf;
using AscendC::TPipe;
using AscendC::TQue;
class PowF32NddmaWithLoops
{
public:
__aicore__ inline PowF32NddmaWithLoops(){};
__aicore__ inline void Init(GM_ADDR inputX, GM_ADDR inputY, GM_ADDR outputZ, GM_ADDR workspace,
const PowTensorTensorTilingData* tilingDataPtr, TPipe* pipePtr)
{
pipePtr_ = pipePtr;
tilingDataPtr_ = tilingDataPtr;
inputGmInputX_.SetGlobalBuffer((__gm__ float*)inputX);
inputGmInputY_.SetGlobalBuffer((__gm__ float*)inputY);
outputGmOutputZ_.SetGlobalBuffer((__gm__ float*)outputZ);
constexpr int64_t DOUBLE_BUFFER = 2;
int64_t BUFFER_SIZE_0 = tilingDataPtr_->elemNum * sizeof(float);
pipePtr_->InitBuffer(queIn0_, DOUBLE_BUFFER, BUFFER_SIZE_0);
pipePtr_->InitBuffer(queIn1_, DOUBLE_BUFFER, BUFFER_SIZE_0);
pipePtr_->InitBuffer(queOut0_, DOUBLE_BUFFER, BUFFER_SIZE_0);
}
__aicore__ inline void Process()
{
int64_t ubLoopNum = AscendC::GetBlockIdx() == AscendC::GetBlockNum() - 1 ? tilingDataPtr_->blockTail
: tilingDataPtr_->blockFormer;
int64_t axesIndices[BROADCAST_MAX_DIMS] = {0};
BroadcastGetAxesIndices(axesIndices, tilingDataPtr_->blockFormer * AscendC::GetBlockIdx(),
tilingDataPtr_->outputDims, tilingDataPtr_->ubSplitAxis,
tilingDataPtr_->dimProductBeforeUbInner);
for (int64_t ubLoopIdx = 0; ubLoopIdx < ubLoopNum; ubLoopIdx += 1) {
if (ubLoopIdx != 0) {
BroadcastUpdateAxesIndices(axesIndices, tilingDataPtr_->outputDims, tilingDataPtr_->ubSplitAxis,
tilingDataPtr_->ubOuter);
}
int64_t ubSplitSize = axesIndices[tilingDataPtr_->ubSplitAxis] == tilingDataPtr_->ubOuter - 1
? tilingDataPtr_->ubTail
: tilingDataPtr_->ubFormer;
CopyIn0(ubSplitSize, axesIndices, ubLoopIdx);
CopyIn1(ubSplitSize, axesIndices, ubLoopIdx);
Compute2(ubSplitSize, axesIndices, ubLoopIdx);
CopyOut3(ubSplitSize, axesIndices, ubLoopIdx);
}
}
private:
__aicore__ inline void CopyIn0(int64_t ubSplitSize, const int64_t (&axesIndices)[BROADCAST_MAX_DIMS],
int64_t ubLoopIdx)
{
bufferIn0_ = queIn0_.AllocTensor<float>();
if ((tilingDataPtr_->input0Strides[tilingDataPtr_->ubSplitAxis] != 0) ||
(ubLoopIdx <= 1 ||
(AscendC::GetBlockIdx() * tilingDataPtr_->blockFormer + ubLoopIdx) % tilingDataPtr_->ubOuter <= 1)) {
BroadcastNddmaWithLoop(inputGmInputX_, bufferIn0_, tilingDataPtr_->outputDims,
tilingDataPtr_->outputStrides, tilingDataPtr_->input0Strides, axesIndices,
tilingDataPtr_->ubSplitAxis, tilingDataPtr_->shapeLen, ubSplitSize,
tilingDataPtr_->ubFormer);
}
queIn0_.EnQue<float>(bufferIn0_);
}
__aicore__ inline void CopyIn1(int64_t ubSplitSize, const int64_t (&axesIndices)[BROADCAST_MAX_DIMS],
int64_t ubLoopIdx)
{
bufferIn1_ = queIn1_.AllocTensor<float>();
if ((tilingDataPtr_->input1Strides[tilingDataPtr_->ubSplitAxis] != 0) ||
(ubLoopIdx <= 1 ||
(AscendC::GetBlockIdx() * tilingDataPtr_->blockFormer + ubLoopIdx) % tilingDataPtr_->ubOuter <= 1)) {
BroadcastNddmaWithLoop(inputGmInputY_, bufferIn1_, tilingDataPtr_->outputDims,
tilingDataPtr_->outputStrides, tilingDataPtr_->input1Strides, axesIndices,
tilingDataPtr_->ubSplitAxis, tilingDataPtr_->shapeLen, ubSplitSize,
tilingDataPtr_->ubFormer);
}
queIn1_.EnQue<float>(bufferIn1_);
}
__aicore__ inline void Compute2(int64_t ubSplitSize, const int64_t (&axesIndices)[BROADCAST_MAX_DIMS],
int64_t ubLoopIdx)
{
bufferIn0_ = queIn0_.DeQue<float>();
bufferIn1_ = queIn1_.DeQue<float>();
bufferOut0_ = queOut0_.AllocTensor<float>();
Power<float, false, pConfig_>(bufferOut0_, bufferIn0_, bufferIn1_,
ubSplitSize * tilingDataPtr_->outputStrides[tilingDataPtr_->ubSplitAxis]);
queIn0_.FreeTensor(bufferIn0_);
queIn1_.FreeTensor(bufferIn1_);
queOut0_.EnQue<float>(bufferOut0_);
}
__aicore__ inline void CopyOut3(int64_t ubSplitSize, const int64_t (&axesIndices)[BROADCAST_MAX_DIMS],
int64_t ubLoopIdx)
{
bufferOut0_ = queOut0_.DeQue<float>();
AscendC::DataCopyExtParams dataCopyExtParams;
dataCopyExtParams.blockCount = 1;
dataCopyExtParams.blockLen =
ubSplitSize * tilingDataPtr_->outputStrides[tilingDataPtr_->ubSplitAxis] * sizeof(float);
int64_t gmOffset = BroadcastGetGmOffset(axesIndices, tilingDataPtr_->outputStrides, tilingDataPtr_->ubSplitAxis,
tilingDataPtr_->ubFormer);
AscendC::DataCopyPad(outputGmOutputZ_[gmOffset], bufferOut0_[0], dataCopyExtParams);
queOut0_.FreeTensor(bufferOut0_);
}
private:
TPipe* pipePtr_;
const PowTensorTensorTilingData* tilingDataPtr_;
GlobalTensor<float> inputGmInputX_;
GlobalTensor<float> inputGmInputY_;
GlobalTensor<float> outputGmOutputZ_;
TQue<AscendC::QuePosition::VECIN, 1> queIn0_;
TQue<AscendC::QuePosition::VECIN, 1> queIn1_;
TQue<AscendC::QuePosition::VECOUT, 1> queOut0_;
LocalTensor<float> bufferIn0_;
LocalTensor<float> bufferIn1_;
LocalTensor<float> bufferOut0_;
constexpr static PowerConfig pConfig_ = {PowerAlgo::DOUBLE_FLOAT_TECH};
};
}
#endif