* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file im2col_nhwc_normal.h
* \brief
*/
#ifndef _IM2COL_NORMAL_NHWC_H_
#define _IM2COL_NORMAL_NHWC_H_
#include "kernel_operator.h"
#include "op_kernel/math_util.h"
#include "im2col_tilingdata.h"
namespace Im2col {
using namespace AscendC;
using namespace Ops::Base;
template <typename T, bool isPadding, uint8_t ubAxis>
class KernelIm2ColNormNhwc {
private:
constexpr static uint32_t BUFFER_NUM = 2;
constexpr static uint32_t UB_BLOCK = Ops::Base::GetUbBlockSize();
constexpr static uint32_t BLK_ELEMS = UB_BLOCK / sizeof(T);
constexpr static uint8_t MAX_DIMS_NUM = 4;
constexpr static uint8_t C_AXIS = 3;
constexpr static uint8_t W_AXIS = 2;
constexpr static uint8_t H_AXIS = 1;
constexpr static uint8_t N_AXIS = 0;
GlobalTensor<T> input_;
GlobalTensor<T> output_;
int64_t outC{0};
int64_t outW{0};
int64_t outH{0};
int64_t inC{0};
int64_t inH{0};
int64_t inW{0};
int64_t inStride_[MAX_DIMS_NUM] = {0, 0, 0, 0};
int64_t outStride_[MAX_DIMS_NUM] = {0, 0, 0, 0};
int64_t inShape_[MAX_DIMS_NUM] = {0, 0, 0, 0};
int64_t outShape_[MAX_DIMS_NUM] = {0, 0, 0, 0};
int64_t inIndex_[MAX_DIMS_NUM] = {0, 0, 0, 0};
int64_t outIndex_[MAX_DIMS_NUM] = {0, 0, 0, 0};
int64_t ubFactor_[MAX_DIMS_NUM] = {0, 0, 0, 0};
int64_t convKernelNumInWidth_{0};
int64_t convKernelNumInHeight_{0};
int64_t hStride_{0};
int64_t wStride_{0};
int64_t hDilation_{0};
int64_t wDilation_{0};
int64_t hPaddingTop_{0};
int64_t wPaddingTop_{0};
int64_t wPaddingBottom_{0};
int64_t hKernelSize_{0};
int64_t wKernelSize_{0};
int64_t wKernelEffSize_{0};
int64_t hKernelEffSize_{0};
int64_t blockIdx_;
int64_t ubRealFactor_{0};
uint32_t alignedCLength_{0};
int32_t outputBufferSize_{0};
int32_t outputTileSize_{0};
bool isWPadding_{false};
TPipe* pipe_ = nullptr;
const Im2ColNHWCTilingData* tilingData_ = nullptr;
TBuf<TPosition::VECCALC> inQueue_;
T constValue_{0};
public:
__aicore__ inline KernelIm2ColNormNhwc() {}
__aicore__ inline void Init(GM_ADDR x, GM_ADDR y, const Im2ColNHWCTilingData* tilingData, TPipe* pipe)
{
tilingData_ = tilingData;
pipe_ = pipe;
blockIdx_ = GetBlockIdx();
input_.SetGlobalBuffer((__gm__ T*)x);
output_.SetGlobalBuffer((__gm__ T*)y);
outputBufferSize_ = tilingData_->outputBufferSize;
outputTileSize_ = tilingData_->outputBufferSize / BUFFER_NUM;
pipe_->InitBuffer(inQueue_, outputBufferSize_);
convKernelNumInWidth_ = tilingData_->convKernelNumInWidth;
convKernelNumInHeight_ = tilingData_->convKernelNumInHeight;
hStride_ = tilingData_->input.hStride;
wStride_ = tilingData_->input.wStride;
hDilation_ = tilingData_->input.hDilation;
wDilation_ = tilingData_->input.wDilation;
hPaddingTop_ = tilingData_->input.hPaddingBefore;
wPaddingTop_ = tilingData_->input.wPaddingBefore;
wPaddingBottom_ = tilingData_->input.wPaddingAfter;
hKernelSize_ = tilingData_->input.hKernelSize;
wKernelSize_ = tilingData_->input.wKernelSize;
wKernelEffSize_ = (wKernelSize_ - 1) * wDilation_ + 1;
hKernelEffSize_ = (hKernelSize_ - 1) * hDilation_ + 1;
inH = tilingData_->input.H;
inW = tilingData_->input.W;
inC = outC = tilingData_->input.C;
outW = hKernelSize_ * wKernelSize_;
outH = convKernelNumInWidth_ * convKernelNumInHeight_;
inShape_[N_AXIS] = outShape_[N_AXIS] = tilingData_->input.N;
inShape_[C_AXIS] = outShape_[C_AXIS] = tilingData_->input.C;
inShape_[W_AXIS] = tilingData_->input.W;
inShape_[H_AXIS] = tilingData_->input.H;
outShape_[W_AXIS] = outW;
outShape_[H_AXIS] = outH;
ubFactor_[N_AXIS] = tilingData_->ubFactorN;
ubFactor_[H_AXIS] = tilingData_->ubFactorH;
ubFactor_[W_AXIS] = tilingData_->ubFactorW;
ubFactor_[C_AXIS] = tilingData_->ubFactorC;
int64_t inShapeSize_ = 1UL;
int64_t outShapeSize_ = 1UL;
for (int8_t i = MAX_DIMS_NUM - 1; i >= 0; --i) {
inStride_[i] = inShapeSize_;
inShapeSize_ *= inShape_[i];
outStride_[i] = outShapeSize_;
outShapeSize_ *= outShape_[i];
}
alignedCLength_ = CeilAlign(static_cast<uint32_t>(outShape_[MAX_DIMS_NUM - 1]), BLK_ELEMS);
isWPadding_ = wPaddingTop_ != 0 || wPaddingBottom_ != 0;
}
__aicore__ inline void Process()
{
int64_t startIdx = blockIdx_ * tilingData_->linesPerCore;
if (startIdx >= tilingData_->totalLines) {
return;
}
LocalTensor<T> input = inQueue_.Get<T>();
if constexpr (isPadding) {
Duplicate(input, constValue_, outputBufferSize_ / sizeof(T));
SetEvent<HardEvent::V_MTE2>(HardEvent::V_MTE2);
}
int64_t endIdx = (blockIdx_ + 1L) * tilingData_->linesPerCore;
endIdx = (endIdx < tilingData_->totalLines ? endIdx : tilingData_->totalLines);
for (int64_t idx = startIdx; idx < endIdx; idx++) {
int64_t curIdx = idx;
for (int8_t i = ubAxis; i >= 0; i--) {
CalculateOutIndex(curIdx, i);
}
inIndex_[C_AXIS] = outIndex_[C_AXIS];
inIndex_[N_AXIS] = outIndex_[N_AXIS];
inIndex_[H_AXIS] = outIndex_[H_AXIS] / convKernelNumInWidth_ * hStride_ -
hPaddingTop_ + outIndex_[W_AXIS] * hKernelSize_ / outW * hDilation_;
inIndex_[W_AXIS] = outIndex_[H_AXIS] % convKernelNumInWidth_ * wStride_ - wPaddingTop_ +
(outIndex_[W_AXIS] - (outIndex_[W_AXIS] * hKernelSize_ / outW) * (outW / hKernelSize_)) * wDilation_;
LocalTensor<T> srcLocal = input[((idx - startIdx) & (BUFFER_NUM - 1)) * outputTileSize_ / sizeof(T)];
CopyIn(srcLocal, idx - startIdx);
SetEvent<HardEvent::MTE2_MTE3>(HardEvent::MTE2_MTE3);
CopyOut(srcLocal, idx - startIdx);
SetEvent<HardEvent::MTE3_V>(HardEvent::MTE3_V);
}
}
__aicore__ inline void CalculateOutIndex(int64_t& curIdx, int8_t curUbAxis)
{
int64_t factorNum = outShape_[curUbAxis];
if (curUbAxis == ubAxis) {
if (ubAxis == W_AXIS && ubFactor_[curUbAxis] < wKernelSize_) {
int64_t wFactorNum = CeilDiv(wKernelSize_, ubFactor_[curUbAxis]);
factorNum = wFactorNum * hKernelSize_;
if (wFactorNum != 0) {
ubRealFactor_ = (curIdx % wFactorNum + 1) * ubFactor_[curUbAxis] <= wKernelSize_ ?
ubFactor_[curUbAxis] : wKernelSize_ % ubFactor_[curUbAxis];
outIndex_[curUbAxis] = ((curIdx % wFactorNum) * ubFactor_[curUbAxis] +
wKernelSize_ * (curIdx / wFactorNum)) % outShape_[curUbAxis];
}
} else if (ubAxis == H_AXIS && ubFactor_[curUbAxis] < convKernelNumInWidth_) {
int64_t hFactorNum = CeilDiv(convKernelNumInWidth_, ubFactor_[curUbAxis]);
factorNum = hFactorNum * convKernelNumInHeight_;
if (hFactorNum != 0) {
ubRealFactor_ = (curIdx % hFactorNum + 1) * ubFactor_[curUbAxis] <= convKernelNumInWidth_ ?
ubFactor_[curUbAxis] : convKernelNumInWidth_ % ubFactor_[curUbAxis];
outIndex_[curUbAxis] = ((curIdx % hFactorNum) * ubFactor_[curUbAxis] +
convKernelNumInWidth_ * (curIdx / hFactorNum)) % outShape_[curUbAxis];
}
} else {
factorNum = CeilDiv(outShape_[curUbAxis], ubFactor_[curUbAxis]);
if (factorNum != 0) {
outIndex_[curUbAxis] = curIdx % factorNum * ubFactor_[curUbAxis];
}
ubRealFactor_ = Std::min(ubFactor_[curUbAxis], outShape_[curUbAxis] - outIndex_[curUbAxis]);
}
} else {
outIndex_[curUbAxis] = curIdx % factorNum;
}
curIdx = (factorNum != 0) ? curIdx / factorNum : curIdx;
}
__aicore__ inline void CopyIn(const LocalTensor<T>& src, int32_t idx)
{
if (idx >= 1) {
if constexpr (isPadding) {
int32_t nextIdx = idx + 1;
LocalTensor<T> input = inQueue_.Get<T>();
LocalTensor<T> nextLocal =
input[(nextIdx & (BUFFER_NUM - 1)) * outputTileSize_ / sizeof(T)];
Duplicate(nextLocal, constValue_, outputTileSize_ / sizeof(T));
}
SetEvent<HardEvent::V_MTE2>(HardEvent::V_MTE2);
}
if constexpr (ubAxis == C_AXIS) {
DoCopyInAxisC(src);
}
if constexpr (ubAxis == W_AXIS) {
DoCopyInAxisW(src);
}
if constexpr (ubAxis == H_AXIS) {
DoCopyInAxisH(src, ubRealFactor_);
}
if constexpr (ubAxis == N_AXIS) {
DoCopyInAxisN(src);
}
}
__aicore__ inline void DoCopyInAxisC(const LocalTensor<T>& src)
{
bool isAllPadding = inIndex_[H_AXIS] >= 0 && inIndex_[H_AXIS] < inH &&
inIndex_[W_AXIS] >= 0 && inIndex_[W_AXIS] < inW ? false : true;
if (isAllPadding) {
return;
}
uint64_t inAddr = 0;
for (uint8_t i = 0; i < MAX_DIMS_NUM; i++) {
inAddr += inIndex_[i] * inStride_[i];
}
DataCopyExtParams copyInParams;
DataCopyPadExtParams<T> padParams{true, 0, 0, 0};
copyInParams.blockCount = 1;
copyInParams.blockLen = ubRealFactor_ * sizeof(T);
copyInParams.srcStride = 0;
copyInParams.dstStride = 0;
DataCopyPad(src, input_[inAddr], copyInParams, padParams);
}
__aicore__ inline void DoCopyInAxisW(const LocalTensor<T>& src)
{
int64_t inHLast = inIndex_[H_AXIS] + (CeilDiv(ubRealFactor_, wKernelSize_) - 1) * hDilation_;
int64_t inWLast = inIndex_[W_AXIS] + (Std::min(ubRealFactor_, wKernelSize_) - 1) * wDilation_;
int64_t startValidHIndex = inIndex_[H_AXIS] + CeilDiv(Std::max(
0L, inIndex_[H_AXIS]) - inIndex_[H_AXIS], hDilation_) * hDilation_;
int64_t startValidWIndex = inIndex_[W_AXIS] + CeilDiv(Std::max(
0L, inIndex_[W_AXIS]) - inIndex_[W_AXIS], wDilation_) * wDilation_;
if (inIndex_[H_AXIS] >= inH || inHLast < 0 || inIndex_[W_AXIS] >= inW || inWLast < 0 ||
startValidWIndex < 0 || startValidWIndex > inWLast ||
startValidHIndex < 0 || startValidHIndex > inHLast) {
return;
}
int64_t hBound = inHLast >= inH ? inH - 1 : inHLast;
int64_t endValidHIndex = inIndex_[H_AXIS] + (hBound - inIndex_[H_AXIS]) / hDilation_ * hDilation_;
int64_t wBound = inWLast >= inW ? inW - 1 : inWLast;
int64_t endValidWIndex = inIndex_[W_AXIS] + (wBound - inIndex_[W_AXIS]) / wDilation_ * wDilation_;
uint32_t ubInOffset = ((startValidHIndex - inIndex_[H_AXIS]) / hDilation_ * wKernelSize_ +
(startValidWIndex - inIndex_[W_AXIS]) / wDilation_) * alignedCLength_;
inIndex_[H_AXIS] = startValidHIndex;
inIndex_[W_AXIS] = startValidWIndex;
uint64_t inAddr = 0;
for (uint8_t i = 0; i < MAX_DIMS_NUM; i++) {
inAddr += inIndex_[i] * inStride_[i];
}
DataCopyExtParams copyInParams;
DataCopyPadExtParams<T> padParams{true, 0, 0, 0};
copyInParams.blockCount = (endValidWIndex - startValidWIndex) / wDilation_ + 1;
copyInParams.blockLen = inC * sizeof(T);
copyInParams.srcStride = (wDilation_ - 1) * inC * sizeof(T);
copyInParams.dstStride = 0;
LoopModeParams loopParams;
loopParams.loop2Size = 1;
loopParams.loop1Size = (endValidHIndex - startValidHIndex) / hDilation_ + 1;
loopParams.loop1SrcStride = inW * inC * hDilation_ * sizeof(T);
loopParams.loop1DstStride = wKernelSize_ * alignedCLength_ * sizeof(T);
SetLoopModePara(loopParams, DataCopyMVType::OUT_TO_UB);
DataCopyPad(src[ubInOffset], input_[inAddr], copyInParams, padParams);
ResetLoopModePara(DataCopyMVType::OUT_TO_UB);
}
__aicore__ inline void DoCopyInAxisH(const LocalTensor<T>& src, int64_t ubFactorH)
{
if constexpr (isPadding) {
DoCopyInAxisHWithPad(src, ubFactorH);
} else {
DoCopyInAxisHWithoutPad(src, ubFactorH);
}
}
__aicore__ inline void DoCopyInAxisHWithoutPad(const LocalTensor<T>& src, int64_t ubFactorH)
{
uint32_t hSlideNum = CeilDiv(ubFactorH, convKernelNumInWidth_);
uint64_t inAddr = 0;
for (uint8_t i = 0; i < MAX_DIMS_NUM; ++i) {
inAddr += inIndex_[i] * inStride_[i];
}
uint32_t slideNum = Std::min(ubFactorH,
(inW + wPaddingBottom_ - 1 -(inIndex_[W_AXIS] + wKernelEffSize_ - 1)) / wStride_ + 1);
DataCopyExtParams copyInParams;
DataCopyPadExtParams<T> padParams{true, 0, 0, 0};
copyInParams.blockLen = inC * sizeof(T);
copyInParams.srcStride = (wDilation_ - 1) * inC * sizeof(T);
copyInParams.dstStride = 0;
copyInParams.blockCount = wKernelSize_;
LoopModeParams loopParams;
loopParams.loop1Size = hKernelSize_;
loopParams.loop1SrcStride = inW * inC * hDilation_ * sizeof(T);
loopParams.loop1DstStride = wKernelSize_ * alignedCLength_ * sizeof(T);
if (slideNum >= hSlideNum) {
loopParams.loop2Size = slideNum;
loopParams.loop2SrcStride = inC * wStride_ * sizeof(T);
loopParams.loop2DstStride = outW * alignedCLength_ * sizeof(T);
for (uint32_t i = 0; i < hSlideNum; ++i) {
uint32_t gmInOffset = inW * inC * hStride_ * i;
uint32_t ubInOffset = i * slideNum * outW * alignedCLength_;
SetLoopModePara(loopParams, DataCopyMVType::OUT_TO_UB);
DataCopyPad(src[ubInOffset], input_[inAddr + gmInOffset], copyInParams, padParams);
ResetLoopModePara(DataCopyMVType::OUT_TO_UB);
}
return;
}
loopParams.loop2Size = hSlideNum;
loopParams.loop2SrcStride = inW * inC * hStride_ * sizeof(T);
loopParams.loop2DstStride = outW * slideNum * alignedCLength_ * sizeof(T);
for (uint32_t i = 0; i < slideNum; ++i) {
uint32_t gmInOffset = inC * wStride_ * i;
uint32_t ubInOffset = outW * alignedCLength_ * i;
SetLoopModePara(loopParams, DataCopyMVType::OUT_TO_UB);
DataCopyPad(src[ubInOffset], input_[inAddr + gmInOffset], copyInParams, padParams);
ResetLoopModePara(DataCopyMVType::OUT_TO_UB);
}
}
__aicore__ inline void DoCopyInAxisHWithPad(const LocalTensor<T>& src, int64_t ubFactorH)
{
uint32_t hSlideNum = CeilDiv(ubFactorH, convKernelNumInWidth_);
uint32_t wSlideNum = Std::min(ubFactorH,
(inW + wPaddingBottom_ - 1 -(inIndex_[W_AXIS] + wKernelEffSize_ - 1)) / wStride_ + 1);
if (hSlideNum == 1 || wSlideNum >= hSlideNum) {
DoCopyInAxisConvWPrefer(hSlideNum, wSlideNum, src);
return;
}
DoCopyInAxisConvHPrefer(hSlideNum, wSlideNum, src);
}
__aicore__ inline void DoCopyInAxisConvWPrefer(uint32_t hSlideNum, uint32_t wSlideNum, const LocalTensor<T>& src)
{
for (uint32_t i = 0; i < hSlideNum; ++i) {
int64_t inHLast = inIndex_[H_AXIS] + (hKernelSize_ - 1) * hDilation_;
int64_t startValidHIndex = inIndex_[H_AXIS] + CeilDiv(Std::max(
0L, inIndex_[H_AXIS]) - inIndex_[H_AXIS], hDilation_) * hDilation_;
int64_t endValidHIndex = inIndex_[H_AXIS] + (Std::min(inHLast, inH - 1) - inIndex_[H_AXIS]) / hDilation_ * hDilation_;
if (inIndex_[H_AXIS] >= inH || inHLast < 0 ||
startValidHIndex < 0 || startValidHIndex > inHLast || endValidHIndex < 0) {
inIndex_[H_AXIS] += hStride_;
continue;
}
int64_t oriWAxis = inIndex_[W_AXIS];
uint32_t j = 0;
while (j < wSlideNum) {
int64_t inWLast = inIndex_[W_AXIS] + (wKernelSize_ - 1) * wDilation_;
uint64_t ubStartAddr = (i * wSlideNum + j) * outW * alignedCLength_;
if (inIndex_[W_AXIS] < 0 || inWLast >= inW) {
DoCopyInKernelWSlideWithPad(startValidHIndex, endValidHIndex, inWLast, src, ubStartAddr);
inIndex_[W_AXIS] += wStride_;
++j;
continue;
}
uint32_t validSlideNum = (inW - (inIndex_[W_AXIS] + wKernelEffSize_)) / wStride_ + 1;
uint32_t untreatedWSlideNum = Std::min(validSlideNum, wSlideNum - j);
DoCopyInKernelWSlideWithoutPad(startValidHIndex, endValidHIndex, untreatedWSlideNum, src, ubStartAddr);
inIndex_[W_AXIS] += wStride_ * untreatedWSlideNum;
j += untreatedWSlideNum;
}
inIndex_[H_AXIS] += hStride_;
inIndex_[W_AXIS] = oriWAxis;
}
}
__aicore__ inline void DoCopyInAxisConvHPrefer(
const uint32_t hSlideNum, const uint32_t wSlideNum, const LocalTensor<T>& src)
{
uint32_t h = 0;
while (h < hSlideNum) {
int64_t inHLast = inIndex_[H_AXIS] + (hKernelSize_ - 1) * hDilation_;
int64_t startValidHIndex = inIndex_[H_AXIS] + CeilDiv(Std::max(
0L, inIndex_[H_AXIS]) - inIndex_[H_AXIS], hDilation_) * hDilation_;
int64_t endValidHIndex = inIndex_[H_AXIS] + (Std::min(inHLast, inH - 1) - inIndex_[H_AXIS]) / hDilation_ * hDilation_;
if (inIndex_[H_AXIS] >= inH || inHLast < 0 ||
startValidHIndex < 0 || startValidHIndex > inHLast || endValidHIndex < 0) {
inIndex_[H_AXIS] += hStride_;
++h;
continue;
}
int64_t oriWAxis = inIndex_[W_AXIS];
uint32_t ubStartAddr = h * wSlideNum * outW * alignedCLength_;
if (inIndex_[H_AXIS] >= 0 && inHLast < inH) {
uint32_t theHSlideNum = (inH - (inIndex_[H_AXIS] + hKernelEffSize_)) / hStride_ + 1;
uint32_t untreatedHSlideNum = Std::min(theHSlideNum, hSlideNum - h);
DoCopyInKernelHSlideWithoutPad(endValidHIndex - startValidHIndex, untreatedHSlideNum, wSlideNum, src, ubStartAddr);
inIndex_[H_AXIS] += hStride_ * untreatedHSlideNum;
inIndex_[W_AXIS] = oriWAxis;
h += untreatedHSlideNum;
continue;
}
uint32_t w = 0;
while (w < wSlideNum) {
ubStartAddr = outW * wSlideNum * h * alignedCLength_ + w * outW * alignedCLength_;
int64_t inWLast = inIndex_[W_AXIS] + (wKernelSize_ - 1) * wDilation_;
if (isWPadding_) {
DoCopyInKernelWSlideWithPad(startValidHIndex, endValidHIndex, inWLast, src, ubStartAddr);
inIndex_[W_AXIS] += wStride_;
++w;
} else {
DoCopyInKernelWSlideWithoutPad(startValidHIndex, endValidHIndex, convKernelNumInWidth_ , src, ubStartAddr);
inIndex_[W_AXIS] += convKernelNumInWidth_ * wStride_;
w += convKernelNumInWidth_;
}
}
inIndex_[H_AXIS] += hStride_;
inIndex_[W_AXIS] = oriWAxis;
++h;
}
}
__aicore__ inline void DoCopyInKernelHSlideWithoutPad(const int64_t validHLength,
const uint32_t untreatedHSlideNum, const uint32_t wSlideNum, const LocalTensor<T>& src, uint32_t ubStartAddr)
{
DataCopyExtParams copyInParams;
copyInParams.blockLen = inC * sizeof(T);
copyInParams.srcStride = (wDilation_ - 1) * inC * sizeof(T);
copyInParams.dstStride = 0;
LoopModeParams loopParams;
loopParams.loop1DstStride = wKernelSize_ * alignedCLength_ * sizeof(T);
loopParams.loop1SrcStride = inW * inC * hDilation_ * sizeof(T);
loopParams.loop2Size = untreatedHSlideNum;
loopParams.loop2SrcStride = inW * inC * hStride_ * sizeof(T);
loopParams.loop2DstStride = outW * wSlideNum * alignedCLength_ * sizeof(T);
DataCopyPadExtParams<T> padParams{true, 0, 0, 0};
for (uint32_t w = 0; w < wSlideNum; ++w) {
int64_t startValidWIndex = inIndex_[W_AXIS] + CeilDiv(Std::max(
0L, inIndex_[W_AXIS]) - inIndex_[W_AXIS], wDilation_) * wDilation_;
int64_t inWLast = inIndex_[W_AXIS] + (wKernelSize_ - 1) * wDilation_;
int64_t endValidWIndex = inIndex_[W_AXIS] + (Std::min(inWLast, inW - 1) - inIndex_[W_AXIS]) / wDilation_ * wDilation_;
if (inIndex_[W_AXIS] >= inW || inWLast < 0 || startValidWIndex < 0 ||
startValidWIndex > inWLast || endValidWIndex < 0) {
inIndex_[W_AXIS] += wStride_;
continue;
}
uint64_t inAddr = startValidWIndex * inStride_[W_AXIS];
for (uint8_t i = 0; i < MAX_DIMS_NUM; ++i) {
if (i == W_AXIS) {
continue;
}
inAddr += inIndex_[i] * inStride_[i];
}
uint32_t ubInOffset = outW * alignedCLength_ * w +
(startValidWIndex - inIndex_[W_AXIS]) / wDilation_ * alignedCLength_;
copyInParams.blockCount = (endValidWIndex - startValidWIndex) / wDilation_ + 1;
loopParams.loop1Size = validHLength / hDilation_ + 1;
SetLoopModePara(loopParams, DataCopyMVType::OUT_TO_UB);
DataCopyPad(src[ubStartAddr + ubInOffset], input_[inAddr], copyInParams, padParams);
ResetLoopModePara(DataCopyMVType::OUT_TO_UB);
inIndex_[W_AXIS] += wStride_;
}
}
__aicore__ inline void DoCopyInKernelWSlideWithPad(const int64_t startValidHIndex,
const int64_t endValidHIndex, const int64_t inWLast, const LocalTensor<T>& src, uint32_t ubStartAddr)
{
int64_t startValidWIndex = inIndex_[W_AXIS] + CeilDiv(Std::max(
0L, inIndex_[W_AXIS]) - inIndex_[W_AXIS], wDilation_) * wDilation_;
int64_t endValidWIndex = inIndex_[W_AXIS] + (Std::min(inWLast, inW - 1) - inIndex_[W_AXIS]) / wDilation_ * wDilation_;
if (inIndex_[W_AXIS] >= inW || inWLast < 0 || startValidWIndex < 0 ||
startValidWIndex > inWLast || endValidWIndex < 0) {
return;
}
uint64_t inAddr = startValidHIndex * inStride_[H_AXIS] + startValidWIndex * inStride_[W_AXIS];
for (uint8_t dim = 0; dim < MAX_DIMS_NUM; dim += C_AXIS) {
inAddr += inIndex_[dim] * inStride_[dim];
}
DataCopyExtParams copyInParams;
copyInParams.blockCount = (endValidWIndex - startValidWIndex) / wDilation_ + 1;
copyInParams.blockLen = inC * sizeof(T);
copyInParams.srcStride = (wDilation_ - 1) * inC * sizeof(T);
copyInParams.dstStride = 0;
LoopModeParams loopParams;
loopParams.loop1DstStride = wKernelSize_ * alignedCLength_ * sizeof(T);
loopParams.loop1SrcStride = inW * inC * hDilation_ * sizeof(T);
loopParams.loop1Size = (endValidHIndex - startValidHIndex) / hDilation_ + 1;
loopParams.loop2Size = 1;
uint32_t ubInOffset = ((startValidHIndex - inIndex_[H_AXIS]) / hDilation_ * wKernelSize_ +
(startValidWIndex - inIndex_[W_AXIS]) / wDilation_) * alignedCLength_;
DataCopyPadExtParams<T> padParams{true, 0, 0, 0};
SetLoopModePara(loopParams, DataCopyMVType::OUT_TO_UB);
DataCopyPad(src[ubStartAddr + ubInOffset], input_[inAddr], copyInParams, padParams);
ResetLoopModePara(DataCopyMVType::OUT_TO_UB);
}
__aicore__ inline void DoCopyInKernelWSlideWithoutPad(const int64_t startValidHIndex,
const int64_t endValidHIndex, const uint32_t untreatedWSlideNum, const LocalTensor<T>& src, uint32_t ubStartAddr)
{
DataCopyExtParams copyInParams;
copyInParams.blockLen = inC * sizeof(T);
copyInParams.blockCount = wKernelSize_;
copyInParams.srcStride = (wDilation_ - 1) * inC * sizeof(T);
copyInParams.dstStride = 0;
LoopModeParams loopParams;
loopParams.loop1Size = (endValidHIndex - startValidHIndex) / hDilation_ + 1;
loopParams.loop1DstStride = wKernelSize_ * alignedCLength_ * sizeof(T);
loopParams.loop1SrcStride = inW * inC * hDilation_ * sizeof(T);
loopParams.loop2Size = untreatedWSlideNum;
loopParams.loop2SrcStride = inC * wStride_ * sizeof(T);
loopParams.loop2DstStride = outW * alignedCLength_ * sizeof(T);
uint64_t inAddr = startValidHIndex * inStride_[H_AXIS];
for (uint8_t dim = 0; dim < MAX_DIMS_NUM; ++dim) {
if (dim == H_AXIS) {
continue;
}
inAddr += inIndex_[dim] * inStride_[dim];
}
uint32_t ubInOffset = ((startValidHIndex - inIndex_[H_AXIS]) / hDilation_ * wKernelSize_) * alignedCLength_;
DataCopyPadExtParams<T> padParams{true, 0, 0, 0};
SetLoopModePara(loopParams, DataCopyMVType::OUT_TO_UB);
DataCopyPad(src[ubStartAddr + ubInOffset], input_[inAddr], copyInParams, padParams);
ResetLoopModePara(DataCopyMVType::OUT_TO_UB);
}
__aicore__ inline void DoCopyInAxisN(const LocalTensor<T>& src)
{
if (ubRealFactor_ > convKernelNumInWidth_ && ubRealFactor_ > convKernelNumInHeight_) {
if constexpr (isPadding) {
DoCopyInAxisNWithPad(src);
} else {
DoCopyInAxisNWithoutPad(src);
}
} else {
int64_t curHAxis = inIndex_[H_AXIS];
for (uint32_t i = 0; i < ubRealFactor_; ++i) {
DoCopyInAxisH(src[outW * outH * alignedCLength_ * i], ubFactor_[H_AXIS]);
inIndex_[N_AXIS]++;
inIndex_[H_AXIS] = curHAxis;
}
}
}
__aicore__ inline void DoCopyInAxisNWithPad(const LocalTensor<T>& src)
{
DataCopyExtParams copyInParams;
DataCopyPadExtParams<T> padParams{true, 0, 0, 0};
copyInParams.blockLen = inC * sizeof(T);
copyInParams.srcStride = (wDilation_ - 1) * inC * sizeof(T);
copyInParams.dstStride = 0;
LoopModeParams loopParams;
loopParams.loop2Size = ubRealFactor_;
loopParams.loop2SrcStride = inStride_[N_AXIS] * sizeof(T);
loopParams.loop2DstStride = outW * outH * alignedCLength_ * sizeof(T);
loopParams.loop1SrcStride = inW * inC * hDilation_ * sizeof(T);
loopParams.loop1DstStride = wKernelSize_ * alignedCLength_ * sizeof(T);
for (uint32_t h = 0; h < convKernelNumInHeight_; ++h) {
int64_t inHLast = inIndex_[H_AXIS] + (hKernelSize_ - 1) * hDilation_;
int64_t startValidHIndex = inIndex_[H_AXIS] + CeilDiv(Std::max(
0L, inIndex_[H_AXIS]) - inIndex_[H_AXIS], hDilation_) * hDilation_;
int64_t endValidHIndex = inIndex_[H_AXIS] + (Std::min(inHLast, inH - 1) - inIndex_[H_AXIS]) / hDilation_ * hDilation_;
if (inIndex_[H_AXIS] >= inH || inHLast < 0 ||
startValidHIndex < 0 || startValidHIndex > inHLast || endValidHIndex < 0) {
inIndex_[H_AXIS] += hStride_;
continue;
}
loopParams.loop1Size = (endValidHIndex - startValidHIndex) / hDilation_ + 1;
int64_t oriWAxis = inIndex_[W_AXIS];
for (uint32_t w = 0; w < convKernelNumInWidth_; ++w) {
int64_t inWLast = inIndex_[W_AXIS] + (wKernelSize_ - 1) * wDilation_;
int64_t startValidWIndex = inIndex_[W_AXIS] + CeilDiv(Std::max(
0L, inIndex_[W_AXIS]) - inIndex_[W_AXIS], wDilation_) * wDilation_;
int64_t endValidWIndex = inIndex_[W_AXIS] + (Std::min(inWLast, inW - 1) - inIndex_[W_AXIS]) / wDilation_ * wDilation_;
if (inIndex_[W_AXIS] >= inW || inWLast < 0 ||
startValidWIndex < 0 || startValidWIndex > inWLast || endValidWIndex < 0) {
inIndex_[W_AXIS] += wStride_;
continue;
}
copyInParams.blockCount = (endValidWIndex - startValidWIndex) / wDilation_ + 1;
uint64_t inAddr = startValidHIndex * inStride_[H_AXIS] + startValidWIndex * inStride_[W_AXIS];
for (uint8_t dim = 0; dim < MAX_DIMS_NUM; dim += C_AXIS) {
inAddr += inIndex_[dim] * inStride_[dim];
}
uint32_t ubInOffset = outW * alignedCLength_ * (w + convKernelNumInWidth_ * h) + alignedCLength_ *
((startValidHIndex - inIndex_[H_AXIS]) / hDilation_ * wKernelSize_ +
(startValidWIndex - inIndex_[W_AXIS]) / wDilation_);
SetLoopModePara(loopParams, DataCopyMVType::OUT_TO_UB);
DataCopyPad(src[ubInOffset], input_[inAddr], copyInParams, padParams);
ResetLoopModePara(DataCopyMVType::OUT_TO_UB);
inIndex_[W_AXIS] += wStride_;
}
inIndex_[H_AXIS] += hStride_;
inIndex_[W_AXIS] = oriWAxis;
}
}
__aicore__ inline void DoCopyInAxisNWithoutPad(const LocalTensor<T>& src)
{
uint64_t inAddr = 0;
for (uint8_t i = 0; i < MAX_DIMS_NUM; ++i) {
inAddr += inIndex_[i] * inStride_[i];
}
DataCopyExtParams copyInParams;
DataCopyPadExtParams<T> padParams{true, 0, 0, 0};
copyInParams.blockLen = inC * sizeof(T);
copyInParams.srcStride = (wDilation_ - 1) * inC * sizeof(T);
copyInParams.dstStride = 0;
copyInParams.blockCount = wKernelSize_;
LoopModeParams loopParams;
loopParams.loop2Size = ubRealFactor_;
loopParams.loop2SrcStride = inStride_[N_AXIS] * sizeof(T);
loopParams.loop2DstStride = outW * outH * alignedCLength_ * sizeof(T);
loopParams.loop1Size = hKernelSize_;
loopParams.loop1SrcStride = inW * inC * hDilation_ * sizeof(T);
loopParams.loop1DstStride = wKernelSize_ * alignedCLength_ * sizeof(T);
for (uint32_t h = 0; h < convKernelNumInHeight_; ++h) {
for (uint32_t w = 0; w < convKernelNumInWidth_; ++w) {
uint32_t gmInOffset = inC * wStride_ * w + inW * inC * hStride_ * h;
uint32_t ubInOffset = outW * alignedCLength_ * w + outW * alignedCLength_ * convKernelNumInWidth_ * h;
SetLoopModePara(loopParams, DataCopyMVType::OUT_TO_UB);
DataCopyPad(src[ubInOffset], input_[inAddr + gmInOffset], copyInParams, padParams);
ResetLoopModePara(DataCopyMVType::OUT_TO_UB);
}
}
}
__aicore__ inline void CopyOut(const LocalTensor<T>& src, const int32_t idx)
{
uint64_t outAddr = 0;
for (uint8_t i = 0; i < MAX_DIMS_NUM; ++i) {
outAddr += outIndex_[i] * outStride_[i];
}
DataCopyExtParams copyOutParams;
copyOutParams.dstStride = 0;
if constexpr (ubAxis == C_AXIS) {
copyOutParams.blockCount = 1;
copyOutParams.blockLen = ubRealFactor_ * sizeof(T);
copyOutParams.srcStride = 0;
} else {
copyOutParams.blockLen = outShape_[C_AXIS] * sizeof(T);
copyOutParams.srcStride = 0;
if constexpr (ubAxis == W_AXIS) {
copyOutParams.blockCount = ubRealFactor_;
}
if constexpr (ubAxis == H_AXIS) {
copyOutParams.blockCount = ubRealFactor_ * outW;
}
if constexpr (ubAxis == N_AXIS) {
copyOutParams.blockCount = ubRealFactor_ * outH * outW;
}
}
DataCopyPad(output_[outAddr], src[0], copyOutParams);
}
template <HardEvent EVENT>
__aicore__ inline void SetEvent(HardEvent evt)
{
event_t eventId = static_cast<event_t>(GetTPipePtr()->FetchEventID(evt));
SetFlag<EVENT>(eventId);
WaitFlag<EVENT>(eventId);
}
};
}
#endif