* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file dropmask.h
* \brief
*/
#ifndef DROPMASK_H
#define DROPMASK_H
#include "adv_api/math/philox.h"
#include "util_regbase.h"
using namespace AscendC;
using namespace AscendC::MicroAPI;
using AscendC::DropOutShapeInfo;
namespace regbaseutil {
constexpr static int32_t uint32Uint8Ratio = 4;
constexpr static int32_t uint32BlockSize = 8;
constexpr static int32_t eachOffsetRandomUint8Num = 16;
constexpr static int32_t dropMaskUint32AlignSize = 4;
constexpr static int32_t philoxRandomNumAlignSize = 16;
constexpr static int32_t halfBaseSize = 16;
constexpr static uint32_t eachRowAlignNum = 8;
constexpr static uint32_t PHILOX_CONST_MUL_0 = 0xD2511F53;
constexpr static uint32_t PHILOX_CONST_MUL_1 = 0xCD9E8D57;
constexpr static uint32_t PHILOX_CONST_KEY_ADD_0 = 0x9E3779B9;
constexpr static uint32_t PHILOX_CONST_KEY_ADD_1 = 0xBB67AE85;
constexpr static uint32_t OFFSET_64 = 64;
constexpr static uint32_t OFFSET_32 = 32;
struct DropMaskInfo {
int64_t seed;
int64_t offset;
uint8_t keepProbUint8;
uint8_t dropMaskOuter;
bool boolMode;
};
template <bool hasDrop, bool hasRope = false>
__aicore__ inline uint64_t ComputeDropOffset(RunInfo<false> &runInfo, ConstInfo<false, hasRope> &constInfo, DropMaskInfo &dropMaskInfo)
{
int64_t s2SizeAligned = CeilDiv(runInfo.actualS2Size, philoxRandomNumAlignSize) * philoxRandomNumAlignSize;
int64_t bOffset = runInfo.b1SSOffsetAlign * constInfo.n2G;
int64_t n2Offset = runInfo.n2oIdx * constInfo.gSize * runInfo.actualS1Size * s2SizeAligned;
int64_t gOffset = runInfo.goIdx * runInfo.actualS1Size * s2SizeAligned;
int64_t s1Offset = (runInfo.s1oIdx * constInfo.s1BaseSize + runInfo.vecCoreOffset) * s2SizeAligned;
int64_t s2Offset = runInfo.s2StartIdx + runInfo.s2LoopCount * constInfo.s2BaseSize;
return static_cast<uint64_t>(bOffset + n2Offset + gOffset + s1Offset + s2Offset) +
static_cast<uint64_t>(dropMaskInfo.offset);
}
__simd_vf__ inline void GenIndexAlign(const uint64_t indexVecDstLocalInt, const uint32_t rowNums,
const uint32_t eachRowIndexNum, const uint32_t eachRowOffset)
{
RegTensor<int32_t> inc_idx;
MaskReg preg;
uint32_t sreg = eachRowIndexNum;
preg= UpdateMask<int32_t>(sreg);
Arange(inc_idx, 0);
for (uint16_t s1Idx = 0; s1Idx < static_cast<uint16_t>(rowNums); s1Idx++) {
StoreAlign<int32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE, MicroAPI::StoreDist::DIST_NORM_B32>(
(__ubuf__ int32_t *&)indexVecDstLocalInt, inc_idx, eachRowIndexNum, preg);
Adds(inc_idx, inc_idx, eachRowOffset, preg);
}
}
__simd_vf__ inline void GenIndexUnAling(const uint64_t indexVecDstLocalInt, const uint32_t rowNums,
const uint32_t eachRowIndexNum, const uint32_t eachRowOffset)
{
RegTensor<int32_t> inc_idx;
MaskReg preg;
UnalignRegForStore ureg;
uint32_t sreg = eachRowIndexNum;
preg= UpdateMask<int32_t>(sreg);
Arange(inc_idx, 0);
for (uint16_t s1Idx = 0; s1Idx < static_cast<uint16_t>(rowNums); s1Idx++) {
StoreUnAlign<int32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE>(
(__ubuf__ int32_t *&)indexVecDstLocalInt, inc_idx, ureg, eachRowIndexNum);
Adds(inc_idx, inc_idx, eachRowOffset, preg);
}
StoreUnAlignPost<int32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE>((__ubuf__ int32_t *&)indexVecDstLocalInt, ureg, 0);
}
* @ingroup GenIndexVec
* @brief generate basic block index vector, eachRowIndexNum need 8 aligned
* @param [out] dropmaskIndexVec, output LocalTensor
* @param [in] rowNums, basic block's rows
* @param [in] eachRowIndexNum, each row need index nums, eachRowIndexNum = CeilDiv(CeilDiv(s2RealSize, 4), 4) * 4 / 4, only support less equal 64
* @param [in] eachRowOffset, each row index need add offset, eachRowOffset = CeilDiv(inputParams.s2Size, 16)
*/
__aicore__ inline void GenIndexVec(LocalTensor<int32_t> &dropmaskIndexVec, uint32_t rowNums, uint32_t eachRowIndexNum,
uint32_t eachRowOffset)
{
uint64_t indexVecDstLocalInt = reinterpret_cast<uint64_t>(dropmaskIndexVec.GetPhyAddr());
if (eachRowIndexNum % eachRowAlignNum == 0) {
GenIndexAlign(indexVecDstLocalInt, rowNums, eachRowIndexNum, eachRowOffset);
} else {
GenIndexUnAling(indexVecDstLocalInt, rowNums, eachRowIndexNum, eachRowOffset);
}
}
__simd_vf__ inline void GenMaskVF(__ubuf__ uint32_t *mask, const uint64_t indexVecLocalInt,
const uint32_t key0, const uint32_t key1, const uint32_t counter0,
const uint32_t counter1, const uint32_t counter2, const uint32_t counter3,
const uint16_t count, const uint8_t probValueUint8Scalar, const uint16_t mainLoop)
{
MaskReg pg = CreateMask<uint32_t, MaskPattern::ALL>();
MaskReg preg = CreateMask<uint8_t, MaskPattern::ALL>();
MaskReg pd;
MaskReg pm_0, pm_1, pm_2, pm_3;
RegTensor<uint32_t> ctr_3, ctr_2, ctr_1, ctr_0, key_1, key_0;
Duplicate(key_0, key0);
Duplicate(key_1, key1);
Duplicate(ctr_0, counter0);
Duplicate(ctr_1, counter1);
Duplicate(ctr_2, counter2);
Duplicate(ctr_3, counter3);
RegTensor<int32_t> inc_idx;
RegTensor<uint32_t> v_zero;
RegTensor<uint32_t> v_const_mul_0, v_const_mul_1;
Duplicate(v_zero, 0x0);
Duplicate(v_const_mul_0, (uint32_t)PHILOX_CONST_MUL_0);
Duplicate(v_const_mul_1, (uint32_t)PHILOX_CONST_MUL_1);
for (uint16_t i = 0; i < mainLoop; i++) {
LoadAlign<int32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE>(
inc_idx, ((__ubuf__ int32_t *&)indexVecLocalInt), ELE_CNT_B32);
RegTensor<uint32_t> tmp_ctr_0 = ctr_0;
RegTensor<uint32_t> tmp_ctr_1 = ctr_1;
RegTensor<uint32_t> tmp_ctr_2 = ctr_2;
RegTensor<uint32_t> tmp_ctr_3 = ctr_3;
AddCarryOut(pd, tmp_ctr_0, ctr_0, (RegTensor<uint32_t>&)inc_idx, pg);
AddCarryOuts(pd, tmp_ctr_1, ctr_1, v_zero, pd, pg);
AddCarryOuts(pd, tmp_ctr_2, ctr_2, v_zero, pd, pg);
AddCarryOuts(pd, tmp_ctr_3, ctr_3, v_zero, pd, pg);
RegTensor<uint32_t> tmp_key_0 = key_0;
RegTensor<uint32_t> tmp_key_1 = key_1;
for (uint16_t j = 0; j < 7; j++) {
RegTensor<uint32_t> tmp_l0, tmp_h0, tmp_l1, tmp_h1;
Mull(tmp_l0, tmp_h0, tmp_ctr_0, v_const_mul_0, pg);
Mull(tmp_l1, tmp_h1, tmp_ctr_2, v_const_mul_1, pg);
Xor(tmp_h1, tmp_h1, tmp_ctr_1, pg);
Xor(tmp_ctr_0, tmp_h1, tmp_key_0, pg);
Xor(tmp_h0, tmp_h0, tmp_ctr_3, pg);
Xor(tmp_ctr_2, tmp_h0, tmp_key_1, pg);
tmp_ctr_1 = tmp_l1;
tmp_ctr_3 = tmp_l0;
Adds(tmp_key_0, tmp_key_0, (uint32_t)PHILOX_CONST_KEY_ADD_0, pg);
Adds(tmp_key_1, tmp_key_1, (uint32_t)PHILOX_CONST_KEY_ADD_1, pg);
}
Interleave(tmp_ctr_0, tmp_ctr_2, tmp_ctr_0, tmp_ctr_2);
Interleave(tmp_ctr_1, tmp_ctr_3, tmp_ctr_1, tmp_ctr_3);
Interleave(tmp_ctr_0, tmp_ctr_1, tmp_ctr_0, tmp_ctr_1);
Interleave(tmp_ctr_2, tmp_ctr_3, tmp_ctr_2, tmp_ctr_3);
RegTensor<uint8_t> tmp_ctr_0_u8 = (RegTensor<uint8_t>&) tmp_ctr_0;
RegTensor<uint8_t> tmp_ctr_1_u8 = (RegTensor<uint8_t>&) tmp_ctr_1;
RegTensor<uint8_t> tmp_ctr_2_u8 = (RegTensor<uint8_t>&) tmp_ctr_2;
RegTensor<uint8_t> tmp_ctr_3_u8 = (RegTensor<uint8_t>&) tmp_ctr_3;
CompareScalar<uint8_t, CMPMODE::LE>(pm_0, tmp_ctr_0_u8, probValueUint8Scalar, preg);
CompareScalar<uint8_t, CMPMODE::LE>(pm_1, tmp_ctr_1_u8, probValueUint8Scalar, preg);
CompareScalar<uint8_t, CMPMODE::LE>(pm_2, tmp_ctr_2_u8, probValueUint8Scalar, preg);
CompareScalar<uint8_t, CMPMODE::LE>(pm_3, tmp_ctr_3_u8, probValueUint8Scalar, preg);
StoreAlign<uint32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE>((__ubuf__ uint32_t *&)mask, pm_0, 32);
StoreAlign<uint32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE>((__ubuf__ uint32_t *&)mask, pm_1, 32);
StoreAlign<uint32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE>((__ubuf__ uint32_t *&)mask, pm_2, 32);
StoreAlign<uint32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE>((__ubuf__ uint32_t *&)mask, pm_3, 32);
}
}
* @ingroup GenMaskByIndexVec
* @brief generate dropout mask by indexVec, use philox
* @param [out] dstLocal, output LocalTensor
* @param [in] indexUb, input index LocalTensor generated by GenIndexVec interface
* @param [in] philoxkey, philox input parameters key 64bits --> two uint32
* @param [in] philoxCounter, philox input parameters counter 128bits --> four uint32
* @param [in] count, The number of uint32 random numbers expected to be generated, 4 aligned
* @param [in] probValueUint8Scalar, keepprob*MAX_UINT8
*/
__aicore__ inline void GenMaskByIndexVec(const LocalTensor<uint8_t> &dstLocal, const LocalTensor<int32_t> &indexUb,
const PhiloxKey &philoxkey, const PhiloxCounter &philoxCounter, uint16_t count,
uint8_t probValueUint8Scalar)
{
__ubuf__ uint32_t *mask = (__ubuf__ uint32_t *)dstLocal.GetPhyAddr();
uint64_t indexVecLocalInt = reinterpret_cast<uint64_t>(indexUb.GetPhyAddr());
uint16_t mainLoop = CeilDiv(count, 256);
GenMaskVF(mask, indexVecLocalInt, philoxkey[0], philoxkey[1], philoxCounter[0], philoxCounter[1], philoxCounter[2],
philoxCounter[3], count, probValueUint8Scalar, mainLoop);
}
template <bool hasRope = false>
__aicore__ inline void GenDropMask(TBuf<> &dropMaskBuf, TBuf<> &maskIndexBuf, uint64_t maskOffset,
RunInfo<false> &runInfo, ConstInfo<false, hasRope> &constInfo, DropMaskInfo &dropMaskInfo)
{
uint64_t dropMaskOffset = CeilDiv(maskOffset, philoxRandomNumAlignSize);
uint32_t eachRowCount = CeilDiv(CeilDiv(constInfo.s2BaseSize, uint32Uint8Ratio),
dropMaskUint32AlignSize) * dropMaskUint32AlignSize;
uint32_t eachRowIndexNum = CeilDiv(eachRowCount, 4);
int32_t eachRowOffset = CeilDiv(runInfo.actualS2Size, philoxRandomNumAlignSize);
LocalTensor<int32_t> dropmaskIndexVec = maskIndexBuf.template Get<int32_t>();
if (constInfo.layoutType != (uint8_t)LayOutTypeEnum::LAYOUT_TND &&
runInfo.actualS1Size % constInfo.s1BaseSize == 0 && runInfo.actualS2Size % constInfo.s2BaseSize == 0) {
if (runInfo.taskId == 0) {
GenIndexVec(dropmaskIndexVec, runInfo.halfS1RealSize, eachRowIndexNum, eachRowOffset);
}
} else if (runInfo.s2LoopCount == 0 || runInfo.s2LoopCount == runInfo.s2LoopLimit) {
GenIndexVec(dropmaskIndexVec, runInfo.halfS1RealSize, eachRowIndexNum, eachRowOffset);
}
uint64_t dropMaskSeed = static_cast<uint64_t>(dropMaskInfo.seed);
uint32_t seedHigh = static_cast<uint32_t>(dropMaskSeed >> 32);
uint32_t seedLow = static_cast<uint32_t>(dropMaskSeed & 0xffffffff);
uint32_t offsetHigh = static_cast<uint32_t>(dropMaskOffset >> 32);
uint32_t offsetLow = static_cast<uint32_t>(dropMaskOffset & 0xffffffff);
LocalTensor<uint8_t> dropMaskUb = dropMaskBuf.template Get<uint8_t>();
GenMaskByIndexVec(dropMaskUb, dropmaskIndexVec, {seedLow, seedHigh}, {offsetLow, offsetHigh, 0, 0},
runInfo.halfS1RealSize * eachRowCount, dropMaskInfo.keepProbUint8);
}
template <bool hasDrop, bool hasRope = false>
__aicore__ inline void GenDropMask(TBuf<> &dropMaskBuf, TBuf<> &maskIndexBuf, RunInfo<false> &runInfo,
ConstInfo<false, hasRope> &constInfo, DropMaskInfo &dropMaskInfo)
{
if constexpr (hasDrop == true) {
int64_t dropMaskOffset = ComputeDropOffset<hasDrop>(runInfo, constInfo, dropMaskInfo);
GenDropMask(dropMaskBuf, maskIndexBuf, dropMaskOffset, runInfo, constInfo, dropMaskInfo);
return;
}
}
__aicore__ inline void BoolCopyIn(LocalTensor<uint8_t> &dstTensor, GlobalTensor<uint8_t> &srcTensor,
int64_t srcOffset, uint32_t s1Size, uint32_t s2Size, int64_t totalS2Size, int64_t s2BaseSize)
{
if (s1Size == 0 || s2Size == 0) {
return;
}
uint32_t alignedS2Size = CeilDiv(s2Size, blockBytes) * blockBytes;
DataCopyParams dataCopyParams;
dataCopyParams.blockCount = s1Size;
dataCopyParams.blockLen = CeilDiv(s2Size, blockBytes);
dataCopyParams.dstStride = CeilDiv(s2BaseSize, blockBytes) - dataCopyParams.blockLen;
if (totalS2Size % blockBytes == 0) {
dataCopyParams.srcStride = (totalS2Size - s2Size) / blockBytes;
DataCopy(dstTensor, srcTensor[srcOffset], dataCopyParams);
} else {
dataCopyParams.blockLen = s2Size;
dataCopyParams.srcStride = totalS2Size - s2Size;
DataCopyPadParams dataCopyPadParams;
DataCopyPad(dstTensor, srcTensor[srcOffset], dataCopyParams, dataCopyPadParams);
}
}
__aicore__ inline void Bit2Int8CopyIn(LocalTensor<uint8_t> &dstTensor, GlobalTensor<uint8_t> &srcTensor,
int64_t srcOffset, uint32_t s1Size, uint32_t s2Size, int64_t totalS2Size, int64_t s2BaseSize)
{
if (s1Size == 0 || s2Size == 0) {
return;
}
DataCopyParams dataCopyParams;
dataCopyParams.blockCount = s1Size;
dataCopyParams.blockLen = CeilDiv(s2Size / byteBitRatio, blockBytes);
dataCopyParams.dstStride = CeilDiv(s2BaseSize / byteBitRatio, blockBytes) - dataCopyParams.blockLen;
if (totalS2Size / byteBitRatio % blockBytes == 0) {
dataCopyParams.srcStride =
(totalS2Size / byteBitRatio - dataCopyParams.blockLen * blockBytes) / blockBytes;
DataCopy(dstTensor, srcTensor[srcOffset / byteBitRatio], dataCopyParams);
} else {
dataCopyParams.blockLen = CeilDiv(s2Size, byteBitRatio);
dataCopyParams.srcStride = (totalS2Size - s2Size) / byteBitRatio;
DataCopyPadParams dataCopyPadParams;
dataCopyPadParams.isPad = true;
dataCopyPadParams.rightPadding = 0;
dataCopyPadParams.paddingValue = 0;
DataCopyPad(dstTensor, srcTensor[srcOffset / byteBitRatio], dataCopyParams, dataCopyPadParams);
}
}
template <bool hasDrop, bool hasRope = false>
__aicore__ inline int64_t ComputeOuterDropOffset(RunInfo<false> &runInfo, ConstInfo<false, hasRope> &constInfo, DropMaskInfo &dropMaskInfo)
{
if constexpr (hasDrop == true) {
int64_t bOffset = runInfo.b1SSOffset * constInfo.n2G;
int64_t n2Offset = runInfo.n2oIdx * constInfo.gSize * runInfo.actualS1Size * runInfo.actualS2Size;
int64_t gOffset = runInfo.goIdx * runInfo.actualS1Size * runInfo.actualS2Size;
int64_t s1Offset = (runInfo.s1oIdx * constInfo.s1BaseSize + runInfo.vecCoreOffset) * runInfo.actualS2Size;
int64_t s2Offset = runInfo.s2StartIdx + runInfo.s2LoopCount * constInfo.s2BaseSize;
return bOffset + n2Offset + gOffset + s1Offset + s2Offset;
} else {
return 0;
}
}
__simd_vf__ inline void DropMaskBool2BitVF(const uint64_t srcUb, const uint64_t dstUb, const uint16_t loopCount)
{
RegTensor<uint32_t> vreg_drop;
MaskReg vreg_cmp;
MaskReg preg_all = CreateMask<uint8_t, MaskPattern::ALL>();
for (uint16_t i = 0; i < loopCount; ++i) {
LoadAlign<uint32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE>(
vreg_drop, (__ubuf__ uint32_t *&)srcUb, OFFSET_64);
RegTensor<uint8_t> vreg_tmp = (RegTensor<uint8_t> &)vreg_drop;
CompareScalar<uint8_t, CMPMODE::EQ>(vreg_cmp, vreg_tmp, 1, preg_all);
StoreAlign<uint32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE>(
(__ubuf__ uint32_t *&)dstUb, vreg_cmp, OFFSET_32);
}
}
__aicore__ inline void DropMaskBool2Bit(LocalTensor<uint8_t> &dstTensor, LocalTensor<uint8_t> &srcTensor,
int32_t halfS1RealSize, int64_t s2BaseSize)
{
uint64_t srcUb = reinterpret_cast<uint64_t>(srcTensor.GetPhyAddr());
uint64_t dstUb = reinterpret_cast<uint64_t>(dstTensor.GetPhyAddr());
uint16_t rowNumEachLoop = regBytes / static_cast<uint16_t>(s2BaseSize);
uint16_t halfS1RealSizeLoop = static_cast<uint16_t>(halfS1RealSize) + 1;
uint16_t loopCount = halfS1RealSizeLoop / rowNumEachLoop;
DropMaskBool2BitVF(srcUb, dstUb, loopCount);
}
__simd_vf__ inline void DropMaskPadDelVF(const uint64_t srcUb, const uint64_t dstUb, const uint16_t loopCount)
{
MaskReg preg1;
MaskReg preg2;
MaskReg preg3;
MaskReg preg4;
for (uint16_t i = 0; i < loopCount; ++i) {
LoadAlign<uint32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE, MicroAPI::MaskDist::DIST_US>(
preg1, (__ubuf__ uint32_t *&)srcUb, OFFSET_32);
LoadAlign<uint32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE, MicroAPI::MaskDist::DIST_US>(
preg2, (__ubuf__ uint32_t *&)srcUb, OFFSET_32);
MaskDeInterleave<uint8_t>(preg3, preg4, preg1, preg2);
StoreAlign<uint32_t, MicroAPI::PostLiteral::POST_MODE_UPDATE>(
(__ubuf__ uint32_t *&)dstUb, preg3, OFFSET_32);
}
}
__aicore__ inline void DropMaskPadDel(LocalTensor<uint8_t> &dstTensor, LocalTensor<uint8_t> &srcTensor,
int32_t halfS1RealSize, int64_t s2BaseSize)
{
uint64_t srcUb = reinterpret_cast<uint64_t>(srcTensor.GetPhyAddr());
uint64_t dstUb = reinterpret_cast<uint64_t>(dstTensor.GetPhyAddr());
uint16_t loopCount = (static_cast<uint16_t>(halfS1RealSize) + 1) / 2;
DropMaskPadDelVF(srcUb, dstUb, loopCount);
}
template <bool hasDrop, bool hasRope = false>
__aicore__ inline void CopyInDropOuter(TBuf<> &dropMaskBuf, TQue<QuePosition::VECIN, 1> &dropMaskInQue,
GlobalTensor<uint8_t>& srcTensor, RunInfo<false> &runInfo, ConstInfo<false, hasRope> &constInfo, DropMaskInfo &dropMaskInfo)
{
if constexpr (hasDrop == true) {
int64_t dropMaskOffset = ComputeOuterDropOffset<hasDrop>(runInfo, constInfo, dropMaskInfo);
LocalTensor<uint8_t> dropTensor = dropMaskBuf.template Get<uint8_t>();
LocalTensor<uint8_t> dropMaskUb = dropMaskInQue.template AllocTensor<uint8_t>();
if (unlikely(dropMaskInfo.boolMode)) {
BoolCopyIn(dropMaskUb, srcTensor, dropMaskOffset, runInfo.halfS1RealSize, runInfo.s2RealSize,
runInfo.actualS2Size, constInfo.s2BaseSize);
dropMaskInQue.template EnQue(dropMaskUb);
dropMaskInQue.template DeQue<uint8_t>();
DropMaskBool2Bit(dropTensor, dropMaskUb, runInfo.halfS1RealSize, constInfo.s2BaseSize);
} else {
Bit2Int8CopyIn(dropMaskUb, srcTensor, dropMaskOffset, runInfo.halfS1RealSize, runInfo.s2RealSize,
runInfo.actualS2Size, constInfo.s2BaseSize);
dropMaskInQue.template EnQue(dropMaskUb);
dropMaskInQue.template DeQue<uint8_t>();
DropMaskPadDel(dropTensor, dropMaskUb, runInfo.halfS1RealSize, constInfo.s2BaseSize);
}
dropMaskInQue.template FreeTensor(dropMaskUb);
return;
}
}
}
#endif