/**
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
 * CANN Open Software License Agreement Version 2.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

/*!
 * \file cdist.h
 * \brief
 */
#ifndef __CDIST_H__
#define __CDIST_H__

#include "kernel_operator.h"
#include "op_kernel/platform_util.h"
#include "../cdist_tiling_data.h"

#ifndef INFINITY
#define INFINITY (__builtin_inff())
#endif

namespace NsCdist {

using namespace AscendC;


template <typename T>
class Cdist {
public:
    __aicore__ inline Cdist(){};
    __aicore__ inline void Init(GM_ADDR x1, GM_ADDR x2, GM_ADDR y, const CdistTilingData* tilingData, TPipe* pipe);
    __aicore__ inline void Process();

private:
    __aicore__ inline void ParseTilingData(const CdistTilingData* tilingData);
    __aicore__ inline void CopyInX1(uint32_t Offset);
    __aicore__ inline void CopyInX2(uint32_t Offset);
    __aicore__ inline void CopyOut(uint32_t Offset);
    __aicore__ inline void CastY();
    __aicore__ inline void CastXToB32();
    __aicore__ inline void Compute();
    __aicore__ inline void ComputeSplitM();
    __aicore__ inline void ProcessSplitM(uint32_t bOffset, uint32_t pOffset, uint32_t rOffsetBlock, uint32_t blockFactorR);
    __aicore__ inline void ProcessNoSplitM(uint32_t bOffset, uint32_t pOffset, uint32_t rOffsetBlock, uint32_t blockFactorR);
    __aicore__ inline void CalSplitMResult(int32_t processNum);
    __aicore__ inline void ComputeOneSize(__local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr);
    __aicore__ inline void ComputePNorm2(uint32_t b, uint32_t p, uint32_t r, uint16_t loopNumM, uint32_t tailNumM,
        __local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr);
    __aicore__ inline void ComputePNorm1(uint32_t b, uint32_t p, uint32_t r, uint16_t loopNumM, uint32_t tailNumM,
        __local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr);
    __aicore__ inline void ComputePNorm0(uint32_t b, uint32_t p, uint32_t r, uint16_t loopNumM, uint32_t tailNumM,
        __local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr);
    __aicore__ inline void ComputePNormInf(uint32_t b, uint32_t p, uint32_t r, uint16_t loopNumM, uint32_t tailNumM,
        __local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr);
    __aicore__ inline void ComputePNormOther(uint32_t b, uint32_t p, uint32_t r, uint16_t loopNumM, uint32_t tailNumM,
        __local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr);

private:
    constexpr static int32_t BUFFER_NUM = 2;
    constexpr static int32_t BLOCK_SIZE = 32;
    constexpr static uint32_t BASE_ONE = 1;
    constexpr static uint32_t LOOP_ZERO = 0;
    constexpr static ExpConfig expConfig = {ExpAlgo::PRECISION_1ULP_FTZ_FALSE};
    constexpr static LnConfig lnConfig = {LnAlgo::PRECISION_1ULP_FTZ_FALSE};
    constexpr static SqrtConfig sqrtConfig = {SqrtAlgo::PRECISION_0ULP_FTZ_FALSE};
    int64_t blockIdx_;
    TPipe* pipe_ = nullptr;
    TQue<QuePosition::VECIN, 1> x1Queue_;
    TQue<QuePosition::VECIN, 1> x2Queue_;
    TQue<QuePosition::VECOUT, 1> yQueue_;
    TQue<QuePosition::VECCALC, 1> tmpQueue_;
    TQue<QuePosition::VECCALC, 1> x1CastQueue_;
    TQue<QuePosition::VECCALC, 1> x2CastQueue_;
    TQue<QuePosition::VECCALC, 1> yCastQueue_;
    GlobalTensor<T> x1GM_;
    GlobalTensor<T> x2GM_;
    GlobalTensor<T> yGM_;
    LocalTensor<float> yFp32_;
    LocalTensor<float> tmpLocal_;
    const CdistTilingData* tiling_;
    int32_t vlLen_ = Ops::Base::GetVRegSize() / sizeof(float);
    uint32_t realCoreNum_ = 0;
    uint32_t B_ = 0;
    uint32_t P_ = 0;
    uint32_t R_ = 0;
    uint32_t RAlign_ = 0;
    uint32_t M_ = 0;
    uint32_t MAlign_ = 0;
    uint32_t blockMainNumB_ = 0;
    uint32_t blockTailNumB_ = 0;
    uint32_t blockMainFactorB_ = 0;
    uint32_t blockTailFactorB_ = 0;
    uint32_t blockMainNumP_ = 0;
    uint32_t blockTailNumP_ = 0;
    uint32_t blockMainFactorP_ = 0;
    uint32_t blockTailFactorP_ = 0;
    uint32_t blockMainNumR_ = 0;
    uint32_t blockTailNumR_ = 0;
    uint32_t blockMainFactorR_ = 0;
    uint32_t blockTailFactorR_ = 0;
    uint32_t ubLoopNumB_ = 0;
    uint32_t ubFactorB_ = 0;
    uint32_t ubTailFactorB_ = 0;
    uint32_t ubLoopNumP_ = 0;
    uint32_t ubFactorP_ = 0;
    uint32_t ubTailFactorP_ = 0;
    uint32_t ubLoopNumR_ = 0;
    uint32_t ubFactorR_ = 0;
    uint32_t ubTailFactorR_ = 0;
    uint32_t ubLoopNumM_ = 0;
    uint32_t ubFactorM_ = 0;
    uint32_t ubTailFactorM_ = 0;
    float p_ = 0;
    uint32_t bSize_ = 0;
    uint32_t pSize_ = 0;
    uint32_t rSize_ = 0;
    uint32_t mSize_ = 0;
    uint32_t ubFactorMAlign_ = 0;
    uint32_t ubFactorRAlign_ = 0;
    // Datacopy params
    DataCopyExtParams copyInParamsX1_{1, 0, 0, 0, 0};
    DataCopyExtParams copyInParamsX2_{1, 0, 0, 0, 0};
    LoopModeParams loopParamX1_{1, 0, 0, 0, 0, 0};
    LoopModeParams loopParamX2_{1, 0, 0, 0, 0, 0};
    DataCopyExtParams copyOutParams_{1, 0, 0, 0, 0};
    LoopModeParams loopParamOut_{1, 0, 0, 0, 0, 0};
    DataCopyPadExtParams<T> padParams_{false, 0, 0, 0};
};

template <typename T>
__aicore__ inline void Cdist<T>::Init(GM_ADDR x1, GM_ADDR x2, GM_ADDR y, const CdistTilingData* tilingData, TPipe* pipe)
{
    blockIdx_ = GetBlockIdx();
    pipe_ = pipe;
    tiling_ = tilingData;
    ParseTilingData(tiling_);
    x1GM_.SetGlobalBuffer((__gm__ T*)x1);
    x2GM_.SetGlobalBuffer((__gm__ T*)x2);
    yGM_.SetGlobalBuffer((__gm__ T*)y);
    if (ubLoopNumM_ == 1) {
        ubFactorMAlign_ = ((ubFactorM_ * sizeof(T) + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE) / sizeof(T);
    }
    else {
        ubFactorMAlign_ = ubFactorM_;
    }
    ubFactorRAlign_ = ((ubFactorR_ * sizeof(T) + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE) / sizeof(T);
    pipe_->InitBuffer(x1Queue_, BUFFER_NUM, ubFactorB_ * ubFactorP_ * ubFactorMAlign_ * sizeof(T));
    pipe_->InitBuffer(x2Queue_, BUFFER_NUM, ubFactorB_ * ubFactorR_ * ubFactorMAlign_ * sizeof(T));
    pipe_->InitBuffer(yQueue_, BUFFER_NUM, ubFactorB_ * ubFactorP_ * ubFactorRAlign_ * sizeof(T));
    pipe_->InitBuffer(tmpQueue_, 1, BLOCK_SIZE);
    if (sizeof(T) != sizeof(float)) {
        pipe_->InitBuffer(x1CastQueue_, 1, ubFactorB_ * ubFactorP_ * ubFactorMAlign_ * sizeof(float));
        pipe_->InitBuffer(x2CastQueue_, 1, ubFactorB_ * ubFactorR_ * ubFactorMAlign_ * sizeof(float));
        pipe_->InitBuffer(yCastQueue_, 1, ubFactorB_ * ubFactorP_ * ubFactorRAlign_ * sizeof(float));
        yFp32_ = yCastQueue_.AllocTensor<float>();
        Duplicate<float>(yFp32_, (float)0, ubFactorB_ * ubFactorP_ * ubFactorRAlign_);
        yCastQueue_.EnQue(yFp32_);
    } else {
        pipe_->InitBuffer(x1CastQueue_, 1, 0);
        pipe_->InitBuffer(x2CastQueue_, 1, 0);
        pipe_->InitBuffer(yCastQueue_, 1, 0);
        yFp32_ = yQueue_.AllocTensor<float>();
        Duplicate<float>(yFp32_, (float)0, ubFactorB_ * ubFactorP_ * ubFactorRAlign_);
        yQueue_.EnQue(yFp32_);
    }
    tmpLocal_ = tmpQueue_.AllocTensor<float>();
    Duplicate<float>(tmpLocal_, (float)0, 1);
    tmpQueue_.EnQue(tmpLocal_);
}

template <typename T>
__aicore__ inline void Cdist<T>::ParseTilingData(const CdistTilingData* tdPtr)
{
    B_ = tdPtr->B;
    P_ = tdPtr->P;
    R_ = tdPtr->R;
    M_ = tdPtr->M;
    blockMainNumB_ = tdPtr->blockMainNumB;
    blockTailNumB_ = tdPtr->blockTailNumB;
    blockMainFactorB_ = tdPtr->blockMainFactorB;
    blockTailFactorB_ = tdPtr->blockTailFactorB;
    blockMainNumP_ = tdPtr->blockMainNumP;
    blockTailNumP_ = tdPtr->blockTailNumP;
    blockMainFactorP_ = tdPtr->blockMainFactorP;
    blockTailFactorP_ = tdPtr->blockTailFactorP;
    blockMainNumR_ = tdPtr->blockMainNumR;
    blockTailNumR_ = tdPtr->blockTailNumR;
    blockMainFactorR_ = tdPtr->blockMainFactorR;
    blockTailFactorR_ = tdPtr->blockTailFactorR;
    ubLoopNumB_ = tdPtr->ubLoopNumB;
    ubFactorB_ = tdPtr->ubFactorB;
    ubTailFactorB_ = tdPtr->ubTailFactorB;
    ubLoopNumP_ = tdPtr->ubLoopNumP;
    ubFactorP_ = tdPtr->ubFactorP;
    ubTailFactorP_ = tdPtr->ubTailFactorP;
    ubLoopNumR_ = tdPtr->ubLoopNumR;
    ubFactorR_ = tdPtr->ubFactorR;
    ubTailFactorR_ = tdPtr->ubTailFactorR;
    ubLoopNumM_ = tdPtr->ubLoopNumM;
    ubFactorM_ = tdPtr->ubFactorM;
    ubTailFactorM_ = tdPtr->ubTailFactorM;
    p_ = tdPtr->p;
}

template <typename T>
__aicore__ inline void Cdist<T>::CopyInX1(uint32_t Offset)
{
    LocalTensor<T> x1Local = x1Queue_.AllocTensor<T>();
    copyInParamsX1_.blockCount = static_cast<uint16_t>(pSize_);
    copyInParamsX1_.blockLen = (ubLoopNumM_ == 1) ? static_cast<uint32_t>(M_ * sizeof(T)) : static_cast<uint32_t>(mSize_ * sizeof(T)); // unit Byte
    copyInParamsX1_.srcStride = (ubLoopNumM_ == 1) ? 0 : static_cast<uint32_t>((M_ - mSize_) * sizeof(T)); // unit Byte
    copyInParamsX1_.dstStride = 0; // unit block(32byte)
    loopParamX1_.loop1Size = static_cast<uint32_t>(bSize_);
    loopParamX1_.loop2Size = 1;
    loopParamX1_.loop1SrcStride = static_cast<uint64_t>((M_ * P_) * sizeof(T));
    loopParamX1_.loop2SrcStride = 0;
    loopParamX1_.loop1DstStride = static_cast<uint64_t>((pSize_ * MAlign_) * sizeof(T));
    loopParamX1_.loop2DstStride = 0;
    SetLoopModePara(loopParamX1_, DataCopyMVType::OUT_TO_UB);
    DataCopyPad(x1Local,x1GM_[Offset],copyInParamsX1_,padParams_);
    ResetLoopModePara(DataCopyMVType::OUT_TO_UB);
    x1Queue_.EnQue(x1Local);
}

template <typename T>
__aicore__ inline void Cdist<T>::CopyInX2(uint32_t Offset)
{
    LocalTensor<T> x2Local = x2Queue_.AllocTensor<T>();
    copyInParamsX2_.blockCount = static_cast<uint16_t>(rSize_);
    copyInParamsX2_.blockLen = (ubLoopNumM_ == 1) ? static_cast<uint32_t>(M_ * sizeof(T)) : static_cast<uint32_t>(mSize_ * sizeof(T)); // unit Byte
    copyInParamsX2_.srcStride = (ubLoopNumM_ == 1) ? 0 : static_cast<uint32_t>((M_ - mSize_) * sizeof(T)); // unit Byte
    copyInParamsX2_.dstStride = 0; // unit block(32byte)
    loopParamX2_.loop1Size = static_cast<uint32_t>(bSize_);
    loopParamX2_.loop2Size = 1;
    loopParamX2_.loop1SrcStride = static_cast<uint64_t>((M_ * R_) * sizeof(T));
    loopParamX2_.loop2SrcStride = 0;
    loopParamX2_.loop1DstStride = static_cast<uint64_t>((rSize_ * MAlign_) * sizeof(T));
    loopParamX2_.loop2DstStride = 0;
    SetLoopModePara(loopParamX2_, DataCopyMVType::OUT_TO_UB);
    DataCopyPad(x2Local,x2GM_[Offset],copyInParamsX2_,padParams_);
    ResetLoopModePara(DataCopyMVType::OUT_TO_UB);
    x2Queue_.EnQue(x2Local);
}

template <typename T>
__aicore__ inline void Cdist<T>::CopyOut(uint32_t Offset)
{
    LocalTensor<T> yLocal = yQueue_.DeQue<T>();
    copyOutParams_.blockCount = static_cast<uint16_t>(pSize_);
    copyOutParams_.blockLen = static_cast<uint32_t>(rSize_ * sizeof(T)); // unit Byte
    copyOutParams_.srcStride = 0;
    copyOutParams_.dstStride = static_cast<uint32_t>((R_ - rSize_) * sizeof(T));
    loopParamOut_.loop1Size = static_cast<uint32_t>(bSize_);
    loopParamOut_.loop2Size = 1;
    loopParamOut_.loop1SrcStride = static_cast<uint64_t>((pSize_ * RAlign_) * sizeof(T));
    loopParamOut_.loop2SrcStride = 0;
    loopParamOut_.loop1DstStride = static_cast<uint64_t>((P_ * R_) * sizeof(T));
    loopParamOut_.loop2DstStride = 0;
    SetLoopModePara(loopParamOut_, DataCopyMVType::UB_TO_OUT);
    DataCopyPad(yGM_[Offset], yLocal, copyOutParams_);
    ResetLoopModePara(DataCopyMVType::UB_TO_OUT);
    yQueue_.FreeTensor(yLocal);
}

template <typename T>
__aicore__ inline void Cdist<T>::CastY()
{
    LocalTensor<T> yLocal;
    if constexpr (sizeof(T) != sizeof(float)) {
        yLocal = yQueue_.AllocTensor<T>();
        yFp32_ = yCastQueue_.DeQue<float>();
        Cast(yLocal, yFp32_, RoundMode::CAST_RINT, (uint32_t)(ubFactorB_ * ubFactorP_ * ubFactorRAlign_));
    } else {
        yLocal = yQueue_.DeQue<T>();
    }
    yQueue_.EnQue(yLocal);
}

template <typename T>
__aicore__ inline void Cdist<T>::CastXToB32()
{
    LocalTensor<T> x1Local = x1Queue_.DeQue<T>();
    LocalTensor<T> x2Local = x2Queue_.DeQue<T>();
    LocalTensor<float> x1Cast;
    LocalTensor<float> x2Cast;
    if constexpr (sizeof(T) != sizeof(float)) {
        LocalTensor<float> x1Cast = x1CastQueue_.AllocTensor<float>();
        LocalTensor<float> x2Cast = x2CastQueue_.AllocTensor<float>();
        Cast(x1Cast, x1Local, RoundMode::CAST_NONE, (uint32_t)(ubFactorB_ * ubFactorP_ * ubFactorMAlign_));
        Cast(x2Cast, x2Local, RoundMode::CAST_NONE, (uint32_t)(ubFactorB_ * ubFactorR_ * ubFactorMAlign_));
        x1CastQueue_.EnQue(x1Cast);
        x2CastQueue_.EnQue(x2Cast);
    } else {
        x1Queue_.EnQue(x1Local);
        x2Queue_.EnQue(x2Local);
    }
    x1Queue_.FreeTensor(x1Local);
    x2Queue_.FreeTensor(x2Local);
}

template <typename T>
__aicore__ inline void Cdist<T>::Compute()
{
    LocalTensor<float> x1Local;
    LocalTensor<float> x2Local;
    if constexpr (sizeof(T) != sizeof(float)) {
        x1Local = x1CastQueue_.DeQue<float>();
        x2Local = x2CastQueue_.DeQue<float>();
        yFp32_ = yCastQueue_.DeQue<float>();
    } else {
        x1Local = x1Queue_.DeQue<T>();
        x2Local = x2Queue_.DeQue<T>();
        yFp32_ = yQueue_.DeQue<float>();
    }
    auto *srcPtrX1 = (__local_mem__ float *)x1Local.GetPhyAddr();
    auto *srcPtrX2 = (__local_mem__ float *)x2Local.GetPhyAddr();
    auto *dstPtr = (__local_mem__ float *)yFp32_.GetPhyAddr();
    ComputeOneSize(srcPtrX1, srcPtrX2, dstPtr);
    if constexpr (sizeof(T) != sizeof(float)) {
        yCastQueue_.EnQue(yFp32_);
        x1CastQueue_.FreeTensor(x1Local);
        x2CastQueue_.FreeTensor(x2Local);
    } else {
        yQueue_.EnQue(yFp32_);
        x1Queue_.FreeTensor(x1Local);
        x2Queue_.FreeTensor(x2Local);
    }
}

template <typename T>
__aicore__ inline void Cdist<T>::ComputeSplitM()
{
    int32_t processNum = bSize_ * pSize_ * rSize_;
    LocalTensor<float> x1LocalSplitM;
    LocalTensor<float> x2LocalSplitM;
    if constexpr (sizeof(T) != sizeof(float)) {
        x1LocalSplitM = x1CastQueue_.DeQue<float>();
        x2LocalSplitM = x2CastQueue_.DeQue<float>();
        yFp32_ = yCastQueue_.DeQue<float>();
    } else {
        x1LocalSplitM = x1Queue_.DeQue<T>();
        x2LocalSplitM = x2Queue_.DeQue<T>();
        yFp32_ = yQueue_.DeQue<float>();
    }
    tmpLocal_ = tmpQueue_.DeQue<float>();
    auto *dstPtr = (__local_mem__ float *)tmpLocal_.GetPhyAddr();
    auto *srcPtrX1 = (__local_mem__ float *)x1LocalSplitM.GetPhyAddr();
    auto *srcPtrX2 = (__local_mem__ float *)x2LocalSplitM.GetPhyAddr();
    ComputeOneSize(srcPtrX1, srcPtrX2, dstPtr);
    if (p_ == static_cast<float>(INFINITY)) {
        Max(yFp32_, tmpLocal_, yFp32_, processNum);
    } else {
        Add(yFp32_, tmpLocal_, yFp32_, processNum);
    }
    if constexpr (sizeof(T) != sizeof(float)) {
        yCastQueue_.EnQue(yFp32_);
        x1CastQueue_.FreeTensor(x1LocalSplitM);
        x2CastQueue_.FreeTensor(x2LocalSplitM);
    } else {
        yQueue_.EnQue(yFp32_);
        x1Queue_.FreeTensor(x1LocalSplitM);
        x2Queue_.FreeTensor(x2LocalSplitM);
    }
    tmpQueue_.EnQue(tmpLocal_);
}

template <typename T>
__aicore__ inline void Cdist<T>::ComputeOneSize(__local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr)
{
    uint16_t M = (ubLoopNumM_ == 1) ? M_ : mSize_;
    uint16_t loopNumM = M / vlLen_;
    uint32_t tailNumM = M - vlLen_ * loopNumM;
    for(uint32_t b = 0; b < bSize_; b++){
        for (uint32_t p = 0; p < pSize_; p++) {
            for (uint32_t r = 0; r < rSize_; r++) {
                if (p_ == 2.0f) {
                    ComputePNorm2(b, p, r, loopNumM, tailNumM, srcPtrX1, srcPtrX2, dstPtr);
                } else if (p_ == 1.0f) {
                    ComputePNorm1(b, p, r, loopNumM, tailNumM, srcPtrX1, srcPtrX2, dstPtr);
                } else if (p_ == 0.0f) {
                    ComputePNorm0(b, p, r, loopNumM, tailNumM, srcPtrX1, srcPtrX2, dstPtr);
                } else if (p_ == static_cast<float>(INFINITY)) {
                    ComputePNormInf(b, p, r, loopNumM, tailNumM, srcPtrX1, srcPtrX2, dstPtr);
                } else {
                    ComputePNormOther(b, p, r, loopNumM, tailNumM, srcPtrX1, srcPtrX2, dstPtr);
                }
            }
        }
    }
}

template <typename T>
__aicore__ inline void Cdist<T>::ComputePNorm2(uint32_t b, uint32_t p, uint32_t r, uint16_t loopNumM, uint32_t tailNumM,
    __local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr)
{
    uint32_t maksTailNumNorm2 = tailNumM;
    uint32_t maskOneNumNorm2 = BASE_ONE;
    __local_mem__ float * yOffsetNorm2 = dstPtr + b * pSize_ * RAlign_ + p * RAlign_ + r;
    __VEC_SCOPE__
    {
        MicroAPI::RegTensor<float> x1RegNorm2;
        MicroAPI::RegTensor<float> x2RegNorm2;
        MicroAPI::RegTensor<float> subRegNorm2;
        MicroAPI::RegTensor<float> mulRegNorm2;
        MicroAPI::RegTensor<float> sumRegNorm2;
        MicroAPI::RegTensor<float> resultRegNorm2;
        MicroAPI::RegTensor<float> dstRegNorm2;
        MicroAPI::MaskReg maskAllNorm2 = MicroAPI::CreateMask<float, MicroAPI::MaskPattern::ALL>();
        MicroAPI::MaskReg maskTailNorm2;
        MicroAPI::MaskReg maskOneNorm2;
        MicroAPI::UnalignRegForStore uRegNorm2;
        maskTailNorm2 = MicroAPI::UpdateMask<float>(maksTailNumNorm2);
        maskOneNorm2 = MicroAPI::UpdateMask<float>(maskOneNumNorm2);
        static constexpr MicroAPI::SqrtSpecificMode modesqrt = {MicroAPI::MaskMergeMode::ZEROING, true, SqrtAlgo::PRECISION_0ULP_FTZ_FALSE};
        MicroAPI::Duplicate(dstRegNorm2, (float)0);
        for (uint16_t m = 0; m < loopNumM; m++) {
            __local_mem__ float * x1OffsetNorm2 = srcPtrX1 + b * pSize_ * MAlign_ + p * MAlign_ + vlLen_ * m;
            __local_mem__ float * x2OffsetNorm2 = srcPtrX2 + b * rSize_ * MAlign_ + r * MAlign_ + vlLen_ * m;
            MicroAPI::LoadAlign(x1RegNorm2, x1OffsetNorm2);
            MicroAPI::LoadAlign(x2RegNorm2, x2OffsetNorm2);
            MicroAPI::Sub(subRegNorm2, x1RegNorm2, x2RegNorm2, maskAllNorm2);
            MicroAPI::Mul(mulRegNorm2, subRegNorm2, subRegNorm2, maskAllNorm2);
            MicroAPI::ReduceSum(sumRegNorm2, mulRegNorm2, maskAllNorm2);
            MicroAPI::Add(dstRegNorm2, dstRegNorm2, sumRegNorm2, maskOneNorm2);
        }
        __local_mem__ float * x1OffsetNorm2 = srcPtrX1 + b * pSize_ * MAlign_ + p * MAlign_ + vlLen_ * loopNumM;
        __local_mem__ float * x2OffsetNorm2 = srcPtrX2 + b * rSize_ * MAlign_ + r * MAlign_ + vlLen_ * loopNumM;
        MicroAPI::LoadAlign(x1RegNorm2, x1OffsetNorm2);
        MicroAPI::LoadAlign(x2RegNorm2, x2OffsetNorm2);
        MicroAPI::Sub(subRegNorm2, x1RegNorm2, x2RegNorm2, maskTailNorm2);
        MicroAPI::Mul(mulRegNorm2, subRegNorm2, subRegNorm2, maskTailNorm2);
        MicroAPI::ReduceSum(sumRegNorm2, mulRegNorm2, maskTailNorm2);
        MicroAPI::Add(dstRegNorm2, dstRegNorm2, sumRegNorm2, maskOneNorm2);
        if (ubLoopNumM_ == 1) {
            MicroAPI::Sqrt<float, &modesqrt>(resultRegNorm2, dstRegNorm2, maskOneNorm2);
            MicroAPI::StoreUnAlign(yOffsetNorm2, resultRegNorm2, uRegNorm2, BASE_ONE);
        } else {
            MicroAPI::StoreUnAlign(yOffsetNorm2, dstRegNorm2, uRegNorm2, BASE_ONE);
        }
        MicroAPI::StoreUnAlignPost(yOffsetNorm2, uRegNorm2, 0);
    }
}

template <typename T>
__aicore__ inline void Cdist<T>::ComputePNorm1(uint32_t b, uint32_t p, uint32_t r, uint16_t loopNumM, uint32_t tailNumM,
    __local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr)
{
    uint32_t maksTailNumNorm1= tailNumM;
    uint32_t maskOneNumNorm1 = BASE_ONE;
    __local_mem__ float * yOffsetNorm1 = dstPtr + b * pSize_ * RAlign_ + p * RAlign_ + r;
    __VEC_SCOPE__
    {
        MicroAPI::RegTensor<float> x1RegNorm1;
        MicroAPI::RegTensor<float> x2RegNorm1;
        MicroAPI::RegTensor<float> subRegNorm1;
        MicroAPI::RegTensor<float> absRegNorm1;
        MicroAPI::RegTensor<float> sumRegNorm1;
        MicroAPI::RegTensor<float> dstRegNorm1;
        MicroAPI::MaskReg maskAllNorm1 = MicroAPI::CreateMask<float, MicroAPI::MaskPattern::ALL>();
        MicroAPI::MaskReg maskTailNorm1;
        MicroAPI::MaskReg maskOneNorm1;
        MicroAPI::UnalignRegForStore uRegNorm1;
        maskTailNorm1 = MicroAPI::UpdateMask<float>(maksTailNumNorm1);
        maskOneNorm1 = MicroAPI::UpdateMask<float>(maskOneNumNorm1);
        MicroAPI::Duplicate(dstRegNorm1, (float)0 );
        for (uint16_t m = 0; m < loopNumM; m++) {
            __local_mem__ float * x1OffsetNorm1 = srcPtrX1 + b * pSize_ * MAlign_ + p * MAlign_ + vlLen_ * m;
            __local_mem__ float * x2OffsetNorm1 = srcPtrX2 + b * rSize_ * MAlign_ + r * MAlign_ + vlLen_ * m;
            MicroAPI::LoadAlign(x1RegNorm1, x1OffsetNorm1);
            MicroAPI::LoadAlign(x2RegNorm1, x2OffsetNorm1);
            MicroAPI::Sub(subRegNorm1, x1RegNorm1, x2RegNorm1, maskAllNorm1);
            MicroAPI::Abs(absRegNorm1, subRegNorm1, maskAllNorm1);
            MicroAPI::ReduceSum(sumRegNorm1, absRegNorm1, maskAllNorm1);
            MicroAPI::Add(dstRegNorm1, dstRegNorm1, sumRegNorm1, maskOneNorm1);
        }
        __local_mem__ float * x1OffsetNorm1 = srcPtrX1 + b * pSize_ * MAlign_ + p * MAlign_ + vlLen_ * loopNumM;
        __local_mem__ float * x2OffsetNorm1 = srcPtrX2 + b * rSize_ * MAlign_ + r * MAlign_ + vlLen_ * loopNumM;
        MicroAPI::LoadAlign(x1RegNorm1, x1OffsetNorm1);
        MicroAPI::LoadAlign(x2RegNorm1, x2OffsetNorm1);
        MicroAPI::Sub(subRegNorm1, x1RegNorm1, x2RegNorm1, maskTailNorm1);
        MicroAPI::Abs(absRegNorm1, subRegNorm1, maskTailNorm1);
        MicroAPI::ReduceSum(sumRegNorm1, absRegNorm1, maskTailNorm1);
        MicroAPI::Add(dstRegNorm1, dstRegNorm1, sumRegNorm1, maskOneNorm1);
        MicroAPI::StoreUnAlign(yOffsetNorm1, dstRegNorm1, uRegNorm1, BASE_ONE);
        MicroAPI::StoreUnAlignPost(yOffsetNorm1, uRegNorm1, 0);
    }
}

template <typename T>
__aicore__ inline void Cdist<T>::ComputePNorm0(uint32_t b, uint32_t p, uint32_t r, uint16_t loopNumM, uint32_t tailNumM,
    __local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr)
{
    uint32_t maksTailNumNorm0 = tailNumM;
    uint32_t maskOneNumNorm0 = BASE_ONE;
    __local_mem__ float * yOffsetNorm0 = dstPtr + b * pSize_ * RAlign_ + p * RAlign_ + r;
    __VEC_SCOPE__
    {
        MicroAPI::RegTensor<float> x1RegNorm0;
        MicroAPI::RegTensor<float> x2RegNorm0;
        MicroAPI::RegTensor<float> subRegNorm0;
        MicroAPI::RegTensor<float> absRegNorm0;
        MicroAPI::RegTensor<float> castRegNorm0;
        MicroAPI::RegTensor<float> minRegNorm0;
        MicroAPI::RegTensor<float> sumRegNorm0;
        MicroAPI::RegTensor<float> dstRegNorm0;
        MicroAPI::MaskReg maskAllNorm0 = MicroAPI::CreateMask<float, MicroAPI::MaskPattern::ALL>();
        MicroAPI::MaskReg maskTailNorm0;
        MicroAPI::MaskReg maskOneNorm0;
        MicroAPI::UnalignRegForStore uRegNorm0;
        maskTailNorm0 = MicroAPI::UpdateMask<float>(maksTailNumNorm0);
        maskOneNorm0 = MicroAPI::UpdateMask<float>(maskOneNumNorm0);
        MicroAPI::Duplicate(dstRegNorm0, (float)0 );
        for (uint16_t m = 0; m < loopNumM; m++) {
            __local_mem__ float * x1OffsetNorm0 = srcPtrX1 + b * pSize_ * MAlign_ + p * MAlign_ + vlLen_ * m;
            __local_mem__ float * x2OffsetNorm0 = srcPtrX2 + b * rSize_ * MAlign_ + r * MAlign_ + vlLen_ * m;
            MicroAPI::LoadAlign(x1RegNorm0, x1OffsetNorm0);
            MicroAPI::LoadAlign(x2RegNorm0, x2OffsetNorm0);
            MicroAPI::Sub(subRegNorm0, x1RegNorm0, x2RegNorm0, maskAllNorm0);
            MicroAPI::Abs(absRegNorm0, subRegNorm0, maskAllNorm0);
            MicroAPI::Truncate<float, RoundMode::CAST_CEIL, MicroAPI::MaskMergeMode::ZEROING>(castRegNorm0, absRegNorm0, maskAllNorm0);
            MicroAPI::Mins(minRegNorm0, castRegNorm0, (float)1, maskAllNorm0);
            MicroAPI::ReduceSum(sumRegNorm0, minRegNorm0, maskAllNorm0);
            MicroAPI::Add(dstRegNorm0, dstRegNorm0, sumRegNorm0, maskOneNorm0);
        }
        __local_mem__ float * x1OffsetNorm0 = srcPtrX1 + b * pSize_ * MAlign_ + p * MAlign_ + vlLen_ * loopNumM;
        __local_mem__ float * x2OffsetNorm0 = srcPtrX2 + b * rSize_ * MAlign_ + r * MAlign_ + vlLen_ * loopNumM;
        MicroAPI::LoadAlign(x1RegNorm0, x1OffsetNorm0);
        MicroAPI::LoadAlign(x2RegNorm0, x2OffsetNorm0);
        MicroAPI::Sub(subRegNorm0, x1RegNorm0, x2RegNorm0, maskTailNorm0);
        MicroAPI::Abs(absRegNorm0, subRegNorm0, maskTailNorm0);
        MicroAPI::Truncate<float, RoundMode::CAST_CEIL, MicroAPI::MaskMergeMode::ZEROING>(castRegNorm0, absRegNorm0, maskTailNorm0);
        MicroAPI::Mins(minRegNorm0, castRegNorm0, (float)1, maskTailNorm0);
        MicroAPI::ReduceSum(sumRegNorm0, minRegNorm0, maskTailNorm0);
        MicroAPI::Add(dstRegNorm0, dstRegNorm0, sumRegNorm0, maskOneNorm0);
        MicroAPI::StoreUnAlign(yOffsetNorm0, dstRegNorm0, uRegNorm0, BASE_ONE);
        MicroAPI::StoreUnAlignPost(yOffsetNorm0, uRegNorm0, 0);
    }
}

template <typename T>
__aicore__ inline void Cdist<T>::ComputePNormInf(uint32_t b, uint32_t p, uint32_t r, uint16_t loopNumM, uint32_t tailNumM,
    __local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr)
{
    uint32_t maksTailNumNormInf = tailNumM;
    uint32_t maskOneNumNormInf = BASE_ONE;
    __local_mem__ float * yOffsetNormInf = dstPtr + b * pSize_ * RAlign_ + p * RAlign_ + r;
    __VEC_SCOPE__
    {
        MicroAPI::RegTensor<float> x1RegNormInf;
        MicroAPI::RegTensor<float> x2RegNormInf;
        MicroAPI::RegTensor<float> subRegNormInf;
        MicroAPI::RegTensor<float> absRegNormInf;
        MicroAPI::RegTensor<float> maxRegNormInf;
        MicroAPI::RegTensor<float> dstRegNormInf;
        MicroAPI::MaskReg maskAllNormInf = MicroAPI::CreateMask<float, MicroAPI::MaskPattern::ALL>();
        MicroAPI::MaskReg maskTailNormInf;
        MicroAPI::MaskReg maskOneNormInf;
        MicroAPI::UnalignRegForStore uRegNormInf;
        maskTailNormInf = MicroAPI::UpdateMask<float>(maksTailNumNormInf);
        maskOneNormInf = MicroAPI::UpdateMask<float>(maskOneNumNormInf);
        MicroAPI::Duplicate(dstRegNormInf, (float)0 );
        for (uint16_t m = 0; m < loopNumM; m++) {
            __local_mem__ float * x1OffsetNormInf = srcPtrX1 + b * pSize_ * MAlign_ + p * MAlign_ + vlLen_ * m;
            __local_mem__ float * x2OffsetNormInf = srcPtrX2 + b * rSize_ * MAlign_ + r * MAlign_ + vlLen_ * m;
            MicroAPI::LoadAlign(x1RegNormInf, x1OffsetNormInf);
            MicroAPI::LoadAlign(x2RegNormInf, x2OffsetNormInf);
            MicroAPI::Sub(subRegNormInf, x1RegNormInf, x2RegNormInf, maskAllNormInf);
            MicroAPI::Abs(absRegNormInf, subRegNormInf, maskAllNormInf);
            MicroAPI::ReduceMax(maxRegNormInf, absRegNormInf, maskAllNormInf);
            MicroAPI::Max(dstRegNormInf, maxRegNormInf, dstRegNormInf, maskOneNormInf);
        }
        __local_mem__ float * x1OffsetNormInf = srcPtrX1 + b * pSize_ * MAlign_ + p * MAlign_ + vlLen_ * loopNumM;
        __local_mem__ float * x2OffsetNormInf = srcPtrX2 + b * rSize_ * MAlign_ + r * MAlign_ + vlLen_ * loopNumM;
        MicroAPI::LoadAlign(x1RegNormInf, x1OffsetNormInf);
        MicroAPI::LoadAlign(x2RegNormInf, x2OffsetNormInf);
        MicroAPI::Sub(subRegNormInf, x1RegNormInf, x2RegNormInf, maskTailNormInf);
        MicroAPI::Abs(absRegNormInf, subRegNormInf, maskAllNormInf);
        MicroAPI::ReduceMax(maxRegNormInf, absRegNormInf, maskAllNormInf);
        MicroAPI::Max(dstRegNormInf, maxRegNormInf, dstRegNormInf, maskOneNormInf);
        MicroAPI::StoreUnAlign(yOffsetNormInf, dstRegNormInf, uRegNormInf, BASE_ONE);
        MicroAPI::StoreUnAlignPost(yOffsetNormInf, uRegNormInf, 0);
    }
}

template <typename T>
__aicore__ inline void Cdist<T>::ComputePNormOther(uint32_t b, uint32_t p, uint32_t r, uint16_t loopNumM, uint32_t tailNumM,
    __local_mem__ float *srcPtrX1, __local_mem__ float *srcPtrX2, __local_mem__ float *dstPtr)
{
    uint32_t maksTailNum = tailNumM;
    uint32_t maskOneNum = BASE_ONE;
    __local_mem__ float * yOffset = dstPtr + b * pSize_ * RAlign_ + p * RAlign_ + r;
    __VEC_SCOPE__
    {
        MicroAPI::RegTensor<float> x1Reg;
        MicroAPI::RegTensor<float> x2Reg;
        MicroAPI::RegTensor<float> subReg;
        MicroAPI::RegTensor<float> absReg;
        MicroAPI::RegTensor<float> logReg;
        MicroAPI::RegTensor<float> mulReg;
        MicroAPI::RegTensor<float> expReg;
        MicroAPI::RegTensor<float> sumReg;
        MicroAPI::RegTensor<float> resultReg;
        MicroAPI::RegTensor<float> dstReg;
        MicroAPI::MaskReg maskAll = MicroAPI::CreateMask<float, MicroAPI::MaskPattern::ALL>();
        MicroAPI::MaskReg maskTail;
        MicroAPI::MaskReg maskOne;
        MicroAPI::UnalignRegForStore uReg;
        maskTail = MicroAPI::UpdateMask<float>(maksTailNum);
        maskOne = MicroAPI::UpdateMask<float>(maskOneNum);
        MicroAPI::Duplicate(dstReg, (float)0);
        static constexpr MicroAPI::ExpSpecificMode modeexp = {MicroAPI::MaskMergeMode::ZEROING,ExpAlgo::PRECISION_1ULP_FTZ_FALSE};
        static constexpr MicroAPI::LogSpecificMode modelog = {MicroAPI::MaskMergeMode::ZEROING,LogAlgo::PRECISION_1ULP_FTZ_FALSE};
        for (uint16_t m = 0; m < loopNumM; m++) {
            __local_mem__ float * x1Offset = srcPtrX1 + b * pSize_ * MAlign_ + p * MAlign_ + vlLen_ * m;
            __local_mem__ float * x2Offset = srcPtrX2 + b * rSize_ * MAlign_ + r * MAlign_ + vlLen_ * m;
            MicroAPI::LoadAlign(x1Reg, x1Offset);
            MicroAPI::LoadAlign(x2Reg, x2Offset);
            MicroAPI::Sub(subReg, x1Reg, x2Reg, maskAll);
            MicroAPI::Abs(absReg, subReg, maskAll);
            MicroAPI::Log<float, &modelog>(logReg, absReg, maskAll);
            MicroAPI::Muls(mulReg, logReg, (float)p_, maskAll);
            MicroAPI::Exp<float, &modeexp>(expReg, mulReg, maskAll);
            MicroAPI::ReduceSum(sumReg, expReg, maskAll);
            MicroAPI::Add(dstReg, dstReg, sumReg, maskOne);
        }
        __local_mem__ float * x1Offset = srcPtrX1 + b * pSize_ * MAlign_ + p * MAlign_ + vlLen_ * loopNumM;
        __local_mem__ float * x2Offset = srcPtrX2 + b * rSize_ * MAlign_ + r * MAlign_ + vlLen_ * loopNumM;
        MicroAPI::LoadAlign(x1Reg, x1Offset);
        MicroAPI::LoadAlign(x2Reg, x2Offset);
        MicroAPI::Sub(subReg, x1Reg, x2Reg, maskTail);
        MicroAPI::Abs(absReg, subReg, maskTail);
        MicroAPI::Log<float, &modelog>(logReg, absReg, maskTail);
        MicroAPI::Muls(mulReg, logReg, (float)p_, maskTail);
        MicroAPI::Exp<float, &modeexp>(expReg, mulReg, maskTail);
        MicroAPI::ReduceSum(sumReg, expReg, maskTail);
        MicroAPI::Add(dstReg, dstReg, sumReg, maskOne);
        if (ubLoopNumM_ == 1) {
            MicroAPI::Log<float, &modelog>(logReg, dstReg, maskOne);
            MicroAPI::Muls(mulReg, logReg, (float)(1 / p_), maskOne);
            MicroAPI::Exp<float, &modeexp>(expReg, mulReg, maskOne);
            MicroAPI::StoreUnAlign(yOffset, expReg, uReg, BASE_ONE);
        } else {
            MicroAPI::StoreUnAlign(yOffset, dstReg, uReg, BASE_ONE);
        }
        MicroAPI::StoreUnAlignPost(yOffset, uReg, 0);
    }
}

template <typename T>
__aicore__ inline void Cdist<T>::CalSplitMResult(int32_t processNum)
{
    if (p_ == 2.0f) {
        if constexpr (sizeof(T) != sizeof(float)) {
            yFp32_ = yCastQueue_.DeQue<float>();
            Sqrt<float,sqrtConfig>(yFp32_, yFp32_, processNum);
            yCastQueue_.EnQue(yFp32_);
        } else {
            yFp32_ = yQueue_.DeQue<float>();
            Sqrt<float,sqrtConfig>(yFp32_, yFp32_, processNum);
            yQueue_.EnQue(yFp32_);
        }
    }
    if (p_ != 1.0f && p_ != 2.0f && p_ != static_cast<float>(INFINITY) && p_ != 0.0f) {
        if constexpr (sizeof(T) != sizeof(float)) {
            yFp32_ = yCastQueue_.DeQue<float>();
            Ln<float, lnConfig>(yFp32_, yFp32_, processNum);
            Muls(yFp32_, yFp32_, (float)(1/p_), processNum);
            Exp<float, expConfig>(yFp32_, yFp32_, processNum);
            yCastQueue_.EnQue(yFp32_);
        } else {
            yFp32_ = yQueue_.DeQue<float>();
            Ln<float, lnConfig>(yFp32_, yFp32_, processNum);
            Muls(yFp32_, yFp32_, (float)(1/p_), processNum);
            Exp<float, expConfig>(yFp32_, yFp32_, processNum);
            yQueue_.EnQue(yFp32_);
        }
    }
}

template <typename T>
__aicore__ inline void Cdist<T>::ProcessNoSplitM(uint32_t bOffset, uint32_t pOffset, uint32_t rOffsetBlock, uint32_t blockFactorR)
{
    uint32_t offsetX1 = 0;
    uint32_t offsetX2 = 0;
    uint32_t offsetY = 0;
    uint32_t rOffset = 0;
    offsetX1 = bOffset * P_ * M_ + pOffset * M_;
    MAlign_ = ((M_ * sizeof(T) + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE) / sizeof(T);
    CopyInX1(offsetX1);
    for (uint32_t rIdx = 0; rIdx < ubLoopNumR_; rIdx++) {
        rOffset = rOffsetBlock + rIdx * ubFactorR_;
        rSize_ = (rIdx == ubLoopNumR_ - 1) ? (blockFactorR - ubFactorR_ * rIdx) : ubFactorR_;
        if (rSize_ == 0) {
            continue;
        }
        RAlign_ = ((rSize_ * sizeof(T) + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE) / sizeof(T);
        offsetX2 = bOffset * R_ * M_ + rOffset * M_;
        offsetY = bOffset * P_ * R_ + pOffset * R_ + rOffset;
        CopyInX2(offsetX2);
        CastXToB32();
        Compute();
        CastY();
        CopyOut(offsetY);
    }
}

template <typename T>
__aicore__ inline void Cdist<T>::ProcessSplitM(uint32_t bOffset, uint32_t pOffset, uint32_t rOffsetBlock, uint32_t blockFactorR)
{
    uint32_t offsetX1 = 0;
    uint32_t offsetX2 = 0;
    uint32_t offsetY = 0;
    int32_t processNum = 0;
    uint32_t mOffset = 0;
    uint32_t rOffset = 0;
    for (uint32_t rIdx = 0; rIdx < ubLoopNumR_; rIdx++) {
        rOffset = rOffsetBlock + rIdx * ubFactorR_;
        rSize_ = (rIdx == ubLoopNumR_ - 1) ? (blockFactorR - ubFactorR_ * rIdx) : ubFactorR_;
        if (rSize_ == 0) {
            continue;
        }
        processNum = bSize_ * pSize_ * rSize_;
        offsetY = bOffset * P_ * R_ + pOffset * R_ + rOffset;
        if constexpr (sizeof(T) != sizeof(float)) {
            yFp32_ = yCastQueue_.DeQue<float>();
            Duplicate<float>(yFp32_, (float)0, processNum);
            yCastQueue_.EnQue(yFp32_);
        } else {
            yFp32_ = yQueue_.DeQue<float>();
            Duplicate<float>(yFp32_, (float)0, processNum);
            yQueue_.EnQue(yFp32_);
        }
        for (uint32_t mIdx = 0; mIdx < ubLoopNumM_; mIdx++) {
            mOffset = mIdx * ubFactorM_;
            mSize_ = (mIdx == ubLoopNumM_ - 1) ? ubTailFactorM_ : ubFactorM_;
            MAlign_ = ((mSize_ * sizeof(T) + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE) / sizeof(T);
            offsetX1 = bOffset * P_ * M_ + pOffset * M_ + mOffset;
            offsetX2 = bOffset * R_ * M_ + rOffset * M_ + mOffset;
            CopyInX1(offsetX1);
            CopyInX2(offsetX2);
            CastXToB32();
            ComputeSplitM();
        }
        CalSplitMResult(processNum);
        CastY();
        PipeBarrier<PIPE_ALL>();
        CopyOut(offsetY);
    }
}

template <typename T>
__aicore__ inline void Cdist<T>::Process()
{
    uint32_t bOffset = 0;
    uint32_t pOffset = 0;
    uint32_t bOffsetBlock = 0;
    uint32_t pOffsetBlock = 0;
    uint32_t rOffsetBlock = 0;
    uint32_t blockFactorB = 0;
    uint32_t blockFactorP = 0;
    uint32_t blockFactorR = 0;
    uint32_t blockNumP = blockMainNumP_ + blockTailNumP_;
    uint32_t blockNumR = blockMainNumR_ + blockTailNumR_;
    uint32_t bBlockIdx = blockIdx_ / (blockNumP * blockNumR);
    uint32_t prBlockIdx = blockIdx_ % (blockNumP * blockNumR);
    uint32_t pBlockIdx = prBlockIdx / blockNumR;
    uint32_t rBlockIdx = prBlockIdx % blockNumR;
    blockFactorB = (bBlockIdx < blockMainNumB_) ? blockMainFactorB_ : blockTailFactorB_;
    bOffsetBlock = (bBlockIdx < blockMainNumB_)
                        ? blockMainFactorB_ * bBlockIdx
                        : blockMainFactorB_ * blockMainNumB_ + (bBlockIdx - blockMainNumB_) * blockTailFactorB_;
    blockFactorR = (rBlockIdx < blockMainNumR_) ? blockMainFactorR_ : blockTailFactorR_;
    rOffsetBlock = (rBlockIdx < blockMainNumR_)
                        ? blockMainFactorR_ * rBlockIdx
                        : blockMainFactorR_ * blockMainNumR_ + (rBlockIdx - blockMainNumR_) * blockTailFactorR_;
    blockFactorP = (pBlockIdx < blockMainNumP_) ? blockMainFactorP_ : blockTailFactorP_;
    pOffsetBlock = (pBlockIdx < blockMainNumP_)
                        ? blockMainFactorP_ * pBlockIdx
                        : blockMainFactorP_ * blockMainNumP_ + (pBlockIdx - blockMainNumP_) * blockTailFactorP_;
    for(uint32_t bIdx = 0; bIdx < ubLoopNumB_; bIdx++){
        bOffset = bOffsetBlock + bIdx * ubFactorB_;
        bSize_ = (bIdx == ubLoopNumB_ - 1) ? (blockFactorB - ubFactorB_ * bIdx) : ubFactorB_;
        if (bSize_ == 0) {
            continue;
        }
        for (uint32_t pIdx = 0; pIdx < ubLoopNumP_; pIdx++) {
            pOffset = pOffsetBlock + pIdx * ubFactorP_;
            pSize_ = (pIdx == ubLoopNumP_ - 1) ? (blockFactorP - ubFactorP_ * pIdx) : ubFactorP_;
            if (pSize_ == 0) {
                continue;
            }
            if (ubLoopNumM_ == 1) {
                ProcessNoSplitM(bOffset, pOffset, rOffsetBlock, blockFactorR);
            } else {
                ProcessSplitM(bOffset, pOffset, rOffsetBlock, blockFactorR);
            }
        }
    }
    yQueue_.FreeTensor(yFp32_);
    tmpQueue_.FreeTensor(tmpLocal_);
}
} // namespace NsCdist

#endif // CDIST_H