ops-math/math/hans_encode/op_kernel/hans_encode_base.h-代码预览-ops-math:基于 CANN 的数学类算子库项目 - AtomGit

cann-robothans_encode/hans_decode算子bug修复
b658dde4创建于 2025年12月25日历史提交
/**
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
 * CANN Open Software License Agreement Version 2.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

/*!
 * \file hans_encode_base.h
 * \brief
 */
#ifndef HANS_ENCODE_BASE_H
#define HANS_ENCODE_BASE_H

namespace HansEncodeNS {
using namespace HansCommonNs;
using namespace AscendC;

template <typename dataType>
class HansEncode {
 public:
  __aicore__ inline HansEncode() {
  }
  __aicore__ inline void Init(TPipe* pipe_, const HansEncodeInitConfig& config) {
    const HansEncodeTilingData* tilingData = config.tilingData;
    if (GetBlockIdx() >= tilingData->processCoreDim) {
      return;
    }
    this->pipe = pipe_;
    this->eventManager = HansCommonNs::EventManager();
    this->reshuff = tilingData->reshuff;
    this->dtypeSize = sizeof(dataType);
    this->processCoreDim = tilingData->processCoreDim;
    this->processLoopPerCore = tilingData->processLoopPerCore;
    this->processLoopCurrentCore =
        GetBlockIdx() < tilingData->processCoreDim - 1 ? this->processLoopPerCore : tilingData->processLoopLastCore;
    int64_t processMantissaPerCore = this->processLoopPerCore * BLOCK_SIZE * (this->dtypeSize - 1);
    int64_t prcoessNumelCurrentCore = this->processLoopCurrentCore * BLOCK_SIZE;
    int64_t processMantissaCurrentCore = prcoessNumelCurrentCore * (this->dtypeSize - 1);
    this->outputDeviceSizeCurrentCore =
        GetBlockIdx() < processCoreDim - 1 ? tilingData->fixedLengthPerCore : tilingData->fixedLengthLastCore;
    this->pdfGm.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(config.pdfGm), PDF_LENGTH);
    this->inputGm.SetGlobalBuffer(
      reinterpret_cast<__gm__ dataType*>(config.inputGm) + processLoopPerCore * BLOCK_SIZE * GetBlockIdx(), 
      prcoessNumelCurrentCore);
    if (reshuff) {
      this->fixedGm.SetGlobalBuffer(
        reinterpret_cast<__gm__ uint8_t*>(config.workspace) + tilingData->fixedLengthPerCore * GetBlockIdx() +
        META_INFO_NUM * sizeof(int32_t), this->outputDeviceSizeCurrentCore);
      this->workspaceGm.SetGlobalBuffer(
        reinterpret_cast<__gm__ uint8_t*>(config.fixedGm),
        tilingData->fixedLengthPerCore * (processCoreDim - 1) + tilingData->fixedLengthLastCore);
    } else {
      this->fixedGm.SetGlobalBuffer(
        reinterpret_cast<__gm__ uint8_t*>(config.fixedGm) + tilingData->fixedLengthPerCore * GetBlockIdx() +
        META_INFO_NUM * sizeof(int32_t), this->outputDeviceSizeCurrentCore);
      this->workspaceGm.SetGlobalBuffer( reinterpret_cast<__gm__ uint8_t*>(config.fixedGm),
        tilingData->fixedLengthPerCore * (processCoreDim - 1) + tilingData->fixedLengthLastCore);
    }
    this->outputDeviceHeaderGm.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t*>(config.fixedGm), 128);
    this->outputMantissaGm.SetGlobalBuffer(
        reinterpret_cast<__gm__ uint8_t*>(config.outputMantissaGm) + processMantissaPerCore * GetBlockIdx(),
        processMantissaCurrentCore);
    this->varGm.SetGlobalBuffer(reinterpret_cast<__gm__ uint8_t*>(config.varGm), tilingData->varLength);
    this->tileDataLength = EACH_LOOOP_PROCESS_NUM;
    this->tileNum = prcoessNumelCurrentCore / this->tileDataLength;
    this->tileRemain = prcoessNumelCurrentCore % this->tileDataLength;
    this->pipe->InitBuffer(this->inputQue, 1, this->dtypeSize * this->tileDataLength);
    this->pipe->InitBuffer(this->pdfQue, 1, sizeof(int32_t) * PDF_LENGTH);
    this->pipe->InitBuffer(this->outputMantissaQue, 1, this->tileDataLength * (this->dtypeSize - 1));
    this->pipe->InitBuffer(this->outputDeviceQue, 1,
                           (this->tileDataLength + EACH_LOOOP_REPEAT_TIMES) * sizeof(uint16_t));
    this->pipe->InitBuffer(this->calcBuf, CALC_BUF_SIZE);
  }

  __aicore__ inline void Process() {
    if (GetBlockIdx() >= this->processCoreDim) {
      return;
    }
    ProcessPdf();
    Prepare4Encode();
    ProcessEncode();
    PostProcess();
  }

 private:
  __aicore__ inline void ProcessPdf() {
    this->eventManager.InitEvent();
    this->inputLocal = this->inputQue.template AllocTensor<dataType>();
    this->pdfLocal = this->pdfQue.template AllocTensor<int32_t>();
    this->outputDeviceLocal = this->outputDeviceQue.template AllocTensor<uint16_t>();
    this->outputMantissaLocal = this->outputMantissaQue.template AllocTensor<uint8_t>();

    this->pdfSortIndexLocal = calcBuf.GetWithOffset<int32_t>(PDF_LENGTH, this->bufOffset);
    this->bufOffset += PDF_LENGTH * sizeof(int32_t);
    PdfSort();
  }

  __aicore__ inline void Prepare4Encode() {
    this->inputHalfLocal = calcBuf.GetWithOffset<half>(this->dtypeSize * EACH_LOOOP_PROCESS_NUM, this->bufOffset);
    this->bufOffset += this->dtypeSize * EACH_LOOOP_PROCESS_NUM * sizeof(half);
    this->ConstOneLocal = calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES, this->bufOffset);
    this->bufOffset += EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t);
    this->ConstZeroLocal = calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES, this->bufOffset);
    this->bufOffset += EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t);
    this->fp32MantissaMaskLocal = calcBuf.GetWithOffset<uint16_t>(MANTISSA_MASK_NUM, this->bufOffset);
    this->bufOffset += MANTISSA_MASK_NUM * sizeof(uint16_t);
    this->inputExpHalfLocal = calcBuf.GetWithOffset<half>(EACH_LOOOP_PROCESS_NUM, this->bufOffset);
    this->bufOffset += EACH_LOOOP_PROCESS_NUM * sizeof(half);
    this->inputExpUint32Local = calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_PROCESS_NUM, this->bufOffset);
    this->bufOffset += EACH_LOOOP_PROCESS_NUM * sizeof(int32_t);
    this->statePdfIndexSortLocal = calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_PROCESS_NUM, this->bufOffset);
    this->bufOffset += EACH_LOOOP_PROCESS_NUM * sizeof(int32_t);
    this->stateBufferLocal = calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_PROCESS_NUM, this->bufOffset);
    this->bufOffset += EACH_LOOOP_PROCESS_NUM * sizeof(int32_t);
    this->statePdfMaxUseBitLocal =
        calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES + EACH_LOOOP_REPEAT_TIMES, this->bufOffset);
    this->bufOffset += (EACH_LOOOP_REPEAT_TIMES + EACH_LOOOP_REPEAT_TIMES) * sizeof(int32_t);
    this->bitLevelMaskZeroLocal = calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES, this->bufOffset);
    this->bufOffset += EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t);
    this->stateBufferBitNumberLocal = calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES, this->bufOffset);
    this->bufOffset += EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t);
    this->encodeHeaderLocal = calcBuf.GetWithOffset<int32_t>(META_INFO_NUM, this->bufOffset);
    this->bufOffset += META_INFO_NUM * sizeof(int32_t);
    this->bitShlMulScaleLocal = calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES, this->bufOffset);
    this->bufOffset += EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t);
    this->reduceLowBlockMaskLocal = calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES, this->bufOffset);
    this->bufOffset += EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t);
    for (int32_t i = 0; i < sizeof(uint16_t) * BYTE_BIT_NUM; ++i) {
      this->bitShlMulScaleLocal.SetValue(i, (int32_t)(1 << i));
    }

    Duplicate<int32_t>(this->encodeHeaderLocal, 0, META_INFO_NUM);
    if (GetBlockIdx() == 0) {
      PipeBarrier<PIPE_ALL>();
      DataCopyPad(this->outputDeviceHeaderGm, this->encodeHeaderLocal,
                  {1, static_cast<uint16_t>(META_INFO_NUM * sizeof(int32_t)), 0, 0});
      PipeBarrier<PIPE_ALL>();
    }
    Duplicate<uint16_t>(this->fp32MantissaMaskLocal, (uint16_t)30583ULL, MANTISSA_MASK_NUM);
    Duplicate<int32_t>(this->bitLevelMaskZeroLocal, 0, EACH_LOOOP_REPEAT_TIMES);
    Duplicate<int32_t>(this->ConstOneLocal, 1, EACH_LOOOP_REPEAT_TIMES);
    Duplicate<int32_t>(this->ConstZeroLocal, 0, EACH_LOOOP_REPEAT_TIMES);
    Duplicate<int32_t>(this->stateBufferBitNumberLocal, 0, EACH_LOOOP_REPEAT_TIMES);
    Duplicate<int32_t>(this->reduceLowBlockMaskLocal, LOW_BLOCK_MASK, EACH_LOOOP_REPEAT_TIMES);
    Duplicate<int32_t>(this->stateBufferLocal, 0, EACH_LOOOP_PROCESS_NUM);
    PipeBarrier<PIPE_ALL>();
  }

  __aicore__ inline void ProcessEncode() {
    if (this->outputDeviceSizeCurrentCore - ENCODE_TAIL_INFO_BYTES_PER_CORE >= 0) {
      CopyIn(0, this->tileDataLength * this->dtypeSize);
      PipeBarrier<PIPE_ALL>();
      SetFlag<HardEvent::MTE3_V>(this->eventManager.eventMTE3VPing);
      int32_t tileRemainNum = this->tileRemain > 0 ? 1 : 0;
      for (int32_t i = 0; i < this->tileNum + tileRemainNum; i++) {
        int32_t processDataLength = i < this->tileNum ? this->tileDataLength : this->tileRemain;
        Uint8Convert2Fp16(processDataLength);
        if (i < this->tileNum) {
          int32_t copySize =
              i < this->tileNum - 1 ? this->tileDataLength * this->dtypeSize : this->tileRemain * this->dtypeSize;
          CopyIn((i + 1) * this->tileDataLength, copySize);
          SetFlag<HardEvent::MTE2_V>(this->eventManager.eventMTE2VPing);
        }
        SeparateAndCopyOut(i * this->tileDataLength * (this->dtypeSize - 1), processDataLength);
        SetFlag<HardEvent::MTE3_V>(this->eventManager.eventMTE3VPong);
        PdfSortMapping(processDataLength);
        if (SpaceVerify(processDataLength)) {
          if (i < this->tileNum) {
            WaitFlag<HardEvent::MTE2_V>(this->eventManager.eventMTE2VPing);
          }
          WaitFlag<HardEvent::MTE3_V>(this->eventManager.eventMTE3VPong);
          break;
        }
        PdfSortAdd2Buffer(processDataLength);
        WaitFlag<HardEvent::MTE3_V>(this->eventManager.eventMTE3VPing);
        OverflowProcess(processDataLength);
        CopyOut(processDataLength);
        SetFlag<HardEvent::MTE3_V>(this->eventManager.eventMTE3VPing);
        if (i < this->tileNum) {
          WaitFlag<HardEvent::MTE2_V>(this->eventManager.eventMTE2VPing);
        }
        WaitFlag<HardEvent::MTE3_V>(this->eventManager.eventMTE3VPong);
      }
      WaitFlag<HardEvent::MTE3_V>(this->eventManager.eventMTE3VPing);
      DeviceTailProcess();
    }
  }

  __aicore__ inline void PostProcess() {
    Sync();
    this->inputQue.template FreeTensor<dataType>(this->inputLocal);
    this->pdfQue.template FreeTensor<int32_t>(this->pdfLocal);
    this->outputDeviceQue.template FreeTensor<uint16_t>(this->outputDeviceLocal);
    this->outputMantissaQue.template FreeTensor<uint8_t>(this->outputMantissaLocal);
    this->pipe->Reset();
    this->eventManager.InitEvent();

    if (this->reshuff) {
      ReshuffDevice();
    } else {
      HostTailProcess();
    }
    this->eventManager.ReleaseEvent();
  }

  __aicore__ inline void PdfSort() {
    int32_t pdfSortAndIndexNum = 512;
    int32_t blockSize = 32;
    int32_t mrg4SortMode = 15;
    int32_t mrg2SortMode = 3;
    int32_t idx0 = 0;
    int32_t idx1 = 64;
    int32_t idx2 = 128;
    int32_t idx3 = 192;
    int32_t idx4 = 256;
    int32_t idx5 = 320;
    int32_t idx6 = 384;
    int32_t idx7 = 448;
    LocalTensor<int32_t> pdfRangeIndexLocal = calcBuf.GetWithOffset<int32_t>(PDF_LENGTH, this->bufOffset);
    this->bufOffset += PDF_LENGTH * sizeof(int32_t);
    LocalTensor<float> pdfSort = calcBuf.GetWithOffset<float>(pdfSortAndIndexNum, this->bufOffset);
    this->bufOffset += pdfSortAndIndexNum * sizeof(float);
    LocalTensor<float> pdfMrgsort = calcBuf.GetWithOffset<float>(pdfSortAndIndexNum, this->bufOffset);
    this->bufOffset += pdfSortAndIndexNum * sizeof(float);
    DataCopyPad(this->pdfLocal, this->pdfGm, {1, static_cast<uint16_t>(PDF_LENGTH * sizeof(int32_t)), 0, 0},
                {true, 0, 0, 0});
    PipeBarrier<PIPE_ALL>();
    CreateVecIndex(pdfRangeIndexLocal, (int32_t)0, PDF_LENGTH);
    PipeBarrier<PIPE_V>();
    Sort32<float>(pdfSort, this->pdfLocal.template ReinterpretCast<float>(),
                  pdfRangeIndexLocal.ReinterpretCast<uint32_t>(), PDF_LENGTH / blockSize);
    PipeBarrier<PIPE_V>();
    MrgSortSrcList<float> srcList1(pdfSort[idx0], pdfSort[idx1], pdfSort[idx2], pdfSort[idx3]);
    uint16_t elementLengths[4] = {32, 32, 32, 32};
    MrgSort4Info srcInfo(elementLengths, false, mrg4SortMode, 1);
    MrgSort<float>(pdfMrgsort, srcList1, srcInfo);
    MrgSortSrcList<float> srcList2(pdfSort[idx4], pdfSort[idx5], pdfSort[idx6], pdfSort[idx7]);
    MrgSort<float>(pdfMrgsort[idx4], srcList2, srcInfo);
    PipeBarrier<PIPE_V>();
    MrgSortSrcList<float> srcList3(pdfMrgsort[idx0], pdfMrgsort[idx4], pdfMrgsort[0], pdfMrgsort[idx4]);
    uint16_t eleLengths[4] = {128, 128, 0, 0};
    MrgSort4Info src256Info(eleLengths, false, mrg2SortMode, 1);
    MrgSort<float>(pdfSort, srcList3, src256Info);
    PipeBarrier<PIPE_V>();
    int32_t uint64BitNum = 64;
    int32_t uint16BitNum = 16;
    int32_t pdfStep = 2;
    for (int32_t i = 0; i < PDF_LENGTH; i++) {
      int32_t index = pdfSort.ReinterpretCast<int32_t>().GetValue(pdfStep * i + 1);
      int32_t metaValue = i == 0 ? (1 << (sizeof(uint16_t) * BYTE_BIT_NUM)) + i
                                 : (int32_t)((uint64BitNum - (int32_t)(ScalarCountLeadingZero((uint64_t)i))) << uint16BitNum) + i;
      this->pdfSortIndexLocal.SetValue(index, metaValue);
    }
    this->bufOffset -= (PDF_LENGTH * sizeof(int32_t) + EACH_LOOOP_REPEAT_TIMES * BYTE_BIT_NUM * sizeof(float) +
                        EACH_LOOOP_REPEAT_TIMES * BYTE_BIT_NUM * sizeof(float));
  }

  __aicore__ inline void CopyIn(int32_t copyOffset, int32_t copySize) {
    if (copySize == 0) {
      return;
    }
    SetFlag<HardEvent::V_MTE2>(this->eventManager.eventVMTE2Ping);
    WaitFlag<HardEvent::V_MTE2>(this->eventManager.eventVMTE2Ping);
    DataCopyParams copyParams{1, static_cast<uint16_t>(copySize), 0, 0};
    DataCopyPadParams padParams{true, 0, 0, 0};
    DataCopyPad(this->inputLocal, this->inputGm[copyOffset], copyParams, padParams);
  }

  __aicore__ inline void Uint8Convert2Fp16(int32_t processDataLength) {
    Cast(this->inputHalfLocal, this->inputLocal.template ReinterpretCast<uint8_t>(), RoundMode::CAST_NONE,
         processDataLength * this->dtypeSize);
    PipeBarrier<PIPE_V>();
  }

  __aicore__ inline void SeparateAndCopyOut(int32_t copyOffset, int32_t processDataLength) {
    uint8_t expPattern = this->dtypeSize == sizeof(float) ? 6 : 2;
    GatherMaskParams expParams;
    expParams.src0BlockStride = 1;
    expParams.repeatTimes =
        static_cast<uint8_t>(processDataLength * this->dtypeSize / (EACH_REPEAT_BYTES / sizeof(uint16_t)));
    expParams.src0RepeatStride = REPEAT_STRIDE;
    expParams.src1RepeatStride = 0;
    GatherMask(this->inputExpHalfLocal, this->inputHalfLocal, expPattern, false, 0, expParams, this->rsvdCnt);
    PipeBarrier<PIPE_V>();
    if (this->dtypeSize == sizeof(float)) {
      GatherMask(
          this->inputHalfLocal.template ReinterpretCast<uint16_t>(),
          this->inputHalfLocal.template ReinterpretCast<uint16_t>(),
          this->fp32MantissaMaskLocal.template ReinterpretCast<uint16_t>(), true,
          expParams.repeatTimes * EACH_REPEAT_BYTES / sizeof(uint16_t), {1, 1, 8, 0}, this->rsvdCnt);
    } else {
      GatherMask(this->inputHalfLocal, this->inputHalfLocal, 1, false, 0, expParams, this->rsvdCnt);
    }
    PipeBarrier<PIPE_V>();
    Cast(this->outputMantissaLocal, this->inputHalfLocal, RoundMode::CAST_TRUNC,
         processDataLength * (this->dtypeSize - 1));
    PipeBarrier<PIPE_V>();
    Cast(this->inputExpUint32Local, this->inputExpHalfLocal, RoundMode::CAST_TRUNC, processDataLength);
    PipeBarrier<PIPE_V>();
    SetFlag<HardEvent::V_MTE3>(this->eventManager.eventVMTE3Ping);
    WaitFlag<HardEvent::V_MTE3>(this->eventManager.eventVMTE3Ping);
    DataCopyParams copyParams{1, static_cast<uint16_t>((this->dtypeSize - 1) * processDataLength), 0, 0};
    DataCopyPad(this->outputMantissaGm[copyOffset], this->outputMantissaLocal, copyParams);
  }

  __aicore__ inline void PdfSortMapping(int32_t processDataLength) {
    Duplicate<int32_t>(this->statePdfMaxUseBitLocal, 0, EACH_LOOOP_REPEAT_TIMES + EACH_LOOOP_REPEAT_TIMES);
    Muls(this->inputExpUint32Local, this->inputExpUint32Local, (int32_t)sizeof(int32_t), processDataLength);
    PipeBarrier<PIPE_V>();
    Gather(this->inputExpUint32Local, this->pdfSortIndexLocal,
           this->inputExpUint32Local.template ReinterpretCast<uint32_t>(), (uint32_t)0, processDataLength);
    PipeBarrier<PIPE_V>();
    Mul(this->statePdfIndexSortLocal.template ReinterpretCast<int16_t>(),
        this->inputExpUint32Local.template ReinterpretCast<int16_t>(),
        this->ConstOneLocal.template ReinterpretCast<int16_t>(), EACH_LOOOP_REPEAT_TIMES * sizeof(int16_t),
        (uint8_t)(processDataLength / BLOCK_SIZE), {1, 1, 0, 8, 8, 0});
    PipeBarrier<PIPE_V>();
    ShiftRight(this->inputExpUint32Local.template ReinterpretCast<uint32_t>(),
               this->inputExpUint32Local.template ReinterpretCast<uint32_t>(),
               (uint32_t)(sizeof(uint16_t) * BYTE_BIT_NUM), processDataLength);
    PipeBarrier<PIPE_V>();
    WholeReduceMax<float>(this->statePdfMaxUseBitLocal.template ReinterpretCast<float>(),
                          this->inputExpUint32Local.template ReinterpretCast<float>(), EACH_LOOOP_REPEAT_TIMES,
                          (uint8_t)(processDataLength / BLOCK_SIZE), 1, 1, REPEAT_STRIDE);
    PipeBarrier<PIPE_V>();
    GatherMask(this->bitLevelMaskZeroLocal, this->statePdfMaxUseBitLocal, 1, false, 0, {1, 2, 8, 0}, this->rsvdCnt);
    PipeBarrier<PIPE_V>();
  }

  __aicore__ inline bool SpaceVerify(int32_t processDataLength) {
    if ((this->currentCoreOutputAcculmulateSize + ENCODE_TAIL_INFO_BYTES_PER_CORE) >
        this->outputDeviceSizeCurrentCore - ENCODE_TAIL_INFO_BYTES_PER_CORE) {
      LocalTensor<int32_t> overflowCheckLocal =
          calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES, this->bufOffset);
      this->bufOffset += EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t);
      LocalTensor<int32_t> stateBufferBitCmpLocal =
          calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES, this->bufOffset);
      this->bufOffset += EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t);
      PipeBarrier<PIPE_V>();
      Add(overflowCheckLocal, this->stateBufferBitNumberLocal, this->bitLevelMaskZeroLocal, EACH_LOOOP_REPEAT_TIMES);
      Duplicate<int32_t>(stateBufferBitCmpLocal, 0, EACH_LOOOP_REPEAT_TIMES);
      PipeBarrier<PIPE_V>();
      CompareScalar(stateBufferBitCmpLocal.ReinterpretCast<uint8_t>(), overflowCheckLocal.ReinterpretCast<float>(),
                    this->const16Float, AscendC::CMPMODE::GT, EACH_LOOOP_REPEAT_TIMES);
      PipeBarrier<PIPE_V>();
      GatherMask(
          this->inputHalfLocal.template ReinterpretCast<uint32_t>(),
          overflowCheckLocal.template ReinterpretCast<uint32_t>(),
          stateBufferBitCmpLocal.template ReinterpretCast<uint32_t>(), true, EACH_REPEAT_BYTES / sizeof(uint32_t),
          {1, 1, 8, 0}, this->rsvdCnt);
      uint64_t compressBufferOffset = AscendC::AscendCUtils::GetRsvdCnt();
      this->bufOffset -= (EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t) + EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t));
      if ((currentCoreOutputAcculmulateSize + EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t) +
           compressBufferOffset * EACH_LOOOP_REPEAT_TIMES * sizeof(uint16_t)) >
          this->outputDeviceSizeCurrentCore - ENCODE_TAIL_INFO_BYTES_PER_CORE) {
        return true;
      }
    }
    return false;
  }

  __aicore__ inline void PdfSortAdd2Buffer(int32_t processDataLength) {
    PipeBarrier<PIPE_V>();
    LocalTensor<int32_t> bitLevelMaskZeroShlScaleLocal =
        calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES, this->bufOffset);
    this->bufOffset += EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t);
    LocalTensor<int32_t> bitLevelMaskFirstShlScaleLocal =
        calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES * BYTE_BIT_NUM, this->bufOffset);
    this->bufOffset += EACH_LOOOP_REPEAT_TIMES * BYTE_BIT_NUM * sizeof(int32_t);
    Muls(bitLevelMaskZeroShlScaleLocal, this->bitLevelMaskZeroLocal, (int32_t)sizeof(int32_t), EACH_LOOOP_REPEAT_TIMES);
    PipeBarrier<PIPE_V>();
    Gather(bitLevelMaskZeroShlScaleLocal, this->bitShlMulScaleLocal,
           bitLevelMaskZeroShlScaleLocal.template ReinterpretCast<uint32_t>(), (uint32_t)0, EACH_LOOOP_REPEAT_TIMES);
    PipeBarrier<PIPE_V>();
    Brcb(bitLevelMaskFirstShlScaleLocal, bitLevelMaskZeroShlScaleLocal, BYTE_BIT_NUM, {1, 8});
    PipeBarrier<PIPE_V>();
    Mul(this->stateBufferLocal, this->stateBufferLocal, bitLevelMaskFirstShlScaleLocal, EACH_LOOOP_REPEAT_TIMES,
        (uint8_t)(processDataLength / BLOCK_SIZE), {1, 1, 0, 8, 8, 1});
    PipeBarrier<PIPE_V>();
    Add(this->stateBufferLocal, this->stateBufferLocal, this->statePdfIndexSortLocal, processDataLength);
    PipeBarrier<PIPE_V>();
    Add(this->stateBufferBitNumberLocal, this->stateBufferBitNumberLocal, this->bitLevelMaskZeroLocal,
        EACH_LOOOP_REPEAT_TIMES);
    PipeBarrier<PIPE_V>();
    this->bufOffset -= (BLOCK_SIZE * sizeof(int32_t) + EACH_LOOOP_REPEAT_TIMES * BYTE_BIT_NUM * sizeof(int32_t));
  }

  __aicore__ inline void OverflowProcess(int32_t processDataLength) {
    LocalTensor<int32_t> stateBufferBitCmpLocal = calcBuf.GetWithOffset<int32_t>(BLOCK_SIZE, this->bufOffset);
    this->bufOffset += BLOCK_SIZE * sizeof(int32_t);
    LocalTensor<int32_t> overflowFlagLocal = calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES, this->bufOffset);
    this->bufOffset += EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t);
    LocalTensor<int32_t> overflowFlagBrcbLocal =
        calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_REPEAT_TIMES * BYTE_BIT_NUM, this->bufOffset);
    this->bufOffset += EACH_LOOOP_REPEAT_TIMES * BYTE_BIT_NUM * sizeof(int32_t);
    LocalTensor<int32_t> overflowFlagBrcbReduceLocal =
        calcBuf.GetWithOffset<int32_t>(EACH_LOOOP_PROCESS_NUM / (sizeof(uint16_t) * BYTE_BIT_NUM), this->bufOffset);
    this->bufOffset += EACH_LOOOP_PROCESS_NUM / (sizeof(uint16_t) * BYTE_BIT_NUM) * sizeof(int32_t);
    PipeBarrier<PIPE_V>();
    Duplicate<int32_t>(stateBufferBitCmpLocal, 0, BLOCK_SIZE);
    CompareScalar(stateBufferBitCmpLocal.ReinterpretCast<uint8_t>(),
                  this->stateBufferBitNumberLocal.template ReinterpretCast<float>(), this->const16Float,
                  AscendC::CMPMODE::GT, EACH_LOOOP_REPEAT_TIMES);
    PipeBarrier<PIPE_V>();
    Select(overflowFlagLocal.ReinterpretCast<float>(), stateBufferBitCmpLocal.ReinterpretCast<uint8_t>(),
           this->ConstOneLocal.template ReinterpretCast<float>(),
           this->ConstZeroLocal.template ReinterpretCast<float>(), AscendC::SELMODE::VSEL_CMPMASK_SPR, BLOCK_SIZE);
    PipeBarrier<PIPE_V>();
    Brcb(overflowFlagBrcbLocal, overflowFlagLocal, BYTE_BIT_NUM, {1, 8});
    PipeBarrier<PIPE_V>();
    GatherMask(
        overflowFlagBrcbReduceLocal.template ReinterpretCast<uint32_t>(),
        overflowFlagBrcbLocal.template ReinterpretCast<uint32_t>(),
        this->reduceLowBlockMaskLocal.template ReinterpretCast<uint32_t>(), true, CONST_8 * EACH_REPEAT_BYTES / sizeof(uint32_t),
        {1, 1, 8, 0}, this->rsvdCnt);
    PipeBarrier<PIPE_V>();
    Muls(overflowFlagBrcbReduceLocal, overflowFlagBrcbReduceLocal, (int32_t)INT32_LOW_16_BIT_MASK,
         EACH_LOOOP_PROCESS_NUM / (sizeof(uint16_t) * BYTE_BIT_NUM));
    PipeBarrier<PIPE_V>();
    GatherMask(
	    this->outputDeviceLocal.template ReinterpretCast<uint16_t>(),
        this->stateBufferLocal.template ReinterpretCast<uint16_t>(),
        overflowFlagBrcbReduceLocal.template ReinterpretCast<uint16_t>(), true,
        processDataLength / BLOCK_SIZE * EACH_REPEAT_BYTES / sizeof(uint16_t), {1, 1, 8, 0}, this->rsvdCnt);
    this->compressBufferOffset = AscendC::AscendCUtils::GetRsvdCnt();
    GatherOutput(overflowFlagBrcbLocal, overflowFlagLocal, processDataLength);
    this->bufOffset -= (BLOCK_SIZE * sizeof(int32_t) + EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t) +
                        EACH_LOOOP_REPEAT_TIMES * BYTE_BIT_NUM * sizeof(int32_t) +
                        EACH_LOOOP_PROCESS_NUM / (sizeof(uint16_t) * BYTE_BIT_NUM) * sizeof(int32_t));
  }

  __aicore__ inline void GatherOutput(LocalTensor<int32_t>& overflowFlagBrcbLocal, 
                                      LocalTensor<int32_t>& overflowFlagLocal, int32_t processDataLength) {
    Adds(overflowFlagBrcbLocal, overflowFlagBrcbLocal, -1, EACH_LOOOP_REPEAT_TIMES * BYTE_BIT_NUM);
    PipeBarrier<PIPE_V>();
    Mul(this->statePdfIndexSortLocal, this->stateBufferLocal, overflowFlagBrcbLocal, BLOCK_SIZE,
        (uint8_t)(processDataLength / BLOCK_SIZE), {1, 1, 0, 8, 8, 1});
    PipeBarrier<PIPE_V>();
    ShiftRight(this->stateBufferLocal.template ReinterpretCast<uint32_t>(),
               this->stateBufferLocal.template ReinterpretCast<uint32_t>(), (uint32_t)(sizeof(uint16_t) * BYTE_BIT_NUM),
               processDataLength);
    PipeBarrier<PIPE_V>();
    Sub(this->stateBufferLocal, this->stateBufferLocal, this->statePdfIndexSortLocal, processDataLength);
    PipeBarrier<PIPE_V>();
    Muls(overflowFlagLocal, overflowFlagLocal, (int32_t)(sizeof(uint16_t) * BYTE_BIT_NUM), BLOCK_SIZE);
    PipeBarrier<PIPE_V>();
    Sub(this->stateBufferBitNumberLocal, this->stateBufferBitNumberLocal, overflowFlagLocal, BLOCK_SIZE);
    PipeBarrier<PIPE_V>();
    GatherMask(this->outputDeviceLocal.template ReinterpretCast<uint16_t>()[this->compressBufferOffset],
               this->bitLevelMaskZeroLocal.template ReinterpretCast<uint16_t>(), 1, false, 0, {1, 1, 8, 0},
               this->rsvdCnt);
    PipeBarrier<PIPE_V>();
  }

  __aicore__ inline void CopyOut(int32_t processDataLength) {
    SetFlag<HardEvent::V_MTE3>(this->eventManager.eventVMTE3Ping);
    WaitFlag<HardEvent::V_MTE3>(this->eventManager.eventVMTE3Ping);
    DataCopyParams copyParams{1, static_cast<uint16_t>((BLOCK_SIZE + this->compressBufferOffset) * 2), 0, 0};
    DataCopyPad(this->fixedGm[this->currentCoreOutputAcculmulateSize],
                this->outputDeviceLocal.template ReinterpretCast<uint8_t>(), copyParams);
    this->currentCoreOutputAcculmulateSize += (BLOCK_SIZE + this->compressBufferOffset) * sizeof(uint16_t);
    this->currentCorePrcoessAcculmulateSize += processDataLength;
    this->currentCoreMantissaAcculmulateSize += processDataLength * (this->dtypeSize - 1);
  }

  __aicore__ inline void DeviceTailProcess() {
    GatherMask(this->outputDeviceLocal.template ReinterpretCast<uint16_t>(),
               this->stateBufferLocal.template ReinterpretCast<uint16_t>(), 1, false, 0, {1, BLOCK_SIZE, 8, 0},
               this->rsvdCnt);
    SetFlag<HardEvent::V_MTE3>(this->eventManager.eventVMTE3Ping);
    WaitFlag<HardEvent::V_MTE3>(this->eventManager.eventVMTE3Ping);
    PipeBarrier<PIPE_MTE3>();
    DataCopyPad(this->fixedGm[this->currentCoreOutputAcculmulateSize],
                this->outputDeviceLocal.template ReinterpretCast<uint8_t>(), {1, static_cast<uint16_t>(8192), 0, 0});
    this->currentCoreOutputAcculmulateSize += EACH_LOOOP_PROCESS_NUM * sizeof(uint16_t);
    SetFlag<HardEvent::MTE3_V>(this->eventManager.eventMTE3VPing);
    WaitFlag<HardEvent::MTE3_V>(this->eventManager.eventMTE3VPing);
    Adds(this->outputDeviceLocal.template ReinterpretCast<int32_t>(),
         this->stateBufferBitNumberLocal.template ReinterpretCast<int32_t>(), 0, BLOCK_SIZE);
    SetFlag<HardEvent::V_MTE3>(this->eventManager.eventVMTE3Ping);
    WaitFlag<HardEvent::V_MTE3>(this->eventManager.eventVMTE3Ping);
    PipeBarrier<PIPE_MTE3>();
    DataCopyPad(this->fixedGm[this->currentCoreOutputAcculmulateSize],
                this->outputDeviceLocal.template ReinterpretCast<uint8_t>(),
                {1, static_cast<uint16_t>(EACH_LOOOP_REPEAT_TIMES * sizeof(int32_t)), 0, 0});
    this->currentCoreOutputAcculmulateSize += BLOCK_SIZE * sizeof(int32_t);
    PipeBarrier<PIPE_MTE3>();
  }

  __aicore__ inline void Sync() {
    this->remainInputSize = this->processLoopCurrentCore * BLOCK_SIZE - this->currentCorePrcoessAcculmulateSize;
    SetFlag<HardEvent::V_S>(this->eventManager.eventVSPing);
    WaitFlag<HardEvent::V_S>(this->eventManager.eventVSPing);
    SetFlag<HardEvent::MTE3_S>(this->eventManager.eventMTE3SPing);
    WaitFlag<HardEvent::MTE3_S>(this->eventManager.eventMTE3SPing);

    int32_t magicNumber = 12138;
    int32_t sumUseCoreIndex = 1;
    int32_t sumLoopsIndex = 2;
    int32_t sumFixedLoopsIndex = 3;
    int32_t sumVarLoopsIndex = 4;

    this->encodeHeaderLocal.SetValue(0, magicNumber);
    this->encodeHeaderLocal.SetValue(sumUseCoreIndex, 1);
    this->encodeHeaderLocal.SetValue(sumLoopsIndex, (int32_t)this->processLoopCurrentCore);
    this->encodeHeaderLocal.SetValue(sumFixedLoopsIndex, this->currentCorePrcoessAcculmulateSize / BLOCK_SIZE);
    this->encodeHeaderLocal.SetValue(sumVarLoopsIndex, this->remainInputSize / BLOCK_SIZE);
    this->encodeHeaderLocal.SetValue(DEVICE_START_IDX + GetBlockIdx(), this->currentCoreOutputAcculmulateSize);
    this->encodeHeaderLocal.SetValue(HOST_START_IDX + GetBlockIdx(), this->remainInputSize);
    SyncAll();
    SetAtomicAdd<int32_t>();
    DataCopyParams copyParams{1, static_cast<uint16_t>(512), 0, 0};
    DataCopyPad(this->outputDeviceHeaderGm, this->encodeHeaderLocal, copyParams);
    SetAtomicNone();
    SyncAll();
  }

  __aicore__ inline void ReshuffDevice() {
    SyncAll();
    int32_t processByteEachLoop = 16384;
    static constexpr int32_t doubleBufferNum = 2;
    this->pipe->InitBuffer(this->inputQue, doubleBufferNum, processByteEachLoop);
    LocalTensor<uint8_t> inputLocalList[doubleBufferNum] = {this->inputQue.template AllocTensor<uint8_t>(),
                                                            this->inputQue.template AllocTensor<uint8_t>()};

    DataCopyPad(this->encodeHeaderLocal, this->outputDeviceHeaderGm, {1, static_cast<uint16_t>(512), 0, 0},
                {true, 0, 0, 0});
    PipeBarrier<PIPE_ALL>();
    DataCopyPad(this->workspaceGm, this->encodeHeaderLocal.template ReinterpretCast<uint8_t>(),
                {1, static_cast<uint16_t>(512), 0, 0});
    PipeBarrier<PIPE_ALL>();
    int64_t outputOffsetNum = 512;
    for (int32_t k = 0; k < GetBlockIdx(); ++k) {
      outputOffsetNum = outputOffsetNum + this->encodeHeaderLocal.GetValue(DEVICE_START_IDX + k);
    }

    int64_t dataSizeRemain = this->encodeHeaderLocal.GetValue(8 + GetBlockIdx());
    int64_t inputOffsetNum = 0;

    SetFlag<HardEvent::MTE3_MTE2>(this->eventManager.eventMTE3MTE2Ping);
    SetFlag<HardEvent::MTE3_MTE2>(this->eventManager.eventMTE3MTE2Pong);
    for (int32_t i = 0; dataSizeRemain > 0; i++) {
      uint32_t size = dataSizeRemain > processByteEachLoop ? processByteEachLoop : dataSizeRemain;
      int32_t index = (i & 1) ? 0 : 1;
      TEventID MTE2MTE3 = (i & 1) ? this->eventManager.eventMTE2MTE3Ping : this->eventManager.eventMTE2MTE3Pong;
      TEventID MTE3MTE2 = (i & 1) ? this->eventManager.eventMTE3MTE2Ping : this->eventManager.eventMTE3MTE2Pong;
      WaitFlag<HardEvent::MTE3_MTE2>(MTE3MTE2);
      DataCopyPad(inputLocalList[index], this->fixedGm[inputOffsetNum], {1, static_cast<uint16_t>(size), 0, 0},
                  {true, 0, 0, 0});
      SetFlag<HardEvent::MTE2_MTE3>(MTE2MTE3);
      WaitFlag<HardEvent::MTE2_MTE3>(MTE2MTE3);
      DataCopyPad(this->workspaceGm[outputOffsetNum], inputLocalList[index], {1, static_cast<uint16_t>(size), 0, 0});
      dataSizeRemain -= processByteEachLoop;
      inputOffsetNum += processByteEachLoop;
      outputOffsetNum += processByteEachLoop;
      SetFlag<HardEvent::MTE3_MTE2>(MTE3MTE2);
    }
    WaitFlag<HardEvent::MTE3_MTE2>(this->eventManager.eventMTE3MTE2Ping);
    WaitFlag<HardEvent::MTE3_MTE2>(this->eventManager.eventMTE3MTE2Pong);

    this->inputQue.template FreeTensor<uint8_t>(inputLocalList[0]);
    this->inputQue.template FreeTensor<uint8_t>(inputLocalList[1]);
  }

  __aicore__ inline void InitBufferInHostTail(int32_t processByteEachLoop){
    this->pipe->InitBuffer(this->inputQue, doubleBufferNum, processByteEachLoop);
    this->pipe->InitBuffer(this->outputMantissaQue, doubleBufferNum,
                           processByteEachLoop / this->dtypeSize * (this->dtypeSize - 1));
    this->pipe->InitBuffer(this->outputHostQue, doubleBufferNum, processByteEachLoop / this->dtypeSize);
    this->pipe->InitBuffer(this->calcBuf, CALC_BUF_SIZE);
    for(int32_t i = 0; i < doubleBufferNum; i++){
      inputLocalList[i] = this->inputQue.template AllocTensor<dataType>();
      outputHostLocalList[i] = this->outputHostQue.template AllocTensor<uint8_t>();
      outputMantissaLocalList[i] = this->outputMantissaQue.template AllocTensor<uint8_t>();
    }
  }

  __aicore__ inline void AllocTensorInHostTail(int32_t processByteEachLoop){
      this->bufOffset = 0;
      for(int i = 0; i <  doubleBufferNum; i++){
        inputReaminFp16LocalList[i] = calcBuf.GetWithOffset<half>(processByteEachLoop, 
                                      bufOffset + processByteEachLoop * doubleBufferNum * i);
      }
      this->bufOffset += processByteEachLoop * doubleBufferNum * sizeof(half);

      for(int i = 0; i <  doubleBufferNum; i++){
        inputReaminExpFp16LocalList[i] = calcBuf.GetWithOffset<half>((processByteEachLoop / this->dtypeSize), 
                          bufOffset +  (processByteEachLoop / this->dtypeSize) * doubleBufferNum * i);
      }
      this->bufOffset += (processByteEachLoop / this->dtypeSize) * doubleBufferNum * sizeof(half);
      for(int i = 0; i <  doubleBufferNum; i++){
        inputReaminMantissaFp16LocalList[i] = 
              calcBuf.GetWithOffset<half>((processByteEachLoop / this->dtypeSize) * (this->dtypeSize - 1), 
              bufOffset + (processByteEachLoop / this->dtypeSize) * (this->dtypeSize - 1) * doubleBufferNum * i);
      }
      this->bufOffset += (processByteEachLoop / this->dtypeSize) * 
                          (this->dtypeSize - 1) * doubleBufferNum * sizeof(half);
      this->encodeHeaderLocal = calcBuf.GetWithOffset<int32_t>(META_INFO_NUM, this->bufOffset);
      this->bufOffset += META_INFO_NUM * sizeof(int32_t);
      this->fp32MantissaMaskLocal = calcBuf.GetWithOffset<uint16_t>(MANTISSA_MASK_NUM, this->bufOffset);
      this->bufOffset += MANTISSA_MASK_NUM * sizeof(uint16_t);
  }

  __aicore__ inline void ProcessInHostTailDb(int32_t loopId,uint32_t size , int64_t& inputOffsetNum,
                        int64_t& ouputHostExpOffsetNum, int64_t& outputMantissaOffsetNum){
      int32_t index = (loopId & 1) ? 0 : 1;
      TEventID MTE3MTE2 = (loopId & 1) ? this->eventManager.eventMTE3MTE2Ping : this->eventManager.eventMTE3MTE2Pong;
      TEventID MTE2V = (loopId & 1) ? this->eventManager.eventMTE2VPing : this->eventManager.eventMTE2VPong;
      TEventID VMTE3 = (loopId & 1) ? this->eventManager.eventVMTE3Ping : this->eventManager.eventVMTE3Pong;
      WaitFlag<HardEvent::MTE3_MTE2>(MTE3MTE2);
      DataCopyParams copyParams{1, static_cast<uint16_t>(size), 0, 0};
      DataCopyPadParams padParams{true, 0, 0, 0};
      DataCopyPad(inputLocalList[index], this->inputGm[inputOffsetNum], copyParams, padParams);
      SetFlag<HardEvent::MTE2_V>(MTE2V);
      WaitFlag<HardEvent::MTE2_V>(MTE2V);
      Cast(inputReaminFp16LocalList[index], inputLocalList[index].template ReinterpretCast<uint8_t>(),
           RoundMode::CAST_NONE, size);
      PipeBarrier<PIPE_V>();
      uint8_t expPattern = this->dtypeSize == sizeof(int32_t) ? 6 : 2;
      GatherMaskParams expParams;
      expParams.src0BlockStride = 1;
      expParams.repeatTimes = static_cast<uint8_t>(size / (EACH_REPEAT_BYTES / sizeof(uint16_t)));
      expParams.src0RepeatStride = REPEAT_STRIDE;
      expParams.src1RepeatStride = 0;
      GatherMask(inputReaminExpFp16LocalList[index], inputReaminFp16LocalList[index], expPattern, false, 0, expParams,
                 this->rsvdCnt);
      PipeBarrier<PIPE_V>();
      if (this->dtypeSize == sizeof(int32_t)) {
        GatherMask(
            inputReaminMantissaFp16LocalList[index].template ReinterpretCast<uint16_t>(),
            inputReaminFp16LocalList[index].template ReinterpretCast<uint16_t>(),
            this->fp32MantissaMaskLocal.template ReinterpretCast<uint16_t>(), true, expParams.repeatTimes * EACH_REPEAT_BYTES / sizeof(uint16_t),
            {1, 1, 8, 0}, this->rsvdCnt);
      } else {
        GatherMask(inputReaminMantissaFp16LocalList[index], inputReaminFp16LocalList[index], 1, false, 0, expParams,
                   this->rsvdCnt);
      }
      PipeBarrier<PIPE_V>();
      Cast(outputHostLocalList[index], inputReaminExpFp16LocalList[index], RoundMode::CAST_TRUNC,
           size / this->dtypeSize);
      PipeBarrier<PIPE_V>();
      Cast(outputMantissaLocalList[index], inputReaminMantissaFp16LocalList[index], RoundMode::CAST_TRUNC,
           size / this->dtypeSize * (this->dtypeSize - 1));
      PipeBarrier<PIPE_V>();
      SetFlag<HardEvent::V_MTE3>(VMTE3);
      WaitFlag<HardEvent::V_MTE3>(VMTE3);
      DataCopyPad(this->varGm[ouputHostExpOffsetNum], outputHostLocalList[index],
                  {1, static_cast<uint16_t>(size / this->dtypeSize), 0, 0});
      DataCopyPad(this->outputMantissaGm[outputMantissaOffsetNum], outputMantissaLocalList[index],
                  {1, static_cast<uint16_t>(size - size / this->dtypeSize), 0, 0});
      SetFlag<HardEvent::MTE3_MTE2>(MTE3MTE2);
      inputOffsetNum += size / this->dtypeSize;
      ouputHostExpOffsetNum += size / this->dtypeSize;
      outputMantissaOffsetNum += size / this->dtypeSize * (this->dtypeSize - 1);
  }

  __aicore__ inline void HostTailProcess() {
    SyncAll();
    if (this->remainInputSize == 0) {
      return;
    }

    int32_t processByteEachLoop = 12288;
    InitBufferInHostTail(processByteEachLoop);
    AllocTensorInHostTail(processByteEachLoop);
    Duplicate<uint16_t>(this->fp32MantissaMaskLocal, (uint16_t)30583ULL, MANTISSA_MASK_NUM);
    PipeBarrier<PIPE_ALL>();

    DataCopyPad(this->encodeHeaderLocal, this->outputDeviceHeaderGm, {1, static_cast<uint16_t>(512), 0, 0},
                {true, 0, 0, 0});
    int64_t hostGmOffset = 0;
    for (int32_t k = 0; k < GetBlockIdx(); ++k) {
      hostGmOffset = hostGmOffset + this->encodeHeaderLocal.GetValue(HOST_START_IDX + k);
    }

    int64_t inputOffsetNum = currentCorePrcoessAcculmulateSize;
    int64_t outputMantissaOffsetNum = currentCoreMantissaAcculmulateSize;
    int64_t ouputHostExpOffsetNum = hostGmOffset;
    int64_t dataSizeRemain = this->remainInputSize * this->dtypeSize;
    SetFlag<HardEvent::MTE3_MTE2>(this->eventManager.eventMTE3MTE2Ping);
    SetFlag<HardEvent::MTE3_MTE2>(this->eventManager.eventMTE3MTE2Pong);
    for (int32_t i = 0; dataSizeRemain > 0; i++) {
      uint32_t size = dataSizeRemain > processByteEachLoop ? processByteEachLoop : dataSizeRemain;
      ProcessInHostTailDb(i, size, inputOffsetNum, ouputHostExpOffsetNum, outputMantissaOffsetNum);
      dataSizeRemain -= size;
    }
    WaitFlag<HardEvent::MTE3_MTE2>(this->eventManager.eventMTE3MTE2Ping);
    WaitFlag<HardEvent::MTE3_MTE2>(this->eventManager.eventMTE3MTE2Pong);
    PipeBarrier<PIPE_ALL>();
    this->inputQue.template FreeTensor<dataType>(inputLocalList[0]);
    this->inputQue.template FreeTensor<dataType>(inputLocalList[1]);
    this->outputHostQue.template FreeTensor<uint8_t>(outputHostLocalList[0]);
    this->outputHostQue.template FreeTensor<uint8_t>(outputHostLocalList[1]);
    this->outputMantissaQue.template FreeTensor<uint8_t>(outputMantissaLocalList[0]);
    this->outputMantissaQue.template FreeTensor<uint8_t>(outputMantissaLocalList[1]);
  }

 private:
  // tiling param
  bool reshuff;
  int32_t dtypeSize;
  int32_t processCoreDim;
  int64_t processLoopPerCore;
  int64_t processLoopCurrentCore;
  int32_t tileDataLength;
  int32_t tileNum;
  int32_t tileRemain;
  // calculate param
  int32_t bufOffset = 0;
  int64_t outputDeviceSizeCurrentCore;
  int64_t outputHostSizeCurrentCore;
  int64_t currentCoreOutputAcculmulateSize = 0;
  int64_t currentCorePrcoessAcculmulateSize = 0;
  int64_t currentCoreMantissaAcculmulateSize = 0;
  int64_t remainInputSize = 0;
  float const16Float = 2.2420775429197073e-44;
  uint64_t rsvdCnt;
  uint64_t compressBufferOffset;
  int32_t debugOffset = 0;

  GlobalTensor<dataType> inputGm;
  GlobalTensor<int32_t> pdfGm;
  GlobalTensor<uint8_t> fixedGm;
  GlobalTensor<uint8_t> workspaceGm;
  GlobalTensor<uint8_t> outputMantissaGm;
  GlobalTensor<uint8_t> varGm;
  GlobalTensor<int32_t> outputDeviceHeaderGm;
  // input
  LocalTensor<dataType> inputLocal;
  LocalTensor<int32_t> pdfLocal;
  // output
  LocalTensor<uint16_t> outputDeviceLocal;
  LocalTensor<uint8_t> outputHostLocal;
  LocalTensor<uint8_t> outputMantissaLocal;
  // calculate
  LocalTensor<half> inputHalfLocal;
  LocalTensor<int32_t> pdfSortIndexLocal;
  LocalTensor<int32_t> ConstOneLocal;
  LocalTensor<int32_t> ConstZeroLocal;
  LocalTensor<uint16_t> fp32MantissaMaskLocal;
  LocalTensor<half> inputExpHalfLocal;
  LocalTensor<int32_t> inputExpUint32Local;
  LocalTensor<int32_t> statePdfIndexSortLocal;
  LocalTensor<int32_t> statePdfMaxUseBitLocal;
  LocalTensor<int32_t> bitLevelMaskZeroLocal;
  LocalTensor<int32_t> stateBufferLocal;
  LocalTensor<int32_t> stateBufferBitNumberLocal;
  LocalTensor<int32_t> encodeHeaderLocal;
  LocalTensor<int32_t> bitShlMulScaleLocal;
  LocalTensor<int32_t> reduceLowBlockMaskLocal;

  TPipe* pipe = nullptr;
  TQue<TPosition::VECIN, 1> inputQue;
  TQue<TPosition::VECIN, 1> pdfQue;
  TBuf<TPosition::VECCALC> calcBuf;
  TQue<TPosition::VECOUT, 1> outputMantissaQue;
  TQue<TPosition::VECOUT, 1> outputDeviceQue;
  TQue<TPosition::VECOUT, 1> outputHostQue;
  HansCommonNs::EventManager eventManager;

  static constexpr int32_t doubleBufferNum = 2;
  LocalTensor<dataType> inputLocalList[doubleBufferNum];
  LocalTensor<uint8_t> outputHostLocalList[doubleBufferNum];
  LocalTensor<uint8_t> outputMantissaLocalList[doubleBufferNum];
  LocalTensor<half> inputReaminFp16LocalList[doubleBufferNum];
  LocalTensor<half> inputReaminExpFp16LocalList[doubleBufferNum];
  LocalTensor<half> inputReaminMantissaFp16LocalList[doubleBufferNum];
};
}  // namespace HansEncodeNS
#endif  // HANS_ENCODE_BASE_H