sip/ops/fft/common/include/kernel/r2c_c2r_common.cceh-代码预览-sip:基于华为Ascend AI处理器的信号处理加速库项目 - AtomGit

3c3c7161创建于 2025年11月5日历史提交
#include "kernel_operator.h"

#include "../../../../include/common/mem.h"


__aicore__ __inline__ void R2C_tail_calc(
    __gm__ float * __restrict__ gm_c2c_output,
    __gm__ float * __restrict__ gm_output,
    uint64_t batch_size,
    uint64_t fft_n)
{
    AscendC::GlobalTensor<float> c2c_output_gm_tensor;
    AscendC::GlobalTensor<float> output_gm_tensor;
    c2c_output_gm_tensor.SetGlobalBuffer(gm_c2c_output);
    output_gm_tensor.SetGlobalBuffer(gm_output);

    uint32_t N = fft_n * 2;
    uint32_t aiv_id = AscendC::GetBlockIdx();
    uint32_t aiv_num = AscendC::GetBlockNum() * 2;

    if (fft_n <= 128) {
        for(int i = 0; i < batch_size; i++) {
            if (aiv_id != 0) {
                continue;
            }
            int64_t offset = i * N;
            int64_t offset_out = i * (N + 2);
            float temp_R0 = c2c_output_gm_tensor.GetValue(offset);                         // 910B的 LD ST 特性
            float temp_I0 = c2c_output_gm_tensor.GetValue(offset + 1);
            output_gm_tensor.SetValue(offset_out, temp_R0 + temp_I0);
            output_gm_tensor.SetValue(offset_out + 1, 0.0);
            output_gm_tensor.SetValue(offset_out + N, temp_R0 - temp_I0);
            output_gm_tensor.SetValue(offset_out + N + 1, 0.0);
        }
        return;
    }

    if (aiv_id == 0) {
        float temp_R0 = c2c_output_gm_tensor.GetValue(0); 
        float temp_I0 = c2c_output_gm_tensor.GetValue(1); 
        output_gm_tensor.SetValue(0, temp_R0 + temp_I0);
        output_gm_tensor.SetValue(0 + 1, 0.0);
        output_gm_tensor.SetValue(0 + N, temp_R0 - temp_I0);
        output_gm_tensor.SetValue(0 + N + 1, 0.0);
    }
    for(int i = 0; i < batch_size - 1; i++) {
        if (i % aiv_num != aiv_id) {
            continue;
        }
        int64_t offset = i * N;
        int64_t offset_out = i * (N + 2);

        float temp_R0 = c2c_output_gm_tensor.GetValue(offset);                         // 910B的 LD ST 特性
        float temp_I0 = c2c_output_gm_tensor.GetValue(offset + 1);
        float temp_R1 = c2c_output_gm_tensor.GetValue(offset + N);                         // 910B的 LD ST 特性
        float temp_I1 = c2c_output_gm_tensor.GetValue(offset + N + 1);

        output_gm_tensor.SetValue(offset_out + N, temp_R0 - temp_I0);
        output_gm_tensor.SetValue(offset_out + N + 1, 0.0);
        output_gm_tensor.SetValue(offset_out + N + 2, temp_R1 + temp_I1);     // 下个Batch的首尾 连续的两个虚数
        output_gm_tensor.SetValue(offset_out + N + 3, 0.0);

        if (i == batch_size - 2) {
            output_gm_tensor.SetValue(offset_out + N + 2 + N, temp_R1 - temp_I1);
            output_gm_tensor.SetValue(offset_out + N + 2 + N + 1, 0.0);
        }
    }
    AscendC::PipeBarrier<PIPE_ALL>();
}


__aicore__ __inline__ void R2C(
    __gm__ float * __restrict__ gm_input,
    __gm__ uint32_t * __restrict__ gm_input_index,
    __gm__ float * __restrict__ gm_a,
    __gm__ float * __restrict__ gm_b,
    __gm__ uint32_t * __restrict__ gm_output_index,
    __gm__ float * __restrict__ gm_output,
    uint64_t N,
    uint64_t batch_size,
    uint64_t batch_id_begin,
    uint64_t batch_id_end)
    {
        AscendC::SetAtomicNone();
        AscendC::SetMaskNorm();
        AscendC::SetVectorMask<float>((uint64_t)-1, (uint64_t)-1);

        AscendC::GlobalTensor<float> input_gm_tensor;
        AscendC::GlobalTensor<float> output_gm_tensor;
        AscendC::GlobalTensor<uint32_t> input_index_gm_tensor;
        AscendC::GlobalTensor<uint32_t> output_index_gm_tensor;
        AscendC::GlobalTensor<float> a_gm_tensor;
        AscendC::GlobalTensor<float> b_gm_tensor;

        input_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_input));
        output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_output));
        input_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_input_index));
        output_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_output_index));
        a_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_a));
        b_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_b));

        uint64_t vecMask[2] = { UINT64_MAX, 0 };

        uint32_t aiv_id = AscendC::GetBlockIdx();
        uint32_t aiv_num = AscendC::GetBlockNum() * 2;

        // 每次取N个数, 执行batch_size次
        uint64_t num = N - 2 > BIASC ? BIASC : N - 2;
        int index_size = N >= BIASC ? BIAS : ((N / 2 + 63) / 64) * 64;
        uint64_t lenBurstC = (num + 7) / 8;
        uint64_t lenBurstIndex = (index_size + 7) / 8;
        uint64_t lenBurstOutIndex = (BIASC + 7) / 8;
        // 每次vector乘加需要的次数
        uint8_t repeat = (num / 2 + 63) / 64; //  此处会导致越界UB操作
        // 每次vector reduce需要的次数
        uint8_t repeat_reduce = (num + 63) / 64;
        // 每次vgather output需要的次数
        uint8_t repeat_out = (num + 63) / 64;
        // 每次把实数和虚部一起取出来
        uint64_t loopN_times = batch_size;
        // 每次取得数的个数
        uint64_t repeat_times = (N - 2 + BIASC - 1) / BIASC;

        uint64_t ping = 1;

        // 开多核
        AsdopsBuffer<ArchType::ASCEND_V220> buf;
        AscendC::LocalTensor<float> UB_INPUT1_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(0 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_R1_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(2 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_I1_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(3 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_R1_REVERSE_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(4 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_I1_REVERSE_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(5 * 8192);
        AscendC::LocalTensor<float> tvub1_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(6 * 8192);
        AscendC::LocalTensor<float> tvub2_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(7 * 8192);

        AscendC::LocalTensor<float> UB_INPUT2_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(8 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_R2_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(10 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_I2_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(11 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_R2_REVERSE_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(12 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_I2_REVERSE_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(13 * 8192);
        AscendC::LocalTensor<float> tvub3_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(14 * 8192);
        AscendC::LocalTensor<float> tvub4_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(15 * 8192);

        AscendC::LocalTensor<float> MatrixA_Real_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(16 * 8192);
        AscendC::LocalTensor<float> MatrixA_Imag_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(17 * 8192);
        AscendC::LocalTensor<float> MatrixB_Real_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(18 * 8192);
        AscendC::LocalTensor<float> MatrixB_Imag_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(19 * 8192);

        AscendC::LocalTensor<uint32_t> reverse_index_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, uint32_t>(20 * 8192);
        AscendC::LocalTensor<uint32_t> vgather_index_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, uint32_t>(21 * 8192);


#ifndef MIX_CORE
        AscendC::DataCopy(reverse_index_ub_tensor, input_index_gm_tensor, AscendC::DataCopyParams(1, lenBurstIndex, 0, 0));  // 搬入512长度
        AscendC::PipeBarrier<PIPE_MTE2>();
        AscendC::DataCopy(vgather_index_ub_tensor, output_index_gm_tensor, AscendC::DataCopyParams(1, lenBurstOutIndex, 0, 0));  // 搬入4096长度，污染后一个
        AscendC::PipeBarrier<PIPE_MTE2>();
#else
        AscendC::DataCopy(vgather_index_ub_tensor, output_index_gm_tensor, AscendC::DataCopyParams(1, lenBurstOutIndex, 0, 0)); // 搬入4096个数
        AscendC::PipeBarrier<PIPE_MTE2>();
#endif

        AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
        AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);

        AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
        AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);

        for(uint64_t loop = 0; loop < repeat_times; loop++) {       // 序列方向的循环

            int64_t offset_loopc = loop * BIASC;      // 已经处理过的长度
            int actual_n = N - 2;

#ifndef MIX_CORE
            actual_n = actual_n < BIASC ? actual_n : ((loop + 1) * BIASC <= actual_n ? BIASC : (actual_n - loop * BIASC));
            bool aligned = (actual_n % BLOCK_SIZE_8 == 0);

            bool init_A = 0;
            bool init_B = 0;
#else
            int reverse_offset = 0;
            if (repeat_times != 1) {                        // 若N-2长度大于4096则循环
                actual_n = (loop + 1) * BIASC <= (N - 2) ? BIASC : ((N - 2) - loop * BIASC);
            }
            
            if (repeat_times != 1 && actual_n != BIASC) {   // 需要读到对应的reverse数组
                reverse_offset = index_size;
            }

            AscendC::DataCopy(reverse_index_ub_tensor, input_index_gm_tensor[reverse_offset], AscendC::DataCopyParams(1, lenBurstIndex, 0, 0));   // 搬入2048个数
            AscendC::PipeBarrier<PIPE_MTE2>();

            bool init_A = 0;
            bool init_B = 0;

            AscendC::DataCopyPad(MatrixA_Real_ub_tensor, a_gm_tensor[offset_loopc / 2 + 1], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));      // 搬对应的N-2个数进来
            AscendC::PipeBarrier<PIPE_MTE2>();
            AscendC::DataCopyPad(MatrixA_Imag_ub_tensor, a_gm_tensor[offset_loopc / 2 + N / 2 + 1], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));       // 搬对应的N-2个数进来
            AscendC::PipeBarrier<PIPE_MTE2>();

            AscendC::DataCopyPad(MatrixB_Real_ub_tensor, b_gm_tensor[offset_loopc / 2 + 1], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
            AscendC::PipeBarrier<PIPE_MTE2>();
            AscendC::DataCopyPad(MatrixB_Imag_ub_tensor, b_gm_tensor[offset_loopc / 2 + N / 2 + 1], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
            AscendC::PipeBarrier<PIPE_MTE2>();
#endif

            for(uint64_t loopN = batch_id_begin; loopN < batch_id_end; loopN+=1) {  // Batch方向的循环

#ifndef MIX_CORE
                int32_t B_id = loopN - batch_id_begin;
                int32_t N_id = loop;
                int32_t loop_id = N_id * batch_size + B_id;
#else
                int32_t B_id = loopN - batch_id_begin;
                int32_t N_id = loop;
                int32_t loop_id = B_id * repeat_times + N_id;
#endif

                if (loop_id % aiv_num != aiv_id) {
                    continue;
                }

#ifndef MIX_CORE
                if(aligned && init_A == 0) {
                    AscendC::DataCopy(MatrixA_Real_ub_tensor, a_gm_tensor[offset_loopc / 2 + 1], AscendC::DataCopyParams(1, (lenBurstC + 1) / 2, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    AscendC::DataCopy(MatrixA_Imag_ub_tensor, a_gm_tensor[offset_loopc / 2 + N / 2 + 1], AscendC::DataCopyParams(1, (lenBurstC + 1) / 2, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    init_A = 1;
                } else if (init_A == 0) {
                    AscendC::DataCopyPad(MatrixA_Real_ub_tensor, a_gm_tensor[offset_loopc / 2 + 1], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));    // 搬对应的N-2个数进来
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    AscendC::DataCopyPad(MatrixA_Imag_ub_tensor, a_gm_tensor[offset_loopc / 2 + N / 2 + 1], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));    // 搬对应的N-2个数进来
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    init_A = 1;
                }

                if(aligned && init_B == 0) {
                    AscendC::DataCopy(MatrixB_Real_ub_tensor, b_gm_tensor[offset_loopc / 2 + 1], AscendC::DataCopyParams(1, (lenBurstC + 1) / 2, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    AscendC::DataCopy(MatrixB_Imag_ub_tensor, b_gm_tensor[offset_loopc / 2 + N / 2 + 1], AscendC::DataCopyParams(1, (lenBurstC + 1) / 2, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    init_B = 1;
                } else if (init_B == 0) {
                    AscendC::DataCopyPad(MatrixB_Real_ub_tensor, b_gm_tensor[offset_loopc / 2 + 1], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    AscendC::DataCopyPad(MatrixB_Imag_ub_tensor, b_gm_tensor[offset_loopc / 2 + N / 2 + 1], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    init_B = 1;
                }
#endif

                auto event_id = ping ? EVENT_ID0 : EVENT_ID1;
                auto buf0_ub_tensor = ping ? UB_INPUT1_ub_tensor : UB_INPUT2_ub_tensor;   // 输入、大于512倒序、A矩阵、B矩阵、搬出
                auto buf1_ub_tensor = ping ? UB_INPUT_R1_ub_tensor: UB_INPUT_R2_ub_tensor;    // 输入的虚部
                auto buf2_ub_tensor = ping ? UB_INPUT_I1_ub_tensor: UB_INPUT_I2_ub_tensor;    // 输入的实部
                auto buf4_ub_tensor = ping ? UB_INPUT_R1_REVERSE_ub_tensor: UB_INPUT_R2_REVERSE_ub_tensor;    // 倒序后的实部
                auto buf5_ub_tensor = ping ? UB_INPUT_I1_REVERSE_ub_tensor: UB_INPUT_I2_REVERSE_ub_tensor;    // 倒序后的虚部
                auto temp1_ub_tensor = ping ? tvub1_ub_tensor : tvub3_ub_tensor;
                auto temp2_ub_tensor = ping ? tvub2_ub_tensor : tvub4_ub_tensor;
                uint64_t index1 = ping ? 2: 10;
                uint64_t index2 = ping ? 3: 11;
                uint64_t index3 = ping ? 6: 14;
                uint64_t index4 = ping ? 7: 15;

                int64_t offset = 2 + loopN * N;
                int64_t offset_out = 2 + loopN * (N + 2);

                AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(event_id);
                //  **********搬入输入***************

#ifndef MIX_CORE
                if(aligned) {
                    AscendC::DataCopy(buf0_ub_tensor, input_gm_tensor[offset + offset_loopc], AscendC::DataCopyParams(1, lenBurstC / 2, 0, 0));
                } else {
                    AscendC::DataCopyPad(buf0_ub_tensor, input_gm_tensor[offset + offset_loopc], AscendC::DataCopyExtParams(1, actual_n * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
                }
#else
                AscendC::DataCopyPad(buf0_ub_tensor, input_gm_tensor[offset + offset_loopc], AscendC::DataCopyExtParams(1, actual_n * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
#endif

                AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(event_id);
                AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(event_id);
                AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(event_id);
                //  **********输入虚实分离***************
                //输入实数分离
                // todo GatherMask
                vreducev2(
                    reinterpret_cast<__ubuf__ uint32_t *>(buf2_ub_tensor.GetPhyAddr(0)),
                    reinterpret_cast<__ubuf__ uint32_t *>(buf0_ub_tensor.GetPhyAddr(0)),
                    nullptr,
                    repeat_reduce, // repeat
                    1,        // src0BlockStride
                    1,        // patternMode, 101010…10
                    8,        // src0RepeatStride
                    8         // src1RepeatStride
                );
                // 输入虚数分离
                // todo GatherMask
                vreducev2(
                    reinterpret_cast<__ubuf__ uint32_t *>(buf1_ub_tensor.GetPhyAddr(0)),
                    reinterpret_cast<__ubuf__ uint32_t *>(buf0_ub_tensor.GetPhyAddr(0)),
                    nullptr,
                    repeat_reduce, // repeat
                    1,        // src0BlockStride
                    2,        // patternMode, 010101…01
                    8,        // src0RepeatStride
                    8         // src1RepeatStride
                );
                AscendC::PipeBarrier<PIPE_V>();
                //  **********倒序***************
                // 倒序分两种情况
                if(repeat_times == 1) {
                    // 一个UB块就能完成
                    // FI倒序
                    // todo GatherMask
                    vgather(reinterpret_cast<__ubuf__ uint32_t *>(buf4_ub_tensor.GetPhyAddr(0)), (__ubuf__ unsigned int *)reverse_index_ub_tensor.GetPhyAddr(0), index1 * UB_VEC_SIZE, 8, repeat);
                    // FR倒序
                    // todo GatherMask
                    vgather(reinterpret_cast<__ubuf__ uint32_t *>(buf5_ub_tensor.GetPhyAddr(0)), (__ubuf__ unsigned int *)reverse_index_ub_tensor.GetPhyAddr(0), index2 * UB_VEC_SIZE, 8, repeat);
                    AscendC::PipeBarrier<PIPE_V>();
                    // FI共轭
                    AscendC::Muls(buf4_ub_tensor, buf4_ub_tensor, -1.0f, vecMask, repeat, {1, 1, 8, 8});
                } else {
                    // 数据量大于512要用多个UB块操作
                    AscendC::PipeBarrier<PIPE_ALL>();

#ifndef MIX_CORE
                    int64_t offset_loopc_rev = (repeat_times - 1 - loop) * BIASC;
                    AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(event_id);
                    AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(event_id);
                    AscendC::DataCopy(buf0_ub_tensor, input_gm_tensor[offset + offset_loopc_rev - 2], AscendC::DataCopyParams(1, lenBurstC, 0, 0));
#else
                    // int64_t offset_loopc_rev = (repeat_times - 1 - loop) * BIASC;
                    int64_t offset_block = ((loop + 1) * BIASC >= (N - 2)) ? (N - 2) : (loop * BIASC + BIASC);
                    int64_t offset_loopc_rev = (N - 2) - offset_block;
                    AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(event_id);
                    AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(event_id);
                    AscendC::DataCopyPad(buf0_ub_tensor, input_gm_tensor[offset + offset_loopc_rev], AscendC::DataCopyExtParams(1, actual_n * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
#endif

                    AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(event_id);
                    AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(event_id);
                    // 大于 UB_VEC_SIZE, 需要倒着读一块， 然后把这块倒着读回来参与计算
                    // 实数分离
                    vreducev2(
                        reinterpret_cast<__ubuf__ uint32_t *>(temp1_ub_tensor.GetPhyAddr(0)),
                        reinterpret_cast<__ubuf__ uint32_t *>(buf0_ub_tensor.GetPhyAddr(0)),
                        nullptr,
                        repeat_reduce, // repeat
                        1,        // src0BlockStride
                        1,        // patternMode, 101010…10
                        8,        // src0RepeatStride
                        8         // src1RepeatStride
                    );
                    // 虚数分离
                    vreducev2(
                        reinterpret_cast<__ubuf__ uint32_t *>(temp2_ub_tensor.GetPhyAddr(0)),
                        reinterpret_cast<__ubuf__ uint32_t *>(buf0_ub_tensor.GetPhyAddr(0)),
                        nullptr,
                        repeat_reduce, // repeat
                        1,        // src0BlockStride
                        2,        // patternMode, 010101…01
                        8,        // src0RepeatStride
                        8         // src1RepeatStride
                    );
                    AscendC::PipeBarrier<PIPE_V>();
                    // FI 倒序
                    // todo GatherMask
                    vgather(reinterpret_cast<__ubuf__ uint32_t *>(buf4_ub_tensor.GetPhyAddr(0)), (__ubuf__ unsigned int *)reverse_index_ub_tensor.GetPhyAddr(0), index4 * UB_VEC_SIZE, 8, repeat);
                    // FR 倒序
                    // todo GatherMask
                    vgather(reinterpret_cast<__ubuf__ uint32_t *>(buf5_ub_tensor.GetPhyAddr(0)), (__ubuf__ unsigned int *)reverse_index_ub_tensor.GetPhyAddr(0), index3 * UB_VEC_SIZE, 8, repeat);
                    AscendC::PipeBarrier<PIPE_V>();
                    // FI共轭
                    AscendC::Muls(buf4_ub_tensor, buf4_ub_tensor, -1.0f, vecMask, repeat, {1, 1, 8, 8});
                    AscendC::PipeBarrier<PIPE_ALL>();
                }
                AscendC::PipeBarrier<PIPE_V>();
                //A的计算
                // 系数A实/虚分离
                //  **********A矩阵相关的计算***************
                // UB_BUF4：FI_REV;   UB_BUF5: FR_REV;   UB_BUF2:FR;    UB_BUF1:FI
                // UB_BUF6: AI;   UB_BUF7: AR
                // FR的计算
                AscendC::Mul(temp1_ub_tensor, buf1_ub_tensor, MatrixA_Real_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});
                AscendC::Mul(temp2_ub_tensor, buf2_ub_tensor, MatrixA_Real_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});
                AscendC::Mul(buf1_ub_tensor, buf1_ub_tensor, MatrixA_Imag_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});
                AscendC::Mul(buf2_ub_tensor, buf2_ub_tensor, MatrixA_Imag_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});
                AscendC::PipeBarrier<PIPE_V>();

                AscendC::Sub(buf1_ub_tensor, temp2_ub_tensor, buf1_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FR * AR - FI * AI
                AscendC::Add(buf2_ub_tensor, temp1_ub_tensor, buf2_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FR * AI + FI * AR
                AscendC::PipeBarrier<PIPE_V>();
 
                // B的计算
                // 系数B实/虚分离
                //  **********B矩阵相关的计算***************
                // UB_BUF4：FI_REV;   UB_BUF5: FR_REV;   UB_BUF2:FR;    UB_BUF1:FI
                // UB_BUF6: BI;   UB_BUF7: BR
                // FrevR的计算

                AscendC::Mul(temp1_ub_tensor, buf5_ub_tensor, MatrixB_Real_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FervR * BR
                AscendC::Mul(temp2_ub_tensor, buf4_ub_tensor, MatrixB_Real_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FervI * BR
                AscendC::Mul(buf5_ub_tensor, buf5_ub_tensor, MatrixB_Imag_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});  // FrevR * BI
                AscendC::Mul(buf4_ub_tensor, buf4_ub_tensor, MatrixB_Imag_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});  // FrevI * BI
                AscendC::PipeBarrier<PIPE_V>();

                AscendC::Sub(buf4_ub_tensor, temp1_ub_tensor, buf4_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FervR * BR - FrevI * BI
                AscendC::Add(buf5_ub_tensor, temp2_ub_tensor, buf5_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FrevR * BI + FervI * BR
                AscendC::PipeBarrier<PIPE_V>();
 
                //  **********虚实结合***************
                // A和B结果组合
                AscendC::Add(buf1_ub_tensor, buf1_ub_tensor, buf4_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FR * AR - FI * AI
                AscendC::Add(buf2_ub_tensor, buf2_ub_tensor, buf5_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FR * AI + FI * AR
                AscendC::PipeBarrier<PIPE_V>();
                // UB_BUF8实数， UB_BUF9虚数，结果组合
                // 结果在UB_BUF0里面
                // todo GatherMask
                vgather(reinterpret_cast<__ubuf__ uint32_t *>(buf4_ub_tensor.GetPhyAddr(0)), (__ubuf__ unsigned int *)vgather_index_ub_tensor.GetPhyAddr(0), index1 * UB_VEC_SIZE, 8, repeat_out);
                AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(event_id);
                AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(event_id);
                AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(event_id);
                //  **********搬出***************

#ifndef MIX_CORE
                if(aligned) {
                    AscendC::DataCopy(output_gm_tensor[offset_out + offset_loopc], buf4_ub_tensor, AscendC::DataCopyParams(1, lenBurstC, 0, 0));
                } else {
                    AscendC::DataCopyPad(output_gm_tensor[offset_out + offset_loopc], buf4_ub_tensor, AscendC::DataCopyExtParams(1, actual_n * sizeof(float), 0, 0, 0));
                }
#else
                AscendC::DataCopyPad(output_gm_tensor[offset_out + offset_loopc], buf4_ub_tensor, AscendC::DataCopyExtParams(1, actual_n * sizeof(float), 0, 0, 0));
#endif

                AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(event_id);
                ping = 1 - ping;
            }
        }
        AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
        AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);

        AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
        AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);
}


__aicore__ __inline__ void C2R(
    __gm__ float * __restrict__ gm_input,
    __gm__ uint32_t * __restrict__ gm_input_index,
    __gm__ float * __restrict__ gm_a,
    __gm__ float * __restrict__ gm_b,
    __gm__ uint32_t * __restrict__ gm_output_index,
    __gm__ float * __restrict__ gm_output,
    uint64_t N,
    uint64_t batch_size,
    uint64_t batch_id_begin,
    uint64_t batch_id_end)
    {
        AscendC::SetAtomicNone();
        AscendC::SetMaskNorm();
        AscendC::SetVectorMask<float>((uint64_t)-1, (uint64_t)-1);

        AscendC::GlobalTensor<float> input_gm_tensor;
        AscendC::GlobalTensor<float> output_gm_tensor;
        AscendC::GlobalTensor<uint32_t> input_index_gm_tensor;
        AscendC::GlobalTensor<uint32_t> output_index_gm_tensor;
        AscendC::GlobalTensor<float> a_gm_tensor;
        AscendC::GlobalTensor<float> b_gm_tensor;

        input_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_input));
        output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_output));
        input_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_input_index));
        output_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_output_index));
        a_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_a));
        b_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_b));

        uint64_t vecMask[2] = { UINT64_MAX, 0 };

        uint32_t aiv_id = AscendC::GetBlockIdx();
        uint32_t aiv_num = AscendC::GetBlockNum() * 2;

        // 每次取N个数, 执行batch_size次
        uint64_t num = N + 2 > BIASC ? BIASC : N + 2;
        uint64_t out_num = N > BIASC ? BIASC : N; // 为规避Vgather问题引入的暂时变量 // 
        int index_size = (N + 2)>= BIASC ? BIAS : (((N + 2) / 2 + 63) / 64) * 64;
        uint64_t lenBurstC = (num + 7) / 8;
        uint64_t lenBurstOut = ((N > BIASC ? BIASC : N) + 7) / 8;
        uint64_t lenBurstIndex = (index_size + 7) / 8;
        uint64_t lenBurstOutIndex = (BIASC + 7) / 8;
        // 每次vector乘加需要的次数
        uint8_t repeat = (num / 2 + 63) / 64; //  此处会导致越界UB操作  // 512
        // 每次vector reduce需要的次数
        uint8_t repeat_reduce = (num + 63) / 64;
        // 每次vgather output需要的次数
        uint8_t repeat_out = (out_num + 63) / 64;
        // 每次把实数和虚部一起取出来
        uint64_t loopN_times = batch_size;
        // 每次取得数的个数
        uint64_t repeat_times = (N + BIASC - 1) / BIASC;

        uint64_t ping = 1;
        // 开多核

        AsdopsBuffer<ArchType::ASCEND_V220> buf;
        AscendC::LocalTensor<float> UB_INPUT1_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(0 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_R1_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(2 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_I1_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(3 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_R1_REVERSE_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(4 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_I1_REVERSE_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(5 * 8192);
        AscendC::LocalTensor<float> tvub1_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(6 * 8192);
        AscendC::LocalTensor<float> tvub2_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(7 * 8192);

        AscendC::LocalTensor<float> UB_INPUT2_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(8 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_R2_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(10 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_I2_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(11 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_R2_REVERSE_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(12 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_I2_REVERSE_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(13 * 8192);
        AscendC::LocalTensor<float> tvub3_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(14 * 8192);
        AscendC::LocalTensor<float> tvub4_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(15 * 8192);

        AscendC::LocalTensor<float> MatrixA_Real_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(16 * 8192);
        AscendC::LocalTensor<float> MatrixA_Imag_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(17 * 8192);
        AscendC::LocalTensor<float> MatrixB_Real_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(18 * 8192);
        AscendC::LocalTensor<float> MatrixB_Imag_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(19 * 8192);

        AscendC::LocalTensor<uint32_t> reverse_index_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, uint32_t>(20 * 8192);
        AscendC::LocalTensor<uint32_t> vgather_index_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, uint32_t>(21 * 8192);


#ifndef MIX_CORE
        AscendC::DataCopy(reverse_index_ub_tensor, input_index_gm_tensor, AscendC::DataCopyParams(1, lenBurstIndex, 0, 0));  // 搬入512长度
        AscendC::PipeBarrier<PIPE_MTE2>();
#endif

        AscendC::DataCopy(vgather_index_ub_tensor, output_index_gm_tensor, AscendC::DataCopyParams(1, lenBurstOutIndex, 0, 0));  // 搬入4096长度，污染后一个
        AscendC::PipeBarrier<PIPE_MTE2>();

        AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
        AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);

        AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
        AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);

        for(uint64_t loop = 0; loop < repeat_times; loop++) {       // 序列方向的循环
            
            int64_t offset_loopc = loop * BIASC;      // 已经处理过的长度
            int actual_n = N + 2;

#ifndef MIX_CORE
            actual_n = actual_n < BIASC ? actual_n : ((loop + 1) * BIASC <= actual_n ? BIASC : (actual_n - loop * BIASC));
            bool aligned = (actual_n % BLOCK_SIZE_8 == 0);

            bool init_A = 0;
            bool init_B = 0;
#else
            int reverse_offset = 0;

            if (repeat_times != 1) {                        // 若N-2长度大于4096则循环
                actual_n = (loop + 1) * BIASC <= (N + 2) ? BIASC : ((N + 2) - loop * BIASC);
            }
            
            if (repeat_times != 1 && actual_n != BIASC) {   // 需要读到对应的reverse数组
                reverse_offset = index_size;
            }

            if (N == 4096) {
                actual_n = N;           // 4096长度需要特殊处理 
            }
            AscendC::DataCopy(reverse_index_ub_tensor, input_index_gm_tensor[reverse_offset], AscendC::DataCopyParams(1, lenBurstIndex, 0, 0));   // 搬入2048个数
            AscendC::PipeBarrier<PIPE_MTE2>();
            
            AscendC::DataCopyPad(MatrixA_Real_ub_tensor, a_gm_tensor[offset_loopc / 2], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));      // 搬对应的N-2个数进来
            AscendC::PipeBarrier<PIPE_MTE2>();
            AscendC::DataCopyPad(MatrixA_Imag_ub_tensor, a_gm_tensor[offset_loopc / 2 + N / 2], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));       // 搬对应的N-2个数进来
            AscendC::PipeBarrier<PIPE_MTE2>();

            AscendC::DataCopyPad(MatrixB_Real_ub_tensor, b_gm_tensor[offset_loopc / 2], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
            AscendC::PipeBarrier<PIPE_MTE2>();      // 搬对应的N-2个数进来
            AscendC::DataCopyPad(MatrixB_Imag_ub_tensor, b_gm_tensor[offset_loopc / 2 + N / 2], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));      // 搬对应的N-2个数进来
            AscendC::PipeBarrier<PIPE_MTE2>();

#endif

            for(uint64_t loopN = batch_id_begin; loopN < batch_id_end; loopN+=1) {  // Batch方向的循环

#ifndef MIX_CORE
                int32_t B_id = loopN - batch_id_begin;
                int32_t N_id = loop;
                int32_t loop_id = N_id * batch_size + B_id;
                if (loop_id % aiv_num != aiv_id) {
                    continue;
                }
                
                if(aligned && init_A == 0) {
                    // LOAD A Matrix
                    AscendC::DataCopy(MatrixA_Real_ub_tensor, a_gm_tensor[offset_loopc / 2], AscendC::DataCopyParams(1, (lenBurstC + 1) / 2, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    AscendC::DataCopy(MatrixA_Imag_ub_tensor, a_gm_tensor[offset_loopc / 2 + N / 2], AscendC::DataCopyParams(1, (lenBurstC + 1) / 2, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    init_A = 1;
                } else if (init_A == 0) {
                    AscendC::DataCopyPad(MatrixA_Real_ub_tensor, a_gm_tensor[offset_loopc / 2], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));    // 搬对应的N-2个数进来
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    AscendC::DataCopyPad(MatrixA_Imag_ub_tensor, a_gm_tensor[offset_loopc / 2 + N / 2], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));    // 搬对应的N-2个数进来
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    init_A = 1;
                }
                

                if(aligned && init_B == 0) {
                    // LOAD B Matrix
                    AscendC::DataCopy(MatrixB_Real_ub_tensor, b_gm_tensor[offset_loopc / 2], AscendC::DataCopyParams(1, (lenBurstC + 1) / 2, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    AscendC::DataCopy(MatrixB_Imag_ub_tensor, b_gm_tensor[offset_loopc / 2 + N / 2], AscendC::DataCopyParams(1, (lenBurstC + 1) / 2, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    init_B = 1;
                } else if (init_B == 0) {
                    AscendC::DataCopyPad(MatrixB_Real_ub_tensor, b_gm_tensor[offset_loopc / 2], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    AscendC::DataCopyPad(MatrixB_Imag_ub_tensor, b_gm_tensor[offset_loopc / 2 + N / 2], AscendC::DataCopyExtParams(1, (actual_n / 2) * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE2>();
                    init_B = 1;
                }
#else
                int32_t B_id = loopN - batch_id_begin;
                int32_t N_id = loop;
                int32_t loop_id = N_id * batch_size + B_id;
                if (loop_id % aiv_num != aiv_id) {
                    continue;
                }
#endif

                auto event_id = ping ? EVENT_ID0 : EVENT_ID1;
                auto buf0_ub_tensor = ping ? UB_INPUT1_ub_tensor : UB_INPUT2_ub_tensor;   // 输入、大于512倒序、A矩阵、B矩阵、搬出
                auto buf1_ub_tensor = ping ? UB_INPUT_R1_ub_tensor: UB_INPUT_R2_ub_tensor;    // 输入的虚部
                auto buf2_ub_tensor = ping ? UB_INPUT_I1_ub_tensor: UB_INPUT_I2_ub_tensor;    // 输入的实部
                auto buf4_ub_tensor = ping ? UB_INPUT_R1_REVERSE_ub_tensor: UB_INPUT_R2_REVERSE_ub_tensor;    // 倒序后的实部
                auto buf5_ub_tensor = ping ? UB_INPUT_I1_REVERSE_ub_tensor: UB_INPUT_I2_REVERSE_ub_tensor;    // 倒序后的虚部
                auto temp1_ub_tensor = ping ? tvub1_ub_tensor : tvub3_ub_tensor;
                auto temp2_ub_tensor = ping ? tvub2_ub_tensor : tvub4_ub_tensor;
                uint64_t index1 = ping ? 2: 10;
                uint64_t index2 = ping ? 3: 11;
                uint64_t index3 = ping ? 6: 14;
                uint64_t index4 = ping ? 7: 15;

                int64_t offset = loopN * (N + 2);
                int64_t offset_out = loopN * N;

                AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(event_id);
                //  **********搬入***************
                int actual_n = N + 2;
                actual_n = actual_n < BIASC ? actual_n : ((loop + 1) * BIASC <= actual_n ? BIASC : (actual_n - loop * BIASC));

#ifndef MIX_CORE
                bool aligned = (actual_n % BLOCK_SIZE_8 == 0);
#endif

                AscendC::DataCopyPad(buf0_ub_tensor, input_gm_tensor[offset + offset_loopc], AscendC::DataCopyExtParams(1, actual_n * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
                AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(event_id);
                AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(event_id);
                AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(event_id);
                //  **********C2R预先置0*****************
                // 使用带mask的vsub将首位虚部以及末位虚部置为0
                if (loop == 0) {
                    AscendC::SetVectorMask<float>(-1, 0x0000000000000002);
                    AscendC::Sub<float, false>(buf0_ub_tensor, buf0_ub_tensor, buf0_ub_tensor, AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 8, 8, 8});
                    AscendC::SetVectorMask<float>(-1, -1);
                }
                AscendC::PipeBarrier<PIPE_V>();

#ifndef MIX_CORE
                if (loop == repeat_times - 1) {
#else
                if (loop == repeat_times - 1 && (N % BIASC != 0)) {
#endif

                    int setzero_offset = actual_n / 64;
                    int remain_len = actual_n - (setzero_offset) * 64;
                    int mask_offset = remain_len - 1;
                    if (actual_n % 64 == 0) {
                        AscendC::SetVectorMask<float>(0x0, 0x8000000000000000);
                        AscendC::Sub<float, false>(buf0_ub_tensor[(setzero_offset - 1) * 64], buf0_ub_tensor[(setzero_offset - 1) * 64], buf0_ub_tensor[(setzero_offset - 1) * 64], AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 8, 8, 8});
                    }
                    else {
                        AscendC::SetVectorMask<float>(0x0, (uint64_t) 1 << mask_offset);
                        AscendC::Sub<float, false>(buf0_ub_tensor[setzero_offset * 64], buf0_ub_tensor[setzero_offset * 64], buf0_ub_tensor[setzero_offset * 64], AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 8, 8, 8});
                    }
                    AscendC::SetVectorMask<float>(-1, -1);
                }
                AscendC::PipeBarrier<PIPE_V>();
                //  **********输入虚实分离***************
                //输入实数分离
                vreducev2(
                    reinterpret_cast<__ubuf__ uint32_t *>(buf2_ub_tensor.GetPhyAddr(0)),
                    reinterpret_cast<__ubuf__ uint32_t *>(buf0_ub_tensor.GetPhyAddr(0)),
                    nullptr,
                    repeat_reduce, // repeat
                    1,        // src0BlockStride
                    1,        // patternMode, 101010…10
                    8,        // src0RepeatStride
                    8         // src1RepeatStride
                );
                // 输入虚数分离
                vreducev2(
                    reinterpret_cast<__ubuf__ uint32_t *>(buf1_ub_tensor.GetPhyAddr(0)),
                    reinterpret_cast<__ubuf__ uint32_t *>(buf0_ub_tensor.GetPhyAddr(0)),
                    nullptr,
                    repeat_reduce, // repeat
                    1,        // src0BlockStride
                    2,        // patternMode, 010101…01
                    8,        // src0RepeatStride
                    8         // src1RepeatStride
                );
                AscendC::PipeBarrier<PIPE_V>();
                //  **********倒序***************
                // 倒序分两种情况
                if(N < BIASC && repeat_times == 1) {
                    // 一个UB块就能完成
                    // FI倒序
                    // todo GatherMask
                    vgather(reinterpret_cast<__ubuf__ uint32_t *>(buf4_ub_tensor.GetPhyAddr(0)), reinterpret_cast<__ubuf__ uint32_t *>(reverse_index_ub_tensor.GetPhyAddr(0)), index1 * UB_VEC_SIZE, 8, repeat);
                    // FR倒序
                    // todo GatherMask
                    vgather(reinterpret_cast<__ubuf__ uint32_t *>(buf5_ub_tensor.GetPhyAddr(0)), reinterpret_cast<__ubuf__ uint32_t *>(reverse_index_ub_tensor.GetPhyAddr(0)), index2 * UB_VEC_SIZE, 8, repeat);
                    AscendC::PipeBarrier<PIPE_V>();
                    // FI共轭
                    AscendC::Muls(buf4_ub_tensor, buf4_ub_tensor, -1.0f, vecMask, repeat, {1, 1, 8, 8});
                } else {
                    // 数据量大于512要用多个UB块操作
                    AscendC::PipeBarrier<PIPE_ALL>();

#ifndef MIX_CORE
                    int64_t offset_loopc_rev = (repeat_times - 1 - loop) * BIASC;
                    AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(event_id);
                    AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(event_id);
                    AscendC::DataCopy(buf0_ub_tensor, input_gm_tensor[offset + offset_loopc_rev + 2], AscendC::DataCopyParams(1, lenBurstC, 0, 0));
                    AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(event_id);
                    AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(event_id);
                    // 大于 UB_VEC_SIZE, 需要倒着读一块， 然后把这块倒着读回来参与计算
 
#else
                    int64_t offset_block = ((loop + 1) * BIASC >= (N + 2)) ? (N + 2) : (loop * BIASC + BIASC);
                    int64_t offset_loopc_rev = (N + 2) - offset_block;
                    AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(event_id);
                    AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(event_id);
                    AscendC::DataCopyPad(buf0_ub_tensor, input_gm_tensor[offset + offset_loopc_rev], AscendC::DataCopyExtParams(1, actual_n * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));

                    AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(event_id);
                    AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(event_id);
                    // 大于 UB_VEC_SIZE, 需要倒着读一块， 然后把这块倒着读回来参与计算
                    if (loop == 0) {
                        AscendC::SetVectorMask<float>(0x0, 0x8000000000000000);
                        AscendC::Sub<float, false>(buf0_ub_tensor[4032], buf0_ub_tensor[4032], buf0_ub_tensor[4032], AscendC::MASK_PLACEHOLDER, 1, {1, 1, 1, 8, 8, 8});
                        AscendC::SetVectorMask<float>(-1, -1);
                    }
                    AscendC::PipeBarrier<PIPE_V>();

#endif

                    // 实数分离
                    vreducev2(
                        reinterpret_cast<__ubuf__ uint32_t *>(temp1_ub_tensor.GetPhyAddr(0)),
                        reinterpret_cast<__ubuf__ uint32_t *>(buf0_ub_tensor.GetPhyAddr(0)),
                        nullptr,
                        repeat_reduce, // repeat
                        1,        // src0BlockStride
                        1,        // patternMode, 101010…10
                        8,        // src0RepeatStride
                        8         // src1RepeatStride
                    );
                    // 虚数分离
                    vreducev2(
                        reinterpret_cast<__ubuf__ uint32_t *>(temp2_ub_tensor.GetPhyAddr(0)),
                        reinterpret_cast<__ubuf__ uint32_t *>(buf0_ub_tensor.GetPhyAddr(0)),
                        nullptr,
                        repeat_reduce, // repeat
                        1,        // src0BlockStride
                        2,        // patternMode, 010101…01
                        8,        // src0RepeatStride
                        8         // src1RepeatStride
                    );
                    AscendC::PipeBarrier<PIPE_V>();
                    // FI 倒序
                    // todo GatherMask
                    vgather(reinterpret_cast<__ubuf__ uint32_t *>(buf4_ub_tensor.GetPhyAddr(0)), reinterpret_cast<__ubuf__ uint32_t *>(reverse_index_ub_tensor.GetPhyAddr(0)), index4 * UB_VEC_SIZE, 8, repeat);
                    // FR 倒序
                    // todo GatherMask
                    vgather(reinterpret_cast<__ubuf__ uint32_t *>(buf5_ub_tensor.GetPhyAddr(0)), reinterpret_cast<__ubuf__ uint32_t *>(reverse_index_ub_tensor.GetPhyAddr(0)), index3 * UB_VEC_SIZE, 8, repeat);
                    AscendC::PipeBarrier<PIPE_V>();
                    // FI共轭
                    AscendC::Muls(buf4_ub_tensor, buf4_ub_tensor, -1.0f, vecMask, repeat, {1, 1, 8, 8});
                    AscendC::PipeBarrier<PIPE_ALL>();
                }
                AscendC::PipeBarrier<PIPE_V>();
                //A的计算
                // 系数A实/虚分离
                //  **********A矩阵相关的计算***************
                // UB_BUF4：FI_REV;   UB_BUF5: FR_REV;   UB_BUF2:FR;    UB_BUF1:FI
                // UB_BUF6: AI;   UB_BUF7: AR
                // FR的计算
                AscendC::Mul(temp1_ub_tensor, buf1_ub_tensor, MatrixA_Real_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});
                AscendC::Mul(temp2_ub_tensor, buf2_ub_tensor, MatrixA_Real_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});
                AscendC::Mul(buf1_ub_tensor, buf1_ub_tensor, MatrixA_Imag_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});
                AscendC::Mul(buf2_ub_tensor, buf2_ub_tensor, MatrixA_Imag_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});
                AscendC::PipeBarrier<PIPE_V>();

                AscendC::Sub(buf1_ub_tensor, temp2_ub_tensor, buf1_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FR * AR - FI * AI
                AscendC::Add(buf2_ub_tensor, temp1_ub_tensor, buf2_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FR * AI + FI * AR
                AscendC::PipeBarrier<PIPE_V>();
 
                // B的计算
                // 系数B实/虚分离
                //  **********B矩阵相关的计算***************
                // UB_BUF4：FI_REV;   UB_BUF5: FR_REV;   UB_BUF2:FR;    UB_BUF1:FI
                // UB_BUF6: BI;   UB_BUF7: BR
                // FrevR的计算

                AscendC::Mul(temp1_ub_tensor, buf5_ub_tensor, MatrixB_Real_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FervR * BR
                AscendC::Mul(temp2_ub_tensor, buf4_ub_tensor, MatrixB_Real_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FervI * BR
                AscendC::Mul(buf5_ub_tensor, buf5_ub_tensor, MatrixB_Imag_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});  // FrevR * BI
                AscendC::Mul(buf4_ub_tensor, buf4_ub_tensor, MatrixB_Imag_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8});  // FrevI * BI
                AscendC::PipeBarrier<PIPE_V>();
                AscendC::Sub(buf4_ub_tensor, temp1_ub_tensor, buf4_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FervR * BR - FrevI * BI
                AscendC::Add(buf5_ub_tensor, temp2_ub_tensor, buf5_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FrevR * BI + FervI * BR
                AscendC::PipeBarrier<PIPE_V>();
 
                //  **********虚实结合***************
                // A和B结果组合
                AscendC::Add(buf1_ub_tensor, buf1_ub_tensor, buf4_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FR * AR - FI * AI
                AscendC::Add(buf2_ub_tensor, buf2_ub_tensor, buf5_ub_tensor, vecMask, repeat, {1, 1, 1, 8, 8, 8}); // FR * AI + FI * AR
                AscendC::PipeBarrier<PIPE_V>();
                // UB_BUF8实数， UB_BUF9虚数，结果组合
                // 结果在UB_BUF0里面
                // todo GatherMask
                vgather(reinterpret_cast<__ubuf__ uint32_t *>(buf4_ub_tensor.GetPhyAddr(0)), reinterpret_cast<__ubuf__ uint32_t *>(vgather_index_ub_tensor.GetPhyAddr(0)), index1 * UB_VEC_SIZE, 8, repeat_out);
                AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(event_id);
                AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(event_id);
                AscendC::SetFlag<AscendC::HardEvent::V_MTE2>(event_id);
                //  **********搬出***************
                actual_n = N;
                actual_n = actual_n < BIASC ? actual_n : ((loop + 1) * BIASC <= actual_n ? BIASC : (actual_n - loop * BIASC));

#ifndef MIX_CORE
                aligned = (actual_n % BLOCK_SIZE_8 == 0);
                if(aligned) {
                    AscendC::DataCopy(output_gm_tensor[offset_out + offset_loopc], buf4_ub_tensor, AscendC::DataCopyParams(1, lenBurstOut, 0, 0));
                } else {
                    AscendC::DataCopyPad(output_gm_tensor[offset_out + offset_loopc], buf4_ub_tensor, AscendC::DataCopyExtParams(1, actual_n * sizeof(float), 0, 0, 0));
                }
#else
                AscendC::DataCopyPad(output_gm_tensor[offset_out + offset_loopc], buf4_ub_tensor, AscendC::DataCopyExtParams(1, actual_n * sizeof(float), 0, 0, 0));
#endif

                AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(event_id);
                ping = 1 - ping;
            }
        }
        AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID0);
        AscendC::WaitFlag<AscendC::HardEvent::V_MTE2>(EVENT_ID1);

        AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID0);
        AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(EVENT_ID1);
}


__aicore__ __inline__ void C2R_odd(
    __gm__ float * __restrict__ gm_input,
    __gm__ uint32_t * __restrict__ gm_input_index,
    __gm__ float * __restrict__ gm_a,
    __gm__ float * __restrict__ gm_b,
    __gm__ uint32_t * __restrict__ gm_output_index,
    __gm__ float * __restrict__ gm_output,
    uint64_t N,
    uint64_t batch_size,
    uint64_t batch_id_begin,
    uint64_t batch_id_end)
    {
        AscendC::SetAtomicNone();
        AscendC::SetMaskNorm();
        AscendC::SetVectorMask<float>((uint64_t)-1, (uint64_t)-1);
 
        AscendC::GlobalTensor<float> input_gm_tensor;
        AscendC::GlobalTensor<float> output_gm_tensor;
        AscendC::GlobalTensor<uint32_t> input_index_gm_tensor;
        AscendC::GlobalTensor<uint32_t> output_index_gm_tensor;
        AscendC::GlobalTensor<float> a_gm_tensor;
        AscendC::GlobalTensor<float> b_gm_tensor;

        input_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_input));
        output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_output));
        input_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_input_index));
        output_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_output_index));
        a_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_a));
        b_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_b));

        uint64_t vecMask[2] = { UINT64_MAX, 0 };

        uint32_t aiv_id = AscendC::GetBlockIdx();
        uint32_t aiv_num = AscendC::GetBlockNum() * 2;
 
        // 每次取N个数, 执行batch_size次
        int32_t real_len = (N / 2 + 1) * 2;     // 结果为N + 1，实际输入长度
        int32_t copy_len = (N / 2) * 2;         // 结果为N - 1, 输入长度为((N / 2 + 1) * 2)，但第一个复数不需要共轭倒序

        // 每次把实数和虚部一起取出来
        uint64_t loopN_times = batch_size;
        // 每次取得数的个数
        uint64_t repeat_times = (copy_len + BIASC - 1) / BIASC;
 
        uint64_t ping = 1;

        // 开多核
        AsdopsBuffer<ArchType::ASCEND_V220> buf;
        AscendC::LocalTensor<float> UB_INPUT1_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(0 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_R1_REVERSE_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(4 * 8192);

        AscendC::LocalTensor<float> UB_INPUT2_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(8 * 8192);
        AscendC::LocalTensor<float> UB_INPUT_R2_REVERSE_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, float>(12 * 8192);

        AscendC::LocalTensor<uint32_t> reverse_index1_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, uint32_t>(16 * 8192);
        AscendC::LocalTensor<uint32_t> reverse_index2_ub_tensor = buf.GetBuffer<BufferType::ASCEND_UB, uint32_t>(20 * 8192);

        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID1);

        AscendC::DataCopy(reverse_index1_ub_tensor, input_index_gm_tensor, AscendC::DataCopyParams(1, 4096 / 8, 0, 0));    // 搬入512长度  
        AscendC::PipeBarrier<PIPE_MTE2>();
        AscendC::DataCopy(reverse_index2_ub_tensor, input_index_gm_tensor, AscendC::DataCopyParams(1, 4096 / 8, 0, 0));    // 搬入512长度  
        AscendC::PipeBarrier<PIPE_MTE2>();

#ifndef MIX_CORE
#else
        int32_t remain_size = copy_len % BIASC;
#endif

        for(uint64_t loop = 0; loop < repeat_times; loop++) {       // 序列方向的循环

            int64_t offset_loopc = loop * BIASC;      // 已经处理过的长度
            int64_t actual_n = BIASC;

            int32_t ord_len = actual_n;                 // 读入的长度
            int32_t rev_len = actual_n;                 // 倒序后的长度
            
            if (repeat_times == 1) {
                ord_len = copy_len + 2;
                rev_len = copy_len;
            }
            else {
                if (loop == 0){
                    ord_len = BIASC + 2;
                    rev_len = BIASC;
                }
                else if (loop == repeat_times - 1) {
                    ord_len = copy_len - loop * BIASC;
                    rev_len = copy_len - loop * BIASC;
                }
                else {
                    ord_len = BIASC;
                    rev_len = BIASC;
                }
            }
            int32_t idx_size = (ord_len + 63) / 64 * 64;    // 输入搬入长度
            int32_t cal_size = (rev_len + 63) / 64 * 64;    // 共轭对应长度
 
            for(uint64_t loopN = batch_id_begin; loopN < batch_id_end; loopN+=1) {  // Batch方向的循环
 
                int32_t B_id = loopN - batch_id_begin;
                int32_t N_id = loop;
                int32_t loop_id = N_id * batch_size + B_id;
                if (loop_id % aiv_num != aiv_id) {
                    continue;
                }

                auto event_id = ping ? EVENT_ID0 : EVENT_ID1;
                auto buf0_ub_tensor = ping ? UB_INPUT1_ub_tensor : UB_INPUT2_ub_tensor;   // 复数输入
                auto buf4_ub_tensor = ping ? UB_INPUT_R1_REVERSE_ub_tensor : UB_INPUT_R2_REVERSE_ub_tensor;   // 复数输入
                auto reverse_index_ub_tensor = ping ? reverse_index1_ub_tensor : reverse_index2_ub_tensor;   // 复数输入
                uint64_t index1 = ping ? 0 : 8;

                AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(event_id);

                int64_t offset = loopN * real_len;
                int64_t offset_out = loopN * 2 * N;
 
                
                //  **********搬入***************
                if (repeat_times == 1) {
                    // (actual_n + 2) 代表再加上第一复数
                    AscendC::DataCopyPad(buf0_ub_tensor, input_gm_tensor[offset + offset_loopc], AscendC::DataCopyExtParams(1, ord_len * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
                        AscendC::PipeBarrier<PIPE_MTE2>();
                }
                else {
                    if (loop == 0) {
                        // (actual_n + 2) 代表再加上第一复数
                        AscendC::DataCopyPad(buf0_ub_tensor, input_gm_tensor[offset + offset_loopc], AscendC::DataCopyExtParams(1, ord_len * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
                        AscendC::PipeBarrier<PIPE_MTE2>();
                    }
                    else if (repeat_times != 1 && loop == repeat_times - 1) {
                        // + 2 means 第一个复数
                        AscendC::DataCopyPad(buf0_ub_tensor, input_gm_tensor[2 + offset + offset_loopc], AscendC::DataCopyExtParams(1, ord_len * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
                        AscendC::PipeBarrier<PIPE_MTE2>();
                    }
                    else {
                        // + 2 means 第一个复数
                        AscendC::DataCopyPad(buf0_ub_tensor, input_gm_tensor[2 + offset + offset_loopc], AscendC::DataCopyExtParams(1, ord_len * sizeof(float), 0, 0, 0), AscendC::DataCopyPadExtParams<float>(false, 0, 0, 0));
                        AscendC::PipeBarrier<PIPE_MTE2>();
                    }
                }
                //  **********搬入倒序index数组***************
                if (repeat_times == 1) {
                    AscendC::DataCopy(reverse_index_ub_tensor, input_index_gm_tensor, AscendC::DataCopyParams(1, cal_size / 8, 0, 0));       // 搬入512长度
                    AscendC::PipeBarrier<PIPE_MTE2>();
                }
                else {
                    if (loop == 0) {
                        AscendC::DataCopy(reverse_index_ub_tensor, input_index_gm_tensor, AscendC::DataCopyParams(1, cal_size / 8, 0, 0));       // 搬入512长度
                        AscendC::PipeBarrier<PIPE_MTE2>();
                    }

#ifndef MIX_CORE
                    else if (loop == repeat_times - 1) {
#else
                    else if (loop == repeat_times - 1 && remain_size != 0) {
#endif
                        AscendC::DataCopy(reverse_index_ub_tensor, input_index_gm_tensor[2 * BIASC], AscendC::DataCopyParams(1, cal_size / 8, 0, 0));       // 搬入512长度
                        AscendC::PipeBarrier<PIPE_MTE2>();
                    }
                    else {
                        AscendC::DataCopy(reverse_index_ub_tensor, input_index_gm_tensor[BIASC], AscendC::DataCopyParams(1, cal_size / 8, 0, 0));       // 搬入512长度 
                        AscendC::PipeBarrier<PIPE_MTE2>();
                    }
                }
                AscendC::SetFlag<AscendC::HardEvent::MTE2_V>(event_id);
                AscendC::WaitFlag<AscendC::HardEvent::MTE2_V>(event_id);
                //  **********[2:N]长度的倒序+共轭***************
                // FI倒序
                // todo GatherMask
                vgather(reinterpret_cast<__ubuf__ uint32_t *>(buf4_ub_tensor.GetPhyAddr(0)), reinterpret_cast<__ubuf__ unsigned int *>(reverse_index_ub_tensor.GetPhyAddr(0)), index1 * UB_VEC_SIZE, 8, cal_size / 64);
                AscendC::PipeBarrier<PIPE_V>();
                AscendC::SetVectorMask<float>(-1, 0xaaaaaaaaaaaaaaaa);
                AscendC::Muls<float, false>(buf4_ub_tensor, buf4_ub_tensor, -1.0f, AscendC::MASK_PLACEHOLDER, cal_size / 64, {1, 1, 8, 8});
                AscendC::SetVectorMask<float>(-1, -1);
                AscendC::PipeBarrier<PIPE_V>();
                AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(event_id);
                AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(event_id);
                //  **********搬出***************
                if (repeat_times == 1) {
                    // (actual_n + 2) 代表再加上第一复数
                    int start_idx = 0;
                    int revert_idx = N * 2 - rev_len;
                    AscendC::DataCopyPad(output_gm_tensor[offset_out + start_idx], buf0_ub_tensor, AscendC::DataCopyExtParams(1,  ord_len * sizeof(float), 0, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE3>();
                    AscendC::DataCopyPad(output_gm_tensor[offset_out + revert_idx], buf4_ub_tensor, AscendC::DataCopyExtParams(1,  rev_len * sizeof(float), 0, 0, 0));
                    AscendC::PipeBarrier<PIPE_MTE3>();
                }
                else {
                    if (loop == 0) {
                        // (actual_n + 2) 代表再加上第一复数
                        int start_idx = 0;
                        int revert_idx = N * 2 - rev_len;
                        AscendC::DataCopyPad(output_gm_tensor[offset_out + start_idx], buf0_ub_tensor, AscendC::DataCopyExtParams(1,  ord_len * sizeof(float), 0, 0, 0));
                        AscendC::PipeBarrier<PIPE_MTE3>();
                        AscendC::DataCopyPad(output_gm_tensor[offset_out + revert_idx], buf4_ub_tensor, AscendC::DataCopyExtParams(1,  rev_len * sizeof(float), 0, 0, 0));
                        AscendC::PipeBarrier<PIPE_MTE3>();
                    }
                    else if (repeat_times != 1 && loop == repeat_times - 1) {
                        // + 2 means 第一个复数
                        int start_idx = 2 + offset_loopc;
                        int revert_idx = N * 2 - offset_loopc - rev_len;
                        AscendC::DataCopyPad(output_gm_tensor[offset_out + start_idx], buf0_ub_tensor, AscendC::DataCopyExtParams(1,  ord_len * sizeof(float), 0, 0, 0));
                        AscendC::PipeBarrier<PIPE_MTE3>();
                        AscendC::DataCopyPad(output_gm_tensor[offset_out + revert_idx], buf4_ub_tensor, AscendC::DataCopyExtParams(1,  rev_len * sizeof(float), 0, 0, 0));
                        AscendC::PipeBarrier<PIPE_MTE3>();
                    }
                    else {
                        // + 2 means 第一个复数
                        int start_idx = 2 + offset_loopc;
                        int revert_idx = N * 2 - offset_loopc - rev_len;
                        AscendC::DataCopyPad(output_gm_tensor[offset_out + start_idx], buf0_ub_tensor, AscendC::DataCopyExtParams(1,  ord_len * sizeof(float), 0, 0, 0));
                        AscendC::PipeBarrier<PIPE_MTE3>();
                        AscendC::DataCopyPad(output_gm_tensor[offset_out + revert_idx], buf4_ub_tensor, AscendC::DataCopyExtParams(1,  rev_len * sizeof(float), 0, 0, 0));
                        AscendC::PipeBarrier<PIPE_MTE3>();
                    }
                }
                AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(event_id);
                ping = 1 - ping;
            }
        }
        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID0);
        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(EVENT_ID1);
}