#include "kernel_operator.h"
#include "fft_all_common.cceh"
#include "fft_r2c_c2r_common.cceh"
__aicore__ __inline__ void common_fft_r2c_mix_cube_by_batch(
__gm__ uint8_t * __restrict__ ffts_addr,
__gm__ float * __restrict__ gm_input,
__gm__ float * __restrict__ gm_dft_matrix_array,
__gm__ float * __restrict__ gm_tw_matrix_array,
__gm__ int32_t * __restrict__ gm_radix_list,
__gm__ float * __restrict__ gm_output,
__gm__ float * __restrict__ gm_workspace,
__gm__ uint8_t * __restrict__ gm_tiling_para,
int32_t odd,
int64_t max_floats = (1 << 21)
)
{
Init4Cube(ffts_addr);
AsdSip::FftAllMixTilingData tiling_data;
InitTilingData(gm_tiling_para, &tiling_data);
AscendC::GlobalTensor<float> input_gm_tensor;
AscendC::GlobalTensor<float> dft_matrix_array_gm_tensor;
AscendC::GlobalTensor<float> tw_matrix_array_gm_tensor;
AscendC::GlobalTensor<int32_t> radix_list_gm_tensor;
AscendC::GlobalTensor<float> output_gm_tensor;
input_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_input));
dft_matrix_array_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_dft_matrix_array));
tw_matrix_array_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_tw_matrix_array));
radix_list_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(gm_radix_list));
output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_output));
AscendC::GlobalTensor<float> workspace_input_gm_tensor;
AscendC::GlobalTensor<float> workspace_output_gm_tensor;
AscendC::GlobalTensor<float> workspace_sync_gm_tensor;
AscendC::GlobalTensor<float> auxil_gm_tensor;
workspace_input_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[0]));
workspace_output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[1]));
workspace_sync_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[2]));
auxil_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[4]));
int64_t n_fft_actual = odd == 1 ? tiling_data.fftN : tiling_data.fftN * 2;
// 让每次处理的数据小于 2 ^ 21 次方
int64_t L2_CACHE_MAX_ELES = max_floats;
int64_t n_batch_in_loop = (L2_CACHE_MAX_ELES + tiling_data.fftN - 1) / tiling_data.fftN;
int64_t batch_loop = (tiling_data.batchSize + n_batch_in_loop - 1) / n_batch_in_loop;
int64_t batch_remain = tiling_data.batchSize % n_batch_in_loop;
if (batch_loop == 1 && batch_remain > 0) {
n_batch_in_loop = batch_remain;
}
if (tiling_data.fftN > L2_CACHE_MAX_ELES * 5) {
n_batch_in_loop = tiling_data.batchSize;
batch_loop = 1;
batch_remain = tiling_data.batchSize;
}
int64_t input_eles_one_loop = n_batch_in_loop * n_fft_actual;
int64_t output_eles_one_loop = n_batch_in_loop * (n_fft_actual / 2 + 1) * 2 * sizeof(float) / sizeof(float);
for (int64_t batch_index = 0; batch_index < batch_loop; batch_index++) {
int64_t batch_actual = n_batch_in_loop;
if (batch_index == batch_loop - 1 && batch_remain > 0) {
batch_actual = batch_remain;
}
AscendC::GlobalTensor<float> now_input_gm_tensor = input_gm_tensor[batch_index * input_eles_one_loop];
AscendC::GlobalTensor<float> now_output_gm_tensor = output_gm_tensor[batch_index * output_eles_one_loop];
stock_fft_mix_aic(
now_input_gm_tensor.GetPhyAddr(0),
dft_matrix_array_gm_tensor.GetPhyAddr(0),
tw_matrix_array_gm_tensor.GetPhyAddr(0),
workspace_input_gm_tensor.GetPhyAddr(0),
workspace_output_gm_tensor.GetPhyAddr(0),
workspace_sync_gm_tensor.GetPhyAddr(0),
radix_list_gm_tensor.GetPhyAddr(0),
now_output_gm_tensor.GetPhyAddr(0),
auxil_gm_tensor.GetPhyAddr(0),
batch_actual,
tiling_data.fftN,
tiling_data.radixListLen,
tiling_data.isInverse,
max_floats
);
}
}
template<int32_t aiv_split_way>
__aicore__ __inline__ void common_fft_r2c_mix_vector(
__gm__ uint8_t * __restrict__ ffts_addr,
__gm__ float * __restrict__ gm_input,
__gm__ uint32_t * __restrict__ gm_input_index,
__gm__ float * __restrict__ gm_a,
__gm__ float * __restrict__ gm_b,
__gm__ uint32_t * __restrict__ gm_output_index,
__gm__ float * __restrict__ gm_dft_matrix_array,
__gm__ float * __restrict__ gm_tw_matrix_array,
__gm__ int32_t * __restrict__ gm_radix_list,
__gm__ float * __restrict__ gm_output,
__gm__ float * __restrict__ gm_workspace,
__gm__ uint8_t * __restrict__ gm_tiling_para,
int32_t odd
)
{
Init4Vector(ffts_addr);
AsdSip::FftAllMixTilingData tiling_data;
InitTilingData(gm_tiling_para, &tiling_data);
AscendC::GlobalTensor<float> input_gm_tensor;
AscendC::GlobalTensor<float> dft_matrix_array_gm_tensor;
AscendC::GlobalTensor<float> tw_matrix_array_gm_tensor;
AscendC::GlobalTensor<int32_t> radix_list_gm_tensor;
AscendC::GlobalTensor<float> output_gm_tensor;
AscendC::GlobalTensor<uint32_t> input_index_gm_tensor;
AscendC::GlobalTensor<uint32_t> output_index_gm_tensor;
AscendC::GlobalTensor<float> a_gm_tensor;
AscendC::GlobalTensor<float> b_gm_tensor;
input_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_input));
dft_matrix_array_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_dft_matrix_array));
tw_matrix_array_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_tw_matrix_array));
radix_list_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(gm_radix_list));
output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_output));
input_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_input_index));
output_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_output_index));
a_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_a));
b_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_b));
AscendC::GlobalTensor<float> workspace_input_gm_tensor;
AscendC::GlobalTensor<float> workspace_output_gm_tensor;
AscendC::GlobalTensor<float> workspace_sync_gm_tensor;
AscendC::GlobalTensor<float> c2c_output_gm_tensor;
AscendC::GlobalTensor<float> auxil_gm_tensor;
workspace_input_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[0]));
workspace_output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[1]));
workspace_sync_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[2]));
c2c_output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[3]));
auxil_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[4]));
stock_fft_mix_aiv<false, aiv_split_way>(
input_gm_tensor.GetPhyAddr(0),
dft_matrix_array_gm_tensor.GetPhyAddr(0),
tw_matrix_array_gm_tensor.GetPhyAddr(0),
workspace_input_gm_tensor.GetPhyAddr(0),
workspace_output_gm_tensor.GetPhyAddr(0),
workspace_sync_gm_tensor.GetPhyAddr(0),
radix_list_gm_tensor.GetPhyAddr(0),
c2c_output_gm_tensor.GetPhyAddr(0),
auxil_gm_tensor.GetPhyAddr(0),
tiling_data.batchSize,
tiling_data.fftN,
tiling_data.radixListLen,
tiling_data.isInverse,
0,
odd
);
AscendC::CrossCoreSetFlag<0, PIPE_MTE3>(1);
AscendC::CrossCoreWaitFlag(1);
if (odd == 1) {
AscendC::PipeBarrier<PIPE_ALL>();
R2C_odd(c2c_output_gm_tensor.GetPhyAddr(0), input_index_gm_tensor.GetPhyAddr(0), a_gm_tensor.GetPhyAddr(0), b_gm_tensor.GetPhyAddr(0), output_index_gm_tensor.GetPhyAddr(0), output_gm_tensor.GetPhyAddr(0), tiling_data.fftN, tiling_data.batchSize, 0, tiling_data.batchSize);
} else {
AscendC::PipeBarrier<PIPE_ALL>();
R2C(c2c_output_gm_tensor.GetPhyAddr(0), input_index_gm_tensor.GetPhyAddr(0), a_gm_tensor.GetPhyAddr(0), b_gm_tensor.GetPhyAddr(0), output_index_gm_tensor.GetPhyAddr(0), output_gm_tensor.GetPhyAddr(0), tiling_data.fftN * 2, tiling_data.batchSize, 0, tiling_data.batchSize);
AscendC::CrossCoreSetFlag<0, PIPE_MTE3>(1);
AscendC::CrossCoreWaitFlag(1);
AscendC::PipeBarrier<PIPE_ALL>();
R2C_tail_calc(c2c_output_gm_tensor.GetPhyAddr(0), output_gm_tensor.GetPhyAddr(0), tiling_data.batchSize, tiling_data.fftN);
}
}
template<int32_t aiv_split_way>
__aicore__ __inline__ void common_fft_r2c_mix_vector_by_batch(
__gm__ uint8_t * __restrict__ ffts_addr,
__gm__ float * __restrict__ gm_input,
__gm__ uint32_t * __restrict__ gm_input_index,
__gm__ float * __restrict__ gm_a,
__gm__ float * __restrict__ gm_b,
__gm__ uint32_t * __restrict__ gm_output_index,
__gm__ float * __restrict__ gm_dft_matrix_array,
__gm__ float * __restrict__ gm_tw_matrix_array,
__gm__ int32_t * __restrict__ gm_radix_list,
__gm__ float * __restrict__ gm_output,
__gm__ float * __restrict__ gm_workspace,
__gm__ uint8_t * __restrict__ gm_tiling_para,
int32_t odd
)
{
Init4Vector(ffts_addr);
AsdSip::FftAllMixTilingData tiling_data;
InitTilingData(gm_tiling_para, &tiling_data);
AscendC::GlobalTensor<float> input_gm_tensor;
AscendC::GlobalTensor<float> dft_matrix_array_gm_tensor;
AscendC::GlobalTensor<float> tw_matrix_array_gm_tensor;
AscendC::GlobalTensor<int32_t> radix_list_gm_tensor;
AscendC::GlobalTensor<float> output_gm_tensor;
AscendC::GlobalTensor<uint32_t> input_index_gm_tensor;
AscendC::GlobalTensor<uint32_t> output_index_gm_tensor;
AscendC::GlobalTensor<float> a_gm_tensor;
AscendC::GlobalTensor<float> b_gm_tensor;
input_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_input));
dft_matrix_array_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_dft_matrix_array));
tw_matrix_array_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_tw_matrix_array));
radix_list_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(gm_radix_list));
output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_output));
input_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_input_index));
output_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_output_index));
a_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_a));
b_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_b));
AscendC::GlobalTensor<float> workspace_input_gm_tensor;
AscendC::GlobalTensor<float> workspace_output_gm_tensor;
AscendC::GlobalTensor<float> workspace_sync_gm_tensor;
AscendC::GlobalTensor<float> c2c_output_gm_tensor;
AscendC::GlobalTensor<float> auxil_gm_tensor;
workspace_input_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[0]));
workspace_output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[1]));
workspace_sync_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[2]));
c2c_output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[3]));
auxil_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[4]));
int64_t n_fft_actual = odd == 1 ? tiling_data.fftN : tiling_data.fftN * 2;
// 让每次处理的数据小于 2 ^ 21 次方
int64_t L2_CACHE_MAX_ELES = (1 << 21);
int64_t n_batch_in_loop = (L2_CACHE_MAX_ELES + tiling_data.fftN - 1) / tiling_data.fftN;
int64_t batch_loop = (tiling_data.batchSize + n_batch_in_loop - 1) / n_batch_in_loop;
int64_t batch_remain = tiling_data.batchSize % n_batch_in_loop;
if (batch_loop == 1 && batch_remain > 0) {
n_batch_in_loop = batch_remain;
}
if (tiling_data.fftN > L2_CACHE_MAX_ELES * 5) {
n_batch_in_loop = tiling_data.batchSize;
batch_loop = 1;
batch_remain = tiling_data.batchSize;
}
int64_t input_eles_one_loop = n_batch_in_loop * n_fft_actual;
int64_t output_eles_one_loop = n_batch_in_loop * (n_fft_actual / 2 + 1) * 2 * sizeof(float) / sizeof(float);
for (int64_t batch_index = 0; batch_index < batch_loop; batch_index++) {
int64_t batch_actual = n_batch_in_loop;
if (batch_index == batch_loop - 1 && batch_remain > 0) {
batch_actual = batch_remain;
}
AscendC::GlobalTensor<float> now_input_gm_tensor = input_gm_tensor[batch_index * input_eles_one_loop];
AscendC::GlobalTensor<float> now_output_gm_tensor = output_gm_tensor[batch_index * output_eles_one_loop];
stock_fft_mix_aiv<false, aiv_split_way>(
now_input_gm_tensor.GetPhyAddr(0),
dft_matrix_array_gm_tensor.GetPhyAddr(0),
tw_matrix_array_gm_tensor.GetPhyAddr(0),
workspace_input_gm_tensor.GetPhyAddr(0),
workspace_output_gm_tensor.GetPhyAddr(0),
workspace_sync_gm_tensor.GetPhyAddr(0),
radix_list_gm_tensor.GetPhyAddr(0),
c2c_output_gm_tensor.GetPhyAddr(0),
auxil_gm_tensor.GetPhyAddr(0),
batch_actual,
tiling_data.fftN,
tiling_data.radixListLen,
tiling_data.isInverse,
0,
odd
);
AscendC::CrossCoreSetFlag<0, PIPE_MTE3>(1);
AscendC::CrossCoreWaitFlag(1);
if (odd == 1) {
AscendC::PipeBarrier<PIPE_ALL>();
R2C_odd(c2c_output_gm_tensor.GetPhyAddr(0), input_index_gm_tensor.GetPhyAddr(0), a_gm_tensor.GetPhyAddr(0), b_gm_tensor.GetPhyAddr(0), output_index_gm_tensor.GetPhyAddr(0), now_output_gm_tensor.GetPhyAddr(0), tiling_data.fftN, batch_actual, 0, batch_actual);
} else {
AscendC::PipeBarrier<PIPE_ALL>();
R2C(c2c_output_gm_tensor.GetPhyAddr(0), input_index_gm_tensor.GetPhyAddr(0), a_gm_tensor.GetPhyAddr(0), b_gm_tensor.GetPhyAddr(0), output_index_gm_tensor.GetPhyAddr(0), now_output_gm_tensor.GetPhyAddr(0), tiling_data.fftN * 2, batch_actual, 0, batch_actual);
AscendC::CrossCoreSetFlag<0, PIPE_MTE3>(1);
AscendC::CrossCoreWaitFlag(1);
AscendC::PipeBarrier<PIPE_ALL>();
R2C_tail_calc(c2c_output_gm_tensor.GetPhyAddr(0), now_output_gm_tensor.GetPhyAddr(0), batch_actual, tiling_data.fftN);
}
AscendC::CrossCoreSetFlag<0, PIPE_MTE3>(1);
AscendC::CrossCoreWaitFlag(1);
}
}
template<int32_t aiv_split_way>
__aicore__ __inline__ void common_fft_r2c_mix_even_vector_by_batch(
__gm__ uint8_t * __restrict__ ffts_addr,
__gm__ float * __restrict__ gm_input,
__gm__ uint32_t * __restrict__ gm_input_index,
__gm__ float * __restrict__ gm_a,
__gm__ float * __restrict__ gm_b,
__gm__ uint32_t * __restrict__ gm_output_index,
__gm__ float * __restrict__ gm_dft_matrix_array,
__gm__ float * __restrict__ gm_tw_matrix_array,
__gm__ int32_t * __restrict__ gm_radix_list,
__gm__ float * __restrict__ gm_output,
__gm__ float * __restrict__ gm_workspace,
__gm__ uint8_t * __restrict__ gm_tiling_para,
int64_t max_floats = (1 << 21)
)
{
Init4Vector(ffts_addr);
AsdSip::FftAllMixTilingData tiling_data;
InitTilingData(gm_tiling_para, &tiling_data);
AscendC::GlobalTensor<float> input_gm_tensor;
AscendC::GlobalTensor<float> dft_matrix_array_gm_tensor;
AscendC::GlobalTensor<float> tw_matrix_array_gm_tensor;
AscendC::GlobalTensor<int32_t> radix_list_gm_tensor;
AscendC::GlobalTensor<float> output_gm_tensor;
AscendC::GlobalTensor<uint32_t> input_index_gm_tensor;
AscendC::GlobalTensor<uint32_t> output_index_gm_tensor;
AscendC::GlobalTensor<float> a_gm_tensor;
AscendC::GlobalTensor<float> b_gm_tensor;
input_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_input));
dft_matrix_array_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_dft_matrix_array));
tw_matrix_array_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_tw_matrix_array));
radix_list_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ int32_t *>(gm_radix_list));
output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_output));
input_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_input_index));
output_index_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t *>(gm_output_index));
a_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_a));
b_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(gm_b));
AscendC::GlobalTensor<float> workspace_input_gm_tensor;
AscendC::GlobalTensor<float> workspace_output_gm_tensor;
AscendC::GlobalTensor<float> workspace_sync_gm_tensor;
AscendC::GlobalTensor<float> c2c_output_gm_tensor;
AscendC::GlobalTensor<float> auxil_gm_tensor;
workspace_input_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[0]));
workspace_output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[1]));
workspace_sync_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[2]));
c2c_output_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[3]));
auxil_gm_tensor.SetGlobalBuffer(reinterpret_cast<__gm__ float *>(reinterpret_cast<__gm__ uint8_t *>(gm_workspace) + tiling_data.workspaceOffsets[4]));
int64_t n_fft_actual = tiling_data.fftN * 2;
// 让每次处理的数据小于 2 ^ 21 次方
int64_t L2_CACHE_MAX_ELES = max_floats;
int64_t n_batch_in_loop = (L2_CACHE_MAX_ELES + tiling_data.fftN - 1) / tiling_data.fftN;
int64_t batch_loop = (tiling_data.batchSize + n_batch_in_loop - 1) / n_batch_in_loop;
int64_t batch_remain = tiling_data.batchSize % n_batch_in_loop;
if (batch_loop == 1 && batch_remain > 0) {
n_batch_in_loop = batch_remain;
}
if (tiling_data.fftN > L2_CACHE_MAX_ELES * 5) {
n_batch_in_loop = tiling_data.batchSize;
batch_loop = 1;
batch_remain = tiling_data.batchSize;
}
int64_t input_eles_one_loop = n_batch_in_loop * n_fft_actual;
int64_t output_eles_one_loop = n_batch_in_loop * (n_fft_actual / 2 + 1) * 2 * sizeof(float) / sizeof(float);
for (int64_t batch_index = 0; batch_index < batch_loop; batch_index++) {
int64_t batch_actual = n_batch_in_loop;
if (batch_index == batch_loop - 1 && batch_remain > 0) {
batch_actual = batch_remain;
}
AscendC::GlobalTensor<float> now_input_gm_tensor = input_gm_tensor[batch_index * input_eles_one_loop];
AscendC::GlobalTensor<float> now_output_gm_tensor = output_gm_tensor[batch_index * output_eles_one_loop];
stock_fft_r2c_mix_even_aiv<false, aiv_split_way>(
now_input_gm_tensor.GetPhyAddr(0),
dft_matrix_array_gm_tensor.GetPhyAddr(0),
tw_matrix_array_gm_tensor.GetPhyAddr(0),
workspace_input_gm_tensor.GetPhyAddr(0),
workspace_output_gm_tensor.GetPhyAddr(0),
workspace_sync_gm_tensor.GetPhyAddr(0),
radix_list_gm_tensor.GetPhyAddr(0),
c2c_output_gm_tensor.GetPhyAddr(0),
auxil_gm_tensor.GetPhyAddr(0),
batch_actual,
tiling_data.fftN,
tiling_data.radixListLen,
tiling_data.isInverse,
max_floats
);
AscendC::CrossCoreSetFlag<0, PIPE_MTE3>(1);
AscendC::CrossCoreWaitFlag(1);
AscendC::PipeBarrier<PIPE_ALL>();
R2C_even(c2c_output_gm_tensor.GetPhyAddr(0), input_index_gm_tensor.GetPhyAddr(0), a_gm_tensor.GetPhyAddr(0), gm_b, output_index_gm_tensor.GetPhyAddr(0), now_output_gm_tensor.GetPhyAddr(0), tiling_data.fftN * 2, batch_actual, 0, batch_actual);
AscendC::CrossCoreSetFlag<0, PIPE_MTE3>(1);
AscendC::CrossCoreWaitFlag(1);
AscendC::PipeBarrier<PIPE_ALL>();
R2C_even_tail_calc(c2c_output_gm_tensor.GetPhyAddr(0), now_output_gm_tensor.GetPhyAddr(0), batch_actual, tiling_data.fftN);
AscendC::CrossCoreSetFlag<0, PIPE_MTE3>(1);
AscendC::CrossCoreWaitFlag(1);
}
}