FFT_3D
产品支持情况
| 产品 | 是否支持 |
|---|---|
| Atlas 200I/500 A2 推理产品 | × |
| Atlas 推理系列产品 | × |
| Atlas 训练系列产品 | × |
| Atlas A3 训练系列产品/Atlas A3 推理系列产品 | √ |
| Atlas A2 训练系列产品/Atlas A2 推理系列产品 | √ |
| Ascend 950PR/Ascend 950DT | × |
功能说明
-
接口功能:
asdFftMakePlan3D:初始化三维FFT配置。
asdFftExecC2C:执行复数到复数的FFT变换。
asdFftExecC2R:执行复数到实数的FFT变换。
asdFftExecR2C:执行实数到复数的FFT变换。
asdFftExecC2CSeparated:执行复数到复数的FFT变换,支持实部、虚部分开输入和输出。 -
计算公式:
设有一个三维离散信号:
它的三维离散傅里叶变换定义为:

其中:

函数原型
AspbStatus asdFftMakePlan3D(
asdFftHandle handle,
int64_t fftSizeX,
int64_t fftSizeY,
int64_t fftSizeZ,
asdFftType fftType,
asdFftDirection direction,
int32_t batchSize)
AspbStatus asdFftExecC2C(
asdFftHandle handle,
const aclTensor * input,
const aclTensor * output)
AspbStatus asdFftExecC2R(
asdFftHandle handle,
const aclTensor * input,
const aclTensor * output)
AspbStatus asdFftExecR2C(
asdFftHandle handle,
const aclTensor * input,
const aclTensor * output)
AspbStatus asdFftExecC2CSeparated(
asdFftHandle handle,
const aclTensor * inputReal,
const aclTensor * inputImag,
const aclTensor * outputReal,
const aclTensor * outputImag)
asdFftMakePlan3D
-
参数说明:
参数名 输入/输出 描述 handle(asdFftHandle) 输入 算子的句柄,需要手动申请创建asdFftHandle对象。 fftSizeX(int64_t) 输入 对应公式中的'Nx',FFT信号长度(第一维)。 fftSizeY(int64_t) 输入 对应公式中的'Ny',FFT信号长度(第二维)。 fftSizeZ(int64_t) 输入 对应公式中的'Nz',FFT信号长度(第三维)。 fftType(asdFftType) 输入 FFT变换类型 - ASCEND_FFT_C2C:复数到复数的快速傅里叶变换。
- ASCEND_FFT_C2R:复数到实数的快速傅里叶变换。
- ASCEND_FFT_R2C:实数到复数的快速傅里叶变换。
- ASCEND_FFT_C2C_SEP:复数到复数的分离式快速傅里叶变换。
direction(asdFftDirection) 输入 选择FFT执行正向变换或反向变换 - ASCEND_FFT_FORWARD:正向快速傅里叶变换。
- ASCEND_FFT_INVERSE:逆向快速傅里叶变换。
batchSize(int32_t) 输入 FFT变换批处理操作中的数据批次数量。 -
返回值:
返回状态码,具体参见SiP返回码。
asdFftExecC2C
-
参数说明:
参数名 输入/输出 描述 handle(asdFftHandle) 输入 算子的句柄,需要手动申请创建asdFftHandle对象。 inData( aclTensor *) 输入 - 对应公式中的'x'。
- 数据类型支持COMPLEX64。
- 数据格式支持ND。
- 输入的shape为(batchSize,fftSizeX,fftSizeY,fftSizeZ)。
outData(aclTensor *) 输出 - 对应公式中的'y'。
- 数据类型支持COMPLEX64。
- 数据格式支持ND。
- 输出的shape为(batchSize,fftSizeX,fftSizeY,fftSizeZ)。
-
返回值:
返回状态码,具体参见SiP返回码。
asdFftExecC2R
-
参数说明:
参数名 输入/输出 描述 handle(asdFftHandle) 输入 算子的句柄,需要手动申请创建asdFftHandle对象。 inData( aclTensor *) 输入 - 对应公式中的'x'。
- 数据类型支持COMPLEX64。
- 数据格式支持ND。
- 输入的shape为(batchSize,fftSizeX,fftSizeY,fftSizeZ/2+1)。
outData(aclTensor *) 输出 - 对应公式中的'y'。
- 数据类型支持FLOAT32。
- 数据格式支持ND。
- 输出的shape为(batchSize,fftSizeX,fftSizeY,fftSizeZ)。
-
返回值:
返回状态码,具体参见SiP返回码。
asdFftExecR2C
-
参数说明:
参数名 输入/输出 描述 handle(asdFftHandle) 输入 算子的句柄,需要手动申请创建asdFftHandle对象。 inData( aclTensor *) 输入 - 对应公式中的'x'。
- 数据类型支持FLOAT32。
- 数据格式支持ND。
- 输入的shape为(batchSize,fftSizeX,fftSizeY,fftSizeZ)。
outData(aclTensor *) 输出 - 对应公式中的'y'。
- 数据类型支持COMPLEX64。
- 数据格式支持ND。
- 输出的shape为(batchSize,fftSizeX,fftSizeY,fftSizeZ/2+1)。
-
返回值:
返回状态码,具体参见SiP返回码。
asdFftExecC2CSeparated
-
参数说明:
参数名 输入/输出 描述 handle(asdFftHandle) 输入 算子的句柄,需要手动申请创建asdFftHandle对象。 inputReal( aclTensor *) 输入 - 公式中的'x'的实部。
- 数据类型支持FLOAT32。
- 数据格式支持ND。
- 输入的shape为(batchSize,fftSize)。
inputImag(aclTensor *) 输入 - 公式中的'x'的虚部。
- 数据类型支持FLOAT32。
- 数据格式支持ND。
- 输入的shape为(batchSize,fftSize)。
outputReal(aclTensor *) 输出 - 公式中的'y'的实部。
- 数据类型支持FLOAT32。
- 数据格式支持ND。
- 输出的shape为(batchSize,fftSize)。
outputImag(aclTensor *) 输出 - 公式中的'y'的虚部。
- 数据类型支持FLOAT32。
- 数据格式支持ND。
- 输出的shape为(batchSize,fftSize)。
-
返回值:
返回状态码,具体参见SiP返回码。
约束说明
- asdFftMakePlan3D
- fftSizeX、fftSizeY、fftSizeZ需保证不超过2272^{27}且分解质因数后不包含超过199的质因子。
- batchSize在存储允许范围内应无额外约束。
- 输入的元素个数理论支持[1,2302^{30}]。
- 输入的元素不支持inf、-inf和nan,如果输入中包含这些值,那么结果为未定义。
- asdFftExecC2CSeparated 信号长度范围[2, 256]。
调用示例
示例代码如下,该样例旨在提供快速上手、开发和调试算子的最小化实现,其核心目标是使用最精简的代码展示算子的核心功能,而非提供生产级的安全保障。不推荐用户直接将示例代码作为业务代码,若用户将示例代码应用在自身的真实业务场景中且发生了安全问题,则需用户自行承担。
- C2C_3D
#include <iostream>
#include <vector>
#include "asdsip.h"
#include "acl/acl.h"
#include "aclnn/acl_meta.h"
using namespace AsdSip;
#define CHECK_RET(cond, return_expr) \
do { \
if (!(cond)) { \
return_expr; \
} \
} while (0)
#define LOG_PRINT(message, ...) \
do { \
printf(message, ##__VA_ARGS__); \
} while (0)
#define ASD_STATUS_CHECK(err) \
do { \
AsdSip::AspbStatus err_ = (err); \
if (err_ != AsdSip::ErrorType::ACL_SUCCESS) { \
std::cout << "Execute failed." << std::endl; \
exit(-1); \
} \
} while (0)
int64_t GetShapeSize(const std::vector<int64_t> &shape)
{
int64_t shapeSize = 1;
for (auto i : shape) {
shapeSize *= i;
}
return shapeSize;
}
int Init(int32_t deviceId, aclrtStream *stream)
{
// 固定写法,AscendCL初始化
auto ret = aclInit(nullptr);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return ret);
ret = aclrtSetDevice(deviceId);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return ret);
ret = aclrtCreateStream(stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return ret);
return 0;
}
template <typename T>
int CreateAclTensor(const std::vector<T> &hostData, const std::vector<int64_t> &shape, void **deviceAddr,
aclDataType dataType, aclTensor **tensor)
{
auto size = GetShapeSize(shape) * sizeof(T);
// 调用aclrtMalloc申请device侧内存
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return ret);
// 调用aclrtMemcpy将host侧数据复制到device侧内存上
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return ret);
// 计算连续tensor的strides
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}
// 调用aclCreateTensor接口创建aclTensor
*tensor = aclCreateTensor(shape.data(),
shape.size(),
dataType,
strides.data(),
0,
aclFormat::ACL_FORMAT_ND,
shape.data(),
shape.size(),
*deviceAddr);
return 0;
}
int main() {
int32_t deviceId = 0;
aclrtStream stream;
auto ret = Init(deviceId, &stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return ret);
// 创造tensor的Host侧数据
int batch = 1, Nfft1 = 256, Nfft2 = 64, Nfft3 = 64;
const int64_t tensorInSize = batch * Nfft1 * Nfft2 * Nfft3;
std::vector<int64_t> selfShape = {batch, Nfft1, Nfft2, Nfft3};
std::vector<int64_t> outShape = {batch, Nfft1, Nfft2, Nfft3};
std::vector<std::complex<float>> inputHostData(tensorInSize, std::complex<float>(0, 0));
for (int i = 0; i < tensorInSize; i++) {
inputHostData[i] = std::complex<float>(i, i + 1);
}
std::vector<std::complex<float>> outHostData(tensorInSize, std::complex<float>(0, 0));
void *inputDeviceAddr = nullptr;
void *outDeviceAddr = nullptr;
aclTensor *input = nullptr;
aclTensor *out = nullptr;
ret = CreateAclTensor(inputHostData, selfShape, &inputDeviceAddr, aclDataType::ACL_COMPLEX64, &input);
CHECK_RET(ret == ::ACL_SUCCESS, return ret);
ret = CreateAclTensor(outHostData, outShape, &outDeviceAddr, aclDataType::ACL_COMPLEX64, &out);
CHECK_RET(ret == ::ACL_SUCCESS, return ret);
asdFftHandle handle;
asdFftCreate(handle);
asdFftMakePlan3D(handle, Nfft1, Nfft2, Nfft3, asdFftType::ASCEND_FFT_C2C, asdFftDirection::ASCEND_FFT_FORWARD, batch);
size_t work_size;
asdFftGetWorkspaceSize(handle, work_size);
void *workspaceAddr = nullptr;
if (work_size > 0) {
ret = aclrtMalloc(&workspaceAddr, static_cast<int64_t>(work_size), ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret);
}
asdFftSetWorkspace(handle, (uint8_t *)workspaceAddr);
asdFftSetStream(handle, stream);
ASD_STATUS_CHECK(asdFftExecC2C(handle, input, out));
ret = aclrtSynchronizeStream(stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret);
asdFftDestroy(handle);
auto size = GetShapeSize(outShape);
std::vector<std::complex<float>> outData(size, 0);
ret = aclrtMemcpy(outData.data(),
outData.size() * sizeof(outData[0]),
outDeviceAddr,
size * sizeof(outData[0]),
ACL_MEMCPY_DEVICE_TO_HOST);
// 打印输出tensor值中前16个
for (int64_t i = 0; i < std::min(static_cast<int64_t>(16), tensorInSize); i++) {
std::cout << static_cast<std::complex<float>>(outData[i]) << "\t";
}
std::cout << "\nend result" << std::endl;
std::cout << "Execute successfully." << std::endl;
aclDestroyTensor(input);
aclDestroyTensor(out);
aclrtFree(inputDeviceAddr);
aclrtFree(outDeviceAddr);
if (work_size > 0) {
aclrtFree(workspaceAddr);
}
aclrtDestroyStream(stream);
aclrtResetDevice(deviceId);
aclFinalize();
return 0;
}
- C2R_3D
#include <iostream>
#include <vector>
#include "asdsip.h"
#include "acl/acl.h"
#include "aclnn/acl_meta.h"
using namespace AsdSip;
#define CHECK_RET(cond, return_expr) \
do { \
if (!(cond)) { \
return_expr; \
} \
} while (0)
#define LOG_PRINT(message, ...) \
do { \
printf(message, ##__VA_ARGS__); \
} while (0)
#define ASD_STATUS_CHECK(err) \
do { \
AsdSip::AspbStatus err_ = (err); \
if (err_ != AsdSip::ErrorType::ACL_SUCCESS) { \
std::cout << "Execute failed." << std::endl; \
exit(-1); \
} \
} while (0)
int64_t GetShapeSize(const std::vector<int64_t> &shape)
{
int64_t shapeSize = 1;
for (auto i : shape) {
shapeSize *= i;
}
return shapeSize;
}
int Init(int32_t deviceId, aclrtStream *stream)
{
// 固定写法,AscendCL初始化
auto ret = aclInit(nullptr);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return ret);
ret = aclrtSetDevice(deviceId);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return ret);
ret = aclrtCreateStream(stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return ret);
return 0;
}
template <typename T>
int CreateAclTensor(const std::vector<T> &hostData, const std::vector<int64_t> &shape, void **deviceAddr,
aclDataType dataType, aclTensor **tensor)
{
auto size = GetShapeSize(shape) * sizeof(T);
// 调用aclrtMalloc申请device侧内存
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return ret);
// 调用aclrtMemcpy将host侧数据复制到device侧内存上
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return ret);
// 计算连续tensor的strides
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}
// 调用aclCreateTensor接口创建aclTensor
*tensor = aclCreateTensor(shape.data(),
shape.size(),
dataType,
strides.data(),
0,
aclFormat::ACL_FORMAT_ND,
shape.data(),
shape.size(),
*deviceAddr);
return 0;
}
int main()
{
int32_t deviceId = 0;
aclrtStream stream;
auto ret = Init(deviceId, &stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return ret);
// 创造tensor的Host侧数据
int batch = 2, Nfft1 = 2, Nfft2 = 128, Nfft3 = 128;
const int64_t inSignal = Nfft3 / 2 + 1;
const int64_t outSignal = Nfft3;
const int64_t tensorInSize = batch * Nfft1 * Nfft2 * inSignal;
const int64_t tensorOutSize = batch * Nfft1 * Nfft2 * outSignal;
std::vector<int64_t> selfShape = {batch, Nfft1, Nfft2, inSignal};
std::vector<int64_t> outShape = {batch, Nfft1, Nfft2, outSignal};
std::vector<std::complex<float>> inputHostData(tensorInSize, std::complex<float>(0, 0));
for (int i = 0; i < tensorInSize; i++) {
inputHostData[i] = std::complex<float>(i, i + 1);
}
std::vector<float> outHostData(tensorOutSize, 0);
void *inputDeviceAddr = nullptr;
void *outDeviceAddr = nullptr;
aclTensor *input = nullptr;
aclTensor *out = nullptr;
ret = CreateAclTensor(inputHostData, selfShape, &inputDeviceAddr, aclDataType::ACL_COMPLEX64, &input);
CHECK_RET(ret == ::ACL_SUCCESS, return ret);
ret = CreateAclTensor(outHostData, outShape, &outDeviceAddr, aclDataType::ACL_FLOAT, &out);
CHECK_RET(ret == ::ACL_SUCCESS, return ret);
asdFftHandle handle;
asdFftCreate(handle);
asdFftMakePlan3D(handle, Nfft1, Nfft2, Nfft3, asdFftType::ASCEND_FFT_C2R, asdFftDirection::ASCEND_FFT_FORWARD, batch);
size_t work_size;
asdFftGetWorkspaceSize(handle, work_size);
void *workspaceAddr = nullptr;
if (work_size > 0) {
ret = aclrtMalloc(&workspaceAddr, static_cast<int64_t>(work_size), ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret);
}
asdFftSetWorkspace(handle, (uint8_t *)workspaceAddr);
asdFftSetStream(handle, stream);
ASD_STATUS_CHECK(asdFftExecC2R(handle, input, out));
ret = aclrtSynchronizeStream(stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret);
asdFftDestroy(handle);
auto size = GetShapeSize(outShape);
std::vector<float> outData(size, 0);
ret = aclrtMemcpy(outData.data(),
outData.size() * sizeof(outData[0]),
outDeviceAddr,
size * sizeof(outData[0]),
ACL_MEMCPY_DEVICE_TO_HOST);
// 打印输出tensor值中前16个
for (int64_t i = 0; i < std::min(static_cast<int64_t>(16), tensorOutSize); i++) {
std::cout << static_cast<float>(outData[i]) << "\t";
}
std::cout << "\nend result" << std::endl;
std::cout << "Execute successfully." << std::endl;
aclDestroyTensor(input);
aclDestroyTensor(out);
aclrtFree(inputDeviceAddr);
aclrtFree(outDeviceAddr);
if (work_size > 0) {
aclrtFree(workspaceAddr);
}
aclrtDestroyStream(stream);
aclrtResetDevice(deviceId);
aclFinalize();
return 0;
}
- R2C_3D
#include <iostream>
#include <vector>
#include "asdsip.h"
#include "acl/acl.h"
#include "aclnn/acl_meta.h"
using namespace AsdSip;
#define CHECK_RET(cond, return_expr) \
do { \
if (!(cond)) { \
return_expr; \
} \
} while (0)
#define LOG_PRINT(message, ...) \
do { \
printf(message, ##__VA_ARGS__); \
} while (0)
#define ASD_STATUS_CHECK(err) \
do { \
AsdSip::AspbStatus err_ = (err); \
if (err_ != AsdSip::ErrorType::ACL_SUCCESS) { \
std::cout << "Execute failed." << std::endl; \
exit(-1); \
} \
} while (0)
int64_t GetShapeSize(const std::vector<int64_t> &shape)
{
int64_t shapeSize = 1;
for (auto i : shape) {
shapeSize *= i;
}
return shapeSize;
}
int Init(int32_t deviceId, aclrtStream *stream)
{
// 固定写法,AscendCL初始化
auto ret = aclInit(nullptr);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return ret);
ret = aclrtSetDevice(deviceId);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return ret);
ret = aclrtCreateStream(stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return ret);
return 0;
}
template <typename T>
int CreateAclTensor(const std::vector<T> &hostData, const std::vector<int64_t> &shape, void **deviceAddr,
aclDataType dataType, aclTensor **tensor)
{
auto size = GetShapeSize(shape) * sizeof(T);
// 调用aclrtMalloc申请device侧内存
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return ret);
// 调用aclrtMemcpy将host侧数据复制到device侧内存上
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return ret);
// 计算连续tensor的strides
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}
// 调用aclCreateTensor接口创建aclTensor
*tensor = aclCreateTensor(shape.data(),
shape.size(),
dataType,
strides.data(),
0,
aclFormat::ACL_FORMAT_ND,
shape.data(),
shape.size(),
*deviceAddr);
return 0;
}
int main()
{
int32_t deviceId = 0;
aclrtStream stream;
auto ret = Init(deviceId, &stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return ret);
// 创造tensor的Host侧数据
int batch = 1, Nfft1 = 1, Nfft2 = 64, Nfft3 = 32;
const int64_t tensorInSize = batch * Nfft1 * Nfft2 * Nfft3;
const int64_t tensorOutSize = batch * Nfft1 * Nfft2 * (Nfft3 / 2 + 1);
std::vector<int64_t> selfShape = {batch, Nfft1, Nfft2, Nfft3};
std::vector<int64_t> outShape = {batch, Nfft1, Nfft2, Nfft3 / 2 + 1};
std::vector<float> inputHostData(tensorInSize, 0);
for (int i = 0; i < tensorInSize; i++) {
inputHostData[i] = i;
}
std::vector<std::complex<float>> outHostData(tensorInSize, std::complex<float>(0, 0));
void *inputDeviceAddr = nullptr;
void *outDeviceAddr = nullptr;
aclTensor *input = nullptr;
aclTensor *out = nullptr;
ret = CreateAclTensor(inputHostData, selfShape, &inputDeviceAddr, aclDataType::ACL_FLOAT, &input);
CHECK_RET(ret == ::ACL_SUCCESS, return ret);
ret = CreateAclTensor(outHostData, outShape, &outDeviceAddr, aclDataType::ACL_COMPLEX64, &out);
CHECK_RET(ret == ::ACL_SUCCESS, return ret);
asdFftHandle handle;
asdFftCreate(handle);
asdFftMakePlan3D(handle, Nfft1, Nfft2, Nfft3, asdFftType::ASCEND_FFT_R2C, asdFftDirection::ASCEND_FFT_FORWARD, batch);
size_t work_size;
asdFftGetWorkspaceSize(handle, work_size);
void *workspaceAddr = nullptr;
if (work_size > 0) {
ret = aclrtMalloc(&workspaceAddr, static_cast<int64_t>(work_size), ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret);
}
asdFftSetWorkspace(handle, (uint8_t *)workspaceAddr);
asdFftSetStream(handle, stream);
ASD_STATUS_CHECK(asdFftExecR2C(handle, input, out));
ret = aclrtSynchronizeStream(stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret);
asdFftDestroy(handle);
auto size = GetShapeSize(outShape);
std::vector<std::complex<float>> outData(size, 0);
ret = aclrtMemcpy(outData.data(),
outData.size() * sizeof(outData[0]),
outDeviceAddr,
size * sizeof(outData[0]),
ACL_MEMCPY_DEVICE_TO_HOST);
// 打印输出tensor值中前16个
for (int64_t i = 0; i < std::min(static_cast<int64_t>(16), tensorOutSize); i++) {
std::cout << static_cast<std::complex<float>>(outData[i]) << "\t";
}
std::cout << "\nend result" << std::endl;
std::cout << "Execute successfully." << std::endl;
aclDestroyTensor(input);
aclDestroyTensor(out);
aclrtFree(inputDeviceAddr);
aclrtFree(outDeviceAddr);
if (work_size > 0) {
aclrtFree(workspaceAddr);
}
aclrtDestroyStream(stream);
aclrtResetDevice(deviceId);
aclFinalize();
return 0;
}
- C2C_3D_SEP
#include <iostream>
#include <fstream>
#include <random>
#include <vector>
#include "asdsip.h"
#include "acl/acl.h"
#include "aclnn/acl_meta.h"
using namespace AsdSip;
#define CHECK_RET(cond, return_expr) \
do { \
if (!(cond)) { \
return_expr; \
} \
} while (0)
#define LOG_PRINT(message, ...) \
do { \
printf(message, ##__VA_ARGS__); \
} while (0)
#define ASD_STATUS_CHECK(err) \
do { \
AsdSip::AspbStatus err_ = (err); \
if (err_ != AsdSip::ErrorType::ACL_SUCCESS) { \
std::cout << "Execute failed." << std::endl; \
exit(-1); \
} \
} while (0)
int64_t GetShapeSize(const std::vector<int64_t> &shape)
{
int64_t shapeSize = 1;
for (auto i : shape) {
shapeSize *= i;
}
return shapeSize;
}
int Init(int32_t deviceId, aclrtStream *stream)
{
// 固定写法,AscendCL初始化
auto ret = aclInit(nullptr);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclInit failed. ERROR: %d\n", ret); return ret);
ret = aclrtSetDevice(deviceId);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtSetDevice failed. ERROR: %d\n", ret); return ret);
ret = aclrtCreateStream(stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtCreateStream failed. ERROR: %d\n", ret); return ret);
return 0;
}
template <typename T>
int CreateAclTensor(const std::vector<T> &hostData, const std::vector<int64_t> &shape, void **deviceAddr,
aclDataType dataType, aclTensor **tensor)
{
auto size = GetShapeSize(shape) * sizeof(T);
// 调用aclrtMalloc申请device侧内存
auto ret = aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtMalloc failed. ERROR: %d\n", ret); return ret);
// 调用aclrtMemcpy将host侧数据复制到device侧内存上
ret = aclrtMemcpy(*deviceAddr, size, hostData.data(), size, ACL_MEMCPY_HOST_TO_DEVICE);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtMemcpy failed. ERROR: %d\n", ret); return ret);
// 计算连续tensor的strides
std::vector<int64_t> strides(shape.size(), 1);
for (int64_t i = shape.size() - 2; i >= 0; i--) {
strides[i] = shape[i + 1] * strides[i + 1];
}
// 调用aclCreateTensor接口创建aclTensor
*tensor = aclCreateTensor(shape.data(),
shape.size(),
dataType,
strides.data(),
0,
aclFormat::ACL_FORMAT_ND,
shape.data(),
shape.size(),
*deviceAddr);
return 0;
}
int main()
{
int32_t deviceId = 0;
aclrtStream stream;
auto ret = Init(deviceId, &stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("Init acl failed. ERROR: %d\n", ret); return ret);
// 创造tensor的Host侧数据
// int batch = 2, Nfft1 = 256, Nfft2 = 256, Nfft3 = 256; // core dd
int batch = 2, Nfft1 = 4, Nfft2 = 4, Nfft3 = 4; // core dd
// int batch = 32, Nfft = 256; // c2c dft
// int batch = 32, Nfft = 8192; // c2c fftb
// int batch = 32, Nfft = 15000; // c2c mixed
// int batch = 32, Nfft = 32768; // c2c fftn
// int batch = 32, Nfft = 199 * 199; // core any
const int64_t tensorInSize = batch * Nfft1 * Nfft2 * Nfft3;
std::vector<int64_t> selfShape = {batch, Nfft1, Nfft2, Nfft3};
std::vector<int64_t> outShape = {batch, Nfft1, Nfft2, Nfft3};
std::vector<float> inputRealHostData(tensorInSize, 0);
std::vector<float> inputImagHostData(tensorInSize, 0);
std::vector<float> outputRealHostData(tensorInSize, 0);
std::vector<float> outputImagHostData(tensorInSize, 0);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(0.0f, 1.0f);
for (int i = 0; i < tensorInSize; i++) {
inputRealHostData[i] = dis(gen);
inputImagHostData[i] = dis(gen);
}
void *inputRealDeviceAddr = nullptr;
void *inputImagDeviceAddr = nullptr;
void *outputRealDeviceAddr = nullptr;
void *outputImagDeviceAddr = nullptr;
aclTensor *inputReal = nullptr;
aclTensor *inputImag = nullptr;
aclTensor *outputReal = nullptr;
aclTensor *outputImag = nullptr;
ret = CreateAclTensor(inputRealHostData, selfShape, &inputRealDeviceAddr, aclDataType::ACL_FLOAT, &inputReal);
CHECK_RET(ret == ::ACL_SUCCESS, return ret);
ret = CreateAclTensor(inputImagHostData, selfShape, &inputImagDeviceAddr, aclDataType::ACL_FLOAT, &inputImag);
CHECK_RET(ret == ::ACL_SUCCESS, return ret);
ret = CreateAclTensor(outputRealHostData, outShape, &outputRealDeviceAddr, aclDataType::ACL_FLOAT, &outputReal);
CHECK_RET(ret == ::ACL_SUCCESS, return ret);
ret = CreateAclTensor(outputImagHostData, outShape, &outputImagDeviceAddr, aclDataType::ACL_FLOAT, &outputImag);
CHECK_RET(ret == ::ACL_SUCCESS, return ret);
asdFftHandle handle;
asdFftCreate(handle);
asdFftMakePlan3D(handle, Nfft1, Nfft2, Nfft3, asdFftType::ASCEND_FFT_C2C_SEP, asdFftDirection::ASCEND_FFT_FORWARD, batch);
size_t work_size;
asdFftGetWorkspaceSize(handle, work_size);
void *workspaceAddr = nullptr;
if (work_size > 0) {
ret = aclrtMalloc(&workspaceAddr, static_cast<int64_t>(work_size), ACL_MEM_MALLOC_HUGE_FIRST);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("allocate workspace failed. ERROR: %d\n", ret); return ret);
}
asdFftSetWorkspace(handle, (uint8_t *)workspaceAddr);
asdFftSetStream(handle, stream);
ASD_STATUS_CHECK(asdFftExecC2CSeparated(handle, inputReal, inputImag, outputReal, outputImag));
ret = aclrtSynchronizeStream(stream);
CHECK_RET(ret == ::ACL_SUCCESS, LOG_PRINT("aclrtSynchronizeStream failed. ERROR: %d\n", ret); return ret);
asdFftDestroy(handle);
auto size = GetShapeSize(outShape);
std::vector<float> outRealData(size, 0);
std::vector<float> outImagData(size, 0);
std::vector<float> workspaceData(size * 2, -1);
ret = aclrtMemcpy(outRealData.data(),
outRealData.size() * sizeof(outRealData[0]),
outputRealDeviceAddr,
size * sizeof(outRealData[0]),
ACL_MEMCPY_DEVICE_TO_HOST);
ret = aclrtMemcpy(outImagData.data(),
outImagData.size() * sizeof(outImagData[0]),
outputImagDeviceAddr,
size * sizeof(outImagData[0]),
ACL_MEMCPY_DEVICE_TO_HOST);
ret = aclrtMemcpy(workspaceData.data(),
workspaceData.size() * sizeof(workspaceData[0]),
workspaceAddr,
workspaceData.size() * sizeof(workspaceData[0]),
ACL_MEMCPY_DEVICE_TO_HOST);
// 打印输出tensor值中前16个
std::cout << "real part:" << std::endl;
for (int64_t i = 0; i < size; i++) {
std::cout << static_cast<float>(outRealData[i]) << "\t";
}
std::cout << "\nimag part:" << std::endl;
for (int64_t i = 0; i < size; i++) {
std::cout << static_cast<float>(outImagData[i]) << "\t";
}
std::cout << "\nworkspace real part:" << std::endl;
for (int64_t i = 0; i < size; i++) {
std::cout << static_cast<float>(workspaceData[i]) << "\t";
}
std::cout << "\nworkspace imag part:" << std::endl;
for (int64_t i = 0; i < size; i++) {
std::cout << static_cast<float>(workspaceData[i + size]) << "\t";
}
std::cout << "\nend result" << std::endl;
std::cout << "Execute successfully." << std::endl;
aclDestroyTensor(inputReal);
aclDestroyTensor(inputImag);
aclDestroyTensor(outputReal);
aclDestroyTensor(outputImag);
aclrtFree(inputRealDeviceAddr);
aclrtFree(inputImagDeviceAddr);
aclrtFree(outputRealDeviceAddr);
aclrtFree(outputImagDeviceAddr);
if (work_size > 0) {
aclrtFree(workspaceAddr);
}
aclrtDestroyStream(stream);
aclrtResetDevice(deviceId);
aclFinalize();
return 0;
}