* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#include <vector>
#include <iostream>
#include "utils.h"
#include "acl/acl.h"
#include "model_utils.h"
#include "aclnnop/aclnn_add.h"
#include "aclnnop/aclnn_mul.h"
using namespace std;
extern void EasyOP(uint32_t coreDim, void *stream, uint32_t* x);
int main()
{
int deviceId = 0;
int blockDim = 1;
int num = 0;
uint32_t *numDevice = nullptr;
void *selfDevice1 = nullptr;
void *otherDevice1 = nullptr;
void *outDevice1 = nullptr;
void *selfDevice2 = nullptr;
void *otherDevice2 = nullptr;
void *outDevice2 = nullptr;
void *selfDevice3 = nullptr;
void *otherDevice3 = nullptr;
void *outDevice3 = nullptr;
aclTensor *self1 = nullptr;
aclTensor *other1 = nullptr;
aclScalar *alpha = nullptr;
aclTensor *out1 = nullptr;
aclTensor *self2 = nullptr;
aclTensor *other2 = nullptr;
aclTensor *out2 = nullptr;
aclTensor *self3 = nullptr;
aclTensor *other3 = nullptr;
aclTensor *out3 = nullptr;
vector<float> selfHostData1 = {1, 1, 1, 1, 1, 1, 1, 1};
vector<float> otherHostData1 = {2, 2, 2, 2, 2, 2, 2, 2};
vector<float> outHostData1 = {0, 0, 0, 0, 0, 0, 0, 0};
vector<float> selfHostData2 = {2, 2, 2, 2, 2, 2, 2, 2};
vector<float> otherHostData2 = {2, 2, 2, 2, 2, 2, 2, 2};
vector<float> outHostData2 = {0, 0, 0, 0, 0, 0, 0, 0};
vector<float> selfHostData3 = {3, 3, 3, 3, 3, 3, 3, 3};
vector<float> otherHostData3 = {2, 2, 2, 2, 2, 2, 2, 2};
vector<float> outHostData3 = {0, 0, 0, 0, 0, 0, 0, 0};
vector<int64_t> shape = {4, 2};
float alphaValue = 1.2f;
uint64_t addWorkspaceSize1 = 0;
uint64_t addWorkspaceSize2 = 0;
uint64_t addWorkspaceSize3 = 0;
aclOpExecutor *addExecutor1;
aclOpExecutor *addExecutor2;
aclOpExecutor *addExecutor3;
aclrtContext context;
int64_t size = ModelUtils::GetShapeSize(shape) * sizeof(float);
CHECK_ERROR(aclInit(NULL));
CHECK_ERROR(aclrtSetDevice(deviceId));
CHECK_ERROR(aclrtCreateContext(&context, deviceId));
ModelUtils::CreateAclTensor(shape, &selfDevice1, aclDataType::ACL_FLOAT, &self1);
ModelUtils::CreateAclTensor(shape, &otherDevice1, aclDataType::ACL_FLOAT, &other1);
ModelUtils::CreateAclTensor(shape, &selfDevice2, aclDataType::ACL_FLOAT, &self2);
ModelUtils::CreateAclTensor(shape, &otherDevice2, aclDataType::ACL_FLOAT, &other2);
ModelUtils::CreateAclTensor(shape, &selfDevice3, aclDataType::ACL_FLOAT, &self3);
ModelUtils::CreateAclTensor(shape, &otherDevice3, aclDataType::ACL_FLOAT, &other3);
alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT);
ModelUtils::CreateAclTensor(shape, &outDevice1, aclDataType::ACL_FLOAT, &out1);
ModelUtils::CreateAclTensor(shape, &outDevice2, aclDataType::ACL_FLOAT, &out2);
ModelUtils::CreateAclTensor(shape, &outDevice3, aclDataType::ACL_FLOAT, &out3);
aclnnAddGetWorkspaceSize(self1, other1, alpha, out1, &addWorkspaceSize1, &addExecutor1);
void *addWorkspaceAddr1 = nullptr;
if (addWorkspaceSize1 > 0) {
CHECK_ERROR(aclrtMalloc(&addWorkspaceAddr1, addWorkspaceSize1, ACL_MEM_MALLOC_HUGE_FIRST));
}
aclnnAddGetWorkspaceSize(self2, other2, alpha, out2, &addWorkspaceSize2, &addExecutor2);
void *addWorkspaceAddr2 = nullptr;
if (addWorkspaceSize2 > 0) {
CHECK_ERROR(aclrtMalloc(&addWorkspaceAddr2, addWorkspaceSize2, ACL_MEM_MALLOC_HUGE_FIRST));
}
aclnnAddGetWorkspaceSize(self3, other3, alpha, out3, &addWorkspaceSize3, &addExecutor3);
void *addWorkspaceAddr3 = nullptr;
if (addWorkspaceSize3 > 0) {
CHECK_ERROR(aclrtMalloc(&addWorkspaceAddr3, addWorkspaceSize3, ACL_MEM_MALLOC_HUGE_FIRST));
}
int32_t rightValue1 = 1;
int32_t rightValue2 = 2;
void *rightDevice1 = nullptr;
void *rightDevice2 = nullptr;
aclrtCondition condition = ACL_RT_EQUAL;
aclrtCompareDataType dataType = ACL_RT_SWITCH_INT32;
CHECK_ERROR(aclrtMalloc(&rightDevice1, sizeof(int32_t), ACL_MEM_MALLOC_HUGE_FIRST));
CHECK_ERROR(aclrtMemcpy(rightDevice1, sizeof(int32_t), &rightValue1, sizeof(int32_t), ACL_MEMCPY_HOST_TO_DEVICE));
CHECK_ERROR(aclrtMalloc(&rightDevice2, sizeof(int32_t), ACL_MEM_MALLOC_HUGE_FIRST));
CHECK_ERROR(aclrtMemcpy(rightDevice2, sizeof(int32_t), &rightValue2, sizeof(int32_t), ACL_MEMCPY_HOST_TO_DEVICE));
CHECK_ERROR(aclrtMalloc((void **)&numDevice, sizeof(uint32_t), ACL_MEM_MALLOC_HUGE_FIRST));
aclmdlRI modelRI;
aclrtStream stream1;
aclrtStream stream2;
aclrtStream stream3;
aclrtStream stream4;
CHECK_ERROR(aclrtCreateStreamWithConfig(&stream1, 0x00U, ACL_STREAM_PERSISTENT));
CHECK_ERROR(aclrtCreateStreamWithConfig(&stream2, 0x00U, ACL_STREAM_PERSISTENT));
CHECK_ERROR(aclrtCreateStreamWithConfig(&stream3, 0x00U, ACL_STREAM_PERSISTENT));
CHECK_ERROR(aclrtCreateStreamWithConfig(&stream4, 0x00U, ACL_STREAM_PERSISTENT));
CHECK_ERROR(aclmdlRIBuildBegin(&modelRI, 0x00U));
CHECK_ERROR(aclmdlRIBindStream(modelRI, stream1, ACL_MODEL_STREAM_FLAG_HEAD));
CHECK_ERROR(aclmdlRIBindStream(modelRI, stream2, ACL_MODEL_STREAM_FLAG_DEFAULT));
CHECK_ERROR(aclmdlRIBindStream(modelRI, stream3, ACL_MODEL_STREAM_FLAG_DEFAULT));
CHECK_ERROR(aclmdlRIBindStream(modelRI, stream4, ACL_MODEL_STREAM_FLAG_DEFAULT));
CHECK_ERROR(aclrtMemcpyAsync(selfDevice1, size, selfHostData1.data(), size, ACL_MEMCPY_HOST_TO_DEVICE, stream1));
CHECK_ERROR(aclrtMemcpyAsync(selfDevice2, size, selfHostData2.data(), size, ACL_MEMCPY_HOST_TO_DEVICE, stream2));
CHECK_ERROR(aclrtMemcpyAsync(selfDevice3, size, selfHostData3.data(), size, ACL_MEMCPY_HOST_TO_DEVICE, stream3));
CHECK_ERROR(aclrtMemcpyAsync(otherDevice1, size, otherHostData1.data(), size, ACL_MEMCPY_HOST_TO_DEVICE, stream1));
CHECK_ERROR(aclrtMemcpyAsync(otherDevice2, size, otherHostData2.data(), size, ACL_MEMCPY_HOST_TO_DEVICE, stream2));
CHECK_ERROR(aclrtMemcpyAsync(otherDevice3, size, otherHostData3.data(), size, ACL_MEMCPY_HOST_TO_DEVICE, stream3));
aclnnAdd(addWorkspaceAddr1, addWorkspaceSize1, addExecutor1, stream1);
EasyOP(blockDim, stream1, numDevice);
aclnnAdd(addWorkspaceAddr2, addWorkspaceSize2, addExecutor2, stream2);
aclnnAdd(addWorkspaceAddr3, addWorkspaceSize3, addExecutor3, stream3);
CHECK_ERROR(aclrtSwitchStream(numDevice, condition, rightDevice1, dataType, stream3, nullptr, stream1));
CHECK_ERROR(aclrtSwitchStream(numDevice, condition, rightDevice2, dataType, stream4, nullptr, stream1));
CHECK_ERROR(aclrtActiveStream(stream2, stream4));
CHECK_ERROR(aclmdlRIEndTask(modelRI, stream2));
CHECK_ERROR(aclmdlRIEndTask(modelRI, stream4));
CHECK_ERROR(aclmdlRIEndTask(modelRI, stream3));
CHECK_ERROR(aclmdlRIBuildEnd(modelRI, NULL));
aclrtStream executeStream;
aclrtCreateStream(&executeStream);
CHECK_ERROR(aclmdlRIExecuteAsync(modelRI, executeStream));
CHECK_ERROR(aclrtSynchronizeStream(executeStream));
CHECK_ERROR(aclrtMemcpy(outHostData1.data(), size, outDevice1, size, ACL_MEMCPY_DEVICE_TO_HOST));
CHECK_ERROR(aclrtMemcpy(outHostData2.data(), size, outDevice2, size, ACL_MEMCPY_DEVICE_TO_HOST));
CHECK_ERROR(aclrtMemcpy(outHostData3.data(), size, outDevice3, size, ACL_MEMCPY_DEVICE_TO_HOST));
INFO_LOG("After executing, print data1.");
ModelUtils::PrintArray(outHostData1);
INFO_LOG("After executing, print data2.");
ModelUtils::PrintArray(outHostData2);
INFO_LOG("After executing, print data3.");
ModelUtils::PrintArray(outHostData3);
outHostData1.assign(outHostData1.size(), 0);
outHostData2.assign(outHostData2.size(), 0);
outHostData3.assign(outHostData3.size(), 0);
CHECK_ERROR(aclrtMemcpyAsync(outDevice1, size, outHostData1.data(), size, ACL_MEMCPY_HOST_TO_DEVICE, stream1));
CHECK_ERROR(aclrtMemcpyAsync(outDevice2, size, outHostData2.data(), size, ACL_MEMCPY_HOST_TO_DEVICE, stream2));
CHECK_ERROR(aclrtMemcpyAsync(outDevice3, size, outHostData3.data(), size, ACL_MEMCPY_HOST_TO_DEVICE, stream3));
CHECK_ERROR(aclmdlRIExecuteAsync(modelRI, executeStream));
CHECK_ERROR(aclrtSynchronizeStream(executeStream));
CHECK_ERROR(aclrtMemcpy(outHostData1.data(), size, outDevice1, size, ACL_MEMCPY_DEVICE_TO_HOST));
CHECK_ERROR(aclrtMemcpy(outHostData2.data(), size, outDevice2, size, ACL_MEMCPY_DEVICE_TO_HOST));
CHECK_ERROR(aclrtMemcpy(outHostData3.data(), size, outDevice3, size, ACL_MEMCPY_DEVICE_TO_HOST));
INFO_LOG("After activing stream2 and executing, print data1.");
ModelUtils::PrintArray(outHostData1);
INFO_LOG("After activing stream2 and executing, print data2.");
ModelUtils::PrintArray(outHostData2);
INFO_LOG("After activing stream2 and executing, print data3.");
ModelUtils::PrintArray(outHostData3);
CHECK_ERROR(aclmdlRIUnbindStream(modelRI, stream1));
CHECK_ERROR(aclmdlRIUnbindStream(modelRI, stream2));
CHECK_ERROR(aclmdlRIUnbindStream(modelRI, stream3));
CHECK_ERROR(aclmdlRIUnbindStream(modelRI, stream4));
CHECK_ERROR(aclmdlRIDestroy(modelRI));
CHECK_ERROR(aclrtDestroyStream(stream1));
CHECK_ERROR(aclrtDestroyStream(stream2));
CHECK_ERROR(aclrtDestroyStream(stream3));
CHECK_ERROR(aclrtDestroyStream(stream4));
CHECK_ERROR(aclDestroyTensor(self1));
CHECK_ERROR(aclDestroyTensor(other1));
CHECK_ERROR(aclDestroyTensor(out1));
CHECK_ERROR(aclDestroyTensor(self2));
CHECK_ERROR(aclDestroyTensor(other2));
CHECK_ERROR(aclDestroyTensor(out2));
CHECK_ERROR(aclDestroyTensor(self3));
CHECK_ERROR(aclDestroyTensor(other3));
CHECK_ERROR(aclDestroyTensor(out3));
CHECK_ERROR(aclDestroyScalar(alpha));
CHECK_ERROR(aclrtFree(numDevice));
CHECK_ERROR(aclrtFree(selfDevice1));
CHECK_ERROR(aclrtFree(otherDevice1));
CHECK_ERROR(aclrtFree(outDevice1));
CHECK_ERROR(aclrtFree(selfDevice2));
CHECK_ERROR(aclrtFree(otherDevice2));
CHECK_ERROR(aclrtFree(outDevice2));
CHECK_ERROR(aclrtFree(selfDevice3));
CHECK_ERROR(aclrtFree(otherDevice3));
CHECK_ERROR(aclrtFree(outDevice3));
if (addWorkspaceAddr1 != nullptr) {
CHECK_ERROR(aclrtFree(addWorkspaceAddr1));
}
if (addWorkspaceAddr1 != nullptr) {
CHECK_ERROR(aclrtFree(addWorkspaceAddr1));
}
if (addWorkspaceAddr1 != nullptr) {
CHECK_ERROR(aclrtFree(addWorkspaceAddr1));
}
CHECK_ERROR(aclrtDestroyContext(context));
CHECK_ERROR(aclrtResetDeviceForce(deviceId));
CHECK_ERROR(aclFinalize());
}