* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#include "prof_api_reg.h"
#include <mutex>
#include <unordered_set>
#include <map>
#include "mmpa/mmpa_api.h"
#include "common/log_inner.h"
namespace {
static bool g_profRun = false;
static std::mutex g_profMutex;
static std::unordered_set<uint32_t> g_deviceList;
constexpr uint64_t ACL_PROF_ACL_API = 0x0001U;
constexpr uint32_t START_PROFILING = 1U;
constexpr uint32_t STOP_PROFILING = 2U;
static bool IsDumpToStdEnabled() {
const char *profilingToStdOut = nullptr;
MM_SYS_GET_ENV(MM_ENV_GE_PROFILING_TO_STD_OUT, profilingToStdOut);
return profilingToStdOut != nullptr;
}
static const std::map<acl::AclProfType, std::string> PROF_TYPE_TO_NAMES = {
{acl::AclProfType::AclopLoad, "aclopLoad"},
{acl::AclProfType::AclopExecute, "aclopExecute"},
{acl::AclProfType::AclopCreateHandle, "aclopCreateHandle"},
{acl::AclProfType::AclopDestroyHandle, "aclopDestroyHandle"},
{acl::AclProfType::AclopExecWithHandle, "aclopExecWithHandle"},
{acl::AclProfType::AclopExecuteV2, "aclopExecuteV2"},
{acl::AclProfType::AclopCreateKernel, "aclopCreateKernel"},
{acl::AclProfType::AclopUpdateParams, "aclopUpdateParams"},
{acl::AclProfType::AclopInferShape, "aclopInferShape"},
{acl::AclProfType::AclopCast, "aclopCast"},
{acl::AclProfType::AclopCreateHandleForCast, "aclopCreateHandleForCast"},
{acl::AclProfType::AclopCreateAttr, "aclopCreateAttr"},
{acl::AclProfType::AclopDestroyAttr, "aclopDestroyAttr"},
{acl::AclProfType::AclopCompile, "aclopCompile"},
{acl::AclProfType::AclopCompileAndExecute, "aclopCompileAndExecute"},
{acl::AclProfType::AclopCompileAndExecuteV2, "aclopCompileAndExecuteV2"},
{acl::AclProfType::AclGenGraphAndDumpForOp, "aclGenGraphAndDumpForOp"},
{acl::AclProfType::OpCompile, "opCompile"},
{acl::AclProfType::OpCompileAndDump, "opCompileAndDump"},
{acl::AclProfType::AclmdlExecute, "aclmdlExecute"},
{acl::AclProfType::AclmdlLoadFromMemWithQ, "aclmdlLoadFromMemWithQ"},
{acl::AclProfType::AclmdlLoadFromMemWithMem, "aclmdlLoadFromMemWithMem"},
{acl::AclProfType::AclmdlGetDesc, "aclmdlGetDesc"},
{acl::AclProfType::AclmdlLoadFromFile, "aclmdlLoadFromFile"},
{acl::AclProfType::AclmdlLoadFromFileWithMem, "aclmdlLoadFromFileWithMem"},
{acl::AclProfType::AclmdlLoadFromMem, "aclmdlLoadFromMem"},
{acl::AclProfType::AclmdlBundleLoadFromFile, "aclmdlBundleLoadFromFile"},
{acl::AclProfType::AclmdlBundleLoadFromMem, "aclmdlBundleLoadFromMem"},
{acl::AclProfType::AclmdlBundleLoadModelWithMem, "aclmdlBundleLoadModelWithMem"},
{acl::AclProfType::AclmdlBundleLoadModelWithConfig, "aclmdlBundleLoadModelWithConfig"},
{acl::AclProfType::AclmdlBundleUnload, "aclmdlBundleUnload"},
{acl::AclProfType::AclmdlBundleUnloadModel, "aclmdlBundleUnloadModel"},
{acl::AclProfType::AclmdlSetInputAIPP, "aclmdlSetInputAIPP"},
{acl::AclProfType::AclmdlSetAIPPByInputIndex, "aclmdlSetAIPPByInputIndex"},
{acl::AclProfType::AclmdlExecuteAsync, "aclmdlExecuteAsync"},
{acl::AclProfType::AclmdlQuerySize, "aclmdlQuerySize"},
{acl::AclProfType::AclmdlQuerySizeFromMem, "aclmdlQuerySizeFromMem"},
{acl::AclProfType::AclmdlSetDynamicBatchSize, "aclmdlSetDynamicBatchSize"},
{acl::AclProfType::AclmdlSetDynamicHWSize, "aclmdlSetDynamicHWSize"},
{acl::AclProfType::AclmdlSetInputDynamicDims, "aclmdlSetInputDynamicDims"},
{acl::AclProfType::AclmdlLoadWithConfig, "aclmdlLoadWithConfig"},
{acl::AclProfType::AclmdlLoadFromFileWithQ, "aclmdlLoadFromFileWithQ"},
{acl::AclProfType::AclmdlUnload, "aclmdlUnload"},
{acl::AclProfType::AclCreateTensorDesc, "aclCreateTensorDesc"},
{acl::AclProfType::AclDestroyTensorDesc, "aclDestroyTensorDesc"},
{acl::AclProfType::AclTransTensorDescFormat, "aclTransTensorDescFormat"},
{acl::AclProfType::AclblasGemmEx, "aclblasGemmEx"},
{acl::AclProfType::AclblasCreateHandleForGemmEx, "aclblasCreateHandleForGemmEx"},
{acl::AclProfType::AclblasCreateHandleForHgemm, "aclblasCreateHandleForHgemm"},
{acl::AclProfType::AclblasHgemm, "aclblasHgemm"},
{acl::AclProfType::AclblasS8gemm, "aclblasS8gemm"},
{acl::AclProfType::AclblasCreateHandleForS8gemm, "aclblasCreateHandleForS8gemm"},
{acl::AclProfType::AclblasGemvEx, "aclblasGemvEx"},
{acl::AclProfType::AclblasCreateHandleForGemvEx, "aclblasCreateHandleForGemvEx"},
{acl::AclProfType::AclblasHgemv, "aclblasHgemv"},
{acl::AclProfType::AclblasCreateHandleForHgemv, "aclblasCreateHandleForHgemv"},
{acl::AclProfType::AclblasCreateHandleForS8gemv, "aclblasCreateHandleForS8gemv"},
{acl::AclProfType::AclblasS8gemv, "aclblasS8gemv"},
};
static aclError RegisterProfType() {
for (auto &iter : PROF_TYPE_TO_NAMES) {
uint32_t typeId = static_cast<uint32_t>(iter.first);
const auto ret = MsprofRegTypeInfo(MSPROF_REPORT_ACL_LEVEL, typeId, iter.second.c_str());
if (ret != MSPROF_ERROR_NONE) {
ACL_LOG_CALL_ERROR("Registered api type [%u] failed = %d", typeId, ret);
return ACL_ERROR_PROFILING_FAILURE;
}
}
return ACL_SUCCESS;
}
static aclError AddDeviceList(const uint32_t *const deviceIdList, const uint32_t deviceNums)
{
ACL_REQUIRES_NOT_NULL(deviceIdList);
for (size_t devId = 0U; devId < deviceNums; devId++) {
if (g_deviceList.count(*(deviceIdList + devId)) == 0U) {
(void)g_deviceList.insert(*(deviceIdList + devId));
ACL_LOG_INFO("device id %u is successfully added in acl profiling", *(deviceIdList + devId));
}
}
return ACL_SUCCESS;
}
static aclError RemoveDeviceList(const uint32_t *const deviceIdList, const uint32_t deviceNums)
{
ACL_REQUIRES_NOT_NULL(deviceIdList);
for (size_t devId = 0U; devId < deviceNums; devId++) {
const auto iter = g_deviceList.find(*(deviceIdList + devId));
if (iter != g_deviceList.end()) {
(void)g_deviceList.erase(iter);
ACL_LOG_INFO("device id %u is successfully deleted from acl profiling", *(deviceIdList + devId));
}
}
return ACL_SUCCESS;
}
static aclError ProfInnerStart(const MsprofCommandHandle *const profilerConfig)
{
ACL_LOG_INFO("start to execute ProfInnerStart");
if (!g_profRun) {
RegisterProfType();
g_profRun = true;
}
(void)AddDeviceList(profilerConfig->devIdList, profilerConfig->devNums);
ACL_LOG_INFO("successfully execute ProfInnerStart");
return ACL_SUCCESS;
}
static aclError ProfInnerStop(const MsprofCommandHandle *const profilerConfig)
{
ACL_LOG_INFO("start to execute ProfInnerStop");
(void)RemoveDeviceList(profilerConfig->devIdList, profilerConfig->devNums);
if (g_deviceList.empty() && g_profRun) {
g_profRun = false;
}
ACL_LOG_INFO("successfully execute ProfInnerStop");
return ACL_SUCCESS;
}
static aclError ProcessProfData(void *const data, const uint32_t len)
{
ACL_LOG_INFO("start to execute ProcessProfData");
const std::lock_guard<std::mutex> lk(g_profMutex);
ACL_REQUIRES_NOT_NULL(data);
constexpr size_t commandLen = sizeof(MsprofCommandHandle);
if (len < commandLen) {
ACL_LOG_INNER_ERROR("[Check][Len]len[%u] is invalid, it should not be smaller than %zu", len, commandLen);
return ACL_ERROR_INVALID_PARAM;
}
MsprofCommandHandle *const profilerConfig = static_cast<MsprofCommandHandle *>(data);
aclError ret = ACL_SUCCESS;
const uint64_t profSwitch = profilerConfig->profSwitch;
const uint32_t type = profilerConfig->type;
if (((profSwitch & ACL_PROF_ACL_API) != 0U) && (type == START_PROFILING)) {
ret = ProfInnerStart(profilerConfig);
}
if (((profSwitch & ACL_PROF_ACL_API) != 0U) && (type == STOP_PROFILING)) {
ret = ProfInnerStop(profilerConfig);
}
return ret;
}
static aclError AclProfCtrlHandle(uint32_t dataType, void *data, uint32_t dataLen)
{
ACL_REQUIRES_NOT_NULL(data);
if (dataType == PROF_CTRL_SWITCH) {
const aclError ret = ProcessProfData(data, dataLen);
if (ret != ACL_SUCCESS) {
ACL_LOG_INNER_ERROR("[Process][ProfSwitch]failed to call ProcessProfData, result is %u", ret);
return ret;
}
return ACL_SUCCESS;
}
ACL_LOG_INFO("get unsupported dataType %u while processing profiling data", dataType);
return ACL_SUCCESS;
}
class AclRegProfCallback {
public:
AclRegProfCallback() {
const auto profRet = MsprofRegisterCallback(ASCENDCL, &AclProfCtrlHandle);
if (profRet != 0) {
ACL_LOG_ERROR("cannot register Callback, prof result = %d", profRet);
}
}
~AclRegProfCallback() {}
};
static AclRegProfCallback g_profCbReg;
}
namespace acl {
AclProfilingReporter::AclProfilingReporter(const AclProfType apiId) : aclApi_(apiId)
{
if (g_profRun && (!IsDumpToStdEnabled())) {
startTime_ = MsprofSysCycleTime();
}
}
AclProfilingReporter::~AclProfilingReporter() noexcept
{
if (g_profRun && (!IsDumpToStdEnabled()) && (startTime_ != 0UL)) {
const uint64_t endTime = MsprofSysCycleTime();
MsprofApi api{};
api.beginTime = startTime_;
api.endTime = endTime;
thread_local static auto tid = mmGetTid();
api.threadId = static_cast<uint32_t>(tid);
api.level = MSPROF_REPORT_ACL_LEVEL;
api.type = static_cast<uint32_t>(aclApi_);
(void)MsprofReportApi(true, &api);
}
}
}