* Copyright (c) 2025-2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file dump_device_perf.cpp
* \brief
*/
#include "machine/runtime/runner/dump_device_perf.h"
#include <cstdlib>
#include <memory>
#include <optional>
#include "tilefwk/pypto_fwk_log.h"
#include "adapter/api/runtime_api.h"
#include "machine/runtime/launcher/device_launcher.h"
#include "interface/machine/device/tilefwk/aicpu_common.h"
#include "interface/utils/file_utils.h"
#include "interface/configs/config_manager.h"
#include "machine/device/dynamic/device_utils.h"
#include "machine/device/distributed/common.h"
#include "machine/utils/checkinject.h"
#include "nlohmann/json.hpp"
using json = nlohmann::json;
namespace npu::tile_fwk::dynamic {
namespace {
constexpr int DUMP_LEVEL_FOUR = 4;
constexpr uint32_t AICPU_NUM_OF_RUN_AICPU_TASKS = 1;
uint32_t g_last_round_num = 0;
inline RtError NormalizedRtMemcpy(
void *dst, uint64_t destMax, const void *src, uint64_t cnt, RtMemcpyKind kind)
{
std::optional<AclModeGuard> captureRelaxGuard;
if (DeviceLauncher::IsCaptureMode()) {
captureRelaxGuard.emplace(AclMdlRICaptureMode::RELAXED);
}
return RuntimeMemcpy(dst, destMax, src, cnt, kind);
}
}
void DumpDevTaskPerfData(DeviceArgs& args, const std::vector<void*>& perfData, bool isLast)
{
if (GetEnvVar("DUMP_DEVICE_PERF") == "true" && !perfData.empty()) {
uint64_t freq = (args.archInfo == ArchInfo::DAV_2201) ? FREQ_DAV_2201 : FREQ_DAV_3510;
DumpAicpuPerfInfo(args, perfData, freq, isLast);
}
}
json BuildSyncEventsJson(const TaskStat& taskStat, const uint8_t* perfDataPtr)
{
json syncEventsArr = json::array();
const uint8_t* perfDataBase = perfDataPtr;
const uint64_t* setEventBase = reinterpret_cast<const uint64_t*>(perfDataBase + taskStat.setEventAddr);
const uint64_t* waitEventBase = reinterpret_cast<const uint64_t*>(perfDataBase + taskStat.waitEventAddr);
for (int k = 0; k < taskStat.setEventNum; ++k) {
uint64_t timestamp = setEventBase[k];
if (timestamp != 0) {
json setEvent;
setEvent["idx"] = k;
setEvent["type"] = "CV_SYNC_SET";
setEvent["time"] = timestamp;
syncEventsArr.push_back(setEvent);
}
}
for (int k = 0; k < taskStat.waitEventNum; ++k) {
uint64_t timestamp = waitEventBase[k];
if (timestamp != 0) {
json waitEvent;
waitEvent["idx"] = k;
waitEvent["type"] = "CV_SYNC_WAIT";
waitEvent["time"] = timestamp;
syncEventsArr.push_back(waitEvent);
}
}
std::sort(syncEventsArr.begin(), syncEventsArr.end(), [](const json& a, const json &b) {
return a.value("time", 0) < b.value("time", 0);
});
return syncEventsArr;
}
void ConstructTaskInfo(
const uint32_t& index, json& rootTaskStats, const std::vector<void*>& perfData, const std::string& coreType)
{
void* devPtr = perfData[index];
size_t dataSize = PERF_DATA_TOTAL_SIZE;
std::vector<uint8_t> hostBuffer(dataSize);
auto ret = NormalizedRtMemcpy(
hostBuffer.data(), dataSize, devPtr, dataSize, RtMemcpyKind::DEVICE_TO_HOST);
if (ret != 0) {
MACHINE_LOGW("task perf D2H copy failed ret: %d, index: %u", ret, index);
}
Metrics* aicpuMetric = reinterpret_cast<Metrics*>(hostBuffer.data());
if (aicpuMetric->taskCount > MAX_DFX_TASK_NUM_PER_CORE) {
aicpuMetric->taskCount = MAX_DFX_TASK_NUM_PER_CORE;
}
TaskStat* taskStats = aicpuMetric->tasks;
size_t numTasks = aicpuMetric->taskCount;
json coreObj;
coreObj["blockIdx"] = index;
coreObj["coreType"] = coreType;
if (aicpuMetric->coreType != -1) {
coreObj["coreType"] = aicpuMetric->coreType == static_cast<int16_t>(CoreType::AIC) ? "AIC" : "AIV";
}
json tasksArr = json::array();
for (size_t j = 0; j < numTasks; ++j) {
if (taskStats[j].execEnd != 0) {
json taskObj;
taskObj["seqNo"] = taskStats[j].seqNo;
taskObj["leafIndex"] = taskStats[j].subGraphId;
taskObj["taskId"] = taskStats[j].taskId;
taskObj["execStart"] = taskStats[j].execStart;
taskObj["execEnd"] = taskStats[j].execEnd;
json syncEventsArr = BuildSyncEventsJson(taskStats[j], hostBuffer.data());
if (!syncEventsArr.empty()) {
taskObj["syncEvents"] = syncEventsArr;
}
tasksArr.push_back(taskObj);
}
}
coreObj["tasks"] = tasksArr;
if (!tasksArr.empty()) {
rootTaskStats.push_back(coreObj);
}
aicpuMetric->taskCount = 0;
ret = NormalizedRtMemcpy(
perfData[index], sizeof(Metrics), aicpuMetric, sizeof(Metrics), RtMemcpyKind::HOST_TO_DEVICE);
if (ret != 0) {
MACHINE_LOGW("task perf H2D copy failed ret: %d, index: %u", ret, index);
}
}
void DumpAicoreTaskExectInfo(const DeviceArgs& args, const std::vector<void*>& perfData)
{
json rootTaskStatus = json::array();
auto blockNum = args.GetBlockNum();
MACHINE_LOGI("GetBlockNum : %lu", blockNum);
for (uint32_t i = 0; i < blockNum; i++) {
std::string coreType = (i < args.nrValidAic) ? "AIC" : "AIV";
ConstructTaskInfo(i, rootTaskStatus, perfData, coreType);
}
uint32_t aicoreBlockNum = args.nrAic + args.nrAiv;
for (uint32_t i = aicoreBlockNum; i < aicoreBlockNum + AICPU_NUM_OF_RUN_AICPU_TASKS; i++) {
ConstructTaskInfo(i, rootTaskStatus, perfData, "AI-CPU");
}
std::string jsonFilePath = npu::tile_fwk::config::LogTopFolder() + "/tilefwk_L1_prof_data.json";
if (!DumpFile(rootTaskStatus.dump(DUMP_LEVEL_FOUR), jsonFilePath)) {
MACHINE_LOGW("Contrust custom op json failed");
return;
}
MACHINE_LOGD("tilefwk_L1_prof_data have saved in: %s", jsonFilePath.c_str());
std::string topo_txt_path = npu::tile_fwk::config::LogTopFolder() + "/dyn_topo.txt";
std::string program_json_path = npu::tile_fwk::config::LogTopFolder() + "/program.json";
std::string mix_event_path = npu::tile_fwk::config::GetAbsoluteTopFolder() + "/mix_event_info.json";
std::string draw_swim_lane_py_path = GetCurrentSharedLibPath() + "/scripts/draw_swim_lane.py";
npu::tile_fwk::config::SetRunDataOption(
KEY_SWIM_GRAPH_PATH, npu::tile_fwk::config::GetAbsoluteTopFolder() + "/merged_swimlane.json");
uint64_t freq = (args.archInfo == ArchInfo::DAV_2201) ? FREQ_DAV_2201 : FREQ_DAV_3510;
if (FileExist(program_json_path) && FileExist(topo_txt_path)) {
MACHINE_LOGI("The files program.json and dyn_topo.txt exist. Start merging the swimlane.");
std::string command = "python3 " + draw_swim_lane_py_path + " \"" + jsonFilePath + "\" \"" + topo_txt_path +
"\" \"" + program_json_path +
"\" --label_type=1 --time_convert_denominator=" + std::to_string(freq) +
" --mix_event_info=\"" + mix_event_path + "\"";
int ret = Checkinject(command.c_str(), command.size());
if (ret != 0) {
MACHINE_LOGE(DevCommonErr::SYSTEM_CALL_FAILED, "Draw swimlane cmd illegal char.");
return;
}
if (system(command.c_str()) != 0) {
MACHINE_LOGW("Failed to execute draw_swim_lane.py. Stop merging the swimlane.");
}
} else {
MACHINE_LOGW("program.json or dyn_topo.txt missing. Stop merging the swimlane.");
}
}
inline void DevTaskPerfFormat(
uint32_t tid, uint32_t type, json& devTaskJson, const MetricPerf* aicpuPer, const uint32_t& turnIdx)
{
json per_dev_task;
uint32_t idx = DEVTASK_PERF_ARRY_INDEX(type);
const DevTaskPerf& perfSlot = aicpuPer->devTaskPerfs[tid][idx];
for (uint32_t i = 0; i < perfSlot.cnt; i++) {
std::string name = PerfTraceName[type];
name = name + "_" + std::to_string(turnIdx);
if (type != PERF_TRACE_DEV_TASK_SEND_FIRST_LEAF_TASK) {
name = name + "(" + std::to_string(i) + ")";
}
per_dev_task["name"] = name;
per_dev_task["end"] = perfSlot.timeStamp[i];
devTaskJson.push_back(per_dev_task);
}
}
inline void SparateCore(int total, int idx, int part, const int& offset, std::vector<int>& coreArray)
{
if (part == 0) {
return;
}
int perCpu = total / part;
int remain = total % part;
int start = idx * perCpu + ((idx < remain) ? idx : remain);
int end = start + perCpu + ((idx < remain) ? 1 : 0);
for (int i = start; i < end; i++) {
coreArray[i + offset] = idx + 1;
}
}
inline void ConstructAicorePerfInfo(json& tasksArr, Metrics* aicoreMetric, const uint32_t& turnNum)
{
uint64_t curCycle = 0;
for (uint32_t type = 0; type < PERF_TRACE_CORE_MAX; type++) {
for (uint32_t turnIdx = g_last_round_num; turnIdx < turnNum; turnIdx++) {
for (uint32_t cnt = 0; cnt < aicoreMetric->perfTraceCnt[turnIdx][type]; cnt++) {
json aicoreTaskType;
curCycle = aicoreMetric->perfTrace[turnIdx][type][cnt];
if (curCycle == 0) {
break;
}
std::string name = AicorePerfTraceName[type];
name = name + "_" + std::to_string(turnIdx);
if (aicoreMetric->perfTraceDevTaskId[turnIdx][type][cnt] != INVALID_DEV_TASK_ID) {
name = name + "(" + std::to_string(aicoreMetric->perfTraceDevTaskId[turnIdx][type][cnt]) + ")";
}
aicoreTaskType["name"] = name;
aicoreTaskType["end"] = curCycle;
tasksArr.push_back(aicoreTaskType);
}
aicoreMetric->perfTraceCnt[turnIdx][type] = 0;
}
}
}
inline void DumpAicoreDevTask(
DeviceArgs& args, json& aicpuPrefArray, const std::vector<void*>& perfData, const uint32_t& freq,
const uint32_t& turnNum)
{
std::vector<int> coreArray;
coreArray.resize(args.GetBlockNum());
for (uint32_t i = 0; i < args.scheCpuNum; i++) {
SparateCore(args.nrValidAic, i, args.scheCpuNum, 0, coreArray);
SparateCore(args.nrValidAic * AIV_NUM_PER_AI_CORE, i, args.scheCpuNum, args.nrValidAic, coreArray);
}
for (uint32_t i = 0; i < args.GetBlockNum(); i++) {
void* devPtr = perfData[i];
size_t dataSize = PERF_DATA_TOTAL_SIZE;
std::vector<uint8_t> hostBuffer(dataSize);
auto ret = NormalizedRtMemcpy(
hostBuffer.data(), dataSize, devPtr, dataSize, RtMemcpyKind::DEVICE_TO_HOST);
if (ret != 0) {
MACHINE_LOGW("aicore perf D2H copy failed ret: %d, block: %u", ret, i);
}
Metrics* aicoreMetric = reinterpret_cast<Metrics*>(hostBuffer.data());
std::string coreType = aicoreMetric->coreType == static_cast<int16_t>(CoreType::AIC) ? "AIC" : "AIV";
json aicoreTask;
aicoreTask["blockIdx"] = i + 1;
aicoreTask["coreType"] = "SCHED" + std::to_string(aicoreMetric->scheCpuIdx) + "-" + coreType;
aicoreTask["freq"] = freq;
json tasksArr = json::array();
ConstructAicorePerfInfo(tasksArr, aicoreMetric, turnNum);
aicoreTask["tasks"] = tasksArr;
aicpuPrefArray.push_back(aicoreTask);
}
}
inline std::unique_ptr<MetricPerf> GetAicpuPrefAddr(const DeviceArgs& args, const uint32_t& turnIdx)
{
auto aicpuMetric = std::make_unique<MetricPerf>();
auto aicpuPer = (ValueToPtr(args.aicpuPerfAddr + turnIdx * sizeof(MetricPerf)));
if (aicpuPer == nullptr) {
MACHINE_LOGW("Aicpu per ptr is null");
return aicpuMetric;
}
auto ret = NormalizedRtMemcpy(
PtrToPtr<MetricPerf, void>(aicpuMetric.get()), sizeof(MetricPerf), aicpuPer, sizeof(MetricPerf),
RtMemcpyKind::DEVICE_TO_HOST);
if (ret != 0) {
MACHINE_LOGW("aicpu meter copy failed ret: %d", ret);
}
return aicpuMetric;
}
inline void DumpAicpuDevTask(
const DeviceArgs& args, json& aicpuPrefArray, const uint32_t& freq, const uint32_t& turnNum)
{
for (uint32_t i = 0; i < args.nrAicpu - 1; i++) {
json aicpu;
std::string coreType = "AICPU";
if (i == 0) {
coreType = "AICPU-CTRL";
} else if (i <= args.scheCpuNum) {
coreType = "AICPU-SCHED";
}
aicpu["blockIdx"] = i;
aicpu["coreType"] = coreType;
aicpu["freq"] = freq;
json aicpuDevTasks = json::array();
for (uint32_t turnIdx = g_last_round_num; turnIdx < turnNum; turnIdx++) {
auto aicpuMetric = GetAicpuPrefAddr(args, turnIdx);
for (uint32_t type = 0; type < PERF_TRACE_MAX; type++) {
if (IsDevTaskType(type)) {
DevTaskPerfFormat(i, type, aicpuDevTasks, aicpuMetric.get(), turnIdx);
continue;
}
if (aicpuMetric->perfAicpuTrace[i][type] == 0) {
continue;
}
json schCtrAicpu;
std::string name = PerfTraceName[type];
schCtrAicpu["name"] = name + "_" + std::to_string(turnIdx);
schCtrAicpu["end"] = aicpuMetric->perfAicpuTrace[i][type];
aicpuDevTasks.push_back(schCtrAicpu);
}
}
aicpu["tasks"] = aicpuDevTasks;
aicpuPrefArray.push_back(aicpu);
}
}
void DumpAicpuPerfInfo(DeviceArgs& args, const std::vector<void*>& perfData, uint32_t freq, bool isLast)
{
void* devPtr = perfData[0];
size_t dataSize = PERF_DATA_TOTAL_SIZE;
std::vector<uint8_t> hostBuffer(dataSize);
auto memcpyRet = NormalizedRtMemcpy(
hostBuffer.data(), dataSize, devPtr, dataSize, RtMemcpyKind::DEVICE_TO_HOST);
if (memcpyRet != 0) {
MACHINE_LOGW("aicpu perf header D2H copy failed ret: %d", memcpyRet);
}
Metrics* aicoreMetric = reinterpret_cast<Metrics*>(hostBuffer.data());
auto sumRoundNum = (aicoreMetric->turnNum > MAX_ROUND_NUM) ? MAX_ROUND_NUM : aicoreMetric->turnNum;
MACHINE_LOGD("CoreId 0 devAddr: %p, sumRoundNum: %ld", devPtr, sumRoundNum);
if (sumRoundNum == g_last_round_num) {
return;
}
if ((sumRoundNum < 50 || sumRoundNum % 50 != 0) && !isLast) {
return;
}
json aicpuPrefArray = json::array();
DumpAicpuDevTask(args, aicpuPrefArray, freq, sumRoundNum);
DumpAicoreDevTask(args, aicpuPrefArray, perfData, freq, sumRoundNum);
std::string aicpuPerfilePath =
config::LogTopFolder() + "/machine_trace_perf_data_" + std::to_string(g_last_round_num) + ".json";
if (!DumpFile(aicpuPrefArray.dump(DUMP_LEVEL_FOUR), aicpuPerfilePath)) {
MACHINE_LOGW("Contrust custom op json failed");
return;
}
std::string scriptPath = GetCurrentSharedLibPath() + "/scripts/machine_perf_trace.py";
std::string cmd = "python3 " + scriptPath + " gen_perfetto " + aicpuPerfilePath + " " +
npu::tile_fwk::config::LogTopFolder() + "/machine_runtime_operator_trace_" +
std::to_string(g_last_round_num) + ".json " + npu::tile_fwk::config::LogTopFolder() +
"/merged_swimlane.json";
int ret = Checkinject(cmd.c_str(), cmd.size());
if (ret != 0) {
MACHINE_LOGE(DevCommonErr::SYSTEM_CALL_FAILED, "Draw swimlane cmd illegal char.");
return;
}
if (system(cmd.c_str()) != 0) {
MACHINE_LOGW("Failed to execute machine_perf_trace.py, cannot get aicpu perfetto.json.");
}
g_last_round_num = sumRoundNum;
npu::tile_fwk::config::SetRunDataOption(
KEY_AICPU_PERF_GRAPH_PATH, npu::tile_fwk::config::GetAbsoluteTopFolder() + "/machine_runtime_operator_trace_" +
std::to_string(g_last_round_num) + ".json");
}
}