* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file device_perf.cpp
* \brief
*/
#include "machine/runtime/runner/device_perf.h"
#include "tilefwk/pypto_fwk_log.h"
#include "adapter/api/runtime_api.h"
#include "interface/configs/config_manager.h"
#include "machine/runtime/runner/dump_device_perf.h"
#include "machine/runtime/runner/runtime_utils.h"
#include "machine/runtime/memory_utils/memory_pool.h"
#include "machine/device/dynamic/device_common.h"
namespace npu::tile_fwk {
namespace {
constexpr uint32_t AICPU_NUM_OF_RUN_AICPU_TASKS = 1;
}
DevicePerf::DevicePerf() : args_() {}
DevicePerf::~DevicePerf()
{
MACHINE_LOGD("Start to cleanup perfData");
StopMachinePerfTraceDumpThread();
ReleasePerfData();
}
void DevicePerf::InitAndStartDumpThread(const DeviceArgs &args)
{
args_ = args;
InitPerfData();
StartMachinePerfTraceDumpThread();
}
uint32_t DevicePerf::GetPerfDataSize() const
{
return args_.nrAic + args_.nrAiv + AICPU_NUM_OF_RUN_AICPU_TASKS;
}
void DevicePerf::InitPerfData()
{
for (uint32_t i = 0; i < GetPerfDataSize(); i++) {
perfData_.push_back(DevMallocWithAlignSize(PERF_DATA_TOTAL_SIZE, TWO_MB_HUGE_PAGE_FLAGS));
}
}
void DevicePerf::ResetPerData() const
{
for (uint32_t i = 0; i < GetPerfDataSize(); i++) {
int rc = RuntimeMemset(perfData_[i], PERF_DATA_TOTAL_SIZE, 0, PERF_DATA_TOTAL_SIZE);
if (rc != 0) {
MACHINE_LOGW("CoreId %u, rtMemSet failed, rc: %d", i, rc);
}
}
}
void DevicePerf::ReleasePerfData()
{
for (size_t i = 0; i < perfData_.size(); i++) {
if (perfData_[i] != nullptr) {
RuntimeFree(perfData_[i]);
perfData_[i] = nullptr;
}
}
perfData_.clear();
}
void DevicePerf::ResetMetrics(const uint32_t coreId)
{
if (perfData_.empty() || perfData_.size() <= static_cast<size_t>(coreId)) {
return;
}
if (args_.aicpuPerfAddr != 0) {
if (!isPerfDataInited_) {
RuntimeMemset(perfData_[coreId], sizeof(Metrics), 0, sizeof(Metrics));
isPerfDataInited_ = true;
}
} else {
RuntimeMemset(perfData_[coreId], sizeof(Metrics), 0, sizeof(Metrics));
}
}
void DevicePerf::SyncProfData(const bool forceSync)
{
if (config::GetDebugOption<int64_t>(CFG_RUNTIME_DBEUG_MODE) == CFG_DEBUG_ALL || forceSync) {
args_.nrValidAic = GetCfgBlockdim();
args_.scheCpuNum = dynamic::CalcSchAicpuNumByBlockDim(args_.nrValidAic, args_.nrAicpu, args_.archInfo);
dynamic::DumpAicoreTaskExectInfo(args_, perfData_);
}
}
void DevicePerf::SetDebugEnable()
{
for (uint32_t i = 0; i < GetPerfDataSize(); i++) {
ResetMetrics(i);
RuntimeMemcpy(
(reinterpret_cast<uint8_t*>(args_.sharedBuffer + sizeof(uint64_t) * SHAK_BUF_DFX_DATA_INDEX)) +
i * SHARED_BUFFER_SIZE,
sizeof(uint64_t), reinterpret_cast<uint8_t*>(&perfData_[i]), sizeof(uint64_t),
RtMemcpyKind::HOST_TO_DEVICE);
}
MACHINE_LOGD("Set debug enable aicore 0 devPtr: %p", perfData_[0]);
}
bool DevicePerf::RunPrepare() const
{
bool ret = true;
if (config::GetDebugOption<int64_t>(CFG_RUNTIME_DBEUG_MODE) == CFG_DEBUG_ALL || ENABLE_PERF_TRACE || PMU_COLLECT) {
for (uint32_t i = 0; i < GetPerfDataSize(); i++) {
ret = RuntimeMemcpy(
(reinterpret_cast<uint8_t*>(args_.sharedBuffer + sizeof(uint64_t) * SHAK_BUF_DFX_DATA_INDEX)) +
i * SHARED_BUFFER_SIZE,
sizeof(uint64_t), reinterpret_cast<const uint8_t*>(&perfData_[i]), sizeof(uint64_t),
RtMemcpyKind::HOST_TO_DEVICE) == RT_SUCCESS;
}
}
return ret;
}
void DevicePerf::StartMachinePerfTraceDumpThread()
{
if (args_.aicpuPerfAddr == 0) {
return;
}
if (dumpThread_.joinable()) {
return;
}
dumpThreadStopFlag_.store(false);
dumpThread_ = std::thread(&DevicePerf::MachinePerfTraceDumpThread, this);
MACHINE_LOGI("Dump thread started");
}
void DevicePerf::StopMachinePerfTraceDumpThread()
{
if (!dumpThread_.joinable()) {
return;
}
dumpThreadStopFlag_.store(true);
if (dumpThread_.joinable()) {
dumpThread_.join();
}
MACHINE_LOGD("Dump thread stopped");
if (args_.aicpuPerfAddr != 0) {
void* ptr = npu::tile_fwk::dynamic::ValueToPtr(args_.aicpuPerfAddr);
if (ptr != nullptr) {
RuntimeFree(ptr);
args_.aicpuPerfAddr = 0;
}
}
}
void DevicePerf::MachinePerfTraceDumpThread()
{
MACHINE_LOGD("Dump thread start to machine perf trace data");
int32_t deviceId = static_cast<int32_t>(args_.deviceId);
if (RuntimeSetDevice(deviceId) != 0) {
MACHINE_LOGW("Dump perf thread set Device[%d] not success", deviceId);
}
while (!dumpThreadStopFlag_.load()) {
usleep(10000);
dynamic::DumpDevTaskPerfData(args_, perfData_, false);
}
MACHINE_LOGD("Dump thread final dump");
dynamic::DumpDevTaskPerfData(args_, perfData_, true);
}
}