* Copyright (c) 2026 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file perf_event_sampler.h
* \brief Linux perf_event_open based PMU sampler for AICPU performance analysis
* Uses PERF_TYPE_HARDWARE standard events for better compatibility
*/
#pragma once
#include <cstdint>
#include <sys/types.h>
#include "machine/utils/device_switch.h"
#include "machine/utils/device_log.h"
#define MAX_PERF_EVENT_NUM 8
namespace npu::tile_fwk {
struct PerfCacheMetrics {
bool valid{false};
double missRate{0.0};
};
struct PerfDerivedMetrics {
double ipc{0.0};
double cpi{0.0};
double branchMissRate{0.0};
PerfCacheMetrics l1dCache;
PerfCacheMetrics llCache;
};
enum PerfEventIdx {
IDX_CPU_CYCLES = 0,
IDX_INSTRUCTIONS,
IDX_BRANCH_INST,
IDX_BRANCH_MISS,
IDX_L1D_CACHE_REFS,
IDX_L1D_CACHE_MISSES,
IDX_LL_CACHE_REFS,
IDX_LL_CACHE_MISSES,
PERF_EVENT_COUNT
};
double DividePerfCounter(uint64_t dividend, uint64_t divisor);
double PercentPerfCounter(uint64_t dividend, uint64_t divisor);
PerfCacheMetrics BuildPerfCacheMetrics(uint64_t refs, uint64_t misses);
PerfDerivedMetrics BuildPerfDerivedMetrics(const uint64_t* counts);
#if __PYPTO_AICPU_PMU_EVENT_ENABLE
struct PerfEventRecord {
int type_{0};
uint64_t config_{0};
int fd_{-1};
const char* name_{nullptr};
bool valid_{false};
};
class PerfEventGroup {
public:
explicit PerfEventGroup(pid_t tid);
~PerfEventGroup();
PerfEventGroup(const PerfEventGroup&) = delete;
PerfEventGroup& operator=(const PerfEventGroup&) = delete;
int GetNrEvent() const;
int GetValidEventCount() const;
int AddEvent(int type, uint64_t config, const char* name);
bool Enable();
void Disable();
int Read(uint64_t* counts);
private:
pid_t tid_;
int nrEvent_{0};
int validEventCount_{0};
int groupFd_{-1};
PerfEventRecord events_[MAX_PERF_EVENT_NUM];
};
class AicpuPerfEventSampler {
public:
AicpuPerfEventSampler();
void Begin();
void End();
void Dump();
private:
void TryAddEvent(int type, uint64_t config, const char* name);
void TryAddCacheEvent(uint64_t cacheId, uint64_t opId, uint64_t resultId, const char* name);
void DumpSummary(const char* title);
void DumpReport(const uint64_t* counts);
void DumpElapsedCycles();
void DumpSectionHeader(const char* title);
void DumpRawCounters(const uint64_t* counts);
PerfDerivedMetrics BuildDerivedMetrics(const uint64_t* counts);
PerfCacheMetrics BuildCacheMetrics(uint64_t refs, uint64_t misses);
void DumpDerivedMetrics(const PerfDerivedMetrics& metrics);
void DumpCacheDerivedMetric(const char* name, const PerfCacheMetrics& metrics);
static double Divide(uint64_t dividend, uint64_t divisor);
static double Percent(uint64_t dividend, uint64_t divisor);
uint64_t cycles{0};
PerfEventGroup events;
bool pmuAvailable{true};
bool pmuEnabled{false};
};
static inline AicpuPerfEventSampler& GetAicpuPerfEventSampler()
{
static thread_local AicpuPerfEventSampler sampler;
return sampler;
}
class AicpuPerfScopedSampler {
public:
explicit AicpuPerfScopedSampler(const char* sectionName);
~AicpuPerfScopedSampler();
private:
const char* sectionName_{"unnamed"};
AicpuPerfEventSampler& sampler_;
};
#define AICPU_PMU_SCOPE(section_name_literal) \
::npu::tile_fwk::AicpuPerfScopedSampler aicpuPerfScopedSampler_##__LINE__(section_name_literal)
#define AICPU_PMU_BEGIN(sampler_name) \
auto& sampler_name = ::npu::tile_fwk::GetAicpuPerfEventSampler(); \
(sampler_name).Begin()
#define AICPU_PMU_END(sampler_name, section_name_literal) \
do { \
(sampler_name).End(); \
DEV_ERROR(MachineError::UNKNOWN, "[AICPU_PMU] %s", section_name_literal); \
(sampler_name).Dump(); \
} while (0)
#define AICPU_PMU_BEGIN_EXTERNAL(sampler_ptr) \
do { (sampler_ptr)->Begin(); } while (0)
#define AICPU_PMU_END_EXTERNAL(sampler_ptr, section_name_literal) \
do { \
(sampler_ptr)->End(); \
DEV_ERROR(MachineError::UNKNOWN, "[AICPU_PMU] %s", section_name_literal); \
(sampler_ptr)->Dump(); \
} while (0)
#else
class AicpuPerfEventSampler {
public:
void Begin() {}
void End() {}
void Dump() {}
};
static inline AicpuPerfEventSampler& GetAicpuPerfEventSampler()
{
static thread_local AicpuPerfEventSampler sampler;
return sampler;
}
class AicpuPerfScopedSampler {
public:
explicit AicpuPerfScopedSampler(const char* sectionName)
{
(void)sectionName;
}
};
#define AICPU_PMU_SCOPE(section_name_literal)
#define AICPU_PMU_BEGIN(sampler_name)
#define AICPU_PMU_END(sampler_name, section_name_literal)
#define AICPU_PMU_BEGIN_EXTERNAL(sampler_ptr)
#define AICPU_PMU_END_EXTERNAL(sampler_ptr, section_name_literal)
#endif
}