77049bde创建于 2025年12月2日历史提交
/******************************************************************************
 * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
 * libkperf licensed under the Mulan PSL v2.
 * You can use this software according to the terms and conditions of the Mulan PSL v2.
 * You may obtain a copy of Mulan PSL v2 at:
 *     http://license.coscl.org.cn/MulanPSL2
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR
 * PURPOSE.
 * See the Mulan PSL v2 for more details.
 * Author: Mr.Gan
 * Create: 2024-04-03
 * Description: implementations for reading performance counters and initializing counting logic
 * of PerfCounterDefault in the KUNPENG_PMU namespace.
 ******************************************************************************/
#include <climits>
#include <poll.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <cstring>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <iostream>
#include <linux/perf_event.h>
#include <linux/version.h>
#include "pmu.h"
#include "linked_list.h"
#include "pfm_event.h"
#include "pmu_event.h"
#include "pcerr.h"
#include "log.h"
#include "perf_counter_default.h"
#include "read_reg.h"
#include "common.h"

using namespace std;
using namespace pcerr;


struct GroupReadFormat {
    __u64 nr;
    __u64 timeEnabled;
    __u64 timeRunning;
    struct {
        __u64 value;
        __u64 id;
    } values[];
};

/**
 * Read pmu counter and deal with pmu multiplexing
 * Right now we do not implement grouping logic, thus we ignore the
 * PERF_FORMAT_ID section for now
 */
int KUNPENG_PMU::PerfCounterDefault::Read(EventData &eventData)
{
    if (__glibc_unlikely(this->fd < 0)) {
        this->accumCount.clear();
        return UNKNOWN_ERROR;
    }

    if (groupStatus == GroupStatus::NO_GROUP) {
        return ReadSingleEvent(eventData.data);
    } else if (groupStatus == GroupStatus::GROUP_LEADER) {
        return ReadGroupEvents(eventData.data);
    }

    // Group members do not need to read counters,
    // Group leader will read them all.
    return SUCCESS;
}

namespace KUNPENG_PMU {
static int PerfMmapReadSelf(const std::shared_ptr<PerfMmap> &countMmap, struct ReadFormat &perfCountValue)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 9, 0)
    uint32_t seq;
    uint32_t idx;
    uint32_t timeMult = 0;
    uint32_t timeShift = 0;
    uint64_t cnt = 0;
    uint64_t cyc = 0;
    uint64_t timeOffset = 0;
    uint64_t timeCycles = 0;
    uint64_t timeMask = ~0ULL;
    auto pc = countMmap->base;
    if (!pc) {
        return LIBPERF_ERR_COUNT_MMAP_IS_NULL;
    }
    if (!pc->cap_user_rdpmc) {
        return LIBPERF_ERR_ENABLE_USER_ACCESS_FAILED;
    }

    do {
        seq = ReadOnce(&pc->lock);
        Barrier();

        perfCountValue.timeEnabled = ReadOnce(&pc->time_enabled);
        perfCountValue.timeRunning = ReadOnce(&pc->time_running);

        if (pc->cap_user_rdpmc && perfCountValue.timeEnabled != perfCountValue.timeRunning) {
            cyc = ReadTimestamp();
            timeMult = ReadOnce(&pc->time_mult);
            timeShift = ReadOnce(&pc->time_shift);
            timeOffset = ReadOnce(&pc->time_offset);

            if (pc->cap_user_time_short) {
                timeCycles = ReadOnce(&pc->time_cycles);
                timeMask = ReadOnce(&pc->time_mask);
            }
        }

        idx = ReadOnce(&pc->index);
        cnt = ReadOnce(&pc->offset);
        if (pc->cap_user_rdpmc && idx) {
            // read the reg mapped by the countMmap->base->idx
            int64_t eventCount = ReadPerfCounter(idx - 1);
            uint16_t width = ReadOnce(&pc->pmc_width);
            eventCount <<= 64 - width;
            eventCount >>= 64 - width;
            cnt += eventCount;
        } else {
            return LIBPERF_ERR_ALLOCATE_REGISTER_FAILED;
        }
        Barrier();
    } while (ReadOnce(&pc->lock) != seq);

    if (perfCountValue.timeEnabled != perfCountValue.timeRunning) {
        uint64_t delta;
        cyc = timeCycles + ((cyc - timeCycles) & timeMask);
        delta = timeOffset + MulU64U32Shr(cyc, timeMult, timeShift);
        perfCountValue.timeEnabled += delta;
        if (idx) {
            perfCountValue.timeRunning += delta;
        }
    }
    perfCountValue.value = cnt;
#endif
    return SUCCESS;
}
}  // namespace KUNPENG_PMU

int KUNPENG_PMU::PerfCounterDefault::ReadSingleEvent(std::vector<PmuData> &data)
{
    ReadFormat perfCountValue;
    if (this->evt->enableUserAccess) {
        if (!this->isCollect) {
            // To keep consistency with read(fd) while the counting is disabled, 
            // we should return the value we last read as we can't access the register now.
            CountValueToData(this->accumCount[0], this->enabled, this->running, this->accumCount[0], data);
            return SUCCESS;
        }
        int err = PerfMmapReadSelf(this->countMmap, perfCountValue);
        if (err != SUCCESS) {
            return err;
        }
    } else {
        int len = read(this->fd, &perfCountValue, sizeof(perfCountValue));
        if (len < 0) {
            New(UNKNOWN_ERROR, strerror(errno));
            return UNKNOWN_ERROR;
        }
    }

    if (accumCount.empty()) {
        accumCount.assign(1, 0);
    }

    int err = CountValueToData(
        perfCountValue.value, perfCountValue.timeEnabled, perfCountValue.timeRunning, accumCount[0], data);
    if (err != SUCCESS) {
        return err;
    }

    this->enabled = perfCountValue.timeEnabled;
    this->running = perfCountValue.timeRunning;
    return SUCCESS;
}

int KUNPENG_PMU::PerfCounterDefault::ReadGroupEvents(std::vector<PmuData> &data)
{
    // Fixme:
    // In current class, we do not know how many events in group.
    // Then we read for max struct size: nr+timeEnabled+timeRunning+ MAX_GROUP_EVENTS*(value+id)
    static const unsigned MAX_GROUP_EVENTS = 14;
    unsigned readSize = sizeof(__u64)*3 + sizeof(__u64)*2*MAX_GROUP_EVENTS;
    GroupReadFormat *perfCountValue = static_cast<GroupReadFormat*>(malloc(readSize));
    if (perfCountValue == NULL) {
        return COMMON_ERR_NOMEM;
    }
    int len = read(this->fd, perfCountValue, readSize);
    if (len < 0) {
        free(perfCountValue);
        New(UNKNOWN_ERROR, strerror(errno));
        return UNKNOWN_ERROR;
    }

    if (accumCount.empty()) {
        accumCount.assign(perfCountValue->nr, 0);
    }

    for (int i = 0;i < accumCount.size(); ++i) {
        auto err = CountValueToData(perfCountValue->values[i].value,
                                    perfCountValue->timeEnabled,
                                    perfCountValue->timeRunning,
                                    accumCount[i],
                                    data
                                    );
        if (err != SUCCESS) {
            free(perfCountValue);
            return err;
        }
    }

    this->enabled = perfCountValue->timeEnabled;
    this->running = perfCountValue->timeRunning;
    free(perfCountValue);
    return SUCCESS;
}

int KUNPENG_PMU::PerfCounterDefault::CountValueToData(const __u64 value, const __u64 timeEnabled,
                                                const __u64 timeRunning, __u64 &accumCount, vector<PmuData> &data)
{
    if (value < accumCount || timeEnabled < enabled || timeRunning < running) {
        return LIBPERF_ERR_COUNT_OVERFLOW;
    }

    // Calculate the diff of count from last read.
    // In case of multiplexing, we follow the linux documentation for calculating the estimated
    // counting value (https://perf.wiki.kernel.org/index.php/Tutorial)
    double percent = 0.0;
    uint64_t increCount;
    if (this->evt->enableUserAccess) {
        percent = 1;
        increCount = static_cast<uint64_t>(value - accumCount);
    } else if ((value == accumCount) || (timeRunning == running)) {
        percent = -1;
        increCount = 0;
    } else {
        percent = static_cast<double>(timeEnabled - enabled) / static_cast<double>(timeRunning - running);
        increCount = static_cast<uint64_t>((value - accumCount)* percent);
    }
    accumCount = value;

    data.emplace_back(PmuData{0});
    auto& current = data.back();
    current.count = increCount;
    current.countPercent = 1.0 / percent;
    current.cpu = this->cpu;
    current.tid = this->pid;
    auto findProc = procMap.find(current.tid);
    if (findProc != procMap.end()) {
        current.pid = findProc->second->pid;
    }
    if(this->evt->cgroupName.size() != 0) {
        current.cgroupName = this->evt->cgroupName.c_str();
    }
    return SUCCESS;
}

/**
 * Initialize counting
 */
int KUNPENG_PMU::PerfCounterDefault::Init(const bool groupEnable, const int groupFd, const int resetOutputFd)
{
    int err = SUCCESS;
    if (this->evt->enableUserAccess) {  // user access
        err = this->MapPerfAttrUserAccess();
        if (err != SUCCESS) {
            return err;
        }
        err = this->Mmap();
        return err;
    }
    err = this->MapPerfAttr(groupEnable, groupFd);
    return err;
}

int KUNPENG_PMU::PerfCounterDefault::MapPerfAttr(const bool groupEnable, const int groupFd)
{
    /**
     * For now, we only implemented the logic for CORE type events. Support for UNCORE PMU events will be
     * added soon
     */
    struct perf_event_attr attr;
    memset(&attr, 0, sizeof(attr));
    attr.size = sizeof(struct perf_event_attr);
    attr.type = this->evt->type;
    attr.config = this->evt->config;
    attr.config1 = this->evt->config1;
    attr.config2 = this->evt->config2;

    /**
     * We want to set the disabled and inherit bit to collect child processes
     */
    attr.disabled = 1;
    attr.inherit = 1;

    attr.exclude_kernel = this->evt->excludeKernel;
    attr.exclude_user = this->evt->excludeUser;

    if (this->evt->enableOnExec) {
        attr.enable_on_exec = 1;
    }

    /**
     * if no permission try setting exclude_kernel=1.
     */
    if (this->needTryExcludeKernel) {
        attr.exclude_kernel = 1;
    }

    // support cgroup feature
    unsigned flags = 0;
    int pid = this->pid;
    if (this->GetCgroupFd() != -1) {
        flags = PERF_FLAG_PID_CGROUP | PERF_FLAG_FD_CLOEXEC;
        pid = this->GetCgroupFd();
    }

    /**
     * For now we set the format id bit to implement grouping logic in the future
     */
    attr.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID;
    if (groupEnable) {
        /*
        * when creating an event group, typically the group leader is initialized with disabled bit set to 1,
        * and any child events are initialized with disabled bit set to 0. Despite disabled bit being set to 0,
        * the child events will not start counting until the group leader is enabled.
        */
        if (groupFd != -1) {
            attr.disabled = 0;
            groupStatus = GroupStatus::GROUP_MEMBER;
        } else {
            groupStatus = GroupStatus::GROUP_LEADER;
        }
        attr.read_format |= PERF_FORMAT_GROUP;
        this->fd = PerfEventOpen(&attr, pid, this->cpu, groupFd, flags);
    } else {
        this->fd = PerfEventOpen(&attr, pid, this->cpu, groupFd, flags);
        groupStatus = GroupStatus::NO_GROUP;
    }
    this->groupFd = groupFd;
    DBG_PRINT("type: %d cpu: %d config: %llx config1: %llx config2: %llx myfd: %d groupfd: %d\n",
        attr.type, cpu, attr.config, attr.config1, attr.config2, this->fd, groupFd);
    if (__glibc_unlikely(this->fd < 0)) {
        return MapErrno(errno);
    }
    return SUCCESS;
}

int KUNPENG_PMU::PerfCounterDefault::MapPerfAttrUserAccess()
{
    struct perf_event_attr attr;
    memset(&attr, 0, sizeof(attr));
    attr.size = sizeof(struct perf_event_attr);
    attr.type = this->evt->type;
    attr.config = this->evt->config;
    attr.config1 = this->evt->config1;
    attr.disabled = 1;
    this->fd = PerfEventOpen(&attr, this->pid, this->cpu, -1, 0);
    DBG_PRINT("type: %d cpu: %d config: %llx config1: %llx myfd: %d \n",
        attr.type,
        this->cpu,
        attr.config,
        attr.config1,
        this->fd);
    if (__glibc_unlikely(this->fd < 0)) {
        return MapErrno(errno);
    }
    this->groupFd = -1;
    return SUCCESS;
}

int KUNPENG_PMU::PerfCounterDefault::Mmap()
{
    this->countMmap = std::make_shared<PerfMmap>();
    this->countMmap->prev = 0;
    this->countMmap->mask = -1;
    void *currentMap =
        mmap(NULL, COUNT_PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, this->fd, 0);
    if (__glibc_unlikely(currentMap == MAP_FAILED)) {
        this->countMmap->base = nullptr;
        close(this->fd);
        return LIBPERF_ERR_FAIL_MMAP;
    }
    this->countMmap->base = static_cast<struct perf_event_mmap_page *>(currentMap);
    this->countMmap->fd = this->fd;
    return SUCCESS;
}

/**
 * Enable
 */
int KUNPENG_PMU::PerfCounterDefault::Enable()
{
    if (groupFd != -1) {
        // Only group leader should use ioctl to enable, disable or reset,
        // otherwise each event in the group will be collected for different durations.
        return SUCCESS;
    }
    int err = PerfEvt::Enable();
    if (err != SUCCESS) {
        return err;
    }
    if (this->evt->enableUserAccess && this->countMmap->base->index == 0) {
        return LIBPERF_ERR_COUNTER_INDEX_IS_ZERO;
    }
    this->isCollect = true;
    this->accumCount.clear();
    this->enabled = 0;
    this->running = 0;
    return SUCCESS;
}

int KUNPENG_PMU::PerfCounterDefault::Disable()
{
    if (groupFd != -1) {
        return SUCCESS;
    }
    int err = PerfEvt::Disable();
    if (err == SUCCESS) {
        this->isCollect = false;
    }
    return err;
}

int KUNPENG_PMU::PerfCounterDefault::Reset()
{
    return PerfEvt::Reset();
}

int KUNPENG_PMU::PerfCounterDefault::Close()
{
    if (this->countMmap && this->countMmap->base && this->countMmap->base != MAP_FAILED) {
        munmap(this->countMmap->base, COUNT_PAGE_SIZE);
    }
    if (this->fd > 0) {
        close(this->fd);
    }
    return SUCCESS;
}