/**
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
 * CANN Open Software License Agreement Version 2.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

/*!
 * \file device_perf.h
 * \brief
 */

#pragma once

#include <cstdint>
#include <string>
#include "tilefwk/aicpu_common.h"

namespace npu::tile_fwk::dynamic {
struct PerfettoMgr {
    static const int MAX_THEAD_NUM = 200;
    static const int TRUNK_SIZE = 32768;
    static const int MAX_EVT_DEPTH = 64;

    struct Record {
        int type;
        int tid;
        uint64_t start;
        uint64_t end;
        uint32_t index;
        uint32_t pIndex;
        std::string name = "-";
    };

    template <typename T, int N>
    struct Array {
        size_t used{0};
        T data[N];

        inline void Push(const T& t) { data[used++] = t; }
        inline void Pop() { used--; }
        inline bool Full() { return used == N; }
        inline bool Empty() { return used == 0; }
        inline T& Top() { return data[used - 1]; }
        inline T* Alloc() { return &data[used++]; }
    };

    using Trunk = Array<Record, TRUNK_SIZE>;
    using EvtStack = Array<Record*, MAX_EVT_DEPTH>;

    Record* allocRecord(int tid)
    {
        if (trunk_[tid] == nullptr || trunk_[tid]->Full()) {
            trunk_[tid] = new Trunk;
            mutex_.lock();
            trunks_.push_back(trunk_[tid]);
            mutex_.unlock();
        }
        return trunk_[tid]->Alloc();
    }

    void PerfBegin(int type, int tid)
    {
#if defined(CONFIG_PERFETTO) && CONFIG_PERFETTO
        auto r = allocRecord(tid);
        auto& stack = evtStack[tid];
        r->type = type;
        r->tid = tid;
        r->start = GetCycles();
        r->index = evtIndex++;
        r->pIndex = stack.Empty() ? -1 : stack.Top()->index;
        stack.Push(r);
#endif
        (void)type;
        (void)tid;
    }

    void PerfEnd(int type, int tid)
    {
#if defined(CONFIG_PERFETTO) && CONFIG_PERFETTO
        auto& stack = evtStack[tid];
        auto r = stack.Top();
        r->end = GetCycles();
        stack.Pop();
#endif
        (void)type;
        (void)tid;
    }

    void PerfEvent(int type, int tid, uint64_t start, uint64_t end, std::string name)
    {
#if defined(CONFIG_PERFETTO) && CONFIG_PERFETTO
        auto r = allocRecord(tid);
        auto& stack = evtStack[tid];
        r->type = type;
        r->tid = tid;
        r->name = name;
        r->start = start;
        r->end = end;
        r->index = evtIndex++;
        r->pIndex = stack.Empty() ? -1 : stack.Top()->index;
#endif
        (void)type;
        (void)tid;
        (void)start;
        (void)end;
        (void)name;
    }

    static PerfettoMgr& Instance()
    {
        static PerfettoMgr recorder;
        return recorder;
    }

    void Dump(const std::string& file)
    {
        std::ofstream os(file);
        for (auto& trunk : trunks_) {
            for (size_t i = 0; i < trunk->used; i++) {
                auto& r = trunk->data[i];
                os << PerfEventName[r.type] << " ";
                os << r.name << " ";
                os << r.index << " ";
                os << r.start << " ";
                os << r.end << " ";
                os << r.tid << ";";
                os << r.pIndex << " ";
                os << std::endl;
            }
        }
    }

private:
    PerfettoMgr() = default;

private:
    EvtStack evtStack[MAX_THEAD_NUM];
    Trunk* trunk_[MAX_THEAD_NUM] = {nullptr};

    std::mutex mutex_;
    std::vector<Trunk*> trunks_;
    std::atomic<int32_t> evtIndex;
};
struct PerfEvtMgr {
    struct Counter {
        int64_t start;
        int64_t total;
        int64_t count;
    };

    bool GetIsOpenProf() { return isOpenProf_; }

    void SetIsOpenProf(bool isOpenProf, uint64_t aicpuPerf = 0)
    {
        if (ctrlTurn_ >= MAX_ROUND_NUM) {
            aicpuPerf_ = 0;
            isOpenProf_ = false;
            DEV_WARN("Aicpu perf info more than maxTurnNum=%u, some info would be lost", MAX_ROUND_NUM);
            return;
        }
        isOpenProf_ = isOpenProf;
        aicpuPerf_ = aicpuPerf;
    }

    void AddCtrlTurn() { ctrlTurn_++; }

    void AddScheduleTurn() { schTurn_++; }

    void PerfBegin(int type) { counters[type].start = static_cast<int64_t>(GetCycles()); }

    void PerfEnd(int type)
    {
        auto& c = counters[type];
        c.count++;
        c.total += static_cast<int64_t>(GetCycles() - c.start);
    }

    static PerfEvtMgr& Instance()
    {
        static PerfEvtMgr recorder;
        return recorder;
    }

    static void RepeatPuts(char c, size_t count)
    {
        char buf[80];
        for (size_t i = 0; i < count; i++) {
            buf[i] = c;
        }
        buf[count] = '\0';
        DEV_INFO("%s.", buf);
    }

    void Dump()
    {
        uint64_t freq = GetFreq();
        static constexpr size_t SHEET_WIDTH = 40 + 3 + 10 + 3 + 10 + 3 + 10;

        RepeatPuts('=', SHEET_WIDTH);
        DEV_INFO("%40s | %10s | %10s | %10s.", "EventType", "Count", "Total(us)", "Avg(us)");
        RepeatPuts('-', SHEET_WIDTH);

        for (int i = 0; i < PERF_EVT_MAX; i++) {
            auto evt = counters[i];
            if (evt.count != 0) {
                uint64_t total = evt.total * NSEC_PER_SEC / freq / NSEC_PER_USEC;
                float avg = static_cast<float>(total / evt.count);
                DEV_INFO("%-40s | %10ld | %10lu | %10.1f.", PerfEventName[i], evt.count, total, avg);
            }
        }

        RepeatPuts('=', SHEET_WIDTH);
    }

    void PerfTrace(uint32_t type, uint32_t tid, uint64_t cycle)
    {
        if (unlikely(tid >= MAX_USED_AICPU_NUM || aicpuPerf_ == 0)) {
            return;
        }

        MetricPerf* aicpuMetrics = (MetricPerf*)(aicpuPerf_ + (tid == 0 ? ctrlTurn_ : schTurn_) * sizeof(MetricPerf));

        if (IsDevTaskType(type)) {
            uint32_t idx = DEVTASK_PERF_ARRY_INDEX(type);
            DevTaskPerf& perfSlot = aicpuMetrics->devTaskPerfs[tid][idx];
            uint64_t& devCnt = perfSlot.cnt;
            if (unlikely(devCnt >= PERF_TRACE_COUNT_DEVTASK_MAX_NUM)) {
                DEV_EVENT(
                    "Dev task num larger than: %u, the excess part will not be recorded",
                    PERF_TRACE_COUNT_DEVTASK_MAX_NUM);
                return;
            }

            perfSlot.timeStamp[devCnt] = cycle == 0 ? static_cast<uint64_t>(GetCycles()) : cycle;
           devCnt++;
        } else {
            aicpuMetrics->perfAicpuTrace[tid][type] = cycle == 0 ? static_cast<uint64_t>(GetCycles()) : cycle;
        }
    }

private:
    PerfEvtMgr()
    {
#if ENABLE_PERF_EVT
        memset_s(counters, sizeof(counters), 0, sizeof(counters));
#endif
    };

private:
    Counter counters[PERF_EVT_MAX];
    bool isOpenProf_{false};
    uint64_t aicpuPerf_{0};
    uint32_t ctrlTurn_{0};
    uint32_t schTurn_{0};
};

inline void PerfBegin(int type)
{
#if ENABLE_PERF_EVT
    PerfEvtMgr::Instance().PerfBegin(type);
#else
    (void)type;
#endif
}

inline void PerfEnd(int type)
{
#if ENABLE_PERF_EVT
    PerfEvtMgr::Instance().PerfEnd(type);
#else
    (void)type;
#endif
}

inline void PerfMtBegin(int type, int tid)
{
#if ENABLE_PERF_EVT
    PerfEvtMgr::Instance().PerfBegin(type + tid);
#else
    (void)type;
    (void)tid;
#endif
}

inline void PerfMtEnd(int type, int tid)
{
#if ENABLE_PERF_EVT
    PerfEvtMgr::Instance().PerfEnd(type + tid);
#else
    (void)type;
    (void)tid;
#endif
}

inline void PerfMtEvent(int type, int tid, uint64_t start, uint64_t end, std::string name = "-")
{
    if (PerfEvtEnable[type]) {
        PerfettoMgr::Instance().PerfEvent(type, tid, start, end, name);
    }
}

inline void PerfMtTrace(uint32_t type, uint32_t tid, uint64_t cycle = 0)
{
    PerfEvtMgr::Instance().PerfTrace(type, tid, cycle);
    
}

struct AutoScopedPerf {
    explicit AutoScopedPerf(int type) : type_(type) { PerfBegin(type); }
    ~AutoScopedPerf() { PerfEnd(type_); }
    int type_;
};
} // namespace npu::tile_fwk::dynamic