* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#ifndef QUEUE_SCHEDULE_STATISTIC_MANAGER_H
#define QUEUE_SCHEDULE_STATISTIC_MANAGER_H
#include <atomic>
#include <thread>
#include <mutex>
#include <condition_variable>
#include "common/type_def.h"
#include "qs_proc_mem_statistic.h"
namespace bqs {
struct EntityStatisticInfo {
uint64_t hcclImprobeTotalTimes;
uint64_t hcclImprobeSuccTimes;
uint64_t hcclImprobeFailTimes;
uint64_t allocMbufTimes;
uint64_t hcclImrecvSuccTimes;
uint64_t hcclImrecvFailTimes;
uint64_t uncompReqQueuePushTimes;
uint64_t uncompReqQueuePopTimes;
uint64_t hcclIsendSuccTimes;
uint64_t hcclIsendFullTimes;
uint64_t hcclIsendFailTimes;
uint64_t hcclTestSomeSuccTimes;
uint64_t freeMbufTimes;
uint64_t hcclEnqueueSuccTimes;
uint64_t hcclEnqueueFailTimes;
uint64_t dequeueSuccTimes;
uint64_t dequeueFailTimes;
uint64_t dequeueEmptyTimes;
uint64_t maxCompletionGapTickForHead;
uint64_t minCompletionGapTickForHead;
uint64_t totalCompletionGapTickForHead;
uint64_t totalCompletionCountForHead;
uint64_t maxCompletionGapTickForBody;
uint64_t minCompletionGapTickForBody;
uint64_t totalCompletionGapTickForBody;
uint64_t totalCompletionCountForBody;
uint64_t enqueueSuccTimes;
EntityStatisticInfo()
: hcclImprobeTotalTimes(0UL),
hcclImprobeSuccTimes(0UL),
hcclImprobeFailTimes(0UL),
allocMbufTimes(0UL),
hcclImrecvSuccTimes(0UL),
hcclImrecvFailTimes(0UL),
uncompReqQueuePushTimes(0UL),
uncompReqQueuePopTimes(0UL),
hcclIsendSuccTimes(0UL),
hcclIsendFullTimes(0UL),
hcclIsendFailTimes(0UL),
hcclTestSomeSuccTimes(0UL),
freeMbufTimes(0UL),
hcclEnqueueSuccTimes(0UL),
hcclEnqueueFailTimes(0UL),
dequeueSuccTimes(0UL),
dequeueFailTimes(0UL),
dequeueEmptyTimes(0UL),
maxCompletionGapTickForHead(0UL),
minCompletionGapTickForHead(0UL),
totalCompletionGapTickForHead(0UL),
totalCompletionCountForHead(0UL),
maxCompletionGapTickForBody(0UL),
minCompletionGapTickForBody(0UL),
totalCompletionGapTickForBody(0UL),
totalCompletionCountForBody(0UL),
enqueueSuccTimes(0UL) {}
};
struct StatisticInfo {
std::atomic<uint64_t> eventScheduleTimes;
std::atomic<uint64_t> enqueueFalseAwakenTimes;
std::atomic<uint64_t> daemonEventScheduleTimes;
std::atomic<uint64_t> awakenTimes;
std::atomic<uint64_t> dataDequeueTimes;
std::atomic<uint64_t> scheduleEmptyTimes;
std::atomic<uint64_t> dataScheduleFailedTimes;
std::atomic<uint64_t> relationEnqueueTimes;
std::atomic<uint64_t> relationDequeueTimes;
std::atomic<uint64_t> asynMemEnqueueTimes;
std::atomic<uint64_t> asynMemDequeueTimes;
std::atomic<uint64_t> f2nfEnqueueTimes;
std::atomic<uint64_t> f2nfDequeueTimes;
std::atomic<uint64_t> bindTimes;
std::atomic<uint64_t> unbindTimes;
std::atomic<uint64_t> getBindTimes;
std::atomic<uint64_t> getAllBindTimes;
std::atomic<uint64_t> responseTimes;
std::atomic<uint64_t> dataEnqueueSuccTimes;
std::atomic<uint64_t> dataEnqueueFailTimes;
std::atomic<uint64_t> dataEnqueueFullTimes;
std::atomic<uint64_t> hcclMpiRecvRequestEventTimes;
std::atomic<uint64_t> hcclMpiRecvReqFalseAwakenTimes;
std::atomic<uint64_t> hcclMpiRecvReqEmptySchedTimes;
std::atomic<uint64_t> hcclMpiRecvReqCallbackTimes;
std::atomic<uint64_t> hcclMpiSendCompEventTimes;
std::atomic<uint64_t> hcclMpiSendCompFalseAwakenTimes;
std::atomic<uint64_t> hcclMpiSendCompEmptySchedTimes;
std::atomic<uint64_t> hcclMpiSendCompCallbackTimes;
std::atomic<uint64_t> hcclMpiRecvCompEventTimes;
std::atomic<uint64_t> hcclMpiRecvCompFalseAwakenTimes;
std::atomic<uint64_t> hcclMpiRecvCompEmptySchedTimes;
std::atomic<uint64_t> hcclMpiRecvCompCallbackTimes;
std::atomic<uint64_t> f2nfEventTimes;
std::atomic<uint64_t> f2nfFalseAwakenTimes;
std::atomic<uint64_t> hcclMpiRecvSuccTimes;
std::atomic<uint64_t> hcclMpiRecvFailTimes;
std::atomic<uint64_t> hcclMpiSendSuccTimes;
std::atomic<uint64_t> hcclMpiSendFailTimes;
std::atomic<uint64_t> hcclMpiSendFullTimes;
std::atomic<uint64_t> hcclMpiF2nfEventTimes;
std::atomic<uint64_t> mbufAllocSize;
std::atomic<uint64_t> mbufAllocTimes;
std::atomic<uint64_t> mbufFreeSize;
std::atomic<uint64_t> mbufFreeTimes;
std::atomic<uint64_t> supplyRecvReqEventTimes;
StatisticInfo()
{
Reset();
}
void Reset()
{
eventScheduleTimes.store(0UL);
enqueueFalseAwakenTimes.store(0UL);
daemonEventScheduleTimes.store(0UL);
awakenTimes.store(0UL);
dataDequeueTimes.store(0UL);
scheduleEmptyTimes.store(0UL);
dataScheduleFailedTimes.store(0UL);
relationEnqueueTimes.store(0UL);
relationDequeueTimes.store(0UL);
asynMemEnqueueTimes.store(0UL);
asynMemDequeueTimes.store(0UL);
f2nfEnqueueTimes.store(0UL);
f2nfDequeueTimes.store(0UL);
bindTimes.store(0UL);
unbindTimes.store(0UL);
getBindTimes.store(0UL);
getAllBindTimes.store(0UL);
responseTimes.store(0UL);
dataEnqueueSuccTimes.store(0UL);
dataEnqueueFailTimes.store(0UL);
dataEnqueueFullTimes.store(0UL);
hcclMpiRecvRequestEventTimes.store(0UL);
hcclMpiRecvReqFalseAwakenTimes.store(0UL);
hcclMpiRecvReqEmptySchedTimes.store(0UL);
hcclMpiRecvReqCallbackTimes.store(0UL);
hcclMpiSendCompEventTimes.store(0UL);
hcclMpiSendCompFalseAwakenTimes.store(0UL);
hcclMpiSendCompEmptySchedTimes.store(0UL);
hcclMpiSendCompCallbackTimes.store(0UL);
hcclMpiRecvCompEventTimes.store(0UL);
hcclMpiRecvCompFalseAwakenTimes.store(0UL);
hcclMpiRecvCompEmptySchedTimes.store(0UL);
hcclMpiRecvCompCallbackTimes.store(0UL);
f2nfEventTimes.store(0UL);
f2nfFalseAwakenTimes.store(0UL);
hcclMpiRecvSuccTimes.store(0UL);
hcclMpiRecvFailTimes.store(0UL);
hcclMpiSendSuccTimes.store(0UL);
hcclMpiSendFailTimes.store(0UL);
hcclMpiSendFullTimes.store(0UL);
hcclMpiF2nfEventTimes.store(0UL);
mbufAllocSize.store(0UL);
mbufAllocTimes.store(0UL);
mbufFreeSize.store(0UL);
mbufFreeTimes.store(0UL);
supplyRecvReqEventTimes.store(0UL);
};
};
class ScheduleStatistic {
public:
ScheduleStatistic()
{
Reset();
};
~ScheduleStatistic() = default;
void Reset();
void UpdateProcessCost(const float64_t cost);
void UpdateScheduleDelay(const float64_t delay);
float64_t GetMaxProcessCost();
float64_t GetSecondProcessCost();
float64_t GetMaxScheduleDelay();
float64_t GetAvgProcessCost();
private:
float64_t maxProcessCostUs_;
float64_t secondMaxProcessUs_;
float64_t maxScheduleDelayUs_;
std::mutex mutex_;
float64_t totalProcessCostUs_;
uint64_t totalProcessCount_;
};
class StatisticManager {
public:
static StatisticManager &GetInstance();
~StatisticManager();
StatisticManager(const StatisticManager &) = delete;
StatisticManager &operator=(const StatisticManager &) = delete;
StatisticManager(StatisticManager &&) = delete;
StatisticManager &operator=(StatisticManager &&) = delete;
public:
* event schedule statistic.
* @param scheduleNum schedule num, default is 1.
*/
uint64_t EventScheduleStat(const uint32_t scheduleNum = 1U);
* enqueue event false awaken times
*/
void EnqueueEventFalseAwakenStat();
* get event schedule statistic.
*/
uint64_t GetEventScheduleStat() const;
* daemon event schedule statistic.
*/
void DaemonEventScheduleStat();
* awaken count add.
*/
void AwakenAdd();
* return awaken times.
*/
uint64_t GetAwakenTimes() const;
* data sched empty statistic.
*/
void AddScheduleEmpty();
* data schedule failed statistic.
* @param failedNum: data schedule failed, default is 1
*/
void DataScheduleFailedStat(const uint64_t failedNum = 1UL);
* relation queue enqueue statistic.
*/
void RelationEnqueueStat();
* relation queue dequeue statistic.
*/
void RelationDequeueStat();
* get relation queue enqueue statistic.
*/
uint64_t GetRelationEnqueCnt() const;
* get relation queue dequeue statistic.
*/
uint64_t GetRelationDequeCnt() const;
* asyn mem queue enqueue statistic.
*/
void AsynMemEnqueueStat();
* asyn mem queue dequeue statistic.
*/
void AsynMemDequeueStat();
* get asyn mem queue enqueue statistic.
*/
uint64_t GetAsynMemEnqueCnt() const;
* get asyn mem queue dequeue statistic.
*/
uint64_t GetAsynMemDequeCnt() const;
* full to not full queue enqueue statistic.
*/
void F2nfEnqueueStat();
* full to not full queue dequeue statistic.
*/
void F2nfDequeueStat();
* hccl mpi recv request event times statistic.
*/
uint64_t HcclMpiRecvRequestEventStat();
* hccl mpi recv request event false awaken times statistic.
*/
void HcclMpiRecvReqFalseAwakenStat();
* hccl mpi recv request event empty sched times statistic.
*/
void HcclMpiRecvReqEmptySchedStat();
* hccl mpi recv request event callback times statistic.
*/
void HcclMpiRecvReqCallbackStat();
* hccl mpi send completion event times statistic.
*/
uint64_t HcclMpiSendCompEventStat();
* hccl mpi send completion event false awaken times statistic.
*/
void HcclMpiSendCompFalseAwakenStat();
* hccl mpi send completion event empty sched times statistic.
*/
void HcclMpiSendCompEmptySchedStat();
* hccl mpi send completion event callback times statistic.
*/
void HcclMpiSendCompCallbackStat();
* hccl mpi recv completion event times statistic.
*/
uint64_t HcclMpiRecvCompEventStat();
* hccl mpi recv completion event false awaken times statistic.
*/
void HcclMpiRecvCompFalseAwakenStat();
* hccl mpi recv completion event empty sched times statistic.
*/
void HcclMpiRecvCompEmptySchedStat();
* hccl mpi recv completion event callback times statistic.
*/
void HcclMpiRecvCompCallbackStat();
* hccl mpi call hcclImrecv success times statistic.
*/
void HcclMpiRecvSuccStat();
* hccl mpi call hcclImrecv failed times statistic.
*/
void HcclMpiRecvFailStat();
* hccl mpi call hcclIsend success times statistic.
*/
void HcclMpiSendSuccStat();
* hccl mpi call hcclIsend failed times statistic.
*/
void HcclMpiSendFailStat();
* hccl mpi call hcclIsend full times statistic.
*/
void HcclMpiSendFullStat();
* hccl mpi congestion relief event times statistic
*/
void HcclMpiF2nfEventStat();
* mbuf alloc statistic
*/
void MbufAllocStat(const uint64_t size);
* mbuf free statistic
*/
void MbufFreeStat(const uint64_t size);
* recv request event supply statistic
*/
void RecvReqEventSupplyStat();
* f2nf event times statistic.
*/
void F2nfEventStat();
* f2nf event false awaken times statistic.
*/
void F2nfEventFalseAwakenStat();
* total dequeue times statistic.
*/
void DataDequeueStat();
void DataQueueEnqueueSuccStat();
void DataQueueEnqueueFailStat();
void DataQueueEnqueueFullStat();
* bind relation request statistic
*/
void BindStat();
* unbind relation request statistic
*/
void UnbindStat();
* get bind relation request statistic
*/
void GetBindStat();
* get all bind relation request statistic
*/
void GetAllBindStat();
* relation response statistic
*/
void ResponseStat();
* subscribe queue statistic
*/
void SubscribeNum(const uint32_t subscribeNum);
* pause subscribe queue statistic, pause then +1
*/
void PauseSubscribe();
* pause subscribe queue statistic, resume then -1
*/
void ResumeSubscribe();
* bind relation number statistic
*/
void BindNum(const uint32_t bindNum);
* abnormal bind relation number statistic
*/
void AbnormalBindNum(const uint32_t bindNum);
* start dump static thread.
*/
void StartStatisticManager(const uint32_t abnormalInterval, const uint32_t hostPid, const bool numaFlag = false,
const uint32_t deviceIdExtra = 0U, const uint32_t enqueGroupIdExtra = 0U);
* stop dump static thread.
*/
void StopStatisticManager();
* dump static thread function.
*/
void ThreadFunc();
* set exist entity flag
*/
void SetExistEntityFlag(const bool flag);
* Dump statistic info
*/
void DumpStatistic();
* Reset statistic info
*/
void ResetStatistic();
* Add unlink tag count
* @return unlink tag count
*/
const uint32_t AddUnlinkCount();
* Reduce unlink tag count
* @return unlink tag count
*/
const uint32_t ReduceUnlinkCount();
void RefreshEnqueHeartBeat();
void AddTagCount();
void ReduceTagCount();
void UpdateScheuleStatistic(const float64_t delay, const float64_t cost);
void DumpOutProcMemStatInfo();
void RecordProcMemInfo();
private:
StatisticManager() = default;
* dump channel statistic info
*/
void DumpChannelStatistic();
private:
std::mutex timerMutex_;
std::thread timerThread_;
volatile bool runFlag_ = false;
StatisticInfo totalStat_;
StatisticInfo periodStat_;
std::atomic<uint32_t> bindNum_ = {0U};
std::atomic<uint32_t> subscribeNum_ = {0U};
std::atomic<uint32_t> pauseSubscribeNum_ = {0U};
std::atomic<bool> existEntityFlag_ = {false};
std::atomic<uint32_t> unLinkTagNum_ = {0U};
std::atomic<uint32_t> abnormalBindNum_ = {0U};
std::atomic<uint32_t> enqueThreadHearBeat_ = {0U};
uint32_t abnormalInterval_;
std::atomic<uint32_t> totalTagNum_ = {0U};
ScheduleStatistic scheduleStatistic_{};
bool numaFlag_{false};
uint32_t deviceIdExtra_{0U};
uint32_t enqueGroupIdExtra_{0U};
QsProcMemStatistic procMemStat_;
uint32_t hostPid_{0U};
};
}
#endif