/**
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
 * CANN Open Software License Agreement Version 2.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

/*!
 * \file device_machine.h
 * \brief
 */

#pragma once

#include <signal.h>
#include <sys/ucontext.h>
#include "device_sche_context.h"
#include "device_common.h"
#include "aicore_manager.h"
#include "aicore_constants.h"
#include "machine/utils/machine_ws_intf.h"
#include "machine/utils/device_log.h"
#include "tilefwk/aicore_print.h"
#include "machine/device/dynamic/aicore_prof.h"
#include "device_trace.h"

constexpr uint32_t LAUNCH_AICPU_NUM = 5;
constexpr int MAX_RETRIES = 100;
//  same-cluster CPU ID bounds in Dav2201
//  (dev0,dev1) and (dev2,dev3) each form a dual-die pair
constexpr int CLUSTER_ID_LOW_BOUND_DIE0 = 4;
constexpr int CLUSTER_ID_HIGH_BOUND_DIE0 = 7;
constexpr int CLUSTER_ID_LOW_BOUND_DIE1 = 12;
constexpr int CLUSTER_ID_HIGH_BOUND_DIE1 = 15;

namespace npu::tile_fwk::dynamic {
struct AicoreLogManager {
    AicoreLogManager()
    {
        data_ = aligned_alloc(PAGE_SIZE, MAX_AICORE_NUM * PRINT_BUFFER_SIZE);
        uint8_t* buf = (uint8_t*)data_;
        for (uint32_t i = 0; i < MAX_AICORE_NUM; i++) {
            logger[i].Init(buf, PRINT_BUFFER_SIZE);
            buf += PRINT_BUFFER_SIZE;
        }
    }
    ~AicoreLogManager() { free(data_); }

    void* data_;
    AicoreLogger logger[MAX_AICORE_NUM];
};

typedef void (*sig_act_f)(int signum, siginfo_t* info, void* act);

class DeviceSchedMachine {
public:
    DeviceSchedMachine()
    {
        for (uint32_t i = 0; i < MAX_SCHEDULE_AICPU_NUM; ++i) {
            aicoreManager_[i] = std::make_unique<AiCoreManager>(schThreadStatus);
        }
    }

    void SetStachSchduleContext(int schedIdx, SchduleContext* context)
    {
        aicoreManager_[schedIdx]->SetSchduleContext(context);
    }

    bool CheckAndResetReg() { return aicoreManager_[0]->CheckAndResetReg(); }

    void init(uint32_t schNum)
    {
        schAicpuNum_ = schNum;
        schThreadStatus.Init();
    }

    int RunThread(int threadIdx, DevStartArgs* devStartArgs, DeviceArgs* args, int schedIdx)
    {
        int ret = 0;
        if (args->nrAic == 0 || args->nrValidAic == 0 || args->nrAicpu < args->scheCpuNum) {
            DEV_ERROR(
                DevCommonErr::PARAM_INVALID,
                "#sche.thread.init: Device machine run invalid args: aicNum=%u, blockdim=%u, launchAicpuNum=%u, "
                "launchScheAicpuNum=%u",
                args->nrAic, args->nrValidAic, args->nrAicpu, args->scheCpuNum);
            return DEVICE_MACHINE_ERROR;
        }

        if (static_cast<uint32_t>(schedIdx) >= args->scheCpuNum) {
            DEV_INFO("thread start ignore ");
            return DEVICE_MACHINE_OK;
        }
#if ENABLE_AICORE_PRINT
        aicoreManager_[schedIdx]->InitLogger(logManager.logger);
#endif
        ret = aicoreManager_[schedIdx]->RunManager(threadIdx, devStartArgs, args, schedIdx);
        DEV_INFO("threadIdx=%d end, ret=%d", threadIdx, ret);
        return ret;
    }

    void ResetRegAll()
    {
        sleep(1);
        DEV_INFO("ResetRegAll");
        for (uint32_t i = 0; i < schAicpuNum_; ++i) {
            aicoreManager_[i]->ResetRegAll();
        }
        sleep(1);
        aicoreManager_[0]->CheckAndResetReg();
        DEV_INFO("Exception reset reg finish.");
    }

private:
    SchThreadStatus schThreadStatus;
    uint32_t schAicpuNum_{MAX_SCHEDULE_AICPU_NUM};
    std::unique_ptr<AiCoreManager> aicoreManager_[MAX_SCHEDULE_AICPU_NUM];
#if ENABLE_AICORE_PRINT
    AicoreLogManager logManager;
#endif
};

constexpr int CPUS_PER_CLUSTER = 4;
static constexpr uint64_t SIGNAL_DELAY_SECONDS = 2;
constexpr int SCHE_THREAD_START_IDX = 1;

struct DynMachineManager {
    struct KernelCtrlEntry {
        int (*kernelCtrlServerInit)(void* targ);
        int (*kernelCtrlServer)(void* targ);
    };

    void SetCurThreadIdxForDav3510(int dieMaxCpuNum, int startIdx, int& curThreadIdx, std::atomic<int>& dieThreadIdx)
    {
        int expected = 0;
        while (expected < dieMaxCpuNum) { // ensure thread security
            int desired = expected + 1;
            if (dieThreadIdx.compare_exchange_weak(
                    expected, desired, std::memory_order_acq_rel, std::memory_order_acquire)) {
                int curDieThreadIdx = expected + startIdx;
                curThreadIdx = curDieThreadIdx;
                break;
            }
        }
    }

    // Common helper function for waiting CPU mask readiness with timeout check
    int WaitForCpuMaskReady(DeviceArgs* devArgs, int cpu, int curThreadIdx)
    {
        TIMEOUT_CHECK_INIT(devArgs->archInfo, TIMEOUT_20MIN);
        while (__builtin_popcount(cpumask_.load(std::memory_order_acquire)) != static_cast<int>(devArgs->nrAicpu)) {
            __PYPTO_TIMEOUT_CHECK(ThreadErr::THREAD_CPU_ALLOC_FAILED,
                return DEVICE_MACHINE_ERROR,
                "#sche.thread.init: Thread alloc, threadIdx=%d, physicalCpu=%d.",
                curThreadIdx, cpu);
        }
        return npu::tile_fwk::dynamic::DEVICE_MACHINE_OK;
    }

    int WaitForThreadIdxReady(DeviceArgs* devArgs, int &expected, int &curThreadIdx, std::atomic<int>& dieMaxThreadIdx)
    {
        TIMEOUT_CHECK_INIT(devArgs->archInfo, TIMEOUT_20MIN);
        while (curThreadIdx > expected &&
            !dieMaxThreadIdx.compare_exchange_strong(expected, curThreadIdx,
            std::memory_order_release, std::memory_order_relaxed)) {
            __PYPTO_TIMEOUT_CHECK(ThreadErr::THREAD_CPU_ALLOC_FAILED, return DEVICE_MACHINE_ERROR,
            "#thread idx update timeout: expected=%d, desired=%d.", expected, curThreadIdx);
        }
        return npu::tile_fwk::dynamic::DEVICE_MACHINE_OK;
    }

    int AllocThreadIdxForDav3510(DeviceArgs* devArgs, int cpu, int& curThreadIdx, std::atomic<int>& threadIdx)
    {
        int die0MaxCpuid = static_cast<int>(devArgs->maxAicpuNum >> 1);
        int die0MaxCpuNum = static_cast<int>(devArgs->scheCpuNum >> 1);
        int die1MaxCpuNum = static_cast<int>(devArgs->scheCpuNum) - die0MaxCpuNum;

        if (cpu <= die0MaxCpuid) {
            SetCurThreadIdxForDav3510(die0MaxCpuNum, SCHE_THREAD_START_IDX, curThreadIdx, die0ThreadIdx_);
            int expected = die0MaxThreadIdx_.load(std::memory_order_relaxed);
            WaitForThreadIdxReady(devArgs, expected, curThreadIdx, die0MaxThreadIdx_);
        } else {
            SetCurThreadIdxForDav3510(
                die1MaxCpuNum, die0MaxCpuNum + SCHE_THREAD_START_IDX, curThreadIdx, die1ThreadIdx_);
            int expected = die1MaxThreadIdx_.load(std::memory_order_relaxed);
            WaitForThreadIdxReady(devArgs, expected, curThreadIdx, die1MaxThreadIdx_);
        }

        cpumask_.fetch_or(1 << cpu, std::memory_order_release);
        
        int ret = WaitForCpuMaskReady(devArgs, cpu, curThreadIdx);
        if (ret != npu::tile_fwk::dynamic::DEVICE_MACHINE_OK) {
            return ret;
        }

        int expected = threadIdx.load(std::memory_order_relaxed);
        int desired;
        int maxRetries = MAX_RETRIES;
        while (maxRetries-- > 0) {  // 下次循环会重新读取 die0/die1,基于新 expected 计算 desired
            int die0MaxThreadIdx = die0MaxThreadIdx_.load(std::memory_order_acquire);
            int die1MaxThreadIdx = die1MaxThreadIdx_.load(std::memory_order_acquire);

            if (die0MaxThreadIdx != die0MaxCpuNum ||
                die1MaxThreadIdx != static_cast<int>(devArgs->scheCpuNum)) {
                desired = expected + 1;
            } else {
                desired = curThreadIdx;
            }

            if (desired == expected ||  // CAS 失败后,expected 已更新为最新值
                threadIdx.compare_exchange_strong(expected, desired,
                    std::memory_order_release, std::memory_order_relaxed)) {
                curThreadIdx = desired;
                break;
            }
        }

        DEV_INFO("Thread alloc success: physicalCpu=%d, threadIdx=%d.", cpu, curThreadIdx);
        return npu::tile_fwk::dynamic::DEVICE_MACHINE_OK;
    }

    int AllocThreadIdxForDav2201(int cpu, int& curThreadIdx, std::atomic<int>& threadIdx)
    {
        if (IsDeviceMode()) {
            if ((cpu >= CLUSTER_ID_LOW_BOUND_DIE0 && cpu <= CLUSTER_ID_HIGH_BOUND_DIE0) ||
                (cpu >= CLUSTER_ID_LOW_BOUND_DIE1 && cpu <= CLUSTER_ID_HIGH_BOUND_DIE1)) {
                curThreadIdx = ++threadIdx;
            } else {
                curThreadIdx = -1;
            }
        } else {
            (void) cpu;
            curThreadIdx = ++threadIdx;
        }
        return npu::tile_fwk::dynamic::DEVICE_MACHINE_OK;
    }

    int AllocThreadIdx(DeviceArgs* devArgs, int& curThreadIdx, std::atomic<int>& threadIdx)
    {
        int ret = npu::tile_fwk::dynamic::DEVICE_MACHINE_OK;
        if (devArgs->scheCpuNum == devArgs->nrAicpu) {
            curThreadIdx = ++threadIdx;
            return ret;
        }

#ifdef __DEVICE__
        int cpu = sched_getcpu();
#else
        int cpu = ++simCpuId_;
#endif
        if (devArgs->archInfo == ArchInfo::DAV_3510) {
            ret = AllocThreadIdxForDav3510(devArgs, cpu, curThreadIdx, threadIdx);
        } else if (devArgs->archInfo == ArchInfo::DAV_2201) {
            ret = AllocThreadIdxForDav2201(cpu, curThreadIdx, threadIdx);
        } else {
            curThreadIdx = ++threadIdx;
        }
        return ret;
    }

    void SignalReg(const sig_act_f sigAct)
    {
        DEV_INFO("Exception SignalReg.");
        struct sigaction myAct;
        (void)memset_s(&myAct, sizeof(myAct), 0, sizeof(myAct));
        sigemptyset(&myAct.sa_mask);
        myAct.sa_flags = SA_SIGINFO;
        myAct.sa_sigaction = sigAct;
        sigaction(SIGFPE, &myAct, &oriFPEAct_);
        sigaction(SIGBUS, &myAct, &oriBUSAct_);
        sigaction(SIGSEGV, &myAct, &oriSEGVAct_);
        sigaction(SIGPIPE, &myAct, &oriPIPEAct_);
        sigaction(SIGILL, &myAct, &oriILLAct_);
        sigaction(SIGABRT, &myAct, &oriBordAct_);
        return;
    }

    int RunCtrl(DeviceKernelArgs* kargs, const KernelCtrlEntry& entry, int threadIdx)
    {
        DEV_TRACE_DEBUG(schema::CtrlEvent(threadIdx, schema::ThreadStart()));

        DEV_INFO("ThreadCtrlEnter idx=%d", threadIdx);

        int ret = entry.kernelCtrlServer(static_cast<void*>(kargs));

        DEV_INFO("ThreadCtrlLeave idx=%d ret=%d", threadIdx, ret);
        return ret;
    }

    int RunSche(DeviceKernelArgs* kargs, const KernelCtrlEntry& entry, int threadIdx)
    {
        UNUSED(entry);

        DeviceArgs* devArgs = PtrToPtr<int64_t, DeviceArgs>(kargs->cfgdata);
        DEV_INFO("DeviceMode=%s, isDeviceMode=%d, stage=%s, threadIdx=%d", IsDeviceMode() ? "device" : "sim", IsDeviceMode(),
            "RunSche.before", threadIdx);
        DEV_INFO("ThreadScheEnter idx=%d", threadIdx);

        DEV_INFO(
            "TaskType=%d, threadIdx=%d, aicNum=%u, aivNum=%u, aicpuNum=%u, validAicNum=%u.",
            static_cast<int>(devArgs->taskType), threadIdx, devArgs->nrAic, devArgs->nrAiv, devArgs->nrAicpu,
            devArgs->nrValidAic);
        DEV_INFO(
            "devQueueAddr=%#lx, sharedBuffer=%#lx, coreRegAddr=%#lx, corePmuAdr=%#lx.", devArgs->devQueueAddr,
            devArgs->sharedBuffer, devArgs->coreRegAddr, devArgs->corePmuAddr);
        DEV_TRACE_DEBUG(schema::ScheEvent(threadIdx, schema::ThreadStart()));

        devArgs->toSubMachineConfig = kargs->toSubMachineConfig;
        SchduleContext localContext;
        int schedIdx = threadIdx - SCHE_THREAD_START_IDX;
        schMachine_.SetStachSchduleContext(schedIdx, &localContext);
        DevAscendProgram* devProg = reinterpret_cast<DevAscendProgram*>(kargs->cfgdata);
        DevStartArgs* devStartArgs =
            reinterpret_cast<DevStartArgs*>(devProg->GetRuntimeDataList()->GetRuntimeDataCurrent());
        int ret = schMachine_.RunThread(threadIdx, devStartArgs, devArgs, schedIdx);

        DEV_INFO("ThreadScheLeave idx=%d ret=%d", threadIdx, ret);
        return ret;
    }

    void RunSchInit(DeviceArgs *args)
    {
        if (initSch_.load()) {
            return;
        }
        schMachine_.init(args->scheCpuNum);
        initSch_.store(true);
    }

    void RunSchDeInit()
    {
        cpumask_ = 0;
        schExitNum_ = 0;
        die0ThreadIdx_ = 0;
        die1ThreadIdx_ = 0;
        initSch_.store(false);
#ifndef __DEVICE__
        simCpuId_ = 0;
#endif
    }

    void RunSchPost(DevAscendProgram *devProg)
    {
        ReleaseRuntimeDataRingBuffer(devProg);
        DEV_INFO("All schedule exited, destroy the machine.");
    }

    int RunCtrlInitNoLock(DeviceKernelArgs* kargs, const KernelCtrlEntry& entry)
    {
        int ret = entry.kernelCtrlServerInit(kargs);
        return ret;
    }

    __sighandler_t GetSigHandle(int signum)
    {
        __sighandler_t handle = nullptr;
        if (signum == static_cast<int>(SIGFPE)) {
            handle = oriFPEAct_.sa_handler;
        } else if (signum == static_cast<int>(SIGBUS)) {
            handle = oriBUSAct_.sa_handler;
        } else if (signum == static_cast<int>(SIGSEGV)) {
            handle = oriSEGVAct_.sa_handler;
        } else if (signum == static_cast<int>(SIGPIPE)) {
            handle = oriPIPEAct_.sa_handler;
        } else if (signum == static_cast<int>(SIGILL)) {
            handle = oriILLAct_.sa_handler;
        } else if (signum == static_cast<int>(SIGABRT)) {
            handle = oriBordAct_.sa_handler;
        }
        return handle;
    }

    void SigAct(int signum, siginfo_t* info, void* act)
    {
        (void)info;
        (void)act;
        DEV_ERROR(ThreadErr::SIGNAL_HANDLER_ABNORMAL, "#sche.except.signal: Exception Signum[%d] Act.", signum);
        PrintBacktrace(ThreadErr::SIGNAL_HANDLER_ABNORMAL, "signal " + std::to_string(signum));
        if (reset_.load()) {
            DEV_WARN("#sche.except.reset: Exception Already reset.");
            sleep(SIGNAL_DELAY_SECONDS);
            return;
        }
        reset_.store(true);
        if (!initSch_.load() && !initCtrl_.load()) {
            DEV_ERROR(ThreadErr::SIGNAL_HANDLER_ABNORMAL, "#sche.except.signal: Exception call ori sigact.");
            __sighandler_t handle = GetSigHandle(signum);
            if (handle == SIG_DFL) {
                DEV_ERROR(ThreadErr::SIGNAL_HANDLER_ABNORMAL, "#sche.except.signal: Ori sigact SIG_DFL.");
                signal(signum, SIG_DFL);
                raise(signum);
            } else if (handle == SIG_IGN) {
                DEV_ERROR(ThreadErr::SIGNAL_HANDLER_ABNORMAL, "#sche.except.signal: Ori sigact SIG_IGN.");
            } else if (handle != nullptr) {
                DEV_ERROR(ThreadErr::SIGNAL_HANDLER_ABNORMAL, "#sche.except.signal: Call Ori sigact.");
                handle(signum);
            }
            return;
        }
        schMachine_.ResetRegAll();
        sigaction(SIGFPE, &oriFPEAct_, nullptr);
        sigaction(SIGBUS, &oriBUSAct_, nullptr);
        sigaction(SIGSEGV, &oriSEGVAct_, nullptr);
        sigaction(SIGPIPE, &oriPIPEAct_, nullptr);
        sigaction(SIGILL, &oriILLAct_, nullptr);
        sigaction(SIGABRT, &oriBordAct_, nullptr);
        (void)raise(signum);
        return;
    }

    void ReleaseRuntimeDataRingBuffer(DevAscendProgram* devProg)
    {
        RuntimeDataRingBufferHead* runtimeDataList = devProg->GetRuntimeDataList();
        runtimeDataList->Deallocate(runtimeDataList->GetRuntimeDataCurrent());
        DEV_INFO("Runtimedata: %lu, %lu", runtimeDataList->GetIndexFinished(), runtimeDataList->GetIndexPending());
    }

    int EntrySplittedStreamCtrl(DeviceKernelArgs* kargs, const KernelCtrlEntry& entry)
    {
        // ctrl start only one thread
        DEV_INFO("Ctrl enter round=%d", (int)kargs->parameter.globalRound);
        initCtrl_.store(true);
        int ret = RunCtrlInitNoLock(kargs, entry);
        if (ret != 0) {
            initCtrl_.store(false);
            DeviceTrace::GetInstance().ReportTraceMsg();
            return ret;
        }
        kargs->taskWastTime = GetCycles();
        ret = RunCtrl(kargs, entry, 0);
        PerfMtTrace(PERF_TRACE_BEGIN, 0, kargs->taskWastTime);
        PerfMtTrace(PERF_TRACE_EXIT, 0);
        DEV_INFO("Ctrl leave ret=%d", ret);
        if (ret != DEVICE_MACHINE_OK) {
            DeviceTrace::GetInstance().ReportTraceMsg();
        }
        initCtrl_.store(false);
        PerfEvtMgr::Instance().AddCtrlTurn();
        return ret;
    }

    void ReCalcDevArgsAicoreNum(DeviceKernelArgs* kargs, DevAscendProgram* devProg)
    {
        if (kargs->parameter.ctrlBlockNum != 0 &&
            static_cast<uint32_t>(kargs->parameter.ctrlBlockNum) != devProg->devArgs.nrValidAic) {
            devProg->devArgs.nrValidAic = kargs->parameter.ctrlBlockNum;
            DEV_INFO("control aicore before launch, nrValidAic changed to %lu", kargs->parameter.ctrlBlockNum);
        }
    }

    int EntrySplittedStreamSche(DeviceKernelArgs* kargs, const KernelCtrlEntry& entry)
    {
        DevAscendProgram* devProg = PtrToPtr<int64_t, DevAscendProgram>(kargs->cfgdata);
        int scheWaitRet = splittedInfo_.ScheWait(devProg);
        if (scheWaitRet != DEVICE_MACHINE_OK) {
            DEV_ERROR(SchedErr::RINGBUFFER_WAIT_TIMEOUT, "#sche.wait: ScheWait failed, ret=%d.", scheWaitRet);
            DeviceTrace::GetInstance().ReportTraceMsg();
            return scheWaitRet;
        }
        auto beginTime = GetCycles();   // After wait, the devStartArgs should be ready.
        DevStartArgs* runtimeDataCurrent =
            reinterpret_cast<DevStartArgs*>(devProg->GetRuntimeDataList()->GetRuntimeDataCurrent());
        ReCalcDevArgsAicoreNum(kargs, devProg);
        auto devArgs = devProg->devArgs;
        int threadIdx = -1;
        RunSchInit(&devArgs);
        if (AllocThreadIdx(&devArgs, threadIdx, runtimeDataCurrent->devScheState.threadIdx) != npu::tile_fwk::dynamic::DEVICE_MACHINE_OK) {
                DEV_ERROR(ThreadErr::THREAD_CPU_ALLOC_FAILED, "#sche.thread.init: Current cpu[%d] alloc thread failed.", sched_getcpu());
            DEV_ATRACE("Schedule Current cpu[%d] alloc thread failed", sched_getcpu());
            DeviceTrace::GetInstance().ReportTraceMsg();
            return npu::tile_fwk::dynamic::DEVICE_MACHINE_ERROR;
        }
        PerfMtTrace(PERF_TRACE_ALLOC_THREAD_ID, threadIdx);
        PerfMtTrace(PERF_TRACE_BEGIN, threadIdx, beginTime);
        int ret = DEVICE_MACHINE_OK;
        if (threadIdx != -1 && threadIdx <= static_cast<int>(devArgs.scheCpuNum)) {
            DEV_INFO("SchedThreadEnter idx=%d round=%d", threadIdx, (int)kargs->parameter.globalRound);
            ret = RunSche(kargs, entry, threadIdx);
            DEV_INFO("SchedThreadLeave idx=%d ret=%d", threadIdx, ret);
            if (ret != DEVICE_MACHINE_OK) {
                DeviceTrace::GetInstance().ReportTraceMsg();
            }

            if (splittedInfo_.ScheSync(runtimeDataCurrent, devArgs.scheCpuNum)) {
                ret = DEVICE_MACHINE_OK;
            }
            PerfMtTrace(PERF_TRACE_EXIT, threadIdx);
        }
        if (++schExitNum_ == devArgs.nrAicpu) {
            RunSchPost(devProg);
            RunSchDeInit();
            PerfEvtMgr::Instance().AddScheduleTurn();
            DEV_INFO("All sche cpu exited.");
        }
        return ret;
    }

    int Entry(DeviceKernelArgs* kargs, const KernelCtrlEntry& entry)
    {
        switch (kargs->parameter.runMode) {
            case RUN_SPLITTED_STREAM_CTRL:
                return EntrySplittedStreamCtrl(kargs, entry);
                break;
            case RUN_SPLITTED_STREAM_SCHE:
                return EntrySplittedStreamSche(kargs, entry);
                break;
            default:
                DEV_ERROR(
                    DevCommonErr::PARAM_INVALID, "#dev.entry.invalid_mode: Invalid run mode: %d\n",
                    (int)kargs->parameter.runMode);
                break;
        }
        return DEVICE_MACHINE_INVALID_RUN_MODE;
    }

    int LastFinishThreadIdx_{0};
    std::atomic<uint64_t> cpumask_{0};
    std::atomic<uint32_t> schExitNum_{0};
#ifndef __DEVICE__
    std::atomic<int> simCpuId_{0};
#endif
    std::atomic<int> ctrlcpuIdx_{0};
    std::atomic<int> die0ThreadIdx_{0};
    std::atomic<int> die1ThreadIdx_{0};
    std::atomic<int> die0MaxThreadIdx_{-1};
    std::atomic<int> die1MaxThreadIdx_{-1};
    DeviceSchedMachine schMachine_;
    struct sigaction oriFPEAct_;
    struct sigaction oriBUSAct_;
    struct sigaction oriSEGVAct_;
    struct sigaction oriPIPEAct_;
    struct sigaction oriILLAct_;
    struct sigaction oriBordAct_;
    std::atomic<bool> reset_{false};
    std::atomic<bool> initCtrl_{false};
	std::atomic<bool> initSch_{false};
    std::atomic<bool> schRunFailed_{false};

    struct SplittedInfo {
        std::atomic<uint64_t> currentRound{0};

        int ScheWait(DevAscendProgram* devProg)
        {
            TIMEOUT_CHECK_INIT(devProg->devArgs.archInfo, TIMEOUT_1MIN);
            
            while (unlikely(!devProg->runtimeDataRingBufferInited)) {
                RuntimeYield(0);
                
                __PYPTO_TIMEOUT_CHECK(SchedErr::RINGBUFFER_WAIT_TIMEOUT,
                    return DEVICE_MACHINE_ERROR,
                    "#sche.wait: RingBuffer init.");
            }
            RuntimeDataRingBufferHead* ringBufferHead = devProg->GetRuntimeDataList();
            start = GetCycles();
            
            while (unlikely(ringBufferHead->Empty())) {
                RuntimeYield(0);
                
                __PYPTO_TIMEOUT_CHECK(SchedErr::RINGBUFFER_WAIT_TIMEOUT,
                    return DEVICE_MACHINE_ERROR,
                    "#sche.wait: RingBuffer data.");
            }
            return DEVICE_MACHINE_OK;
        }

        bool ScheSync(DevStartArgs* devStartArgs, int schNum)
        {
            return ++devStartArgs->devScheState.finished == schNum;
        }
    } splittedInfo_;
};

} // namespace npu::tile_fwk::dynamic