#include "kernel_operator.h"
#include "sk_common.h"
#include "sk_flag_diag.h"
#include <cstdio>

typedef void (*sk_sub_func)(const __gm__ void *param, const sk::SkSystemArgs *sysArgs);

// 事件记录常量
constexpr uint32_t SK_KERNEL_EVENT_RECORD_SIZE = sizeof(SkKernelEventRecord);
constexpr uint32_t UINT32_MAX_VAL = 0xFFFFFFFF;

namespace sk {

// 写入事件记录到 GM
__aicore__ inline void WriteEventRecord(__gm__ void* gmBase, uint64_t modelIdIndex, uint32_t skId, uint32_t nodeId,
    uint32_t blockNum, uint64_t startTime, uint64_t endTime, uint32_t coreSizeBuffer) 
{
    if (gmBase == nullptr) {
        return;
    }
    
    __gm__ SkKernelEventCoreBuf* coreBuf = (__gm__ SkKernelEventCoreBuf*)gmBase;
    
    // core地址增加偏移
    uint32_t oldOffset = coreBuf->offset;
    if (oldOffset == 0) {
        oldOffset = sizeof(SkKernelEventCoreBuf); // 初始化偏移为结构体大小
    }
    
    // 检查是否超出 1MB 限制
    if (oldOffset + SK_KERNEL_EVENT_RECORD_SIZE > coreSizeBuffer) {
        // 空间已满,不再写入
        return;
    }
    
    // 写入记录
    __gm__ SkKernelEventRecord* record = (__gm__ SkKernelEventRecord*)((__gm__ char*)gmBase + oldOffset);
    record->modelIdIndex = modelIdIndex;
    record->skId = skId;
    record->nodeId = nodeId;
    record->blockIdx = static_cast<uint8_t>(AscendC::GetBlockIdx());
    record->blockNum = static_cast<uint8_t>(blockNum);
    record->startTime = startTime;
    record->endTime = endTime;
    
    // 更新偏移
    coreBuf->offset = oldOffset + SK_KERNEL_EVENT_RECORD_SIZE;
}

template<bool aic_flag>
__aicore__ inline void NotifyFunc(GM_ADDR param, uint64_t value) {
    if constexpr(aic_flag) {
        if (get_block_idx() ==0) {
            __gm__ uint64_t *notifyLock = reinterpret_cast<__gm__ uint64_t *>(param);
            *notifyLock = value;
            dcci(notifyLock, 0, 2);
        }
    } else {
        if (AscendC::GetBlockIdx() == 0) {
            __gm__ uint64_t *notifyLock = reinterpret_cast<__gm__ uint64_t *>(param);
            *notifyLock = value;
            dcci(notifyLock, 0, 2);
        }
    }
}

template<bool aic_flag>
__aicore__ inline void WaitFunc(GM_ADDR param, uint64_t value, uint32_t flag) {
    if constexpr(aic_flag) {
        if (get_block_idx() ==0) {
            __gm__ volatile uint64_t *waitLock = reinterpret_cast<__gm__ uint64_t *>(param);
            if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::GEQ)) {
                dcci(waitLock, 0, 2);
                while (*waitLock < value) {
                    dcci(waitLock, 0, 2);
                }
            } else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::EQ)) {
                dcci(waitLock, 0, 2);
                while (*waitLock != value) {
                    dcci(waitLock, 0, 2);
                }
            } else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::AND)) {
                dcci(waitLock, 0, 2);
                while ((*waitLock & value) == 0) {
                    dcci(waitLock, 0, 2);
                }
            } else {
                dcci(waitLock, 0, 2);
                while ((~(*waitLock | value)) == 0) {
                    dcci(waitLock, 0, 2);
                }
            }
        }
    } else {
        if (AscendC::GetBlockIdx() == 0) {
            __gm__ volatile uint64_t *waitLock = reinterpret_cast<__gm__ uint64_t *>(param);
            if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::GEQ)) {
                dcci(waitLock, 0, 2);
                while (*waitLock < value) {
                    dcci(waitLock, 0, 2);
                }
            } else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::EQ)) {
                dcci(waitLock, 0, 2);
                while (*waitLock != value) {
                    dcci(waitLock, 0, 2);
                }
            } else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::AND)) {
                dcci(waitLock, 0, 2);
                while ((*waitLock & value) == 0) {
                    dcci(waitLock, 0, 2);
                }
            } else {
                dcci(waitLock, 0, 2);
                while ((~(*waitLock | value)) == 0) {
                    dcci(waitLock, 0, 2);
                }
            }
        }
    }
}

template<bool aic_flag>
__aicore__ inline void ResetFunc(GM_ADDR param, uint64_t value) {
    if constexpr(aic_flag) {
        if (get_block_idx() ==0) {
            __gm__ uint64_t *resetLock = reinterpret_cast<__gm__ uint64_t *>(param);
            *resetLock = value;
            dcci(resetLock, 0, 2);
        }
    } else {
        if (AscendC::GetBlockIdx() == 0) {
            __gm__ uint64_t *resetLock = reinterpret_cast<__gm__ uint64_t *>(param);
            *resetLock = value;
            dcci(resetLock, 0, 2);
        }
    }
}

template <uint8_t aic, uint8_t aiv>
__aicore__ inline void FullCoreSyncImpl()
{
    if constexpr (aic == 1 && aiv == 0) {
        AscendC::CrossCoreSetFlag<0x0, PIPE_FIX>(AscendC::SYNC_AIC_FLAG);
        AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_FLAG);
    } else if constexpr (aic == 0 && aiv == 1) {
        AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(AscendC::SYNC_AIV_ONLY_ALL);
        AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_ONLY_ALL);
    } else {
        if ASCEND_IS_AIC {
            AscendC::CrossCoreSetFlag<0x0, PIPE_FIX>(AscendC::SYNC_AIC_FLAG);
            AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_FLAG);
            AscendC::CrossCoreSetFlag<0x02, PIPE_FIX>(AscendC::SYNC_AIC_AIV_FLAG);
            AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_FLAG);
        }
        if ASCEND_IS_AIV {
            AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(AscendC::SYNC_AIV_ONLY_ALL);
            AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_ONLY_ALL);
            AscendC::CrossCoreSetFlag<0x02, PIPE_MTE3>(AscendC::SYNC_AIV_FLAG);
            AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_AIV_FLAG);
        }
    }
}

template <uint8_t aic, uint8_t aiv>
__aicore__ inline void AutoCoreSyncImpl(SkCoreSyncType syncType) {
    switch (syncType) {
        case SkCoreSyncType::CROSS_SYNC_AIC_TO_AIC:
            if ASCEND_IS_AIC {
                AscendC::CrossCoreSetFlag<0x0, PIPE_FIX>(AscendC::SYNC_AIC_FLAG);
                AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_FLAG);
            }
            return;
        case SkCoreSyncType::CROSS_SYNC_AIV_TO_AIV:
            if ASCEND_IS_AIV {
                AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(AscendC::SYNC_AIV_ONLY_ALL);
                AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_ONLY_ALL);
            }
            return;
        case SkCoreSyncType::INTER_SYNC_SET_AIC_TO_AIV:
            if ASCEND_IS_AIC {
                AscendC::CrossCoreSetFlag<0x02, PIPE_FIX>(AscendC::SYNC_AIC_AIV_FLAG);
            }
            return;
        case SkCoreSyncType::INTER_SYNC_SET_AIV_TO_AIC:
            if ASCEND_IS_AIV {
                AscendC::CrossCoreSetFlag<0x02, PIPE_MTE3>(AscendC::SYNC_AIV_FLAG);
            }
            return;
        case SkCoreSyncType::INTER_SYNC_WAIT_AIC_TO_AIV:
            if ASCEND_IS_AIV {
                AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_AIV_FLAG);
            }
            return;
        case SkCoreSyncType::INTER_SYNC_WAIT_AIV_TO_AIC:
            if ASCEND_IS_AIC {
                AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_FLAG);
            }
            return;
        case SkCoreSyncType::ALL_SYNC:
            FullCoreSyncImpl<aic, aiv>();
            return;
        default:
            AscendC::Trap();
            return;
    }
}

template <uint8_t aic, uint8_t aiv>
__aicore__ inline void AutoCoreSyncImpl(SkCoreSyncType syncType, uint8_t numBlocks, uint64_t syncConfig)
{
    if (syncConfig == 0) {
        AutoCoreSyncImpl<aic, aiv>(syncType);
        return;
    }

    if (AscendC::GetBlockIdx() < numBlocks) {
        return;
    }

    if ASCEND_IS_AIC {
        if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIC_TO_AIC_SET)) != 0) {
            AscendC::CrossCoreSetFlag<0x0, PIPE_FIX>(AscendC::SYNC_AIC_FLAG);
        }
        if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIC_TO_AIC_WAIT)) != 0) {
            AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_FLAG);
        }
        if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIC_TO_AIV_SET)) != 0) {
            AscendC::CrossCoreSetFlag<0x02, PIPE_FIX>(AscendC::SYNC_AIC_AIV_FLAG);
        }
        if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIV_TO_AIC_WAIT)) != 0) {
            AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_FLAG);
        }
    }

    if ASCEND_IS_AIV {
        if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIV_TO_AIV_SET)) != 0) {
            AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(AscendC::SYNC_AIV_ONLY_ALL);
        }
        if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIV_TO_AIV_WAIT)) != 0) {
            AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_ONLY_ALL);
        }
        if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIV_TO_AIC_SET)) != 0) {
            AscendC::CrossCoreSetFlag<0x02, PIPE_MTE3>(AscendC::SYNC_AIV_FLAG);
        }
        if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIC_TO_AIV_WAIT)) != 0) {
            AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_AIV_FLAG);
        }
    }
}

} // namespace sk

template <uint8_t aic, uint8_t aiv, bool enable_debug = false, bool enable_profiling = false,
    bool enable_op_trace = false, bool enable_early_start = false>
__aicore__ inline void spk_entry_impl(GM_ADDR skDevArgs) {
    const __gm__ SkHeaderInfo *sk_header = (const __gm__ SkHeaderInfo *)(get_para_base());
    
    // op_trace counter info declaration
    __gm__ SkCounterInfo *counterInfo = nullptr;
    if constexpr(enable_op_trace){
        counterInfo = (__gm__ SkCounterInfo *)((const __gm__ uint8_t *)sk_header + sk_header->counterOffset + sizeof(SkCounterInfo) * (get_coreid() & 0x00FF) );
        counterInfo->opState = static_cast<uint8_t>(SkOpTraceType::SK_ENTRY_LAUNCHED);
        dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
    }
    uint64_t cond = static_cast<uint64_t>(SkOpTraceType::SK_ENTRY_LAUNCHED);
    uint64_t modelIdIndexAndSkScopeId = sk_header->modelIdIndexAndSkScopeId;
    cond = modelIdIndexAndSkScopeId | cond;
    set_cond(cond);
    __gm__ void* eventGmAddr = nullptr;
    uint64_t eventModelIdIndex = 0;
    uint32_t eventSkId = 0;
    uint64_t startTime = 0;
    uint64_t endTime = 0;
    uint64_t skStartTime = 0;
    uint64_t skEndTime = 0;
    uint8_t skBlockNum = AscendC::GetBlockNum();
    uint32_t coreSizeBuffer = 0;
    uint8_t eventEnabled = 0;
    if constexpr(enable_profiling){
        // 读取事件配置信息
        __gm__ SkEventConfig* eventConfig = nullptr;
        
        if (sk_header->eventConfigOffset != 0) {
            eventConfig = (__gm__ SkEventConfig*)((const __gm__ uint8_t *)sk_header + sk_header->eventConfigOffset);
            if (eventConfig->enabled != 0) {
                eventGmAddr = (__gm__ void*)eventConfig->eventGmAddr;
                eventModelIdIndex = eventConfig->modelIdIndex;
                eventSkId = eventConfig->skId;
                eventEnabled = eventConfig->enabled;
                coreSizeBuffer = eventConfig->coreSize;
                
                // 计算当前 core 的 GM 地址
                uint32_t coreId = get_coreid() & 0x00FF;
                eventGmAddr = (__gm__ void*)((__gm__ char*)eventGmAddr + coreId * coreSizeBuffer);
            }
        }
        skStartTime = get_sys_cnt();
    }
    
    uint32_t que_offset = 0;
    if ASCEND_IS_AIC {
        que_offset = sk_header->aicQueOffset + (((uint8_t)get_coreid()) % 4) * sk_header->aicQueSize;
        if constexpr(enable_profiling){
            if (aic != 0) {
                skBlockNum = AscendC::GetBlockNum() * aic;
            }
        }
    }
    if ASCEND_IS_AIV {
        que_offset = sk_header->aivQueOffset + (((uint8_t)get_coreid()) % 4) * sk_header->aivQueSize;
        if constexpr(enable_profiling){
            if (aiv != 0) {
                skBlockNum = AscendC::GetBlockNum() * aiv;
            }
        }
    }
    const __gm__ TaskQue *taskQue = (const __gm__ TaskQue *)((const __gm__ uint8_t *)sk_header + que_offset);
    for (auto i = 0; i < taskQue->taskCnt; i++) {
        const __gm__ TaskInfo *task = taskQue->taskInfos + i;
        if (task->type == SkTaskType::TYPE_PRELOAD) {
            auto blockId = AscendC::GetBlockIdx();
            if (blockId < task->numBlocks) {
                preload((const void *)(task->entry[((uint8_t)get_coreid()) % task->entryCnt]), task->args);
                dc_preload((__gm__ uint64_t *)(task->reserved), 0);
                dc_preload((__gm__ uint64_t *)(task->reserved) + 8, 0);
                dc_preload((__gm__ uint64_t *)(task + 1), 0);
            }
        } else if (task->type == SkTaskType::TYPE_FUNC) {
            auto blockId = AscendC::GetBlockIdx();
            if (blockId < task->numBlocks) {
                if constexpr(enable_profiling){
                    if (eventEnabled) {
                            startTime = get_sys_cnt();
                        }
                }
                if constexpr(enable_op_trace){
                    counterInfo->index = task->index;
                    counterInfo->opState = static_cast<uint8_t>(SkOpTraceType::OP_LAUNCHED);
                    dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
                }
                uint64_t cond = static_cast<uint64_t>(SkOpTraceType::OP_LAUNCHED) + (static_cast<uint64_t>(task->index) << 8);
                cond = modelIdIndexAndSkScopeId | cond;
                set_cond(cond);
                sk::SkSystemArgs sysArgs = {
                    static_cast<uint16_t>(blockId),
                    static_cast<uint16_t>(task->numBlocks),
                    static_cast<uint16_t>(task->reserved), // reserved field used for early start sync config
                };

                if ((task->debugOptions & 0x4) != 0) {
                    dcci(nullptr, 1, 2);
                }

                ((sk_sub_func)(task->entry[((uint8_t)get_coreid()) % task->entryCnt]))(
                    (const __gm__ void *)task->args,
                    &sysArgs
                );

                // 执行post-kernel dcci: 直接检查enable_dcci_after_func (bit 5 = 0x20)
                // 该bit由host端预先计算,综合了disableDcci和afterKernelEnd的逻辑
                // kernel侧无需组合判断,简化判断流程
                if ((task->debugOptions & 0x20) != 0) {
                    dcci(nullptr, 1, 2);
                }

                if constexpr(enable_op_trace){
                    counterInfo->opState = static_cast<uint8_t>(SkOpTraceType::OP_FINISHED);
                    dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
                }
                cond = static_cast<uint64_t>(SkOpTraceType::OP_FINISHED) + (static_cast<uint64_t>(task->index) << 8);
                cond = modelIdIndexAndSkScopeId | cond;
                set_cond(cond);
                if constexpr(enable_profiling){
                    if (eventEnabled) {
                        endTime = get_sys_cnt();
                        sk::WriteEventRecord(eventGmAddr, eventModelIdIndex, eventSkId,
                                            task->index, task->numBlocks, startTime, endTime, coreSizeBuffer);
                    }
                }
            }
            if constexpr(enable_op_trace){
                constexpr bool is_aic_or_aiv_only = (aic == 1 && aiv == 0) || (aic == 0 && aiv == 1);
                if constexpr (!is_aic_or_aiv_only) {
                    bool is_mix_kernel = (task->relatedType == SkKernelType::MIX_AIC_1_1) ||
                        (task->relatedType == SkKernelType::MIX_AIC_1_2);
                    if (((task->debugOptions & 0x10) != 0) && is_mix_kernel) {
                        test_cross_core_sync_flags();
                    }
                }
            }
        } else if (task->type == SkTaskType::TYPE_SYNC) {
            if constexpr(enable_debug) {
                if ((task->debugOptions & 0x2) != 0) {
                    sk::FullCoreSyncImpl<aic, aiv>();
                } else {
                    if constexpr(enable_early_start) {
                        sk::AutoCoreSyncImpl<aic, aiv>((SkCoreSyncType)task->args,
                            task->numBlocks, task->reserved);
                    } else {
                        sk::AutoCoreSyncImpl<aic, aiv>((SkCoreSyncType)task->args);
                    }
                }
            } else{
                if constexpr(enable_early_start) {
                    sk::AutoCoreSyncImpl<aic, aiv>((SkCoreSyncType)task->args, task->numBlocks, task->reserved);
                } else {
                    sk::AutoCoreSyncImpl<aic, aiv>((SkCoreSyncType)task->args);
                }
            }
        } else if (task->type == SkTaskType::TYPE_EVENT_NOTIFY) {
            GM_ADDR eventAddr = reinterpret_cast<GM_ADDR>(task->args);
            const uint64_t eventValue = task->entry[0];
            if ASCEND_IS_AIC {
                sk::NotifyFunc<true>(eventAddr, eventValue);
            }
            if ASCEND_IS_AIV {
                sk::NotifyFunc<false>(eventAddr, eventValue);
            }
        } else if (task->type == SkTaskType::TYPE_EVENT_WAIT) {
            GM_ADDR eventAddr = reinterpret_cast<GM_ADDR>(task->args);
            const uint64_t eventValue = task->entry[0];
            const uint32_t waitFlag = static_cast<uint32_t>(task->reserved);
            if ASCEND_IS_AIC {
                sk::WaitFunc<true>(eventAddr, eventValue, waitFlag);
            }
            if ASCEND_IS_AIV {
                sk::WaitFunc<false>(eventAddr, eventValue, waitFlag);
            }
        } else if (task->type == SkTaskType::TYPE_EVENT_RESET) {
            GM_ADDR eventAddr = reinterpret_cast<GM_ADDR>(task->args);
            const uint64_t eventValue = task->entry[0];
            if ASCEND_IS_AIC {
                sk::ResetFunc<true>(eventAddr, eventValue);
            }
            if ASCEND_IS_AIV {
                sk::ResetFunc<false>(eventAddr, eventValue);
            }
        }
    }
    if constexpr(enable_profiling){
        if (eventEnabled) {
            skEndTime = get_sys_cnt();
            sk::WriteEventRecord(eventGmAddr, eventModelIdIndex, eventSkId,
                                UINT32_MAX_VAL, skBlockNum, skStartTime, skEndTime, coreSizeBuffer);
        }
    }
    pipe_barrier(PIPE_ALL);
    if constexpr(enable_op_trace){
        counterInfo->opState = static_cast<uint8_t>(SkOpTraceType::SK_ENTRY_FINISHED);
        dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
    }
    cond = static_cast<uint64_t>(SkOpTraceType::SK_ENTRY_FINISHED);
    cond = modelIdIndexAndSkScopeId | cond;
    set_cond(cond);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, false, false>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, false, false>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, false, false>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, false, false>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, true, false>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, true, false>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, true, false>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, true, false>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_dump_profiling(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_dump_profiling(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_dump_profiling(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_dump_profiling(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_dump_profiling(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_dump_profiling(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_dump_profiling(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_dump_profiling(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, true, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, true, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, true, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, true, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, false, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, false, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, false, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, false, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, true, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, true, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, true, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, true, false, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_dump_profiling_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, false, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_dump_profiling_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, false, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_dump_profiling_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, false, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_dump_profiling_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, false, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_dump_profiling_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, true, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_dump_profiling_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, true, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_dump_profiling_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, true, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_dump_profiling_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, true, true, false, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, false, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, false, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, false, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, false, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, true, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, true, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, true, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, true, false, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, false, true, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, false, true, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, false, true, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, false, true, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<0, 1, true, true, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 0, true, true, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 1, true, true, true, true>(skDevArgs);
}

extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
    spk_entry_impl<1, 2, true, true, true, true>(skDevArgs);
}