#include "kernel_operator.h"
#include "sk_common.h"
#include "sk_flag_diag.h"
#include <cstdio>
typedef void (*sk_sub_func)(const __gm__ void *param, const sk::SkSystemArgs *sysArgs);
// 事件记录常量
constexpr uint32_t SK_KERNEL_EVENT_RECORD_SIZE = sizeof(SkKernelEventRecord);
constexpr uint32_t UINT32_MAX_VAL = 0xFFFFFFFF;
namespace AscendC {
// 写入事件记录到 GM
__aicore__ inline void WriteEventRecord(__gm__ void* gmBase, uint64_t modelRI, uint32_t skId, uint32_t nodeId,
uint32_t blockNum, uint64_t startTime, uint64_t endTime, uint32_t coreSizeBuffer)
{
if (gmBase == nullptr) {
return;
}
__gm__ SkKernelEventCoreBuf* coreBuf = (__gm__ SkKernelEventCoreBuf*)gmBase;
// core地址增加偏移
uint32_t oldOffset = coreBuf->offset;
if (oldOffset == 0) {
oldOffset = sizeof(SkKernelEventCoreBuf); // 初始化偏移为结构体大小
}
// 检查是否超出 1MB 限制
if (oldOffset + SK_KERNEL_EVENT_RECORD_SIZE > coreSizeBuffer) {
// 空间已满,不再写入
return;
}
// 写入记录
__gm__ SkKernelEventRecord* record = (__gm__ SkKernelEventRecord*)((__gm__ char*)gmBase + oldOffset);
record->modelRI = modelRI;
record->skId = skId;
record->nodeId = nodeId;
record->blockIdx = static_cast<uint8_t>(AscendC::GetBlockIdx());
record->blockNum = static_cast<uint8_t>(blockNum);
record->startTime = startTime;
record->endTime = endTime;
// 更新偏移
coreBuf->offset = oldOffset + SK_KERNEL_EVENT_RECORD_SIZE;
}
template<bool aic_flag>
__aicore__ inline void NotifyFunc(GM_ADDR param, uint64_t value) {
if constexpr(aic_flag) {
if (get_block_idx() ==0) {
__gm__ uint64_t *notifyLock = reinterpret_cast<__gm__ uint64_t *>(param);
*notifyLock = value;
dcci(notifyLock, 0, 2);
}
} else {
if (AscendC::GetBlockIdx() == 0) {
__gm__ uint64_t *notifyLock = reinterpret_cast<__gm__ uint64_t *>(param);
*notifyLock = value;
dcci(notifyLock, 0, 2);
}
}
}
template<bool aic_flag>
__aicore__ inline void WaitFunc(GM_ADDR param, uint64_t value, uint32_t flag) {
if constexpr(aic_flag) {
if (get_block_idx() ==0) {
__gm__ volatile uint64_t *waitLock = reinterpret_cast<__gm__ uint64_t *>(param);
if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::GEQ)) {
dcci(waitLock, 0, 2);
while (*waitLock < value) {
dcci(waitLock, 0, 2);
}
} else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::EQ)) {
dcci(waitLock, 0, 2);
while (*waitLock != value) {
dcci(waitLock, 0, 2);
}
} else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::AND)) {
dcci(waitLock, 0, 2);
while ((*waitLock & value) == 0) {
dcci(waitLock, 0, 2);
}
} else {
dcci(waitLock, 0, 2);
while ((~(*waitLock | value)) == 0) {
dcci(waitLock, 0, 2);
}
}
}
} else {
if (AscendC::GetBlockIdx() == 0) {
__gm__ volatile uint64_t *waitLock = reinterpret_cast<__gm__ uint64_t *>(param);
if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::GEQ)) {
dcci(waitLock, 0, 2);
while (*waitLock < value) {
dcci(waitLock, 0, 2);
}
} else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::EQ)) {
dcci(waitLock, 0, 2);
while (*waitLock != value) {
dcci(waitLock, 0, 2);
}
} else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::AND)) {
dcci(waitLock, 0, 2);
while ((*waitLock & value) == 0) {
dcci(waitLock, 0, 2);
}
} else {
dcci(waitLock, 0, 2);
while ((~(*waitLock | value)) == 0) {
dcci(waitLock, 0, 2);
}
}
}
}
}
template<bool aic_flag>
__aicore__ inline void ResetFunc(GM_ADDR param, uint64_t value) {
if constexpr(aic_flag) {
if (get_block_idx() ==0) {
__gm__ uint64_t *resetLock = reinterpret_cast<__gm__ uint64_t *>(param);
*resetLock = value;
dcci(resetLock, 0, 2);
}
} else {
if (AscendC::GetBlockIdx() == 0) {
__gm__ uint64_t *resetLock = reinterpret_cast<__gm__ uint64_t *>(param);
*resetLock = value;
dcci(resetLock, 0, 2);
}
}
}
template <uint8_t aic, uint8_t aiv>
__aicore__ inline void AutoCoreSyncImpl(SkCoreSyncType sync_type) {
switch (sync_type) {
case SkCoreSyncType::CROSS_SYNC_AIC_TO_AIC:
if ASCEND_IS_AIC {
ffts_cross_core_sync(PIPE_FIX, AscendC::GetffstMsg(0x0, AscendC::SYNC_AIC_FLAG));
wait_flag_dev(AscendC::SYNC_AIC_FLAG);
}
return;
case SkCoreSyncType::CROSS_SYNC_AIV_TO_AIV:
if ASCEND_IS_AIV {
ffts_cross_core_sync(PIPE_MTE3, AscendC::GetffstMsg(0x0, AscendC::SYNC_AIV_ONLY_ALL));
wait_flag_dev(AscendC::SYNC_AIV_ONLY_ALL);
}
return;
case SkCoreSyncType::INTER_SYNC_SET_AIC_TO_AIV:
if ASCEND_IS_AIC {
ffts_cross_core_sync(PIPE_MTE3, AscendC::GetffstMsg(0x02, AscendC::SYNC_AIC_AIV_FLAG));
}
return;
case SkCoreSyncType::INTER_SYNC_SET_AIV_TO_AIC:
if ASCEND_IS_AIV {
ffts_cross_core_sync(PIPE_MTE3, AscendC::GetffstMsg(0x02, AscendC::SYNC_AIV_FLAG));
}
return;
case SkCoreSyncType::INTER_SYNC_WAIT_AIC_TO_AIV:
if ASCEND_IS_AIV {
wait_flag_dev(AscendC::SYNC_AIC_AIV_FLAG);
}
return;
case SkCoreSyncType::INTER_SYNC_WAIT_AIV_TO_AIC:
if ASCEND_IS_AIC {
wait_flag_dev(AscendC::SYNC_AIV_FLAG);
}
return;
default:
if constexpr (aic == 1 && aiv == 0) {
ffts_cross_core_sync(PIPE_FIX, AscendC::GetffstMsg(0x0, AscendC::SYNC_AIC_FLAG));
wait_flag_dev(AscendC::SYNC_AIC_FLAG);
} else if constexpr (aic == 0 && aiv==1) {
ffts_cross_core_sync(PIPE_MTE3, AscendC::GetffstMsg(0x0, AscendC::SYNC_AIV_ONLY_ALL));
wait_flag_dev(AscendC::SYNC_AIV_ONLY_ALL);
} else {
AscendC::SyncAll<false>();
}
return;
return;
}
}
}
namespace sk {
__aicore__ inline bool IsValidKernelType(SkKernelType curType) {
auto type = static_cast<uint16_t>(curType);
return type >= static_cast<uint16_t>(SkKernelType::AIC_ONLY) &&
type <= static_cast<uint16_t>(SkKernelType::MIX_AIC_1_2);
}
__aicore__ inline uint16_t GetSyncCombinationValue(SkKernelType preType, SkKernelType curType) {
if (!IsValidKernelType(preType) || !IsValidKernelType(curType)) {
return INVALID_SYNC_COMBINATION;
}
return SYNC_COMBINATION_TABLE[static_cast<uint16_t>(preType) - 1]
[static_cast<uint16_t>(curType) - 1];
}
} // namespace sk
template <uint8_t aic, uint8_t aiv, bool enable_debug = false, bool enable_profiling = false, bool enable_op_trace = false>
__aicore__ inline void spk_entry_impl(GM_ADDR skDevArgs) {
const __gm__ SkHeaderInfo *sk_header = (const __gm__ SkHeaderInfo *)(get_para_base());
// op_trace counter info declaration
__gm__ SkCounterInfo *counterInfo = nullptr;
if constexpr(enable_op_trace){
counterInfo = (__gm__ SkCounterInfo *)((const __gm__ uint8_t *)sk_header + sk_header->counterOffset + sizeof(SkCounterInfo) * (get_coreid() & 0x00FF) );
counterInfo->launch = static_cast<uint8_t>(SkOpTraceType::SK_ENTRY_LAUNCHED);
dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
}
uint64_t cond = static_cast<uint64_t>(SkOpTraceType::SK_ENTRY_LAUNCHED);
uint64_t modelRIIdAndSkScopeId = sk_header->modelRIIdAndSkScopeId;
cond = modelRIIdAndSkScopeId | cond;
set_cond(cond);
__gm__ void* eventGmAddr = nullptr;
uint64_t eventModelRI = 0;
uint32_t eventSkId = 0;
uint64_t startTime = 0;
uint64_t endTime = 0;
uint64_t skStartTime = 0;
uint64_t skEndTime = 0;
uint8_t skBlockNum = AscendC::GetBlockNum();
uint32_t coreSizeBuffer = 0;
if constexpr(enable_profiling){
// 读取事件配置信息
__gm__ SkEventConfig* eventConfig = nullptr;
bool eventEnabled = false;
if (sk_header->eventConfigOffset != 0) {
eventConfig = (__gm__ SkEventConfig*)((const __gm__ uint8_t *)sk_header + sk_header->eventConfigOffset);
if (eventConfig->enabled != 0) {
eventGmAddr = (__gm__ void*)eventConfig->eventGmAddr;
eventModelRI = eventConfig->modelRI;
eventSkId = eventConfig->skId;
eventEnabled = eventConfig->enabled;
coreSizeBuffer = eventConfig->coreSize;
// 计算当前 core 的 GM 地址
uint32_t coreId = get_coreid() & 0x00FF;
eventGmAddr = (__gm__ void*)((__gm__ char*)eventGmAddr + coreId * coreSizeBuffer);
}
}
skStartTime = get_sys_cnt();
}
uint32_t que_offset = 0;
if ASCEND_IS_AIC {
que_offset = sk_header->aicQueOffset + (((uint8_t)get_coreid()) % 4) * sk_header->aicQueSize;
if constexpr(enable_profiling){
if (aic != 0) {
skBlockNum = AscendC::GetBlockNum() * aic;
}
}
}
if ASCEND_IS_AIV {
que_offset = sk_header->aivQueOffset + (((uint8_t)get_coreid()) % 4) * sk_header->aivQueSize;
if constexpr(enable_profiling){
if (aiv != 0) {
skBlockNum = AscendC::GetBlockNum() * aiv;
}
}
}
const __gm__ TaskQue *taskQue = (const __gm__ TaskQue *)((const __gm__ uint8_t *)sk_header + que_offset);
SkKernelType preKernelType = SkKernelType::DEFAULT;
for (auto i = 0; i < taskQue->taskCnt; i++) {
const __gm__ TaskInfo *task = taskQue->taskInfos + i;
if (task->type == SkTaskType::TYPE_PRELOAD) {
auto blockId = AscendC::GetBlockIdx();
if (blockId < task->numBlocks) {
preload((const void *)(task->entry[((uint8_t)get_coreid()) % task->entryCnt]), task->args);
dc_preload((__gm__ uint64_t *)(task->reserved), 0);
dc_preload((__gm__ uint64_t *)(task->reserved) + 8, 0);
dc_preload((__gm__ uint64_t *)(task + 1), 0);
}
} else if (task->type == SkTaskType::TYPE_FUNC) {
auto blockId = AscendC::GetBlockIdx();
if (blockId < task->numBlocks) {
if constexpr(enable_profiling){
startTime = get_sys_cnt();
}
if constexpr(enable_op_trace){
counterInfo->index = task->index;
counterInfo->launch = static_cast<uint8_t>(SkOpTraceType::OP_LAUNCHED);
dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
}
uint64_t cond = static_cast<uint64_t>(SkOpTraceType::OP_LAUNCHED) + (static_cast<uint64_t>(task->index) << 8);
cond = modelRIIdAndSkScopeId | cond;
set_cond(cond);
sk::SkSystemArgs sysArgs = {
static_cast<uint16_t>(AscendC::GetBlockIdx()),
task->numBlocks
};
// first task does not require synchronization settings
if (preKernelType != SkKernelType::DEFAULT) {
sysArgs.SkSetTaskSyncCfg(sk::GetSyncCombinationValue(preKernelType, task->originType));
}
if ((task->debugOptions & 0x4) != 0) {
dcci(nullptr, 1, 2);
}
((sk_sub_func)(task->entry[((uint8_t)get_coreid()) % task->entryCnt]))(
(const __gm__ void *)task->args,
&sysArgs
);
// 执行post-kernel dcci的条件:
// 1. 设置了DCCI_AFTER_KERNEL_END (bit 0x8)
// 2. 或者没有设置DCCI_DISABLE_ON_KERNEL (bit 0x1)
// 这确保了即使同时设置了disable和after_kernel_end,也会执行dcci
if ((task->debugOptions & 0x8) != 0 || (task->debugOptions & 0x1) == 0) {
dcci(nullptr, 1, 2);
}
if constexpr(enable_op_trace){
counterInfo->launch = static_cast<uint8_t>(SkOpTraceType::OP_FINISHED);
dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
}
cond = static_cast<uint64_t>(SkOpTraceType::OP_FINISHED) + (static_cast<uint64_t>(task->index) << 8);
cond = modelRIIdAndSkScopeId | cond;
set_cond(cond);
if constexpr(enable_profiling){
endTime = get_sys_cnt();
AscendC::WriteEventRecord(eventGmAddr, eventModelRI, eventSkId,
task->index, task->numBlocks, startTime, endTime, coreSizeBuffer);
}
}
if constexpr(enable_op_trace){
constexpr bool is_aic_or_aiv_only = (aic == 1 && aiv == 0) || (aic == 0 && aiv == 1);
if constexpr (!is_aic_or_aiv_only) {
bool is_mix_kernel = (task->originType == SkKernelType::MIX_AIC_1_1) || (task->originType == SkKernelType::MIX_AIC_1_2);
if (((task->debugOptions & 0x10) != 0) && is_mix_kernel) {
test_cross_core_sync_flags();
}
}
}
} else if (task->type == SkTaskType::TYPE_SYNC) {
if constexpr(enable_debug) {
if ((task->debugOptions & 0x2) != 0) {
if constexpr (aic == 1 && aiv == 0) {
ffts_cross_core_sync(PIPE_FIX, AscendC::GetffstMsg(0x0, AscendC::SYNC_AIC_FLAG));
wait_flag_dev(AscendC::SYNC_AIC_FLAG);
} else if constexpr (aic == 0 && aiv==1) {
ffts_cross_core_sync(PIPE_MTE3, AscendC::GetffstMsg(0x0, AscendC::SYNC_AIV_ONLY_ALL));
wait_flag_dev(AscendC::SYNC_AIV_ONLY_ALL);
} else {
AscendC::SyncAll<false>();
}
} else {
AscendC::AutoCoreSyncImpl<aic, aiv>((SkCoreSyncType)task->args);
}
} else{
AscendC::AutoCoreSyncImpl<aic, aiv>((SkCoreSyncType)task->args);
}
} else if (task->type == SkTaskType::TYPE_EVENT_NOTIFY) {
GM_ADDR eventAddr = reinterpret_cast<GM_ADDR>(task->args);
const uint64_t eventValue = task->entry[0];
if ASCEND_IS_AIC {
AscendC::NotifyFunc<true>(eventAddr, eventValue);
}
if ASCEND_IS_AIV {
AscendC::NotifyFunc<false>(eventAddr, eventValue);
}
} else if (task->type == SkTaskType::TYPE_EVENT_WAIT) {
GM_ADDR eventAddr = reinterpret_cast<GM_ADDR>(task->args);
const uint64_t eventValue = task->entry[0];
const uint32_t waitFlag = static_cast<uint32_t>(task->reserved);
if ASCEND_IS_AIC {
AscendC::WaitFunc<true>(eventAddr, eventValue, waitFlag);
}
if ASCEND_IS_AIV {
AscendC::WaitFunc<false>(eventAddr, eventValue, waitFlag);
}
} else if (task->type == SkTaskType::TYPE_EVENT_RESET) {
GM_ADDR eventAddr = reinterpret_cast<GM_ADDR>(task->args);
const uint64_t eventValue = task->entry[0];
if ASCEND_IS_AIC {
AscendC::ResetFunc<true>(eventAddr, eventValue);
}
if ASCEND_IS_AIV {
AscendC::ResetFunc<false>(eventAddr, eventValue);
}
}
preKernelType = task->originType;
}
if constexpr(enable_profiling){
skEndTime = get_sys_cnt();
AscendC::WriteEventRecord(eventGmAddr, eventModelRI, eventSkId,
UINT32_MAX_VAL, skBlockNum, skStartTime, skEndTime, coreSizeBuffer);
}
pipe_barrier(PIPE_ALL);
if constexpr(enable_op_trace){
counterInfo->launch = static_cast<uint8_t>(SkOpTraceType::SK_ENTRY_FINISHED);
dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
}
cond = static_cast<uint64_t>(SkOpTraceType::SK_ENTRY_FINISHED);
cond = modelRIIdAndSkScopeId | cond;
set_cond(cond);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, true, true>(skDevArgs);
}