#include "kernel_operator.h"
#include "sk_common.h"
#include "sk_flag_diag.h"
#include <cstdio>
typedef void (*sk_sub_func)(const __gm__ void *param, const sk::SkSystemArgs *sysArgs);
// 事件记录常量
constexpr uint32_t SK_KERNEL_EVENT_RECORD_SIZE = sizeof(SkKernelEventRecord);
constexpr uint32_t UINT32_MAX_VAL = 0xFFFFFFFF;
namespace sk {
// 写入事件记录到 GM
__aicore__ inline void WriteEventRecord(__gm__ void* gmBase, uint64_t modelIdIndex, uint32_t skId, uint32_t nodeId,
uint32_t blockNum, uint64_t startTime, uint64_t endTime, uint32_t coreSizeBuffer)
{
if (gmBase == nullptr) {
return;
}
__gm__ SkKernelEventCoreBuf* coreBuf = (__gm__ SkKernelEventCoreBuf*)gmBase;
// core地址增加偏移
uint32_t oldOffset = coreBuf->offset;
if (oldOffset == 0) {
oldOffset = sizeof(SkKernelEventCoreBuf); // 初始化偏移为结构体大小
}
// 检查是否超出 1MB 限制
if (oldOffset + SK_KERNEL_EVENT_RECORD_SIZE > coreSizeBuffer) {
// 空间已满,不再写入
return;
}
// 写入记录
__gm__ SkKernelEventRecord* record = (__gm__ SkKernelEventRecord*)((__gm__ char*)gmBase + oldOffset);
record->modelIdIndex = modelIdIndex;
record->skId = skId;
record->nodeId = nodeId;
record->blockIdx = static_cast<uint8_t>(AscendC::GetBlockIdx());
record->blockNum = static_cast<uint8_t>(blockNum);
record->startTime = startTime;
record->endTime = endTime;
// 更新偏移
coreBuf->offset = oldOffset + SK_KERNEL_EVENT_RECORD_SIZE;
}
template<bool aic_flag>
__aicore__ inline void NotifyFunc(GM_ADDR param, uint64_t value) {
if constexpr(aic_flag) {
if (get_block_idx() ==0) {
__gm__ uint64_t *notifyLock = reinterpret_cast<__gm__ uint64_t *>(param);
*notifyLock = value;
dcci(notifyLock, 0, 2);
}
} else {
if (AscendC::GetBlockIdx() == 0) {
__gm__ uint64_t *notifyLock = reinterpret_cast<__gm__ uint64_t *>(param);
*notifyLock = value;
dcci(notifyLock, 0, 2);
}
}
}
template<bool aic_flag>
__aicore__ inline void WaitFunc(GM_ADDR param, uint64_t value, uint32_t flag) {
if constexpr(aic_flag) {
if (get_block_idx() ==0) {
__gm__ volatile uint64_t *waitLock = reinterpret_cast<__gm__ uint64_t *>(param);
if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::GEQ)) {
dcci(waitLock, 0, 2);
while (*waitLock < value) {
dcci(waitLock, 0, 2);
}
} else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::EQ)) {
dcci(waitLock, 0, 2);
while (*waitLock != value) {
dcci(waitLock, 0, 2);
}
} else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::AND)) {
dcci(waitLock, 0, 2);
while ((*waitLock & value) == 0) {
dcci(waitLock, 0, 2);
}
} else {
dcci(waitLock, 0, 2);
while ((~(*waitLock | value)) == 0) {
dcci(waitLock, 0, 2);
}
}
}
} else {
if (AscendC::GetBlockIdx() == 0) {
__gm__ volatile uint64_t *waitLock = reinterpret_cast<__gm__ uint64_t *>(param);
if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::GEQ)) {
dcci(waitLock, 0, 2);
while (*waitLock < value) {
dcci(waitLock, 0, 2);
}
} else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::EQ)) {
dcci(waitLock, 0, 2);
while (*waitLock != value) {
dcci(waitLock, 0, 2);
}
} else if (flag == static_cast<uint32_t>(SkMemoryWaitFlag::AND)) {
dcci(waitLock, 0, 2);
while ((*waitLock & value) == 0) {
dcci(waitLock, 0, 2);
}
} else {
dcci(waitLock, 0, 2);
while ((~(*waitLock | value)) == 0) {
dcci(waitLock, 0, 2);
}
}
}
}
}
template<bool aic_flag>
__aicore__ inline void ResetFunc(GM_ADDR param, uint64_t value) {
if constexpr(aic_flag) {
if (get_block_idx() ==0) {
__gm__ uint64_t *resetLock = reinterpret_cast<__gm__ uint64_t *>(param);
*resetLock = value;
dcci(resetLock, 0, 2);
}
} else {
if (AscendC::GetBlockIdx() == 0) {
__gm__ uint64_t *resetLock = reinterpret_cast<__gm__ uint64_t *>(param);
*resetLock = value;
dcci(resetLock, 0, 2);
}
}
}
template <uint8_t aic, uint8_t aiv>
__aicore__ inline void FullCoreSyncImpl()
{
if constexpr (aic == 1 && aiv == 0) {
AscendC::CrossCoreSetFlag<0x0, PIPE_FIX>(AscendC::SYNC_AIC_FLAG);
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_FLAG);
} else if constexpr (aic == 0 && aiv == 1) {
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(AscendC::SYNC_AIV_ONLY_ALL);
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_ONLY_ALL);
} else {
if ASCEND_IS_AIC {
AscendC::CrossCoreSetFlag<0x0, PIPE_FIX>(AscendC::SYNC_AIC_FLAG);
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_FLAG);
AscendC::CrossCoreSetFlag<0x02, PIPE_FIX>(AscendC::SYNC_AIC_AIV_FLAG);
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_FLAG);
}
if ASCEND_IS_AIV {
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(AscendC::SYNC_AIV_ONLY_ALL);
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_ONLY_ALL);
AscendC::CrossCoreSetFlag<0x02, PIPE_MTE3>(AscendC::SYNC_AIV_FLAG);
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_AIV_FLAG);
}
}
}
template <uint8_t aic, uint8_t aiv>
__aicore__ inline void AutoCoreSyncImpl(SkCoreSyncType syncType) {
switch (syncType) {
case SkCoreSyncType::CROSS_SYNC_AIC_TO_AIC:
if ASCEND_IS_AIC {
AscendC::CrossCoreSetFlag<0x0, PIPE_FIX>(AscendC::SYNC_AIC_FLAG);
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_FLAG);
}
return;
case SkCoreSyncType::CROSS_SYNC_AIV_TO_AIV:
if ASCEND_IS_AIV {
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(AscendC::SYNC_AIV_ONLY_ALL);
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_ONLY_ALL);
}
return;
case SkCoreSyncType::INTER_SYNC_SET_AIC_TO_AIV:
if ASCEND_IS_AIC {
AscendC::CrossCoreSetFlag<0x02, PIPE_FIX>(AscendC::SYNC_AIC_AIV_FLAG);
}
return;
case SkCoreSyncType::INTER_SYNC_SET_AIV_TO_AIC:
if ASCEND_IS_AIV {
AscendC::CrossCoreSetFlag<0x02, PIPE_MTE3>(AscendC::SYNC_AIV_FLAG);
}
return;
case SkCoreSyncType::INTER_SYNC_WAIT_AIC_TO_AIV:
if ASCEND_IS_AIV {
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_AIV_FLAG);
}
return;
case SkCoreSyncType::INTER_SYNC_WAIT_AIV_TO_AIC:
if ASCEND_IS_AIC {
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_FLAG);
}
return;
case SkCoreSyncType::ALL_SYNC:
FullCoreSyncImpl<aic, aiv>();
return;
default:
AscendC::Trap();
return;
}
}
template <uint8_t aic, uint8_t aiv>
__aicore__ inline void AutoCoreSyncImpl(SkCoreSyncType syncType, uint8_t numBlocks, uint64_t syncConfig)
{
if (syncConfig == 0) {
AutoCoreSyncImpl<aic, aiv>(syncType);
return;
}
if (AscendC::GetBlockIdx() < numBlocks) {
return;
}
if ASCEND_IS_AIC {
if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIC_TO_AIC_SET)) != 0) {
AscendC::CrossCoreSetFlag<0x0, PIPE_FIX>(AscendC::SYNC_AIC_FLAG);
}
if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIC_TO_AIC_WAIT)) != 0) {
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_FLAG);
}
if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIC_TO_AIV_SET)) != 0) {
AscendC::CrossCoreSetFlag<0x02, PIPE_FIX>(AscendC::SYNC_AIC_AIV_FLAG);
}
if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIV_TO_AIC_WAIT)) != 0) {
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_FLAG);
}
}
if ASCEND_IS_AIV {
if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIV_TO_AIV_SET)) != 0) {
AscendC::CrossCoreSetFlag<0x0, PIPE_MTE3>(AscendC::SYNC_AIV_ONLY_ALL);
}
if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIV_TO_AIV_WAIT)) != 0) {
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIV_ONLY_ALL);
}
if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIV_TO_AIC_SET)) != 0) {
AscendC::CrossCoreSetFlag<0x02, PIPE_MTE3>(AscendC::SYNC_AIV_FLAG);
}
if ((syncConfig & static_cast<uint64_t>(SkEarlyStartMask::AIC_TO_AIV_WAIT)) != 0) {
AscendC::CrossCoreWaitFlag(AscendC::SYNC_AIC_AIV_FLAG);
}
}
}
} // namespace sk
template <uint8_t aic, uint8_t aiv, bool enable_debug = false, bool enable_profiling = false,
bool enable_op_trace = false, bool enable_early_start = false>
__aicore__ inline void spk_entry_impl(GM_ADDR skDevArgs) {
const __gm__ SkHeaderInfo *sk_header = (const __gm__ SkHeaderInfo *)(get_para_base());
// op_trace counter info declaration
__gm__ SkCounterInfo *counterInfo = nullptr;
if constexpr(enable_op_trace){
counterInfo = (__gm__ SkCounterInfo *)((const __gm__ uint8_t *)sk_header + sk_header->counterOffset + sizeof(SkCounterInfo) * (get_coreid() & 0x00FF) );
counterInfo->opState = static_cast<uint8_t>(SkOpTraceType::SK_ENTRY_LAUNCHED);
dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
}
uint64_t cond = static_cast<uint64_t>(SkOpTraceType::SK_ENTRY_LAUNCHED);
uint64_t modelIdIndexAndSkScopeId = sk_header->modelIdIndexAndSkScopeId;
cond = modelIdIndexAndSkScopeId | cond;
set_cond(cond);
__gm__ void* eventGmAddr = nullptr;
uint64_t eventModelIdIndex = 0;
uint32_t eventSkId = 0;
uint64_t startTime = 0;
uint64_t endTime = 0;
uint64_t skStartTime = 0;
uint64_t skEndTime = 0;
uint8_t skBlockNum = AscendC::GetBlockNum();
uint32_t coreSizeBuffer = 0;
uint8_t eventEnabled = 0;
if constexpr(enable_profiling){
// 读取事件配置信息
__gm__ SkEventConfig* eventConfig = nullptr;
if (sk_header->eventConfigOffset != 0) {
eventConfig = (__gm__ SkEventConfig*)((const __gm__ uint8_t *)sk_header + sk_header->eventConfigOffset);
if (eventConfig->enabled != 0) {
eventGmAddr = (__gm__ void*)eventConfig->eventGmAddr;
eventModelIdIndex = eventConfig->modelIdIndex;
eventSkId = eventConfig->skId;
eventEnabled = eventConfig->enabled;
coreSizeBuffer = eventConfig->coreSize;
// 计算当前 core 的 GM 地址
uint32_t coreId = get_coreid() & 0x00FF;
eventGmAddr = (__gm__ void*)((__gm__ char*)eventGmAddr + coreId * coreSizeBuffer);
}
}
skStartTime = get_sys_cnt();
}
uint32_t que_offset = 0;
if ASCEND_IS_AIC {
que_offset = sk_header->aicQueOffset + (((uint8_t)get_coreid()) % 4) * sk_header->aicQueSize;
if constexpr(enable_profiling){
if (aic != 0) {
skBlockNum = AscendC::GetBlockNum() * aic;
}
}
}
if ASCEND_IS_AIV {
que_offset = sk_header->aivQueOffset + (((uint8_t)get_coreid()) % 4) * sk_header->aivQueSize;
if constexpr(enable_profiling){
if (aiv != 0) {
skBlockNum = AscendC::GetBlockNum() * aiv;
}
}
}
const __gm__ TaskQue *taskQue = (const __gm__ TaskQue *)((const __gm__ uint8_t *)sk_header + que_offset);
for (auto i = 0; i < taskQue->taskCnt; i++) {
const __gm__ TaskInfo *task = taskQue->taskInfos + i;
if (task->type == SkTaskType::TYPE_PRELOAD) {
auto blockId = AscendC::GetBlockIdx();
if (blockId < task->numBlocks) {
preload((const void *)(task->entry[((uint8_t)get_coreid()) % task->entryCnt]), task->args);
dc_preload((__gm__ uint64_t *)(task->reserved), 0);
dc_preload((__gm__ uint64_t *)(task->reserved) + 8, 0);
dc_preload((__gm__ uint64_t *)(task + 1), 0);
}
} else if (task->type == SkTaskType::TYPE_FUNC) {
auto blockId = AscendC::GetBlockIdx();
if (blockId < task->numBlocks) {
if constexpr(enable_profiling){
if (eventEnabled) {
startTime = get_sys_cnt();
}
}
if constexpr(enable_op_trace){
counterInfo->index = task->index;
counterInfo->opState = static_cast<uint8_t>(SkOpTraceType::OP_LAUNCHED);
dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
}
uint64_t cond = static_cast<uint64_t>(SkOpTraceType::OP_LAUNCHED) + (static_cast<uint64_t>(task->index) << 8);
cond = modelIdIndexAndSkScopeId | cond;
set_cond(cond);
sk::SkSystemArgs sysArgs = {
static_cast<uint16_t>(blockId),
static_cast<uint16_t>(task->numBlocks),
static_cast<uint16_t>(task->reserved), // reserved field used for early start sync config
};
if ((task->debugOptions & 0x4) != 0) {
dcci(nullptr, 1, 2);
}
((sk_sub_func)(task->entry[((uint8_t)get_coreid()) % task->entryCnt]))(
(const __gm__ void *)task->args,
&sysArgs
);
// 执行post-kernel dcci: 直接检查enable_dcci_after_func (bit 5 = 0x20)
// 该bit由host端预先计算,综合了disableDcci和afterKernelEnd的逻辑
// kernel侧无需组合判断,简化判断流程
if ((task->debugOptions & 0x20) != 0) {
dcci(nullptr, 1, 2);
}
if constexpr(enable_op_trace){
counterInfo->opState = static_cast<uint8_t>(SkOpTraceType::OP_FINISHED);
dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
}
cond = static_cast<uint64_t>(SkOpTraceType::OP_FINISHED) + (static_cast<uint64_t>(task->index) << 8);
cond = modelIdIndexAndSkScopeId | cond;
set_cond(cond);
if constexpr(enable_profiling){
if (eventEnabled) {
endTime = get_sys_cnt();
sk::WriteEventRecord(eventGmAddr, eventModelIdIndex, eventSkId,
task->index, task->numBlocks, startTime, endTime, coreSizeBuffer);
}
}
}
if constexpr(enable_op_trace){
constexpr bool is_aic_or_aiv_only = (aic == 1 && aiv == 0) || (aic == 0 && aiv == 1);
if constexpr (!is_aic_or_aiv_only) {
bool is_mix_kernel = (task->relatedType == SkKernelType::MIX_AIC_1_1) ||
(task->relatedType == SkKernelType::MIX_AIC_1_2);
if (((task->debugOptions & 0x10) != 0) && is_mix_kernel) {
test_cross_core_sync_flags();
}
}
}
} else if (task->type == SkTaskType::TYPE_SYNC) {
if constexpr(enable_debug) {
if ((task->debugOptions & 0x2) != 0) {
sk::FullCoreSyncImpl<aic, aiv>();
} else {
if constexpr(enable_early_start) {
sk::AutoCoreSyncImpl<aic, aiv>((SkCoreSyncType)task->args,
task->numBlocks, task->reserved);
} else {
sk::AutoCoreSyncImpl<aic, aiv>((SkCoreSyncType)task->args);
}
}
} else{
if constexpr(enable_early_start) {
sk::AutoCoreSyncImpl<aic, aiv>((SkCoreSyncType)task->args, task->numBlocks, task->reserved);
} else {
sk::AutoCoreSyncImpl<aic, aiv>((SkCoreSyncType)task->args);
}
}
} else if (task->type == SkTaskType::TYPE_EVENT_NOTIFY) {
GM_ADDR eventAddr = reinterpret_cast<GM_ADDR>(task->args);
const uint64_t eventValue = task->entry[0];
if ASCEND_IS_AIC {
sk::NotifyFunc<true>(eventAddr, eventValue);
}
if ASCEND_IS_AIV {
sk::NotifyFunc<false>(eventAddr, eventValue);
}
} else if (task->type == SkTaskType::TYPE_EVENT_WAIT) {
GM_ADDR eventAddr = reinterpret_cast<GM_ADDR>(task->args);
const uint64_t eventValue = task->entry[0];
const uint32_t waitFlag = static_cast<uint32_t>(task->reserved);
if ASCEND_IS_AIC {
sk::WaitFunc<true>(eventAddr, eventValue, waitFlag);
}
if ASCEND_IS_AIV {
sk::WaitFunc<false>(eventAddr, eventValue, waitFlag);
}
} else if (task->type == SkTaskType::TYPE_EVENT_RESET) {
GM_ADDR eventAddr = reinterpret_cast<GM_ADDR>(task->args);
const uint64_t eventValue = task->entry[0];
if ASCEND_IS_AIC {
sk::ResetFunc<true>(eventAddr, eventValue);
}
if ASCEND_IS_AIV {
sk::ResetFunc<false>(eventAddr, eventValue);
}
}
}
if constexpr(enable_profiling){
if (eventEnabled) {
skEndTime = get_sys_cnt();
sk::WriteEventRecord(eventGmAddr, eventModelIdIndex, eventSkId,
UINT32_MAX_VAL, skBlockNum, skStartTime, skEndTime, coreSizeBuffer);
}
}
pipe_barrier(PIPE_ALL);
if constexpr(enable_op_trace){
counterInfo->opState = static_cast<uint8_t>(SkOpTraceType::SK_ENTRY_FINISHED);
dcci(static_cast<__gm__ void *>(counterInfo), 0, 2);
}
cond = static_cast<uint64_t>(SkOpTraceType::SK_ENTRY_FINISHED);
cond = modelIdIndexAndSkScopeId | cond;
set_cond(cond);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, false>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_dump_profiling(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_dump_profiling_op_trace(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, false, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_dump_profiling_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_dump_profiling_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_dump_profiling_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_dump_profiling_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_dump_profiling_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_dump_profiling_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_dump_profiling_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_dump_profiling_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, true, false, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, false, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, false, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, false, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, false, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, false, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(0, 1) void sk_entry_aiv_debug_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<0, 1, true, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 0) void sk_entry_aic_debug_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 0, true, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 1) void sk_entry_mix11_debug_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 1, true, true, true, true>(skDevArgs);
}
extern "C" __global__ __attribute__((aligned(512))) __mix__(1, 2) void sk_entry_mix12_debug_dump_profiling_op_trace_early_start(GM_ADDR skDevArgs)
{
spk_entry_impl<1, 2, true, true, true, true>(skDevArgs);
}