* -------------------------------------------------------------------------
* This file is part of the MindStudio project.
* Copyright (c) 2025 Huawei Technologies Co.,Ltd.
*
* MindStudio is licensed under Mulan PSL v2.
* You can use this software according to the terms and conditions of the Mulan PSL v2.
* You may obtain a copy of Mulan PSL v2 at:
*
* http://license.coscl.org.cn/MulanPSL2
*
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
* EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
* MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
* See the Mulan PSL v2 for more details.
* -------------------------------------------------------------------------
*/
#ifndef PROFILER_SERVER_SOURCETEST_H
#define PROFILER_SERVER_SOURCETEST_H
#include <string>
const std::string FOREACH_TILLING_DEF_H_PATH = "/home/test/workspace/foreach/common/foreach/foreach_tiling_def.h";
const std::string FOREACH_TILLING_DEF_H = R"(
#ifndef __FOREACH_TILING_DEF_H__
#define __FOREACH_TILING_DEF_H__
#include <cstdint>
#include <cstring>
#include "kernel_tiling/kernel_tiling.h"
#ifdef __CCE_KT_TEST__
#define __aicore__
#else
#define __aicore__ [aicore]
#endif
inline __aicore__ int32_t AlignDiv32(int32_t n)
{
return ((n + 31) & ~31) / 32;
}
DTYPE_SCALAR float
constexpr uint16_t MAX_TENSOR_CONT = 256;
constexpr uint16_t MAX_CORE_CONT = 64;
#pragma pack(1)
struct ForeachCommonTilingData {
uint64_t inputsTensorUbSize = 0;
int64_t tensorDataCountList[256] = {};
uint16_t tensorStartList[64] = {};
uint16_t tensorEndList[64] = {};
int64_t tensorStartOffsetList[64] = {};
int64_t tensorEndOffsetList[64] = {};
};
#pragma pack()
COPY_ARR(arrA, arrB, count) \
for (uint16_t i = 0; i < count; i++) { \
arrA[i] = arrB[i]; \
}
CONVERT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer) \
__ubuf__ tilingStruct* tilingDataPointer = \
reinterpret_cast<__ubuf__ tilingStruct*>((__ubuf__ uint8_t*)(tilingPointer));
#ifdef __CCE_KT_TEST__
INIT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer) \
CONVERT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer);
#else
INIT_TILING_DATA(tilingStruct, tilingDataPointer, tilingPointer) \
__ubuf__ uint8_t* tilingUbPointer = (__ubuf__ uint8_t*)get_imm(0); \
copy_gm_to_ubuf(((__ubuf__ uint8_t*)(tilingUbPointer)), ((__gm__ uint8_t*)(tilingPointer)), \
0, 1, AlignDiv32(sizeof(tilingStruct)), 0, 0); \
CONVERT_TILING_DATA(tilingStruct, tilingDataPointer, tilingUbPointer); \
pipe_barrier(PIPE_ALL);
#endif
GET_TILING_DATA(tilingData, tilingPointer) \
ForeachCommonTilingData tilingData; \
INIT_TILING_DATA(ForeachCommonTilingData, tilingDataPointer, tilingPointer); \
(tilingData).inputsTensorUbSize = tilingDataPointer->inputsTensorUbSize; \
COPY_ARR((tilingData).tensorDataCountList, tilingDataPointer->tensorDataCountList, MAX_TENSOR_CONT) \
COPY_ARR((tilingData).tensorStartList, tilingDataPointer->tensorStartList, MAX_CORE_CONT) \
COPY_ARR((tilingData).tensorEndList, tilingDataPointer->tensorEndList, MAX_CORE_CONT) \
COPY_ARR((tilingData).tensorStartOffsetList, tilingDataPointer->tensorStartOffsetList, MAX_CORE_CONT) \
COPY_ARR((tilingData).tensorEndOffsetList, tilingDataPointer->tensorEndOffsetList, MAX_CORE_CONT)
#endif
)";
const std::string KERNEL_FOREACH_BASE_H_PATH =
"/home/test/workspace/foreach/common/foreach/op_kernel/kernel_foreach_base.h";
const std::string KERNEL_FOREACH_BASE_H = R"(
/*
* Copyright (c) Huawei Technologies Co., Ltd. 2023-2023. All rights reserved.
*/
#ifndef KERNEL_FOREACH_BASE_H
#define KERNEL_FOREACH_BASE_H
#include <type_traits>
#include "kernel_operator.h"
namespace Common {
namespace OpKernel {
using namespace AscendC;
constexpr uint8_t COPY_SPACE_MULTIPLE = 9;
template <typename T>
class KernelForeachBase {
protected:
__aicore__ inline KernelForeachBase() {};
__aicore__ inline void Init(ForeachCommonTilingData* tilingData);
__aicore__ inline void ParseTilingData(ForeachCommonTilingData* tilingData);
__aicore__ inline __gm__ T* GetTensorAddr(uint16_t index, GM_ADDR tensorPtr);
template <typename T1, typename T2>
__aicore__ inline T1 CeilA2B(T1 a, T2 b) {
if (b == 0) {
return a;
}
return (a + b - 1) / b;
};
protected:
TPipe pipe;
int64_t blockIdx = 0;
// tiling params
uint64_t inputsTensorUbSize = 0;
int64_t* tensorDataCountList = nullptr;
uint16_t tensorStart = 0;
uint16_t tensorEnd = 0;
int64_t tensorStartOffset = 0;
int64_t tensorEndOffset = 0;
uint64_t totalTensorUbSize = 0;
uint32_t maxDataCount = 0;
uint32_t maxCastDataCount = 0;
};
template <typename T>
__aicore__ inline void KernelForeachBase<T>::Init(
ForeachCommonTilingData* tilingData) {
blockIdx = GetBlockIdx();
ParseTilingData(tilingData);
#if __CCE_AICORE__ == 220
if (std::is_same_v<T, bfloat16_t>) {
totalTensorUbSize = inputsTensorUbSize * COPY_SPACE_MULTIPLE;
maxDataCount = totalTensorUbSize / sizeof(T);
maxCastDataCount = inputsTensorUbSize / sizeof(float);
} else {
maxDataCount = inputsTensorUbSize / sizeof(T);
}
#else
maxDataCount = inputsTensorUbSize / sizeof(T);
#endif
}
template <typename T>
__aicore__ inline void KernelForeachBase<T>::ParseTilingData(
ForeachCommonTilingData* tilingData) {
inputsTensorUbSize = tilingData->inputsTensorUbSize;
tensorDataCountList = tilingData->tensorDataCountList;
tensorStart = tilingData->tensorStartList[blockIdx];
tensorEnd = tilingData->tensorEndList[blockIdx];
tensorStartOffset = tilingData->tensorStartOffsetList[blockIdx];
tensorEndOffset = tilingData->tensorEndOffsetList[blockIdx];
}
template <typename T>
__aicore__ inline __gm__ T* KernelForeachBase<T>::GetTensorAddr(uint16_t index, GM_ADDR tensorPtr) {
__gm__ uint64_t* dataAddr = reinterpret_cast<__gm__ uint64_t*>(tensorPtr);
uint64_t tensorPtrOffset = *dataAddr; // The offset of the data address from the first address.
// Moving 3 bits to the right means dividing by sizeof(uint64 t).
__gm__ uint64_t* retPtr = dataAddr + (tensorPtrOffset >> 3);
return reinterpret_cast<__gm__ T*>(*(retPtr + index));
}
} // namespace OpKernel
} // namespace Common
#endif // KERNEL_FOREACH_BASE_H
)";
const std::string FOREACH_SUB_SCALAR_LIST_CPP_PATH = "/home/test/workspace/foreach/foreach_sub_scalar_list.cpp";
const std::string FOREACH_SUB_SCALAR_LIST_CPP = R"(
/*
* Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
* This file contains code of cpu debug and npu code. We read data from bin file
* and write result to file.
*/
#include "kernel_operator.h"
// op kernel building at build_out directory, it's not fully aligned with source code structure
// current op_kernel folder is absent in build_out directory, so the relative path to common has just one layer
#include "common/foreach/foreach_tiling_def.h"
#include "common/foreach/op_kernel/foreach_one_scalar_list_binary_level_zero_api.h"
using namespace AscendC;
using namespace Common::OpKernel;
extern __global__ __aicore__ void foreach_sub_scalar_list(GM_ADDR x, GM_ADDR scalar, GM_ADDR y,
GM_ADDR workspace, GM_ADDR tiling) {
GET_TILING_DATA(tilingData, tiling);
//foreach(vector) not need workspace
GM_ADDR userWS = nullptr;
ForeachOneScalarListBinaryLevelZeroApi<float, float, Sub, 2, 1> op;
op.Init(x, scalar, y, userWS, &tilingData);
op.Process();
if (TILING_KEY_IS(1)) {
ForeachOneScalarListBinaryLevelZeroApi<half, half, Sub, 2, 1> op;
op.Init(x, scalar, y, userWS, &tilingData);
op.Process();
} else if (TILING_KEY_IS(2)) {
ForeachOneScalarListBinaryLevelZeroApi<float, float, Sub, 2, 1> op;
op.Init(x, scalar, y, userWS, &tilingData);
op.Process();
} else if (TILING_KEY_IS(3)) {
ForeachOneScalarListBinaryLevelZeroApi<int, int, Sub, 2, 1> op;
op.Init(x, scalar, y, userWS, &tilingData);
op.Process();
}
#if __CCE_AICORE__ == 220
else if (TILING_KEY_IS(4)) {
ForeachOneScalarListBinaryLevelZeroApi<bfloat16_t, float, Sub, 2, 1> op;
op.Init(x, scalar, y, userWS, &tilingData);
op.Process();
}
#endif
}
)";
const std::string API_INSTR_JSON = R"(
{
"Cores" : ["core0.veccore0", "core0.veccore1", "core1.veccore0"],
"Instructions" : [{
"Address" : "0x1124400c",
"AscendC Inner Code" : "/home/test/workspace/foreach/foreach_sub_scalar_list.cpp:18",
"Cycles" : [1.0, 1.0, 1.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "ADD XD:X16=0x100000,XD:X16:0x100000,XM:X15=0,XM:X15:0,XN:X16=0x100000,XN:X16:0x100000,dtype:S64"
}, {
"Address" : "0x11244010",
"AscendC Inner Code" : "/home/test/workspace/foreach/foreach_sub_scalar_list.cpp:18",
"Cycles" : [1.0, 1.0, 1.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "MOV_XD_SPR SPR:COREID,XD:X15=0x18,XD:X15:0x18"
}, {
"Address" : "0x11244014",
"AscendC Inner Code" : "/home/test/workspace/foreach/foreach_sub_scalar_list.cpp:18",
"Cycles" : [1.0, 1.0, 1.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "MOV_XD_IMM IMM:0x7fff,XD:X17=0x7fff,XD:X17:0x7fff"
}, {
"Address" : "0x11244018",
"AscendC Inner Code" : "/home/test/workspace/foreach/foreach_sub_scalar_list.cpp:18",
"Cycles" : [1.0, 1.0, 1.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "AND XD:X15=0x18,XD:X15:0x18,XM:X17=0x7fff,XM:X17:0x7fff,XN:X15=0x18,XN:X15:0x18,dtype:B64"
}, {
"Address" : "0x1124401c",
"AscendC Inner Code" : "/home/test/workspace/foreach/foreach_sub_scalar_list.cpp:18",
"Cycles" : [1.0, 1.0, 1.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "MOV_XD_IMM IMM:0x4000,XD:X17=0x4000,XD:X17:0x4000"
}, {
"Address" : "0x11244020",
"AscendC Inner Code" : "/home/test/workspace/foreach/foreach_sub_scalar_list.cpp:18",
"Cycles" : [1.0, 1.0, 1.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "MOV_XD_IMM IMM:0x1,XD:X0=0x1,XD:X0:0x1"
}, {
"Address" : "0x11244024",
"AscendC Inner Code" : "/home/test/workspace/foreach/foreach_sub_scalar_list.cpp:18",
"Cycles" : [3.0, 3.0, 3.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "MADD XD:X16=0x160000,XD:X16:0x160000,XM:X17=0x4000,XM:X17:0x4000,XN:X15=0x18,XN:X15:0x18,dtype:S64"
}, {
"Address" : "0x11244808",
"AscendC Inner Code" : "/home/test/workspace/foreach/common/foreach/foreach_tiling_def.h:32",
"Cycles" : [37.0, 37.0, 37.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "STI_XN_IMM #IMM_TYPE:ZERO,#POST:0,IMM:0,XN:X8=0x163f78,XN:X8:0x163f78,dtype:B64"
}, {
"Address" : "0x1124482c",
"AscendC Inner Code" : "/home/test/workspace/foreach/common/foreach/foreach_tiling_def.h:32",
"Cycles" : [1.0, 1.0, 1.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "ADD XD:X7=0x163f90,XD:X7:0x163f90,XM:X7=0xd10,XM:X7:0xd10,XN:X30=0x163280,XN:X30:0x163280,dtype:S64"
}, {
"Address" : "0x11244830",
"AscendC Inner Code" : "/home/test/workspace/foreach/common/foreach/foreach_tiling_def.h:32",
"Cycles" : [1.0, 1.0, 1.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "ADD XD:X8=0x163f98,XD:X8:0x163f98,XM:X8=0xd18,XM:X8:0xd18,XN:X30=0x163280,XN:X30:0x163280,dtype:S64"
}, {
"Address" : "0x11244834",
"AscendC Inner Code" : "/home/test/workspace/foreach/common/foreach/foreach_tiling_def.h:32",
"Cycles" : [299.0, 300.0, 296.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "STI_XN_IMM #IMM_TYPE:ZERO,#POST:0,IMM:0,XN:X7=0x163f90,XN:X7:0x163f90,dtype:B64"
}, {
"Address" : "0x11244838",
"AscendC Inner Code" : "/home/test/workspace/foreach/common/foreach/foreach_tiling_def.h:32",
"Cycles" : [299.0, 300.0, 296.0],
"Instructions Executed" : [1, 1, 1],
"Pipe" : "SCALAR",
"Source" : "STI_XN_IMM #IMM_TYPE:ZERO,#POST:0,IMM:0,XN:X8=0x163f98,XN:X8:0x163f98,dtype:B64"
}
]
}
)";
const std::string API_FILE_JSON = R"(
{
"Cores" : ["core0.veccore0", "core0.veccore1", "core1.veccore0"],
"Files" : [{
"Lines" : [{
"Address Range" : [["0x11244240", "0x11244240"], ["0x11244248", "0x1124453c"]],
"Cycles" : [15674.0, 15715.0, 15707.0],
"Instructions Executed" : [192, 192, 192],
"Line" : 31
}, {
"Address Range" : [["0x11244540", "0x11244540"], ["0x11244548", "0x11244844"]],
"Cycles" : [16293.0, 16335.0, 16309.0],
"Instructions Executed" : [193, 193, 193],
"Line" : 32
}
],
"Source" : "/home/test/workspace/foreach/common/foreach/foreach_tiling_def.h"
}, {
"Lines" : [{
"Address Range" : [["0x11244df0", "0x11244df0"], ["0x11244e38", "0x11244e38"]],
"Cycles" : [729.0, 727.0, 731.0],
"Instructions Executed" : [2, 2, 2],
"Line" : 85
}, {
"Address Range" : [["0x11244e0c", "0x11244e10"], ["0x11244e40", "0x11244e44"]],
"Cycles" : [4.0, 4.0, 4.0],
"Instructions Executed" : [4, 4, 4],
"Line" : 87
}, {
"Address Range" : [["0x11244e18", "0x11244e18"], ["0x11244e48", "0x11244e48"]],
"Cycles" : [727.0, 727.0, 729.0],
"Instructions Executed" : [2, 2, 2],
"Line" : 88
}
],
"Source" : "/home/test/workspace/foreach/common/foreach/op_kernel/kernel_foreach_base.h"
}, {
"Lines" : [{
"Address Range" : [["0x11244000", "0x11244060"]],
"Cycles" : [59.0, 59.0, 59.0],
"Instructions Executed" : [25, 25, 25],
"Line" : 18
}, {
"Address Range" : [["0x11244064", "0x11244848"], ["0x1124485c", "0x11244898"]],
"Cycles" : [264407.0, 264702.0, 264186.0],
"Instructions Executed" : [7489, 7489, 7489],
"Line" : 20
}, {
"Address Range" : [["0x11244954", "0x11244a90"], ["0x11244ca4", "0x11244ca8"]],
"Cycles" : [15703.5, 15723.5, 15738.5],
"Instructions Executed" : [82, 82, 82],
"Line" : 25
}, {
"Address Range" : [["0x11244a94", "0x11244c08"], ["0x11244c10", "0x11244c94"]],
"Cycles" : [7767.0, 7496.0, 7583.0],
"Instructions Executed" : [128, 128, 128],
"Line" : 26
}
],
"Source" : "/home/test/workspace/foreach/foreach_sub_scalar_list.cpp"
}
]
}
)";
#endif