* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file mstx_local_tensor_info.h
* \brief
*/
#if !defined(__ASCENDC_INCLUDE_INTERNAL_HEADERS__)
#pragma message("impl/basic_api/mstx_local_tensor_info.h is an internal header file and must not be used directly. Functions or variables defined in this file may be removed in the future. Please use \"#include \"basic_api/kernel_tensor.h\"\" and use public functions or variables defined in interface headers files.")
#define __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#define __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_MSTX_LOCAL_TENSOR_INFO_H__
#endif
#ifndef MSTX_TENSOR_INFO_H
#define MSTX_TENSOR_INFO_H
#include "kernel_tensor.h"
#include "kernel_struct_unary.h"
#include "kernel_struct_binary.h"
#include "kernel_struct_brcb.h"
#include "kernel_struct_gather.h"
#ifdef __MSTX_DFX_REPORT__
namespace AscendC {
namespace MstxTensor {
enum MstxTensorAddressSpace {
MSTX_TENSOR_AT_PRIVATE = 0,
MSTX_TENSOR_AT_GM,
MSTX_TENSOR_AT_L1,
MSTX_TENSOR_AT_L0A,
MSTX_TENSOR_AT_L0B,
MSTX_TENSOR_AT_L0C,
MSTX_TENSOR_AT_UB,
};
struct MstxTensorDesc {
MstxTensorAddressSpace space;
uint64_t addr;
uint64_t size;
uint8_t dataBits;
};
template <typename T>
__aicore__ inline MstxTensorDesc From(const LocalTensor<T>& dst) {
MstxTensorAddressSpace space;
Hardware dstHWPos = GetPhyType((TPosition)dst.GetPosition());
if (dstHWPos == Hardware::GM) {
space = MSTX_TENSOR_AT_GM;
} else if (dstHWPos == Hardware::UB) {
space = MSTX_TENSOR_AT_UB;
} else if (dstHWPos == Hardware::L1) {
space = MSTX_TENSOR_AT_L1;
} else if (dstHWPos == Hardware::L0A) {
space = MSTX_TENSOR_AT_L0A;
} else if (dstHWPos == Hardware::L0B) {
space = MSTX_TENSOR_AT_L0B;
} else if (dstHWPos == Hardware::L0C) {
space = MSTX_TENSOR_AT_L0C;
} else {
space = MSTX_TENSOR_AT_PRIVATE;
}
uint8_t sizebit;
if constexpr (IsSameType<PrimT<T>, int4b_t>::value) {
sizebit = static_cast<uint8_t>(4 * sizeof(PrimT<T>));
} else {
sizebit = static_cast<uint8_t>(8 * sizeof(PrimT<T>));
}
return MstxTensorDesc{
space,
reinterpret_cast<uint64_t>(dst.GetPhyAddr()),
dst.GetSize(),
sizebit
};
}
template <typename T>
__aicore__ inline MstxTensorDesc FromGm(const GlobalTensor<T>& dst) {
uint8_t sizebit;
if constexpr (IsSameType<PrimT<T>, int4b_t>::value) {
sizebit = static_cast<uint8_t>(4 * sizeof(PrimT<T>));
} else {
sizebit = static_cast<uint8_t>(8 * sizeof(PrimT<T>));
}
return MstxTensorDesc{
MSTX_TENSOR_AT_GM,
reinterpret_cast<uint64_t>(dst.GetPhyAddr()),
0,
sizebit
};
}
enum class MstxReportType : uint32_t {
MSTX_VEC_UNARY = 3000,
MSTX_VEC_BINARY = 3001,
MSTX_VEC_GATHER = 3002,
MSTX_VEC_BINARY_SCALAR,
MSTX_VEC_BILINEAR_INTERPOLATION,
MSTX_VEC_TENARY,
MSTX_VEC_CAST,
MSTX_VEC_CASTDEQ,
MSTX_VEC_SET_DEQ_SCALE,
MSTX_VEC_CMP,
MSTX_VEC_CMPS,
MSTX_VEC_SEL,
MSTX_VEC_GATHER_MASK,
MSTX_VEC_TRANSDATA,
MSTX_VEC_TRANSPOSE,
MSTX_VEC_WHOLE_REDUCE,
MSTX_VEC_BLK_REDUCE,
MSTX_VEC_REDUCE,
MSTX_VEC_PAIR_REDUCE,
MSTX_VEC_REPEAT_REDUCE,
MSTX_VEC_DUP,
MSTX_VEC_BROADCAST,
MSTX_VEC_VCI,
MSTX_VEC_COPY,
MSTX_DATA_COPY = 4001,
MSTX_DATA_COPY_PAD = 4002,
};
enum MstxMaskMode: uint32_t {
MSTX_MASK_NORM = 0,
MSTX_MASK_COUNT,
MSTX_MASK_FROM_REG = 0xff,
};
struct MstxVectorMask {
uint64_t mask0;
uint64_t mask1;
};
struct MstxVecWrapper {
MstxMaskMode maskMode;
MstxVectorMask mask;
uint32_t reserveBufSize;
bool useMask;
};
__aicore__ inline MstxVecWrapper WrapperFrom(MstxMaskMode maskMode, uint64_t mask0, uint64_t mask1, bool isSetMask, uint32_t reserveBufSize) {
return MstxVecWrapper{
maskMode,
{mask0, mask1},
reserveBufSize,
isSetMask
};
}
template <typename T>
__aicore__ inline MstxVecWrapper WrapperFrom(MstxMaskMode maskMode, uint64_t mask, bool isSetMask, uint32_t reserveBufSize) {
int32_t typeLen = 0;
constexpr int32_t halfTypeLen = 64;
constexpr int32_t lenCoeff = 2;
uint64_t trueMask0, trueMask1;
if constexpr (IsSameType<T, int4b_t>::value) {
typeLen = DEFAULT_BLOCK_SIZE * INT4_TWO;
} else {
typeLen = DEFAULT_BLOCK_SIZE / sizeof(T);
}
if (mask == halfTypeLen) {
trueMask0 = FULL_MASK;
trueMask1 = 0;
} else if (mask == typeLen || mask >= halfTypeLen * lenCoeff) {
trueMask0 = FULL_MASK;
trueMask1 = FULL_MASK;
} else {
trueMask0 = (mask > halfTypeLen) ? FULL_MASK : (((static_cast<uint64_t>(1)) << static_cast<uint32_t>(mask)) - 1);
trueMask1 = (mask > halfTypeLen) ? (((static_cast<uint64_t>(1)) << static_cast<uint32_t>(mask - halfTypeLen)) - 1) : 0;
}
return MstxVecWrapper{
maskMode,
{trueMask0, trueMask1},
reserveBufSize,
isSetMask
};
}
struct MstxVecUnaryDesc {
MstxTensorDesc dst;
MstxTensorDesc src;
MstxVecWrapper wrapper;
uint32_t blockNum;
uint32_t dstBlockStride;
uint32_t srcBlockStride;
uint32_t repeatTimes;
uint32_t dstRepeatStride;
uint32_t srcRepeatStride;
char name[64];
};
struct MstxVecReduceDesc {
MstxTensorDesc dst;
MstxTensorDesc src;
MstxVecWrapper wrapper;
uint32_t srcBlockStride;
uint32_t repeatTimes;
uint32_t dstRepeatStride;
uint32_t srcRepeatStride;
char name[64];
};
struct MstxVecBrcbDesc {
MstxTensorDesc dst;
MstxTensorDesc src;
MstxVecWrapper wrapper;
uint32_t dstBlockStride;
uint32_t repeatTimes;
uint32_t dstRepeatStride;
char name[64];
};
struct MstxVecComplexReduceDesc {
MstxTensorDesc dst;
MstxTensorDesc src;
MstxTensorDesc tmp;
MstxVecWrapper wrapper;
uint32_t repeatTimes;
uint32_t srcRepeatStride;
char name[64];
};
struct MstxVecDupDesc {
MstxTensorDesc dst;
MstxVecWrapper wrapper;
uint32_t dstBlockStride;
uint32_t repeatTimes;
uint32_t dstRepeatStride;
char name[64];
};
struct MstxVecCopy {
MstxTensorDesc dst;
MstxTensorDesc src;
MstxVecWrapper wrapper;
uint32_t repeatTimes;
uint32_t dstStride;
uint32_t srcStride;
uint32_t dstRepeatSize;
uint32_t srcRepeatSize;
char name[64];
};
struct MstxVecCastDeqDesc {
MstxTensorDesc dst;
MstxTensorDesc src;
MstxVecWrapper wrapper;
uint32_t blockNum;
uint32_t dstBlockStride;
uint32_t srcBlockStride;
uint32_t repeatTimes;
uint32_t dstRepeatStride;
uint32_t srcRepeatStride;
bool halfBlock;
char name[64];
};
struct MstxDataCopyDesc {
MstxTensorDesc dst;
MstxTensorDesc src;
uint32_t lenBurst;
uint32_t nBurst;
uint32_t srcGap;
uint32_t dstGap;
char name[64];
};
struct MstxDataCopyPadDesc {
MstxTensorDesc dst;
MstxTensorDesc src;
uint32_t lenBurst;
uint32_t nBurst;
uint32_t srcGap;
uint32_t dstGap;
uint32_t leftPad;
uint32_t rightPad;
char name[64];
};
struct MstxVecBinaryDesc {
MstxTensorDesc dst;
MstxTensorDesc src0;
MstxTensorDesc src1;
MstxVecWrapper wrapper;
uint32_t blockNum;
uint32_t dstBlockStride;
uint32_t src0BlockStride;
uint32_t src1BlockStride;
uint32_t repeatTimes;
uint32_t dstRepeatStride;
uint32_t src0RepeatStride;
uint32_t src1RepeatStride;
char name[64];
};
struct MstxVecBilinearInterpolation {
MstxTensorDesc dst;
MstxTensorDesc src0;
MstxTensorDesc src1;
MstxTensorDesc src0Offset;
MstxTensorDesc shared;
MstxVecWrapper wrapper;
uint32_t hRepeat;
bool repeatMode;
uint32_t dstBlockStride;
uint32_t vROffset;
uint32_t vRepeat;
char name[64];
};
struct MstxVecTranspose {
MstxTensorDesc dst;
MstxTensorDesc src;
MstxTensorDesc shared;
bool temp;
char name[64];
};
struct MstxVecSelDesc {
MstxTensorDesc dst;
MstxTensorDesc src0;
MstxTensorDesc src1;
MstxTensorDesc mask;
MstxVecWrapper wrapper;
bool scalarMode;
uint32_t blockNum;
uint32_t dstBlockStride;
uint32_t src0BlockStride;
uint32_t src1BlockStride;
uint32_t repeatTimes;
uint32_t dstRepeatStride;
uint32_t src0RepeatStride;
uint32_t src1RepeatStride;
char name[64];
};
enum class MstxGatherMaskMode {
V1,
V2
};
struct MstxVecGatherMaskDesc {
MstxTensorDesc dst;
MstxTensorDesc src;
MstxVecWrapper wrapper;
MstxGatherMaskMode mode;
uint32_t repeatTimes;
uint32_t src0BlockStride;
uint32_t src0RepeatStride;
uint32_t src1RepeatStride;
char name[64];
};
__aicore__ inline void CopyName(char b[64], __gm__ const char* a)
{
uint32_t i = 0;
for (; i < 63; ++i){
b[i] = a[i];
if (a[i] == '\0') {
break;
}
}
b[i] = '\0';
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, uint64_t mask0, uint64_t mask1,
const uint8_t repeatTime, const UnaryRepeatParams& repeatParams, __gm__ const char* name)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = repeatParams.blockNumber;
mstxVecUnaryDesc.repeatTimes = repeatTime;
mstxVecUnaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecUnaryDesc.srcBlockStride = repeatParams.srcBlkStride;
mstxVecUnaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecUnaryDesc.srcRepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_UNARY), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, uint64_t mask,
const uint8_t repeatTime, const UnaryRepeatParams& repeatParams, __gm__ const char* name)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = repeatParams.blockNumber;
mstxVecUnaryDesc.repeatTimes = repeatTime;
mstxVecUnaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecUnaryDesc.srcBlockStride = repeatParams.srcBlkStride;
mstxVecUnaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecUnaryDesc.srcRepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_UNARY), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, __gm__ const char* name,
const int32_t count)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = 8;
mstxVecUnaryDesc.repeatTimes = 1;
mstxVecUnaryDesc.dstBlockStride = 1;
mstxVecUnaryDesc.srcBlockStride = 1;
mstxVecUnaryDesc.dstRepeatStride = 8;
mstxVecUnaryDesc.srcRepeatStride = 8;
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_UNARY), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryTenaryInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, uint64_t mask0, uint64_t mask1,
const uint8_t repeatTime, const UnaryRepeatParams& repeatParams, __gm__ const char* name)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = repeatParams.blockNumber;
mstxVecUnaryDesc.repeatTimes = repeatTime;
mstxVecUnaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecUnaryDesc.srcBlockStride = repeatParams.srcBlkStride;
mstxVecUnaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecUnaryDesc.srcRepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_TENARY), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryTenaryInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, uint64_t mask,
const uint8_t repeatTime, const UnaryRepeatParams& repeatParams, __gm__ const char* name)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = repeatParams.blockNumber;
mstxVecUnaryDesc.repeatTimes = repeatTime;
mstxVecUnaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecUnaryDesc.srcBlockStride = repeatParams.srcBlkStride;
mstxVecUnaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecUnaryDesc.srcRepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_TENARY), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryTenaryInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, __gm__ const char* name,
const int32_t count)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = 8;
mstxVecUnaryDesc.repeatTimes = 1;
mstxVecUnaryDesc.dstBlockStride = 1;
mstxVecUnaryDesc.srcBlockStride = 1;
mstxVecUnaryDesc.dstRepeatStride = 8;
if constexpr (sizeof(T) > sizeof(U)) {
mstxVecUnaryDesc.srcRepeatStride = DEFAULT_REPEAT_STRIDE / 2 ;
} else {
mstxVecUnaryDesc.srcRepeatStride = DEFAULT_REPEAT_STRIDE;
}
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_TENARY), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryCastInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, uint64_t mask0, uint64_t mask1,
const uint8_t repeatTime, const UnaryRepeatParams& repeatParams, __gm__ const char* name)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = repeatParams.blockNumber;
mstxVecUnaryDesc.repeatTimes = repeatTime;
mstxVecUnaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecUnaryDesc.srcBlockStride = repeatParams.srcBlkStride;
mstxVecUnaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecUnaryDesc.srcRepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CAST), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryCastInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, uint64_t mask,
const uint8_t repeatTime, const UnaryRepeatParams& repeatParams, __gm__ const char* name)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = repeatParams.blockNumber;
mstxVecUnaryDesc.repeatTimes = repeatTime;
mstxVecUnaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecUnaryDesc.srcBlockStride = repeatParams.srcBlkStride;
mstxVecUnaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecUnaryDesc.srcRepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CAST), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryCastInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, __gm__ const char* name,
const int32_t count)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = 8;
mstxVecUnaryDesc.repeatTimes = 1;
mstxVecUnaryDesc.dstBlockStride = 1;
mstxVecUnaryDesc.srcBlockStride = 1;
if constexpr (sizeof(T) > sizeof(U)) {
if constexpr (IsSameType<U, int4b_t>::value) {
mstxVecUnaryDesc.dstRepeatStride = DEFAULT_REPEAT_STRIDE;
mstxVecUnaryDesc.srcRepeatStride = ONE_FOURTH_DEFAULT_REPEAT_STRIDE;
} else {
mstxVecUnaryDesc.dstRepeatStride = DEFAULT_REPEAT_STRIDE;
mstxVecUnaryDesc.srcRepeatStride = DEFAULT_REPEAT_STRIDE / 2;
}
} else if constexpr (sizeof(T) < sizeof(U)) {
if constexpr (IsSameType<T, int4b_t>::value) {
mstxVecUnaryDesc.dstRepeatStride = ONE_FOURTH_DEFAULT_REPEAT_STRIDE;
mstxVecUnaryDesc.srcRepeatStride = DEFAULT_REPEAT_STRIDE;
} else {
mstxVecUnaryDesc.dstRepeatStride = DEFAULT_REPEAT_STRIDE / 2;
mstxVecUnaryDesc.srcRepeatStride = DEFAULT_REPEAT_STRIDE;
}
} else {
mstxVecUnaryDesc.dstRepeatStride = DEFAULT_REPEAT_STRIDE;
mstxVecUnaryDesc.srcRepeatStride = DEFAULT_REPEAT_STRIDE;
}
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CAST), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryCastDeqInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, uint64_t mask0, uint64_t mask1,
const uint8_t repeatTime, const UnaryRepeatParams& repeatParams, __gm__ const char* name, bool halfBlock)
{
MstxVecCastDeqDesc mstxVecCastDeqDesc;
mstxVecCastDeqDesc.dst = From(dst);
mstxVecCastDeqDesc.src = From(src);
mstxVecCastDeqDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecCastDeqDesc.blockNum = repeatParams.blockNumber;
mstxVecCastDeqDesc.repeatTimes = repeatTime;
mstxVecCastDeqDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecCastDeqDesc.srcBlockStride = repeatParams.srcBlkStride;
mstxVecCastDeqDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecCastDeqDesc.srcRepeatStride = repeatParams.srcRepStride;
mstxVecCastDeqDesc.halfBlock = halfBlock;
CopyName(mstxVecCastDeqDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CASTDEQ), sizeof(mstxVecCastDeqDesc), &mstxVecCastDeqDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryCastDeqInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, uint64_t mask,
const uint8_t repeatTime, const UnaryRepeatParams& repeatParams, __gm__ const char* name, bool halfBlock)
{
MstxVecCastDeqDesc mstxVecCastDeqDesc;
mstxVecCastDeqDesc.dst = From(dst);
mstxVecCastDeqDesc.src = From(src);
mstxVecCastDeqDesc.wrapper = WrapperFrom<U>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecCastDeqDesc.blockNum = repeatParams.blockNumber;
mstxVecCastDeqDesc.repeatTimes = repeatTime;
mstxVecCastDeqDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecCastDeqDesc.srcBlockStride = repeatParams.srcBlkStride;
mstxVecCastDeqDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecCastDeqDesc.srcRepeatStride = repeatParams.srcRepStride;
mstxVecCastDeqDesc.halfBlock = halfBlock;
CopyName(mstxVecCastDeqDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CASTDEQ), sizeof(mstxVecCastDeqDesc), &mstxVecCastDeqDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryCastDeqInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, __gm__ const char* name,
const int32_t count, bool halfBlock)
{
MstxVecCastDeqDesc mstxVecCastDeqDesc;
mstxVecCastDeqDesc.dst = From(dst);
mstxVecCastDeqDesc.src = From(src);
mstxVecCastDeqDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), isSetMask, static_cast<uint32_t>(0));
mstxVecCastDeqDesc.blockNum = 8;
mstxVecCastDeqDesc.repeatTimes = 1;
mstxVecCastDeqDesc.dstBlockStride = 1;
mstxVecCastDeqDesc.srcBlockStride = 1;
mstxVecCastDeqDesc.dstRepeatStride = 8;
mstxVecCastDeqDesc.srcRepeatStride = 8;
if constexpr (sizeof(T) > sizeof(U)) {
if constexpr (IsSameType<U, int4b_t>::value) {
mstxVecCastDeqDesc.dstRepeatStride = DEFAULT_REPEAT_STRIDE;
mstxVecCastDeqDesc.srcRepeatStride = ONE_FOURTH_DEFAULT_REPEAT_STRIDE;
} else {
mstxVecCastDeqDesc.dstRepeatStride = DEFAULT_REPEAT_STRIDE;
mstxVecCastDeqDesc.srcRepeatStride = DEFAULT_REPEAT_STRIDE / 2;
}
} else if constexpr (sizeof(T) < sizeof(U)) {
if constexpr (IsSameType<T, int4b_t>::value) {
mstxVecCastDeqDesc.dstRepeatStride = ONE_FOURTH_DEFAULT_REPEAT_STRIDE;
mstxVecCastDeqDesc.srcRepeatStride = DEFAULT_REPEAT_STRIDE;
} else {
mstxVecCastDeqDesc.dstRepeatStride = DEFAULT_REPEAT_STRIDE / 2;
mstxVecCastDeqDesc.srcRepeatStride = DEFAULT_REPEAT_STRIDE;
}
} else {
mstxVecCastDeqDesc.dstRepeatStride = DEFAULT_REPEAT_STRIDE;
mstxVecCastDeqDesc.srcRepeatStride = DEFAULT_REPEAT_STRIDE;
}
mstxVecCastDeqDesc.halfBlock = halfBlock;
CopyName(mstxVecCastDeqDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CASTDEQ), sizeof(mstxVecCastDeqDesc), &mstxVecCastDeqDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryCmpsInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, uint64_t mask0, uint64_t mask1,
const uint8_t repeatTime, const UnaryRepeatParams& repeatParams, __gm__ const char* name)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = repeatParams.blockNumber;
mstxVecUnaryDesc.repeatTimes = repeatTime;
mstxVecUnaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecUnaryDesc.srcBlockStride = repeatParams.srcBlkStride;
mstxVecUnaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecUnaryDesc.srcRepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CMPS), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryCmpsInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, uint64_t mask,
const uint8_t repeatTime, const UnaryRepeatParams& repeatParams, __gm__ const char* name)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = repeatParams.blockNumber;
mstxVecUnaryDesc.repeatTimes = repeatTime;
mstxVecUnaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecUnaryDesc.srcBlockStride = repeatParams.srcBlkStride;
mstxVecUnaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecUnaryDesc.srcRepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CMPS), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U, bool isSetMask>
__aicore__ inline void GetMstxVecUnaryCmpsInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src, __gm__ const char* name,
const int32_t count)
{
MstxVecUnaryDesc mstxVecUnaryDesc;
mstxVecUnaryDesc.dst = From(dst);
mstxVecUnaryDesc.src = From(src);
mstxVecUnaryDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), isSetMask, static_cast<uint32_t>(0));
mstxVecUnaryDesc.blockNum = 8;
mstxVecUnaryDesc.repeatTimes = 1;
mstxVecUnaryDesc.dstBlockStride = 1;
mstxVecUnaryDesc.srcBlockStride = 1;
mstxVecUnaryDesc.dstRepeatStride = 8;
mstxVecUnaryDesc.srcRepeatStride = 8;
CopyName(mstxVecUnaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CMPS), sizeof(mstxVecUnaryDesc), &mstxVecUnaryDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecReduceInfo(const LocalTensor<U>& dst, const LocalTensor<T>& src, uint64_t mask0, uint64_t mask1,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride, bool isSetMask, __gm__ const char* name)
{
MstxVecReduceDesc mstxVecReduceDesc;
mstxVecReduceDesc.dst = From(dst);
mstxVecReduceDesc.src = From(src);
mstxVecReduceDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecReduceDesc.repeatTimes = repeatTime;
mstxVecReduceDesc.srcBlockStride = srcBlkStride;
mstxVecReduceDesc.dstRepeatStride = dstRepStride;
mstxVecReduceDesc.srcRepeatStride = srcRepStride;
CopyName(mstxVecReduceDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_WHOLE_REDUCE), sizeof(mstxVecReduceDesc), &mstxVecReduceDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecReduceInfo(const LocalTensor<U>& dst, const LocalTensor<T>& src, uint64_t mask,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride, bool isSetMask, __gm__ const char* name)
{
MstxVecReduceDesc mstxVecReduceDesc;
mstxVecReduceDesc.dst = From(dst);
mstxVecReduceDesc.src = From(src);
mstxVecReduceDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecReduceDesc.repeatTimes = repeatTime;
mstxVecReduceDesc.srcBlockStride = srcBlkStride;
mstxVecReduceDesc.dstRepeatStride = dstRepStride;
mstxVecReduceDesc.srcRepeatStride = srcRepStride;
CopyName(mstxVecReduceDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_WHOLE_REDUCE), sizeof(mstxVecReduceDesc), &mstxVecReduceDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecCopyInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src, uint64_t mask0, uint64_t mask1,
const int32_t repeatTime, const CopyRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecCopy mstxVecCopy;
mstxVecCopy.dst = From(dst);
mstxVecCopy.src = From(src);
mstxVecCopy.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecCopy.repeatTimes = repeatTime;
mstxVecCopy.dstStride = repeatParams.dstStride;
mstxVecCopy.srcStride = repeatParams.srcStride;
mstxVecCopy.dstRepeatSize = repeatParams.dstRepeatSize;
mstxVecCopy.srcRepeatSize = repeatParams.srcRepeatSize;
CopyName(mstxVecCopy.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_COPY), sizeof(mstxVecCopy), &mstxVecCopy);
}
template <typename T>
__aicore__ inline void GetMstxVecCopyInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src, uint64_t mask,
const int32_t repeatTime, const CopyRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecCopy mstxVecCopy;
mstxVecCopy.dst = From(dst);
mstxVecCopy.src = From(src);
mstxVecCopy.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecCopy.repeatTimes = repeatTime;
mstxVecCopy.dstStride = repeatParams.dstStride;
mstxVecCopy.srcStride = repeatParams.srcStride;
mstxVecCopy.dstRepeatSize = repeatParams.dstRepeatSize;
mstxVecCopy.srcRepeatSize = repeatParams.srcRepeatSize;
CopyName(mstxVecCopy.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_COPY), sizeof(mstxVecCopy), &mstxVecCopy);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecReduceBlkInfo(const LocalTensor<U>& dst, const LocalTensor<T>& src, uint64_t mask0, uint64_t mask1,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride, bool isSetMask, __gm__ const char* name)
{
MstxVecReduceDesc mstxVecReduceDesc;
mstxVecReduceDesc.dst = From(dst);
mstxVecReduceDesc.src = From(src);
mstxVecReduceDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecReduceDesc.repeatTimes = repeatTime;
mstxVecReduceDesc.srcBlockStride = srcBlkStride;
mstxVecReduceDesc.dstRepeatStride = dstRepStride;
mstxVecReduceDesc.srcRepeatStride = srcRepStride;
CopyName(mstxVecReduceDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BLK_REDUCE), sizeof(mstxVecReduceDesc), &mstxVecReduceDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecReduceBlkInfo(const LocalTensor<U>& dst, const LocalTensor<T>& src, uint64_t mask,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride, bool isSetMask, __gm__ const char* name)
{
MstxVecReduceDesc mstxVecReduceDesc;
mstxVecReduceDesc.dst = From(dst);
mstxVecReduceDesc.src = From(src);
mstxVecReduceDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecReduceDesc.repeatTimes = repeatTime;
mstxVecReduceDesc.srcBlockStride = srcBlkStride;
mstxVecReduceDesc.dstRepeatStride = dstRepStride;
mstxVecReduceDesc.srcRepeatStride = srcRepStride;
CopyName(mstxVecReduceDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BLK_REDUCE), sizeof(mstxVecReduceDesc), &mstxVecReduceDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecReducePairInfo(const LocalTensor<U>& dst, const LocalTensor<T>& src, uint64_t mask0, uint64_t mask1,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride, bool isSetMask, __gm__ const char* name)
{
MstxVecReduceDesc mstxVecReduceDesc;
mstxVecReduceDesc.dst = From(dst);
mstxVecReduceDesc.src = From(src);
mstxVecReduceDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecReduceDesc.repeatTimes = repeatTime;
mstxVecReduceDesc.srcBlockStride = srcBlkStride;
mstxVecReduceDesc.dstRepeatStride = dstRepStride;
mstxVecReduceDesc.srcRepeatStride = srcRepStride;
CopyName(mstxVecReduceDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_PAIR_REDUCE), sizeof(mstxVecReduceDesc), &mstxVecReduceDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecReducePairInfo(const LocalTensor<U>& dst, const LocalTensor<T>& src, uint64_t mask,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride, bool isSetMask, __gm__ const char* name)
{
MstxVecReduceDesc mstxVecReduceDesc;
mstxVecReduceDesc.dst = From(dst);
mstxVecReduceDesc.src = From(src);
mstxVecReduceDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecReduceDesc.repeatTimes = repeatTime;
mstxVecReduceDesc.srcBlockStride = srcBlkStride;
mstxVecReduceDesc.dstRepeatStride = dstRepStride;
mstxVecReduceDesc.srcRepeatStride = srcRepStride;
CopyName(mstxVecReduceDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_PAIR_REDUCE), sizeof(mstxVecReduceDesc), &mstxVecReduceDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecReduceRepeatInfo(const LocalTensor<U>& dst, const LocalTensor<T>& src, uint64_t mask0, uint64_t mask1,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride, bool isSetMask, __gm__ const char* name)
{
MstxVecReduceDesc mstxVecReduceDesc;
mstxVecReduceDesc.dst = From(dst);
mstxVecReduceDesc.src = From(src);
mstxVecReduceDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecReduceDesc.repeatTimes = repeatTime;
mstxVecReduceDesc.srcBlockStride = srcBlkStride;
mstxVecReduceDesc.dstRepeatStride = dstRepStride;
mstxVecReduceDesc.srcRepeatStride = srcRepStride;
CopyName(mstxVecReduceDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_REPEAT_REDUCE), sizeof(mstxVecReduceDesc), &mstxVecReduceDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecReduceRepeatInfo(const LocalTensor<U>& dst, const LocalTensor<T>& src, uint64_t mask,
const int32_t repeatTime, const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride, bool isSetMask, __gm__ const char* name)
{
MstxVecReduceDesc mstxVecReduceDesc;
mstxVecReduceDesc.dst = From(dst);
mstxVecReduceDesc.src = From(src);
mstxVecReduceDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecReduceDesc.repeatTimes = repeatTime;
mstxVecReduceDesc.srcBlockStride = srcBlkStride;
mstxVecReduceDesc.dstRepeatStride = dstRepStride;
mstxVecReduceDesc.srcRepeatStride = srcRepStride;
CopyName(mstxVecReduceDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_REPEAT_REDUCE), sizeof(mstxVecReduceDesc), &mstxVecReduceDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecBrcbInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src, const uint8_t repeatTime,
const BrcbRepeatParams& repeatParams, __gm__ const char* name)
{
MstxVecBrcbDesc mstxVecBrcbDesc;
mstxVecBrcbDesc.dst = From(dst);
mstxVecBrcbDesc.src = From(src);
mstxVecBrcbDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, 0, 0, false, static_cast<uint32_t>(0));
mstxVecBrcbDesc.repeatTimes = repeatTime;
mstxVecBrcbDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecBrcbDesc.dstRepeatStride = repeatParams.dstRepStride;
CopyName(mstxVecBrcbDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BROADCAST), sizeof(mstxVecBrcbDesc), &mstxVecBrcbDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecTransposeInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src, __gm__ const char* name)
{
MstxVecTranspose mstxVecTranspose;
mstxVecTranspose.dst = From(dst);
mstxVecTranspose.src = From(src);
mstxVecTranspose.temp = false;
CopyName(mstxVecTranspose.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_TRANSPOSE), sizeof(mstxVecTranspose), &mstxVecTranspose);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecTransposeTempInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src,
const LocalTensor<U> &sharedTmpBuffer, __gm__ const char* name)
{
MstxVecTranspose mstxVecTranspose;
mstxVecTranspose.dst = From(dst);
mstxVecTranspose.src = From(src);
mstxVecTranspose.shared = From(sharedTmpBuffer);
mstxVecTranspose.temp = true;
CopyName(mstxVecTranspose.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_TRANSPOSE), sizeof(mstxVecTranspose), &mstxVecTranspose);
}
template <typename T>
__aicore__ inline void GetMstxVecDupInfo(const LocalTensor<T>& dst, uint64_t mask0, uint64_t mask1,
const uint8_t repeatTime, const uint16_t dstBlockStride, const uint8_t dstRepeatStride, bool isSetMask, __gm__ const char* name)
{
MstxVecDupDesc mstxVecDupDesc;
mstxVecDupDesc.dst = From(dst);
mstxVecDupDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecDupDesc.repeatTimes = repeatTime;
mstxVecDupDesc.dstBlockStride = dstBlockStride;
mstxVecDupDesc.dstRepeatStride = dstRepeatStride;
CopyName(mstxVecDupDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_DUP), sizeof(mstxVecDupDesc), &mstxVecDupDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecDupInfo(const LocalTensor<T>& dst, uint64_t mask,
const uint8_t repeatTime, const uint16_t dstBlockStride, const uint8_t dstRepeatStride, bool isSetMask, __gm__ const char* name)
{
MstxVecDupDesc mstxVecDupDesc;
mstxVecDupDesc.dst = From(dst);
mstxVecDupDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecDupDesc.repeatTimes = repeatTime;
mstxVecDupDesc.dstBlockStride = dstBlockStride;
mstxVecDupDesc.dstRepeatStride = dstRepeatStride;
CopyName(mstxVecDupDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_DUP), sizeof(mstxVecDupDesc), &mstxVecDupDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecDupInfo(const LocalTensor<T>& dst, const int32_t& count, __gm__ const char* name)
{
MstxVecDupDesc mstxVecDupDesc;
mstxVecDupDesc.dst = From(dst);
mstxVecDupDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), true, static_cast<uint32_t>(0));
mstxVecDupDesc.repeatTimes = 1;
mstxVecDupDesc.dstBlockStride = 1;
mstxVecDupDesc.dstRepeatStride = 8;
CopyName(mstxVecDupDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_DUP), sizeof(mstxVecDupDesc), &mstxVecDupDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecIndexInfo(const LocalTensor<T>& dst, uint64_t mask0, uint64_t mask1,
const uint8_t repeatTime, const uint16_t dstBlockStride, const uint8_t dstRepeatStride, __gm__ const char* name)
{
MstxVecDupDesc mstxVecDupDesc;
mstxVecDupDesc.dst = From(dst);
mstxVecDupDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, true, static_cast<uint32_t>(8192));
mstxVecDupDesc.repeatTimes = repeatTime;
mstxVecDupDesc.dstBlockStride = dstBlockStride;
mstxVecDupDesc.dstRepeatStride = dstRepeatStride;
CopyName(mstxVecDupDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_VCI), sizeof(mstxVecDupDesc), &mstxVecDupDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecIndexInfo(const LocalTensor<T>& dst, uint64_t mask,
const uint8_t repeatTime, const uint16_t dstBlockStride, const uint8_t dstRepeatStride, __gm__ const char* name)
{
MstxVecDupDesc mstxVecDupDesc;
mstxVecDupDesc.dst = From(dst);
mstxVecDupDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, true, static_cast<uint32_t>(8192));
mstxVecDupDesc.repeatTimes = repeatTime;
mstxVecDupDesc.dstBlockStride = dstBlockStride;
mstxVecDupDesc.dstRepeatStride = dstRepeatStride;
CopyName(mstxVecDupDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_VCI), sizeof(mstxVecDupDesc), &mstxVecDupDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecIndexInfo(const LocalTensor<T>& dst, uint32_t count, __gm__ const char* name)
{
MstxVecDupDesc mstxVecDupDesc;
mstxVecDupDesc.dst = From(dst);
mstxVecDupDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), true, static_cast<uint32_t>(8192));
mstxVecDupDesc.repeatTimes = 1;
mstxVecDupDesc.dstBlockStride = 1;
mstxVecDupDesc.dstRepeatStride = 8;
CopyName(mstxVecDupDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_VCI), sizeof(mstxVecDupDesc), &mstxVecDupDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecReduceComplexInfo(const LocalTensor<U>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const int32_t mask0, const int32_t mask1,
const int32_t repeatTime, const int32_t srcRepStride, __gm__ const char* name)
{
MstxVecComplexReduceDesc mstxVecComplexReduceDesc;
mstxVecComplexReduceDesc.dst = From(dst);
mstxVecComplexReduceDesc.src = From(src);
mstxVecComplexReduceDesc.tmp = From(sharedTmpBuffer);
mstxVecComplexReduceDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, true, static_cast<uint32_t>(8192));
mstxVecComplexReduceDesc.repeatTimes = repeatTime;
mstxVecComplexReduceDesc.srcRepeatStride = srcRepStride;
CopyName(mstxVecComplexReduceDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_REDUCE), sizeof(mstxVecComplexReduceDesc), &mstxVecComplexReduceDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecReduceComplexInfo(const LocalTensor<U>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const int32_t mask,
const int32_t repeatTime, const int32_t srcRepStride, __gm__ const char* name)
{
MstxVecComplexReduceDesc mstxVecComplexReduceDesc;
mstxVecComplexReduceDesc.dst = From(dst);
mstxVecComplexReduceDesc.src = From(src);
mstxVecComplexReduceDesc.tmp = From(sharedTmpBuffer);
mstxVecComplexReduceDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, true, static_cast<uint32_t>(8192));
mstxVecComplexReduceDesc.repeatTimes = repeatTime;
mstxVecComplexReduceDesc.srcRepeatStride = srcRepStride;
CopyName(mstxVecComplexReduceDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_REDUCE), sizeof(mstxVecComplexReduceDesc), &mstxVecComplexReduceDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecReduceComplexInfo(const LocalTensor<U>& dst, const LocalTensor<T>& src,
const LocalTensor<T>& sharedTmpBuffer, const int32_t count, __gm__ const char* name)
{
MstxVecComplexReduceDesc mstxVecComplexReduceDesc;
mstxVecComplexReduceDesc.dst = From(dst);
mstxVecComplexReduceDesc.src = From(src);
mstxVecComplexReduceDesc.tmp = From(sharedTmpBuffer);
mstxVecComplexReduceDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), true, static_cast<uint32_t>(8192));
mstxVecComplexReduceDesc.repeatTimes = 1;
mstxVecComplexReduceDesc.srcRepeatStride = 8;
CopyName(mstxVecComplexReduceDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_REDUCE), sizeof(mstxVecComplexReduceDesc), &mstxVecComplexReduceDesc);
}
template <typename T, typename U, typename V>
__aicore__ inline void GetMstxVecBinaryInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src0,
const LocalTensor<V>& src1, uint64_t mask0, uint64_t mask1, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = repeatParams.blockNumber;
mstxVecBinaryDesc.repeatTimes = repeatTime;
mstxVecBinaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecBinaryDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecBinaryDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecBinaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecBinaryDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecBinaryDesc.src1RepeatStride = repeatParams.src1RepStride;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BINARY), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T, typename U, typename V>
__aicore__ inline void GetMstxVecBinaryInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src0,
const LocalTensor<V>& src1, uint64_t mask, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = repeatParams.blockNumber;
mstxVecBinaryDesc.repeatTimes = repeatTime;
mstxVecBinaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecBinaryDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecBinaryDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecBinaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecBinaryDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecBinaryDesc.src1RepeatStride = repeatParams.src1RepStride;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BINARY), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T, typename U, typename V>
__aicore__ inline void GetMstxVecBinaryInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src0,
const LocalTensor<V>& src1, __gm__ const char* name, const int32_t count)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), true, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = 8;
mstxVecBinaryDesc.repeatTimes = 1;
mstxVecBinaryDesc.dstBlockStride = 1;
mstxVecBinaryDesc.src0BlockStride = 1;
mstxVecBinaryDesc.src1BlockStride = 1;
mstxVecBinaryDesc.dstRepeatStride = 8;
mstxVecBinaryDesc.src0RepeatStride = 8;
mstxVecBinaryDesc.src1RepeatStride = 8;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BINARY), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T, typename U, typename V>
__aicore__ inline void GetMstxVecBilinearInterpolationInfo(const LocalTensor<T> &dst, const LocalTensor<T> &src0,
const LocalTensor<U> &src0Offset, const LocalTensor<T> &src1, uint64_t mask0, uint64_t mask1, uint8_t hRepeat,
bool repeatMode, uint16_t dstBlkStride, uint16_t vROffset, uint8_t vRepeat, const LocalTensor<V> &sharedTmpBuffer, __gm__ const char* name)
{
MstxVecBilinearInterpolation mstxVecBilinearInterpolation;
mstxVecBilinearInterpolation.dst = From(dst);
mstxVecBilinearInterpolation.src0 = From(src0);
mstxVecBilinearInterpolation.src1 = From(src1);
mstxVecBilinearInterpolation.src0Offset = From(src0Offset);
mstxVecBilinearInterpolation.shared = From(sharedTmpBuffer);
mstxVecBilinearInterpolation.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, true, static_cast<uint32_t>(8192));
mstxVecBilinearInterpolation.hRepeat = hRepeat;
mstxVecBilinearInterpolation.repeatMode = repeatMode;
mstxVecBilinearInterpolation.dstBlockStride = dstBlkStride;
mstxVecBilinearInterpolation.vROffset = vROffset;
mstxVecBilinearInterpolation.vRepeat = vRepeat;
CopyName(mstxVecBilinearInterpolation.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BILINEAR_INTERPOLATION), sizeof(mstxVecBilinearInterpolation), &mstxVecBilinearInterpolation);
}
template <typename T, typename U, typename V>
__aicore__ inline void GetMstxVecBilinearInterpolationInfo(const LocalTensor<T> &dst, const LocalTensor<T> &src0,
const LocalTensor<U> &src0Offset, const LocalTensor<T> &src1, uint64_t mask, uint8_t hRepeat,
bool repeatMode, uint16_t dstBlkStride, uint16_t vROffset, uint8_t vRepeat, const LocalTensor<V> &sharedTmpBuffer, __gm__ const char* name)
{
MstxVecBilinearInterpolation mstxVecBilinearInterpolation;
mstxVecBilinearInterpolation.dst = From(dst);
mstxVecBilinearInterpolation.src0 = From(src0);
mstxVecBilinearInterpolation.src1 = From(src1);
mstxVecBilinearInterpolation.src0Offset = From(src0Offset);
mstxVecBilinearInterpolation.shared = From(sharedTmpBuffer);
mstxVecBilinearInterpolation.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, true, static_cast<uint32_t>(8192));
mstxVecBilinearInterpolation.hRepeat = hRepeat;
mstxVecBilinearInterpolation.repeatMode = repeatMode;
mstxVecBilinearInterpolation.dstBlockStride = dstBlkStride;
mstxVecBilinearInterpolation.vROffset = vROffset;
mstxVecBilinearInterpolation.vRepeat = vRepeat;
CopyName(mstxVecBilinearInterpolation.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BILINEAR_INTERPOLATION), sizeof(mstxVecBilinearInterpolation), &mstxVecBilinearInterpolation);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecBinarySelInfo(const LocalTensor<T>& dst, const LocalTensor<U>& selMask,
const LocalTensor<T>& src0, const LocalTensor<T>& src1, uint64_t mask0, uint64_t mask1, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, bool isSetMask, SELMODE selMode, __gm__ const char* name)
{
MstxVecSelDesc mstxVecSelDesc;
mstxVecSelDesc.dst = From(dst);
mstxVecSelDesc.src0 = From(src0);
mstxVecSelDesc.src1 = From(src1);
mstxVecSelDesc.mask = From(selMask);
mstxVecSelDesc.scalarMode = (selMode == AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE);
if (mstxVecSelDesc.scalarMode) {
mstxVecSelDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(8192));
} else {
mstxVecSelDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
}
mstxVecSelDesc.blockNum = repeatParams.blockNumber;
mstxVecSelDesc.repeatTimes = repeatTime;
mstxVecSelDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecSelDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecSelDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecSelDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecSelDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecSelDesc.src1RepeatStride = repeatParams.src1RepStride;
CopyName(mstxVecSelDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_SEL), sizeof(mstxVecSelDesc), &mstxVecSelDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecBinarySelInfo(const LocalTensor<T>& dst, const LocalTensor<U>& selMask,
const LocalTensor<T>& src0, const LocalTensor<T>& src1, uint64_t mask, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, bool isSetMask, SELMODE selMode, __gm__ const char* name)
{
MstxVecSelDesc mstxVecSelDesc;
mstxVecSelDesc.dst = From(dst);
mstxVecSelDesc.src0 = From(src0);
mstxVecSelDesc.src1 = From(src1);
mstxVecSelDesc.mask = From(selMask);
mstxVecSelDesc.scalarMode = (selMode == AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE);
if (mstxVecSelDesc.scalarMode) {
mstxVecSelDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(8192));
} else {
mstxVecSelDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
}
mstxVecSelDesc.blockNum = repeatParams.blockNumber;
mstxVecSelDesc.repeatTimes = repeatTime;
mstxVecSelDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecSelDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecSelDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecSelDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecSelDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecSelDesc.src1RepeatStride = repeatParams.src1RepStride;
CopyName(mstxVecSelDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_SEL), sizeof(mstxVecSelDesc), &mstxVecSelDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecBinarySelInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src0,
const LocalTensor<T>& src1, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, SELMODE selMode, __gm__ const char* name)
{
MstxVecSelDesc mstxVecSelDesc;
mstxVecSelDesc.dst = From(dst);
mstxVecSelDesc.src0 = From(src0);
mstxVecSelDesc.src1 = From(src1);
mstxVecSelDesc.scalarMode = (selMode == AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE);
if (mstxVecSelDesc.scalarMode) {
mstxVecSelDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, 0, 0, false, static_cast<uint32_t>(8192));
} else {
mstxVecSelDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, 0, 0, false, static_cast<uint32_t>(0));
}
mstxVecSelDesc.blockNum = repeatParams.blockNumber;
mstxVecSelDesc.repeatTimes = repeatTime;
mstxVecSelDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecSelDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecSelDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecSelDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecSelDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecSelDesc.src1RepeatStride = repeatParams.src1RepStride;
CopyName(mstxVecSelDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_SEL), sizeof(mstxVecSelDesc), &mstxVecSelDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecBinarySelInfo(const LocalTensor<T>& dst, const LocalTensor<U>& selMask,
const LocalTensor<T>& src0, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, SELMODE selMode, __gm__ const char* name)
{
MstxVecSelDesc mstxVecSelDesc;
mstxVecSelDesc.dst = From(dst);
mstxVecSelDesc.src0 = From(src0);
mstxVecSelDesc.mask = From(selMask);
mstxVecSelDesc.scalarMode = (selMode == AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE);
if (mstxVecSelDesc.scalarMode) {
mstxVecSelDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, 0, 0, false, static_cast<uint32_t>(8192));
} else {
mstxVecSelDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, 0, 0, false, static_cast<uint32_t>(0));
}
mstxVecSelDesc.blockNum = repeatParams.blockNumber;
mstxVecSelDesc.repeatTimes = repeatTime;
mstxVecSelDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecSelDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecSelDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecSelDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecSelDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecSelDesc.src1RepeatStride = repeatParams.src1RepStride;
CopyName(mstxVecSelDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_SEL), sizeof(mstxVecSelDesc), &mstxVecSelDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecBinarySelInfo(const LocalTensor<T>& dst, const LocalTensor<U>& selMask,
const LocalTensor<T>& src0, const LocalTensor<T>& src1, uint32_t count, bool isSetMask, SELMODE selMode, __gm__ const char* name)
{
MstxVecSelDesc mstxVecSelDesc;
mstxVecSelDesc.dst = From(dst);
mstxVecSelDesc.src0 = From(src0);
mstxVecSelDesc.src1 = From(src1);
mstxVecSelDesc.mask = From(selMask);
mstxVecSelDesc.scalarMode = (selMode == AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE);
if (mstxVecSelDesc.scalarMode) {
mstxVecSelDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), isSetMask, static_cast<uint32_t>(8192));
} else {
mstxVecSelDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), isSetMask, static_cast<uint32_t>(0));
}
mstxVecSelDesc.blockNum = 8;
mstxVecSelDesc.repeatTimes = 1;
mstxVecSelDesc.dstBlockStride = 1;
mstxVecSelDesc.src0BlockStride = 1;
mstxVecSelDesc.src1BlockStride = 1;
mstxVecSelDesc.dstRepeatStride = 8;
mstxVecSelDesc.src0RepeatStride = 8;
mstxVecSelDesc.src1RepeatStride = 8;
CopyName(mstxVecSelDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_SEL), sizeof(mstxVecSelDesc), &mstxVecSelDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecBinarySelInfo(const LocalTensor<T>& dst, const LocalTensor<U>& selMask,
const LocalTensor<T>& src0, uint64_t mask0, uint64_t mask1, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, bool isSetMask, SELMODE selMode, __gm__ const char* name)
{
MstxVecSelDesc mstxVecSelDesc;
mstxVecSelDesc.dst = From(dst);
mstxVecSelDesc.src0 = From(src0);
mstxVecSelDesc.mask = From(selMask);
mstxVecSelDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(8192));
mstxVecSelDesc.blockNum = repeatParams.blockNumber;
mstxVecSelDesc.repeatTimes = repeatTime;
mstxVecSelDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecSelDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecSelDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecSelDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecSelDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecSelDesc.src1RepeatStride = repeatParams.src1RepStride;
mstxVecSelDesc.scalarMode = (selMode == AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE);
CopyName(mstxVecSelDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_SEL), sizeof(mstxVecSelDesc), &mstxVecSelDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecBinarySelInfo(const LocalTensor<T>& dst, const LocalTensor<U>& selMask,
const LocalTensor<T>& src0, uint64_t mask, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, bool isSetMask, SELMODE selMode, __gm__ const char* name)
{
MstxVecSelDesc mstxVecSelDesc;
mstxVecSelDesc.dst = From(dst);
mstxVecSelDesc.src0 = From(src0);
mstxVecSelDesc.mask = From(selMask);
mstxVecSelDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(8192));
mstxVecSelDesc.blockNum = repeatParams.blockNumber;
mstxVecSelDesc.repeatTimes = repeatTime;
mstxVecSelDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecSelDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecSelDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecSelDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecSelDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecSelDesc.src1RepeatStride = repeatParams.src1RepStride;
mstxVecSelDesc.scalarMode = (selMode == AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE);
CopyName(mstxVecSelDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_SEL), sizeof(mstxVecSelDesc), &mstxVecSelDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxVecBinarySelInfo(const LocalTensor<T>& dst, const LocalTensor<U>& selMask,
const LocalTensor<T>& src0, uint32_t count, bool isSetMask, SELMODE selMode, __gm__ const char* name)
{
MstxVecSelDesc mstxVecSelDesc;
mstxVecSelDesc.dst = From(dst);
mstxVecSelDesc.src0 = From(src0);
mstxVecSelDesc.mask = From(selMask);
mstxVecSelDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), isSetMask, static_cast<uint32_t>(8192));
mstxVecSelDesc.blockNum = 8;
mstxVecSelDesc.repeatTimes = 1;
mstxVecSelDesc.dstBlockStride = 1;
mstxVecSelDesc.src0BlockStride = 1;
mstxVecSelDesc.src1BlockStride = 1;
mstxVecSelDesc.dstRepeatStride = 8;
mstxVecSelDesc.src0RepeatStride = 8;
mstxVecSelDesc.src1RepeatStride = 8;
mstxVecSelDesc.scalarMode = (selMode == AscendC::SELMODE::VSEL_TENSOR_SCALAR_MODE);
CopyName(mstxVecSelDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_SEL), sizeof(mstxVecSelDesc), &mstxVecSelDesc);
}
template <typename T, typename U, typename V>
__aicore__ inline void GetMstxVecBinaryCmpInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src0,
const LocalTensor<V>& src1, uint64_t mask0, uint64_t mask1, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = repeatParams.blockNumber;
mstxVecBinaryDesc.repeatTimes = repeatTime;
mstxVecBinaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecBinaryDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecBinaryDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecBinaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecBinaryDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecBinaryDesc.src1RepeatStride = repeatParams.src1RepStride;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CMP), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T, typename U, typename V>
__aicore__ inline void GetMstxVecBinaryCmpInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src0,
const LocalTensor<V>& src1, uint64_t mask, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = repeatParams.blockNumber;
mstxVecBinaryDesc.repeatTimes = repeatTime;
mstxVecBinaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecBinaryDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecBinaryDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecBinaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecBinaryDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecBinaryDesc.src1RepeatStride = repeatParams.src1RepStride;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CMP), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T, typename U, typename V>
__aicore__ inline void GetMstxVecBinaryCmpInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src0,
const LocalTensor<V>& src1, __gm__ const char* name, const int32_t count)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), true, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = 8;
mstxVecBinaryDesc.repeatTimes = 1;
mstxVecBinaryDesc.dstBlockStride = 1;
mstxVecBinaryDesc.src0BlockStride = 1;
mstxVecBinaryDesc.src1BlockStride = 1;
mstxVecBinaryDesc.dstRepeatStride = 8;
mstxVecBinaryDesc.src0RepeatStride = 8;
mstxVecBinaryDesc.src1RepeatStride = 8;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CMP), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T, typename U, typename V>
__aicore__ inline void GetMstxVecBinaryAddReqReluInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src0,
const LocalTensor<V>& src1, uint64_t mask0, uint64_t mask1, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(8192));
mstxVecBinaryDesc.blockNum = repeatParams.blockNumber;
mstxVecBinaryDesc.repeatTimes = repeatTime;
mstxVecBinaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecBinaryDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecBinaryDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecBinaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecBinaryDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecBinaryDesc.src1RepeatStride = repeatParams.src1RepStride;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BINARY), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T, typename U, typename V>
__aicore__ inline void GetMstxVecBinaryAddReqReluInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src0,
const LocalTensor<V>& src1, uint64_t mask, const uint8_t repeatTime,
const BinaryRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(8192));
mstxVecBinaryDesc.blockNum = repeatParams.blockNumber;
mstxVecBinaryDesc.repeatTimes = repeatTime;
mstxVecBinaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecBinaryDesc.src0BlockStride = repeatParams.src0BlkStride;
mstxVecBinaryDesc.src1BlockStride = repeatParams.src1BlkStride;
mstxVecBinaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecBinaryDesc.src0RepeatStride = repeatParams.src0RepStride;
mstxVecBinaryDesc.src1RepeatStride = repeatParams.src1RepStride;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BINARY), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T, typename U, typename V>
__aicore__ inline void GetMstxVecBinaryAddReqReluInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src0,
const LocalTensor<V>& src1, __gm__ const char* name, const int32_t count)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), true, static_cast<uint32_t>(8192));
mstxVecBinaryDesc.blockNum = 8;
mstxVecBinaryDesc.repeatTimes = 1;
mstxVecBinaryDesc.dstBlockStride = 1;
mstxVecBinaryDesc.src0BlockStride = 1;
mstxVecBinaryDesc.src1BlockStride = 1;
mstxVecBinaryDesc.dstRepeatStride = 8;
mstxVecBinaryDesc.src0RepeatStride = 8;
mstxVecBinaryDesc.src1RepeatStride = 8;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BINARY), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecBinaryScalarInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src0,
const LocalTensor<T>& src1, uint64_t mask0, uint64_t mask1, const uint8_t repeatTime,
const UnaryRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = repeatParams.blockNumber;
mstxVecBinaryDesc.repeatTimes = repeatTime;
mstxVecBinaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecBinaryDesc.src0BlockStride = repeatParams.srcBlkStride;
mstxVecBinaryDesc.src1BlockStride = repeatParams.srcBlkStride;
mstxVecBinaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecBinaryDesc.src0RepeatStride = repeatParams.srcRepStride;
mstxVecBinaryDesc.src1RepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BINARY_SCALAR), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecBinaryScalarInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src0,
const LocalTensor<T>& src1, uint64_t mask, const uint8_t repeatTime,
const UnaryRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = repeatParams.blockNumber;
mstxVecBinaryDesc.repeatTimes = repeatTime;
mstxVecBinaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecBinaryDesc.src0BlockStride = repeatParams.srcBlkStride;
mstxVecBinaryDesc.src1BlockStride = repeatParams.srcBlkStride;
mstxVecBinaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecBinaryDesc.src0RepeatStride = repeatParams.srcRepStride;
mstxVecBinaryDesc.src1RepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BINARY_SCALAR), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecBinaryScalarInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src0,
const LocalTensor<T>& src1, bool isSetMask, __gm__ const char* name, const int32_t count)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), isSetMask, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = 8;
mstxVecBinaryDesc.repeatTimes = 1;
mstxVecBinaryDesc.dstBlockStride = 1;
mstxVecBinaryDesc.src0BlockStride = 1;
mstxVecBinaryDesc.src1BlockStride = 1;
mstxVecBinaryDesc.dstRepeatStride = 8;
mstxVecBinaryDesc.src0RepeatStride = 8;
mstxVecBinaryDesc.src1RepeatStride = 8;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_BINARY_SCALAR), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecBinaryScalarCmpsInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src0,
const LocalTensor<T>& src1, uint64_t mask0, uint64_t mask1, const uint8_t repeatTime,
const UnaryRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, isSetMask, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = repeatParams.blockNumber;
mstxVecBinaryDesc.repeatTimes = repeatTime;
mstxVecBinaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecBinaryDesc.src0BlockStride = repeatParams.srcBlkStride;
mstxVecBinaryDesc.src1BlockStride = repeatParams.srcBlkStride;
mstxVecBinaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecBinaryDesc.src0RepeatStride = repeatParams.srcRepStride;
mstxVecBinaryDesc.src1RepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CMPS), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecBinaryScalarCmpsInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src0,
const LocalTensor<T>& src1, uint64_t mask, const uint8_t repeatTime,
const UnaryRepeatParams& repeatParams, bool isSetMask, __gm__ const char* name)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, isSetMask, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = repeatParams.blockNumber;
mstxVecBinaryDesc.repeatTimes = repeatTime;
mstxVecBinaryDesc.dstBlockStride = repeatParams.dstBlkStride;
mstxVecBinaryDesc.src0BlockStride = repeatParams.srcBlkStride;
mstxVecBinaryDesc.src1BlockStride = repeatParams.srcBlkStride;
mstxVecBinaryDesc.dstRepeatStride = repeatParams.dstRepStride;
mstxVecBinaryDesc.src0RepeatStride = repeatParams.srcRepStride;
mstxVecBinaryDesc.src1RepeatStride = repeatParams.srcRepStride;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CMPS), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecBinaryScalarCmpsInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src0,
const LocalTensor<T>& src1, bool isSetMask, __gm__ const char* name, const int32_t count)
{
MstxVecBinaryDesc mstxVecBinaryDesc;
mstxVecBinaryDesc.dst = From(dst);
mstxVecBinaryDesc.src0 = From(src0);
mstxVecBinaryDesc.src1 = From(src1);
mstxVecBinaryDesc.wrapper = WrapperFrom(MSTX_MASK_COUNT, count, static_cast<uint64_t>(0), isSetMask, static_cast<uint32_t>(0));
mstxVecBinaryDesc.blockNum = 8;
mstxVecBinaryDesc.repeatTimes = 1;
mstxVecBinaryDesc.dstBlockStride = 1;
mstxVecBinaryDesc.src0BlockStride = 1;
mstxVecBinaryDesc.src1BlockStride = 1;
mstxVecBinaryDesc.dstRepeatStride = 8;
mstxVecBinaryDesc.src0RepeatStride = 8;
mstxVecBinaryDesc.src1RepeatStride = 8;
CopyName(mstxVecBinaryDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_CMPS), sizeof(mstxVecBinaryDesc), &mstxVecBinaryDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecGatherMaskInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src,
uint32_t mask0, uint32_t mask1, const GatherMaskParams& gatherMaskParams, GatherMaskMode mode, __gm__ const char* name)
{
MstxVecGatherMaskDesc mstxVecGatherMaskDesc;
mstxVecGatherMaskDesc.dst = From(dst);
mstxVecGatherMaskDesc.src = From(src);
mstxVecGatherMaskDesc.wrapper = WrapperFrom(MSTX_MASK_FROM_REG, mask0, mask1, true, static_cast<uint32_t>(0));
mstxVecGatherMaskDesc.mode = (mode == GatherMaskMode::VERSION_V2) ? AscendC::MstxTensor::MstxGatherMaskMode::V2 : AscendC::MstxTensor::MstxGatherMaskMode::V1;
mstxVecGatherMaskDesc.repeatTimes = gatherMaskParams.repeatTimes;
mstxVecGatherMaskDesc.src0BlockStride = gatherMaskParams.src0BlockStride;
mstxVecGatherMaskDesc.src0RepeatStride = gatherMaskParams.src0RepeatStride;
mstxVecGatherMaskDesc.src1RepeatStride = gatherMaskParams.src1RepeatStride;
CopyName(mstxVecGatherMaskDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_GATHER_MASK), sizeof(mstxVecGatherMaskDesc), &mstxVecGatherMaskDesc);
}
template <typename T>
__aicore__ inline void GetMstxVecGatherMaskInfo(const LocalTensor<T>& dst, const LocalTensor<T>& src,
uint32_t mask, const GatherMaskParams& gatherMaskParams, GatherMaskMode mode, __gm__ const char* name)
{
MstxVecGatherMaskDesc mstxVecGatherMaskDesc;
mstxVecGatherMaskDesc.dst = From(dst);
mstxVecGatherMaskDesc.src = From(src);
mstxVecGatherMaskDesc.wrapper = WrapperFrom<T>(MSTX_MASK_FROM_REG, mask, true, static_cast<uint32_t>(0));
mstxVecGatherMaskDesc.mode = (mode == GatherMaskMode::VERSION_V2) ? AscendC::MstxTensor::MstxGatherMaskMode::V2 : AscendC::MstxTensor::MstxGatherMaskMode::V1;
mstxVecGatherMaskDesc.repeatTimes = gatherMaskParams.repeatTimes;
mstxVecGatherMaskDesc.src0BlockStride = gatherMaskParams.src0BlockStride;
mstxVecGatherMaskDesc.src0RepeatStride = gatherMaskParams.src0RepeatStride;
mstxVecGatherMaskDesc.src1RepeatStride = gatherMaskParams.src1RepeatStride;
CopyName(mstxVecGatherMaskDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_VEC_GATHER_MASK), sizeof(mstxVecGatherMaskDesc), &mstxVecGatherMaskDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxDataCopyInfo(const LocalTensor<T>& dst, const GlobalTensor<U>& src,
const DataCopyParams& repeatParams, __gm__ const char* name)
{
MstxDataCopyDesc mstxDataCopyDesc;
mstxDataCopyDesc.dst = From(dst);
mstxDataCopyDesc.src = FromGm(src);
mstxDataCopyDesc.nBurst = repeatParams.blockCount;
mstxDataCopyDesc.lenBurst = repeatParams.blockLen;
mstxDataCopyDesc.srcGap = repeatParams.srcStride;
mstxDataCopyDesc.dstGap = repeatParams.dstStride;
CopyName(mstxDataCopyDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_DATA_COPY), sizeof(mstxDataCopyDesc), &mstxDataCopyDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxDataCopyInfo(const GlobalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyParams& repeatParams, __gm__ const char* name)
{
MstxDataCopyDesc mstxDataCopyDesc;
mstxDataCopyDesc.dst = FromGm(dst);
mstxDataCopyDesc.src = From(src);
mstxDataCopyDesc.nBurst = repeatParams.blockCount;
mstxDataCopyDesc.lenBurst = repeatParams.blockLen;
mstxDataCopyDesc.srcGap = repeatParams.srcStride;
mstxDataCopyDesc.dstGap = repeatParams.dstStride;
CopyName(mstxDataCopyDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_DATA_COPY), sizeof(mstxDataCopyDesc), &mstxDataCopyDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxDataCopyInfo(const LocalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyParams& repeatParams, __gm__ const char* name)
{
MstxDataCopyDesc mstxDataCopyDesc;
mstxDataCopyDesc.dst = From(dst);
mstxDataCopyDesc.src = From(src);
mstxDataCopyDesc.nBurst = repeatParams.blockCount;
mstxDataCopyDesc.lenBurst = repeatParams.blockLen;
mstxDataCopyDesc.srcGap = repeatParams.srcStride;
mstxDataCopyDesc.dstGap = repeatParams.dstStride;
CopyName(mstxDataCopyDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_DATA_COPY), sizeof(mstxDataCopyDesc), &mstxDataCopyDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxDataCopyPadInfo(const LocalTensor<T>& dst, const GlobalTensor<U>& src,
const DataCopyParams& dataCopyParams, const DataCopyPadParams &padParams, __gm__ const char* name)
{
MstxDataCopyPadDesc mstxDataCopyPadDesc;
mstxDataCopyPadDesc.dst = From(dst);
mstxDataCopyPadDesc.src = FromGm(src);
mstxDataCopyPadDesc.nBurst = dataCopyParams.blockCount;
mstxDataCopyPadDesc.lenBurst = dataCopyParams.blockLen;
mstxDataCopyPadDesc.srcGap = dataCopyParams.srcStride;
mstxDataCopyPadDesc.dstGap = dataCopyParams.dstStride;
mstxDataCopyPadDesc.leftPad = padParams.leftPadding;
mstxDataCopyPadDesc.rightPad = padParams.rightPadding;
CopyName(mstxDataCopyPadDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_DATA_COPY_PAD), sizeof(mstxDataCopyPadDesc), &mstxDataCopyPadDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxDataCopyPadInfo(const GlobalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyParams& dataCopyParams, __gm__ const char* name)
{
MstxDataCopyPadDesc mstxDataCopyPadDesc;
mstxDataCopyPadDesc.dst = FromGm(dst);
mstxDataCopyPadDesc.src = From(src);
mstxDataCopyPadDesc.nBurst = dataCopyParams.blockCount;
mstxDataCopyPadDesc.lenBurst = dataCopyParams.blockLen;
mstxDataCopyPadDesc.srcGap = dataCopyParams.srcStride;
mstxDataCopyPadDesc.dstGap = dataCopyParams.dstStride;
mstxDataCopyPadDesc.leftPad = 0;
mstxDataCopyPadDesc.rightPad = 0;
CopyName(mstxDataCopyPadDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_DATA_COPY_PAD), sizeof(mstxDataCopyPadDesc), &mstxDataCopyPadDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxDataCopyPadInfo(const LocalTensor<T>& dst, const GlobalTensor<U>& src,
const DataCopyExtParams &dataCopyParams, const DataCopyPadExtParams<T> &padParams, __gm__ const char* name)
{
MstxDataCopyPadDesc mstxDataCopyPadDesc;
mstxDataCopyPadDesc.dst = From(dst);
mstxDataCopyPadDesc.src = FromGm(src);
mstxDataCopyPadDesc.nBurst = dataCopyParams.blockCount;
mstxDataCopyPadDesc.lenBurst = dataCopyParams.blockLen;
mstxDataCopyPadDesc.srcGap = dataCopyParams.srcStride;
mstxDataCopyPadDesc.dstGap = dataCopyParams.dstStride;
mstxDataCopyPadDesc.leftPad = padParams.leftPadding;
mstxDataCopyPadDesc.rightPad = padParams.rightPadding;
CopyName(mstxDataCopyPadDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_DATA_COPY_PAD), sizeof(mstxDataCopyPadDesc), &mstxDataCopyPadDesc);
}
template <typename T, typename U>
__aicore__ inline void GetMstxDataCopyPadInfo(const GlobalTensor<T>& dst, const LocalTensor<U>& src,
const DataCopyExtParams& dataCopyParams, __gm__ const char* name)
{
MstxDataCopyPadDesc mstxDataCopyPadDesc;
mstxDataCopyPadDesc.dst = FromGm(dst);
mstxDataCopyPadDesc.src = From(src);
mstxDataCopyPadDesc.nBurst = dataCopyParams.blockCount;
mstxDataCopyPadDesc.lenBurst = dataCopyParams.blockLen;
mstxDataCopyPadDesc.srcGap = dataCopyParams.srcStride;
mstxDataCopyPadDesc.dstGap = dataCopyParams.dstStride;
mstxDataCopyPadDesc.leftPad = 0;
mstxDataCopyPadDesc.rightPad = 0;
CopyName(mstxDataCopyPadDesc.name, name);
__mstx_dfx_report_stub(static_cast<uint32_t>(MstxReportType::MSTX_DATA_COPY_PAD), sizeof(mstxDataCopyPadDesc), &mstxDataCopyPadDesc);
}
}
}
#endif
#endif
#if defined(__UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_MSTX_LOCAL_TENSOR_INFO_H__)
#undef __ASCENDC_INCLUDE_INTERNAL_HEADERS__
#undef __UNDEF_ASCENDC_INCLUDE_INTERNAL_HEADERS_MSTX_LOCAL_TENSOR_INFO_H__
#endif