* Copyright (c) Huawei Technologies Co., Ltd. 2024-2024. All rights reserved.
*/
#include "kernel_operator.h"
using namespace AscendC;
namespace AscendC {
template<typename Ta, typename Tb>
__aicore__ inline Ta min(const Ta a, const Tb b)
{
if (a > b) {
return b;
}
return a;
}
template<typename Ta, typename Tb>
__aicore__ inline Ta max(const Ta a, const Tb b)
{
if (a < b) {
return b;
}
return a;
}
template<typename T>
class KernelUnique {
public:
__aicore__ inline KernelUnique(TPipe& pipe) : pipe(pipe) {}
__aicore__ inline size_t GetGlobalOffset(const uint32_t blockIdx);
__aicore__ inline void Init(GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace,
const uint32_t totalLength, const uint32_t shortBlockTileNum, const uint16_t tileLength,
const uint16_t tailLength, const uint8_t aivNum, const uint8_t blockNum, const uint8_t shortBlockNum);
__aicore__ inline void Process();
private:
__aicore__ inline void CopyIn(const int32_t progress);
__aicore__ inline void Elem32Sort(const int32_t progress);
__aicore__ inline void TileSort(const int32_t progress);
template<typename T1>
__aicore__ inline static void DataCopyGM2GM(const GlobalTensor<T1>& dst, const GlobalTensor<T1>& src,
const LocalTensor<T1>& tmpLocal, const int elemLength, const int bufByteLength);
using GMSSrcList = GlobalTensor<float> (&)[4];
struct GMSParams {
int (&GMSLengths)[4];
uint8_t& queNum;
LocalTensor<float> (&&buffLocal)[5];
};
__aicore__ inline static void MrgSortGM(GlobalTensor<float>&& dstGlobal, GMSSrcList& srcList, GMSParams& params);
__aicore__ inline void BlockSortV2();
__aicore__ inline void GlobalSortV2();
__aicore__ inline static void ConsecutiveUnique(const LocalTensor<float>& dstVal,
const LocalTensor<float>& srcLocal, const LocalTensor<float>& shiftedLocal,
const LocalTensor<uint32_t>& bitMask16, const uint16_t elemLength, uint64_t& tileUniqueCnt);
__aicore__ inline void TileUnique(const int32_t progress);
__aicore__ inline void CopyOut();
private:
static constexpr int32_t TILE_LENGTH = 8192;
static constexpr float FLOAT_INF = 3e+99;
static constexpr int16_t SORT_DATATYPE_SIZE = sizeof(float) + sizeof(uint32_t);
static constexpr int16_t SORT_DATATYPE_SIZE_FACTOR = SORT_DATATYPE_SIZE / sizeof(float);
static constexpr int32_t TILE_LEN_BYTE = TILE_LENGTH * SORT_DATATYPE_SIZE;
static constexpr int32_t TILE_LEN_ELEM = TILE_LENGTH * SORT_DATATYPE_SIZE_FACTOR;
static constexpr uint16_t VALID_QUE[5] = {
0, 0, 0b11, 0b111, 0b1111};
TPipe& pipe;
TBuf<TPosition::VECIN> calcBuf[3];
GlobalTensor<T> srcGlobal;
GlobalTensor<uint32_t> srcGlobalAsUint;
GlobalTensor<T> dstGlobal1;
GlobalTensor<int32_t> dstGlobal1As32;
GlobalTensor<int32_t> uniqueCntGlobal;
GlobalTensor<float> sortedBlock1;
GlobalTensor<int32_t> sortedBlock1AsInt;
GlobalTensor<float> sortedBlock2;
GlobalTensor<int32_t> sortedBlock2AsInt;
GlobalTensor<float> sortedGlobal1;
GlobalTensor<float> sortedGlobal2;
GlobalTensor<int32_t> IBSyncGlobal;
GlobalTensor<uint32_t> blockUniqueCntGlobal;
uint16_t syncWorkspaceSize;
uint8_t eventID {0};
uint64_t blockUniqueCnt {0};
float lastTileUniqueVal;
uint32_t totalLength;
uint32_t tileNum;
uint32_t shortBlockTileNum;
uint16_t tailLength;
uint8_t blockNum;
uint8_t shortBlockNum;
size_t globalOffset;
size_t blockLength;
bool hasInfFlag {false};
};
template<typename T>
__aicore__ inline size_t KernelUnique<T>::GetGlobalOffset(const uint32_t blockIdx)
{
const size_t offset =
(this->shortBlockTileNum * min(this->shortBlockNum, blockIdx) +
(this->shortBlockTileNum + 1) * (this->shortBlockNum >= blockIdx ? 0 : blockIdx - this->shortBlockNum)) *
TILE_LENGTH;
return offset;
}
template<typename T>
__aicore__ inline void KernelUnique<T>::Init(GM_ADDR input, GM_ADDR output, GM_ADDR uniqueCnt, GM_ADDR workspace,
const uint32_t totalLength, const uint32_t shortBlockTileNum, const uint16_t tileLength,
const uint16_t tailLength, const uint8_t aivNum, const uint8_t blockNum, const uint8_t shortBlockNum)
{
this->totalLength = totalLength;
this->shortBlockTileNum = shortBlockTileNum;
this->tailLength = tailLength;
this->blockNum = blockNum;
this->shortBlockNum = shortBlockNum;
uint32_t alignedTotalLength = (totalLength + TILE_LENGTH - 1) / TILE_LENGTH * TILE_LENGTH;
const bool isShortBlock = this->shortBlockNum > GetBlockIdx();
this->tileNum = isShortBlock ? shortBlockTileNum : shortBlockTileNum + 1;
this->blockLength = this->tileNum * TILE_LENGTH;
this->globalOffset = GetGlobalOffset(GetBlockIdx());
srcGlobal.SetGlobalBuffer((__gm__ T*)input + globalOffset, this->blockLength);
srcGlobalAsUint.SetGlobalBuffer((__gm__ uint32_t*)input + globalOffset * sizeof(T) / sizeof(uint32_t),
this->blockLength * sizeof(T) / sizeof(uint32_t));
dstGlobal1.SetGlobalBuffer((__gm__ T*)output, alignedTotalLength);
dstGlobal1As32.SetGlobalBuffer((__gm__ int32_t*)output, alignedTotalLength * sizeof(T) / sizeof(int32_t));
uniqueCntGlobal.SetGlobalBuffer((__gm__ int32_t*)uniqueCnt, 1);
sortedBlock1.SetGlobalBuffer((__gm__ float*)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR,
this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
sortedBlock1AsInt.SetGlobalBuffer((__gm__ int32_t*)workspace + globalOffset * SORT_DATATYPE_SIZE_FACTOR,
this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
sortedBlock2.SetGlobalBuffer((__gm__ float*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR +
globalOffset * SORT_DATATYPE_SIZE_FACTOR,
this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
sortedBlock2AsInt.SetGlobalBuffer((__gm__ int32_t*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR +
globalOffset * SORT_DATATYPE_SIZE_FACTOR,
this->blockLength * SORT_DATATYPE_SIZE_FACTOR);
sortedGlobal1.SetGlobalBuffer((__gm__ float*)workspace, alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR);
sortedGlobal2.SetGlobalBuffer((__gm__ float*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR,
alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR);
this->syncWorkspaceSize = (blockNum * 32 * 8 + aivNum * 32 + 32) / sizeof(int32_t);
IBSyncGlobal.SetGlobalBuffer(
(__gm__ int32_t*)workspace + alignedTotalLength * SORT_DATATYPE_SIZE_FACTOR * 2, syncWorkspaceSize);
blockUniqueCntGlobal.SetGlobalBuffer((__gm__ uint32_t*)workspace + alignedTotalLength * 4 + syncWorkspaceSize,
(blockNum + 7) / 8 * 8);
if (GetBlockNum() > 1) {
if (GetBlockIdx() == 0) {
InitGlobalMemory(IBSyncGlobal, syncWorkspaceSize, 0);
}
PipeBarrier<PIPE_ALL>();
}
pipe.InitBuffer(calcBuf[0], TILE_LEN_BYTE);
pipe.InitBuffer(calcBuf[1], TILE_LEN_BYTE);
pipe.InitBuffer(calcBuf[2], TILE_LEN_BYTE);
}
template<typename T>
__aicore__ inline void KernelUnique<T>::Process()
{
for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) {
CopyIn(tileIdx);
Elem32Sort(tileIdx);
TileSort(tileIdx);
}
if (GetBlockNum() > 1) {
if (this->tileNum > 1) {
BlockSortV2();
}
SyncAll();
GlobalSortV2();
SyncAll();
}
if ((IsSameType<T, bfloat16_t>::value || IsSameType<T, half>::value || IsSameType<T, float>::value) &&
GetBlockIdx() == blockNum - 1) {
if (sortedGlobal1.GetValue((totalLength - 1) * 2) == -FLOAT_INF) {
hasInfFlag = true;
}
}
for (int32_t tileIdx = 0; tileIdx < this->tileNum; tileIdx++) {
TileUnique(tileIdx);
}
if (this->blockNum > 1) {
LocalTensor<int32_t> IBSyncLocal = calcBuf[0].Get<int32_t>();
if (GetBlockIdx() != 0) {
IBWait(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx() - 1, eventID);
}
IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID);
}
CopyOut();
}
template<typename T>
__aicore__ inline void KernelUnique<T>::CopyIn(const int32_t progress)
{
LocalTensor<T> srcLocal = calcBuf[0].Get<T>();
LocalTensor<float> sortedLocal2 = calcBuf[2].Get<float>();
int32_t castLen;
if ((progress != tileNum - 1) || (GetBlockIdx() != blockNum - 1) || tailLength == 0) {
if constexpr (!IsSameType<T, float>::value) {
DataCopy(srcLocal, srcGlobal[progress * TILE_LENGTH], TILE_LENGTH);
} else {
DataCopy(sortedLocal2, srcGlobal[progress * TILE_LENGTH], TILE_LENGTH);
}
castLen = TILE_LENGTH;
} else {
LocalTensor<uint32_t> srcAsUint = srcLocal.template ReinterpretCast<uint32_t>();
Duplicate(sortedLocal2, FLOAT_INF, TILE_LENGTH);
if constexpr (IsSameType<T, float>::value) {
PipeBarrier<PIPE_ALL>();
DataCopyPad(sortedLocal2, srcGlobal[progress * TILE_LENGTH],
{1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0});
} else if constexpr (sizeof(T) >= sizeof(float)) {
PipeBarrier<PIPE_V>();
DataCopyPad(srcAsUint, srcGlobalAsUint[progress * TILE_LENGTH * sizeof(T) / sizeof(uint32_t)],
{1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0});
} else {
PipeBarrier<PIPE_V>();
DataCopyPad(srcLocal, srcGlobal[progress * TILE_LENGTH],
{1, static_cast<uint16_t>(sizeof(T) * tailLength), 0, 0}, {false, 0, 0, 0});
}
castLen = tailLength;
}
PipeBarrier<PIPE_ALL>();
if constexpr (!IsSameType<T, float>::value) {
if constexpr (sizeof(T) >= sizeof(float)) {
Cast(sortedLocal2, srcLocal, RoundMode::CAST_ROUND, castLen);
} else {
Cast(sortedLocal2, srcLocal, RoundMode::CAST_NONE, castLen);
}
PipeBarrier<PIPE_V>();
}
Muls(sortedLocal2, sortedLocal2, (float)-1, TILE_LENGTH);
}
template<typename T>
__aicore__ inline void KernelUnique<T>::Elem32Sort(const int32_t progress)
{
LocalTensor<T> srcLocal = calcBuf[0].Get<T>();
LocalTensor<float> sortedLocal1 = calcBuf[1].Get<float>();
LocalTensor<float> sortedLocal2 = calcBuf[2].Get<float>();
LocalTensor<int32_t> arithLocal = srcLocal.template ReinterpretCast<int32_t>()[TILE_LENGTH];
int32_t baseOffset = progress * TILE_LENGTH + this->globalOffset;
Duplicate(arithLocal, baseOffset, TILE_LENGTH);
PipeBarrier<PIPE_V>();
LocalTensor<uint32_t> uidArray = arithLocal.template ReinterpretCast<uint32_t>();
constexpr uint8_t sort32BatchSize = 32;
constexpr uint8_t sort32RepeatLimit = 255;
int instrRepeatTime = 0;
int restLen = TILE_LENGTH;
while (restLen) {
int repTime = min(restLen / sort32BatchSize, sort32RepeatLimit);
Sort32<float>(sortedLocal1[sort32BatchSize * sort32RepeatLimit * SORT_DATATYPE_SIZE_FACTOR * instrRepeatTime],
sortedLocal2[sort32BatchSize * sort32RepeatLimit * instrRepeatTime],
uidArray[sort32BatchSize * sort32RepeatLimit * instrRepeatTime], repTime);
restLen -= repTime * sort32BatchSize;
instrRepeatTime++;
}
PipeBarrier<PIPE_ALL>();
}
template<typename T>
__aicore__ inline void KernelUnique<T>::TileSort(const int32_t progress)
{
LocalTensor<float> sortedLocal1 = calcBuf[1].Get<float>();
LocalTensor<float> sortedLocal2 = calcBuf[2].Get<float>();
LocalTensor<float> sortedQue[2] = {sortedLocal1, sortedLocal2};
uint16_t currentQueLength = 32;
uint16_t currentQueNum = TILE_LENGTH / currentQueLength;
bool switchFlag = false;
while (currentQueLength < TILE_LENGTH) {
const uint16_t elementLengths[4] = {currentQueLength, currentQueLength, currentQueLength, currentQueLength};
const uint16_t fullMrgSortTime = currentQueNum / 4;
if (fullMrgSortTime > 0) {
MrgSort4Info params = {elementLengths, false, 0b1111, fullMrgSortTime};
MrgSort<float>(sortedQue[!switchFlag],
{sortedQue[switchFlag][0], sortedQue[switchFlag][currentQueLength * 1 * 2],
sortedQue[switchFlag][currentQueLength * 2 * 2], sortedQue[switchFlag][currentQueLength * 3 * 2]},
params);
PipeBarrier<PIPE_ALL>();
switchFlag = !switchFlag;
}
currentQueNum = fullMrgSortTime;
currentQueLength *= 4;
}
DataCopy(sortedBlock1[progress * TILE_LEN_ELEM], sortedQue[switchFlag], TILE_LEN_ELEM);
PipeBarrier<PIPE_ALL>();
}
template<typename T>
template<typename T1>
__aicore__ inline void KernelUnique<T>::DataCopyGM2GM(const GlobalTensor<T1>& dst, const GlobalTensor<T1>& src,
const LocalTensor<T1>& tmpLocal, const int elemLength, const int bufByteLength)
{
int bufElemLength = min(bufByteLength, 65535) / sizeof(T1);
int restLen = elemLength;
while (restLen > 0) {
int copyLen = min(restLen, bufElemLength);
DataCopyPad(tmpLocal, src[elemLength - restLen], {1, static_cast<uint16_t>(sizeof(T1) * copyLen), 0, 0},
{false, 0, 0, 0});
PipeBarrier<PIPE_ALL>();
DataCopyPad(dst[elemLength - restLen], tmpLocal, {1, static_cast<uint16_t>(sizeof(T1) * copyLen), 0, 0});
PipeBarrier<PIPE_ALL>();
restLen -= copyLen;
}
}
template<typename T>
__aicore__ inline void KernelUnique<T>::MrgSortGM(
GlobalTensor<float>&& dstGlobal, GMSSrcList& srcList, GMSParams& params)
{
int restLen[4] {params.GMSLengths[0], params.GMSLengths[1], params.GMSLengths[2], params.GMSLengths[3]};
int currentHead[4] {};
int totalMrgLen {};
uint8_t queNum = params.queNum;
constexpr int BUFFER_LEN[5] {0, 0, 4095, 2730, 2048};
uint16_t sortedLen[4];
uint16_t mrgLen[4] {};
while (queNum > 1) {
int currentBufferLen = BUFFER_LEN[queNum];
for (int i = 0; i < queNum; i++) {
mrgLen[i] = min(restLen[i], currentBufferLen);
}
for (int i = 0; i < queNum; i++) {
DataCopyPad(params.buffLocal[i], srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR],
{1, static_cast<uint16_t>(sizeof(float) * mrgLen[i] * SORT_DATATYPE_SIZE_FACTOR), 0, 0},
{false, 0, 0, 0});
}
PipeBarrier<PIPE_ALL>();
MrgSort4Info localParams {mrgLen, true, VALID_QUE[queNum], 1};
MrgSort<float>(params.buffLocal[4],
{params.buffLocal[0], params.buffLocal[1], params.buffLocal[2], params.buffLocal[3]}, localParams);
PipeBarrier<PIPE_ALL>();
GetMrgSortResult(sortedLen[0], sortedLen[1], sortedLen[2], sortedLen[3]);
const uint16_t localMrgLen = sortedLen[0] + sortedLen[1] + sortedLen[2] + sortedLen[3];
DataCopyPad(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR], params.buffLocal[4],
{1, static_cast<uint16_t>(sizeof(float) * localMrgLen * SORT_DATATYPE_SIZE_FACTOR), 0, 0});
PipeBarrier<PIPE_ALL>();
totalMrgLen += localMrgLen;
for (int i = 0; i < queNum; i++) {
restLen[i] -= sortedLen[i];
currentHead[i] += sortedLen[i];
}
for (int i = 0; i < queNum; i++) {
if (restLen[i] == 0) {
for (int j = i; j < 3; j++) {
restLen[j] = restLen[j + 1];
currentHead[j] = currentHead[j + 1];
srcList[j] = srcList[j + 1];
}
restLen[3] = 0;
queNum--;
break;
}
}
}
for (int i = 0; i < params.queNum; i++) {
if (restLen[i] > 0) {
DataCopyGM2GM(dstGlobal[totalMrgLen * SORT_DATATYPE_SIZE_FACTOR],
srcList[i][currentHead[i] * SORT_DATATYPE_SIZE_FACTOR], params.buffLocal[4],
restLen[i] * SORT_DATATYPE_SIZE_FACTOR, TILE_LEN_BYTE);
break;
}
}
};
template<typename T>
__aicore__ inline void KernelUnique<T>::BlockSortV2()
{
LocalTensor<float> sortedLocal1 = calcBuf[0].Get<float>();
LocalTensor<float> sortedLocal2 = calcBuf[1].Get<float>();
LocalTensor<float> mrgLocal = calcBuf[2].Get<float>();
GlobalTensor<float> sortedBlock[2] = {sortedBlock1, sortedBlock2};
constexpr uint8_t PREFIX_QUE_NUM = 4;
bool switchFlag = false;
GlobalTensor<float> srcGlobal[4];
LocalTensor<float> buffLocal[5];
int lengths[4];
for (int bindTile = 1; bindTile < tileNum; bindTile *= PREFIX_QUE_NUM) {
for (int tileIdx = 0; tileIdx < tileNum; tileIdx += bindTile * PREFIX_QUE_NUM) {
int mrgTileNum = min(tileNum - tileIdx, bindTile * PREFIX_QUE_NUM);
uint8_t queNum = (mrgTileNum + bindTile - 1) / bindTile;
uint8_t lastQueTileNum = mrgTileNum % bindTile;
if (lastQueTileNum == 0) {
lastQueTileNum = bindTile;
}
for (int i = 0; i < queNum; i++) {
srcGlobal[i] = sortedBlock[switchFlag][TILE_LEN_ELEM * (tileIdx + bindTile * i)];
}
for (int i = 0; i < queNum - 1; i++) {
lengths[i] = TILE_LENGTH * bindTile;
}
lengths[queNum - 1] = TILE_LENGTH * lastQueTileNum;
GMSSrcList srcList {srcGlobal};
GMSParams params {lengths, queNum,
{sortedLocal1, sortedLocal1[TILE_LENGTH], sortedLocal2, sortedLocal2[TILE_LENGTH], mrgLocal}};
MrgSortGM(sortedBlock[!switchFlag][TILE_LEN_ELEM * tileIdx], srcList, params);
}
switchFlag = !switchFlag;
}
if (switchFlag) {
DataCopyGM2GM(sortedBlock1, sortedBlock2, sortedLocal1, blockLength * SORT_DATATYPE_SIZE_FACTOR, TILE_LEN_BYTE);
}
}
template<typename T>
__aicore__ inline void KernelUnique<T>::GlobalSortV2()
{
LocalTensor<float> sortedLocal1 = calcBuf[0].Get<float>();
LocalTensor<float> sortedLocal2 = calcBuf[1].Get<float>();
LocalTensor<float> mrgLocal = calcBuf[2].Get<float>();
LocalTensor<int32_t> IBSyncLocal = sortedLocal2.ReinterpretCast<int32_t>();
GlobalTensor<float> sortedGlobal[2] = {sortedGlobal1, sortedGlobal2};
constexpr uint8_t PREFIX_QUE_NUM = 4;
bool switchFlag = false;
GlobalTensor<float> srcGlobal[4];
int lengths[4];
for (int bindBlock = 1; bindBlock < blockNum; bindBlock *= PREFIX_QUE_NUM, eventID++) {
for (int blockIdx = 0; blockIdx < blockNum; blockIdx += bindBlock * PREFIX_QUE_NUM) {
if ((GetBlockIdx() == blockIdx + bindBlock) || (GetBlockIdx() == blockIdx + bindBlock * 2) ||
(GetBlockIdx() == blockIdx + bindBlock * 3)) {
PipeBarrier<PIPE_ALL>();
IBSet(IBSyncGlobal, IBSyncLocal, (int32_t)GetBlockIdx(), eventID);
PipeBarrier<PIPE_ALL>();
} else if (GetBlockIdx() == blockIdx) {
int mrgBlockNum = min(blockNum - blockIdx, bindBlock * PREFIX_QUE_NUM);
uint8_t queNum = (mrgBlockNum + bindBlock - 1) / bindBlock;
for (int i = 1; i < queNum; i++) {
PipeBarrier<PIPE_ALL>();
IBWait(IBSyncGlobal, IBSyncLocal, (int32_t)blockIdx + (bindBlock * i), eventID);
PipeBarrier<PIPE_ALL>();
}
uint8_t lastQueBlockNum = mrgBlockNum % bindBlock;
if (lastQueBlockNum == 0) {
lastQueBlockNum = bindBlock;
}
for (int i = 0; i < queNum; i++) {
srcGlobal[i] =
sortedGlobal[switchFlag][GetGlobalOffset(blockIdx + bindBlock * i) * SORT_DATATYPE_SIZE_FACTOR];
}
for (int i = 0; i < queNum - 1; i++) {
lengths[i] =
GetGlobalOffset(blockIdx + (bindBlock * (i + 1))) - GetGlobalOffset(blockIdx + (bindBlock * i));
}
lengths[queNum - 1] = GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)) + lastQueBlockNum) -
GetGlobalOffset(blockIdx + (bindBlock * (queNum - 1)));
GMSSrcList srcList {srcGlobal};
GMSParams params {lengths, queNum,
{sortedLocal1, sortedLocal1[TILE_LENGTH], sortedLocal2, sortedLocal2[TILE_LENGTH], mrgLocal}};
MrgSortGM(
sortedGlobal[!switchFlag][GetGlobalOffset(blockIdx) * SORT_DATATYPE_SIZE_FACTOR], srcList, params);
}
}
switchFlag = !switchFlag;
}
if (switchFlag) {
GlobalTensor<float> tmpGlobal = sortedGlobal1;
sortedGlobal1 = sortedGlobal2;
sortedGlobal2 = tmpGlobal;
GlobalTensor<float> tmpGlobal1 = sortedBlock1;
sortedBlock1 = sortedBlock2;
sortedBlock2 = tmpGlobal1;
GlobalTensor<int32_t> tmpGlobal2 = sortedBlock1AsInt;
sortedBlock1AsInt = sortedBlock2AsInt;
sortedBlock2AsInt = tmpGlobal2;
}
}
template<typename T>
__aicore__ inline void KernelUnique<T>::ConsecutiveUnique(const LocalTensor<float>& dstVal,
const LocalTensor<float>& srcLocal, const LocalTensor<float>& shiftedLocal, const LocalTensor<uint32_t>& bitMask32,
const uint16_t elemLength, uint64_t& tileUniqueCnt)
{
LocalTensor<uint16_t> bitMask16 = bitMask32.ReinterpretCast<uint16_t>();
uint64_t rsvdCnt = 0;
GatherMask(dstVal, srcLocal, 1, false, 0, {1, static_cast<uint16_t>((elemLength * 2 + 63) / 64), 8, 0}, rsvdCnt);
PipeBarrier<PIPE_V>();
Duplicate(bitMask16, (uint16_t)0b1111111111111111, elemLength / 16);
PipeBarrier<PIPE_V>();
bitMask16.SetValue(0, 0b1111111111111110);
GatherMask(shiftedLocal, dstVal, bitMask32, true, elemLength, {1, 1, 8, 8}, rsvdCnt);
PipeBarrier<PIPE_V>();
shiftedLocal.SetValue(elemLength - 1, -FLOAT_INF);
Compare(bitMask16, dstVal, shiftedLocal, CMPMODE::NE, (elemLength + 63) / 64 * 64);
PipeBarrier<PIPE_V>();
GatherMask(dstVal, dstVal, bitMask32, true, elemLength, {1, 1, 8, 8}, tileUniqueCnt);
PipeBarrier<PIPE_V>();
}
template<typename T>
__aicore__ inline void KernelUnique<T>::TileUnique(const int32_t progress)
{
LocalTensor<uint32_t> bitMask32 = calcBuf[0].Get<uint32_t>();
LocalTensor<float> shiftedLocal = bitMask32[TILE_LENGTH].ReinterpretCast<float>();
LocalTensor<float> sortedLocal1 = calcBuf[1].Get<float>();
LocalTensor<float> sortedLocal2 = calcBuf[2].Get<float>();
LocalTensor<uint32_t> uniqueCntLocal = shiftedLocal.ReinterpretCast<uint32_t>();
uint64_t tileUniqueCnt;
uint64_t tmpRsvdCnt;
DataCopy(sortedLocal1, sortedBlock1[progress * TILE_LEN_ELEM], TILE_LEN_ELEM);
PipeBarrier<PIPE_ALL>();
ConsecutiveUnique(sortedLocal2, sortedLocal1, shiftedLocal, bitMask32, TILE_LENGTH, tileUniqueCnt);
PipeBarrier<PIPE_ALL>();
if ((progress == tileNum - 1) && hasInfFlag) {
sortedLocal2.SetValue(tileUniqueCnt, -FLOAT_INF);
tileUniqueCnt++;
}
PipeBarrier<PIPE_ALL>();
if (tileUniqueCnt != 0) {
blockUniqueCnt += tileUniqueCnt;
if (progress != 0 && lastTileUniqueVal == sortedLocal2.GetValue(0)) {
blockUniqueCnt--;
}
DataCopyPad(sortedBlock1[blockUniqueCnt - tileUniqueCnt], sortedLocal2,
{1, static_cast<uint16_t>(sizeof(float) * tileUniqueCnt), 0, 0});
PipeBarrier<PIPE_ALL>();
lastTileUniqueVal = sortedLocal2.GetValue(tileUniqueCnt - 1);
}
if (progress == tileNum - 1) {
uniqueCntLocal.SetValue(0, blockUniqueCnt);
DataCopyPad(blockUniqueCntGlobal[GetBlockIdx()], uniqueCntLocal,
{1, static_cast<uint16_t>(sizeof(uint32_t) * 1), 0, 0});
PipeBarrier<PIPE_ALL>();
}
}
template<typename T>
__aicore__ inline void KernelUnique<T>::CopyOut()
{
LocalTensor<T> copyLocal0 = calcBuf[0].Get<T>();
LocalTensor<float> copyLocal1 = calcBuf[1].Get<float>();
LocalTensor<int32_t> IBSyncLocal = copyLocal1.ReinterpretCast<int32_t>();
LocalTensor<int32_t> copyLocal2 = calcBuf[2].Get<int32_t>();
uint64_t lastAccUniqueCnt = 0;
for (int i = 0; i < GetBlockIdx(); i++) {
uint64_t lastUniqueCnt = blockUniqueCntGlobal.GetValue(i);
lastAccUniqueCnt += lastUniqueCnt;
if (sortedGlobal1[GetGlobalOffset(i + 1) * SORT_DATATYPE_SIZE_FACTOR].GetValue(0) ==
sortedGlobal1[GetGlobalOffset(i) * SORT_DATATYPE_SIZE_FACTOR].GetValue(lastUniqueCnt - 1)) {
lastAccUniqueCnt--;
}
}
uint64_t thisUniqueCnt = blockUniqueCntGlobal.GetValue(GetBlockIdx());
uint64_t restLen = thisUniqueCnt;
constexpr uint64_t bottleneckTypeSize = sizeof(T) > sizeof(float) ? sizeof(T) : sizeof(float);
LocalTensor<int32_t> copyVal32 = copyLocal0.template ReinterpretCast<int32_t>();
LocalTensor<int32_t> uniqueVal32 = copyLocal1.ReinterpretCast<int32_t>();
while (restLen > 0) {
uint64_t copyLen = min(restLen, TILE_LEN_BYTE / bottleneckTypeSize);
copyLen = min(copyLen, 65535 / bottleneckTypeSize);
if constexpr (!IsSameType<T, float>::value) {
DataCopyPad(copyLocal1, sortedBlock1[thisUniqueCnt - restLen],
{1, static_cast<uint16_t>(sizeof(float) * copyLen), 0, 0}, {false, 0, 0, 0});
PipeBarrier<PIPE_ALL>();
Muls(copyLocal1, copyLocal1, (float)-1, copyLen);
PipeBarrier<PIPE_V>();
Cast(copyLocal0, copyLocal1, RoundMode::CAST_RINT, copyLen);
PipeBarrier<PIPE_ALL>();
} else {
DataCopyPad(copyLocal0, sortedBlock1[thisUniqueCnt - restLen],
{1, static_cast<uint16_t>(sizeof(float) * copyLen), 0, 0}, {false, 0, 0, 0});
PipeBarrier<PIPE_ALL>();
Muls(copyLocal0, copyLocal0, (float)-1, copyLen);
PipeBarrier<PIPE_V>();
}
if constexpr (sizeof(T) > 4) {
DataCopyPad(dstGlobal1As32[(lastAccUniqueCnt + thisUniqueCnt - restLen) * sizeof(T) / sizeof(uint32_t)],
copyVal32, {1, static_cast<uint16_t>(sizeof(T) * copyLen), 0, 0});
} else {
DataCopyPad(dstGlobal1[lastAccUniqueCnt + thisUniqueCnt - restLen], copyLocal0,
{1, static_cast<uint16_t>(sizeof(T) * copyLen), 0, 0});
}
PipeBarrier<PIPE_ALL>();
restLen -= copyLen;
}
if (GetBlockIdx() == blockNum - 1) {
uniqueVal32.SetValue(0, lastAccUniqueCnt + thisUniqueCnt);
DataCopyPad(uniqueCntGlobal, uniqueVal32, {1, static_cast<uint16_t>(sizeof(uint32_t) * 1), 0, 0});
PipeBarrier<PIPE_ALL>();
}
}
}