/*
* Copyright (c) 2024 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

#include "kernel_operator.h"
#include "mixkernels/pad/tiling/tiling_data.h"
#include "mixkernels/utils/common/kernel/kernel_utils.h"
using namespace AscendC;

constexpr int32_t BUFFER_NUM = 1;
constexpr int32_t ELE_PER_BLK = 16;
constexpr int32_t MAX_BATCH_NUM = 64;

class KernelPad {
public:
    __aicore__ inline KernelPad() {}
    __aicore__ inline void Init(GM_ADDR tmpOut,
                                GM_ADDR paddingOffset, GM_ADDR seqLen,
                                GM_ADDR inputIds, GM_ADDR out,
                                uint32_t padLength, uint32_t batch, uint32_t hiddenDim)
    {
        padLength_ = padLength;
        batch_ = batch;
        hiddenDim_ = hiddenDim;
        hiddenDimAlign_ = (hiddenDim + ELE_PER_BLK - 1) / ELE_PER_BLK * ELE_PER_BLK;
        padLengthAlign_ = ((padLength_ + ELE_PER_BLK - 1) / ELE_PER_BLK) * ELE_PER_BLK;
        batchAlign_ = ((batch_ + ELE_PER_BLK - 1) / ELE_PER_BLK) * ELE_PER_BLK;
        uint32_t tokenNum  = 0;
        for (uint32_t i = 0; i < batch; i++) {
            tokenNum += *((__gm__ int32_t *)(seqLen) + i);
        }
        tmpOutGm.SetGlobalBuffer((__gm__ half *)tmpOut, tokenNum * hiddenDim_);
        seqLenGm.SetGlobalBuffer((__gm__ int32_t *)seqLen, tokenNum);
        outGm.SetGlobalBuffer((__gm__ half *)out, batch_ * hiddenDim_);
        pipe_.InitBuffer(tmpOutQueue_, BUFFER_NUM, hiddenDimAlign_ * sizeof(half));
        pipe_.InitBuffer(seqLenQueue_, BUFFER_NUM, MAX_BATCH_NUM * sizeof(int32_t));
        pipe_.InitBuffer(outQueue_, BUFFER_NUM, hiddenDimAlign_ * sizeof(half));
    }
    __aicore__ inline void Process()
    {
        for (int64_t i = 0; i < batch_; i++) {
            CopyOnce();
            AscendC::PipeBarrier<PIPE_ALL>();
            CopyIn(i);
            AscendC::SetFlag<HardEvent::MTE2_V>(EVENT_ID0);
            AscendC::WaitFlag<HardEvent::MTE2_V>(EVENT_ID0);
            ComputeRemovePadding();
            AscendC::SetFlag<HardEvent::V_MTE2>(EVENT_ID0);
            AscendC::WaitFlag<HardEvent::V_MTE2>(EVENT_ID0);
            CopyOut(i);
        }
    }

private:
    __aicore__ inline void CopyOnce()
    {
        LocalTensor<int32_t> seqLenLocal = seqLenQueue_.AllocTensor<int32_t>();
        DataCopy(seqLenLocal, seqLenGm, batchAlign_);
        seqLenQueue_.EnQue(seqLenLocal);
    }
    __aicore__ inline void CopyIn(uint64_t progress)
    {
        LocalTensor<int32_t> seqLenLocal = seqLenQueue_.DeQue<int32_t>();
        LocalTensor<half> tmpOutLocal = tmpOutQueue_.AllocTensor<half>();
        if (seqLenLocal.GetValue(progress) == 0) {
            offsetUnusedBathch++;
        } else {
            tempVal_ = tempVal_ + seqLenLocal.GetValue(progress);
            AscendC::PipeBarrier<PIPE_ALL>();
            DataCopy(tmpOutLocal, tmpOutGm[(tempVal_ - 1) * hiddenDim_], hiddenDimAlign_);
        }
        tmpOutQueue_.EnQue(tmpOutLocal);
        seqLenQueue_.FreeTensor(seqLenLocal);
    }

    __aicore__ inline void ComputeRemovePadding()
    {
        LocalTensor<half> tmpOutLocal = tmpOutQueue_.DeQue<half>();
        LocalTensor<half> outLocal = outQueue_.AllocTensor<half>();
        DataCopy(outLocal, tmpOutLocal, hiddenDimAlign_);
        tmpOutQueue_.FreeTensor(tmpOutLocal);
        outQueue_.EnQue(outLocal);
    }
    __aicore__ inline void CopyOut(uint64_t progress)
    {
        LocalTensor<half> outLocal = outQueue_.DeQue<half>();
        uint64_t offset = (progress - offsetUnusedBathch) <= 0 ?
            0 : (progress - offsetUnusedBathch) * hiddenDim_;
        DataCopy(outGm[offset], outLocal, hiddenDimAlign_);
        outQueue_.FreeTensor(outLocal);
    }
private:
    TPipe pipe_;
    TQue<QuePosition::VECIN, BUFFER_NUM> tmpOutQueue_, paddingOffsetQueue_, seqLenQueue_, inputIdsQueue_;
    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueue_;
    GlobalTensor<half> tmpOutGm, outGm;
    GlobalTensor<int32_t> paddingOffsetGm, seqLenGm, inputIdsGm;
    uint32_t padLength_{1};
    uint32_t batch_{1};
    uint32_t padLengthAlign_{16};
    uint32_t batchAlign_{8};
    uint32_t hiddenDim_{16};
    uint32_t hiddenDimAlign_{16};
    uint32_t tempVal_ = 0;
    uint64_t offsetUnusedBathch = 0;
};

inline __aicore__ void InitTilingData(const __gm__ uint8_t *p_tilingdata, AtbOps::PadTilingData *tilingdata)
{
#if defined(__CCE_KT_TEST__) || (defined(__CCE_AICORE__) && __CCE_AICORE__ == 220)
    tilingdata->padLength = (*(const __gm__ uint32_t *)(p_tilingdata + 0));
    tilingdata->batch = (*(const __gm__ uint32_t *)(p_tilingdata + 4));
    tilingdata->hiddenDim = (*(const __gm__ uint32_t *)(p_tilingdata + 8));
#else
    AscendC::TPipe pipe;
    __ubuf__ uint8_t *tilingdata_in_ub = nullptr;
    CopyGmTilingToUb(tilingdata_in_ub, p_tilingdata, sizeof(AtbOps::PadTilingData), &pipe);
    AscendC::PipeBarrier<PIPE_ALL>();
    tilingdata->padLength = (*(__ubuf__ uint32_t *)(tilingdata_in_ub + 0));
    tilingdata->batch = (*(__ubuf__ uint32_t *)(tilingdata_in_ub + 4));
    tilingdata->hiddenDim = (*(__ubuf__ uint32_t *)(tilingdata_in_ub + 8));
    AscendC::PipeBarrier<PIPE_ALL>();
#endif
}

extern "C" __global__ __aicore__ void pad(GM_ADDR tmpOut,
                                          GM_ADDR paddingOffset, GM_ADDR seqLen,
                                          GM_ADDR inputIds, GM_ADDR out,
                                          GM_ADDR workspace, GM_ADDR tiling)
{
    AtbOps::PadTilingData tilingData;
    InitTilingData(tiling, &(tilingData));
    KernelPad op;
    op.Init(tmpOut, paddingOffset, seqLen, inputIds,
            out, tilingData.padLength, tilingData.batch, tilingData.hiddenDim);
    op.Process();
}