ascend-transformer-boost/src/kernels/mixkernels/unpad/op_kernel/unpad.cpp-代码预览-ascend-transformer-boost:基于华为Ascend AI处理器的Transformer模型加速库项目 - AtomGit

cann-robotcleancode #if 或 #elif 预处理指令中的常量表达式被求值前应确保其使用的标识符已定义第三部分
/*
* Copyright (c) 2024 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

#include "kernel_operator.h"
#include "mixkernels/unpad/tiling/tiling_data.h"
#include "mixkernels/utils/common/kernel/kernel_utils.h"
using namespace AscendC;

constexpr int32_t BUFFER_NUM = 1;
constexpr int32_t ELE_PER_BLK = 8;
constexpr int32_t ELE_PER_BLK_INT64 = 4;
constexpr int32_t MAX_BATCH_NUM = 64;

class KernelUnpad {
public:
    __aicore__ inline KernelUnpad() {}
    __aicore__ inline void Init(GM_ADDR input_ids,
                                GM_ADDR cum_offsets_now, GM_ADDR token_num,
                                GM_ADDR seq_len, GM_ADDR x_remove_padding,
                                GM_ADDR cum_offsets_out, GM_ADDR padding_offset,
                                uint32_t padLength_, uint32_t batch)
    {
        this->padLength_ = padLength_;
        this->batch_ = batch;
        inputIdsGm.SetGlobalBuffer((__gm__ int64_t *)input_ids, padLength_ * batch_);
        cumOffsetsNowGm.SetGlobalBuffer((__gm__ int32_t *)cum_offsets_now, batch_);
        tokenNumGm.SetGlobalBuffer((__gm__ int64_t *)token_num, 1);
        seqLenGm.SetGlobalBuffer((__gm__ int32_t *)seq_len, batch_);
        xRemovePaddingGm.SetGlobalBuffer((__gm__ int64_t *)x_remove_padding, padLength_ * batch_);
        cumOffsetOutGm.SetGlobalBuffer((__gm__ int32_t *)cum_offsets_out, batch_);
        paddingOffsetGm.SetGlobalBuffer((__gm__ int32_t *)padding_offset, padLength_ * batch_);
        padLengthAlign_ = ((padLength_ + ELE_PER_BLK - 1) / ELE_PER_BLK) * ELE_PER_BLK;
        padLengthAlignInt64_ = ((padLength_ + ELE_PER_BLK_INT64 - 1) / ELE_PER_BLK_INT64) * ELE_PER_BLK_INT64;
        batchAlign_ = ((batch_ + ELE_PER_BLK - 1) / ELE_PER_BLK) * ELE_PER_BLK;
        pipe_.InitBuffer(inputIdsQueue_, BUFFER_NUM, padLengthAlign_ * sizeof(int64_t));
        pipe_.InitBuffer(cumOffsetsQueue_, BUFFER_NUM, MAX_BATCH_NUM * sizeof(int32_t));
        pipe_.InitBuffer(seqLenQueue_, BUFFER_NUM, MAX_BATCH_NUM * sizeof(int32_t));
        pipe_.InitBuffer(xRemovePaddingQueue_, padLengthAlign_ * sizeof(int64_t));
        pipe_.InitBuffer(cumOffsetOutQueue_, BUFFER_NUM, MAX_BATCH_NUM * sizeof(int32_t));
        pipe_.InitBuffer(broadCastBuf_, padLengthAlign_ *  sizeof(int32_t));
        pipe_.InitBuffer(cumOffsetsBuf_, MAX_BATCH_NUM *  sizeof(int32_t));
        pipe_.InitBuffer(seqLenBuf_, MAX_BATCH_NUM *  sizeof(int32_t));
    }

    __aicore__ inline void Process()
    {
        AscendC::PipeBarrier<PIPE_MTE2>();
        AscendC::PipeBarrier<PIPE_ALL>();
        for (uint64_t i = 0; i < batch_; i++) {
            CopyIn(i);
            CopyOnce();
            AscendC::PipeBarrier<PIPE_MTE2>();
            ComputeOnce(i);
            ComputeRemovePadding();
            AscendC::PipeBarrier<PIPE_V>();
            CopyOut(i);
            CopyOutOnce(i);
        }
        ClearStep();
    }

private:
    __aicore__ inline void CopyOnce()
    {
        LocalTensor<int32_t> seqLenLocal = seqLenQueue_.AllocTensor<int32_t>();
        AscendC::LocalTensor<int32_t> cumOffsetsBuffer = cumOffsetsBuf_.Get<int32_t>();
        AscendC::LocalTensor<int32_t> seqLenBufBuffer = seqLenBuf_.Get<int32_t>();
        DataCopy(seqLenLocal, seqLenGm, batchAlign_);
        DataCopy(cumOffsetsBuffer, cumOffsetsNowGm, batchAlign_);
        DataCopy(seqLenBufBuffer, seqLenGm, batchAlign_);
        seqLenQueue_.EnQue(seqLenLocal);
    }

    __aicore__ inline void ComputeOnce(uint32_t progress)
    {
        AscendC::LocalTensor<int32_t> cumOffsetsBuffer = cumOffsetsBuf_.Get<int32_t>();
        LocalTensor<int32_t> seqLenLocal = seqLenQueue_.DeQue<int32_t>();
        AscendC::LocalTensor<int32_t> broadCast = broadCastBuf_.Get<int32_t>();
        AscendC::LocalTensor<int32_t> cumOffsetOutLocal = cumOffsetOutQueue_.AllocTensor<int32_t>();
        seqLenZero_ = seqLenLocal.GetValue(0);
        if (progress == 0) {
            Duplicate(broadCast, (int32_t)0, padLengthAlign_);
            AscendC::PipeBarrier<PIPE_ALL>();
        } else {
            Duplicate(broadCast, (int32_t)cumOffsetsBuffer.GetValue(progress - 1), padLengthAlign_);
            AscendC::PipeBarrier<PIPE_V>();
        }
        AscendC::PipeBarrier<PIPE_ALL>();
        cumOffsetOutLocal.SetValue(0, (int32_t)0);
        for (uint32_t x = 1; x < batch_; x++) {
                cumOffsetOutLocal.SetValue(x, cumOffsetsBuffer.GetValue(x - 1));
        }
        cumOffsetOutQueue_.EnQue<int32_t>(cumOffsetOutLocal);
        seqLenQueue_.FreeTensor(seqLenLocal);
    }

    __aicore__ inline void CopyOutOnce(uint64_t progress)
    {
        AscendC::LocalTensor<int32_t> cumOffsetsBuffer = cumOffsetsBuf_.Get<int32_t>();
        LocalTensor<int32_t> cumOffsetOutLocal = cumOffsetOutQueue_.DeQue<int32_t>();
        AscendC::LocalTensor<int32_t> broadCast = broadCastBuf_.Get<int32_t>();
        if (progress == 0) {
            DataCopy(paddingOffsetGm, broadCast, padLengthAlign_);
        } else {
            DataCopy(paddingOffsetGm[seqLenZero_ + (progress - 1) * padLength_ -
                     cumOffsetsBuffer.GetValue(progress - 1) +
                     cumOffsetsBuffer.GetValue(0)], broadCast, padLengthAlign_);
        }
        DataCopy(cumOffsetOutGm, cumOffsetOutLocal, batchAlign_);
        cumOffsetOutQueue_.FreeTensor(cumOffsetOutLocal);
    }

    __aicore__ inline void CopyIn(uint64_t progress)
    {
        LocalTensor<int64_t> inputIdsLocal = inputIdsQueue_.AllocTensor<int64_t>();
        DataCopy(inputIdsLocal, inputIdsGm[progress * padLength_], padLengthAlignInt64_);
        inputIdsQueue_.EnQue(inputIdsLocal);
    }

    __aicore__ inline void ComputeRemovePadding()
    {
        LocalTensor<int64_t> inputIdsLocal = inputIdsQueue_.DeQue<int64_t>();
        AscendC::LocalTensor<int32_t> broadCast = broadCastBuf_.Get<int32_t>();
        AscendC::LocalTensor<int64_t> xRemovePaddingLocal = xRemovePaddingQueue_.Get<int64_t>();
        DataCopy(xRemovePaddingLocal, inputIdsLocal, padLengthAlignInt64_);
        AscendC::PipeBarrier<PIPE_ALL>();
        inputIdsQueue_.FreeTensor(inputIdsLocal);
    }
    __aicore__ inline void CopyOut(uint64_t progress)
    {
        AscendC::LocalTensor<int32_t> cumOffsetsBuffer = cumOffsetsBuf_.Get<int32_t>();
        AscendC::LocalTensor<int32_t> seqLenBufBuffer = seqLenBuf_.Get<int32_t>();
        LocalTensor<int64_t> xRemovePaddingLocal = xRemovePaddingQueue_.Get<int64_t>();
        AscendC::LocalTensor<int32_t> broadCast = broadCastBuf_.Get<int32_t>();

        if (progress == 0) {
            DataCopy(xRemovePaddingGm, xRemovePaddingLocal, padLengthAlignInt64_);
        } else {
            DataCopy(xRemovePaddingGm[progress * padLength_ - cumOffsetsBuffer.GetValue(progress - 1)],
                     xRemovePaddingLocal, padLengthAlignInt64_);
        }
        AscendC::PipeBarrier<PIPE_ALL>();

        for (int32_t i = 0; i < padLengthAlignInt64_; i++) {
            xRemovePaddingLocal.SetValue(i, (int64_t)0);
        }
        DataCopy(xRemovePaddingGm[batch_ * padLength_ - cumOffsetsBuffer.GetValue(batch_ - 1)],
            xRemovePaddingLocal, padLengthAlignInt64_);
    }

    __aicore__ inline void ClearStep()
    {
        AscendC::LocalTensor<int32_t> cumOffsetsBuffer = cumOffsetsBuf_.Get<int32_t>();
        LocalTensor<int64_t> xRemovePaddingLocal = xRemovePaddingQueue_.Get<int64_t>();
        AscendC::LocalTensor<int32_t> broadCast = broadCastBuf_.Get<int32_t>();
        AscendC::PipeBarrier<PIPE_ALL>();

        for (int32_t i = 0; i < padLengthAlign_; i++) {
            xRemovePaddingLocal.SetValue(i, (int64_t)0);
        }
        Duplicate(broadCast, (int32_t)0, padLengthAlign_);
        AscendC::PipeBarrier<PIPE_ALL>();

        DataCopy(paddingOffsetGm[batch_ * padLength_ - cumOffsetsBuffer.GetValue(batch_ - 1)],
            broadCast, padLengthAlign_);

        DataCopy(xRemovePaddingGm[batch_ * padLength_ - cumOffsetsBuffer.GetValue(batch_ - 1)],
            xRemovePaddingLocal, padLengthAlignInt64_);
    }
private:
    TPipe pipe_;
    TQue<QuePosition::VECIN, BUFFER_NUM>  inputIdsQueue_, cumOffsetsQueue_, tokenNumQueue_, seqLenQueue_;
    TQue<QuePosition::VECOUT, BUFFER_NUM> cumOffsetOutQueue_, paddingOffsetQueue_;
    AscendC::TBuf<AscendC::TPosition::VECCALC> broadCastBuf_;
    AscendC::TBuf<AscendC::TPosition::VECCALC> cumOffsetsBuf_;
    AscendC::TBuf<AscendC::TPosition::VECCALC> seqLenBuf_;
    AscendC::TBuf<AscendC::TPosition::VECCALC> xRemovePaddingQueue_;

    GlobalTensor<int32_t> cumOffsetsNowGm, seqLenGm, cumOffsetOutGm, paddingOffsetGm;
    GlobalTensor<int64_t> inputIdsGm, tokenNumGm, xRemovePaddingGm;
    uint32_t padLength_{1};
    uint64_t batch_{1};
    uint32_t padLengthAlign_{16};
    uint32_t padLengthAlignInt64_{4};
    uint32_t batchAlign_{8};
    int64_t seqLenZero_{0};
};

inline __aicore__ void InitTilingData(const __gm__ uint8_t *p_tilingdata, AtbOps::UnpadTilingData *tilingdata)
{
#if defined(__CCE_KT_TEST__) || (defined(__CCE_AICORE__) && __CCE_AICORE__ == 220)
    tilingdata->padLength = (*(const __gm__ uint32_t *)(p_tilingdata + 0));
    tilingdata->batch = (*(const __gm__ uint32_t *)(p_tilingdata + 4));

#else
    AscendC::TPipe pipe;
    __ubuf__ uint8_t *tilingdata_in_ub = nullptr;
    CopyGmTilingToUb(tilingdata_in_ub, p_tilingdata, sizeof(AtbOps::UnpadTilingData), &pipe);
    AscendC::PipeBarrier<PIPE_ALL>();
    tilingdata->padLength = (*(__ubuf__ uint32_t *)(tilingdata_in_ub + 0));
    tilingdata->batch = (*(__ubuf__ uint32_t *)(tilingdata_in_ub + 4));
    AscendC::PipeBarrier<PIPE_ALL>();
#endif
}

extern "C" __global__ __aicore__ void unpad(GM_ADDR input_ids,
    GM_ADDR cum_offsets_now, GM_ADDR token_num, GM_ADDR seq_len, GM_ADDR x_remove_padding,
    GM_ADDR cum_offsets_out, GM_ADDR padding_offset, GM_ADDR workspace, GM_ADDR tiling)
{
    AtbOps::UnpadTilingData tilingData;
    InitTilingData(tiling, &(tilingData));
    KernelUnpad op;
    op.Init(input_ids, cum_offsets_now, token_num, seq_len,
            x_remove_padding, cum_offsets_out, padding_offset, tilingData.padLength, tilingData.batch);
    op.Process();
}