/*
* Copyright (c) 2024 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#include <mki/base/operation_base.h>
#include <mki/utils/log/log.h>
#include <mki/utils/const/op_const.h>
#include <mki_loader/op_register.h>
#include "atbops/params/params.h"

namespace AtbOps {
constexpr int32_t MAX_SEQLEN = 4096;
using namespace Mki;
class FastSoftMaxGradOperation : public OperationBase {
public:
    explicit FastSoftMaxGradOperation(const std::string &opName) noexcept : OperationBase(opName) {}

    int64_t GetInputNum(const Any &specificParam) const override
    {
        MKI_CHECK(specificParam.Type() == typeid(OpParam::FastSoftMaxGrad), "OpParam is invalid", return 0);
        return DIM_2;
    }

    int64_t GetOutputNum(const Any &specificParam) const override
    {
        MKI_CHECK(specificParam.Type() == typeid(OpParam::FastSoftMaxGrad), "OpParam is invalid", return 0);
        return DIM_1;
    }

protected:
    Status InferShapeImpl(const LaunchParam &launchParam, SVector<Tensor> &outTensors) const override
    {
        MKI_CHECK(launchParam.GetParam().Type() == typeid(OpParam::FastSoftMaxGrad),
            "OpParam is invalid", return Status::FailStatus(ERROR_INFERSHAPE_ERROR, "OpParam is invalid"));
            
        auto param = AnyCast<OpParam::FastSoftMaxGrad>(launchParam.GetParam());
        MKI_LOG(INFO) << "infer shape param: " << param.headNum;
        MKI_CHECK(CheckUnpadFastSoftMaxGrad(launchParam), "Failed to check run info",
            return Status::FailStatus(ERROR_INFERSHAPE_ERROR, "Failed to check run info"));
        auto &yInputTensor = launchParam.GetInTensor(DIM_0);
        outTensors[DIM_0].desc = yInputTensor.desc;
        return Status::OkStatus();
    }

    Kernel *GetBestKernel(const LaunchParam &launchParam) const override
    {
        MKI_CHECK(IsConsistent(launchParam), "Failed to check consistent", return nullptr);
        MKI_CHECK(launchParam.GetParam().Type() == typeid(OpParam::FastSoftMaxGrad),
            "OpParam is invalid", return nullptr);
        return GetKernelByName("FastSoftMaxGradKernel");
    }

private:
    bool CheckUnpadFastSoftMaxGrad(const LaunchParam &launchParam) const
    {
        auto param = AnyCast<OpParam::FastSoftMaxGrad>(launchParam.GetParam());
        MKI_CHECK(param.headNum > 0,
            "headNum is invalid for FastSoftMaxGradOperation.", return false);
        constexpr size_t maxSeqLenSize = 32;
        MKI_CHECK(param.qSeqLen.size() > 0,
            "qSeqLen list should not be empty!", return false);
        MKI_CHECK(param.qSeqLen.size() <= maxSeqLenSize,
            "qSeqLen list size should be less than 32!", return false);
        for (auto sampleSeqLen : param.qSeqLen) {
            if (sampleSeqLen <= 0 || sampleSeqLen > MAX_SEQLEN) {
                MKI_LOG(ERROR) << "Invalid qSeqLen: " << sampleSeqLen;
                return false;
            }
        }
        int64_t nSquareToken = 0;
        for (int32_t sampleSeqLen : param.qSeqLen) {
            nSquareToken += static_cast<int64_t>(sampleSeqLen) * sampleSeqLen;
        }
        MKI_CHECK(nSquareToken < std::numeric_limits<uint32_t>::max() / static_cast<uint32_t>(param.headNum),
            "nSquareToken * headNum is overflow for FastSoftMaxGradOperation.", return false);
        int64_t inDim = 0;
        inDim = nSquareToken * param.headNum;
        auto &yInputTensor = launchParam.GetInTensor(DIM_0);
        MKI_CHECK(yInputTensor.desc.dtype == TENSOR_DTYPE_FLOAT16,
            "Input dim 0 (yInput) dtype invalid, should be float16", return false);
        MKI_CHECK(yInputTensor.desc.format == TENSOR_FORMAT_ND,
            "Input dim 0 (yInput) format invalid, should be ND", return false);
        MKI_CHECK(yInputTensor.desc.dims.size() == 1,
            "Input dim 0 (yInput) dims invalid", return false);
        MKI_CHECK(yInputTensor.desc.dims[DIM_0] == inDim,
            "Input dim 0 (yInput) dims invalid", return false);
        auto &yGradTensor = launchParam.GetInTensor(DIM_1);
        MKI_CHECK(yGradTensor.desc.dtype == TENSOR_DTYPE_FLOAT16,
            "Input dim 1 (yGrad) dtype invalid, should be float16", return false);
        MKI_CHECK(yGradTensor.desc.format == TENSOR_FORMAT_ND,
            "Input dim 1 (yGrad) format invalid, should be ND", return false);
        MKI_CHECK(yGradTensor.desc.dims.size() == 1,
            "Input dim 1 (yGrad) dims invalid", return false);
        MKI_CHECK(yGradTensor.desc.dims[DIM_0] == inDim,
            "Input dim 1 (yGrad) dims invalid", return false);
        return true;
    }
};

REG_OPERATION(FastSoftMaxGradOperation);
} // namespace AtbOps