/**
 * Copyright (c) 2026 Huawei Technologies Co., Ltd.
 * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
 * CANN Open Software License Agreement Version 2.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

/* Generated By CANNBot */

/*!
 * \file inv_tiling_arch35.cpp
 * \brief Inv 算子 Host Tiling 实现(arch35 / Ascend950)
 *
 * Tiling 策略:
 *   1. 多核切分:blockFactor = CeilAlign(CeilDiv(total, coreNum), 32B/typeSize)
 *   2. UB 切分:bytesPerElem 按 dtype 分支(含 BUFFER_NUM=2 的 double buffer)
 *               - FP32:      2 * BUFFER_NUM * 4 + 4 = 20   (input+output*DB + ones)
 *               - FP16/BF16: 2 * BUFFER_NUM * 2 + 4 + 4 = 16 (input+output*DB + xFloat + ones)
 *               - INT32:     2 * BUFFER_NUM * 4     = 16   (input+output*DB,无 fp32 工作字节)
 *               ubFactor    = FloorAlign(ubSize / bytesPerElem, 32B/typeSize)
 *   3. Buffer layout: inputQueue(BUFFER_NUM) + outputQueue(BUFFER_NUM) + [tmpBuf1(fp32, 仅FP16/BF16)] + [tmpBuf2(fp32 ones, 仅浮点)]
 *                     int32 路径仅 input+output(纯整型 Compare/Select,无 fp32 中转,评审 MED-2)
 *
 * TilingKey 编码(按 dtype 维度 D_T_SELF 分发):
 *   key 0 -> C_DT_FLOAT   (float32 直接 Div)
 *   key 1 -> C_DT_FLOAT16 (float16: Cast→Div→Cast)
 *   key 2 -> C_DT_BF16    (bfloat16: Cast→Div→Cast)
 *   key 3 -> C_DT_INT32   (int32: 整型 Compare+Select 三值映射)
 */

#include "register/op_def_registry.h"
#include "op_common/log/log.h"
#include "op_common/op_host/util/math_util.h"
#include "op_common/op_host/util/platform_util.h"
#include "../../op_kernel/arch35/inv_tiling_data.h"
#include "../../op_kernel/arch35/inv_tiling_key.h"

namespace optiling {

using Ops::Base::CeilDiv;
using Ops::Base::FloorDiv;
using Ops::Base::FloorAlign;

constexpr uint32_t WS_SYS_SIZE = 0U;

static const gert::Shape g_vec_1_shape = {1};

static inline const gert::Shape EnsureNotScalar(const gert::Shape& in_shape)
{
    if (in_shape.GetDimNum() == 0) {
        return g_vec_1_shape;
    }
    return in_shape;
}

static ge::graphStatus GetPlatformInfo(gert::TilingContext* context, uint64_t& ubSize, int64_t& coreNum)
{
    fe::PlatFormInfos* platformInfoPtr = context->GetPlatformInfo();
    OP_CHECK_NULL_WITH_CONTEXT(context, platformInfoPtr);
    auto ascendcPlatform = platform_ascendc::PlatformAscendC(platformInfoPtr);
    coreNum = ascendcPlatform.GetCoreNumAiv();
    OP_CHECK_IF(coreNum == 0, OP_LOGE(context, "Inv: coreNum is 0"), return ge::GRAPH_FAILED);
    ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize);
    OP_CHECK_IF(ubSize == 0, OP_LOGE(context, "Inv: ubSize is 0"), return ge::GRAPH_FAILED);
    return ge::GRAPH_SUCCESS;
}

static ge::graphStatus GetShapeInfo(gert::TilingContext* context, int64_t& totalElements,
                                    ge::DataType& dataType)
{
    auto inputSelf = context->GetInputShape(0);
    OP_CHECK_NULL_WITH_CONTEXT(context, inputSelf);
    auto inputShape = EnsureNotScalar(inputSelf->GetStorageShape());
    totalElements = inputShape.GetShapeSize();

    auto inputDesc = context->GetInputDesc(0);
    OP_CHECK_NULL_WITH_CONTEXT(context, inputDesc);
    dataType = inputDesc->GetDataType();
    const std::set<ge::DataType> supportedDtype = {
        ge::DT_FLOAT, ge::DT_FLOAT16, ge::DT_BF16, ge::DT_INT32
    };
    OP_CHECK_IF(supportedDtype.count(dataType) == 0,
        OP_LOGE(context, "Inv: unsupported dtype %d", static_cast<int>(dataType)),
        return ge::GRAPH_FAILED);

    return ge::GRAPH_SUCCESS;
}

static ge::graphStatus GetWorkspaceSize(gert::TilingContext* context)
{
    size_t* currentWorkspace = context->GetWorkspaceSizes(1);
    OP_CHECK_NULL_WITH_CONTEXT(context, currentWorkspace);
    currentWorkspace[0] = WS_SYS_SIZE;
    return ge::GRAPH_SUCCESS;
}

static int64_t GetTypeSize(ge::DataType dataType)
{
    return (dataType == ge::DT_FLOAT16 || dataType == ge::DT_BF16) ? 2 : 4;
}

static int64_t CalcBytesPerElem(ge::DataType dataType, int64_t typeSize)
{
    constexpr int64_t BUFFER_NUM = 2;
    if (dataType == ge::DT_INT32) {
        return 2 * BUFFER_NUM * typeSize;
    }
    int64_t fp32WorkBytes = (dataType == ge::DT_FLOAT)
        ? static_cast<int64_t>(sizeof(float))
        : 2 * static_cast<int64_t>(sizeof(float));
    return 2 * BUFFER_NUM * typeSize + fp32WorkBytes;
}

static ge::graphStatus CalcTilingParams(gert::TilingContext* context, InvTilingData* tiling,
                                         int64_t totalElements, int64_t coreNum,
                                         uint64_t ubSize, ge::DataType dataType)
{
    int64_t typeSize = GetTypeSize(dataType);
    OP_CHECK_IF(typeSize == 0, OP_LOGE(context, "Inv: typeSize is 0"), return ge::GRAPH_FAILED);
    int64_t ubBlockSize = 32 / typeSize;
    int64_t blockFactor = CeilDiv(totalElements, coreNum);
    blockFactor = ((blockFactor + ubBlockSize - 1) / ubBlockSize) * ubBlockSize;
    int64_t usedCoreNum = CeilDiv(totalElements, blockFactor);
    int64_t bytesPerElem = CalcBytesPerElem(dataType, typeSize);
    OP_CHECK_IF(bytesPerElem == 0, OP_LOGE(context, "Inv: bytesPerElem is 0"), return ge::GRAPH_FAILED);
    int64_t ubFactor = FloorAlign(static_cast<int64_t>(ubSize) / bytesPerElem, ubBlockSize);
    OP_CHECK_IF(ubFactor <= 0, OP_LOGE(context, "Inv: ubFactor=%ld, UB too small", ubFactor),
                return ge::GRAPH_FAILED);
    tiling->totalElements = totalElements;
    tiling->blockFactor = blockFactor;
    tiling->ubFactor = ubFactor;
    context->SetBlockDim(usedCoreNum);
    return ge::GRAPH_SUCCESS;
}

static ge::graphStatus InvTilingFunc(gert::TilingContext* context)
{
    OP_LOGD(context->GetNodeName(), "Enter InvTilingFunc");
    uint64_t ubSize = 0;
    int64_t coreNum = 0;
    OP_CHECK_IF(GetPlatformInfo(context, ubSize, coreNum) != ge::GRAPH_SUCCESS,
        OP_LOGE(context, "Inv: GetPlatformInfo error"), return ge::GRAPH_FAILED);
    int64_t totalElements = 0;
    ge::DataType dataType = ge::DT_FLOAT;
    OP_CHECK_IF(GetShapeInfo(context, totalElements, dataType) != ge::GRAPH_SUCCESS,
        OP_LOGE(context, "Inv: GetShapeInfo error"), return ge::GRAPH_FAILED);
    OP_CHECK_IF(GetWorkspaceSize(context) != ge::GRAPH_SUCCESS,
        OP_LOGE(context, "Inv: GetWorkspaceSize error"), return ge::GRAPH_FAILED);
    InvTilingData* tiling = context->GetTilingData<InvTilingData>();
    OP_CHECK_NULL_WITH_CONTEXT(context, tiling);
    OP_CHECK_IF(memset_s(tiling, sizeof(InvTilingData), 0, sizeof(InvTilingData)) != EOK,
        OP_LOGE(context, "Inv: set tiling data error"), return ge::GRAPH_FAILED);
    if (totalElements == 0) {
        context->SetBlockDim(1);
    } else {
        OP_CHECK_IF(CalcTilingParams(context, tiling, totalElements, coreNum, ubSize, dataType)
                    != ge::GRAPH_SUCCESS,
                    OP_LOGE(context, "Inv: CalcTilingParams error"), return ge::GRAPH_FAILED);
    }
    ASCENDC_TPL_SEL_PARAM(context, static_cast<uint32_t>(dataType));
    return ge::GRAPH_SUCCESS;
}

static ge::graphStatus TilingParseForInv([[maybe_unused]] gert::TilingParseContext* context)
{
    return ge::GRAPH_SUCCESS;
}

struct InvCompileInfo {};

IMPL_OP_OPTILING(Inv).Tiling(InvTilingFunc).TilingParse<InvCompileInfo>(TilingParseForInv);

} // namespace optiling