/**
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 * This program is free software, you can redistribute it and/or modify it under the terms and conditions of 
 * CANN Open Software License Agreement Version 2.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, 
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

#include "ascendc_ir.h"
#include "graph/symbolizer/symbolic_utils.h"
#include "defalut_reg_func.h"

namespace af {
namespace ascir {
namespace {
constexpr int32_t TWO = 2;
constexpr int32_t FOUR = 4;

constexpr int32_t TMP_SIZE_UNIT = 1024;
constexpr int32_t BASIC_TMP_SIZE = 16384;
constexpr int32_t MAX_TMP_SIZE = 65536;
constexpr int32_t MAX_TMP_SIZE_FOR_SMALL_TAIL = 96 * 1024;

constexpr int32_t TYPESIZEEQ8 = 8;
constexpr int32_t TYPESIZEEQ4 = 4;
constexpr int32_t TYPESIZEEQ2 = 2;
constexpr int32_t TYPESIZEEQ1 = 1;

constexpr int32_t ALIGNSIZE8 = 8;
constexpr int32_t ALIGNSIZE16 = 16;
constexpr int32_t ALIGNSIZE32 = 32;

constexpr int32_t ALIGNPAD_8 = 29;
constexpr int32_t ALIGNPAD_4 = 29;
constexpr int32_t ALIGNPAD_2 = 45;
constexpr int32_t ALIGNPAD_1 = 93;

constexpr int32_t TMPSIZEOF8_4 = 128;
constexpr int32_t TMPSIZEOF2 = 64;
constexpr int32_t TMPSIZEOF1 = 48;

Expression CalcForSmallTailKernel(AscNodeOutputs &node_outputs, uint32_t concat_dim) {
  auto dst_col_size_expr = node_outputs[0U].attr.repeats[concat_dim];
  for (uint32_t i = concat_dim + 1; i < node_outputs[0U].attr.repeats.size(); ++i) {
    dst_col_size_expr = dst_col_size_expr * node_outputs[0U].attr.repeats[i];
  }
  int64_t dst_col_size = -1;
  if (!dst_col_size_expr.GetConstValue(dst_col_size)) {
    return sym::Align(dst_col_size_expr, ALIGNSIZE16) * Symbol(TMP_SIZE_UNIT);
  }
  auto scale = std::max((dst_col_size + ALIGNSIZE16 - 1) / ALIGNSIZE16, 2L);
  auto buf_size = BASIC_TMP_SIZE * scale;
  if (buf_size * TWO <= MAX_TMP_SIZE_FOR_SMALL_TAIL) {
    buf_size *= TWO;
  }
  return Symbol(buf_size);
}

bool IsAllStaticAligned(AscNodeInputs &node_inputs, uint32_t concat_dim, int32_t align_size) {
  for (uint32_t i = 0; i < node_inputs.Size(); ++i) {
    auto axis = node_inputs[i].attr.repeats[concat_dim];
    for (uint32_t j = concat_dim + 1; j < node_inputs[i].attr.repeats.size(); ++j) {
      axis = sym::Mul(axis, node_inputs[i].attr.repeats[j]);
    }

    if (SymbolicUtils::StaticCheckEq(sym::Mod(axis, Symbol(align_size)), sym::kSymbolZero) != TriBool::kTrue) {
      GELOGD("The product of dims after concat_dim is %s, not aligned.",
              SymbolicUtils::ToString(sym::Mod(axis, Symbol(align_size))).c_str());
      return false;
    }
  }
  return true;
}

Expression CalcForDefaultKernel(AscNodeInputs &node_inputs, uint32_t concat_dim, bool flag) {
  Expression max_axis_size = Symbol(0);
  if (flag) {
    for (uint32_t i = 1; i < node_inputs.Size(); ++i) {
      Expression axis = node_inputs[i].attr.repeats[concat_dim];
      for (uint32_t j = concat_dim + 1; j < node_inputs[i].attr.repeats.size(); ++j) {
        axis = sym::Mul(axis, node_inputs[i].attr.repeats[j]);
      }
      max_axis_size = sym::Max(max_axis_size, axis);
    }
  } else {
    for (uint32_t i = 1; i < node_inputs.Size(); ++i) {
      max_axis_size = sym::Max(max_axis_size, node_inputs[i].attr.repeats[node_inputs[i].attr.repeats.size() - 1]);
    }
  }

  auto type_size = GetSizeByDataType(node_inputs[0].attr.dtype);
  GE_ASSERT_TRUE(type_size != 0, "Invalid node inputs dtype, sizeof(T) = 0.");
  Expression min_tmp_buf_size = Symbol(0);
  bool is_aligned = IsAllStaticAligned(node_inputs, concat_dim, ALIGNSIZE32 / type_size);
  if (type_size == TYPESIZEEQ8) {
    min_tmp_buf_size = is_aligned ? Symbol(0) : (sym::Align(Symbol(FOUR) * max_axis_size, ALIGNSIZE8) +
                                                              Symbol(ALIGNPAD_8)) * Symbol(TMPSIZEOF8_4);
  } else if (type_size == TYPESIZEEQ4) {
    min_tmp_buf_size = is_aligned ? Symbol(0) : (sym::Align(Symbol(TWO) * max_axis_size, ALIGNSIZE8) +
                                                              Symbol(ALIGNPAD_4)) * Symbol(TMPSIZEOF8_4);
  } else if (type_size == TYPESIZEEQ2) {
    min_tmp_buf_size = is_aligned ? Symbol(0) : (sym::Align(Symbol(TWO) * max_axis_size, ALIGNSIZE16) +
                                                              Symbol(ALIGNPAD_2)) * Symbol(TMPSIZEOF2);
  } else if (type_size == TYPESIZEEQ1) {
    min_tmp_buf_size = is_aligned ? Symbol(0) : (sym::Align(Symbol(TWO) * max_axis_size, ALIGNSIZE32) +
                                                              Symbol(ALIGNPAD_1)) * Symbol(TMPSIZEOF1);
  }
  return min_tmp_buf_size;
}
}  // namespace

std::vector<std::unique_ptr<TmpBufDesc>> CalcConcatTmpSize(const AscNode &node) {
  std::vector<std::unique_ptr<TmpBufDesc>> tmp_buf_desc;
  AscNodeInputs node_inputs = node.inputs;
  AscNodeOutputs node_outputs = node.outputs;
  if (node_inputs.Size() <= 0) {
    return tmp_buf_desc;
  }

  bool flag = false;  // 是否尾轴合并
  uint32_t concat_dim = 0;
  for (uint32_t i = 0; i < node_outputs[0].attr.repeats.size(); ++i) {
    if (SymbolicUtils::StaticCheckEq(node_outputs[0].attr.repeats[i], node_inputs[0].attr.repeats[i]) != TriBool::kTrue) {
      concat_dim = i;
      if (i != node_outputs[0].attr.repeats.size() - 1) {
        flag = true;
      }
    }
  }
  bool concat_small_tail = false;
  (void) af::AttrUtils::GetBool(node.GetOpDesc(), "_concat_small_tail", concat_small_tail);
  const auto tmp_buf_size = concat_small_tail ? CalcForSmallTailKernel(node_outputs, concat_dim) :
                            CalcForDefaultKernel(node_inputs, concat_dim, flag);
  if (SymbolicUtils::StaticCheckEq(tmp_buf_size, sym::kSymbolZero) == TriBool::kTrue) {
    GELOGI("%s does not require tmp buf", node.GetNamePtr());
    return {};
  }

  auto min_tmp_buf_size = sym::Max(Symbol(BASIC_TMP_SIZE), tmp_buf_size);
  auto max_tmp_buf_size = concat_small_tail ? MAX_TMP_SIZE_FOR_SMALL_TAIL : MAX_TMP_SIZE;
  min_tmp_buf_size = sym::Min(Symbol(max_tmp_buf_size), min_tmp_buf_size);
  TmpBufDesc desc = {min_tmp_buf_size, -1};
  tmp_buf_desc.emplace_back(std::make_unique<TmpBufDesc>(desc));
  GELOGD("%s is_small_tail = %d, calc_buf_size = %s, min_tmp_buf_size = %s", node.GetNamePtr(),
         static_cast<int32_t>(concat_small_tail), SymbolicUtils::ToString(tmp_buf_size).c_str(),
         SymbolicUtils::ToString(min_tmp_buf_size).c_str());
  return tmp_buf_desc;
}

std::vector<std::unique_ptr<TmpBufDesc>> CalcConcatTmpSizeV2(const AscNode &node) {
  AscNodeInputs node_inputs = node.inputs;
  AscNodeOutputs node_outputs = node.outputs;
  GE_ASSERT_TRUE(node_inputs.Size() > 0);
  uint32_t concat_dim = 0;
  const auto num_dims = node_outputs[0].attr.repeats.size();
  for (uint32_t idx = 0; idx < num_dims; ++idx) {
    const auto i = num_dims - idx - 1;
    if (node_outputs[0].attr.repeats[i] != node_inputs[0].attr.repeats[i]) {
      concat_dim = i;
      break;
    }
  }
  auto type_size = GetSizeByDataType(node_inputs[0].attr.dtype);
  GE_ASSERT_TRUE(type_size > 0,
                 "%s Invalid node inputs dtype: %d",
                 node.GetNamePtr(), static_cast<int32_t>(node_inputs[0].attr.dtype));
  Expression min_tmp_buf_size = Symbol(0);
  bool is_aligned = IsAllStaticAligned(node_inputs, concat_dim, ALIGNSIZE32 / type_size);
  if (is_aligned) {
    GELOGD("%s is all aligned", node.GetNamePtr());
    return {};
  }
  constexpr int64_t kTmpBufSizeForConcatByScatter = 1024L;
  return GetTmpBuffer(Symbol(kTmpBufSizeForConcatByScatter));
}
}  // namespace ascir
}  // namespace af