/**
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
 * CANN Open Software License Agreement Version 2.0 (the "License").
 * Please refer to the License for details. You may not use this file except in compliance with the License.
 * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */

#include "common/dump/dump_op.h"
#include "acl/acl_rt.h"
#include <array>
#include "common/dump/dump_manager.h"
#include "common/plugin/datatype_util.h"
#include "framework/common/debug/ge_log.h"
#include "common/sgt_slice_type.h"
#include "framework/common/util.h"
#include "framework/common/framework_types_internal.h"
#include "framework/common/debug/log.h"
#include "graph/anchor.h"
#include "graph/ge_tensor.h"
#include "graph/op_desc.h"
#include "graph/utils/tensor_utils.h"
#include "proto/ge_ir.pb.h"
#include "proto/op_mapping.pb.h"
#include "runtime/rt.h"
// 待rt.h移除后再替换为rt_external_device.h
#include "rts/rts_device.h"
#include "aicpu_task_struct.h"
#include "graph/debug/ge_attr_define.h"
#include "graph/utils/attr_utils.h"
#include "common/checker.h"
#include "common/aclrt_malloc_helper.h"
#include "base/err_msg.h"

namespace {
constexpr uint32_t kAiCpuLoadFlag = 1U;
const std::string kDumpModeOutput = "output";
const std::string kDumpModeInput = "input";
const std::string kDumpModeAll = "all";
const std::string kDumpKernelsDumpOp = "DumpDataInfo";
constexpr uint32_t k16BitsMask = 0x0000FFFFU;  // 16 bits, 1111,1111,1111,1111
constexpr int32_t k16BitWidth = 16;
const std::string kDumpDataDefaultValue = "stats";
constexpr uint32_t kInputBitsMask = 0x01U;
constexpr uint32_t kOutputBitsMask = 0x02U;
}  // namespace

namespace ge {
DumpOp::~DumpOp() {
  if (proto_dev_mem_ != nullptr) {
    (void)aclrtFree(proto_dev_mem_);
    proto_dev_mem_ = nullptr;
  }

  if (proto_size_dev_mem_ != nullptr) {
    (void)aclrtFree(proto_size_dev_mem_);
    proto_size_dev_mem_ = nullptr;
  }

  if (dev_mem_unload_ !=nullptr) {
    (void)aclrtFree(dev_mem_unload_);
    dev_mem_unload_ = nullptr;
  }

  if (launch_kernel_args_dev_mem_ != nullptr) {
    GE_CHK_RT(aclrtFree(launch_kernel_args_dev_mem_));
    launch_kernel_args_dev_mem_ = nullptr;
  }
}

void DumpOp::SetLoopAddr(const uintptr_t global_step, const uintptr_t loop_per_iter, const uintptr_t loop_cond) {
  global_step_ = global_step;
  loop_per_iter_ = loop_per_iter;
  loop_cond_ = loop_cond;
}

void DumpOp::SetDynamicModelInfo(const std::string &dynamic_model_name, const std::string &dynamic_om_name,
                                 const uint32_t dynamic_model_id) {
  dynamic_model_name_ = dynamic_model_name;
  dynamic_om_name_ = dynamic_om_name;
  dynamic_model_id_ = dynamic_model_id;
  GELOGD("Model name [%s], om_name [%s], model id [%u].", dynamic_model_name.c_str(), dynamic_om_name.c_str(),
         dynamic_model_id);
}

static void SetLoopAddrToOpMapping(const uintptr_t step_id, const uintptr_t loop_per_iter,
                                   const uintptr_t loop_cond,
                                   toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) {
  GELOGI("step_id: %lu, loop_per_iter:%lu, loop_cond: %lu.", static_cast<uint64_t>(step_id),
         static_cast<uint64_t>(loop_per_iter), static_cast<uint64_t>(loop_cond));

  if (step_id != 0U) {
    op_mapping_info.set_step_id_addr(static_cast<uint64_t>(step_id));
  }

  if (loop_per_iter != 0U) {
    op_mapping_info.set_iterations_per_loop_addr(static_cast<uint64_t>(loop_per_iter));
  }

  if (loop_cond != 0U) {
    op_mapping_info.set_loop_cond_addr(static_cast<uint64_t>(loop_cond));
  }
}

bool DumpOp::IsInBlacklist(const std::string &op_name, const std::string &op_type, size_t index, bool is_input) const {
  std::set<std::string> check_names;
  if (!dynamic_model_name_.empty()) check_names.insert(dynamic_model_name_);
  if (!root_graph_name_.empty()) check_names.insert(root_graph_name_);
  if (!dynamic_om_name_.empty()) check_names.insert(dynamic_om_name_);

  for (const auto &mn : check_names) {
    if (is_input) {
      if (dump_properties_.IsInputInOpNameBlacklist(mn, op_name, static_cast<uint32_t>(index)) ||
        dump_properties_.IsInputInOpTypeBlacklist(mn, op_type, static_cast<uint32_t>(index))) {
        GELOGI("[Dumper] Node %s input[%zu] is in blacklist for model %s, skip.",
                op_name.c_str(), index, mn.c_str());
        return true;
      }
    } else {
      if (dump_properties_.IsOutputInOpNameBlacklist(mn, op_name, static_cast<uint32_t>(index)) ||
        dump_properties_.IsOutputInOpTypeBlacklist(mn, op_type, static_cast<uint32_t>(index))) {
        GELOGI("[Dumper] Node %s output[%zu] is in blacklist for model %s, skip.",
                op_name.c_str(), index, mn.c_str());
        return true;
      }
    }
  }

  if (is_input) {
    if (dump_properties_.IsInputInOpNameBlacklist(DUMP_LAYER_OP_MODEL, op_name, static_cast<uint32_t>(index)) ||
      dump_properties_.IsInputInOpTypeBlacklist(DUMP_LAYER_OP_MODEL, op_type, static_cast<uint32_t>(index))) {
      GELOGI("[Dumper] Node %s input[%zu] is in global blacklist (DUMP_LAYER_OP_MODEL), skip.",
              op_name.c_str(), index);
      return true;
    }
  } else {
    if (dump_properties_.IsOutputInOpNameBlacklist(DUMP_LAYER_OP_MODEL, op_name, static_cast<uint32_t>(index)) ||
      dump_properties_.IsOutputInOpTypeBlacklist(DUMP_LAYER_OP_MODEL, op_type, static_cast<uint32_t>(index))) {
      GELOGI("[Dumper] Node %s output[%zu] is in global blacklist (DUMP_LAYER_OP_MODEL), skip.",
              op_name.c_str(), index);
      return true;
    }
  }

  return false;
}

void DumpOp::DumpWorkspace(toolkit::aicpu::dump::Task &task) {
  for (size_t i = 0UL; i < space_addrs_.size(); ++i) {
    const uint64_t addr = static_cast<uint64_t>(space_addrs_[i].first);
    const uint64_t size = static_cast<uint64_t>(space_addrs_[i].second);
    GELOGI("workspace_info: %p %zu", addr, size);
    toolkit::aicpu::dump::Workspace space;
    space.set_type(toolkit::aicpu::dump::Workspace::LOG);
    space.set_data_addr(addr);
    space.set_size(size);
    task.mutable_space()->Add(std::move(space));
  }
}

toolkit::aicpu::dump::AddressType DumpOp::GetAddrType(const toolkit::aicpu::dump::Task &task,
                                                      const GeTensorDesc &desc) const {
  if (task.context_size() != 0) {
    return toolkit::aicpu::dump::AddressType::RAW_ADDR;
  }

  bool no_tiling = false;
  if (AttrUtils::GetBool(desc, ATTR_NAME_TENSOR_NO_TILING_MEM_TYPE, no_tiling) && no_tiling) {
    return toolkit::aicpu::dump::AddressType::NOTILING_ADDR;
  }

  return toolkit::aicpu::dump::AddressType::TRADITIONAL_ADDR;
}

Status DumpOp::DumpOutput(toolkit::aicpu::dump::Task &task, const OpDescPtr &op_desc,
                          const std::vector<uintptr_t> &addrs, bool ffts_flag) const {
  const auto &output_descs = op_desc->GetAllOutputsDescPtr();
  const std::string dump_model_name = dynamic_model_name_;
  const std::string dump_om_name = dynamic_om_name_;
  GELOGI("Start to dump output in Launch dump op, model name %s, size %u, ffts flag %d.", dump_model_name.c_str(), output_descs.size(),
         static_cast<int32_t>(ffts_flag));
  for (size_t i = 0UL; i < output_descs.size(); ++i) {
    const std::string op_name = op_desc->GetName();
    const std::string op_type = op_desc->GetType();
        // 检查是否应该跳过该输出
    if (IsInBlacklist(op_name, op_type, i, false)) {
      continue;
    }
    if ((i >= addrs.size()) || (!ffts_flag && addrs[i] == reinterpret_cast<uintptr_t>(nullptr))) {
      GELOGW("[Dumper] Node name %s, i is %zu, output addrs size is %zu", op_desc->GetName().c_str(), i,
             addrs.size());
      continue;
    }
    GELOGD("Get op[%s:%s] output_desc[shape:%s, original shape:%s]", op_desc->GetNamePtr(), op_desc->GetTypePtr(),
           output_descs.at(i)->GetShape().ToString().c_str(), output_descs.at(i)->GetOriginShape().ToString().c_str());
    toolkit::aicpu::dump::Output output;
    output.set_data_type(static_cast<int32_t>(DataTypeUtil::GetIrDataType(output_descs.at(i)->GetDataType())));
    output.set_format(static_cast<int32_t>(output_descs.at(i)->GetFormat()));
    for (const int64_t dim : output_descs.at(i)->GetShape().GetDims()) {
      output.mutable_shape()->add_dim(static_cast<uint64_t>(dim));
    }
    for (const int64_t dim : output_descs.at(i)->GetOriginShape().GetDims()) {
      output.mutable_origin_shape()->add_dim(static_cast<uint64_t>(dim));
    }
    int64_t output_size = 0;
    if (TensorUtils::GetTensorSizeInBytes(*output_descs.at(i), output_size) != SUCCESS) {
      GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Get][TensorSize]Failed, output %zu, node %s(%s),",
             i, op_desc->GetName().c_str(), op_desc->GetType().c_str());
      REPORT_INNER_ERR_MSG("E19999", "Get output %zu tensor size of node %s(%s) failed",
                        i, op_desc->GetName().c_str(), op_desc->GetType().c_str());
      return ACL_ERROR_GE_INTERNAL_ERROR;
    }

    GELOGI("[Dumper] Node [%s] output[%zu] size %ld addr is %p.", op_desc->GetName().c_str(), i, output_size, addrs[i]);
    output.set_size(static_cast<uint64_t>(output_size));
    output.set_address(static_cast<uint64_t>(addrs[i]));
    output.set_offset(std::numeric_limits<uint64_t>::max());
    output.set_addr_type(GetAddrType(task, *output_descs.at(i)));
    task.mutable_output()->Add(std::move(output));
  }
  return SUCCESS;
}

Status DumpOp::DumpInput(toolkit::aicpu::dump::Task &task, const OpDescPtr &op_desc,
                         const std::vector<uintptr_t> &addrs, bool ffts_flag) const {
  GeTensorDescPtr input_descs;
  const std::string dump_model_name = dynamic_model_name_;
  const std::string dump_om_name = dynamic_om_name_;
  GELOGI("Start dump input in Launch dump op %s, model_name %s, input_descs size %zu, addr size %zu.", dump_model_name.c_str(), op_desc->GetName().c_str(),
    op_desc->GetAllInputsSize(), addrs.size());
  for (size_t i = 0UL; i < op_desc->GetAllInputsSize(); i++) {
    const std::string op_name = op_desc->GetName();
    const std::string op_type = op_desc->GetType();
    GELOGI("[Dumper] Node name %s, node type %s input_descs idx %zu", op_name.c_str(), op_type.c_str(), i);
    if (IsInBlacklist(op_name, op_type, i, true)) {
      continue;
    }
    input_descs = op_desc->MutableInputDesc(static_cast<uint32_t>(i));
    if ((input_descs == nullptr) || (input_descs->GetShape().IsUnknownShape())) {
      continue;
    }
    if ((i > addrs.size()) || (!ffts_flag && addrs[i] == reinterpret_cast<uintptr_t>(nullptr))) {
      GELOGW("[Dumper] Node name %s, addr_id is %zu, input addrs size is %zu", op_desc->GetName().c_str(), i,
             addrs.size());
      continue;
    }
    toolkit::aicpu::dump::Input input;
    input.set_data_type(static_cast<int32_t>(DataTypeUtil::GetIrDataType(input_descs->GetDataType())));
    input.set_format(static_cast<int32_t>(input_descs->GetFormat()));

    GELOGD("Get op[%s:%s] input_desc[shape:%s, original shape:%s]", op_desc->GetNamePtr(), op_desc->GetTypePtr(),
           input_descs->GetShape().ToString().c_str(), input_descs->GetOriginShape().ToString().c_str());
    for (const int64_t dim : input_descs->GetShape().GetDims()) {
      input.mutable_shape()->add_dim(static_cast<uint64_t>(dim));
    }
    for (const int64_t dim : input_descs->GetOriginShape().GetDims()) {
      input.mutable_origin_shape()->add_dim(static_cast<uint64_t>(dim));
    }
    int64_t input_size = 0;
    if (TensorUtils::GetTensorSizeInBytes(*input_descs, input_size) != SUCCESS) {
      GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Get][TensorSize]Failed, input %zu, node %s(%s)",
             i, op_desc->GetName().c_str(), op_desc->GetType().c_str());
      REPORT_INNER_ERR_MSG("E19999", "Get input %zu tensor size of node %s(%s) failed",
                        i, op_desc->GetName().c_str(), op_desc->GetType().c_str());
      return ACL_ERROR_GE_INTERNAL_ERROR;
    }

    GELOGI("[Dumper] Node [%s] input[%zu] size %ld addr is %p.", op_desc->GetName().c_str(), i, input_size, addrs[i]);
    input.set_size(static_cast<uint64_t>(input_size));
    input.set_address(static_cast<uint64_t>(addrs[i]));
    input.set_offset(std::numeric_limits<uint64_t>::max());
    input.set_addr_type(GetAddrType(task, *input_descs));
    task.mutable_input()->Add(std::move(input));
  }
  return SUCCESS;
}

void DumpOp::SetDumpInfo(const DumpProperties &dump_properties, const OpDescPtr &op_desc,
                         const std::vector<uintptr_t> &input_addrs, const std::vector<uintptr_t> &output_addrs,
                         aclrtStream const stream) {
  dump_properties_ = dump_properties;
  op_desc_ = op_desc;
  input_addrs_ = input_addrs;
  output_addrs_ = output_addrs;
  stream_ = stream;
}

Status DumpOp::ProtoMallocAndMemcpy(const size_t proto_size, const std::string &proto_msg) {
  GE_FREE_RT_LOG(proto_dev_mem_);
  aclError rt_ret = ge::AclrtMalloc(&proto_dev_mem_, proto_size, RT_MEMORY_HBM, GE_MODULE_NAME_U16);
  if (rt_ret != ACL_SUCCESS) {
    GELOGE(RT_ERROR_TO_GE_STATUS(rt_ret), "[Call][aclrtMalloc]Failed, ret: %d", rt_ret);
    REPORT_INNER_ERR_MSG("E19999", "Call aclrtMalloc failed, ret: %d", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }

  rt_ret = aclrtMemcpy(proto_dev_mem_, proto_size, proto_msg.c_str(), proto_size, ACL_MEMCPY_HOST_TO_DEVICE);
  if (rt_ret != ACL_SUCCESS) {
    GELOGE(RT_ERROR_TO_GE_STATUS(rt_ret), "[Call][aclrtMemcpy]Failed, ret: %d", rt_ret);
    REPORT_INNER_ERR_MSG("E19999", "Call aclrtMemcpy failed, ret: %d", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }

  GE_FREE_RT_LOG(proto_size_dev_mem_);
  rt_ret = ge::AclrtMalloc(&proto_size_dev_mem_, sizeof(size_t), RT_MEMORY_HBM, GE_MODULE_NAME_U16);
  if (rt_ret != ACL_SUCCESS) {
    GELOGE(RT_ERROR_TO_GE_STATUS(rt_ret), "[Call][aclrtMalloc]Failed, ret: %d", rt_ret);
    REPORT_INNER_ERR_MSG("E19999", "Call aclrtMalloc failed, ret: %d", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }
  rt_ret = aclrtMemcpy(proto_size_dev_mem_, sizeof(size_t), &proto_size, sizeof(size_t), ACL_MEMCPY_HOST_TO_DEVICE);
  if (rt_ret != ACL_SUCCESS) {
    GELOGE(RT_ERROR_TO_GE_STATUS(rt_ret), "[Call][aclrtMemcpy]Failed, ret %d", rt_ret);
    REPORT_INNER_ERR_MSG("E19999", "Call aclrtMemcpy failed, ret %d", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }
  return SUCCESS;
}

Status DumpOp::ExecutorDumpOp(bool need_device_args) {
  std::string proto_msg;
  const size_t proto_size = op_mapping_info_.ByteSizeLong();
  const bool ret = op_mapping_info_.SerializeToString(&proto_msg);
  if ((!ret) || (proto_size == 0U)) {
    GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Serialize][Protobuf]Failed, proto_size is %zu",
           proto_size);
    REPORT_INNER_ERR_MSG("E19999", "[Serialize][Protobuf]Failed, proto_size is %zu", proto_size);
    return ACL_ERROR_GE_INTERNAL_ERROR;
  }

  const Status status = ProtoMallocAndMemcpy(proto_size, proto_msg);
  if (status != SUCCESS) {
    return status;
  }

  constexpr uint32_t io_addr_num = 2U;
  constexpr uint32_t args_size =
      static_cast<uint32_t>(sizeof(aicpu::AicpuParamHead)) +
      (io_addr_num * static_cast<uint32_t>(sizeof(uint64_t)));
  std::array<uint8_t, args_size> args = {};
  size_t args_pos = 0UL;
  aicpu::AicpuParamHead &param_head = *(static_cast<aicpu::AicpuParamHead *>(static_cast<void *>(&args[args_pos])));
  args_pos += sizeof(aicpu::AicpuParamHead);
  param_head.length = args_size;
  param_head.ioAddrNum = io_addr_num;
  *(static_cast<uint64_t *>(static_cast<void *>(&args[args_pos]))) = PtrToValue(proto_dev_mem_);
  args_pos += sizeof(uint64_t);
  *(reinterpret_cast<uint64_t *>(static_cast<void *>(&args[args_pos]))) = PtrToValue(proto_size_dev_mem_);
  rtArgsEx_t args_for_launch = {};
  if (need_device_args) {
    GE_ASSERT_TRUE(launch_kernel_args_dev_mem_ == nullptr);
    GE_CHK_RT_RET(ge::AclrtMalloc(&launch_kernel_args_dev_mem_, args_size, RT_MEMORY_HBM, GE_MODULE_NAME_U16));
    GE_CHK_RT_RET(aclrtMemcpy(launch_kernel_args_dev_mem_, args_size, &args[0U], args_size, ACL_MEMCPY_HOST_TO_DEVICE));
    args_for_launch.args = launch_kernel_args_dev_mem_;
    args_for_launch.isNoNeedH2DCopy = 1U;
  } else {
    args_for_launch.args = &args[0U];
    args_for_launch.isNoNeedH2DCopy = 0U;
  }
  args_for_launch.argsSize = args_size;
  const rtError_t rt_ret = rtCpuKernelLaunchWithFlag(nullptr, kDumpKernelsDumpOp.c_str(), 1U,
                                                     &args_for_launch, nullptr, stream_, RT_KERNEL_DEFAULT);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_ERROR_TO_GE_STATUS(rt_ret), "[Call][rtCpuKernelLaunch]Failed, ret %d", rt_ret);
    REPORT_INNER_ERR_MSG("E19999", "Call rtCpuKernelLaunch failed, ret %d", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }
  GELOGI("Kernel launch dump op %s success", op_desc_->GetName().c_str());
  return SUCCESS;
}

Status DumpOp::SetDumpModelName() {
  if (dynamic_model_name_.empty() && dynamic_om_name_.empty()) {
    GELOGI("Single op dump, no need set model name");
    return SUCCESS;
  }
  op_mapping_info_.set_model_id(dynamic_model_id_);
  std::set<std::string> model_list = dump_properties_.GetAllDumpModel();
  const bool not_find_by_omname = model_list.find(dynamic_om_name_) == model_list.end();
  const bool not_find_by_modelname = model_list.find(dynamic_model_name_) == model_list.cend();
  const bool find_by_rootgraphname = (!root_graph_name_.empty()) &&
                                    (model_list.find(root_graph_name_) != model_list.end());
  const std::string dump_model_name = not_find_by_omname ? dynamic_model_name_ : dynamic_om_name_;
  if ((!dump_model_name.empty()) && (dump_properties_.IsOpDebugOpen())) {
    GELOGI("Dump model name is %s", dump_model_name.c_str());
    op_mapping_info_.set_model_name(dump_model_name);
    return SUCCESS;
  }
  if ((model_list.find(DUMP_ALL_MODEL) == model_list.end()) &&
      (model_list.find(DUMP_LAYER_OP_MODEL) == model_list.end())) {
    if (not_find_by_omname && not_find_by_modelname && !find_by_rootgraphname) {
      std::string model_list_str;
      for (auto &model : model_list) {
        model_list_str += "[" + model + "].";
      }
      GELOGW("Model %s (root_graph: %s) will not be set to dump, dump list: %s",
            dump_model_name.c_str(), root_graph_name_.c_str(), model_list_str.c_str());
      return FAILED;
    }
  }
  if ((!dump_model_name.empty()) && dump_properties_.IsDumpOpen()) {
    GELOGI("Dump model name is %s", dump_model_name.c_str());
    op_mapping_info_.set_model_name(dump_model_name);
  }
  return SUCCESS;
}

Status DumpOp::UpdateAddrs(const std::vector<uintptr_t> &input_addrs,
                           const std::vector<uintptr_t> &output_addrs) {
  for (auto &task : *op_mapping_info_.mutable_task()) {
    if (dump_properties_.GetDumpMode() == kDumpModeInput) {
      task.clear_input();
      const auto ret = DumpInput(task, op_desc_, input_addrs);
      if (ret != SUCCESS) {
        GELOGE(ret, "[Dump][Input]Update dump input Failed, node %s(%s), ret 0x%X",
              op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
        REPORT_INNER_ERR_MSG("E19999", "Update dump input failed, node %s(%s), ret 0x%X",
                          op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
        return ret;
      }
    }
    if (dump_properties_.GetDumpMode() == kDumpModeOutput) {
      task.clear_output();
      const auto ret = DumpOutput(task, op_desc_, output_addrs);
      if (ret != SUCCESS) {
        GELOGE(ret, "[Dump][Input]Update dump output Failed, node %s(%s), ret 0x%X",
              op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
        return ret;
      }
    }
  }
  std::string proto_msg;
  const size_t proto_size = op_mapping_info_.ByteSizeLong();
  if (proto_size == 0U) {
    GELOGW("[Dump][Update] proto_size is zero");
    return SUCCESS;
  }
  const bool ret = op_mapping_info_.SerializeToString(&proto_msg);
  if (!ret) {
    GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Serialize][Protobuf]Failed, proto_size is %zu",
           proto_size);
    REPORT_INNER_ERR_MSG("E19999", "[Serialize][Protobuf]Failed, proto_size is %zu", proto_size);
    return ACL_ERROR_GE_INTERNAL_ERROR;
  }
  auto rt_ret = aclrtMemcpy(proto_dev_mem_, proto_size, proto_msg.c_str(), proto_size, ACL_MEMCPY_HOST_TO_DEVICE);
  if (rt_ret != ACL_SUCCESS) {
    GELOGE(RT_ERROR_TO_GE_STATUS(rt_ret), "[Call][aclrtMemcpy]Failed, ret: %d", rt_ret);
    REPORT_INNER_ERR_MSG("E19999", "Call aclrtMemcpy failed, ret: %d", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }
  rt_ret = aclrtMemcpy(proto_size_dev_mem_, sizeof(size_t), &proto_size, sizeof(size_t), ACL_MEMCPY_HOST_TO_DEVICE);
  if (rt_ret != ACL_SUCCESS) {
    GELOGE(RT_ERROR_TO_GE_STATUS(rt_ret), "[Call][aclrtMemcpy]Failed, ret %d", rt_ret);
    REPORT_INNER_ERR_MSG("E19999", "Call aclrtMemcpy failed, ret %d", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }
  return SUCCESS;
}

void DumpOp::DumpTask(toolkit::aicpu::dump::Task &task, const uint32_t task_id) {
  GELOGW("Task id is %u, stream id is %u", task_id, stream_id_);
  task.set_task_id(task_id);
  task.set_stream_id(stream_id_);
  task.mutable_op()->set_op_name(op_desc_->GetName());
  task.mutable_op()->set_op_type(op_desc_->GetType());
}

void DumpOp::SaveFftsSubOpInfo(const OpDescPtr &op_desc, const std::vector<Context> &context) {
  ffts_sub_op_list_.push_back({op_desc, context});
}

Status DumpOp::BuildFftsSubOpTask(toolkit::aicpu::dump::OpMappingInfo &op_mapping_info) {
  const auto mode = dump_properties_.GetDumpMode();
  uint32_t dump_mode_bits;
  if (mode == kDumpModeAll || dump_properties_.IsOpDebugOpen()) {
    dump_mode_bits = kInputBitsMask | kOutputBitsMask;
  } else if (mode == kDumpModeInput) {
    dump_mode_bits = kInputBitsMask;
  } else if (mode == kDumpModeOutput) {
    dump_mode_bits = kOutputBitsMask;
  } else {
    return SUCCESS;
  }

  for (const auto &op_iter : ffts_sub_op_list_) {
    const auto &op_desc = op_iter.op;
    GELOGD("Op %s in model begin to add ffts task in op_mapping_info", op_desc->GetName().c_str());
    toolkit::aicpu::dump::Task task;
    task.set_end_graph(false);
    task.set_task_id(static_cast<uint32_t>(UINT16_MAX));
    task.set_stream_id(static_cast<uint32_t>(UINT16_MAX));
    task.mutable_op()->set_op_name(op_desc->GetName());
    task.mutable_op()->set_op_type(op_desc->GetType());
    task.set_task_type(toolkit::aicpu::dump::Task::FFTSPLUS);
    for (const auto &context : op_iter.context) {
      toolkit::aicpu::dump::Context ffts_context;
      ffts_context.set_context_id(context.context_id);
      ffts_context.set_thread_id(context.thread_id);
      std::stringstream dbg_ss;
      if ((dump_mode_bits & kInputBitsMask) != 0U) {
        for (const auto &input : context.input) {
          toolkit::aicpu::dump::RealAddressAndSize real_address_and_size;
          real_address_and_size.set_address(input.address);
          real_address_and_size.set_size(input.size);
          ffts_context.mutable_input()->Add(std::move(real_address_and_size));
          dbg_ss << "[input addr: 0x" << &(std::hex) << input.address << ", size: 0x" << input.size << "]";
        }
      }
      if ((dump_mode_bits & kOutputBitsMask) != 0U) {
        for (const auto &output : context.output) {
          toolkit::aicpu::dump::RealAddressAndSize real_address_and_size;
          real_address_and_size.set_address(output.address);
          real_address_and_size.set_size(output.size);
          ffts_context.mutable_output()->Add(std::move(real_address_and_size));
          dbg_ss << "[output addr: 0x" << &(std::hex) << output.address << ", size: 0x" << output.size << "]";
        }
      }
      task.mutable_context()->Add(std::move(ffts_context));
      GELOGD("Op %s add context with context id %u thread id %u, input num %u, output num %u, address info %s",
             op_desc->GetName().c_str(), context.context_id, context.thread_id, context.input.size(),
             context.output.size(), dbg_ss.str().c_str());
    }
    const std::string* ffts_str = AttrUtils::GetStr(*op_desc, ffts::kAttrSgtJsonInfo);
    if (ffts_str != nullptr && !ffts_str->empty()) {
      toolkit::aicpu::dump::OpAttr op_attr;
      op_attr.set_name(ffts::kAttrSgtJsonInfo);
      op_attr.set_value(*ffts_str);
      task.mutable_attr()->Add(std::move(op_attr));
      GELOGI("Add sgt json attr %s in op %s.", ffts_str->c_str(), op_desc->GetName().c_str());
    }

    op_desc_ = op_desc;
    const std::vector<uintptr_t> input_addrs(op_desc->GetAllInputsSize());
    const std::vector<uintptr_t> output_addrs(op_desc->GetAllOutputsDescPtr().size());
    if ((dump_mode_bits & kInputBitsMask) != 0U) {
      GE_CHK_STATUS_RET(DumpInput(task, op_desc, input_addrs, true), "Dump Input failed, node %s(%s)",
                        op_desc_->GetName().c_str(), op_desc_->GetType().c_str());
    }
    if ((dump_mode_bits & kOutputBitsMask) != 0U) {
      GE_CHK_STATUS_RET(DumpOutput(task, op_desc, output_addrs, true), "Dump Output failed, node %s(%s)",
                        op_desc_->GetName().c_str(), op_desc_->GetType().c_str());
    }
    op_mapping_info.mutable_task()->Add(std::move(task));
  }
  return SUCCESS;
}

Status DumpOp::GenerateFftsDump(const DumpProperties &dump_properties, void *&load_dump_info, uint32_t &load_dump_len,
                                void *&unload_dump_info, uint32_t &unload_dump_len, const bool is_single_op_dump) {
  int32_t device_id = 0;
  GE_CHK_RT_RET(aclrtGetDevice(&device_id));
  GE_RETURN_WITH_LOG_IF_TRUE(device_id < 0, "Check device_id %d failed", device_id);
  dump_properties_ = dump_properties;

  const auto dump_path = dump_properties_.GetDumpPath() + std::to_string(device_id) + "/";
  if (!is_single_op_dump) {
    op_mapping_info_.set_dump_step(dump_properties_.GetDumpStep());
  }
  const auto dump_data = (dump_properties_.GetDumpData() == kDumpDataDefaultValue)
                             ? toolkit::aicpu::dump::DumpData::STATS_DUMP_DATA
                             : toolkit::aicpu::dump::DumpData::TENSOR_DUMP_DATA;
  op_mapping_info_.set_dump_data(dump_data);
  op_mapping_info_.set_dump_path(dump_path);
  op_mapping_info_.set_flag(kAiCpuLoadFlag);

  if ((!is_single_op_dump) && (SetDumpModelName() != SUCCESS)) {
    return SUCCESS;
  }
  SetLoopAddrToOpMapping(global_step_, loop_per_iter_, loop_cond_, op_mapping_info_);
  GELOGI("Dump step is %s, dump path is %s in Generate ffts plus dump op", dump_properties_.GetDumpStep().c_str(),
         dump_path.c_str());

  GE_CHK_RT_RET(BuildFftsSubOpTask(op_mapping_info_));

  std::string proto_msg;
  const size_t proto_size = op_mapping_info_.ByteSizeLong();
  GE_CHK_BOOL_RET_STATUS(op_mapping_info_.SerializeToString(&proto_msg), FAILED,
                         "op_mapping_info serialize to string failed.");

  if (proto_dev_mem_ != nullptr) {
    GELOGW("proto_dev_mem_ has been used.");
    GE_FREE_RT_LOG(proto_dev_mem_);
  }

  GE_CHK_RT_RET(ge::AclrtMalloc(&proto_dev_mem_, proto_size, RT_MEMORY_HBM, GE_MODULE_NAME_U16));
  GE_CHK_RT_RET(aclrtMemcpy(proto_dev_mem_, proto_size, proto_msg.c_str(), proto_size, ACL_MEMCPY_HOST_TO_DEVICE));

  load_dump_info = proto_dev_mem_;
  load_dump_len = static_cast<uint32_t>(proto_size);
  GE_CHK_BOOL_RET_STATUS((load_dump_len == proto_size), FAILED, "load_dump_len != proto_size");

  GE_ASSERT_SUCCESS(BuildUnLoadFftsDumpInfo(unload_dump_info, unload_dump_len));
  ffts_sub_op_list_.clear();
  op_mapping_info_.clear_task(); // clear current task to avoid repetitive dump in next iteration
  return SUCCESS;
}

Status DumpOp::BuildUnLoadFftsDumpInfo(void *&unload_dump_info, uint32_t &unload_dump_len) {
  GELOGI("UnloadDumpInfo start.");
  op_mapping_info_.set_flag(0);
  op_mapping_info_.clear_model_id();

  for (const auto &op_iter : ffts_sub_op_list_) {
    toolkit::aicpu::dump::Task task;
    task.set_task_id(static_cast<uint32_t>(UINT16_MAX));
    task.set_stream_id(static_cast<uint32_t>(UINT16_MAX));
    for (const auto &context : op_iter.context) {
      toolkit::aicpu::dump::Context ffts_context;
      ffts_context.set_context_id(context.context_id);
      ffts_context.set_thread_id(context.thread_id);
      task.mutable_context()->Add(std::move(ffts_context));
    }
    op_mapping_info_.mutable_task()->Add(std::move(task));
  }

  std::string proto_str;
  const size_t proto_size = op_mapping_info_.ByteSizeLong();
  GE_CHK_BOOL_RET_STATUS(op_mapping_info_.SerializeToString(&proto_str), FAILED,
                         "op_mapping_info serialize to string failed.");

  if (dev_mem_unload_ != nullptr) {
    GELOGW("dev_mem_unload_ has been used.");
    GE_FREE_RT_LOG(dev_mem_unload_);
  }

  GE_CHK_RT_RET(ge::AclrtMalloc(&dev_mem_unload_, proto_size, RT_MEMORY_HBM, GE_MODULE_NAME_U16));
  GE_PRINT_DYNAMIC_MEMORY(aclrtMalloc, "unload dump information.", proto_size);
  GE_CHK_RT_RET(aclrtMemcpy(dev_mem_unload_, proto_size, proto_str.c_str(), proto_size, ACL_MEMCPY_HOST_TO_DEVICE));

  unload_dump_info = dev_mem_unload_;
  unload_dump_len = static_cast<uint32_t>(proto_size);
  GE_CHK_BOOL_RET_STATUS((unload_dump_len == proto_size), FAILED, "unload_dump_len != proto_size");
  return SUCCESS;
}

Status DumpOp::LaunchDumpOp(const bool is_single_op_dump, bool need_device_args) {
  GELOGI("Start to launch dump op %s, is single op dump %d, device args flag %d.",
         op_desc_->GetName().c_str(), static_cast<int32_t>(is_single_op_dump), static_cast<int32_t>(need_device_args));

  int32_t device_id = 0;
  const aclError rt_ret = aclrtGetDevice(&device_id);
  if (rt_ret != ACL_SUCCESS) {
    GELOGE(RT_ERROR_TO_GE_STATUS(rt_ret), "[Call][aclrtGetDevice]Failed, ret %d", rt_ret);
    REPORT_INNER_ERR_MSG("E19999", "[Call][aclrtGetDevice]Failed, ret %d", rt_ret);
    return RT_ERROR_TO_GE_STATUS(rt_ret);
  }
  if (device_id < 0) {
    GELOGE(ACL_ERROR_GE_INTERNAL_ERROR, "[Check][DeviceId]Failed, device_id %d", device_id);
    REPORT_INNER_ERR_MSG("E19999", "Check device_id %d failed", device_id);
    return ACL_ERROR_GE_INTERNAL_ERROR;
  }
  const auto dump_path = dump_properties_.GetDumpPath() + std::to_string(device_id) + "/";
  op_mapping_info_.clear_task();
  op_mapping_info_.set_dump_path(dump_path);
  op_mapping_info_.set_flag(kAiCpuLoadFlag);
  if (!is_single_op_dump) {
    op_mapping_info_.set_dump_step(dump_properties_.GetDumpStep());
  }
  const auto dump_data = (dump_properties_.GetDumpData() == kDumpDataDefaultValue)
                             ? toolkit::aicpu::dump::DumpData::STATS_DUMP_DATA
                             : toolkit::aicpu::dump::DumpData::TENSOR_DUMP_DATA;
  op_mapping_info_.set_dump_data(dump_data);

  if ((!is_single_op_dump) && (SetDumpModelName() != SUCCESS)) {
    return SUCCESS;
  }
  SetLoopAddrToOpMapping(global_step_, loop_per_iter_, loop_cond_, op_mapping_info_);
  GELOGI("Dump step is %s, dump path is %s in Launch dump op", dump_properties_.GetDumpStep().c_str(),
         dump_path.c_str());
  if ((task_id_ == 0U) || (stream_id_ == 0U)) {
    GE_CHK_RT(aclrtGetThreadLastTaskId(&task_id_));
    int32_t temp_stream_id;
    GE_CHK_RT(aclrtStreamGetId(stream_, &temp_stream_id));
    stream_id_ = static_cast<uint32_t>(temp_stream_id);
  }
  int32_t bit_width;
  GE_CHK_RT(rtsDeviceGetCapability(device_id, RT_FEATURE_SYSTEM_TASKID_BIT_WIDTH, &bit_width));
  if (bit_width == k16BitWidth) {
    task_id_ = task_id_ & k16BitsMask;
  }
  toolkit::aicpu::dump::Task task;
  DumpTask(task, task_id_);
  return ExecuteDump(task, need_device_args);
}

Status DumpOp::ExecuteDump(toolkit::aicpu::dump::Task &task, bool need_device_args) {
  const Status status = LaunchDump(task);
  if (status != SUCCESS) {
    return status;
  }
  const auto ret = ExecutorDumpOp(need_device_args);
  if (ret != SUCCESS) {
    GELOGE(ret, "[Dump][Op]Failed, ret 0x%X", ret);
    return ret;
  }
  GELOGI("Dump %s success", op_desc_->GetName().c_str());
  return SUCCESS;
}

Status DumpOp::LaunchDump(toolkit::aicpu::dump::Task &task) {
  if (dump_properties_.GetDumpMode() == kDumpModeOutput) {
    const auto ret = DumpOutput(task, op_desc_, output_addrs_);
    if (ret != SUCCESS) {
      GELOGE(ret, "[Dump][Output]Failed, node %s(%s), ret 0x%X",
             op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
      return ret;
    }
  } else if (dump_properties_.GetDumpMode() == kDumpModeInput) {
    const auto ret = DumpInput(task, op_desc_, input_addrs_);
    if (ret != SUCCESS) {
      GELOGE(ret, "[Dump][Input]Failed, node %s(%s), ret 0x%X",
             op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
      REPORT_INNER_ERR_MSG("E19999", "Dump Input failed, node %s(%s), ret 0x%X",
                        op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
      return ret;
    }
  } else if ((dump_properties_.GetDumpMode() == kDumpModeAll) || dump_properties_.IsOpDebugOpen()) {
    DumpWorkspace(task);
    auto ret = DumpOutput(task, op_desc_, output_addrs_);
    if (ret != SUCCESS) {
      GELOGE(ret, "[Dump][Output]Failed when in dumping all, node %s(%s), ret 0x%X",
             op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
      return ret;
    }
    ret = DumpInput(task, op_desc_, input_addrs_);
    if (ret != SUCCESS) {
      GELOGE(ret, "[Dump][Input]Failed when in dumping all, node %s(%s), ret 0x%X",
             op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
      REPORT_INNER_ERR_MSG("E19999", "Dump Input failed when in dumping all, node %s(%s), ret 0x%X",
                        op_desc_->GetName().c_str(), op_desc_->GetType().c_str(), ret);
      return ret;
    }
  } else {
    // for future use of dump mode
  }
  op_mapping_info_.mutable_task()->Add(std::move(task));
  return SUCCESS;
}
}  // namespace ge