* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#include "codegen_kernel.h"
#include <sstream>
#include <string>
#include <functional>
#include <stack>
#include "attr_utils.h"
#include "ascir_ops.h"
#include "common_utils.h"
#include "common/ge_common/debug/log.h"
#include "graph/ascendc_ir/utils/asc_tensor_utils.h"
#include "common/checker.h"
#include "api_call/utils/api_call_factory.h"
#include "ascir_utils.h"
#include "backend/backend_spec.h"
#include "graph/symbolizer/symbolic_utils.h"
#include "ascendc_api_registry.h"
#include "optimize/platform/platform_factory.h"
#include "common/platform_context.h"
#include "codegen_graph_check.h"
using namespace std;
using namespace af::ops;
using namespace codegen;
using namespace af::ascir_op;
using namespace ascgen_utils;
namespace {
constexpr uint32_t kFuncIdBegin = 20000000U;
constexpr const char kInputTensorDescName[] = "input_tensor_desc";
constexpr const char kOutputTensorDescName[] = "output_tensor_desc";
}
std::ostream &operator<<(std::ostream &os, const Code &obj) {
return os << obj.Str();
}
Type::Type(const string &type_name) : name(type_name) {}
std::string Type::Str() const {
return name;
}
Variable::Variable(const Type &var_type, const string &var_name) : type(var_type), name(var_name) {}
std::string Variable::Str() const {
return name;
}
std::string Variable::AsArg() const {
stringstream ss;
ss << this->type << " " << this->name;
return ss.str();
}
std::string Variable::Define(std::string &&init, bool define_const) const {
std::stringstream ss;
if (define_const) {
ss << "const ";
}
if (init.empty()) {
ss << type << " " << name << ";";
} else {
ss << type << " " << name << " = " << std::move(init) << ";";
}
return ss.str();
}
std::string Variable::Assign(std::string &value) const {
std::stringstream ss;
ss << name << " = " << value << ";";
return ss.str();
}
Axis::Axis(const ascir::Axis &axis)
: ascir::Axis(axis),
Variable(kIntT, axis.name),
loop_size(Variable(kInt64T, axis.name + "_loop_size")),
elem_size(axis.name + "_elem_size"),
actual_size(axis.name + "_actual_size"),
axis_size(axis.name + "_axis_size"),
tail_size(axis.name + "_tail_size"),
size_expr(af::Symbol(axis.size.Str().get())) {}
Status Tensor::DtypeName(ge::DataType dtype, std::string &dtype_name) {
static const std::string kTypeNames[] = {
[ge::DT_FLOAT] = "float", [ge::DT_FLOAT16] = "half", [ge::DT_INT8] = "int8_t",
[ge::DT_INT32] = "int32_t", [ge::DT_UINT8] = "uint8_t", "",
[ge::DT_INT16] = "int16_t", [ge::DT_UINT16] = "uint16_t", [ge::DT_UINT32] = "uint32_t",
[ge::DT_INT64] = "int64_t", [ge::DT_UINT64] = "uint64_t", [ge::DT_DOUBLE] = "",
[ge::DT_BOOL] = "uint8_t", [ge::DT_STRING] = "", [ge::DT_DUAL_SUB_INT8] = "",
[ge::DT_DUAL_SUB_UINT8] = "", [ge::DT_COMPLEX64] = "", [ge::DT_COMPLEX128] = "",
[ge::DT_QINT8] = "", [ge::DT_QINT16] = "", [ge::DT_QINT32] = "",
[ge::DT_QUINT8] = "", [ge::DT_QUINT16] = "", [ge::DT_RESOURCE] = "",
[ge::DT_STRING_REF] = "", [ge::DT_DUAL] = "", [ge::DT_VARIANT] = "",
[ge::DT_BF16] = "bfloat16_t", [ge::DT_UNDEFINED] = "", [ge::DT_INT4] = "int4x2_t",
[ge::DT_UINT1] = "", [ge::DT_INT2] = "", [ge::DT_UINT2] = "",
[ge::DT_COMPLEX32] = "",
};
GE_CHK_BOOL_RET_STATUS((dtype < (sizeof(kTypeNames) / sizeof(kTypeNames[0])) && kTypeNames[dtype] != ""), af::FAILED,
"Codegen unsupported data type:%d", static_cast<int32_t>(dtype));
dtype_name = kTypeNames[dtype];
return af::SUCCESS;
}
const Type Tensor::GlobalTensorTypes(std::string &dtype_name) {
return Type("GlobalTensor<" + dtype_name + ">");
}
const Type Tensor::LocalTensorTypes(std::string &dtype_name) {
return Type("LocalTensor<" + dtype_name + ">");
}
Tensor::Tensor(const ascir::TensorAttr &tensor, std::string &dtype_name, const std::string &tensor_name)
: Variable((af::ascir::AscTensorUtils::IsConstTensor(tensor)) ? Type(dtype_name)
: (tensor.attr.mem.alloc_type == af::AllocType::kAllocTypeGlobal) ? GlobalTensorTypes(dtype_name)
: LocalTensorTypes(dtype_name),
(af::ascir::AscTensorUtils::IsConstTensor(tensor)) ? ("scalar_" + to_string(tensor.attr.mem.tensor_id))
: (tensor.attr.mem.alloc_type == af::AllocType::kAllocTypeGlobal)
? ("global_" + to_string(tensor.attr.mem.tensor_id))
: ("local_" + to_string(tensor.attr.mem.tensor_id))),
id(tensor.attr.mem.tensor_id),
reuse_id(tensor.attr.mem.reuse_id),
dtype(tensor.attr.dtype),
alloc_type(tensor.attr.mem.alloc_type),
position(tensor.attr.mem.position),
axis(tensor.attr.axis),
axis_size(tensor.attr.repeats),
axis_strides(tensor.attr.strides),
vectorized_axis(tensor.attr.vectorized_axis),
vectorized_strides(tensor.attr.vectorized_strides),
que_id(tensor.attr.que.id),
buf_id(tensor.attr.buf.id),
size(this->name + "_size"),
actual_size(this->name + "_actual_size"),
que_depth(this->name + "_que_depth"),
que_buf_num(this->name + "_que_buf_num"),
que_share_offset("q" + std::to_string(tensor.attr.que.id) + "_reuse" + std::to_string(tensor.attr.mem.reuse_id) +
"_offset"),
const_value(""),
const_value_expr(af::Symbol(0)),
que_depth_value(tensor.attr.que.depth),
que_buf_num_value(tensor.attr.que.buf_num),
merge_scope(tensor.attr.opt.merge_scope),
is_constant(af::ascir::AscTensorUtils::IsConstTensor(tensor)),
ub_scalar_name(this->name + "_ub_scalar") {
(void)tensor_name;
}
Tensor::Tensor(const ascir::TensorAttr &tensor, std::string &dtype_name, const ascir::SizeExpr &value,
const std::string &tensor_name)
: Tensor(tensor, dtype_name, tensor_name) {
this->const_value_expr = value;
this->is_constant = true;
}
Tensor::Tensor(const std::string &value, const ascir::TensorAttr &tensor, std::string &dtype_name,
const std::string &tensor_name)
: Tensor(tensor, dtype_name, tensor_name) {
this->const_value = value;
this->is_constant = true;
}
Status Tensor::Init() {
for (auto vec_axis : this->vectorized_axis) {
auto pos = std::find(this->axis.begin(), this->axis.end(), vec_axis);
GE_ASSERT_TRUE((pos != this->axis.end()), "Codegen vectorized axis[%ld] not found", vec_axis);
this->vectorized_axis_pos.push_back(pos - this->axis.begin());
}
is_ub_scalar = (this->axis_size.size() > 0U);
for (auto &repeate : this->axis_size) {
if (repeate != One) {
is_ub_scalar = false;
break;
}
}
GELOGD("t_name:%s, axis_id:%s, size:%s, strides:%s, v_axis_id:%s, v_axis_pos:%s, v_strides:%s, is_ub_scalar:%d",
name.c_str(), VectorToStr(this->axis).c_str(), VectorToStr(this->axis_size).c_str(),
VectorToStr(this->axis_strides).c_str(), VectorToStr(this->vectorized_axis).c_str(),
VectorToStr(this->vectorized_axis_pos).c_str(), VectorToStr(this->vectorized_strides).c_str(),
static_cast<int32_t>(is_ub_scalar));
return af::SUCCESS;
}
Status Tensor::InitUbScalar(std::string &result) const {
std::stringstream ss;
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(this->dtype, dtype_name), "data type:%d failed",
static_cast<int32_t>(this->dtype));
ss << ub_scalar_name << " = ";
ss << name << ".GetValue(0);" << std::endl;
result = ss.str();
return af::SUCCESS;
}
Status Tensor::GenDuplicateValueOfUbScalar(std::string &result) const {
std::stringstream ss;
std::string dtype_name;
Tensor::DtypeName(this->dtype, dtype_name);
std::string event_id = this->name + "_event_id";
ss << "AscendC::PipeBarrier<PIPE_ALL>();" << std::endl;
ss << "Duplicate(" << this->name << "[0], "
<< ub_scalar_name << ", " << "32/sizeof(" << dtype_name <<"));" << std::endl;
ss << "AscendC::PipeBarrier<PIPE_V>();" << std::endl;
result = ss.str();
return af::SUCCESS;
}
Status Tensor::DefineUbScalar(std::string &result) const {
std::stringstream ss;
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(this->dtype, dtype_name), "data type:%d failed",
static_cast<int32_t>(this->dtype));
ss << dtype_name << " " << ub_scalar_name << ";" << std::endl;
result = ss.str();
return af::SUCCESS;
}
Status Tensor::SetGlobalBuffer(Variable global, const std::string &offset, std::string &result) const {
std::stringstream ss;
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(this->dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(this->dtype));
if (!offset.empty() && offset != "0") {
ss << name << ".SetGlobalBuffer("
<< "(__gm__ " << dtype_name << "*)((__gm__ uint8_t*)(" << global << ") + (" << offset << ")));";
} else {
ss << name << ".SetGlobalBuffer("
<< "(__gm__ " << dtype_name << "*)" << global << ");";
}
result = ss.str();
return af::SUCCESS;
}
Status codegen::PositionValue(ascir::Position position, std::string &result) {
static std::unordered_map<size_t, std::string> position_values = {
{static_cast<size_t>(af::Position::kPositionGM), "TPosition::GM"},
{static_cast<size_t>(af::Position::kPositionVecIn), "TPosition::VECIN"},
{static_cast<size_t>(af::Position::kPositionVecCalc), "TPosition::VECCALC"},
{static_cast<size_t>(af::Position::kPositionVecOut), "TPosition::VECOUT"}
};
auto it = position_values.find(static_cast<size_t>(position));
if (it == position_values.end()) {
GELOGE(af::FAILED, "Codegen position value[%d] invalid", static_cast<int32_t>(position));
return af::FAILED;
}
result = it->second;
return af::SUCCESS;
}
MergeScope::MergeScope(ascir::MergeScopeId merge_scope_id, ascir::Position pos)
: id(merge_scope_id),
position(pos),
size("m" + to_string(merge_scope_id) + "_size"),
depth("m" + to_string(merge_scope_id) + "_que_depth"),
buf_num("m" + to_string(merge_scope_id) + "_que_buf_num") {}
TQue::TQue(ascir::QueId que_id, ascir::Position pos, std::string &position_name)
: Variable(Type("TQue<" + position_name + ", " + "1>"), "q" + to_string(que_id)),
id(que_id),
position(pos),
size(this->name + "_size"),
depth(this->name + "_depth"),
buf_num(this->name + "_buf_num"),
buf(Type("LocalTensor<uint8_t>"), name + "_buf") {}
TQue::TQue(ascir::QueId que_id, ascir::Position src_position, const std::string &src_position_name,
const std::string &dst_position_name)
: Variable(Type("TQueBind<" + src_position_name + ", " + dst_position_name + ", 1>"), "q" + to_string(que_id)),
id(que_id),
position(src_position),
size(this->name + "_size"),
depth(this->name + "_depth"),
buf_num(this->name + "_buf_num"),
buf(Type("LocalTensor<uint8_t>"), name + "_buf") {}
std::string TQue::AllocBuf(const bool with_define) const {
stringstream ss;
if (with_define && !is_cv_ub_fusion) {
ss << this->buf.AsArg();
} else {
ss << this->buf.Str();
}
ss << " = " << this->name << ".AllocTensor<uint8_t>();" << std::endl;
return ss.str();
}
std::string TQue::FreeBuf() const {
stringstream ss;
ss << this->name << ".FreeTensor(" << this->buf << ");" << std::endl;
return ss.str();
}
std::string TQue::EnqueBuf() const {
stringstream ss;
ss << this->name << ".EnQue(" << this->buf << ");" << std::endl;
return ss.str();
}
std::string TQue::DequeBuf(const bool is_unit_first) const {
stringstream ss;
if (is_unit_first) {
ss << this->buf.AsArg() << " = " << this->name << ".DeQue<uint8_t>();" << std::endl;
} else {
ss << this->buf.name << " = " << this->name << ".DeQue<uint8_t>();" << std::endl;
}
return ss.str();
}
TBuf::TBuf(ascir::BufId buf_id, const ascir::Position pos, std::string &position_name)
: Variable(Type("TBuf<" + position_name + ">"), "b" + to_string(buf_id)),
id(buf_id),
position(pos),
size(this->name + "_size"),
buf(Type("LocalTensor<uint8_t>"), name + "_buf") {}
std::string TBuf::AllocBuf(const bool with_define) const {
stringstream ss;
if (with_define) {
ss << this->buf.AsArg();
} else {
ss << this->buf.Str();
}
ss << " = " << this->name << ".Get<uint8_t>();";
return ss.str();
}
std::string TBuf::AllocBuf(std::string buf_name, std::string dtype_name, const bool with_define) const {
stringstream ss;
if (with_define) {
ss << "LocalTensor<" << dtype_name << "> " << buf_name << " = " << this->name << ".Get<" << dtype_name << ">();";
} else {
ss << buf_name << " = " << this->name << ".Get<" << dtype_name << ">();";
}
return ss.str();
}
Tiler::Tiler(const std::string &tiling_data_type, const std::string &tiling_data_name)
: tiling_data(Type{tiling_data_type}, tiling_data_name), gm_tiling(kGmAddrT, "gm_tiling"), block_dim("block_dim") {}
std::string Tiler::Offset(const std::vector<ascir::AxisId> ¤t_axis, const std::vector<ascir::AxisId> &axis,
const std::vector<ascir::SizeExpr> &strides) const {
std::stringstream ss;
bool is_first = true;
for (auto iter = axis.begin(); iter != axis.end(); ++iter) {
bool is_from = false;
for (auto ca : current_axis) {
if (this->IsFrom(ca, *iter)) {
is_from = true;
break;
}
}
if (!is_from) {
continue;
}
if (is_first) {
is_first = false;
} else {
ss << " + ";
}
auto stride = strides[iter - axis.begin()];
if (stride == 0) {
ss << "0";
} else if (stride == 1) {
ss << "(int64_t)" << this->GetAxis(*iter);
} else {
ss << "(int64_t)" << this->GetAxis(*iter) << " * " << "(int64_t)" << this->Size(stride);
}
}
if (is_first) {
ss << "0";
}
return ss.str();
}
std::string Tiler::TensorVectorizedOffset(const std::vector<ascir::AxisId> ¤t_axis, const Tensor &tensor) const {
std::vector<ascir::AxisId> current_vectorized_axis;
for (auto a : current_axis) {
if (find(tensor.vectorized_axis.begin(), tensor.vectorized_axis.end(), a) != tensor.vectorized_axis.end()) {
current_vectorized_axis.emplace_back(a);
}
}
return this->Offset(current_vectorized_axis, tensor.vectorized_axis, tensor.vectorized_strides);
}
std::string Tiler::Str() const {
return tiling_data.Str();
}
void codegen::Tiler::AddSizeVar(ascir::SizeVar size) {
std::string var_define;
if (!(size.expr.IsConstExpr())) {
var_define = std::string(size.expr.Str().get());
std::string tiling_var = this->tiling_data.Str() + "->" + var_define;
af::Expression tiling_sizevar = af::Symbol(tiling_var.c_str());
this->sizes.emplace_back(std::make_pair(size.expr, tiling_sizevar));
}
}
uint32_t codegen::Tiler::GetTilingCaseId() const {
return this->tiling_case_id;
}
void codegen::Tiler::SetTilingCaseId(uint32_t tilingCaseId) {
this->tiling_case_id = tilingCaseId;
}
Status codegen::Tiler::AddAxis(const ascir::Axis &axis) {
auto [new_axis, insert_success] = this->axis_map.emplace(axis.id, codegen::Axis(axis));
(void)new_axis;
if (!insert_success) {
GELOGE(af::FAILED, "Codegen insert axis[%ld] fail", axis.id);
return af::FAILED;
}
return af::SUCCESS;
}
static bool GetSplitBAttr(const Tiler *tiler, const Axis &axis) {
if (axis.type == ascir::Axis::Type::kAxisTypeTileInner || axis.type == ascir::Axis::Type::kAxisTypeTileOuter) {
return true;
}
if (axis.type == ascir::Axis::Type::kAxisTypeInvalid) {
return false;
}
for (const auto from : axis.from) {
auto from_axis = tiler->GetAxis(from);
if (GetSplitBAttr(tiler, from_axis)) {
return true;
}
}
return false;
}
void codegen::Tiler::AddAxisSplitBAttr() {
for (auto &[id, cur_axis] : axis_map) {
(void)id;
cur_axis.is_split_b = GetSplitBAttr(this, cur_axis);
}
}
bool Axis::IsOuter() const {
return (type == ascir::Axis::Type::kAxisTypeBlockOuter || type == ascir::Axis::Type::kAxisTypeTileOuter);
}
bool Axis::IsInner() const {
return (type == ascir::Axis::Type::kAxisTypeBlockInner || type == ascir::Axis::Type::kAxisTypeTileInner);
}
static bool IsMergeFromInner(const Tiler &tiler, const codegen::Axis &axis) {
if (axis.from.size() == 0) {
return axis.IsInner();
}
bool contain_inner = false;
std::function<void(int32_t)> func = [&tiler, &contain_inner, &func](int32_t current_axis_id) {
const auto ¤t_axis = tiler.GetAxis(current_axis_id);
for (const auto &from : current_axis.from) {
const auto &from_axis = tiler.GetAxis(from);
if (from_axis.IsInner()) {
contain_inner = true;
break;
} else if (from_axis.type == ascir::Axis::Type::kAxisTypeMerged) {
func(from);
}
}
};
func(axis.id);
return contain_inner;
}
bool Tiler::IsFrom(ascir::AxisId src, ascir::AxisId dst) const {
if (src == dst) {
return true;
}
const auto &axis = this->GetAxis(src);
for (const auto &from : axis.from) {
if (from == dst || IsFrom(from, dst)) {
return true;
}
}
return false;
}
bool Tiler::HasSameOriginAxis(ascir::AxisId src, ascir::AxisId dst) const {
std::set<ascir::AxisId> src_origins;
std::set<ascir::AxisId> dst_origins;
std::function<void(int32_t, std::set<ascir::AxisId> &)> func = [this, &func](int32_t current_axis_id,
std::set<ascir::AxisId> &origin_ids) {
const auto &axis = this->GetAxis(current_axis_id);
for (const auto &from : axis.from) {
if (this->GetAxis(from).type == Axis::Type::kAxisTypeOriginal) {
origin_ids.insert(from);
} else {
func(from, origin_ids);
}
}
};
func(src, src_origins);
func(dst, dst_origins);
for (auto id : src_origins) {
if (dst_origins.count(id) != 0) {
return true;
}
}
return false;
}
std::string codegen::Tiler::Size(const ascir::SizeExpr &size, bool using_int_tiling_data) const {
std::string const_expr_str = std::string(size.Str().get());
if (size.IsConstExpr()) {
return (const_expr_str.find("Rational") != std::string::npos) ?
af::SymbolicUtils::AsNumerDenomToString(size) : const_expr_str;
}
std::string str_ret = std::string((size.Replace(this->sizes)).Str().get());
return (using_int_tiling_data || str_ret.find("Rational") != std::string::npos) ?
af::SymbolicUtils::AsNumerDenomToString(size.Replace(this->sizes)) : str_ret;
}
std::string codegen::Tiler::ActualSize(const ascir::SizeExpr &size, bool using_int_tiling_data) const {
auto replace_actual = size.Replace(this->actual_sizes);
std::string str_ret = std::string((replace_actual.Replace(this->sizes)).Str().get());
return (using_int_tiling_data || str_ret.find("Rational") != std::string::npos) ?
af::SymbolicUtils::AsNumerDenomToString(replace_actual.Replace(this->sizes)) : str_ret;
}
std::string Tiler::TensorActualSize(const Tensor &tensor) const {
if (tensor.vectorized_axis.size() == 0) {
return "1";
}
stringstream ss;
int64_t count = 0;
for (size_t i = 0; i < tensor.vectorized_axis.size(); i++) {
auto &stride = tensor.vectorized_strides[i];
if (stride == 0) {
continue;
}
if (count >= 1) {
ss << " + ";
}
auto axis = GetAxis(tensor.vectorized_axis[i]);
auto axis_pos = tensor.vectorized_axis_pos[i];
auto axis_size = tensor.axis_size[axis_pos];
bool size_equal = af::SymbolicUtils::StaticCheckEq(axis_size, axis.size_expr) == af::TriBool::kTrue;
if (axis.type == Axis::Type::kAxisTypeTileInner || size_equal) {
ss << "(" + axis.actual_size.Str() + " - 1)";
} else {
ss << "(" + this->Size(axis_size) + " - 1)";
}
if (stride != 1) {
ss << " * " << this->Size(stride);
}
count++;
}
ss << ((ss.str().size() == 0u) ? "1" : " + 1");
return ss.str();
}
std::string Tiler::TensorVectorizedSize(const Tensor &tensor) const {
if (tensor.vectorized_axis.size() == 0) {
return "1";
}
stringstream ss;
std::string blk_align;
(void)KernelUtils::BlkAlign(tensor.dtype, blk_align);
ss << blk_align << "(";
int64_t count = 0;
for (size_t i = 0; i < tensor.vectorized_axis.size(); i++) {
auto axis_pos = tensor.vectorized_axis_pos[i];
auto axis_size = tensor.axis_size[axis_pos];
auto &stride = tensor.vectorized_strides[i];
if (stride == 0) {
continue;
}
if (count >= 1) {
ss << " + ";
}
ss << "(" + this->Size(axis_size) + " - 1)";
if (stride != 1) {
ss << " * " << this->Size(stride);
}
count++;
}
ss << ((ss.str().size() == 0u) ? "1" : " + 1)");
return ss.str();
}
const Axis &Tiler::GetAxis(const ascir::AxisId id) const {
auto iter = this->axis_map.find(id);
if (iter == this->axis_map.end()) {
GELOGE(af::FAILED, "Codegen axis[%ld] not found", id);
throw std::runtime_error("Axis not found " + to_string(id));
}
return iter->second;
}
std::string codegen::Tiler::AxisSize(const ascir::AxisId id) const {
return this->Size(this->GetAxis(id).size);
}
std::string codegen::Tiler::AxisSize(const Axis &axis) const {
return this->Size(axis.size);
}
std::string codegen::Tiler::GenAxisSizeNew(const ascir::AxisId id) const {
stringstream ss;
const auto &axis = this->GetAxis(id);
bool is_reduce_block = axis.type == ascir::Axis::Type::kAxisTypeBlockOuter && axis.from.size() > 1;
if (axis.type == ascir::Axis::Type::kAxisTypeOriginal) {
ss << "const " << axis.axis_size.AsArg() << " = " << this->AxisSize(axis) << ";" << endl;
ss << "const " << axis.loop_size.AsArg() << " = " << axis.axis_size.Str() << ";" << endl;
ss << "const " << axis.actual_size.AsArg() << " = " << axis.axis_size.Str() << ";" << endl;
} else if (axis.IsOuter() && !is_reduce_block) {
const auto &from = this->GetAxis(axis.from[0]);
const auto &inner = this->GetAxis(axis.split_pair_other_id);
ss << "const " << axis.axis_size.AsArg() << " = " << from.loop_size.Str() << " / " << inner.axis_size.Str() << ";"
<< endl;
ss << "const " << axis.loop_size.AsArg() << " = " << axis.axis_size.Str() << " + (" << inner.tail_size << " > 0);"
<< endl;
} else if (axis.IsInner()) {
const auto &from = this->GetAxis(axis.from[0]);
ss << "const " << axis.axis_size.AsArg() << " = " << this->AxisSize(axis) << ";" << endl;
ss << "const " << axis.tail_size.AsArg() << " = " << from.loop_size << " % " << axis.axis_size << ";" << endl;
} else if (axis.type == ascir::Axis::Type::kAxisTypeMerged || is_reduce_block) {
ss << "const " << axis.axis_size.AsArg() << " = ";
for (const auto &f : axis.from) {
ss << this->GetAxis(f).loop_size.Str() << " * ";
}
ss << "1;" << endl;
ss << "const " << axis.loop_size.AsArg() << " = " << axis.axis_size.Str() << ";" << endl;
ss << "const " << axis.actual_size.AsArg() << " = " << axis.axis_size.Str() << ";" << endl;
}
return ss.str();
}
std::string codegen::Tiler::GenInnerLoopSizeAndActualSize(const ascir::AxisId id, const ascir::AxisId loop_axis) const {
stringstream ss;
auto axis = this->GetAxis(id);
if (!axis.IsInner()) {
return "";
}
if (IsFrom(loop_axis, axis.split_pair_other_id)) {
auto outter = this->GetAxis(axis.split_pair_other_id);
ss << axis.actual_size.AsArg() << " = " << outter.Str() << " < " << outter.axis_size << " ? "
<< axis.axis_size.Str() << " : " << axis.tail_size.Str() << ";" << endl;
ss << axis.loop_size.AsArg() << " = " << axis.actual_size.Str() << ";" << endl;
af::Expression actual_size = af::Symbol(axis.actual_size.name.c_str());
this->actual_sizes.emplace_back(std::make_pair(axis.size_expr, actual_size));
}
return ss.str();
}
std::string codegen::Tiler::CalcFromAxis(const ascir::AxisId id, bool is_define) const {
stringstream ss;
auto axis = this->GetAxis(id);
if (axis.IsInner()) {
auto from = this->GetAxis(axis.from[0]);
ss << (is_define ? from.AsArg() : from.Str()) << " = " << "block_dim_offset" << " + "
<< axis.Str() << ";" << std::endl;
ss << this->CalcFromAxis(from.id, is_define);
} else if (axis.type == Axis::Type::kAxisTypeMerged) {
for (size_t i = 0; i < axis.from.size(); i++) {
auto &from = this->GetAxis(axis.from[i]);
ss << (is_define ? from.AsArg() : from.Str()) << " = " << axis.Str();
ss << " / ";
ss << "(";
for (size_t j = i + 1; j < axis.from.size(); j++) {
ss << this->GetAxis(axis.from[j]).loop_size << " * ";
}
ss << "1)";
ss << " % " << this->GetAxis(axis.from[i]).loop_size << ";" << std::endl;
}
for (auto from : axis.from) {
ss << this->CalcFromAxis(from, is_define);
}
}
return ss.str();
}
void codegen::Tiler::BlockOutterAxisDefine(const ascir::AxisId id, std::stringstream &ss) {
auto axis = this->GetAxis(id);
if (axis.IsInner()) {
return;
}
for (size_t i = 0; i < axis.from.size(); i++) {
auto &from = this->GetAxis(axis.from[i]);
ss << from.AsArg() << " = " << axis.Str();
ss << " / (";
for (size_t j = i + 1; j < axis.from.size(); j++) {
ss << this->GetAxis(axis.from[j]).loop_size << " * ";
}
ss << "1)";
ss << " % " << this->GetAxis(axis.from[i]).loop_size << ";" << std::endl;
if (from.type == Axis::Type::kAxisTypeMerged) {
BlockOutterAxisDefine(axis.from[i], ss);
}
}
}
std::string codegen::Tiler::BlockOutterAxisDefine() {
stringstream code;
code << this->block_dim.Define("GetBlockIdx()") << std::endl;
if (enable_group_parallel_) {
code << "const uint32_t block_offset = " << tiling_data.name << "->ub_size; // resue as block_offset"
<< std::endl;
code << this->block_dim.name << " = " << this->block_dim.name << " >= block_offset ? "
<< this->block_dim.name << " - block_offset : "
<< this->block_dim.name << " + GetBlockNum() - block_offset;" << std::endl;
} else {
code << "if (" << this->block_dim.name << " >= " << tiling_data.name << "->block_dim) { " << std::endl
<< " return;" << std::endl << "}" << std::endl;
}
for (auto &[id, axis] : this->axis_map) {
(void)id;
if (axis.type != ascir::Axis::Type::kAxisTypeBlockOuter) {
continue;
}
stringstream axis_value;
axis_value << this->block_dim.name << " % " << axis.loop_size;
code << axis.Define(axis_value.str(), true);
code << " " << std::endl;
if (axis.from.size() > 1) {
BlockOutterAxisDefine(id, code);
}
}
return code.str();
}
void Tiler::EnableGroupParallel(bool enable_group_parallel) {
enable_group_parallel_ = enable_group_parallel;
}
std::string KernelUtils::Max() {
return "KernelUtils::Max";
}
std::string KernelUtils::Sum() {
return "KernelUtils::Sum";
}
Status KernelUtils::BlkNum(ge::DataType dtype, std::string &result) {
std::stringstream ss;
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(dtype));
ss << "KernelUtils::BlkNum<" << dtype_name << ">";
result = ss.str();
return af::SUCCESS;
}
Status KernelUtils::BlkAlign(ge::DataType dtype, std::string &result) {
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(dtype));
result = "KernelUtils::BlkAlign<" + dtype_name + ">";
return af::SUCCESS;
}
std::string KernelUtils::SizeAlign() {
return "KernelUtils::SizeAlign";
}
std::string KernelUtils::FindNearestPower2() {
return "KernelUtils::FindNearestPower2";
}
TPipe::TPipe(const std::string &tpipe_name, const Tiler &tpipe_tiler)
: Variable(Type{"TPipe"}, tpipe_name), tiler(tpipe_tiler), tmp_buf(Type{"LocalTensor<uint8_t>"}, "tmp_buf") {}
Status TPipe::AddTensor(const Tensor &tensor) {
auto [ret, is_insert1] = this->tensors.emplace(tensor.id, tensor);
GE_CHK_BOOL_RET_STATUS(is_insert1, af::FAILED, "Codegen tensor[%ld,%s] is already added", tensor.id,
tensor.name.c_str());
auto &t = ret->second;
if (t.merge_scope != af::kIdNone &&
(t.alloc_type == af::AllocType::kAllocTypeQueue || t.alloc_type == af::AllocType::kAllocTypeBuffer)) {
auto merge_scope = this->merge_scopes.find(t.merge_scope);
if (merge_scope == this->merge_scopes.end()) {
auto [new_scope, is_insert2] = this->merge_scopes.emplace(t.merge_scope, MergeScope{t.merge_scope, t.position});
GE_CHK_BOOL_RET_STATUS(is_insert2, af::FAILED, "Codegen emplace merge_scope [%ld] failed", t.merge_scope);
new_scope->second.tensors.push_back(t.id);
} else {
GE_CHK_BOOL_RET_STATUS(merge_scope->second.position == t.position, af::FAILED,
"Merge scope for tensor[%s] position mismatch between %d and %d", t.name.c_str(),
static_cast<int32_t>(t.position), static_cast<int32_t>(merge_scope->second.position));
merge_scope->second.tensors.push_back(t.id);
}
}
if (t.alloc_type == af::AllocType::kAllocTypeQueue) {
GE_CHK_BOOL_RET_STATUS(t.que_id != af::kIdNone, af::FAILED, "Codegen tensor[%ld,%s] queue is none", t.id,
t.name.c_str());
TQue *que = nullptr;
auto iter = this->ques.find(t.que_id);
GE_ASSERT_TRUE(iter != this->ques.end(), "Cannot find que with id [%ld], it may not be initialized correctly",
t.que_id);
que = &iter->second;
if (t.merge_scope != af::kIdNone) {
que->merge_scopes.insert(t.merge_scope);
} else {
que->not_merge_tensors.insert(t.id);
}
que->share_group[t.reuse_id].push_back(t.id);
} else if (t.alloc_type == af::AllocType::kAllocTypeBuffer) {
GE_CHK_BOOL_RET_STATUS(t.buf_id != af::kIdNone, af::FAILED, "Codegen tensor[%ld,%s] buffer is none", t.id,
t.name.c_str());
TBuf *buf = nullptr;
auto iter = this->bufs.find(t.buf_id);
if (iter == this->bufs.end()) {
std::string position;
GE_CHK_STATUS_RET(PositionValue(t.position, position), "Codegen get position value failed");
auto [new_buf, is_insert5] = this->bufs.emplace(t.buf_id, TBuf{t.buf_id, t.position, position});
GE_CHK_BOOL_RET_STATUS(is_insert5, af::FAILED, "Codegen emplace tbuf [%ld] failed", t.buf_id);
buf = &new_buf->second;
} else {
buf = &iter->second;
}
GE_CHK_BOOL_RET_STATUS(buf->position == t.position, af::FAILED,
"Codegen buf position mismatch for tensor[%s] between %d and %d", t.name.c_str(),
static_cast<int32_t>(t.position), static_cast<int32_t>(buf->position));
if (t.merge_scope != af::kIdNone) {
buf->merge_scopes.insert(t.merge_scope);
} else {
buf->not_merge_tensors.insert(t.id);
}
}
return af::SUCCESS;
}
Status TPipe::AddTensor(const ascir::TensorAttr &tensor_attr, const std::string &tensor_name) {
auto tensor_val_name = GenValidName(tensor_name);
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(tensor_attr.attr.dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(tensor_attr.attr.dtype));
Tensor tensor(tensor_attr, dtype_name, tensor_val_name);
GE_CHK_STATUS_RET(tensor.Init(), "Codegen tensor init failed");
GE_CHK_STATUS_RET(this->AddTensor(tensor), "Codegen add tensor failed");
return af::SUCCESS;
}
Status TPipe::AddTensor(const std::string &const_value, const ascir::TensorAttr &tensor_attr,
const std::string &tensor_name) {
auto tensor_val_name = GenValidName(tensor_name);
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(tensor_attr.attr.dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(tensor_attr.attr.dtype));
std::string pre_process_value;
GE_CHK_STATUS_RET(ascgen_utils::ScalarValuePreProcess(const_value, dtype_name, pre_process_value),
"Scalar value pre process failed, ori_value:%s, dtype:%s", const_value.c_str(), dtype_name.c_str());
GELOGD("ori_value:%s, dtype:%s, pre_process_value:%s", const_value.c_str(), dtype_name.c_str(),
pre_process_value.c_str());
Tensor tensor(pre_process_value, tensor_attr, dtype_name, tensor_val_name);
GE_CHK_STATUS_RET(tensor.Init(), "Codegen tensor init failed");
GE_CHK_STATUS_RET(this->AddTensor(tensor), "Codegen add tensor failed");
return af::SUCCESS;
}
Status TPipe::AddTensor(const ascir::TensorAttr &tensor_attr, const ascir::SizeExpr &const_value,
const std::string &tensor_name) {
auto tensor_val_name = GenValidName(tensor_name);
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(tensor_attr.attr.dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(tensor_attr.attr.dtype));
Tensor tensor(tensor_attr, dtype_name, const_value, tensor_val_name);
GE_CHK_STATUS_RET(tensor.Init(), "Codegen tensor init failed");
GE_CHK_STATUS_RET(this->AddTensor(tensor), "Codegen add tensor failed");
return af::SUCCESS;
}
std::string TPipe::AllocTmpBuf(const TBuf &buf, const bool with_define) const {
stringstream ss;
if (with_define) {
ss << this->tmp_buf.AsArg();
} else {
ss << this->tmp_buf.Str();
}
ss << "_" << to_string(buf.id) << " = " << buf.name << ".Get<uint8_t>();" << std::endl;
return ss.str();
}
static bool IsNextNodeSupportScalar(const ascir::NodeView &node) {
std::set<std::string> support_ub_scalar_nodes = {Load::Type, Store::Type, Div::Type, TrueDiv::Type, Mul::Type,
Add::Type, Sub::Type, Minimum::Type, Maximum::Type, LogicalOr::Type, LogicalAnd::Type, Broadcast::Type,
ClipByValue::Type, Eq::Type, Ne::Type, Gt::Type, Lt::Type, Ge::Type, Le::Type, Pow::Type, Where::Type};
return support_ub_scalar_nodes.count(node->GetType()) > 0U;
}
Status Kernel::OutputTensorIsUbScalar(const ascir::NodeView &node, bool &is_ub_scalar) const {
is_ub_scalar = true;
auto desc = node->GetOpDesc();
for (auto output : node->outputs()) {
auto output_index = af::ascir::AscTensorUtils::Index(*output);
auto tensor_name = node->GetName() + "_" + desc->GetOutputNameByIndex(output_index);
auto tensor_val_name = GenValidName(tensor_name);
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(output->attr.dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(output->attr.dtype));
Tensor tensor(*output, dtype_name, tensor_val_name);
GE_CHK_STATUS_RET(tensor.Init(), "Codegen tensor init failed");
if (!tensor.is_ub_scalar) {
is_ub_scalar = false;
break;
}
}
GELOGI("node:%s, output tensor is_ub_scalar:%d", node->GetNamePtr(), static_cast<int32_t>(is_ub_scalar));
return af::SUCCESS;
}
static bool IsOutputOnlyLink2VFNode(const ascir::TensorView &tensor) {
for (const auto &peer_input : tensor.anchor.GetPeerInDataAnchors()) {
auto output_node = std::dynamic_pointer_cast<af::AscNode>(peer_input->GetOwnerNode());
if (output_node == nullptr || output_node->GetType() != VectorFunc::Type) {
return false;
}
}
return true;
}
Status Kernel::ParseUbScalarOptimizationInfo(const ascir::NodeView& node, Tensor& t, ascir::TensorId id,
bool is_all_link_vf) {
if (t.is_ub_scalar && !af::ops::IsOps<af::ascir_op::Scalar>(node) && !is_all_link_vf) {
bool a_tenor_of_next_node_is_not_ub_scalar = false;
bool is_next_node_support_ub_scalar = false;
for (auto &out : node->outputs()) {
if (out == nullptr) {
continue;
}
for (auto &peer_input : out->anchor.GetPeerInDataAnchors()) {
auto next_node = std::dynamic_pointer_cast<af::AscNode>(peer_input->GetOwnerNode());
t.need_duplicate_value_of_ub_scalar = IsSupportBlkTensorInput(next_node) ?
true : t.need_duplicate_value_of_ub_scalar;
bool is_ub_scalar;
GE_CHK_STATUS_RET(OutputTensorIsUbScalar(next_node, is_ub_scalar));
if (!is_ub_scalar) {
a_tenor_of_next_node_is_not_ub_scalar = true;
is_next_node_support_ub_scalar = IsNextNodeSupportScalar(next_node);
break;
}
}
if (a_tenor_of_next_node_is_not_ub_scalar) {
this->ub_scalar_tensors.emplace_back(id);
break;
}
}
t.need_gen_get_value_of_ub_scalar = a_tenor_of_next_node_is_not_ub_scalar && is_next_node_support_ub_scalar;
GELOGD("node:%s, tensor_id:%d, is_ub_scalar:%d, need_gen_get_value_of_ub_scalar:%d", node->GetNamePtr(),
static_cast<int32_t>(id), static_cast<int32_t>(t.is_ub_scalar),
static_cast<int32_t>(t.need_gen_get_value_of_ub_scalar));
}
return af::SUCCESS;
}
Status Kernel::JudgeIsLoadLinkStoreAndVec(const ascir::NodeView& node, Tensor& t, ascir::TensorId id) const {
if ((node->attr.api.compute_type == ascir::ComputeType::kComputeLoad) && (!IsOps<Gather>(node))) {
bool link_to_store = false;
bool link_to_vec = false;
for (auto &out : node->outputs()) {
GE_CHK_BOOL_RET_STATUS_NOLOG(out != nullptr, af::SUCCESS);
for (auto &peer_input : out->anchor.GetPeerInDataAnchors()) {
auto next_node = std::dynamic_pointer_cast<af::AscNode>(peer_input->GetOwnerNode());
link_to_store = IsOps<Store>(next_node) ? true : link_to_store;
link_to_vec = IsOps<Store>(next_node) ? link_to_vec : true;
}
}
t.is_load_link_store_and_vec = link_to_store && link_to_vec;
GELOGD("node:%s, tensor_id:%d, is_load_link_store_and_vec:%d", node->GetNamePtr(), static_cast<int32_t>(id),
static_cast<int32_t>(t.is_load_link_store_and_vec));
}
return af::SUCCESS;
}
Status Kernel::ParseOptimizeInfo(const ascir::NodeView &node, const ascir::TensorView &tensor) {
std::set<std::string> force_non_ub_scalar = {Max::Type, Sum::Type, Min::Type, Mean::Type,
Prod::Type, Any::Type, All::Type};
ascir::TensorId id = tensor.attr.mem.tensor_id;
auto tensor_ptr = this->tpipe.GetTensor(id);
GE_CHK_BOOL_RET_STATUS(tensor_ptr != nullptr, af::FAILED, "Check[Param] tensor_ptr is nullptr");
auto &t = *tensor_ptr;
t.is_ub_scalar = (force_non_ub_scalar.count(node->GetType()) > 0U) ? false : t.is_ub_scalar;
GELOGD("node:%s, tensor_id:%d, is_ub_scalar:%d", node->GetNamePtr(), static_cast<int32_t>(id),
static_cast<int32_t>(t.is_ub_scalar));
bool is_all_link_vf = IsOutputOnlyLink2VFNode(tensor);
GE_CHK_STATUS_RET(ParseUbScalarOptimizationInfo(node, t, id, is_all_link_vf));
GE_CHK_STATUS_RET(JudgeIsLoadLinkStoreAndVec(node, t, id));
ParseScalarNeedGenBlkTensors(node, id);
return af::SUCCESS;
}
Status Kernel::ParseScalarNeedGenBlkTensors(const ascir::NodeView &node, ascir::TensorId id) {
if (!IsOps<Scalar>(node)) {
return af::SUCCESS;
}
for (auto &out : node->outputs()) {
GE_CHK_BOOL_EXEC(out != nullptr, continue);
for (auto &peer_input : out->anchor.GetPeerInDataAnchors()) {
auto next_node = std::dynamic_pointer_cast<af::AscNode>(peer_input->GetOwnerNode());
if (IsSupportBlkTensorInput(next_node)) {
this->tpipe.need_gen_blk_tensors.emplace_back(id);
break;
}
}
}
return af::SUCCESS;
}
const TQue* TPipe::GetQue(const ascir::QueId id) const {
auto iter = this->ques.find(id);
GE_CHK_BOOL_EXEC(iter != this->ques.end(), return nullptr, "Codegen que[%d] not found", id);
return &iter->second;
}
const TBuf &TPipe::GetBuf(const ascir::BufId id) const {
auto iter = this->bufs.find(id);
if (iter == this->bufs.end()) {
GELOGE(af::FAILED, "Codegen buf[%d] not found", id);
throw std::runtime_error("Buf not found " + to_string(id));
}
return iter->second;
}
const Tensor *TPipe::GetTensor(ascir::TensorId id) const {
auto iter = tensors.find(id);
GE_CHK_BOOL_EXEC(iter != tensors.end(), return nullptr, "Codegen tensor[%ld] not found", id);
return &iter->second;
}
Tensor *TPipe::GetTensor(ascir::TensorId id) {
auto iter = tensors.find(id);
GE_CHK_BOOL_EXEC(iter != tensors.end(), return nullptr, "Codegen tensor[%ld] not found", id);
return &iter->second;
}
Status TPipe::TensorAlloc(const Tensor &tensor, std::string &result) const {
if (tensor.is_constant) {
result = "";
return af::SUCCESS;
}
std::stringstream ss;
if (this->cv_fusion_type != ascir::CubeTemplateType::kUBFuse) {
ss << tensor.Define() << std::endl;
}
const Variable *buf;
if (tensor.alloc_type == af::AllocType::kAllocTypeBuffer) {
buf = &GetBuf(tensor.buf_id).buf;
} else if (tensor.alloc_type == af::AllocType::kAllocTypeQueue) {
auto t_que = GetQue(tensor.que_id);
GE_CHK_BOOL_RET_STATUS(t_que != nullptr, af::FAILED, "Codegen que[%ld] not found", tensor.que_id);
buf = &t_que->buf;
} else if (tensor.alloc_type == af::AllocType::kAllocTypeGlobal) {
buf = &tensor;
} else {
GELOGE(af::FAILED, "Codegen tensor[%ld, %s] alloc type[%d] not supported", tensor.id, tensor.name.c_str(),
static_cast<int32_t>(tensor.alloc_type));
return af::FAILED;
}
GE_CHK_BOOL_RET_STATUS(tensor.merge_scope == af::kIdNone, af::FAILED,
"Codegen tensor[%ld, %s] merge scope[%ld] not supported", tensor.id, tensor.name.c_str(),
tensor.merge_scope);
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(tensor.dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(tensor.dtype));
if (tensor.alloc_type == af::AllocType::kAllocTypeQueue) {
ss << tensor << " = " << *buf << "[" << tensor.que_share_offset << "]"
<< ".template ReinterpretCast<" << dtype_name << ">();" << std::endl;
} else {
ss << tensor << " = " << *buf << ".template ReinterpretCast<" << dtype_name << ">();" << std::endl;
}
result = ss.str();
return af::SUCCESS;
}
Status TPipe::InitTQueBuffers(const TQue &que, std::string &result) const {
stringstream ss;
std::string blk_align;
GE_CHK_STATUS_RET(KernelUtils::BlkAlign(ge::DT_UINT8, blk_align), "Codegen blk align failed");
if (this->cv_fusion_type == ascir::CubeTemplateType::kUBFuse || !using_att_calc_qbt_size_) {
if (using_global_tpipe_) {
ss << "GetTPipePtr()->InitBuffer(" << que << ", " << que.buf_num << ", " << blk_align << "(" << que.size << "));";
} else {
ss << this->name << "."
<< "InitBuffer(" << que << ", " << que.buf_num << ", " << blk_align << "(" << que.size << "));";
}
} else {
ss << "// ";
ss << this->name << "."
<< "InitBuffer(" << que << ", " << que.buf_num << ", " << blk_align << "(" << que.size << "));" << std::endl;
ss << this->name << "."
<< "InitBuffer(" << que << ", " << que.buf_num << ", t->q" << std::to_string(que.id) << "_size);";
}
result = ss.str();
return af::SUCCESS;
}
Status TPipe::InitTBufBuffer(const TBuf &buf, std::string &result) const {
stringstream ss;
std::string blk_align;
GE_CHK_STATUS_RET(KernelUtils::BlkAlign(ge::DT_UINT8, blk_align), "Codegen blk align failed");
if (this->cv_fusion_type == ascir::CubeTemplateType::kUBFuse || !using_att_calc_qbt_size_) {
if (using_global_tpipe_) {
ss << "GetTPipePtr()->InitBuffer(" << buf << ", " << blk_align << "(" << buf.size << "));";
} else {
ss << this->name << "." << "InitBuffer(" << buf << ", " << blk_align << "(" << buf.size << "));";
}
} else {
ss << "// ";
ss << this->name << "."
<< "InitBuffer(" << buf << ", " << blk_align << "(" << buf.size << "));" << std::endl;
ss << this->name << "."
<< "InitBuffer(" << buf << ", t->b" << std::to_string(buf.id) << "_size);";
}
result = ss.str();
return af::SUCCESS;
}
std::string TPipe::TensorSizeCalc() const {
stringstream ss;
for (const auto &pair : this->tensors) {
const auto &t = pair.second;
if (t.alloc_type == af::AllocType::kAllocTypeQueue) {
ss << t.size.DefineConst(this->tiler.TensorVectorizedSize(t)) << std::endl;
ss << t.que_buf_num.DefineConst(to_string(t.que_buf_num_value)) << std::endl;
} else if (t.alloc_type == af::AllocType::kAllocTypeBuffer) {
ss << t.size.DefineConst(this->tiler.TensorVectorizedSize(t)) << std::endl;
}
}
return ss.str();
}
std::string TPipe::TensorActualSizeCalc(const ascir::TensorId id) const {
stringstream ss;
auto t_ptr = GetTensor(id);
GE_CHK_BOOL_EXEC(t_ptr != nullptr, return "", "t_ptr nullptr");
auto &t = *t_ptr;
if (this->cv_fusion_type != ascir::CubeTemplateType::kUBFuse) {
ss << t.actual_size.DefineConst(this->tiler.TensorActualSize(t));
} else {
ss << t.actual_size.Str() << " = " << this->tiler.TensorActualSize(t) << ";";
}
ss << std::endl;
return ss.str();
}
Status TPipe::MergeScopeSizeCalc(std::string &result) const {
stringstream ss;
for (const auto &pair : this->merge_scopes) {
const auto &merge_scope = pair.second;
stringstream tensor_size_sum;
stringstream tensor_bufnum_max;
tensor_size_sum << KernelUtils::Sum() << "(";
tensor_bufnum_max << KernelUtils::Max() << "(";
bool first = true;
for (auto tid : merge_scope.tensors) {
auto tensor = this->tensors.find(tid);
if (tensor == this->tensors.end()) {
GELOGE(af::FAILED, "Codegen tensor[%ld] not found", tid);
return af::FAILED;
}
if (tensor->second.alloc_type != af::AllocType::kAllocTypeQueue &&
tensor->second.alloc_type != af::AllocType::kAllocTypeBuffer) {
GELOGE(af::FAILED, "Codegen tensor[%ld] is not alloc from que/buf", tid);
return af::FAILED;
}
if (first) {
first = false;
} else {
tensor_size_sum << ", ";
if (tensor->second.alloc_type == af::AllocType::kAllocTypeQueue) {
tensor_bufnum_max << ", ";
}
}
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(tensor->second.dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(tensor->second.dtype));
tensor_size_sum << tensor->second.size << " * " << "sizeof(" << dtype_name << ")";
if (tensor->second.alloc_type == af::AllocType::kAllocTypeQueue) {
tensor_bufnum_max << tensor->second.que_buf_num;
}
}
tensor_size_sum << ")";
tensor_bufnum_max << ")";
ss << merge_scope.size.DefineConst(tensor_size_sum.str()) << std::endl;
ss << merge_scope.buf_num.DefineConst(tensor_bufnum_max.str()) << std::endl;
}
result = ss.str();
return af::SUCCESS;
}
Status TPipe::LocalTQueAlloc(std::string &result) const {
stringstream ss;
for (auto &[id, que] : this->ques) {
if (id == this->cube_output_que_id) {
continue;
}
stringstream tensor_size_max;
stringstream tensor_bufnum_max;
tensor_size_max << KernelUtils::Max() << "(";
tensor_bufnum_max << KernelUtils::Max() << "(";
bool is_first = true;
for (auto mid : que.merge_scopes) {
auto merge_scope = this->merge_scopes.find(mid);
if (merge_scope == this->merge_scopes.end()) {
GELOGE(af::FAILED, "Codegen merge scope not found:%ld", mid);
return af::FAILED;
}
if (is_first) {
is_first = false;
} else {
tensor_size_max << ", ";
tensor_bufnum_max << ", ";
}
tensor_size_max << merge_scope->second.size;
tensor_bufnum_max << merge_scope->second.buf_num;
}
uint32_t tensor_buf_num_max_val = 0;
for (auto tid : que.not_merge_tensors) {
auto tensor = this->tensors.find(tid);
if (tensor == this->tensors.end()) {
GELOGE(af::FAILED, "Codegen tensor not found:%ld", tid);
return af::FAILED;
}
if (is_first) {
is_first = false;
} else {
tensor_size_max << ", ";
}
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(tensor->second.dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(tensor->second.dtype));
tensor_size_max << tensor->second.size << " * sizeof(" << dtype_name << ")";
tensor_buf_num_max_val = std::max(tensor_buf_num_max_val, tensor->second.que_buf_num_value);
}
tensor_bufnum_max << (que.merge_scopes.empty() ? "" : ", ") << tensor_buf_num_max_val;
for (auto share_tensors : que.share_group) {
if (share_tensors.second.size() <= 1) {
continue;
}
tensor_size_max << ", ";
bool is_first_share = true;
for (auto tid : share_tensors.second) {
auto tensor = this->tensors.find(tid);
GE_ASSERT_TRUE(tensor != this->tensors.end(), "Codegen share tensor not found:%ld", tid);
if (is_first_share) {
is_first_share = false;
} else {
tensor_size_max << " + ";
}
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(tensor->second.dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(tensor->second.dtype));
tensor_size_max << tensor->second.size << " * sizeof(" << dtype_name << ")";
}
}
tensor_size_max << ")";
tensor_bufnum_max << ")";
if (this->cv_fusion_type == ascir::CubeTemplateType::kUBFuse) {
ss << que.size.DefineConst(tensor_size_max.str()) << std::endl;
ss << que.buf_num.DefineConst(tensor_bufnum_max.str()) << std::endl;
} else if (!using_att_calc_qbt_size_) {
ss << que.size.DefineConst(tensor_size_max.str()) << std::endl;
ss << que.buf_num.DefineConst(tensor_bufnum_max.str()) << std::endl;
ss << que.Define() << std::endl;
} else {
ss << "// " << que.size.DefineConst(tensor_size_max.str()) << std::endl;
ss << que.buf_num.DefineConst(tensor_bufnum_max.str()) << std::endl;
ss << que.Define() << std::endl;
}
std::string init;
GE_CHK_STATUS_RET(this->InitTQueBuffers(que, init), "Codegen init tque buffers failed");
ss << init << std::endl;
}
result = ss.str();
return af::SUCCESS;
}
Status TPipe::BlkTensorAllocAndInit(std::string &result) const {
stringstream ss;
for (auto &id : this->need_gen_blk_tensors) {
auto tensor_ptr = this->GetTensor(id);
GE_CHK_BOOL_RET_STATUS(tensor_ptr != nullptr, af::FAILED, "BlkTensorAllocAndInit need_gen_blk_tensors failed");
std::string scalar_t_buf_name = tensor_ptr->name + "_tbuf";
std::string scalar_local_blk_tensor_name = "local_blk_tensor_of_" + tensor_ptr->name;
ss << "TBuf<TPosition::VECCALC> " << scalar_t_buf_name << ";" << std::endl;
if (using_global_tpipe_) {
ss << "GetTPipePtr()->InitBuffer(" << scalar_t_buf_name << ", 32);" << std::endl;
} else {
ss << "tpipe.InitBuffer(" << scalar_t_buf_name << ", 32);" << std::endl;
}
ss << "LocalTensor<" << tensor_ptr->type << "> " << scalar_local_blk_tensor_name << " = " << scalar_t_buf_name
<< ".Get<" << tensor_ptr->type << ">();" << std::endl;
ss << "Duplicate(" << scalar_local_blk_tensor_name << "[0], static_cast<" << tensor_ptr->type
<< ">(" << tensor_ptr->const_value << "), static_cast<uint64_t>(32/"
<< "sizeof(" << tensor_ptr->type << ")));" << std::endl;
ss << "AscendC::PipeBarrier<PIPE_V>();" << std::endl;
}
result = ss.str();
return af::SUCCESS;
}
std::string TPipe::GenDuplicateBufAlloc(const std::set<std::pair<std::string, std::string>>& pre_api_extract_dup) const {
std::stringstream ss;
int32_t i = 1;
for (auto [const_val, const_dtype] : pre_api_extract_dup) {
const std::string index_str = std::to_string(i);
ss << "TBuf<TPosition::VECCALC> builtin_tmp_buffer_" << index_str << ";" << std::endl;
if (using_global_tpipe_) {
ss << "GetTPipePtr()->InitBuffer(builtin_tmp_buffer_" << index_str << ", ONE_BLK_SIZE);" << std::endl;
} else {
ss << "tpipe.InitBuffer(builtin_tmp_buffer_" << index_str << ", ONE_BLK_SIZE);" << std::endl;
}
ss << "LocalTensor<uint8_t> builtin_tmp_buf_" << index_str << " = builtin_tmp_buffer_" << index_str <<
".Get<uint8_t>();" << std::endl;
std::string local_tensor_name = "local_blk_tensor_of_" + const_dtype + "_" + const_val;
ss << "LocalTensor<" << const_dtype << "> " << local_tensor_name <<
" = builtin_tmp_buf_" << index_str << ".template ReinterpretCast<" << const_dtype << ">();" << std::endl;
if (const_dtype == "half" || const_dtype == "float" || const_dtype == "double") {
const_val += ".0";
}
ss << "Duplicate(" << local_tensor_name << "[0], (" << const_dtype << ")" << const_val <<
", ONE_BLK_SIZE / sizeof(" << const_dtype << "));"<< std::endl;
i++;
}
return ss.str();
}
Status TPipe::LocalTBufAlloc(const TBuf &buf, std::string &result, const bool with_define) const {
stringstream ss;
std::string reuse_dtype_name = "";
std::vector<const Tensor *> reuse_buf_tensors;
bool is_buf_reuse = true;
stringstream tensor_size_max;
GE_CHK_STATUS_RET(ParseTBufReuse(buf, reuse_dtype_name, is_buf_reuse, reuse_buf_tensors,
tensor_size_max), "Codegen parse tbuf reuse failed");
if (this->cv_fusion_type == ascir::CubeTemplateType::kUBFuse) {
ss << buf.size.DefineConst(tensor_size_max.str()) << std::endl;
} else if (!using_att_calc_qbt_size_) {
ss << buf.size.DefineConst(tensor_size_max.str()) << std::endl;
ss << buf.Define() << std::endl;
} else {
ss << "// " << buf.size.DefineConst(tensor_size_max.str()) << std::endl;
ss << buf.Define() << std::endl;
}
std::string init;
GE_CHK_STATUS_RET(this->InitTBufBuffer(buf, init), "Codegen init tbuf buffer failed");
ss << init << std::endl;
if (!is_buf_reuse) {
ss << buf.AllocBuf(with_define) << std::endl;
} else {
ss << buf.AllocBuf(reuse_buf_tensors[0]->name, reuse_dtype_name, with_define) << std::endl;
reuse_buf_tensors[0]->no_need_realloc = true;
for (size_t i = 1UL; i < reuse_buf_tensors.size(); i++) {
reuse_buf_tensors[i]->no_need_realloc = true;
if (with_define) {
ss << "LocalTensor<" << reuse_dtype_name << "> ";
}
ss << reuse_buf_tensors[i]->name << " = " << reuse_buf_tensors[0]->name << ";" << std::endl;
}
}
result = ss.str();
return af::SUCCESS;
}
Status TPipe::LocalTBufAllocLoopTwice(std::string &result, const bool with_define) const {
stringstream ss;
std::string tmp;
for (const auto buf_id : this->contiguous_buf_ids) {
const auto it = this->bufs.find(buf_id);
GE_ASSERT_TRUE(it != this->bufs.cend(), "buf not found, buf_id = %ld", buf_id);
const auto &buf = it->second;
GE_CHK_STATUS_RET(this->LocalTBufAlloc(buf, tmp, with_define), "Codegen TBuf alloc failed(no tmp buf).");
ss << tmp;
}
std::set<ascir::BufId> allocated{this->contiguous_buf_ids.cbegin(), this->contiguous_buf_ids.cend()};
for (auto &pair : this->bufs) {
if (allocated.find(pair.first) != allocated.end()) {
continue;
}
auto &buf = pair.second;
if (!buf.tmp_buf_reuse) {
GE_CHK_STATUS_RET(this->LocalTBufAlloc(buf, tmp, with_define), "Codegen TBuf alloc failed(no tmp buf).");
ss << tmp;
}
}
for (auto &pair : this->bufs) {
auto &buf = pair.second;
if (buf.tmp_buf_reuse) {
GE_CHK_STATUS_RET(this->LocalTBufAlloc(buf, tmp, with_define), "Codegen TBuf alloc failed(tmp buf).");
ss << tmp;
ss << this->AllocTmpBuf(buf, with_define);
}
}
ss << std::endl;
result = ss.str();
return af::SUCCESS;
}
Status TPipe::LocalTensorQueBufAlloc(std::string &result) const {
stringstream ss;
std::string tmp;
ss << this->TensorSizeCalc();
GE_CHK_STATUS_RET(this->MergeScopeSizeCalc(tmp), "Codegen merge scope size failed");
ss << tmp;
ss << std::endl;
GE_CHK_STATUS_RET(this->LocalTQueAlloc(tmp), "Codegen alloc local tque failed");
ss << tmp;
GE_CHK_STATUS_RET(this->LocalTBufAllocLoopTwice(tmp), "Codegen alloc local tbuf failed");
ss << tmp << std::endl;
result = ss.str();
return af::SUCCESS;
}
std::string TPipe::SyncMte3ToMte2(const Tensor in_tensor) const {
stringstream ss;
std::string event_name = in_tensor.Str() + "_e";
std::string sync_name = in_tensor.Str() + "_s";
ss << "auto " << event_name << " = tpipe.AllocEventID<HardEvent::MTE3_MTE2>();" << std::endl
<< "TQueSync<PIPE_MTE3, PIPE_MTE2> " << sync_name << ";" << std::endl
<< sync_name << ".SetFlag(" << event_name << ");" << std::endl
<< sync_name << ".WaitFlag(" << event_name << ");" << std::endl
<< "tpipe.ReleaseEventID<HardEvent::MTE3_MTE2>(" << event_name << ");" << std::endl;
return ss.str();
}
std::string TPipe::SyncMte2ToMte3(const Tensor in_tensor) const {
stringstream ss;
std::string event_name = in_tensor.Str() + "_e";
std::string sync_name = in_tensor.Str() + "_s";
ss << "auto " << event_name << " = tpipe.AllocEventID<HardEvent::MTE2_MTE3>();" << std::endl
<< "TQueSync<PIPE_MTE2, PIPE_MTE3> " << sync_name << ";" << std::endl
<< sync_name << ".SetFlag(" << event_name << ");" << std::endl
<< sync_name << ".WaitFlag(" << event_name << ");" << std::endl
<< "tpipe.ReleaseEventID<HardEvent::MTE2_MTE3>(" << event_name << ");" << std::endl;
return ss.str();
}
Status TPipe::CollectQues(const ascir::ImplGraph &graph) {
std::unordered_map<ascir::QueId, af::Position> que_id_to_src_position;
std::set<ascir::QueId> need_bind_que_id;
for (auto node : graph.GetAllNodes()) {
if (node->attr.api.type == ge::ApiType::kAPITypeBuffer) {
continue;
}
for (const auto &out_tensor : node->outputs()) {
if (out_tensor->attr.mem.alloc_type != af::AllocType::kAllocTypeQueue) {
continue;
}
const int64_t tensor_que_id = out_tensor->attr.que.id;
if (out_tensor->attr.mem.position == af::Position::kPositionVecIn) {
que_id_to_src_position.emplace(tensor_que_id, out_tensor->attr.mem.position);
std::set<std::string> peer_node_types;
for (const auto &peer_in_anchor : out_tensor->anchor.GetPeerInDataAnchorsPtr()) {
if (peer_in_anchor != nullptr && peer_in_anchor->GetOwnerNodeBarePtr() != nullptr) {
peer_node_types.emplace(peer_in_anchor->GetOwnerNodeBarePtr()->GetType());
}
}
if ((peer_node_types.size() == 1U) && *peer_node_types.begin() == Store::Type) {
need_bind_que_id.emplace(tensor_que_id);
}
} else if (out_tensor->attr.mem.position == af::Position::kPositionVecOut) {
que_id_to_src_position.emplace(tensor_que_id, out_tensor->attr.mem.position);
}
}
}
for (const auto &iter : que_id_to_src_position) {
if (this->ques.count(iter.first) > 0UL) {
continue;
}
std::string position;
GE_CHK_STATUS_RET(PositionValue(iter.second, position), "Codegen get position value failed");
bool need_que_bind = need_bind_que_id.count(iter.first) > 0UL;
if (need_que_bind) {
std::string dst_position;
GE_CHK_STATUS_RET(PositionValue(af::Position::kPositionVecOut, dst_position),
"Codegen get position value failed");
auto new_que = this->ques.emplace(iter.first, TQue{iter.first, iter.second, position, dst_position});
GE_CHK_BOOL_RET_STATUS(new_que.second, af::FAILED, "Codegen emplace que [%ld] failed", iter.first);
} else {
auto new_que = this->ques.emplace(iter.first, TQue{iter.first, iter.second, position});
GE_CHK_BOOL_RET_STATUS(new_que.second, af::FAILED, "Codegen emplace que [%ld] failed", iter.first);
}
}
for (auto &[id, que] : this->ques) {
if (id != this->cube_output_que_id) {
que.is_cv_ub_fusion = (this->cv_fusion_type == ascir::CubeTemplateType::kUBFuse);
}
}
return af::SUCCESS;
}
void TPipe::SetUsingAttCalcQBTSizeConfig(bool using_att_calc_qbt_size) {
using_att_calc_qbt_size_ = using_att_calc_qbt_size;
}
void TPipe::SetUsingGlobalTpipe(bool using_global_tpipe) {
using_global_tpipe_ = using_global_tpipe;
}
Kernel::Kernel(const std::string &kernel_name)
: workspace_arg("workspace"),
name(kernel_name),
tiler(kernel_name + "TilingData", "t"),
tpipe("tpipe", this->tiler),
root_loop(af::kIdNone) {}
Kernel::~Kernel() {
root_loop.Destruct();
}
std::string Kernel::TilingKeyFuncDeclare(const std::string &impl_graph_name, const std::string &tiling_data) const {
const char *flags[] = {"inline", "__aicore__"};
const char *return_type = "void";
std::stringstream ss;
for (auto flag : flags) {
ss << flag << " ";
}
ss << return_type << " ";
ss << CamelToLowerSneak(impl_graph_name) << "(";
if (use_list_tensor_) {
ss << "ListTensorDesc &input_tensor_desc, ListTensorDesc &output_tensor_desc, ";
} else {
for (auto &input : this->inputs) {
ss << input.AsArg() << ", ";
}
for (auto &output : this->outputs) {
ss << output.AsArg() << ", ";
}
}
ss << this->workspace_arg.AsArg() << ", ";
for (auto &workspace : this->workspaces) {
ss << workspace.AsArg() << ", ";
}
ss << "const "<< tiling_data << " *t";
ss << ")";
return ss.str();
}
Status Kernel::GlobalTensorInit(std::string &result) const {
std::stringstream ss;
for (std::size_t i = 0; i < this->inputs.size(); i++) {
const auto &tensor = this->tpipe.tensors.find(this->input_tensors[i]);
if (tensor == this->tpipe.tensors.end()) {
GELOGE(af::FAILED, "Codegen input tensor id[%ld] not found", this->input_tensors[i]);
return af::FAILED;
}
if (tensor->second.is_constant) {
continue;
}
ss << tensor->second.Define() << std::endl;
std::string local_result;
if (use_list_tensor_) {
auto input_index = input_name_to_index_.at(this->inputs[i].Str());
auto input = GM_ADDR("input_tensor_desc.GetDataPtr<__gm__ uint8_t>(" + std::to_string(input_index) + ")");
GE_CHK_STATUS_RET(tensor->second.SetGlobalBuffer(input, "", local_result),
"Codegen set global buffer failed");
} else {
GE_CHK_STATUS_RET(tensor->second.SetGlobalBuffer(this->inputs[i], "", local_result),
"Codegen set global buffer failed");
}
ss << local_result << std::endl;
}
for (std::size_t i = 0; i < this->outputs.size(); i++) {
const auto &tensor = this->tpipe.tensors.find(this->output_tensors[i]);
if (tensor == this->tpipe.tensors.end()) {
GELOGE(af::FAILED, "Codegen output tensor id[%ld] not found", this->output_tensors[i]);
return af::FAILED;
}
ss << tensor->second.Define() << std::endl;
std::string local_result;
if (use_list_tensor_) {
auto output_index = output_name_to_index_.at(this->outputs[i].Str());
auto output = GM_ADDR("output_tensor_desc.GetDataPtr<__gm__ uint8_t>(" + std::to_string(output_index) + ")");
GE_CHK_STATUS_RET(tensor->second.SetGlobalBuffer(output, "", local_result),
"Codegen set global buffer failed");
} else {
GE_CHK_STATUS_RET(tensor->second.SetGlobalBuffer(this->outputs[i], "", local_result),
"Codegen set global buffer failed");
}
ss << local_result << std::endl;
}
for (std::size_t i = 0; i < this->constant_tensors.size(); i++) {
auto tensor = this->tpipe.tensors.find(this->constant_tensors[i]);
if (tensor == this->tpipe.tensors.end()) {
GELOGE(af::FAILED, "Codegen concat tensor id[%ld] not found", this->constant_tensors[i]);
return af::FAILED;
}
GELOGI("const_value_expr: %s", tensor->second.const_value_expr.Str().get());
string const_value = tensor->second.const_value_expr == 0 ? tensor->second.const_value
: tiler.Size(tensor->second.const_value_expr, true);
ss << tensor->second.DefineConst(const_value.c_str()) << std::endl;
GELOGI("Define ss value: %s", ss.str().c_str());
}
for (std::size_t i = 0; i < this->ub_scalar_tensors.size(); i++) {
auto tensor = this->tpipe.tensors.find(this->ub_scalar_tensors[i]);
GE_ASSERT_TRUE((tensor != this->tpipe.tensors.end()), "Codegen ub_scalar tensor id[%ld] not found",
this->ub_scalar_tensors[i]);
std::string def_ub_scalar;
GE_CHK_STATUS_RET(tensor->second.DefineUbScalar(def_ub_scalar));
ss << def_ub_scalar;
GELOGI("Define ub_scalar var: %s", def_ub_scalar.c_str());
}
std::stringstream offset_ss;
offset_ss << "0";
auto it_ws_tensors = this->workspace_tensors.begin();
for (size_t i = 0UL; i < this->workspaces.size(); i++) {
GELOGI("Define workspace tensor id: %ld", it_ws_tensors->first);
auto tensor = this->tpipe.tensors.find(it_ws_tensors->first);
if (tensor == this->tpipe.tensors.end()) {
GELOGE(af::FAILED, "Codegen workspace tensor id[%ld] not found", it_ws_tensors->first);
return af::FAILED;
}
ss << tensor->second.Define() << std::endl;
std::string local_result;
GE_CHK_STATUS_RET(tensor->second.SetGlobalBuffer(this->workspace_arg, offset_ss.str(), local_result),
"Codegen set global buffer failed");
ss << local_result << std::endl;
offset_ss << " + " << "(" << this->workspaces[i] << ")";
it_ws_tensors++;
}
result = ss.str();
return af::SUCCESS;
}
Status Kernel::LocalTensorQueBufAlloc(std::string &result, const ascir::ImplGraph &graph) const {
(void)graph;
stringstream ss;
std::string tmp;
ss << this->tpipe.Define() << std::endl;
if (!this->pre_api_extract_dup.empty()) {
ss << this->tpipe.GenDuplicateBufAlloc(this->pre_api_extract_dup) << std::endl;
}
GE_CHK_STATUS_RET(this->tpipe.BlkTensorAllocAndInit(tmp), "Codegen BlkTensorAllocAndInit failed");
ss << tmp << std::endl;
GE_CHK_STATUS_RET(this->tpipe.LocalTensorQueBufAlloc(tmp), "Codegen alloc local tensor que buf failed");
ss << tmp << std::endl;
result = ss.str();
return af::SUCCESS;
}
Status Kernel::ParseWorkspaceTensor(const ascir::TensorAttr *tensor,
const ascir::FusedScheduledResult &fused_schedule_result,
std::set<int64_t> &output_indices,
const std::unordered_map<ascir::TensorId, size_t> &output_tensorid_to_index,
const std::map<size_t, std::string> output_index_to_name) {
if (this->tpipe.tensors.find(tensor->attr.mem.tensor_id) == this->tpipe.tensors.cend()) {
GE_CHK_STATUS_RET(this->tpipe.AddTensor(*tensor, "workspace"), "Codegen add tensor failed");
}
if (output_tensorid_to_index.find(tensor->attr.mem.tensor_id) != output_tensorid_to_index.cend()) {
int64_t index;
const auto &out_node = fused_schedule_result.output_nodes[output_tensorid_to_index.at(tensor->attr.mem.tensor_id)];
GE_CHK_GRAPH_STATUS_RET(out_node->attr.ir_attr->GetAttrValue("index", index),
"Failed to get Workspace reuse Output index, node = %s", out_node->GetNamePtr());
GE_ASSERT_TRUE(index >= 0, "invalid Workspace reuse Output index, node = %s, index = %ld", out_node->GetNamePtr(),
index);
GE_ASSERT_TRUE(output_index_to_name.find(index) != output_index_to_name.cend(),
"Get workspace reuse output name failed.");
const auto &output_name = output_index_to_name.at(index);
GE_ASSERT_TRUE(!output_name.empty(), "Failed to get workspace reuse arg name, output_node = %s, index = %ld",
out_node->GetNamePtr(), index);
if (output_indices.emplace(index).second) {
this->outputs.emplace_back(GM_ADDR(GenValidName(output_name)));
this->output_tensors.emplace_back(out_node->inputs[0].attr.mem.tensor_id);
}
}
return af::SUCCESS;
}
Status Kernel::ParseGraph(const ascir::ImplGraph &graph, const ascir::FusedScheduledResult &fused_schedule_result,
Kernel &kernel) {
GE_CHK_STATUS_RET(CheckGraphValidity(graph), "Graph: %s is invalid", graph.GetName().c_str());
std::map<size_t, std::string> input_index_to_name;
std::map<size_t, std::string> output_index_to_name;
std::unordered_map<ascir::TensorId, size_t> output_tensorid_to_index;
for (size_t i = 0U; i < fused_schedule_result.input_nodes.size(); ++i) {
const auto &input = fused_schedule_result.input_nodes[i];
GE_ASSERT_TRUE(IsOps<Data>(input) || IsOps<ScalarData>(input), "Codegen unsupported input[%s] type[%s]",
input->GetName().c_str(), input->GetType().c_str());
const auto &normalized_name = GenValidName(input->GetName());
kernel.input_name_to_index_[normalized_name] = i;
input_index_to_name[i] = normalized_name;
GELOGD("input_index = %zu, input_name = %s", i, normalized_name.c_str());
}
for (size_t i = 0U; i < fused_schedule_result.output_nodes.size(); ++i) {
const auto &output = fused_schedule_result.output_nodes[i];
const auto &normalized_name = GenValidName(output->GetName());
kernel.output_name_to_index_[normalized_name] = i;
output_index_to_name[i] = output->GetName();
output_tensorid_to_index[output->inputs[0].attr.mem.tensor_id] = i;
GELOGD("output_index = %zu, output_name = %s", i, normalized_name.c_str());
}
std::set<int64_t> input_indices;
std::set<int64_t> output_indices;
std::map<int64_t, std::pair<std::string, ascir::TensorId>> kernel_outputs;
bool has_gather = false;
for (const auto &node : graph.GetAllNodes()) {
if (IsOps<Data>(node) || IsOps<ScalarData>(node)) {
int64_t index;
GE_CHK_GRAPH_STATUS_RET(node->attr.ir_attr->GetAttrValue("index", index), "Failed to get Data index, node = %s",
node->GetNamePtr());
GE_ASSERT_TRUE(index >= 0, "invalid Data index, node = %s, index = %ld", node->GetNamePtr(), index);
const auto &input_name = input_index_to_name[index];
GE_ASSERT_TRUE(!input_name.empty(), "Failed to get arg name, input_node = %s, index = %ld", node->GetNamePtr(),
index);
if (input_indices.emplace(index).second) {
if (IsOps<Data>(node)) {
kernel.inputs.emplace_back(GM_ADDR(GenValidName(input_name)));
} else if (IsOps<ScalarData>(node)) {
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(node->outputs[0].attr.dtype, dtype_name), "data type:%d failed",
static_cast<int32_t>(node->outputs[0].attr.dtype));
kernel.inputs.emplace_back(Variable(Type(dtype_name), input_name));
}
kernel.input_tensors.emplace_back(node->outputs[0].attr.mem.tensor_id);
}
continue;
}
if (IsOps<Output>(node)) {
int64_t index;
GE_CHK_GRAPH_STATUS_RET(node->attr.ir_attr->GetAttrValue("index", index), "Failed to get Data index, node = %s",
node->GetNamePtr());
GE_ASSERT_TRUE(index >= 0, "invalid Data index, node = %s, index = %ld", node->GetNamePtr(), index);
const auto &output_name = output_index_to_name[index];
GE_ASSERT_TRUE(!output_name.empty(), "Failed to get arg name, output_node = %s, index = %ld", node->GetNamePtr(),
index);
if (output_indices.emplace(index).second) {
kernel_outputs[index] = std::make_pair(output_name, node->inputs[0].attr.mem.tensor_id);
}
continue;
}
has_gather = (has_gather || IsOps<Gather>(node));
}
for (const auto &pair : kernel_outputs) {
kernel.outputs.emplace_back(GM_ADDR(GenValidName(pair.second.first)));
kernel.output_tensors.emplace_back(pair.second.second);
}
std::vector<ascir::TensorId> workspace_tensor_id = GetWorkspaceTensorIdListInOneScheduleResult(fused_schedule_result);
for (auto tId : workspace_tensor_id) {
std::string workspaceStr = "workspace";
workspaceStr = workspaceStr + std::to_string(tId);
kernel.workspaces.emplace_back(Uint32(workspaceStr.c_str()));
kernel.workspace_tensors[tId] = "0";
}
for (auto node : graph.GetAllNodes()) {
if (IsOps<Scalar>(node) || IsOps<IndexExpr>(node)) {
kernel.constant_tensors.emplace_back(node->outputs[0].attr.mem.tensor_id);
}
}
for (auto axis : graph.GetAllAxis()) {
GE_CHK_STATUS_RET(kernel.tiler.AddAxis(*axis), "Codegen add axis failed");
}
kernel.tiler.AddAxisSplitBAttr();
for (auto size : graph.GetAllSizeVar()) {
kernel.tiler.AddSizeVar(*size);
}
GE_ASSERT_SUCCESS(kernel.tpipe.CollectQues(graph));
for (auto node : graph.GetAllNodes()) {
if (IsOps<Output>(node) || IsOps<Data>(node) || IsOps<ScalarData>(node)) {
continue;
}
auto desc = node->GetOpDesc();
for (auto output : node->outputs()) {
auto output_index = af::ascir::AscTensorUtils::Index(*output);
auto tensor_name = node->GetName() + "_" + desc->GetOutputNameByIndex(output_index);
if (IsOps<Scalar>(node)) {
std::string const_value;
auto ir_attr = node->attr.ir_attr.get();
if (ir_attr->GetAttrValue("value", const_value) != af::GRAPH_SUCCESS) {
GELOGE(af::FAILED, "GetAttrValue const value failed");
return af::FAILED;
}
GELOGI("Scalar node const value %s", const_value.c_str());
GE_CHK_STATUS_RET(kernel.tpipe.AddTensor(const_value, *output, tensor_name), "Codegen add tensor failed");
GE_CHK_STATUS_RET(kernel.ParseOptimizeInfo(node, *output));
} else if (IsOps<ScalarData>(node)) {
GELOGI("ScalarData node const value %s", node->GetName().c_str());
GE_CHK_STATUS_RET(kernel.tpipe.AddTensor(GenValidName(node->GetName()), *output, tensor_name),
"Codegen add tensor failed");
GE_CHK_STATUS_RET(kernel.ParseOptimizeInfo(node, *output));
} else if (IsOps<IndexExpr>(node)) {
int64_t size_id = 0;
auto ir_attr = node->attr.ir_attr.get();
if (ir_attr->GetAttrValue("expr", size_id) != af::GRAPH_SUCCESS) {
GELOGE(af::FAILED, "GetAttrValue index expr failed, size_id = %lld", size_id);
return af::FAILED;
}
GELOGI("size_id = %lld", size_id);
auto all_sizevar = graph.GetAllSizeVar();
GE_CHK_STATUS_RET(kernel.tpipe.AddTensor(*output, all_sizevar.at(size_id)->expr, tensor_name),
"Codegen add tensor failed");
} else if (IsOps<Workspace>(node)) {
GE_CHK_STATUS_RET(kernel.ParseWorkspaceTensor(output, fused_schedule_result, output_indices,
output_tensorid_to_index, output_index_to_name),
"Codegen parse workspace tensor failed");
kernel.has_workspace_node = true;
} else if (IsOps<Store>(node)) {
if (kernel.tpipe.tensors.find(output->attr.mem.tensor_id) == kernel.tpipe.tensors.cend()) {
GE_CHK_STATUS_RET(kernel.tpipe.AddTensor(*output, tensor_name), "Codegen add tensor failed");
}
} else {
GE_CHK_STATUS_RET(kernel.tpipe.AddTensor(*output, tensor_name), "Codegen add tensor failed");
GE_CHK_STATUS_RET(kernel.ParseOptimizeInfo(node, *output));
}
}
}
for (auto node : fused_schedule_result.input_nodes) {
auto desc = node->GetOpDesc();
for (auto output : node->outputs()) {
auto tensor_name = node->GetName() + "_" + desc->GetOutputNameByIndex(af::ascir::AscTensorUtils::Index(*output));
kernel.tpipe.AddTensor(*output, tensor_name);
}
}
for (auto node : fused_schedule_result.workspace_nodes) {
GE_ASSERT_TRUE(IsOps<Workspace>(node), "fused_schedule_result node[%s] is not workspace", node->GetName().c_str());
auto &output = node->outputs[0U];
if (kernel.tpipe.tensors.find(output.attr.mem.tensor_id) == kernel.tpipe.tensors.cend()) {
GELOGD("Workspace node[%s] is input in a schedule group, tensor id[%ld]", node->GetName().c_str(),
output.attr.mem.tensor_id);
GE_CHK_STATUS_RET(kernel.tpipe.AddTensor(output, "workspace"), "Codegen add tensor failed");
}
}
for (auto node : graph.GetAllNodes()) {
for (auto tmp_buffer : node->attr.tmp_buffers) {
if (tmp_buffer.id == -1) {
continue;
}
auto it = kernel.tpipe.bufs.find(tmp_buffer.id);
GELOGD("reuse tmp buffer id is %ld.", tmp_buffer.id);
if (it == kernel.tpipe.bufs.end()) {
std::string position = "TPosition::VECCALC";
ascir::Position tensor_position = af::Position::kPositionVecCalc;
auto [new_buf, is_insert5] = kernel.tpipe.bufs.emplace(tmp_buffer.id, TBuf{tmp_buffer.id, tensor_position, position});
GE_CHK_BOOL_RET_STATUS(is_insert5, af::FAILED, "Codegen emplace tbuf [%ld] failed", tmp_buffer.id);
new_buf->second.tmp_buf_size_list.emplace_back(tmp_buffer.buf_desc.size);
new_buf->second.tmp_buf_reuse = true;
} else {
it->second.tmp_buf_size_list.emplace_back(tmp_buffer.buf_desc.size);
it->second.tmp_buf_reuse = true;
}
}
}
uint32_t total_blk_num = 0U;
GetApiExtractDupSet(graph, kernel.pre_api_extract_dup, total_blk_num);
kernel.SetEnableParallelCompile((!has_gather));
if (IsCVFusionUBGraph(graph, kernel.tpipe.cv_fusion_type)) {
GE_CHK_STATUS_RET(kernel.tpipe.GetCVFusionCubeOutputUBTensorIdAndQueId(graph),
"get cube output tensor id failed");
}
return kernel.root_loop.ConstructFromNodes(graph.GetAllNodes(), kernel.tiler, kernel.tpipe);
}
Status Kernel::GenerateSubGraphFuncDef(const Loop *loop, std::stringstream &ss) const {
GE_ASSERT_NOTNULL(loop);
std::stack<const Loop *> loop_stack;
loop_stack.push(loop);
while (!loop_stack.empty()) {
const Loop *current_loop = loop_stack.top();
loop_stack.pop();
for (auto &body : current_loop->bodys) {
if (body.type == LoopType::LOOP) {
GE_ASSERT_NOTNULL(body.loop);
loop_stack.push(body.loop);
} else if (body.type == LoopType::CALL) {
GE_ASSERT_NOTNULL(body.call);
GE_ASSERT_SUCCESS(body.call->GenerateFuncDefinition(tpipe, tiler, ss), "gen func definition failed, api_name:%s",
body.call->api_name_.c_str());
}
}
}
return af::SUCCESS;
}
Status Kernel::Generate(const std::string &impl_graph_name, const std::string &tiling_data, std::string &result,
const ascir::ImplGraph &graph) {
if (ascgen_utils::IsCubeType(graph)) {
return af::SUCCESS;
}
stringstream ss;
GE_ASSERT_SUCCESS(GenerateSubGraphFuncDef(&(this->root_loop), ss));
ss << this->TilingKeyFuncDeclare(impl_graph_name, tiling_data) << " {" << std::endl;
std::string tmp;
for (auto &[id, axis] : tiler.axis_map) {
if (axis.IsInner() || IsMergeFromInner(tiler, axis)) {
continue;
}
if (axis.IsOuter() && !(axis.type == ascir::Axis::Type::kAxisTypeBlockOuter && axis.from.size() > 1)) {
ss << tiler.GenAxisSizeNew(axis.split_pair_other_id);
}
ss << tiler.GenAxisSizeNew(id);
}
ss << this->tiler.BlockOutterAxisDefine();
ss << std::endl;
GE_CHK_STATUS_RET(this->GlobalTensorInit(tmp), "Codegen global tensor init failed");
ss << tmp;
ss << std::endl;
GE_CHK_STATUS_RET(this->LocalTensorQueBufAlloc(tmp, graph), "Codegen alloc local tensor que buf failed");
ss << tmp;
GE_CHK_STATUS_RET(this->root_loop.Generate(this->tiler, this->tpipe, tmp), "Codegen root loop Generate failed");
ss << tmp;
ss << "}" << std::endl;
result = ss.str();
return af::SUCCESS;
}
std::string Kernel::GetIncludeApiHeaderFiles(const ascir::FusedScheduledResult &fused_schedule_result) {
std::set<std::string> api_header_list = {
"basic_api/kernel_tpipe.h",
"basic_api/kernel_tensor.h",
"basic_api/kernel_type.h",
"basic_api/kernel_operator_block_sync_intf.h",
"basic_api/kernel_operator_data_copy_intf.h",
"basic_api/kernel_common.h",
"basic_api/kernel_operator_common_intf.h",
"basic_api/kernel_operator_sys_var_intf.h",
"basic_api/kernel_struct_binary.h",
};
std::stringstream ss;
for (const auto &header : api_header_list) {
ss << "#include \"" << header << "\"" << std::endl;
}
for (size_t graph_id = 0; graph_id < fused_schedule_result.node_idx_to_scheduled_results.size(); graph_id++) {
auto scheduled_results = fused_schedule_result.node_idx_to_scheduled_results[graph_id];
for (size_t i = 0; i < scheduled_results.size(); i++) {
auto schedule_groups = scheduled_results[i].schedule_groups;
for (size_t j = 0; j < schedule_groups.size(); j++) {
auto schedule_graphs = schedule_groups[j].impl_graphs;
for (size_t k = 0; k < schedule_graphs.size(); k++) {
for (const auto &node : schedule_graphs[k].GetAllNodes()) {
auto impl = ascgen_utils::GetAscIrCodegenImpl(node->GetType());
GE_ASSERT_NOTNULL(impl, "GetAscIrCodegenImpl of node %s[%s] is null", node->GetTypePtr(),
node->GetNamePtr());
for (const auto &header_str : impl->IncludeApiHeaderFiles()) {
if (api_header_list.count(header_str) == 0) {
api_header_list.insert(header_str);
ss << "#include \"" << header_str << "\"" << std::endl;
}
}
}
}
}
}
}
return ss.str();
}
std::string Kernel::IncludeAndDefines(const ascir::FusedScheduledResult &fused_schedule_result,
const std::string &kernel_task_type, bool use_tensor_desc, bool is_inductor) {
std::stringstream ss;
ss << Kernel::GetIncludeApiHeaderFiles(fused_schedule_result);
if (use_tensor_desc) {
ss << "#include \"kernel_operator_list_tensor_intf.h\"" << std::endl;
}
ss << "#include \"autofuse_tiling_data.h\"" << std::endl;
ss << std::endl;
ss << "using namespace AscendC;" << std::endl;
ss << std::endl;
if (!is_inductor) {
ss << "KERNEL_TASK_TYPE_DEFAULT(" << kernel_task_type << ");" << std::endl;
}
ss << std::endl;
const static string kAscendcUtilsExtend = {
#include "utils_str.h"
};
const static string kAscendcBrcInline = {
#include "brc_inline_api_str.h"
};
ss << kAscendcUtilsExtend << kAscendcBrcInline << std::endl;
return ss.str();
}
std::string Kernel::KernelFuncDeclare(const std::string &graph_name,
const ascir::FusedScheduledResult &fused_schedule_result, bool use_list_tensor,
bool is_inductor, bool is_conv2d) {
std::stringstream ss;
if (ascgen_utils::IsCubeFusedScheduled(fused_schedule_result)) {
if (is_conv2d) {
ss << "template<int8_t FmapTiling, int8_t WeightTiling, int8_t L1PingPong, int8_t L0PingPong, int8_t "
"OutputOrder, int8_t IterOrder, int8_t GroupType, int8_t EnableSmallChannel, int8_t WeightUbTrans, int8_t "
"FmapCopyMode, int8_t InnerBatch, int8_t DisContinuous > "
<< std::endl;
} else {
ss << "template <int8_t API_LEVEL, int8_t A_TRANS, int8_t B_TRANS, int8_t BATCH_MODEL, int8_t MODEL, int8_t "
"FULL_LOAD, int8_t L0C2OUT_MODEL>"
<< std::endl;
}
const char *flags[] = {"__global__", "__aicore__"};
for (auto flag : flags) {
ss << flag << " ";
}
} else {
const char *flags[] = {"extern \"C\"", "__global__", "__aicore__"};
for (auto flag : flags) {
ss << flag << " ";
}
}
ss << "void " << CamelToLowerSneak(graph_name) << "(";
if (use_list_tensor) {
ss << GM_ADDR("inputs").AsArg() << ", " << GM_ADDR("outputs").AsArg() << ", ";
} else {
for (auto &input : fused_schedule_result.input_nodes) {
if (IsOps<ScalarData>(input)) {
std::string dtype_name;
GE_ASSERT_SUCCESS(Tensor::DtypeName(input->outputs[0].attr.dtype, dtype_name), "data type:%d failed",
static_cast<int32_t>(input->outputs[0].attr.dtype));
ss << dtype_name << " " << GenValidName(input->GetName()) << ", ";
} else {
ss << GM_ADDR(GenValidName(input->GetName())).AsArg() << ", ";
}
}
for (auto &output : fused_schedule_result.output_nodes) {
ss << GM_ADDR(GenValidName(output->GetName())).AsArg() << ", ";
}
}
ss << GM_ADDR("workspace").AsArg() << ", ";
if (is_inductor) {
ss << "AutofuseTilingData t";
} else {
ss << GM_ADDR("gm_tiling_data").AsArg();
}
ss << ")";
return ss.str();
}
std::string Kernel::GenTilingFuncCall(const std::string &impl_graph_name, const std::string &tiling_data,
uint32_t index, bool enable_group_parallel, bool need_sync_all) const {
std::stringstream ss;
ss << (index == 0 ? " if (" : " else if (");
if (enable_group_parallel) {
ss << "MatchTilingKeyAndBlockDim(" << CamelToLowerSneak(tiling_data) << ", " << index << ")";
} else {
ss << CamelToLowerSneak(tiling_data) << ".tiling_key == " << std::to_string(index);
}
ss << ") {" << std::endl;
ss << " " << CamelToLowerSneak(impl_graph_name) << "(";
if (use_list_tensor_) {
ss << kInputTensorDescName << ", " << kOutputTensorDescName << ", ";
} else {
for (auto &input : this->inputs) {
ss << input.Str() << ", ";
}
for (auto &output : this->outputs) {
ss << output.Str() << ", ";
}
}
ss << this->workspace_arg.Str() << ", ";
for (auto &workspace : this->workspaces) {
ss << "t." << workspace.Str() << ", ";
}
ss << "&" << tiling_data;
ss << ");" << std::endl;
if (need_sync_all) {
ss << " SyncAll();" << std::endl;
}
ss << " }";
return ss.str();
}
std::string Kernel::GenTilingFuncCall(const std::string &impl_graph_name, const std::string &tiling_data) const {
std::stringstream string_stream;
string_stream << CamelToLowerSneak(impl_graph_name) << "(";
if (use_list_tensor_) {
string_stream << kInputTensorDescName << ", " << kOutputTensorDescName << ", ";
} else {
for (auto &input : this->inputs) {
string_stream << input.Str() << ", ";
}
for (auto &output : this->outputs) {
string_stream << output.Str() << ", ";
}
}
string_stream << this->workspace_arg.Str() << ", ";
for (auto &workspace : this->workspaces) {
string_stream << "t."<< workspace.Str() << ", ";
}
string_stream << "&" << tiling_data;
string_stream << ");";
return string_stream.str();
}
Status Kernel::GenSingleGroupKernelWithRegTilingKey(const ascir::FusedScheduledResult &fused_schedule_result,
const CodegenConfig& config, std::stringstream &ss,
std::stringstream &ss1, bool use_list_tensor) {
std::string tiling_data_type = "AutofuseTilingData";
std::unordered_set<const std::string *> kernel_file_ptr;
auto schedule_graphs = fused_schedule_result.node_idx_to_scheduled_results[0][0].schedule_groups[0].impl_graphs;
std::vector<TilingFuncCall> func_calls;
uint32_t tiling_key = 0U;
for (size_t i = 0; i < schedule_graphs.size(); i++) {
Kernel kernel(schedule_graphs[i].GetName());
kernel.tiler.SetTilingCaseId(i);
kernel.SetUsingAttCalcQBTSizeConfig(config.using_att_calc_qbt_size);
kernel.SetUseListTensor(use_list_tensor);
GE_CHK_STATUS_RET(Kernel::ParseGraph(schedule_graphs[i], fused_schedule_result, kernel),
"Codegen parse graph failed");
auto is_dynamic = !IsStaticSchedResult(fused_schedule_result);
GE_CHK_STATUS_RET(kernel.GenerateKernelByNode(schedule_graphs[i], ss, kernel_file_ptr, is_dynamic));
std::string tmp;
GE_CHK_STATUS_RET(kernel.Generate(schedule_graphs[i].GetName(), tiling_data_type, tmp, schedule_graphs[i]),
"Codegen generate kernel for %s failed", schedule_graphs[i].GetName().c_str());
ss << tmp;
std::string func_call;
if (ascgen_utils::IsCubeType(schedule_graphs[i])) {
func_call = kernel.GenCubeTilingFuncCall(schedule_graphs[i]);
} else {
func_call = kernel.GenTilingFuncCall(schedule_graphs[i].GetName(), "t");
}
func_calls.emplace_back(func_call, kernel.has_workspace_node, false);
}
std::vector<TilingFuncCall> current;
std::vector<std::vector<TilingFuncCall>> per_group_func_calls;
per_group_func_calls.emplace_back(std::move(func_calls));
AppendFuncCall(ss1, per_group_func_calls, current, 0, tiling_key, ascgen_utils::IsCubeFusedScheduled(fused_schedule_result));
return af::SUCCESS;
}
Status Kernel::GenMulGroupKernelWithRegTilingKey(const ascir::FusedScheduledResult &fused_schedule_result,
const CodegenConfig& config, std::stringstream &ss,
std::stringstream &ss1, bool use_list_tensor) {
std::string tiling_data_type = "AutofuseTilingData";
std::unordered_set<const std::string *> kernel_file_ptr;
uint32_t tiling_key = 0U;
for (size_t graph_id = 0; graph_id < fused_schedule_result.node_idx_to_scheduled_results.size(); graph_id++) {
auto scheduled_results = fused_schedule_result.node_idx_to_scheduled_results[graph_id];
for (size_t i = 0; i < scheduled_results.size(); i++) {
auto schedule_groups = scheduled_results[i].schedule_groups;
auto enable_group_parallel = scheduled_results[i].enable_group_parallel;
std::vector<std::vector<TilingFuncCall>> per_group_func_calls;
ascir::CubeTemplateType cv_fusion_type = scheduled_results[i].cube_type;
if (cv_fusion_type == ascir::CubeTemplateType::kCommon) {
continue;
}
for (size_t j = 0; j < schedule_groups.size(); j++) {
auto schedule_graphs = schedule_groups[j].impl_graphs;
std::vector<TilingFuncCall> func_calls;
bool vector_no_db_flag = true;
for (size_t k = 0; k < schedule_graphs.size(); k++) {
std::string tiling_data = "AscGraph" + std::to_string(graph_id) + "ScheduleResult" + std::to_string(i) +
"G" + std::to_string(j) + "TilingData";
Kernel kernel(schedule_graphs[k].GetName());
kernel.SetUsingAttCalcQBTSizeConfig(config.using_att_calc_qbt_size);
kernel.SetUseListTensor(use_list_tensor);
kernel.tiler.SetTilingCaseId(k);
kernel.tiler.EnableGroupParallel(enable_group_parallel);
kernel.tpipe.cv_fusion_type = cv_fusion_type;
GE_CHK_STATUS_RET(Kernel::ParseGraph(schedule_graphs[k], fused_schedule_result, kernel),
"Codegen parse graph failed");
auto is_dynamic = !IsStaticSchedResult(fused_schedule_result);
GE_CHK_STATUS_RET(kernel.GenerateKernelByNode(schedule_graphs[k], ss, kernel_file_ptr, is_dynamic));
if (IsCVFusionUBGraph(schedule_graphs[k], cv_fusion_type)) {
GE_CHK_STATUS_RET(
kernel.GenerateVecFuncOfCVFusion(ss, vector_no_db_flag, IsConv2DFusedScheduled(fused_schedule_result)),
"Gen CV fusion Func failed");
} else {
std::string tmp;
GE_CHK_STATUS_RET(kernel.Generate(schedule_graphs[k].GetName(), tiling_data, tmp, schedule_graphs[k]),
"Codegen generate kernel for %s failed", schedule_graphs[k].GetName().c_str());
ss << tmp;
}
std::string filed_name = "t.graph" + std::to_string(graph_id) + "_result" + std::to_string(i) + "_g" +
std::to_string(j) + "_tiling_data";
bool need_sync_all = kernel.has_workspace_node && j != schedule_groups.size() - 1;
std::string func_call;
if (ascgen_utils::IsCubeType(schedule_graphs[k])) {
func_calls.emplace_back(kernel.GenCubeTilingFuncCall(schedule_graphs[k]), kernel.has_workspace_node, true);
} else if (cv_fusion_type == ascir::CubeTemplateType::kUBFuse) {
GE_CHK_STATUS_RET(kernel.InitCVFusionAddr(ss1, vector_no_db_flag), "Init CV Fusion Addr failed");
vector_no_db_flag = false;
continue;
} else {
func_calls.emplace_back(kernel.GenTilingFuncCall(schedule_graphs[k].GetName(), filed_name),
kernel.has_workspace_node, need_sync_all);
}
}
if (!func_calls.empty()) {
per_group_func_calls.emplace_back(std::move(func_calls));
}
}
std::vector<TilingFuncCall> current;
AppendFuncCall(ss1, per_group_func_calls, current, 0, tiling_key, ascgen_utils::IsCubeFusedScheduled(fused_schedule_result));
}
}
return af::SUCCESS;
}
Status Kernel::GenKernelFuncWithRegTilingKey(const ascir::FusedScheduledResult &fused_schedule_result,
const CodegenConfig& config, std::stringstream &ss, std::stringstream &ss1,
bool use_list_tensor) {
if (ascgen_utils::IsSingleGroup(fused_schedule_result)) {
GE_ASSERT_SUCCESS(GenSingleGroupKernelWithRegTilingKey(fused_schedule_result, config, ss, ss1, use_list_tensor));
} else {
GE_ASSERT_SUCCESS(GenMulGroupKernelWithRegTilingKey(fused_schedule_result, config, ss, ss1, use_list_tensor));
}
return af::SUCCESS;
}
Status Kernel::GenSingleGroupKernelWithParseTilingData(const ascir::FusedScheduledResult &fused_schedule_result,
const std::vector<af::AscGraph> &schedule_graphs,
const CodegenConfig& config, std::stringstream &ss,
std::stringstream &ss1, bool use_list_tensor,
std::unordered_set<const std::string *> &kernel_file_ptr) {
for (size_t i = 0; i < schedule_graphs.size(); i++) {
Kernel kernel(schedule_graphs[i].GetName());
kernel.SetUsingAttCalcQBTSizeConfig(config.using_att_calc_qbt_size);
kernel.SetUseListTensor(use_list_tensor);
kernel.tiler.SetTilingCaseId(i);
GE_CHK_STATUS_RET(Kernel::ParseGraph(schedule_graphs[i], fused_schedule_result, kernel),
"Codegen parse graph failed");
auto is_dynamic = !IsStaticSchedResult(fused_schedule_result);
GE_CHK_STATUS_RET(kernel.GenerateKernelByNode(schedule_graphs[i], ss, kernel_file_ptr, is_dynamic));
std::string tmp;
GE_CHK_STATUS_RET(kernel.Generate(schedule_graphs[i].GetName(), "AutofuseTilingData", tmp, schedule_graphs[i]),
"Codegen generate kernel for %s failed", schedule_graphs[i].GetName().c_str());
ss << tmp;
if (ascgen_utils::IsCubeType(schedule_graphs[i])) {
ss1 << kernel.GenCubeTilingFuncCall(schedule_graphs[i]);
} else {
ss1 << kernel.GenTilingFuncCall(schedule_graphs[i].GetName(), "t", i);
}
}
return af::SUCCESS;
}
Status Kernel::GenCubeCommonFuncForScheduleGroup(const ascir::FusedScheduledResult &fused_schedule_result,
const size_t graph_id, const size_t common_index,
const size_t group_index, const CodegenConfig &config,
std::stringstream &ss, std::stringstream &res_ss, const bool use_list_tensor,
std::unordered_set<const std::string *> &kernel_file_ptr) {
const auto &scheduled_results = fused_schedule_result.node_idx_to_scheduled_results[graph_id];
const auto &schedule_groups = scheduled_results[common_index].schedule_groups;
auto enable_group_parallel = scheduled_results[common_index].enable_group_parallel;
ascir::CubeTemplateType cv_fusion_type = scheduled_results[common_index].cube_type;
const auto &schedule_graphs = schedule_groups[group_index].impl_graphs;
GE_ASSERT_TRUE(!schedule_graphs.empty(), "schedule_graphs is empty");
for (size_t k = 0U; k < schedule_graphs.size(); k++) {
Kernel kernel(schedule_graphs[k].GetName());
kernel.SetUsingAttCalcQBTSizeConfig(config.using_att_calc_qbt_size);
kernel.SetUseListTensor(use_list_tensor);
kernel.tiler.SetTilingCaseId(k);
kernel.tiler.EnableGroupParallel(enable_group_parallel);
kernel.tpipe.cv_fusion_type = cv_fusion_type;
GE_CHK_STATUS_RET(Kernel::ParseGraph(schedule_graphs[k], fused_schedule_result, kernel),
"Codegen parse graph failed");
auto is_dynamic = !IsStaticSchedResult(fused_schedule_result);
GE_CHK_STATUS_RET(kernel.GenerateKernelByNode(schedule_graphs[k], ss, kernel_file_ptr, is_dynamic),
"Gen api headers failed");
if (ascgen_utils::IsCubeType(schedule_graphs[k])) {
res_ss << kernel.GenCubeCommonTilingSingleFuncCall(schedule_graphs[k]);
return af::SUCCESS;
} else {
std::string tmp;
GE_CHK_STATUS_RET(kernel.Generate(schedule_graphs[k].GetName(), "AutofuseTilingData", tmp, schedule_graphs[k]),
"Codegen generate cv kernel for %s failed", schedule_graphs[k].GetName().c_str());
ss << tmp;
res_ss << kernel.GenTilingFuncCall(schedule_graphs[k].GetName(), "t", k, enable_group_parallel, false)
<< std::endl;
}
}
return af::SUCCESS;
}
Status Kernel::GenCubeCommonFuncForAIV(const ascir::FusedScheduledResult &fused_schedule_result, const size_t graph_id,
const size_t common_index, const size_t group_index, const CodegenConfig &config,
std::stringstream &ss, std::stringstream &vec_ss, const bool use_list_tensor,
std::unordered_set<const std::string *> &kernel_file_ptr) {
vec_ss << "if ASCEND_IS_AIV {" << std::endl;
vec_ss << " SyncAll<false>();" << std::endl;
vec_ss << " #ifdef CV_AIV_NUM" << std::endl;
vec_ss << " if (GetBlockIdx() >= CV_AIV_NUM) {" << std::endl;
vec_ss << " return;" << std::endl;
vec_ss << " }" << std::endl;
vec_ss << " #endif" << std::endl;
if (!IsEmptyTensorSence(fused_schedule_result)) {
vec_ss << " GET_TILING_DATA(t, gm_tiling_data);" << std::endl;
GE_ASSERT_SUCCESS(GenCubeCommonFuncForScheduleGroup(fused_schedule_result, graph_id, common_index, group_index,
config, ss, vec_ss, use_list_tensor, kernel_file_ptr));
}
vec_ss << "}" << std::endl;
return af::SUCCESS;
}
Status Kernel::GenCubeCommonFuncForAICMix(const ascir::FusedScheduledResult &fused_schedule_result,
const size_t graph_id, const size_t common_index, const size_t group_index,
const CodegenConfig &config, std::stringstream &ss,
std::stringstream &cube_ss, const bool use_list_tensor,
std::unordered_set<const std::string *> &kernel_file_ptr) {
cube_ss << " #ifdef CV_AIC_NUM" << std::endl;
cube_ss << " if ASCEND_IS_AIC {" << std::endl;
cube_ss << " if (GetBlockIdx() >= CV_AIC_NUM) {" << std::endl;
cube_ss << " SyncAll<false>();" << std::endl;
cube_ss << " return;" << std::endl;
cube_ss << " }" << std::endl;
cube_ss << " }" << std::endl;
cube_ss << " #endif" << std::endl;
cube_ss << " uint32_t vec_wss = 0U;" << std::endl;
cube_ss << " #ifdef CV_VEC_WSS" << std::endl;
cube_ss << " vec_wss = CV_VEC_WSS;" << std::endl;
cube_ss << " #endif" << std::endl;
GE_ASSERT_SUCCESS(GenCubeCommonFuncForScheduleGroup(fused_schedule_result, graph_id, common_index, group_index,
config, ss, cube_ss, use_list_tensor, kernel_file_ptr));
cube_ss << " if ASCEND_IS_AIC {" << std::endl;
cube_ss << " SyncAll<false>();" << std::endl;
cube_ss << " }" << std::endl;
return af::SUCCESS;
}
Status Kernel::GenCubeCommonFuncForAIC(const ascir::FusedScheduledResult &fused_schedule_result, const size_t graph_id,
const size_t common_index, const size_t group_index, const CodegenConfig &config,
std::stringstream &ss, std::stringstream &cube_ss, const bool use_list_tensor,
std::unordered_set<const std::string *> &kernel_file_ptr) {
cube_ss << "if ASCEND_IS_AIC {" << std::endl;
cube_ss << " #ifdef CV_AIC_NUM" << std::endl;
cube_ss << " if (GetBlockIdx() >= CV_AIC_NUM) {" << std::endl;
cube_ss << " SyncAll<false>();" << std::endl;
cube_ss << " return;" << std::endl;
cube_ss << " }" << std::endl;
cube_ss << " #endif" << std::endl;
cube_ss << " uint32_t vec_wss = 0U;" << std::endl;
cube_ss << " #ifdef CV_VEC_WSS" << std::endl;
cube_ss << " vec_wss = CV_VEC_WSS;" << std::endl;
cube_ss << " #endif" << std::endl;
GE_ASSERT_SUCCESS(GenCubeCommonFuncForScheduleGroup(fused_schedule_result, graph_id, common_index, group_index,
config, ss, cube_ss, use_list_tensor, kernel_file_ptr));
cube_ss << " SyncAll<false>();" << std::endl;
cube_ss << "}" << std::endl;
return af::SUCCESS;
}
Status Kernel::GenCubeCommonFuncOfCVFusion(const ascir::FusedScheduledResult &fused_schedule_result,
const size_t graph_id, const size_t common_index,
const CodegenConfig &config, std::stringstream &ss, std::stringstream &ss1,
const bool use_list_tensor,
std::unordered_set<const std::string *> &kernel_file_ptr) {
const auto &scheduled_results = fused_schedule_result.node_idx_to_scheduled_results[graph_id];
const auto &schedule_groups = scheduled_results[common_index].schedule_groups;
std::vector<std::vector<std::string>> per_group_func_calls;
for (size_t j = 0U; j < schedule_groups.size(); j++) {
std::vector<std::string> func_calls;
auto schedule_graphs = schedule_groups[j].impl_graphs;
GE_ASSERT_TRUE(!schedule_graphs.empty(), "schedule_graphs is empty");
bool is_cube_group = ascgen_utils::IsCubeType(schedule_graphs[0]);
if (is_cube_group) {
std::stringstream cube_ss;
cube_ss << "#ifdef CV_SAFETY_FUSION_MIX_MODE" << std::endl;
GE_ASSERT_SUCCESS(GenCubeCommonFuncForAICMix(fused_schedule_result, graph_id, common_index, j, config, ss,
cube_ss, use_list_tensor, kernel_file_ptr));
cube_ss << "#else" << std::endl;
GE_ASSERT_SUCCESS(GenCubeCommonFuncForAIC(fused_schedule_result, graph_id, common_index, j, config, ss, cube_ss,
use_list_tensor, kernel_file_ptr));
cube_ss << "#endif" << std::endl;
func_calls.emplace_back(cube_ss.str());
} else {
std::stringstream vec_ss;
GE_ASSERT_SUCCESS(GenCubeCommonFuncForAIV(fused_schedule_result, graph_id, common_index, j, config, ss, vec_ss,
use_list_tensor, kernel_file_ptr));
func_calls.emplace_back(vec_ss.str());
}
if (!func_calls.empty()) {
if (is_cube_group) {
per_group_func_calls.insert(per_group_func_calls.cbegin(), std::move(func_calls));
} else {
per_group_func_calls.emplace_back(std::move(func_calls));
}
}
}
AppendFuncCall(ss1, per_group_func_calls.cbegin(), per_group_func_calls.cend(), false);
return af::SUCCESS;
}
Status Kernel::GenMulGroupKernelWithParseTilingData(const ascir::FusedScheduledResult &fused_schedule_result,
const size_t graph_id, const CodegenConfig &config,
std::stringstream &ss, std::stringstream &ss1, bool use_list_tensor,
std::unordered_set<const std::string *> &kernel_file_ptr) {
auto scheduled_results = fused_schedule_result.node_idx_to_scheduled_results[graph_id];
uint32_t function_id = kFuncIdBegin;
for (size_t i = 0; i < scheduled_results.size(); i++) {
auto schedule_groups = scheduled_results[i].schedule_groups;
auto enable_group_parallel = scheduled_results[i].enable_group_parallel;
ascir::CubeTemplateType cv_fusion_type = scheduled_results[i].cube_type;
if (cv_fusion_type == ascir::CubeTemplateType::kCommon) {
GE_ASSERT_SUCCESS(GenCubeCommonFuncOfCVFusion(fused_schedule_result, graph_id, i, config, ss, ss1,
use_list_tensor, kernel_file_ptr));
continue;
} else if (cv_fusion_type == ascir::CubeTemplateType::kDefault) {
ss1 << (i == 0 ? " if" : " else if ") << "(t." << "graph" << std::to_string(graph_id)
<< "_tiling_key == " << std::to_string(i) << ") {" << std::endl;
}
std::vector<std::vector<std::string>> per_group_func_calls;
bool enable_parallel_compile = true;
for (size_t j = 0; j < schedule_groups.size(); j++) {
std::vector<std::string> func_calls;
auto schedule_graphs = schedule_groups[j].impl_graphs;
bool vector_no_db_flag = true;
for (size_t k = 0; k < schedule_graphs.size(); k++) {
std::string tiling_data = "AscGraph" + std::to_string(graph_id) + "ScheduleResult" + std::to_string(i) +
"G" + std::to_string(j) + "TilingData";
Kernel kernel(schedule_graphs[k].GetName());
kernel.SetUsingAttCalcQBTSizeConfig(config.using_att_calc_qbt_size);
kernel.SetUseListTensor(use_list_tensor);
kernel.tiler.SetTilingCaseId(k);
kernel.tiler.EnableGroupParallel(enable_group_parallel);
kernel.tpipe.cv_fusion_type = cv_fusion_type;
GE_CHK_STATUS_RET(Kernel::ParseGraph(schedule_graphs[k], fused_schedule_result, kernel),
"Codegen parse graph failed");
auto is_dynamic = !IsStaticSchedResult(fused_schedule_result);
GE_CHK_STATUS_RET(kernel.GenerateKernelByNode(schedule_graphs[k], ss, kernel_file_ptr, is_dynamic),
"Gen api headers failed");
std::string filed_name = "t.graph" + std::to_string(graph_id) + "_result" + std::to_string(i) + "_g" +
std::to_string(j) + "_tiling_data";
bool need_sync_all = kernel.has_workspace_node && j != schedule_groups.size() - 1;
if (ascgen_utils::IsCubeType(schedule_graphs[k])) {
func_calls.emplace_back(kernel.GenCubeTilingFuncCall(schedule_graphs[k]));
} else if (cv_fusion_type == ascir::CubeTemplateType::kUBFuse) {
GE_CHK_STATUS_RET(
kernel.GenerateVecFuncOfCVFusion(ss, vector_no_db_flag, IsConv2DFusedScheduled(fused_schedule_result)),
"Gen CV fusion Func failed");
GE_CHK_STATUS_RET(kernel.InitCVFusionAddr(ss1, vector_no_db_flag), "Init CV Fusion Addr failed");
vector_no_db_flag = false;
continue;
} else {
std::string tmp;
GE_CHK_STATUS_RET(kernel.Generate(schedule_graphs[k].GetName(), tiling_data, tmp, schedule_graphs[k]),
"Codegen generate kernel for %s failed", schedule_graphs[k].GetName().c_str());
ss << tmp;
func_calls.emplace_back(kernel.GenTilingFuncCall(schedule_graphs[k].GetName(), filed_name, k,
enable_group_parallel, need_sync_all));
enable_parallel_compile = (enable_parallel_compile && kernel.GetEnableParallelCompile());
}
}
if (!func_calls.empty()) {
per_group_func_calls.emplace_back(std::move(func_calls));
}
}
auto max_group_per_compile_unit = GetMaxGroupPerCompileUnit(enable_parallel_compile);
if (per_group_func_calls.size() <= static_cast<size_t>(max_group_per_compile_unit)) {
AppendFuncCall(ss1, per_group_func_calls.cbegin(), per_group_func_calls.cend());
} else {
const auto kernel_args = PackingFuncArgs("AutofuseTilingData", fused_schedule_result, use_list_tensor);
const auto packing_func_names =
Kernel::GenPackingFunctions(ss, kernel_args, per_group_func_calls, max_group_per_compile_unit, function_id);
GenPackingFunctionCalls(ss1, kernel_args, packing_func_names);
}
if (cv_fusion_type == ascir::CubeTemplateType::kDefault) {
ss1 << " }";
}
}
FakeTilingIds(ss, function_id);
return af::SUCCESS;
}
Status Kernel::GenKernelFuncWithParseTilingData(const ascir::FusedScheduledResult &fused_schedule_result,
const CodegenConfig& config, std::stringstream &ss,
std::stringstream &ss1, bool use_list_tensor) {
std::unordered_set<const std::string *> kernel_file_ptr;
for (size_t graph_id = 0; graph_id < fused_schedule_result.node_idx_to_scheduled_results.size(); graph_id++) {
auto scheduled_results = fused_schedule_result.node_idx_to_scheduled_results[graph_id];
if ((fused_schedule_result.node_idx_to_scheduled_results.size() == 1) && (scheduled_results.size() == 1) &&
(scheduled_results[0].schedule_groups.size() == 1)) {
auto schedule_graphs = scheduled_results[0].schedule_groups[0].impl_graphs;
GE_ASSERT_SUCCESS(GenSingleGroupKernelWithParseTilingData(fused_schedule_result, schedule_graphs, config, ss,
ss1, use_list_tensor, kernel_file_ptr));
} else {
GE_ASSERT_SUCCESS(GenMulGroupKernelWithParseTilingData(fused_schedule_result, graph_id, config, ss,
ss1, use_list_tensor, kernel_file_ptr));
}
}
return af::SUCCESS;
}
int64_t Kernel::GetMaxGroupPerCompileUnit(bool enable_parallel_compile) {
uint32_t max_group_per_compile_unit = std::numeric_limits<uint32_t>::max();
if (enable_parallel_compile) {
auto backend_spec = optimize::BackendSpec::GetInstance();
if (backend_spec != nullptr) {
max_group_per_compile_unit = backend_spec->max_group_num_per_compile_unit;
}
}
return max_group_per_compile_unit;
}
ge::Status Kernel::GenCubeCommonTiling(std::stringstream &ss, const bool is_batch, bool is_conv2d) const {
if (is_conv2d) {
ss << "AscendC::TPipe pipe;" << std::endl;
ss << " conv2d_v2<";
ss << "FmapTiling, WeightTiling, L1PingPong, L0PingPong, OutputOrder, IterOrder, GroupType, EnableSmallChannel, "
"WeightUbTrans, FmapCopyMode, InnerBatch, DisContinuous>(";
} else {
if (is_batch) {
ss << " batch_mat_mul_v3<";
} else {
ss << " mat_mul_v3<";
}
ss << "API_LEVEL, A_TRANS, B_TRANS, BATCH_MODEL, MODEL, FULL_LOAD, L0C2OUT_MODEL>(";
}
return af::SUCCESS;
}
std::string Kernel::GenCubeTilingSingleFuncCall(const bool is_batch, const bool is_cv_fuse, bool is_bias,
bool is_offset_w, bool is_conv2d) const {
std::stringstream ss;
GE_CHK_STATUS(GenCubeCommonTiling(ss, is_batch, is_conv2d), "GenCubeCommonTilingHead failed");
if (use_list_tensor_) {
ss << kInputTensorDescName << ", " << kOutputTensorDescName << ", ";
} else {
if (this->inputs.size() < (2U + (is_bias ? 1U : 0U) + (is_offset_w ? 1U : 0U))) {
ss << this->inputs[0].Str() << ", ";
}
for (auto &input : this->inputs) {
ss << input.Str() << ", ";
}
if (!is_bias) {
ss << "nullptr, ";
}
if (!is_offset_w) {
ss << "nullptr, ";
}
for (auto &output : this->outputs) {
ss << output.Str() << ", ";
}
if (this->outputs.empty()) {
ss << (is_cv_fuse ? "nullptr, " : "output_0, ");
}
}
ss << this->workspace_arg.Str() << ", ";
ss << "gm_tiling_data";
ss << (is_cv_fuse ? ", &CV_FUSION_ADDR" : "");
ss << ");" << std::endl;
return ss.str();
}
std::string Kernel::GenCubeCommonTilingSingleFuncCall(const ascir::ImplGraph &impl_graph) const {
bool is_batch = false;
bool has_bias = false;
bool has_offset_w = false;
bool is_conv2d = IsConv2DGraphType(impl_graph);
if (is_conv2d) {
has_bias = ascgen_utils::IsConv2DTypeWithBias(impl_graph);
has_offset_w = ascgen_utils::IsConv2DTypeWithOffsetW(impl_graph);
} else {
is_batch = ascgen_utils::IsMatMulTypeWithBatch(impl_graph);
has_bias = ascgen_utils::IsMatMulTypeWithBias(impl_graph);
has_offset_w = ascgen_utils::IsMatMulTypeWithOffsetW(impl_graph);
}
std::stringstream ss;
GE_CHK_STATUS(GenCubeCommonTiling(ss, is_batch, is_conv2d), "GenCubeCommonTilingHead failed");
if (use_list_tensor_) {
ss << kInputTensorDescName << ", " << kOutputTensorDescName << ", ";
} else {
auto min_inputs_num = 1U + (has_bias ? 1U : 0U) + (has_offset_w ? 1U : 0U);
GE_ASSERT_TRUE(this->inputs.size() >= min_inputs_num, "cube inputs num [%u] < min_inputs_num [%u]",
this->inputs.size(), min_inputs_num);
(this->inputs.size() == min_inputs_num) ? (ss << this->inputs[0].Str() << ", ") : (ss << "");
for (auto &input : this->inputs) {
ss << input.Str() << ", ";
}
if (!has_bias) {
ss << "nullptr, ";
}
if (!has_offset_w) {
ss << "nullptr, ";
}
for (auto &output : this->outputs) {
ss << output.Str() << ", ";
}
if (this->outputs.empty()) {
ss << this->workspace_arg.Str() << ", ";
}
}
ss << this->workspace_arg.Str();
ss << " + vec_wss, gm_tiling_data);" << std::endl;
return ss.str();
}
std::string Kernel::GenCubeTilingFuncCall(const ascir::ImplGraph &impl_graph) const {
bool is_batch = false;
bool is_bias = false;
bool is_offset_w = false;
bool is_conv2d = IsConv2DGraphType(impl_graph);
if (is_conv2d) {
is_bias = ascgen_utils::IsConv2DTypeWithBias(impl_graph);
is_offset_w = ascgen_utils::IsConv2DTypeWithOffsetW(impl_graph);
} else {
is_batch = ascgen_utils::IsMatMulTypeWithBatch(impl_graph);
is_bias = ascgen_utils::IsMatMulTypeWithBias(impl_graph);
is_offset_w = ascgen_utils::IsMatMulTypeWithOffsetW(impl_graph);
}
std::stringstream ss;
ss << "#ifdef CV_UB_FUSION" << std::endl;
ss << GenCubeTilingSingleFuncCall(is_batch, true, is_bias, is_offset_w, is_conv2d);
ss << "#else" << std::endl;
ss << GenCubeTilingSingleFuncCall(is_batch, false, is_bias, is_offset_w, is_conv2d);
ss << "#endif" << std::endl;
return ss.str();
}
Status Kernel::GenKernelFuncByTilingKey(const ascir::FusedScheduledResult &fused_schedule_result, std::stringstream &ss,
bool use_list_tensor, const CodegenConfig& config,
const std::string &kernel_task_type) {
std::stringstream ss1;
std::string graph_name = GenValidName(fused_schedule_result.fused_graph_name.GetString());
if (config.is_inductor) {
ss1 << Kernel::KernelFuncDeclare(graph_name, fused_schedule_result, use_list_tensor, config.is_inductor,
IsConv2DFusedScheduled(fused_schedule_result))
<< " {" << std::endl;
ss1 << " KERNEL_TASK_TYPE_DEFAULT(" << kernel_task_type << ");" << std::endl;
} else {
ss1 << Kernel::KernelFuncDeclare(graph_name, fused_schedule_result, use_list_tensor, config.is_inductor,
IsConv2DFusedScheduled(fused_schedule_result))
<< " {" << std::endl;
if (!ascgen_utils::IsCubeFusedScheduled(fused_schedule_result)) {
ss1 << " REGISTER_TILING_DEFAULT(" << "AutofuseTilingData);" << std::endl;
if (IsEmptyTensorSence(fused_schedule_result)) {
ss1 << std::endl << "}" << std::endl;
ss << ss1.str();
return af::SUCCESS;
} else {
ss1 << " GET_TILING_DATA(t, gm_tiling_data);" << std::endl;
}
} else if (ascgen_utils::IsCubeCommonFusedScheduled(fused_schedule_result)){
ss << "#include \"autofuse_cube_tiling_data.h\"" << std::endl;
}
}
if (use_list_tensor) {
ss1 << " ListTensorDesc " << kInputTensorDescName << "((__gm__ void *)inputs);" << std::endl;
ss1 << " ListTensorDesc " << kOutputTensorDescName << "((__gm__ void *)outputs);" << std::endl;
}
if (ascgen_utils::CanUseTilingKey(fused_schedule_result) && !config.is_inductor &&
!ascgen_utils::IsCubeCommonFusedScheduled(fused_schedule_result)) {
GE_ASSERT_SUCCESS(GenKernelFuncWithRegTilingKey(fused_schedule_result, config, ss, ss1, use_list_tensor));
} else {
GE_ASSERT_SUCCESS(GenKernelFuncWithParseTilingData(fused_schedule_result, config, ss, ss1, use_list_tensor));
}
ss1 << std::endl << "}" << std::endl;
ss << ss1.str();
return af::SUCCESS;
}
void Kernel::SetUseListTensor(bool use_list_tensor) {
use_list_tensor_ = use_list_tensor;
}
void Kernel::SetUsingAttCalcQBTSizeConfig(bool using_att_calc_qbt_size) {
this->tpipe.SetUsingAttCalcQBTSizeConfig(using_att_calc_qbt_size);
}
void Kernel::SetEnableParallelCompile(bool enable_parallel_compile) {
enable_parallel_compile_ = enable_parallel_compile;
}
bool Kernel::GetEnableParallelCompile() const {
return enable_parallel_compile_;
}
void Kernel::AppendFuncCall(std::stringstream &ss, std::vector<std::vector<std::string>>::const_iterator begin,
std::vector<std::vector<std::string>>::const_iterator end, bool need_sync_all) {
for (auto it = begin; it != end; ++it) {
if (it != begin && need_sync_all) {
ss << " AscendC::PipeBarrier<PIPE_ALL>();" << std::endl;
}
for (const auto &call_statement : *it) {
ss << call_statement;
}
ss << std::endl;
}
}
void Kernel::AppendFuncCall(std::stringstream &ss,
std::vector<std::vector<TilingFuncCall>> &per_group_func_calls,
std::vector<TilingFuncCall> ¤t, size_t depth, uint32_t &tiling_key, bool is_cube) {
if (depth == per_group_func_calls.size()) {
if (!is_cube) {
ss << (tiling_key == 0U ? " if " : " else if ") << "(TILING_KEY_IS(" << std::to_string(tiling_key) << ")) {" << std::endl;
}
for (const auto &tiling_func_call : current) {
ss << " " << tiling_func_call.func_call_ << std::endl;
if (tiling_func_call.need_sync_all_) {
ss << " SyncAll();" << std::endl;
}
}
if (!is_cube) {
ss << " }";
}
tiling_key++;
return;
}
for (const auto &func_call : per_group_func_calls[depth]) {
current.push_back(func_call);
AppendFuncCall(ss, per_group_func_calls, current, depth + 1, tiling_key, is_cube);
current.pop_back();
}
}
std::vector<Variable> Kernel::PackingFuncArgs(const std::string &tiling_data_type,
const ascir::FusedScheduledResult &fused_schedule_result,
bool use_list_tensor) {
std::vector<Variable> args;
if (use_list_tensor) {
args.emplace_back(Type("ListTensorDesc&"), kInputTensorDescName);
args.emplace_back(Type("ListTensorDesc&"), kOutputTensorDescName);
} else {
for (auto &input : fused_schedule_result.input_nodes) {
args.emplace_back(GM_ADDR(GenValidName(input->GetName())));
}
for (auto &output : fused_schedule_result.output_nodes) {
args.emplace_back(GM_ADDR(GenValidName(output->GetName())));
}
}
args.emplace_back(GM_ADDR("workspace"));
args.emplace_back(Type(tiling_data_type + "&"), "t");
return args;
}
std::vector<std::string> Kernel::GenPackingFunctions(std::stringstream &ss_define,
const std::vector<Variable> &kernel_args,
const std::vector<std::vector<std::string>> &per_group_func_calls,
int64_t max_group_per_compile_unit,
uint32_t &function_id) {
std::vector<std::string> func_names;
auto remaining_groups = static_cast<int64_t>(per_group_func_calls.size());
auto begin = per_group_func_calls.cbegin();
while (remaining_groups > 0) {
const auto num = std::min(remaining_groups, max_group_per_compile_unit);
const auto end = begin + num;
const auto &func_name = "packed_functions_8" + std::to_string(function_id);
ss_define << PackingFuncDeclare(func_name, kernel_args) << ";" << std::endl;
ss_define << "#if TILING_KEY_VAR == " << function_id << std::endl;
ss_define << PackingFuncDeclare(func_name, kernel_args) << "{" << std::endl;
AppendFuncCall(ss_define, begin, end);
ss_define << "}" << std::endl;
ss_define << "#endif" << std::endl;
remaining_groups -= num;
begin += num;
function_id += 1;
func_names.emplace_back(func_name);
}
return func_names;
}
std::string Kernel::PackingFuncDeclare(const std::string &func_name, const std::vector<Variable> &kernel_args) {
std::stringstream ss;
ss << "extern \"C\" __aicore__ void ";
ss << func_name << "(";
std::vector<std::string> args;
args.reserve(kernel_args.size());
for (const auto &arg : kernel_args) {
args.emplace_back(arg.AsArg());
}
ss << af::StringUtils::Join(args.cbegin(), args.cend(), ", ");
ss << ")";
return ss.str();
}
void Kernel::GenPackingFunctionCalls(stringstream &ss, const vector<Variable> &kernel_args,
const vector<std::string> &func_names) {
std::vector<std::string> args;
args.reserve(kernel_args.size());
for (const auto &arg : kernel_args) {
args.emplace_back(arg.Str());
}
bool need_sync = false;
const auto &func_args = af::StringUtils::Join(args.cbegin(), args.cend(), ", ");
for (const auto &func_name : func_names) {
if (need_sync) {
ss << " AscendC::PipeBarrier<PIPE_ALL>();" << std::endl;
}
std::string function_id = func_name.substr(func_name.rfind("_") + 1);
ss << " " << func_name << "(" << func_args << ");" << std::endl;
need_sync = true;
}
}
void Kernel::FakeTilingIds(stringstream &ss, uint32_t function_id_end) {
if (function_id_end != kFuncIdBegin) {
ss << "inline void fake_tiling_ids() {" << std::endl;
ss << " int32_t g_tilingKey = -1;" << std::endl;
ss << " if (TILING_KEY_IS(0)) {}" << std::endl;
for (uint32_t function_id = kFuncIdBegin; function_id < function_id_end; ++function_id) {
ss << " if (TILING_KEY_IS(" << function_id << ")) {}" << std::endl;
}
ss << "}" << std::endl;
}
}
Status Kernel::GenerateMacro(stringstream &ss) {
std::stack<const Loop *> loop_stack;
loop_stack.push(&(this->root_loop));
while (!loop_stack.empty()) {
const Loop *current_loop = loop_stack.top();
loop_stack.pop();
for (auto &body : current_loop->bodys) {
if (body.type == LoopType::LOOP) {
GE_ASSERT_NOTNULL(body.loop);
loop_stack.push(body.loop);
} else if (body.call->unit == af::ComputeUnit::kUnitCube) {
string macro_result;
body.call->GenerateMacro(macro_result);
ss << macro_result << std::endl;
break;
}
}
}
return af::SUCCESS;
}
Status Kernel::GenerateKernelByNode(const ascir::ImplGraph &graph, stringstream &ss,
std::unordered_set<const std::string *> &kernel_file_ptr,
bool is_dynamic) {
GE_CHK_STATUS_RET(GenerateMacro(ss), "Generate Macro failed");
std::string npu_arch;
GE_ASSERT_SUCCESS(ge::PlatformContext::GetInstance().GetCurrentPlatformString(npu_arch));
const bool need_marco = (npu_arch == "3510");
if (need_marco) {
ss << "#if defined(__DAV_C310__) || (defined(__NPU_ARCH__) && (__NPU_ARCH__ == 5102 || __NPU_ARCH__ == 3510))"
<< std::endl;
}
for (const auto &node : graph.GetAllNodes()) {
auto impl = ascgen_utils::GetAscIrCodegenImpl(node->GetType());
GE_ASSERT_NOTNULL(impl, "GetAscIrCodegenImpl of node %s[%s] is null", node->GetTypePtr(), node->GetNamePtr());
for (const auto &header_str : impl->LoadApiHeaderFiles(is_dynamic)) {
const auto &file = AscendCApiRegistry::GetInstance().GetFileContent(header_str);
if (!file.empty()) {
if (kernel_file_ptr.find(&(file)) == kernel_file_ptr.end()) {
kernel_file_ptr.insert(&(file));
ss << file;
}
}
}
}
if (need_marco) {
ss << "#endif" << std::endl;
}
return af::SUCCESS;
}
Status Kernel::GlobalTensorDefine(std::string &result) const {
std::stringstream ss;
for (std::size_t i = 0; i < this->inputs.size(); i++) {
const auto &tensor = this->tpipe.tensors.find(this->input_tensors[i]);
GE_ASSERT_TRUE((tensor != this->tpipe.tensors.end()), "Codegen input tensor id[%ld] not found",
this->input_tensors[i]);
if (tensor->second.is_constant) {
continue;
}
ss << " " << tensor->second.Define() << std::endl;
}
for (std::size_t i = 0; i < this->outputs.size(); i++) {
const auto &tensor = this->tpipe.tensors.find(this->output_tensors[i]);
GE_ASSERT_TRUE((tensor != this->tpipe.tensors.end()), "Codegen output tensor id[%ld] not found",
this->output_tensors[i]);
ss << " " << tensor->second.Define() << std::endl;
}
for (std::size_t i = 0; i < this->constant_tensors.size(); i++) {
auto tensor = this->tpipe.tensors.find(this->constant_tensors[i]);
GE_ASSERT_TRUE((tensor != this->tpipe.tensors.end()), "Codegen concat tensor id[%ld] not found",
this->constant_tensors[i]);
GELOGI("const_value_expr: %s", tensor->second.const_value_expr.Str().get());
string const_value = tensor->second.const_value_expr == 0 ? tensor->second.const_value
: tiler.Size(tensor->second.const_value_expr, true);
ss << " " << tensor->second.DefineConst(const_value.c_str()) << std::endl;
GELOGI("Define ss value: %s", ss.str().c_str());
}
for (std::size_t i = 0; i < this->ub_scalar_tensors.size(); i++) {
auto tensor = this->tpipe.tensors.find(this->ub_scalar_tensors[i]);
GE_ASSERT_TRUE((tensor != this->tpipe.tensors.end()), "Codegen ub_scalar tensor id[%ld] not found",
this->ub_scalar_tensors[i]);
std::string def_ub_scalar;
GE_CHK_STATUS_RET(tensor->second.DefineUbScalar(def_ub_scalar));
ss << " " << def_ub_scalar;
GELOGI("Define ub_scalar var: %s", def_ub_scalar.c_str());
}
ss << std::endl;
result = ss.str();
return af::SUCCESS;
}
Status Kernel::GlobalTensorAssign(std::string &result) const {
std::stringstream ss;
for (std::size_t i = 0; i < this->inputs.size(); i++) {
const auto &tensor = this->tpipe.tensors.find(this->input_tensors[i]);
GE_ASSERT_TRUE((tensor != this->tpipe.tensors.end()), "Codegen input tensor id[%ld] not found",
this->input_tensors[i]);
std::string local_result;
GE_CHK_STATUS_RET(tensor->second.SetGlobalBuffer(this->inputs[i], "", local_result),
"Codegen set global buffer failed");
ss << local_result << std::endl;
}
for (std::size_t i = 0; i < this->outputs.size(); i++) {
const auto &tensor = this->tpipe.tensors.find(this->output_tensors[i]);
GE_ASSERT_TRUE((tensor != this->tpipe.tensors.end()), "Codegen output tensor id[%ld] not found",
this->output_tensors[i]);
std::string local_result;
GE_CHK_STATUS_RET(tensor->second.SetGlobalBuffer(this->outputs[i], "", local_result),
"Codegen set global buffer failed");
ss << local_result << std::endl;
}
ss << std::endl;
result = ss.str();
return af::SUCCESS;
}
Status TPipe::GetCVFusionCubeOutputUBTensorIdAndQueId(const ascir::ImplGraph &graph) {
for (auto node : graph.GetAllNodes()) {
if (IsOps<Workspace>(node)) {
for (auto &peer_input : node->outputs[0].anchor.GetPeerInDataAnchors()) {
auto next_node = std::dynamic_pointer_cast<af::AscNode>(peer_input->GetOwnerNode());
GE_ASSERT_NOTNULL(next_node, "Codegen CV Fusion get next node after workspace node failed");
if (IsOps<Load>(next_node)) {
this->cube_output_tensor_id = next_node->outputs[0].attr.mem.tensor_id;
this->cube_output_que_id = next_node->outputs[0].attr.que.id;
return af::SUCCESS;
}
}
GELOGE(af::FAILED, "Codegen CV Fusion Load node next to Workspace not found");
return af::FAILED;
}
}
GELOGE(af::FAILED, "Codegen CV Fusion get Workspace node failed");
return af::FAILED;
}
static void AddCommaIfNeeded(bool &is_first, std::stringstream &tensor_size_max) {
if (is_first) {
is_first = false;
} else {
tensor_size_max << ", ";
}
}
Status TPipe::ParseTBufReuse(TBuf buf, std::string& reuse_dtype_name, bool& is_buf_reuse,
std::vector<const Tensor *>& reuse_buf_tensors, std::stringstream &tensor_size_max) const {
tensor_size_max << KernelUtils::Max() << "(";
bool is_first = true;
for (auto mid : buf.merge_scopes) {
auto merge_scope = this->merge_scopes.find(mid);
if (merge_scope == this->merge_scopes.end()) {
GELOGE(af::FAILED, "Codegen merge scope not found:%ld", mid);
return af::FAILED;
}
AddCommaIfNeeded(is_first, tensor_size_max);
tensor_size_max << merge_scope->second.size;
}
for (auto tid : buf.not_merge_tensors) {
auto tensor = this->tensors.find(tid);
if (tensor == this->tensors.end()) {
GELOGE(af::FAILED, "Codegen tensor not found:%ld", tid);
return af::FAILED;
}
AddCommaIfNeeded(is_first, tensor_size_max);
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(tensor->second.dtype, dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(tensor->second.dtype));
if (is_buf_reuse) {
if (reuse_dtype_name == "") {
reuse_dtype_name = dtype_name;
} else {
if (reuse_dtype_name != dtype_name) {
is_buf_reuse = false;
}
}
}
tensor_size_max << tensor->second.size << " * sizeof(" << dtype_name << ")";
reuse_buf_tensors.push_back(&tensor->second);
}
for (auto tmp_buf_size : buf.tmp_buf_size_list) {
AddCommaIfNeeded(is_first, tensor_size_max);
if (this->cv_fusion_type == ascir::CubeTemplateType::kUBFuse) {
tensor_size_max << this->tiler.ActualSize(tmp_buf_size);
} else {
tensor_size_max << this->tiler.Size(tmp_buf_size);
}
}
if (reuse_buf_tensors.size() == 0) {
is_buf_reuse = false;
}
tensor_size_max << ")";
return af::SUCCESS;
}
Status TPipe::LocalTensorDefine(std::string &result) const {
stringstream ss;
for (auto &pair : this->tensors) {
auto &t = pair.second;
if (t.alloc_type != af::AllocType::kAllocTypeGlobal) {
ss << " " << t.AsArg() << ";" << std::endl;
}
}
ss << std::endl;
result = ss.str();
return af::SUCCESS;
}
std::string TPipe::TensorSizeDefine() const {
stringstream ss;
for (auto &pair : this->tensors) {
auto &t = pair.second;
if ((t.alloc_type == af::AllocType::kAllocTypeQueue) || (t.alloc_type == af::AllocType::kAllocTypeBuffer)) {
ss << " " << t.size.Define() << std::endl;
ss << " " << t.actual_size.Define() << std::endl;
}
}
return ss.str();
}
Status TPipe::TensorSizeAssign(std::string dtype_name, std::string &result) const {
stringstream ss;
for (auto &pair : this->tensors) {
auto &t = pair.second;
if ((t.alloc_type == af::AllocType::kAllocTypeQueue) || (t.alloc_type == af::AllocType::kAllocTypeBuffer)) {
if (t.is_ub_scalar) {
std::string tensor_dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(t.dtype, tensor_dtype_name), "Codegen get data type:%d failed",
static_cast<int32_t>(t.dtype));
ss << t.size.Str() << " = KernelUtils::BlkAlign<" << tensor_dtype_name << ">(1);" << std::endl;
} else {
ss << t.size.Str() << " = stage_size / sizeof(" << dtype_name << ");" << std::endl;
}
}
}
ss << std::endl;
result = ss.str();
return af::SUCCESS;
}
std::string TPipe::GenDuplicateBufDefine(const std::set<std::pair<std::string, std::string>>& pre_api_extract_dup) const {
std::stringstream ss;
int32_t i = 1;
for (auto [const_val, const_dtype] : pre_api_extract_dup) {
const std::string index_str = std::to_string(i);
ss << " TBuf<TPosition::VECCALC> builtin_tmp_buffer_" << index_str << ";" << std::endl;
std::string local_tensor_name = "local_blk_tensor_of_" + const_dtype + "_" + const_val;
ss << " LocalTensor<" << const_dtype << "> " << local_tensor_name << ";" << std::endl;
i++;
}
return ss.str();
}
std::string TPipe::GenDuplicateBufAssign(const std::set<std::pair<std::string, std::string>>& pre_api_extract_dup) const {
std::stringstream ss;
int32_t i = 1;
for (auto [const_val, const_dtype] : pre_api_extract_dup) {
const std::string index_str = std::to_string(i);
if (using_global_tpipe_) {
ss << "GetTPipePtr()->InitBuffer(builtin_tmp_buffer_" << index_str << ", ONE_BLK_SIZE);" << std::endl;
} else {
ss << "tpipe.InitBuffer(builtin_tmp_buffer_" << index_str << ", ONE_BLK_SIZE);" << std::endl;
}
ss << "LocalTensor<uint8_t> builtin_tmp_buf_" << index_str << " = builtin_tmp_buffer_" << index_str
<< ".Get<uint8_t>();" << std::endl;
std::string local_tensor_name = "local_blk_tensor_of_" + const_dtype + "_" + const_val;
ss << local_tensor_name << " = builtin_tmp_buf_" << index_str << ".template ReinterpretCast<" << const_dtype
<< ">();" << std::endl;
if (const_dtype == "half" || const_dtype == "float" || const_dtype == "double") {
const_val += ".0";
}
ss << "Duplicate(" << local_tensor_name << "[0], (" << const_dtype << ")" << const_val <<
", ONE_BLK_SIZE / sizeof(" << const_dtype << "));"<< std::endl;
i++;
}
return ss.str();
}
Status TPipe::BlkTensorDefine(std::string &result) const {
stringstream ss;
for (auto &id : this->need_gen_blk_tensors) {
auto tensor_ptr = this->GetTensor(id);
GE_CHK_BOOL_RET_STATUS(tensor_ptr != nullptr, af::FAILED, "BlkTensorAllocAndInit need_gen_blk_tensors failed");
std::string scalar_t_buf_name = tensor_ptr->name + "_tbuf";
std::string scalar_local_blk_tensor_name = "local_blk_tensor_of_" + tensor_ptr->name;
ss << " TBuf<TPosition::VECCALC> " << scalar_t_buf_name << ";" << std::endl;
ss << " LocalTensor<" << tensor_ptr->type << "> " << scalar_local_blk_tensor_name << ";" << std::endl;
}
ss << std::endl;
result = ss.str();
return af::SUCCESS;
}
Status TPipe::BlkTensorAssign(std::string &result) const {
stringstream ss;
for (auto &id : this->need_gen_blk_tensors) {
auto tensor_ptr = this->GetTensor(id);
GE_CHK_BOOL_RET_STATUS(tensor_ptr != nullptr, af::FAILED, "BlkTensorAllocAndInit need_gen_blk_tensors failed");
std::string scalar_t_buf_name = tensor_ptr->name + "_tbuf";
std::string scalar_local_blk_tensor_name = "local_blk_tensor_of_" + tensor_ptr->name;
if (using_global_tpipe_) {
ss << "GetTPipePtr()->InitBuffer(" << scalar_t_buf_name << ", 32);" << std::endl;
} else {
ss << "tpipe.InitBuffer(" << scalar_t_buf_name << ", 32);" << std::endl;
}
ss << scalar_local_blk_tensor_name << " = " << scalar_t_buf_name << ".Get<" << tensor_ptr->type << ">();"
<< std::endl;
ss << "Duplicate(" << scalar_local_blk_tensor_name << "[0], static_cast<" << tensor_ptr->type
<< ">(" << tensor_ptr->const_value << "), static_cast<uint64_t>(32/"
<< "sizeof(" << tensor_ptr->type << ")));" << std::endl;
ss << "AscendC::PipeBarrier<PIPE_V>();" << std::endl;
}
ss << std::endl;
result = ss.str();
return af::SUCCESS;
}
Status Kernel::GenerateVecFuncOfCVFusion(std::stringstream &result, bool vector_no_db_flag, bool is_conv2d) {
std::string tiling_data_type = "AutofuseTilingData";
if (vector_no_db_flag) {
if (is_conv2d) {
result << R"(
// conv2d
#include "autofuse_cube_tiling_data.h"
)" << std::endl;
} else {
result << R"(
#include "cmct/block/block_scheduler_policy.h"
#include "cmct/block/block_scheduler_utils.h"
#include "cmct/utils/status_utils.h"
#include "autofuse_cube_tiling_data.h"
)" << std::endl;
}
result << "#ifdef CV_UB_NO_DB" << std::endl;
} else {
result << "#ifdef CV_UB_DB" << std::endl;
}
result << R"(
class AutoFusionVector {
public:
__aicore__ inline AutoFusionVector() {};
)" << std::endl;
result << " struct Arguments {" << std::endl;
for (auto &input : this->inputs) {
result << " " << input.AsArg() << "{nullptr};" << std::endl;
}
for (auto &output : this->outputs) {
result << " " << output.AsArg() << "{nullptr};" << std::endl;
}
result << " };" << std::endl << std::endl;
result << " struct Params {" << std::endl;
for (auto &input : this->inputs) {
result << " " << input.AsArg() << "{nullptr};" << std::endl;
}
for (auto &output : this->outputs) {
result << " " << output.AsArg() << "{nullptr};" << std::endl;
}
result << " };" << std::endl;
for (auto &input : this->inputs) {
result << " " << input.AsArg() << "{nullptr};" << std::endl;
}
for (auto &output : this->outputs) {
result << " " << output.AsArg() << "{nullptr};" << std::endl;
}
result << std::endl;
auto ub_tensor = this->tpipe.GetTensor(this->tpipe.cube_output_tensor_id);
GE_CHK_BOOL_RET_STATUS(ub_tensor != nullptr, af::FAILED, "Codegen CV Fusion MatmulOutput UB tensor id[%ld] "
"not found", this->tpipe.cube_output_tensor_id);
std::string dtype_name;
GE_CHK_STATUS_RET(Tensor::DtypeName(ub_tensor->dtype, dtype_name), "data type:%d failed",
static_cast<int32_t>(ub_tensor->dtype));
std::string tmp;
GE_CHK_STATUS_RET(this->tpipe.LocalTensorDefine(tmp), "Local tbuf define failed");
result << tmp;
if (!is_conv2d) {
result << " TPipe tpipe;" << std::endl << std::endl;
} else {
this->tpipe.SetUsingGlobalTpipe(true);
}
for (auto &[id, que] : this->tpipe.ques) {
if (id == this->tpipe.cube_output_que_id) {
continue;
}
result << " " << que.Define() << std::endl;
result << " " << que.buf.AsArg() << ";" << std::endl;
}
result << std::endl;
for (auto &pair : this->tpipe.bufs) {
auto &buf = pair.second;
result << " " << buf.Define() << std::endl;
result << " " << buf.buf.AsArg() << ";" << std::endl;
if (buf.tmp_buf_reuse) {
result << " " << this->tpipe.tmp_buf.AsArg() << "_" << to_string(buf.id) << ";" << std::endl;
}
}
result << " TBuf<TPosition::VECCALC> buf_cube;" << std::endl;
if (!this->pre_api_extract_dup.empty()) {
result << this->tpipe.GenDuplicateBufDefine(this->pre_api_extract_dup) << std::endl;
}
GE_CHK_STATUS_RET(this->tpipe.BlkTensorDefine(tmp), "Block tensor define failed");
result << tmp;
GE_CHK_STATUS_RET(this->GlobalTensorDefine(tmp), "Global tensor define in cv-ub-fuse case failed");
result << tmp;
result << this->tpipe.TensorSizeDefine() << std::endl;
result << " LocalTensor<" << dtype_name << "> cLocal_;" << std::endl;
if (is_conv2d) {
result
<< "__aicore__ inline void Init(Params const& params, AscendC::LocalTensor<" << dtype_name
<< ">& cLocal, int64_t &stage_size1, int64_t &stage_size2) {";
result << std::endl;
result << "GET_TILING_DATA_WITH_STRUCT(Conv2DTilingData, tmpTilingData, tmpTilingGM);" << std::endl;
result << "const int32_t ub_align_value = 32 / sizeof(" << dtype_name << ");" << std::endl;
result << "const int32_t basen_align = (tmpTilingData.conv2dApiTiling.hoL0 + ub_align_value - 1) / "
"ub_align_value * ub_align_value;"
<< std::endl;
result << "AutofuseTilingData autofuse_tiling_size;" << std::endl;
std::string npu_arch;
int32_t vec_num = 2;
GE_ASSERT_SUCCESS(ge::PlatformContext::GetInstance().GetCurrentPlatformString(npu_arch));
if (npu_arch == "5102") {
vec_num = 1;
}
result << "stage_size1 = KernelUtils::Max(tmpTilingData.conv2dApiTiling.nL0 / " << vec_num << ", 16) * basen_align;" << std::endl;
result << "const int32_t compute_size = autofuse_tiling_size.STAGE_SIZE_NAME > 144 ? "
"autofuse_tiling_size.STAGE_SIZE_NAME : 144;"
<< std::endl;
result << "int32_t stage_size = compute_size * sizeof(" << dtype_name << ");" << std::endl;
result << "stage_size2 = compute_size;" << std::endl;
} else {
result << "__aicore__ inline void Init(Params const& params, AscendC::LocalTensor<" << dtype_name
<< ">& cLocal, int64_t l1M, int64_t l1NAlign, int64_t ubOffset, int64_t &stage_size_type) {";
result << std::endl;
result << "GET_TILING_DATA_WITH_STRUCT(MatMulV3BasicTilingData, tmpTilingData, tmpTilingGM);" << std::endl;
result << "const int32_t ub_align_value = 32 / sizeof(" << dtype_name << ");" << std::endl;
result
<< "const int32_t basen_align = (tmpTilingData.baseN + ub_align_value - 1) / ub_align_value * ub_align_value;"
<< std::endl;
result << "const int32_t basen_basem_align = (tmpTilingData.baseM * basen_align * sizeof(" << dtype_name
<< ")) / 2 + basen_align * sizeof(" << dtype_name << ");" << std::endl;
result << "AutofuseTilingData autofuse_tiling_size;" << std::endl;
result << "int32_t stage_size = autofuse_tiling_size.STAGE_SIZE_NAME > 144 ? " << std::endl;
result << "autofuse_tiling_size.STAGE_SIZE_NAME * sizeof(" << dtype_name << ") : basen_basem_align;" << std::endl;
result << "stage_size_type = static_cast<int64_t>(autofuse_tiling_size.STAGE_SIZE_NAME > 144 ? " << std::endl;
result << "autofuse_tiling_size.STAGE_SIZE_NAME : basen_basem_align / sizeof(" << dtype_name << "));" << std::endl;
}
GE_CHK_STATUS_RET(this->root_loop.ActualSizeDefine(this->tiler, this->tpipe, dtype_name, tmp),
"actual size define failed");
result << tmp;
if (is_conv2d) {
result << ub_tensor->Str() << "_actual_size = stage_size2;" << std::endl << std::endl;
} else {
result << ub_tensor->Str() << "_actual_size = stage_size_type;" << std::endl << std::endl;
}
GE_CHK_STATUS_RET(this->tpipe.TensorSizeAssign(dtype_name, tmp), "Tensor size assign failed");
result << tmp;
for (auto &input : this->inputs) {
result << input << " = params." << input << ";" << std::endl;
}
for (auto &output : this->outputs) {
result << output << " = params." << output << ";" << std::endl;
}
GE_CHK_STATUS_RET(this->GlobalTensorAssign(tmp), "Global tensor assign in cv-ub-fuse case failed");
result << tmp;
if (is_conv2d) {
result << "GetTPipePtr()->InitBuffer(buf_cube, stage_size1 * sizeof(" << dtype_name << "));" << std::endl;
} else {
result << "tpipe.InitBuffer(buf_cube, basen_basem_align);" << std::endl;
}
result << ub_tensor->name << " = buf_cube.Get<" << dtype_name << ">();" << std::endl;
result << "cLocal = " << ub_tensor->name << ";" << std::endl << std::endl;
result << "cLocal_ = " << ub_tensor->name << ";" << std::endl << std::endl;
GE_CHK_STATUS_RET(this->tpipe.LocalTBufAllocLoopTwice(tmp, false), "Local tbuf define failed");
result << tmp;
if (!this->pre_api_extract_dup.empty()) {
result << this->tpipe.GenDuplicateBufAssign(this->pre_api_extract_dup) << std::endl;
}
GE_CHK_STATUS_RET(this->tpipe.BlkTensorAssign(tmp), "Block tensor assign failed");
result << tmp;
GE_CHK_STATUS_RET(this->tpipe.LocalTQueAlloc(tmp), "Codegen alloc local tque failed");
result << tmp;
result << "}" << std::endl << std::endl;
stringstream ss;
GE_ASSERT_SUCCESS(this->GenerateSubGraphFuncDef(&(this->root_loop), ss));
result << ss.str() << std::endl;
if (is_conv2d) {
result << "inline __aicore__ void auto_fusion_vector_stage1(int64_t offset, int64_t offsetH, int64_t offsetW, "
"int64_t offsetCout, int64_t curAivN, int64_t curAlignN, int64_t shapeN, "
"int64_t curAivM, int64_t shapeM, int64_t stageSize, int64_t stageOffset) {";
result << "}" << std::endl;
result << "inline __aicore__ void auto_fusion_vector_stage2(int64_t offset, int64_t offsetH, int64_t offsetW, "
"int64_t offsetCout, int64_t curAivN, int64_t curAlignN, int64_t shapeN, "
"int64_t curAivM, int64_t shapeM, int64_t stageSize, int64_t stageOffset) {";
} else {
result << "inline __aicore__ void auto_fusion_vector_stage1(int64_t offset, int64_t curAivM, int64_t curAivN, "
"int64_t shapeN, int64_t shapeM, int64_t curAlignN, int64_t stageSize) {";
}
result << std::endl;
result << "int64_t batch_num = offset / shapeN / shapeM;" << std::endl;
if (is_conv2d) {
result << "int64_t load_block_len = curAivN;" << std::endl;
result << "int64_t load_src_stride = shapeN - curAivN;" << std::endl;
result << "int64_t load_dst_stride = curAlignN - curAivN;" << std::endl;
} else {
result << "int64_t load_block_len = curAlignN;" << std::endl;
result << "int64_t load_src_stride = shapeN - curAlignN;" << std::endl;
result << "int64_t load_dst_stride = 0;" << std::endl;
result << "if (shapeN < curAlignN) {" << std::endl;
result << "load_block_len = curAivN;" << std::endl;
result << "load_src_stride = 0;" << std::endl;
result << "load_dst_stride = curAlignN - shapeN;" << std::endl;
result << "}" << std::endl;
}
GE_CHK_STATUS_RET(this->root_loop.Generate(this->tiler, this->tpipe, tmp, ComputeStage::kCVFuseStage1),
"Codegen root loop Generate failed");
result << tmp;
result << "}" << std::endl;
if (is_conv2d) {
result << "inline __aicore__ void auto_fusion_vector_stage3(int64_t offset, int64_t offsetH, int64_t offsetW, "
"int64_t offsetCout, int64_t curAivN, int64_t curAlignN, int64_t shapeN, "
"int64_t curAivM, int64_t shapeM, int64_t stageSize, int64_t stageOffset) {";
} else {
result << "inline __aicore__ void auto_fusion_vector_stage2(int64_t offset, int64_t curAivM, int64_t curAivN, "
"int64_t shapeN, int64_t shapeM, int64_t curAlignN, int64_t stageSize) {";
}
result << std::endl;
result << "int64_t batch_num = offset / shapeN / shapeM;" << std::endl;
result << "int64_t load_block_len = curAlignN;" << std::endl;
result << "int64_t load_src_stride = shapeN - curAlignN;" << std::endl;
result << "int64_t load_dst_stride = 0;" << std::endl;
result << "if (shapeN < curAlignN) {" << std::endl;
result << "load_block_len = curAivN;" << std::endl;
result << "load_src_stride = 0;" << std::endl;
result << "load_dst_stride = curAlignN - shapeN;" << std::endl;
result << "}" << std::endl;
GE_CHK_STATUS_RET(this->root_loop.Generate(this->tiler, this->tpipe, tmp, ComputeStage::kCVFuseStage2),
"Codegen root loop Generate failed");
result << tmp;
result << "}" << std::endl;
if (is_conv2d) {
result << "inline __aicore__ void operator()(int64_t offset, int64_t offsetH, int64_t offsetW, int64_t "
"offsetCout, int64_t curAivN, int64_t curAlignN, int64_t shapeN, "
"int64_t curAivM, int64_t shapeM, int64_t stageSize, int64_t stageOffset, uint8_t stage = 0) {"
<< std::endl
<< ub_tensor->name << " = cLocal_[stageOffset].template ReinterpretCast<" << dtype_name << ">();"
<< std::endl
<< "if (stage == 1) {" << std::endl
<< " auto_fusion_vector_stage1(offset, offsetH, offsetW, offsetCout, curAivN, curAlignN, shapeN, curAivM, "
"shapeM, stageSize, stageOffset);"
<< std::endl
<< "} else if (stage == 2) {" << std::endl
<< " auto_fusion_vector_stage2(offset, offsetH, offsetW, offsetCout, curAivN, curAlignN, shapeN, curAivM, "
"shapeM, stageSize, stageOffset);"
<< std::endl
<< "} else if (stage == 3) {" << std::endl
<< " auto_fusion_vector_stage3(offset, offsetH, offsetW, offsetCout, curAivN, curAlignN, shapeN, curAivM, "
"shapeM, stageSize, stageOffset);"
<< std::endl
<< "} else {" << std::endl
<< " auto_fusion_vector_stage1(offset, offsetH, offsetW, offsetCout, curAivN, curAlignN, shapeN, curAivM, "
"shapeM, stageSize, stageOffset);"
<< std::endl
<< " auto_fusion_vector_stage2(offset, offsetH, offsetW, offsetCout, curAivN, curAlignN, shapeN, curAivM, "
"shapeM, stageSize, stageOffset);"
<< std::endl
<< " auto_fusion_vector_stage3(offset, offsetH, offsetW, offsetCout, curAivN, curAlignN, shapeN, curAivM, "
"shapeM, stageSize, stageOffset);"
<< std::endl
<< "}" << std::endl
<< "}" << std::endl;
} else {
result << "inline __aicore__ void operator()(int64_t offset, int64_t curAivM, int64_t curAivN, int64_t shapeN, "
"int64_t shapeM, int64_t curAlignN, int64_t stageSize, int64_t stageOffset, uint8_t stage = 0) {"
<< std::endl
<< ub_tensor->name << " = cLocal_[stageOffset].template ReinterpretCast<" << dtype_name << ">();"
<< std::endl
<< "if (stage == 1) {" << std::endl
<< " auto_fusion_vector_stage1(offset, curAivM, curAivN, shapeN, shapeM, curAlignN, stageSize);"
<< std::endl
<< "} else if (stage == 2) {" << std::endl
<< " auto_fusion_vector_stage2(offset, curAivM, curAivN, shapeN, shapeM, curAlignN, stageSize);"
<< std::endl
<< "} else {" << std::endl
<< " auto_fusion_vector_stage1(offset, curAivM, curAivN, shapeN, shapeM, curAlignN, stageSize);"
<< std::endl
<< " auto_fusion_vector_stage2(offset, curAivM, curAivN, shapeN, shapeM, curAlignN, stageSize);"
<< std::endl
<< "}" << std::endl
<< "}" << std::endl;
}
result << "};" << std::endl;
result << "#endif" << std::endl;
return af::SUCCESS;
}
Status Kernel::InitCVFusionAddr(std::stringstream &result, bool vector_no_db_flag) {
if (vector_no_db_flag) {
result << " AutoFusionVector::Params CV_FUSION_ADDR;\n";
for (auto input : this->inputs) {
result << " CV_FUSION_ADDR." << input.Str() << " = " << input.Str() << ";" << std::endl;
}
size_t output_idx = 0;
for (auto output : this->outputs) {
result << " CV_FUSION_ADDR." << output.Str() << " = " << output.Str() << ";" << std::endl;
result << " GM_ADDR output_" << output_idx++ << " = " << output.Str() << ";" << std::endl;
}
}
return af::SUCCESS;
}
static std::string GetScheduledResultInputOutput(const ascir::FusedScheduledResult &fused_schedule_result,
bool is_kernel_func_call) {
std::stringstream ss;
for (size_t i = 0U; i < fused_schedule_result.input_nodes.size(); i++) {
auto &input = fused_schedule_result.input_nodes[i];
if (IsOps<Data>(input)) {
ss << (is_kernel_func_call ? "(uint8_t*)" : "void* ") << "input" << i << ", ";
} else if (IsOps<ScalarData>(input)) {
std::string dtype_name;
GE_ASSERT_SUCCESS(Tensor::DtypeName(input->outputs[0].attr.dtype, dtype_name), "data type:%d failed",
static_cast<int32_t>(input->outputs[0].attr.dtype));
ss << (is_kernel_func_call ? "" : (dtype_name + " ")) << "input" << i << ", ";
}
}
int32_t index = 0;
for (const auto &node : fused_schedule_result.output_nodes) {
if (IsOps<Output>(node)) {
ss << (is_kernel_func_call ? "(uint8_t*)" : "void* ") << "output" << index++ << ", ";
}
}
return ss.str();
}
std::string Kernel::GenKernelFuncCallForInductor(const ascir::FusedScheduledResult &fused_schedule_result) {
std::string tiling_data_name = "AutofuseTilingData";
std::string graph_name = CamelToLowerSneak(GenValidName(fused_schedule_result.fused_graph_name.GetString()));
std::string extern_c = "extern \"C\"";
std::stringstream ss;
ss << "void init_" << graph_name << "(void) {}" << std::endl;
ss << extern_c << " int64_t AutofuseLaunch(uint32_t blockDim, void* stream, ";
ss << GetScheduledResultInputOutput(fused_schedule_result, false);
ss << "void* workspace, " << tiling_data_name << "* tiling_data)" << std::endl;
ss << "{" << std::endl;
ss << " " << graph_name << "<<<blockDim, nullptr, stream>>>(";
ss << GetScheduledResultInputOutput(fused_schedule_result, true);
ss << "(uint8_t*)workspace, *tiling_data);" << std::endl;
ss << " return 0;" << std::endl;
ss << "}" << std::endl;
return ss.str();
}
static ApiCallRegister<ApiCall> register_api_call("ApiCall");