* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
#include "gtest/gtest.h"
#include "node_utils_ex.h"
#include "graph_utils.h"
#include "ascendc_ir.h"
#include "ascir_ops.h"
#include "ascir_ops_utils.h"
#include "codegen_kernel.h"
#include "graph/ascendc_ir/utils/asc_tensor_utils.h"
#include "common_utils.h"
#include "utils/api_call_factory.h"
#include "utils/api_call_utils.h"
#include "elewise/compare_api_call.h"
#include "elewise/unary_bitwidth_change_api_call.h"
#include "elewise/unary_api_tmp_call.h"
#include "elewise/unary_api_call.h"
#include "elewise/ub2ub_api_call.h"
#include "autofuse_config/auto_fuse_config.h"
#include "codegen_graph_check.h"
#include "platform_context.h"
#include "runtime_stub.h"
using namespace ge;
using namespace af::ops;
using namespace af::ascir_op;
using namespace codegen;
namespace {
std::string ToString(const af::Expression &e) {
return std::string(e.Serialize().get());
}
}
TEST(CodegenKernel, Type_StrWillReturnTypeName) {
codegen::Type t{"int"};
EXPECT_EQ(t.Str(), "int");
}
TEST(CodegenKernel, Variable_StrWillReturnVarName) {
codegen::Type t{"int"};
codegen::Variable v{t, "x"};
EXPECT_EQ(v.Str(), "x");
}
TEST(CodegenKernel, Variable_DefineWillReturnDefineWithInit) {
codegen::Type t{"int"};
codegen::Variable v{t, "x"};
EXPECT_EQ(v.Define("100"), "int x = 100;");
}
TEST(CodegenKernel, Variable_AsArg) {
codegen::Type t{"int"};
codegen::Variable v{t, "x"};
EXPECT_EQ(v.AsArg(), "int x");
}
TEST(CodegenKernel, Axis_StrWillReturnAxisName) {
af::SizeVar s0(af::Symbol("s0"));
af::Axis axis{.name = "z0", .size = s0.expr};
codegen::Axis codegen_axis(axis);
EXPECT_EQ(codegen_axis.Str(), "z0");
}
TEST(CodegenKernel, Tiler_StrWillReturnTilingDataName) {
codegen::Tiler tiler("TilingData", "tiling_data");
EXPECT_EQ(codegen::Tiler(tiler).Str(), "tiling_data");
}
TEST(CodegenKernel, Tiler_SizeWillReturnTilingDataField) {
af::SizeVar s0(af::Symbol("s0"));
codegen::Tiler tiler("TilingData", "tiling_data");
tiler.AddSizeVar(af::SizeVar(s0));
EXPECT_EQ(tiler.Size(s0.expr), "tiling_data->s0");
}
TEST(CodegenKernel, Tiler_SizeWhenHasTwoMoreNums_WillAddRoundBrackets) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
codegen::Tiler tiler("TilingData", "tiling_data");
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
auto mul = s0.expr * s1.expr;
EXPECT_EQ(tiler.Size(mul), "(tiling_data->s0 * tiling_data->s1)");
}
TEST(CodegenKernel, Tiler_Size) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
codegen::Tiler tiler("TilingData", "tiling_data");
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
EXPECT_EQ(tiler.Size(Zero), "0");
EXPECT_EQ(tiler.Size(One), "1");
EXPECT_EQ(tiler.Size(s0.expr), "tiling_data->s0");
EXPECT_EQ(tiler.Size(s0.expr * s1.expr), "(tiling_data->s0 * tiling_data->s1)");
EXPECT_EQ(tiler.Size(One / s0.expr), "Pow(tiling_data->s0, -1)");
EXPECT_EQ(tiler.Size(One / (s0.expr * s1.expr)), "(1 / (tiling_data->s0 * tiling_data->s1))");
EXPECT_EQ(tiler.Size(s0.expr / s1.expr), "(tiling_data->s0 / (tiling_data->s1))");
}
TEST(CodegenKernel, Tiler_TensorVectorizedSize) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.tensor_id = 0;
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.repeats = {z0.size, z1.size, z2.size};
tensor.attr.strides = {z1.size*z2.size, z2.size, One};
vector<af::Expression> vectorized_strides{One, One};
vectorized_strides[0] = z2.size;
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(t.Init(), ge::SUCCESS);
EXPECT_EQ(tiler.TensorVectorizedSize(t),
std::string{"KernelUtils::BlkAlign<half>((t->s1 - 1) * t->s2 + (t->s2 - 1) + 1)"});
}
TEST(CodegenKernel, Tiler_ShapeOneTensorVectorizedSize) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
af::ascir_op::Load load("load");
graph.AddNode(load);
load.x = x.y;
load.attr.sched.axis = {z0.id, z1.id, z2.id};
*load.y.axis = {z0.id, z1.id, z2.id};
*load.y.repeats = {One, One, One};
*load.y.strides = {One, One, One};
auto node = graph.FindNode("load");
af::AscTensor tensor = node->outputs[0];
tensor.attr.dtype = ge::DT_FLOAT;
tensor.attr.mem.tensor_id = 0;
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.vectorized_strides = {af::sym::Align(One, 32 / sizeof(float)), One};
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(t.Init(), ge::SUCCESS);
EXPECT_EQ(tiler.TensorVectorizedSize(t), std::string{"KernelUtils::BlkAlign<float>((1 - 1) * 8 + (1 - 1) + 1)"});
}
TEST(CodegenKernel, Tiler_TensorVectorizedSize_WhenNotVectorized) {
codegen::Tiler tiler;
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
EXPECT_EQ(tiler.TensorVectorizedSize(codegen::Tensor(tensor, dtype_name)), std::string{
"1"});
}
TEST(CodegenKernel, Tiler_BlockOutterAxisDefine) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeBlockOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeBlockOuter, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeBlockOuter, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
auto result_code = tiler.BlockOutterAxisDefine();
EXPECT_EQ(result_code, std::string{
"int block_dim = GetBlockIdx();\n"
"if (block_dim >= t->block_dim) { \n"
" return;\n"
"}\n"
"const int z0 = block_dim % z0_loop_size; \n"
"const int z1 = block_dim % z1_loop_size; \n"
"const int z2 = block_dim % z2_loop_size; \n"
});
}
TEST(CodegenKernel, Tiler_GetAxisVar) {
af::SizeVar s0(af::Symbol("s0"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeBlockOuter, .size = s0.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddAxis(z0);
EXPECT_EQ(tiler.GetAxis(z0.id).Str(), "z0");
}
TEST(CodegenKernel, Tiler_TensorOffset_WhenGlobalTensor_WillOffsetAll) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
std::vector<af::AxisId> tensor_axis = {z0.id, z1.id, z2.id};
std::vector<af::Expression> stride = {s1.expr * s2.expr, s2.expr, One};
EXPECT_EQ(tiler.Offset({}, tensor_axis, stride), std::string{"0"});
EXPECT_EQ(tiler.Offset({z0.id}, tensor_axis, stride), std::string{"(int64_t)z0 * (int64_t)(t->s1 * t->s2)"});
EXPECT_EQ(tiler.Offset({z0.id, z1.id}, tensor_axis, stride), std::string{"(int64_t)z0 * (int64_t)(t->s1 * t->s2) + (int64_t)z1 * (int64_t)t->s2"});
EXPECT_EQ(tiler.Offset({z0.id, z1.id, z2.id}, tensor_axis, stride), std::string{"(int64_t)z0 * (int64_t)(t->s1 * t->s2) + (int64_t)z1 * (int64_t)t->s2 + (int64_t)z2"});
}
TEST(CodegenKernel, Tiler_TensorOffset_WhenLocalTensor_VectorizedOnCurrentAxis) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.strides = {s1.expr * s2.expr, s2.expr, One};
tensor.attr.vectorized_axis = {z1.id, z2.id};
vector<af::Expression> vectorized_strides{One, One};
vectorized_strides[0] = z2.size;
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(tiler.TensorVectorizedOffset({z0.id}, codegen::Tensor(tensor, dtype_name)), "0");
EXPECT_EQ(tiler.TensorVectorizedOffset({z0.id, z1.id}, codegen::Tensor(tensor, dtype_name)), "(int64_t)z1 * (int64_t)t->s2");
EXPECT_EQ(tiler.TensorVectorizedOffset({z0.id, z1.id, z2.id}, codegen::Tensor(tensor, dtype_name)), "(int64_t)z1 * (int64_t)t->s2 + (int64_t)z2");
}
TEST(CodegenKernel, ubScalarFalseTestWhenVecRepeateIsOneButNotAllOne) {
af::AscGraph graph("test");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
Load load("load");
graph.AddNode(load);
auto node = graph.FindNode("load");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.repeats = {s0, One, One};
tensor.attr.strides = {One, One, One};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.mem.tensor_id = 1;
vector<af::Expression> vectorized_strides{One, One};
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
EXPECT_EQ(t.is_ub_scalar, false);
}
TEST(CodegenKernel, Tensor_ubScalarVrInitTest) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
Load load("load");
graph.AddNode(load);
auto node = graph.FindNode("load");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.repeats = {One, One, One};
tensor.attr.strides = {One, One, One};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.mem.tensor_id = 1;
vector<af::Expression> vectorized_strides{One, One};
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
EXPECT_EQ(t.is_ub_scalar, true);
std::string result1;
EXPECT_EQ(t.DefineUbScalar(result1), 0);
EXPECT_EQ(result1, "float global_1_ub_scalar;\n");
std::string result2;
EXPECT_EQ(t.InitUbScalar(result2), 0);
EXPECT_EQ(result2, "global_1_ub_scalar = global_1.GetValue(0);\n");
}
TEST(CodegenKernel, OutputTensorIsUbScalar_test) {
af::AscGraph graph("test");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
Load load("load");
graph.AddNode(load);
auto node = graph.FindNode("load");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.repeats = {s0, s1, s2};
tensor.attr.strides = {s1*s2, s2, One};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.mem.tensor_id = 1;
vector<af::Expression> vectorized_strides{s2, One};
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
EXPECT_EQ(t.is_ub_scalar, false);
codegen::Kernel kernel("test");
EXPECT_EQ(kernel.tpipe.AddTensor(t), 0);
bool is_ub_scalar = true;
EXPECT_EQ(kernel.OutputTensorIsUbScalar(node, is_ub_scalar), 0);
EXPECT_EQ(is_ub_scalar, false);
}
TEST(CodegenKernel, OutputTensorIsScalarDuplicate_test) {
af::AscGraph graph("test");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
Load load("load");
graph.AddNode(load);
auto node = graph.FindNode("load");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.repeats = {s0, s1, s2};
tensor.attr.strides = {s1*s2, s2, One};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.mem.tensor_id = 1;
vector<af::Expression> vectorized_strides{s2, One};
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
codegen::Kernel kernel("test");
EXPECT_EQ(kernel.tpipe.AddTensor(t), 0);
kernel.tpipe.need_gen_blk_tensors.push_back(t.id);
bool is_ub_scalar = true;
std::string result;
Status ans = kernel.tpipe.BlkTensorAllocAndInit(result);
EXPECT_EQ(result, std::string{
"TBuf<TPosition::VECCALC> global_1_tbuf;\n"
"tpipe.InitBuffer(global_1_tbuf, 32);\n"
"LocalTensor<GlobalTensor<float>> local_blk_tensor_of_global_1 = global_1_tbuf.Get<GlobalTensor<float>>();\n"
"Duplicate(local_blk_tensor_of_global_1[0], static_cast<GlobalTensor<float>>(), static_cast<uint64_t>(32/sizeof(GlobalTensor<float>)));\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
});
}
TEST(CodegenKernel, ReduceTensorForceNonUbScalar) {
af::AscGraph graph("test");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
Sum sum("sum");
graph.AddNode(sum);
auto node = graph.FindNode("sum");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.repeats = {One, One, One};
tensor.attr.strides = {One, One, One};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.mem.tensor_id = 1;
vector<af::Expression> vectorized_strides{One, One};
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
EXPECT_EQ(t.is_ub_scalar, true);
codegen::Kernel kernel("test");
EXPECT_EQ(kernel.tpipe.AddTensor(t), 0);
EXPECT_EQ(kernel.ParseOptimizeInfo(node, tensor), 0);
auto t_s_ptr = kernel.tpipe.GetTensor(1);
auto t_s = *t_s_ptr;
EXPECT_EQ(t_s.is_ub_scalar, false);
}
TEST(CodegenKernel, ApiCallPreProcessTest) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
Load load("load");
graph.AddNode(load);
auto node = graph.FindNode("load");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.repeats = {One, One, One};
tensor.attr.strides = {One, One, One};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.mem.tensor_id = 1;
vector<af::Expression> vectorized_strides{One, One};
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
codegen::ApiCall call("Load");
EXPECT_EQ(call.Init(node), 0);
std::string result1, result2;
std::vector<std::reference_wrapper<const codegen::Tensor>> output_tensors;
t.need_gen_get_value_of_ub_scalar = true;
output_tensors.emplace_back(t);
std::vector<::ascir::AxisId> current_axis = {z0.id};
codegen::TPipe tpipe("tpipe", tiler);
EXPECT_EQ(call.PreProcess(tpipe, current_axis, output_tensors, result1), 0);
EXPECT_EQ(result1, "if (z0 < 1) {\n");
EXPECT_EQ(call.PostProcess(tpipe, current_axis, output_tensors, result2), 0);
EXPECT_EQ(result2, std::string{
"event_t eventID = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_S));\n"
"SetFlag<HardEvent::MTE2_S>(eventID);\n"
"WaitFlag<HardEvent::MTE2_S>(eventID);\n"
"global_1_ub_scalar = global_1.GetValue(0);\n"
"}\n"});
}
TEST(CodegenKernel, ApiCallPreProcessUbScalarTest) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
Load load("load");
graph.AddNode(load);
auto node = graph.FindNode("load");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.repeats = {One, One, One};
tensor.attr.strides = {One, One, One};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.mem.tensor_id = 1;
vector<af::Expression> vectorized_strides{One, One};
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
codegen::ApiCall call("Load");
EXPECT_EQ(call.Init(node), 0);
std::string result1, result2;
std::vector<std::reference_wrapper<const codegen::Tensor>> output_tensors;
t.need_gen_get_value_of_ub_scalar = true;
t.need_duplicate_value_of_ub_scalar = true;
output_tensors.emplace_back(t);
std::vector<::ascir::AxisId> current_axis = {z0.id};
codegen::TPipe tpipe("tpipe", tiler);
EXPECT_EQ(call.PostProcess(tpipe, current_axis, output_tensors, result2), 0);
EXPECT_EQ(result2, std::string{
"event_t eventID = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_S));\n"
"SetFlag<HardEvent::MTE2_S>(eventID);\n"
"WaitFlag<HardEvent::MTE2_S>(eventID);\n"
"global_1_ub_scalar = global_1.GetValue(0);\n"
"AscendC::PipeBarrier<PIPE_ALL>();\n"
"Duplicate(global_1[0], global_1_ub_scalar, 32/sizeof(float));\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"}\n"});
}
TEST(CodegenKernel, ApiCallPreProcessLoadUbScalarTest) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
Load load("load");
graph.AddNode(load);
auto node = graph.FindNode("load");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.repeats = {One, One, One};
tensor.attr.strides = {One, One, One};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.mem.tensor_id = 1;
vector<af::Expression> vectorized_strides{One, One};
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
codegen::ApiCall call("Load");
EXPECT_EQ(call.Init(node), 0);
std::string result1, result2;
std::vector<std::reference_wrapper<const codegen::Tensor>> output_tensors;
t.need_gen_get_value_of_ub_scalar = true;
t.need_duplicate_value_of_ub_scalar = true;
output_tensors.emplace_back(t);
std::vector<::ascir::AxisId> current_axis = {z0.id};
codegen::TPipe tpipe("tpipe", tiler);
EXPECT_EQ(call.PostProcess(tpipe, current_axis, output_tensors, result2), 0);
EXPECT_EQ(result2, std::string{
"event_t eventID = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE2_S));\n"
"SetFlag<HardEvent::MTE2_S>(eventID);\n"
"WaitFlag<HardEvent::MTE2_S>(eventID);\n"
"global_1_ub_scalar = global_1.GetValue(0);\n"
"AscendC::PipeBarrier<PIPE_ALL>();\n"
"Duplicate(global_1[0], global_1_ub_scalar, 32/sizeof(float));\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"}\n"});
}
TEST(CodegenKernel, kernelUbScalarVarDef) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
Load load("load");
graph.AddNode(load);
auto node = graph.FindNode("load");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.repeats = {One, One, One};
tensor.attr.strides = {One, One, One};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.mem.tensor_id = 1;
vector<af::Expression> vectorized_strides{One, One};
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
t.need_gen_get_value_of_ub_scalar = true;
codegen::Kernel kernel("test");
EXPECT_EQ(kernel.tpipe.AddTensor(t), 0);
kernel.ub_scalar_tensors.emplace_back(1);
std::string result;
EXPECT_EQ(kernel.GlobalTensorInit(result), 0);
EXPECT_EQ(result, "float global_1_ub_scalar;\n");
}
TEST(CodegenKernel, Tiler_TensorOffset_WhenLocalTensor_VectorizedNestCurrentAxis) {
GTEST_SKIP();
}
TEST(CodegenKernel, Tiler_TensorAlloc_WhenTensorFromQue_AndMerge) {
GTEST_SKIP();
}
TEST(CodegenKernel, Tensor_SetGlobalBuffer) {
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.tensor_id = 0;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t(tensor, dtype_name, "tensor");
codegen::GM_ADDR gm("gm");
std::string result;
t.SetGlobalBuffer(gm, "", result);
EXPECT_EQ(result, "global_0.SetGlobalBuffer((__gm__ half*)gm);");
}
TEST(CodegenKernel, TQue_AllocBuf) {
std::string position;
codegen::PositionValue(af::Position::kPositionVecIn, position);
codegen::TQue que(0, af::Position::kPositionVecIn, position);
EXPECT_EQ(que.AllocBuf(), "LocalTensor<uint8_t> q0_buf = q0.AllocTensor<uint8_t>();\n");
}
TEST(CodegenKernel, TQue_EnqueBuf) {
std::string position;
codegen::PositionValue(af::Position::kPositionVecIn, position);
codegen::TQue que(0, af::Position::kPositionVecIn, position);
EXPECT_EQ(que.EnqueBuf(), "q0.EnQue(q0_buf);\n");
}
TEST(CodegenKernel, TQue_DequeBuf) {
std::string position;
codegen::PositionValue(af::Position::kPositionVecIn, position);
codegen::TQue que(0, af::Position::kPositionVecIn, position);
EXPECT_EQ(que.DequeBuf(true), "LocalTensor<uint8_t> q0_buf = q0.DeQue<uint8_t>();\n");
EXPECT_EQ(que.DequeBuf(false), "q0_buf = q0.DeQue<uint8_t>();\n");
}
TEST(CodegenKernel, TQue_FreeBuf) {
std::string position;
codegen::PositionValue(af::Position::kPositionVecIn, position);
codegen::TQue que(0, af::Position::kPositionVecIn, position);
EXPECT_EQ(que.FreeBuf(), "q0.FreeTensor(q0_buf);\n");
}
TEST(CodegenKernel, TPipe_TensorAlloc_WhenConstantTensor_WillNotAlloc) {
af::AscGraph graph("test");
af::ascir_op::Scalar x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.position = af::Position::kPositionInvalid;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeInvalid;
tensor.attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor, "test_t");
std::string result;
auto tensor_ptr = tpipe.GetTensor(tensor.attr.mem.tensor_id);
tpipe.TensorAlloc(*tensor_ptr, result);
EXPECT_EQ(result,
std::string{});
}
TEST(CodegenKernel, TPipe_TensorAlloc_WhenTensorFromQue_AndNotMerge) {
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.position = af::Position::kPositionVecIn;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
tensor.attr.que.id = 1;
tensor.attr.mem.reuse_id = 1;
tensor.attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor, "test_t");
std::string result;
auto tensor_ptr = tpipe.GetTensor(tensor.attr.mem.tensor_id);
tpipe.TensorAlloc(*tensor_ptr, result);
EXPECT_EQ(result, std::string{
"LocalTensor<half> local_0;\n"
"local_0 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
});
}
TEST(CodegenKernel, TPipe_TensorAlloc_WhenTensorFromBuf_AndNotMerge) {
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.position = af::Position::kPositionVecIn;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
tensor.attr.buf.id = 1;
tensor.attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor, "test_t");
std::string result;
auto tensor_ptr = tpipe.GetTensor(tensor.attr.mem.tensor_id);
tpipe.TensorAlloc(*tensor_ptr, result);
EXPECT_EQ(result, std::string{
"LocalTensor<half> local_0;\n"
"local_0 = b1_buf.template ReinterpretCast<half>();\n"
});
}
TEST(CodegenKernel, AddTensor_InvalidName) {
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
{
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.position = af::Position::kPositionVecIn;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
tensor.attr.que.id = 1;
tensor.attr.mem.reuse_id = 1;
tensor.attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor, "invalid/test");
std::string result;
auto tensor_ptr = tpipe.GetTensor(tensor.attr.mem.tensor_id);
tpipe.TensorAlloc(*tensor_ptr, result);
EXPECT_EQ(result, std::string{
"LocalTensor<half> local_0;\n"
"local_0 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
});
}
{
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.position = af::Position::kPositionVecIn;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
tensor.attr.que.id = 1;
tensor.attr.mem.reuse_id = 1;
tensor.attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor, "0test");
std::string result;
auto tensor_ptr = tpipe.GetTensor(tensor.attr.mem.tensor_id);
tpipe.TensorAlloc(*tensor_ptr, result);
EXPECT_EQ(result, std::string{
"LocalTensor<half> local_0;\n"
"local_0 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
});
}
}
TEST(CodegenKernel, TPipe_InitTQueBuffers) {
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.mem.position = af::Position::kPositionVecIn;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
tensor.attr.que.id = 1;
tensor.attr.mem.reuse_id = 1;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor);
auto que = tpipe.ques.find(tensor.attr.que.id);
ASSERT_NE(que, tpipe.ques.end());
std::string result;
tpipe.InitTQueBuffers(que->second, result);
EXPECT_EQ(result, std::string {
"// tpipe.InitBuffer(q1, q1_buf_num, KernelUtils::BlkAlign<uint8_t>(q1_size));\n"
"tpipe.InitBuffer(q1, q1_buf_num, t->q1_size);"});
}
TEST(CodegenKernel, TPipe_InitTBufBuffer) {
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.mem.position = af::Position::kPositionVecIn;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
tensor.attr.buf.id = 1;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor);
auto buf = tpipe.bufs.find(tensor.attr.buf.id);
ASSERT_NE(buf, tpipe.bufs.end());
std::string result;
tpipe.InitTBufBuffer(buf->second, result);
EXPECT_EQ(result, std::string{
"// tpipe.InitBuffer(b1, KernelUtils::BlkAlign<uint8_t>(b1_size));\n"
"tpipe.InitBuffer(b1, t->b1_size);"});
}
TEST(CodegenKernel, TPipe_TensorSizeCalc_AllocFromBuf) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.repeats = {z0.size, z1.size, z2.size};
tensor.attr.strides = {z1.size*z2.size, z2.size, One};
tensor.attr.mem.tensor_id = 1;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
tensor.attr.mem.position = af::Position::kPositionVecIn;
tensor.attr.opt.merge_scope = af::kIdNone;
tensor.attr.buf.id = 2;
vector<af::Expression> vectorized_strides{One, One};
vectorized_strides[0] = z2.size;
tensor.attr.vectorized_strides = vectorized_strides;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor);
EXPECT_EQ(tpipe.TensorSizeCalc(), std::string{
"const uint32_t local_1_size = KernelUtils::BlkAlign<float>((t->s1 - 1) * t->s2 + (t->s2 - 1) + 1);\n"
});
}
TEST(CodegenKernel, TPipe_TensorSizeCalc_AllocFromQue) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.repeats = {z0.size, z1.size, z2.size};
tensor.attr.strides = {z1.size*z2.size, z2.size, One};
tensor.attr.mem.tensor_id = 1;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
tensor.attr.mem.position = af::Position::kPositionVecIn;
tensor.attr.opt.merge_scope = af::kIdNone;
tensor.attr.que.id = 2;
tensor.attr.que.buf_num = 4;
vector<af::Expression> vectorized_strides{One, One};
vectorized_strides[0] = z2.size;
tensor.attr.vectorized_strides = vectorized_strides;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor);
EXPECT_EQ(tpipe.TensorSizeCalc(), std::string{
"const uint32_t local_1_size = KernelUtils::BlkAlign<float>((t->s1 - 1) * t->s2 + (t->s2 - 1) + 1);\n"
"const uint32_t local_1_que_buf_num = 4;\n"
});
}
TEST(CodegenKernel, TPipe_MergeScopeSizeCalc) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.repeats = {z0.size, z1.size, z2.size};
tensor.attr.strides = {z1.size*z2.size, z2.size, One};
tensor.attr.mem.position = af::Position::kPositionVecIn;
tensor.attr.opt.merge_scope = 1;
vector<af::Expression> vectorized_strides{One, One};
vectorized_strides[0] = z2.size;
tensor.attr.vectorized_strides = vectorized_strides;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
tensor.attr.que.id = 2;
tensor.attr.que.depth = 3;
tensor.attr.que.buf_num = 4;
tpipe.AddTensor(tensor);
tensor.attr.dtype = ge::DT_FLOAT;
tensor.attr.mem.tensor_id = 1;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
tensor.attr.buf.id = 2;
tpipe.AddTensor(tensor);
std::string result;
tpipe.MergeScopeSizeCalc(result);
EXPECT_EQ(result, std::string{
"const uint32_t m1_size = KernelUtils::Sum(local_0_size * sizeof(half), local_1_size * sizeof(float));\n"
"const uint32_t m1_que_buf_num = KernelUtils::Max(local_0_que_buf_num);\n"
});
}
TEST(CodegenKernel, TPipe_LocalTBufAlloc) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.repeats = {z0.size, z1.size, z2.size};
tensor.attr.strides = {z1.size*z2.size, z2.size, One};
tensor.attr.mem.position = af::Position::kPositionVecIn;
vector<af::Expression> vectorized_strides{One, One};
vectorized_strides[0] = z2.size;
tensor.attr.vectorized_strides = vectorized_strides;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
tensor.attr.buf.id = 1;
tensor.attr.opt.merge_scope = 1;
tpipe.AddTensor(tensor);
tensor.attr.dtype = ge::DT_FLOAT;
tensor.attr.mem.tensor_id = 1;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
tensor.attr.buf.id = 1;
tensor.attr.opt.merge_scope = af::kIdNone;
tpipe.AddTensor(tensor);
std::string result;
tpipe.SetUsingAttCalcQBTSizeConfig(false);
tpipe.LocalTBufAllocLoopTwice(result);
EXPECT_EQ(result, std::string{
"const uint32_t b1_size = KernelUtils::Max(m1_size, local_1_size * sizeof(float));\n"
"TBuf<TPosition::VECIN> b1;\n"
"tpipe.InitBuffer(b1, KernelUtils::BlkAlign<uint8_t>(b1_size));\n"
"LocalTensor<float> local_1 = b1.Get<float>();\n\n"
});
}
TEST(CodegenKernel, TPipe_LocalTBufAlloc_2) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.repeats = {z0.size, z1.size, z2.size};
tensor.attr.strides = {z1.size*z2.size, z2.size, One};
tensor.attr.mem.position = af::Position::kPositionVecIn;
vector<af::Expression> vectorized_strides{One, One};
vectorized_strides[0] = z2.size;
tensor.attr.vectorized_strides = vectorized_strides;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
tensor.attr.buf.id = 1;
tensor.attr.opt.merge_scope = 1;
tpipe.AddTensor(tensor);
tensor.attr.dtype = ge::DT_FLOAT;
tensor.attr.mem.tensor_id = 1;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
tensor.attr.buf.id = 1;
tensor.attr.opt.merge_scope = 2;
tpipe.AddTensor(tensor);
std::string result;
tpipe.SetUsingAttCalcQBTSizeConfig(false);
EXPECT_EQ(tpipe.LocalTBufAllocLoopTwice(result), ge::SUCCESS);
}
TEST(CodegenKernel, TPipe_LocalTBufAlloc_MergeScopes_ERROR) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.repeats = {z0.size, z1.size, z2.size};
tensor.attr.strides = {z1.size*z2.size, z2.size, One};
tensor.attr.mem.position = af::Position::kPositionVecIn;
vector<af::Expression> vectorized_strides{One, One};
vectorized_strides[0] = z2.size;
tensor.attr.vectorized_strides = vectorized_strides;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
tensor.attr.buf.id = 1;
tensor.attr.opt.merge_scope = 1;
tpipe.AddTensor(tensor);
for (auto &[id, buf] : tpipe.bufs) {
if (id == 1) {
buf.merge_scopes.insert(100);
}
}
std::string result;
tpipe.SetUsingAttCalcQBTSizeConfig(false);
EXPECT_EQ(tpipe.LocalTBufAllocLoopTwice(result), ge::FAILED);
}
TEST(CodegenKernel, TPipe_LocalTBufAlloc_NotMergeTensors_ERROR) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.repeats = {z0.size, z1.size, z2.size};
tensor.attr.strides = {z1.size*z2.size, z2.size, One};
tensor.attr.mem.position = af::Position::kPositionVecIn;
vector<af::Expression> vectorized_strides{One, One};
vectorized_strides[0] = z2.size;
tensor.attr.vectorized_strides = vectorized_strides;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
tensor.attr.buf.id = 1;
tensor.attr.opt.merge_scope = af::kIdNone;
tpipe.AddTensor(tensor);
for (auto &[id, buf] : tpipe.bufs) {
if (id == 1) {
buf.not_merge_tensors.insert(100);
}
}
std::string result;
tpipe.SetUsingAttCalcQBTSizeConfig(false);
EXPECT_EQ(tpipe.LocalTBufAllocLoopTwice(result), ge::FAILED);
}
TEST(CodegenKernel, TPipe_LocalTQueAlloc) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(s2);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.repeats = {z0.size, z1.size, z2.size};
tensor.attr.strides = {z1.size*z2.size, z2.size, One};
tensor.attr.mem.position = af::Position::kPositionVecIn;
vector<af::Expression> vectorized_strides{One, One};
vectorized_strides[0] = z2.size;
tensor.attr.vectorized_strides = vectorized_strides;
codegen::TPipe tpipe("tpipe", tiler);
tensor.attr.dtype = ge::DT_FLOAT16;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
tensor.attr.que.id = 1;
tensor.attr.mem.reuse_id = 1;
tensor.attr.opt.merge_scope = 1;
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor);
tensor.attr.dtype = ge::DT_FLOAT;
tensor.attr.mem.tensor_id = 1;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
tensor.attr.que.id = 1;
tensor.attr.mem.reuse_id = 2;
tensor.attr.opt.merge_scope = af::kIdNone;
tensor.attr.que.buf_num = 2;
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor);
std::string result;
tpipe.LocalTQueAlloc(result);
EXPECT_EQ(result, std::string{
"// const uint32_t q1_size = KernelUtils::Max(m1_size, local_1_size * sizeof(float));\n"
"const uint32_t q1_buf_num = KernelUtils::Max(m1_que_buf_num, 2);\n"
"TQue<TPosition::VECIN, 1> q1;\n"
"// tpipe.InitBuffer(q1, q1_buf_num, KernelUtils::BlkAlign<uint8_t>(q1_size));\n"
"tpipe.InitBuffer(q1, q1_buf_num, t->q1_size);\n"
});
}
TEST(CodegenKernel, ApiCall_Generate) {
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
codegen::ApiCall call("call");
call.Init(node);
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
std::string result;
EXPECT_EQ(call.Generate(tpipe, {}, {}, {}, result), SUCCESS);
}
TEST(CodegenKernel, Stage_AddCall_WillCollectInputOutputQues) {
GTEST_SKIP();
}
TEST(CodegenKernel, Stage_AddCall_WillAddCall) {
GTEST_SKIP();
}
class MockApiCall : public virtual codegen::ApiCall {
public:
MockApiCall(const std::string &api_name) : codegen::ApiCall(api_name) {}
MockApiCall(const af::AscNodePtr &node, const std::string &api_name) :
codegen::ApiCall(api_name) {Init(node);}
Status Generate(const codegen::TPipe &tpipe, const std::vector<af::AxisId> ¤t_axis, std::string &result) const override{
result = this->type + "();\n";
return 0;
}
virtual Status GenerateFuncDefinition(const TPipe &tpipe, const Tiler &tiler, std::stringstream &ss) const {
ss << "func_test_Definition:" << api_name_ << std::endl;
return ge::SUCCESS;
};
};
class CodegenKernel_CallSync : public ::testing::Test {
protected:
af::AscGraph graph;
af::AscNodePtr x;
int tensor_id = 0;
CodegenKernel_CallSync()
: graph("test_graph"), x(nullptr) {
Data x_op("x", graph);
x = graph.FindNode("x");
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.tensor_id = this->tensor_id++;
}
af::AscNodePtr AddNode(const char* name, const std::vector<af::AscNodePtr>& inputs, bool has_output=true, const af::AscNodePtr reuse_or_share = nullptr) {
auto op_desc = std::make_shared<af::OpDesc>(name, name);
for (int i = 0; i < inputs.size(); i++) {
op_desc->AddInputDesc(af::GeTensorDesc());
}
if (has_output) {
op_desc->AddOutputDesc(af::GeTensorDesc());
}
auto op = af::OpDescUtils::CreateOperatorFromOpDesc(op_desc);
graph.AddNode(op);
auto node = af::NodeUtilsEx::GetNodeFromOperator(op);
int32_t in_index = 0;
for (auto input : inputs) {
af::GraphUtils::AddEdge(input->GetOutDataAnchor(0), node->GetInDataAnchor(in_index));
in_index++;
}
auto n = graph.FindNode(name);
if (has_output) {
n->outputs[0].attr.dtype = ge::DT_FLOAT16;
n->outputs[0].attr.mem.tensor_id = this->tensor_id++;
n->outputs[0].attr.opt.merge_scope = af::kIdNone;
n->outputs[0].attr.mem.reuse_id = n->outputs[0].attr.mem.tensor_id;
if (reuse_or_share != nullptr) {
auto reuse_output = reuse_or_share->outputs()[0];
n->outputs[0].attr.mem.alloc_type = reuse_output->attr.mem.alloc_type;
n->outputs[0].attr.buf.id = reuse_output->attr.buf.id;
n->outputs[0].attr.que.id = reuse_output->attr.que.id;
}
}
return n;
}
af::AscNodePtr AddNode(const char* name, bool has_output=true, const af::AscNodePtr reuse=nullptr) {
auto op_desc = std::make_shared<af::OpDesc>(name, name);
if (has_output) {
op_desc->AddOutputDesc(af::GeTensorDesc());
}
auto op = af::OpDescUtils::CreateOperatorFromOpDesc(op_desc);
graph.AddNode(op);
auto n = graph.FindNode(name);
if (has_output) {
n->outputs[0].attr.dtype = ge::DT_FLOAT16;
n->outputs[0].attr.mem.tensor_id = this->tensor_id++;
n->outputs[0].attr.opt.merge_scope = af::kIdNone;
n->outputs[0].attr.mem.reuse_id = n->outputs[0].attr.mem.tensor_id;
if (reuse != nullptr) {
auto reuse_output = reuse->outputs()[0];
n->outputs[0].attr.mem.alloc_type = reuse_output->attr.mem.alloc_type;
n->outputs[0].attr.buf.id = reuse_output->attr.buf.id;
n->outputs[0].attr.que.id = reuse_output->attr.que.id;
}
}
return n;
}
af::AscNodePtr Load(const char* name, const af::AscNodePtr& input, const af::AscNodePtr reuse=nullptr) {
auto n = AddNode(name, {input}, true, reuse);
n->attr.api.unit = af::ComputeUnit::kUnitMTE2;
n->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
if (reuse == nullptr) {
n->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
n->outputs[0].attr.que.id = n->outputs[0].attr.mem.tensor_id;
n->outputs[0].attr.mem.reuse_id = n->outputs[0].attr.mem.tensor_id;
}
return n;
}
af::AscNodePtr LoadForShare(const char* name, const af::AscNodePtr& input, const af::AscNodePtr share_pre) {
auto n = AddNode(name, {input}, true, share_pre);
n->attr.api.unit = af::ComputeUnit::kUnitMTE2;
n->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
auto share_output = share_pre->outputs()[0];
n->outputs[0].attr.mem.reuse_id = share_output->attr.mem.reuse_id;
return n;
}
af::AscNodePtr Vec(const char* name, bool has_output=true, const af::AscNodePtr reuse=nullptr) {
auto n = AddNode(name, has_output, reuse);
n->attr.api.unit = af::ComputeUnit::kUnitVector;
if (has_output) {
n->outputs[0].attr.mem.position = af::Position::kPositionVecCalc;
if (reuse == nullptr) {
n->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
n->outputs[0].attr.buf.id = n->outputs[0].attr.mem.tensor_id;
}
}
return n;
}
af::AscNodePtr Vec(const char* name, const std::vector<af::AscNodePtr>& inputs, bool has_output=true, const af::AscNodePtr reuse = nullptr) {
auto n = AddNode(name, inputs, has_output, reuse);
n->attr.api.unit = af::ComputeUnit::kUnitVector;
if (has_output) {
n->outputs[0].attr.mem.position = af::Position::kPositionVecCalc;
if (reuse == nullptr) {
n->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
n->outputs[0].attr.buf.id = n->outputs[0].attr.mem.tensor_id;
}
}
return n;
}
af::AscNodePtr VecOut(const char* name, const af::AscNodePtr input=nullptr, const af::AscNodePtr reuse=nullptr) {
auto n = (input == nullptr) ? AddNode(name, true, reuse) : AddNode(name, {input}, true, reuse);
n->attr.api.unit = af::ComputeUnit::kUnitVector;
n->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
if (reuse == nullptr) {
n->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
n->outputs[0].attr.que.id = n->outputs[0].attr.mem.tensor_id;
n->outputs[0].attr.mem.reuse_id = n->outputs[0].attr.mem.tensor_id;
}
return n;
}
af::AscNodePtr Store(const char* name, const af::AscNodePtr input, bool has_output=false) {
auto n = AddNode(name, {input}, has_output);
n->attr.api.unit = af::ComputeUnit::kUnitMTE2;
if (has_output) {
n->outputs[0].attr.mem.position = af::Position::kPositionGM;
}
return n;
}
void InitApiCallInputs(std::map<int, MockApiCall> &calls, std::map<af::AscNode *, int64_t> &node_to_order,
const AscNodePtr &node, ApiCall &api_call) {
int32_t index = 0;
for (auto i : node->inputs()) {
auto i_index = af::ascir::AscTensorUtils::Index(*i);
auto in_node = dynamic_cast<af::AscNode *>(af::ascir::AscTensorUtils::GetOwner(*i));
if (in_node->GetType() == "Data") {
continue;
}
auto in_call = calls.find(static_cast<int32_t>(node_to_order[in_node]));
if (in_call != calls.end()) {
api_call.inputs.emplace_back(&in_call->second.outputs[i_index]);
in_call->second.outputs[i_index].reads.emplace_back(&api_call);
if (node->GetName() == "Concat") {
in_call->second.outputs[i_index].share_order = index;
}
}
++index;
}
}
std::string Generate() {
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
codegen::Loop loop(af::kIdNone);
map<int, MockApiCall> calls;
map<int64_t, codegen::ApiTensor*> buf_last_use;
map<int64_t, codegen::ApiTensor*> que_last_use;
map<int64_t, map<int64_t, codegen::ApiTensor*>> que_last_share;
std::map<af::AscNode*, int64_t> node_to_order_;
int64_t top_id = 0UL;
for (auto node : graph.GetAllNodes()) {
node_to_order_[node.get()] = top_id++;
tpipe.CollectQues(graph);
for (auto o : node->outputs()) {
tpipe.AddTensor(*o);
}
if (node->GetType() == "Data") {
continue;
}
auto [it, _] = calls.insert({node_to_order_[node.get()], MockApiCall(node, "")});
InitApiCallInputs(calls, node_to_order_, node, it->second);
for (auto o: node->outputs()) {
auto o_index = af::ascir::AscTensorUtils::Index(*o);
if (o->attr.mem.alloc_type == af::AllocType::kAllocTypeBuffer) {
auto reused_tensor = buf_last_use.find(o->attr.buf.id);
if (reused_tensor != buf_last_use.end()) {
reused_tensor->second->reuse_next = &it->second.outputs[o_index];
it->second.outputs[o_index].reuse_from = reused_tensor->second;
}
buf_last_use.insert({o->attr.buf.id, &it->second.outputs[o_index]});
} else if (o->attr.mem.alloc_type == af::AllocType::kAllocTypeQueue) {
map<int64_t, codegen::ApiTensor*>& last_share = que_last_share[o->attr.que.id];
auto share_tensor = last_share.find(o->attr.mem.reuse_id);
if (share_tensor != last_share.end()) {
auto t_ptr = tpipe.GetTensor(o->attr.mem.tensor_id);
auto t_share_prev_ptr = tpipe.GetTensor(share_tensor->second->id);
auto &t = *t_ptr;
auto &t_share_prev = *t_share_prev_ptr;
t.share_pre_size = t_share_prev.size.name;
share_tensor->second->share_next = &it->second.outputs[o_index];
it->second.outputs[o_index].share_prev = share_tensor->second;
}
last_share[o->attr.mem.reuse_id] = &it->second.outputs[o_index];
auto reused_tensor = que_last_use.find(o->attr.que.id);
if (reused_tensor != que_last_use.end()) {
reused_tensor->second->reuse_next = &it->second.outputs[o_index];
it->second.outputs[o_index].reuse_from = reused_tensor->second;
}
que_last_use.insert({o->attr.que.id, &it->second.outputs[o_index]});
}
}
for (auto& o: it->second.outputs) {
o.write = &it->second;
}
loop.AddCall(&it->second);
}
std::string result;
loop.Generate(tiler, tpipe, result);
return result;
}
};
TEST_F(CodegenKernel_CallSync, Load_Store_ShouldSyncMte2ToMte3) {
auto load = Load("Load", x);
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
auto store = Store("Store", load);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"Load();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"auto local_1_e = tpipe.AllocEventID<HardEvent::MTE2_MTE3>();\n"
"TQueSync<PIPE_MTE2, PIPE_MTE3> local_1_s;\n"
"local_1_s.SetFlag(local_1_e);\n"
"local_1_s.WaitFlag(local_1_e);\n"
"tpipe.ReleaseEventID<HardEvent::MTE2_MTE3>(local_1_e);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"Store();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, Vec_Store_ShouldAlloc) {
auto vec1 = Vec("vec1");
auto vec2 = Vec("vec2", {vec1});
auto store = Store("store", vec2);
vec2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
vec2->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
vec2->outputs[0].attr.que.id = vec2->outputs[0].attr.mem.tensor_id;
vec2->outputs[0].attr.mem.reuse_id = vec2->outputs[0].attr.mem.tensor_id;
EXPECT_EQ(Generate(), std::string{
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = b1_buf.template ReinterpretCast<half>();\n"
"vec1();\n\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"uint32_t q2_reuse2_offset = 0;\n"
"LocalTensor<uint8_t> q2_buf = q2.AllocTensor<uint8_t>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q2_buf[q2_reuse2_offset].template ReinterpretCast<half>();\n"
"vec2();\n"
"q2.EnQue(q2_buf);\n\n"
"q2_buf = q2.DeQue<uint8_t>();\n"
"store();\n"
"q2.FreeTensor(q2_buf);\n\n"
});
}
TEST_F(CodegenKernel_CallSync, VecOut_Unuse_ShouldFree) {
auto vec1 = Vec("vec1");
auto vec2 = VecOut("vec2", {vec1});
auto vec3 = VecOut("vec3", {vec1}, vec2);
EXPECT_EQ(Generate(), std::string{
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = b1_buf.template ReinterpretCast<half>();\n"
"vec1();\n\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"uint32_t q2_reuse2_offset = 0;\n"
"LocalTensor<uint8_t> q2_buf = q2.AllocTensor<uint8_t>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q2_buf[q2_reuse2_offset].template ReinterpretCast<half>();\n"
"vec2();\n"
"q2.EnQue(q2_buf);\n"
"q2.FreeTensor(q2_buf);\n\n"
"uint32_t q2_reuse3_offset = 0;\n"
"q2_buf = q2.AllocTensor<uint8_t>();\n"
"const uint32_t local_3_actual_size = 1;\n"
"LocalTensor<half> local_3;\n"
"local_3 = q2_buf[q2_reuse3_offset].template ReinterpretCast<half>();\n"
"vec3();\n"
"q2.EnQue(q2_buf);\n"
"q2.FreeTensor(q2_buf);\n\n"
});
}
TEST_F(CodegenKernel_CallSync, Load_Vec__ShouldAllocFromQue_Enq_Deq_Free) {
auto load = Load("load1", x);
auto vec = Vec("vec1", {load});
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load1();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = b2_buf.template ReinterpretCast<half>();\n"
"vec1();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, Vec_Vec__Should_PipeBarrier) {
auto vec1 = Vec("vec1");
auto vec2 = Vec("vec2", {vec1});
EXPECT_EQ(Generate(), std::string{
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = b1_buf.template ReinterpretCast<half>();\n"
"vec1();\n"
"\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = b2_buf.template ReinterpretCast<half>();\n"
"vec2();\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, Vec_Store__Should_AllocFromQue_Enq_Deq_Free) {
auto vec1 = VecOut("vec1");
auto store = Store("store", vec1);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"vec1();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"store();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, Load_MulitVec__ShouldDequeFirstVec_FreeAfterLastVec) {
auto load = Load("load1", x);
auto vec1 = Vec("vec1", {load}, false);
auto vec2 = Vec("vec2", {load}, false);
auto vec3 = Vec("vec3", {load}, false);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load1();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"vec1();\n"
"\n"
"vec2();\n"
"\n"
"vec3();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, Load_Vec1_ReuseByVec1_WillPipeBeforeVec1_AnfFreeAfterVec2) {
auto load = Load("load1", x);
auto vec1 = Vec("vec1", {load}, false);
auto vec2 = Vec("vec2", true, load);
auto vec3 = Vec("vec3", {vec2}, false);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load1();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"vec1();\n"
"\n"
"uint32_t q1_reuse2_offset = 0;\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q1_buf[q1_reuse2_offset].template ReinterpretCast<half>();\n"
"vec2();\n"
"\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"vec3();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, Load1Vec1__Load2ReuseLoad1_ShouldFreeAlloc) {
auto load1 = Load("load1", x);
auto vec1 = Vec("vec1", {load1}, false);
auto load2 = Load("load2", x, load1);
auto vec2 = Vec("vec2", {load2}, false);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load1();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"vec1();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
"uint32_t q1_reuse2_offset = 0;\n"
"q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q1_buf[q1_reuse2_offset].template ReinterpretCast<half>();\n"
"load2();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"vec2();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, Vec_MulitVec__Should_PipeBarrierFirstVec) {
auto vec1 = Vec("vec1");
auto vec2 = Vec("vec2", {vec1});
auto vec3 = Vec("vec3", {vec1});
EXPECT_EQ(Generate(), std::string{
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = b1_buf.template ReinterpretCast<half>();\n"
"vec1();\n"
"\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = b2_buf.template ReinterpretCast<half>();\n"
"vec2();\n"
"\n"
"const uint32_t local_3_actual_size = 1;\n"
"LocalTensor<half> local_3;\n"
"local_3 = b3_buf.template ReinterpretCast<half>();\n"
"vec3();\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, Vec_Store_Vec__Should_AllocFromQue_Enq_Deq_PipeBarrier_Free) {
auto vec1 = VecOut("vec1");
auto store = Store("store", vec1);
auto vec2 = Vec("vec2", {vec1});
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"vec1();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"store();\n"
"\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = b2_buf.template ReinterpretCast<half>();\n"
"vec2();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, Vec_Vec_Store__Should_AllocFromQue_Enq_PipeBarrier_Deq_Free) {
auto vec1 = VecOut("vec1");
auto vec2 = Vec("vec2", {vec1});
auto store = Store("store", vec1);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"vec1();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = b2_buf.template ReinterpretCast<half>();\n"
"vec2();\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"store();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, Vec1alloc_Vec2_Vec3ReuseVec1) {
auto vec1 = Vec("vec1");
auto vec2 = Vec("vec2", {vec1}, false);
auto vec3 = Vec("vec3", true, vec1);
EXPECT_EQ(Generate(), std::string{
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = b1_buf.template ReinterpretCast<half>();\n"
"vec1();\n"
"\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"vec2();\n"
"\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = b1_buf.template ReinterpretCast<half>();\nvec3();\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, Vec1_Vec2_Vec3_Vec4ReuseVec1) {
auto vec1 = Vec("vec1");
auto vec2 = Vec("vec2", {vec1}, false);
auto vec3 = Vec("vec3", true, vec1);
auto vec4 = Vec("vec4", true, vec1);
EXPECT_EQ(Generate(), std::string{
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = b1_buf.template ReinterpretCast<half>();\n"
"vec1();\n"
"\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"vec2();\n"
"\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = b1_buf.template ReinterpretCast<half>();\n"
"vec3();\n"
"\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"const uint32_t local_3_actual_size = 1;\n"
"LocalTensor<half> local_3;\n"
"local_3 = b1_buf.template ReinterpretCast<half>();\n"
"vec4();\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, LoadEnq_DeqVec1_Vec23_PipeVec4ReuseLoad) {
auto load = Load("load1", x);
auto vec1 = Vec("vec1", {load}, false);
auto vec2 = Vec("vec2", {load}, false);
auto vec3 = Vec("vec3", {load}, false);
auto vec4 = Vec("vec4", true, load);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load1();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"vec1();\n"
"\n"
"vec2();\n"
"\n"
"vec3();\n"
"\n"
"uint32_t q1_reuse2_offset = 0;\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q1_buf[q1_reuse2_offset].template ReinterpretCast<half>();\n"
"vec4();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, AllocVec1Enq_DeqStore1_Stroe23Free_Vec2ReuseVec1) {
auto vec1 = VecOut("vec1");
auto store1 = Store("store1", vec1);
auto store2 = Store("store2", vec1);
auto store3 = Store("store3", vec1);
auto vec2 = Vec("vec2", true, vec1);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"vec1();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"store1();\n"
"\n"
"store2();\n"
"\n"
"store3();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
"uint32_t q1_reuse2_offset = 0;\n"
"q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q1_buf[q1_reuse2_offset].template ReinterpretCast<half>();\n"
"vec2();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, AllocVec1Enq_DeqStore1_Stroe23Free_Vec2OutReuseVec1Out) {
auto vec1 = VecOut("vec1");
auto store1 = Store("store1", vec1);
auto store2 = Store("store2", vec1);
auto store3 = Store("store3", vec1);
auto vec2 = VecOut("vec2", nullptr, vec1);
auto store4 = Store("store4", vec2);
auto res = Generate();
EXPECT_EQ(res, std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"vec1();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"store1();\n"
"\n"
"store2();\n"
"\n"
"store3();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
"uint32_t q1_reuse2_offset = 0;\n"
"q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q1_buf[q1_reuse2_offset].template ReinterpretCast<half>();\n"
"vec2();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"store4();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
TEST_F(CodegenKernel_CallSync, AllocVec1Enq_PipVec23_DeqStoreFree_Vec4ReuseVec1) {
auto vec1 = VecOut("vec1");
auto vec2 = Vec("vec2", {vec1}, false);
auto vec3 = Vec("vec3", {vec1}, false);
auto store = Store("store", vec1);
auto vec4 = Vec("vec4", true, vec1);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"vec1();\n"
"q1.EnQue(q1_buf);\n"
"\n"
"AscendC::PipeBarrier<PIPE_V>();\n"
"vec2();\n"
"\n"
"vec3();\n"
"\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"store();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
"uint32_t q1_reuse2_offset = 0;\n"
"q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q1_buf[q1_reuse2_offset].template ReinterpretCast<half>();\n"
"vec4();\n"
"q1.FreeTensor(q1_buf);\n"
"\n"
});
}
* load1 load2
* \ /
* vec
* load1和load2共用
*/
TEST_F(CodegenKernel_CallSync, AllocLoad1_ShareLoad2Enq_DeqVec) {
auto load1 = Load("load1", x);
auto load2 = LoadForShare("load2", x, load1);
auto vec = Vec("vec", {load1, load2}, false);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load1();\n\n"
"q1_reuse1_offset = q1_reuse1_offset + local_1_size * 2;\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load2();\n"
"q1.EnQue(q1_buf);\n\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"vec();\n"
"q1.FreeTensor(q1_buf);\n\n"
});
}
* load1 load2
* \ /
* vec
* ...
* load3
* load1和load2共用
* load3与load1/load2复用
*/
TEST_F(CodegenKernel_CallSync, AllocLoad1_ShareLoad2Enq_DeqVec_Load3ReuseLoad2) {
auto load1 = Load("load1", x);
auto load2 = LoadForShare("load2", x, load1);
auto vec = Vec("vec", {load1, load2}, false);
auto load3 = Load("load3", x, load2);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load1();\n\n"
"q1_reuse1_offset = q1_reuse1_offset + local_1_size * 2;\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load2();\n"
"q1.EnQue(q1_buf);\n\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"vec();\n"
"q1.FreeTensor(q1_buf);\n\n"
"uint32_t q1_reuse3_offset = 0;\n"
"q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_3_actual_size = 1;\n"
"LocalTensor<half> local_3;\n"
"local_3 = q1_buf[q1_reuse3_offset].template ReinterpretCast<half>();\n"
"load3();\n"
"q1.EnQue(q1_buf);\n"
"q1.FreeTensor(q1_buf);\n\n"
});
}
* load1
* |
* vec1
* ...
* load2 load3
* \ /
* vec2
* load1与load2和load3复用
* load2和load3共用
*/
TEST_F(CodegenKernel_CallSync, AllocLoad1Enq_DeqVec1_AllocLoad2_ShareLoad3Enq_DeqVec2) {
auto load1 = Load("load1", x);
auto vec1 = Vec("vec1", {load1}, false);
auto load2 = Load("load2", x, load1);
auto load3 = LoadForShare("load3", x, load2);
auto vec2 = Vec("vec2", {load2, load3}, false);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load1();\n"
"q1.EnQue(q1_buf);\n\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"vec1();\n"
"q1.FreeTensor(q1_buf);\n\n"
"uint32_t q1_reuse2_offset = 0;\n"
"q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q1_buf[q1_reuse2_offset].template ReinterpretCast<half>();\n"
"load2();\n\n"
"q1_reuse2_offset = q1_reuse2_offset + local_2_size * 2;\n"
"const uint32_t local_3_actual_size = 1;\n"
"LocalTensor<half> local_3;\n"
"local_3 = q1_buf[q1_reuse2_offset].template ReinterpretCast<half>();\n"
"load3();\n"
"q1.EnQue(q1_buf);\n\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"vec2();\n"
"q1.FreeTensor(q1_buf);\n\n"
});
}
* load1 load3 load2
* \ / |
* vec1 vec2
* load1和load3共用, load2使用其他que,拓扑序为load1 - load2 - load3 - vec1 - vec2
*/
TEST_F(CodegenKernel_CallSync, AllocLoad1_AllocLoad2Enq_ShareLoad3Enq_DeqVec1_DeqVec2) {
auto load1 = Load("load1", x);
auto load2 = Load("load2", x);
auto load3 = LoadForShare("load3", x, load1);
auto vec1 = Vec("vec1", {load1, load3}, false);
auto vec2 = Vec("vec2", {load2}, false);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load1();\n\n"
"uint32_t q2_reuse2_offset = 0;\n"
"LocalTensor<uint8_t> q2_buf = q2.AllocTensor<uint8_t>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q2_buf[q2_reuse2_offset].template ReinterpretCast<half>();\n"
"load2();\n"
"q2.EnQue(q2_buf);\n\n"
"q1_reuse1_offset = q1_reuse1_offset + local_1_size * 2;\n"
"const uint32_t local_3_actual_size = 1;\n"
"LocalTensor<half> local_3;\n"
"local_3 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load3();\n"
"q1.EnQue(q1_buf);\n\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"vec1();\n"
"q1.FreeTensor(q1_buf);\n\n"
"q2_buf = q2.DeQue<uint8_t>();\n"
"vec2();\n"
"q2.FreeTensor(q2_buf);\n\n"
});
}
* load1 load2 load3
* \ | /
* vec
* load1和load3共用, load2使用其他que
*/
TEST_F(CodegenKernel_CallSync, AllocLoad1_AllocLoad2Enq_ShareLoad3Enq_DeqDeqVec) {
auto load1 = Load("load1", x);
auto load2 = Load("load2", x);
auto load3 = LoadForShare("load3", x, load1);
auto vec = Vec("vec", {load1, load2, load3}, false);
EXPECT_EQ(Generate(), std::string{
"uint32_t q1_reuse1_offset = 0;\n"
"LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();\n"
"const uint32_t local_1_actual_size = 1;\n"
"LocalTensor<half> local_1;\n"
"local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load1();\n\n"
"uint32_t q2_reuse2_offset = 0;\n"
"LocalTensor<uint8_t> q2_buf = q2.AllocTensor<uint8_t>();\n"
"const uint32_t local_2_actual_size = 1;\n"
"LocalTensor<half> local_2;\n"
"local_2 = q2_buf[q2_reuse2_offset].template ReinterpretCast<half>();\n"
"load2();\n"
"q2.EnQue(q2_buf);\n\n"
"q1_reuse1_offset = q1_reuse1_offset + local_1_size * 2;\n"
"const uint32_t local_3_actual_size = 1;\n"
"LocalTensor<half> local_3;\n"
"local_3 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();\n"
"load3();\n"
"q1.EnQue(q1_buf);\n\n"
"q1_buf = q1.DeQue<uint8_t>();\n"
"q2_buf = q2.DeQue<uint8_t>();\n"
"vec();\n"
"q2.FreeTensor(q2_buf);\n"
"q1.FreeTensor(q1_buf);\n\n"
});
}
TEST(CodegenKernel, StageGenerate_WillNotDuplicatAllocTensorInSameStage) {
GTEST_SKIP();
}
TEST(CodegenKernel, CompareApiCallNotThrowingFor) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
Data x_op("x", graph);
Data x_op2("x2", graph);
Load load_op("load");
Load load_op2("load2");
af::ascir_op::Gt gt_op("gt");
graph.AddNode(load_op);
graph.AddNode(load_op2);
graph.AddNode(gt_op);
load_op.x = x_op.y;
load_op.attr.sched.axis = {z0.id, z1.id};
*load_op.y.axis = {z0.id, z1.id};
*load_op.y.repeats = {s0, s1};
*load_op.y.strides = {s1, One};
load_op2.x = x_op2.y;
load_op2.attr.sched.axis = {z0.id, z1.id};
*load_op2.y.axis = {z0.id, z1.id};
*load_op2.y.repeats = {s0, s1};
*load_op2.y.strides = {s1, One};
gt_op.x1 = load_op.y;
gt_op.x2 = load_op2.y;
*gt_op.y.axis = {z0.id, z1.id};
*gt_op.y.repeats = {s0, s1};
*gt_op.y.strides = {s1, One};
auto load = graph.FindNode("load");
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load->attr.sched.loop_axis = z0.id;
auto load2 = graph.FindNode("load2");
load2->attr.api.compute_type = af::ComputeType::kComputeLoad;
load2->attr.api.type = af::ApiType::kAPITypeCompute;
load2->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load2->attr.sched.loop_axis = z0.id;
auto size = ge::GetSizeByDataType(ge::DT_FLOAT16);
load->outputs[0].attr.vectorized_axis = {z1.id};
load->outputs[0].attr.vectorized_strides = {One};
load->outputs[0].attr.dtype = ge::DT_FLOAT;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.que.id = 1;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
load2->outputs[0].attr.vectorized_axis = {z1.id};
load2->outputs[0].attr.vectorized_strides = {One};
load2->outputs[0].attr.dtype = ge::DT_FLOAT;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.tensor_id = 1;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load2->outputs[0].attr.que.id = 1;
load2->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto gt = graph.FindNode("gt");
gt->attr.api.compute_type = af::ComputeType::kComputeElewise;
gt->attr.api.type = af::ApiType::kAPITypeCompute;
gt->attr.api.unit = af::ComputeUnit::kUnitVector;
gt->attr.sched.loop_axis = z0.id;
gt->attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
gt->outputs[0].attr.vectorized_axis = {z1.id};
gt->outputs[0].attr.vectorized_strides = {One};
gt->outputs[0].attr.dtype = ge::DT_INT16;
gt->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
gt->outputs[0].attr.mem.tensor_id = 3;
gt->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
gt->outputs[0].attr.que.id = 2;
gt->outputs[0].attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(load->outputs[0]);
tpipe.AddTensor(load2->outputs[0]);
tpipe.AddTensor(gt->outputs[0]);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
std::vector<af::AxisId> current_axis;
current_axis.push_back(z0.id);
codegen::ApiTensor x1;
codegen::ApiTensor x2;
x1.id = load->outputs[0].attr.mem.tensor_id;
x2.id = load2->outputs[0].attr.mem.tensor_id;
codegen::CompareApiCall call("GT");
EXPECT_EQ(call.Init(gt), 0);
call.inputs.push_back(&x1);
call.inputs.push_back(&x2);
std::string result;
call.Generate(tpipe, current_axis, result);
std::cout << result << std::endl;
EXPECT_EQ(result, std::string{
"CompareExtend(local_3[0], local_0[0], local_1[0], CMPMODE::GT, local_0_actual_size, tmp_buf_0);\n"
});
}
TEST(CodegenKernel, CompareApiCallThrowingFor) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
auto z2 = graph.CreateAxis("z2", s2);
Data x_op("x", graph);
Data x_op2("x2", graph);
Load load_op("load");
Load load_op2("load2");
af::ascir_op::Gt gt_op("gt");
graph.AddNode(load_op);
graph.AddNode(load_op2);
graph.AddNode(gt_op);
load_op.x = x_op.y;
load_op.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op.y.axis = {z0.id, z1.id, z2.id};
*load_op.y.repeats = {s0, s1, s2};
*load_op.y.strides = {s1 * s2, s2, One};
load_op2.x = x_op2.y;
load_op2.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op2.y.axis = {z0.id, z1.id, z2.id};
*load_op2.y.repeats = {s0, s1, s2};
*load_op2.y.strides = {s1 * s2, s2, One};
gt_op.x1 = load_op.y;
gt_op.x2 = load_op2.y;
*gt_op.y.axis = {z0.id, z1.id, z2.id};
*gt_op.y.repeats = {s0, s1, s2};
*gt_op.y.strides = {s1 * s2, s2, One};
auto load = graph.FindNode("load");
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load->attr.sched.loop_axis = z0.id;
auto load2 = graph.FindNode("load2");
load2->attr.api.compute_type = af::ComputeType::kComputeLoad;
load2->attr.api.type = af::ApiType::kAPITypeCompute;
load2->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load2->attr.sched.loop_axis = z0.id;
auto size = ge::GetSizeByDataType(ge::DT_FLOAT16);
auto stride = af::sym::Align(z2.size, 32 / size);
load->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load->outputs[0].attr.vectorized_strides = {stride, One};
load->outputs[0].attr.dtype = ge::DT_INT32;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.que.id = 1;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
load2->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load2->outputs[0].attr.vectorized_strides = {stride, One};
load2->outputs[0].attr.dtype = ge::DT_INT32;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.tensor_id = 1;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load2->outputs[0].attr.que.id = 1;
load2->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto gt = graph.FindNode("gt");
gt->attr.api.compute_type = af::ComputeType::kComputeElewise;
gt->attr.api.type = af::ApiType::kAPITypeCompute;
gt->attr.api.unit = af::ComputeUnit::kUnitVector;
gt->attr.sched.loop_axis = z0.id;
gt->attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
gt->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
gt->outputs[0].attr.vectorized_strides = {stride, One};
gt->outputs[0].attr.dtype = ge::DT_INT32;
gt->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
gt->outputs[0].attr.mem.tensor_id = 3;
gt->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
gt->outputs[0].attr.que.id = 2;
gt->outputs[0].attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(load->outputs[0]);
tpipe.AddTensor(load2->outputs[0]);
tpipe.AddTensor(gt->outputs[0]);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
std::vector<af::AxisId> current_axis;
current_axis.push_back(z0.id);
codegen::ApiTensor x1;
codegen::ApiTensor x2;
x1.id = load->outputs[0].attr.mem.tensor_id;
x2.id = load2->outputs[0].attr.mem.tensor_id;
codegen::CompareApiCall call("GT");
EXPECT_EQ(call.Init(gt), 0);
call.inputs.push_back(&x1);
call.inputs.push_back(&x2);
std::string result;
call.Generate(tpipe, current_axis, result);
EXPECT_EQ(result, std::string{
"CompareExtend<int32_t, CMPMODE::GT>(local_3[0], local_0[0], local_1[0], t->s1, t->s2, ((16 * Ceiling((Rational(1 , 16) * t->s2))))/(1), ((16 * Ceiling((Rational(1 , 16) * t->s2))))/(1), tmp_buf_0);\n"});
}
TEST(CodegenKernel, CompareApiCallTwoAxis) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
auto z2 = graph.CreateAxis("z2", s2);
Data x_op("x", graph);
Data x_op2("x2", graph);
Load load_op("load");
Load load_op2("load2");
af::ascir_op::Gt gt_op("gt");
graph.AddNode(load_op);
graph.AddNode(load_op2);
graph.AddNode(gt_op);
load_op.x = x_op.y;
load_op.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op.y.axis = {z0.id, z1.id, z2.id};
*load_op.y.repeats = {s0, s1, s2};
*load_op.y.strides = {s1 * s2, s2, One};
load_op2.x = x_op2.y;
load_op2.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op2.y.axis = {z0.id, z1.id, z2.id};
*load_op2.y.repeats = {s0, s1, s2};
*load_op2.y.strides = {s1 * s2, s2, One};
gt_op.x1 = load_op.y;
gt_op.x2 = load_op2.y;
*gt_op.y.axis = {z0.id, z1.id, z2.id};
*gt_op.y.repeats = {s0, s1, s2};
*gt_op.y.strides = {s1 * s2, s2, One};
auto load = graph.FindNode("load");
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load->attr.sched.loop_axis = z0.id;
auto load2 = graph.FindNode("load2");
load2->attr.api.compute_type = af::ComputeType::kComputeLoad;
load2->attr.api.type = af::ApiType::kAPITypeCompute;
load2->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load2->attr.sched.loop_axis = z0.id;
auto size = ge::GetSizeByDataType(ge::DT_FLOAT16);
auto stride = af::sym::Align(z2.size, 32 / size);
load->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load->outputs[0].attr.vectorized_strides = {stride, One};
load->outputs[0].attr.dtype = ge::DT_FLOAT;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.que.id = 1;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
load2->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load2->outputs[0].attr.vectorized_strides = {stride, One};
load2->outputs[0].attr.dtype = ge::DT_FLOAT;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.tensor_id = 1;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load2->outputs[0].attr.que.id = 1;
load2->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto gt = graph.FindNode("gt");
gt->attr.api.compute_type = af::ComputeType::kComputeElewise;
gt->attr.api.type = af::ApiType::kAPITypeCompute;
gt->attr.api.unit = af::ComputeUnit::kUnitVector;
gt->attr.sched.loop_axis = z0.id;
gt->attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
gt->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
gt->outputs[0].attr.vectorized_strides = {stride, One};
gt->outputs[0].attr.dtype = ge::DT_INT16;
gt->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
gt->outputs[0].attr.mem.tensor_id = 3;
gt->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
gt->outputs[0].attr.que.id = 2;
gt->outputs[0].attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(load->outputs[0]);
tpipe.AddTensor(load2->outputs[0]);
tpipe.AddTensor(gt->outputs[0]);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
std::vector<af::AxisId> current_axis;
current_axis.push_back(z0.id);
codegen::ApiTensor x1;
codegen::ApiTensor x2;
x1.id = load->outputs[0].attr.mem.tensor_id;
x2.id = load2->outputs[0].attr.mem.tensor_id;
codegen::CompareApiCall call("GT");
EXPECT_EQ(call.Init(gt), 0);
call.inputs.push_back(&x1);
call.inputs.push_back(&x2);
std::string result;
call.Generate(tpipe, current_axis, result);
EXPECT_EQ(result, std::string{
"CompareExtend<float, CMPMODE::GT>(local_3[0], local_0[0], local_1[0], t->s1, t->s2, ((16 * Ceiling((Rational(1 , 16) * t->s2))))/(1), ((16 * Ceiling((Rational(1 , 16) * t->s2))))/(1), tmp_buf_0);\n"
});
}
TEST(CodegenKernel, CompareApiCallThrowingForWithX2IsUbScalar) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
auto z2 = graph.CreateAxis("z2", s2);
Data x_op("x", graph);
Data x_op2("x2", graph);
Load load_op("load");
Load load_op2("load2");
af::ascir_op::Gt gt_op("gt");
graph.AddNode(load_op);
graph.AddNode(load_op2);
graph.AddNode(gt_op);
load_op.x = x_op.y;
load_op.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op.y.axis = {z0.id, z1.id, z2.id};
*load_op.y.repeats = {s0, s1, s2};
*load_op.y.strides = {s1 * s2, s2, One};
load_op2.x = x_op2.y;
load_op2.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op2.y.axis = {z0.id, z1.id, z2.id};
*load_op2.y.repeats = {s0, s1, s2};
*load_op2.y.strides = {s1 * s2, s2, One};
gt_op.x1 = load_op.y;
gt_op.x2 = load_op2.y;
*gt_op.y.axis = {z0.id, z1.id, z2.id};
*gt_op.y.repeats = {s0, s1, s2};
*gt_op.y.strides = {s1 * s2, s2, One};
auto load = graph.FindNode("load");
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load->attr.sched.loop_axis = z0.id;
auto load2 = graph.FindNode("load2");
load2->attr.api.compute_type = af::ComputeType::kComputeLoad;
load2->attr.api.type = af::ApiType::kAPITypeCompute;
load2->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load2->attr.sched.loop_axis = z0.id;
auto size = ge::GetSizeByDataType(ge::DT_FLOAT);
auto stride = af::sym::Align(z2.size, 32 / size);
load->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load->outputs[0].attr.vectorized_strides = {stride, One};
load->outputs[0].attr.dtype = ge::DT_FLOAT;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.que.id = 1;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
load2->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load2->outputs[0].attr.vectorized_strides = {stride, One};
load2->outputs[0].attr.dtype = ge::DT_FLOAT;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.tensor_id = 1;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load2->outputs[0].attr.que.id = 1;
load2->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto size_int16 = ge::GetSizeByDataType(ge::DT_INT16);
auto stride_int16 = af::sym::Align(z2.size, 32 / size_int16);
auto gt = graph.FindNode("gt");
gt->attr.api.compute_type = af::ComputeType::kComputeElewise;
gt->attr.api.type = af::ApiType::kAPITypeCompute;
gt->attr.api.unit = af::ComputeUnit::kUnitVector;
gt->attr.sched.loop_axis = z0.id;
gt->attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
gt->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
gt->outputs[0].attr.vectorized_strides = {stride_int16, One};
gt->outputs[0].attr.dtype = ge::DT_INT16;
gt->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
gt->outputs[0].attr.mem.tensor_id = 3;
gt->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
gt->outputs[0].attr.que.id = 2;
gt->outputs[0].attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(load->outputs[0]);
std::string dtype_name;
codegen::Tensor::DtypeName(load2->outputs[0].attr.dtype, dtype_name);
codegen::Tensor t(load2->outputs[0], dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
t.need_gen_get_value_of_ub_scalar = true;
t.is_ub_scalar = true;
EXPECT_EQ(tpipe.AddTensor(t), 0);
tpipe.AddTensor(gt->outputs[0]);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
std::vector<af::AxisId> current_axis;
current_axis.push_back(z0.id);
codegen::ApiTensor x1;
codegen::ApiTensor x2;
x1.id = load->outputs[0].attr.mem.tensor_id;
x2.id = load2->outputs[0].attr.mem.tensor_id;
codegen::CompareApiCall call("GT");
EXPECT_EQ(call.Init(gt), 0);
call.inputs.push_back(&x1);
call.inputs.push_back(&x2);
std::string result;
call.Generate(tpipe, current_axis, result);
EXPECT_EQ(result, std::string{
"CompareExtend<float, CMPMODE::GT>(local_3[0], local_0[0], local_1[0], t->s1, t->s2, ((8 * Ceiling((Rational(1 , 8) * t->s2))))/(1), ((16 * Ceiling((Rational(1 , 16) * t->s2))))/(1), tmp_buf_0);\n"
});
}
TEST(CodegenKernel, CompareApiCallThrowingForWithX2IsUbScalarForUint32) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
auto z2 = graph.CreateAxis("z2", s2);
Data x_op("x", graph);
Data x_op2("x2", graph);
Load load_op("load");
Load load_op2("load2");
af::ascir_op::Gt gt_op("gt");
graph.AddNode(load_op);
graph.AddNode(load_op2);
graph.AddNode(gt_op);
load_op.x = x_op.y;
load_op.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op.y.axis = {z0.id, z1.id, z2.id};
*load_op.y.repeats = {s0, s1, s2};
*load_op.y.strides = {s1 * s2, s2, One};
load_op2.x = x_op2.y;
load_op2.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op2.y.axis = {z0.id, z1.id, z2.id};
*load_op2.y.repeats = {s0, s1, s2};
*load_op2.y.strides = {s1 * s2, s2, One};
gt_op.x1 = load_op.y;
gt_op.x2 = load_op2.y;
*gt_op.y.axis = {z0.id, z1.id, z2.id};
*gt_op.y.repeats = {s0, s1, s2};
*gt_op.y.strides = {s1 * s2, s2, One};
auto load = graph.FindNode("load");
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load->attr.sched.loop_axis = z0.id;
auto load2 = graph.FindNode("load2");
load2->attr.api.compute_type = af::ComputeType::kComputeLoad;
load2->attr.api.type = af::ApiType::kAPITypeCompute;
load2->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load2->attr.sched.loop_axis = z0.id;
auto size = ge::GetSizeByDataType(ge::DT_UINT32);
auto stride = af::sym::Align(z2.size, 32 / size);
load->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load->outputs[0].attr.vectorized_strides = {stride, One};
load->outputs[0].attr.dtype = ge::DT_UINT32;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.que.id = 1;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
load2->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load2->outputs[0].attr.vectorized_strides = {stride, One};
load2->outputs[0].attr.dtype = ge::DT_UINT32;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.tensor_id = 1;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load2->outputs[0].attr.que.id = 1;
load2->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto size_int16 = ge::GetSizeByDataType(ge::DT_INT16);
auto stride_int16 = af::sym::Align(z2.size, 32 / size_int16);
auto gt = graph.FindNode("gt");
gt->attr.api.compute_type = af::ComputeType::kComputeElewise;
gt->attr.api.type = af::ApiType::kAPITypeCompute;
gt->attr.api.unit = af::ComputeUnit::kUnitVector;
gt->attr.sched.loop_axis = z0.id;
gt->attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
gt->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
gt->outputs[0].attr.vectorized_strides = {stride_int16, One};
gt->outputs[0].attr.dtype = ge::DT_INT16;
gt->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
gt->outputs[0].attr.mem.tensor_id = 3;
gt->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
gt->outputs[0].attr.que.id = 2;
gt->outputs[0].attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(load->outputs[0]);
std::string dtype_name;
codegen::Tensor::DtypeName(load2->outputs[0].attr.dtype, dtype_name);
codegen::Tensor t(load2->outputs[0], dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
t.need_gen_get_value_of_ub_scalar = true;
t.is_ub_scalar = true;
EXPECT_EQ(tpipe.AddTensor(t), 0);
tpipe.AddTensor(gt->outputs[0]);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
std::vector<af::AxisId> current_axis;
current_axis.push_back(z0.id);
codegen::ApiTensor x1;
codegen::ApiTensor x2;
x1.id = load->outputs[0].attr.mem.tensor_id;
x2.id = load2->outputs[0].attr.mem.tensor_id;
codegen::CompareApiCall call("GT");
EXPECT_EQ(call.Init(gt), 0);
call.inputs.push_back(&x1);
call.inputs.push_back(&x2);
std::string result;
call.Generate(tpipe, current_axis, result);
EXPECT_EQ(result, std::string{
"CompareExtend<uint32_t, CMPMODE::GT>(local_3[0], local_0[0], local_1[0], t->s1, t->s2, ((8 * Ceiling((Rational(1 , 8) * t->s2))))/(1), ((16 * Ceiling((Rational(1 , 16) * t->s2))))/(1), tmp_buf_0);\n"
});
}
TEST(CodegenKernel, CompareApiCallNotThrowingForWithX2IsUbScalar) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
auto z2 = graph.CreateAxis("z2", s2);
Data x_op("x", graph);
Data x_op2("x2", graph);
Load load_op("load");
Load load_op2("load2");
af::ascir_op::Gt gt_op("gt");
graph.AddNode(load_op);
graph.AddNode(load_op2);
graph.AddNode(gt_op);
load_op.x = x_op.y;
load_op.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op.y.axis = {z0.id, z1.id, z2.id};
*load_op.y.repeats = {s0, s1, s2};
*load_op.y.strides = {s1 * s2, s2, One};
load_op2.x = x_op2.y;
load_op2.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op2.y.axis = {z0.id, z1.id, z2.id};
*load_op2.y.repeats = {s0, s1, s2};
*load_op2.y.strides = {s1 * s2, s2, One};
gt_op.x1 = load_op.y;
gt_op.x2 = load_op2.y;
*gt_op.y.axis = {z0.id, z1.id, z2.id};
*gt_op.y.repeats = {s0, s1, s2};
*gt_op.y.strides = {s1 * s2, s2, One};
auto load = graph.FindNode("load");
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load->attr.sched.loop_axis = z0.id;
auto load2 = graph.FindNode("load2");
load2->attr.api.compute_type = af::ComputeType::kComputeLoad;
load2->attr.api.type = af::ApiType::kAPITypeCompute;
load2->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load2->attr.sched.loop_axis = z0.id;
load->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load->outputs[0].attr.vectorized_strides = {s2, One};
load->outputs[0].attr.dtype = ge::DT_FLOAT;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.que.id = 1;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
load2->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load2->outputs[0].attr.vectorized_strides = {s2, One};
load2->outputs[0].attr.dtype = ge::DT_FLOAT;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.tensor_id = 1;
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load2->outputs[0].attr.que.id = 1;
load2->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto gt = graph.FindNode("gt");
gt->attr.api.compute_type = af::ComputeType::kComputeElewise;
gt->attr.api.type = af::ApiType::kAPITypeCompute;
gt->attr.api.unit = af::ComputeUnit::kUnitVector;
gt->attr.sched.loop_axis = z0.id;
gt->attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
gt->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
gt->outputs[0].attr.vectorized_strides = {s2, One};
gt->outputs[0].attr.dtype = ge::DT_FLOAT;
gt->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
gt->outputs[0].attr.mem.tensor_id = 3;
gt->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
gt->outputs[0].attr.que.id = 2;
gt->outputs[0].attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(load->outputs[0]);
std::string dtype_name;
codegen::Tensor::DtypeName(load2->outputs[0].attr.dtype, dtype_name);
codegen::Tensor t(load2->outputs[0], dtype_name, "t");
EXPECT_EQ(t.Init(), 0);
t.need_gen_get_value_of_ub_scalar = true;
t.is_ub_scalar = true;
EXPECT_EQ(tpipe.AddTensor(t), 0);
tpipe.AddTensor(gt->outputs[0]);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
std::vector<af::AxisId> current_axis;
current_axis.push_back(z0.id);
codegen::ApiTensor x1;
codegen::ApiTensor x2;
x1.id = load->outputs[0].attr.mem.tensor_id;
x2.id = load2->outputs[0].attr.mem.tensor_id;
codegen::CompareApiCall call("GT");
EXPECT_EQ(call.Init(gt), 0);
call.inputs.push_back(&x1);
call.inputs.push_back(&x2);
std::string result;
call.Generate(tpipe, current_axis, result);
EXPECT_EQ(result, std::string{
"CompareScalarExtend(local_3[0], local_0[0], (float)local_1_ub_scalar, CMPMODE::GT, local_0_actual_size, tmp_buf_0);\n"
});
codegen::Kernel kernel("test");
kernel.tpipe.CollectQues(graph);
EXPECT_EQ(kernel.tpipe.AddTensor(t), 0);
EXPECT_EQ(kernel.ParseOptimizeInfo(load2, load->outputs[0]), ge::FAILED);
}
TEST(CodegenKernel, IsnanExtendNotThrowingFor) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
Data x_op("x", graph);
Load load_op("load");
af::ascir_op::Isnan isnan("isnan");
graph.AddNode(load_op);
graph.AddNode(isnan);
load_op.x = x_op.y;
load_op.attr.sched.axis = {z0.id, z1.id};
*load_op.y.axis = {z0.id, z1.id};
*load_op.y.repeats = {s0, s1};
*load_op.y.strides = {s1, One};
isnan.x = load_op.y;
*isnan.y.axis = {z0.id, z1.id};
*isnan.y.repeats = {s0, s1};
*isnan.y.strides = {s1, One};
auto load = graph.FindNode("load");
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load->attr.sched.loop_axis = z0.id;
auto size = ge::GetSizeByDataType(ge::DT_FLOAT16);
load->outputs[0].attr.vectorized_axis = {z1.id};
load->outputs[0].attr.vectorized_strides = {One};
load->outputs[0].attr.dtype = ge::DT_FLOAT;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.que.id = 1;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto isnan_node = graph.FindNode("isnan");
isnan_node->attr.api.compute_type = af::ComputeType::kComputeElewise;
isnan_node->attr.api.type = af::ApiType::kAPITypeCompute;
isnan_node->attr.api.unit = af::ComputeUnit::kUnitVector;
isnan_node->attr.sched.loop_axis = z0.id;
isnan_node->attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
isnan_node->outputs[0].attr.vectorized_axis = {z1.id};
isnan_node->outputs[0].attr.vectorized_strides = {One};
isnan_node->outputs[0].attr.dtype = ge::DT_INT16;
isnan_node->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
isnan_node->outputs[0].attr.mem.tensor_id = 3;
isnan_node->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
isnan_node->outputs[0].attr.que.id = 2;
isnan_node->outputs[0].attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(load->outputs[0]);
tpipe.AddTensor(isnan_node->outputs[0]);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
std::vector<af::AxisId> current_axis;
current_axis.push_back(z0.id);
codegen::ApiTensor x1;
x1.id = load->outputs[0].attr.mem.tensor_id;
codegen::UnaryBitWidthChangeApiCall call("IsnanExtend");
EXPECT_EQ(call.Init(isnan_node), 0);
call.inputs.push_back(&x1);
std::string result;
call.Generate(tpipe, current_axis, result);
std::cout << result << std::endl;
EXPECT_EQ(result, std::string{
"IsnanExtend(local_3[0], local_0[0], tmp_buf_0, local_0_actual_size);\n"
});
}
TEST(CodegenKernel, IsnanExtendThrowingFor) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
auto z2 = graph.CreateAxis("z2", s2);
Data x_op("x", graph);
Load load_op("load");
af::ascir_op::Isnan isnan("isnan");
graph.AddNode(load_op);
graph.AddNode(isnan);
load_op.x = x_op.y;
load_op.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op.y.axis = {z0.id, z1.id, z2.id};
*load_op.y.repeats = {s0, s1, s2};
*load_op.y.strides = {s1*s2, s2, One};
isnan.x = load_op.y;
*isnan.y.axis = {z0.id, z1.id, z2.id};
*isnan.y.repeats = {s0, s1, s2};
*isnan.y.strides = {s1*s2, s2, One};
auto load = graph.FindNode("load");
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load->attr.sched.loop_axis = z0.id;
auto size = ge::GetSizeByDataType(ge::DT_FLOAT16);
load->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load->outputs[0].attr.vectorized_strides = {af::sym::Align(z2.size, 16), One};
load->outputs[0].attr.dtype = ge::DT_FLOAT;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.que.id = 1;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto isnan_node = graph.FindNode("isnan");
isnan_node->attr.api.compute_type = af::ComputeType::kComputeElewise;
isnan_node->attr.api.type = af::ApiType::kAPITypeCompute;
isnan_node->attr.api.unit = af::ComputeUnit::kUnitVector;
isnan_node->attr.sched.loop_axis = z0.id;
isnan_node->attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
isnan_node->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
isnan_node->outputs[0].attr.vectorized_strides = {af::sym::Align(z2.size, 16), One};
isnan_node->outputs[0].attr.dtype = ge::DT_INT16;
isnan_node->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
isnan_node->outputs[0].attr.mem.tensor_id = 3;
isnan_node->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
isnan_node->outputs[0].attr.que.id = 2;
isnan_node->outputs[0].attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(load->outputs[0]);
tpipe.AddTensor(isnan_node->outputs[0]);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
std::vector<af::AxisId> current_axis;
current_axis.push_back(z0.id);
codegen::ApiTensor x1;
x1.id = load->outputs[0].attr.mem.tensor_id;
codegen::UnaryBitWidthChangeApiCall call("IsnanExtend");
EXPECT_EQ(call.Init(isnan_node), 0);
call.inputs.push_back(&x1);
std::string result;
call.Generate(tpipe, current_axis, result);
std::cout << result << std::endl;
EXPECT_EQ(result, std::string{
"for(int outer_for_0 = 0; outer_for_0 < t->s1; outer_for_0++) {\nIsnanExtend(local_3[outer_for_0 * ((16 * Ceiling((Rational(1 , 16) * t->s2))))/(1)], local_0[outer_for_0 * ((16 * Ceiling((Rational(1 , 16) * t->s2))))/(1)], tmp_buf_0, t->s2);\n\n}\n"
});
}
TEST(CodegenKernel, UnaryApiTmpCall) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
auto z2 = graph.CreateAxis("z2", s2);
Data x_op("x", graph);
Load load_op("load");
af::ascir_op::Sign sign("sign");
graph.AddNode(load_op);
graph.AddNode(sign);
load_op.x = x_op.y;
load_op.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_op.y.axis = {z0.id, z1.id, z2.id};
*load_op.y.repeats = {s0, s1, s2};
*load_op.y.strides = {s1*s2, s2, One};
sign.x = load_op.y;
*sign.y.axis = {z0.id, z1.id, z2.id};
*sign.y.repeats = {s0, s1, s2};
*sign.y.strides = {s1*s2, s2, One};
auto load = graph.FindNode("load");
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load->attr.sched.loop_axis = z0.id;
auto size = ge::GetSizeByDataType(ge::DT_FLOAT16);
load->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
load->outputs[0].attr.vectorized_strides = {af::sym::Align(z2.size, 16), One};
load->outputs[0].attr.dtype = ge::DT_FLOAT;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.que.id = 1;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto sign_node = graph.FindNode("sign");
sign_node->attr.api.compute_type = af::ComputeType::kComputeElewise;
sign_node->attr.api.type = af::ApiType::kAPITypeCompute;
sign_node->attr.api.unit = af::ComputeUnit::kUnitVector;
sign_node->attr.sched.loop_axis = z0.id;
sign_node->attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
sign_node->outputs[0].attr.vectorized_axis = {z1.id, z2.id};
sign_node->outputs[0].attr.vectorized_strides = {af::sym::Align(z2.size, 16), One};
sign_node->outputs[0].attr.dtype = ge::DT_INT16;
sign_node->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
sign_node->outputs[0].attr.mem.tensor_id = 3;
sign_node->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
sign_node->outputs[0].attr.que.id = 2;
sign_node->outputs[0].attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(load->outputs[0]);
tpipe.AddTensor(sign_node->outputs[0]);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
std::vector<af::AxisId> current_axis;
current_axis.push_back(z0.id);
codegen::ApiTensor x1;
x1.id = load->outputs[0].attr.mem.tensor_id;
codegen::UnaryApiTmpCall call("SignExtend");
EXPECT_EQ(call.Init(sign_node), 0);
call.inputs.push_back(&x1);
std::string result;
call.Generate(tpipe, current_axis, result);
std::cout << result << std::endl;
EXPECT_EQ(result, std::string{
"SignExtend(local_3[0], local_0[0], tmp_buf_0, local_0_actual_size);\n"
});
}
TEST(CodegenKernel, Ub2ubApiCall) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
Data x_op("x", graph);
Load load_op("load");
af::ascir_op::Ub2ub ub2ub_op("ub2ub");
graph.AddNode(load_op);
graph.AddNode(ub2ub_op);
load_op.x = x_op.y;
load_op.attr.sched.axis = {z0.id, z1.id};
*load_op.y.axis = {z0.id, z1.id};
*load_op.y.repeats = {s0, s1};
*load_op.y.strides = {s1, One};
ub2ub_op.x = load_op.y;
auto load = graph.FindNode("load");
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
load->attr.sched.loop_axis = z0.id;
load->outputs[0].attr.vectorized_axis = {z0.id, z1.id};
load->outputs[0].attr.dtype = ge::DT_FLOAT;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.que.id = 1;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto ub2ub = graph.FindNode("ub2ub");
ub2ub->attr.api.unit = af::ComputeUnit::kUnitVector;
ub2ub->outputs[0].attr.dtype = ge::DT_FLOAT;
ub2ub->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
ub2ub->outputs[0].attr.mem.tensor_id = 1;
ub2ub->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
ub2ub->outputs[0].attr.que.id = 2;
ub2ub->outputs[0].attr.opt.merge_scope = af::kIdNone;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(load->outputs[0]);
tpipe.AddTensor(ub2ub->outputs[0]);
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
codegen::ApiTensor x1;
x1.id = load->outputs[0].attr.mem.tensor_id;
codegen::Ub2ubApiCall call("DataCopy");
EXPECT_EQ(call.Init(ub2ub), 0);
call.inputs.push_back(&x1);
std::string result;
call.Generate(tpipe, vector<af::AxisId>{}, result);
EXPECT_EQ(result, std::string{
"DataCopy(local_1[0], local_0[0], KernelUtils::BlkAlign<float>(local_0_actual_size));\n"
});
}
TEST(CodegenKernel, TwoWorkspaceCodegen) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
Data x_op("x", graph);
Store store_op1("store1");
Store store_op2("store2");
Workspace workspace_op1("workspace1");
Workspace workspace_op2("workspace2");
Load load_op1("load1");
Load load_op2("load2");
Output y_op1("y1");
Output y_op2("y2");
graph.AddNode(store_op1);
graph.AddNode(store_op2);
graph.AddNode(workspace_op1);
graph.AddNode(workspace_op2);
graph.AddNode(load_op1);
graph.AddNode(load_op2);
graph.AddNode(y_op1);
graph.AddNode(y_op2);
x_op.y.dtype = ge::DT_FLOAT16;
x_op.ir_attr.SetIndex(0);
store_op1.x = x_op.y;
store_op1.y.dtype = ge::DT_FLOAT16;
*store_op1.y.axis = {z0.id, z1.id};
workspace_op1.x = store_op1.y;
workspace_op1.y.dtype = ge::DT_FLOAT16;
*workspace_op1.y.axis = {z0.id, z1.id};
store_op2.x = x_op.y;
store_op2.y.dtype = ge::DT_FLOAT16;
*store_op2.y.axis = {z0.id, z1.id};
workspace_op2.x = store_op2.y;
workspace_op2.y.dtype = ge::DT_FLOAT16;
*workspace_op2.y.axis = {z0.id, z1.id};
load_op1.x = workspace_op1.y;
load_op1.y.dtype = ge::DT_FLOAT16;
*load_op1.y.axis = {z0.id, z1.id};
load_op2.x = workspace_op2.y;
load_op2.y.dtype = ge::DT_FLOAT16;
*load_op2.y.axis = {z0.id, z1.id};
y_op1.x = load_op1.y;
y_op1.ir_attr.SetIndex(0);
y_op2.x = load_op2.y;
y_op2.ir_attr.SetIndex(1);
auto x = graph.FindNode("x");
auto load1 = graph.FindNode("load1");
auto load2 = graph.FindNode("load2");
auto workspace1 = graph.FindNode("workspace1");
auto workspace2 = graph.FindNode("workspace2");
auto store1 = graph.FindNode("store1");
auto store2 = graph.FindNode("store2");
auto y1 = graph.FindNode("y1");
auto y2 = graph.FindNode("y2");
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.tensor_id = 0;
store1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store1->outputs[0].attr.mem.tensor_id = 1;
store1->outputs[0].attr.repeats = {s0, s1};
store1->outputs[0].attr.strides = {s1, One};
store2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store2->outputs[0].attr.mem.tensor_id = 2;
store2->outputs[0].attr.repeats = {s0, s1};
store2->outputs[0].attr.strides = {s1, One};
workspace1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
workspace1->outputs[0].attr.mem.tensor_id = 1;
workspace1->outputs[0].attr.repeats = {s0, s1};
workspace2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
workspace2->outputs[0].attr.mem.tensor_id = 2;
workspace2->outputs[0].attr.repeats = {s0, s1};
load1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
load1->outputs[0].attr.mem.tensor_id = 3;
load1->outputs[0].attr.repeats = {s0, s1};
load1->outputs[0].attr.strides = {s1, One};
load2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
load2->outputs[0].attr.mem.tensor_id = 4;
load2->outputs[0].attr.repeats = {s0, s1};
load2->outputs[0].attr.strides = {s1, One};
y1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
y1->outputs[0].attr.mem.tensor_id = 5;
y2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
y2->outputs[0].attr.mem.tensor_id = 6;
workspace1->attr.api.compute_type = af::ComputeType::kComputeInvalid;
workspace2->attr.api.compute_type = af::ComputeType::kComputeInvalid;
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.input_nodes.push_back(x);
fused_schedule_result.output_nodes.push_back(y1);
fused_schedule_result.output_nodes.push_back(y2);
fused_schedule_result.workspace_nodes.push_back(workspace1);
fused_schedule_result.workspace_nodes.push_back(workspace2);
codegen::Kernel kernel(graph.GetName());
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
std::string result;
kernel.GlobalTensorInit(result);
EXPECT_EQ(result, std::string{
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_3;\n"
"global_3.SetGlobalBuffer((__gm__ half*)y1);\n"
"GlobalTensor<half> global_4;\n"
"global_4.SetGlobalBuffer((__gm__ half*)y2);\n"
"GlobalTensor<half> global_1;\n"
"global_1.SetGlobalBuffer((__gm__ half*)workspace);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)((__gm__ uint8_t*)(workspace) + (0 + (workspace1))));\n"
});
}
TEST(CodegenKernel, TwoWorkspaceReuseAsInputCodegen) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
Workspace workspace_op1("workspace1");
Workspace workspace_op2("workspace2");
Load load_op1("load1");
Load load_op2("load2");
Output y_op1("y1");
Output y_op2("y2");
graph.AddNode(workspace_op1);
graph.AddNode(workspace_op2);
graph.AddNode(load_op1);
graph.AddNode(load_op2);
graph.AddNode(y_op1);
graph.AddNode(y_op2);
workspace_op1.y.dtype = ge::DT_FLOAT16;
*workspace_op1.y.axis = {z0.id, z1.id};
workspace_op2.y.dtype = ge::DT_FLOAT16;
*workspace_op2.y.axis = {z0.id, z1.id};
load_op1.x = workspace_op1.y;
load_op1.y.dtype = ge::DT_FLOAT16;
*load_op1.y.axis = {z0.id, z1.id};
load_op2.x = workspace_op2.y;
load_op2.y.dtype = ge::DT_FLOAT16;
*load_op2.y.axis = {z0.id, z1.id};
y_op1.x = load_op1.y;
y_op1.ir_attr.SetIndex(0);
y_op2.x = load_op2.y;
y_op2.ir_attr.SetIndex(1);
auto load1 = graph.FindNode("load1");
auto load2 = graph.FindNode("load2");
auto workspace1 = graph.FindNode("workspace1");
auto workspace2 = graph.FindNode("workspace2");
auto y1 = graph.FindNode("y1");
auto y2 = graph.FindNode("y2");
workspace1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
workspace1->outputs[0].attr.mem.tensor_id = 1;
workspace1->outputs[0].attr.repeats = {s0, s1};
workspace2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
workspace2->outputs[0].attr.mem.tensor_id = 1;
workspace2->outputs[0].attr.repeats = {s0, s1};
load1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
load1->outputs[0].attr.mem.tensor_id = 3;
load1->outputs[0].attr.repeats = {s0, s1};
load1->outputs[0].attr.strides = {s1, One};
load2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
load2->outputs[0].attr.mem.tensor_id = 4;
load2->outputs[0].attr.repeats = {s0, s1};
load2->outputs[0].attr.strides = {s1, One};
y1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
y1->outputs[0].attr.mem.tensor_id = 5;
y2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
y2->outputs[0].attr.mem.tensor_id = 6;
workspace1->attr.api.compute_type = af::ComputeType::kComputeInvalid;
workspace2->attr.api.compute_type = af::ComputeType::kComputeInvalid;
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.output_nodes.push_back(y1);
fused_schedule_result.output_nodes.push_back(y2);
fused_schedule_result.workspace_nodes.push_back(workspace1);
fused_schedule_result.workspace_nodes.push_back(workspace2);
codegen::Kernel kernel(graph.GetName());
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
std::string result;
kernel.GlobalTensorInit(result);
EXPECT_EQ(result, std::string{
"GlobalTensor<half> global_3;\n"
"global_3.SetGlobalBuffer((__gm__ half*)y1);\n"
"GlobalTensor<half> global_4;\n"
"global_4.SetGlobalBuffer((__gm__ half*)y2);\n"
"GlobalTensor<half> global_1;\n"
"global_1.SetGlobalBuffer((__gm__ half*)workspace);\n"
});
}
TEST(CodegenKernel, WorkspaceReuseOutputAsInputCodegen) {
af::AscGraph graph0("test_graph0");
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
Workspace workspace_op("workspace");
Load load_op0("load0");
Load load_op("load");
Output y_op1("y1");
Output y_op2("y2");
graph0.AddNode(load_op0);
graph0.AddNode(y_op1);
load_op0.y.dtype = ge::DT_FLOAT16;
y_op1.x = load_op0.y;
y_op1.y.dtype = ge::DT_FLOAT16;
y_op1.ir_attr.SetIndex(0);
graph.AddNode(workspace_op);
graph.AddNode(load_op);
graph.AddNode(y_op2);
workspace_op.y.dtype = ge::DT_FLOAT16;
*workspace_op.y.axis = {z0.id, z1.id};
load_op.x = workspace_op.y;
load_op.y.dtype = ge::DT_FLOAT16;
*load_op.y.axis = {z0.id, z1.id};
y_op2.x = load_op.y;
y_op2.ir_attr.SetIndex(1);
auto load0 = graph0.FindNode("load0");
auto y1 = graph0.FindNode("y1");
load0->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
load0->outputs[0].attr.mem.tensor_id = 1;
y1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
y1->outputs[0].attr.mem.tensor_id = 1;
auto load = graph.FindNode("load");
auto workspace = graph.FindNode("workspace");
auto y2 = graph.FindNode("y2");
workspace->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
workspace->outputs[0].attr.mem.tensor_id = 1;
workspace->outputs[0].attr.repeats = {s0, s1};
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
load->outputs[0].attr.mem.tensor_id = 2;
load->outputs[0].attr.repeats = {s0, s1};
load->outputs[0].attr.strides = {s1, One};
y2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
y2->outputs[0].attr.mem.tensor_id = 3;
workspace->attr.api.compute_type = af::ComputeType::kComputeInvalid;
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.output_nodes.push_back(y1);
fused_schedule_result.output_nodes.push_back(y2);
codegen::Kernel kernel(graph.GetName());
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
std::string result;
kernel.GlobalTensorInit(result);
EXPECT_EQ(result, std::string{
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y2);\n"
"GlobalTensor<half> global_1;\n"
"global_1.SetGlobalBuffer((__gm__ half*)y1);\n"
});
}
TEST(CodegenKernel, Looper_GenerateLoop_WhenNestedLoop) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::Axis z0{.id = 0, .name = "z0", .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .size = s1.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddAxis(z0);
tiler.AddAxis(z1);
codegen::Loop loop1(z0.id);
codegen::Loop loop2(z1.id);
loop1.AddLoop(&loop2);
codegen::TPipe tpipe("t", tiler);
std::string result;
EXPECT_EQ(loop1.Generate(tiler, tpipe, result), ge::SUCCESS);
EXPECT_EQ(result, std::string{
"for (int z0 = 0; z0 < z0_loop_size; z0++) {\n"
"for (int z1 = 0; z1 < z1_loop_size; z1++) {\n"
"}\n"
"}\n"
});
}
TEST(CodegenKernel, Looper_GenerateLoop_WhenTwoLoop) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::Axis z0{.id = 0, .name = "z0", .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .size = s1.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddAxis(z0);
tiler.AddAxis(z1);
codegen::Loop root_loop(af::kIdNone), loop1(z0.id), loop2(z1.id);
root_loop.AddLoop(&loop1);
root_loop.AddLoop(&loop2);
codegen::TPipe tpipe("t", tiler);
std::string result;
EXPECT_EQ(root_loop.Generate(tiler, tpipe, result), ge::SUCCESS);
EXPECT_EQ(result, std::string{
"for (int z0 = 0; z0 < z0_loop_size; z0++) {\n"
"}\n"
"for (int z1 = 0; z1 < z1_loop_size; z1++) {\n"
"}\n"
});
}
TEST(CodegenKernel, Kernel_GlobalTensorInit) {
af::AscGraph graph("test_graph");
af::ascir_op::Data x_op("x", graph);
af::ascir_op::Output y_op("y");
af::ascir_op::Load load_op("load");
af::ascir_op::Store store_op("store");
graph.AddNode(load_op);
graph.AddNode(store_op);
graph.AddNode(y_op);
x_op.y.dtype = ge::DT_FLOAT16;
x_op.ir_attr.SetIndex(0);
load_op.x = x_op.y;
load_op.y.dtype = ge::DT_FLOAT16;
store_op.x = load_op.y;
store_op.y.dtype = ge::DT_FLOAT16;
y_op.x = store_op.y;
y_op.ir_attr.SetIndex(0);
auto x = graph.FindNode("x");
auto load = graph.FindNode("load");
auto store = graph.FindNode("store");
auto y = graph.FindNode("y");
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
load->outputs[0].attr.mem.tensor_id = 1;
load->outputs[0].attr.buf.id = 1;
store->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store->outputs[0].attr.mem.tensor_id = 2;
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.input_nodes.push_back(x);
fused_schedule_result.output_nodes.push_back(y);
codegen::Kernel kernel(graph.GetName());
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
std::string result;
kernel.GlobalTensorInit(result);
EXPECT_EQ(result, std::string{
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y);\n"
});
}
TEST(CodegenKernel, Kernel_ConstantTensorInit) {
af::AscGraph graph("test_graph");
af::ascir_op::Scalar constant("constant");
graph.AddNode(constant);
constant.ir_attr.SetValue("100.1");
constant.y.dtype = ge::DT_FLOAT16;
auto constant_node = graph.FindNode("constant");
constant_node->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeInvalid;
constant_node->outputs[0].attr.mem.tensor_id = 0;
constant_node->outputs[0].attr.mem.position = af::Position::kPositionInvalid;
::ascir::FusedScheduledResult fused_schedule_result;
codegen::Kernel kernel(graph.GetName());
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
std::string result;
kernel.GlobalTensorInit(result);
EXPECT_EQ(result, std::string{
"const half scalar_0 = 100.1;\n"
});
}
TEST(CodegenKernel, Kernel_IndexExprTensorInit) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
af::ascir_op::IndexExpr index("index");
graph.AddNode(index);
index.ir_attr.SetExpr(0);
index.y.dtype = ge::DT_FLOAT16;
auto index_node = graph.FindNode("index");
index_node->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeInvalid;
index_node->outputs[0].attr.mem.tensor_id = 0;
index_node->outputs[0].attr.mem.position = af::Position::kPositionInvalid;
::ascir::FusedScheduledResult fused_schedule_result;
codegen::Kernel kernel(graph.GetName());
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
std::string result;
kernel.GlobalTensorInit(result);
EXPECT_EQ(result, std::string{
"const half scalar_0 = (t->s0)/(1);\n"
});
}
TEST(CodegenKernel, Kernel_KernelFunctionDeclare) {
af::AscGraph graph("test_kernel");
af::ascir_op::Data x1("x1"), x2("x2"), x3("x3");
x1.ir_attr.SetIndex(0);
x2.ir_attr.SetIndex(1);
x3.ir_attr.SetIndex(2);
af::ascir_op::Output y1("y1"), y2("y2"), y3("y3");
y1.ir_attr.SetIndex(0);
y2.ir_attr.SetIndex(1);
y3.ir_attr.SetIndex(2);
graph.AddNode(x1);
graph.AddNode(x2);
graph.AddNode(x3);
graph.AddNode(y1);
graph.AddNode(y2);
graph.AddNode(y3);
y1.x = x1.y;
y2.x = x2.y;
y3.x = x3.y;
auto x1_node = graph.FindNode("x1");
auto x2_node = graph.FindNode("x2");
auto x3_node = graph.FindNode("x3");
auto y1_node = graph.FindNode("y1");
auto y2_node = graph.FindNode("y2");
auto y3_node = graph.FindNode("y3");
x1_node->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x1_node->outputs[0].attr.mem.tensor_id = 0;
x2_node->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x2_node->outputs[0].attr.mem.tensor_id = 1;
x3_node->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x3_node->outputs[0].attr.mem.tensor_id = 2;
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.input_nodes.push_back(x1_node);
fused_schedule_result.input_nodes.push_back(x2_node);
fused_schedule_result.input_nodes.push_back(x3_node);
fused_schedule_result.output_nodes.push_back(y1_node);
fused_schedule_result.output_nodes.push_back(y2_node);
fused_schedule_result.output_nodes.push_back(y3_node);
codegen::Kernel kernel(graph.GetName());
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
EXPECT_EQ(kernel.TilingKeyFuncDeclare("test_kernel_general_0_nil_0_nil", "test_kernelTilingData"), std::string{
"inline __aicore__ void test_kernel_general_0_nil_0_nil(GM_ADDR x1, GM_ADDR x2, GM_ADDR x3, GM_ADDR y1, GM_ADDR y2, GM_ADDR y3, GM_ADDR workspace, const test_kernelTilingData *t)"
});
}
TEST(CodegenKernel, Kernel_LocalTensorQueBufAlloc) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto z0 = graph.CreateAxis("z0", s0);
af::ascir_op::Data x_op("x", graph);
x_op.ir_attr.SetIndex(0);
af::ascir_op::Load load_op("load");
af::ascir_op::Store store_op("store");
af::ascir_op::Output y_op("y");
y_op.ir_attr.SetIndex(0);
graph.AddNode(load_op);
graph.AddNode(store_op);
graph.AddNode(y_op);
x_op.y.dtype = ge::DT_FLOAT16;
load_op.x = x_op.y;
load_op.y.dtype = ge::DT_FLOAT16;
store_op.x = load_op.y;
store_op.y.dtype = ge::DT_FLOAT16;
y_op.x = store_op.y;
y_op.y.dtype = ge::DT_FLOAT16;
store_op.attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
auto x = graph.FindNode("x");
auto load = graph.FindNode("load");
auto store = graph.FindNode("store");
auto y = graph.FindNode("y");
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.axis = {z0.id};
load->outputs[0].attr.vectorized_axis = {z0.id};
load->outputs[0].attr.vectorized_strides = {One};
load->outputs[0].attr.repeats = {z0.size};
load->outputs[0].attr.strides = {One};
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.mem.tensor_id = 1;
load->outputs[0].attr.que.id = 0;
load->outputs[0].attr.que.depth = 2;
load->outputs[0].attr.que.buf_num = 2;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
store->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store->outputs[0].attr.mem.tensor_id = 2;
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.input_nodes.push_back(x);
fused_schedule_result.output_nodes.push_back(y);
codegen::Kernel kernel(graph.GetName());
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
std::string result;
kernel.LocalTensorQueBufAlloc(result, graph);
EXPECT_EQ(result, std::string{
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = KernelUtils::BlkAlign<half>((t->s0 - 1) + 1);\n"
"const uint32_t local_1_que_buf_num = 2;\n\n"
"// const uint32_t q0_size = KernelUtils::Max(local_1_size * sizeof(half));\n"
"const uint32_t q0_buf_num = KernelUtils::Max(2);\n"
"TQueBind<TPosition::VECIN, TPosition::VECOUT, 1> q0;\n"
"// tpipe.InitBuffer(q0, q0_buf_num, KernelUtils::BlkAlign<uint8_t>(q0_size));\n"
"tpipe.InitBuffer(q0, q0_buf_num, t->q0_size);\n"
"// const uint32_t b0_size = KernelUtils::Max(8192);\n"
"TBuf<TPosition::VECCALC> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
});
}
TEST(CodegenKernel, Kernel_LocalTensorShareReuseQueAlloc) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto z0 = graph.CreateAxis("z0", s0);
af::ascir_op::Data x1_op("x1", graph);
x1_op.ir_attr.SetIndex(0);
af::ascir_op::Data x2_op("x2", graph);
x2_op.ir_attr.SetIndex(1);
af::ascir_op::Data x3_op("x3", graph);
x3_op.ir_attr.SetIndex(2);
af::ascir_op::Load load1_op("load1");
af::ascir_op::Load load2_op("load2");
af::ascir_op::Load load3_op("load3");
af::ascir_op::Add add_op("add");
af::ascir_op::Store store_op("store");
af::ascir_op::Output y_op("y");
y_op.ir_attr.SetIndex(0);
graph.AddNode(load1_op);
graph.AddNode(load2_op);
graph.AddNode(load3_op);
graph.AddNode(add_op);
graph.AddNode(store_op);
graph.AddNode(y_op);
x1_op.y.dtype = ge::DT_FLOAT16;
x2_op.y.dtype = ge::DT_FLOAT16;
x3_op.y.dtype = ge::DT_FLOAT16;
load1_op.x = x1_op.y;
load1_op.y.dtype = ge::DT_FLOAT16;
load2_op.x = x2_op.y;
load2_op.y.dtype = ge::DT_FLOAT16;
load3_op.x = x3_op.y;
load3_op.y.dtype = ge::DT_FLOAT16;
add_op.x1 = load1_op.y;
add_op.x2 = load2_op.y;
add_op.y.dtype = ge::DT_FLOAT16;
add_op.attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
store_op.x = add_op.y;
store_op.y.dtype = ge::DT_FLOAT16;
y_op.x = store_op.y;
y_op.y.dtype = ge::DT_FLOAT16;
auto x1 = graph.FindNode("x1");
auto x2 = graph.FindNode("x2");
auto x3 = graph.FindNode("x3");
auto load1 = graph.FindNode("load1");
auto load2 = graph.FindNode("load2");
auto load3 = graph.FindNode("load3");
auto add = graph.FindNode("add");
auto store = graph.FindNode("store");
auto y = graph.FindNode("y");
x1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x1->outputs[0].attr.mem.tensor_id = 0;
x2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x2->outputs[0].attr.mem.tensor_id = 1;
x3->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x3->outputs[0].attr.mem.tensor_id = 2;
load1->outputs[0].attr.axis = {z0.id};
load1->outputs[0].attr.vectorized_axis = {z0.id};
load1->outputs[0].attr.vectorized_strides = {One};
load1->outputs[0].attr.repeats = {z0.size};
load1->outputs[0].attr.strides = {One};
load1->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load1->outputs[0].attr.mem.tensor_id = 3;
load1->outputs[0].attr.que.id = 0;
load1->outputs[0].attr.mem.reuse_id = 0;
load1->outputs[0].attr.que.depth = 2;
load1->outputs[0].attr.que.buf_num = 2;
load1->outputs[0].attr.opt.merge_scope = af::kIdNone;
load2->outputs[0].attr.axis = {z0.id};
load2->outputs[0].attr.vectorized_axis = {z0.id};
load2->outputs[0].attr.vectorized_strides = {One};
load2->outputs[0].attr.repeats = {z0.size};
load2->outputs[0].attr.strides = {One};
load2->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load2->outputs[0].attr.mem.tensor_id = 4;
load2->outputs[0].attr.que.id = 0;
load2->outputs[0].attr.mem.reuse_id = 0;
load2->outputs[0].attr.que.depth = 2;
load2->outputs[0].attr.que.buf_num = 2;
load2->outputs[0].attr.opt.merge_scope = af::kIdNone;
load3->outputs[0].attr.axis = {z0.id};
load3->outputs[0].attr.vectorized_axis = {z0.id};
load3->outputs[0].attr.vectorized_strides = {One};
load3->outputs[0].attr.repeats = {z0.size};
load3->outputs[0].attr.strides = {One};
load3->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load3->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load3->outputs[0].attr.mem.tensor_id = 5;
load3->outputs[0].attr.que.id = 0;
load3->outputs[0].attr.mem.reuse_id = 1;
load3->outputs[0].attr.que.depth = 2;
load3->outputs[0].attr.que.buf_num = 2;
load3->outputs[0].attr.opt.merge_scope = af::kIdNone;
add->outputs[0].attr.axis = {z0.id};
add->outputs[0].attr.vectorized_axis = {z0.id};
add->outputs[0].attr.vectorized_strides = {One};
add->outputs[0].attr.repeats = {z0.size};
add->outputs[0].attr.strides = {One};
add->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
add->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
add->outputs[0].attr.mem.tensor_id = 6;
add->outputs[0].attr.que.id = 0;
add->outputs[0].attr.mem.reuse_id = 0;
add->outputs[0].attr.que.depth = 2;
add->outputs[0].attr.que.buf_num = 2;
add->outputs[0].attr.opt.merge_scope = af::kIdNone;
store->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store->outputs[0].attr.mem.tensor_id = 6;
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.input_nodes.push_back(x1);
fused_schedule_result.input_nodes.push_back(x2);
fused_schedule_result.input_nodes.push_back(x3);
fused_schedule_result.output_nodes.push_back(y);
codegen::Kernel kernel(graph.GetName());
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
std::string result;
kernel.LocalTensorQueBufAlloc(result, graph);
EXPECT_EQ(result, std::string{
"TPipe tpipe;\n\n"
"const uint32_t local_3_size = KernelUtils::BlkAlign<half>((t->s0 - 1) + 1);\n"
"const uint32_t local_3_que_buf_num = 2;\n"
"const uint32_t local_4_size = KernelUtils::BlkAlign<half>((t->s0 - 1) + 1);\n"
"const uint32_t local_4_que_buf_num = 2;\n"
"const uint32_t local_5_size = KernelUtils::BlkAlign<half>((t->s0 - 1) + 1);\n"
"const uint32_t local_5_que_buf_num = 2;\n"
"const uint32_t local_6_size = KernelUtils::BlkAlign<half>((t->s0 - 1) + 1);\n"
"const uint32_t local_6_que_buf_num = 2;\n\n"
"// const uint32_t q0_size = KernelUtils::Max(local_3_size * sizeof(half), local_4_size * sizeof(half), local_5_size * sizeof(half), local_6_size * sizeof(half), local_3_size * sizeof(half) + local_4_size * sizeof(half) + local_6_size * sizeof(half));\n"
"const uint32_t q0_buf_num = KernelUtils::Max(2);\n"
"TQue<TPosition::VECIN, 1> q0;\n"
"// tpipe.InitBuffer(q0, q0_buf_num, KernelUtils::BlkAlign<uint8_t>(q0_size));\n"
"tpipe.InitBuffer(q0, q0_buf_num, t->q0_size);\n"
"// const uint32_t b0_size = KernelUtils::Max(8192);\n"
"TBuf<TPosition::VECCALC> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
});
}
TEST(CodegenKernel, Kernel_GenerateKernel_Multi_ScheduleGroup) {
af::AscGraph graph("test_kernel");
af::ascir_op::Data x_op("x", graph);
x_op.ir_attr.SetIndex(0);
af::ascir_op::Output y_op("y");
y_op.ir_attr.SetIndex(0);
af::ascir_op::Load load_op("load");
af::ascir_op::Store store_op("store");
graph.AddNode(load_op);
graph.AddNode(store_op);
graph.AddNode(y_op);
x_op.y.dtype = ge::DT_FLOAT16;
load_op.x = x_op.y;
load_op.y.dtype = ge::DT_FLOAT16;
store_op.x = load_op.y;
store_op.y.dtype = ge::DT_FLOAT16;
y_op.x = store_op.y;
store_op.attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
auto x = graph.FindNode("x");
auto load = graph.FindNode("load");
auto store = graph.FindNode("store");
auto output = graph.FindNode("y");
x->attr.api.unit = af::ComputeUnit::kUnitNone;
load->attr.api.unit = af::ComputeUnit::kUnitNone;
store->attr.api.unit = af::ComputeUnit::kUnitNone;
output->attr.api.unit = af::ComputeUnit::kUnitNone;
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.tensor_id = 0;
x->outputs[0].attr.opt.merge_scope = 0;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
load->outputs[0].attr.mem.tensor_id = 1;
load->outputs[0].attr.buf.id = 0;
load->outputs[0].attr.opt.merge_scope = 0;
store->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store->outputs[0].attr.mem.tensor_id = 2;
store->outputs[0].attr.opt.merge_scope = 0;
::ascir::ScheduleGroup schedule_result0_group0 = {{af::AscGraph("test_kernel_general_0_nil_0_nil"),
af::AscGraph("test_kernel_general_1_nil_1_nil"), af::AscGraph("test_kernel_general_2_nil_2_nil")}};
::ascir::ScheduleGroup schedule_result0_group1 = {{af::AscGraph("test_kernel_general_3_nil_3_nil"),
af::AscGraph("test_kernel_general_4_nil_4_nil")}};
::ascir::ScheduleGroup schedule_result1_group0 = {{af::AscGraph("test_kernel_general_5_nil_5_nil"),
af::AscGraph("test_kernel_general_6_nil_6_nil")}};
::ascir::ScheduledResult schedule_result0;
schedule_result0.schedule_groups.push_back(schedule_result0_group0);
schedule_result0.schedule_groups.push_back(schedule_result0_group1);
::ascir::ScheduledResult schedule_result1;
schedule_result1.schedule_groups.push_back(schedule_result1_group0);
std::vector<::ascir::ScheduledResult> schedule_results;
schedule_results.push_back(schedule_result0);
schedule_results.push_back(schedule_result1);
for (uint32_t i = 0; i < schedule_results.size(); i++) {
for (auto schedule_group : schedule_results[i].schedule_groups) {
for (auto impl_graph : schedule_group.impl_graphs) {
impl_graph.CopyFrom(graph);
}
}
}
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.fused_graph_name = af::AscendString(graph.GetName().c_str());
fused_schedule_result.node_idx_to_scheduled_results.push_back(schedule_results);
fused_schedule_result.input_nodes.push_back(x);
fused_schedule_result.output_nodes.push_back(output);
std::stringstream ss;
std::string string;
codegen::Kernel::GenKernelFuncByTilingKey(fused_schedule_result, ss);
string = ss.str();
EXPECT_EQ(string, std::string{
"\n"
"/**\n"
" * Copyright (c) 2025 Huawei Technologies Co., Ltd.\n"
" * This program is free software, you can redistribute it and/or modify it under the terms and conditions of \n"
" * CANN Open Software License Agreement Version 2.0 (the \"License\").\n"
" * Please refer to the License for details. You may not use this file except in compliance with the License.\n"
" * THIS SOFTWARE IS PROVIDED ON AN \"AS IS\" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, \n"
" * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.\n"
" * See LICENSE in the root of the software repository for the full text of the License.\n"
" */\n"
"#ifndef __ASCENDC_API_DATACOPY_H__\n"
"#define __ASCENDC_API_DATACOPY_H__\n"
"\n"
"template <typename T>\n"
"inline __aicore__ void DataCopyPadExtend(const AscendC::LocalTensor<T> &dst, const AscendC::GlobalTensor<T> &src,\n"
" uint32_t block_count, uint32_t block_len, uint32_t src_stride,\n"
" uint32_t dst_stride) {\n"
" uint32_t align_num = ONE_BLK_SIZE / sizeof(T);\n"
" AscendC::DataCopyExtParams param;\n"
" param.blockCount = block_count;\n"
" param.blockLen = block_len * sizeof(T);\n"
" param.srcStride = src_stride * sizeof(T);\n"
" param.dstStride = dst_stride / align_num;\n"
"\n"
" AscendC::DataCopyPadExtParams<T> pad_params = {true, 0, 0, 0};\n"
" AscendC::DataCopyPad(dst, src, param, pad_params);\n"
"}\n"
"\n"
"template <typename T>\n"
"inline __aicore__ void DataCopyPadExtend(const AscendC::GlobalTensor<T> &dst, const AscendC::LocalTensor<T> &src,\n"
" uint32_t block_count, uint32_t block_len, uint32_t src_stride,\n"
" uint32_t dst_stride) {\n"
" uint32_t align_num = ONE_BLK_SIZE / sizeof(T);\n"
" AscendC::DataCopyExtParams param;\n"
" param.blockCount = block_count;\n"
" param.blockLen = block_len * sizeof(T);\n"
" param.srcStride = src_stride / align_num;\n"
" param.dstStride = dst_stride * sizeof(T);\n"
"\n"
" AscendC::DataCopyPad(dst, src, param);\n"
"}\n"
"\n"
"template <typename T>\n"
"inline __aicore__ void DataCopyExtend(const AscendC::LocalTensor<T> &dst, const AscendC::LocalTensor<T> &src,\n"
" const uint32_t size) {\n"
" AscendC::DataCopy(dst, src, size);\n"
"}\n"
"\n"
"#endif // __ASCENDC_API_DATACOPY_H__\n"
"inline __aicore__ void test_kernel_general_0_nil_0_nil(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, const AscGraph0ScheduleResult0G0TilingData *t) {\n"
"int block_dim = GetBlockIdx();\n"
"if (block_dim >= t->block_dim) { \n"
" return;\n"
"}\n\n"
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y);\n\n"
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = 1;\n"
"const uint32_t m0_size = KernelUtils::Sum(local_1_size * sizeof(half));\n"
"const uint32_t m0_que_buf_num = KernelUtils::Max();\n\n"
"// const uint32_t b0_size = KernelUtils::Max(m0_size, 8192);\n"
"TBuf<TPosition::GM> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
"}\n"
"inline __aicore__ void test_kernel_general_1_nil_1_nil(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, const AscGraph0ScheduleResult0G0TilingData *t) {\n"
"int block_dim = GetBlockIdx();\n"
"if (block_dim >= t->block_dim) { \n"
" return;\n"
"}\n\n"
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y);\n\n"
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = 1;\n"
"const uint32_t m0_size = KernelUtils::Sum(local_1_size * sizeof(half));\n"
"const uint32_t m0_que_buf_num = KernelUtils::Max();\n\n"
"// const uint32_t b0_size = KernelUtils::Max(m0_size, 8192);\n"
"TBuf<TPosition::GM> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
"}\n"
"inline __aicore__ void test_kernel_general_2_nil_2_nil(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, const AscGraph0ScheduleResult0G0TilingData *t) {\n"
"int block_dim = GetBlockIdx();\n"
"if (block_dim >= t->block_dim) { \n"
" return;\n"
"}\n\n"
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y);\n\n"
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = 1;\n"
"const uint32_t m0_size = KernelUtils::Sum(local_1_size * sizeof(half));\n"
"const uint32_t m0_que_buf_num = KernelUtils::Max();\n\n"
"// const uint32_t b0_size = KernelUtils::Max(m0_size, 8192);\n"
"TBuf<TPosition::GM> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
"}\n"
"inline __aicore__ void test_kernel_general_3_nil_3_nil(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, const AscGraph0ScheduleResult0G1TilingData *t) {\n"
"int block_dim = GetBlockIdx();\n"
"if (block_dim >= t->block_dim) { \n"
" return;\n"
"}\n\n"
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y);\n\n"
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = 1;\n"
"const uint32_t m0_size = KernelUtils::Sum(local_1_size * sizeof(half));\n"
"const uint32_t m0_que_buf_num = KernelUtils::Max();\n\n"
"// const uint32_t b0_size = KernelUtils::Max(m0_size, 8192);\n"
"TBuf<TPosition::GM> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
"}\n"
"inline __aicore__ void test_kernel_general_4_nil_4_nil(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, const AscGraph0ScheduleResult0G1TilingData *t) {\n"
"int block_dim = GetBlockIdx();\n"
"if (block_dim >= t->block_dim) { \n"
" return;\n"
"}\n\n"
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y);\n\n"
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = 1;\n"
"const uint32_t m0_size = KernelUtils::Sum(local_1_size * sizeof(half));\n"
"const uint32_t m0_que_buf_num = KernelUtils::Max();\n\n"
"// const uint32_t b0_size = KernelUtils::Max(m0_size, 8192);\n"
"TBuf<TPosition::GM> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
"}\n"
"inline __aicore__ void test_kernel_general_5_nil_5_nil(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, const AscGraph0ScheduleResult1G0TilingData *t) {\n"
"int block_dim = GetBlockIdx();\n"
"if (block_dim >= t->block_dim) { \n"
" return;\n"
"}\n\n"
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y);\n\n"
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = 1;\n"
"const uint32_t m0_size = KernelUtils::Sum(local_1_size * sizeof(half));\n"
"const uint32_t m0_que_buf_num = KernelUtils::Max();\n\n"
"// const uint32_t b0_size = KernelUtils::Max(m0_size, 8192);\n"
"TBuf<TPosition::GM> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
"}\n"
"inline __aicore__ void test_kernel_general_6_nil_6_nil(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, const AscGraph0ScheduleResult1G0TilingData *t) {\n"
"int block_dim = GetBlockIdx();\n"
"if (block_dim >= t->block_dim) { \n"
" return;\n"
"}\n\n"
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y);\n\n"
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = 1;\n"
"const uint32_t m0_size = KernelUtils::Sum(local_1_size * sizeof(half));\n"
"const uint32_t m0_que_buf_num = KernelUtils::Max();\n\n"
"// const uint32_t b0_size = KernelUtils::Max(m0_size, 8192);\n"
"TBuf<TPosition::GM> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
"}\n"
"extern \"C\" __global__ __aicore__ void test_kernel(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR gm_tiling_data) {\n"
" REGISTER_TILING_DEFAULT(AutofuseTilingData);\n"
" GET_TILING_DATA(t, gm_tiling_data);\n"
" if (TILING_KEY_IS(0)) {\n"
" test_kernel_general_0_nil_0_nil(x, y, workspace, &t.graph0_result0_g0_tiling_data);\n"
" test_kernel_general_3_nil_3_nil(x, y, workspace, &t.graph0_result0_g1_tiling_data);\n"
" } else if (TILING_KEY_IS(1)) {\n"
" test_kernel_general_0_nil_0_nil(x, y, workspace, &t.graph0_result0_g0_tiling_data);\n"
" test_kernel_general_4_nil_4_nil(x, y, workspace, &t.graph0_result0_g1_tiling_data);\n"
" } else if (TILING_KEY_IS(2)) {\n"
" test_kernel_general_1_nil_1_nil(x, y, workspace, &t.graph0_result0_g0_tiling_data);\n"
" test_kernel_general_3_nil_3_nil(x, y, workspace, &t.graph0_result0_g1_tiling_data);\n"
" } else if (TILING_KEY_IS(3)) {\n"
" test_kernel_general_1_nil_1_nil(x, y, workspace, &t.graph0_result0_g0_tiling_data);\n"
" test_kernel_general_4_nil_4_nil(x, y, workspace, &t.graph0_result0_g1_tiling_data);\n"
" } else if (TILING_KEY_IS(4)) {\n"
" test_kernel_general_2_nil_2_nil(x, y, workspace, &t.graph0_result0_g0_tiling_data);\n"
" test_kernel_general_3_nil_3_nil(x, y, workspace, &t.graph0_result0_g1_tiling_data);\n"
" } else if (TILING_KEY_IS(5)) {\n"
" test_kernel_general_2_nil_2_nil(x, y, workspace, &t.graph0_result0_g0_tiling_data);\n"
" test_kernel_general_4_nil_4_nil(x, y, workspace, &t.graph0_result0_g1_tiling_data);\n"
" } else if (TILING_KEY_IS(6)) {\n"
" test_kernel_general_5_nil_5_nil(x, y, workspace, &t.graph0_result1_g0_tiling_data);\n"
" } else if (TILING_KEY_IS(7)) {\n"
" test_kernel_general_6_nil_6_nil(x, y, workspace, &t.graph0_result1_g0_tiling_data);\n"
" }\n"
"}\n"});
fused_schedule_result.node_idx_to_scheduled_results[0][0].enable_group_parallel = true;
std::stringstream ss1;
std::string kernel_txt;
codegen::Kernel::GenKernelFuncByTilingKey(fused_schedule_result, ss1);
kernel_txt = ss1.str();
std::string expect_found = "const uint32_t block_offset = t->ub_size; // resue as block_offset\n"
"block_dim = block_dim >= block_offset ? "
"block_dim - block_offset : block_dim + GetBlockNum() - block_offset;\n";
EXPECT_TRUE(kernel_txt.find(expect_found) != std::string::npos);
}
TEST(CodegenKernel, Kernel_GenerateKernel_Single_ScheduleGroup) {
af::AscGraph graph("test_kernel");
af::ascir_op::Data x_op("x", graph);
x_op.ir_attr.SetIndex(0);
af::ascir_op::Output y_op("y");
y_op.ir_attr.SetIndex(0);
af::ascir_op::Load load_op("load");
af::ascir_op::Store store_op("store");
graph.AddNode(load_op);
graph.AddNode(store_op);
graph.AddNode(y_op);
x_op.y.dtype = ge::DT_FLOAT16;
load_op.x = x_op.y;
load_op.y.dtype = ge::DT_FLOAT16;
store_op.x = load_op.y;
store_op.y.dtype = ge::DT_FLOAT16;
y_op.x = store_op.y;
store_op.attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
auto x = graph.FindNode("x");
auto load = graph.FindNode("load");
auto store = graph.FindNode("store");
auto output = graph.FindNode("y");
x->attr.api.unit = af::ComputeUnit::kUnitNone;
load->attr.api.unit = af::ComputeUnit::kUnitNone;
store->attr.api.unit = af::ComputeUnit::kUnitNone;
output->attr.api.unit = af::ComputeUnit::kUnitNone;
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.tensor_id = 0;
x->outputs[0].attr.opt.merge_scope = 0;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
load->outputs[0].attr.mem.tensor_id = 1;
load->outputs[0].attr.buf.id = 0;
load->outputs[0].attr.opt.merge_scope = 0;
store->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store->outputs[0].attr.mem.tensor_id = 2;
store->outputs[0].attr.opt.merge_scope = 0;
::ascir::ScheduleGroup schedule_result0_group0 = {{af::AscGraph("test_kernel_general_0_nil_0_nil"),
af::AscGraph("test_kernel_general_1_nil_1_nil"), af::AscGraph("test_kernel_general_2_nil_2_nil")}};
::ascir::ScheduledResult schedule_result0;
schedule_result0.schedule_groups.push_back(schedule_result0_group0);
std::vector<::ascir::ScheduledResult> schedule_results;
schedule_results.push_back(schedule_result0);
for (uint32_t i = 0; i < schedule_results.size(); i++) {
for (auto schedule_group : schedule_results[i].schedule_groups) {
for (auto impl_graph : schedule_group.impl_graphs) {
impl_graph.CopyFrom(graph);
}
}
}
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.fused_graph_name = af::AscendString(graph.GetName().c_str());
fused_schedule_result.node_idx_to_scheduled_results.push_back(schedule_results);
fused_schedule_result.input_nodes.push_back(x);
fused_schedule_result.output_nodes.push_back(output);
std::stringstream ss;
std::string string;
codegen::Kernel::GenKernelFuncByTilingKey(fused_schedule_result, ss);
string = ss.str();
EXPECT_EQ(string, std::string{
"\n"
"/**\n"
" * Copyright (c) 2025 Huawei Technologies Co., Ltd.\n"
" * This program is free software, you can redistribute it and/or modify it under the terms and conditions of \n"
" * CANN Open Software License Agreement Version 2.0 (the \"License\").\n"
" * Please refer to the License for details. You may not use this file except in compliance with the License.\n"
" * THIS SOFTWARE IS PROVIDED ON AN \"AS IS\" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, \n"
" * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.\n"
" * See LICENSE in the root of the software repository for the full text of the License.\n"
" */\n"
"#ifndef __ASCENDC_API_DATACOPY_H__\n"
"#define __ASCENDC_API_DATACOPY_H__\n"
"\n"
"template <typename T>\n"
"inline __aicore__ void DataCopyPadExtend(const AscendC::LocalTensor<T> &dst, const AscendC::GlobalTensor<T> &src,\n"
" uint32_t block_count, uint32_t block_len, uint32_t src_stride,\n"
" uint32_t dst_stride) {\n"
" uint32_t align_num = ONE_BLK_SIZE / sizeof(T);\n"
" AscendC::DataCopyExtParams param;\n"
" param.blockCount = block_count;\n"
" param.blockLen = block_len * sizeof(T);\n"
" param.srcStride = src_stride * sizeof(T);\n"
" param.dstStride = dst_stride / align_num;\n"
"\n"
" AscendC::DataCopyPadExtParams<T> pad_params = {true, 0, 0, 0};\n"
" AscendC::DataCopyPad(dst, src, param, pad_params);\n"
"}\n"
"\n"
"template <typename T>\n"
"inline __aicore__ void DataCopyPadExtend(const AscendC::GlobalTensor<T> &dst, const AscendC::LocalTensor<T> &src,\n"
" uint32_t block_count, uint32_t block_len, uint32_t src_stride,\n"
" uint32_t dst_stride) {\n"
" uint32_t align_num = ONE_BLK_SIZE / sizeof(T);\n"
" AscendC::DataCopyExtParams param;\n"
" param.blockCount = block_count;\n"
" param.blockLen = block_len * sizeof(T);\n"
" param.srcStride = src_stride / align_num;\n"
" param.dstStride = dst_stride * sizeof(T);\n"
"\n"
" AscendC::DataCopyPad(dst, src, param);\n"
"}\n"
"\n"
"template <typename T>\n"
"inline __aicore__ void DataCopyExtend(const AscendC::LocalTensor<T> &dst, const AscendC::LocalTensor<T> &src,\n"
" const uint32_t size) {\n"
" AscendC::DataCopy(dst, src, size);\n"
"}\n"
"\n"
"#endif // __ASCENDC_API_DATACOPY_H__\n"
"inline __aicore__ void test_kernel_general_0_nil_0_nil(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, const AutofuseTilingData *t) {\n"
"int block_dim = GetBlockIdx();\n"
"if (block_dim >= t->block_dim) { \n"
" return;\n"
"}\n\n"
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y);\n\n"
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = 1;\n"
"const uint32_t m0_size = KernelUtils::Sum(local_1_size * sizeof(half));\n"
"const uint32_t m0_que_buf_num = KernelUtils::Max();\n\n"
"// const uint32_t b0_size = KernelUtils::Max(m0_size, 8192);\n"
"TBuf<TPosition::GM> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
"}\n"
"inline __aicore__ void test_kernel_general_1_nil_1_nil(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, const AutofuseTilingData *t) {\n"
"int block_dim = GetBlockIdx();\n"
"if (block_dim >= t->block_dim) { \n"
" return;\n"
"}\n\n"
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y);\n\n"
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = 1;\n"
"const uint32_t m0_size = KernelUtils::Sum(local_1_size * sizeof(half));\n"
"const uint32_t m0_que_buf_num = KernelUtils::Max();\n\n"
"// const uint32_t b0_size = KernelUtils::Max(m0_size, 8192);\n"
"TBuf<TPosition::GM> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
"}\n"
"inline __aicore__ void test_kernel_general_2_nil_2_nil(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, const AutofuseTilingData *t) {\n"
"int block_dim = GetBlockIdx();\n"
"if (block_dim >= t->block_dim) { \n"
" return;\n"
"}\n\n"
"GlobalTensor<half> global_0;\n"
"global_0.SetGlobalBuffer((__gm__ half*)x);\n"
"GlobalTensor<half> global_2;\n"
"global_2.SetGlobalBuffer((__gm__ half*)y);\n\n"
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = 1;\n"
"const uint32_t m0_size = KernelUtils::Sum(local_1_size * sizeof(half));\n"
"const uint32_t m0_que_buf_num = KernelUtils::Max();\n\n"
"// const uint32_t b0_size = KernelUtils::Max(m0_size, 8192);\n"
"TBuf<TPosition::GM> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
"}\n"
"extern \"C\" __global__ __aicore__ void test_kernel(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR gm_tiling_data) {\n"
" REGISTER_TILING_DEFAULT(AutofuseTilingData);\n"
" GET_TILING_DATA(t, gm_tiling_data);\n"
" if (TILING_KEY_IS(0)) {\n"
" test_kernel_general_0_nil_0_nil(x, y, workspace, &t);\n"
" } else if (TILING_KEY_IS(1)) {\n"
" test_kernel_general_1_nil_1_nil(x, y, workspace, &t);\n"
" } else if (TILING_KEY_IS(2)) {\n"
" test_kernel_general_2_nil_2_nil(x, y, workspace, &t);\n"
" }\n"
"}\n"});
}
TEST(CodegenKernel, DynamicInputsAndOutputs) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto z0 = graph.CreateAxis("z0", s0);
af::ascir_op::Data x_op("x", graph);
x_op.ir_attr.SetIndex(0);
af::ascir_op::Load load_op("load");
af::ascir_op::Store store_op("store");
af::ascir_op::Output y_op("y");
y_op.ir_attr.SetIndex(0);
graph.AddNode(load_op);
graph.AddNode(store_op);
graph.AddNode(y_op);
x_op.y.dtype = ge::DT_FLOAT16;
load_op.x = x_op.y;
load_op.y.dtype = ge::DT_FLOAT16;
store_op.x = load_op.y;
store_op.y.dtype = ge::DT_FLOAT16;
y_op.x = store_op.y;
y_op.y.dtype = ge::DT_FLOAT16;
store_op.attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
auto x = graph.FindNode("x");
auto load = graph.FindNode("load");
auto store = graph.FindNode("store");
auto y = graph.FindNode("y");
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.axis = {z0.id};
load->outputs[0].attr.vectorized_axis = {z0.id};
load->outputs[0].attr.vectorized_strides = {One};
load->outputs[0].attr.repeats = {z0.size};
load->outputs[0].attr.strides = {One};
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.mem.tensor_id = 1;
load->outputs[0].attr.que.id = 0;
load->outputs[0].attr.que.depth = 2;
load->outputs[0].attr.que.buf_num = 2;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
store->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store->outputs[0].attr.mem.tensor_id = 2;
::ascir::ScheduledResult schedule_result0;
schedule_result0.schedule_groups.resize(1);
schedule_result0.schedule_groups[0].impl_graphs.emplace_back(graph);
std::vector<::ascir::ScheduledResult> schedule_results;
schedule_results.push_back(schedule_result0);
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.node_idx_to_scheduled_results.push_back(schedule_results);
fused_schedule_result.input_nodes.push_back(x);
fused_schedule_result.output_nodes.push_back(y);
codegen::Kernel kernel(graph.GetName());
kernel.SetUseListTensor(true);
std::string include_and_defines = codegen::Kernel::IncludeAndDefines(fused_schedule_result, "KERNEL_TYPE_AIV_ONLY" , true);
EXPECT_TRUE(include_and_defines.find("kernel_operator_list_tensor_intf.h") != std::string::npos);
auto declare = codegen::Kernel::KernelFuncDeclare("fused_graph", fused_schedule_result, true);
std::string expected = "extern \"C\" __global__ __aicore__ void "
"fused_graph(GM_ADDR inputs, GM_ADDR outputs, GM_ADDR workspace, GM_ADDR gm_tiling_data)";
EXPECT_EQ(declare, expected);
kernel.SetUseListTensor(true);
std::string global_tensor_init_result;
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
auto func_call = kernel.GenTilingFuncCall("fused_graph", "fused_graphTilingData", 0);
std::string expected_func_call =
" if (fused_graph_tiling_data.tiling_key == 0) {\n"
" fused_graph(input_tensor_desc, output_tensor_desc, workspace, &fused_graphTilingData);\n"
" }";
EXPECT_EQ(func_call, expected_func_call);
}
TEST(CodegenKernel, PackingFunctionCalls) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto z0 = graph.CreateAxis("z0", s0);
af::ascir_op::Data x_op("x", graph);
x_op.ir_attr.SetIndex(0);
af::ascir_op::Load load_op("load");
af::ascir_op::Store store_op("store");
af::ascir_op::Output y_op("y");
y_op.ir_attr.SetIndex(0);
graph.AddNode(load_op);
graph.AddNode(store_op);
graph.AddNode(y_op);
x_op.y.dtype = ge::DT_FLOAT16;
load_op.x = x_op.y;
load_op.y.dtype = ge::DT_FLOAT16;
store_op.x = load_op.y;
store_op.y.dtype = ge::DT_FLOAT16;
y_op.x = store_op.y;
y_op.y.dtype = ge::DT_FLOAT16;
auto x = graph.FindNode("x");
auto load = graph.FindNode("load");
auto store = graph.FindNode("store");
auto y = graph.FindNode("y");
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.tensor_id = 0;
x->attr.api.unit = af::ComputeUnit::kUnitNone;
y->attr.api.unit = af::ComputeUnit::kUnitNone;
load->outputs[0].attr.axis = {z0.id};
load->outputs[0].attr.vectorized_axis = {z0.id};
load->outputs[0].attr.vectorized_strides = {One};
load->outputs[0].attr.repeats = {z0.size};
load->outputs[0].attr.strides = {One};
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.mem.tensor_id = 1;
load->outputs[0].attr.que.id = 0;
load->outputs[0].attr.mem.reuse_id = 0;
load->outputs[0].attr.que.depth = 2;
load->outputs[0].attr.que.buf_num = 2;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
store->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store->outputs[0].attr.mem.tensor_id = 2;
::ascir::ScheduledResult schedule_result0;
schedule_result0.schedule_groups.resize(6);
schedule_result0.enable_group_parallel = true;
for (auto &schedule_group : schedule_result0.schedule_groups) {
schedule_group.impl_graphs.emplace_back(graph);
}
std::vector<::ascir::ScheduledResult> schedule_results;
schedule_results.push_back(schedule_result0);
schedule_results.push_back(schedule_result0);
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.fused_graph_name = af::AscendString(graph.GetName().c_str());
fused_schedule_result.input_nodes.push_back(x);
fused_schedule_result.output_nodes.push_back(y);
fused_schedule_result.node_idx_to_scheduled_results.push_back(schedule_results);
std::stringstream ss;
std::string result;
auto ret = codegen::Kernel::GenKernelFuncByTilingKey(fused_schedule_result, ss, true);
result = ss.str();
EXPECT_EQ(ret, SUCCESS);
std::cout << result;
EXPECT_TRUE(result.find("packed_functions_820000000(input_tensor_desc, output_tensor_desc, workspace, t)")
!= std::string::npos);
EXPECT_TRUE(result.find("packed_functions_820000001(input_tensor_desc, output_tensor_desc, workspace, t)")
!= std::string::npos);
EXPECT_TRUE(result.find("packed_functions_820000002(input_tensor_desc, output_tensor_desc, workspace, t)")
!= std::string::npos);
EXPECT_TRUE(result.find("packed_functions_820000003(input_tensor_desc, output_tensor_desc, workspace, t)")
!= std::string::npos);
EXPECT_TRUE(result.find("packed_functions_820000004(input_tensor_desc, output_tensor_desc, workspace, t)")
== std::string::npos);
EXPECT_TRUE(result.find("if TILING_KEY_VAR == 20000000") != std::string::npos);
EXPECT_TRUE(result.find("if TILING_KEY_VAR == 20000001") != std::string::npos);
EXPECT_TRUE(result.find("if TILING_KEY_VAR == 20000002") != std::string::npos);
EXPECT_TRUE(result.find("if TILING_KEY_VAR == 20000003") != std::string::npos);
EXPECT_TRUE(result.find("if TILING_KEY_VAR == 20000004") == std::string::npos);
ss.clear();
ret = codegen::Kernel::GenKernelFuncByTilingKey(fused_schedule_result, ss, false);
EXPECT_EQ(ret, SUCCESS);
result = ss.str();
EXPECT_TRUE(result.find("packed_functions_820000000(x, y, workspace, t);") != std::string::npos);
EXPECT_TRUE(result.find("packed_functions_820000001(x, y, workspace, t);") != std::string::npos);
EXPECT_TRUE(result.find("packed_functions_820000002(x, y, workspace, t);") != std::string::npos);
EXPECT_TRUE(result.find("packed_functions_820000003(x, y, workspace, t);") != std::string::npos);
EXPECT_TRUE(result.find("packed_functions_820000004(x, y, workspace, t);") == std::string::npos);
EXPECT_TRUE(result.find("if TILING_KEY_VAR == 20000000") != std::string::npos);
EXPECT_TRUE(result.find("if TILING_KEY_VAR == 20000001") != std::string::npos);
EXPECT_TRUE(result.find("if TILING_KEY_VAR == 20000002") != std::string::npos);
EXPECT_TRUE(result.find("if TILING_KEY_VAR == 20000003") != std::string::npos);
EXPECT_TRUE(result.find("if TILING_KEY_VAR == 20000004") == std::string::npos);
}
TEST(CodegenKernel, TPipe_AllocTmpBuf) {
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.mem.position = af::Position::kPositionVecIn;
tensor.attr.mem.tensor_id = 0;
tensor.attr.mem.alloc_type = af::AllocType::kAllocTypeBuffer;
tensor.attr.buf.id = 1;
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
tpipe.CollectQues(graph);
tpipe.AddTensor(tensor);
auto buf = tpipe.bufs.find(tensor.attr.buf.id);
ASSERT_NE(buf, tpipe.bufs.end());
EXPECT_EQ(tpipe.AllocTmpBuf(buf->second), "LocalTensor<uint8_t> tmp_buf_1 = b1.Get<uint8_t>();\n");
}
TEST(CodegenKernel, GenDuplicateBufAlloc) {
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
af::ascir_op::Data y("y", graph);
auto node_x = graph.FindNode("x");
auto node_y = graph.FindNode("y");
node_x->attr.tmp_buffers = {{af::Symbol(1024), -1}, {af::Symbol(1024), 0}};
node_y->attr.tmp_buffers = {{af::Symbol(1024), -1}, {af::Symbol(1024), 0}};
std::set<std::pair<std::string, std::string>> pre_api_extract_dup = {{"1", "float"}};
codegen::Tiler tiler;
codegen::TPipe tpipe("tpipe", tiler);
std::string result;
result = tpipe.GenDuplicateBufAlloc(pre_api_extract_dup);
EXPECT_EQ(result, "TBuf<TPosition::VECCALC> builtin_tmp_buffer_1;\n"
"tpipe.InitBuffer(builtin_tmp_buffer_1, ONE_BLK_SIZE);\n"
"LocalTensor<uint8_t> builtin_tmp_buf_1 = builtin_tmp_buffer_1.Get<uint8_t>();\n"
"LocalTensor<float> local_blk_tensor_of_float_1 = builtin_tmp_buf_1.template ReinterpretCast<float>();\n"
"Duplicate(local_blk_tensor_of_float_1[0], (float)1.0, ONE_BLK_SIZE / sizeof(float));\n");
}
TEST(CodegenKernel, DichotomyReduceApiTest_RAPatternReduceMean) {
af::AscGraph graph("test");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
auto z2 = graph.CreateAxis("z2", s2);
Data x_("x");
graph.AddNode(x_);
x_.y.dtype = ge::DT_FLOAT;
Load load_("load");
graph.AddNode(load_);
load_.x = x_.y;
load_.attr.sched.axis = {z0.id, z1.id, z2.id};
*load_.y.axis = {z0.id, z1.id, z2.id};
*load_.y.repeats = {s0, s1, s2};
*load_.y.strides = {s1 * s2, s2, One};
af::ascir_op::Mean rmean_("rmean");
graph.AddNode(rmean_);
rmean_.attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}, {{af::Symbol(8192), 0}, af::MemAttr(), 1}};
rmean_.x = load_.y;
rmean_.attr.sched.axis = {z0.id, z1.id, z2.id};
*rmean_.y.axis = {z0.id, z1.id, z2.id};
*rmean_.y.repeats = {One, One, s2};
*rmean_.y.strides = {Zero, Zero, One};
Store store_("store");
graph.AddNode(store_);
store_.x = rmean_.y;
store_.attr.sched.axis = {z0.id, z1.id, z2.id};
*store_.y.axis = {z0.id, z1.id, z2.id};
*store_.y.repeats = {One, One, s2};
*store_.y.strides = {Zero, Zero, One};
Output y_("y");
graph.AddNode(y_);
y_.x = store_.y;
y_.y.dtype = ge::DT_FLOAT;
auto x = graph.FindNode("x");
x->attr.api.compute_type = af::ComputeType::kComputeInvalid;
x->attr.api.type = af::ApiType::kAPITypeBuffer;
x->attr.api.unit = af::ComputeUnit::kUnitNone;
auto load = graph.FindNode("load");
load->outputs[0].attr.dtype = ge::DT_FLOAT;
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
auto rmean = graph.FindNode("rmean");
rmean->attr.api.compute_type = af::ComputeType::kComputeReduce;
rmean->outputs[0].attr.dtype = ge::DT_FLOAT;
rmean->attr.api.type = af::ApiType::kAPITypeCompute;
rmean->attr.api.unit = af::ComputeUnit::kUnitVector;
auto store = graph.FindNode("store");
store->attr.api.compute_type = af::ComputeType::kComputeStore;
store->outputs[0].attr.dtype = ge::DT_FLOAT;
store->attr.api.type = af::ApiType::kAPITypeCompute;
store->attr.api.unit = af::ComputeUnit::kUnitMTE2;
auto y = graph.FindNode("y");
y->attr.api.compute_type = af::ComputeType::kComputeInvalid;
y->attr.api.type = af::ApiType::kAPITypeBuffer;
y->attr.api.unit = af::ComputeUnit::kUnitNone;
auto z0_id = load->attr.sched.axis[0];
auto z1_id = load->attr.sched.axis[1];
auto z2_id = load->attr.sched.axis[2];
auto [z2T, z2t] = graph.TileSplit(z2_id);
auto [z2TB, z2Tb] = graph.BlockSplit(z2T->id);
auto [z1T, z1t] = graph.TileSplit(z1_id);
auto z0z1T = graph.MergeAxis({z0_id, z1T->id});
for (auto node : graph.GetAllNodes()) {
if (IsOps<Data>(node) || IsOps<Output>(node)) {
continue;
}
graph.ApplySplit(node, z2T->id, z2t->id);
graph.ApplySplit(node, z2TB->id, z2Tb->id);
graph.ApplySplit(node, z1T->id, z1t->id);
graph.ApplyMerge(node, z0z1T->id);
graph.ApplyReorder(node, {z2TB->id, z2Tb->id, z0z1T->id, z1t->id, z2t->id});
}
vector<af::Expression> vectorized_strides{One, One};
load->attr.sched.loop_axis = z0z1T->id;
load->outputs[0].attr.vectorized_axis = {z1t->id, z2t->id};
auto size = ge::GetSizeByDataType(ge::DT_FLOAT);
vectorized_strides[0] = af::sym::Align(graph.FindAxis(z2t->id)->size, 32 / size);
load->outputs[0].attr.vectorized_strides = vectorized_strides;
rmean->attr.sched.loop_axis = z0z1T->id;
rmean->outputs[0].attr.vectorized_axis = {z1t->id, z2t->id};
rmean->outputs[0].attr.vectorized_strides = {Zero, One};
store->attr.sched.loop_axis = z2Tb->id;
store->outputs[0].attr.vectorized_axis = {z1t->id, z2t->id};
store->outputs[0].attr.vectorized_strides = {Zero, One};
x->outputs[0].attr.mem.tensor_id = 0;
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.hardware = af::MemHardware::kMemHardwareGM;
x->outputs[0].attr.mem.position = af::Position::kPositionGM;
x->outputs[0].attr.buf.id = af::kIdNone;
x->outputs[0].attr.que.id = af::kIdNone;
x->outputs[0].attr.opt.ref_tensor = af::kIdNone;
x->outputs[0].attr.opt.merge_scope = af::kIdNone;
load->outputs[0].attr.mem.tensor_id = 1;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.mem.hardware = af::MemHardware::kMemHardwareUB;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.buf.id = af::kIdNone;
load->outputs[0].attr.que.id = 0;
load->outputs[0].attr.mem.reuse_id = 0;
load->outputs[0].attr.que.depth = 2;
load->outputs[0].attr.que.buf_num = 2;
load->outputs[0].attr.opt.ref_tensor = af::kIdNone;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
rmean->outputs[0].attr.mem.tensor_id = 2;
rmean->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
rmean->outputs[0].attr.mem.hardware = af::MemHardware::kMemHardwareUB;
rmean->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
rmean->outputs[0].attr.buf.id = af::kIdNone;
rmean->outputs[0].attr.que.id = 1;
rmean->outputs[0].attr.mem.reuse_id = 1;
rmean->outputs[0].attr.que.depth = 2;
rmean->outputs[0].attr.que.buf_num = 2;
rmean->outputs[0].attr.opt.ref_tensor = af::kIdNone;
rmean->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto impl = ascgen_utils::GetAscIrCodegenImpl(rmean->GetType());
std::vector<std::unique_ptr<af::TmpBufDesc>> buffers = impl->CalcTmpBufSize(*rmean);
for (auto &buf_desc : buffers) {
if (buf_desc != nullptr) {
af::TmpBuffer temp_buffer;
temp_buffer.buf_desc = std::move(*buf_desc);
rmean->attr.tmp_buffers.emplace_back(temp_buffer);
}
}
store->outputs[0].attr.mem.tensor_id = 3;
store->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store->outputs[0].attr.mem.hardware = af::MemHardware::kMemHardwareGM;
store->outputs[0].attr.mem.position = af::Position::kPositionGM;
store->outputs[0].attr.buf.id = af::kIdNone;
store->outputs[0].attr.que.id = af::kIdNone;
store->outputs[0].attr.opt.ref_tensor = af::kIdNone;
store->outputs[0].attr.opt.merge_scope = af::kIdNone;
::ascir::ScheduleGroup schedule_result0_group0 = {{af::AscGraph("test_kernel_general_0_nil_0_nil")}};
::ascir::ScheduledResult schedule_result0;
schedule_result0.schedule_groups.push_back(schedule_result0_group0);
std::vector<::ascir::ScheduledResult> schedule_results;
schedule_results.push_back(schedule_result0);
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.fused_graph_name = af::AscendString(graph.GetName().c_str());
fused_schedule_result.node_idx_to_scheduled_results.push_back(schedule_results);
for (auto impl_graph : schedule_result0_group0.impl_graphs) {
impl_graph.CopyFrom(graph);
for (auto node : impl_graph.GetAllNodes()) {
if (node->GetType() == "Data") {
int64_t index = -1;
if (node->attr.ir_attr->GetAttrValue("index", index) == ge::GRAPH_FAILED) {
auto &attr = reinterpret_cast<af::AscDataIrAttrDef &>(*node->attr.ir_attr);
attr.SetIndex(static_cast<int64_t>(fused_schedule_result.input_nodes.size()));
}
fused_schedule_result.input_nodes.emplace_back(node);
} else if (node->GetType() == "Output") {
int64_t index = -1;
if (node->attr.ir_attr->GetAttrValue("index", index) == ge::GRAPH_FAILED) {
auto &attr = reinterpret_cast<af::AscDataIrAttrDef &>(*node->attr.ir_attr);
attr.SetIndex(static_cast<int64_t>(fused_schedule_result.output_nodes.size()));
}
fused_schedule_result.output_nodes.emplace_back(node);
}
}
}
std::stringstream ss;
std::string string;
Status status = codegen::Kernel::GenKernelFuncByTilingKey(fused_schedule_result, ss);
string = ss.str();
EXPECT_EQ(status, ge::SUCCESS);
}
TEST(CodegenKernel, DichotomyReduceApiTest_MultiMerge_RAPatternReduceMean) {
af::AscGraph graph("test");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
auto s3 = graph.CreateSizeVar("s3");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
auto z2 = graph.CreateAxis("z2", s2);
auto z3 = graph.CreateAxis("z3", s3);
Data x_("x");
graph.AddNode(x_);
x_.y.dtype = ge::DT_FLOAT;
Load load_("load");
graph.AddNode(load_);
load_.x = x_.y;
load_.attr.sched.axis = {z0.id, z1.id, z2.id, z3.id};
*load_.y.axis = {z0.id, z1.id, z2.id, z3.id};
*load_.y.repeats = {s0, s1, s2, s3};
*load_.y.strides = {s1 * s2 * s3, s2 * s3, s3, One};
af::ascir_op::Mean rmean_("rmean");
graph.AddNode(rmean_);
rmean_.attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}, {{af::Symbol(8192), 0}, af::MemAttr(), 1}};
rmean_.x = load_.y;
rmean_.attr.sched.axis = {z0.id, z1.id, z2.id, z3.id};
*rmean_.y.axis = {z0.id, z1.id, z2.id, z3.id};
*rmean_.y.repeats = {One, One, One, s3};
*rmean_.y.strides = {Zero, Zero, Zero, One};
Store store_("store");
graph.AddNode(store_);
store_.x = rmean_.y;
store_.attr.sched.axis = {z0.id, z1.id, z2.id, z3.id};
*store_.y.axis = {z0.id, z1.id, z2.id, z3.id};
*store_.y.repeats = {One, One, One, s3};
*store_.y.strides = {Zero, Zero, Zero, One};
Output y_("y");
graph.AddNode(y_);
y_.x = store_.y;
y_.y.dtype = ge::DT_FLOAT;
auto x = graph.FindNode("x");
x->attr.api.compute_type = af::ComputeType::kComputeInvalid;
x->attr.api.type = af::ApiType::kAPITypeBuffer;
x->attr.api.unit = af::ComputeUnit::kUnitNone;
auto load = graph.FindNode("load");
load->outputs[0].attr.dtype = ge::DT_FLOAT;
load->attr.api.compute_type = af::ComputeType::kComputeLoad;
load->attr.api.type = af::ApiType::kAPITypeCompute;
load->attr.api.unit = af::ComputeUnit::kUnitMTE2;
auto rmean = graph.FindNode("rmean");
rmean->attr.api.compute_type = af::ComputeType::kComputeReduce;
rmean->outputs[0].attr.dtype = ge::DT_FLOAT;
rmean->attr.api.type = af::ApiType::kAPITypeCompute;
rmean->attr.api.unit = af::ComputeUnit::kUnitVector;
auto store = graph.FindNode("store");
store->attr.api.compute_type = af::ComputeType::kComputeStore;
store->outputs[0].attr.dtype = ge::DT_FLOAT;
store->attr.api.type = af::ApiType::kAPITypeCompute;
store->attr.api.unit = af::ComputeUnit::kUnitMTE2;
auto y = graph.FindNode("y");
y->attr.api.compute_type = af::ComputeType::kComputeInvalid;
y->attr.api.type = af::ApiType::kAPITypeBuffer;
y->attr.api.unit = af::ComputeUnit::kUnitNone;
auto z0_id = load->attr.sched.axis[0];
auto z1_id = load->attr.sched.axis[1];
auto z2_id = load->attr.sched.axis[2];
auto z3_id = load->attr.sched.axis[3];
auto [z2T, z2t] = graph.TileSplit(z2_id);
auto [z3T, z3t] = graph.TileSplit(z3_id);
auto [z3TB, z3Tb] = graph.BlockSplit(z3T->id);
auto z0z1 = graph.MergeAxis({z0_id, z1_id});
auto z0z1z2T = graph.MergeAxis({z0z1->id, z2T->id});
for (auto node : graph.GetAllNodes()) {
if (IsOps<Data>(node) || IsOps<Output>(node)) {
continue;
}
graph.ApplySplit(node, z2T->id, z2t->id);
graph.ApplySplit(node, z3T->id, z3t->id);
graph.ApplySplit(node, z3TB->id, z3Tb->id);
graph.ApplyMerge(node, z0z1->id);
graph.ApplyMerge(node, z0z1z2T->id);
graph.ApplyReorder(node, {z3TB->id, z3Tb->id, z0z1z2T->id, z2t->id, z3t->id});
}
vector<af::Expression> vectorized_strides{One, One};
load->attr.sched.loop_axis = z0z1z2T->id;
load->outputs[0].attr.vectorized_axis = {z2t->id, z3t->id};
auto size = ge::GetSizeByDataType(ge::DT_FLOAT);
vectorized_strides[0] = af::sym::Align(graph.FindAxis(z2t->id)->size, 32 / size);
load->outputs[0].attr.vectorized_strides = vectorized_strides;
rmean->attr.sched.loop_axis = z0z1z2T->id;
rmean->outputs[0].attr.vectorized_axis = {z2t->id, z3t->id};
rmean->outputs[0].attr.vectorized_strides = {Zero, One};
store->attr.sched.loop_axis = z3Tb->id;
store->outputs[0].attr.vectorized_axis = {z2t->id, z3t->id};
store->outputs[0].attr.vectorized_strides = {Zero, One};
x->outputs[0].attr.mem.tensor_id = 0;
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.hardware = af::MemHardware::kMemHardwareGM;
x->outputs[0].attr.mem.position = af::Position::kPositionGM;
x->outputs[0].attr.buf.id = af::kIdNone;
x->outputs[0].attr.que.id = af::kIdNone;
x->outputs[0].attr.opt.ref_tensor = af::kIdNone;
x->outputs[0].attr.opt.merge_scope = af::kIdNone;
load->outputs[0].attr.mem.tensor_id = 1;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.mem.hardware = af::MemHardware::kMemHardwareUB;
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.buf.id = af::kIdNone;
load->outputs[0].attr.que.id = 0;
load->outputs[0].attr.mem.reuse_id = 0;
load->outputs[0].attr.que.depth = 2;
load->outputs[0].attr.que.buf_num = 2;
load->outputs[0].attr.opt.ref_tensor = af::kIdNone;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
rmean->outputs[0].attr.mem.tensor_id = 2;
rmean->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
rmean->outputs[0].attr.mem.hardware = af::MemHardware::kMemHardwareUB;
rmean->outputs[0].attr.mem.position = af::Position::kPositionVecOut;
rmean->outputs[0].attr.buf.id = af::kIdNone;
rmean->outputs[0].attr.que.id = 1;
rmean->outputs[0].attr.mem.reuse_id = 1;
rmean->outputs[0].attr.que.depth = 2;
rmean->outputs[0].attr.que.buf_num = 2;
rmean->outputs[0].attr.opt.ref_tensor = af::kIdNone;
rmean->outputs[0].attr.opt.merge_scope = af::kIdNone;
auto impl = ascgen_utils::GetAscIrCodegenImpl(rmean->GetType());
std::vector<std::unique_ptr<af::TmpBufDesc>> buffers = impl->CalcTmpBufSize(*rmean);
for (auto &buf_desc : buffers) {
if (buf_desc != nullptr) {
af::TmpBuffer temp_buffer;
temp_buffer.buf_desc = std::move(*buf_desc);
rmean->attr.tmp_buffers.emplace_back(temp_buffer);
}
}
store->outputs[0].attr.mem.tensor_id = 3;
store->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store->outputs[0].attr.mem.hardware = af::MemHardware::kMemHardwareGM;
store->outputs[0].attr.mem.position = af::Position::kPositionGM;
store->outputs[0].attr.buf.id = af::kIdNone;
store->outputs[0].attr.que.id = af::kIdNone;
store->outputs[0].attr.opt.ref_tensor = af::kIdNone;
store->outputs[0].attr.opt.merge_scope = af::kIdNone;
::ascir::ScheduleGroup schedule_result0_group0 = {{af::AscGraph("test_kernel_general_0_nil_0_nil")}};
::ascir::ScheduledResult schedule_result0;
schedule_result0.schedule_groups.push_back(schedule_result0_group0);
std::vector<::ascir::ScheduledResult> schedule_results;
schedule_results.push_back(schedule_result0);
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.fused_graph_name = af::AscendString(graph.GetName().c_str());
fused_schedule_result.node_idx_to_scheduled_results.push_back(schedule_results);
for (auto impl_graph : schedule_result0_group0.impl_graphs) {
impl_graph.CopyFrom(graph);
for (auto node : impl_graph.GetAllNodes()) {
if (node->GetType() == "Data") {
int64_t index = -1;
if (node->attr.ir_attr->GetAttrValue("index", index) == ge::GRAPH_FAILED) {
auto &attr = reinterpret_cast<af::AscDataIrAttrDef &>(*node->attr.ir_attr);
attr.SetIndex(static_cast<int64_t>(fused_schedule_result.input_nodes.size()));
}
fused_schedule_result.input_nodes.emplace_back(node);
} else if (node->GetType() == "Output") {
int64_t index = -1;
if (node->attr.ir_attr->GetAttrValue("index", index) == ge::GRAPH_FAILED) {
auto &attr = reinterpret_cast<af::AscDataIrAttrDef &>(*node->attr.ir_attr);
attr.SetIndex(static_cast<int64_t>(fused_schedule_result.output_nodes.size()));
}
fused_schedule_result.output_nodes.emplace_back(node);
}
}
}
std::stringstream ss;
std::string string;
Status status = codegen::Kernel::GenKernelFuncByTilingKey(fused_schedule_result, ss);
string = ss.str();
EXPECT_EQ(status, ge::SUCCESS);
}
TEST(CodegenKernel, GenerateTQueBind) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto z0 = graph.CreateAxis("z0", s0);
af::ascir_op::Data x_op("x", graph);
x_op.ir_attr.SetIndex(0);
af::ascir_op::Load load_op("load");
af::ascir_op::Store store_op("store");
af::ascir_op::Output y_op("y");
af::ascir_op::Output y_op_1("y_1");
y_op.ir_attr.SetIndex(0);
y_op_1.ir_attr.SetIndex(1);
graph.AddNode(load_op);
graph.AddNode(store_op);
graph.AddNode(y_op);
graph.AddNode(y_op_1);
x_op.y.dtype = ge::DT_FLOAT16;
load_op.x = x_op.y;
load_op.y.dtype = ge::DT_FLOAT16;
store_op.x = load_op.y;
store_op.y.dtype = ge::DT_FLOAT16;
store_op.attr.tmp_buffers = {{{af::Symbol(8192), -1}, af::MemAttr(), 0}};
y_op.x = store_op.y;
y_op_1.x = store_op.y;
y_op.y.dtype = ge::DT_FLOAT16;
y_op_1.y.dtype = ge::DT_FLOAT16;
auto x = graph.FindNode("x");
auto load = graph.FindNode("load");
auto store = graph.FindNode("store");
auto y = graph.FindNode("y");
auto y_1 = graph.FindNode("y_1");
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.tensor_id = 0;
load->outputs[0].attr.axis = {z0.id};
load->outputs[0].attr.vectorized_axis = {z0.id};
load->outputs[0].attr.vectorized_strides = {One};
load->outputs[0].attr.repeats = {z0.size};
load->outputs[0].attr.strides = {One};
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.mem.tensor_id = 1;
load->outputs[0].attr.que.id = 0;
load->outputs[0].attr.que.depth = 2;
load->outputs[0].attr.que.buf_num = 2;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
store->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store->outputs[0].attr.mem.tensor_id = 2;
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.input_nodes.push_back(x);
fused_schedule_result.output_nodes.push_back(y);
fused_schedule_result.output_nodes.push_back(y_1);
codegen::Kernel kernel(graph.GetName());
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
std::string result;
kernel.LocalTensorQueBufAlloc(result, graph);
EXPECT_EQ(result, std::string{
"TPipe tpipe;\n\n"
"const uint32_t local_1_size = KernelUtils::BlkAlign<half>((t->s0 - 1) + 1);\n"
"const uint32_t local_1_que_buf_num = 2;\n\n"
"// const uint32_t q0_size = KernelUtils::Max(local_1_size * sizeof(half));\n"
"const uint32_t q0_buf_num = KernelUtils::Max(2);\n"
"TQueBind<TPosition::VECIN, TPosition::VECOUT, 1> q0;\n"
"// tpipe.InitBuffer(q0, q0_buf_num, KernelUtils::BlkAlign<uint8_t>(q0_size));\n"
"tpipe.InitBuffer(q0, q0_buf_num, t->q0_size);\n"
"// const uint32_t b0_size = KernelUtils::Max(8192);\n"
"TBuf<TPosition::VECCALC> b0;\n"
"// tpipe.InitBuffer(b0, KernelUtils::BlkAlign<uint8_t>(b0_size));\n"
"tpipe.InitBuffer(b0, t->b0_size);\n"
"LocalTensor<uint8_t> b0_buf = b0.Get<uint8_t>();\n"
"LocalTensor<uint8_t> tmp_buf_0 = b0.Get<uint8_t>();\n\n\n\n"
});
}
TEST(CodegenKernel, CalculateWorkspaceSize_Test) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto s2 = graph.CreateSizeVar("s2");
auto s3 = graph.CreateSizeVar("s3");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
Store store_op1("store1");
Store store_op2("store2");
Workspace workspace_op1("workspace1");
Workspace workspace_op2("workspace2");
Load load_op1("load1");
Load load_op2("load2");
graph.AddNode(store_op1);
graph.AddNode(store_op2);
graph.AddNode(workspace_op1);
graph.AddNode(workspace_op2);
graph.AddNode(load_op1);
graph.AddNode(load_op2);
store_op1.y.dtype = ge::DT_FLOAT16;
*store_op1.y.axis = {z0.id, z1.id};
workspace_op1.x = store_op1.y;
workspace_op1.y.dtype = ge::DT_FLOAT16;
*workspace_op1.y.axis = {z0.id, z1.id};
store_op2.y.dtype = ge::DT_FLOAT16;
*store_op2.y.axis = {z0.id, z1.id};
workspace_op2.x = store_op2.y;
workspace_op2.y.dtype = ge::DT_FLOAT16;
*workspace_op2.y.axis = {z0.id, z1.id};
load_op1.x = workspace_op1.y;
load_op1.y.dtype = ge::DT_FLOAT16;
*load_op1.y.axis = {z0.id, z1.id};
load_op2.x = workspace_op2.y;
load_op2.y.dtype = ge::DT_FLOAT16;
*load_op2.y.axis = {z0.id, z1.id};
auto load1 = graph.FindNode("load1");
auto load2 = graph.FindNode("load2");
auto workspace1 = graph.FindNode("workspace1");
auto workspace2 = graph.FindNode("workspace2");
auto store1 = graph.FindNode("store1");
auto store2 = graph.FindNode("store2");
store1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store1->outputs[0].attr.mem.tensor_id = 1;
store1->outputs[0].attr.repeats = {s0, s1, s2, s3};
store1->outputs[0].attr.strides = {s1 * s2 * s3, s2 * s3, s3, One};
store2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store2->outputs[0].attr.mem.tensor_id = 2;
store2->outputs[0].attr.repeats = {s0, s1, s2};
store2->outputs[0].attr.strides = {s1 * s2, s2, One};
workspace1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
workspace1->outputs[0].attr.mem.tensor_id = 1;
workspace2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
workspace2->outputs[0].attr.mem.tensor_id = 2;
load1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
load1->outputs[0].attr.mem.tensor_id = 3;
load1->outputs[0].attr.repeats = {s2, s3};
load1->outputs[0].attr.strides = {s3, One};
load2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
load2->outputs[0].attr.mem.tensor_id = 4;
load2->outputs[0].attr.repeats = {s0, s1};
load2->outputs[0].attr.strides = {s1, One};
workspace1->attr.api.compute_type = af::ComputeType::kComputeInvalid;
workspace2->attr.api.compute_type = af::ComputeType::kComputeInvalid;
std::vector<af::AscNodePtr> workspace_nodes = {workspace1, workspace2};
auto ws_size = ascgen_utils::CalculateWorkspaceSize(workspace_nodes);
EXPECT_EQ(ToString(ws_size), "(Max(Max(0, (2 * Max(Max(1, s1), (s0 * s1)))), (2 * Max(Max(Max(1, s2), (s1 * s2)), (s0 * s1 * s2)))) + Max(Max(0, (2 * Max(Max(Max(Max(1, s3), (s2 * s3)), (s0 * s1 * s2 * s3)), (s1 * s2 * s3)))), (2 * Max(Max(1, s3), (s2 * s3)))))");
}
TEST(CodegenKernel, CalculateWorkspaceSize_Test2) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar(4);
auto s1 = graph.CreateSizeVar(8);
auto s2 = graph.CreateSizeVar(16);
auto s3 = graph.CreateSizeVar(32);
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
Store store_op1("store1");
Workspace workspace_op1("workspace1");
Load load_op1("load1");
Load load_op2("load2");
graph.AddNode(store_op1);
graph.AddNode(workspace_op1);
graph.AddNode(load_op1);
graph.AddNode(load_op2);
store_op1.y.dtype = ge::DT_FLOAT16;
*store_op1.y.axis = {z0.id, z1.id};
workspace_op1.x = store_op1.y;
workspace_op1.y.dtype = ge::DT_FLOAT16;
*workspace_op1.y.axis = {z0.id, z1.id};
load_op1.x = workspace_op1.y;
load_op1.y.dtype = ge::DT_FLOAT16;
*load_op1.y.axis = {z0.id, z1.id};
load_op2.x = workspace_op1.y;
load_op2.y.dtype = ge::DT_FLOAT16;
*load_op2.y.axis = {z0.id, z1.id};
auto load1 = graph.FindNode("load1");
auto load2 = graph.FindNode("load2");
auto workspace1 = graph.FindNode("workspace1");
auto store1 = graph.FindNode("store1");
store1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store1->outputs[0].attr.mem.tensor_id = 1;
workspace1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
workspace1->outputs[0].attr.mem.tensor_id = 1;
load1->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
load1->outputs[0].attr.mem.tensor_id = 3;
load2->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
load2->outputs[0].attr.mem.tensor_id = 4;
workspace1->attr.api.compute_type = af::ComputeType::kComputeInvalid;
{
store1->outputs[0].attr.repeats = {s0, s1, s2, s3};
store1->outputs[0].attr.strides= {s1 * s2 * s3, s2 * s3, s3, One};
load1->outputs[0].attr.repeats = {s1, s3};
load1->outputs[0].attr.strides = {s3, One};
load2->outputs[0].attr.repeats = {s2, s3};
load2->outputs[0].attr.strides = {s3, One};
std::vector<af::AscNodePtr> workspace_nodes = {workspace1};
auto ws_size = ascgen_utils::CalculateWorkspaceSize(workspace_nodes);
EXPECT_EQ(ToString(ws_size), "32768");
}
{
store1->outputs[0].attr.repeats = {s1, s3};
store1->outputs[0].attr.strides = {s3, One};
load1->outputs[0].attr.repeats = {s1, s3};
load1->outputs[0].attr.strides = {s3, One};
load2->outputs[0].attr.repeats = {s0, s1, s2, s3};
load2->outputs[0].attr.strides = {s1 * s2 * s3, s2 * s3, s3, One};
std::vector<af::AscNodePtr> workspace_nodes = {workspace1};
auto ws_size = ascgen_utils::CalculateWorkspaceSize(workspace_nodes);
EXPECT_EQ(ToString(ws_size), "32768");
}
{
store1->outputs[0].attr.repeats = {s1, s3};
store1->outputs[0].attr.strides = {s3, One};
load1->outputs[0].attr.repeats = {s1, s2, s3};
load1->outputs[0].attr.strides = {s2 * s3, s3, One};
load2->outputs[0].attr.repeats = {s1, s3};
load2->outputs[0].attr.strides = {s3, One};
std::vector<af::AscNodePtr> workspace_nodes = {workspace1};
auto ws_size = ascgen_utils::CalculateWorkspaceSize(workspace_nodes);
EXPECT_EQ(ToString(ws_size), "8192");
}
}
TEST(CodegenKernel, CalculateWorkspaceSize_WarnTest) {
af::AscGraph graph("test_graph");
Workspace workspace_op1("workspace1");
Load load_op1("load1");
graph.AddNode(workspace_op1);
graph.AddNode(load_op1);
auto load1 = graph.FindNode("load1");
{
std::vector<af::AscNodePtr> workspace_nodes = {load1};
auto ws_size = ascgen_utils::CalculateWorkspaceSize(workspace_nodes);
EXPECT_EQ(ToString(ws_size), "0");
}
{
std::vector<af::AscNodePtr> workspace_nodes = {nullptr, nullptr};
auto ws_size = ascgen_utils::CalculateWorkspaceSize(workspace_nodes);
EXPECT_EQ(ToString(ws_size), "0");
}
}
TEST(CodegenKernel, EmptyTensorKernel) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto z0 = graph.CreateAxis("z0", Zero);
af::ascir_op::Data x_op("x", graph);
x_op.ir_attr.SetIndex(0);
af::ascir_op::Load load_op("load");
af::ascir_op::Store store_op("store");
af::ascir_op::Output y_op("y");
y_op.ir_attr.SetIndex(0);
graph.AddNode(load_op);
graph.AddNode(store_op);
graph.AddNode(y_op);
load_op.x = x_op.y;
load_op.y.dtype = ge::DT_FLOAT16;
store_op.x = load_op.y;
y_op.x = store_op.y;
auto x = graph.FindNode("x");
auto load = graph.FindNode("load");
auto store = graph.FindNode("store");
auto y = graph.FindNode("y");
x->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
x->outputs[0].attr.mem.tensor_id = 0;
x->attr.api.unit = af::ComputeUnit::kUnitNone;
y->attr.api.unit = af::ComputeUnit::kUnitNone;
load->outputs[0].attr.axis = {z0.id};
load->outputs[0].attr.vectorized_axis = {z0.id};
load->outputs[0].attr.vectorized_strides = {One};
load->outputs[0].attr.repeats = {z0.size};
load->outputs[0].attr.strides = {One};
load->outputs[0].attr.mem.position = af::Position::kPositionVecIn;
load->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeQueue;
load->outputs[0].attr.mem.tensor_id = 1;
load->outputs[0].attr.que.id = 0;
load->outputs[0].attr.mem.reuse_id = 0;
load->outputs[0].attr.que.depth = 2;
load->outputs[0].attr.que.buf_num = 2;
load->outputs[0].attr.opt.merge_scope = af::kIdNone;
store->outputs[0].attr.mem.alloc_type = af::AllocType::kAllocTypeGlobal;
store->outputs[0].attr.mem.tensor_id = 2;
store->outputs[0].attr.axis = {z0.id};
store->outputs[0].attr.vectorized_axis = {z0.id};
store->outputs[0].attr.vectorized_strides = {One};
store->outputs[0].attr.repeats = {z0.size};
store->outputs[0].attr.strides = {One};
::ascir::ScheduledResult schedule_result0;
schedule_result0.schedule_groups.resize(1);
for (auto &schedule_group : schedule_result0.schedule_groups) {
schedule_group.impl_graphs.emplace_back(graph);
}
std::vector<::ascir::ScheduledResult> schedule_results;
schedule_results.push_back(schedule_result0);
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.fused_graph_name = af::AscendString(graph.GetName().c_str());
fused_schedule_result.input_nodes.push_back(x);
fused_schedule_result.output_nodes.push_back(y);
fused_schedule_result.node_idx_to_scheduled_results.push_back(schedule_results);
std::stringstream ss;
std::string result;
auto ret = codegen::Kernel::GenKernelFuncByTilingKey({fused_schedule_result}, ss, true);
result = ss.str();
EXPECT_EQ(ret, ge::SUCCESS);
EXPECT_TRUE(result.find("test_graph") == result.rfind("test_graph"));
schedule_result0.schedule_groups.resize(2);
for (auto &schedule_group : schedule_result0.schedule_groups) {
schedule_group.impl_graphs.emplace_back(graph);
}
fused_schedule_result.node_idx_to_scheduled_results[0].push_back(schedule_result0);
std::stringstream ss1;
std::string result1;
ret = codegen::Kernel::GenKernelFuncByTilingKey(fused_schedule_result, ss1, true);
result1 = ss1.str();
EXPECT_EQ(ret, ge::SUCCESS);
EXPECT_TRUE(result.find("test_graph") == result.rfind("test_graph"));
}
TEST(CodegenKernel, Kernel_GenerateSubGraphFuncDefTest) {
codegen::Kernel kernel("test_kernel");
auto call1 = new MockApiCall("call1");
auto call2 = new MockApiCall("call2");
auto call3 = new MockApiCall("call3");
auto call4 = new MockApiCall("call4");
auto loop1 = new codegen::Loop(1);
loop1->AddCall(call3);
loop1->AddCall(call4);
kernel.root_loop.AddCall(call1);
kernel.root_loop.AddCall(call2);
kernel.root_loop.AddLoop(loop1);
std::stringstream ss;
auto ret = kernel.GenerateSubGraphFuncDef(&(kernel.root_loop), ss);
EXPECT_EQ(ret, ge::SUCCESS);
EXPECT_EQ(ss.str(), std::string{"func_test_Definition:call1\n"
"func_test_Definition:call2\n"
"func_test_Definition:call3\n"
"func_test_Definition:call4\n"});
}
TEST(CodegenKernel, BroadcastInlineWithExecCondition) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::Axis z0{.id = 0, .name = "z0", .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .size = s1.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddAxis(z0);
tiler.AddAxis(z1);
for (auto &[id, cur_axis] : tiler.axis_map) {
(void)id;
cur_axis.is_split_b = true;
}
codegen::Loop loop1(z0.id);
codegen::Loop loop2(z1.id);
loop1.AddLoop(&loop2);
auto call1 = new MockApiCall("call1");
auto call2 = new MockApiCall("call2");
call1->enable_cache = true;
call1->exec_condition = af::ExecuteCondition::kCacheBlockSplitFusedBroadcastAxis;
call1->unit = af::ComputeUnit::kUnitVector;
call2->enable_cache = true;
call2->exec_condition = af::ExecuteCondition::kCacheBlockSplitOriginBroadcastAxis;
call2->unit = af::ComputeUnit::kUnitVector;
loop2.AddCall(call1);
loop2.AddCall(call2);
codegen::TPipe tpipe("t", tiler);
std::string result;
EXPECT_EQ(loop1.Generate(tiler, tpipe, result), ge::SUCCESS);
EXPECT_EQ(result, std::string{
"for (int z0 = 0; z0 < z0_loop_size; z0++) {\n"
"for (int z1 = 0; z1 < z1_loop_size; z1++) {\n"
"bool enable_cache_fused_brc_axis = (z1 < 1) || ((block_dim * 0 + z1) % z1_loop_size < 1);\n"
"bool enable_cache_origin_brc_axis = (z1 < 1);\n"
"if (enable_cache_fused_brc_axis) {\n"
"();\n"
"}\n\n"
"if (enable_cache_origin_brc_axis) {\n"
"();\n"
"}\n\n"
"}\n"
"}\n"
});
}
TEST(CodegenKernel, CalculateVectorizedAixsMergeStatus) {
af::SizeVar s0(af::Symbol("s0"));
af::SizeVar s1(af::Symbol("s1"));
af::SizeVar s2(af::Symbol("s2"));
af::Axis z0{.id = 0, .name = "z0", .type = af::Axis::Type::kAxisTypeTileOuter, .size = s0.expr};
af::Axis z1{.id = 1, .name = "z1", .type = af::Axis::Type::kAxisTypeTileInner, .size = s1.expr};
af::Axis z2{.id = 2, .name = "z2", .type = af::Axis::Type::kAxisTypeOriginal, .size = s2.expr};
codegen::Tiler tiler;
tiler.AddSizeVar(af::SizeVar(s0));
tiler.AddSizeVar(af::SizeVar(s1));
tiler.AddSizeVar(af::SizeVar(s2));
tiler.AddAxis(z0);
tiler.AddAxis(z1);
tiler.AddAxis(z2);
codegen::TPipe tpipe("tpipe", tiler);
af::AscGraph graph("test");
af::ascir_op::Data x("x", graph);
auto node = graph.FindNode("x");
af::AscTensor tensor = node->outputs[0];
tensor.attr.axis = {z0.id, z1.id, z2.id};
tensor.attr.vectorized_axis = {z1.id, z2.id};
tensor.attr.repeats = {z0.size, z1.size, z2.size};
tensor.attr.strides = {z1.size*z2.size, z2.size, One};
vector<af::Expression> vectorized_strides{One, One};
vectorized_strides[0] = z2.size;
tensor.attr.vectorized_strides = vectorized_strides;
std::string dtype_name;
codegen::Tensor::DtypeName(tensor.attr.dtype, dtype_name);
codegen::Tensor t = codegen::Tensor(tensor, dtype_name);
t.vectorized_axis_pos = {1, 2};
std::vector<codegen::Tensor> inputs = {};
std::vector<codegen::Tensor> outputs = {t};
VectorizedAxisLoopMergeStatus merge_info;
GenerateVectorizedAxisMergeStatus(inputs, outputs, merge_info, tpipe);
EXPECT_EQ(merge_info.merge_repeats_str.size(), 1);
EXPECT_EQ(merge_info.merge_repeats_str[0], "t->s1 * t->s2");
}
class CodegenKernelV2Test : public ::testing::Test {
protected:
void SetUp() override {
dlog_setlevel(ASCGEN_MODULE_NAME, DLOG_ERROR, 0);
ge::PlatformContext::GetInstance().Reset();
auto stub_v2 = std::make_shared<RuntimeStubV2Common>();
RuntimeStub::SetInstance(stub_v2);
}
void TearDown() override {
dlog_setlevel(ASCGEN_MODULE_NAME, DLOG_ERROR, 0);
RuntimeStub::Reset();
ge::PlatformContext::GetInstance().Reset();
}
};
TEST_F(CodegenKernelV2Test, RequireContinuousTQueBuf_AllFromQue) {
af::AscGraph graph("test_graph");
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
Data x_op("x", graph);
x_op.ir_attr.SetIndex(0);
Load load_op1("load1");
Load load_op2("load2");
Concat concat_op("concat");
load_op1.x = x_op.y;
load_op1.y.dtype = ge::DT_FLOAT;
*load_op1.y.axis = {z0.id, z1.id};
load_op2.x = x_op.y;
load_op2.y.dtype = ge::DT_FLOAT;
*load_op2.y.axis = {z0.id, z1.id};
concat_op.x = {load_op1.y, load_op2.y};
*concat_op.y.axis = {z0.id, z1.id};
auto load1 = graph.FindNode("load1");
auto load2 = graph.FindNode("load2");
auto concat = graph.FindNode("concat");
load1->outputs[0].attr.mem.tensor_id = 3;
load1->outputs[0].attr.repeats = {s0, s1};
load1->outputs[0].attr.strides = {s1, One};
load1->outputs[0].attr.mem.position = Position::kPositionVecIn;
load2->outputs[0].attr = load1->outputs[0].attr;
load2->outputs[0].attr.mem.tensor_id = 4;
concat->outputs[0].attr.mem.alloc_type = AllocType::kAllocTypeGlobal;
concat->outputs[0].attr.mem.tensor_id = 5;
concat->outputs[0].attr.repeats = {s0, s1 + s1};
concat->outputs[0].attr.strides = {s1 + s1, One};
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.input_nodes = {graph.FindNode("x")};
codegen::Kernel kernel("test_input_all_from_que");
EXPECT_EQ(codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel), ge::SUCCESS);
for (const auto &body : kernel.root_loop.bodys) {
if (body.call->inputs.size() > 1) {
for (int32_t i = 0; i < static_cast<int32_t>(body.call->inputs.size()); ++i) {
EXPECT_EQ(body.call->inputs[i]->share_order, i);
}
}
}
ge::RuntimeStub::Reset();
ge::PlatformContext::GetInstance().Reset();
}
TEST_F(CodegenKernelV2Test, RequireContinuousTQueBuf_AllFromBuf) {
ge::PlatformContext::GetInstance().Reset();
auto stub_v2 = std::make_shared<ge::RuntimeStubV2Common>();
ge::RuntimeStub::SetInstance(stub_v2);
af::AscGraph graph("test_graph");
::ascir::FusedScheduledResult fused_schedule_result;
auto s0 = graph.CreateSizeVar("s0");
auto s1 = graph.CreateSizeVar("s1");
auto z0 = graph.CreateAxis("z0", s0);
auto z1 = graph.CreateAxis("z1", s1);
Data x_op("x", graph);
x_op.ir_attr.SetIndex(0);
Load load_op1("load1");
Load load_op2("load2");
Concat concat_op("concat");
load_op1.x = x_op.y;
load_op1.y.dtype = ge::DT_FLOAT;
*load_op1.y.axis = {z0.id, z1.id};
load_op2.x = x_op.y;
load_op2.y.dtype = ge::DT_FLOAT;
*load_op2.y.axis = {z0.id, z1.id};
concat_op.x = {load_op2.y, load_op1.y};
*concat_op.y.axis = {z0.id, z1.id};
auto data = graph.FindNode("x");
auto load1 = graph.FindNode("load1");
auto load2 = graph.FindNode("load2");
auto concat = graph.FindNode("concat");
load1->outputs[0].attr.mem.alloc_type = AllocType::kAllocTypeBuffer;
load1->outputs[0].attr.mem.tensor_id = 3;
load1->outputs[0].attr.repeats = {s0, s1};
load1->outputs[0].attr.strides = {s1, One};
load1->outputs[0].attr.mem.position = Position::kPositionVecCalc;
load1->outputs[0].attr.buf.id = 1;
load2->outputs[0].attr = load1->outputs[0].attr;
load2->outputs[0].attr.mem.tensor_id = 4;
load2->outputs[0].attr.buf.id = 2;
concat->outputs[0].attr.mem.tensor_id = 5;
concat->outputs[0].attr.repeats = {s0, s1 + s1};
concat->outputs[0].attr.strides = {s1 + s1, One};
fused_schedule_result.input_nodes = {data};
codegen::Kernel kernel("test_input_all_from_que");
EXPECT_EQ(codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel), ge::SUCCESS);
EXPECT_EQ(kernel.tpipe.contiguous_buf_ids, (std::vector<::ascir::BufId>{2, 1}));
}
TEST_F(CodegenKernel_CallSync, TestDefineShareOffsets) {
auto load1 = Load("load1", x);
auto load2 = Load("load2", x);
auto load3 = Load("load3", x);
load2->outputs[0].attr.que.id = load1->outputs[0].attr.que.id;
load2->outputs[0].attr.mem.reuse_id = load1->outputs[0].attr.mem.reuse_id;
load3->outputs[0].attr.que.id = load1->outputs[0].attr.que.id;
load3->outputs[0].attr.mem.reuse_id = load1->outputs[0].attr.mem.reuse_id;
auto concat = Vec("Concat", {load3, load2, load1});
auto store = Store("store", concat);
EXPECT_EQ(Generate(), R"(uint32_t q1_reuse1_offset = 0;
LocalTensor<uint8_t> q1_buf = q1.AllocTensor<uint8_t>();
const uint32_t q1_reuse1_offset_part_1 = q1_reuse1_offset + local_3_size * 2;
const uint32_t q1_reuse1_offset_part_2 = q1_reuse1_offset_part_1 + local_2_size * 2;
q1_reuse1_offset = q1_reuse1_offset_part_2;
const uint32_t local_1_actual_size = 1;
LocalTensor<half> local_1;
local_1 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();
load1();
q1_reuse1_offset = q1_reuse1_offset_part_1;
const uint32_t local_2_actual_size = 1;
LocalTensor<half> local_2;
local_2 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();
load2();
q1_reuse1_offset = 0;
const uint32_t local_3_actual_size = 1;
LocalTensor<half> local_3;
local_3 = q1_buf[q1_reuse1_offset].template ReinterpretCast<half>();
load3();
q1.EnQue(q1_buf);
q1_buf = q1.DeQue<uint8_t>();
const uint32_t local_4_actual_size = 1;
LocalTensor<half> local_4;
local_4 = b4_buf.template ReinterpretCast<half>();
Concat();
q1.FreeTensor(q1_buf);
store();
)");
}
TEST(CodegenKernel, ScalarDataOutputTensor_test) {
af::AscGraph graph("test");
codegen::Tiler tiler;
ScalarData scalar_data("scalar_data", graph);
scalar_data.ir_attr.SetIndex(0);
scalar_data.y.dtype = ge::DT_FLOAT;
auto scalar_node = graph.FindNode("scalar_data");
ge::AscTensor scalar_tensor = scalar_node->outputs[0];
scalar_tensor.attr.mem.tensor_id = 0;
Data data("data", graph);
data.ir_attr.SetIndex(1);
data.y.dtype = ge::DT_FLOAT;
auto data_node = graph.FindNode("data");
ge::AscTensor data_tensor = data_node->outputs[0];
data_tensor.attr.mem.tensor_id = 1;
std::string scalar_dtype_name;
codegen::Tensor::DtypeName(scalar_tensor.attr.dtype, scalar_dtype_name);
codegen::Tensor t1(scalar_tensor, scalar_dtype_name, "t1");
EXPECT_EQ(t1.Init(), 0);
std::string data_dtype_name;
codegen::Tensor::DtypeName(data_tensor.attr.dtype, data_dtype_name);
codegen::Tensor t2(data_tensor, data_dtype_name, "t2");
EXPECT_EQ(t2.Init(), 0);
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.input_nodes.push_back(data_node);
fused_schedule_result.input_nodes.push_back(scalar_node);
codegen::Kernel kernel(graph.GetName());
codegen::Kernel::ParseGraph(graph, fused_schedule_result, kernel);
std::string result;
EXPECT_EQ(kernel.GlobalTensorInit(result), 0);
EXPECT_EQ(result, "GlobalTensor<float> global_1;\nglobal_1.SetGlobalBuffer((__gm__ float*)scalar_data);\n");
}
class CodegenKernelConv2DTest : public ::testing::Test {
protected:
void SetUp() override {}
void TearDown() override {}
};
TEST_F(CodegenKernelConv2DTest, KernelFuncDeclare_ShouldGenerateConv2DTemplate_WhenIsConv2dTrue) {
af::AscGraph graph("conv2d_graph");
codegen::Kernel kernel("conv2d_kernel");
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.fused_graph_name = af::AscendString("conv2d_fusion");
::ascir::ScheduledResult scheduled_result;
scheduled_result.cube_type = ::ascir::CubeTemplateType::kUBFuse;
std::vector<::ascir::ScheduledResult> scheduled_results;
scheduled_results.push_back(scheduled_result);
fused_schedule_result.node_idx_to_scheduled_results.push_back(scheduled_results);
std::string declare = kernel.KernelFuncDeclare("conv2d_graph", fused_schedule_result, false, false, true);
EXPECT_NE(declare.find("FmapTiling"), std::string::npos);
EXPECT_NE(declare.find("WeightTiling"), std::string::npos);
EXPECT_NE(declare.find("L1PingPong"), std::string::npos);
EXPECT_NE(declare.find("L0PingPong"), std::string::npos);
EXPECT_NE(declare.find("OutputOrder"), std::string::npos);
EXPECT_NE(declare.find("IterOrder"), std::string::npos);
EXPECT_NE(declare.find("GroupType"), std::string::npos);
EXPECT_NE(declare.find("EnableSmallChannel"), std::string::npos);
EXPECT_NE(declare.find("WeightUbTrans"), std::string::npos);
EXPECT_NE(declare.find("FmapCopyMode"), std::string::npos);
EXPECT_NE(declare.find("InnerBatch"), std::string::npos);
EXPECT_NE(declare.find("DisContinuous"), std::string::npos);
EXPECT_EQ(declare.find("API_LEVEL"), std::string::npos);
EXPECT_EQ(declare.find("A_TRANS"), std::string::npos);
EXPECT_EQ(declare.find("B_TRANS"), std::string::npos);
}
TEST_F(CodegenKernelConv2DTest, KernelFuncDeclare_ShouldGenerateMatMulTemplate_WhenIsConv2dFalse) {
af::AscGraph graph("matmul_graph");
codegen::Kernel kernel("matmul_kernel");
::ascir::FusedScheduledResult fused_schedule_result;
fused_schedule_result.fused_graph_name = af::AscendString("matmul_fusion");
::ascir::ScheduledResult scheduled_result;
scheduled_result.cube_type = ::ascir::CubeTemplateType::kUBFuse;
std::vector<::ascir::ScheduledResult> scheduled_results;
scheduled_results.push_back(scheduled_result);
fused_schedule_result.node_idx_to_scheduled_results.push_back(scheduled_results);
std::string declare = kernel.KernelFuncDeclare("matmul_graph", fused_schedule_result, false, false, false);
EXPECT_NE(declare.find("API_LEVEL"), std::string::npos);
EXPECT_NE(declare.find("A_TRANS"), std::string::npos);
EXPECT_NE(declare.find("B_TRANS"), std::string::npos);
EXPECT_NE(declare.find("BATCH_MODEL"), std::string::npos);
EXPECT_NE(declare.find("MODEL"), std::string::npos);
EXPECT_NE(declare.find("FULL_LOAD"), std::string::npos);
EXPECT_NE(declare.find("L0C2OUT_MODEL"), std::string::npos);
EXPECT_EQ(declare.find("FmapTiling"), std::string::npos);
EXPECT_EQ(declare.find("WeightTiling"), std::string::npos);
}
TEST_F(CodegenKernelConv2DTest, GenCubeCommonTiling_ShouldGenerateConv2DAPI_WhenIsConv2dTrue) {
codegen::Kernel kernel("conv2d_kernel");
std::stringstream ss;
EXPECT_EQ(kernel.GenCubeCommonTiling(ss, false, true), ge::SUCCESS);
std::string result = ss.str();
EXPECT_NE(result.find("AscendC::TPipe pipe"), std::string::npos);
EXPECT_NE(result.find("conv2d_v2"), std::string::npos);
EXPECT_NE(result.find("FmapTiling"), std::string::npos);
EXPECT_NE(result.find("WeightTiling"), std::string::npos);
EXPECT_NE(result.find("L1PingPong"), std::string::npos);
EXPECT_NE(result.find("L0PingPong"), std::string::npos);
EXPECT_NE(result.find("OutputOrder"), std::string::npos);
EXPECT_NE(result.find("IterOrder"), std::string::npos);
EXPECT_NE(result.find("GroupType"), std::string::npos);
EXPECT_EQ(result.find("mat_mul_v3"), std::string::npos);
EXPECT_EQ(result.find("batch_mat_mul_v3"), std::string::npos);
}
TEST_F(CodegenKernelConv2DTest, GenCubeCommonTiling_ShouldGenerateMatMulAPI_WhenIsConv2dFalseAndNotBatch) {
codegen::Kernel kernel("matmul_kernel");
std::stringstream ss;
EXPECT_EQ(kernel.GenCubeCommonTiling(ss, false, false), ge::SUCCESS);
std::string result = ss.str();
EXPECT_NE(result.find("mat_mul_v3"), std::string::npos);
EXPECT_NE(result.find("API_LEVEL"), std::string::npos);
EXPECT_NE(result.find("A_TRANS"), std::string::npos);
EXPECT_NE(result.find("B_TRANS"), std::string::npos);
EXPECT_NE(result.find("BATCH_MODEL"), std::string::npos);
EXPECT_NE(result.find("MODEL"), std::string::npos);
EXPECT_NE(result.find("FULL_LOAD"), std::string::npos);
EXPECT_NE(result.find("L0C2OUT_MODEL"), std::string::npos);
EXPECT_EQ(result.find("conv2d_v2"), std::string::npos);
EXPECT_EQ(result.find("batch_mat_mul_v3"), std::string::npos);
}
TEST_F(CodegenKernelConv2DTest, GenCubeCommonTiling_ShouldGenerateBatchMatMulAPI_WhenIsConv2dFalseAndIsBatch) {
codegen::Kernel kernel("batch_matmul_kernel");
std::stringstream ss;
EXPECT_EQ(kernel.GenCubeCommonTiling(ss, true, false), ge::SUCCESS);
std::string result = ss.str();
EXPECT_NE(result.find("batch_mat_mul_v3"), std::string::npos);
EXPECT_NE(result.find("API_LEVEL"), std::string::npos);
EXPECT_NE(result.find("A_TRANS"), std::string::npos);
EXPECT_NE(result.find("B_TRANS"), std::string::npos);
EXPECT_NE(result.find("BATCH_MODEL"), std::string::npos);
EXPECT_NE(result.find("MODEL"), std::string::npos);
EXPECT_NE(result.find("FULL_LOAD"), std::string::npos);
EXPECT_NE(result.find("L0C2OUT_MODEL"), std::string::npos);
EXPECT_EQ(result.find("conv2d_v2"), std::string::npos);
}
TEST_F(CodegenKernelConv2DTest, SetUsingGlobalTpipe_ShouldSetFlagCorrectly) {
codegen::Kernel kernel("test_kernel");
kernel.tpipe.SetUsingGlobalTpipe(true);
kernel.tpipe.SetUsingGlobalTpipe(false);
EXPECT_TRUE(true);
}