* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/
* \file test_dynamic_unalign.cpp
* \brief
*/
#include <gtest/gtest.h>
#include "tilefwk/data_type.h"
#include "interface/function/function.h"
#include "tilefwk/tilefwk_op.h"
#include "test_suite_stest_ops.h"
#include "interface/interpreter/raw_tensor_data.h"
#include "test_dev_func_runner.h"
using namespace npu::tile_fwk;
using namespace npu::tile_fwk::dynamic;
class DynamicUnalignTest : public npu::tile_fwk::stest::TestSuite_STest_Ops_Aihac {
public:
void SetUp() override
{
npu::tile_fwk::stest::TestSuite_STest_Ops_Aihac::SetUp();
TileShape::Current().SetVecTile(32, 32);
TileShape::Current().SetCubeTile({32, 32}, {32, 32}, {32, 32});
RuntimeSetDevice(GetDeviceIdByEnvVar());
}
};
void TestLoopTailBlock(const Tensor& t0, const Tensor& blockTable, Tensor& out, int s)
{
int blockSize = 64;
FUNCTION("main", {t0, blockTable}, {out})
{
LOOP("L0", FunctionType::DYNAMIC_LOOP, i, LoopRange(GetInputShape(t0, 0) / s))
{
SymbolicScalar size = GetTensorData(blockTable, {i, 0});
Tensor t0s = View(t0, {s, s}, {size, s}, {blockSize * i, 0});
Tensor t1 = Add(t0s, t0s);
Assemble(t1, {blockSize * i, 0}, out);
}
}
}
TEST_F(DynamicUnalignTest, TestTailBlock)
{
int s = 64;
int n = 8;
Tensor t0(DT_FP32, {n * s, s}, "t0");
Tensor blockTable{DT_INT32, {n, 1}, "blockTable"};
Tensor out(DT_FP32, {n * s, s}, "out");
TestLoopTailBlock(t0, blockTable, out, s);
std::vector<int> tblData;
for (int i = 0; i < n; i++)
tblData.push_back(i);
ProgramData::GetInstance().AppendInputs({
RawTensorData::CreateConstantTensor<float>(t0, 1.0),
RawTensorData::CreateTensor<int>(blockTable, tblData),
});
ProgramData::GetInstance().AppendOutputs({
RawTensorData::CreateConstantTensor<float>(out, 0.0f),
});
#ifdef BUILD_WITH_CANN
DevFuncRunner::Run(Program::GetInstance().GetLastFunction());
std::vector<float> golden(n * s * s, 2.0f);
auto outs = npu::tile_fwk::ProgramData::GetInstance().GetOutputData(0);
EXPECT_TRUE(resultCmp(golden, (float*)outs->data(), 0.001f));
#endif
}
TEST_F(DynamicUnalignTest, test_mm_unalign)
{
SetInterpreterConfig();
int b = 1;
int nq = 32;
int nk = 1;
int s1 = 1;
int s2 = 256;
int dR = 64;
int dN = 512;
std::vector<int64_t> qRopeShape = {b * nq * s1, dR};
std::vector<int64_t> qNopeShape = {b * nq * s1, dN};
std::vector<int64_t> kRopeShape = {b * nk * s2, dR};
std::vector<int64_t> kNopeShape = {b * nk * s2, dN};
std::vector<int64_t> outShape = {b * nq * s1, nk * s2};
Tensor qRope(DT_BF16, qRopeShape, "qRope");
Tensor qNope(DT_BF16, qNopeShape, "qNope");
Tensor kRope(DT_BF16, kRopeShape, "kRope");
Tensor kNope(DT_BF16, kNopeShape, "kNope");
Tensor actSeqs{DT_INT32, {b}, "actSeqs"};
Tensor out(DT_FP32, outShape, "out");
auto dtype = qNope.GetStorage()->Datatype();
std::vector<npu::tile_fwk::bfloat16> qRopeData(b * nq * s1 * dR, 0);
std::vector<npu::tile_fwk::bfloat16> qNopeData(b * nq * s1 * dN, 0);
std::vector<npu::tile_fwk::bfloat16> kRopeData(b * nk * s2 * dR, 0);
std::vector<npu::tile_fwk::bfloat16> kNopeData(b * nk * s2 * dN, 0);
std::vector<int> seq(b);
std::vector<float> golden(b * nq * s1 * nk * s2, 0);
readInput<npu::tile_fwk::bfloat16>(GetGoldenDir() + "/q_rope.bin", qRopeData);
readInput<npu::tile_fwk::bfloat16>(GetGoldenDir() + "/q_nope.bin", qNopeData);
readInput<npu::tile_fwk::bfloat16>(GetGoldenDir() + "/k_rope.bin", kRopeData);
readInput<npu::tile_fwk::bfloat16>(GetGoldenDir() + "/k_nope.bin", kNopeData);
readInput<int>(GetGoldenDir() + "/actual_seq_len.bin", seq);
readInput(GetGoldenDir() + "/out.bin", golden);
ProgramData::GetInstance().AppendInputs({
RawTensorData::CreateTensor<npu::tile_fwk::bfloat16>(qRope, qRopeData),
RawTensorData::CreateTensor<npu::tile_fwk::bfloat16>(qNope, qNopeData),
RawTensorData::CreateTensor<npu::tile_fwk::bfloat16>(kRope, kRopeData),
RawTensorData::CreateTensor<npu::tile_fwk::bfloat16>(kNope, kNopeData),
RawTensorData::CreateTensor<int32_t>(actSeqs, seq),
});
ProgramData::GetInstance().AppendOutputs({
RawTensorData::CreateConstantTensor<float>(out, 0.0f),
});
ProgramData::GetInstance().AppendGoldens({
RawTensorData::CreateTensor<float>(out, golden),
});
FUNCTION("main", {qRope, qNope, kRope, kNope, actSeqs}, {out})
{
LOOP("L0", FunctionType::DYNAMIC_LOOP, batchId, LoopRange(GetInputShape(qRope, 0) / (nq * s1)))
{
SymbolicScalar curSeq = GetTensorData(actSeqs, {batchId});
Tensor qr = View(qRope, {nq * s1, dR}, {nq * s1, dR}, {batchId * nq * s1, 0});
Tensor qn = View(qNope, {nq * s1, dN}, {nq * s1, dN}, {batchId * nq * s1, 0});
Tensor kr = View(kRope, {nk * s2, dR}, {nk * curSeq, dR}, {batchId * nk * s2, 0});
Tensor kn = View(kNope, {nk * s2, dN}, {nk * curSeq, dN}, {batchId * nk * s2, 0});
Tensor qi(dtype, {nq * s1, dN + dR}, "qi");
Assemble(qn, {0, 0}, qi);
Assemble(qr, {0, dN}, qi);
Tensor kj(dtype, {nk * s2, dN + dR}, "kj");
Assemble(kn, {0, 0}, kj);
Assemble(kr, {0, dN}, kj);
auto tmp = Matrix::Matmul(DataType::DT_FP32, qi, kj, false, true);
Assemble(tmp, {batchId * nq * s1, 0}, out);
}
}
DevFuncRunner::Run(Program::GetInstance().GetLastFunction());
auto outs = npu::tile_fwk::ProgramData::GetInstance().GetOutputData(0);
EXPECT_TRUE(resultCmp(golden, (float*)outs->data(), 0.005f));
}
TEST_F(DynamicUnalignTest, test_mm2_unalign)
{
SetInterpreterConfig();
TileShape::Current().SetCubeTile({32, 32}, {128, 128}, {64, 64});
int b = 1;
int nq = 32;
int nk = 1;
int s1 = 1;
int s2 = 128;
int d = 64;
std::vector<int64_t> qShape = {b * nq * s1, nk * s2};
std::vector<int64_t> kShape = {b * nk * s2, d};
std::vector<int64_t> outShape = {b * nq * s1, d};
Tensor qk(DT_BF16, qShape, "qk");
Tensor v(DT_BF16, kShape, "v");
Tensor actSeqs{DT_INT32, {b}, "actSeqs"};
Tensor out(DT_FP32, outShape, "out");
std::vector<npu::tile_fwk::bfloat16> qkData(b * nq * s1 * nk * s2, 0);
std::vector<npu::tile_fwk::bfloat16> vData(b * nk * s2 * d, 0);
std::vector<int> seq(b);
std::vector<float> golden(b * nq * s1 * d, 0);
readInput<npu::tile_fwk::bfloat16>(GetGoldenDir() + "/qk.bin", qkData);
readInput<npu::tile_fwk::bfloat16>(GetGoldenDir() + "/v.bin", vData);
readInput<int>(GetGoldenDir() + "/actual_seq_len.bin", seq);
readInput(GetGoldenDir() + "/out.bin", golden);
ProgramData::GetInstance().AppendInputs({
RawTensorData::CreateTensor<npu::tile_fwk::bfloat16>(qk, qkData),
RawTensorData::CreateTensor<npu::tile_fwk::bfloat16>(v, vData),
RawTensorData::CreateTensor<int32_t>(actSeqs, seq),
});
ProgramData::GetInstance().AppendOutputs({
RawTensorData::CreateConstantTensor<float>(out, 0.0f),
});
ProgramData::GetInstance().AppendGoldens({
RawTensorData::CreateTensor<float>(out, golden),
});
FUNCTION("main", {qk, v, actSeqs}, {out})
{
LOOP("L0", FunctionType::DYNAMIC_LOOP, batchId, LoopRange(GetInputShape(qk, 0) / (nq * s1)))
{
SymbolicScalar curSeq = GetTensorData(actSeqs, {batchId});
Tensor qk0 = View(qk, {nq * s1, nk * s2}, {nq * s1, nk * curSeq}, {batchId * nq * s1, 0});
Tensor v0 = View(v, {nk * s2, d}, {nk * curSeq, d}, {batchId * nk * s2, 0});
auto tmp = Matrix::Matmul(DataType::DT_FP32, qk0, v0, false, false);
Assemble(tmp, {batchId * nq * s1, 0}, out);
}
}
DevFuncRunner::Run(Program::GetInstance().GetLastFunction());
auto outs = npu::tile_fwk::ProgramData::GetInstance().GetOutputData(0);
EXPECT_TRUE(resultCmp(golden, (float*)outs->data(), 0.005f));
}
TEST_F(DynamicUnalignTest, test_rowmaxsingle_unalign)
{
SetInterpreterConfig();
TileShape::Current().SetVecTile(128, 128);
int b = 1;
int nTile = 32;
int blockSize = 256;
std::vector<int64_t> qShape = {b * nTile, blockSize};
std::vector<int64_t> outshape = {b * nTile, 1};
Tensor q(DT_FP32, qShape, "q");
Tensor actSeqs(DT_INT32, {b, 1}, "actual_seq");
Tensor out(DT_FP32, outshape, "out");
std::vector<float> qData(b * nTile * blockSize, 0);
std::vector<int> seq(b);
std::vector<float> golden(b * nTile * 1, 0);
readInput<float>(GetGoldenDir() + "/q.bin", qData);
readInput<int>(GetGoldenDir() + "/actual_seq_len.bin", seq);
readInput(GetGoldenDir() + "/out.bin", golden);
ProgramData::GetInstance().AppendInputs({
RawTensorData::CreateTensor<float>(q, qData),
RawTensorData::CreateTensor<int32_t>(actSeqs, seq),
});
ProgramData::GetInstance().AppendOutputs({
RawTensorData::CreateConstantTensor<float>(out, 0.0f),
});
ProgramData::GetInstance().AppendGoldens({
RawTensorData::CreateTensor<float>(out, golden),
});
FUNCTION("main", {q, actSeqs}, {out})
{
LOOP("L0", FunctionType::DYNAMIC_LOOP, batchId, LoopRange(GetInputShape(q, 0) / (nTile)))
{
SymbolicScalar curSeq = GetTensorData(actSeqs, {batchId, 0});
Tensor q0 = View(q, {nTile, blockSize}, {nTile, curSeq}, {batchId * nTile, 0});
auto tmp = Amax(q0, -1, true);
Assemble(tmp, {batchId * nTile, 0}, out);
}
}
DevFuncRunner::Run(Program::GetInstance().GetLastFunction());
auto outs = npu::tile_fwk::ProgramData::GetInstance().GetOutputData(0);
EXPECT_TRUE(resultCmp(golden, (float*)outs->data(), 0.001f));
}
TEST_F(DynamicUnalignTest, test_rowsumsingle_unalign)
{
SetInterpreterConfig();
TileShape::Current().SetVecTile(128, 128);
int b = 1;
int nTile = 32;
int blockSize = 256;
std::vector<int64_t> qShape = {b * nTile, blockSize};
std::vector<int64_t> outshape = {b * nTile, 1};
Tensor q(DT_FP32, qShape, "q");
Tensor actSeqs(DT_INT32, {b, 1}, "actual_seq");
Tensor out(DT_FP32, outshape, "out");
std::vector<float> qData(b * nTile * blockSize, 0);
std::vector<int> seq(b);
std::vector<float> golden(b * nTile * 1, 0);
readInput<float>(GetGoldenDir() + "/q.bin", qData);
readInput<int>(GetGoldenDir() + "/actual_seq_len.bin", seq);
readInput(GetGoldenDir() + "/out.bin", golden);
ProgramData::GetInstance().AppendInputs({
RawTensorData::CreateTensor<float>(q, qData),
RawTensorData::CreateTensor<int32_t>(actSeqs, seq),
});
ProgramData::GetInstance().AppendOutputs({
RawTensorData::CreateConstantTensor<float>(out, 0.0f),
});
ProgramData::GetInstance().AppendGoldens({
RawTensorData::CreateTensor<float>(out, golden),
});
FUNCTION("main", {q, actSeqs}, {out})
{
LOOP("L0", FunctionType::DYNAMIC_LOOP, batchId, LoopRange(GetInputShape(q, 0) / (nTile)))
{
SymbolicScalar curSeq = GetTensorData(actSeqs, {batchId, 0});
Tensor q0 = View(q, {nTile, blockSize}, {nTile, curSeq}, {batchId * nTile, 0});
auto tmp = Sum(q0, -1, true);
Assemble(tmp, {batchId * nTile, 0}, out);
}
}
DevFuncRunner::Run(Program::GetInstance().GetLastFunction());
auto outs = npu::tile_fwk::ProgramData::GetInstance().GetOutputData(0);
EXPECT_TRUE(resultCmp(golden, (float*)outs->data(), 0.001f));
}
TEST_F(DynamicUnalignTest, test_unary_unalign)
{
SetInterpreterConfig();
TileShape::Current().SetVecTile(64, 64);
int b = 4;
int sq = 128;
int d = 64;
int outIdx = 2;
std::vector<int64_t> qShape = {b * sq, d};
Tensor q(DT_FP32, qShape, "q");
Tensor actSeqs(DT_INT32, {b, 1, 1}, "actual_seq");
Tensor out(DT_FP32, qShape, "out");
std::vector<int> actSeqsData(b, 100);
std::vector<float> golden(b * sq * d, 0.001f);
for (int bidx = 0; bidx < b; ++bidx) {
int offset = bidx * sq * d;
std::fill(golden.begin() + offset, golden.begin() + offset + actSeqsData[bidx] * d, exp(1.0));
}
ProgramData::GetInstance().AppendInputs(
{RawTensorData::CreateConstantTensor<float>(q, 1.0), RawTensorData::CreateTensor<int32_t>(actSeqs, actSeqsData),
RawTensorData::CreateConstantTensor<float>(out, 0.001f)});
ProgramData::GetInstance().AppendGoldens({
RawTensorData::CreateTensor<float>(out, golden),
});
FUNCTION("main", {q, actSeqs, out})
{
LOOP("L0", FunctionType::DYNAMIC_LOOP, batchId, LoopRange(GetInputShape(q, 0) / (sq)))
{
SymbolicScalar curSeq = GetTensorData(actSeqs, {batchId, 0, 0});
Tensor q0 = View(q, {sq, d}, {curSeq, d}, {batchId * sq, 0});
auto tmp = Exp(q0);
Assemble(tmp, {batchId * sq, 0}, out);
}
}
DevFuncRunner::Run(Program::GetInstance().GetLastFunction());
auto outs = npu::tile_fwk::ProgramData::GetInstance().GetInputData(outIdx);
EXPECT_TRUE(resultCmp(golden, (float*)outs->data(), 0.001f));
}