#include "op_plugin/OpApiInterface.h"
#include "op_plugin/utils/custom_functions/atb/AtbCommon.h"
#include <acl/acl.h>
using namespace std;
namespace atb {
using SelfAttentionParam = atb::infer::SelfAttentionParam;
void _npu_flash_attention(const at::Tensor &query, const at::Tensor &key, const at::Tensor &value, const at::Tensor &mask, const at::Tensor &seq_len, const double scale_value, const int64_t num_heads, const int64_t num_kv_heads, at::Tensor &out)
{
const c10::OptionalDeviceGuard device_guard(device_of(query));
OpParamCache<SelfAttentionParam>& selfAttentionParamCache = OpParamCache<SelfAttentionParam>::getInstance();
SelfAttentionParam selfattentionparam;
selfattentionparam.calcType = SelfAttentionParam::PA_ENCODER;
selfattentionparam.kernelType = SelfAttentionParam::KERNELTYPE_DEFAULT;
selfattentionparam.clampType = SelfAttentionParam::CLAMP_TYPE_UNDEFINED;
selfattentionparam.maskType = SelfAttentionParam::MASK_TYPE_NORM;
selfattentionparam.kvcacheCfg = SelfAttentionParam::K_CACHE_V_CACHE;
selfattentionparam.scaleType = SelfAttentionParam::SCALE_TYPE_TOR;
selfattentionparam.quantType = SelfAttentionParam::TYPE_QUANT_UNDEFINED;
selfattentionparam.cacheType = SelfAttentionParam::CACHE_TYPE_NORM;
selfattentionparam.outDataType = ACL_DT_UNDEFINED;
selfattentionparam.headNum = num_heads;
selfattentionparam.kvHeadNum = num_kv_heads;
selfattentionparam.qScale = 1;
selfattentionparam.qkScale = scale_value;
selfattentionparam.batchRunStatusEnable = false;
selfattentionparam.isTriuMask = 0;
selfattentionparam.clampMin = 0;
selfattentionparam.clampMax = 0;
selfattentionparam.inputLayout = atb::infer::TYPE_BSND;
selfattentionparam.mlaVHeadSize = 0;
selfattentionparam.windowSize = 0;
ParamSetter parametter;
parametter.Input(query, true)
.Input(key, true)
.Input(value, true)
.Input(mask)
.Input(seq_len, true)
.Output(out);
auto opSelfattention = selfAttentionParamCache.getOperation(selfattentionparam, "SelfAttentionOperation");
RunAtbCmd(opSelfattention, parametter, "SelfAttentionOperation");
return;
}
namespace {
TORCH_LIBRARY_FRAGMENT(atb, m)
{
m.def("_npu_flash_attention(Tensor query, Tensor key, Tensor value, Tensor mask, Tensor seq_len, float scale_value, int num_heads, int num_kv_heads, Tensor(a!) out) -> ()");
}
}
namespace {
TORCH_LIBRARY_IMPL(atb, PrivateUse1, m)
{
m.impl("_npu_flash_attention", TORCH_FN(atb::_npu_flash_attention));
}
}
}