import torch
from torch import Tensor
def quant_optimized_matmul_tla(
mat1: Tensor,
mat2: Tensor,
scale: Tensor,
per_token_scale: Tensor,
outDType: str | torch.dtype = torch.bfloat16,
transA: bool = False,
transB: bool = True,
formatA: bool = False,
formatB: bool = False,
) -> Tensor:
"""Run CATLASS quantized optimized matmul (TLA) on NPU tensors.
Source: example 42_quant_optimized_matmul_tla.
Computes ``D = per_token_scale * (mat1 @ mat2) * scale`` where mat1 and
mat2 are int8 quantized matrices, scale is a per-column dequantization
factor and per_token_scale is a per-row dequantization factor.
Args:
mat1: Left input matrix (int8). Shape is ``(M, K)`` unless
``transA`` is true, in which case shape is ``(K, M)``.
mat2: Right input matrix (int8). Shape is ``(K, N)`` unless
``transB`` is true, in which case shape is ``(N, K)``.
scale: Per-column scale tensor (float32). Shape ``(1, N)``.
per_token_scale: Per-row scale tensor (float32). Shape ``(1, M)``.
outDType: Output dtype. Accepted strings are ``bfloat16``,
``float16`` and ``float32``.
transA: Whether to read ``mat1`` as transposed.
transB: Whether to read ``mat2`` as transposed.
formatA: Whether ``mat1`` is stored in the CATLASS NZ block format.
formatB: Whether ``mat2`` is stored in the CATLASS NZ block format.
Returns:
Output tensor with shape ``(M, N)`` on the active NPU device.
"""
if isinstance(outDType, str):
dtype_lower = outDType.lower()
outDType = getattr(torch, dtype_lower, None)
if outDType is None:
raise ValueError(f"{outDType} is not a data type of torch")
return torch.ops.catlass.quant_optimized_matmul_tla(
mat1, mat2, scale, per_token_scale, outDType, transA, transB, formatA, formatB
)