import os
import shutil
import tempfile
import pytest
import torch
from .base import FakeLlamaModelAdapter, invoke_test, is_npu_available
@pytest.mark.parametrize("test_device, test_dtype", [
pytest.param("cpu", torch.float32),
pytest.param("npu", torch.float16, marks=pytest.mark.skipif(not is_npu_available(), reason="NPU not available")),
pytest.param("npu", torch.bfloat16, marks=pytest.mark.skipif(not is_npu_available(), reason="NPU not available")),
])
@pytest.mark.smoke
def test_kv_quant_only_process(test_device: str, test_dtype: torch.dtype):
tmp_dir = tempfile.mkdtemp()
try:
model_adapter = invoke_test("kv_quant.yaml", tmp_dir, device=test_device)
assert isinstance(model_adapter, FakeLlamaModelAdapter), "model_adapter should be FakeLlamaModelAdapter"
tokenizer = model_adapter.loaded_tokenizer
input_text = "Hello world"
input_ids = tokenizer(input_text, return_tensors="pt", truncation=True)
model_adapter.loaded_model(**input_ids)
finally:
if os.path.exists(tmp_dir):
shutil.rmtree(tmp_dir)