import os
import pytest
from transformers import AutoModelForCausalLM, AutoTokenizer
from tests.e2e.conftest import VllmRunner
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
MODELS = ["Qwen/Qwen3-0.6B"]
def get_prompt_embeds(chat, tokenizer, embedding_layer):
"""Convert chat messages to prompt embeddings."""
token_ids = tokenizer.apply_chat_template(chat, add_generation_prompt=True, return_tensors="pt", return_dict=False)
prompt_embeds = embedding_layer(token_ids).squeeze(0)
return prompt_embeds
@pytest.mark.parametrize("model_name", MODELS)
def test_mixed_prompt_embeds_and_text(model_name):
"""Test mixed inputs with both prompt embeddings and text prompts."""
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()
chat = [{"role": "user", "content": "What is AI?"}]
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
text_prompt = "What is machine learning?"
with VllmRunner(
model_name,
enable_prompt_embeds=True,
cudagraph_capture_sizes=[1, 2, 4, 8],
) as vllm_runner:
embeds_output = vllm_runner.model.generate(
{
"prompt_embeds": prompt_embeds,
}
)
text_output = vllm_runner.model.generate(text_prompt)
assert len(embeds_output) == 1
assert len(text_output) == 1
assert len(embeds_output[0].outputs[0].text) > 0
assert len(text_output[0].outputs[0].text) > 0
print("\n[Prompt Embeds Output]:", embeds_output[0].outputs[0].text)
print("[Text Prompt Output]:", text_output[0].outputs[0].text)