"""
Two VllmRunner instances are nested so that the first instance's worker
process is still holding NPU memory when the second instance's worker process
starts. Both instances must:
1. Initialize without raising any exception (no OOM during
determine_available_memory / KV-cache allocation).
2. Successfully complete a short generation request.
The model is Qwen/Qwen3-0.6B (~0.5 GiB weights) and gpu_memory_utilization
is set to 0.4 per instance so that two instances comfortably fit on a single
64 GiB Ascend 910B card while leaving enough headroom to avoid the
pre-fix negative-KV-cache condition.
"""
import os
from tests.e2e.conftest import VllmRunner
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
MODEL = "Qwen/Qwen3-0.6B"
_PROMPTS = ["Hello, my name is"]
_MAX_TOKENS = 5
_GPU_MEM_UTIL = 0.4
_MAX_MODEL_LEN = 512
def test_two_instances_on_single_card() -> None:
"""
Regression test for PR #7427 (multi-instance OOM on single card).
Start a first vllm-ascend instance; while it is still running and holding
NPU memory, start a second instance with identical settings. Both must
initialize correctly and produce non-empty outputs.
Failure signature (pre-fix):
RuntimeError / ValueError during the second instance's init, or
"Available KV cache memory: -X.XX GiB" in the logs followed by
zero KV blocks being allocated.
"""
with VllmRunner(
MODEL,
max_model_len=_MAX_MODEL_LEN,
gpu_memory_utilization=_GPU_MEM_UTIL,
enforce_eager=True,
) as runner1:
with VllmRunner(
MODEL,
max_model_len=_MAX_MODEL_LEN,
gpu_memory_utilization=_GPU_MEM_UTIL,
enforce_eager=True,
) as runner2:
outputs2 = runner2.generate_greedy(_PROMPTS, max_tokens=_MAX_TOKENS)
outputs1 = runner1.generate_greedy(_PROMPTS, max_tokens=_MAX_TOKENS)
assert outputs1, "First instance produced no outputs"
assert outputs2, "Second instance produced no outputs"
_, text1 = outputs1[0]
_, text2 = outputs2[0]
assert text1, "First instance output text is empty — model may have failed to run"
assert text2, (
"Second instance output text is empty — "
"KV cache may have been allocated with zero blocks (pre-fix OOM regression)"
)