import os
import torch
from vllm import LLM, SamplingParams
from vllm.utils.mem_constants import GiB_bytes
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
def main():
prompt = "How are you?"
free, total = torch.npu.mem_get_info()
print(f"Free memory before sleep: {free / 1024**3:.2f} GiB")
used_bytes_baseline = total - free
llm = LLM("Qwen/Qwen2.5-0.5B-Instruct", enable_sleep_mode=True)
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
llm.sleep(level=1)
free_npu_bytes_after_sleep, total = torch.npu.mem_get_info()
print(f"Free memory after sleep: {free_npu_bytes_after_sleep / 1024**3:.2f} GiB")
used_bytes = total - free_npu_bytes_after_sleep - used_bytes_baseline
assert used_bytes < 1 * GiB_bytes
llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
assert output[0].outputs[0].text == output2[0].outputs[0].text
if __name__ == "__main__":
main()