@@ -24,10 +24,11 @@ from vllm.v1.worker.gpu.sample.gumbel import gumbel_sample
from vllm_ascend.utils import vllm_version_is
-if vllm_version_is("v0.16.0"):
- from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs
-else:
- from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
+#if vllm_version_is("v0.16.0"):
+# from vllm.v1.worker.gpu.spec_decode.eagle import prepare_eagle_decode, prepare_eagle_inputs
+#else:
+# from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
+from vllm.v1.worker.gpu.spec_decode.eagle.speculator import prepare_eagle_decode, prepare_eagle_inputs
from vllm_ascend.worker.v2.attn_utils import build_attn_metadata
@@ -175,7 +176,8 @@ def propose(
return self.draft_tokens[:num_reqs]
-if vllm_version_is("v0.16.0"):
- vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose = propose
-else:
- vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose
+#if vllm_version_is("v0.16.0"):
+# vllm.v1.worker.gpu.spec_decode.eagle.EagleSpeculator.propose = propose
+#else:
+# vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose
+vllm.v1.worker.gpu.spec_decode.eagle.speculator.EagleSpeculator.propose = propose
@@ -1894,7 +1894,7 @@ class NPUModelRunner(GPUModelRunner):
num_tokens=num_tokens,
has_lora=has_lora,
uniform_decode=uniform_decode,
- disable_full=disable_full,
+ #disable_full=disable_full,
num_active_loras=num_active_loras,
)
else: