HabanaAI · xuechendi · Apr 30, 2025 · Apr 24, 2025 · Apr 24, 2025 · Apr 24, 2025
@@ -89,13 +89,13 @@ stages:
     steps:
       - name: gsm8k_small_g2_tp1_mlp_spec_decode
         flavor: g2
-        command: PT_HPU_LAZY_MODE=1 TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_mlp_correctness.py::test_mlp_e2e_greedy_correctness 
+        command: PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_mlp_correctness.py::test_mlp_e2e_greedy_correctness 
       - name: gsm8k_small_g2_tp1_medusa_spec_decode
         flavor: g2
-        command: PT_HPU_LAZY_MODE=1 TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_medusa_correctness.py::test_medusa_e2e_greedy_correctness
-      # - name: gsm8k_small_g2_tp1_eagle_spec_decode
-      #   flavor: g2
-      #   command: PT_HPU_LAZY_MODE=1 VLLM_COS_SIN_RECOMPUTE=true TORCH_COMPILE_DISABLE=true VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_eagle_correctness.py::test_eagle_e2e_greedy_correctness
+        command: PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_medusa_correctness.py::test_medusa_e2e_greedy_correctness
+      - name: gsm8k_small_g2_tp1_eagle_spec_decode
+        flavor: g2
+        command: PT_HPU_LAZY_MODE=1 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=True pytest -v tests/spec_decode/e2e/test_eagle_correctness.py::test_eagle_e2e_greedy_correctness
   # - name: tests_lora
     # steps:
       # - name: test_llama_lora

@@ -390,11 +390,14 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
             # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here,
             # so we're using a workaround. Remove this when fixed in
             # HPU PT bridge.
-            padded_weight = torch.cat([
-                loaded_weight,
-                torch.zeros(param.shape[0] - loaded_weight.shape[0],
-                            *loaded_weight.shape[1:])
-            ])
+            if param.shape[0] > loaded_weight.shape[0]:
+                padded_weight = torch.cat([
+                    loaded_weight,
+                    torch.zeros(param.shape[0] - loaded_weight.shape[0],
+                                *loaded_weight.shape[1:])
+                ])
+            else:
+                padded_weight = loaded_weight
             param.data.copy_(padded_weight)
         else:
             param[:loaded_weight.shape[0]].data.copy_(loaded_weight)

@@ -303,30 +303,6 @@ def _contract_batch_all_spec(
         # Map distinct sequences used to score each token
         # of shape [batch_size * k + 1] back to [batch_size, k + 1].
         contracted_bs, k = proposals.proposal_token_ids.shape
-        if current_platform.is_hpu():
-            (
-                target_sampler_output.sampled_token_ids,
-                target_sampler_output.sampled_token_probs,
-                target_sampler_output.logprobs,
-                target_sampler_output.hidden_states,
-                _,
-                _,
-                _,
-                _,
-            ) = self._split_scoring_output_hpu(target_sampler_output,
-                                               num_scoring_tokens)
-        else:
-            (
-                target_sampler_output.sampled_token_ids,
-                target_sampler_output.sampled_token_probs,
-                target_sampler_output.logprobs,
-                target_sampler_output.hidden_states,
-                _,
-                _,
-                _,
-                _,
-            ) = self._split_scoring_output(target_sampler_output,
-                                           num_scoring_tokens)
 
         # Reshape tensors to original batch size
         target_token_ids = target_sampler_output.sampled_token_ids.reshape(

@@ -6,6 +6,7 @@
 
 from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
 
 try:
     try:
@@ -19,9 +20,10 @@
             from vllm.attention.backends.hpu_attn import (
                 HPUPagedAttentionMetadata as FlashAttentionMetadata)
 except (ModuleNotFoundError, ImportError, AssertionError) as err:
-    raise RuntimeError(
-        "Draft model speculative decoding currently only supports"
-        "CUDA and ROCm and HPU attention backend.") from err
+    if current_platform.is_cuda_alike():
+        raise RuntimeError(
+            "Draft model speculative decoding currently only supports"
+            "CUDA and ROCm and HPU attention backend.") from err
 
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalKwargs
@@ -40,6 +42,14 @@
 allow_gpu_advance_step = True
 
 
+class GeneralTP1DraftModelRunner(ModelRunnerWrapperBase):
+
+    def __init__(self, model_runner: ModelRunnerBase):
+        super().__init__(model_runner)
+
+        self.indices_of_seq_with_bonus_tokens = None
+
+
 class TP1DraftModelRunner(ModelRunnerWrapperBase):
     """Specialized model runner for speculative decoding draft model.
     Since the draft model always execute k forward passes consecutively to

@@ -29,8 +29,8 @@
 
 if current_platform.is_cuda_alike():
     from vllm.spec_decode.draft_model_runner import TP1DraftModelRunner
-if current_platform.is_hpu():
-    from vllm.spec_decode.hpu_draft_model_runner import HPUTP1DraftModelRunner
+else:
+    from vllm.spec_decode.draft_model_runner import GeneralTP1DraftModelRunner
 
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
@@ -190,7 +190,7 @@ def create_worker(
                             "model_runner_cls"] = TP1DraftModelRunner
                     elif current_platform.is_hpu():
                         draft_worker_kwargs[
-                            "model_runner_cls"] = HPUTP1DraftModelRunner
+                            "model_runner_cls"] = GeneralTP1DraftModelRunner
                 else:
                     if draft_model_config.hf_config.model_type == "eagle":
                         raise NotImplementedError(

@@ -731,6 +731,8 @@ def __init__(
         # For delayed sampling
         self.cached_step_inputs: List[
             ModelInputForHPUWithSamplingMetadata] = []
+        self.spec_decode_enabled = \
+            self.vllm_config.speculative_config is not None
 
     def _set_gc_threshold(self) -> None:
         """
@@ -2581,6 +2583,8 @@ def execute_model(
         assert not (use_delayed_sampling and
             self.parallel_config.pipeline_parallel_size != 1), \
             'Delayed sampling is not compatible with Pipeline Parallelism!'
+        assert not (use_delayed_sampling and self.spec_decode_enabled), \
+            'Delayed sampling is not compatible with speculative decoding!'
         assert model_input.input_tokens is not None
         if use_delayed_sampling and not model_input.is_prompt and \
                 self.is_driver_worker:
@@ -2672,6 +2676,21 @@ def execute_model(
                 **(model_input.multi_modal_kwargs or {}),
             }
             if previous_hidden_states is not None:
+                # HPU will pad up to block_size,
+                # pad previous_hidden_states as well
+                previous_hidden_states = previous_hidden_states.unsqueeze(
+                    1).expand(-1, input_tokens.shape[-1], -1)
+                batch_size_padding = batch_size - previous_hidden_states.shape[
+                    0]
+                if batch_size_padding > 0:
+                    dummy_previous_hidden_states = torch.zeros(
+                        batch_size_padding,
+                        *previous_hidden_states.shape[1:],
+                        dtype=previous_hidden_states.dtype,
+                        device=previous_hidden_states.device)
+                    previous_hidden_states = torch.cat(
+                        [previous_hidden_states, dummy_previous_hidden_states],
+                        dim=0)
                 execute_model_kwargs.update(
                     {"previous_hidden_states": previous_hidden_states})
             if htorch.utils.internal.is_lazy():
@@ -2872,12 +2891,21 @@ def try_revert_dummy_output_tokens():
                     is_prompt=is_prompt)
                 self.profiler.record_counter(self.event_start, counters)
             if num_steps == 1:
+                if self.spec_decode_enabled and isinstance(
+                        output, SamplerOutput):
+                    output.sampled_token_ids = output.sampled_token_ids[:
+                                                                        real_batch_size]
+                    output.sampled_token_probs = output.sampled_token_probs[:
+                                                                            real_batch_size]
+                    output.logprobs = output.logprobs[:real_batch_size]
                 if self.return_hidden_states:
                     # we only need to pass hidden states of most recent token
                     assert model_input.sampling_metadata is not None
+                    hidden_states = hidden_states[:real_batch_size]
                     if model_input.is_prompt:
                         output.prefill_hidden_states = hidden_states
                     output.hidden_states = hidden_states
+
                 if use_delayed_sampling:
                     if self.is_driver_worker:
                         return [fake_output]

@@ -77,8 +77,8 @@ def __init__(
         speculative_config = self.speculative_config
         model_config = self.model_config
         speculative_args = {} if speculative_config is None \
-            or (speculative_config.draft_model_config.model ==
-                model_config.model) \
+            or (speculative_config.draft_model_config.hf_config.model_type \
+                == model_config.hf_config.model_type) \
             or (speculative_config.draft_model_config.hf_config.model_type
                 not in ["medusa", "mlp_speculator", "eagle"]) \
                     else {"return_hidden_states": True}

@@ -262,6 +262,17 @@ def __init__(
     def __getattr__(self, attr):
         return getattr(self.model_runner, attr)
 
+    def __setattr__(self, name, value):
+        """
+         Ensure that setting the 'model_runner' attribute
+         does not delegate to model_runner
+         """
+
+        if name == "model_runner":
+            object.__setattr__(self, name, value)
+        else:
+            setattr(self.model_runner, name, value)
+
 
 class InputProcessingError(Exception):
     """This exception is raised when an error occurs preparing the inputs for