diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index aa65ea844e2e..400a65cfa659 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -90,6 +90,7 @@ DUMMY_TOKEN_ID = -1 UNSET_IMG_ARGS = 9999999 +shutdown_inc_called = False class PhaseType(Enum): @@ -3280,10 +3281,17 @@ def finish_measurements(self): finalize_calibration(self.model.model) def shutdown_inc(self): - can_finalize_inc = self._is_quant_with_inc() and \ - (self.model.model is not None) and \ - self.inc_initialized_successfully and \ - not getattr(self, "_is_inc_finalized", False) + global shutdown_inc_called + if shutdown_inc_called: + return + shutdown_inc_called = True + can_finalize_inc = False + from contextlib import suppress + with suppress(AttributeError): + can_finalize_inc = (self._is_quant_with_inc() + and (self.model.model is not None) + and self.inc_initialized_successfully and + not getattr(self, "_is_inc_finalized", False)) if can_finalize_inc: from neural_compressor.torch.quantization import ( finalize_calibration) @@ -4169,6 +4177,10 @@ def _make_decode_output( return SamplerOutput(sampler_outputs) def shutdown_inc(self): + global shutdown_inc_called + if shutdown_inc_called: + return + shutdown_inc_called = True can_finalize_inc = False from contextlib import suppress with suppress(AttributeError): diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 1b47e729502b..a8cfc775b0d4 100755 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -506,7 +506,7 @@ def list_prompt_adapters(self) -> Set[int]: "Prompt Adapter is not implemented for HPU backend.") def shutdown(self): - self.model_runner.shutdown_inc() + getattr(self.model_runner, 'shutdown_inc', lambda: None)() @property def max_model_len(self) -> int: