Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions vllm/worker/hpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@

DUMMY_TOKEN_ID = -1
UNSET_IMG_ARGS = 9999999
shutdown_inc_called = False


class PhaseType(Enum):
Expand Down Expand Up @@ -3280,10 +3281,17 @@ def finish_measurements(self):
finalize_calibration(self.model.model)

def shutdown_inc(self):
can_finalize_inc = self._is_quant_with_inc() and \
(self.model.model is not None) and \
self.inc_initialized_successfully and \
not getattr(self, "_is_inc_finalized", False)
global shutdown_inc_called
if shutdown_inc_called:
return
shutdown_inc_called = True
can_finalize_inc = False
from contextlib import suppress
with suppress(AttributeError):
can_finalize_inc = (self._is_quant_with_inc()
and (self.model.model is not None)
and self.inc_initialized_successfully and
not getattr(self, "_is_inc_finalized", False))
if can_finalize_inc:
from neural_compressor.torch.quantization import (
finalize_calibration)
Expand Down Expand Up @@ -4169,6 +4177,10 @@ def _make_decode_output(
return SamplerOutput(sampler_outputs)

def shutdown_inc(self):
global shutdown_inc_called
if shutdown_inc_called:
return
shutdown_inc_called = True
can_finalize_inc = False
from contextlib import suppress
with suppress(AttributeError):
Expand Down
2 changes: 1 addition & 1 deletion vllm/worker/hpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ def list_prompt_adapters(self) -> Set[int]:
"Prompt Adapter is not implemented for HPU backend.")

def shutdown(self):
self.model_runner.shutdown_inc()
getattr(self.model_runner, 'shutdown_inc', lambda: None)()

@property
def max_model_len(self) -> int:
Expand Down