Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion sgl-model-gateway/e2e_test/infra/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ class Runtime(str, Enum):

# Model loading configuration
INITIAL_GRACE_PERIOD = 30 # Wait before first health check (model loading time)
LAUNCH_STAGGER_DELAY = 5 # Delay between launching multiple workers
LAUNCH_STAGGER_DELAY = (
10 # Delay between launching multiple workers (avoid I/O contention)
)

# Retry configuration
MAX_RETRY_ATTEMPTS = (
Expand Down
33 changes: 27 additions & 6 deletions sgl-model-gateway/e2e_test/infra/model_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import logging
import os
import signal
import subprocess
import threading
import time
Expand Down Expand Up @@ -242,19 +243,39 @@ def _grpc_health_check(self, timeout: float = 5.0) -> bool:
return False

def terminate(self, timeout: float = 10.0) -> None:
"""Terminate the model server process."""
"""Terminate the model server process and all child processes.

Since workers are started with start_new_session=True, they run in their
own process group. We must kill the entire process group to ensure child
processes (e.g., TP workers) are also terminated and GPU memory is freed.
"""
if self.process.poll() is not None:
return # Already terminated

logger.info("Terminating %s (PID %d)", self.key, self.process.pid)
pid = self.process.pid
logger.info("Terminating %s (PID %d)", self.key, pid)

# Try graceful shutdown of the entire process group first
try:
pgid = os.getpgid(pid)
os.killpg(pgid, signal.SIGTERM)
except (ProcessLookupError, OSError) as e:
logger.debug("Could not send SIGTERM to process group: %s", e)
# Fall back to terminating just the main process
self.process.terminate()

# Try graceful shutdown first
self.process.terminate()
try:
self.process.wait(timeout=timeout)
except subprocess.TimeoutExpired:
logger.warning("%s did not terminate, killing", self.key)
self.process.kill()
logger.warning("%s did not terminate, killing process group", self.key)
# Force kill the entire process group
try:
pgid = os.getpgid(pid)
os.killpg(pgid, signal.SIGKILL)
except (ProcessLookupError, OSError) as e:
logger.debug("Could not send SIGKILL to process group: %s", e)
self.process.kill()

try:
self.process.wait(timeout=5) # Brief timeout after kill
except subprocess.TimeoutExpired:
Expand Down
Loading