Revert PR sgl-project#14044: Restore separate memory pool for piecewise CUDA graph (sgl-project#14278)

alisonshao · tonyluj · commit cf1da387ca53 · 2025-12-05T12:24:14.000+08:00
diff --git a/python/sglang/srt/distributed/device_communicators/pynccl_allocator.py b/python/sglang/srt/distributed/device_communicators/pynccl_allocator.py
@@ -71,8 +71,7 @@ def is_symmetric_memory_enabled():
 
 def set_graph_pool_id(graph_pool_id):
     global _graph_pool_id
-    if _graph_pool_id is not None:
-        _graph_pool_id = graph_pool_id
+    _graph_pool_id = graph_pool_id
 
 
 def disable_symmetric_memory_context():
diff --git a/python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py b/python/sglang/srt/model_executor/piecewise_cuda_graph_runner.py
@@ -45,10 +45,6 @@
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.pooler import EmbeddingPoolerOutput
 from sglang.srt.layers.torchao_utils import save_gemlite_cache
-from sglang.srt.model_executor.cuda_graph_runner import (
-    get_global_graph_memory_pool,
-    set_global_graph_memory_pool,
-)
 from sglang.srt.model_executor.forward_batch_info import (
     CaptureHiddenMode,
     ForwardBatch,
@@ -147,6 +143,19 @@ def patch_model(model: torch.nn.Module, compiler: str):
         _to_torch(model, reverse=True, num_tokens=16)
 
 
+# Reuse this memory pool across all cuda graph runners.
+global_graph_memory_pool = None
+
+
+def get_global_graph_memory_pool():
+    return global_graph_memory_pool
+
+
+def set_global_graph_memory_pool(val):
+    global global_graph_memory_pool
+    global_graph_memory_pool = val
+
+
 def set_torch_compile_config():
     import torch._dynamo.config