verl-project · HollowMan6 · May 11, 2025 · May 15, 2025
diff --git a/.github/workflows/e2e_ppo_trainer.yml b/.github/workflows/e2e_ppo_trainer.yml
@@ -186,7 +186,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
+      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -215,7 +215,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
+      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -244,7 +244,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
+      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
@@ -273,7 +273,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
+      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4
       options: --gpus all --shm-size=50g # Visual dataloader requires large memory
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml
@@ -56,7 +56,7 @@ jobs:
       HF_HUB_ENABLE_HF_TRANSFER: 1
       SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK: "True"
     container:
-      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
+      image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
@@ -6,7 +6,7 @@
 # Support - Traing: fsdp; Inference: vllm
 # FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
 # Support - Traing: fsdp; Inference: vllm, sglang
-FROM lmsysorg/sglang:v0.4.6.post1-rocm630
+FROM lmsysorg/sglang:v0.4.6.post4-rocm630
 
 # Set working directory
 # WORKDIR $PWD/app

diff --git a/docker/Dockerfile.sglang b/docker/Dockerfile.sglang
@@ -36,8 +36,8 @@ RUN pip config set global.index-url "${PIP_INDEX}" && \
     pip config set global.extra-index-url "${PIP_INDEX}" && \
     python -m pip install --upgrade pip
 
-# Install sglang-0.4.6.post1 and torch-memory-saver
-RUN pip install "sglang[all]==0.4.6.post1" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
+# Install sglang-0.4.6.post4 and torch-memory-saver
+RUN pip install "sglang[all]==0.4.6.post4" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
 
 # Install torch-2.6.0
 RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \

diff --git a/docker/Dockerfile.vllm.sglang.megatron b/docker/Dockerfile.vllm.sglang.megatron
@@ -81,7 +81,7 @@ RUN pip uninstall -y pynvml nvidia-ml-py && \
     pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
 
 # Install Sglang
-RUN pip install --no-deps "sglang[all]>=0.4.5.post3"
+RUN pip install --no-deps "sglang[all]>=0.4.6.post3"
 
 # Install cudnn
 RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \

diff --git a/docs/amd_tutorial/amd_build_dockerfile_page.rst b/docs/amd_tutorial/amd_build_dockerfile_page.rst
@@ -22,7 +22,7 @@ docker/Dockerfile.rocm
     # Support - Traing: fsdp; Inference: vllm
     # FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
     # Support - Traing: fsdp; Inference: vllm, sglang
-    FROM lmsysorg/sglang:v0.4.6.post1-rocm630
+    FROM lmsysorg/sglang:v0.4.6.post4-rocm630
 
     # Set working directory
     # WORKDIR $PWD/app

@@ -42,7 +42,7 @@ For vLLM with Megatron or FSDP, please use the stable version of image ``whatcan
 
 For latest vLLM with FSDP, please refer to ``hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0``.
 
-For SGLang with FSDP, please use ``ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1`` which is provided by SGLang RL Group.
+For SGLang with FSDP, please use ``ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4`` which is provided by SGLang RL Group.
 
 See files under ``docker/`` for NGC-based image or if you want to build your own.
 

@@ -20,7 +20,7 @@ Please always follow the following command to install SGLang with verl.
 
 .. code-block:: bash
     pip install --upgrade pip
-    # Currently 0.4.6.post1, subject to updates at any time, please refer to the latest version specified in `setup.py`
+    # Currently 0.4.6.post4, subject to updates at any time, please refer to the latest version specified in `setup.py`
     pip install -e ".[sglang]"
 
 Using SGLang as the Inference Backend for PPO Training on a Single Machine

diff --git a/requirements_sglang.txt b/requirements_sglang.txt
@@ -17,5 +17,5 @@ torchdata
 torchvision
 transformers
 wandb
-sglang[all]==0.4.4.post4
+sglang[all]==0.4.6.post4
 torch-memory-saver>=0.0.5
diff --git a/scripts/install_vllm_sglang_mcore.sh b/scripts/install_vllm_sglang_mcore.sh
@@ -8,7 +8,7 @@ export MAX_JOBS=32
 echo "1. install inference frameworks and pytorch they need"
 pip install --no-cache-dir "vllm==0.8.4" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata
 if [ $USE_SGLANG -eq 1 ]; then
-    pip install --no-deps "sglang[all]>=0.4.5.post3"
+    pip install --no-deps "sglang[all]>=0.4.6.post3"
 fi
 
 

diff --git a/setup.py b/setup.py
@@ -51,7 +51,7 @@
 VLLM_REQUIRES = ["tensordict<=0.6.2", "vllm<=0.8.5"]
 SGLANG_REQUIRES = [
     "tensordict<=0.6.2",
-    "sglang[srt,openai]==0.4.6.post1",
+    "sglang[srt,openai]==0.4.6.post4",
     "torch-memory-saver>=0.0.5",
 ]
 

@@ -41,19 +41,20 @@
 from verl import DataProto
 from verl.third_party.sglang import parallel_state as sglang_ps
 from verl.tools.base_tool import BaseTool
-from verl.tools.schemas import OpenAIFunctionCallSchema, OpenAIFunctionParsedSchema, OpenAIFunctionToolCall
+from verl.tools.schemas import (OpenAIFunctionCallSchema,
+                                OpenAIFunctionParsedSchema,
+                                OpenAIFunctionToolCall)
 from verl.utils.debug import GPUMemoryLogger
 from verl.utils.model import compute_position_id_with_mask
 from verl.utils.net_utils import is_ipv6
-from verl.utils.torch_functional import get_response_mask, pad_sequence_to_length
+from verl.utils.torch_functional import (get_response_mask,
+                                         pad_sequence_to_length)
 from verl.workers.rollout.base import BaseRollout
-from verl.workers.rollout.schemas import (
-    AsyncRolloutRequest,
-    AsyncRolloutRequestStateEnum,
-    FinishReasonTypeEnum,
-    Message,
-)
-from verl.workers.rollout.sglang_rollout.sglang_rollout import _post_process_outputs, _pre_process_inputs
+from verl.workers.rollout.schemas import (AsyncRolloutRequest,
+                                          AsyncRolloutRequestStateEnum,
+                                          FinishReasonTypeEnum, Message)
+from verl.workers.rollout.sglang_rollout.sglang_rollout import (
+    _post_process_outputs, _pre_process_inputs)
 
 if TYPE_CHECKING:
     from torch import nn
@@ -191,6 +192,7 @@ def initialize_tools(tools_config) -> list:
         dist.all_gather_object(visible_devices, os.environ["CUDA_VISIBLE_DEVICES"], device_mesh_cpu.get_group("tp"))
         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(visible_devices)
 
+        self._rank = device_mesh_cpu["tp"].get_rank()
         # initialize the inference engine
         monkey_patch_torch_reductions()
         nnodes = -(-tp_size // len(visible_devices))
@@ -199,7 +201,7 @@ def initialize_tools(tools_config) -> list:
             port = get_open_port() if port is None else port
             [ip, port] = broadcast_pyobj(
                 [ip, port],
-                rank=self._tp_rank,
+                rank=self._rank,
                 dist_group=device_mesh_cpu.get_group("tp"),
                 src=device_mesh_cpu["tp"].mesh[0].item(),
                 force_cpu_device=False,
@@ -371,7 +373,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
             # Most naive implementation, can extract tensor and send via gloo if too slow
             [output] = broadcast_pyobj(
                 data=[output],
-                rank=self._tp_rank,
+                rank=self._rank,
                 dist_group=self._device_mesh_cpu["tp"].get_group(),
                 src=self._device_mesh_cpu["tp"].mesh[0].item(),
                 force_cpu_device=False,
@@ -423,7 +425,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
 
         # free cache engine
         if self.config.free_cache_engine and self._engine is not None:
-            self._engine.tokenizer_manager.flush_cache()
+            self._engine.flush_cache()
 
         return DataProto(batch=batch)
 
@@ -591,7 +593,7 @@ def generate_sequences_with_tools(self, prompts: DataProto, **kwargs) -> DataPro
 
         [sorted_output_req_list] = broadcast_pyobj(
             data=[sorted_output_req_list],
-            rank=self._tp_rank,
+            rank=self._rank,
             dist_group=self._device_mesh_cpu["tp"].get_group(),
             src=self._device_mesh_cpu["tp"].mesh[0].item(),
             force_cpu_device=False,
@@ -681,7 +683,7 @@ def generate_sequences_with_tools(self, prompts: DataProto, **kwargs) -> DataPro
 
         # free cache engine
         if self.config.free_cache_engine and self._engine is not None and self._tp_rank == 0:
-            self._engine.tokenizer_manager.flush_cache()
+            self._engine.flush_cache()
 
         return DataProto(batch=batch, non_tensor_batch={"messages": np.array(messages), "reward_scores": np.array(reward_scores)})