From 4cb3535c70e4251001e4fab7ba9a4fea2e1b234c Mon Sep 17 00:00:00 2001 From: Hollow Man Date: Mon, 12 May 2025 01:22:45 +0300 Subject: [PATCH 1/2] [Bug] Bump SGLang version to 0.4.6.post4; Fix AsyncSGLangRollout Similar to https://github.com/sgl-project/sglang/pull/5997 In the PP PR https://github.com/sgl-project/sglang/pull/5724 broadcast_pyobj function changed its condition from judging rank==0 (if rank is local rank 0 of the passing ProcessGroup) to rank==src (if rank is global rank src), which breaks VerlEngine's broadcast logic when dp>1 and tp>1. Signed-off-by: Hollow Man --- docker/Dockerfile.sglang | 4 +-- docker/Dockerfile.vllm.sglang.megatron | 2 +- requirements_sglang.txt | 2 +- scripts/install_vllm_sglang_mcore.sh | 2 +- setup.py | 2 +- .../sglang_rollout/async_sglang_rollout.py | 30 ++++++++++--------- 6 files changed, 22 insertions(+), 20 deletions(-) diff --git a/docker/Dockerfile.sglang b/docker/Dockerfile.sglang index 7e95c799049..1a8c16d2bff 100644 --- a/docker/Dockerfile.sglang +++ b/docker/Dockerfile.sglang @@ -36,8 +36,8 @@ RUN pip config set global.index-url "${PIP_INDEX}" && \ pip config set global.extra-index-url "${PIP_INDEX}" && \ python -m pip install --upgrade pip -# Install sglang-0.4.6.post1 and torch-memory-saver -RUN pip install "sglang[all]==0.4.6.post1" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir +# Install sglang-0.4.6.post4 and torch-memory-saver +RUN pip install "sglang[all]==0.4.6.post4" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir # Install torch-2.6.0 RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \ diff --git a/docker/Dockerfile.vllm.sglang.megatron b/docker/Dockerfile.vllm.sglang.megatron index d7b205d5b9e..12010950286 100644 --- a/docker/Dockerfile.vllm.sglang.megatron +++ b/docker/Dockerfile.vllm.sglang.megatron @@ -81,7 +81,7 @@ RUN pip uninstall -y pynvml nvidia-ml-py && \ pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1" # Install Sglang -RUN pip install --no-deps "sglang[all]>=0.4.5.post3" +RUN pip install --no-deps "sglang[all]>=0.4.6.post3" # Install cudnn RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \ diff --git a/requirements_sglang.txt b/requirements_sglang.txt index b8112110e69..470851a4bf7 100644 --- a/requirements_sglang.txt +++ b/requirements_sglang.txt @@ -17,5 +17,5 @@ torchdata torchvision transformers wandb -sglang[all]==0.4.4.post4 +sglang[all]==0.4.6.post4 torch-memory-saver>=0.0.5 \ No newline at end of file diff --git a/scripts/install_vllm_sglang_mcore.sh b/scripts/install_vllm_sglang_mcore.sh index 5afe7923321..9bcc51c0140 100644 --- a/scripts/install_vllm_sglang_mcore.sh +++ b/scripts/install_vllm_sglang_mcore.sh @@ -8,7 +8,7 @@ export MAX_JOBS=32 echo "1. install inference frameworks and pytorch they need" pip install --no-cache-dir "vllm==0.8.4" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata if [ $USE_SGLANG -eq 1 ]; then - pip install --no-deps "sglang[all]>=0.4.5.post3" + pip install --no-deps "sglang[all]>=0.4.6.post3" fi diff --git a/setup.py b/setup.py index d912c4515c0..25540673e27 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ VLLM_REQUIRES = ["tensordict<=0.6.2", "vllm<=0.8.5"] SGLANG_REQUIRES = [ "tensordict<=0.6.2", - "sglang[srt,openai]==0.4.6.post1", + "sglang[srt,openai]==0.4.6.post4", "torch-memory-saver>=0.0.5", ] diff --git a/verl/workers/rollout/sglang_rollout/async_sglang_rollout.py b/verl/workers/rollout/sglang_rollout/async_sglang_rollout.py index 797ba48bda1..90d248360bc 100644 --- a/verl/workers/rollout/sglang_rollout/async_sglang_rollout.py +++ b/verl/workers/rollout/sglang_rollout/async_sglang_rollout.py @@ -41,19 +41,20 @@ from verl import DataProto from verl.third_party.sglang import parallel_state as sglang_ps from verl.tools.base_tool import BaseTool -from verl.tools.schemas import OpenAIFunctionCallSchema, OpenAIFunctionParsedSchema, OpenAIFunctionToolCall +from verl.tools.schemas import (OpenAIFunctionCallSchema, + OpenAIFunctionParsedSchema, + OpenAIFunctionToolCall) from verl.utils.debug import GPUMemoryLogger from verl.utils.model import compute_position_id_with_mask from verl.utils.net_utils import is_ipv6 -from verl.utils.torch_functional import get_response_mask, pad_sequence_to_length +from verl.utils.torch_functional import (get_response_mask, + pad_sequence_to_length) from verl.workers.rollout.base import BaseRollout -from verl.workers.rollout.schemas import ( - AsyncRolloutRequest, - AsyncRolloutRequestStateEnum, - FinishReasonTypeEnum, - Message, -) -from verl.workers.rollout.sglang_rollout.sglang_rollout import _post_process_outputs, _pre_process_inputs +from verl.workers.rollout.schemas import (AsyncRolloutRequest, + AsyncRolloutRequestStateEnum, + FinishReasonTypeEnum, Message) +from verl.workers.rollout.sglang_rollout.sglang_rollout import ( + _post_process_outputs, _pre_process_inputs) if TYPE_CHECKING: from torch import nn @@ -191,6 +192,7 @@ def initialize_tools(tools_config) -> list: dist.all_gather_object(visible_devices, os.environ["CUDA_VISIBLE_DEVICES"], device_mesh_cpu.get_group("tp")) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(visible_devices) + self._rank = device_mesh_cpu["tp"].get_rank() # initialize the inference engine monkey_patch_torch_reductions() nnodes = -(-tp_size // len(visible_devices)) @@ -199,7 +201,7 @@ def initialize_tools(tools_config) -> list: port = get_open_port() if port is None else port [ip, port] = broadcast_pyobj( [ip, port], - rank=self._tp_rank, + rank=self._rank, dist_group=device_mesh_cpu.get_group("tp"), src=device_mesh_cpu["tp"].mesh[0].item(), force_cpu_device=False, @@ -371,7 +373,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto: # Most naive implementation, can extract tensor and send via gloo if too slow [output] = broadcast_pyobj( data=[output], - rank=self._tp_rank, + rank=self._rank, dist_group=self._device_mesh_cpu["tp"].get_group(), src=self._device_mesh_cpu["tp"].mesh[0].item(), force_cpu_device=False, @@ -423,7 +425,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto: # free cache engine if self.config.free_cache_engine and self._engine is not None: - self._engine.tokenizer_manager.flush_cache() + self._engine.flush_cache() return DataProto(batch=batch) @@ -591,7 +593,7 @@ def generate_sequences_with_tools(self, prompts: DataProto, **kwargs) -> DataPro [sorted_output_req_list] = broadcast_pyobj( data=[sorted_output_req_list], - rank=self._tp_rank, + rank=self._rank, dist_group=self._device_mesh_cpu["tp"].get_group(), src=self._device_mesh_cpu["tp"].mesh[0].item(), force_cpu_device=False, @@ -681,7 +683,7 @@ def generate_sequences_with_tools(self, prompts: DataProto, **kwargs) -> DataPro # free cache engine if self.config.free_cache_engine and self._engine is not None and self._tp_rank == 0: - self._engine.tokenizer_manager.flush_cache() + self._engine.flush_cache() return DataProto(batch=batch, non_tensor_batch={"messages": np.array(messages), "reward_scores": np.array(reward_scores)}) From 22276296a10732fe281f4563bf89395c96091c72 Mon Sep 17 00:00:00 2001 From: Hollow Man Date: Thu, 15 May 2025 20:00:00 +0300 Subject: [PATCH 2/2] Bump CI sglang version Signed-off-by: Hollow Man --- .github/workflows/e2e_ppo_trainer.yml | 8 ++++---- .github/workflows/sgl.yml | 2 +- docker/Dockerfile.rocm | 2 +- docs/amd_tutorial/amd_build_dockerfile_page.rst | 2 +- docs/start/install.rst | 2 +- docs/workers/sglang_worker.rst | 2 +- 6 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/e2e_ppo_trainer.yml b/.github/workflows/e2e_ppo_trainer.yml index 7c3fc615897..6cd4b65b696 100644 --- a/.github/workflows/e2e_ppo_trainer.yml +++ b/.github/workflows/e2e_ppo_trainer.yml @@ -186,7 +186,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1 + image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -215,7 +215,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1 + image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -244,7 +244,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1 + image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 @@ -273,7 +273,7 @@ jobs: HF_ENDPOINT: "https://hf-mirror.com" HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable container: - image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1 + image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4 options: --gpus all --shm-size=50g # Visual dataloader requires large memory steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/.github/workflows/sgl.yml b/.github/workflows/sgl.yml index 6be11e8dda4..ce4147642f4 100644 --- a/.github/workflows/sgl.yml +++ b/.github/workflows/sgl.yml @@ -56,7 +56,7 @@ jobs: HF_HUB_ENABLE_HF_TRANSFER: 1 SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK: "True" container: - image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1 + image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4 options: --gpus all --shm-size=10g steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 89648e5a4d5..d8992c4a4e6 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -6,7 +6,7 @@ # Support - Traing: fsdp; Inference: vllm # FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 # Support - Traing: fsdp; Inference: vllm, sglang -FROM lmsysorg/sglang:v0.4.6.post1-rocm630 +FROM lmsysorg/sglang:v0.4.6.post4-rocm630 # Set working directory # WORKDIR $PWD/app diff --git a/docs/amd_tutorial/amd_build_dockerfile_page.rst b/docs/amd_tutorial/amd_build_dockerfile_page.rst index c6041d7121b..8f45f89affa 100644 --- a/docs/amd_tutorial/amd_build_dockerfile_page.rst +++ b/docs/amd_tutorial/amd_build_dockerfile_page.rst @@ -22,7 +22,7 @@ docker/Dockerfile.rocm # Support - Traing: fsdp; Inference: vllm # FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 # Support - Traing: fsdp; Inference: vllm, sglang - FROM lmsysorg/sglang:v0.4.6.post1-rocm630 + FROM lmsysorg/sglang:v0.4.6.post4-rocm630 # Set working directory # WORKDIR $PWD/app diff --git a/docs/start/install.rst b/docs/start/install.rst index e66c0cd6178..110282b9563 100644 --- a/docs/start/install.rst +++ b/docs/start/install.rst @@ -42,7 +42,7 @@ For vLLM with Megatron or FSDP, please use the stable version of image ``whatcan For latest vLLM with FSDP, please refer to ``hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0``. -For SGLang with FSDP, please use ``ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1`` which is provided by SGLang RL Group. +For SGLang with FSDP, please use ``ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4`` which is provided by SGLang RL Group. See files under ``docker/`` for NGC-based image or if you want to build your own. diff --git a/docs/workers/sglang_worker.rst b/docs/workers/sglang_worker.rst index 8cbdab9a842..0d2342a160f 100644 --- a/docs/workers/sglang_worker.rst +++ b/docs/workers/sglang_worker.rst @@ -20,7 +20,7 @@ Please always follow the following command to install SGLang with verl. .. code-block:: bash pip install --upgrade pip - # Currently 0.4.6.post1, subject to updates at any time, please refer to the latest version specified in `setup.py` + # Currently 0.4.6.post4, subject to updates at any time, please refer to the latest version specified in `setup.py` pip install -e ".[sglang]" Using SGLang as the Inference Backend for PPO Training on a Single Machine