Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/e2e_ppo_trainer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ jobs:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
Expand Down Expand Up @@ -215,7 +215,7 @@ jobs:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
Expand Down Expand Up @@ -244,7 +244,7 @@ jobs:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
Expand Down Expand Up @@ -273,7 +273,7 @@ jobs:
HF_ENDPOINT: "https://hf-mirror.com"
HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
container:
image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4
options: --gpus all --shm-size=50g # Visual dataloader requires large memory
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/sgl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ jobs:
HF_HUB_ENABLE_HF_TRANSFER: 1
SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK: "True"
container:
image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1
image: ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4
options: --gpus all --shm-size=10g
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.rocm
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# Support - Traing: fsdp; Inference: vllm
# FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
# Support - Traing: fsdp; Inference: vllm, sglang
FROM lmsysorg/sglang:v0.4.6.post1-rocm630
FROM lmsysorg/sglang:v0.4.6.post4-rocm630

# Set working directory
# WORKDIR $PWD/app
Expand Down
4 changes: 2 additions & 2 deletions docker/Dockerfile.sglang
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip

# Install sglang-0.4.6.post1 and torch-memory-saver
RUN pip install "sglang[all]==0.4.6.post1" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
# Install sglang-0.4.6.post4 and torch-memory-saver
RUN pip install "sglang[all]==0.4.6.post4" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir

# Install torch-2.6.0
RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.vllm.sglang.megatron
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"

# Install Sglang
RUN pip install --no-deps "sglang[all]>=0.4.5.post3"
RUN pip install --no-deps "sglang[all]>=0.4.6.post3"

# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
Expand Down
2 changes: 1 addition & 1 deletion docs/amd_tutorial/amd_build_dockerfile_page.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ docker/Dockerfile.rocm
# Support - Traing: fsdp; Inference: vllm
# FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
# Support - Traing: fsdp; Inference: vllm, sglang
FROM lmsysorg/sglang:v0.4.6.post1-rocm630
FROM lmsysorg/sglang:v0.4.6.post4-rocm630

# Set working directory
# WORKDIR $PWD/app
Expand Down
2 changes: 1 addition & 1 deletion docs/start/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ For vLLM with Megatron or FSDP, please use the stable version of image ``whatcan

For latest vLLM with FSDP, please refer to ``hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0``.

For SGLang with FSDP, please use ``ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post1`` which is provided by SGLang RL Group.
For SGLang with FSDP, please use ``ocss884/verl-sglang:ngc-th2.6.0-cu126-sglang0.4.6.post4`` which is provided by SGLang RL Group.

See files under ``docker/`` for NGC-based image or if you want to build your own.

Expand Down
2 changes: 1 addition & 1 deletion docs/workers/sglang_worker.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Please always follow the following command to install SGLang with verl.

.. code-block:: bash
pip install --upgrade pip
# Currently 0.4.6.post1, subject to updates at any time, please refer to the latest version specified in `setup.py`
# Currently 0.4.6.post4, subject to updates at any time, please refer to the latest version specified in `setup.py`
pip install -e ".[sglang]"

Using SGLang as the Inference Backend for PPO Training on a Single Machine
Expand Down
2 changes: 1 addition & 1 deletion requirements_sglang.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ torchdata
torchvision
transformers
wandb
sglang[all]==0.4.4.post4
sglang[all]==0.4.6.post4
torch-memory-saver>=0.0.5
2 changes: 1 addition & 1 deletion scripts/install_vllm_sglang_mcore.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ export MAX_JOBS=32
echo "1. install inference frameworks and pytorch they need"
pip install --no-cache-dir "vllm==0.8.4" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata
if [ $USE_SGLANG -eq 1 ]; then
pip install --no-deps "sglang[all]>=0.4.5.post3"
pip install --no-deps "sglang[all]>=0.4.6.post3"
fi


Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
VLLM_REQUIRES = ["tensordict<=0.6.2", "vllm<=0.8.5"]
SGLANG_REQUIRES = [
"tensordict<=0.6.2",
"sglang[srt,openai]==0.4.6.post1",
"sglang[srt,openai]==0.4.6.post4",
"torch-memory-saver>=0.0.5",
]

Expand Down
30 changes: 16 additions & 14 deletions verl/workers/rollout/sglang_rollout/async_sglang_rollout.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,19 +41,20 @@
from verl import DataProto
from verl.third_party.sglang import parallel_state as sglang_ps
from verl.tools.base_tool import BaseTool
from verl.tools.schemas import OpenAIFunctionCallSchema, OpenAIFunctionParsedSchema, OpenAIFunctionToolCall
from verl.tools.schemas import (OpenAIFunctionCallSchema,
OpenAIFunctionParsedSchema,
OpenAIFunctionToolCall)
from verl.utils.debug import GPUMemoryLogger
from verl.utils.model import compute_position_id_with_mask
from verl.utils.net_utils import is_ipv6
from verl.utils.torch_functional import get_response_mask, pad_sequence_to_length
from verl.utils.torch_functional import (get_response_mask,
pad_sequence_to_length)
from verl.workers.rollout.base import BaseRollout
from verl.workers.rollout.schemas import (
AsyncRolloutRequest,
AsyncRolloutRequestStateEnum,
FinishReasonTypeEnum,
Message,
)
from verl.workers.rollout.sglang_rollout.sglang_rollout import _post_process_outputs, _pre_process_inputs
from verl.workers.rollout.schemas import (AsyncRolloutRequest,
AsyncRolloutRequestStateEnum,
FinishReasonTypeEnum, Message)
from verl.workers.rollout.sglang_rollout.sglang_rollout import (
_post_process_outputs, _pre_process_inputs)

if TYPE_CHECKING:
from torch import nn
Expand Down Expand Up @@ -191,6 +192,7 @@ def initialize_tools(tools_config) -> list:
dist.all_gather_object(visible_devices, os.environ["CUDA_VISIBLE_DEVICES"], device_mesh_cpu.get_group("tp"))
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(visible_devices)

self._rank = device_mesh_cpu["tp"].get_rank()
# initialize the inference engine
monkey_patch_torch_reductions()
nnodes = -(-tp_size // len(visible_devices))
Expand All @@ -199,7 +201,7 @@ def initialize_tools(tools_config) -> list:
port = get_open_port() if port is None else port
[ip, port] = broadcast_pyobj(
[ip, port],
rank=self._tp_rank,
rank=self._rank,
dist_group=device_mesh_cpu.get_group("tp"),
src=device_mesh_cpu["tp"].mesh[0].item(),
force_cpu_device=False,
Expand Down Expand Up @@ -371,7 +373,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:
# Most naive implementation, can extract tensor and send via gloo if too slow
[output] = broadcast_pyobj(
data=[output],
rank=self._tp_rank,
rank=self._rank,
dist_group=self._device_mesh_cpu["tp"].get_group(),
src=self._device_mesh_cpu["tp"].mesh[0].item(),
force_cpu_device=False,
Expand Down Expand Up @@ -423,7 +425,7 @@ def generate_sequences(self, prompts: DataProto, **kwargs) -> DataProto:

# free cache engine
if self.config.free_cache_engine and self._engine is not None:
self._engine.tokenizer_manager.flush_cache()
self._engine.flush_cache()

return DataProto(batch=batch)

Expand Down Expand Up @@ -591,7 +593,7 @@ def generate_sequences_with_tools(self, prompts: DataProto, **kwargs) -> DataPro

[sorted_output_req_list] = broadcast_pyobj(
data=[sorted_output_req_list],
rank=self._tp_rank,
rank=self._rank,
dist_group=self._device_mesh_cpu["tp"].get_group(),
src=self._device_mesh_cpu["tp"].mesh[0].item(),
force_cpu_device=False,
Expand Down Expand Up @@ -681,7 +683,7 @@ def generate_sequences_with_tools(self, prompts: DataProto, **kwargs) -> DataPro

# free cache engine
if self.config.free_cache_engine and self._engine is not None and self._tp_rank == 0:
self._engine.tokenizer_manager.flush_cache()
self._engine.flush_cache()

return DataProto(batch=batch, non_tensor_batch={"messages": np.array(messages), "reward_scores": np.array(reward_scores)})

Expand Down