thyecust · thyecust · Apr 11, 2025 · Mar 29, 2025 · Mar 29, 2025 · Mar 29, 2025
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -16,4 +16,4 @@
 /test/lang @merrymercy @Ying1123 @ByronHsu
 /test/srt @merrymercy @Ying1123 @zhyncs
 /sgl-router @ByronHsu @Ying1123
-/sgl-kernel @zhyncs @ispobock @HandH1998 @BBuf @yizhang2077 @merrymercy
+/sgl-kernel @zhyncs @ispobock @HandH1998 @BBuf @yizhang2077 @merrymercy @yinfan98
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
@@ -7,12 +7,14 @@ on:
       - "python/sglang/**"
       - "test/**"
       - "sgl-kernel/**"
+      - ".github/workflows/pr-test-amd.yml"
   pull_request:
     branches: [ main ]
     paths:
       - "python/sglang/**"
       - "test/**"
       - "sgl-kernel/**"
+      - ".github/workflows/pr-test-amd.yml"
   workflow_dispatch:
 
 concurrency:
@@ -36,12 +38,12 @@ jobs:
           else
             DEVICE_FLAG="--device /dev/dri"
           fi
-          docker pull lmsysorg/sglang:v0.4.3.post4-rocm630
+          docker pull lmsysorg/sglang:v0.4.5-rocm630
           docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
             -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
             --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
             -w /sglang-checkout --name ci_sglang \
-            lmsysorg/sglang:v0.4.3.post4-rocm630
+            lmsysorg/sglang:v0.4.5-rocm630
 
       - name: Install dependencies
         run: |
@@ -53,6 +55,10 @@ jobs:
           docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
           docker exec -w /human-eval ci_sglang pip install -e .
 
+          docker exec -w / ci_sglang mkdir -p /dummy-grok
+          mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
+          docker cp ./dummy-grok ci_sglang:/
+
       - name: Evaluate Accuracy
         timeout-minutes: 20
         run: |
@@ -76,20 +82,19 @@ jobs:
           else
             DEVICE_FLAG="--device /dev/dri"
           fi
-          docker pull lmsysorg/sglang:v0.4.3.post4-rocm630
+          docker pull lmsysorg/sglang:v0.4.5-rocm630
           docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
             -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
             --cap-add=SYS_PTRACE -e HF_TOKEN=${{ secrets.AMD_HF_TOKEN }} --security-opt seccomp=unconfined \
             -w /sglang-checkout --name ci_sglang \
-            lmsysorg/sglang:v0.4.3.post4-rocm630
+            lmsysorg/sglang:v0.4.5-rocm630
 
       - name: Install dependencies
         run: |
           docker exec ci_sglang pip install --upgrade pip
           docker exec ci_sglang pip uninstall sgl-kernel -y || true
           docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
           docker exec ci_sglang pip install -e "python[dev_hip]"
-          docker exec ci_sglang pip install py-spy || true
 
           docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
           docker exec -w /human-eval ci_sglang pip install -e .
@@ -99,6 +104,48 @@ jobs:
         run: |
           docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 test_mla.py
 
+  bench-test-2-gpu-amd:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
+      github.event.pull_request.draft == false
+    runs-on: linux-mi300-gpu-2
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup docker
+        run: |
+          # Ensure GPU isolation if pod is part of kubernetes setup with DEVICE_FLAG.
+          if [ -f "/etc/podinfo/gha-render-devices" ]; then
+            DEVICE_FLAG=$(cat /etc/podinfo/gha-render-devices)
+          else
+            DEVICE_FLAG="--device /dev/dri"
+          fi
+          docker pull lmsysorg/sglang:v0.4.5-rocm630
+          docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
+            -v ${{ github.workspace }}:/sglang-checkout --ipc=host --group-add video \
+            --cap-add=SYS_PTRACE -e HF_TOKEN=${HF_TOKEN} --security-opt seccomp=unconfined \
+            -w /sglang-checkout --name ci_sglang \
+            lmsysorg/sglang:v0.4.5-rocm630
+
+      - name: Install dependencies
+        run: |
+          docker exec ci_sglang pip install --upgrade pip
+          docker exec ci_sglang pip uninstall sgl-kernel -y || true
+          docker exec -w /sglang-checkout/sgl-kernel ci_sglang bash -c "rm -f pyproject.toml && mv pyproject_rocm.toml pyproject.toml && python3 setup_rocm.py install"
+          docker exec ci_sglang pip install -e "python[dev_hip]"
+
+          docker exec -w / ci_sglang git clone https://github.com/merrymercy/human-eval.git
+          docker exec -w /human-eval ci_sglang pip install -e .
+
+          docker exec -w / ci_sglang mkdir -p /dummy-grok
+          mkdir -p dummy-grok && wget https://sharkpublic.blob.core.windows.net/sharkpublic/sglang/dummy_grok.json -O dummy-grok/config.json
+          docker cp ./dummy-grok ci_sglang:/
+
+      - name: Evaluate Benchmark
+        timeout-minutes: 20
+        run: |
+          docker exec -w /sglang-checkout/test/srt -e SGLANG_IS_IN_CI=1 ci_sglang python3 models/test_dummy_grok_models.py
+
   finish:
     if: always()
     needs: [

diff --git a/.github/workflows/pr-test-sgl-kernel.yml b/.github/workflows/pr-test-sgl-kernel.yml
@@ -80,7 +80,8 @@ jobs:
 
       - name: Install
         run: |
-          pip3 install torch==2.5.1 && pip3 install pytest && pip3 install vllm==0.6.4.post1
+          bash scripts/ci_install_dependency.sh
+          pip3 install torch==2.5.1 && pip3 install pytest && pip3 install vllm==0.7.2
           pip3 uninstall sgl-kernel -y || true
           pip3 install sgl-kernel/dist/*whl --force-reinstall --no-deps
           pip3 list | grep sgl-kernel
@@ -89,7 +90,7 @@ jobs:
         timeout-minutes: 30
         run: |
           cd sgl-kernel
-          find tests -name "test_*.py" | xargs -n 1 python3
+          pytest tests/
 
       - name: Uninstall dependencies
         run: |

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
@@ -68,7 +68,7 @@ jobs:
           bash scripts/ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 30
+        timeout-minutes: 40
         run: |
           cd test/srt
           python3 run_suite.py --suite per-commit --auto-partition-id ${{ matrix.part }} --auto-partition-size 7
@@ -87,53 +87,11 @@ jobs:
         run: |
           bash scripts/ci_install_dependency.sh
 
-      - name: Test data parallelism (DP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 test_data_parallelism.py
-
-      - name: Test data parallelism attention (DP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 test_dp_attention.py
-
-      - name: Test update weights from distributed
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 test_update_weights_from_distributed.py
-
-      - name: Test VerlEngine
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 test_verl_engine.py
-
-      - name: Test Patch Torch
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 test_patch_torch.py
-
-      - name: Test expert parallelism (EP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 test_moe_ep.py
-
-      - name: Test torch compile (TP=2)
-        timeout-minutes: 10
+      - name: Run test
+        timeout-minutes: 25
         run: |
           cd test/srt
-          python3 test_mla_tp.py
-
-      - name: Test lora tensor parallelism (TP=2)
-        timeout-minutes: 10
-        run: |
-          cd test/srt/models/lora
-          python3 test_lora_tp.py
+          python3 run_suite.py --suite per-commit-2-gpu
 
   performance-test-1-gpu-part-1:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&

diff --git a/.github/workflows/release-docs.yml b/.github/workflows/release-docs.yml
@@ -49,6 +49,7 @@ jobs:
           make compile
 
           make html
+          python3 wrap_run_llm.py
           cd _build/html
 
           git clone https://$GITHUB_TOKEN@github.com/sgl-project/sgl-project.github.io.git ../sgl-project.github.io --depth 1

diff --git a/.gitmodules b/.gitmodules
@@ -1,12 +1,4 @@
-[submodule "sgl-kernel/3rdparty/cutlass"]
-	path = sgl-kernel/3rdparty/cutlass
-	url = https://github.com/NVIDIA/cutlass.git
-[submodule "sgl-kernel/3rdparty/cccl"]
-	path = sgl-kernel/3rdparty/cccl
-	url = https://github.com/NVIDIA/cccl.git
 [submodule "sgl-kernel/3rdparty/flashinfer"]
 	path = sgl-kernel/3rdparty/flashinfer
-	url = https://github.com/flashinfer-ai/flashinfer.git
-[submodule "sgl-kernel/3rdparty/deepgemm"]
-	path = sgl-kernel/3rdparty/deepgemm
-	url = https://github.com/deepseek-ai/DeepGEMM
+	url = https://github.com/sgl-project/flashinfer.git
+	branch = sgl-kernel
diff --git a/benchmark/deepseek_v3/README.md b/benchmark/deepseek_v3/README.md
@@ -178,10 +178,11 @@ python3 -m sglang.bench_one_batch_server --model None --base-url http://10.0.0.1
 
 ### Example: Serving with 8 A100/A800 with AWQ Quantization
 
-AWQ does not support BF16, so add the `--dtype half` flag if AWQ is used for quantization. One example is as follows:
+Add `--quantization moe_wna16` flag to enable moe wna16 kernel for better performance.
+One example is as follows:
 
 ```bash
-python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --dtype half
+python3 -m sglang.launch_server --model cognitivecomputations/DeepSeek-R1-AWQ --tp 8 --trust-remote-code --quantization moe_wna16
 ```
 
 

diff --git a/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py b/benchmark/kernels/fused_moe_triton/tuning_fused_moe_triton.py
@@ -399,7 +399,12 @@ def main(args: argparse.Namespace):
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
     elif config.architectures[0] in ["DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM"]:
-        E = config.n_routed_experts
+        n_share_fusion_experts = args.n_share_experts_fusion
+        E = (
+            config.n_routed_experts + n_share_fusion_experts
+            if config.architectures[0] in ["DeepseekV3ForCausalLM"]
+            else config.n_routed_experts
+        )
         topk = config.num_experts_per_tok
         intermediate_size = config.moe_intermediate_size
         shard_intermediate_size = 2 * intermediate_size // args.tp_size
@@ -559,6 +564,12 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument("--batch-size", type=int, required=False)
     parser.add_argument("--tune", action="store_true")
+    parser.add_argument(
+        "--n-share-experts-fusion",
+        type=int,
+        default=0,
+        help="The number of shared_experts need to be replica to fuse with normal experts in deepseek v3/r1",
+    )
     args = parser.parse_args()
 
     main(args)
diff --git a/benchmark/mmmu/bench_hf.py b/benchmark/mmmu/bench_hf.py
@@ -1,5 +1,6 @@
 import argparse
 
+import PIL
 import torch
 from data_utils import save_json
 from eval_utils import (
@@ -72,17 +73,38 @@ def eval_mmmu(args):
         if suffix:
             contents += [{"type": "text", "text": suffix}]
         messages = [{"role": "user", "content": contents}]
-        model_inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            return_dict=True,
-            add_generation_prompt=True,
-            return_tensors="pt",
-        ).to(model.device)
-        input_len = model_inputs["input_ids"].shape[-1]
-        generation = model.generate(**model_inputs, generation_config=generation_config)
-        generation = generation[0][input_len:]
-        response = processor.decode(generation, skip_special_tokens=True)
+        try:
+            model_inputs = processor.tokenizer.apply_chat_template(
+                messages,
+                tokenize=True,
+                return_dict=True,
+                add_generation_prompt=True,
+                return_tensors="pt",
+            ).to(model.device)
+            input_len = model_inputs["input_ids"].shape[-1]
+            generation = model.generate(
+                **model_inputs, generation_config=generation_config
+            )
+            generation = generation[0][input_len:]
+            response = processor.decode(generation, skip_special_tokens=True)
+        except:
+            contents = []
+            if prefix:
+                contents += [prefix]
+            image = PIL.Image.open(sample["image_path"])
+            contents += [image]
+            if suffix:
+                contents += [suffix]
+            messages = [{"role": "user", "content": contents}]
+            response = model.chat(
+                msgs=messages,
+                tokenizer=processor.tokenizer,
+                sampling=False,
+                max_new_tokens=sampling_params["max_new_tokens"],
+                use_tts_template=False,
+                generate_audio=False,
+                temperature=0.0,
+            )
         print(f"response: {response}")
         process_result(response, sample, answer_dict, out_samples)
 

diff --git a/benchmark/mmmu/bench_sglang.py b/benchmark/mmmu/bench_sglang.py
@@ -86,8 +86,8 @@ def eval_mmmu(args):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    args = add_common_sglang_args_and_parse(parser)
     EvalArgs.add_cli_args(parser)
+    args = add_common_sglang_args_and_parse(parser)
     args = parser.parse_args()
 
     eval_mmmu(args)
diff --git a/benchmark/mmmu/eval_utils.py b/benchmark/mmmu/eval_utils.py
@@ -442,6 +442,8 @@ def calculate_ins_level_acc(results: Dict):
 
 
 def process_result(response, sample, answer_dict, out_samples):
+    if response is None:
+        return
     if sample["question_type"] == "multiple-choice":
         pred_ans = parse_multi_choice_response(
             response, sample["all_choices"], sample["index2ans"]

diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev
@@ -21,6 +21,7 @@ RUN apt-get update && apt-get install -y \
     pkg-config \
     libssl-dev \
     bear \
+    ccache \
     && apt install -y rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 \
     && rm -rf /var/lib/apt/lists/* \
     && apt-get clean
@@ -44,6 +45,7 @@ RUN python3 -m pip install --no-cache-dir \
     black \
     isort \
     icdiff \
+    uv \
     pre-commit
 
 # Install diff-so-fancy

diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
@@ -1,5 +1,5 @@
 # Usage (to build SGLang ROCm docker image):
-#   docker build --build-arg SGL_BRANCH=v0.4.4.post3 -t v0.4.4.post3-rocm630 -f Dockerfile.rocm .
+#   docker build --build-arg SGL_BRANCH=v0.4.5 -t v0.4.5-rocm630 -f Dockerfile.rocm .
 
 # default base image
 ARG BASE_IMAGE="rocm/sgl-dev:vllm20250114"

diff --git a/docs/Makefile b/docs/Makefile
@@ -23,7 +23,7 @@ compile:
 		parallel -0 -j3 --halt soon,fail=1 ' \
 		NB_NAME=$$(basename {}); \
 		START_TIME=$$(date +%s); \
-		retry --delay=0 --times=3 -- \
+		retry --delay=0 --times=2 -- \
 			jupyter nbconvert --to notebook --execute --inplace "{}" \
 			--ExecutePreprocessor.timeout=600 \
 			--ExecutePreprocessor.kernel_name=python3; \