sgl-project · merrymercy · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025 · Mar 10, 2025
@@ -266,7 +266,7 @@ jobs:
           cd test/srt
           python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
-          USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+          # USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
       - name: Benchmark single latency + torch.compile (TP=2)
         timeout-minutes: 10

diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py
@@ -6,10 +6,12 @@
 import torch
 import torch.library
 
-from sglang.srt.utils import is_hip, is_hpu
+from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
 
 logger = logging.getLogger(__name__)
-use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
+use_vllm_custom_allreduce = get_bool_env_var(
+    "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
+)
 
 if not is_hpu():
     # ROCm does not use vllm custom allreduce

@@ -22,7 +22,7 @@
 from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
 
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention

diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py
@@ -48,6 +48,7 @@
     safetensors_weights_iterator,
 )
 from sglang.srt.utils import (
+    get_bool_env_var,
     get_device_capability,
     is_pin_memory_available,
     set_weight_attrs,
@@ -197,7 +198,7 @@ def _maybe_download_from_modelscope(
 
         Returns the path to the downloaded model, or None if the model is not
         downloaded from ModelScope."""
-        if os.environ.get("SGLANG_USE_MODELSCOPE", None) == "True":
+        if get_bool_env_var("SGLANG_USE_MODELSCOPE"):
             # download model from ModelScope hub,
             # lazy import so that modelscope is not required for normal use.
             # pylint: disable=C.

@@ -100,7 +100,6 @@ void cublas_grouped_gemm(
   check_device_dtype(out_dtype, inputs);
   check_device_dtype(out_dtype, weights);
   check_device_dtype(out_dtype, outputs);
-  cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF);
 
   // Weights should be transposed to (n, k) of column major
   std::vector<cublasOperation_t> transa_array(group_count, CUBLAS_OP_T);
@@ -132,7 +131,6 @@ void cublas_grouped_gemm(
   std::vector<void*> b_array = get_tensor_ptrs(inputs);
   std::vector<void*> c_array = get_tensor_ptrs(outputs);
 
-  auto handle = reinterpret_cast<cublasHandle_t>(cublas_handle);
   auto stream = reinterpret_cast<cudaStream_t>(cuda_stream);
 
   // Should allocate tensors for storage of pointers
@@ -141,6 +139,9 @@ void cublas_grouped_gemm(
   torch::Tensor d_c = create_ptr_pointer(c_array, stream);
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12050
+  auto handle = reinterpret_cast<cublasHandle_t>(cublas_handle);
+  cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF);
+
   auto status = cublasGemmGroupedBatchedEx(
       handle,
       transa_array.data(),

@@ -32,11 +32,8 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
   m.def("all_reduce(int fa, Tensor inp, Tensor! out) -> ()");
   m.impl("all_reduce", torch::kCUDA, &all_reduce);
 
-  m.def("get_graph_buffer_ipc_meta(int fa) -> (int[], int[])");
-  m.impl("get_graph_buffer_ipc_meta", torch::kCUDA, &get_graph_buffer_ipc_meta);
-
-  m.def("register_graph_buffers(int fa, int[][] handles, int[][] offsets) -> ()");
-  m.impl("register_graph_buffers", torch::kCUDA, &register_graph_buffers);
+  m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
+  m.def("register_graph_buffers", &register_graph_buffers);
 
   /*
    * From csrc/attention

@@ -1,9 +1,7 @@
-import math
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import pytest
 import torch
-import torch.nn as nn
 from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
 
 

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
@@ -20,30 +20,33 @@ class TestFile:
         TestFile("models/test_generation_models.py", 103),
         TestFile("models/test_qwen_models.py", 82),
         TestFile("models/test_reward_models.py", 83),
-        TestFile("test_gptqmodel_dynamic.py", 72),
         TestFile("models/test_gme_qwen_models.py", 45),
         TestFile("test_abort.py", 51),
+        TestFile("test_block_int8.py", 22),
         TestFile("test_chunked_prefill.py", 336),
-        TestFile("test_custom_allreduce.py", 1),
-        TestFile("test_double_sparsity.py", 50),
         TestFile("test_eagle_infer.py", 447),
+        TestFile("test_ebnf_constrained.py"),
+        TestFile("test_fp8_kernel.py", 2),
         TestFile("test_embedding_openai_server.py", 36),
-        TestFile("test_eval_accuracy_mini.py", 63),
         TestFile("test_gguf.py", 78),
+        TestFile("test_gptqmodel_dynamic.py", 72),
+        TestFile("test_hidden_states.py", 55),
+        TestFile("test_int8_kernel.py", 1),
         TestFile("test_input_embeddings.py", 38),
+        TestFile("test_json_constrained.py", 98),
+        TestFile("test_large_max_new_tokens.py", 41),
+        TestFile("test_metrics.py", 32),
         TestFile("test_mla.py", 92),
         TestFile("test_mla_deepseek_v3.py", 221),
         TestFile("test_mla_flashinfer.py", 395),
         TestFile("test_mla_fp8.py", 93),
-        TestFile("test_json_constrained.py", 98),
-        TestFile("test_large_max_new_tokens.py", 41),
-        TestFile("test_metrics.py", 32),
         TestFile("test_no_chunked_prefill.py", 126),
         TestFile("test_no_overlap_scheduler.py", 262),
         TestFile("test_openai_server.py", 124),
         TestFile("test_penalty.py", 41),
         TestFile("test_pytorch_sampling_backend.py", 66),
         TestFile("test_radix_attention.py", 167),
+        TestFile("test_reasoning_content.py", 89),
         TestFile("test_regex_constrained.py", 64),
         TestFile("test_release_memory_occupation.py", 44),
         TestFile("test_request_length_validation.py", 31),
@@ -58,18 +61,13 @@ class TestFile:
         TestFile("test_torchao.py", 70),
         TestFile("test_triton_attention_kernels.py", 4),
         TestFile("test_triton_attention_backend.py", 134),
-        TestFile("test_hidden_states.py", 55),
         TestFile("test_update_weights_from_disk.py", 114),
         TestFile("test_update_weights_from_tensor.py", 48),
         TestFile("test_vertex_endpoint.py", 31),
         TestFile("test_vision_chunked_prefill.py", 223),
         TestFile("test_vision_llm.py", 18.4),
         TestFile("test_vision_openai_server.py", 344),
         TestFile("test_w8a8_quantization.py", 46),
-        TestFile("test_fp8_kernel.py", 2),
-        TestFile("test_block_int8.py", 22),
-        TestFile("test_int8_kernel.py", 1),
-        TestFile("test_reasoning_content.py", 89),
     ],
     "nightly": [
         TestFile("test_nightly_gsm8k_eval.py"),

diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py
@@ -3,6 +3,7 @@
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    get_bool_env_var,
     is_in_ci,
     run_bench_one_batch,
     write_github_step_summary,
@@ -27,9 +28,13 @@ def test_moe_tp2_bs1(self):
             DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"]
         )
 
+        use_vllm_custom_allreduce = get_bool_env_var(
+            "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
+        )
+
         if is_in_ci():
             write_github_step_summary(
-                f"### test_moe_tp2_bs1\n"
+                f"### test_moe_tp2_bs1 ({use_vllm_custom_allreduce=})\n"
                 f"output_throughput : {output_throughput:.2f} token/s\n"
             )
             self.assertGreater(output_throughput, 124)

diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py
diff --git a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py
diff --git a/test/srt/test_gptqmodel_dynamic.py b/test/srt/test_gptqmodel_dynamic.py
@@ -129,6 +129,7 @@ def run_decode(self, max_new_tokens):
                 "text": "The capital of France is",
                 "sampling_params": {
                     "max_new_tokens": max_new_tokens,
+                    "temperature": 0.001,
                 },
             },
         )