From 0e10fc08d46de2d86a9c2d1cd8337315aa3bcb2d Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 9 Mar 2025 22:42:33 -0700
Subject: [PATCH 1/8] Fix tests

---
 python/sglang/srt/_custom_ops.py              |  4 +-
 sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu   |  5 +-
 test/srt/run_suite.py                         | 23 +++---
 test/srt/test_bench_one_batch.py              |  7 +-
 ...est_eval_accuracy_large_chunked_prefill.py | 68 -----------------
 ...al_accuracy_large_mixed_chunked_prefill.py | 74 -------------------
 test/srt/test_eval_accuracy_mini.py           | 42 -----------
 7 files changed, 22 insertions(+), 201 deletions(-)
 delete mode 100644 test/srt/test_eval_accuracy_large_chunked_prefill.py
 delete mode 100644 test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
 delete mode 100644 test/srt/test_eval_accuracy_mini.py

diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py
index d06765c3a8ce..6cc387706c1e 100644
--- a/python/sglang/srt/_custom_ops.py
+++ b/python/sglang/srt/_custom_ops.py
@@ -6,10 +6,10 @@
 import torch
 import torch.library
 
-from sglang.srt.utils import is_hip, is_hpu
+from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
 
 logger = logging.getLogger(__name__)
-use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
+use_vllm_custom_allreduce = get_bool_env_var("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
 
 if not is_hpu():
     # ROCm does not use vllm custom allreduce
diff --git a/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu b/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
index d0a80c7bff57..2add0826f54c 100644
--- a/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
+++ b/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
@@ -100,7 +100,6 @@ void cublas_grouped_gemm(
   check_device_dtype(out_dtype, inputs);
   check_device_dtype(out_dtype, weights);
   check_device_dtype(out_dtype, outputs);
-  cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF);
 
   // Weights should be transposed to (n, k) of column major
   std::vector<cublasOperation_t> transa_array(group_count, CUBLAS_OP_T);
@@ -132,7 +131,6 @@ void cublas_grouped_gemm(
   std::vector<void*> b_array = get_tensor_ptrs(inputs);
   std::vector<void*> c_array = get_tensor_ptrs(outputs);
 
-  auto handle = reinterpret_cast<cublasHandle_t>(cublas_handle);
   auto stream = reinterpret_cast<cudaStream_t>(cuda_stream);
 
   // Should allocate tensors for storage of pointers
@@ -141,6 +139,9 @@ void cublas_grouped_gemm(
   torch::Tensor d_c = create_ptr_pointer(c_array, stream);
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12050
+  auto handle = reinterpret_cast<cublasHandle_t>(cublas_handle);
+  cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF);
+
   auto status = cublasGemmGroupedBatchedEx(
       handle,
       transa_array.data(),
diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index b9e36e2321c1..02e442497e4e 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -20,30 +20,34 @@ class TestFile:
         TestFile("models/test_generation_models.py", 103),
         TestFile("models/test_qwen_models.py", 82),
         TestFile("models/test_reward_models.py", 83),
-        TestFile("test_gptqmodel_dynamic.py", 72),
         TestFile("models/test_gme_qwen_models.py", 45),
         TestFile("test_abort.py", 51),
+        TestFile("test_block_int8.py", 22),
         TestFile("test_chunked_prefill.py", 336),
-        TestFile("test_custom_allreduce.py", 1),
-        TestFile("test_double_sparsity.py", 50),
         TestFile("test_eagle_infer.py", 447),
+        TestFile("test_ebnf_constrained.py"),
+        TestFile("test_fp8_kernel.py", 2),
+        TestFile("test_fp8_kvcache.py"),
         TestFile("test_embedding_openai_server.py", 36),
-        TestFile("test_eval_accuracy_mini.py", 63),
         TestFile("test_gguf.py", 78),
+        TestFile("test_gptqmodel_dynamic.py", 72),
+        TestFile("test_hidden_states.py", 55),
+        TestFile("test_int8_kernel.py", 1),
         TestFile("test_input_embeddings.py", 38),
+        TestFile("test_json_constrained.py", 98),
+        TestFile("test_large_max_new_tokens.py", 41),
+        TestFile("test_metrics.py", 32),
         TestFile("test_mla.py", 92),
         TestFile("test_mla_deepseek_v3.py", 221),
         TestFile("test_mla_flashinfer.py", 395),
         TestFile("test_mla_fp8.py", 93),
-        TestFile("test_json_constrained.py", 98),
-        TestFile("test_large_max_new_tokens.py", 41),
-        TestFile("test_metrics.py", 32),
         TestFile("test_no_chunked_prefill.py", 126),
         TestFile("test_no_overlap_scheduler.py", 262),
         TestFile("test_openai_server.py", 124),
         TestFile("test_penalty.py", 41),
         TestFile("test_pytorch_sampling_backend.py", 66),
         TestFile("test_radix_attention.py", 167),
+        TestFile("test_reasoning_content.py", 89),
         TestFile("test_regex_constrained.py", 64),
         TestFile("test_release_memory_occupation.py", 44),
         TestFile("test_request_length_validation.py", 31),
@@ -58,7 +62,6 @@ class TestFile:
         TestFile("test_torchao.py", 70),
         TestFile("test_triton_attention_kernels.py", 4),
         TestFile("test_triton_attention_backend.py", 134),
-        TestFile("test_hidden_states.py", 55),
         TestFile("test_update_weights_from_disk.py", 114),
         TestFile("test_update_weights_from_tensor.py", 48),
         TestFile("test_vertex_endpoint.py", 31),
@@ -66,10 +69,6 @@ class TestFile:
         TestFile("test_vision_llm.py", 18.4),
         TestFile("test_vision_openai_server.py", 344),
         TestFile("test_w8a8_quantization.py", 46),
-        TestFile("test_fp8_kernel.py", 2),
-        TestFile("test_block_int8.py", 22),
-        TestFile("test_int8_kernel.py", 1),
-        TestFile("test_reasoning_content.py", 89),
     ],
     "nightly": [
         TestFile("test_nightly_gsm8k_eval.py"),
diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py
index f4140b89fce5..9f3e2575c5df 100644
--- a/test/srt/test_bench_one_batch.py
+++ b/test/srt/test_bench_one_batch.py
@@ -3,6 +3,7 @@
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    get_bool_env_var,
     is_in_ci,
     run_bench_one_batch,
     write_github_step_summary,
@@ -27,9 +28,13 @@ def test_moe_tp2_bs1(self):
             DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"]
         )
 
+        use_vllm_custom_allreduce = get_bool_env_var(
+            "USE_VLLM_CUSTOM_ALLREDUCE", default=True
+        )
+
         if is_in_ci():
             write_github_step_summary(
-                f"### test_moe_tp2_bs1\n"
+                f"### test_moe_tp2_bs1 ({use_vllm_custom_allreduce=})\n"
                 f"output_throughput : {output_throughput:.2f} token/s\n"
             )
             self.assertGreater(output_throughput, 124)
diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py
deleted file mode 100644
index c8ce5cff2b3b..000000000000
--- a/test/srt/test_eval_accuracy_large_chunked_prefill.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import unittest
-from types import SimpleNamespace
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-
-
-class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--log-level-http", "warning", "--chunked-prefill-size", "256"],
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_mmlu(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mmlu",
-            num_examples=3000,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.705, f"{metrics}"
-
-    def test_human_eval(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="humaneval",
-            num_examples=None,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.64, f"{metrics}"
-
-    def test_mgsm_en(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mgsm_en",
-            num_examples=None,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.84, f"{metrics}"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
deleted file mode 100644
index 3bc115874f0b..000000000000
--- a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import unittest
-from types import SimpleNamespace
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-
-
-class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--log-level-http",
-                "warning",
-                "--chunked-prefill-size",
-                "256",
-                "--enable-mixed-chunk",
-            ],
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_mmlu(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mmlu",
-            num_examples=3000,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.705, f"{metrics}"
-
-    def test_human_eval(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="humaneval",
-            num_examples=None,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.64, f"{metrics}"
-
-    def test_mgsm_en(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mgsm_en",
-            num_examples=None,
-            num_threads=1024,
-        )
-
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.84, f"{metrics}"
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py
deleted file mode 100644
index a008c3869e8d..000000000000
--- a/test/srt/test_eval_accuracy_mini.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import unittest
-from types import SimpleNamespace
-
-from sglang.srt.utils import kill_process_tree
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-
-
-class TestEvalAccuracyMini(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_mmlu(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mmlu",
-            num_examples=64,
-            num_threads=32,
-            temperature=0.1,
-        )
-
-        metrics = run_eval(args)
-        self.assertGreaterEqual(metrics["score"], 0.65)
-
-
-if __name__ == "__main__":
-    unittest.main()

From 9539e312d56431631c35bf37bc88a6a1d888d48c Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 9 Mar 2025 22:48:34 -0700
Subject: [PATCH 2/8] Fix

---
 python/sglang/srt/_custom_ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py
index 6cc387706c1e..0584dc80fd69 100644
--- a/python/sglang/srt/_custom_ops.py
+++ b/python/sglang/srt/_custom_ops.py
@@ -9,7 +9,9 @@
 from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
 
 logger = logging.getLogger(__name__)
-use_vllm_custom_allreduce = get_bool_env_var("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
+use_vllm_custom_allreduce = get_bool_env_var(
+    "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
+)
 
 if not is_hpu():
     # ROCm does not use vllm custom allreduce

From 30102330f0a037d43a9e33567c8e465be7152b0f Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 9 Mar 2025 22:54:10 -0700
Subject: [PATCH 3/8] Fix

---
 test/srt/test_bench_one_batch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py
index 9f3e2575c5df..e015da6a1020 100644
--- a/test/srt/test_bench_one_batch.py
+++ b/test/srt/test_bench_one_batch.py
@@ -29,7 +29,7 @@ def test_moe_tp2_bs1(self):
         )
 
         use_vllm_custom_allreduce = get_bool_env_var(
-            "USE_VLLM_CUSTOM_ALLREDUCE", default=True
+            "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
         )
 
         if is_in_ci():

From 495570164bdb1634d843b313ad41409a2d316853 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 9 Mar 2025 23:27:10 -0700
Subject: [PATCH 4/8] Fix tests

---
 .github/workflows/pr-test.yml      | 2 +-
 test/srt/test_gptqmodel_dynamic.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml
index 3b33b319d1a7..265e1374bc73 100644
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -266,7 +266,7 @@ jobs:
           cd test/srt
           python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
-          USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+          # USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
       - name: Benchmark single latency + torch.compile (TP=2)
         timeout-minutes: 10
diff --git a/test/srt/test_gptqmodel_dynamic.py b/test/srt/test_gptqmodel_dynamic.py
index 92e17a8e4d74..c9145fe6fb7b 100644
--- a/test/srt/test_gptqmodel_dynamic.py
+++ b/test/srt/test_gptqmodel_dynamic.py
@@ -129,6 +129,7 @@ def run_decode(self, max_new_tokens):
                 "text": "The capital of France is",
                 "sampling_params": {
                     "max_new_tokens": max_new_tokens,
+                    "temperature": 0.001,
                 },
             },
         )

From 23fe1f3629865ef71b83efa2a05215a4f0d1de3a Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 9 Mar 2025 23:28:49 -0700
Subject: [PATCH 5/8] Fix

---
 test/srt/run_suite.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py
index 02e442497e4e..590c461362a3 100644
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -27,7 +27,6 @@ class TestFile:
         TestFile("test_eagle_infer.py", 447),
         TestFile("test_ebnf_constrained.py"),
         TestFile("test_fp8_kernel.py", 2),
-        TestFile("test_fp8_kvcache.py"),
         TestFile("test_embedding_openai_server.py", 36),
         TestFile("test_gguf.py", 78),
         TestFile("test_gptqmodel_dynamic.py", 72),

From 8a2505aee44449ccbda75f97c0a67cb48c71c795 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Sun, 9 Mar 2025 23:36:42 -0700
Subject: [PATCH 6/8] Fix torch extension

---
 sgl-kernel/csrc/torch_extension.cc | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/sgl-kernel/csrc/torch_extension.cc b/sgl-kernel/csrc/torch_extension.cc
index 1304915bfd17..d8bd89917cac 100644
--- a/sgl-kernel/csrc/torch_extension.cc
+++ b/sgl-kernel/csrc/torch_extension.cc
@@ -32,11 +32,8 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
   m.def("all_reduce(int fa, Tensor inp, Tensor! out) -> ()");
   m.impl("all_reduce", torch::kCUDA, &all_reduce);
 
-  m.def("get_graph_buffer_ipc_meta(int fa) -> (int[], int[])");
-  m.impl("get_graph_buffer_ipc_meta", torch::kCUDA, &get_graph_buffer_ipc_meta);
-
-  m.def("register_graph_buffers(int fa, int[][] handles, int[][] offsets) -> ()");
-  m.impl("register_graph_buffers", torch::kCUDA, &register_graph_buffers);
+  m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
+  m.def("register_graph_buffers", &register_graph_buffers);
 
   /*
    * From csrc/attention

From 8b5b94a7b6d7aa583408cbc1a02a2247eec422c6 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 10 Mar 2025 00:00:49 -0700
Subject: [PATCH 7/8] Remove uesless imports

---
 sgl-kernel/tests/test_rotary_embedding.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sgl-kernel/tests/test_rotary_embedding.py b/sgl-kernel/tests/test_rotary_embedding.py
index b7a141404e6f..fa937a604935 100644
--- a/sgl-kernel/tests/test_rotary_embedding.py
+++ b/sgl-kernel/tests/test_rotary_embedding.py
@@ -1,9 +1,7 @@
-import math
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import pytest
 import torch
-import torch.nn as nn
 from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
 
 

From 18cb444fa6aed00c11e39bd6cfd713ee36f1cec9 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Mon, 10 Mar 2025 00:02:31 -0700
Subject: [PATCH 8/8] use more get_bool_env_var

---
 python/sglang/srt/layers/attention/flashinfer_backend.py | 2 +-
 python/sglang/srt/model_loader/loader.py                 | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py
index af47f2ce2496..791cbeec04c6 100644
--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -22,7 +22,7 @@
 from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
 
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention
diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py
index eff4aa5f31b4..c241fd9d6102 100644
--- a/python/sglang/srt/model_loader/loader.py
+++ b/python/sglang/srt/model_loader/loader.py
@@ -48,6 +48,7 @@
     safetensors_weights_iterator,
 )
 from sglang.srt.utils import (
+    get_bool_env_var,
     get_device_capability,
     is_pin_memory_available,
     set_weight_attrs,
@@ -197,7 +198,7 @@ def _maybe_download_from_modelscope(
 
         Returns the path to the downloaded model, or None if the model is not
         downloaded from ModelScope."""
-        if os.environ.get("SGLANG_USE_MODELSCOPE", None) == "True":
+        if get_bool_env_var("SGLANG_USE_MODELSCOPE"):
             # download model from ModelScope hub,
             # lazy import so that modelscope is not required for normal use.
             # pylint: disable=C.