From 0e10fc08d46de2d86a9c2d1cd8337315aa3bcb2d Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 9 Mar 2025 22:42:33 -0700 Subject: [PATCH 1/8] Fix tests --- python/sglang/srt/_custom_ops.py | 4 +- sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu | 5 +- test/srt/run_suite.py | 23 +++--- test/srt/test_bench_one_batch.py | 7 +- ...est_eval_accuracy_large_chunked_prefill.py | 68 ----------------- ...al_accuracy_large_mixed_chunked_prefill.py | 74 ------------------- test/srt/test_eval_accuracy_mini.py | 42 ----------- 7 files changed, 22 insertions(+), 201 deletions(-) delete mode 100644 test/srt/test_eval_accuracy_large_chunked_prefill.py delete mode 100644 test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py delete mode 100644 test/srt/test_eval_accuracy_mini.py diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py index d06765c3a8ce..6cc387706c1e 100644 --- a/python/sglang/srt/_custom_ops.py +++ b/python/sglang/srt/_custom_ops.py @@ -6,10 +6,10 @@ import torch import torch.library -from sglang.srt.utils import is_hip, is_hpu +from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu logger = logging.getLogger(__name__) -use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True) +use_vllm_custom_allreduce = get_bool_env_var("USE_VLLM_CUSTOM_ALLREDUCE", default=True) if not is_hpu(): # ROCm does not use vllm custom allreduce diff --git a/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu b/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu index d0a80c7bff57..2add0826f54c 100644 --- a/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu +++ b/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu @@ -100,7 +100,6 @@ void cublas_grouped_gemm( check_device_dtype(out_dtype, inputs); check_device_dtype(out_dtype, weights); check_device_dtype(out_dtype, outputs); - cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF); // Weights should be transposed to (n, k) of column major std::vector transa_array(group_count, CUBLAS_OP_T); @@ -132,7 +131,6 @@ void cublas_grouped_gemm( std::vector b_array = get_tensor_ptrs(inputs); std::vector c_array = get_tensor_ptrs(outputs); - auto handle = reinterpret_cast(cublas_handle); auto stream = reinterpret_cast(cuda_stream); // Should allocate tensors for storage of pointers @@ -141,6 +139,9 @@ void cublas_grouped_gemm( torch::Tensor d_c = create_ptr_pointer(c_array, stream); #if defined CUDA_VERSION && CUDA_VERSION >= 12050 + auto handle = reinterpret_cast(cublas_handle); + cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF); + auto status = cublasGemmGroupedBatchedEx( handle, transa_array.data(), diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index b9e36e2321c1..02e442497e4e 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -20,30 +20,34 @@ class TestFile: TestFile("models/test_generation_models.py", 103), TestFile("models/test_qwen_models.py", 82), TestFile("models/test_reward_models.py", 83), - TestFile("test_gptqmodel_dynamic.py", 72), TestFile("models/test_gme_qwen_models.py", 45), TestFile("test_abort.py", 51), + TestFile("test_block_int8.py", 22), TestFile("test_chunked_prefill.py", 336), - TestFile("test_custom_allreduce.py", 1), - TestFile("test_double_sparsity.py", 50), TestFile("test_eagle_infer.py", 447), + TestFile("test_ebnf_constrained.py"), + TestFile("test_fp8_kernel.py", 2), + TestFile("test_fp8_kvcache.py"), TestFile("test_embedding_openai_server.py", 36), - TestFile("test_eval_accuracy_mini.py", 63), TestFile("test_gguf.py", 78), + TestFile("test_gptqmodel_dynamic.py", 72), + TestFile("test_hidden_states.py", 55), + TestFile("test_int8_kernel.py", 1), TestFile("test_input_embeddings.py", 38), + TestFile("test_json_constrained.py", 98), + TestFile("test_large_max_new_tokens.py", 41), + TestFile("test_metrics.py", 32), TestFile("test_mla.py", 92), TestFile("test_mla_deepseek_v3.py", 221), TestFile("test_mla_flashinfer.py", 395), TestFile("test_mla_fp8.py", 93), - TestFile("test_json_constrained.py", 98), - TestFile("test_large_max_new_tokens.py", 41), - TestFile("test_metrics.py", 32), TestFile("test_no_chunked_prefill.py", 126), TestFile("test_no_overlap_scheduler.py", 262), TestFile("test_openai_server.py", 124), TestFile("test_penalty.py", 41), TestFile("test_pytorch_sampling_backend.py", 66), TestFile("test_radix_attention.py", 167), + TestFile("test_reasoning_content.py", 89), TestFile("test_regex_constrained.py", 64), TestFile("test_release_memory_occupation.py", 44), TestFile("test_request_length_validation.py", 31), @@ -58,7 +62,6 @@ class TestFile: TestFile("test_torchao.py", 70), TestFile("test_triton_attention_kernels.py", 4), TestFile("test_triton_attention_backend.py", 134), - TestFile("test_hidden_states.py", 55), TestFile("test_update_weights_from_disk.py", 114), TestFile("test_update_weights_from_tensor.py", 48), TestFile("test_vertex_endpoint.py", 31), @@ -66,10 +69,6 @@ class TestFile: TestFile("test_vision_llm.py", 18.4), TestFile("test_vision_openai_server.py", 344), TestFile("test_w8a8_quantization.py", 46), - TestFile("test_fp8_kernel.py", 2), - TestFile("test_block_int8.py", 22), - TestFile("test_int8_kernel.py", 1), - TestFile("test_reasoning_content.py", 89), ], "nightly": [ TestFile("test_nightly_gsm8k_eval.py"), diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py index f4140b89fce5..9f3e2575c5df 100644 --- a/test/srt/test_bench_one_batch.py +++ b/test/srt/test_bench_one_batch.py @@ -3,6 +3,7 @@ from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MOE_MODEL_NAME_FOR_TEST, + get_bool_env_var, is_in_ci, run_bench_one_batch, write_github_step_summary, @@ -27,9 +28,13 @@ def test_moe_tp2_bs1(self): DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"] ) + use_vllm_custom_allreduce = get_bool_env_var( + "USE_VLLM_CUSTOM_ALLREDUCE", default=True + ) + if is_in_ci(): write_github_step_summary( - f"### test_moe_tp2_bs1\n" + f"### test_moe_tp2_bs1 ({use_vllm_custom_allreduce=})\n" f"output_throughput : {output_throughput:.2f} token/s\n" ) self.assertGreater(output_throughput, 124) diff --git a/test/srt/test_eval_accuracy_large_chunked_prefill.py b/test/srt/test_eval_accuracy_large_chunked_prefill.py deleted file mode 100644 index c8ce5cff2b3b..000000000000 --- a/test/srt/test_eval_accuracy_large_chunked_prefill.py +++ /dev/null @@ -1,68 +0,0 @@ -import unittest -from types import SimpleNamespace - -from sglang.srt.utils import kill_process_tree -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - popen_launch_server, -) - - -class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=["--log-level-http", "warning", "--chunked-prefill-size", "256"], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_mmlu(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mmlu", - num_examples=3000, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.705, f"{metrics}" - - def test_human_eval(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="humaneval", - num_examples=None, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.64, f"{metrics}" - - def test_mgsm_en(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mgsm_en", - num_examples=None, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.84, f"{metrics}" - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py deleted file mode 100644 index 3bc115874f0b..000000000000 --- a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py +++ /dev/null @@ -1,74 +0,0 @@ -import unittest -from types import SimpleNamespace - -from sglang.srt.utils import kill_process_tree -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - popen_launch_server, -) - - -class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, - cls.base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=[ - "--log-level-http", - "warning", - "--chunked-prefill-size", - "256", - "--enable-mixed-chunk", - ], - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_mmlu(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mmlu", - num_examples=3000, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.705, f"{metrics}" - - def test_human_eval(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="humaneval", - num_examples=None, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.64, f"{metrics}" - - def test_mgsm_en(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mgsm_en", - num_examples=None, - num_threads=1024, - ) - - metrics = run_eval(args) - assert metrics["score"] >= 0.84, f"{metrics}" - - -if __name__ == "__main__": - unittest.main() diff --git a/test/srt/test_eval_accuracy_mini.py b/test/srt/test_eval_accuracy_mini.py deleted file mode 100644 index a008c3869e8d..000000000000 --- a/test/srt/test_eval_accuracy_mini.py +++ /dev/null @@ -1,42 +0,0 @@ -import unittest -from types import SimpleNamespace - -from sglang.srt.utils import kill_process_tree -from sglang.test.run_eval import run_eval -from sglang.test.test_utils import ( - DEFAULT_MODEL_NAME_FOR_TEST, - DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - DEFAULT_URL_FOR_TEST, - popen_launch_server, -) - - -class TestEvalAccuracyMini(unittest.TestCase): - @classmethod - def setUpClass(cls): - cls.model = DEFAULT_MODEL_NAME_FOR_TEST - cls.base_url = DEFAULT_URL_FOR_TEST - cls.process = popen_launch_server( - cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH - ) - - @classmethod - def tearDownClass(cls): - kill_process_tree(cls.process.pid) - - def test_mmlu(self): - args = SimpleNamespace( - base_url=self.base_url, - model=self.model, - eval_name="mmlu", - num_examples=64, - num_threads=32, - temperature=0.1, - ) - - metrics = run_eval(args) - self.assertGreaterEqual(metrics["score"], 0.65) - - -if __name__ == "__main__": - unittest.main() From 9539e312d56431631c35bf37bc88a6a1d888d48c Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 9 Mar 2025 22:48:34 -0700 Subject: [PATCH 2/8] Fix --- python/sglang/srt/_custom_ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/sglang/srt/_custom_ops.py b/python/sglang/srt/_custom_ops.py index 6cc387706c1e..0584dc80fd69 100644 --- a/python/sglang/srt/_custom_ops.py +++ b/python/sglang/srt/_custom_ops.py @@ -9,7 +9,9 @@ from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu logger = logging.getLogger(__name__) -use_vllm_custom_allreduce = get_bool_env_var("USE_VLLM_CUSTOM_ALLREDUCE", default=True) +use_vllm_custom_allreduce = get_bool_env_var( + "USE_VLLM_CUSTOM_ALLREDUCE", default="true" +) if not is_hpu(): # ROCm does not use vllm custom allreduce From 30102330f0a037d43a9e33567c8e465be7152b0f Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 9 Mar 2025 22:54:10 -0700 Subject: [PATCH 3/8] Fix --- test/srt/test_bench_one_batch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/test_bench_one_batch.py b/test/srt/test_bench_one_batch.py index 9f3e2575c5df..e015da6a1020 100644 --- a/test/srt/test_bench_one_batch.py +++ b/test/srt/test_bench_one_batch.py @@ -29,7 +29,7 @@ def test_moe_tp2_bs1(self): ) use_vllm_custom_allreduce = get_bool_env_var( - "USE_VLLM_CUSTOM_ALLREDUCE", default=True + "USE_VLLM_CUSTOM_ALLREDUCE", default="true" ) if is_in_ci(): From 495570164bdb1634d843b313ad41409a2d316853 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 9 Mar 2025 23:27:10 -0700 Subject: [PATCH 4/8] Fix tests --- .github/workflows/pr-test.yml | 2 +- test/srt/test_gptqmodel_dynamic.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr-test.yml b/.github/workflows/pr-test.yml index 3b33b319d1a7..265e1374bc73 100644 --- a/.github/workflows/pr-test.yml +++ b/.github/workflows/pr-test.yml @@ -266,7 +266,7 @@ jobs: cd test/srt python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 + # USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 - name: Benchmark single latency + torch.compile (TP=2) timeout-minutes: 10 diff --git a/test/srt/test_gptqmodel_dynamic.py b/test/srt/test_gptqmodel_dynamic.py index 92e17a8e4d74..c9145fe6fb7b 100644 --- a/test/srt/test_gptqmodel_dynamic.py +++ b/test/srt/test_gptqmodel_dynamic.py @@ -129,6 +129,7 @@ def run_decode(self, max_new_tokens): "text": "The capital of France is", "sampling_params": { "max_new_tokens": max_new_tokens, + "temperature": 0.001, }, }, ) From 23fe1f3629865ef71b83efa2a05215a4f0d1de3a Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 9 Mar 2025 23:28:49 -0700 Subject: [PATCH 5/8] Fix --- test/srt/run_suite.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/srt/run_suite.py b/test/srt/run_suite.py index 02e442497e4e..590c461362a3 100644 --- a/test/srt/run_suite.py +++ b/test/srt/run_suite.py @@ -27,7 +27,6 @@ class TestFile: TestFile("test_eagle_infer.py", 447), TestFile("test_ebnf_constrained.py"), TestFile("test_fp8_kernel.py", 2), - TestFile("test_fp8_kvcache.py"), TestFile("test_embedding_openai_server.py", 36), TestFile("test_gguf.py", 78), TestFile("test_gptqmodel_dynamic.py", 72), From 8a2505aee44449ccbda75f97c0a67cb48c71c795 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Sun, 9 Mar 2025 23:36:42 -0700 Subject: [PATCH 6/8] Fix torch extension --- sgl-kernel/csrc/torch_extension.cc | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sgl-kernel/csrc/torch_extension.cc b/sgl-kernel/csrc/torch_extension.cc index 1304915bfd17..d8bd89917cac 100644 --- a/sgl-kernel/csrc/torch_extension.cc +++ b/sgl-kernel/csrc/torch_extension.cc @@ -32,11 +32,8 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) { m.def("all_reduce(int fa, Tensor inp, Tensor! out) -> ()"); m.impl("all_reduce", torch::kCUDA, &all_reduce); - m.def("get_graph_buffer_ipc_meta(int fa) -> (int[], int[])"); - m.impl("get_graph_buffer_ipc_meta", torch::kCUDA, &get_graph_buffer_ipc_meta); - - m.def("register_graph_buffers(int fa, int[][] handles, int[][] offsets) -> ()"); - m.impl("register_graph_buffers", torch::kCUDA, ®ister_graph_buffers); + m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta); + m.def("register_graph_buffers", ®ister_graph_buffers); /* * From csrc/attention From 8b5b94a7b6d7aa583408cbc1a02a2247eec422c6 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 10 Mar 2025 00:00:49 -0700 Subject: [PATCH 7/8] Remove uesless imports --- sgl-kernel/tests/test_rotary_embedding.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sgl-kernel/tests/test_rotary_embedding.py b/sgl-kernel/tests/test_rotary_embedding.py index b7a141404e6f..fa937a604935 100644 --- a/sgl-kernel/tests/test_rotary_embedding.py +++ b/sgl-kernel/tests/test_rotary_embedding.py @@ -1,9 +1,7 @@ -import math from typing import Any, Dict, List, Optional, Tuple, Union import pytest import torch -import torch.nn as nn from sgl_kernel import apply_rope_with_cos_sin_cache_inplace From 18cb444fa6aed00c11e39bd6cfd713ee36f1cec9 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Mon, 10 Mar 2025 00:02:31 -0700 Subject: [PATCH 8/8] use more get_bool_env_var --- python/sglang/srt/layers/attention/flashinfer_backend.py | 2 +- python/sglang/srt/model_loader/loader.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/layers/attention/flashinfer_backend.py b/python/sglang/srt/layers/attention/flashinfer_backend.py index af47f2ce2496..791cbeec04c6 100644 --- a/python/sglang/srt/layers/attention/flashinfer_backend.py +++ b/python/sglang/srt/layers/attention/flashinfer_backend.py @@ -22,7 +22,7 @@ from sglang.srt.layers.dp_attention import get_attention_tp_size from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput -from sglang.srt.utils import is_flashinfer_available +from sglang.srt.utils import get_bool_env_var, is_flashinfer_available if TYPE_CHECKING: from sglang.srt.layers.radix_attention import RadixAttention diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py index eff4aa5f31b4..c241fd9d6102 100644 --- a/python/sglang/srt/model_loader/loader.py +++ b/python/sglang/srt/model_loader/loader.py @@ -48,6 +48,7 @@ safetensors_weights_iterator, ) from sglang.srt.utils import ( + get_bool_env_var, get_device_capability, is_pin_memory_available, set_weight_attrs, @@ -197,7 +198,7 @@ def _maybe_download_from_modelscope( Returns the path to the downloaded model, or None if the model is not downloaded from ModelScope.""" - if os.environ.get("SGLANG_USE_MODELSCOPE", None) == "True": + if get_bool_env_var("SGLANG_USE_MODELSCOPE"): # download model from ModelScope hub, # lazy import so that modelscope is not required for normal use. # pylint: disable=C.