diff --git a/docs/advanced_features/router.md b/docs/advanced_features/router.md index af446f9d795d..388b86cda2cb 100644 --- a/docs/advanced_features/router.md +++ b/docs/advanced_features/router.md @@ -45,7 +45,7 @@ SGLang Model Gateway is a high-performance model-routing gateway for large-scale ## Architecture ### Control Plane -- **Worker Manager** discovers capabilities (`/server_info`, `/model_info`), tracks load, and registers/removes workers in the shared registry. +- **Worker Manager** discovers capabilities (`/get_server_info`, `/get_model_info`), tracks load, and registers/removes workers in the shared registry. - **Job Queue** serializes add/remove requests and exposes status (`/workers/{url}`) so clients can track onboarding progress. - **Load Monitor** feeds cache-aware and power-of-two policies with live worker load statistics. - **Health Checker** continuously probes workers and updates readiness, circuit breaker state, and router metrics. diff --git a/docs/advanced_features/server_arguments.md b/docs/advanced_features/server_arguments.md index 22f92109e1ff..891cbacfaaf4 100644 --- a/docs/advanced_features/server_arguments.md +++ b/docs/advanced_features/server_arguments.md @@ -219,7 +219,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s | Argument | Description | Defaults | Options | | --- | --- | --- | --- | | `--json-model-override-args` | A dictionary in JSON string format used to override default model configurations. | `{}` | Type: str | -| `--preferred-sampling-params` | json-formatted sampling settings that will be returned in /model_info | `None` | Type: str | +| `--preferred-sampling-params` | json-formatted sampling settings that will be returned in /get_model_info | `None` | Type: str | ## LoRA | Argument | Description | Defaults | Options | diff --git a/docs/basic_usage/native_api.ipynb b/docs/basic_usage/native_api.ipynb index 672f3643e196..4f95ab2ccbc4 100644 --- a/docs/basic_usage/native_api.ipynb +++ b/docs/basic_usage/native_api.ipynb @@ -9,8 +9,8 @@ "Apart from the OpenAI compatible APIs, the SGLang Runtime also provides its native server APIs. We introduce the following APIs:\n", "\n", "- `/generate` (text generation model)\n", - "- `/model_info`\n", - "- `/server_info`\n", + "- `/get_model_info`\n", + "- `/get_server_info`\n", "- `/health`\n", "- `/health_generate`\n", "- `/flush_cache`\n", @@ -99,7 +99,7 @@ "metadata": {}, "outputs": [], "source": [ - "url = f\"http://localhost:{port}/model_info\"\n", + "url = f\"http://localhost:{port}/get_model_info\"\n", "\n", "response = requests.get(url)\n", "response_json = response.json()\n", @@ -127,7 +127,7 @@ "source": [ "## Get Server Info\n", "Gets the server information including CLI arguments, token limits, and memory pool sizes.\n", - "- Note: `server_info` merges the following deprecated endpoints:\n", + "- Note: `get_server_info` merges the following deprecated endpoints:\n", " - `get_server_args`\n", " - `get_memory_pool_size` \n", " - `get_max_total_num_tokens`" @@ -139,7 +139,7 @@ "metadata": {}, "outputs": [], "source": [ - "url = f\"http://localhost:{port}/server_info\"\n", + "url = f\"http://localhost:{port}/get_server_info\"\n", "\n", "response = requests.get(url)\n", "print_highlight(response.text)" diff --git a/docs/developer_guide/bench_serving.md b/docs/developer_guide/bench_serving.md index caafaa74ce65..b2f8568e260f 100644 --- a/docs/developer_guide/bench_serving.md +++ b/docs/developer_guide/bench_serving.md @@ -352,4 +352,4 @@ python3 -m sglang.bench_serving \ ### Notes - The script raises the file descriptor soft limit (`RLIMIT_NOFILE`) to help with many concurrent connections. -- For sglang, `/server_info` is queried post-run to report speculative decoding accept length when available. +- For sglang, `/get_server_info` is queried post-run to report speculative decoding accept length when available. diff --git a/python/sglang/bench_one_batch_server.py b/python/sglang/bench_one_batch_server.py index f4c6e75b3330..159a3a935959 100644 --- a/python/sglang/bench_one_batch_server.py +++ b/python/sglang/bench_one_batch_server.py @@ -339,7 +339,7 @@ def run_one_case( output_throughput = batch_size * output_len / (latency - last_ttft) overall_throughput = batch_size * (input_len + output_len) / latency - server_info = requests.get(url + "/server_info").json() + server_info = requests.get(url + "/get_server_info").json() internal_state = server_info.get("internal_states", [{}]) last_gen_throughput = internal_state[0].get("last_gen_throughput", None) or -1 acc_length = internal_state[0].get("avg_spec_accept_length", None) or -1 @@ -451,7 +451,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): proc, base_url = launch_server_process(server_args) # Get tokenizer - server_info = requests.get(base_url + "/server_info").json() + server_info = requests.get(base_url + "/get_server_info").json() if "tokenizer_path" in server_info: tokenizer_path = server_info["tokenizer_path"] elif "prefill" in server_info: diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 00ea3b0a43a3..002674894f45 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -2062,7 +2062,7 @@ async def limited_request_func(request_func_input, pbar): if "sglang" in backend: server_info = requests.get( - base_url + "/server_info", headers=get_auth_headers() + base_url + "/get_server_info", headers=get_auth_headers() ) if server_info.status_code == 200: server_info_json = server_info.json() @@ -2180,7 +2180,7 @@ async def limited_request_func(request_func_input, pbar): print("{:<40} {:<10.2f}".format("Max ITL (ms):", metrics.max_itl_ms)) print("=" * 50) - resp = requests.get(base_url + "/server_info", headers=get_auth_headers()) + resp = requests.get(base_url + "/get_server_info", headers=get_auth_headers()) server_info = resp.json() if resp.status_code == 200 else None if ( diff --git a/python/sglang/lang/backend/runtime_endpoint.py b/python/sglang/lang/backend/runtime_endpoint.py index 746646aaf495..1573ca68da77 100644 --- a/python/sglang/lang/backend/runtime_endpoint.py +++ b/python/sglang/lang/backend/runtime_endpoint.py @@ -38,7 +38,7 @@ def __init__( self.verify = verify res = http_request( - self.base_url + "/model_info", + self.base_url + "/get_model_info", api_key=self.api_key, verify=self.verify, ) @@ -66,7 +66,7 @@ def flush_cache(self): def get_server_info(self): res = http_request( - self.base_url + "/server_info", + self.base_url + "/get_server_info", api_key=self.api_key, verify=self.verify, ) @@ -514,7 +514,7 @@ def encode( async def get_server_info(self): async with aiohttp.ClientSession() as session: - async with session.get(f"{self.url}/server_info") as response: + async with session.get(f"{self.url}/get_server_info") as response: if response.status == 200: return await response.json() else: diff --git a/python/sglang/profiler.py b/python/sglang/profiler.py index e23b83ac314a..0ffa07f26602 100644 --- a/python/sglang/profiler.py +++ b/python/sglang/profiler.py @@ -41,7 +41,7 @@ def run_profile( # Dump server args. file_path = Path(output_dir) / "server_args.json" if not file_path.exists(): - response = requests.get(url + "/server_info") + response = requests.get(url + "/get_server_info") response.raise_for_status() server_args_data = response.json() with open(file_path, "w") as file: diff --git a/python/sglang/srt/entrypoints/http_server.py b/python/sglang/srt/entrypoints/http_server.py index cc324375e917..cf0a3784fe8c 100644 --- a/python/sglang/srt/entrypoints/http_server.py +++ b/python/sglang/srt/entrypoints/http_server.py @@ -1480,7 +1480,7 @@ def _execute_server_warmup( for _ in range(120): time.sleep(1) try: - res = requests.get(url + "/model_info", timeout=5, headers=headers) + res = requests.get(url + "/get_model_info", timeout=5, headers=headers) assert res.status_code == 200, f"{res=}, {res.text=}" success = True break diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py index 1f6531b245ce..ceebe206bccc 100644 --- a/python/sglang/srt/server_args.py +++ b/python/sglang/srt/server_args.py @@ -2801,7 +2801,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--preferred-sampling-params", type=str, - help="json-formatted sampling settings that will be returned in /model_info", + help="json-formatted sampling settings that will be returned in /get_model_info", ) # LoRA diff --git a/scripts/playground/bench_speculative.py b/scripts/playground/bench_speculative.py index 457f1c02254f..a67ed0c3d70b 100644 --- a/scripts/playground/bench_speculative.py +++ b/scripts/playground/bench_speculative.py @@ -120,7 +120,7 @@ def send_one_batch(base_url, num_prompts, batch_size, processor, is_multimodal): acc_length = results["accept_length"] or 1.0 avg_output_token = results["total_output_tokens"] / results["completed"] - server_info = requests.get(base_url + "/server_info").json() + server_info = requests.get(base_url + "/get_server_info").json() # We use 20% percentile instead of median on purpose step_time = np.percentile( server_info["internal_states"][0]["step_time_dict"][str(batch_size)], 20 diff --git a/test/manual/test_deepseek_v32_cp_single_node.py b/test/manual/test_deepseek_v32_cp_single_node.py index 701a3da990b0..c2a6b9d4c824 100644 --- a/test/manual/test_deepseek_v32_cp_single_node.py +++ b/test/manual/test_deepseek_v32_cp_single_node.py @@ -79,7 +79,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/manual/test_eagle_infer_beta_dp_attention.py b/test/manual/test_eagle_infer_beta_dp_attention.py index 114d0a865331..382196a18fd5 100644 --- a/test/manual/test_eagle_infer_beta_dp_attention.py +++ b/test/manual/test_eagle_infer_beta_dp_attention.py @@ -77,7 +77,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/manual/test_weight_version.py b/test/manual/test_weight_version.py index c39eb6d9e849..5011ee701726 100644 --- a/test/manual/test_weight_version.py +++ b/test/manual/test_weight_version.py @@ -3,7 +3,7 @@ This test suite verifies the weight_version feature implementation including: 1. Default weight_version setting -2. /weight_version endpoint +2. /get_weight_version endpoint 3. /update_weight_version endpoint 4. /generate request meta_info contains weight_version 5. OpenAI API response metadata contains weight_version @@ -48,13 +48,13 @@ def tearDownClass(cls): def test_weight_version_comprehensive(self): """Comprehensive test for all weight_version functionality.""" - response = requests.get(f"{self.base_url}/model_info") + response = requests.get(f"{self.base_url}/get_model_info") self.assertEqual(response.status_code, 200) data = response.json() self.assertIn("weight_version", data) self.assertEqual(data["weight_version"], "test_version_1.0") - response = requests.get(f"{self.base_url}/weight_version") + response = requests.get(f"{self.base_url}/get_weight_version") self.assertEqual(response.status_code, 200) data = response.json() self.assertIn("weight_version", data) @@ -114,7 +114,7 @@ def test_weight_version_comprehensive(self): self.assertTrue(data["success"]) self.assertEqual(data["new_version"], "updated_version_2.0") - response = requests.get(f"{self.base_url}/weight_version") + response = requests.get(f"{self.base_url}/get_weight_version") self.assertEqual(response.status_code, 200) data = response.json() self.assertEqual(data["weight_version"], "updated_version_2.0") @@ -148,13 +148,13 @@ def test_weight_version_comprehensive(self): self.assertTrue(data["success"]) self.assertEqual(data["new_version"], "final_version_3.0") - # Check /weight_version - response = requests.get(f"{self.base_url}/weight_version") + # Check /get_weight_version + response = requests.get(f"{self.base_url}/get_weight_version") self.assertEqual(response.status_code, 200) self.assertEqual(response.json()["weight_version"], "final_version_3.0") - # Check /model_info - response = requests.get(f"{self.base_url}/model_info") + # Check /get_model_info + response = requests.get(f"{self.base_url}/get_model_info") self.assertEqual(response.status_code, 200) self.assertEqual(response.json()["weight_version"], "final_version_3.0") @@ -193,7 +193,7 @@ def test_update_weight_version_with_weight_updates(self): print("Testing weight_version update with real weight operations...") # Get current model info for reference - model_info_response = requests.get(f"{self.base_url}/model_info") + model_info_response = requests.get(f"{self.base_url}/get_model_info") self.assertEqual(model_info_response.status_code, 200) current_model_path = model_info_response.json()["model_path"] @@ -214,7 +214,7 @@ def test_update_weight_version_with_weight_updates(self): ) # Verify version was updated - version_response = requests.get(f"{self.base_url}/weight_version") + version_response = requests.get(f"{self.base_url}/get_weight_version") self.assertEqual(version_response.status_code, 200) self.assertEqual( version_response.json()["weight_version"], "disk_update_v2.0.0" diff --git a/test/srt/ep/test_deepep_large.py b/test/srt/ep/test_deepep_large.py index 6d9b43c39100..26f4925f744a 100644 --- a/test/srt/ep/test_deepep_large.py +++ b/test/srt/ep/test_deepep_large.py @@ -143,7 +143,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.92) - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/ep/test_deepep_small.py b/test/srt/ep/test_deepep_small.py index 2c27f783c5c5..2dd42b18913a 100644 --- a/test/srt/ep/test_deepep_small.py +++ b/test/srt/ep/test_deepep_small.py @@ -307,7 +307,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -381,7 +381,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/hicache/test_hicache_variants.py b/test/srt/hicache/test_hicache_variants.py index c67ffe9f1c52..98706878cb76 100644 --- a/test/srt/hicache/test_hicache_variants.py +++ b/test/srt/hicache/test_hicache_variants.py @@ -152,7 +152,7 @@ def test_mmlu(self): self.assertGreaterEqual(metrics["score"], self.expected_mmlu_score) # EAGLE-specific check - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") print(f"{server_info=}") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" diff --git a/test/srt/quant/test_w4a8_deepseek_v3.py b/test/srt/quant/test_w4a8_deepseek_v3.py index 1925aace81f0..064986a577a0 100644 --- a/test/srt/quant/test_w4a8_deepseek_v3.py +++ b/test/srt/quant/test_w4a8_deepseek_v3.py @@ -103,7 +103,7 @@ def test_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/rl/test_update_weights_from_disk.py b/test/srt/rl/test_update_weights_from_disk.py index 00bd72b33d3b..0127b98dc0f3 100644 --- a/test/srt/rl/test_update_weights_from_disk.py +++ b/test/srt/rl/test_update_weights_from_disk.py @@ -113,7 +113,7 @@ def run_decode_random(self, max_new_tokens=32): return response.json() def get_model_info(self): - response = requests.get(self.base_url + "/model_info") + response = requests.get(self.base_url + "/get_model_info") model_path = response.json()["model_path"] print(json.dumps(response.json())) return model_path @@ -254,7 +254,7 @@ def run_decode(self, max_new_tokens=32): return response.json() def get_model_info(self): - response = requests.get(self.base_url + "/model_info") + response = requests.get(self.base_url + "/get_model_info") model_path = response.json()["model_path"] print(json.dumps(response.json())) return model_path @@ -386,7 +386,7 @@ def run_decode(): return response.json()["text"] def get_model_info(): - response = requests.get(base_url + "/model_info") + response = requests.get(base_url + "/get_model_info") model_path = response.json()["model_path"] print(json.dumps(response.json())) return model_path diff --git a/test/srt/rl/test_update_weights_from_tensor.py b/test/srt/rl/test_update_weights_from_tensor.py index 5d545e7a89b3..04dd01d96c4a 100644 --- a/test/srt/rl/test_update_weights_from_tensor.py +++ b/test/srt/rl/test_update_weights_from_tensor.py @@ -209,7 +209,7 @@ def run_decode(self, max_new_tokens=32): return response.json() def get_model_info(self): - response = requests.get(self.base_url + "/model_info") + response = requests.get(self.base_url + "/get_model_info") model_path = response.json()["model_path"] print(json.dumps(response.json())) return model_path diff --git a/test/srt/test_data_parallelism.py b/test/srt/test_data_parallelism.py index 06dd101600b8..e54e3a5cf540 100644 --- a/test/srt/test_data_parallelism.py +++ b/test/srt/test_data_parallelism.py @@ -65,12 +65,12 @@ def test_update_weight(self): def test_get_memory_pool_size(self): # use `get_server_info` instead since `get_memory_pool_size` is merged into `get_server_info` - response = requests.get(self.base_url + "/server_info") + response = requests.get(self.base_url + "/get_server_info") assert response.status_code == 200 time.sleep(1) - response = requests.get(self.base_url + "/server_info") + response = requests.get(self.base_url + "/get_server_info") assert response.status_code == 200 diff --git a/test/srt/test_deepseek_v32_mtp.py b/test/srt/test_deepseek_v32_mtp.py index 2714544aa8a1..41e3aa78d224 100644 --- a/test/srt/test_deepseek_v32_mtp.py +++ b/test/srt/test_deepseek_v32_mtp.py @@ -69,7 +69,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/test_deepseek_v3_fp4_4gpu.py b/test/srt/test_deepseek_v3_fp4_4gpu.py index 0f326b8c473c..1f236f0953c8 100644 --- a/test/srt/test_deepseek_v3_fp4_4gpu.py +++ b/test/srt/test_deepseek_v3_fp4_4gpu.py @@ -143,7 +143,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/test_deepseek_v3_mtp.py b/test/srt/test_deepseek_v3_mtp.py index 8073868e6610..b8c43a293557 100644 --- a/test/srt/test_deepseek_v3_mtp.py +++ b/test/srt/test_deepseek_v3_mtp.py @@ -67,7 +67,7 @@ def test_a_gsm8k( metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/test_dp_attention.py b/test/srt/test_dp_attention.py index 6669e16e38ae..426616456759 100644 --- a/test/srt/test_dp_attention.py +++ b/test/srt/test_dp_attention.py @@ -112,7 +112,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/test_eagle_dp_attention.py b/test/srt/test_eagle_dp_attention.py index fb93e6443699..14c1ad394326 100644 --- a/test/srt/test_eagle_dp_attention.py +++ b/test/srt/test_eagle_dp_attention.py @@ -78,7 +78,7 @@ def test_a_gsm8k(self): metrics = run_eval_few_shot_gsm8k(args) print(f"{metrics=}") - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") server_data = server_info.json() # Try to get avg_spec_accept_length diff --git a/test/srt/test_eagle_infer_b.py b/test/srt/test_eagle_infer_b.py index 0bff1c6d04e3..5b9df163005a 100644 --- a/test/srt/test_eagle_infer_b.py +++ b/test/srt/test_eagle_infer_b.py @@ -144,7 +144,7 @@ def test_gsm8k(self): print(f"{metrics=}") self.assertGreater(metrics["accuracy"], 0.20) - server_info = requests.get(self.base_url + "/server_info").json() + server_info = requests.get(self.base_url + "/get_server_info").json() avg_spec_accept_length = server_info["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/test_fa3.py b/test/srt/test_fa3.py index 47b2e8a5ef96..739b143fa733 100644 --- a/test/srt/test_fa3.py +++ b/test/srt/test_fa3.py @@ -112,7 +112,7 @@ def test_gsm8k(self): self.assertGreater(metrics[metric_key], self.accuracy_threshold) if self.speculative_decode: - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/test_flashmla.py b/test/srt/test_flashmla.py index 5edf5b3d7504..e9c69e5d5b11 100644 --- a/test/srt/test_flashmla.py +++ b/test/srt/test_flashmla.py @@ -119,7 +119,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") print(f"{server_info=}") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" diff --git a/test/srt/test_hybrid_attn_backend.py b/test/srt/test_hybrid_attn_backend.py index 329bc1e81a1b..a16203ae73e8 100644 --- a/test/srt/test_hybrid_attn_backend.py +++ b/test/srt/test_hybrid_attn_backend.py @@ -88,7 +88,7 @@ def test_gsm8k(self): self.assertGreater(metrics[metric_key], self.accuracy_threshold) if self.speculative_decode: - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/test_mla_deepseek_v3.py b/test/srt/test_mla_deepseek_v3.py index 8d999a316d19..4e9e99ce53e8 100644 --- a/test/srt/test_mla_deepseek_v3.py +++ b/test/srt/test_mla_deepseek_v3.py @@ -179,7 +179,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/test_mla_flashinfer.py b/test/srt/test_mla_flashinfer.py index 791d44f2912c..f72aef5a5300 100644 --- a/test/srt/test_mla_flashinfer.py +++ b/test/srt/test_mla_flashinfer.py @@ -111,7 +111,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") print(f"{server_info=}") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" diff --git a/test/srt/test_mla_int8_deepseek_v3.py b/test/srt/test_mla_int8_deepseek_v3.py index db73f824b0a4..ceea88351113 100644 --- a/test/srt/test_mla_int8_deepseek_v3.py +++ b/test/srt/test_mla_int8_deepseek_v3.py @@ -113,7 +113,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] @@ -217,7 +217,7 @@ def test_gsm8k(self): self.assertGreater(metrics["accuracy"], 0.60) - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/test_ngram_speculative_decoding.py b/test/srt/test_ngram_speculative_decoding.py index 8527a9a643c9..3106fa970686 100644 --- a/test/srt/test_ngram_speculative_decoding.py +++ b/test/srt/test_ngram_speculative_decoding.py @@ -80,7 +80,7 @@ def test_gsm8k(self): metric_key = "accuracy" self.assertGreater(metrics[metric_key], self.accuracy_threshold) - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ] diff --git a/test/srt/test_srt_endpoint.py b/test/srt/test_srt_endpoint.py index 12c247ac7048..59a8c3c46c7c 100644 --- a/test/srt/test_srt_endpoint.py +++ b/test/srt/test_srt_endpoint.py @@ -496,7 +496,7 @@ def send_and_check_cached_tokens(input_ids): self.assertEqual(send_and_check_cached_tokens(range(0, 11000)), 10000) def test_get_server_info(self): - response = requests.get(self.base_url + "/server_info") + response = requests.get(self.base_url + "/get_server_info") response_json = response.json() max_total_num_tokens = response_json["max_total_num_tokens"] @@ -626,7 +626,7 @@ def test_get_server_info_concurrent(self): tp = ThreadPoolExecutor(max_workers=30) def s(): - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") server_info.json() futures = [] diff --git a/test/srt/test_standalone_speculative_decoding.py b/test/srt/test_standalone_speculative_decoding.py index 62a8aae9be83..70d18db4ae58 100644 --- a/test/srt/test_standalone_speculative_decoding.py +++ b/test/srt/test_standalone_speculative_decoding.py @@ -88,7 +88,7 @@ def test_gsm8k(self): metric_key = "accuracy" self.assertGreater(metrics[metric_key], self.accuracy_threshold) - server_info = requests.get(self.base_url + "/server_info") + server_info = requests.get(self.base_url + "/get_server_info") avg_spec_accept_length = server_info.json()["internal_states"][0][ "avg_spec_accept_length" ]