Skip to content

Commit 2fcd56e

Browse files
authored
[router] add get server info and get model info in grpc server (#11303)
1 parent 0958a39 commit 2fcd56e

File tree

7 files changed

+393
-3
lines changed

7 files changed

+393
-3
lines changed

python/sglang/srt/entrypoints/grpc_server.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import argparse
77
import asyncio
8+
import dataclasses
89
import logging
910
import multiprocessing as mp
1011
import os
@@ -15,8 +16,11 @@
1516

1617
import grpc
1718
from google.protobuf.json_format import MessageToDict
19+
from google.protobuf.struct_pb2 import Struct
20+
from google.protobuf.timestamp_pb2 import Timestamp
1821
from grpc_reflection.v1alpha import reflection
1922

23+
import sglang
2024
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
2125
from sglang.srt.entrypoints.grpc_request_manager import GrpcRequestManager
2226
from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
@@ -173,11 +177,13 @@ def __init__(
173177
request_manager: GrpcRequestManager,
174178
server_args: ServerArgs,
175179
model_info: Dict,
180+
scheduler_info: Dict,
176181
):
177182
"""Initialize the standalone gRPC service."""
178183
self.request_manager = request_manager
179184
self.server_args = server_args
180185
self.model_info = model_info
186+
self.scheduler_info = scheduler_info
181187
self.start_time = time.time()
182188

183189
# Start the request manager's event loop using auto_create_handle_loop
@@ -396,6 +402,89 @@ async def Abort(
396402
message=str(e),
397403
)
398404

405+
async def GetModelInfo(
406+
self,
407+
request: sglang_scheduler_pb2.GetModelInfoRequest,
408+
context: grpc.aio.ServicerContext,
409+
) -> sglang_scheduler_pb2.GetModelInfoResponse:
410+
"""Get model information."""
411+
logger.info("Model info request received")
412+
413+
is_generation = self.scheduler_info.get("is_generation")
414+
if is_generation is None:
415+
is_generation = not self.server_args.is_embedding
416+
417+
return sglang_scheduler_pb2.GetModelInfoResponse(
418+
model_path=self.server_args.model_path,
419+
tokenizer_path=self.server_args.tokenizer_path or "",
420+
is_generation=is_generation,
421+
preferred_sampling_params=(
422+
self.server_args.preferred_sampling_params or ""
423+
),
424+
weight_version=self.server_args.weight_version or "",
425+
served_model_name=self.server_args.served_model_name,
426+
max_context_length=self.model_info["max_context_length"],
427+
vocab_size=self.model_info["vocab_size"],
428+
supports_vision=self.model_info["supports_vision"],
429+
model_type=self.model_info["model_type"],
430+
eos_token_ids=self.model_info["eos_token_ids"],
431+
pad_token_id=self.model_info["pad_token_id"],
432+
bos_token_id=self.model_info["bos_token_id"],
433+
max_req_input_len=self.model_info["max_req_input_len"],
434+
)
435+
436+
async def GetServerInfo(
437+
self,
438+
request: sglang_scheduler_pb2.GetServerInfoRequest,
439+
context: grpc.aio.ServicerContext,
440+
) -> sglang_scheduler_pb2.GetServerInfoResponse:
441+
"""Get server information."""
442+
logger.info("Server info request received")
443+
444+
server_args_dict = dataclasses.asdict(self.server_args)
445+
server_args_struct = Struct()
446+
447+
def make_serializable(obj):
448+
if obj is None:
449+
return None
450+
elif isinstance(obj, (str, int, float, bool)):
451+
return obj
452+
elif isinstance(obj, (list, tuple, set)):
453+
return [make_serializable(item) for item in obj]
454+
elif isinstance(obj, dict):
455+
return {k: make_serializable(v) for k, v in obj.items()}
456+
else:
457+
return str(obj)
458+
459+
serializable_args = make_serializable(server_args_dict)
460+
server_args_struct.update(serializable_args)
461+
462+
# Convert scheduler_info to Struct
463+
scheduler_info_struct = Struct()
464+
scheduler_info_struct.update(self.scheduler_info)
465+
466+
# Get runtime state from request manager
467+
manager_state = self.request_manager.get_server_info()
468+
469+
# Calculate uptime
470+
uptime = time.time() - self.start_time
471+
472+
# Create timestamp
473+
start_timestamp = Timestamp()
474+
start_timestamp.FromSeconds(int(self.start_time))
475+
476+
return sglang_scheduler_pb2.GetServerInfoResponse(
477+
server_args=server_args_struct,
478+
scheduler_info=scheduler_info_struct,
479+
active_requests=manager_state["active_requests"],
480+
is_paused=manager_state["paused"],
481+
last_receive_timestamp=manager_state["last_receive_time"],
482+
uptime_seconds=uptime,
483+
sglang_version=sglang.__version__,
484+
server_type="grpc",
485+
start_time=start_timestamp,
486+
)
487+
399488
# Helper methods for request/response conversion
400489

401490
def _convert_generate_request(
@@ -756,6 +845,7 @@ async def serve_grpc(
756845
request_manager=request_manager,
757846
server_args=server_args,
758847
model_info=model_info,
848+
scheduler_info=scheduler_info,
759849
)
760850
sglang_scheduler_pb2_grpc.add_SglangSchedulerServicer_to_server(servicer, server)
761851

python/sglang/srt/grpc/sglang_scheduler.proto

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,12 @@ service SglangScheduler {
2020
// Abort a running request
2121
rpc Abort(AbortRequest) returns (AbortResponse);
2222

23+
// Get model information
24+
rpc GetModelInfo(GetModelInfoRequest) returns (GetModelInfoResponse);
25+
26+
// Get server information
27+
rpc GetServerInfo(GetServerInfoRequest) returns (GetServerInfoResponse);
28+
2329
}
2430

2531
// =====================
@@ -401,3 +407,56 @@ message SetInternalStateResponse {
401407
bool success = 1;
402408
string message = 2;
403409
}
410+
411+
// =====================
412+
// Model and Server Info
413+
// =====================
414+
415+
// Get model information
416+
message GetModelInfoRequest {}
417+
418+
message GetModelInfoResponse {
419+
string model_path = 1;
420+
string tokenizer_path = 2;
421+
bool is_generation = 3;
422+
string preferred_sampling_params = 4; // JSON string or empty
423+
string weight_version = 5;
424+
string served_model_name = 6;
425+
int32 max_context_length = 7;
426+
int32 vocab_size = 8;
427+
bool supports_vision = 9;
428+
string model_type = 10;
429+
repeated int32 eos_token_ids = 11;
430+
int32 pad_token_id = 12;
431+
int32 bos_token_id = 13;
432+
int32 max_req_input_len = 14;
433+
}
434+
435+
// Get server information
436+
message GetServerInfoRequest {}
437+
438+
message GetServerInfoResponse {
439+
// Server configuration (as structured data)
440+
google.protobuf.Struct server_args = 1;
441+
442+
// Scheduler metrics (from scheduler initialization)
443+
google.protobuf.Struct scheduler_info = 2;
444+
445+
// Runtime state
446+
int32 active_requests = 3;
447+
bool is_paused = 4;
448+
double last_receive_timestamp = 5;
449+
double uptime_seconds = 6;
450+
451+
// Version info
452+
string sglang_version = 7;
453+
454+
// Server metadata
455+
string server_type = 8; // "grpc"
456+
google.protobuf.Timestamp start_time = 9;
457+
458+
// Note: internal_states not provided in gRPC mode
459+
// Scheduler-side metrics (memory usage, throughput) require
460+
// bidirectional communicator infrastructure not available in gRPC.
461+
// Use HTTP /get_server_info if scheduler internal state is needed.
462+
}

python/sglang/srt/grpc/sglang_scheduler_pb2.py

Lines changed: 11 additions & 3 deletions
Large diffs are not rendered by default.

python/sglang/srt/grpc/sglang_scheduler_pb2.pyi

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,3 +428,65 @@ class SetInternalStateResponse(_message.Message):
428428
success: bool
429429
message: str
430430
def __init__(self, success: bool = ..., message: _Optional[str] = ...) -> None: ...
431+
432+
class GetModelInfoRequest(_message.Message):
433+
__slots__ = ()
434+
def __init__(self) -> None: ...
435+
436+
class GetModelInfoResponse(_message.Message):
437+
__slots__ = ("model_path", "tokenizer_path", "is_generation", "preferred_sampling_params", "weight_version", "served_model_name", "max_context_length", "vocab_size", "supports_vision", "model_type", "eos_token_ids", "pad_token_id", "bos_token_id", "max_req_input_len")
438+
MODEL_PATH_FIELD_NUMBER: _ClassVar[int]
439+
TOKENIZER_PATH_FIELD_NUMBER: _ClassVar[int]
440+
IS_GENERATION_FIELD_NUMBER: _ClassVar[int]
441+
PREFERRED_SAMPLING_PARAMS_FIELD_NUMBER: _ClassVar[int]
442+
WEIGHT_VERSION_FIELD_NUMBER: _ClassVar[int]
443+
SERVED_MODEL_NAME_FIELD_NUMBER: _ClassVar[int]
444+
MAX_CONTEXT_LENGTH_FIELD_NUMBER: _ClassVar[int]
445+
VOCAB_SIZE_FIELD_NUMBER: _ClassVar[int]
446+
SUPPORTS_VISION_FIELD_NUMBER: _ClassVar[int]
447+
MODEL_TYPE_FIELD_NUMBER: _ClassVar[int]
448+
EOS_TOKEN_IDS_FIELD_NUMBER: _ClassVar[int]
449+
PAD_TOKEN_ID_FIELD_NUMBER: _ClassVar[int]
450+
BOS_TOKEN_ID_FIELD_NUMBER: _ClassVar[int]
451+
MAX_REQ_INPUT_LEN_FIELD_NUMBER: _ClassVar[int]
452+
model_path: str
453+
tokenizer_path: str
454+
is_generation: bool
455+
preferred_sampling_params: str
456+
weight_version: str
457+
served_model_name: str
458+
max_context_length: int
459+
vocab_size: int
460+
supports_vision: bool
461+
model_type: str
462+
eos_token_ids: _containers.RepeatedScalarFieldContainer[int]
463+
pad_token_id: int
464+
bos_token_id: int
465+
max_req_input_len: int
466+
def __init__(self, model_path: _Optional[str] = ..., tokenizer_path: _Optional[str] = ..., is_generation: bool = ..., preferred_sampling_params: _Optional[str] = ..., weight_version: _Optional[str] = ..., served_model_name: _Optional[str] = ..., max_context_length: _Optional[int] = ..., vocab_size: _Optional[int] = ..., supports_vision: bool = ..., model_type: _Optional[str] = ..., eos_token_ids: _Optional[_Iterable[int]] = ..., pad_token_id: _Optional[int] = ..., bos_token_id: _Optional[int] = ..., max_req_input_len: _Optional[int] = ...) -> None: ...
467+
468+
class GetServerInfoRequest(_message.Message):
469+
__slots__ = ()
470+
def __init__(self) -> None: ...
471+
472+
class GetServerInfoResponse(_message.Message):
473+
__slots__ = ("server_args", "scheduler_info", "active_requests", "is_paused", "last_receive_timestamp", "uptime_seconds", "sglang_version", "server_type", "start_time")
474+
SERVER_ARGS_FIELD_NUMBER: _ClassVar[int]
475+
SCHEDULER_INFO_FIELD_NUMBER: _ClassVar[int]
476+
ACTIVE_REQUESTS_FIELD_NUMBER: _ClassVar[int]
477+
IS_PAUSED_FIELD_NUMBER: _ClassVar[int]
478+
LAST_RECEIVE_TIMESTAMP_FIELD_NUMBER: _ClassVar[int]
479+
UPTIME_SECONDS_FIELD_NUMBER: _ClassVar[int]
480+
SGLANG_VERSION_FIELD_NUMBER: _ClassVar[int]
481+
SERVER_TYPE_FIELD_NUMBER: _ClassVar[int]
482+
START_TIME_FIELD_NUMBER: _ClassVar[int]
483+
server_args: _struct_pb2.Struct
484+
scheduler_info: _struct_pb2.Struct
485+
active_requests: int
486+
is_paused: bool
487+
last_receive_timestamp: float
488+
uptime_seconds: float
489+
sglang_version: str
490+
server_type: str
491+
start_time: _timestamp_pb2.Timestamp
492+
def __init__(self, server_args: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ..., scheduler_info: _Optional[_Union[_struct_pb2.Struct, _Mapping]] = ..., active_requests: _Optional[int] = ..., is_paused: bool = ..., last_receive_timestamp: _Optional[float] = ..., uptime_seconds: _Optional[float] = ..., sglang_version: _Optional[str] = ..., server_type: _Optional[str] = ..., start_time: _Optional[_Union[datetime.datetime, _timestamp_pb2.Timestamp, _Mapping]] = ...) -> None: ...

python/sglang/srt/grpc/sglang_scheduler_pb2_grpc.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,16 @@ def __init__(self, channel):
5959
request_serializer=sglang__scheduler__pb2.AbortRequest.SerializeToString,
6060
response_deserializer=sglang__scheduler__pb2.AbortResponse.FromString,
6161
_registered_method=True)
62+
self.GetModelInfo = channel.unary_unary(
63+
'/sglang.grpc.scheduler.SglangScheduler/GetModelInfo',
64+
request_serializer=sglang__scheduler__pb2.GetModelInfoRequest.SerializeToString,
65+
response_deserializer=sglang__scheduler__pb2.GetModelInfoResponse.FromString,
66+
_registered_method=True)
67+
self.GetServerInfo = channel.unary_unary(
68+
'/sglang.grpc.scheduler.SglangScheduler/GetServerInfo',
69+
request_serializer=sglang__scheduler__pb2.GetServerInfoRequest.SerializeToString,
70+
response_deserializer=sglang__scheduler__pb2.GetServerInfoResponse.FromString,
71+
_registered_method=True)
6272

6373

6474
class SglangSchedulerServicer(object):
@@ -94,6 +104,20 @@ def Abort(self, request, context):
94104
context.set_details('Method not implemented!')
95105
raise NotImplementedError('Method not implemented!')
96106

107+
def GetModelInfo(self, request, context):
108+
"""Get model information
109+
"""
110+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
111+
context.set_details('Method not implemented!')
112+
raise NotImplementedError('Method not implemented!')
113+
114+
def GetServerInfo(self, request, context):
115+
"""Get server information
116+
"""
117+
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
118+
context.set_details('Method not implemented!')
119+
raise NotImplementedError('Method not implemented!')
120+
97121

98122
def add_SglangSchedulerServicer_to_server(servicer, server):
99123
rpc_method_handlers = {
@@ -117,6 +141,16 @@ def add_SglangSchedulerServicer_to_server(servicer, server):
117141
request_deserializer=sglang__scheduler__pb2.AbortRequest.FromString,
118142
response_serializer=sglang__scheduler__pb2.AbortResponse.SerializeToString,
119143
),
144+
'GetModelInfo': grpc.unary_unary_rpc_method_handler(
145+
servicer.GetModelInfo,
146+
request_deserializer=sglang__scheduler__pb2.GetModelInfoRequest.FromString,
147+
response_serializer=sglang__scheduler__pb2.GetModelInfoResponse.SerializeToString,
148+
),
149+
'GetServerInfo': grpc.unary_unary_rpc_method_handler(
150+
servicer.GetServerInfo,
151+
request_deserializer=sglang__scheduler__pb2.GetServerInfoRequest.FromString,
152+
response_serializer=sglang__scheduler__pb2.GetServerInfoResponse.SerializeToString,
153+
),
120154
}
121155
generic_handler = grpc.method_handlers_generic_handler(
122156
'sglang.grpc.scheduler.SglangScheduler', rpc_method_handlers)
@@ -237,3 +271,57 @@ def Abort(request,
237271
timeout,
238272
metadata,
239273
_registered_method=True)
274+
275+
@staticmethod
276+
def GetModelInfo(request,
277+
target,
278+
options=(),
279+
channel_credentials=None,
280+
call_credentials=None,
281+
insecure=False,
282+
compression=None,
283+
wait_for_ready=None,
284+
timeout=None,
285+
metadata=None):
286+
return grpc.experimental.unary_unary(
287+
request,
288+
target,
289+
'/sglang.grpc.scheduler.SglangScheduler/GetModelInfo',
290+
sglang__scheduler__pb2.GetModelInfoRequest.SerializeToString,
291+
sglang__scheduler__pb2.GetModelInfoResponse.FromString,
292+
options,
293+
channel_credentials,
294+
insecure,
295+
call_credentials,
296+
compression,
297+
wait_for_ready,
298+
timeout,
299+
metadata,
300+
_registered_method=True)
301+
302+
@staticmethod
303+
def GetServerInfo(request,
304+
target,
305+
options=(),
306+
channel_credentials=None,
307+
call_credentials=None,
308+
insecure=False,
309+
compression=None,
310+
wait_for_ready=None,
311+
timeout=None,
312+
metadata=None):
313+
return grpc.experimental.unary_unary(
314+
request,
315+
target,
316+
'/sglang.grpc.scheduler.SglangScheduler/GetServerInfo',
317+
sglang__scheduler__pb2.GetServerInfoRequest.SerializeToString,
318+
sglang__scheduler__pb2.GetServerInfoResponse.FromString,
319+
options,
320+
channel_credentials,
321+
insecure,
322+
call_credentials,
323+
compression,
324+
wait_for_ready,
325+
timeout,
326+
metadata,
327+
_registered_method=True)

0 commit comments

Comments
 (0)