Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/advanced_features/server_arguments.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ Please consult the documentation below and [server_args.py](https://github.com/s
| `--device` | The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified. | None |
| `--tp-size` | The tensor parallelism size. | 1 |
| `--pp-size` | The pipeline parallelism size. | 1 |
| `--max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | None |
| `--pp-max-micro-batch-size` | The maximum micro batch size in pipeline parallelism. | None |
| `--stream-interval` | The interval (or buffer size) for streaming in terms of the token length. A smaller value makes streaming smoother, while a larger value makes the throughput higher. | 1 |
| `--stream-output` | Whether to output as a sequence of disjoint segments. | False |
| `--random-seed` | The random seed. | None |
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/entrypoints/http_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ async def get_load():


# example usage:
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
async def set_internal_state(obj: SetInternalStateReq, request: Request):
res = await _global_state.tokenizer_manager.set_internal_state(obj)
Expand Down
2 changes: 1 addition & 1 deletion python/sglang/srt/managers/schedule_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@
"ep_num_redundant_experts",
"enable_nan_detection",
"flashinfer_mla_disable_ragged",
"max_micro_batch_size",
"pp_max_micro_batch_size",
"disable_shared_experts_fusion",
"sampling_backend",
"speculative_accept_threshold_single",
Expand Down
10 changes: 5 additions & 5 deletions python/sglang/srt/managers/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,8 +464,8 @@ def __init__(
_,
_,
) = self.tp_worker.get_worker_info()
if global_server_args_dict["max_micro_batch_size"] is None:
global_server_args_dict["max_micro_batch_size"] = max(
if global_server_args_dict["pp_max_micro_batch_size"] is None:
global_server_args_dict["pp_max_micro_batch_size"] = max(
self.max_running_requests // server_args.pp_size, 1
)

Expand Down Expand Up @@ -1802,7 +1802,7 @@ def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
return ret

def get_num_allocatable_reqs(self, running_bs):
res = global_server_args_dict["max_micro_batch_size"] - running_bs
res = global_server_args_dict["pp_max_micro_batch_size"] - running_bs
if self.pp_size > 1:
res = min(res, self.req_to_token_pool.available_size())
return res
Expand Down Expand Up @@ -2510,7 +2510,7 @@ def set_internal_state(self, recv_req: SetInternalStateReq):
server_args_dict = recv_req.server_args
args_allow_update = set(
[
"max_micro_batch_size",
"pp_max_micro_batch_size",
"speculative_accept_threshold_single",
"speculative_accept_threshold_acc",
]
Expand All @@ -2521,7 +2521,7 @@ def set_internal_state(self, recv_req: SetInternalStateReq):
logging.warning(f"Updating {k} is not supported.")
if_success = False
break
elif k == "max_micro_batch_size" and (
elif k == "pp_max_micro_batch_size" and (
v > self.max_running_requests // self.pp_size or v < 1
):
logging.warning(
Expand Down
6 changes: 3 additions & 3 deletions python/sglang/srt/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ class ServerArgs:
device: Optional[str] = None
tp_size: int = 1
pp_size: int = 1
max_micro_batch_size: Optional[int] = None
pp_max_micro_batch_size: Optional[int] = None
stream_interval: int = 1
stream_output: bool = False
random_seed: Optional[int] = None
Expand Down Expand Up @@ -1599,9 +1599,9 @@ def add_cli_args(parser: argparse.ArgumentParser):
help="The pipeline parallelism size.",
)
parser.add_argument(
"--max-micro-batch-size",
"--pp-max-micro-batch-size",
type=int,
default=ServerArgs.max_micro_batch_size,
default=ServerArgs.pp_max_micro_batch_size,
help="The maximum micro batch size in pipeline parallelism.",
)
parser.add_argument(
Expand Down
Loading