sgl-project · zhyncs · Jun 23, 2025 · Jun 19, 2025
diff --git a/sgl-router/Cargo.toml b/sgl-router/Cargo.toml
@@ -35,6 +35,7 @@ metrics = "0.24.2"
 metrics-exporter-prometheus = "0.17.0"
 # Added for request tracing
 uuid = { version = "1.10", features = ["v4", "serde"] }
+thiserror = "2.0.12"
 [profile.release]
 lto = "thin"
 codegen-units = 1
diff --git a/sgl-router/README.md b/sgl-router/README.md
@@ -95,38 +95,217 @@ python -m sglang_router.launch_router \
 
 ### Kubernetes Service Discovery
 
-SGL Router supports automatic service discovery for worker nodes in Kubernetes environments. When enabled, the router will automatically:
+SGL Router supports automatic service discovery for worker nodes in Kubernetes environments. This feature works with both regular (single-server) routing and PD (Prefill-Decode) routing modes. When enabled, the router will automatically:
 
 - Discover and add worker pods with matching labels
 - Remove unhealthy or deleted worker pods
 - Dynamically adjust the worker pool based on pod health and availability
+- For PD mode: distinguish between prefill and decode servers based on labels
 
-#### Command Line Usage
+#### Regular Mode Service Discovery
+
+For traditional single-server routing:
 
 ```bash
 python -m sglang_router.launch_router \
     --service-discovery \
     --selector app=sglang-worker role=inference \
-    --service-discovery-port 8000 \
     --service-discovery-namespace default
 ```
 
+#### PD Mode Service Discovery
+
+For PD (Prefill-Decode) disaggregated routing, service discovery can automatically discover and classify pods as either prefill or decode servers based on their labels:
+
+```bash
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --policy cache_aware \
+    --service-discovery \
+    --prefill-selector app=sglang component=prefill \
+    --decode-selector app=sglang component=decode \
+    --service-discovery-namespace sglang-system
+```
+
+You can also specify initial prefill and decode servers and let service discovery add more:
+
+```bash
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --policy cache_aware \
+    --prefill http://prefill-1:8000 8001 \
+    --decode http://decode-1:8000 \
+    --service-discovery \
+    --prefill-selector app=sglang component=prefill \
+    --decode-selector app=sglang component=decode \
+    --service-discovery-namespace sglang-system
+```
+
+#### Kubernetes Pod Configuration for PD Mode
+
+When using PD service discovery, your Kubernetes pods need specific labels to be classified as prefill or decode servers:
+
+**Prefill Server Pod:**
+```yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: sglang-prefill-1
+  labels:
+    app: sglang
+    component: prefill
+  annotations:
+    sglang.ai/bootstrap-port: "9001"  # Optional: Bootstrap port for Mooncake prefill coordination
+spec:
+  containers:
+  - name: sglang
+    image: lmsys/sglang:latest
+    ports:
+    - containerPort: 8000  # Main API port
+    - containerPort: 9001  # Optional: Bootstrap coordination port
+    # ... rest of configuration
+```
+
+**Decode Server Pod:**
+```yaml
+apiVersion: v1
+kind: Pod
+metadata:
+  name: sglang-decode-1
+  labels:
+    app: sglang
+    component: decode
+spec:
+  containers:
+  - name: sglang
+    image: lmsys/sglang:latest
+    ports:
+    - containerPort: 8000  # Main API port
+    # ... rest of configuration
+```
+
+**Key Requirements:**
+- Prefill pods must have labels matching your `--prefill-selector`
+- Decode pods must have labels matching your `--decode-selector`
+- Prefill pods can optionally include bootstrap port in annotations using `sglang.ai/bootstrap-port` (defaults to None if not specified)
+
 #### Service Discovery Arguments
 
+**General Arguments:**
 - `--service-discovery`: Enable Kubernetes service discovery feature
-- `--selector`: One or more label key-value pairs for pod selection (format: key1=value1 key2=value2)
-- `--service-discovery-port`: Port to use when generating worker URLs (default: 80)
+- `--service-discovery-port`: Port to use when generating worker URLs (default: 8000)
 - `--service-discovery-namespace`: Optional. Kubernetes namespace to watch for pods. If not provided, watches all namespaces (requires cluster-wide permissions)
+- `--selector`: One or more label key-value pairs for pod selection in regular mode (format: key1=value1 key2=value2)
+
+**PD Mode Arguments:**
+- `--pd-disaggregation`: Enable PD (Prefill-Decode) disaggregated mode
+- `--prefill`: Specify initial prefill server URL and bootstrap port (format: URL BOOTSTRAP_PORT, can be used multiple times)
+- `--decode`: Specify initial decode server URL (can be used multiple times)
+- `--prefill-selector`: Label selector for prefill server pods in PD mode (format: key1=value1 key2=value2)
+- `--decode-selector`: Label selector for decode server pods in PD mode (format: key1=value1 key2=value2)
+- `--policy`: Routing policy (cache_aware, random, power_of_two - note: power_of_two only works in PD mode)
+
+**Notes:**
+- Bootstrap port annotation is automatically set to `sglang.ai/bootstrap-port` for Mooncake deployments
+- Advanced cache tuning parameters use sensible defaults and are not exposed via CLI
 
 #### RBAC Requirements
 
 When using service discovery, you must configure proper Kubernetes RBAC permissions:
 
-- **If using namespace-scoped discovery** (with `--service-discovery-namespace`):
-  Set up a ServiceAccount, Role, and RoleBinding
+**Namespace-scoped (recommended):**
+```yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: sglang-router
+  namespace: sglang-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  namespace: sglang-system
+  name: sglang-router
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: sglang-router
+  namespace: sglang-system
+subjects:
+- kind: ServiceAccount
+  name: sglang-router
+  namespace: sglang-system
+roleRef:
+  kind: Role
+  name: sglang-router
+  apiGroup: rbac.authorization.k8s.io
+```
+
+**Cluster-wide (if watching all namespaces):**
+```yaml
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: sglang-router
+  namespace: sglang-system
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: sglang-router
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: sglang-router
+subjects:
+- kind: ServiceAccount
+  name: sglang-router
+  namespace: sglang-system
+roleRef:
+  kind: ClusterRole
+  name: sglang-router
+  apiGroup: rbac.authorization.k8s.io
+```
+
+#### Complete Example: PD Mode with Service Discovery
+
+Here's a complete example of running SGLang Router with PD mode and service discovery:
+
+```bash
+# Start the router with PD mode and automatic prefill/decode discovery
+python -m sglang_router.launch_router \
+    --pd-disaggregation \
+    --policy cache_aware \
+    --service-discovery \
+    --prefill-selector app=sglang component=prefill environment=production \
+    --decode-selector app=sglang component=decode environment=production \
+    --service-discovery-namespace production \
+    --host 0.0.0.0 \
+    --port 8080 \
+    --prometheus-host 0.0.0.0 \
+    --prometheus-port 9090
+```
+
+This setup will:
+1. Enable PD (Prefill-Decode) disaggregated routing mode with automatic pod classification
+2. Watch for pods in the `production` namespace
+3. Automatically add prefill servers with labels `app=sglang`, `component=prefill`, `environment=production`
+4. Automatically add decode servers with labels `app=sglang`, `component=decode`, `environment=production`
+5. Extract bootstrap ports from the `sglang.ai/bootstrap-port` annotation on prefill pods
+6. Use cache-aware load balancing for optimal performance
+7. Expose the router API on port 8080 and metrics on port 9090
 
-- **If watching all namespaces** (without specifying namespace):
-  Set up a ServiceAccount, ClusterRole, and ClusterRoleBinding with permissions to list/watch pods at the cluster level
+**Note:** In PD mode with service discovery, pods MUST match either the prefill or decode selector to be added. Pods that don't match either selector are ignored.
 
 ### Troubleshooting
 

diff --git a/sgl-router/py_src/sglang_router/launch_router.py b/sgl-router/py_src/sglang_router/launch_router.py
@@ -32,7 +32,7 @@ class RouterArgs:
     port: int = 30000
 
     # PD-specific configuration
-    pd_disaggregated: bool = False  # Enable PD disaggregated mode
+    pd_disaggregation: bool = False  # Enable PD disaggregated mode
     prefill_urls: List[tuple] = dataclasses.field(
         default_factory=list
     )  # List of (url, bootstrap_port)
@@ -55,6 +55,10 @@ class RouterArgs:
     selector: Dict[str, str] = dataclasses.field(default_factory=dict)
     service_discovery_port: int = 80
     service_discovery_namespace: Optional[str] = None
+    # PD service discovery configuration
+    prefill_selector: Dict[str, str] = dataclasses.field(default_factory=dict)
+    decode_selector: Dict[str, str] = dataclasses.field(default_factory=dict)
+    bootstrap_port_annotation: str = "sglang.ai/bootstrap-port"
     # Prometheus configuration
     prometheus_port: Optional[int] = None
     prometheus_host: Optional[str] = None
@@ -108,7 +112,7 @@ def add_cli_args(
 
         # PD-specific arguments
         parser.add_argument(
-            f"--{prefix}pd-disaggregated",
+            f"--{prefix}pd-disaggregation",
             action="store_true",
             help="Enable PD (Prefill-Decode) disaggregated mode",
         )
@@ -207,6 +211,18 @@ def add_cli_args(
             type=str,
             help="Kubernetes namespace to watch for pods. If not provided, watches all namespaces (requires cluster-wide permissions)",
         )
+        parser.add_argument(
+            f"--{prefix}prefill-selector",
+            type=str,
+            nargs="+",
+            help="Label selector for prefill server pods in PD mode (format: key1=value1 key2=value2)",
+        )
+        parser.add_argument(
+            f"--{prefix}decode-selector",
+            type=str,
+            nargs="+",
+            help="Label selector for decode server pods in PD mode (format: key1=value1 key2=value2)",
+        )
         # Prometheus configuration
         parser.add_argument(
             f"--{prefix}prometheus-port",
@@ -243,7 +259,7 @@ def from_cli_args(
             worker_urls=worker_urls,
             host=args.host,
             port=args.port,
-            pd_disaggregated=getattr(args, f"{prefix}pd_disaggregated", False),
+            pd_disaggregation=getattr(args, f"{prefix}pd_disaggregation", False),
             prefill_urls=prefill_urls,
             decode_urls=decode_urls,
             policy=getattr(args, f"{prefix}policy"),
@@ -267,6 +283,13 @@ def from_cli_args(
             service_discovery_namespace=getattr(
                 args, f"{prefix}service_discovery_namespace", None
             ),
+            prefill_selector=cls._parse_selector(
+                getattr(args, f"{prefix}prefill_selector", None)
+            ),
+            decode_selector=cls._parse_selector(
+                getattr(args, f"{prefix}decode_selector", None)
+            ),
+            bootstrap_port_annotation="sglang.ai/bootstrap-port",  # Mooncake-specific annotation
             prometheus_port=getattr(args, f"{prefix}prometheus_port", None),
             prometheus_host=getattr(args, f"{prefix}prometheus_host", None),
         )
@@ -355,17 +378,20 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
             router_args = args
 
         # Validate configuration based on mode
-        if router_args.pd_disaggregated:
-            # Validate PD configuration
-            if not router_args.prefill_urls:
-                raise ValueError("PD disaggregated mode requires --prefill")
-            if not router_args.decode_urls:
-                raise ValueError("PD disaggregated mode requires --decode")
+        if router_args.pd_disaggregation:
+            # Validate PD configuration - skip URL requirements if using service discovery
+            if not router_args.service_discovery:
+                if not router_args.prefill_urls:
+                    raise ValueError("PD disaggregation mode requires --prefill")
+                if not router_args.decode_urls:
+                    raise ValueError("PD disaggregation mode requires --decode")
 
         # Create router with unified constructor
         router = Router(
             worker_urls=(
-                router_args.worker_urls if not router_args.pd_disaggregated else []
+                []
+                if router_args.service_discovery or router_args.pd_disaggregation
+                else router_args.worker_urls
             ),
             host=router_args.host,
             port=router_args.port,
@@ -384,14 +410,16 @@ def launch_router(args: argparse.Namespace) -> Optional[Router]:
             selector=router_args.selector,
             service_discovery_port=router_args.service_discovery_port,
             service_discovery_namespace=router_args.service_discovery_namespace,
+            prefill_selector=router_args.prefill_selector,
+            decode_selector=router_args.decode_selector,
             prometheus_port=router_args.prometheus_port,
             prometheus_host=router_args.prometheus_host,
-            pd_disaggregated=router_args.pd_disaggregated,
+            pd_disaggregation=router_args.pd_disaggregation,
             prefill_urls=(
-                router_args.prefill_urls if router_args.pd_disaggregated else None
+                router_args.prefill_urls if router_args.pd_disaggregation else None
             ),
             decode_urls=(
-                router_args.decode_urls if router_args.pd_disaggregated else None
+                router_args.decode_urls if router_args.pd_disaggregation else None
             ),
         )
 
@@ -425,7 +453,7 @@ def parse_router_args(args: List[str]) -> RouterArgs:
   python -m sglang_router.launch_router --worker-urls http://worker1:8000 http://worker2:8000
 
   # PD disaggregated mode
-  python -m sglang_router.launch_router --pd-disaggregated \\
+  python -m sglang_router.launch_router --pd-disaggregation \\
     --prefill http://prefill1:8000 9000 --prefill http://prefill2:8000 none \\
     --decode http://decode1:8001 --decode http://decode2:8001 \\
     --policy cache_aware