sgl-project · slin1237 · Dec 8, 2025 · Dec 8, 2025 · Dec 8, 2025
@@ -312,25 +312,6 @@ impl LoadBalancingPolicy for BucketPolicy {
         workers.iter().position(|w| w.url() == prefill_url)
     }
 
-    fn select_worker_pair(
-        &self,
-        prefill_workers: &[Arc<dyn Worker>],
-        decode_workers: &[Arc<dyn Worker>],
-        request_text: Option<&str>,
-    ) -> Option<(usize, usize)> {
-        let prefill_idx = self.select_worker(prefill_workers, request_text)?;
-
-        let healthy_decode = get_healthy_worker_indices(decode_workers);
-        if healthy_decode.is_empty() {
-            return None;
-        }
-
-        let mut rng = rand::rng();
-        let decode_idx = rng.random_range(0..healthy_decode.len());
-
-        Some((prefill_idx, decode_idx))
-    }
-
     fn name(&self) -> &'static str {
         "bucket"
     }

@@ -347,37 +347,6 @@ impl LoadBalancingPolicy for CacheAwarePolicy {
         }
     }
 
-    fn select_worker_pair(
-        &self,
-        prefill_workers: &[Arc<dyn Worker>],
-        decode_workers: &[Arc<dyn Worker>],
-        request_text: Option<&str>,
-    ) -> Option<(usize, usize)> {
-        // DEPRECATED: This method is no longer used when separate policies are configured.
-        // The PD router now uses separate policies for prefill and decode selection.
-        // This implementation remains for backward compatibility when a single policy is used.
-
-        // In PD mode with single policy:
-        // - Prefill: Use cache-aware routing for better cache utilization
-        // - Decode: Use least-load routing for better load distribution
-
-        // Select prefill worker using cache-aware logic
-        let prefill_idx = self.select_worker(prefill_workers, request_text)?;
-
-        // Select decode worker using least-load logic
-        let healthy_decode = get_healthy_worker_indices(decode_workers);
-        if healthy_decode.is_empty() {
-            return None;
-        }
-
-        let decode_idx = healthy_decode
-            .iter()
-            .min_by_key(|&&idx| decode_workers[idx].load())
-            .copied()?;
-
-        Some((prefill_idx, decode_idx))
-    }
-
     fn on_request_complete(&self, worker_url: &str, success: bool) {
         // Could track success rates per worker for more intelligent routing
         if !success {

@@ -39,22 +39,6 @@ pub trait LoadBalancingPolicy: Send + Sync + Debug {
         request_text: Option<&str>,
     ) -> Option<usize>;
 
-    /// Select a pair of workers (prefill and decode) for PD routing
-    ///
-    /// Returns indices of (prefill_worker, decode_worker) from their respective arrays.
-    /// Default implementation uses select_worker for each array independently.
-    fn select_worker_pair(
-        &self,
-        prefill_workers: &[Arc<dyn Worker>],
-        decode_workers: &[Arc<dyn Worker>],
-        request_text: Option<&str>,
-    ) -> Option<(usize, usize)> {
-        // Default implementation: independently select from each pool
-        let prefill_idx = self.select_worker(prefill_workers, request_text)?;
-        let decode_idx = self.select_worker(decode_workers, request_text)?;
-        Some((prefill_idx, decode_idx))
-    }
-
     /// Update policy state after request completion
     ///
     /// This is called when a request completes (successfully or not) to allow