diff --git a/sgl-model-gateway/src/policies/bucket.rs b/sgl-model-gateway/src/policies/bucket.rs index ea4a3a440fda..e16daac70943 100644 --- a/sgl-model-gateway/src/policies/bucket.rs +++ b/sgl-model-gateway/src/policies/bucket.rs @@ -312,25 +312,6 @@ impl LoadBalancingPolicy for BucketPolicy { workers.iter().position(|w| w.url() == prefill_url) } - fn select_worker_pair( - &self, - prefill_workers: &[Arc], - decode_workers: &[Arc], - request_text: Option<&str>, - ) -> Option<(usize, usize)> { - let prefill_idx = self.select_worker(prefill_workers, request_text)?; - - let healthy_decode = get_healthy_worker_indices(decode_workers); - if healthy_decode.is_empty() { - return None; - } - - let mut rng = rand::rng(); - let decode_idx = rng.random_range(0..healthy_decode.len()); - - Some((prefill_idx, decode_idx)) - } - fn name(&self) -> &'static str { "bucket" } diff --git a/sgl-model-gateway/src/policies/cache_aware.rs b/sgl-model-gateway/src/policies/cache_aware.rs index b5c8e1d14919..781f413e7ba8 100644 --- a/sgl-model-gateway/src/policies/cache_aware.rs +++ b/sgl-model-gateway/src/policies/cache_aware.rs @@ -347,37 +347,6 @@ impl LoadBalancingPolicy for CacheAwarePolicy { } } - fn select_worker_pair( - &self, - prefill_workers: &[Arc], - decode_workers: &[Arc], - request_text: Option<&str>, - ) -> Option<(usize, usize)> { - // DEPRECATED: This method is no longer used when separate policies are configured. - // The PD router now uses separate policies for prefill and decode selection. - // This implementation remains for backward compatibility when a single policy is used. - - // In PD mode with single policy: - // - Prefill: Use cache-aware routing for better cache utilization - // - Decode: Use least-load routing for better load distribution - - // Select prefill worker using cache-aware logic - let prefill_idx = self.select_worker(prefill_workers, request_text)?; - - // Select decode worker using least-load logic - let healthy_decode = get_healthy_worker_indices(decode_workers); - if healthy_decode.is_empty() { - return None; - } - - let decode_idx = healthy_decode - .iter() - .min_by_key(|&&idx| decode_workers[idx].load()) - .copied()?; - - Some((prefill_idx, decode_idx)) - } - fn on_request_complete(&self, worker_url: &str, success: bool) { // Could track success rates per worker for more intelligent routing if !success { diff --git a/sgl-model-gateway/src/policies/mod.rs b/sgl-model-gateway/src/policies/mod.rs index 7eca8609775b..8edf7f6a4e3e 100644 --- a/sgl-model-gateway/src/policies/mod.rs +++ b/sgl-model-gateway/src/policies/mod.rs @@ -39,22 +39,6 @@ pub trait LoadBalancingPolicy: Send + Sync + Debug { request_text: Option<&str>, ) -> Option; - /// Select a pair of workers (prefill and decode) for PD routing - /// - /// Returns indices of (prefill_worker, decode_worker) from their respective arrays. - /// Default implementation uses select_worker for each array independently. - fn select_worker_pair( - &self, - prefill_workers: &[Arc], - decode_workers: &[Arc], - request_text: Option<&str>, - ) -> Option<(usize, usize)> { - // Default implementation: independently select from each pool - let prefill_idx = self.select_worker(prefill_workers, request_text)?; - let decode_idx = self.select_worker(decode_workers, request_text)?; - Some((prefill_idx, decode_idx)) - } - /// Update policy state after request completion /// /// This is called when a request completes (successfully or not) to allow