Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 0 additions & 19 deletions sgl-model-gateway/src/policies/bucket.rs
Original file line number Diff line number Diff line change
Expand Up @@ -312,25 +312,6 @@ impl LoadBalancingPolicy for BucketPolicy {
workers.iter().position(|w| w.url() == prefill_url)
}

fn select_worker_pair(
&self,
prefill_workers: &[Arc<dyn Worker>],
decode_workers: &[Arc<dyn Worker>],
request_text: Option<&str>,
) -> Option<(usize, usize)> {
let prefill_idx = self.select_worker(prefill_workers, request_text)?;

let healthy_decode = get_healthy_worker_indices(decode_workers);
if healthy_decode.is_empty() {
return None;
}

let mut rng = rand::rng();
let decode_idx = rng.random_range(0..healthy_decode.len());

Some((prefill_idx, decode_idx))
}

fn name(&self) -> &'static str {
"bucket"
}
Expand Down
31 changes: 0 additions & 31 deletions sgl-model-gateway/src/policies/cache_aware.rs
Original file line number Diff line number Diff line change
Expand Up @@ -347,37 +347,6 @@ impl LoadBalancingPolicy for CacheAwarePolicy {
}
}

fn select_worker_pair(
&self,
prefill_workers: &[Arc<dyn Worker>],
decode_workers: &[Arc<dyn Worker>],
request_text: Option<&str>,
) -> Option<(usize, usize)> {
// DEPRECATED: This method is no longer used when separate policies are configured.
// The PD router now uses separate policies for prefill and decode selection.
// This implementation remains for backward compatibility when a single policy is used.

// In PD mode with single policy:
// - Prefill: Use cache-aware routing for better cache utilization
// - Decode: Use least-load routing for better load distribution

// Select prefill worker using cache-aware logic
let prefill_idx = self.select_worker(prefill_workers, request_text)?;

// Select decode worker using least-load logic
let healthy_decode = get_healthy_worker_indices(decode_workers);
if healthy_decode.is_empty() {
return None;
}

let decode_idx = healthy_decode
.iter()
.min_by_key(|&&idx| decode_workers[idx].load())
.copied()?;

Some((prefill_idx, decode_idx))
}

fn on_request_complete(&self, worker_url: &str, success: bool) {
// Could track success rates per worker for more intelligent routing
if !success {
Expand Down
16 changes: 0 additions & 16 deletions sgl-model-gateway/src/policies/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,22 +39,6 @@ pub trait LoadBalancingPolicy: Send + Sync + Debug {
request_text: Option<&str>,
) -> Option<usize>;

/// Select a pair of workers (prefill and decode) for PD routing
///
/// Returns indices of (prefill_worker, decode_worker) from their respective arrays.
/// Default implementation uses select_worker for each array independently.
fn select_worker_pair(
&self,
prefill_workers: &[Arc<dyn Worker>],
decode_workers: &[Arc<dyn Worker>],
request_text: Option<&str>,
) -> Option<(usize, usize)> {
// Default implementation: independently select from each pool
let prefill_idx = self.select_worker(prefill_workers, request_text)?;
let decode_idx = self.select_worker(decode_workers, request_text)?;
Some((prefill_idx, decode_idx))
}

/// Update policy state after request completion
///
/// This is called when a request completes (successfully or not) to allow
Expand Down
Loading