Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sgl-model-gateway/src/core/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,7 @@ impl Worker for BasicWorker {

fn set_healthy(&self, healthy: bool) {
self.healthy.store(healthy, Ordering::Release);
Metrics::set_worker_health(self.url(), healthy);
}

async fn check_health_async(&self) -> WorkerResult<()> {
Expand Down
7 changes: 5 additions & 2 deletions sgl-model-gateway/src/core/worker_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use super::{
WorkerType,
},
};
use crate::routers::grpc::client::GrpcClient;
use crate::{observability::metrics::Metrics, routers::grpc::client::GrpcClient};

/// Builder for creating BasicWorker instances with fluent API
pub struct BasicWorkerBuilder {
Expand Down Expand Up @@ -187,11 +187,14 @@ impl BasicWorkerBuilder {
None => OnceCell::new(),
});

let healthy = true;
Metrics::set_worker_health(&self.url, healthy);

BasicWorker {
metadata,
load_counter: Arc::new(AtomicUsize::new(0)),
processed_counter: Arc::new(AtomicUsize::new(0)),
healthy: Arc::new(AtomicBool::new(true)),
healthy: Arc::new(AtomicBool::new(healthy)),
consecutive_failures: Arc::new(AtomicUsize::new(0)),
consecutive_successes: Arc::new(AtomicUsize::new(0)),
circuit_breaker: CircuitBreaker::with_config_and_label(
Expand Down
13 changes: 13 additions & 0 deletions sgl-model-gateway/src/observability/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ pub fn init_metrics() {
"smg_worker_requests_active",
"Currently running requests per worker"
);
describe_gauge!(
"smg_worker_health",
"Worker health status (1=healthy, 0=unhealthy)"
);
describe_counter!(
"smg_worker_health_checks_total",
"Health check results by worker_type and result"
Expand Down Expand Up @@ -807,6 +811,15 @@ impl Metrics {
.set(count as f64);
}

/// Set worker health status
pub fn set_worker_health(worker_url: &str, healthy: bool) {
gauge!(
"smg_worker_health",
"worker" => worker_url.to_string()
)
.set(if healthy { 1.0 } else { 0.0 });
}

// ========================================================================
// Layer 3: Worker resilience metrics (circuit breaker)
// ========================================================================
Expand Down
Loading