From 0e6fc77c0c257dadcdceea4e7552f9df1b59ac65 Mon Sep 17 00:00:00 2001 From: vianney Date: Wed, 11 Feb 2026 15:25:03 +0100 Subject: [PATCH 01/80] feat(worker): Add trigger to worker trait --- Cargo.lock | 3 ++ libdd-common/Cargo.toml | 1 + libdd-common/src/worker.rs | 43 ++++++++++++++- libdd-data-pipeline/Cargo.toml | 1 + libdd-data-pipeline/src/agent_info/fetcher.rs | 2 + libdd-data-pipeline/src/pausable_worker.rs | 35 ++++++++++--- libdd-data-pipeline/src/stats_exporter.rs | 2 + libdd-telemetry/Cargo.toml | 1 + libdd-telemetry/src/worker/mod.rs | 52 ++++++------------- 9 files changed, 93 insertions(+), 47 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5c5d5672a6..1f444467a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2971,6 +2971,7 @@ name = "libdd-common" version = "1.1.0" dependencies = [ "anyhow", + "async-trait", "bytes", "cc", "const_format", @@ -3084,6 +3085,7 @@ version = "1.0.0" dependencies = [ "anyhow", "arc-swap", + "async-trait", "bytes", "clap", "criterion", @@ -3291,6 +3293,7 @@ name = "libdd-telemetry" version = "2.0.0" dependencies = [ "anyhow", + "async-trait", "base64 0.22.1", "futures", "hashbrown 0.15.1", diff --git a/libdd-common/Cargo.toml b/libdd-common/Cargo.toml index 5dd2ea8dd7..78030e17f4 100644 --- a/libdd-common/Cargo.toml +++ b/libdd-common/Cargo.toml @@ -17,6 +17,7 @@ bench = false [dependencies] anyhow = "1.0" +async-trait = "0.1" futures = "0.3" futures-core = { version = "0.3.0", default-features = false } futures-util = { version = "0.3.0", default-features = false } diff --git a/libdd-common/src/worker.rs b/libdd-common/src/worker.rs index c79c9317f2..ca7852848d 100644 --- a/libdd-common/src/worker.rs +++ b/libdd-common/src/worker.rs @@ -1,12 +1,51 @@ // Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ // SPDX-License-Identifier: Apache-2.0 +use async_trait::async_trait; + /// Trait representing a generic worker. /// /// The worker runs an async looping function running periodic tasks. /// /// This trait can be used to provide wrapper around a worker. +/// +/// This trait is dyn-compatible thanks to the `async_trait` macro, +/// which allows it to be used as `Box`. +#[async_trait] pub trait Worker { - /// Main worker loop - fn run(&mut self) -> impl std::future::Future + Send; + /// Main worker function + async fn run(&mut self); + + /// Function to call between each `run` to wait for the next run + async fn trigger(&mut self); + + /// Reset the worker in the child after a fork + fn reset(&mut self) { + return; + } + + /// Hook called when the app is shutting down. Used to flush all data. + fn shutdown(&mut self) { + return; + } +} + +// Blanket implementation for boxed trait objects +#[async_trait] +impl Worker for Box { + async fn run(&mut self) { + (**self).run().await + } + + async fn trigger(&mut self) { + (**self).trigger().await + } + + fn reset(&mut self) { + (**self).reset() + } + + fn shutdown(&mut self) { + (**self).shutdown() + } } diff --git a/libdd-data-pipeline/Cargo.toml b/libdd-data-pipeline/Cargo.toml index fc5faa1cb5..107378a289 100644 --- a/libdd-data-pipeline/Cargo.toml +++ b/libdd-data-pipeline/Cargo.toml @@ -14,6 +14,7 @@ autobenches = false [dependencies] anyhow = { version = "1.0" } arc-swap = "1.7.1" +async-trait = "0.1" http = "1.1" http-body-util = "0.1" tracing = { version = "0.1", default-features = false } diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index 9bd7200288..221cca36e4 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -5,6 +5,7 @@ use super::{schema::AgentInfo, AGENT_INFO_CACHE}; use anyhow::{anyhow, Result}; +use async_trait::async_trait; use http::header::HeaderName; use http_body_util::BodyExt; use libdd_common::{http_common, worker::Worker, Endpoint}; @@ -176,6 +177,7 @@ impl AgentInfoFetcher { } } +#[async_trait] impl Worker for AgentInfoFetcher { /// Start fetching the info endpoint with the given interval. /// diff --git a/libdd-data-pipeline/src/pausable_worker.rs b/libdd-data-pipeline/src/pausable_worker.rs index 223d0af246..34d25e1ad9 100644 --- a/libdd-data-pipeline/src/pausable_worker.rs +++ b/libdd-data-pipeline/src/pausable_worker.rs @@ -80,10 +80,17 @@ impl PausableWorker { let stop_token = CancellationToken::new(); let cloned_token = stop_token.clone(); let handle = rt.spawn(async move { - select! { - _ = worker.run() => {worker} - _ = cloned_token.cancelled() => {worker} + loop { + select! { + _ = worker.trigger() => { + worker.run().await; + } + _ = cloned_token.cancelled() => { + break; + } + } } + worker }); *self = PausableWorker::Running { handle, stop_token }; @@ -115,6 +122,15 @@ impl PausableWorker { } } + /// Reset the worker state (used in child process after fork). + /// + /// This delegates to the worker's reset method if the worker is in a paused state. + pub fn reset(&mut self) { + if let PausableWorker::Paused { worker } = self { + worker.reset(); + } + } + /// Wait for the run method of the worker to exit. pub async fn join(self) -> Result<(), JoinError> { if let PausableWorker::Running { handle, .. } = self { @@ -126,6 +142,7 @@ impl PausableWorker { #[cfg(test)] mod tests { + use async_trait::async_trait; use tokio::{runtime::Builder, time::sleep}; use super::*; @@ -140,13 +157,15 @@ mod tests { sender: Sender, } + #[async_trait] impl Worker for TestWorker { async fn run(&mut self) { - loop { - let _ = self.sender.send(self.state); - self.state += 1; - sleep(Duration::from_millis(100)).await; - } + let _ = self.sender.send(self.state); + self.state += 1; + } + + async fn trigger(&mut self) { + sleep(Duration::from_millis(100)).await; } } diff --git a/libdd-data-pipeline/src/stats_exporter.rs b/libdd-data-pipeline/src/stats_exporter.rs index 6b64c09ecc..83394537b0 100644 --- a/libdd-data-pipeline/src/stats_exporter.rs +++ b/libdd-data-pipeline/src/stats_exporter.rs @@ -11,6 +11,7 @@ use std::{ time, }; +use async_trait::async_trait; use crate::trace_exporter::TracerMetadata; use libdd_common::{worker::Worker, Endpoint, HttpClient}; use libdd_trace_protobuf::pb; @@ -132,6 +133,7 @@ impl StatsExporter { } } +#[async_trait] impl Worker for StatsExporter { /// Run loop of the stats exporter /// diff --git a/libdd-telemetry/Cargo.toml b/libdd-telemetry/Cargo.toml index a98c629ba0..9645076859 100644 --- a/libdd-telemetry/Cargo.toml +++ b/libdd-telemetry/Cargo.toml @@ -18,6 +18,7 @@ https = ["libdd-common/https"] [dependencies] anyhow = { version = "1.0" } +async-trait = "0.1" base64 = "0.22" futures = { version = "0.3", default-features = false } http-body-util = "0.1" diff --git a/libdd-telemetry/src/worker/mod.rs b/libdd-telemetry/src/worker/mod.rs index 3bfa1bcccb..ed21345e0d 100644 --- a/libdd-telemetry/src/worker/mod.rs +++ b/libdd-telemetry/src/worker/mod.rs @@ -11,6 +11,7 @@ use crate::{ metrics::{ContextKey, MetricBuckets, MetricContexts}, }; +use async_trait::async_trait; use libdd_common::{http_common, tag::Tag, worker::Worker}; use std::iter::Sum; @@ -140,6 +141,7 @@ pub struct TelemetryWorker { metrics_flush_interval: Duration, deadlines: scheduler::Scheduler, data: TelemetryWorkerData, + next_action: Option, } impl Debug for TelemetryWorker { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { @@ -157,58 +159,33 @@ impl Debug for TelemetryWorker { } } +#[async_trait] impl Worker for TelemetryWorker { - // Runs a state machine that waits for actions, either from the worker's - // mailbox, or scheduled actions from the worker's deadline object. - async fn run(&mut self) { - debug!( - worker.flavor = ?self.flavor, - worker.runtime_id = %self.runtime_id, - "Starting telemetry worker" - ); - - loop { - if self.cancellation_token.is_cancelled() { - debug!( - worker.runtime_id = %self.runtime_id, - "Telemetry worker cancelled, shutting down" - ); - return; - } + async fn trigger(&mut self) { + // Wait for the next action and store it + let action = self.recv_next_action().await; + self.next_action = Some(action); + } - let action = self.recv_next_action().await; + // Processes a single action from the state machine + async fn run(&mut self) { + // Take the action that was stored by trigger() + if let Some(action) = self.next_action.take() { debug!( worker.runtime_id = %self.runtime_id, action = ?action, "Received telemetry action" ); - let action_result = match self.flavor { + let _action_result = match self.flavor { TelemetryWorkerFlavor::Full => self.dispatch_action(action).await, TelemetryWorkerFlavor::MetricsLogs => { self.dispatch_metrics_logs_action(action).await } }; - - match action_result { - ControlFlow::Continue(()) => {} - ControlFlow::Break(()) => { - debug!( - worker.runtime_id = %self.runtime_id, - worker.restartable = self.config.restartable, - "Telemetry worker received break signal" - ); - if !self.config.restartable { - break; - } - } - }; } - debug!( - worker.runtime_id = %self.runtime_id, - "Telemetry worker stopped" - ); + // TODO: Handle action result and add support to stop worker from `run` } } @@ -1145,6 +1122,7 @@ impl TelemetryWorkerBuilder { ), ]), cancellation_token: token.clone(), + next_action: None, }; ( From 305b85359fc797712bfd50daf65b292be073912e Mon Sep 17 00:00:00 2001 From: vianney Date: Wed, 11 Feb 2026 15:25:54 +0100 Subject: [PATCH 02/80] feat(data_pipeline): add SharedRuntime --- libdd-data-pipeline/src/lib.rs | 1 + libdd-data-pipeline/src/shared_runtime.rs | 337 ++++++++++++++++++++++ 2 files changed, 338 insertions(+) create mode 100644 libdd-data-pipeline/src/shared_runtime.rs diff --git a/libdd-data-pipeline/src/lib.rs b/libdd-data-pipeline/src/lib.rs index 57572cd97a..c059939a55 100644 --- a/libdd-data-pipeline/src/lib.rs +++ b/libdd-data-pipeline/src/lib.rs @@ -13,6 +13,7 @@ pub mod agent_info; mod health_metrics; mod pausable_worker; +pub mod shared_runtime; #[allow(missing_docs)] pub mod stats_exporter; pub(crate) mod telemetry; diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs new file mode 100644 index 0000000000..b068888008 --- /dev/null +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -0,0 +1,337 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +//! SharedRuntime for managing PausableWorkers across fork boundaries. +//! +//! This module provides a SharedRuntime that manages a tokio runtime and allows +//! spawning PausableWorkers on it. It also provides hooks for safely handling +//! fork operations by pausing workers before fork and restarting them appropriately +//! in parent and child processes. + +use crate::pausable_worker::{PausableWorker, PausableWorkerError}; +use libdd_common::{worker::Worker, MutexExt}; +use std::fmt; +use std::sync::{Arc, Mutex, PoisonError}; +use tokio::{ + runtime::{Builder, Runtime}, + task::JoinSet, +}; + +/// Type alias for a boxed worker trait object that can be used with PausableWorker. +type BoxedWorker = Box; + +/// Errors that can occur when using SharedRuntime. +#[derive(Debug)] +pub enum SharedRuntimeError { + /// The runtime is not available or in an invalid state. + RuntimeUnavailable, + /// Failed to acquire a lock on internal state. + LockFailed(String), + /// A worker operation failed. + WorkerError(PausableWorkerError), + /// Failed to create or manage the tokio runtime. + RuntimeCreation(std::io::Error), + /// A generic error occurred. + Other(String), +} + +impl fmt::Display for SharedRuntimeError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + SharedRuntimeError::RuntimeUnavailable => { + write!(f, "Runtime is not available or in an invalid state") + } + SharedRuntimeError::LockFailed(msg) => write!(f, "Failed to acquire lock: {}", msg), + SharedRuntimeError::WorkerError(err) => write!(f, "Worker error: {}", err), + SharedRuntimeError::RuntimeCreation(err) => { + write!(f, "Failed to create runtime: {}", err) + } + SharedRuntimeError::Other(msg) => write!(f, "{}", msg), + } + } +} + +impl std::error::Error for SharedRuntimeError {} + +impl From for SharedRuntimeError { + fn from(err: PausableWorkerError) -> Self { + SharedRuntimeError::WorkerError(err) + } +} + +impl From for SharedRuntimeError { + fn from(err: std::io::Error) -> Self { + SharedRuntimeError::RuntimeCreation(err) + } +} + +/// A shared runtime that manages PausableWorkers and provides fork safety hooks. +/// +/// The SharedRuntime owns a tokio runtime and tracks PausableWorkers spawned on it. +/// It provides methods to safely pause workers before forking and restart them +/// after fork in both parent and child processes. +#[derive(Debug)] +pub struct SharedRuntime { + runtime: Arc>>>, + workers: Arc>>>, +} + +impl SharedRuntime { + /// Create a new SharedRuntime with a default multi-threaded tokio runtime. + /// + /// # Errors + /// Returns an error if the tokio runtime cannot be created. + pub fn new() -> Result { + let runtime = tokio::runtime::Builder::new_multi_thread() + .worker_threads(1) + .enable_all() + .build()?; + + Ok(Self { + runtime: Arc::new(Mutex::new(Some(runtime))), + workers: Arc::new(Mutex::new(Vec::new())), + }) + } + + /// Spawn a PausableWorker on this runtime. + /// + /// The worker will be tracked by this SharedRuntime and will be paused/resumed + /// during fork operations. + /// + /// # Errors + /// Returns an error if the runtime is not available or the worker cannot be started. + pub fn spawn_worker( + &self, + worker: T, + ) -> Result<(), SharedRuntimeError> { + let boxed_worker: BoxedWorker = Box::new(worker); + let mut pausable_worker = PausableWorker::new(boxed_worker); + + let runtime_lock = self.runtime.lock_or_panic(); + + // If the runtime is not available, it's added to the worker list and will be started when + // the runtime is recreated. + if let Some(runtime) = runtime_lock.as_ref() { + pausable_worker.start(runtime)?; + } + + let mut workers_lock = self.workers.lock_or_panic(); + workers_lock.push(pausable_worker); + + Ok(()) + } + + /// Hook to be called before forking. + /// + /// This method pauses all workers and prepares the runtime for forking. + /// It ensures that no background tasks are running when the fork occurs, + /// preventing potential deadlocks in the child process. + /// + /// # Errors + /// Returns an error if workers cannot be paused or the runtime is in an invalid state. + pub async fn before_fork(&self) -> Result<(), SharedRuntimeError> { + if let Some(runtime) = self.runtime.lock_or_panic().take() { + let pause_result: Result, PausableWorkerError> = runtime + .block_on(async { + let mut set = JoinSet::new(); + let mut workers_lock = self.workers.lock_or_panic(); + + // Pause all workers + for pausable_worker in workers_lock.iter_mut() { + set.spawn(pausable_worker.pause()); + } + set.join_all().await + }) + .into_iter() + .collect(); + pause_result.map_err(|err| SharedRuntimeError::WorkerError(err))?; + } + Ok(()) + } + + fn restart_runtime(&self) -> Result<(), SharedRuntimeError> { + let mut runtime_lock = self.runtime.lock_or_panic(); + if runtime_lock.is_none() { + *runtime_lock = Some(Arc::new( + Builder::new_multi_thread() + .worker_threads(1) + .enable_all() + .build()?, + )); + } + Ok(()) + } + + /// Hook to be called in the parent process after forking. + /// + /// This method restarts workers and resumes normal operation in the parent process. + /// The runtime may need to be recreated if it was shut down in before_fork. + /// + /// # Errors + /// Returns an error if workers cannot be restarted or the runtime cannot be recreated. + pub fn after_fork_parent(&self) -> Result<(), SharedRuntimeError> { + self.restart_runtime()?; + + let runtime_lock = self.runtime.lock_or_panic(); + let runtime = runtime_lock + .as_ref() + .ok_or(SharedRuntimeError::RuntimeUnavailable)?; + + let mut workers_lock = self.workers.lock_or_panic(); + + // Restart all workers + for pausable_worker in workers_lock.iter_mut() { + pausable_worker.start(runtime)?; + } + + Ok(()) + } + + /// Hook to be called in the child process after forking. + /// + /// This method reinitializes the runtime and workers in the child process. + /// A new runtime must be created since tokio runtimes cannot be safely forked. + /// Workers can optionally be restarted to resume operations in the child. + /// + /// # Errors + /// Returns an error if the runtime cannot be reinitialized or workers cannot be started. + pub fn after_fork_child(&self) -> Result<(), SharedRuntimeError> { + self.restart_runtime()?; + + let runtime_lock = self.runtime.lock_or_panic(); + let runtime = runtime_lock + .as_ref() + .ok_or(SharedRuntimeError::RuntimeUnavailable)?; + + let mut workers_lock = self.workers.lock_or_panic(); + + // Restart all workers in child process + for pausable_worker in workers_lock.iter_mut() { + pausable_worker.reset(); + pausable_worker.start(runtime)?; + } + + Ok(()) + } + + /// Get a reference to the underlying runtime. + /// + /// This allows external code to spawn additional tasks on the runtime if needed. + /// + /// # Errors + /// Returns None if the runtime is not available (e.g., during fork operations). + pub fn runtime(&self) -> Arc { + match self.runtime.lock_or_panic().as_ref() { + None => Builder.new_current_thread().enable_all().build().unwrap(), + Some(runtime) => runtime.clone(), + } + } + + /// Shutdown the runtime and all workers. + /// + /// This should be called during application shutdown to cleanly stop all + /// background workers and the runtime. + /// + /// Note: The runtime itself is not dropped by this method to avoid issues with + /// dropping a runtime from within an async context. The runtime will be dropped + /// when the SharedRuntime is dropped from a synchronous context. + /// + /// # Errors + /// Returns an error if workers cannot be stopped. + pub async fn shutdown(&self) -> Result<(), SharedRuntimeError> { + let mut workers_lock = self.workers.lock_or_panic(); + + // Pause all workers + for pausable_worker in workers_lock.iter_mut() { + pausable_worker.pause().await?; + } + + // Note: We don't drop the runtime here because dropping a runtime from + // within an async context causes a panic. The runtime will be properly + // cleaned up when SharedRuntime is dropped from a synchronous context. + + Ok(()) + } +} + +impl Default for SharedRuntime { + fn default() -> Self { + Self::new().expect("Failed to create default SharedRuntime") + } +} + +#[cfg(test)] +mod tests { + use super::*; + use async_trait::async_trait; + use std::sync::mpsc::{channel, Sender}; + use std::time::Duration; + use tokio::time::sleep; + + struct TestWorker { + state: u32, + sender: Sender, + } + + #[async_trait] + impl Worker for TestWorker { + async fn run(&mut self) { + let _ = self.sender.send(self.state); + self.state += 1; + } + + async fn trigger(&self) { + sleep(Duration::from_millis(100)).await; + } + } + + #[test] + fn test_shared_runtime_creation() { + let shared_runtime = SharedRuntime::new(); + assert!(shared_runtime.is_ok()); + } + + #[test] + fn test_spawn_worker() { + let shared_runtime = SharedRuntime::new().unwrap(); + let (sender, _receiver) = channel::(); + let worker = TestWorker { state: 0, sender }; + + // TODO: Complete this test once spawn_worker properly stores workers + let result = shared_runtime.spawn_worker(worker); + assert!(result.is_ok()); + } + + #[test] + fn test_before_and_after_fork_parent() { + // Run in a separate thread to ensure we're not in any async context + let handle = std::thread::spawn(|| { + let rt = tokio::runtime::Runtime::new().unwrap(); + let shared_runtime = SharedRuntime::new().unwrap(); + + // Test before_fork + rt.block_on(async { + assert!(shared_runtime.before_fork().await.is_ok()); + }); + + // Test after_fork_parent (synchronous) + assert!(shared_runtime.after_fork_parent().is_ok()); + + // Clean shutdown + rt.block_on(async { + assert!(shared_runtime.shutdown().await.is_ok()); + }); + }); + + handle.join().expect("Thread panicked"); + } + + #[test] + fn test_after_fork_child() { + // Test after_fork_child in a non-async context + let shared_runtime = SharedRuntime::new().unwrap(); + + // This should succeed as we're not in an async context + assert!(shared_runtime.after_fork_child().is_ok()); + } +} From 66e9e06244f9024bc80e6f2e5a6ab3f94d8df205 Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 13 Feb 2026 20:23:20 +0100 Subject: [PATCH 03/80] feat(worker): add initial trigger --- libdd-common/src/worker.rs | 12 +++++++++++- libdd-data-pipeline/src/pausable_worker.rs | 11 +++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/libdd-common/src/worker.rs b/libdd-common/src/worker.rs index ca7852848d..3d37d0b93a 100644 --- a/libdd-common/src/worker.rs +++ b/libdd-common/src/worker.rs @@ -16,9 +16,15 @@ pub trait Worker { /// Main worker function async fn run(&mut self); - /// Function to call between each `run` to wait for the next run + /// Function called between each `run` to wait for the next run async fn trigger(&mut self); + /// Alternative trigger called on start to provide custom behavior + /// Can be used to trigger first run right away. Defaults to `trigger` behavior. + async fn initial_trigger(&mut self) { + self.trigger().await + } + /// Reset the worker in the child after a fork fn reset(&mut self) { return; @@ -41,6 +47,10 @@ impl Worker for Box { (**self).trigger().await } + async fn initial_trigger(&mut self) { + (**self).initial_trigger().await + } + fn reset(&mut self) { (**self).reset() } diff --git a/libdd-data-pipeline/src/pausable_worker.rs b/libdd-data-pipeline/src/pausable_worker.rs index 34d25e1ad9..ae5faaa0a6 100644 --- a/libdd-data-pipeline/src/pausable_worker.rs +++ b/libdd-data-pipeline/src/pausable_worker.rs @@ -80,6 +80,17 @@ impl PausableWorker { let stop_token = CancellationToken::new(); let cloned_token = stop_token.clone(); let handle = rt.spawn(async move { + // First iteration: use initial_trigger + select! { + _ = worker.initial_trigger() => { + worker.run().await; + } + _ = cloned_token.cancelled() => { + return worker; + } + } + + // Subsequent iterations: use regular trigger loop { select! { _ = worker.trigger() => { From b6722629b5825fb1b93a23015c018ad8b35a43c2 Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 13 Feb 2026 20:29:46 +0100 Subject: [PATCH 04/80] feat(agent_info): use initial trigger --- libdd-data-pipeline/src/agent_info/fetcher.rs | 58 +++++++++---------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index 221cca36e4..6c1a2341f5 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -179,46 +179,40 @@ impl AgentInfoFetcher { #[async_trait] impl Worker for AgentInfoFetcher { - /// Start fetching the info endpoint with the given interval. - /// - /// # Warning - /// This method does not return and should be called within a dedicated task. - async fn run(&mut self) { - // Skip the first fetch if some info is present to avoid calling the /info endpoint - // at fork for heavy-forking environment. + async fn initial_trigger(&mut self) { + // Skip initial wait if cache is not populated if AGENT_INFO_CACHE.load().is_none() { - self.fetch_and_update().await; + return; } + self.trigger().await + } - // Main loop waiting for a trigger event or the end of the refresh interval to trigger the - // fetch. - loop { - match &mut self.trigger_rx { - Some(trigger_rx) => { - tokio::select! { - // Wait for manual trigger (new state from headers) - trigger = trigger_rx.recv() => { - if trigger.is_some() { - self.fetch_and_update().await; - } else { - // The channel has been closed - self.trigger_rx = None; - } + async fn trigger(&mut self) { + // Wait for either a manual trigger or the refresh interval + match &mut self.trigger_rx { + Some(trigger_rx) => { + tokio::select! { + // Wait for manual trigger (new state from headers) + trigger = trigger_rx.recv() => { + if trigger.is_none() { + // The channel has been closed + self.trigger_rx = None; } - // Regular periodic fetch timer - _ = sleep(self.refresh_interval) => { - self.fetch_and_update().await; - } - }; - } - None => { - // If the trigger channel is closed we only use timed fetch. - sleep(self.refresh_interval).await; - self.fetch_and_update().await; + } + // Regular periodic fetch timer + _ = sleep(self.refresh_interval) => {} } } + None => { + // If the trigger channel is closed we only use timed fetch. + sleep(self.refresh_interval).await; + } } } + + async fn run(&mut self) { + self.fetch_and_update().await; + } } impl AgentInfoFetcher { From 23022a6bec8800f8dd3a3c24fe77bd6423200419 Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 13 Feb 2026 20:30:18 +0100 Subject: [PATCH 05/80] feat(stats): implement stats worker --- libdd-data-pipeline/src/stats_exporter.rs | 33 +++++++++++------------ 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/libdd-data-pipeline/src/stats_exporter.rs b/libdd-data-pipeline/src/stats_exporter.rs index 83394537b0..b1cd68cfdd 100644 --- a/libdd-data-pipeline/src/stats_exporter.rs +++ b/libdd-data-pipeline/src/stats_exporter.rs @@ -11,13 +11,12 @@ use std::{ time, }; -use async_trait::async_trait; use crate::trace_exporter::TracerMetadata; +use async_trait::async_trait; use libdd_common::{worker::Worker, Endpoint, HttpClient}; use libdd_trace_protobuf::pb; use libdd_trace_stats::span_concentrator::SpanConcentrator; use libdd_trace_utils::send_with_retry::{send_with_retry, RetryStrategy}; -use tokio::select; use tokio_util::sync::CancellationToken; use tracing::error; @@ -135,22 +134,22 @@ impl StatsExporter { #[async_trait] impl Worker for StatsExporter { - /// Run loop of the stats exporter - /// - /// Once started, the stats exporter will flush and send stats on every `self.flush_interval`. - /// If the `self.cancellation_token` is cancelled, the exporter will force flush all stats and - /// return. + async fn trigger(&mut self) { + tokio::time::sleep(self.flush_interval).await; + } + + /// Flush and send stats on every trigger. async fn run(&mut self) { - loop { - select! { - _ = self.cancellation_token.cancelled() => { - let _ = self.send(true).await; - break; - }, - _ = tokio::time::sleep(self.flush_interval) => { - let _ = self.send(false).await; - }, - }; + let _ = self.send(false).await; + } + + fn shutdown(&mut self) { + // Force flush all stats on shutdown + let rt = tokio::runtime::Handle::try_current(); + if let Ok(handle) = rt { + handle.block_on(async { + let _ = self.send(true).await; + }); } } } From 6c9c7f33cb091acb7a1a05d5b1a2ef4508e19aa0 Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 13 Feb 2026 20:32:52 +0100 Subject: [PATCH 06/80] fix(shared_runtime): fix compile error --- libdd-data-pipeline/src/shared_runtime.rs | 46 +++++++++++------------ 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index b068888008..4455a04d18 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -11,11 +11,8 @@ use crate::pausable_worker::{PausableWorker, PausableWorkerError}; use libdd_common::{worker::Worker, MutexExt}; use std::fmt; -use std::sync::{Arc, Mutex, PoisonError}; -use tokio::{ - runtime::{Builder, Runtime}, - task::JoinSet, -}; +use std::sync::{Arc, Mutex}; +use tokio::runtime::{Builder, Runtime}; /// Type alias for a boxed worker trait object that can be used with PausableWorker. type BoxedWorker = Box; @@ -70,12 +67,20 @@ impl From for SharedRuntimeError { /// The SharedRuntime owns a tokio runtime and tracks PausableWorkers spawned on it. /// It provides methods to safely pause workers before forking and restart them /// after fork in both parent and child processes. -#[derive(Debug)] pub struct SharedRuntime { runtime: Arc>>>, workers: Arc>>>, } +impl std::fmt::Debug for SharedRuntime { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SharedRuntime") + .field("runtime", &self.runtime) + .field("workers", &"") + .finish() + } +} + impl SharedRuntime { /// Create a new SharedRuntime with a default multi-threaded tokio runtime. /// @@ -88,7 +93,7 @@ impl SharedRuntime { .build()?; Ok(Self { - runtime: Arc::new(Mutex::new(Some(runtime))), + runtime: Arc::new(Mutex::new(Some(Arc::new(runtime)))), workers: Arc::new(Mutex::new(Vec::new())), }) } @@ -131,20 +136,15 @@ impl SharedRuntime { /// Returns an error if workers cannot be paused or the runtime is in an invalid state. pub async fn before_fork(&self) -> Result<(), SharedRuntimeError> { if let Some(runtime) = self.runtime.lock_or_panic().take() { - let pause_result: Result, PausableWorkerError> = runtime - .block_on(async { - let mut set = JoinSet::new(); - let mut workers_lock = self.workers.lock_or_panic(); - - // Pause all workers - for pausable_worker in workers_lock.iter_mut() { - set.spawn(pausable_worker.pause()); - } - set.join_all().await - }) - .into_iter() - .collect(); - pause_result.map_err(|err| SharedRuntimeError::WorkerError(err))?; + runtime.block_on(async { + let mut workers_lock = self.workers.lock_or_panic(); + + // Pause all workers sequentially + for pausable_worker in workers_lock.iter_mut() { + pausable_worker.pause().await?; + } + Ok::<(), PausableWorkerError>(()) + })?; } Ok(()) } @@ -222,7 +222,7 @@ impl SharedRuntime { /// Returns None if the runtime is not available (e.g., during fork operations). pub fn runtime(&self) -> Arc { match self.runtime.lock_or_panic().as_ref() { - None => Builder.new_current_thread().enable_all().build().unwrap(), + None => Arc::new(Builder::new_current_thread().enable_all().build().unwrap()), Some(runtime) => runtime.clone(), } } @@ -280,7 +280,7 @@ mod tests { self.state += 1; } - async fn trigger(&self) { + async fn trigger(&mut self) { sleep(Duration::from_millis(100)).await; } } From acc69ae522f2c3dc598b333301cb5f2e3d9398d9 Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 13 Feb 2026 20:46:01 +0100 Subject: [PATCH 07/80] data-pipeline: pause all workers before joining in before_fork --- libdd-data-pipeline/src/pausable_worker.rs | 49 ++++++++++++++----- libdd-data-pipeline/src/shared_runtime.rs | 8 ++- libdd-data-pipeline/src/trace_exporter/mod.rs | 4 +- 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/libdd-data-pipeline/src/pausable_worker.rs b/libdd-data-pipeline/src/pausable_worker.rs index ae5faaa0a6..7d266cbfb3 100644 --- a/libdd-data-pipeline/src/pausable_worker.rs +++ b/libdd-data-pipeline/src/pausable_worker.rs @@ -111,28 +111,51 @@ impl PausableWorker { } } - /// Pause the worker saving it's state to be restarted. + /// Request the worker to pause without waiting for task termination. /// /// # Errors - /// Fails if the worker handle has been aborted preventing the worker from being retrieved. - pub async fn pause(&mut self) -> Result<(), PausableWorkerError> { + /// Fails if the worker is in an invalid state. + pub fn request_pause(&mut self) -> Result<(), PausableWorkerError> { match self { - PausableWorker::Running { handle, stop_token } => { + PausableWorker::Running { stop_token, .. } => { stop_token.cancel(); - if let Ok(worker) = handle.await { - *self = PausableWorker::Paused { worker }; - Ok(()) - } else { - // The task has been aborted and the worker can't be retrieved. - *self = PausableWorker::InvalidState; - Err(PausableWorkerError::TaskAborted) - } + Ok(()) } PausableWorker::Paused { .. } => Ok(()), PausableWorker::InvalidState => Err(PausableWorkerError::InvalidState), } } + /// Wait for a requested pause to complete and store the worker state. + /// + /// # Errors + /// Fails if the worker handle has been aborted preventing the worker from being retrieved. + pub async fn join(&mut self) -> Result<(), PausableWorkerError> { + if let PausableWorker::Running { handle, .. } = std::mem::replace(self, Self::InvalidState) { + if let Ok(worker) = handle.await { + *self = PausableWorker::Paused { worker }; + Ok(()) + } else { + // The task has been aborted and the worker can't be retrieved. + *self = PausableWorker::InvalidState; + Err(PausableWorkerError::TaskAborted) + } + } else if let PausableWorker::Paused { .. } = self { + Ok(()) + } else { + Err(PausableWorkerError::InvalidState) + } + } + + /// Pause the worker saving it's state to be restarted. + /// + /// # Errors + /// Fails if the worker handle has been aborted preventing the worker from being retrieved. + pub async fn pause(&mut self) -> Result<(), PausableWorkerError> { + self.request_pause()?; + self.join().await + } + /// Reset the worker state (used in child process after fork). /// /// This delegates to the worker's reset method if the worker is in a paused state. @@ -143,7 +166,7 @@ impl PausableWorker { } /// Wait for the run method of the worker to exit. - pub async fn join(self) -> Result<(), JoinError> { + pub async fn join_task(self) -> Result<(), JoinError> { if let PausableWorker::Running { handle, .. } = self { handle.await?; } diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index 4455a04d18..0d5ed74b90 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -139,9 +139,13 @@ impl SharedRuntime { runtime.block_on(async { let mut workers_lock = self.workers.lock_or_panic(); - // Pause all workers sequentially + // First signal all workers to pause, then wait for each one to stop. for pausable_worker in workers_lock.iter_mut() { - pausable_worker.pause().await?; + pausable_worker.request_pause()?; + } + + for pausable_worker in workers_lock.iter_mut() { + pausable_worker.join().await?; } Ok::<(), PausableWorkerError>(()) })?; diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 02320a684d..416a1f8e43 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -383,7 +383,7 @@ impl TraceExporter { let stats_worker = self.workers.lock_or_panic().stats.take(); if let Some(stats_worker) = stats_worker { - let _ = stats_worker.join().await; + let _ = stats_worker.join_task().await; } } if let Some(telemetry) = self.telemetry.take() { @@ -391,7 +391,7 @@ impl TraceExporter { let telemetry_worker = self.workers.lock_or_panic().telemetry.take(); if let Some(telemetry_worker) = telemetry_worker { - let _ = telemetry_worker.join().await; + let _ = telemetry_worker.join_task().await; } } } From 17a9bb8a2e9fa896c87d0be51609b1a16f890689 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 16 Feb 2026 15:44:52 +0100 Subject: [PATCH 08/80] feat(exporter): use shared runtime in trace exporter --- libdd-data-pipeline/src/pausable_worker.rs | 9 +- libdd-data-pipeline/src/shared_runtime.rs | 6 +- .../src/trace_exporter/builder.rs | 39 ++-- libdd-data-pipeline/src/trace_exporter/mod.rs | 217 +++--------------- .../src/trace_exporter/stats.rs | 78 ++----- 5 files changed, 62 insertions(+), 287 deletions(-) diff --git a/libdd-data-pipeline/src/pausable_worker.rs b/libdd-data-pipeline/src/pausable_worker.rs index 7d266cbfb3..dc07d9faea 100644 --- a/libdd-data-pipeline/src/pausable_worker.rs +++ b/libdd-data-pipeline/src/pausable_worker.rs @@ -8,7 +8,7 @@ use std::fmt::Display; use tokio::{ runtime::Runtime, select, - task::{JoinError, JoinHandle}, + task::JoinHandle, }; use tokio_util::sync::CancellationToken; @@ -165,13 +165,6 @@ impl PausableWorker { } } - /// Wait for the run method of the worker to exit. - pub async fn join_task(self) -> Result<(), JoinError> { - if let PausableWorker::Running { handle, .. } = self { - handle.await?; - } - Ok(()) - } } #[cfg(test)] diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index 0d5ed74b90..b999a1d6ab 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -134,7 +134,7 @@ impl SharedRuntime { /// /// # Errors /// Returns an error if workers cannot be paused or the runtime is in an invalid state. - pub async fn before_fork(&self) -> Result<(), SharedRuntimeError> { + pub fn before_fork(&self) -> Result<(), SharedRuntimeError> { if let Some(runtime) = self.runtime.lock_or_panic().take() { runtime.block_on(async { let mut workers_lock = self.workers.lock_or_panic(); @@ -314,9 +314,7 @@ mod tests { let shared_runtime = SharedRuntime::new().unwrap(); // Test before_fork - rt.block_on(async { - assert!(shared_runtime.before_fork().await.is_ok()); - }); + assert!(shared_runtime.before_fork().is_ok()); // Test after_fork_parent (synchronous) assert!(shared_runtime.after_fork_parent().is_ok()); diff --git a/libdd-data-pipeline/src/trace_exporter/builder.rs b/libdd-data-pipeline/src/trace_exporter/builder.rs index f9833fa668..6567430788 100644 --- a/libdd-data-pipeline/src/trace_exporter/builder.rs +++ b/libdd-data-pipeline/src/trace_exporter/builder.rs @@ -2,20 +2,18 @@ // SPDX-License-Identifier: Apache-2.0 use crate::agent_info::AgentInfoFetcher; -use crate::pausable_worker::PausableWorker; +use crate::shared_runtime::SharedRuntime; use crate::telemetry::TelemetryClientBuilder; use crate::trace_exporter::agent_response::AgentResponsePayloadVersion; use crate::trace_exporter::error::BuilderErrorKind; use crate::trace_exporter::{ add_path, StatsComputationStatus, TelemetryConfig, TraceExporter, TraceExporterError, - TraceExporterInputFormat, TraceExporterOutputFormat, TraceExporterWorkers, TracerMetadata, - INFO_ENDPOINT, + TraceExporterInputFormat, TraceExporterOutputFormat, TracerMetadata, INFO_ENDPOINT, }; use arc_swap::ArcSwap; use libdd_common::http_common::new_default_client; use libdd_common::{parse_uri, tag, Endpoint}; use libdd_dogstatsd_client::new; -use std::sync::{Arc, Mutex}; use std::time::Duration; const DEFAULT_AGENT_URL: &str = "http://127.0.0.1:8126"; @@ -227,12 +225,9 @@ impl TraceExporterBuilder { )); } - let runtime = Arc::new( - tokio::runtime::Builder::new_multi_thread() - .worker_threads(1) - .enable_all() - .build()?, - ); + let shared_runtime = SharedRuntime::new().map_err(|e| { + TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration(e.to_string())) + })?; let dogstatsd = self.dogstatsd_url.and_then(|u| { new(Endpoint::from_slice(&u)).ok() // If we couldn't set the endpoint return @@ -251,8 +246,7 @@ impl TraceExporterBuilder { let info_endpoint = Endpoint::from_url(add_path(&agent_url, INFO_ENDPOINT)); let (info_fetcher, info_response_observer) = AgentInfoFetcher::new(info_endpoint.clone(), Duration::from_secs(5 * 60)); - let mut info_fetcher_worker = PausableWorker::new(info_fetcher); - info_fetcher_worker.start(&runtime).map_err(|e| { + shared_runtime.spawn_worker(info_fetcher).map_err(|e| { TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration(e.to_string())) })?; @@ -276,21 +270,20 @@ impl TraceExporterBuilder { if let Some(id) = telemetry_config.runtime_id { builder = builder.set_runtime_id(&id); } - builder.build(runtime.handle().clone()) + builder.build(shared_runtime.runtime().handle().clone()) }); - let (telemetry_client, telemetry_worker) = match telemetry { + let telemetry_client = match telemetry { Some((client, worker)) => { - let mut telemetry_worker = PausableWorker::new(worker); - telemetry_worker.start(&runtime).map_err(|e| { + shared_runtime.spawn_worker(worker).map_err(|e| { TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration( e.to_string(), )) })?; - runtime.block_on(client.start()); - (Some(client), Some(telemetry_worker)) + shared_runtime.runtime().block_on(client.start()); + Some(client) } - None => (None, None), + None => None, }; Ok(TraceExporter { @@ -320,7 +313,7 @@ impl TraceExporterBuilder { input_format: self.input_format, output_format: self.output_format, client_computed_top_level: self.client_computed_top_level, - runtime: Arc::new(Mutex::new(Some(runtime))), + shared_runtime, dogstatsd, common_stats_tags: vec![libdatadog_version], client_side_stats: ArcSwap::new(stats.into()), @@ -328,12 +321,6 @@ impl TraceExporterBuilder { info_response_observer, telemetry: telemetry_client, health_metrics_enabled: self.health_metrics_enabled, - workers: Arc::new(Mutex::new(TraceExporterWorkers { - info: info_fetcher_worker, - stats: None, - telemetry: telemetry_worker, - })), - agent_payload_response_version: self .agent_rates_payload_version_enabled .then(AgentResponsePayloadVersion::new), diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 416a1f8e43..053c268569 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -15,8 +15,7 @@ use self::metrics::MetricsEmitter; use self::stats::StatsComputationStatus; use self::trace_serializer::TraceSerializer; use crate::agent_info::{AgentInfoFetcher, ResponseObserver}; -use crate::pausable_worker::PausableWorker; -use crate::stats_exporter::StatsExporter; +use crate::shared_runtime::SharedRuntime; use crate::telemetry::{SendPayloadTelemetry, TelemetryClient}; use crate::trace_exporter::agent_response::{ AgentResponsePayloadVersion, DATADOG_RATES_PAYLOAD_VERSION_HEADER, @@ -35,7 +34,6 @@ use libdd_common::tag::Tag; use libdd_common::{http_common, Endpoint}; use libdd_common::{HttpClient, MutexExt}; use libdd_dogstatsd_client::Client; -use libdd_telemetry::worker::TelemetryWorker; use libdd_trace_utils::msgpack_decoder; use libdd_trace_utils::send_with_retry::{ send_with_retry, RetryStrategy, SendWithRetryError, SendWithRetryResult, @@ -43,7 +41,7 @@ use libdd_trace_utils::send_with_retry::{ use libdd_trace_utils::span::{v04::Span, TraceData}; use libdd_trace_utils::trace_utils::TracerHeaderTags; use std::io; -use std::sync::{Arc, Mutex}; +use std::sync::Arc; use std::time::Duration; use std::{borrow::Borrow, collections::HashMap, str::FromStr}; use tokio::runtime::Runtime; @@ -153,13 +151,6 @@ impl<'a> From<&'a TracerMetadata> for HashMap<&'static str, String> { } } -#[derive(Debug)] -pub(crate) struct TraceExporterWorkers { - pub info: PausableWorker, - pub stats: Option>, - pub telemetry: Option>, -} - /// The TraceExporter ingest traces from the tracers serialized as messagepack and forward them to /// the agent while applying some transformation. /// @@ -191,7 +182,7 @@ pub struct TraceExporter { input_format: TraceExporterInputFormat, output_format: TraceExporterOutputFormat, // TODO - do something with the response callback - https://datadoghq.atlassian.net/browse/APMSP-1019 - runtime: Arc>>>, + shared_runtime: SharedRuntime, /// None if dogstatsd is disabled dogstatsd: Option, common_stats_tags: Vec, @@ -201,7 +192,6 @@ pub struct TraceExporter { info_response_observer: ResponseObserver, telemetry: Option, health_metrics_enabled: bool, - workers: Arc>, agent_payload_response_version: Option, http_client: HttpClient, } @@ -212,113 +202,21 @@ impl TraceExporter { TraceExporterBuilder::default() } - /// Return the existing runtime or create a new one and start all workers + /// Return a runtime from the shared runtime manager. fn runtime(&self) -> Result, TraceExporterError> { - let mut runtime_guard = self.runtime.lock_or_panic(); - match runtime_guard.as_ref() { - Some(runtime) => { - // Runtime already running - Ok(runtime.clone()) - } - None => { - // Create a new current thread runtime with all features enabled - let runtime = Arc::new( - tokio::runtime::Builder::new_multi_thread() - .worker_threads(1) - .enable_all() - .build()?, - ); - *runtime_guard = Some(runtime.clone()); - self.start_all_workers(&runtime)?; - Ok(runtime) - } - } + Ok(self.shared_runtime.runtime()) } /// Manually start all workers pub fn run_worker(&self) -> Result<(), TraceExporterError> { - self.runtime()?; - Ok(()) - } - - /// Start all workers with the given runtime - fn start_all_workers(&self, runtime: &Arc) -> Result<(), TraceExporterError> { - let mut workers = self.workers.lock_or_panic(); - - self.start_info_worker(&mut workers, runtime)?; - self.start_stats_worker(&mut workers, runtime)?; - self.start_telemetry_worker(&mut workers, runtime)?; - - Ok(()) - } - - /// Start the info worker - fn start_info_worker( - &self, - workers: &mut TraceExporterWorkers, - runtime: &Arc, - ) -> Result<(), TraceExporterError> { - workers.info.start(runtime).map_err(|e| { + self.shared_runtime.after_fork_parent().map_err(|e| { TraceExporterError::Internal(InternalErrorKind::InvalidWorkerState(e.to_string())) }) } - /// Start the stats worker if present - fn start_stats_worker( - &self, - workers: &mut TraceExporterWorkers, - runtime: &Arc, - ) -> Result<(), TraceExporterError> { - if let Some(stats_worker) = &mut workers.stats { - stats_worker.start(runtime).map_err(|e| { - TraceExporterError::Internal(InternalErrorKind::InvalidWorkerState(e.to_string())) - })?; - } - Ok(()) - } - - /// Start the telemetry worker if present - fn start_telemetry_worker( - &self, - workers: &mut TraceExporterWorkers, - runtime: &Arc, - ) -> Result<(), TraceExporterError> { - if let Some(telemetry_worker) = &mut workers.telemetry { - telemetry_worker.start(runtime).map_err(|e| { - TraceExporterError::Internal(InternalErrorKind::InvalidWorkerState(e.to_string())) - })?; - if let Some(client) = &self.telemetry { - runtime.block_on(client.start()); - } - } - Ok(()) - } - pub fn stop_worker(&self) { - let runtime = self.runtime.lock_or_panic().take(); - if let Some(ref rt) = runtime { - // Stop workers to save their state - let mut workers = self.workers.lock_or_panic(); - rt.block_on(async { - let _ = workers.info.pause().await; - if let Some(stats_worker) = &mut workers.stats { - let _ = stats_worker.pause().await; - }; - if let Some(telemetry_worker) = &mut workers.telemetry { - let _ = telemetry_worker.pause().await; - }; - }); - } - // When the info fetcher is paused, the trigger channel keeps a reference to the runtime's - // IoStack as a waker. This prevents the IoStack from being dropped when shutting - // down runtime. By manually sending a message to the trigger channel we trigger the - // waker releasing the reference to the IoStack. Finally we drain the channel to - // avoid triggering a fetch when the info fetcher is restarted. - if let PausableWorker::Paused { worker } = &mut self.workers.lock_or_panic().info { - self.info_response_observer.manual_trigger(); - worker.drain(); - } - drop(runtime); + let _ = self.shared_runtime.before_fork(); + self.info_response_observer.manual_trigger(); } /// Send msgpack serialized traces to the agent @@ -373,27 +271,16 @@ impl TraceExporter { /// This function should not take ownership of the trace exporter as it will cause the runtime /// stored in the trace exporter to be dropped in a non-blocking context causing a panic. async fn shutdown_async(&mut self) { - let stats_status = self.client_side_stats.load(); if let StatsComputationStatus::Enabled { cancellation_token, .. - } = stats_status.as_ref() + } = self.client_side_stats.load().as_ref() { cancellation_token.cancel(); - - let stats_worker = self.workers.lock_or_panic().stats.take(); - - if let Some(stats_worker) = stats_worker { - let _ = stats_worker.join_task().await; - } } if let Some(telemetry) = self.telemetry.take() { telemetry.shutdown().await; - let telemetry_worker = self.workers.lock_or_panic().telemetry.take(); - - if let Some(telemetry_worker) = telemetry_worker { - let _ = telemetry_worker.join_task().await; - } } + let _ = self.shared_runtime.shutdown().await; } /// Check if agent info state has changed @@ -415,30 +302,22 @@ impl TraceExporter { let ctx = stats::StatsContext { metadata: &self.metadata, endpoint_url: &self.endpoint.url, - runtime: &self.runtime, + shared_runtime: &self.shared_runtime, }; stats::handle_stats_disabled_by_agent( &ctx, &agent_info, &self.client_side_stats, - &self.workers, self.http_client.clone(), ); } StatsComputationStatus::Enabled { stats_concentrator, .. } => { - let ctx = stats::StatsContext { - metadata: &self.metadata, - endpoint_url: &self.endpoint.url, - runtime: &self.runtime, - }; stats::handle_stats_enabled( - &ctx, &agent_info, stats_concentrator, &self.client_side_stats, - &self.workers, ); } } @@ -848,7 +727,7 @@ impl TraceExporter { #[cfg(test)] /// Test only function to check if the stats computation is active and the worker is running pub fn is_stats_worker_active(&self) -> bool { - stats::is_stats_worker_active(&self.client_side_stats, &self.workers) + stats::is_stats_worker_active(&self.client_side_stats) } } @@ -1521,15 +1400,9 @@ mod tests { traces_endpoint.assert_calls(1); while metrics_endpoint.calls() == 0 { - exporter - .runtime - .lock() - .unwrap() - .as_ref() - .unwrap() - .block_on(async { - sleep(Duration::from_millis(100)).await; - }) + exporter.shared_runtime.runtime().block_on(async { + sleep(Duration::from_millis(100)).await; + }) } metrics_endpoint.assert_calls(1); } @@ -1579,15 +1452,9 @@ mod tests { traces_endpoint.assert_calls(1); while metrics_endpoint.calls() == 0 { - exporter - .runtime - .lock() - .unwrap() - .as_ref() - .unwrap() - .block_on(async { - sleep(Duration::from_millis(100)).await; - }) + exporter.shared_runtime.runtime().block_on(async { + sleep(Duration::from_millis(100)).await; + }) } metrics_endpoint.assert_calls(1); } @@ -1648,15 +1515,9 @@ mod tests { traces_endpoint.assert_calls(1); while metrics_endpoint.calls() == 0 { - exporter - .runtime - .lock() - .unwrap() - .as_ref() - .unwrap() - .block_on(async { - sleep(Duration::from_millis(100)).await; - }) + exporter.shared_runtime.runtime().block_on(async { + sleep(Duration::from_millis(100)).await; + }) } metrics_endpoint.assert_calls(1); } @@ -1831,15 +1692,9 @@ mod tests { // Wait for the info fetcher to get the config while mock_info.calls() == 0 { - exporter - .runtime - .lock() - .unwrap() - .as_ref() - .unwrap() - .block_on(async { - sleep(Duration::from_millis(100)).await; - }) + exporter.shared_runtime.runtime().block_on(async { + sleep(Duration::from_millis(100)).await; + }) } let _ = exporter.send(data.as_ref()).unwrap(); @@ -1938,15 +1793,9 @@ mod single_threaded_tests { // Wait for the info fetcher to get the config while agent_info::get_agent_info().is_none() { - exporter - .runtime - .lock() - .unwrap() - .as_ref() - .unwrap() - .block_on(async { - sleep(Duration::from_millis(100)).await; - }) + exporter.shared_runtime.runtime().block_on(async { + sleep(Duration::from_millis(100)).await; + }) } let result = exporter.send(data.as_ref()); @@ -2043,15 +1892,9 @@ mod single_threaded_tests { // Wait for agent_info to be present so that sending a trace will trigger the stats worker // to start while agent_info::get_agent_info().is_none() { - exporter - .runtime - .lock() - .unwrap() - .as_ref() - .unwrap() - .block_on(async { - sleep(Duration::from_millis(100)).await; - }) + exporter.shared_runtime.runtime().block_on(async { + sleep(Duration::from_millis(100)).await; + }) } exporter.send(data.as_ref()).unwrap(); diff --git a/libdd-data-pipeline/src/trace_exporter/stats.rs b/libdd-data-pipeline/src/trace_exporter/stats.rs index 943ebc5dd1..85ef3d5284 100644 --- a/libdd-data-pipeline/src/trace_exporter/stats.rs +++ b/libdd-data-pipeline/src/trace_exporter/stats.rs @@ -8,13 +8,13 @@ //! and processing traces for stats collection. use crate::agent_info::schema::AgentInfo; +use crate::shared_runtime::SharedRuntime; use crate::stats_exporter; use arc_swap::ArcSwap; use libdd_common::{Endpoint, HttpClient, MutexExt}; use libdd_trace_stats::span_concentrator::SpanConcentrator; use std::sync::{Arc, Mutex}; use std::time::Duration; -use tokio::runtime::Runtime; use tokio_util::sync::CancellationToken; use tracing::{debug, error}; @@ -28,7 +28,7 @@ pub(crate) const STATS_ENDPOINT: &str = "/v0.6/stats"; pub(crate) struct StatsContext<'a> { pub metadata: &'a super::TracerMetadata, pub endpoint_url: &'a http::Uri, - pub runtime: &'a Arc>>>, + pub shared_runtime: &'a SharedRuntime, } #[derive(Debug)] @@ -61,7 +61,6 @@ fn get_span_kinds_for_stats(agent_info: &Arc) -> Vec { pub(crate) fn start_stats_computation( ctx: &StatsContext, client_side_stats: &ArcSwap, - workers: &Arc>, span_kinds: Vec, peer_tags: Vec, client: HttpClient, @@ -79,7 +78,6 @@ pub(crate) fn start_stats_computation( bucket_size, &stats_concentrator, &cancellation_token, - workers, client_side_stats, client, )?; @@ -93,7 +91,6 @@ fn create_and_start_stats_worker( bucket_size: Duration, stats_concentrator: &Arc>, cancellation_token: &CancellationToken, - workers: &Arc>, client_side_stats: &ArcSwap, client: HttpClient, ) -> anyhow::Result<()> { @@ -105,22 +102,11 @@ fn create_and_start_stats_worker( cancellation_token.clone(), client, ); - let mut stats_worker = crate::pausable_worker::PausableWorker::new(stats_exporter); + ctx.shared_runtime + .spawn_worker(stats_exporter) + .map_err(|e| anyhow::anyhow!(e.to_string()))?; - // Get runtime guard - let runtime_guard = ctx.runtime.lock_or_panic(); - if let Some(rt) = runtime_guard.as_ref() { - stats_worker.start(rt).map_err(|e| { - super::error::TraceExporterError::Internal( - super::error::InternalErrorKind::InvalidWorkerState(e.to_string()), - ) - })?; - } else { - return Err(anyhow::anyhow!("Runtime not available")); - } - - // Update the stats computation state with the new worker and components - workers.lock_or_panic().stats = Some(stats_worker); + // Update the stats computation state with the new worker components. client_side_stats.store(Arc::new(StatsComputationStatus::Enabled { stats_concentrator: stats_concentrator.clone(), cancellation_token: cancellation_token.clone(), @@ -132,29 +118,17 @@ fn create_and_start_stats_worker( /// Stops the stats exporter and disable stats computation /// /// Used when client-side stats is disabled by the agent -pub(crate) fn stop_stats_computation( - ctx: &StatsContext, - client_side_stats: &ArcSwap, - workers: &Arc>, -) { +pub(crate) fn stop_stats_computation(client_side_stats: &ArcSwap) { if let StatsComputationStatus::Enabled { stats_concentrator, cancellation_token, } = &**client_side_stats.load() { - // If there's no runtime there's no exporter to stop - let runtime_guard = ctx.runtime.lock_or_panic(); - if let Some(rt) = runtime_guard.as_ref() { - rt.block_on(async { - cancellation_token.cancel(); - }); - workers.lock_or_panic().stats = None; - let bucket_size = stats_concentrator.lock_or_panic().get_bucket_size(); - - client_side_stats.store(Arc::new(StatsComputationStatus::DisabledByAgent { - bucket_size, - })); - } + cancellation_token.cancel(); + let bucket_size = stats_concentrator.lock_or_panic().get_bucket_size(); + client_side_stats.store(Arc::new(StatsComputationStatus::DisabledByAgent { + bucket_size, + })); } } @@ -163,7 +137,6 @@ pub(crate) fn handle_stats_disabled_by_agent( ctx: &StatsContext, agent_info: &Arc, client_side_stats: &ArcSwap, - workers: &Arc>, client: HttpClient, ) { if agent_info.info.client_drop_p0s.is_some_and(|v| v) { @@ -171,7 +144,6 @@ pub(crate) fn handle_stats_disabled_by_agent( let status = start_stats_computation( ctx, client_side_stats, - workers, get_span_kinds_for_stats(agent_info), agent_info.info.peer_tags.clone().unwrap_or_default(), client, @@ -187,18 +159,16 @@ pub(crate) fn handle_stats_disabled_by_agent( /// Handle stats computation when it's already enabled pub(crate) fn handle_stats_enabled( - ctx: &StatsContext, agent_info: &Arc, stats_concentrator: &Mutex, client_side_stats: &ArcSwap, - workers: &Arc>, ) { if agent_info.info.client_drop_p0s.is_some_and(|v| v) { let mut concentrator = stats_concentrator.lock_or_panic(); concentrator.set_span_kinds(get_span_kinds_for_stats(agent_info)); concentrator.set_peer_tags(agent_info.info.peer_tags.clone().unwrap_or_default()); } else { - stop_stats_computation(ctx, client_side_stats, workers); + stop_stats_computation(client_side_stats); debug!("Client-side stats computation has been disabled by the agent") } } @@ -258,25 +228,9 @@ pub(crate) fn process_traces_for_stats( #[cfg(test)] /// Test only function to check if the stats computation is active and the worker is running -pub(crate) fn is_stats_worker_active( - client_side_stats: &ArcSwap, - workers: &Arc>, -) -> bool { - if !matches!( +pub(crate) fn is_stats_worker_active(client_side_stats: &ArcSwap) -> bool { + matches!( **client_side_stats.load(), StatsComputationStatus::Enabled { .. } - ) { - return false; - } - - if let Ok(workers) = workers.try_lock() { - if let Some(stats_worker) = &workers.stats { - return matches!( - stats_worker, - crate::pausable_worker::PausableWorker::Running { .. } - ); - } - } - - false + ) } From 238a43025e1100f5c8c65e285a62f974cd1386c0 Mon Sep 17 00:00:00 2001 From: vianney Date: Wed, 18 Feb 2026 15:06:28 +0100 Subject: [PATCH 09/80] feat(shared_runtime): add worker handle --- libdd-common/src/worker.rs | 8 +- libdd-data-pipeline/src/pausable_worker.rs | 60 +++++--- libdd-data-pipeline/src/shared_runtime.rs | 141 +++++++++++++----- libdd-data-pipeline/src/stats_exporter.rs | 92 +++++------- .../src/trace_exporter/stats.rs | 1 - 5 files changed, 188 insertions(+), 114 deletions(-) diff --git a/libdd-common/src/worker.rs b/libdd-common/src/worker.rs index 3d37d0b93a..bae37c66a4 100644 --- a/libdd-common/src/worker.rs +++ b/libdd-common/src/worker.rs @@ -12,7 +12,7 @@ use async_trait::async_trait; /// This trait is dyn-compatible thanks to the `async_trait` macro, /// which allows it to be used as `Box`. #[async_trait] -pub trait Worker { +pub trait Worker: std::fmt::Debug { /// Main worker function async fn run(&mut self); @@ -31,7 +31,7 @@ pub trait Worker { } /// Hook called when the app is shutting down. Used to flush all data. - fn shutdown(&mut self) { + async fn shutdown(&mut self) { return; } } @@ -55,7 +55,7 @@ impl Worker for Box { (**self).reset() } - fn shutdown(&mut self) { - (**self).shutdown() + async fn shutdown(&mut self) { + (**self).shutdown().await } } diff --git a/libdd-data-pipeline/src/pausable_worker.rs b/libdd-data-pipeline/src/pausable_worker.rs index dc07d9faea..b37294e65a 100644 --- a/libdd-data-pipeline/src/pausable_worker.rs +++ b/libdd-data-pipeline/src/pausable_worker.rs @@ -5,11 +5,7 @@ use libdd_common::worker::Worker; use std::fmt::Display; -use tokio::{ - runtime::Runtime, - select, - task::JoinHandle, -}; +use tokio::{runtime::Runtime, select, task::JoinHandle}; use tokio_util::sync::CancellationToken; /// A pausable worker which can be paused and restarted on forks. @@ -35,6 +31,7 @@ pub enum PausableWorker { Paused { worker: T, }, + Stopped, InvalidState, } @@ -42,6 +39,7 @@ pub enum PausableWorker { pub enum PausableWorkerError { InvalidState, TaskAborted, + WorkerStopped, } impl Display for PausableWorkerError { @@ -53,6 +51,9 @@ impl Display for PausableWorkerError { PausableWorkerError::TaskAborted => { write!(f, "Worker task has been aborted and state has been lost.") } + PausableWorkerError::WorkerStopped => { + write!(f, "Worker has been definitely stopped") + } } } } @@ -74,6 +75,8 @@ impl PausableWorker { pub fn start(&mut self, rt: &Runtime) -> Result<(), PausableWorkerError> { if let Self::Running { .. } = self { Ok(()) + } else if let Self::Stopped = self { + Err(PausableWorkerError::WorkerStopped) } else if let Self::Paused { mut worker } = std::mem::replace(self, Self::InvalidState) { // Worker is temporarily in an invalid state, but since this block is failsafe it will // be replaced by a valid state. @@ -113,15 +116,18 @@ impl PausableWorker { /// Request the worker to pause without waiting for task termination. /// + /// This is useful when pausing multiple workers in parallel. + /// /// # Errors /// Fails if the worker is in an invalid state. - pub fn request_pause(&mut self) -> Result<(), PausableWorkerError> { + pub fn request_pause(&self) -> Result<(), PausableWorkerError> { match self { PausableWorker::Running { stop_token, .. } => { stop_token.cancel(); Ok(()) } PausableWorker::Paused { .. } => Ok(()), + PausableWorker::Stopped => Ok(()), PausableWorker::InvalidState => Err(PausableWorkerError::InvalidState), } } @@ -131,19 +137,26 @@ impl PausableWorker { /// # Errors /// Fails if the worker handle has been aborted preventing the worker from being retrieved. pub async fn join(&mut self) -> Result<(), PausableWorkerError> { - if let PausableWorker::Running { handle, .. } = std::mem::replace(self, Self::InvalidState) { - if let Ok(worker) = handle.await { - *self = PausableWorker::Paused { worker }; - Ok(()) - } else { - // The task has been aborted and the worker can't be retrieved. - *self = PausableWorker::InvalidState; - Err(PausableWorkerError::TaskAborted) + match self { + PausableWorker::Running { .. } => { + let PausableWorker::Running { handle, .. } = + std::mem::replace(self, PausableWorker::InvalidState) + else { + // Unreachable + return Ok(()); + }; + + if let Ok(worker) = handle.await { + *self = PausableWorker::Paused { worker }; + Ok(()) + } else { + // The task has been aborted and the worker can't be retrieved. + *self = PausableWorker::InvalidState; + Err(PausableWorkerError::TaskAborted) + } } - } else if let PausableWorker::Paused { .. } = self { - Ok(()) - } else { - Err(PausableWorkerError::InvalidState) + PausableWorker::Paused { .. } | PausableWorker::Stopped => Ok(()), + PausableWorker::InvalidState => Err(PausableWorkerError::InvalidState), } } @@ -156,15 +169,20 @@ impl PausableWorker { self.join().await } - /// Reset the worker state (used in child process after fork). - /// - /// This delegates to the worker's reset method if the worker is in a paused state. + /// Reset the worker state (e.g. in a fork child). pub fn reset(&mut self) { if let PausableWorker::Paused { worker } = self { worker.reset(); } } + /// Shutdown the worker. + pub async fn shutdown(&mut self) { + if let PausableWorker::Paused { worker } = self { + worker.shutdown().await; + } + *self = PausableWorker::Stopped; + } } #[cfg(test)] diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index b999a1d6ab..ab9bdfd3f1 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -11,12 +11,51 @@ use crate::pausable_worker::{PausableWorker, PausableWorkerError}; use libdd_common::{worker::Worker, MutexExt}; use std::fmt; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use tokio::runtime::{Builder, Runtime}; +use tokio::task::JoinSet; /// Type alias for a boxed worker trait object that can be used with PausableWorker. type BoxedWorker = Box; +#[derive(Debug)] +struct WorkerEntry { + id: u64, + worker: PausableWorker, +} + +/// Handle to a worker registered on a [`SharedRuntime`]. +/// +/// This handle can be used to stop the worker. +#[derive(Clone, Debug)] +pub struct WorkerHandle { + worker_id: u64, + workers: Arc>>, +} + +impl WorkerHandle { + /// Stop the worker and call it's shutdown logic. + /// + /// # Errors + /// Returns an error if the worker does not exist anymore or is already stopped. + pub async fn stop(self) -> Result<(), SharedRuntimeError> { + let mut workers_lock = self.workers.lock_or_panic(); + let Some(position) = workers_lock + .iter() + .position(|entry| entry.id == self.worker_id) + else { + return Err(SharedRuntimeError::Other( + "Worker not found or already stopped".to_string(), + )); + }; + workers_lock[position].worker.pause().await?; + workers_lock[position].worker.shutdown().await; + workers_lock[position].worker = PausableWorker::Stopped; + Ok(()) + } +} + /// Errors that can occur when using SharedRuntime. #[derive(Debug)] pub enum SharedRuntimeError { @@ -67,18 +106,11 @@ impl From for SharedRuntimeError { /// The SharedRuntime owns a tokio runtime and tracks PausableWorkers spawned on it. /// It provides methods to safely pause workers before forking and restart them /// after fork in both parent and child processes. +#[derive(Debug)] pub struct SharedRuntime { runtime: Arc>>>, - workers: Arc>>>, -} - -impl std::fmt::Debug for SharedRuntime { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("SharedRuntime") - .field("runtime", &self.runtime) - .field("workers", &"") - .finish() - } + workers: Arc>>, + next_worker_id: AtomicU64, } impl SharedRuntime { @@ -95,6 +127,7 @@ impl SharedRuntime { Ok(Self { runtime: Arc::new(Mutex::new(Some(Arc::new(runtime)))), workers: Arc::new(Mutex::new(Vec::new())), + next_worker_id: AtomicU64::new(1), }) } @@ -108,9 +141,10 @@ impl SharedRuntime { pub fn spawn_worker( &self, worker: T, - ) -> Result<(), SharedRuntimeError> { + ) -> Result { let boxed_worker: BoxedWorker = Box::new(worker); let mut pausable_worker = PausableWorker::new(boxed_worker); + let worker_id = self.next_worker_id.fetch_add(1, Ordering::Relaxed); let runtime_lock = self.runtime.lock_or_panic(); @@ -121,9 +155,15 @@ impl SharedRuntime { } let mut workers_lock = self.workers.lock_or_panic(); - workers_lock.push(pausable_worker); + workers_lock.push(WorkerEntry { + id: worker_id, + worker: pausable_worker, + }); - Ok(()) + Ok(WorkerHandle { + worker_id, + workers: self.workers.clone(), + }) } /// Hook to be called before forking. @@ -140,12 +180,12 @@ impl SharedRuntime { let mut workers_lock = self.workers.lock_or_panic(); // First signal all workers to pause, then wait for each one to stop. - for pausable_worker in workers_lock.iter_mut() { - pausable_worker.request_pause()?; + for worker_entry in workers_lock.iter_mut() { + worker_entry.worker.request_pause()?; } - for pausable_worker in workers_lock.iter_mut() { - pausable_worker.join().await?; + for worker_entry in workers_lock.iter_mut() { + worker_entry.worker.join().await?; } Ok::<(), PausableWorkerError>(()) })?; @@ -184,8 +224,13 @@ impl SharedRuntime { let mut workers_lock = self.workers.lock_or_panic(); // Restart all workers - for pausable_worker in workers_lock.iter_mut() { - pausable_worker.start(runtime)?; + for worker_entry in workers_lock.iter_mut() { + if let Err(err) = worker_entry.worker.start(runtime) { + // Ignore worker not started because they are stopped + if !matches!(err, PausableWorkerError::WorkerStopped) { + return Err(err.into()); + } + } } Ok(()) @@ -210,9 +255,13 @@ impl SharedRuntime { let mut workers_lock = self.workers.lock_or_panic(); // Restart all workers in child process - for pausable_worker in workers_lock.iter_mut() { - pausable_worker.reset(); - pausable_worker.start(runtime)?; + for worker_entry in workers_lock.iter_mut() { + worker_entry.worker.reset(); + if let Err(err) = worker_entry.worker.start(runtime) { + if !matches!(err, PausableWorkerError::WorkerStopped) { + return Err(err.into()); + } + } } Ok(()) @@ -236,24 +285,24 @@ impl SharedRuntime { /// This should be called during application shutdown to cleanly stop all /// background workers and the runtime. /// - /// Note: The runtime itself is not dropped by this method to avoid issues with - /// dropping a runtime from within an async context. The runtime will be dropped - /// when the SharedRuntime is dropped from a synchronous context. - /// /// # Errors /// Returns an error if workers cannot be stopped. pub async fn shutdown(&self) -> Result<(), SharedRuntimeError> { - let mut workers_lock = self.workers.lock_or_panic(); - - // Pause all workers - for pausable_worker in workers_lock.iter_mut() { - pausable_worker.pause().await?; + let workers = { + let mut workers_lock = self.workers.lock_or_panic(); + std::mem::take(&mut *workers_lock) + }; + + let mut join_set = JoinSet::new(); + for mut worker_entry in workers { + join_set.spawn(async move { + worker_entry.worker.pause().await?; + worker_entry.worker.shutdown().await; + Ok::<(), PausableWorkerError>(()) + }); } - // Note: We don't drop the runtime here because dropping a runtime from - // within an async context causes a panic. The runtime will be properly - // cleaned up when SharedRuntime is dropped from a synchronous context. - + join_set.join_all().await; Ok(()) } } @@ -272,6 +321,7 @@ mod tests { use std::time::Duration; use tokio::time::sleep; + #[derive(Debug)] struct TestWorker { state: u32, sender: Sender, @@ -301,9 +351,28 @@ mod tests { let (sender, _receiver) = channel::(); let worker = TestWorker { state: 0, sender }; - // TODO: Complete this test once spawn_worker properly stores workers let result = shared_runtime.spawn_worker(worker); assert!(result.is_ok()); + assert_eq!(shared_runtime.workers.lock_or_panic().len(), 1); + } + + #[test] + fn test_worker_handle_stop_marks_worker_stopped() { + let rt = tokio::runtime::Runtime::new().unwrap(); + let shared_runtime = SharedRuntime::new().unwrap(); + let (sender, _receiver) = channel::(); + let worker = TestWorker { state: 0, sender }; + + let handle = shared_runtime.spawn_worker(worker).unwrap(); + assert_eq!(shared_runtime.workers.lock_or_panic().len(), 1); + + rt.block_on(async { + assert!(handle.stop().await.is_ok()); + }); + + let workers_lock = shared_runtime.workers.lock_or_panic(); + assert_eq!(workers_lock.len(), 1); + assert!(matches!(workers_lock[0].worker, PausableWorker::Stopped)); } #[test] diff --git a/libdd-data-pipeline/src/stats_exporter.rs b/libdd-data-pipeline/src/stats_exporter.rs index b1cd68cfdd..0205752f7d 100644 --- a/libdd-data-pipeline/src/stats_exporter.rs +++ b/libdd-data-pipeline/src/stats_exporter.rs @@ -17,7 +17,6 @@ use libdd_common::{worker::Worker, Endpoint, HttpClient}; use libdd_trace_protobuf::pb; use libdd_trace_stats::span_concentrator::SpanConcentrator; use libdd_trace_utils::send_with_retry::{send_with_retry, RetryStrategy}; -use tokio_util::sync::CancellationToken; use tracing::error; const STATS_ENDPOINT_PATH: &str = "/v0.6/stats"; @@ -30,7 +29,6 @@ pub struct StatsExporter { endpoint: Endpoint, meta: TracerMetadata, sequence_id: AtomicU64, - cancellation_token: CancellationToken, client: HttpClient, } @@ -48,7 +46,6 @@ impl StatsExporter { concentrator: Arc>, meta: TracerMetadata, endpoint: Endpoint, - cancellation_token: CancellationToken, client: HttpClient, ) -> Self { Self { @@ -57,7 +54,6 @@ impl StatsExporter { endpoint, meta, sequence_id: AtomicU64::new(0), - cancellation_token, client, } } @@ -143,14 +139,9 @@ impl Worker for StatsExporter { let _ = self.send(false).await; } - fn shutdown(&mut self) { + async fn shutdown(&mut self) { // Force flush all stats on shutdown - let rt = tokio::runtime::Handle::try_current(); - if let Ok(handle) = rt { - handle.block_on(async { - let _ = self.send(true).await; - }); - } + let _ = self.send(true).await; } } @@ -268,7 +259,6 @@ mod tests { Arc::new(Mutex::new(get_test_concentrator())), get_test_metadata(), Endpoint::from_url(stats_url_from_agent_url(&server.url("/")).unwrap()), - CancellationToken::new(), new_default_client(), ); @@ -296,7 +286,6 @@ mod tests { Arc::new(Mutex::new(get_test_concentrator())), get_test_metadata(), Endpoint::from_url(stats_url_from_agent_url(&server.url("/")).unwrap()), - CancellationToken::new(), new_default_client(), ); @@ -329,7 +318,6 @@ mod tests { Arc::new(Mutex::new(get_test_concentrator())), get_test_metadata(), Endpoint::from_url(stats_url_from_agent_url(&server.url("/")).unwrap()), - CancellationToken::new(), new_default_client(), ); @@ -347,42 +335,42 @@ mod tests { ); } - #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn test_cancellation_token() { - let server = MockServer::start_async().await; - - let mut mock = server - .mock_async(|when, then| { - when.method(POST) - .header("Content-type", "application/msgpack") - .path("/v0.6/stats") - .body_includes("libdatadog-test"); - then.status(200).body(""); - }) - .await; - - let buckets_duration = Duration::from_secs(10); - let cancellation_token = CancellationToken::new(); - - let mut stats_exporter = StatsExporter::new( - buckets_duration, - Arc::new(Mutex::new(get_test_concentrator())), - get_test_metadata(), - Endpoint::from_url(stats_url_from_agent_url(&server.url("/")).unwrap()), - cancellation_token.clone(), - new_default_client(), - ); - - tokio::spawn(async move { - stats_exporter.run().await; - }); - // Cancel token to trigger force flush - cancellation_token.cancel(); - - assert!( - poll_for_mock_hit(&mut mock, 10, 100, 1, false).await, - "Expected max retry attempts" - ); - } + // #[cfg_attr(miri, ignore)] + // #[tokio::test] + // async fn test_cancellation_token() { + // let server = MockServer::start_async().await; + // + // let mut mock = server + // .mock_async(|when, then| { + // when.method(POST) + // .header("Content-type", "application/msgpack") + // .path("/v0.6/stats") + // .body_includes("libdatadog-test"); + // then.status(200).body(""); + // }) + // .await; + // + // let buckets_duration = Duration::from_secs(10); + // let cancellation_token = CancellationToken::new(); + // + // let mut stats_exporter = StatsExporter::new( + // buckets_duration, + // Arc::new(Mutex::new(get_test_concentrator())), + // get_test_metadata(), + // Endpoint::from_url(stats_url_from_agent_url(&server.url("/")).unwrap()), + // cancellation_token.clone(), + // new_default_client(), + // ); + // + // tokio::spawn(async move { + // stats_exporter.run().await; + // }); + // // Cancel token to trigger force flush + // cancellation_token.cancel(); + // + // assert!( + // poll_for_mock_hit(&mut mock, 10, 100, 1, false).await, + // "Expected max retry attempts" + // ); + // } } diff --git a/libdd-data-pipeline/src/trace_exporter/stats.rs b/libdd-data-pipeline/src/trace_exporter/stats.rs index 85ef3d5284..dfd01e5449 100644 --- a/libdd-data-pipeline/src/trace_exporter/stats.rs +++ b/libdd-data-pipeline/src/trace_exporter/stats.rs @@ -99,7 +99,6 @@ fn create_and_start_stats_worker( stats_concentrator.clone(), ctx.metadata.clone(), Endpoint::from_url(add_path(ctx.endpoint_url, STATS_ENDPOINT)), - cancellation_token.clone(), client, ); ctx.shared_runtime From 592c24c442ef5b67c5d834fd31b25bb4ec9816bf Mon Sep 17 00:00:00 2001 From: vianney Date: Wed, 18 Feb 2026 15:35:05 +0100 Subject: [PATCH 10/80] refactor(worker): remove stopped status --- libdd-data-pipeline/src/pausable_worker.rs | 75 +++++++++++----------- libdd-data-pipeline/src/shared_runtime.rs | 31 +++------ 2 files changed, 47 insertions(+), 59 deletions(-) diff --git a/libdd-data-pipeline/src/pausable_worker.rs b/libdd-data-pipeline/src/pausable_worker.rs index b37294e65a..d49149f2d7 100644 --- a/libdd-data-pipeline/src/pausable_worker.rs +++ b/libdd-data-pipeline/src/pausable_worker.rs @@ -31,7 +31,6 @@ pub enum PausableWorker { Paused { worker: T, }, - Stopped, InvalidState, } @@ -39,7 +38,6 @@ pub enum PausableWorker { pub enum PausableWorkerError { InvalidState, TaskAborted, - WorkerStopped, } impl Display for PausableWorkerError { @@ -51,9 +49,6 @@ impl Display for PausableWorkerError { PausableWorkerError::TaskAborted => { write!(f, "Worker task has been aborted and state has been lost.") } - PausableWorkerError::WorkerStopped => { - write!(f, "Worker has been definitely stopped") - } } } } @@ -73,44 +68,49 @@ impl PausableWorker { /// # Errors /// Fails if the worker is in an invalid state. pub fn start(&mut self, rt: &Runtime) -> Result<(), PausableWorkerError> { - if let Self::Running { .. } = self { - Ok(()) - } else if let Self::Stopped = self { - Err(PausableWorkerError::WorkerStopped) - } else if let Self::Paused { mut worker } = std::mem::replace(self, Self::InvalidState) { - // Worker is temporarily in an invalid state, but since this block is failsafe it will - // be replaced by a valid state. - let stop_token = CancellationToken::new(); - let cloned_token = stop_token.clone(); - let handle = rt.spawn(async move { - // First iteration: use initial_trigger - select! { - _ = worker.initial_trigger() => { - worker.run().await; - } - _ = cloned_token.cancelled() => { - return worker; - } - } + match self { + PausableWorker::Running { .. } => Ok(()), + PausableWorker::Paused { .. } => { + let PausableWorker::Paused { mut worker } = + std::mem::replace(self, PausableWorker::InvalidState) + else { + // Unreachable + return Ok(()); + }; - // Subsequent iterations: use regular trigger - loop { + // Worker is temporarily in an invalid state, but since this block is failsafe it + // will be replaced by a valid state. + let stop_token = CancellationToken::new(); + let cloned_token = stop_token.clone(); + let handle = rt.spawn(async move { + // First iteration using initial_trigger select! { - _ = worker.trigger() => { + _ = worker.initial_trigger() => { worker.run().await; } _ = cloned_token.cancelled() => { - break; + return worker; } } - } - worker - }); - *self = PausableWorker::Running { handle, stop_token }; - Ok(()) - } else { - Err(PausableWorkerError::InvalidState) + // Regular iterations + loop { + select! { + _ = worker.trigger() => { + worker.run().await; + } + _ = cloned_token.cancelled() => { + break; + } + } + } + worker + }); + + *self = PausableWorker::Running { handle, stop_token }; + Ok(()) + } + PausableWorker::InvalidState => Err(PausableWorkerError::InvalidState), } } @@ -127,7 +127,6 @@ impl PausableWorker { Ok(()) } PausableWorker::Paused { .. } => Ok(()), - PausableWorker::Stopped => Ok(()), PausableWorker::InvalidState => Err(PausableWorkerError::InvalidState), } } @@ -155,7 +154,7 @@ impl PausableWorker { Err(PausableWorkerError::TaskAborted) } } - PausableWorker::Paused { .. } | PausableWorker::Stopped => Ok(()), + PausableWorker::Paused { .. } => Ok(()), PausableWorker::InvalidState => Err(PausableWorkerError::InvalidState), } } @@ -181,7 +180,6 @@ impl PausableWorker { if let PausableWorker::Paused { worker } = self { worker.shutdown().await; } - *self = PausableWorker::Stopped; } } @@ -197,6 +195,7 @@ mod tests { }; /// Test worker incrementing the state and sending it with the sender. + #[derive(Debug)] struct TestWorker { state: u32, sender: Sender, diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index ab9bdfd3f1..e644b2aecd 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -35,10 +35,10 @@ pub struct WorkerHandle { } impl WorkerHandle { - /// Stop the worker and call it's shutdown logic. + /// Stop the worker, call it's shutdown logic and remove it from the worker list. /// /// # Errors - /// Returns an error if the worker does not exist anymore or is already stopped. + /// Returns an error if the worker does not exist anymore. pub async fn stop(self) -> Result<(), SharedRuntimeError> { let mut workers_lock = self.workers.lock_or_panic(); let Some(position) = workers_lock @@ -46,12 +46,12 @@ impl WorkerHandle { .position(|entry| entry.id == self.worker_id) else { return Err(SharedRuntimeError::Other( - "Worker not found or already stopped".to_string(), + "Worker has already been stopped".to_string(), )); }; - workers_lock[position].worker.pause().await?; - workers_lock[position].worker.shutdown().await; - workers_lock[position].worker = PausableWorker::Stopped; + let WorkerEntry { mut worker, .. } = workers_lock.swap_remove(position); + worker.pause().await?; + worker.shutdown().await; Ok(()) } } @@ -225,12 +225,7 @@ impl SharedRuntime { // Restart all workers for worker_entry in workers_lock.iter_mut() { - if let Err(err) = worker_entry.worker.start(runtime) { - // Ignore worker not started because they are stopped - if !matches!(err, PausableWorkerError::WorkerStopped) { - return Err(err.into()); - } - } + worker_entry.worker.start(runtime)?; } Ok(()) @@ -257,11 +252,7 @@ impl SharedRuntime { // Restart all workers in child process for worker_entry in workers_lock.iter_mut() { worker_entry.worker.reset(); - if let Err(err) = worker_entry.worker.start(runtime) { - if !matches!(err, PausableWorkerError::WorkerStopped) { - return Err(err.into()); - } - } + worker_entry.worker.start(runtime)?; } Ok(()) @@ -357,7 +348,7 @@ mod tests { } #[test] - fn test_worker_handle_stop_marks_worker_stopped() { + fn test_worker_handle_stop_removes_worker() { let rt = tokio::runtime::Runtime::new().unwrap(); let shared_runtime = SharedRuntime::new().unwrap(); let (sender, _receiver) = channel::(); @@ -370,9 +361,7 @@ mod tests { assert!(handle.stop().await.is_ok()); }); - let workers_lock = shared_runtime.workers.lock_or_panic(); - assert_eq!(workers_lock.len(), 1); - assert!(matches!(workers_lock[0].worker, PausableWorker::Stopped)); + assert_eq!(shared_runtime.workers.lock_or_panic().len(), 0); } #[test] From ff6448c39bc0f5a4a933885c73ee2f9d6f2c690b Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 20 Feb 2026 16:10:06 +0100 Subject: [PATCH 11/80] chore(telemetry): move telemetry shutdown to worker --- libdd-data-pipeline/src/shared_runtime.rs | 4 +-- libdd-data-pipeline/src/trace_exporter/mod.rs | 17 +++++------- .../src/trace_exporter/stats.rs | 27 ++++++++++--------- libdd-telemetry/src/worker/mod.rs | 10 +++++++ 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index e644b2aecd..180e30c3d2 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -35,10 +35,10 @@ pub struct WorkerHandle { } impl WorkerHandle { - /// Stop the worker, call it's shutdown logic and remove it from the worker list. + /// Stop the worker and execute the shutdown logic. /// /// # Errors - /// Returns an error if the worker does not exist anymore. + /// Returns an error if the worker has already been stopped. pub async fn stop(self) -> Result<(), SharedRuntimeError> { let mut workers_lock = self.workers.lock_or_panic(); let Some(position) = workers_lock diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 053c268569..471a3f3708 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -271,16 +271,7 @@ impl TraceExporter { /// This function should not take ownership of the trace exporter as it will cause the runtime /// stored in the trace exporter to be dropped in a non-blocking context causing a panic. async fn shutdown_async(&mut self) { - if let StatsComputationStatus::Enabled { - cancellation_token, .. - } = self.client_side_stats.load().as_ref() - { - cancellation_token.cancel(); - } - if let Some(telemetry) = self.telemetry.take() { - telemetry.shutdown().await; - } - let _ = self.shared_runtime.shutdown().await; + self.shared_runtime.shutdown().await; } /// Check if agent info state has changed @@ -314,7 +305,13 @@ impl TraceExporter { StatsComputationStatus::Enabled { stats_concentrator, .. } => { + let ctx = stats::StatsContext { + metadata: &self.metadata, + endpoint_url: &self.endpoint.url, + shared_runtime: &self.shared_runtime, + }; stats::handle_stats_enabled( + &ctx, &agent_info, stats_concentrator, &self.client_side_stats, diff --git a/libdd-data-pipeline/src/trace_exporter/stats.rs b/libdd-data-pipeline/src/trace_exporter/stats.rs index dfd01e5449..51374aaca0 100644 --- a/libdd-data-pipeline/src/trace_exporter/stats.rs +++ b/libdd-data-pipeline/src/trace_exporter/stats.rs @@ -8,14 +8,13 @@ //! and processing traces for stats collection. use crate::agent_info::schema::AgentInfo; -use crate::shared_runtime::SharedRuntime; +use crate::shared_runtime::{SharedRuntime, WorkerHandle}; use crate::stats_exporter; use arc_swap::ArcSwap; use libdd_common::{Endpoint, HttpClient, MutexExt}; use libdd_trace_stats::span_concentrator::SpanConcentrator; use std::sync::{Arc, Mutex}; use std::time::Duration; -use tokio_util::sync::CancellationToken; use tracing::{debug, error}; use super::add_path; @@ -42,7 +41,7 @@ pub(crate) enum StatsComputationStatus { /// Client-side stats is enabled Enabled { stats_concentrator: Arc>, - cancellation_token: CancellationToken, + worker_handle: WorkerHandle, }, } @@ -72,12 +71,10 @@ pub(crate) fn start_stats_computation( span_kinds, peer_tags, ))); - let cancellation_token = CancellationToken::new(); create_and_start_stats_worker( ctx, bucket_size, &stats_concentrator, - &cancellation_token, client_side_stats, client, )?; @@ -90,7 +87,6 @@ fn create_and_start_stats_worker( ctx: &StatsContext, bucket_size: Duration, stats_concentrator: &Arc>, - cancellation_token: &CancellationToken, client_side_stats: &ArcSwap, client: HttpClient, ) -> anyhow::Result<()> { @@ -101,14 +97,15 @@ fn create_and_start_stats_worker( Endpoint::from_url(add_path(ctx.endpoint_url, STATS_ENDPOINT)), client, ); - ctx.shared_runtime + let worker_handle = ctx + .shared_runtime .spawn_worker(stats_exporter) .map_err(|e| anyhow::anyhow!(e.to_string()))?; // Update the stats computation state with the new worker components. client_side_stats.store(Arc::new(StatsComputationStatus::Enabled { stats_concentrator: stats_concentrator.clone(), - cancellation_token: cancellation_token.clone(), + worker_handle, })); Ok(()) @@ -117,17 +114,22 @@ fn create_and_start_stats_worker( /// Stops the stats exporter and disable stats computation /// /// Used when client-side stats is disabled by the agent -pub(crate) fn stop_stats_computation(client_side_stats: &ArcSwap) { +pub(crate) fn stop_stats_computation( + ctx: &StatsContext, + client_side_stats: &ArcSwap, +) { if let StatsComputationStatus::Enabled { stats_concentrator, - cancellation_token, + worker_handle, } = &**client_side_stats.load() { - cancellation_token.cancel(); let bucket_size = stats_concentrator.lock_or_panic().get_bucket_size(); client_side_stats.store(Arc::new(StatsComputationStatus::DisabledByAgent { bucket_size, })); + ctx.shared_runtime + .runtime() + .block_on(async { worker_handle.clone().stop().await }); } } @@ -158,6 +160,7 @@ pub(crate) fn handle_stats_disabled_by_agent( /// Handle stats computation when it's already enabled pub(crate) fn handle_stats_enabled( + ctx: &StatsContext, agent_info: &Arc, stats_concentrator: &Mutex, client_side_stats: &ArcSwap, @@ -167,7 +170,7 @@ pub(crate) fn handle_stats_enabled( concentrator.set_span_kinds(get_span_kinds_for_stats(agent_info)); concentrator.set_peer_tags(agent_info.info.peer_tags.clone().unwrap_or_default()); } else { - stop_stats_computation(client_side_stats); + stop_stats_computation(ctx, client_side_stats); debug!("Client-side stats computation has been disabled by the agent") } } diff --git a/libdd-telemetry/src/worker/mod.rs b/libdd-telemetry/src/worker/mod.rs index ed21345e0d..27ca18caa3 100644 --- a/libdd-telemetry/src/worker/mod.rs +++ b/libdd-telemetry/src/worker/mod.rs @@ -187,6 +187,16 @@ impl Worker for TelemetryWorker { // TODO: Handle action result and add support to stop worker from `run` } + + async fn shutdown(&mut self) { + let stop_action = TelemetryActions::Lifecycle(LifecycleAction::Stop); + let _action_result = match self.flavor { + TelemetryWorkerFlavor::Full => self.dispatch_action(stop_action).await, + TelemetryWorkerFlavor::MetricsLogs => { + self.dispatch_metrics_logs_action(stop_action).await + } + }; + } } #[derive(Debug, Default, Serialize, Deserialize)] From 0dbce861bd74745c98bef6e8e4efd4866b7a4282 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 23 Feb 2026 14:24:13 +0100 Subject: [PATCH 12/80] chore(shared-runtime): update error types --- libdd-common/src/worker.rs | 17 ++-- libdd-data-pipeline/src/pausable_worker.rs | 9 -- libdd-data-pipeline/src/shared_runtime.rs | 90 +++++++++++-------- libdd-data-pipeline/src/telemetry/mod.rs | 8 -- .../src/trace_exporter/builder.rs | 19 +++- libdd-data-pipeline/src/trace_exporter/mod.rs | 22 ++--- .../src/trace_exporter/stats.rs | 9 +- 7 files changed, 92 insertions(+), 82 deletions(-) diff --git a/libdd-common/src/worker.rs b/libdd-common/src/worker.rs index bae37c66a4..cbd2f85a71 100644 --- a/libdd-common/src/worker.rs +++ b/libdd-common/src/worker.rs @@ -5,12 +5,11 @@ use async_trait::async_trait; /// Trait representing a generic worker. /// -/// The worker runs an async looping function running periodic tasks. +/// # Lifecycle +/// The worker's `Self::run` method should be executed everytime the `Self::trigger` method returns. +/// On startup `Self::initial_trigger` should be called before `Self::run`. /// -/// This trait can be used to provide wrapper around a worker. /// -/// This trait is dyn-compatible thanks to the `async_trait` macro, -/// which allows it to be used as `Box`. #[async_trait] pub trait Worker: std::fmt::Debug { /// Main worker function @@ -26,14 +25,10 @@ pub trait Worker: std::fmt::Debug { } /// Reset the worker in the child after a fork - fn reset(&mut self) { - return; - } + fn reset(&mut self) {} - /// Hook called when the app is shutting down. Used to flush all data. - async fn shutdown(&mut self) { - return; - } + /// Hook called when the app is shutting down. Can be used to flush remaining data. + async fn shutdown(&mut self) {} } // Blanket implementation for boxed trait objects diff --git a/libdd-data-pipeline/src/pausable_worker.rs b/libdd-data-pipeline/src/pausable_worker.rs index d49149f2d7..50d478869d 100644 --- a/libdd-data-pipeline/src/pausable_worker.rs +++ b/libdd-data-pipeline/src/pausable_worker.rs @@ -64,9 +64,6 @@ impl PausableWorker { /// Start the worker on the given runtime. /// /// The worker's main loop will be run on the runtime. - /// - /// # Errors - /// Fails if the worker is in an invalid state. pub fn start(&mut self, rt: &Runtime) -> Result<(), PausableWorkerError> { match self { PausableWorker::Running { .. } => Ok(()), @@ -117,9 +114,6 @@ impl PausableWorker { /// Request the worker to pause without waiting for task termination. /// /// This is useful when pausing multiple workers in parallel. - /// - /// # Errors - /// Fails if the worker is in an invalid state. pub fn request_pause(&self) -> Result<(), PausableWorkerError> { match self { PausableWorker::Running { stop_token, .. } => { @@ -160,9 +154,6 @@ impl PausableWorker { } /// Pause the worker saving it's state to be restarted. - /// - /// # Errors - /// Fails if the worker handle has been aborted preventing the worker from being retrieved. pub async fn pause(&mut self) -> Result<(), PausableWorkerError> { self.request_pause()?; self.join().await diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index 180e30c3d2..51b16333b5 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -10,13 +10,12 @@ use crate::pausable_worker::{PausableWorker, PausableWorkerError}; use libdd_common::{worker::Worker, MutexExt}; -use std::fmt; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; +use std::{fmt, io}; use tokio::runtime::{Builder, Runtime}; use tokio::task::JoinSet; -/// Type alias for a boxed worker trait object that can be used with PausableWorker. type BoxedWorker = Box; #[derive(Debug)] @@ -34,22 +33,46 @@ pub struct WorkerHandle { workers: Arc>>, } +#[derive(Debug)] +pub enum WorkerHandleError { + AlreadyStopped, + WorkerError(PausableWorkerError), +} + +impl fmt::Display for WorkerHandleError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::AlreadyStopped => { + write!(f, "Worker has already been stopped") + } + Self::WorkerError(err) => write!(f, "Worker error: {}", err), + } + } +} + +impl From for WorkerHandleError { + fn from(err: PausableWorkerError) -> Self { + Self::WorkerError(err) + } +} + impl WorkerHandle { /// Stop the worker and execute the shutdown logic. /// /// # Errors /// Returns an error if the worker has already been stopped. - pub async fn stop(self) -> Result<(), SharedRuntimeError> { - let mut workers_lock = self.workers.lock_or_panic(); - let Some(position) = workers_lock - .iter() - .position(|entry| entry.id == self.worker_id) - else { - return Err(SharedRuntimeError::Other( - "Worker has already been stopped".to_string(), - )); + pub async fn stop(self) -> Result<(), WorkerHandleError> { + let mut worker = { + let mut workers_lock = self.workers.lock_or_panic(); + let Some(position) = workers_lock + .iter() + .position(|entry| entry.id == self.worker_id) + else { + return Err(WorkerHandleError::AlreadyStopped); + }; + let WorkerEntry { worker, .. } = workers_lock.swap_remove(position); + worker }; - let WorkerEntry { mut worker, .. } = workers_lock.swap_remove(position); worker.pause().await?; worker.shutdown().await; Ok(()) @@ -65,24 +88,21 @@ pub enum SharedRuntimeError { LockFailed(String), /// A worker operation failed. WorkerError(PausableWorkerError), - /// Failed to create or manage the tokio runtime. - RuntimeCreation(std::io::Error), - /// A generic error occurred. - Other(String), + /// Failed to create the tokio runtime. + RuntimeCreation(io::Error), } impl fmt::Display for SharedRuntimeError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - SharedRuntimeError::RuntimeUnavailable => { + Self::RuntimeUnavailable => { write!(f, "Runtime is not available or in an invalid state") } - SharedRuntimeError::LockFailed(msg) => write!(f, "Failed to acquire lock: {}", msg), - SharedRuntimeError::WorkerError(err) => write!(f, "Worker error: {}", err), - SharedRuntimeError::RuntimeCreation(err) => { + Self::LockFailed(msg) => write!(f, "Failed to acquire lock: {}", msg), + Self::WorkerError(err) => write!(f, "Worker error: {}", err), + Self::RuntimeCreation(err) => { write!(f, "Failed to create runtime: {}", err) } - SharedRuntimeError::Other(msg) => write!(f, "{}", msg), } } } @@ -95,8 +115,8 @@ impl From for SharedRuntimeError { } } -impl From for SharedRuntimeError { - fn from(err: std::io::Error) -> Self { +impl From for SharedRuntimeError { + fn from(err: io::Error) -> Self { SharedRuntimeError::RuntimeCreation(err) } } @@ -176,10 +196,8 @@ impl SharedRuntime { /// Returns an error if workers cannot be paused or the runtime is in an invalid state. pub fn before_fork(&self) -> Result<(), SharedRuntimeError> { if let Some(runtime) = self.runtime.lock_or_panic().take() { - runtime.block_on(async { - let mut workers_lock = self.workers.lock_or_panic(); - - // First signal all workers to pause, then wait for each one to stop. + let mut workers_lock = self.workers.lock_or_panic(); + runtime.block_on(async move { for worker_entry in workers_lock.iter_mut() { worker_entry.worker.request_pause()?; } @@ -258,16 +276,18 @@ impl SharedRuntime { Ok(()) } - /// Get a reference to the underlying runtime. + /// Get a reference to the underlying runtime or create a single-threaded one. /// /// This allows external code to spawn additional tasks on the runtime if needed. /// /// # Errors - /// Returns None if the runtime is not available (e.g., during fork operations). - pub fn runtime(&self) -> Arc { + /// Returns an error if it fails to create a runtime. + pub fn runtime(&self) -> Result, io::Error> { match self.runtime.lock_or_panic().as_ref() { - None => Arc::new(Builder::new_current_thread().enable_all().build().unwrap()), - Some(runtime) => runtime.clone(), + None => Ok(Arc::new( + Builder::new_current_thread().enable_all().build()?, + )), + Some(runtime) => Ok(runtime.clone()), } } @@ -298,12 +318,6 @@ impl SharedRuntime { } } -impl Default for SharedRuntime { - fn default() -> Self { - Self::new().expect("Failed to create default SharedRuntime") - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index 9715aa50ae..3276ae7652 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -297,14 +297,6 @@ impl TelemetryClient { .send_msg(TelemetryActions::Lifecycle(LifecycleAction::Start)) .await; } - - /// Shutdowns the telemetry client. - pub async fn shutdown(self) { - _ = self - .worker - .send_msg(TelemetryActions::Lifecycle(LifecycleAction::Stop)) - .await; - } } #[cfg(test)] diff --git a/libdd-data-pipeline/src/trace_exporter/builder.rs b/libdd-data-pipeline/src/trace_exporter/builder.rs index 6567430788..69ee14ced1 100644 --- a/libdd-data-pipeline/src/trace_exporter/builder.rs +++ b/libdd-data-pipeline/src/trace_exporter/builder.rs @@ -270,19 +270,32 @@ impl TraceExporterBuilder { if let Some(id) = telemetry_config.runtime_id { builder = builder.set_runtime_id(&id); } - builder.build(shared_runtime.runtime().handle().clone()) + let runtime = shared_runtime.runtime().map_err(|e| { + TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration(e.to_string())) + })?; + // This handle is never used since we run it as a SharedRuntime worker. So it is fine + // if the tokio runtime is dropped by SharedRuntime. + Ok(builder.build(runtime.handle().clone())) }); let telemetry_client = match telemetry { - Some((client, worker)) => { + Some(Ok((client, worker))) => { shared_runtime.spawn_worker(worker).map_err(|e| { TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration( e.to_string(), )) })?; - shared_runtime.runtime().block_on(client.start()); + shared_runtime + .runtime() + .map_err(|e| { + TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration( + e.to_string(), + )) + })? + .block_on(client.start()); Some(client) } + Some(Err(e)) => return Err(e), None => None, }; diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 471a3f3708..c498efbe6e 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -14,7 +14,7 @@ use self::agent_response::AgentResponse; use self::metrics::MetricsEmitter; use self::stats::StatsComputationStatus; use self::trace_serializer::TraceSerializer; -use crate::agent_info::{AgentInfoFetcher, ResponseObserver}; +use crate::agent_info::ResponseObserver; use crate::shared_runtime::SharedRuntime; use crate::telemetry::{SendPayloadTelemetry, TelemetryClient}; use crate::trace_exporter::agent_response::{ @@ -31,8 +31,8 @@ use http::uri::PathAndQuery; use http::Uri; use http_body_util::BodyExt; use libdd_common::tag::Tag; +use libdd_common::HttpClient; use libdd_common::{http_common, Endpoint}; -use libdd_common::{HttpClient, MutexExt}; use libdd_dogstatsd_client::Client; use libdd_trace_utils::msgpack_decoder; use libdd_trace_utils::send_with_retry::{ @@ -204,7 +204,9 @@ impl TraceExporter { /// Return a runtime from the shared runtime manager. fn runtime(&self) -> Result, TraceExporterError> { - Ok(self.shared_runtime.runtime()) + self.shared_runtime + .runtime() + .map_err(|e| TraceExporterError::Io(e)) } /// Manually start all workers @@ -271,7 +273,7 @@ impl TraceExporter { /// This function should not take ownership of the trace exporter as it will cause the runtime /// stored in the trace exporter to be dropped in a non-blocking context causing a panic. async fn shutdown_async(&mut self) { - self.shared_runtime.shutdown().await; + let _ = self.shared_runtime.shutdown().await; } /// Check if agent info state has changed @@ -1397,7 +1399,7 @@ mod tests { traces_endpoint.assert_calls(1); while metrics_endpoint.calls() == 0 { - exporter.shared_runtime.runtime().block_on(async { + exporter.shared_runtime.runtime().unwrap().block_on(async { sleep(Duration::from_millis(100)).await; }) } @@ -1449,7 +1451,7 @@ mod tests { traces_endpoint.assert_calls(1); while metrics_endpoint.calls() == 0 { - exporter.shared_runtime.runtime().block_on(async { + exporter.shared_runtime.runtime().unwrap().block_on(async { sleep(Duration::from_millis(100)).await; }) } @@ -1512,7 +1514,7 @@ mod tests { traces_endpoint.assert_calls(1); while metrics_endpoint.calls() == 0 { - exporter.shared_runtime.runtime().block_on(async { + exporter.shared_runtime.runtime().unwrap().block_on(async { sleep(Duration::from_millis(100)).await; }) } @@ -1689,7 +1691,7 @@ mod tests { // Wait for the info fetcher to get the config while mock_info.calls() == 0 { - exporter.shared_runtime.runtime().block_on(async { + exporter.shared_runtime.runtime().unwrap().block_on(async { sleep(Duration::from_millis(100)).await; }) } @@ -1790,7 +1792,7 @@ mod single_threaded_tests { // Wait for the info fetcher to get the config while agent_info::get_agent_info().is_none() { - exporter.shared_runtime.runtime().block_on(async { + exporter.shared_runtime.runtime().unwrap().block_on(async { sleep(Duration::from_millis(100)).await; }) } @@ -1889,7 +1891,7 @@ mod single_threaded_tests { // Wait for agent_info to be present so that sending a trace will trigger the stats worker // to start while agent_info::get_agent_info().is_none() { - exporter.shared_runtime.runtime().block_on(async { + exporter.shared_runtime.runtime().unwrap().block_on(async { sleep(Duration::from_millis(100)).await; }) } diff --git a/libdd-data-pipeline/src/trace_exporter/stats.rs b/libdd-data-pipeline/src/trace_exporter/stats.rs index 51374aaca0..b33a014c1d 100644 --- a/libdd-data-pipeline/src/trace_exporter/stats.rs +++ b/libdd-data-pipeline/src/trace_exporter/stats.rs @@ -127,9 +127,12 @@ pub(crate) fn stop_stats_computation( client_side_stats.store(Arc::new(StatsComputationStatus::DisabledByAgent { bucket_size, })); - ctx.shared_runtime - .runtime() - .block_on(async { worker_handle.clone().stop().await }); + match ctx.shared_runtime.runtime() { + Ok(runtime) => { + let _ = runtime.block_on(async { worker_handle.clone().stop().await }); + } + Err(e) => error!("Failed to stop stats worker: {e}"), + } } } From 58bff21c38b07640363e7fa0df36373099f46f1e Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 26 Feb 2026 11:40:56 +0100 Subject: [PATCH 13/80] chore(shared-runtime): return detailed errors in before_fork --- libdd-data-pipeline/src/shared_runtime.rs | 23 ++++++++++++++----- libdd-data-pipeline/src/trace_exporter/mod.rs | 7 ++++-- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index 51b16333b5..f8b023a286 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -194,19 +194,30 @@ impl SharedRuntime { /// /// # Errors /// Returns an error if workers cannot be paused or the runtime is in an invalid state. - pub fn before_fork(&self) -> Result<(), SharedRuntimeError> { + pub fn before_fork(&self) -> Result<(), Vec> { if let Some(runtime) = self.runtime.lock_or_panic().take() { let mut workers_lock = self.workers.lock_or_panic(); - runtime.block_on(async move { + let results = runtime.block_on(async move { + let mut results = Vec::new(); for worker_entry in workers_lock.iter_mut() { - worker_entry.worker.request_pause()?; + let _ = worker_entry.worker.request_pause(); } for worker_entry in workers_lock.iter_mut() { - worker_entry.worker.join().await?; + results.push(worker_entry.worker.join().await); } - Ok::<(), PausableWorkerError>(()) - })?; + results + }); + + // Collect all errors + let errors: Vec = results + .into_iter() + .filter_map(|r| Some(r.err()?.into())) + .collect(); + + if !errors.is_empty() { + return Err(errors); + } } Ok(()) } diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index c498efbe6e..e6b50fd0e3 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -206,7 +206,7 @@ impl TraceExporter { fn runtime(&self) -> Result, TraceExporterError> { self.shared_runtime .runtime() - .map_err(|e| TraceExporterError::Io(e)) + .map_err(TraceExporterError::Io) } /// Manually start all workers @@ -217,7 +217,10 @@ impl TraceExporter { } pub fn stop_worker(&self) { - let _ = self.shared_runtime.before_fork(); + let errors = self.shared_runtime.before_fork(); + if let Err(errors) = errors { + error!("Some workers failed to stop: {errors:?}"); + } self.info_response_observer.manual_trigger(); } From 16ab63ba3f68a3e0b40a45d2ee68a6b48331f1ad Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 27 Feb 2026 17:41:14 +0100 Subject: [PATCH 14/80] chore(runtime): doc --- libdd-common/src/worker.rs | 4 +--- libdd-data-pipeline/src/trace_exporter/mod.rs | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/libdd-common/src/worker.rs b/libdd-common/src/worker.rs index cbd2f85a71..d8f1df4805 100644 --- a/libdd-common/src/worker.rs +++ b/libdd-common/src/worker.rs @@ -8,8 +8,6 @@ use async_trait::async_trait; /// # Lifecycle /// The worker's `Self::run` method should be executed everytime the `Self::trigger` method returns. /// On startup `Self::initial_trigger` should be called before `Self::run`. -/// -/// #[async_trait] pub trait Worker: std::fmt::Debug { /// Main worker function @@ -19,7 +17,7 @@ pub trait Worker: std::fmt::Debug { async fn trigger(&mut self); /// Alternative trigger called on start to provide custom behavior - /// Can be used to trigger first run right away. Defaults to `trigger` behavior. + /// Defaults to `trigger` behavior. async fn initial_trigger(&mut self) { self.trigger().await } diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index e6b50fd0e3..dfd7b932c8 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -216,6 +216,7 @@ impl TraceExporter { }) } + /// Stop all background workers and drop the tokio runtime pub fn stop_worker(&self) { let errors = self.shared_runtime.before_fork(); if let Err(errors) = errors { From 3625c0a225d91b78b7bbb14b4363385380030b33 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 2 Mar 2026 13:33:58 +0100 Subject: [PATCH 15/80] test(telemetry): use shared runtime in tests --- libdd-data-pipeline/src/telemetry/mod.rs | 930 ++++++++++++++--------- 1 file changed, 574 insertions(+), 356 deletions(-) diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index 3276ae7652..37d797eedc 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -304,27 +304,12 @@ mod tests { use http::{Response, StatusCode}; use httpmock::Method::POST; use httpmock::MockServer; - use libdd_common::{http_common, worker::Worker}; + use libdd_common::http_common; use regex::Regex; use tokio::time::sleep; use super::*; - - async fn get_test_client(url: &str) -> TelemetryClient { - let (client, mut worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(url) - .set_heartbeat(100) - .set_debug_enabled(true) - .build(Handle::current()); - tokio::spawn(async move { worker.run().await }); - client - } + use crate::shared_runtime::SharedRuntime; #[test] fn builder_test() { @@ -370,293 +355,503 @@ mod tests { } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn api_bytes_test() { + #[test] + fn api_bytes_test() { let payload = Regex::new(r#""metric":"trace_api.bytes","tags":\["src_library:libdatadog"\],"sketch_b64":".+","common":true,"interval":\d+,"type":"distribution""#).unwrap(); - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - bytes_sent: 1, - ..Default::default() - }; - - let client = get_test_client(&server.url("/")).await; + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + bytes_sent: 1, + ..Default::default() + }; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_debug_enabled(true) + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + let _ = client.send(&data); + handle.stop().await.expect("Failed to stop worker"); + while telemetry_srv.calls_async().await == 0 { + sleep(Duration::from_millis(10)).await; + } + telemetry_srv.assert_calls_async(1).await; + }); - client.start().await; - let _ = client.send(&data); - client.shutdown().await; - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } - telemetry_srv.assert_calls_async(1).await; + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn requests_test() { + #[test] + fn requests_test() { let payload = Regex::new(r#""metric":"trace_api.requests","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog"\],"common":true,"type":"count""#).unwrap(); - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - requests_count: 1, - ..Default::default() - }; - - let client = get_test_client(&server.url("/")).await; + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + requests_count: 1, + ..Default::default() + }; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_debug_enabled(true) + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + let _ = client.send(&data); + handle.stop().await.expect("Failed to stop worker"); + while telemetry_srv.calls_async().await == 0 { + sleep(Duration::from_millis(10)).await; + } + telemetry_srv.assert_calls_async(1).await; + }); - client.start().await; - let _ = client.send(&data); - client.shutdown().await; - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } - telemetry_srv.assert_calls_async(1).await; + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn responses_per_code_test() { + #[test] + fn responses_per_code_test() { let payload = Regex::new(r#""metric":"trace_api.responses","points":\[\[\d+,1\.0\]\],"tags":\["status_code:200","src_library:libdatadog"\],"common":true,"type":"count"#).unwrap(); - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - responses_count_per_code: HashMap::from([(200, 1)]), - ..Default::default() - }; - - let client = get_test_client(&server.url("/")).await; + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + responses_count_per_code: HashMap::from([(200, 1)]), + ..Default::default() + }; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_debug_enabled(true) + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + let _ = client.send(&data); + handle.stop().await.expect("Failed to stop worker"); + while telemetry_srv.calls_async().await == 0 { + sleep(Duration::from_millis(10)).await; + } + telemetry_srv.assert_calls_async(1).await; + }); - client.start().await; - let _ = client.send(&data); - client.shutdown().await; - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } - telemetry_srv.assert_calls_async(1).await; + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn errors_timeout_test() { + #[test] + fn errors_timeout_test() { let payload = Regex::new(r#""metric":"trace_api.errors","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","type:timeout"\],"common":true,"type":"count"#).unwrap(); - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - errors_timeout: 1, - ..Default::default() - }; - - let client = get_test_client(&server.url("/")).await; + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + errors_timeout: 1, + ..Default::default() + }; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_debug_enabled(true) + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + let _ = client.send(&data); + handle.stop().await.expect("Failed to stop worker"); + while telemetry_srv.calls_async().await == 0 { + sleep(Duration::from_millis(10)).await; + } + telemetry_srv.assert_calls_async(1).await; + }); - client.start().await; - let _ = client.send(&data); - client.shutdown().await; - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } - telemetry_srv.assert_calls_async(1).await; + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn errors_network_test() { + #[test] + fn errors_network_test() { let payload = Regex::new(r#""metric":"trace_api.errors","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","type:network"\],"common":true,"type":"count"#).unwrap(); - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - errors_network: 1, - ..Default::default() - }; - - let client = get_test_client(&server.url("/")).await; + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + errors_network: 1, + ..Default::default() + }; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_debug_enabled(true) + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + let _ = client.send(&data); + handle.stop().await.expect("Failed to stop worker"); + while telemetry_srv.calls_async().await == 0 { + sleep(Duration::from_millis(10)).await; + } + telemetry_srv.assert_calls_async(1).await; + }); - client.start().await; - let _ = client.send(&data); - client.shutdown().await; - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } - telemetry_srv.assert_calls_async(1).await; + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn errors_status_code_test() { + #[test] + fn errors_status_code_test() { let payload = Regex::new(r#""metric":"trace_api.errors","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","type:status_code"\],"common":true,"type":"count"#).unwrap(); - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - errors_status_code: 1, - ..Default::default() - }; - - let client = get_test_client(&server.url("/")).await; + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + errors_status_code: 1, + ..Default::default() + }; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_debug_enabled(true) + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + let _ = client.send(&data); + handle.stop().await.expect("Failed to stop worker"); + while telemetry_srv.calls_async().await == 0 { + sleep(Duration::from_millis(10)).await; + } + telemetry_srv.assert_calls_async(1).await; + }); - client.start().await; - let _ = client.send(&data); - client.shutdown().await; - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } - telemetry_srv.assert_calls_async(1).await; + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn chunks_sent_test() { + #[test] + fn chunks_sent_test() { let payload = Regex::new(r#""metric":"trace_chunks_sent","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog"\],"common":true,"type":"count"#).unwrap(); - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_sent: 1, - ..Default::default() - }; - - let client = get_test_client(&server.url("/")).await; + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + chunks_sent: 1, + ..Default::default() + }; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_debug_enabled(true) + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + let _ = client.send(&data); + handle.stop().await.expect("Failed to stop worker"); + while telemetry_srv.calls_async().await == 0 { + sleep(Duration::from_millis(10)).await; + } + telemetry_srv.assert_calls_async(1).await; + }); - client.start().await; - let _ = client.send(&data); - client.shutdown().await; - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } - telemetry_srv.assert_calls_async(1).await; + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn chunks_dropped_send_failure_test() { + #[test] + fn chunks_dropped_send_failure_test() { let payload = Regex::new(r#""metric":"trace_chunks_dropped","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","reason:send_failure"\],"common":true,"type":"count"#).unwrap(); - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_dropped_send_failure: 1, - ..Default::default() - }; - - let client = get_test_client(&server.url("/")).await; + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + chunks_dropped_send_failure: 1, + ..Default::default() + }; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_debug_enabled(true) + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + let _ = client.send(&data); + handle.stop().await.expect("Failed to stop worker"); + while telemetry_srv.calls_async().await == 0 { + sleep(Duration::from_millis(10)).await; + } + telemetry_srv.assert_calls_async(1).await; + }); - client.start().await; - let _ = client.send(&data); - client.shutdown().await; - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } - telemetry_srv.assert_calls_async(1).await; + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn chunks_dropped_p0_test() { + #[test] + fn chunks_dropped_p0_test() { let payload = Regex::new(r#""metric":"trace_chunks_dropped","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","reason:p0_drop"\],"common":true,"type":"count"#).unwrap(); - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_dropped_p0: 1, - ..Default::default() - }; - - let client = get_test_client(&server.url("/")).await; + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + chunks_dropped_p0: 1, + ..Default::default() + }; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_debug_enabled(true) + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + let _ = client.send(&data); + handle.stop().await.expect("Failed to stop worker"); + while telemetry_srv.calls_async().await == 0 { + sleep(Duration::from_millis(10)).await; + } + telemetry_srv.assert_calls_async(1).await; + }); - client.start().await; - let _ = client.send(&data); - client.shutdown().await; - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } - telemetry_srv.assert_calls_async(1).await; + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn chunks_dropped_serialization_error_test() { + #[test] + fn chunks_dropped_serialization_error_test() { let payload = Regex::new(r#""metric":"trace_chunks_dropped","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","reason:serialization_error"\],"common":true,"type":"count"#).unwrap(); - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_dropped_serialization_error: 1, - ..Default::default() - }; - - let client = get_test_client(&server.url("/")).await; + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + chunks_dropped_serialization_error: 1, + ..Default::default() + }; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_debug_enabled(true) + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + let _ = client.send(&data); + handle.stop().await.expect("Failed to stop worker"); + while telemetry_srv.calls_async().await == 0 { + sleep(Duration::from_millis(10)).await; + } + telemetry_srv.assert_calls_async(1).await; + }); - client.start().await; - let _ = client.send(&data); - client.shutdown().await; - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } - telemetry_srv.assert_calls_async(1).await; + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } #[test] @@ -718,25 +913,28 @@ mod tests { } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn telemetry_from_network_error_test() { - // Create an hyper error by calling an undefined service - let err = http_common::new_default_client() - .get(http::Uri::from_static("localhost:12345")) - .await - .unwrap_err(); - - let result = Err(SendWithRetryError::Network(http_common::into_error(err), 5)); - let telemetry = SendPayloadTelemetry::from_retry_result(&result, 1, 2, 0); - assert_eq!( - telemetry, - SendPayloadTelemetry { - chunks_dropped_send_failure: 2, - requests_count: 5, - errors_network: 1, - ..Default::default() - } - ) + #[test] + fn telemetry_from_network_error_test() { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + // Create an hyper error by calling an undefined service + let err = http_common::new_default_client() + .get(http::Uri::from_static("localhost:12345")) + .await + .unwrap_err(); + + let result = Err(SendWithRetryError::Network(http_common::into_error(err), 5)); + let telemetry = SendPayloadTelemetry::from_retry_result(&result, 1, 2, 0); + assert_eq!( + telemetry, + SendPayloadTelemetry { + chunks_dropped_send_failure: 2, + requests_count: 5, + errors_network: 1, + ..Default::default() + } + ) + }); } #[test] @@ -755,8 +953,8 @@ mod tests { } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn telemetry_from_build_error_test() { + #[test] + fn telemetry_from_build_error_test() { let result = Err(SendWithRetryError::Build(5)); let telemetry = SendPayloadTelemetry::from_retry_result(&result, 1, 2, 0); assert_eq!( @@ -799,88 +997,108 @@ mod tests { } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn runtime_id_test() { - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_includes(r#""runtime_id":"foo""#); - then.status(200).body(""); - }) - .await; - - let (client, mut worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_runtime_id("foo") - .build(Handle::current()); - tokio::spawn(async move { worker.run().await }); + #[test] + fn runtime_id_test() { + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_includes(r#""runtime_id":"foo""#); + then.status(200).body(""); + }) + .await; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_runtime_id("foo") + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + client + .send(&SendPayloadTelemetry { + requests_count: 1, + ..Default::default() + }) + .unwrap(); + handle.stop().await.expect("Failed to stop worker"); + while telemetry_srv.calls_async().await == 0 { + sleep(Duration::from_millis(10)).await; + } + // One payload generate-metrics + telemetry_srv.assert_calls_async(1).await; + }); - client.start().await; - client - .send(&SendPayloadTelemetry { - requests_count: 1, - ..Default::default() - }) - .unwrap(); - client.shutdown().await; - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } - // One payload generate-metrics - telemetry_srv.assert_calls_async(1).await; + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn application_metadata_test() { - let server = MockServer::start_async().await; - - let telemetry_srv = server - .mock_async(|when, then| { - when.method(POST) - .body_includes(r#""application":{"service_name":"test_service","service_version":"test_version","env":"test_env","language_name":"test_language","language_version":"test_language_version","tracer_version":"test_tracer_version"}"#); - then.status(200).body(""); - }) - .await; - - let (client, mut worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_runtime_id("foo") - .build(Handle::current()); - tokio::spawn(async move { worker.run().await }); - - client.start().await; - client - .send(&SendPayloadTelemetry { - requests_count: 1, - ..Default::default() - }) - .unwrap(); - client.shutdown().await; - // Wait for the server to receive at least one call, but don't hang forever. - let start = std::time::Instant::now(); - while telemetry_srv.calls_async().await == 0 { - if start.elapsed() > Duration::from_secs(180) { - panic!("telemetry server did not receive calls within timeout"); + #[test] + fn application_metadata_test() { + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + rt.block_on(async { + let server = MockServer::start_async().await; + let telemetry_srv = server + .mock_async(|when, then| { + when.method(POST) + .body_includes(r#""application":{"service_name":"test_service","service_version":"test_version","env":"test_env","language_name":"test_language","language_version":"test_language_version","tracer_version":"test_tracer_version"}"#); + then.status(200).body(""); + }) + .await; + + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(&server.url("/")) + .set_heartbeat(100) + .set_runtime_id("foo") + .build(rt.handle().clone()); + let handle = shared_runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + + client.start().await; + client + .send(&SendPayloadTelemetry { + requests_count: 1, + ..Default::default() + }) + .unwrap(); + handle.stop().await.expect("Failed to stop worker"); + // Wait for the server to receive at least one call, but don't hang forever. + let start = std::time::Instant::now(); + while telemetry_srv.calls_async().await == 0 { + if start.elapsed() > Duration::from_secs(180) { + panic!("telemetry server did not receive calls within timeout"); + } + sleep(Duration::from_millis(10)).await; } - sleep(Duration::from_millis(10)).await; - } - // One payload generate-metrics - telemetry_srv.assert_calls_async(1).await; + // One payload generate-metrics + telemetry_srv.assert_calls_async(1).await; + }); + + rt.block_on(async { + shared_runtime.shutdown().await.expect("Failed to shutdown"); + }); } } From f86890a68ddca0ac09d777a63be6881485c6f1ed Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 2 Mar 2026 13:56:54 +0100 Subject: [PATCH 16/80] test(telemetry): use client builder --- libdd-data-pipeline/src/telemetry/mod.rs | 225 ++++------------------- 1 file changed, 31 insertions(+), 194 deletions(-) diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index 37d797eedc..7c785f9d35 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -309,7 +309,25 @@ mod tests { use tokio::time::sleep; use super::*; - use crate::shared_runtime::SharedRuntime; + use crate::shared_runtime::{SharedRuntime, WorkerHandle}; + + fn get_test_client(url: &str, runtime: &SharedRuntime) -> (TelemetryClient, WorkerHandle) { + let (client, worker) = TelemetryClientBuilder::default() + .set_service_name("test_service") + .set_service_version("test_version") + .set_env("test_env") + .set_language("test_language") + .set_language_version("test_language_version") + .set_tracer_version("test_tracer_version") + .set_url(url) + .set_heartbeat(100) + .set_debug_enabled(true) + .build(Handle::current()); + let handle = runtime + .spawn_worker(worker) + .expect("Failed to spawn worker"); + (client, handle) + } #[test] fn builder_test() { @@ -341,19 +359,6 @@ mod tests { ); } - #[cfg_attr(miri, ignore)] - #[tokio::test(flavor = "multi_thread")] - async fn spawn_test() { - let _ = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .build(Handle::current()); - } - #[cfg_attr(miri, ignore)] #[test] fn api_bytes_test() { @@ -375,21 +380,7 @@ mod tests { ..Default::default() }; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_debug_enabled(true) - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); handle.stop().await.expect("Failed to stop worker"); @@ -425,21 +416,7 @@ mod tests { ..Default::default() }; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_debug_enabled(true) - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); handle.stop().await.expect("Failed to stop worker"); @@ -475,21 +452,7 @@ mod tests { ..Default::default() }; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_debug_enabled(true) - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); handle.stop().await.expect("Failed to stop worker"); @@ -525,21 +488,7 @@ mod tests { ..Default::default() }; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_debug_enabled(true) - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); handle.stop().await.expect("Failed to stop worker"); @@ -575,21 +524,7 @@ mod tests { ..Default::default() }; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_debug_enabled(true) - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); handle.stop().await.expect("Failed to stop worker"); @@ -625,21 +560,7 @@ mod tests { ..Default::default() }; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_debug_enabled(true) - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); handle.stop().await.expect("Failed to stop worker"); @@ -675,21 +596,7 @@ mod tests { ..Default::default() }; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_debug_enabled(true) - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); handle.stop().await.expect("Failed to stop worker"); @@ -725,21 +632,7 @@ mod tests { ..Default::default() }; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_debug_enabled(true) - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); handle.stop().await.expect("Failed to stop worker"); @@ -775,21 +668,7 @@ mod tests { ..Default::default() }; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_debug_enabled(true) - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); handle.stop().await.expect("Failed to stop worker"); @@ -825,21 +704,7 @@ mod tests { ..Default::default() }; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_debug_enabled(true) - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); handle.stop().await.expect("Failed to stop worker"); @@ -1011,21 +876,7 @@ mod tests { }) .await; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_runtime_id("foo") - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; client .send(&SendPayloadTelemetry { @@ -1062,21 +913,7 @@ mod tests { }) .await; - let (client, worker) = TelemetryClientBuilder::default() - .set_service_name("test_service") - .set_service_version("test_version") - .set_env("test_env") - .set_language("test_language") - .set_language_version("test_language_version") - .set_tracer_version("test_tracer_version") - .set_url(&server.url("/")) - .set_heartbeat(100) - .set_runtime_id("foo") - .build(rt.handle().clone()); - let handle = shared_runtime - .spawn_worker(worker) - .expect("Failed to spawn worker"); - + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; client .send(&SendPayloadTelemetry { From 3d285ae69d703ffe2da7fc956274c81eda7ff49b Mon Sep 17 00:00:00 2001 From: vianney Date: Wed, 4 Mar 2026 18:33:16 +0100 Subject: [PATCH 17/80] chore(runtime): fix nit --- libdd-data-pipeline/src/agent_info/fetcher.rs | 5 ++ libdd-data-pipeline/src/pausable_worker.rs | 22 +++-- libdd-data-pipeline/src/shared_runtime.rs | 20 ++++- libdd-data-pipeline/src/stats_exporter.rs | 80 +++++++++-------- libdd-data-pipeline/src/telemetry/mod.rs | 89 ++++--------------- libdd-data-pipeline/src/trace_exporter/mod.rs | 10 ++- 6 files changed, 103 insertions(+), 123 deletions(-) diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index 6c1a2341f5..fc29799dc4 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -210,6 +210,11 @@ impl Worker for AgentInfoFetcher { } } + fn reset(&mut self) { + // Drain all messages from the channel to remove messages sent to release the reference on + self.drain(); + } + async fn run(&mut self) { self.fetch_and_update().await; } diff --git a/libdd-data-pipeline/src/pausable_worker.rs b/libdd-data-pipeline/src/pausable_worker.rs index 50d478869d..1e0d378f0d 100644 --- a/libdd-data-pipeline/src/pausable_worker.rs +++ b/libdd-data-pipeline/src/pausable_worker.rs @@ -125,20 +125,30 @@ impl PausableWorker { } } - /// Wait for a requested pause to complete and store the worker state. + /// Pause the worker and wait for it to complete, storing its state for restart. + /// + /// This method will cancel the worker's cancellation token if it hasn't been cancelled yet, + /// then wait for the worker to finish and store its state. Calling [`Self::request_pause`] + /// before this method is optional - it's only needed when shutting down multiple workers + /// simultaneously to allow them to pause concurrently before waiting for all of them. /// /// # Errors /// Fails if the worker handle has been aborted preventing the worker from being retrieved. pub async fn join(&mut self) -> Result<(), PausableWorkerError> { match self { PausableWorker::Running { .. } => { - let PausableWorker::Running { handle, .. } = + let PausableWorker::Running { handle, stop_token } = std::mem::replace(self, PausableWorker::InvalidState) else { // Unreachable return Ok(()); }; + // Cancel the token if it hasn't been cancelled yet to avoid deadlock + if !stop_token.is_cancelled() { + stop_token.cancel(); + } + if let Ok(worker) = handle.await { *self = PausableWorker::Paused { worker }; Ok(()) @@ -153,12 +163,6 @@ impl PausableWorker { } } - /// Pause the worker saving it's state to be restarted. - pub async fn pause(&mut self) -> Result<(), PausableWorkerError> { - self.request_pause()?; - self.join().await - } - /// Reset the worker state (e.g. in a fork child). pub fn reset(&mut self) { if let PausableWorker::Paused { worker } = self { @@ -214,7 +218,7 @@ mod tests { pausable_worker.start(&runtime).unwrap(); assert_eq!(receiver.recv().unwrap(), 0); - runtime.block_on(async { pausable_worker.pause().await.unwrap() }); + runtime.block_on(async { pausable_worker.join().await.unwrap() }); // Empty the message queue and get the last message let mut next_message = 1; for message in receiver.try_iter() { diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index f8b023a286..f7bdffa522 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -73,7 +73,7 @@ impl WorkerHandle { let WorkerEntry { worker, .. } = workers_lock.swap_remove(position); worker }; - worker.pause().await?; + worker.join().await?; worker.shutdown().await; Ok(()) } @@ -318,13 +318,27 @@ impl SharedRuntime { let mut join_set = JoinSet::new(); for mut worker_entry in workers { join_set.spawn(async move { - worker_entry.worker.pause().await?; + worker_entry.worker.join().await?; worker_entry.worker.shutdown().await; Ok::<(), PausableWorkerError>(()) }); } - join_set.join_all().await; + let mut results = Vec::new(); + while let Some(result) = join_set.join_next().await { + // Unwrap the JoinHandle result (panic if task panicked) + results.push(result.expect("Worker task panicked")); + } + + // Collect all errors + let errors: Vec = results + .into_iter() + .filter_map(|r| Some(r.err()?.into())) + .collect(); + + if !errors.is_empty() { + return Err(errors); + } Ok(()) } } diff --git a/libdd-data-pipeline/src/stats_exporter.rs b/libdd-data-pipeline/src/stats_exporter.rs index 0205752f7d..6f1a4b51f7 100644 --- a/libdd-data-pipeline/src/stats_exporter.rs +++ b/libdd-data-pipeline/src/stats_exporter.rs @@ -335,42 +335,46 @@ mod tests { ); } - // #[cfg_attr(miri, ignore)] - // #[tokio::test] - // async fn test_cancellation_token() { - // let server = MockServer::start_async().await; - // - // let mut mock = server - // .mock_async(|when, then| { - // when.method(POST) - // .header("Content-type", "application/msgpack") - // .path("/v0.6/stats") - // .body_includes("libdatadog-test"); - // then.status(200).body(""); - // }) - // .await; - // - // let buckets_duration = Duration::from_secs(10); - // let cancellation_token = CancellationToken::new(); - // - // let mut stats_exporter = StatsExporter::new( - // buckets_duration, - // Arc::new(Mutex::new(get_test_concentrator())), - // get_test_metadata(), - // Endpoint::from_url(stats_url_from_agent_url(&server.url("/")).unwrap()), - // cancellation_token.clone(), - // new_default_client(), - // ); - // - // tokio::spawn(async move { - // stats_exporter.run().await; - // }); - // // Cancel token to trigger force flush - // cancellation_token.cancel(); - // - // assert!( - // poll_for_mock_hit(&mut mock, 10, 100, 1, false).await, - // "Expected max retry attempts" - // ); - // } + #[cfg_attr(miri, ignore)] + #[test] + fn test_worker_shutdown() { + use crate::shared_runtime::SharedRuntime; + + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let rt = shared_runtime.runtime().expect("Failed to get runtime"); + + let server = MockServer::start(); + + let mut mock = server.mock(|when, then| { + when.method(POST) + .header("Content-type", "application/msgpack") + .path("/v0.6/stats") + .body_includes("libdatadog-test"); + then.status(200).body(""); + }); + + let buckets_duration = Duration::from_secs(10); + + let stats_exporter = StatsExporter::new( + buckets_duration, + Arc::new(Mutex::new(get_test_concentrator())), + get_test_metadata(), + Endpoint::from_url(stats_url_from_agent_url(&server.url("/")).unwrap()), + new_default_client(), + ); + + let handle = shared_runtime + .spawn_worker(stats_exporter) + .expect("Failed to spawn worker"); + + // Stop the worker to trigger force flush + rt.block_on(async { + handle.stop().await.expect("Failed to stop worker"); + }); + + assert!( + rt.block_on(poll_for_mock_hit(&mut mock, 10, 100, 1, false)), + "Expected max retry attempts" + ); + } } diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index 7c785f9d35..374ba45963 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -389,10 +389,6 @@ mod tests { } telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } #[cfg_attr(miri, ignore)] @@ -425,10 +421,6 @@ mod tests { } telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } #[cfg_attr(miri, ignore)] @@ -461,10 +453,6 @@ mod tests { } telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } #[cfg_attr(miri, ignore)] @@ -497,10 +485,6 @@ mod tests { } telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } #[cfg_attr(miri, ignore)] @@ -533,10 +517,6 @@ mod tests { } telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } #[cfg_attr(miri, ignore)] @@ -569,10 +549,6 @@ mod tests { } telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } #[cfg_attr(miri, ignore)] @@ -605,10 +581,6 @@ mod tests { } telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } #[cfg_attr(miri, ignore)] @@ -641,10 +613,6 @@ mod tests { } telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } #[cfg_attr(miri, ignore)] @@ -677,10 +645,6 @@ mod tests { } telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } #[cfg_attr(miri, ignore)] @@ -713,10 +677,6 @@ mod tests { } telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } #[test] @@ -778,28 +738,25 @@ mod tests { } #[cfg_attr(miri, ignore)] - #[test] - fn telemetry_from_network_error_test() { - let rt = tokio::runtime::Runtime::new().unwrap(); - rt.block_on(async { - // Create an hyper error by calling an undefined service - let err = http_common::new_default_client() - .get(http::Uri::from_static("localhost:12345")) - .await - .unwrap_err(); - - let result = Err(SendWithRetryError::Network(http_common::into_error(err), 5)); - let telemetry = SendPayloadTelemetry::from_retry_result(&result, 1, 2, 0); - assert_eq!( - telemetry, - SendPayloadTelemetry { - chunks_dropped_send_failure: 2, - requests_count: 5, - errors_network: 1, - ..Default::default() - } - ) - }); + #[tokio::test] + async fn telemetry_from_network_error_test() { + // Create an hyper error by calling an undefined service + let err = http_common::new_default_client() + .get(http::Uri::from_static("localhost:12345")) + .await + .unwrap_err(); + + let result = Err(SendWithRetryError::Network(http_common::into_error(err), 5)); + let telemetry = SendPayloadTelemetry::from_retry_result(&result, 1, 2, 0); + assert_eq!( + telemetry, + SendPayloadTelemetry { + chunks_dropped_send_failure: 2, + requests_count: 5, + errors_network: 1, + ..Default::default() + } + ) } #[test] @@ -891,10 +848,6 @@ mod tests { // One payload generate-metrics telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } #[cfg_attr(miri, ignore)] @@ -933,9 +886,5 @@ mod tests { // One payload generate-metrics telemetry_srv.assert_calls_async(1).await; }); - - rt.block_on(async { - shared_runtime.shutdown().await.expect("Failed to shutdown"); - }); } } diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index dfd7b932c8..6abb410164 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -31,8 +31,7 @@ use http::uri::PathAndQuery; use http::Uri; use http_body_util::BodyExt; use libdd_common::tag::Tag; -use libdd_common::HttpClient; -use libdd_common::{http_common, Endpoint}; +use libdd_common::{http_common, Endpoint, HttpClient}; use libdd_dogstatsd_client::Client; use libdd_trace_utils::msgpack_decoder; use libdd_trace_utils::send_with_retry::{ @@ -181,7 +180,6 @@ pub struct TraceExporter { metadata: TracerMetadata, input_format: TraceExporterInputFormat, output_format: TraceExporterOutputFormat, - // TODO - do something with the response callback - https://datadoghq.atlassian.net/browse/APMSP-1019 shared_runtime: SharedRuntime, /// None if dogstatsd is disabled dogstatsd: Option, @@ -222,6 +220,12 @@ impl TraceExporter { if let Err(errors) = errors { error!("Some workers failed to stop: {errors:?}"); } + + // When the info fetcher is paused, the trigger channel keeps a reference to the runtime's + // IoStack through the waker. This prevents the IoStack from being dropped when shutting + // down the runtime. By manually sending a message to the trigger channel we trigger the + // waker releasing the reference to the IoStack. Finally we drain the channel to + // avoid triggering a fetch when the info fetcher is restarted. self.info_response_observer.manual_trigger(); } From 1f8c0cf1ebfc3e7f4b05c2a92a035f8700116c27 Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 5 Mar 2026 15:09:59 +0100 Subject: [PATCH 18/80] refactor(runtime): move shutdown to runtime --- libdd-data-pipeline/src/shared_runtime.rs | 68 ++++++++++++------- libdd-data-pipeline/src/trace_exporter/mod.rs | 39 ++++------- 2 files changed, 58 insertions(+), 49 deletions(-) diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index f7bdffa522..7738142746 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -90,6 +90,8 @@ pub enum SharedRuntimeError { WorkerError(PausableWorkerError), /// Failed to create the tokio runtime. RuntimeCreation(io::Error), + /// Shutdown timed out. + ShutdownTimedOut(std::time::Duration), } impl fmt::Display for SharedRuntimeError { @@ -103,6 +105,9 @@ impl fmt::Display for SharedRuntimeError { Self::RuntimeCreation(err) => { write!(f, "Failed to create runtime: {}", err) } + Self::ShutdownTimedOut(duration) => { + write!(f, "Shutdown timed out after {:?}", duration) + } } } } @@ -302,14 +307,43 @@ impl SharedRuntime { } } - /// Shutdown the runtime and all workers. + /// Shutdown the runtime and all workers synchronously with optional timeout. + /// + /// This creates a temporary runtime to execute the async shutdown and should be called + /// from non-async contexts during application shutdown. + /// + /// Worker errors are logged but do not cause the function to fail. + /// + /// # Errors + /// Returns an error only if shutdown times out or runtime creation fails. + pub fn shutdown(self, timeout: Option) -> Result<(), SharedRuntimeError> { + let runtime = self.runtime()?; + + if let Some(timeout) = timeout { + match runtime + .block_on(async { tokio::time::timeout(timeout, self.shutdown_async()).await }) + { + Ok(()) => Ok(()), + Err(_) => Err(SharedRuntimeError::ShutdownTimedOut(timeout)), + } + } else { + runtime.block_on(self.shutdown_async()); + Ok(()) + } + } + + /// Shutdown all workers asynchronously. /// /// This should be called during application shutdown to cleanly stop all /// background workers and the runtime. /// - /// # Errors - /// Returns an error if workers cannot be stopped. - pub async fn shutdown(&self) -> Result<(), SharedRuntimeError> { + /// Worker errors are logged but do not cause the function to fail. + /// + /// This function should not take ownership of the SharedRuntime as it will cause the runtime + /// to be dropped in a non-blocking context causing a panic. + pub async fn shutdown_async(&self) { + use tracing::error; + let workers = { let mut workers_lock = self.workers.lock_or_panic(); std::mem::take(&mut *workers_lock) @@ -318,28 +352,16 @@ impl SharedRuntime { let mut join_set = JoinSet::new(); for mut worker_entry in workers { join_set.spawn(async move { - worker_entry.worker.join().await?; + let result = worker_entry.worker.join().await; + if let Err(e) = result { + error!("Worker failed to shutdown: {:?}", e); + return; + } worker_entry.worker.shutdown().await; - Ok::<(), PausableWorkerError>(()) }); } - let mut results = Vec::new(); - while let Some(result) = join_set.join_next().await { - // Unwrap the JoinHandle result (panic if task panicked) - results.push(result.expect("Worker task panicked")); - } - - // Collect all errors - let errors: Vec = results - .into_iter() - .filter_map(|r| Some(r.err()?.into())) - .collect(); - - if !errors.is_empty() { - return Err(errors); - } - Ok(()) + join_set.join_all().await; } } @@ -418,7 +440,7 @@ mod tests { // Clean shutdown rt.block_on(async { - assert!(shared_runtime.shutdown().await.is_ok()); + shared_runtime.shutdown_async().await; }); }); diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 6abb410164..d0ee3c971f 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -256,32 +256,19 @@ impl TraceExporter { } /// Safely shutdown the TraceExporter and all related tasks - pub fn shutdown(mut self, timeout: Option) -> Result<(), TraceExporterError> { - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build()?; - - if let Some(timeout) = timeout { - match runtime - .block_on(async { tokio::time::timeout(timeout, self.shutdown_async()).await }) - { - Ok(()) => Ok(()), - Err(_e) => Err(TraceExporterError::Shutdown( - error::ShutdownError::TimedOut(timeout), - )), - } - } else { - runtime.block_on(self.shutdown_async()); - Ok(()) - } - } - - /// Future used inside `Self::shutdown`. - /// - /// This function should not take ownership of the trace exporter as it will cause the runtime - /// stored in the trace exporter to be dropped in a non-blocking context causing a panic. - async fn shutdown_async(&mut self) { - let _ = self.shared_runtime.shutdown().await; + pub fn shutdown(self, timeout: Option) -> Result<(), TraceExporterError> { + self.shared_runtime + .shutdown(timeout) + .map_err(|e| match e { + crate::shared_runtime::SharedRuntimeError::ShutdownTimedOut(duration) => { + TraceExporterError::Shutdown(error::ShutdownError::TimedOut(duration)) + } + crate::shared_runtime::SharedRuntimeError::RuntimeCreation(io_err) => { + TraceExporterError::Io(io_err) + } + // Other error cases should not occur from shutdown() + _ => unreachable!("Unexpected SharedRuntimeError from shutdown: {:?}", e), + }) } /// Check if agent info state has changed From 62a63d686b5866e8123a875b6768b20d69c824b9 Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 5 Mar 2026 15:12:47 +0100 Subject: [PATCH 19/80] test(telemetry): Add sleep after send With the refactor the stop action is processed right away without processing all actions sent before it. This can cause some message to be lost if they were submitted right before shutdown. --- libdd-data-pipeline/src/telemetry/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index 374ba45963..1331d37be2 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -607,6 +607,9 @@ mod tests { let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(10)).await; + handle.stop().await.expect("Failed to stop worker"); while telemetry_srv.calls_async().await == 0 { sleep(Duration::from_millis(10)).await; From c1270f1694b20827083ddbebca6d0100786cb35a Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 6 Mar 2026 17:32:06 +0100 Subject: [PATCH 20/80] test(telemetry): fix deadlocks in telemetry tests --- libdd-data-pipeline/src/telemetry/mod.rs | 100 +++++++++++++---------- 1 file changed, 59 insertions(+), 41 deletions(-) diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index 1331d37be2..e22f2e82da 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -319,6 +319,7 @@ mod tests { .set_language("test_language") .set_language_version("test_language_version") .set_tracer_version("test_tracer_version") + .set_runtime_id("foo") .set_url(url) .set_heartbeat(100) .set_debug_enabled(true) @@ -329,6 +330,18 @@ mod tests { (client, handle) } + macro_rules! wait_for_telemetry_call { + ($telemetry_srv:expr) => {{ + let start = std::time::Instant::now(); + while $telemetry_srv.calls_async().await == 0 { + if start.elapsed() > Duration::from_secs(10) { + panic!("telemetry server did not receive calls within timeout"); + } + sleep(Duration::from_millis(10)).await; + } + }}; + } + #[test] fn builder_test() { let builder = TelemetryClientBuilder::default() @@ -383,10 +396,11 @@ mod tests { let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; + handle.stop().await.expect("Failed to stop worker"); - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); telemetry_srv.assert_calls_async(1).await; }); } @@ -415,10 +429,11 @@ mod tests { let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; + handle.stop().await.expect("Failed to stop worker"); - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); telemetry_srv.assert_calls_async(1).await; }); } @@ -447,10 +462,11 @@ mod tests { let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; + handle.stop().await.expect("Failed to stop worker"); - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); telemetry_srv.assert_calls_async(1).await; }); } @@ -479,10 +495,11 @@ mod tests { let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; + handle.stop().await.expect("Failed to stop worker"); - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); telemetry_srv.assert_calls_async(1).await; }); } @@ -511,10 +528,11 @@ mod tests { let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; + handle.stop().await.expect("Failed to stop worker"); - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); telemetry_srv.assert_calls_async(1).await; }); } @@ -543,10 +561,11 @@ mod tests { let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; + handle.stop().await.expect("Failed to stop worker"); - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); telemetry_srv.assert_calls_async(1).await; }); } @@ -575,10 +594,11 @@ mod tests { let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; + handle.stop().await.expect("Failed to stop worker"); - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); telemetry_srv.assert_calls_async(1).await; }); } @@ -608,12 +628,10 @@ mod tests { client.start().await; let _ = client.send(&data); // Wait for send to be processed - sleep(Duration::from_millis(10)).await; + sleep(Duration::from_millis(1)).await; handle.stop().await.expect("Failed to stop worker"); - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); telemetry_srv.assert_calls_async(1).await; }); } @@ -642,10 +660,11 @@ mod tests { let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; + handle.stop().await.expect("Failed to stop worker"); - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); telemetry_srv.assert_calls_async(1).await; }); } @@ -674,10 +693,11 @@ mod tests { let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; + handle.stop().await.expect("Failed to stop worker"); - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); telemetry_srv.assert_calls_async(1).await; }); } @@ -844,10 +864,11 @@ mod tests { ..Default::default() }) .unwrap(); + // Wait for send to be processed + sleep(Duration::from_millis(10)).await; + handle.stop().await.expect("Failed to stop worker"); - while telemetry_srv.calls_async().await == 0 { - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); // One payload generate-metrics telemetry_srv.assert_calls_async(1).await; }); @@ -877,15 +898,12 @@ mod tests { ..Default::default() }) .unwrap(); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; + handle.stop().await.expect("Failed to stop worker"); // Wait for the server to receive at least one call, but don't hang forever. - let start = std::time::Instant::now(); - while telemetry_srv.calls_async().await == 0 { - if start.elapsed() > Duration::from_secs(180) { - panic!("telemetry server did not receive calls within timeout"); - } - sleep(Duration::from_millis(10)).await; - } + wait_for_telemetry_call!(telemetry_srv); // One payload generate-metrics telemetry_srv.assert_calls_async(1).await; }); From 608556e2b2e7ae740372bba581b0ba3e33562fb8 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 9 Mar 2026 11:47:24 +0100 Subject: [PATCH 21/80] refactor(runtime): skip shutdown when runtime is None --- libdd-data-pipeline/src/shared_runtime.rs | 33 ++++++++++++----------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index 7738142746..073b5516ca 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -309,26 +309,27 @@ impl SharedRuntime { /// Shutdown the runtime and all workers synchronously with optional timeout. /// - /// This creates a temporary runtime to execute the async shutdown and should be called - /// from non-async contexts during application shutdown. - /// /// Worker errors are logged but do not cause the function to fail. /// /// # Errors - /// Returns an error only if shutdown times out or runtime creation fails. - pub fn shutdown(self, timeout: Option) -> Result<(), SharedRuntimeError> { - let runtime = self.runtime()?; - - if let Some(timeout) = timeout { - match runtime - .block_on(async { tokio::time::timeout(timeout, self.shutdown_async()).await }) - { - Ok(()) => Ok(()), - Err(_) => Err(SharedRuntimeError::ShutdownTimedOut(timeout)), + /// Returns an error only if shutdown times out. + pub fn shutdown(&self, timeout: Option) -> Result<(), SharedRuntimeError> { + match self.runtime.lock_or_panic().take() { + Some(runtime) => { + let result = if let Some(timeout) = timeout { + match runtime.block_on(async { + tokio::time::timeout(timeout, self.shutdown_async()).await + }) { + Ok(()) => Ok(()), + Err(_) => Err(SharedRuntimeError::ShutdownTimedOut(timeout)), + } + } else { + runtime.block_on(self.shutdown_async()); + Ok(()) + }; + result } - } else { - runtime.block_on(self.shutdown_async()); - Ok(()) + None => Ok(()), // The runtime is not running so there's nothing to shutdown } } From 34ec7d92eca6c61be2d2fc906adfdc08519a027b Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 9 Mar 2026 14:12:24 +0100 Subject: [PATCH 22/80] feat(runtime): add runtime to builder --- .../examples/send-traces-with-stats.rs | 15 ++- libdd-data-pipeline/src/agent_info/fetcher.rs | 124 +++++++++--------- libdd-data-pipeline/src/stats_exporter.rs | 10 +- .../src/trace_exporter/builder.rs | 34 ++++- libdd-data-pipeline/src/trace_exporter/mod.rs | 31 ++--- 5 files changed, 116 insertions(+), 98 deletions(-) diff --git a/libdd-data-pipeline/examples/send-traces-with-stats.rs b/libdd-data-pipeline/examples/send-traces-with-stats.rs index e5a2180754..7542826a7e 100644 --- a/libdd-data-pipeline/examples/send-traces-with-stats.rs +++ b/libdd-data-pipeline/examples/send-traces-with-stats.rs @@ -2,8 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 use clap::Parser; -use libdd_data_pipeline::trace_exporter::{ - TelemetryConfig, TraceExporter, TraceExporterInputFormat, TraceExporterOutputFormat, +use libdd_data_pipeline::{ + shared_runtime::SharedRuntime, + trace_exporter::{ + TelemetryConfig, TraceExporter, TraceExporterInputFormat, TraceExporterOutputFormat, + }, }; use libdd_log::logger::{ logger_configure_std, logger_set_log_level, LogEventLevel, StdConfig, StdTarget, @@ -11,6 +14,7 @@ use libdd_log::logger::{ use libdd_trace_protobuf::pb; use std::{ collections::HashMap, + sync::Arc, time::{Duration, UNIX_EPOCH}, }; @@ -53,6 +57,8 @@ fn main() { .expect("Failed to configure logger"); logger_set_log_level(LogEventLevel::Debug).expect("Failed to set log level"); + let shared_runtime = Arc::new(SharedRuntime::new().expect("Failed to create runtime")); + let args = Args::parse(); let telemetry_cfg = TelemetryConfig::default(); let mut builder = TraceExporter::builder(); @@ -67,6 +73,7 @@ fn main() { .set_language_version(env!("CARGO_PKG_RUST_VERSION")) .set_input_format(TraceExporterInputFormat::V04) .set_output_format(TraceExporterOutputFormat::V04) + .set_shared_runtime(shared_runtime.clone()) .enable_telemetry(telemetry_cfg) .enable_stats(Duration::from_secs(10)); let exporter = builder.build().expect("Failed to build TraceExporter"); @@ -86,7 +93,7 @@ fn main() { let data = rmp_serde::to_vec_named(&traces).expect("Failed to serialize traces"); exporter.send(data.as_ref()).expect("Failed to send traces"); - exporter + shared_runtime .shutdown(None) - .expect("Failed to shutdown exporter"); + .expect("Failed to shutdown runtime"); } diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index fc29799dc4..944c83fb52 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -292,6 +292,7 @@ impl ResponseObserver { mod single_threaded_tests { use super::*; use crate::agent_info; + use crate::shared_runtime::SharedRuntime; use httpmock::prelude::*; const TEST_INFO: &str = r#"{ @@ -433,29 +434,31 @@ mod single_threaded_tests { } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn test_agent_info_fetcher_run() { + #[test] + fn test_agent_info_fetcher_run() { AGENT_INFO_CACHE.store(None); let server = MockServer::start(); - let mock_v1 = server - .mock_async(|when, then| { - when.path("/info"); - then.status(200) - .header("content-type", "application/json") - .body(r#"{"version":"1"}"#); - }) - .await; + let mut mock_v1 = server.mock(|when, then| { + when.path("/info"); + then.status(200) + .header("content-type", "application/json") + .body(r#"{"version":"1"}"#); + }); let endpoint = Endpoint::from_url(server.url("/info").parse().unwrap()); - let (mut fetcher, _response_observer) = + let (fetcher, _response_observer) = AgentInfoFetcher::new(endpoint.clone(), Duration::from_millis(100)); assert!(agent_info::get_agent_info().is_none()); - tokio::spawn(async move { - fetcher.run().await; - }); + let shared_runtime = SharedRuntime::new().unwrap(); + shared_runtime.spawn_worker(fetcher).unwrap(); // Wait until the info is fetched + let start = std::time::Instant::now(); while agent_info::get_agent_info().is_none() { - tokio::time::sleep(Duration::from_millis(100)).await; + assert!( + start.elapsed() <= Duration::from_secs(10), + "Timeout waiting for first /info fetch" + ); + std::thread::sleep(Duration::from_millis(100)); } let version_1 = agent_info::get_agent_info() @@ -466,22 +469,25 @@ mod single_threaded_tests { .clone() .unwrap(); assert_eq!(version_1, "1"); - mock_v1.assert_async().await; + mock_v1.assert(); // Update the info endpoint - mock_v1.delete_async().await; - let mock_v2 = server - .mock_async(|when, then| { - when.path("/info"); - then.status(200) - .header("content-type", "application/json") - .body(r#"{"version":"2"}"#); - }) - .await; + mock_v1.delete(); + let mock_v2 = server.mock(|when, then| { + when.path("/info"); + then.status(200) + .header("content-type", "application/json") + .body(r#"{"version":"2"}"#); + }); // Wait for second fetch - while mock_v2.calls_async().await == 0 { - tokio::time::sleep(Duration::from_millis(100)).await; + let start = std::time::Instant::now(); + while mock_v2.calls() == 0 { + assert!( + start.elapsed() <= Duration::from_secs(10), + "Timeout waiting for second /info fetch" + ); + std::thread::sleep(Duration::from_millis(100)); } // This check is not 100% deterministic, but between the time the mock returns the response @@ -500,22 +506,20 @@ mod single_threaded_tests { assert_eq!(version_2, "2"); break; } - tokio::time::sleep(Duration::from_millis(100)).await; + std::thread::sleep(Duration::from_millis(100)); } } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn test_agent_info_trigger_different_state() { + #[test] + fn test_agent_info_trigger_different_state() { let server = MockServer::start(); - let mock = server - .mock_async(|when, then| { - when.path("/info"); - then.status(200) - .header("content-type", "application/json") - .body(r#"{"version":"triggered"}"#); - }) - .await; + let mock = server.mock(|when, then| { + when.path("/info"); + then.status(200) + .header("content-type", "application/json") + .body(r#"{"version":"triggered"}"#); + }); // Populate the cache with initial state AGENT_INFO_CACHE.store(Some(Arc::new(AgentInfo { @@ -524,13 +528,12 @@ mod single_threaded_tests { }))); let endpoint = Endpoint::from_url(server.url("/info").parse().unwrap()); - let (mut fetcher, response_observer) = + let (fetcher, response_observer) = // Interval is too long to fetch during the test AgentInfoFetcher::new(endpoint, Duration::from_secs(3600)); - tokio::spawn(async move { - fetcher.run().await; - }); + let shared_runtime = SharedRuntime::new().unwrap(); + shared_runtime.spawn_worker(fetcher).unwrap(); // Create a mock HTTP response with the new agent state let response = http_common::empty_response( @@ -548,13 +551,13 @@ mod single_threaded_tests { const SLEEP_DURATION_MS: u64 = 10; let mut attempts = 0; - while mock.calls_async().await == 0 && attempts < MAX_ATTEMPTS { + while mock.calls() == 0 && attempts < MAX_ATTEMPTS { attempts += 1; - tokio::time::sleep(Duration::from_millis(SLEEP_DURATION_MS)).await; + std::thread::sleep(Duration::from_millis(SLEEP_DURATION_MS)); } // Should trigger a fetch since the state is different - mock.assert_calls_async(1).await; + mock.assert_calls(1); // Wait for the cache to be updated with proper timeout let mut attempts = 0; @@ -568,7 +571,7 @@ mod single_threaded_tests { } } attempts += 1; - tokio::time::sleep(Duration::from_millis(SLEEP_DURATION_MS)).await; + std::thread::sleep(Duration::from_millis(SLEEP_DURATION_MS)); } // Verify the cache was updated @@ -588,17 +591,15 @@ mod single_threaded_tests { } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn test_agent_info_trigger_same_state() { + #[test] + fn test_agent_info_trigger_same_state() { let server = MockServer::start(); - let mock = server - .mock_async(|when, then| { - when.path("/info"); - then.status(200) - .header("content-type", "application/json") - .body(r#"{"version":"same"}"#); - }) - .await; + let mock = server.mock(|when, then| { + when.path("/info"); + then.status(200) + .header("content-type", "application/json") + .body(r#"{"version":"same"}"#); + }); let same_json = r#"{"version":"same"}"#; let same_hash = calculate_hash(same_json); @@ -610,12 +611,11 @@ mod single_threaded_tests { }))); let endpoint = Endpoint::from_url(server.url("/info").parse().unwrap()); - let (mut fetcher, response_observer) = + let (fetcher, response_observer) = AgentInfoFetcher::new(endpoint, Duration::from_secs(3600)); // Very long interval - tokio::spawn(async move { - fetcher.run().await; - }); + let shared_runtime = SharedRuntime::new().unwrap(); + shared_runtime.spawn_worker(fetcher).unwrap(); // Create a mock HTTP response with the same agent state let response = http_common::empty_response( @@ -629,9 +629,9 @@ mod single_threaded_tests { response_observer.check_response(&response); // Wait to ensure no fetch occurs - tokio::time::sleep(Duration::from_millis(500)).await; + std::thread::sleep(Duration::from_millis(500)); // Should not trigger a fetch since the state is the same - mock.assert_calls_async(0).await; + mock.assert_calls(0); } } diff --git a/libdd-data-pipeline/src/stats_exporter.rs b/libdd-data-pipeline/src/stats_exporter.rs index 6f1a4b51f7..4dc67e2f01 100644 --- a/libdd-data-pipeline/src/stats_exporter.rs +++ b/libdd-data-pipeline/src/stats_exporter.rs @@ -181,6 +181,7 @@ pub fn stats_url_from_agent_url(agent_url: &str) -> anyhow::Result { #[cfg(test)] mod tests { use super::*; + use crate::shared_runtime::SharedRuntime; use httpmock::prelude::*; use httpmock::MockServer; use libdd_common::http_common::new_default_client; @@ -338,8 +339,6 @@ mod tests { #[cfg_attr(miri, ignore)] #[test] fn test_worker_shutdown() { - use crate::shared_runtime::SharedRuntime; - let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); let rt = shared_runtime.runtime().expect("Failed to get runtime"); @@ -363,14 +362,11 @@ mod tests { new_default_client(), ); - let handle = shared_runtime + let _handle = shared_runtime .spawn_worker(stats_exporter) .expect("Failed to spawn worker"); - // Stop the worker to trigger force flush - rt.block_on(async { - handle.stop().await.expect("Failed to stop worker"); - }); + shared_runtime.shutdown(None).unwrap(); assert!( rt.block_on(poll_for_mock_hit(&mut mock, 10, 100, 1, false)), diff --git a/libdd-data-pipeline/src/trace_exporter/builder.rs b/libdd-data-pipeline/src/trace_exporter/builder.rs index 69ee14ced1..7ac135624e 100644 --- a/libdd-data-pipeline/src/trace_exporter/builder.rs +++ b/libdd-data-pipeline/src/trace_exporter/builder.rs @@ -14,6 +14,7 @@ use arc_swap::ArcSwap; use libdd_common::http_common::new_default_client; use libdd_common::{parse_uri, tag, Endpoint}; use libdd_dogstatsd_client::new; +use std::sync::Arc; use std::time::Duration; const DEFAULT_AGENT_URL: &str = "http://127.0.0.1:8126"; @@ -44,6 +45,7 @@ pub struct TraceExporterBuilder { compute_stats_by_span_kind: bool, peer_tags: Vec, telemetry: Option, + shared_runtime: Option>, health_metrics_enabled: bool, test_session_token: Option, agent_rates_payload_version_enabled: bool, @@ -197,6 +199,12 @@ impl TraceExporterBuilder { self } + /// Set a shared runtime used by the exporter for background workers. + pub fn set_shared_runtime(&mut self, shared_runtime: Arc) -> &mut Self { + self.shared_runtime = Some(shared_runtime); + self + } + /// Enables health metrics emission. pub fn enable_health_metrics(&mut self) -> &mut Self { self.health_metrics_enabled = true; @@ -225,9 +233,13 @@ impl TraceExporterBuilder { )); } - let shared_runtime = SharedRuntime::new().map_err(|e| { - TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration(e.to_string())) - })?; + let shared_runtime = + self.shared_runtime + .unwrap_or(Arc::new(SharedRuntime::new().map_err(|e| { + TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration( + e.to_string(), + )) + })?)); let dogstatsd = self.dogstatsd_url.and_then(|u| { new(Endpoint::from_slice(&u)).ok() // If we couldn't set the endpoint return @@ -420,6 +432,22 @@ mod tests { assert_eq!(exporter.metadata.language_interpreter, ""); assert!(!exporter.metadata.client_computed_stats); assert!(exporter.telemetry.is_none()); + assert!( + exporter.shared_runtime.runtime().is_ok(), + "default shared runtime should be initialized" + ); + } + + #[cfg_attr(miri, ignore)] + #[test] + fn test_set_shared_runtime() { + let mut builder = TraceExporterBuilder::default(); + let shared_runtime = Arc::new(SharedRuntime::new().unwrap()); + builder.set_shared_runtime(shared_runtime.clone()); + + let exporter = builder.build().unwrap(); + + assert!(Arc::ptr_eq(&exporter.shared_runtime, &shared_runtime)); } #[test] diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index d0ee3c971f..10208c4638 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -41,7 +41,6 @@ use libdd_trace_utils::span::{v04::Span, TraceData}; use libdd_trace_utils::trace_utils::TracerHeaderTags; use std::io; use std::sync::Arc; -use std::time::Duration; use std::{borrow::Borrow, collections::HashMap, str::FromStr}; use tokio::runtime::Runtime; use tracing::{debug, error, warn}; @@ -180,7 +179,7 @@ pub struct TraceExporter { metadata: TracerMetadata, input_format: TraceExporterInputFormat, output_format: TraceExporterOutputFormat, - shared_runtime: SharedRuntime, + shared_runtime: Arc, /// None if dogstatsd is disabled dogstatsd: Option, common_stats_tags: Vec, @@ -255,22 +254,6 @@ impl TraceExporter { Ok(res) } - /// Safely shutdown the TraceExporter and all related tasks - pub fn shutdown(self, timeout: Option) -> Result<(), TraceExporterError> { - self.shared_runtime - .shutdown(timeout) - .map_err(|e| match e { - crate::shared_runtime::SharedRuntimeError::ShutdownTimedOut(duration) => { - TraceExporterError::Shutdown(error::ShutdownError::TimedOut(duration)) - } - crate::shared_runtime::SharedRuntimeError::RuntimeCreation(io_err) => { - TraceExporterError::Io(io_err) - } - // Other error cases should not occur from shutdown() - _ => unreachable!("Unexpected SharedRuntimeError from shutdown: {:?}", e), - }) - } - /// Check if agent info state has changed fn has_agent_info_state_changed(&self, agent_info: &Arc) -> bool { Some(agent_info.state_hash.as_str()) @@ -1693,8 +1676,6 @@ mod tests { let _ = exporter.send(data.as_ref()).unwrap(); - exporter.shutdown(None).unwrap(); - mock_traces.assert(); } @@ -1764,6 +1745,8 @@ mod single_threaded_tests { .body(r#"{"version":"1","client_drop_p0s":true,"endpoints":["/v0.4/traces","/v0.6/stats"]}"#); }); + let runtime = SharedRuntime::new().unwrap(); + let mut builder = TraceExporterBuilder::default(); builder .set_url(&server.url("/")) @@ -1775,6 +1758,7 @@ mod single_threaded_tests { .set_language_interpreter("v8") .set_input_format(TraceExporterInputFormat::V04) .set_output_format(TraceExporterOutputFormat::V04) + .set_shared_runtime(runtime.clone()) .enable_stats(Duration::from_secs(10)); let exporter = builder.build().unwrap(); @@ -1806,7 +1790,7 @@ mod single_threaded_tests { std::thread::sleep(Duration::from_millis(10)); } - exporter.shutdown(None).unwrap(); + runtime.shutdown(None).unwrap(); // Wait for the mock server to process the stats for _ in 0..1000 { @@ -1858,6 +1842,8 @@ mod single_threaded_tests { .body(r#"{"version":"1","client_drop_p0s":true,"endpoints":["/v0.4/traces","/v0.6/stats"]}"#); }); + let runtime = SharedRuntime::new().unwrap(); + let mut builder = TraceExporterBuilder::default(); builder .set_url(&server.url("/")) @@ -1869,6 +1855,7 @@ mod single_threaded_tests { .set_language_interpreter("v8") .set_input_format(TraceExporterInputFormat::V04) .set_output_format(TraceExporterOutputFormat::V04) + .set_shared_runtime(runtime.clone()) .enable_stats(Duration::from_secs(10)); let exporter = builder.build().unwrap(); @@ -1903,7 +1890,7 @@ mod single_threaded_tests { std::thread::sleep(Duration::from_millis(10)); } - exporter + runtime .shutdown(Some(Duration::from_millis(5))) .unwrap_err(); // The shutdown should timeout From a02eec673ec585e10d142722622a47a8e7e67e1d Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 9 Mar 2026 14:32:57 +0100 Subject: [PATCH 23/80] chore(telemetry): remove macro --- libdd-data-pipeline/src/telemetry/mod.rs | 110 ++++++++++-------- libdd-data-pipeline/src/trace_exporter/mod.rs | 4 +- 2 files changed, 63 insertions(+), 51 deletions(-) diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index e22f2e82da..0ec7d1817f 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -305,6 +305,7 @@ mod tests { use httpmock::Method::POST; use httpmock::MockServer; use libdd_common::http_common; + use libdd_trace_utils::test_utils::poll_for_mock_hit; use regex::Regex; use tokio::time::sleep; @@ -329,19 +330,6 @@ mod tests { .expect("Failed to spawn worker"); (client, handle) } - - macro_rules! wait_for_telemetry_call { - ($telemetry_srv:expr) => {{ - let start = std::time::Instant::now(); - while $telemetry_srv.calls_async().await == 0 { - if start.elapsed() > Duration::from_secs(10) { - panic!("telemetry server did not receive calls within timeout"); - } - sleep(Duration::from_millis(10)).await; - } - }}; - } - #[test] fn builder_test() { let builder = TelemetryClientBuilder::default() @@ -381,7 +369,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST).body_matches(payload); then.status(200).body(""); @@ -400,8 +388,10 @@ mod tests { sleep(Duration::from_millis(1)).await; handle.stop().await.expect("Failed to stop worker"); - wait_for_telemetry_call!(telemetry_srv); - telemetry_srv.assert_calls_async(1).await; + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); }); } @@ -414,7 +404,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST).body_matches(payload); then.status(200).body(""); @@ -433,8 +423,10 @@ mod tests { sleep(Duration::from_millis(1)).await; handle.stop().await.expect("Failed to stop worker"); - wait_for_telemetry_call!(telemetry_srv); - telemetry_srv.assert_calls_async(1).await; + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); }); } @@ -447,7 +439,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST).body_matches(payload); then.status(200).body(""); @@ -466,8 +458,10 @@ mod tests { sleep(Duration::from_millis(1)).await; handle.stop().await.expect("Failed to stop worker"); - wait_for_telemetry_call!(telemetry_srv); - telemetry_srv.assert_calls_async(1).await; + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); }); } @@ -480,7 +474,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST).body_matches(payload); then.status(200).body(""); @@ -499,8 +493,10 @@ mod tests { sleep(Duration::from_millis(1)).await; handle.stop().await.expect("Failed to stop worker"); - wait_for_telemetry_call!(telemetry_srv); - telemetry_srv.assert_calls_async(1).await; + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); }); } @@ -513,7 +509,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST).body_matches(payload); then.status(200).body(""); @@ -532,8 +528,10 @@ mod tests { sleep(Duration::from_millis(1)).await; handle.stop().await.expect("Failed to stop worker"); - wait_for_telemetry_call!(telemetry_srv); - telemetry_srv.assert_calls_async(1).await; + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); }); } @@ -546,7 +544,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST).body_matches(payload); then.status(200).body(""); @@ -565,8 +563,10 @@ mod tests { sleep(Duration::from_millis(1)).await; handle.stop().await.expect("Failed to stop worker"); - wait_for_telemetry_call!(telemetry_srv); - telemetry_srv.assert_calls_async(1).await; + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); }); } @@ -579,7 +579,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST).body_matches(payload); then.status(200).body(""); @@ -598,8 +598,10 @@ mod tests { sleep(Duration::from_millis(1)).await; handle.stop().await.expect("Failed to stop worker"); - wait_for_telemetry_call!(telemetry_srv); - telemetry_srv.assert_calls_async(1).await; + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); }); } @@ -612,7 +614,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST).body_matches(payload); then.status(200).body(""); @@ -631,8 +633,10 @@ mod tests { sleep(Duration::from_millis(1)).await; handle.stop().await.expect("Failed to stop worker"); - wait_for_telemetry_call!(telemetry_srv); - telemetry_srv.assert_calls_async(1).await; + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); }); } @@ -645,7 +649,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST).body_matches(payload); then.status(200).body(""); @@ -664,8 +668,10 @@ mod tests { sleep(Duration::from_millis(1)).await; handle.stop().await.expect("Failed to stop worker"); - wait_for_telemetry_call!(telemetry_srv); - telemetry_srv.assert_calls_async(1).await; + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); }); } @@ -678,7 +684,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST).body_matches(payload); then.status(200).body(""); @@ -697,8 +703,10 @@ mod tests { sleep(Duration::from_millis(1)).await; handle.stop().await.expect("Failed to stop worker"); - wait_for_telemetry_call!(telemetry_srv); - telemetry_srv.assert_calls_async(1).await; + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); }); } @@ -849,7 +857,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST).body_includes(r#""runtime_id":"foo""#); then.status(200).body(""); @@ -868,9 +876,11 @@ mod tests { sleep(Duration::from_millis(10)).await; handle.stop().await.expect("Failed to stop worker"); - wait_for_telemetry_call!(telemetry_srv); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); // One payload generate-metrics - telemetry_srv.assert_calls_async(1).await; }); } @@ -882,7 +892,7 @@ mod tests { rt.block_on(async { let server = MockServer::start_async().await; - let telemetry_srv = server + let mut telemetry_srv = server .mock_async(|when, then| { when.method(POST) .body_includes(r#""application":{"service_name":"test_service","service_version":"test_version","env":"test_env","language_name":"test_language","language_version":"test_language_version","tracer_version":"test_tracer_version"}"#); @@ -903,9 +913,11 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); // Wait for the server to receive at least one call, but don't hang forever. - wait_for_telemetry_call!(telemetry_srv); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); // One payload generate-metrics - telemetry_srv.assert_calls_async(1).await; }); } } diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 10208c4638..b248d93bc9 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -1745,7 +1745,7 @@ mod single_threaded_tests { .body(r#"{"version":"1","client_drop_p0s":true,"endpoints":["/v0.4/traces","/v0.6/stats"]}"#); }); - let runtime = SharedRuntime::new().unwrap(); + let runtime = Arc::new(SharedRuntime::new().unwrap()); let mut builder = TraceExporterBuilder::default(); builder @@ -1842,7 +1842,7 @@ mod single_threaded_tests { .body(r#"{"version":"1","client_drop_p0s":true,"endpoints":["/v0.4/traces","/v0.6/stats"]}"#); }); - let runtime = SharedRuntime::new().unwrap(); + let runtime = Arc::new(SharedRuntime::new().unwrap()); let mut builder = TraceExporterBuilder::default(); builder From 3119d5fac73a8cccc1d4cecc3dd23361bf4fb4bb Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 9 Mar 2026 14:57:53 +0100 Subject: [PATCH 24/80] docs(runtime): add warnings --- libdd-common/src/worker.rs | 3 +++ libdd-data-pipeline/src/shared_runtime.rs | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/libdd-common/src/worker.rs b/libdd-common/src/worker.rs index d8f1df4805..5812a068f1 100644 --- a/libdd-common/src/worker.rs +++ b/libdd-common/src/worker.rs @@ -11,6 +11,9 @@ use async_trait::async_trait; #[async_trait] pub trait Worker: std::fmt::Debug { /// Main worker function + /// + /// Code in this function should always use timeout on long-running await calls to avoid + /// blocking forks if an await call takes too long to complete. async fn run(&mut self); /// Function called between each `run` to wait for the next run diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index 073b5516ca..792fb4a2be 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -296,6 +296,11 @@ impl SharedRuntime { /// /// This allows external code to spawn additional tasks on the runtime if needed. /// + /// # Warning + /// Since this method can return a single-threaded runtime it should only be use to + /// execute async code with `block_on` if you need to spawn async code on it without blocking, + /// you should us a `Worker` instead. + /// /// # Errors /// Returns an error if it fails to create a runtime. pub fn runtime(&self) -> Result, io::Error> { From 74425ba3cc596c58d4deb5a3f20da6dcba7b9531 Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 10 Mar 2026 11:25:19 +0100 Subject: [PATCH 25/80] feat(shared_runtime): add shared runtime ffi --- libdd-data-pipeline-ffi/src/lib.rs | 1 + libdd-data-pipeline-ffi/src/shared_runtime.rs | 340 ++++++++++++++++++ libdd-data-pipeline-ffi/src/trace_exporter.rs | 30 +- libdd-data-pipeline/src/shared_runtime.rs | 28 +- libdd-data-pipeline/src/trace_exporter/mod.rs | 24 -- 5 files changed, 379 insertions(+), 44 deletions(-) create mode 100644 libdd-data-pipeline-ffi/src/shared_runtime.rs diff --git a/libdd-data-pipeline-ffi/src/lib.rs b/libdd-data-pipeline-ffi/src/lib.rs index d85002ab40..effe7d965c 100644 --- a/libdd-data-pipeline-ffi/src/lib.rs +++ b/libdd-data-pipeline-ffi/src/lib.rs @@ -8,4 +8,5 @@ mod error; mod response; +mod shared_runtime; mod trace_exporter; diff --git a/libdd-data-pipeline-ffi/src/shared_runtime.rs b/libdd-data-pipeline-ffi/src/shared_runtime.rs new file mode 100644 index 0000000000..4731184ec0 --- /dev/null +++ b/libdd-data-pipeline-ffi/src/shared_runtime.rs @@ -0,0 +1,340 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +use libdd_data_pipeline::shared_runtime::{SharedRuntime, SharedRuntimeError}; +use std::ffi::{c_char, CString}; +use std::ptr::NonNull; +use std::sync::Arc; + +/// Error codes for SharedRuntime FFI operations. +#[repr(C)] +#[derive(Copy, Clone, Debug, PartialEq)] +pub enum SharedRuntimeErrorCode { + /// Invalid argument provided (e.g. null handle). + InvalidArgument, + /// The runtime is not available or in an invalid state. + RuntimeUnavailable, + /// Failed to acquire a lock on internal state. + LockFailed, + /// A worker operation failed. + WorkerError, + /// Failed to create the tokio runtime. + RuntimeCreation, + /// Shutdown timed out. + ShutdownTimedOut, +} + +/// Error returned by SharedRuntime FFI functions. +#[repr(C)] +pub struct SharedRuntimeFFIError { + pub code: SharedRuntimeErrorCode, + pub msg: *mut c_char, +} + +impl SharedRuntimeFFIError { + fn new(code: SharedRuntimeErrorCode, msg: &str) -> Self { + Self { + code, + msg: CString::new(msg).unwrap_or_default().into_raw(), + } + } +} + +impl From for SharedRuntimeFFIError { + fn from(err: SharedRuntimeError) -> Self { + let code = match &err { + SharedRuntimeError::RuntimeUnavailable => SharedRuntimeErrorCode::RuntimeUnavailable, + SharedRuntimeError::LockFailed(_) => SharedRuntimeErrorCode::LockFailed, + SharedRuntimeError::WorkerError(_) => SharedRuntimeErrorCode::WorkerError, + SharedRuntimeError::RuntimeCreation(_) => SharedRuntimeErrorCode::RuntimeCreation, + SharedRuntimeError::ShutdownTimedOut(_) => SharedRuntimeErrorCode::ShutdownTimedOut, + }; + SharedRuntimeFFIError::new(code, &err.to_string()) + } +} + +impl Drop for SharedRuntimeFFIError { + fn drop(&mut self) { + if !self.msg.is_null() { + // SAFETY: `msg` is always produced by `CString::into_raw` in `new`. + unsafe { + drop(CString::from_raw(self.msg)); + self.msg = std::ptr::null_mut(); + } + } + } +} + +/// Frees a `SharedRuntimeFFIError`. After this call the pointer is invalid. +#[no_mangle] +pub unsafe extern "C" fn ddog_shared_runtime_error_free(error: Option>) { + drop(error); +} + +/// Create a new `SharedRuntime` wrapped in an `Arc`. +/// +/// On success writes the new handle into `*out_handle` and returns `None`. +/// On failure leaves `*out_handle` unchanged and returns an error. +/// +/// The caller owns the returned handle and must eventually pass it to +/// [`ddog_shared_runtime_free`] (or another consumer that takes ownership). +#[no_mangle] +pub unsafe extern "C" fn ddog_shared_runtime_new( + out_handle: NonNull>>, +) -> Option> { + match SharedRuntime::new() { + Ok(runtime) => { + out_handle.as_ptr().write(Box::new(Arc::new(runtime))); + None + } + Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + } +} + +/// Clone the `Arc`, incrementing the reference count. +/// +/// The new handle is independent from the original: either can be freed +/// without affecting the other. The underlying runtime is only destroyed +/// when every handle has been freed. +/// +/// On success writes the cloned handle into `*out_handle` and returns `None`. +#[no_mangle] +pub unsafe extern "C" fn ddog_shared_runtime_clone( + handle: Option<&Arc>, + out_handle: NonNull>>, +) -> Option> { + match handle { + Some(arc) => { + out_handle.as_ptr().write(Box::new(arc.clone())); + None + } + None => Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))), + } +} + +/// Free a `SharedRuntime` handle, decrementing the `Arc` reference count. +/// +/// The underlying runtime is only shut down once the last handle is freed. +/// Use [`ddog_shared_runtime_shutdown`] to explicitly stop the runtime and +/// all its workers before the last handle is freed. +#[no_mangle] +pub unsafe extern "C" fn ddog_shared_runtime_free(handle: Box>) { + drop(handle); +} + +/// Must be called in the parent process before `fork()`. +/// +/// Pauses all workers so that no background threads are running during the +/// fork, preventing deadlocks in the child process. +/// +/// Returns an error if `handle` is null. +#[no_mangle] +pub unsafe extern "C" fn ddog_shared_runtime_before_fork( + handle: Option<&Arc>, +) -> Option> { + let runtime = match handle { + Some(r) => r, + None => { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))) + } + }; + + runtime.before_fork(); + None +} + +/// Must be called in the parent process after `fork()`. +/// +/// Restarts all workers that were paused by [`ddog_shared_runtime_before_fork`]. +/// +/// Returns `None` on success, or an error if workers could not be restarted. +#[no_mangle] +pub unsafe extern "C" fn ddog_shared_runtime_after_fork_parent( + handle: Option<&Arc>, +) -> Option> { + let runtime = match handle { + Some(r) => r, + None => { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))) + } + }; + + match runtime.after_fork_parent() { + Ok(()) => None, + Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + } +} + +/// Must be called in the child process after `fork()`. +/// +/// Creates a fresh tokio runtime and restarts all workers. The original +/// runtime cannot be safely reused after a fork. +/// +/// Returns `None` on success, or an error if the runtime could not be +/// reinitialized. +#[no_mangle] +pub unsafe extern "C" fn ddog_shared_runtime_after_fork_child( + handle: Option<&Arc>, +) -> Option> { + let runtime = match handle { + Some(r) => r, + None => { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))) + } + }; + + match runtime.after_fork_child() { + Ok(()) => None, + Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + } +} + +/// Shut down the `SharedRuntime`, stopping all workers. +/// +/// `timeout_ms` is the maximum time to wait for workers to stop, in +/// milliseconds. Pass `0` for no timeout. +/// +/// Returns `None` on success, or `SharedRuntimeErrorCode::ShutdownTimedOut` +/// if the timeout was reached. +#[no_mangle] +pub unsafe extern "C" fn ddog_shared_runtime_shutdown( + handle: Option<&Arc>, + timeout_ms: u64, +) -> Option> { + let runtime = match handle { + Some(r) => r, + None => { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))) + } + }; + + let timeout = if timeout_ms > 0 { + Some(std::time::Duration::from_millis(timeout_ms)) + } else { + None + }; + + match runtime.shutdown(timeout) { + Ok(()) => None, + Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::mem::MaybeUninit; + + #[test] + fn test_new_and_free() { + unsafe { + let mut handle: MaybeUninit>> = MaybeUninit::uninit(); + let err = ddog_shared_runtime_new(NonNull::new_unchecked(&mut handle).cast()); + assert!(err.is_none()); + ddog_shared_runtime_free(handle.assume_init()); + } + } + + #[test] + fn test_clone() { + unsafe { + let mut handle: MaybeUninit>> = MaybeUninit::uninit(); + ddog_shared_runtime_new(NonNull::new_unchecked(&mut handle).cast()); + let handle = handle.assume_init(); + + let mut cloned: MaybeUninit>> = MaybeUninit::uninit(); + let err = ddog_shared_runtime_clone( + Some(handle.as_ref()), + NonNull::new_unchecked(&mut cloned).cast(), + ); + assert!(err.is_none()); + + // Both handles should point to the same underlying runtime (strong count == 2). + assert_eq!(Arc::strong_count(handle.as_ref()), 2); + + ddog_shared_runtime_free(cloned.assume_init()); + assert_eq!(Arc::strong_count(handle.as_ref()), 1); + + ddog_shared_runtime_free(handle); + } + } + + #[test] + fn test_clone_null_handle() { + unsafe { + let mut cloned: MaybeUninit>> = MaybeUninit::uninit(); + let err = ddog_shared_runtime_clone(None, NonNull::new_unchecked(&mut cloned).cast()); + assert!(err.is_some()); + assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); + } + } + + #[test] + fn test_before_after_fork_null() { + unsafe { + let err = ddog_shared_runtime_before_fork(None); + assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); + + let err = ddog_shared_runtime_after_fork_parent(None); + assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); + + let err = ddog_shared_runtime_after_fork_child(None); + assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); + } + } + + #[test] + fn test_fork_lifecycle() { + unsafe { + let mut handle: MaybeUninit>> = MaybeUninit::uninit(); + ddog_shared_runtime_new(NonNull::new_unchecked(&mut handle).cast()); + let handle = handle.assume_init(); + + let err = ddog_shared_runtime_before_fork(Some(handle.as_ref())); + assert!(err.is_none(), "{:?}", err.map(|e| e.code)); + + let err = ddog_shared_runtime_after_fork_parent(Some(handle.as_ref())); + assert!(err.is_none(), "{:?}", err.map(|e| e.code)); + + ddog_shared_runtime_free(handle); + } + } + + #[test] + fn test_shutdown() { + unsafe { + let mut handle: MaybeUninit>> = MaybeUninit::uninit(); + ddog_shared_runtime_new(NonNull::new_unchecked(&mut handle).cast()); + let handle = handle.assume_init(); + + let err = ddog_shared_runtime_shutdown(Some(handle.as_ref()), 0); + assert!(err.is_none()); + + ddog_shared_runtime_free(handle); + } + } + + #[test] + fn test_error_free() { + let error = Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "test error", + )); + unsafe { ddog_shared_runtime_error_free(Some(error)) }; + } +} diff --git a/libdd-data-pipeline-ffi/src/trace_exporter.rs b/libdd-data-pipeline-ffi/src/trace_exporter.rs index 20a3f380f0..cca62501c6 100644 --- a/libdd-data-pipeline-ffi/src/trace_exporter.rs +++ b/libdd-data-pipeline-ffi/src/trace_exporter.rs @@ -7,10 +7,11 @@ use libdd_common_ffi::{ CharSlice, {slice::AsBytes, slice::ByteSlice}, }; +use libdd_data_pipeline::shared_runtime::SharedRuntime; use libdd_data_pipeline::trace_exporter::{ TelemetryConfig, TraceExporter, TraceExporterInputFormat, TraceExporterOutputFormat, }; -use std::{ptr::NonNull, time::Duration}; +use std::{ptr::NonNull, sync::Arc, time::Duration}; use tracing::{debug, error}; #[cfg(all(feature = "catch_panic", panic = "unwind"))] @@ -100,6 +101,7 @@ pub struct TraceExporterConfig { health_metrics_enabled: bool, test_session_token: Option, connection_timeout: Option, + shared_runtime: Option>, } #[no_mangle] @@ -426,6 +428,28 @@ pub unsafe extern "C" fn ddog_trace_exporter_config_set_connection_timeout( ) } +/// Sets a shared runtime for the TraceExporter to use for background workers. +/// +/// When set, the exporter will use the provided runtime instead of creating its own. +/// This allows multiple exporters (or other components) to share a single runtime. +/// The config holds a clone of the `Arc`, so the original handle remains valid. +#[no_mangle] +pub unsafe extern "C" fn ddog_trace_exporter_config_set_shared_runtime( + config: Option<&mut TraceExporterConfig>, + handle: Option<&Arc>, +) -> Option> { + catch_panic!( + match (config, handle) { + (Some(config), Some(runtime)) => { + config.shared_runtime = Some(runtime.clone()); + None + } + _ => gen_error!(ErrorCode::InvalidArgument), + }, + gen_error!(ErrorCode::Panic) + ) +} + /// Create a new TraceExporter instance. /// /// # Arguments @@ -478,6 +502,10 @@ pub unsafe extern "C" fn ddog_trace_exporter_new( builder.enable_health_metrics(); } + if let Some(runtime) = config.shared_runtime.clone() { + builder.set_shared_runtime(runtime); + } + match builder.build() { Ok(exporter) => { out_handle.as_ptr().write(Box::new(exporter)); diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-data-pipeline/src/shared_runtime.rs index 792fb4a2be..4db4755be7 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-data-pipeline/src/shared_runtime.rs @@ -197,34 +197,24 @@ impl SharedRuntime { /// It ensures that no background tasks are running when the fork occurs, /// preventing potential deadlocks in the child process. /// - /// # Errors - /// Returns an error if workers cannot be paused or the runtime is in an invalid state. - pub fn before_fork(&self) -> Result<(), Vec> { + /// Worker errors are logged but do not cause the function to fail. + pub fn before_fork(&self) { + use tracing::error; + if let Some(runtime) = self.runtime.lock_or_panic().take() { let mut workers_lock = self.workers.lock_or_panic(); - let results = runtime.block_on(async move { - let mut results = Vec::new(); + runtime.block_on(async move { for worker_entry in workers_lock.iter_mut() { let _ = worker_entry.worker.request_pause(); } for worker_entry in workers_lock.iter_mut() { - results.push(worker_entry.worker.join().await); + if let Err(e) = worker_entry.worker.join().await { + error!("Worker failed to pause before fork: {:?}", e); + } } - results }); - - // Collect all errors - let errors: Vec = results - .into_iter() - .filter_map(|r| Some(r.err()?.into())) - .collect(); - - if !errors.is_empty() { - return Err(errors); - } } - Ok(()) } fn restart_runtime(&self) -> Result<(), SharedRuntimeError> { @@ -439,7 +429,7 @@ mod tests { let shared_runtime = SharedRuntime::new().unwrap(); // Test before_fork - assert!(shared_runtime.before_fork().is_ok()); + shared_runtime.before_fork(); // Test after_fork_parent (synchronous) assert!(shared_runtime.after_fork_parent().is_ok()); diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index b248d93bc9..40da5b94d5 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -213,21 +213,6 @@ impl TraceExporter { }) } - /// Stop all background workers and drop the tokio runtime - pub fn stop_worker(&self) { - let errors = self.shared_runtime.before_fork(); - if let Err(errors) = errors { - error!("Some workers failed to stop: {errors:?}"); - } - - // When the info fetcher is paused, the trigger channel keeps a reference to the runtime's - // IoStack through the waker. This prevents the IoStack from being dropped when shutting - // down the runtime. By manually sending a message to the trigger channel we trigger the - // waker releasing the reference to the IoStack. Finally we drain the channel to - // avoid triggering a fetch when the info fetcher is restarted. - self.info_response_observer.manual_trigger(); - } - /// Send msgpack serialized traces to the agent /// /// # Arguments @@ -1694,15 +1679,6 @@ mod tests { assert_eq!(exporter.endpoint.timeout_ms, 42); } - - #[test] - #[cfg_attr(miri, ignore)] - fn stop_and_start_runtime() { - let builder = TraceExporterBuilder::default(); - let exporter = builder.build().unwrap(); - exporter.stop_worker(); - exporter.run_worker().unwrap(); - } } #[cfg(test)] From 2869d39b553ac3e9626abe47ea6c045866194813 Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 10 Mar 2026 12:25:10 +0100 Subject: [PATCH 26/80] feat(trace_exporter): add shutdown method --- .../src/trace_exporter/builder.rs | 17 +++-- libdd-data-pipeline/src/trace_exporter/mod.rs | 65 ++++++++++++++++++- 2 files changed, 74 insertions(+), 8 deletions(-) diff --git a/libdd-data-pipeline/src/trace_exporter/builder.rs b/libdd-data-pipeline/src/trace_exporter/builder.rs index 7ac135624e..60f0b83c35 100644 --- a/libdd-data-pipeline/src/trace_exporter/builder.rs +++ b/libdd-data-pipeline/src/trace_exporter/builder.rs @@ -8,7 +8,8 @@ use crate::trace_exporter::agent_response::AgentResponsePayloadVersion; use crate::trace_exporter::error::BuilderErrorKind; use crate::trace_exporter::{ add_path, StatsComputationStatus, TelemetryConfig, TraceExporter, TraceExporterError, - TraceExporterInputFormat, TraceExporterOutputFormat, TracerMetadata, INFO_ENDPOINT, + TraceExporterInputFormat, TraceExporterOutputFormat, TracerMetadata, TraceExporterWorkers, + INFO_ENDPOINT, }; use arc_swap::ArcSwap; use libdd_common::http_common::new_default_client; @@ -258,7 +259,7 @@ impl TraceExporterBuilder { let info_endpoint = Endpoint::from_url(add_path(&agent_url, INFO_ENDPOINT)); let (info_fetcher, info_response_observer) = AgentInfoFetcher::new(info_endpoint.clone(), Duration::from_secs(5 * 60)); - shared_runtime.spawn_worker(info_fetcher).map_err(|e| { + let info_fetcher_handle = shared_runtime.spawn_worker(info_fetcher).map_err(|e| { TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration(e.to_string())) })?; @@ -290,9 +291,9 @@ impl TraceExporterBuilder { Ok(builder.build(runtime.handle().clone())) }); - let telemetry_client = match telemetry { + let (telemetry_client, telemetry_handle) = match telemetry { Some(Ok((client, worker))) => { - shared_runtime.spawn_worker(worker).map_err(|e| { + let handle = shared_runtime.spawn_worker(worker).map_err(|e| { TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration( e.to_string(), )) @@ -305,10 +306,10 @@ impl TraceExporterBuilder { )) })? .block_on(client.start()); - Some(client) + (Some(client), Some(handle)) } Some(Err(e)) => return Err(e), - None => None, + None => (None, None), }; Ok(TraceExporter { @@ -350,6 +351,10 @@ impl TraceExporterBuilder { .agent_rates_payload_version_enabled .then(AgentResponsePayloadVersion::new), http_client: new_default_client(), + workers: TraceExporterWorkers { + info_fetcher: info_fetcher_handle, + telemetry: telemetry_handle, + }, }) } diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 40da5b94d5..bd10d7a6c9 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -15,12 +15,12 @@ use self::metrics::MetricsEmitter; use self::stats::StatsComputationStatus; use self::trace_serializer::TraceSerializer; use crate::agent_info::ResponseObserver; -use crate::shared_runtime::SharedRuntime; +use crate::shared_runtime::{SharedRuntime, WorkerHandle}; use crate::telemetry::{SendPayloadTelemetry, TelemetryClient}; use crate::trace_exporter::agent_response::{ AgentResponsePayloadVersion, DATADOG_RATES_PAYLOAD_VERSION_HEADER, }; -use crate::trace_exporter::error::{InternalErrorKind, RequestError, TraceExporterError}; +use crate::trace_exporter::error::{InternalErrorKind, RequestError, ShutdownError, TraceExporterError}; use crate::{ agent_info::{self, schema::AgentInfo}, health_metrics, @@ -43,6 +43,7 @@ use std::io; use std::sync::Arc; use std::{borrow::Borrow, collections::HashMap, str::FromStr}; use tokio::runtime::Runtime; +use tokio::task::JoinSet; use tracing::{debug, error, warn}; const INFO_ENDPOINT: &str = "/info"; @@ -149,6 +150,13 @@ impl<'a> From<&'a TracerMetadata> for HashMap<&'static str, String> { } } +/// Handles for the background workers owned by a [`TraceExporter`]. +#[derive(Debug)] +pub(crate) struct TraceExporterWorkers { + info_fetcher: WorkerHandle, + telemetry: Option, +} + /// The TraceExporter ingest traces from the tracers serialized as messagepack and forward them to /// the agent while applying some transformation. /// @@ -191,6 +199,7 @@ pub struct TraceExporter { health_metrics_enabled: bool, agent_payload_response_version: Option, http_client: HttpClient, + workers: TraceExporterWorkers, } impl TraceExporter { @@ -199,6 +208,58 @@ impl TraceExporter { TraceExporterBuilder::default() } + /// Stop the background workers owned by this exporter. + /// + /// Only the workers spawned for this exporter are stopped. Workers from other components + /// sharing the same [`SharedRuntime`] are unaffected. + /// + /// # Errors + /// Returns [`SharedRuntimeError::ShutdownTimedOut`] if a timeout was given and elapsed before + /// all workers finished. + pub fn shutdown(self, timeout: Option) -> Result<(), TraceExporterError> { + let runtime = self.runtime()?; + if let Some(timeout) = timeout { + match runtime.block_on(async { + tokio::time::timeout(timeout, self.shutdown_workers()).await + }) { + Ok(()) => Ok(()), + Err(_) => Err(TraceExporterError::Shutdown(ShutdownError::TimedOut( + timeout, + ))), + } + } else { + runtime.block_on(self.shutdown_workers()); + Ok(()) + } + } + + async fn shutdown_workers(self) { + let mut join_set = JoinSet::new(); + + // Extract the stats handle before moving other fields. + if let StatsComputationStatus::Enabled { worker_handle, .. } = + &**self.client_side_stats.load() + { + let handle = worker_handle.clone(); + join_set.spawn(async move { handle.stop().await }); + } + + let info_fetcher = self.workers.info_fetcher; + let telemetry = self.workers.telemetry; + + join_set.spawn(async move { info_fetcher.stop().await }); + + if let Some(telemetry) = telemetry { + join_set.spawn(async move { telemetry.stop().await }); + } + + while let Some(result) = join_set.join_next().await { + if let Ok(Err(e)) = result { + error!("Worker failed to shutdown: {:?}", e); + } + } + } + /// Return a runtime from the shared runtime manager. fn runtime(&self) -> Result, TraceExporterError> { self.shared_runtime From 4b1d0b6e06bbcf1714b76e8c642cd92ee8d21cd5 Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 10 Mar 2026 13:34:23 +0100 Subject: [PATCH 27/80] fix(shared_runtime): add on_pause hook to release waker in info fetcher --- libdd-common/src/worker.rs | 8 ++++++++ libdd-data-pipeline/src/agent_info/fetcher.rs | 9 +++++++++ libdd-data-pipeline/src/pausable_worker.rs | 3 ++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/libdd-common/src/worker.rs b/libdd-common/src/worker.rs index 5812a068f1..a88c81a192 100644 --- a/libdd-common/src/worker.rs +++ b/libdd-common/src/worker.rs @@ -28,6 +28,10 @@ pub trait Worker: std::fmt::Debug { /// Reset the worker in the child after a fork fn reset(&mut self) {} + /// Hook called after the worker has been paused (e.g. before a fork). + /// Default is a no-op. + async fn on_pause(&mut self) {} + /// Hook called when the app is shutting down. Can be used to flush remaining data. async fn shutdown(&mut self) {} } @@ -51,6 +55,10 @@ impl Worker for Box { (**self).reset() } + async fn on_pause(&mut self) { + (**self).on_pause().await + } + async fn shutdown(&mut self) { (**self).shutdown().await } diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index 944c83fb52..a279c17e18 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -144,6 +144,7 @@ pub struct AgentInfoFetcher { info_endpoint: Endpoint, refresh_interval: Duration, trigger_rx: Option>, + trigger_tx: mpsc::Sender<()>, } impl AgentInfoFetcher { @@ -161,6 +162,7 @@ impl AgentInfoFetcher { info_endpoint, refresh_interval, trigger_rx: Some(trigger_rx), + trigger_tx: trigger_tx.clone(), }; let response_observer = ResponseObserver::new(trigger_tx); @@ -210,6 +212,13 @@ impl Worker for AgentInfoFetcher { } } + async fn on_pause(&mut self) { + // Release the IoStack waker stored in trigger_rx by waking the channel, + // then drain the message to avoid a spurious fetch on restart. + let _ = self.trigger_tx.try_send(()); + self.drain(); + } + fn reset(&mut self) { // Drain all messages from the channel to remove messages sent to release the reference on self.drain(); diff --git a/libdd-data-pipeline/src/pausable_worker.rs b/libdd-data-pipeline/src/pausable_worker.rs index 1e0d378f0d..f2a1ad5090 100644 --- a/libdd-data-pipeline/src/pausable_worker.rs +++ b/libdd-data-pipeline/src/pausable_worker.rs @@ -149,7 +149,8 @@ impl PausableWorker { stop_token.cancel(); } - if let Ok(worker) = handle.await { + if let Ok(mut worker) = handle.await { + worker.on_pause().await; *self = PausableWorker::Paused { worker }; Ok(()) } else { From 0e74cad1ff7fe3ae761c4d641cb29a172af1a32d Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 10 Mar 2026 16:59:45 +0100 Subject: [PATCH 28/80] fix(telemetry): add reset hook to telemetry --- libdd-telemetry/src/worker/mod.rs | 250 +++++++++++++++++++++++++++- libdd-telemetry/src/worker/store.rs | 5 + 2 files changed, 254 insertions(+), 1 deletion(-) diff --git a/libdd-telemetry/src/worker/mod.rs b/libdd-telemetry/src/worker/mod.rs index 27ca18caa3..f66f298f85 100644 --- a/libdd-telemetry/src/worker/mod.rs +++ b/libdd-telemetry/src/worker/mod.rs @@ -184,8 +184,29 @@ impl Worker for TelemetryWorker { } }; } + } - // TODO: Handle action result and add support to stop worker from `run` + /// Reset the worker state in the child process after a fork. + /// + /// Discards inherited pending telemetry state without sending anything, and drains + /// the mailbox so that actions queued before the fork are not processed by the child. + /// Dedupe history is preserved across forks so the child does not re-emit already + /// seen dependencies, integrations, or configurations unless they are observed again + /// as new data. + fn reset(&mut self) { + // Drain all actions queued in the mailbox before the fork. + while self.mailbox.try_recv().is_ok() {} + + // Discard any action that was staged by the last trigger() call. + self.next_action = None; + + // Clear all unbuffered telemetry data; the child must not send pre-fork data. + self.data.logs = store::QueueHashMap::default(); + self.data.metric_buckets = MetricBuckets::default(); + self.data.dependencies.clear(); + self.data.integrations.clear(); + self.data.configurations.clear(); + self.data.endpoints.clear(); } async fn shutdown(&mut self) { @@ -1190,4 +1211,231 @@ mod tests { #[allow(clippy::redundant_closure)] let _ = |h: TelemetryWorkerHandle| is_sync(h); } + + mod reset { + use super::super::*; + use crate::data::{ + metrics::{MetricNamespace, MetricType}, + Configuration, ConfigurationOrigin, Dependency, Endpoint, Integration, Log, LogLevel, + }; + use libdd_common::worker::Worker; + + fn build_test_worker() -> (TelemetryWorkerHandle, TelemetryWorker) { + let builder = TelemetryWorkerBuilder::new( + "hostname".to_string(), + "test-service".to_string(), + "rust".to_string(), + "1.0.0".to_string(), + "1.0.0".to_string(), + ); + // build_worker requires a tokio Handle; tests using this must be #[tokio::test] + builder.build_worker(tokio::runtime::Handle::current()) + } + + fn make_log(id: u64, message: &str) -> (LogIdentifier, Log) { + ( + LogIdentifier { identifier: id }, + Log { + message: message.to_string(), + level: LogLevel::Warn, + stack_trace: None, + count: 1, + tags: String::new(), + is_sensitive: false, + is_crash: false, + }, + ) + } + + /// After reset(), pending buffered telemetry is cleared while dedupe history is preserved. + #[tokio::test] + async fn test_reset_clears_buffered_data() { + let (handle, mut worker) = build_test_worker(); + + // Populate every data field that reset() should clear. + worker.data.dependencies.insert(Dependency { + name: "dep".to_string(), + version: None, + }); + worker.data.integrations.insert(Integration { + name: "integration".to_string(), + version: None, + enabled: true, + compatible: None, + auto_enabled: None, + }); + worker.data.configurations.insert(Configuration { + name: "cfg".to_string(), + value: "true".to_string(), + origin: ConfigurationOrigin::Code, + config_id: None, + seq_id: None, + }); + worker.data.endpoints.insert(Endpoint { + operation_name: "GET /health".to_string(), + resource_name: "/health".to_string(), + ..Default::default() + }); + let (id, log) = make_log(42, "msg"); + worker.data.logs.get_mut_or_insert(id, log); + + // Register a metric context and add a data point. + let key = handle.register_metric_context( + "test.metric".to_string(), + vec![], + MetricType::Count, + false, + MetricNamespace::Tracers, + ); + worker.data.metric_buckets.add_point(key, 1.0, vec![]); + + worker.reset(); + + let stats = worker.stats(); + assert_eq!( + stats.dependencies_stored, 1, + "dependency dedupe history should be preserved" + ); + assert_eq!( + stats.dependencies_unflushed, 0, + "dependency pending queue should be cleared" + ); + assert_eq!( + stats.integrations_stored, 1, + "integration dedupe history should be preserved" + ); + assert_eq!( + stats.integrations_unflushed, 0, + "integration pending queue should be cleared" + ); + assert_eq!( + stats.configurations_stored, 1, + "configuration dedupe history should be preserved" + ); + assert_eq!( + stats.configurations_unflushed, 0, + "configuration pending queue should be cleared" + ); + assert_eq!(stats.logs, 0, "logs should be cleared"); + assert_eq!( + stats.metric_buckets.buckets, 0, + "metric buckets should be cleared" + ); + assert_eq!( + stats.metric_buckets.series, 0, + "metric series should be cleared" + ); + assert!( + worker.data.endpoints.is_empty(), + "endpoints should be cleared" + ); + assert!(worker.next_action.is_none(), "next_action should be None"); + } + + /// After reset(), actions queued in the mailbox before the fork are discarded. + #[tokio::test] + async fn test_reset_drains_mailbox() { + let (handle, mut worker) = build_test_worker(); + + // Enqueue several actions that should be discarded. + handle + .try_send_msg(TelemetryActions::AddDependency(Dependency { + name: "dep".to_string(), + version: None, + })) + .unwrap(); + let (id, log) = make_log(1, "pre-fork log"); + handle + .try_send_msg(TelemetryActions::AddLog((id, log))) + .unwrap(); + + // Stage one action as if trigger() had already stored it. + worker.next_action = Some(TelemetryActions::Lifecycle(LifecycleAction::Start)); + + worker.reset(); + + // The mailbox must be empty and next_action cleared. + assert!( + worker.mailbox.try_recv().is_err(), + "mailbox should be empty" + ); + assert!(worker.next_action.is_none(), "next_action should be None"); + // None of the queued actions should have been applied to pending state. + let stats = worker.stats(); + assert_eq!( + stats.dependencies_stored, 0, + "queued AddDependency must not be applied" + ); + assert_eq!( + stats.dependencies_unflushed, 0, + "queued AddDependency must not be pending" + ); + assert_eq!(stats.logs, 0, "queued AddLog must be discarded"); + } + + /// After reset(), the worker accepts new telemetry and processes it normally. + #[tokio::test] + async fn test_worker_accepts_new_data_after_reset() { + let (handle, mut worker) = build_test_worker(); + worker.flavor = TelemetryWorkerFlavor::MetricsLogs; + + // Populate state before reset – this data must not survive. + let (id, log) = make_log(99, "pre-fork"); + worker.data.logs.get_mut_or_insert(id, log); + + worker.reset(); + + // Send a new log from the child side. + let (id2, log2) = make_log(1, "post-fork"); + handle + .try_send_msg(TelemetryActions::AddLog((id2, log2))) + .unwrap(); + + // Simulate one trigger() + run() cycle. + worker.trigger().await; + worker.run().await; + + let stats = worker.stats(); + // Only the new post-fork log should be buffered. + assert_eq!(stats.logs, 1, "only post-fork log should be present"); + } + + /// After reset(), lifecycle state needed to keep periodic flushing alive is preserved. + #[tokio::test] + async fn test_reset_preserves_started_and_deadlines() { + let (_handle, mut worker) = build_test_worker(); + + worker.data.started = true; + worker + .deadlines + .schedule_event(LifecycleAction::FlushMetricAggr) + .unwrap(); + worker + .deadlines + .schedule_event(LifecycleAction::FlushData) + .unwrap(); + + let deadlines_before = worker.deadlines.deadlines.clone(); + + worker.reset(); + + assert!(worker.data.started, "started flag should be preserved"); + assert_eq!( + worker.deadlines.deadlines.len(), + deadlines_before.len(), + "scheduled deadlines should be preserved" + ); + for ((_, actual), (_, expected)) in worker + .deadlines + .deadlines + .iter() + .zip(deadlines_before.iter()) + { + assert_eq!( + actual, expected, + "deadline kinds should be preserved across reset" + ); + } + } + } } diff --git a/libdd-telemetry/src/worker/store.rs b/libdd-telemetry/src/worker/store.rs index 3986941de1..1c2d400900 100644 --- a/libdd-telemetry/src/worker/store.rs +++ b/libdd-telemetry/src/worker/store.rs @@ -208,6 +208,11 @@ where pub fn len_stored(&self) -> usize { self.items.len() } + + /// Discard only pending unflushed items while preserving stored dedupe history. + pub fn clear(&mut self) { + self.unflushed.clear(); + } } impl Extend for Store From 544b840c97b3921af041c7d8879ae730e375546e Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 10 Mar 2026 17:59:36 +0100 Subject: [PATCH 29/80] fix(telemetry): fix spawn and run loop for telemetry --- libdd-telemetry/src/worker/mod.rs | 62 +++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/libdd-telemetry/src/worker/mod.rs b/libdd-telemetry/src/worker/mod.rs index f66f298f85..b453d4b2ce 100644 --- a/libdd-telemetry/src/worker/mod.rs +++ b/libdd-telemetry/src/worker/mod.rs @@ -836,6 +836,59 @@ impl TelemetryWorker { metric_buckets: self.data.metric_buckets.stats(), } } + + // Runs a state machine that waits for actions, either from the worker's + // mailbox, or scheduled actions from the worker's deadline object. + async fn run_loop(mut self) { + debug!( + worker.flavor = ?self.flavor, + worker.runtime_id = %self.runtime_id, + "Starting telemetry worker" + ); + + loop { + if self.cancellation_token.is_cancelled() { + debug!( + worker.runtime_id = %self.runtime_id, + "Telemetry worker cancelled, shutting down" + ); + return; + } + + let action = self.recv_next_action().await; + debug!( + worker.runtime_id = %self.runtime_id, + action = ?action, + "Received telemetry action" + ); + + let action_result = match self.flavor { + TelemetryWorkerFlavor::Full => self.dispatch_action(action).await, + TelemetryWorkerFlavor::MetricsLogs => { + self.dispatch_metrics_logs_action(action).await + } + }; + + match action_result { + ControlFlow::Continue(()) => {} + ControlFlow::Break(()) => { + debug!( + worker.runtime_id = %self.runtime_id, + worker.restartable = self.config.restartable, + "Telemetry worker received break signal" + ); + if !self.config.restartable { + break; + } + } + }; + } + + debug!( + worker.runtime_id = %self.runtime_id, + "Telemetry worker stopped" + ); + } } #[derive(Debug)] @@ -1162,6 +1215,7 @@ impl TelemetryWorkerBuilder { shutdown, cancellation_token: token, runtime: tokio_runtime, + contexts, }, worker, @@ -1173,9 +1227,9 @@ impl TelemetryWorkerBuilder { pub fn spawn(self) -> (TelemetryWorkerHandle, JoinHandle<()>) { let tokio_runtime = tokio::runtime::Handle::current(); - let (worker_handle, mut worker) = self.build_worker(tokio_runtime.clone()); + let (worker_handle, worker) = self.build_worker(tokio_runtime.clone()); - let join_handle = tokio_runtime.spawn(async move { worker.run().await }); + let join_handle = tokio_runtime.spawn(async move { worker.run_loop().await }); (worker_handle, join_handle) } @@ -1185,10 +1239,10 @@ impl TelemetryWorkerBuilder { let runtime = tokio::runtime::Builder::new_current_thread() .enable_all() .build()?; - let (handle, mut worker) = self.build_worker(runtime.handle().clone()); + let (handle, worker) = self.build_worker(runtime.handle().clone()); let notify_shutdown = handle.shutdown.clone(); std::thread::spawn(move || { - runtime.block_on(worker.run()); + runtime.block_on(worker.run_loop()); runtime.shutdown_background(); notify_shutdown.shutdown_finished(); }); From a3e2c380aaa9fc9825b2dcad9e5023c4bc399988 Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 10 Mar 2026 18:07:24 +0100 Subject: [PATCH 30/80] docs(info_fetcher): update doc for running the fetcher --- libdd-data-pipeline/src/agent_info/fetcher.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index a279c17e18..cfe3c00af7 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -98,9 +98,11 @@ async fn fetch_and_hash_response(info_endpoint: &Endpoint) -> Result<(String, by /// Fetch the info endpoint and update an ArcSwap keeping it up-to-date. /// -/// Once the run method has been started, the fetcher will -/// update the global info state based on the given refresh interval. You can access the current -/// state with [`crate::agent_info::get_agent_info`] +/// This type implements [`libdd_common::worker::Worker`] and is intended to be driven by a worker +/// runner such as [`crate::shared_runtime::SharedRuntime`]. +/// In that lifecycle, `trigger()` waits for the next refresh event and `run()` performs a single fetch. +/// +/// You can access the current state with [`crate::agent_info::get_agent_info`]. /// /// # Response observer /// When the fetcher is created it also returns a [`ResponseObserver`] which can be used to check @@ -121,10 +123,9 @@ async fn fetch_and_hash_response(info_endpoint: &Endpoint) -> Result<(String, by /// endpoint, /// std::time::Duration::from_secs(5 * 60), /// ); -/// // Start the runner -/// tokio::spawn(async move { -/// fetcher.run().await; -/// }); +/// // Start the fetcher on a shared runtime +/// let runtime = libdd_data_pipeline::shared_runtime::SharedRuntime::new()?; +/// runtime.spawn_worker(fetcher)?; /// /// // Get the Arc to access the info /// let agent_info_arc = agent_info::get_agent_info(); From 67fcf2a036a363dc52c0fe06c74eda79caebf6fb Mon Sep 17 00:00:00 2001 From: vianney Date: Wed, 11 Mar 2026 11:31:06 +0100 Subject: [PATCH 31/80] feat(runtime-ffi): remove redundant allocation of Box --- libdd-data-pipeline-ffi/src/shared_runtime.rs | 183 +++++++++--------- 1 file changed, 95 insertions(+), 88 deletions(-) diff --git a/libdd-data-pipeline-ffi/src/shared_runtime.rs b/libdd-data-pipeline-ffi/src/shared_runtime.rs index 4731184ec0..ff817afcfc 100644 --- a/libdd-data-pipeline-ffi/src/shared_runtime.rs +++ b/libdd-data-pipeline-ffi/src/shared_runtime.rs @@ -71,27 +71,31 @@ pub unsafe extern "C" fn ddog_shared_runtime_error_free(error: Option>>, + out_handle: NonNull<*const SharedRuntime>, ) -> Option> { match SharedRuntime::new() { Ok(runtime) => { - out_handle.as_ptr().write(Box::new(Arc::new(runtime))); + out_handle.as_ptr().write(Arc::into_raw(Arc::new(runtime))); None } Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), } } -/// Clone the `Arc`, incrementing the reference count. +/// Clone the handle, incrementing the `Arc` strong count. /// /// The new handle is independent from the original: either can be freed /// without affecting the other. The underlying runtime is only destroyed @@ -100,29 +104,32 @@ pub unsafe extern "C" fn ddog_shared_runtime_new( /// On success writes the cloned handle into `*out_handle` and returns `None`. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_clone( - handle: Option<&Arc>, - out_handle: NonNull>>, + handle: *const SharedRuntime, + out_handle: NonNull<*const SharedRuntime>, ) -> Option> { - match handle { - Some(arc) => { - out_handle.as_ptr().write(Box::new(arc.clone())); - None - } - None => Some(Box::new(SharedRuntimeFFIError::new( + if handle.is_null() { + return Some(Box::new(SharedRuntimeFFIError::new( SharedRuntimeErrorCode::InvalidArgument, "handle is null", - ))), + ))); } + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + Arc::increment_strong_count(handle); + out_handle.as_ptr().write(handle); + None } -/// Free a `SharedRuntime` handle, decrementing the `Arc` reference count. +/// Free a handle, decrementing the `Arc` strong count. /// -/// The underlying runtime is only shut down once the last handle is freed. +/// The underlying runtime is only destroyed once the last handle is freed. /// Use [`ddog_shared_runtime_shutdown`] to explicitly stop the runtime and /// all its workers before the last handle is freed. #[no_mangle] -pub unsafe extern "C" fn ddog_shared_runtime_free(handle: Box>) { - drop(handle); +pub unsafe extern "C" fn ddog_shared_runtime_free(handle: *const SharedRuntime) { + if !handle.is_null() { + // SAFETY: handle was produced by Arc::into_raw; this call takes ownership. + drop(Arc::from_raw(handle)); + } } /// Must be called in the parent process before `fork()`. @@ -133,19 +140,16 @@ pub unsafe extern "C" fn ddog_shared_runtime_free(handle: Box /// Returns an error if `handle` is null. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_before_fork( - handle: Option<&Arc>, + handle: *const SharedRuntime, ) -> Option> { - let runtime = match handle { - Some(r) => r, - None => { - return Some(Box::new(SharedRuntimeFFIError::new( - SharedRuntimeErrorCode::InvalidArgument, - "handle is null", - ))) - } - }; - - runtime.before_fork(); + if handle.is_null() { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))); + } + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + (*handle).before_fork(); None } @@ -156,19 +160,16 @@ pub unsafe extern "C" fn ddog_shared_runtime_before_fork( /// Returns `None` on success, or an error if workers could not be restarted. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_after_fork_parent( - handle: Option<&Arc>, + handle: *const SharedRuntime, ) -> Option> { - let runtime = match handle { - Some(r) => r, - None => { - return Some(Box::new(SharedRuntimeFFIError::new( - SharedRuntimeErrorCode::InvalidArgument, - "handle is null", - ))) - } - }; - - match runtime.after_fork_parent() { + if handle.is_null() { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))); + } + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + match (*handle).after_fork_parent() { Ok(()) => None, Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), } @@ -183,19 +184,16 @@ pub unsafe extern "C" fn ddog_shared_runtime_after_fork_parent( /// reinitialized. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_after_fork_child( - handle: Option<&Arc>, + handle: *const SharedRuntime, ) -> Option> { - let runtime = match handle { - Some(r) => r, - None => { - return Some(Box::new(SharedRuntimeFFIError::new( - SharedRuntimeErrorCode::InvalidArgument, - "handle is null", - ))) - } - }; - - match runtime.after_fork_child() { + if handle.is_null() { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))); + } + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + match (*handle).after_fork_child() { Ok(()) => None, Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), } @@ -210,18 +208,15 @@ pub unsafe extern "C" fn ddog_shared_runtime_after_fork_child( /// if the timeout was reached. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_shutdown( - handle: Option<&Arc>, + handle: *const SharedRuntime, timeout_ms: u64, ) -> Option> { - let runtime = match handle { - Some(r) => r, - None => { - return Some(Box::new(SharedRuntimeFFIError::new( - SharedRuntimeErrorCode::InvalidArgument, - "handle is null", - ))) - } - }; + if handle.is_null() { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))); + } let timeout = if timeout_ms > 0 { Some(std::time::Duration::from_millis(timeout_ms)) @@ -229,7 +224,8 @@ pub unsafe extern "C" fn ddog_shared_runtime_shutdown( None }; - match runtime.shutdown(timeout) { + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + match (*handle).shutdown(timeout) { Ok(()) => None, Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), } @@ -240,11 +236,19 @@ mod tests { use super::*; use std::mem::MaybeUninit; + unsafe fn strong_count(handle: *const SharedRuntime) -> usize { + // Reconstruct the Arc temporarily without dropping it. + let arc = Arc::from_raw(handle); + let count = Arc::strong_count(&arc); + std::mem::forget(arc); + count + } + #[test] fn test_new_and_free() { unsafe { - let mut handle: MaybeUninit>> = MaybeUninit::uninit(); - let err = ddog_shared_runtime_new(NonNull::new_unchecked(&mut handle).cast()); + let mut handle: MaybeUninit<*const SharedRuntime> = MaybeUninit::uninit(); + let err = ddog_shared_runtime_new(NonNull::new_unchecked(handle.as_mut_ptr())); assert!(err.is_none()); ddog_shared_runtime_free(handle.assume_init()); } @@ -253,22 +257,22 @@ mod tests { #[test] fn test_clone() { unsafe { - let mut handle: MaybeUninit>> = MaybeUninit::uninit(); - ddog_shared_runtime_new(NonNull::new_unchecked(&mut handle).cast()); + let mut handle: MaybeUninit<*const SharedRuntime> = MaybeUninit::uninit(); + ddog_shared_runtime_new(NonNull::new_unchecked(handle.as_mut_ptr())); let handle = handle.assume_init(); - let mut cloned: MaybeUninit>> = MaybeUninit::uninit(); + let mut cloned: MaybeUninit<*const SharedRuntime> = MaybeUninit::uninit(); let err = ddog_shared_runtime_clone( - Some(handle.as_ref()), - NonNull::new_unchecked(&mut cloned).cast(), + handle, + NonNull::new_unchecked(cloned.as_mut_ptr()), ); assert!(err.is_none()); - // Both handles should point to the same underlying runtime (strong count == 2). - assert_eq!(Arc::strong_count(handle.as_ref()), 2); + // Both handles point to the same underlying runtime (strong count == 2). + assert_eq!(strong_count(handle), 2); ddog_shared_runtime_free(cloned.assume_init()); - assert_eq!(Arc::strong_count(handle.as_ref()), 1); + assert_eq!(strong_count(handle), 1); ddog_shared_runtime_free(handle); } @@ -277,8 +281,11 @@ mod tests { #[test] fn test_clone_null_handle() { unsafe { - let mut cloned: MaybeUninit>> = MaybeUninit::uninit(); - let err = ddog_shared_runtime_clone(None, NonNull::new_unchecked(&mut cloned).cast()); + let mut cloned: MaybeUninit<*const SharedRuntime> = MaybeUninit::uninit(); + let err = ddog_shared_runtime_clone( + std::ptr::null(), + NonNull::new_unchecked(cloned.as_mut_ptr()), + ); assert!(err.is_some()); assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); } @@ -287,13 +294,13 @@ mod tests { #[test] fn test_before_after_fork_null() { unsafe { - let err = ddog_shared_runtime_before_fork(None); + let err = ddog_shared_runtime_before_fork(std::ptr::null()); assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); - let err = ddog_shared_runtime_after_fork_parent(None); + let err = ddog_shared_runtime_after_fork_parent(std::ptr::null()); assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); - let err = ddog_shared_runtime_after_fork_child(None); + let err = ddog_shared_runtime_after_fork_child(std::ptr::null()); assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); } } @@ -301,14 +308,14 @@ mod tests { #[test] fn test_fork_lifecycle() { unsafe { - let mut handle: MaybeUninit>> = MaybeUninit::uninit(); - ddog_shared_runtime_new(NonNull::new_unchecked(&mut handle).cast()); + let mut handle: MaybeUninit<*const SharedRuntime> = MaybeUninit::uninit(); + ddog_shared_runtime_new(NonNull::new_unchecked(handle.as_mut_ptr())); let handle = handle.assume_init(); - let err = ddog_shared_runtime_before_fork(Some(handle.as_ref())); + let err = ddog_shared_runtime_before_fork(handle); assert!(err.is_none(), "{:?}", err.map(|e| e.code)); - let err = ddog_shared_runtime_after_fork_parent(Some(handle.as_ref())); + let err = ddog_shared_runtime_after_fork_parent(handle); assert!(err.is_none(), "{:?}", err.map(|e| e.code)); ddog_shared_runtime_free(handle); @@ -318,11 +325,11 @@ mod tests { #[test] fn test_shutdown() { unsafe { - let mut handle: MaybeUninit>> = MaybeUninit::uninit(); - ddog_shared_runtime_new(NonNull::new_unchecked(&mut handle).cast()); + let mut handle: MaybeUninit<*const SharedRuntime> = MaybeUninit::uninit(); + ddog_shared_runtime_new(NonNull::new_unchecked(handle.as_mut_ptr())); let handle = handle.assume_init(); - let err = ddog_shared_runtime_shutdown(Some(handle.as_ref()), 0); + let err = ddog_shared_runtime_shutdown(handle, 0); assert!(err.is_none()); ddog_shared_runtime_free(handle); From 6420f28e9cf7c653711b2e96d65130fbf35453a6 Mon Sep 17 00:00:00 2001 From: vianney Date: Wed, 11 Mar 2026 11:37:03 +0100 Subject: [PATCH 32/80] format --- libdd-data-pipeline-ffi/src/shared_runtime.rs | 6 ++---- libdd-data-pipeline/src/agent_info/fetcher.rs | 3 ++- libdd-data-pipeline/src/trace_exporter/builder.rs | 2 +- libdd-data-pipeline/src/trace_exporter/mod.rs | 10 ++++++---- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/libdd-data-pipeline-ffi/src/shared_runtime.rs b/libdd-data-pipeline-ffi/src/shared_runtime.rs index ff817afcfc..220cebf50d 100644 --- a/libdd-data-pipeline-ffi/src/shared_runtime.rs +++ b/libdd-data-pipeline-ffi/src/shared_runtime.rs @@ -262,10 +262,8 @@ mod tests { let handle = handle.assume_init(); let mut cloned: MaybeUninit<*const SharedRuntime> = MaybeUninit::uninit(); - let err = ddog_shared_runtime_clone( - handle, - NonNull::new_unchecked(cloned.as_mut_ptr()), - ); + let err = + ddog_shared_runtime_clone(handle, NonNull::new_unchecked(cloned.as_mut_ptr())); assert!(err.is_none()); // Both handles point to the same underlying runtime (strong count == 2). diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index cfe3c00af7..773496afa7 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -100,7 +100,8 @@ async fn fetch_and_hash_response(info_endpoint: &Endpoint) -> Result<(String, by /// /// This type implements [`libdd_common::worker::Worker`] and is intended to be driven by a worker /// runner such as [`crate::shared_runtime::SharedRuntime`]. -/// In that lifecycle, `trigger()` waits for the next refresh event and `run()` performs a single fetch. +/// In that lifecycle, `trigger()` waits for the next refresh event and `run()` performs a single +/// fetch. /// /// You can access the current state with [`crate::agent_info::get_agent_info`]. /// diff --git a/libdd-data-pipeline/src/trace_exporter/builder.rs b/libdd-data-pipeline/src/trace_exporter/builder.rs index 60f0b83c35..e68851c332 100644 --- a/libdd-data-pipeline/src/trace_exporter/builder.rs +++ b/libdd-data-pipeline/src/trace_exporter/builder.rs @@ -8,7 +8,7 @@ use crate::trace_exporter::agent_response::AgentResponsePayloadVersion; use crate::trace_exporter::error::BuilderErrorKind; use crate::trace_exporter::{ add_path, StatsComputationStatus, TelemetryConfig, TraceExporter, TraceExporterError, - TraceExporterInputFormat, TraceExporterOutputFormat, TracerMetadata, TraceExporterWorkers, + TraceExporterInputFormat, TraceExporterOutputFormat, TraceExporterWorkers, TracerMetadata, INFO_ENDPOINT, }; use arc_swap::ArcSwap; diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index bd10d7a6c9..d7c20b4805 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -20,7 +20,9 @@ use crate::telemetry::{SendPayloadTelemetry, TelemetryClient}; use crate::trace_exporter::agent_response::{ AgentResponsePayloadVersion, DATADOG_RATES_PAYLOAD_VERSION_HEADER, }; -use crate::trace_exporter::error::{InternalErrorKind, RequestError, ShutdownError, TraceExporterError}; +use crate::trace_exporter::error::{ + InternalErrorKind, RequestError, ShutdownError, TraceExporterError, +}; use crate::{ agent_info::{self, schema::AgentInfo}, health_metrics, @@ -219,9 +221,9 @@ impl TraceExporter { pub fn shutdown(self, timeout: Option) -> Result<(), TraceExporterError> { let runtime = self.runtime()?; if let Some(timeout) = timeout { - match runtime.block_on(async { - tokio::time::timeout(timeout, self.shutdown_workers()).await - }) { + match runtime + .block_on(async { tokio::time::timeout(timeout, self.shutdown_workers()).await }) + { Ok(()) => Ok(()), Err(_) => Err(TraceExporterError::Shutdown(ShutdownError::TimedOut( timeout, From 72f61f63f333d364e175519a8abe8aa761d2ad61 Mon Sep 17 00:00:00 2001 From: vianney Date: Wed, 11 Mar 2026 13:54:36 +0100 Subject: [PATCH 33/80] feat(runtime-ffi): use new handle in trace exporter builder --- libdd-data-pipeline-ffi/src/shared_runtime.rs | 76 +------------------ libdd-data-pipeline-ffi/src/trace_exporter.rs | 18 +++-- libdd-data-pipeline/src/trace_exporter/mod.rs | 2 + 3 files changed, 17 insertions(+), 79 deletions(-) diff --git a/libdd-data-pipeline-ffi/src/shared_runtime.rs b/libdd-data-pipeline-ffi/src/shared_runtime.rs index 220cebf50d..917f60dffa 100644 --- a/libdd-data-pipeline-ffi/src/shared_runtime.rs +++ b/libdd-data-pipeline-ffi/src/shared_runtime.rs @@ -76,10 +76,6 @@ pub unsafe extern "C" fn ddog_shared_runtime_error_free(error: Option, -) -> Option> { - if handle.is_null() { - return Some(Box::new(SharedRuntimeFFIError::new( - SharedRuntimeErrorCode::InvalidArgument, - "handle is null", - ))); - } - // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. - Arc::increment_strong_count(handle); - out_handle.as_ptr().write(handle); - None -} - /// Free a handle, decrementing the `Arc` strong count. /// -/// The underlying runtime is only destroyed once the last handle is freed. -/// Use [`ddog_shared_runtime_shutdown`] to explicitly stop the runtime and -/// all its workers before the last handle is freed. +/// The underlying runtime may not be dropped if other components are still using it. +/// Use [`ddog_shared_runtime_shutdown`] to cleanly stop workers. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_free(handle: *const SharedRuntime) { if !handle.is_null() { @@ -236,14 +207,6 @@ mod tests { use super::*; use std::mem::MaybeUninit; - unsafe fn strong_count(handle: *const SharedRuntime) -> usize { - // Reconstruct the Arc temporarily without dropping it. - let arc = Arc::from_raw(handle); - let count = Arc::strong_count(&arc); - std::mem::forget(arc); - count - } - #[test] fn test_new_and_free() { unsafe { @@ -254,41 +217,6 @@ mod tests { } } - #[test] - fn test_clone() { - unsafe { - let mut handle: MaybeUninit<*const SharedRuntime> = MaybeUninit::uninit(); - ddog_shared_runtime_new(NonNull::new_unchecked(handle.as_mut_ptr())); - let handle = handle.assume_init(); - - let mut cloned: MaybeUninit<*const SharedRuntime> = MaybeUninit::uninit(); - let err = - ddog_shared_runtime_clone(handle, NonNull::new_unchecked(cloned.as_mut_ptr())); - assert!(err.is_none()); - - // Both handles point to the same underlying runtime (strong count == 2). - assert_eq!(strong_count(handle), 2); - - ddog_shared_runtime_free(cloned.assume_init()); - assert_eq!(strong_count(handle), 1); - - ddog_shared_runtime_free(handle); - } - } - - #[test] - fn test_clone_null_handle() { - unsafe { - let mut cloned: MaybeUninit<*const SharedRuntime> = MaybeUninit::uninit(); - let err = ddog_shared_runtime_clone( - std::ptr::null(), - NonNull::new_unchecked(cloned.as_mut_ptr()), - ); - assert!(err.is_some()); - assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); - } - } - #[test] fn test_before_after_fork_null() { unsafe { diff --git a/libdd-data-pipeline-ffi/src/trace_exporter.rs b/libdd-data-pipeline-ffi/src/trace_exporter.rs index 1703fb6a8d..ce26d11051 100644 --- a/libdd-data-pipeline-ffi/src/trace_exporter.rs +++ b/libdd-data-pipeline-ffi/src/trace_exporter.rs @@ -396,18 +396,26 @@ pub unsafe extern "C" fn ddog_trace_exporter_config_set_connection_timeout( /// Sets a shared runtime for the TraceExporter to use for background workers. /// +/// `handle` must have been initialized with [`ddog_shared_runtime_new`]. +/// /// When set, the exporter will use the provided runtime instead of creating its own. /// This allows multiple exporters (or other components) to share a single runtime. -/// The config holds a clone of the `Arc`, so the original handle remains valid. +/// The config holds a clone of the `Arc` (increments the strong count), so the +/// original handle remains valid and must still be freed with +/// [`ddog_shared_runtime_free`]. #[no_mangle] pub unsafe extern "C" fn ddog_trace_exporter_config_set_shared_runtime( config: Option<&mut TraceExporterConfig>, - handle: Option<&Arc>, + handle: *const SharedRuntime, ) -> Option> { catch_panic!( - match (config, handle) { - (Some(config), Some(runtime)) => { - config.shared_runtime = Some(runtime.clone()); + match config { + Some(config) if !handle.is_null() => { + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + // Increment the strong count before reconstructing so the config's Arc + // is independent from the caller's handle. + Arc::increment_strong_count(handle); + config.shared_runtime = Some(Arc::from_raw(handle)); None } _ => gen_error!(ErrorCode::InvalidArgument), diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index d7c20b4805..550862bb28 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -43,6 +43,8 @@ use libdd_trace_utils::span::{v04::Span, TraceData}; use libdd_trace_utils::trace_utils::TracerHeaderTags; use std::io; use std::sync::Arc; +#[cfg(feature = "test-utils")] +use std::time::Duration; use std::{borrow::Borrow, collections::HashMap, str::FromStr}; use tokio::runtime::Runtime; use tokio::task::JoinSet; From 7bac7195600d48b28f862d07550e03c1635243de Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 13 Mar 2026 13:58:51 +0100 Subject: [PATCH 34/80] chore: catch panics on ffi --- libdd-data-pipeline-ffi/src/lib.rs | 6 +- libdd-data-pipeline-ffi/src/shared_runtime.rs | 159 +++++++++++------- libdd-data-pipeline-ffi/src/trace_exporter.rs | 2 +- 3 files changed, 104 insertions(+), 63 deletions(-) diff --git a/libdd-data-pipeline-ffi/src/lib.rs b/libdd-data-pipeline-ffi/src/lib.rs index c1c71bad49..a276ac260b 100644 --- a/libdd-data-pipeline-ffi/src/lib.rs +++ b/libdd-data-pipeline-ffi/src/lib.rs @@ -18,11 +18,11 @@ macro_rules! catch_panic { Ok(ret) => ret, Err(info) => { if let Some(s) = info.downcast_ref::() { - error!(error = %ErrorCode::Panic, s); + tracing::error!("panic: {}", s); } else if let Some(s) = info.downcast_ref::<&str>() { - error!(error = %ErrorCode::Panic, s); + tracing::error!("panic: {}", s); } else { - error!(error = %ErrorCode::Panic, "Unable to retrieve panic context"); + tracing::error!("panic: unable to retrieve panic context"); } $err } diff --git a/libdd-data-pipeline-ffi/src/shared_runtime.rs b/libdd-data-pipeline-ffi/src/shared_runtime.rs index 917f60dffa..5e98cd6c52 100644 --- a/libdd-data-pipeline-ffi/src/shared_runtime.rs +++ b/libdd-data-pipeline-ffi/src/shared_runtime.rs @@ -1,6 +1,7 @@ // Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ // SPDX-License-Identifier: Apache-2.0 +use crate::catch_panic; use libdd_data_pipeline::shared_runtime::{SharedRuntime, SharedRuntimeError}; use std::ffi::{c_char, CString}; use std::ptr::NonNull; @@ -22,6 +23,9 @@ pub enum SharedRuntimeErrorCode { RuntimeCreation, /// Shutdown timed out. ShutdownTimedOut, + /// An unexpected panic occurred inside the FFI call. + #[cfg(feature = "catch_panic")] + Panic, } /// Error returned by SharedRuntime FFI functions. @@ -65,10 +69,19 @@ impl Drop for SharedRuntimeFFIError { } } +macro_rules! panic_error { + () => { + Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::Panic, + "panic", + ))) + }; +} + /// Frees a `SharedRuntimeFFIError`. After this call the pointer is invalid. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_error_free(error: Option>) { - drop(error); + catch_panic!(drop(error), ()) } /// Create a new `SharedRuntime`. @@ -82,13 +95,16 @@ pub unsafe extern "C" fn ddog_shared_runtime_error_free(error: Option, ) -> Option> { - match SharedRuntime::new() { - Ok(runtime) => { - out_handle.as_ptr().write(Arc::into_raw(Arc::new(runtime))); - None - } - Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), - } + catch_panic!( + match SharedRuntime::new() { + Ok(runtime) => { + out_handle.as_ptr().write(Arc::into_raw(Arc::new(runtime))); + None + } + Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + }, + panic_error!() + ) } /// Free a handle, decrementing the `Arc` strong count. @@ -97,10 +113,15 @@ pub unsafe extern "C" fn ddog_shared_runtime_new( /// Use [`ddog_shared_runtime_shutdown`] to cleanly stop workers. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_free(handle: *const SharedRuntime) { - if !handle.is_null() { - // SAFETY: handle was produced by Arc::into_raw; this call takes ownership. - drop(Arc::from_raw(handle)); - } + catch_panic!( + { + if !handle.is_null() { + // SAFETY: handle was produced by Arc::into_raw; this call takes ownership. + drop(Arc::from_raw(handle)); + } + }, + () + ) } /// Must be called in the parent process before `fork()`. @@ -113,15 +134,20 @@ pub unsafe extern "C" fn ddog_shared_runtime_free(handle: *const SharedRuntime) pub unsafe extern "C" fn ddog_shared_runtime_before_fork( handle: *const SharedRuntime, ) -> Option> { - if handle.is_null() { - return Some(Box::new(SharedRuntimeFFIError::new( - SharedRuntimeErrorCode::InvalidArgument, - "handle is null", - ))); - } - // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. - (*handle).before_fork(); - None + catch_panic!( + { + if handle.is_null() { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))); + } + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + (*handle).before_fork(); + None + }, + panic_error!() + ) } /// Must be called in the parent process after `fork()`. @@ -133,17 +159,22 @@ pub unsafe extern "C" fn ddog_shared_runtime_before_fork( pub unsafe extern "C" fn ddog_shared_runtime_after_fork_parent( handle: *const SharedRuntime, ) -> Option> { - if handle.is_null() { - return Some(Box::new(SharedRuntimeFFIError::new( - SharedRuntimeErrorCode::InvalidArgument, - "handle is null", - ))); - } - // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. - match (*handle).after_fork_parent() { - Ok(()) => None, - Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), - } + catch_panic!( + { + if handle.is_null() { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))); + } + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + match (*handle).after_fork_parent() { + Ok(()) => None, + Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + } + }, + panic_error!() + ) } /// Must be called in the child process after `fork()`. @@ -157,17 +188,22 @@ pub unsafe extern "C" fn ddog_shared_runtime_after_fork_parent( pub unsafe extern "C" fn ddog_shared_runtime_after_fork_child( handle: *const SharedRuntime, ) -> Option> { - if handle.is_null() { - return Some(Box::new(SharedRuntimeFFIError::new( - SharedRuntimeErrorCode::InvalidArgument, - "handle is null", - ))); - } - // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. - match (*handle).after_fork_child() { - Ok(()) => None, - Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), - } + catch_panic!( + { + if handle.is_null() { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))); + } + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + match (*handle).after_fork_child() { + Ok(()) => None, + Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + } + }, + panic_error!() + ) } /// Shut down the `SharedRuntime`, stopping all workers. @@ -182,24 +218,29 @@ pub unsafe extern "C" fn ddog_shared_runtime_shutdown( handle: *const SharedRuntime, timeout_ms: u64, ) -> Option> { - if handle.is_null() { - return Some(Box::new(SharedRuntimeFFIError::new( - SharedRuntimeErrorCode::InvalidArgument, - "handle is null", - ))); - } + catch_panic!( + { + if handle.is_null() { + return Some(Box::new(SharedRuntimeFFIError::new( + SharedRuntimeErrorCode::InvalidArgument, + "handle is null", + ))); + } - let timeout = if timeout_ms > 0 { - Some(std::time::Duration::from_millis(timeout_ms)) - } else { - None - }; + let timeout = if timeout_ms > 0 { + Some(std::time::Duration::from_millis(timeout_ms)) + } else { + None + }; - // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. - match (*handle).shutdown(timeout) { - Ok(()) => None, - Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), - } + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + match (*handle).shutdown(timeout) { + Ok(()) => None, + Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + } + }, + panic_error!() + ) } #[cfg(test)] diff --git a/libdd-data-pipeline-ffi/src/trace_exporter.rs b/libdd-data-pipeline-ffi/src/trace_exporter.rs index ce26d11051..40c1ba717b 100644 --- a/libdd-data-pipeline-ffi/src/trace_exporter.rs +++ b/libdd-data-pipeline-ffi/src/trace_exporter.rs @@ -13,7 +13,7 @@ use libdd_data_pipeline::trace_exporter::{ TelemetryConfig, TraceExporter, TraceExporterInputFormat, TraceExporterOutputFormat, }; use std::{ptr::NonNull, sync::Arc, time::Duration}; -use tracing::{debug, error}; +use tracing::debug; #[inline] fn sanitize_string(str: CharSlice) -> Result> { From fc1c81b6d9e1f03e9d7ee2a784016f449385e5f5 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 16 Mar 2026 11:22:58 +0100 Subject: [PATCH 35/80] feat(telemetry): add check for existing action --- libdd-telemetry/src/worker/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libdd-telemetry/src/worker/mod.rs b/libdd-telemetry/src/worker/mod.rs index b453d4b2ce..e2d73d4f52 100644 --- a/libdd-telemetry/src/worker/mod.rs +++ b/libdd-telemetry/src/worker/mod.rs @@ -162,6 +162,10 @@ impl Debug for TelemetryWorker { #[async_trait] impl Worker for TelemetryWorker { async fn trigger(&mut self) { + if self.next_action.is_some() { + // An action is already available and hasn't been executed + return; + } // Wait for the next action and store it let action = self.recv_next_action().await; self.next_action = Some(action); From 57de7d4caa86ec0b756b8baa4eebd23dbc32eb64 Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 20 Mar 2026 15:52:24 +0100 Subject: [PATCH 36/80] refactor: move shared runtime to a separate crate --- Cargo.lock | 24 +++++++++++- Cargo.toml | 2 + LICENSE-3rdparty.yml | 2 +- libdd-common/src/lib.rs | 1 - libdd-data-pipeline-ffi/Cargo.toml | 1 + libdd-data-pipeline-ffi/cbindgen.toml | 2 +- libdd-data-pipeline-ffi/src/lib.rs | 1 - libdd-data-pipeline-ffi/src/trace_exporter.rs | 2 +- libdd-data-pipeline/Cargo.toml | 2 +- libdd-data-pipeline/src/agent_info/fetcher.rs | 13 ++++--- libdd-data-pipeline/src/lib.rs | 2 - libdd-data-pipeline/src/stats_exporter.rs | 5 ++- libdd-data-pipeline/src/telemetry/mod.rs | 2 +- .../src/trace_exporter/builder.rs | 2 +- libdd-data-pipeline/src/trace_exporter/mod.rs | 2 +- .../src/trace_exporter/stats.rs | 2 +- libdd-shared-runtime-ffi/Cargo.toml | 26 +++++++++++++ libdd-shared-runtime-ffi/build.rs | 11 ++++++ libdd-shared-runtime-ffi/cbindgen.toml | 28 ++++++++++++++ libdd-shared-runtime-ffi/src/lib.rs | 37 +++++++++++++++++++ .../src/shared_runtime.rs | 7 +++- libdd-shared-runtime/Cargo.toml | 26 +++++++++++++ libdd-shared-runtime/src/lib.rs | 14 +++++++ .../src/worker/mod.rs | 3 ++ .../src/worker}/pausable_worker.rs | 4 +- .../src/worker}/shared_runtime.rs | 7 +++- libdd-telemetry/Cargo.toml | 1 + libdd-telemetry/src/worker/mod.rs | 5 ++- tools/docker/Dockerfile.build | 2 + 29 files changed, 208 insertions(+), 28 deletions(-) create mode 100644 libdd-shared-runtime-ffi/Cargo.toml create mode 100644 libdd-shared-runtime-ffi/build.rs create mode 100644 libdd-shared-runtime-ffi/cbindgen.toml create mode 100644 libdd-shared-runtime-ffi/src/lib.rs rename {libdd-data-pipeline-ffi => libdd-shared-runtime-ffi}/src/shared_runtime.rs (95%) create mode 100644 libdd-shared-runtime/Cargo.toml create mode 100644 libdd-shared-runtime/src/lib.rs rename libdd-common/src/worker.rs => libdd-shared-runtime/src/worker/mod.rs (97%) rename {libdd-data-pipeline/src => libdd-shared-runtime/src/worker}/pausable_worker.rs (98%) rename {libdd-data-pipeline/src => libdd-shared-runtime/src/worker}/shared_runtime.rs (99%) diff --git a/Cargo.lock b/Cargo.lock index 8bd066a7c2..9c83859b3d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3107,6 +3107,7 @@ dependencies = [ "libdd-ddsketch", "libdd-dogstatsd-client", "libdd-log", + "libdd-shared-runtime", "libdd-telemetry", "libdd-tinybytes", "libdd-trace-protobuf", @@ -3120,7 +3121,6 @@ dependencies = [ "sha2", "tempfile", "tokio", - "tokio-util", "tracing", "uuid", ] @@ -3133,6 +3133,7 @@ dependencies = [ "httpmock", "libdd-common-ffi", "libdd-data-pipeline", + "libdd-shared-runtime", "libdd-tinybytes", "libdd-trace-utils", "rmp-serde", @@ -3313,6 +3314,26 @@ dependencies = [ "prost", ] +[[package]] +name = "libdd-shared-runtime" +version = "28.0.3" +dependencies = [ + "async-trait", + "libdd-common", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "libdd-shared-runtime-ffi" +version = "28.0.3" +dependencies = [ + "build_common", + "libdd-shared-runtime", + "tracing", +] + [[package]] name = "libdd-telemetry" version = "3.0.0" @@ -3327,6 +3348,7 @@ dependencies = [ "libc", "libdd-common", "libdd-ddsketch", + "libdd-shared-runtime", "serde", "serde_json", "sys-info", diff --git a/Cargo.toml b/Cargo.toml index 6009b61bad..b50571416b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,8 @@ members = [ "spawn_worker", "tests/spawn_from_lib", "bin_tests", + "libdd-shared-runtime", + "libdd-shared-runtime-ffi", "libdd-data-pipeline", "libdd-data-pipeline-ffi", "libdd-ddsketch", diff --git a/LICENSE-3rdparty.yml b/LICENSE-3rdparty.yml index db7415cf26..b0bccd7c2e 100644 --- a/LICENSE-3rdparty.yml +++ b/LICENSE-3rdparty.yml @@ -1,4 +1,4 @@ -root_name: builder, build_common, tools, libdd-alloc, libdd-crashtracker, libdd-common, libdd-telemetry, libdd-ddsketch, libdd-libunwind-sys, libdd-crashtracker-ffi, libdd-common-ffi, datadog-ffe, datadog-ffe-ffi, datadog-ipc, datadog-ipc-macros, libdd-tinybytes, tarpc, tarpc-plugins, spawn_worker, cc_utils, libdd-library-config, libdd-trace-protobuf, libdd-library-config-ffi, datadog-live-debugger, libdd-data-pipeline, libdd-dogstatsd-client, libdd-trace-stats, libdd-trace-utils, libdd-trace-normalization, libdd-log, datadog-live-debugger-ffi, libdd-profiling, libdd-profiling-protobuf, libdd-profiling-ffi, libdd-data-pipeline-ffi, libdd-ddsketch-ffi, libdd-log-ffi, libdd-telemetry-ffi, symbolizer-ffi, datadog-profiling-replayer, datadog-remote-config, datadog-sidecar, datadog-sidecar-macros, datadog-sidecar-ffi, libdd-trace-obfuscation, datadog-tracer-flare, sidecar_mockgen, test_spawn_from_lib +root_name: builder, build_common, tools, libdd-alloc, libdd-crashtracker, libdd-common, libdd-telemetry, libdd-ddsketch, libdd-libunwind-sys, libdd-crashtracker-ffi, libdd-common-ffi, datadog-ffe, datadog-ffe-ffi, datadog-ipc, datadog-ipc-macros, libdd-tinybytes, tarpc, tarpc-plugins, spawn_worker, cc_utils, libdd-library-config, libdd-trace-protobuf, libdd-library-config-ffi, datadog-live-debugger, libdd-data-pipeline, libdd-dogstatsd-client, libdd-trace-stats, libdd-trace-utils, libdd-trace-normalization, libdd-log, datadog-live-debugger-ffi, libdd-profiling, libdd-profiling-protobuf, libdd-profiling-ffi, libdd-data-pipeline-ffi, libdd-ddsketch-ffi, libdd-log-ffi, libdd-telemetry-ffi, symbolizer-ffi, datadog-profiling-replayer, datadog-remote-config, datadog-sidecar, datadog-sidecar-macros, datadog-sidecar-ffi, libdd-trace-obfuscation, datadog-tracer-flare, sidecar_mockgen, test_spawn_from_lib, libdd-shared-runtime, libdd-shared-runtime-ffi third_party_libraries: - package_name: addr2line package_version: 0.24.2 diff --git a/libdd-common/src/lib.rs b/libdd-common/src/lib.rs index 0d841543fe..6b0dcb58b7 100644 --- a/libdd-common/src/lib.rs +++ b/libdd-common/src/lib.rs @@ -31,7 +31,6 @@ pub mod test_utils; pub mod threading; pub mod timeout; pub mod unix_utils; -pub mod worker; /// Extension trait for `Mutex` to provide a method that acquires a lock, panicking if the lock is /// poisoned. diff --git a/libdd-data-pipeline-ffi/Cargo.toml b/libdd-data-pipeline-ffi/Cargo.toml index cb61cbea65..dbf766d471 100644 --- a/libdd-data-pipeline-ffi/Cargo.toml +++ b/libdd-data-pipeline-ffi/Cargo.toml @@ -30,6 +30,7 @@ libdd-trace-utils = { path = "../libdd-trace-utils" } [dependencies] libdd-data-pipeline = { path = "../libdd-data-pipeline" } +libdd-shared-runtime = { path = "../libdd-shared-runtime" } libdd-common-ffi = { path = "../libdd-common-ffi", default-features = false } libdd-tinybytes = { path = "../libdd-tinybytes" } tracing = { version = "0.1", default-features = false } diff --git a/libdd-data-pipeline-ffi/cbindgen.toml b/libdd-data-pipeline-ffi/cbindgen.toml index 34ace82fbb..56f952de39 100644 --- a/libdd-data-pipeline-ffi/cbindgen.toml +++ b/libdd-data-pipeline-ffi/cbindgen.toml @@ -36,4 +36,4 @@ must_use = "DDOG_CHECK_RETURN" [parse] parse_deps = true -include = ["libdd-common", "libdd-common-ffi", "libdd-data-pipeline"] +include = ["libdd-common", "libdd-common-ffi", "libdd-shared-runtime", "libdd-data-pipeline"] diff --git a/libdd-data-pipeline-ffi/src/lib.rs b/libdd-data-pipeline-ffi/src/lib.rs index a276ac260b..894b03845d 100644 --- a/libdd-data-pipeline-ffi/src/lib.rs +++ b/libdd-data-pipeline-ffi/src/lib.rs @@ -8,7 +8,6 @@ mod error; mod response; -mod shared_runtime; mod trace_exporter; #[cfg(all(feature = "catch_panic", panic = "unwind"))] diff --git a/libdd-data-pipeline-ffi/src/trace_exporter.rs b/libdd-data-pipeline-ffi/src/trace_exporter.rs index 40c1ba717b..2a926b590a 100644 --- a/libdd-data-pipeline-ffi/src/trace_exporter.rs +++ b/libdd-data-pipeline-ffi/src/trace_exporter.rs @@ -8,7 +8,7 @@ use libdd_common_ffi::{ CharSlice, {slice::AsBytes, slice::ByteSlice}, }; -use libdd_data_pipeline::shared_runtime::SharedRuntime; +use libdd_shared_runtime::SharedRuntime; use libdd_data_pipeline::trace_exporter::{ TelemetryConfig, TraceExporter, TraceExporterInputFormat, TraceExporterOutputFormat, }; diff --git a/libdd-data-pipeline/Cargo.toml b/libdd-data-pipeline/Cargo.toml index c513c4c5a5..7f764a97b9 100644 --- a/libdd-data-pipeline/Cargo.toml +++ b/libdd-data-pipeline/Cargo.toml @@ -30,8 +30,8 @@ tokio = { version = "1.23", features = [ "time", ], default-features = false } uuid = { version = "1.10.0", features = ["v4"] } -tokio-util = "0.7.11" libdd-common = { version = "2.0.0", path = "../libdd-common", default-features = false } +libdd-shared-runtime = { path = "../libdd-shared-runtime" } libdd-telemetry = { version = "3.0.0", path = "../libdd-telemetry", default-features = false } libdd-trace-protobuf = { version = "1.1.0", path = "../libdd-trace-protobuf" } libdd-trace-stats = { version = "1.0.1", path = "../libdd-trace-stats" } diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index 773496afa7..b4359064f8 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -8,7 +8,8 @@ use anyhow::{anyhow, Result}; use async_trait::async_trait; use http::header::HeaderName; use http_body_util::BodyExt; -use libdd_common::{http_common, worker::Worker, Endpoint}; +use libdd_common::{http_common, Endpoint}; +use libdd_shared_runtime::Worker; use sha2::{Digest, Sha256}; use std::sync::Arc; use std::time::Duration; @@ -98,8 +99,8 @@ async fn fetch_and_hash_response(info_endpoint: &Endpoint) -> Result<(String, by /// Fetch the info endpoint and update an ArcSwap keeping it up-to-date. /// -/// This type implements [`libdd_common::worker::Worker`] and is intended to be driven by a worker -/// runner such as [`crate::shared_runtime::SharedRuntime`]. +/// This type implements [`libdd_shared_runtime::Worker`] and is intended to be driven by a worker +/// runner such as [`libdd_shared_runtime::SharedRuntime`]. /// In that lifecycle, `trigger()` waits for the next refresh event and `run()` performs a single /// fetch. /// @@ -113,7 +114,7 @@ async fn fetch_and_hash_response(info_endpoint: &Endpoint) -> Result<(String, by /// # Example /// ```no_run /// # use anyhow::Result; -/// # use libdd_common::worker::Worker; +/// # use libdd_shared_runtime::Worker; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// // Define the endpoint @@ -125,7 +126,7 @@ async fn fetch_and_hash_response(info_endpoint: &Endpoint) -> Result<(String, by /// std::time::Duration::from_secs(5 * 60), /// ); /// // Start the fetcher on a shared runtime -/// let runtime = libdd_data_pipeline::shared_runtime::SharedRuntime::new()?; +/// let runtime = libdd_shared_runtime::SharedRuntime::new()?; /// runtime.spawn_worker(fetcher)?; /// /// // Get the Arc to access the info @@ -303,7 +304,7 @@ impl ResponseObserver { mod single_threaded_tests { use super::*; use crate::agent_info; - use crate::shared_runtime::SharedRuntime; + use libdd_shared_runtime::SharedRuntime; use httpmock::prelude::*; const TEST_INFO: &str = r#"{ diff --git a/libdd-data-pipeline/src/lib.rs b/libdd-data-pipeline/src/lib.rs index c059939a55..17df4ba61a 100644 --- a/libdd-data-pipeline/src/lib.rs +++ b/libdd-data-pipeline/src/lib.rs @@ -12,8 +12,6 @@ pub mod agent_info; mod health_metrics; -mod pausable_worker; -pub mod shared_runtime; #[allow(missing_docs)] pub mod stats_exporter; pub(crate) mod telemetry; diff --git a/libdd-data-pipeline/src/stats_exporter.rs b/libdd-data-pipeline/src/stats_exporter.rs index 4dc67e2f01..759e8b45a5 100644 --- a/libdd-data-pipeline/src/stats_exporter.rs +++ b/libdd-data-pipeline/src/stats_exporter.rs @@ -13,7 +13,8 @@ use std::{ use crate::trace_exporter::TracerMetadata; use async_trait::async_trait; -use libdd_common::{worker::Worker, Endpoint, HttpClient}; +use libdd_common::{Endpoint, HttpClient}; +use libdd_shared_runtime::Worker; use libdd_trace_protobuf::pb; use libdd_trace_stats::span_concentrator::SpanConcentrator; use libdd_trace_utils::send_with_retry::{send_with_retry, RetryStrategy}; @@ -181,7 +182,7 @@ pub fn stats_url_from_agent_url(agent_url: &str) -> anyhow::Result { #[cfg(test)] mod tests { use super::*; - use crate::shared_runtime::SharedRuntime; + use libdd_shared_runtime::SharedRuntime; use httpmock::prelude::*; use httpmock::MockServer; use libdd_common::http_common::new_default_client; diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index 0ec7d1817f..9700382624 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -310,7 +310,7 @@ mod tests { use tokio::time::sleep; use super::*; - use crate::shared_runtime::{SharedRuntime, WorkerHandle}; + use libdd_shared_runtime::{SharedRuntime, WorkerHandle}; fn get_test_client(url: &str, runtime: &SharedRuntime) -> (TelemetryClient, WorkerHandle) { let (client, worker) = TelemetryClientBuilder::default() diff --git a/libdd-data-pipeline/src/trace_exporter/builder.rs b/libdd-data-pipeline/src/trace_exporter/builder.rs index e68851c332..47266c60cb 100644 --- a/libdd-data-pipeline/src/trace_exporter/builder.rs +++ b/libdd-data-pipeline/src/trace_exporter/builder.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use crate::agent_info::AgentInfoFetcher; -use crate::shared_runtime::SharedRuntime; +use libdd_shared_runtime::SharedRuntime; use crate::telemetry::TelemetryClientBuilder; use crate::trace_exporter::agent_response::AgentResponsePayloadVersion; use crate::trace_exporter::error::BuilderErrorKind; diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 550862bb28..8a5f88cdf0 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -15,7 +15,7 @@ use self::metrics::MetricsEmitter; use self::stats::StatsComputationStatus; use self::trace_serializer::TraceSerializer; use crate::agent_info::ResponseObserver; -use crate::shared_runtime::{SharedRuntime, WorkerHandle}; +use libdd_shared_runtime::{SharedRuntime, WorkerHandle}; use crate::telemetry::{SendPayloadTelemetry, TelemetryClient}; use crate::trace_exporter::agent_response::{ AgentResponsePayloadVersion, DATADOG_RATES_PAYLOAD_VERSION_HEADER, diff --git a/libdd-data-pipeline/src/trace_exporter/stats.rs b/libdd-data-pipeline/src/trace_exporter/stats.rs index b33a014c1d..ba042c845f 100644 --- a/libdd-data-pipeline/src/trace_exporter/stats.rs +++ b/libdd-data-pipeline/src/trace_exporter/stats.rs @@ -8,7 +8,7 @@ //! and processing traces for stats collection. use crate::agent_info::schema::AgentInfo; -use crate::shared_runtime::{SharedRuntime, WorkerHandle}; +use libdd_shared_runtime::{SharedRuntime, WorkerHandle}; use crate::stats_exporter; use arc_swap::ArcSwap; use libdd_common::{Endpoint, HttpClient, MutexExt}; diff --git a/libdd-shared-runtime-ffi/Cargo.toml b/libdd-shared-runtime-ffi/Cargo.toml new file mode 100644 index 0000000000..2b31805dc1 --- /dev/null +++ b/libdd-shared-runtime-ffi/Cargo.toml @@ -0,0 +1,26 @@ +# Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "libdd-shared-runtime-ffi" +version.workspace = true +edition.workspace = true +rust-version.workspace = true +license.workspace = true +publish = false + +[lib] +crate-type = ["lib", "staticlib", "cdylib"] +bench = false + +[features] +default = ["cbindgen", "catch_panic"] +catch_panic = [] +cbindgen = ["build_common/cbindgen"] + +[build-dependencies] +build_common = { path = "../build-common" } + +[dependencies] +libdd-shared-runtime = { path = "../libdd-shared-runtime" } +tracing = { version = "0.1", default-features = false } diff --git a/libdd-shared-runtime-ffi/build.rs b/libdd-shared-runtime-ffi/build.rs new file mode 100644 index 0000000000..5cefa15d31 --- /dev/null +++ b/libdd-shared-runtime-ffi/build.rs @@ -0,0 +1,11 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 + +extern crate build_common; + +use build_common::generate_and_configure_header; + +fn main() { + let header_name = "shared-runtime.h"; + generate_and_configure_header(header_name); +} diff --git a/libdd-shared-runtime-ffi/cbindgen.toml b/libdd-shared-runtime-ffi/cbindgen.toml new file mode 100644 index 0000000000..f294158074 --- /dev/null +++ b/libdd-shared-runtime-ffi/cbindgen.toml @@ -0,0 +1,28 @@ +# Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +# SPDX-License-Identifier: Apache-2.0 + +language = "C" +cpp_compat = true +tab_width = 2 +header = """// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 +""" +include_guard = "DDOG_SHARED_RUNTIME_H" + +[export] +prefix = "ddog_" +renaming_overrides_prefixing = true + +[export.mangle] +rename_types = "PascalCase" + +[enum] +prefix_with_name = true +rename_variants = "ScreamingSnakeCase" + +[fn] +must_use = "DDOG_CHECK_RETURN" + +[parse] +parse_deps = true +include = ["libdd-shared-runtime"] diff --git a/libdd-shared-runtime-ffi/src/lib.rs b/libdd-shared-runtime-ffi/src/lib.rs new file mode 100644 index 0000000000..3387a8d03c --- /dev/null +++ b/libdd-shared-runtime-ffi/src/lib.rs @@ -0,0 +1,37 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 +#![cfg_attr(not(test), deny(clippy::panic))] +#![cfg_attr(not(test), deny(clippy::unwrap_used))] +#![cfg_attr(not(test), deny(clippy::expect_used))] +#![cfg_attr(not(test), deny(clippy::todo))] +#![cfg_attr(not(test), deny(clippy::unimplemented))] + +mod shared_runtime; + +#[cfg(all(feature = "catch_panic", panic = "unwind"))] +macro_rules! catch_panic { + ($f:expr, $err:expr) => { + match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| $f)) { + Ok(ret) => ret, + Err(info) => { + if let Some(s) = info.downcast_ref::() { + tracing::error!("panic: {}", s); + } else if let Some(s) = info.downcast_ref::<&str>() { + tracing::error!("panic: {}", s); + } else { + tracing::error!("panic: unable to retrieve panic context"); + } + $err + } + } + }; +} + +#[cfg(any(not(feature = "catch_panic"), panic = "abort"))] +macro_rules! catch_panic { + ($f:expr, $err:expr) => { + $f + }; +} + +pub(crate) use catch_panic; diff --git a/libdd-data-pipeline-ffi/src/shared_runtime.rs b/libdd-shared-runtime-ffi/src/shared_runtime.rs similarity index 95% rename from libdd-data-pipeline-ffi/src/shared_runtime.rs rename to libdd-shared-runtime-ffi/src/shared_runtime.rs index 5e98cd6c52..99b34c2285 100644 --- a/libdd-data-pipeline-ffi/src/shared_runtime.rs +++ b/libdd-shared-runtime-ffi/src/shared_runtime.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use crate::catch_panic; -use libdd_data_pipeline::shared_runtime::{SharedRuntime, SharedRuntimeError}; +use libdd_shared_runtime::{SharedRuntime, SharedRuntimeError}; use std::ffi::{c_char, CString}; use std::ptr::NonNull; use std::sync::Arc; @@ -91,6 +91,7 @@ pub unsafe extern "C" fn ddog_shared_runtime_error_free(error: Option, @@ -130,6 +131,7 @@ pub unsafe extern "C" fn ddog_shared_runtime_free(handle: *const SharedRuntime) /// fork, preventing deadlocks in the child process. /// /// Returns an error if `handle` is null. +/// The handle must have been initialized with `ddog_shared_runtime_new`. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_before_fork( handle: *const SharedRuntime, @@ -155,6 +157,7 @@ pub unsafe extern "C" fn ddog_shared_runtime_before_fork( /// Restarts all workers that were paused by [`ddog_shared_runtime_before_fork`]. /// /// Returns `None` on success, or an error if workers could not be restarted. +/// The handle must have been initialized with `ddog_shared_runtime_new`. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_after_fork_parent( handle: *const SharedRuntime, @@ -184,6 +187,7 @@ pub unsafe extern "C" fn ddog_shared_runtime_after_fork_parent( /// /// Returns `None` on success, or an error if the runtime could not be /// reinitialized. +/// The handle must have been initialized with `ddog_shared_runtime_new`. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_after_fork_child( handle: *const SharedRuntime, @@ -213,6 +217,7 @@ pub unsafe extern "C" fn ddog_shared_runtime_after_fork_child( /// /// Returns `None` on success, or `SharedRuntimeErrorCode::ShutdownTimedOut` /// if the timeout was reached. +/// The handle must have been initialized with `ddog_shared_runtime_new`. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_shutdown( handle: *const SharedRuntime, diff --git a/libdd-shared-runtime/Cargo.toml b/libdd-shared-runtime/Cargo.toml new file mode 100644 index 0000000000..437959036b --- /dev/null +++ b/libdd-shared-runtime/Cargo.toml @@ -0,0 +1,26 @@ +# Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +# SPDX-License-Identifier: Apache-2.0 + +[package] +name = "libdd-shared-runtime" +version.workspace = true +description = "Shared tokio runtime with fork-safe worker management for Datadog libraries" +homepage = "https://github.com/DataDog/libdatadog/tree/main/libdd-shared-runtime" +repository = "https://github.com/DataDog/libdatadog/tree/main/libdd-shared-runtime" +edition.workspace = true +rust-version.workspace = true +license.workspace = true + +[lib] +crate-type = ["lib"] +bench = false + +[dependencies] +async-trait = "0.1" +tokio = { version = "1.23", features = ["rt", "rt-multi-thread", "time"] } +tokio-util = "0.7.11" +tracing = { version = "0.1", default-features = false } +libdd-common = { version = "2.0.0", path = "../libdd-common", default-features = false } + +[lints.rust] +unexpected_cfgs = { level = "warn", check-cfg = ['cfg(coverage)'] } diff --git a/libdd-shared-runtime/src/lib.rs b/libdd-shared-runtime/src/lib.rs new file mode 100644 index 0000000000..61bff93faf --- /dev/null +++ b/libdd-shared-runtime/src/lib.rs @@ -0,0 +1,14 @@ +// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ +// SPDX-License-Identifier: Apache-2.0 +#![cfg_attr(not(test), deny(clippy::panic))] +#![cfg_attr(not(test), deny(clippy::unwrap_used))] +#![cfg_attr(not(test), deny(clippy::expect_used))] +#![cfg_attr(not(test), deny(clippy::todo))] +#![cfg_attr(not(test), deny(clippy::unimplemented))] + +pub mod worker; + +// Top-level re-exports for convenience +pub use worker::pausable_worker::{PausableWorker, PausableWorkerError}; +pub use worker::shared_runtime::{SharedRuntime, SharedRuntimeError, WorkerHandle, WorkerHandleError}; +pub use worker::Worker; diff --git a/libdd-common/src/worker.rs b/libdd-shared-runtime/src/worker/mod.rs similarity index 97% rename from libdd-common/src/worker.rs rename to libdd-shared-runtime/src/worker/mod.rs index a88c81a192..ea821b1203 100644 --- a/libdd-common/src/worker.rs +++ b/libdd-shared-runtime/src/worker/mod.rs @@ -1,6 +1,9 @@ // Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ // SPDX-License-Identifier: Apache-2.0 +pub mod pausable_worker; +pub mod shared_runtime; + use async_trait::async_trait; /// Trait representing a generic worker. diff --git a/libdd-data-pipeline/src/pausable_worker.rs b/libdd-shared-runtime/src/worker/pausable_worker.rs similarity index 98% rename from libdd-data-pipeline/src/pausable_worker.rs rename to libdd-shared-runtime/src/worker/pausable_worker.rs index f2a1ad5090..2c1d57a856 100644 --- a/libdd-data-pipeline/src/pausable_worker.rs +++ b/libdd-shared-runtime/src/worker/pausable_worker.rs @@ -3,14 +3,14 @@ //! Defines a pausable worker to be able to stop background processes before forks -use libdd_common::worker::Worker; +use super::Worker; use std::fmt::Display; use tokio::{runtime::Runtime, select, task::JoinHandle}; use tokio_util::sync::CancellationToken; /// A pausable worker which can be paused and restarted on forks. /// -/// Used to allow a [`libdd_common::worker::Worker`] to be paused while saving its state when +/// Used to allow a [`super::Worker`] to be paused while saving its state when /// dropping a tokio runtime to be able to restart with the same state on a new runtime. This is /// used to stop all threads before a fork to avoid deadlocks in child. /// diff --git a/libdd-data-pipeline/src/shared_runtime.rs b/libdd-shared-runtime/src/worker/shared_runtime.rs similarity index 99% rename from libdd-data-pipeline/src/shared_runtime.rs rename to libdd-shared-runtime/src/worker/shared_runtime.rs index 4db4755be7..b435cb6762 100644 --- a/libdd-data-pipeline/src/shared_runtime.rs +++ b/libdd-shared-runtime/src/worker/shared_runtime.rs @@ -8,8 +8,11 @@ //! fork operations by pausing workers before fork and restarting them appropriately //! in parent and child processes. -use crate::pausable_worker::{PausableWorker, PausableWorkerError}; -use libdd_common::{worker::Worker, MutexExt}; +use super::{ + pausable_worker::{PausableWorker, PausableWorkerError}, + Worker, +}; +use libdd_common::MutexExt; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::{fmt, io}; diff --git a/libdd-telemetry/Cargo.toml b/libdd-telemetry/Cargo.toml index b41cb0d01e..d3b4833b20 100644 --- a/libdd-telemetry/Cargo.toml +++ b/libdd-telemetry/Cargo.toml @@ -33,6 +33,7 @@ uuid = { version = "1.3", features = ["v4"] } hashbrown = "0.15" libdd-common = { version = "2.0.0", path = "../libdd-common", default-features = false } +libdd-shared-runtime = { path = "../libdd-shared-runtime" } libdd-ddsketch = { version = "1.0.1", path = "../libdd-ddsketch" } [target."cfg(unix)".dependencies] diff --git a/libdd-telemetry/src/worker/mod.rs b/libdd-telemetry/src/worker/mod.rs index e2d73d4f52..6afabd4975 100644 --- a/libdd-telemetry/src/worker/mod.rs +++ b/libdd-telemetry/src/worker/mod.rs @@ -12,7 +12,8 @@ use crate::{ }; use async_trait::async_trait; -use libdd_common::{http_common, tag::Tag, worker::Worker}; +use libdd_common::{http_common, tag::Tag}; +use libdd_shared_runtime::Worker; use std::iter::Sum; use std::ops::Add; @@ -1276,7 +1277,7 @@ mod tests { metrics::{MetricNamespace, MetricType}, Configuration, ConfigurationOrigin, Dependency, Endpoint, Integration, Log, LogLevel, }; - use libdd_common::worker::Worker; + use libdd_shared_runtime::Worker; fn build_test_worker() -> (TelemetryWorkerHandle, TelemetryWorker) { let builder = TelemetryWorkerBuilder::new( diff --git a/tools/docker/Dockerfile.build b/tools/docker/Dockerfile.build index b6871a1101..1717f328c9 100644 --- a/tools/docker/Dockerfile.build +++ b/tools/docker/Dockerfile.build @@ -114,6 +114,8 @@ COPY "datadog-ipc/Cargo.toml" "datadog-ipc/" COPY "datadog-ipc-macros/Cargo.toml" "datadog-ipc-macros/" COPY "datadog-ipc/tarpc/Cargo.toml" "datadog-ipc/tarpc/" COPY "datadog-ipc/plugins/Cargo.toml" "datadog-ipc/plugins/" +COPY "libdd-shared-runtime/Cargo.toml" "libdd-shared-runtime/" +COPY "libdd-shared-runtime-ffi/Cargo.toml" "libdd-shared-runtime-ffi/" COPY "libdd-data-pipeline/Cargo.toml" "libdd-data-pipeline/" COPY "libdd-data-pipeline-ffi/Cargo.toml" "libdd-data-pipeline-ffi/" COPY "bin_tests/Cargo.toml" "bin_tests/" From fb187cea37403ebd4d398ffd18d700aec11015e4 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 23 Mar 2026 10:45:23 +0100 Subject: [PATCH 37/80] chore: apply suggestions --- libdd-data-pipeline/src/agent_info/fetcher.rs | 3 ++- .../src/trace_exporter/stats.rs | 4 ++-- libdd-shared-runtime/src/lib.rs | 19 +++++++++++++++---- libdd-shared-runtime/src/worker/mod.rs | 13 +++++++------ .../src/worker/shared_runtime.rs | 18 +++++++++++------- 5 files changed, 37 insertions(+), 20 deletions(-) diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index b4359064f8..23584e4fe3 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -224,6 +224,7 @@ impl Worker for AgentInfoFetcher { fn reset(&mut self) { // Drain all messages from the channel to remove messages sent to release the reference on + // IoStack self.drain(); } @@ -646,4 +647,4 @@ mod single_threaded_tests { // Should not trigger a fetch since the state is the same mock.assert_calls(0); } -} +} \ No newline at end of file diff --git a/libdd-data-pipeline/src/trace_exporter/stats.rs b/libdd-data-pipeline/src/trace_exporter/stats.rs index ba042c845f..6071ffc5df 100644 --- a/libdd-data-pipeline/src/trace_exporter/stats.rs +++ b/libdd-data-pipeline/src/trace_exporter/stats.rs @@ -100,7 +100,7 @@ fn create_and_start_stats_worker( let worker_handle = ctx .shared_runtime .spawn_worker(stats_exporter) - .map_err(|e| anyhow::anyhow!(e.to_string()))?; + .map_err(|e| anyhow::anyhow!(e))?; // Update the stats computation state with the new worker components. client_side_stats.store(Arc::new(StatsComputationStatus::Enabled { @@ -238,4 +238,4 @@ pub(crate) fn is_stats_worker_active(client_side_stats: &ArcSwap for WorkerHandleError { fn from(err: PausableWorkerError) -> Self { Self::WorkerError(err) @@ -174,12 +176,14 @@ impl SharedRuntime { let mut pausable_worker = PausableWorker::new(boxed_worker); let worker_id = self.next_worker_id.fetch_add(1, Ordering::Relaxed); - let runtime_lock = self.runtime.lock_or_panic(); + { + let runtime_lock = self.runtime.lock_or_panic(); - // If the runtime is not available, it's added to the worker list and will be started when - // the runtime is recreated. - if let Some(runtime) = runtime_lock.as_ref() { - pausable_worker.start(runtime)?; + // If the runtime is not available, it's added to the worker list and will be started + // when the runtime is recreated. + if let Some(runtime) = runtime_lock.as_ref() { + pausable_worker.start(runtime)?; + } } let mut workers_lock = self.workers.lock_or_panic(); @@ -290,7 +294,7 @@ impl SharedRuntime { /// This allows external code to spawn additional tasks on the runtime if needed. /// /// # Warning - /// Since this method can return a single-threaded runtime it should only be use to + /// Since this method can return a single-threaded runtime it should only be used to /// execute async code with `block_on` if you need to spawn async code on it without blocking, /// you should us a `Worker` instead. /// From 0f91bb35496ed56a5ba3ef3a501afac5969dcace Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 23 Mar 2026 10:47:43 +0100 Subject: [PATCH 38/80] chore(codeowners): add libdd-shared-runtime to codeowners --- .github/CODEOWNERS | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index efc0feb1b6..384debb255 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -48,6 +48,7 @@ libdd-library-config*/ @DataDog/apm-sdk-capabilities libdd-libunwind*/ @DataDog/libdatadog-profiling libdd-log*/ @DataDog/apm-common-components-core libdd-profiling*/ @DataDog/libdatadog-profiling +libdd-shared-runtime*/ @DataDog/apm-common-components-core libdd-telemetry*/ @DataDog/apm-common-components-core libdd-tinybytes @DataDog/apm-common-components-core libdd-trace-normalization @DataDog/serverless @DataDog/libdatadog-apm From f12131f93f590f15dd481fb10a2cea35e97a8e40 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 23 Mar 2026 13:25:41 +0100 Subject: [PATCH 39/80] docs(shared-runtime-ffi): add comment to error msg --- libdd-shared-runtime-ffi/src/shared_runtime.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libdd-shared-runtime-ffi/src/shared_runtime.rs b/libdd-shared-runtime-ffi/src/shared_runtime.rs index 99b34c2285..5442423221 100644 --- a/libdd-shared-runtime-ffi/src/shared_runtime.rs +++ b/libdd-shared-runtime-ffi/src/shared_runtime.rs @@ -32,6 +32,8 @@ pub enum SharedRuntimeErrorCode { #[repr(C)] pub struct SharedRuntimeFFIError { pub code: SharedRuntimeErrorCode, + /// The error message is always defined when the error is returned by a ddog_shared_runtime + /// ffi. pub msg: *mut c_char, } From 1943f130474f55b15d5932fca159591d62180a0b Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 23 Mar 2026 13:46:35 +0100 Subject: [PATCH 40/80] test: use thread::sleep instead of tokio --- libdd-data-pipeline/src/trace_exporter/mod.rs | 28 +++++-------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 8a5f88cdf0..b66f5cd29a 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -784,7 +784,6 @@ mod tests { use std::collections::HashMap; use std::net; use std::time::Duration; - use tokio::time::sleep; // v05 messagepack empty payload -> [[""], []] const V5_EMPTY: [u8; 4] = [0x92, 0x91, 0xA0, 0x90]; @@ -1427,9 +1426,7 @@ mod tests { traces_endpoint.assert_calls(1); while metrics_endpoint.calls() == 0 { - exporter.shared_runtime.runtime().unwrap().block_on(async { - sleep(Duration::from_millis(100)).await; - }) + std::thread::sleep(Duration::from_millis(100)); } metrics_endpoint.assert_calls(1); } @@ -1479,9 +1476,7 @@ mod tests { traces_endpoint.assert_calls(1); while metrics_endpoint.calls() == 0 { - exporter.shared_runtime.runtime().unwrap().block_on(async { - sleep(Duration::from_millis(100)).await; - }) + std::thread::sleep(Duration::from_millis(100)); } metrics_endpoint.assert_calls(1); } @@ -1542,9 +1537,7 @@ mod tests { traces_endpoint.assert_calls(1); while metrics_endpoint.calls() == 0 { - exporter.shared_runtime.runtime().unwrap().block_on(async { - sleep(Duration::from_millis(100)).await; - }) + std::thread::sleep(Duration::from_millis(100)); } metrics_endpoint.assert_calls(1); } @@ -1719,9 +1712,7 @@ mod tests { // Wait for the info fetcher to get the config while mock_info.calls() == 0 { - exporter.shared_runtime.runtime().unwrap().block_on(async { - sleep(Duration::from_millis(100)).await; - }) + std::thread::sleep(Duration::from_millis(100)); } let _ = exporter.send(data.as_ref()).unwrap(); @@ -1754,7 +1745,6 @@ mod single_threaded_tests { use libdd_trace_utils::msgpack_encoder; use libdd_trace_utils::span::v04::SpanBytes; use std::time::Duration; - use tokio::time::sleep; #[cfg_attr(miri, ignore)] #[test] @@ -1812,9 +1802,7 @@ mod single_threaded_tests { // Wait for the info fetcher to get the config while agent_info::get_agent_info().is_none() { - exporter.shared_runtime.runtime().unwrap().block_on(async { - sleep(Duration::from_millis(100)).await; - }) + std::thread::sleep(Duration::from_millis(100)); } let result = exporter.send(data.as_ref()); @@ -1914,9 +1902,7 @@ mod single_threaded_tests { // Wait for agent_info to be present so that sending a trace will trigger the stats worker // to start while agent_info::get_agent_info().is_none() { - exporter.shared_runtime.runtime().unwrap().block_on(async { - sleep(Duration::from_millis(100)).await; - }) + std::thread::sleep(Duration::from_millis(100)); } exporter.send(data.as_ref()).unwrap(); @@ -1937,4 +1923,4 @@ mod single_threaded_tests { mock_traces.assert(); } -} +} \ No newline at end of file From f1c8ea92a3fdfcde4b7805f07b7d63a7e5f51947 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 23 Mar 2026 13:56:16 +0100 Subject: [PATCH 41/80] chore: rename join to wait_for_pause --- libdd-shared-runtime/src/worker/pausable_worker.rs | 6 +++--- libdd-shared-runtime/src/worker/shared_runtime.rs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/libdd-shared-runtime/src/worker/pausable_worker.rs b/libdd-shared-runtime/src/worker/pausable_worker.rs index 2c1d57a856..29b2bb9bad 100644 --- a/libdd-shared-runtime/src/worker/pausable_worker.rs +++ b/libdd-shared-runtime/src/worker/pausable_worker.rs @@ -130,11 +130,11 @@ impl PausableWorker { /// This method will cancel the worker's cancellation token if it hasn't been cancelled yet, /// then wait for the worker to finish and store its state. Calling [`Self::request_pause`] /// before this method is optional - it's only needed when shutting down multiple workers - /// simultaneously to allow them to pause concurrently before waiting for all of them. + /// simultaneously to allow them to pause concurrently before waiting for all workers to pause. /// /// # Errors /// Fails if the worker handle has been aborted preventing the worker from being retrieved. - pub async fn join(&mut self) -> Result<(), PausableWorkerError> { + pub async fn wait_for_pause(&mut self) -> Result<(), PausableWorkerError> { match self { PausableWorker::Running { .. } => { let PausableWorker::Running { handle, stop_token } = @@ -219,7 +219,7 @@ mod tests { pausable_worker.start(&runtime).unwrap(); assert_eq!(receiver.recv().unwrap(), 0); - runtime.block_on(async { pausable_worker.join().await.unwrap() }); + runtime.block_on(async { pausable_worker.wait_for_pause().await.unwrap() }); // Empty the message queue and get the last message let mut next_message = 1; for message in receiver.try_iter() { diff --git a/libdd-shared-runtime/src/worker/shared_runtime.rs b/libdd-shared-runtime/src/worker/shared_runtime.rs index a532d1ba1d..420f062203 100644 --- a/libdd-shared-runtime/src/worker/shared_runtime.rs +++ b/libdd-shared-runtime/src/worker/shared_runtime.rs @@ -78,7 +78,7 @@ impl WorkerHandle { let WorkerEntry { worker, .. } = workers_lock.swap_remove(position); worker }; - worker.join().await?; + worker.wait_for_pause().await?; worker.shutdown().await; Ok(()) } @@ -216,7 +216,7 @@ impl SharedRuntime { } for worker_entry in workers_lock.iter_mut() { - if let Err(e) = worker_entry.worker.join().await { + if let Err(e) = worker_entry.worker.wait_for_pause().await { error!("Worker failed to pause before fork: {:?}", e); } } @@ -355,7 +355,7 @@ impl SharedRuntime { let mut join_set = JoinSet::new(); for mut worker_entry in workers { join_set.spawn(async move { - let result = worker_entry.worker.join().await; + let result = worker_entry.worker.wait_for_pause().await; if let Err(e) = result { error!("Worker failed to shutdown: {:?}", e); return; From 16c20b6c4bb48d7c74a19ae47b6019c535fca3d6 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 23 Mar 2026 14:05:47 +0100 Subject: [PATCH 42/80] docs: update after_fork_child doc --- libdd-shared-runtime/src/worker/shared_runtime.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdd-shared-runtime/src/worker/shared_runtime.rs b/libdd-shared-runtime/src/worker/shared_runtime.rs index 420f062203..27f93b0c5d 100644 --- a/libdd-shared-runtime/src/worker/shared_runtime.rs +++ b/libdd-shared-runtime/src/worker/shared_runtime.rs @@ -266,7 +266,7 @@ impl SharedRuntime { /// /// This method reinitializes the runtime and workers in the child process. /// A new runtime must be created since tokio runtimes cannot be safely forked. - /// Workers can optionally be restarted to resume operations in the child. + /// Workers are reset and restarted to resume operations in the child. /// /// # Errors /// Returns an error if the runtime cannot be reinitialized or workers cannot be started. From 366dff5bc0fd15b3c5621fb4fea26f8c93a25e39 Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 24 Mar 2026 13:07:18 +0100 Subject: [PATCH 43/80] refactor(shared-runtime): move shared runtime to a module --- libdd-shared-runtime/src/lib.rs | 5 ++--- .../shared_runtime.rs => shared_runtime/mod.rs} | 13 +++++-------- .../{worker => shared_runtime}/pausable_worker.rs | 4 ++-- .../src/{worker/mod.rs => worker.rs} | 7 ++----- 4 files changed, 11 insertions(+), 18 deletions(-) rename libdd-shared-runtime/src/{worker/shared_runtime.rs => shared_runtime/mod.rs} (98%) rename libdd-shared-runtime/src/{worker => shared_runtime}/pausable_worker.rs (99%) rename libdd-shared-runtime/src/{worker/mod.rs => worker.rs} (95%) diff --git a/libdd-shared-runtime/src/lib.rs b/libdd-shared-runtime/src/lib.rs index 8878ef9fe4..a2ea817946 100644 --- a/libdd-shared-runtime/src/lib.rs +++ b/libdd-shared-runtime/src/lib.rs @@ -16,10 +16,9 @@ //! `after_fork_child`) that pause and restart workers around `fork()` calls, preventing //! deadlocks in child processes. +pub mod shared_runtime; pub mod worker; // Top-level re-exports for convenience -pub use worker::shared_runtime::{ - SharedRuntime, SharedRuntimeError, WorkerHandle, WorkerHandleError, -}; +pub use shared_runtime::{SharedRuntime, SharedRuntimeError, WorkerHandle, WorkerHandleError}; pub use worker::Worker; \ No newline at end of file diff --git a/libdd-shared-runtime/src/worker/shared_runtime.rs b/libdd-shared-runtime/src/shared_runtime/mod.rs similarity index 98% rename from libdd-shared-runtime/src/worker/shared_runtime.rs rename to libdd-shared-runtime/src/shared_runtime/mod.rs index 27f93b0c5d..3ca713de0c 100644 --- a/libdd-shared-runtime/src/worker/shared_runtime.rs +++ b/libdd-shared-runtime/src/shared_runtime/mod.rs @@ -8,10 +8,10 @@ //! fork operations by pausing workers before fork and restarting them appropriately //! in parent and child processes. -use super::{ - pausable_worker::{PausableWorker, PausableWorkerError}, - Worker, -}; +pub(crate) mod pausable_worker; + +use crate::worker::Worker; +use pausable_worker::{PausableWorker, PausableWorkerError}; use libdd_common::MutexExt; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; @@ -341,9 +341,6 @@ impl SharedRuntime { /// background workers and the runtime. /// /// Worker errors are logged but do not cause the function to fail. - /// - /// This function should not take ownership of the SharedRuntime as it will cause the runtime - /// to be dropped in a non-blocking context causing a panic. pub async fn shutdown_async(&self) { use tracing::error; @@ -458,4 +455,4 @@ mod tests { // This should succeed as we're not in an async context assert!(shared_runtime.after_fork_child().is_ok()); } -} +} \ No newline at end of file diff --git a/libdd-shared-runtime/src/worker/pausable_worker.rs b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs similarity index 99% rename from libdd-shared-runtime/src/worker/pausable_worker.rs rename to libdd-shared-runtime/src/shared_runtime/pausable_worker.rs index 29b2bb9bad..80652f6512 100644 --- a/libdd-shared-runtime/src/worker/pausable_worker.rs +++ b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs @@ -3,7 +3,7 @@ //! Defines a pausable worker to be able to stop background processes before forks -use super::Worker; +use crate::worker::Worker; use std::fmt::Display; use tokio::{runtime::Runtime, select, task::JoinHandle}; use tokio_util::sync::CancellationToken; @@ -228,4 +228,4 @@ mod tests { pausable_worker.start(&runtime).unwrap(); assert_eq!(receiver.recv().unwrap(), next_message); } -} +} \ No newline at end of file diff --git a/libdd-shared-runtime/src/worker/mod.rs b/libdd-shared-runtime/src/worker.rs similarity index 95% rename from libdd-shared-runtime/src/worker/mod.rs rename to libdd-shared-runtime/src/worker.rs index f546e3ffc8..e05b772e38 100644 --- a/libdd-shared-runtime/src/worker/mod.rs +++ b/libdd-shared-runtime/src/worker.rs @@ -1,12 +1,9 @@ // Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/ // SPDX-License-Identifier: Apache-2.0 -pub mod pausable_worker; -pub mod shared_runtime; - use async_trait::async_trait; -/// A background worker meant to be spawned on a [`SharedRuntime`](shared_runtime::SharedRuntime). +/// A background worker meant to be spawned on a [`SharedRuntime`](crate::SharedRuntime). /// /// # Lifecycle /// The worker's [`run`](Self::run) method is executed every time [`trigger`](Self::trigger) @@ -66,4 +63,4 @@ impl Worker for Box { async fn shutdown(&mut self) { (**self).shutdown().await } -} +} \ No newline at end of file From 0a2cac47b2e34e1c0f4bfa40d7498be474d25321 Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 24 Mar 2026 14:22:54 +0100 Subject: [PATCH 44/80] chore: add debug logs --- libdd-shared-runtime/src/shared_runtime/mod.rs | 17 +++++++++++------ .../src/shared_runtime/pausable_worker.rs | 7 ++++++- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/libdd-shared-runtime/src/shared_runtime/mod.rs b/libdd-shared-runtime/src/shared_runtime/mod.rs index 3ca713de0c..40a67b1d69 100644 --- a/libdd-shared-runtime/src/shared_runtime/mod.rs +++ b/libdd-shared-runtime/src/shared_runtime/mod.rs @@ -11,13 +11,14 @@ pub(crate) mod pausable_worker; use crate::worker::Worker; -use pausable_worker::{PausableWorker, PausableWorkerError}; use libdd_common::MutexExt; +use pausable_worker::{PausableWorker, PausableWorkerError}; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::{fmt, io}; use tokio::runtime::{Builder, Runtime}; use tokio::task::JoinSet; +use tracing::{debug, error}; type BoxedWorker = Box; @@ -149,6 +150,7 @@ impl SharedRuntime { /// # Errors /// Returns an error if the tokio runtime cannot be created. pub fn new() -> Result { + debug!("Creating new SharedRuntime"); let runtime = tokio::runtime::Builder::new_multi_thread() .worker_threads(1) .enable_all() @@ -173,6 +175,7 @@ impl SharedRuntime { worker: T, ) -> Result { let boxed_worker: BoxedWorker = Box::new(worker); + debug!(?boxed_worker, "Spawning worker on SharedRuntime"); let mut pausable_worker = PausableWorker::new(boxed_worker); let worker_id = self.next_worker_id.fetch_add(1, Ordering::Relaxed); @@ -206,8 +209,7 @@ impl SharedRuntime { /// /// Worker errors are logged but do not cause the function to fail. pub fn before_fork(&self) { - use tracing::error; - + debug!("before_fork: pausing all workers"); if let Some(runtime) = self.runtime.lock_or_panic().take() { let mut workers_lock = self.workers.lock_or_panic(); runtime.block_on(async move { @@ -245,6 +247,7 @@ impl SharedRuntime { /// # Errors /// Returns an error if workers cannot be restarted or the runtime cannot be recreated. pub fn after_fork_parent(&self) -> Result<(), SharedRuntimeError> { + debug!("after_fork_parent: restarting runtime and workers"); self.restart_runtime()?; let runtime_lock = self.runtime.lock_or_panic(); @@ -271,6 +274,7 @@ impl SharedRuntime { /// # Errors /// Returns an error if the runtime cannot be reinitialized or workers cannot be started. pub fn after_fork_child(&self) -> Result<(), SharedRuntimeError> { + debug!("after_fork_child: reinitializing runtime and workers"); self.restart_runtime()?; let runtime_lock = self.runtime.lock_or_panic(); @@ -316,6 +320,7 @@ impl SharedRuntime { /// # Errors /// Returns an error only if shutdown times out. pub fn shutdown(&self, timeout: Option) -> Result<(), SharedRuntimeError> { + debug!(?timeout, "Shutting down SharedRuntime"); match self.runtime.lock_or_panic().take() { Some(runtime) => { let result = if let Some(timeout) = timeout { @@ -342,8 +347,7 @@ impl SharedRuntime { /// /// Worker errors are logged but do not cause the function to fail. pub async fn shutdown_async(&self) { - use tracing::error; - + debug!("Shutting down all workers asynchronously"); let workers = { let mut workers_lock = self.workers.lock_or_panic(); std::mem::take(&mut *workers_lock) @@ -455,4 +459,5 @@ mod tests { // This should succeed as we're not in an async context assert!(shared_runtime.after_fork_child().is_ok()); } -} \ No newline at end of file +} + diff --git a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs index 80652f6512..c027bb3ddd 100644 --- a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs +++ b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs @@ -7,6 +7,7 @@ use crate::worker::Worker; use std::fmt::Display; use tokio::{runtime::Runtime, select, task::JoinHandle}; use tokio_util::sync::CancellationToken; +use tracing::debug; /// A pausable worker which can be paused and restarted on forks. /// @@ -67,7 +68,8 @@ impl PausableWorker { pub fn start(&mut self, rt: &Runtime) -> Result<(), PausableWorkerError> { match self { PausableWorker::Running { .. } => Ok(()), - PausableWorker::Paused { .. } => { + PausableWorker::Paused { worker } => { + debug!(?worker, "Starting pausable worker"); let PausableWorker::Paused { mut worker } = std::mem::replace(self, PausableWorker::InvalidState) else { @@ -117,6 +119,7 @@ impl PausableWorker { pub fn request_pause(&self) -> Result<(), PausableWorkerError> { match self { PausableWorker::Running { stop_token, .. } => { + debug!("Requesting pause for worker"); stop_token.cancel(); Ok(()) } @@ -137,6 +140,7 @@ impl PausableWorker { pub async fn wait_for_pause(&mut self) -> Result<(), PausableWorkerError> { match self { PausableWorker::Running { .. } => { + debug!("Waiting for worker to pause"); let PausableWorker::Running { handle, stop_token } = std::mem::replace(self, PausableWorker::InvalidState) else { @@ -150,6 +154,7 @@ impl PausableWorker { } if let Ok(mut worker) = handle.await { + debug!(?worker, "Worker paused successfully"); worker.on_pause().await; *self = PausableWorker::Paused { worker }; Ok(()) From 040ccdd9607cf9fb235758eaa191ad4fc9ce4d06 Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 26 Mar 2026 15:18:06 +0100 Subject: [PATCH 45/80] feat(shared_runtime): add block_on method to runtime --- libdd-data-pipeline-ffi/src/trace_exporter.rs | 2 +- libdd-data-pipeline/src/agent_info/fetcher.rs | 4 +- libdd-data-pipeline/src/stats_exporter.rs | 7 +- libdd-data-pipeline/src/telemetry/mod.rs | 591 +++++++++--------- .../src/trace_exporter/builder.rs | 26 +- libdd-data-pipeline/src/trace_exporter/mod.rs | 24 +- .../src/trace_exporter/stats.rs | 11 +- libdd-data-pipeline/tests/test_fetch_info.rs | 3 +- libdd-shared-runtime/src/lib.rs | 2 +- .../src/shared_runtime/mod.rs | 28 +- .../src/shared_runtime/pausable_worker.rs | 2 +- libdd-shared-runtime/src/worker.rs | 2 +- libdd-telemetry/src/worker/mod.rs | 30 +- 13 files changed, 361 insertions(+), 371 deletions(-) diff --git a/libdd-data-pipeline-ffi/src/trace_exporter.rs b/libdd-data-pipeline-ffi/src/trace_exporter.rs index 2a926b590a..485c0dbbf8 100644 --- a/libdd-data-pipeline-ffi/src/trace_exporter.rs +++ b/libdd-data-pipeline-ffi/src/trace_exporter.rs @@ -8,10 +8,10 @@ use libdd_common_ffi::{ CharSlice, {slice::AsBytes, slice::ByteSlice}, }; -use libdd_shared_runtime::SharedRuntime; use libdd_data_pipeline::trace_exporter::{ TelemetryConfig, TraceExporter, TraceExporterInputFormat, TraceExporterOutputFormat, }; +use libdd_shared_runtime::SharedRuntime; use std::{ptr::NonNull, sync::Arc, time::Duration}; use tracing::debug; diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index 23584e4fe3..e5844c2bd6 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -305,8 +305,8 @@ impl ResponseObserver { mod single_threaded_tests { use super::*; use crate::agent_info; - use libdd_shared_runtime::SharedRuntime; use httpmock::prelude::*; + use libdd_shared_runtime::SharedRuntime; const TEST_INFO: &str = r#"{ "version": "0.0.0", @@ -647,4 +647,4 @@ mod single_threaded_tests { // Should not trigger a fetch since the state is the same mock.assert_calls(0); } -} \ No newline at end of file +} diff --git a/libdd-data-pipeline/src/stats_exporter.rs b/libdd-data-pipeline/src/stats_exporter.rs index 759e8b45a5..6f49beb9a7 100644 --- a/libdd-data-pipeline/src/stats_exporter.rs +++ b/libdd-data-pipeline/src/stats_exporter.rs @@ -182,10 +182,10 @@ pub fn stats_url_from_agent_url(agent_url: &str) -> anyhow::Result { #[cfg(test)] mod tests { use super::*; - use libdd_shared_runtime::SharedRuntime; use httpmock::prelude::*; use httpmock::MockServer; use libdd_common::http_common::new_default_client; + use libdd_shared_runtime::SharedRuntime; use libdd_trace_utils::span::{trace_utils, v04::SpanSlice}; use libdd_trace_utils::test_utils::poll_for_mock_hit; use time::Duration; @@ -341,7 +341,6 @@ mod tests { #[test] fn test_worker_shutdown() { let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); let server = MockServer::start(); @@ -370,7 +369,9 @@ mod tests { shared_runtime.shutdown(None).unwrap(); assert!( - rt.block_on(poll_for_mock_hit(&mut mock, 10, 100, 1, false)), + shared_runtime + .block_on(poll_for_mock_hit(&mut mock, 10, 100, 1, false)) + .expect("Failed to get runtime"), "Expected max retry attempts" ); } diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index 9700382624..faf83d68b2 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -16,7 +16,6 @@ use libdd_trace_utils::{ trace_utils::SendDataResult, }; use std::{collections::HashMap, time::Duration}; -use tokio::runtime::Handle; /// Structure to build a Telemetry client. /// @@ -100,7 +99,7 @@ impl TelemetryClientBuilder { } /// Builds the telemetry client. - pub fn build(self, runtime: Handle) -> (TelemetryClient, TelemetryWorker) { + pub fn build(self) -> (TelemetryClient, TelemetryWorker) { #[allow(clippy::unwrap_used)] let mut builder = TelemetryWorkerBuilder::new_fetch_host( self.service_name.unwrap(), @@ -118,7 +117,7 @@ impl TelemetryClientBuilder { builder.runtime_id = Some(id); } - let (worker_handle, worker) = builder.build_worker(runtime); + let (worker_handle, worker) = builder.build_worker(None); ( TelemetryClient { @@ -324,7 +323,7 @@ mod tests { .set_url(url) .set_heartbeat(100) .set_debug_enabled(true) - .build(Handle::current()); + .build(); let handle = runtime .spawn_worker(worker) .expect("Failed to spawn worker"); @@ -365,34 +364,34 @@ mod tests { fn api_bytes_test() { let payload = Regex::new(r#""metric":"trace_api.bytes","tags":\["src_library:libdatadog"\],"sketch_b64":".+","common":true,"interval":\d+,"type":"distribution""#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - bytes_sent: 1, - ..Default::default() - }; + shared_runtime + .block_on(async { + let server = MockServer::start_async().await; + let mut telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + bytes_sent: 1, + ..Default::default() + }; - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - let _ = client.send(&data); - // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + client.start().await; + let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; - handle.stop().await.expect("Failed to stop worker"); - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" - ); - }); + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + }) + .expect("Failed to get runtime"); } #[cfg_attr(miri, ignore)] @@ -400,34 +399,34 @@ mod tests { fn requests_test() { let payload = Regex::new(r#""metric":"trace_api.requests","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog"\],"common":true,"type":"count""#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - requests_count: 1, - ..Default::default() - }; + shared_runtime + .block_on(async { + let server = MockServer::start_async().await; + let mut telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + requests_count: 1, + ..Default::default() + }; - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - let _ = client.send(&data); - // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + client.start().await; + let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; - handle.stop().await.expect("Failed to stop worker"); - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" - ); - }); + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + }) + .expect("Failed to get runtime"); } #[cfg_attr(miri, ignore)] @@ -435,34 +434,34 @@ mod tests { fn responses_per_code_test() { let payload = Regex::new(r#""metric":"trace_api.responses","points":\[\[\d+,1\.0\]\],"tags":\["status_code:200","src_library:libdatadog"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - responses_count_per_code: HashMap::from([(200, 1)]), - ..Default::default() - }; + shared_runtime + .block_on(async { + let server = MockServer::start_async().await; + let mut telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + responses_count_per_code: HashMap::from([(200, 1)]), + ..Default::default() + }; - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - let _ = client.send(&data); - // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + client.start().await; + let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; - handle.stop().await.expect("Failed to stop worker"); - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" - ); - }); + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + }) + .expect("Failed to get runtime"); } #[cfg_attr(miri, ignore)] @@ -470,34 +469,34 @@ mod tests { fn errors_timeout_test() { let payload = Regex::new(r#""metric":"trace_api.errors","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","type:timeout"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - errors_timeout: 1, - ..Default::default() - }; + shared_runtime + .block_on(async { + let server = MockServer::start_async().await; + let mut telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + errors_timeout: 1, + ..Default::default() + }; - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - let _ = client.send(&data); - // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + client.start().await; + let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; - handle.stop().await.expect("Failed to stop worker"); - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" - ); - }); + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + }) + .expect("Failed to get runtime"); } #[cfg_attr(miri, ignore)] @@ -505,34 +504,34 @@ mod tests { fn errors_network_test() { let payload = Regex::new(r#""metric":"trace_api.errors","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","type:network"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - errors_network: 1, - ..Default::default() - }; + shared_runtime + .block_on(async { + let server = MockServer::start_async().await; + let mut telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + errors_network: 1, + ..Default::default() + }; - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - let _ = client.send(&data); - // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + client.start().await; + let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; - handle.stop().await.expect("Failed to stop worker"); - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" - ); - }); + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + }) + .expect("Failed to get runtime"); } #[cfg_attr(miri, ignore)] @@ -540,34 +539,34 @@ mod tests { fn errors_status_code_test() { let payload = Regex::new(r#""metric":"trace_api.errors","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","type:status_code"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - errors_status_code: 1, - ..Default::default() - }; + shared_runtime + .block_on(async { + let server = MockServer::start_async().await; + let mut telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + errors_status_code: 1, + ..Default::default() + }; - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - let _ = client.send(&data); - // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + client.start().await; + let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; - handle.stop().await.expect("Failed to stop worker"); - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" - ); - }); + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + }) + .expect("Failed to get runtime"); } #[cfg_attr(miri, ignore)] @@ -575,34 +574,34 @@ mod tests { fn chunks_sent_test() { let payload = Regex::new(r#""metric":"trace_chunks_sent","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_sent: 1, - ..Default::default() - }; + shared_runtime + .block_on(async { + let server = MockServer::start_async().await; + let mut telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + chunks_sent: 1, + ..Default::default() + }; - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - let _ = client.send(&data); - // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + client.start().await; + let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; - handle.stop().await.expect("Failed to stop worker"); - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" - ); - }); + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + }) + .expect("Failed to get runtime"); } #[cfg_attr(miri, ignore)] @@ -610,34 +609,34 @@ mod tests { fn chunks_dropped_send_failure_test() { let payload = Regex::new(r#""metric":"trace_chunks_dropped","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","reason:send_failure"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_dropped_send_failure: 1, - ..Default::default() - }; + shared_runtime + .block_on(async { + let server = MockServer::start_async().await; + let mut telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + chunks_dropped_send_failure: 1, + ..Default::default() + }; - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - let _ = client.send(&data); - // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + client.start().await; + let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; - handle.stop().await.expect("Failed to stop worker"); - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" - ); - }); + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + }) + .expect("Failed to get runtime"); } #[cfg_attr(miri, ignore)] @@ -645,34 +644,34 @@ mod tests { fn chunks_dropped_p0_test() { let payload = Regex::new(r#""metric":"trace_chunks_dropped","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","reason:p0_drop"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_dropped_p0: 1, - ..Default::default() - }; + shared_runtime + .block_on(async { + let server = MockServer::start_async().await; + let mut telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + chunks_dropped_p0: 1, + ..Default::default() + }; - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - let _ = client.send(&data); - // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + client.start().await; + let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; - handle.stop().await.expect("Failed to stop worker"); - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" - ); - }); + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + }) + .expect("Failed to get runtime"); } #[cfg_attr(miri, ignore)] @@ -680,34 +679,34 @@ mod tests { fn chunks_dropped_serialization_error_test() { let payload = Regex::new(r#""metric":"trace_chunks_dropped","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","reason:serialization_error"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_dropped_serialization_error: 1, - ..Default::default() - }; + shared_runtime + .block_on(async { + let server = MockServer::start_async().await; + let mut telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }) + .await; + + let data = SendPayloadTelemetry { + chunks_dropped_serialization_error: 1, + ..Default::default() + }; - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - let _ = client.send(&data); - // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + client.start().await; + let _ = client.send(&data); + // Wait for send to be processed + sleep(Duration::from_millis(1)).await; - handle.stop().await.expect("Failed to stop worker"); - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" - ); - }); + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + }) + .expect("Failed to get runtime"); } #[test] @@ -853,44 +852,42 @@ mod tests { #[test] fn runtime_id_test() { let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_includes(r#""runtime_id":"foo""#); - then.status(200).body(""); - }) - .await; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - client - .send(&SendPayloadTelemetry { - requests_count: 1, - ..Default::default() - }) - .unwrap(); - // Wait for send to be processed - sleep(Duration::from_millis(10)).await; - - handle.stop().await.expect("Failed to stop worker"); - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" - ); - // One payload generate-metrics - }); + shared_runtime + .block_on(async { + let server = MockServer::start_async().await; + let mut telemetry_srv = server + .mock_async(|when, then| { + when.method(POST).body_includes(r#""runtime_id":"foo""#); + then.status(200).body(""); + }) + .await; + + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + client.start().await; + client + .send(&SendPayloadTelemetry { + requests_count: 1, + ..Default::default() + }) + .unwrap(); + // Wait for send to be processed + sleep(Duration::from_millis(10)).await; + + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + // One payload generate-metrics + }) + .expect("Failed to get runtime"); } #[cfg_attr(miri, ignore)] #[test] fn application_metadata_test() { let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let rt = shared_runtime.runtime().expect("Failed to get runtime"); - - rt.block_on(async { + shared_runtime.block_on(async { let server = MockServer::start_async().await; let mut telemetry_srv = server .mock_async(|when, then| { @@ -918,6 +915,6 @@ mod tests { "telemetry server did not receive calls within timeout" ); // One payload generate-metrics - }); + }).expect("Failed to get runtime"); } } diff --git a/libdd-data-pipeline/src/trace_exporter/builder.rs b/libdd-data-pipeline/src/trace_exporter/builder.rs index 47266c60cb..88d816c166 100644 --- a/libdd-data-pipeline/src/trace_exporter/builder.rs +++ b/libdd-data-pipeline/src/trace_exporter/builder.rs @@ -2,7 +2,6 @@ // SPDX-License-Identifier: Apache-2.0 use crate::agent_info::AgentInfoFetcher; -use libdd_shared_runtime::SharedRuntime; use crate::telemetry::TelemetryClientBuilder; use crate::trace_exporter::agent_response::AgentResponsePayloadVersion; use crate::trace_exporter::error::BuilderErrorKind; @@ -15,6 +14,7 @@ use arc_swap::ArcSwap; use libdd_common::http_common::new_default_client; use libdd_common::{parse_uri, tag, Endpoint}; use libdd_dogstatsd_client::new; +use libdd_shared_runtime::SharedRuntime; use std::sync::Arc; use std::time::Duration; @@ -283,12 +283,7 @@ impl TraceExporterBuilder { if let Some(id) = telemetry_config.runtime_id { builder = builder.set_runtime_id(&id); } - let runtime = shared_runtime.runtime().map_err(|e| { - TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration(e.to_string())) - })?; - // This handle is never used since we run it as a SharedRuntime worker. So it is fine - // if the tokio runtime is dropped by SharedRuntime. - Ok(builder.build(runtime.handle().clone())) + Ok(builder.build()) }); let (telemetry_client, telemetry_handle) = match telemetry { @@ -298,14 +293,11 @@ impl TraceExporterBuilder { e.to_string(), )) })?; - shared_runtime - .runtime() - .map_err(|e| { - TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration( - e.to_string(), - )) - })? - .block_on(client.start()); + shared_runtime.block_on(client.start()).map_err(|e| { + TraceExporterError::Builder(BuilderErrorKind::InvalidConfiguration( + e.to_string(), + )) + })?; (Some(client), Some(handle)) } Some(Err(e)) => return Err(e), @@ -437,10 +429,6 @@ mod tests { assert_eq!(exporter.metadata.language_interpreter, ""); assert!(!exporter.metadata.client_computed_stats); assert!(exporter.telemetry.is_none()); - assert!( - exporter.shared_runtime.runtime().is_ok(), - "default shared runtime should be initialized" - ); } #[cfg_attr(miri, ignore)] diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index b66f5cd29a..6100abb269 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -15,7 +15,6 @@ use self::metrics::MetricsEmitter; use self::stats::StatsComputationStatus; use self::trace_serializer::TraceSerializer; use crate::agent_info::ResponseObserver; -use libdd_shared_runtime::{SharedRuntime, WorkerHandle}; use crate::telemetry::{SendPayloadTelemetry, TelemetryClient}; use crate::trace_exporter::agent_response::{ AgentResponsePayloadVersion, DATADOG_RATES_PAYLOAD_VERSION_HEADER, @@ -35,6 +34,7 @@ use http_body_util::BodyExt; use libdd_common::tag::Tag; use libdd_common::{http_common, Endpoint, HttpClient}; use libdd_dogstatsd_client::Client; +use libdd_shared_runtime::{SharedRuntime, WorkerHandle}; use libdd_trace_utils::msgpack_decoder; use libdd_trace_utils::send_with_retry::{ send_with_retry, RetryStrategy, SendWithRetryError, SendWithRetryResult, @@ -46,7 +46,6 @@ use std::sync::Arc; #[cfg(feature = "test-utils")] use std::time::Duration; use std::{borrow::Borrow, collections::HashMap, str::FromStr}; -use tokio::runtime::Runtime; use tokio::task::JoinSet; use tracing::{debug, error, warn}; @@ -221,10 +220,11 @@ impl TraceExporter { /// Returns [`SharedRuntimeError::ShutdownTimedOut`] if a timeout was given and elapsed before /// all workers finished. pub fn shutdown(self, timeout: Option) -> Result<(), TraceExporterError> { - let runtime = self.runtime()?; + let runtime = self.shared_runtime.clone(); if let Some(timeout) = timeout { match runtime .block_on(async { tokio::time::timeout(timeout, self.shutdown_workers()).await }) + .map_err(TraceExporterError::Io)? { Ok(()) => Ok(()), Err(_) => Err(TraceExporterError::Shutdown(ShutdownError::TimedOut( @@ -232,7 +232,9 @@ impl TraceExporter { ))), } } else { - runtime.block_on(self.shutdown_workers()); + runtime + .block_on(self.shutdown_workers()) + .map_err(TraceExporterError::Io)?; Ok(()) } } @@ -264,10 +266,10 @@ impl TraceExporter { } } - /// Return a runtime from the shared runtime manager. - fn runtime(&self) -> Result, TraceExporterError> { + /// Run a future to completion on the shared runtime. + fn block_on(&self, f: F) -> Result { self.shared_runtime - .runtime() + .block_on(f) .map_err(TraceExporterError::Io) } @@ -412,8 +414,7 @@ impl TraceExporter { trace_chunks: Vec>>, ) -> Result { self.check_agent_info(); - self.runtime()? - .block_on(async { self.send_trace_chunks_inner(trace_chunks).await }) + self.block_on(async { self.send_trace_chunks_inner(trace_chunks).await })? } /// Send a list of trace chunks to the agent, asynchronously @@ -459,8 +460,7 @@ impl TraceExporter { None, ); - self.runtime()? - .block_on(async { self.send_trace_chunks_inner(traces).await }) + self.block_on(async { self.send_trace_chunks_inner(traces).await })? } /// Send traces payload to agent with retry and telemetry reporting @@ -1923,4 +1923,4 @@ mod single_threaded_tests { mock_traces.assert(); } -} \ No newline at end of file +} diff --git a/libdd-data-pipeline/src/trace_exporter/stats.rs b/libdd-data-pipeline/src/trace_exporter/stats.rs index 6071ffc5df..bafad1d46a 100644 --- a/libdd-data-pipeline/src/trace_exporter/stats.rs +++ b/libdd-data-pipeline/src/trace_exporter/stats.rs @@ -8,10 +8,10 @@ //! and processing traces for stats collection. use crate::agent_info::schema::AgentInfo; -use libdd_shared_runtime::{SharedRuntime, WorkerHandle}; use crate::stats_exporter; use arc_swap::ArcSwap; use libdd_common::{Endpoint, HttpClient, MutexExt}; +use libdd_shared_runtime::{SharedRuntime, WorkerHandle}; use libdd_trace_stats::span_concentrator::SpanConcentrator; use std::sync::{Arc, Mutex}; use std::time::Duration; @@ -127,11 +127,10 @@ pub(crate) fn stop_stats_computation( client_side_stats.store(Arc::new(StatsComputationStatus::DisabledByAgent { bucket_size, })); - match ctx.shared_runtime.runtime() { - Ok(runtime) => { - let _ = runtime.block_on(async { worker_handle.clone().stop().await }); - } + match ctx.shared_runtime.block_on(worker_handle.clone().stop()) { + Ok(Err(e)) => error!("Failed to stop stats worker: {e}"), Err(e) => error!("Failed to stop stats worker: {e}"), + _ => {} } } } @@ -238,4 +237,4 @@ pub(crate) fn is_stats_worker_active(client_side_stats: &ArcSwap Result, io::Error> { - match self.runtime.lock_or_panic().as_ref() { - None => Ok(Arc::new( - Builder::new_current_thread().enable_all().build()?, - )), - Some(runtime) => Ok(runtime.clone()), - } + /// Returns an error if it fails to create a fallback runtime. + pub fn block_on(&self, f: F) -> Result { + let runtime = match self.runtime.lock_or_panic().as_ref() { + None => Arc::new(Builder::new_current_thread().enable_all().build()?), + Some(runtime) => runtime.clone(), + }; + Ok(runtime.block_on(f)) } /// Shutdown the runtime and all workers synchronously with optional timeout. @@ -459,5 +454,4 @@ mod tests { // This should succeed as we're not in an async context assert!(shared_runtime.after_fork_child().is_ok()); } -} - +} \ No newline at end of file diff --git a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs index c027bb3ddd..ea7778912c 100644 --- a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs +++ b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs @@ -233,4 +233,4 @@ mod tests { pausable_worker.start(&runtime).unwrap(); assert_eq!(receiver.recv().unwrap(), next_message); } -} \ No newline at end of file +} diff --git a/libdd-shared-runtime/src/worker.rs b/libdd-shared-runtime/src/worker.rs index e05b772e38..e6bc0e9160 100644 --- a/libdd-shared-runtime/src/worker.rs +++ b/libdd-shared-runtime/src/worker.rs @@ -63,4 +63,4 @@ impl Worker for Box { async fn shutdown(&mut self) { (**self).shutdown().await } -} \ No newline at end of file +} diff --git a/libdd-telemetry/src/worker/mod.rs b/libdd-telemetry/src/worker/mod.rs index 6afabd4975..945a895010 100644 --- a/libdd-telemetry/src/worker/mod.rs +++ b/libdd-telemetry/src/worker/mod.rs @@ -933,8 +933,9 @@ pub struct TelemetryWorkerHandle { sender: mpsc::Sender, shutdown: Arc, cancellation_token: CancellationToken, - // Used to spawn cancellation tasks - runtime: runtime::Handle, + // Used to spawn cancellation tasks. Should be None when running as a SharedRuntime worker, + // since the runtime is not guaranteed to exist for the lifetime of the worker. + runtime: Option, contexts: MetricContexts, } @@ -992,12 +993,16 @@ impl TelemetryWorkerHandle { } fn cancel_requests_with_deadline(&self, deadline: time::Instant) { + let Some(runtime) = &self.runtime else { + tracing::error!("Cannot schedule cancellation deadline: no runtime handle available"); + return; + }; let token = self.cancellation_token.clone(); let f = async move { tokio::time::sleep_until(deadline.into()).await; token.cancel() }; - self.runtime.spawn(f); + runtime.spawn(f); } pub fn wait_for_shutdown_deadline(&self, deadline: time::Instant) { @@ -1161,10 +1166,15 @@ impl TelemetryWorkerBuilder { } } - /// Build the corresponding worker and it's handle. - /// The runtime handle is wrapped in the worker handle and should be the one used to run the - /// worker task. - pub fn build_worker(self, tokio_runtime: Handle) -> (TelemetryWorkerHandle, TelemetryWorker) { + /// Build the corresponding worker and its handle. + /// + /// The optional runtime handle is stored in the worker handle and should be the one used to run + /// the worker task cancellation deadlines. Pass `None` when the worker will be run via a + /// [`SharedRuntime`](libdd_shared_runtime::SharedRuntime). + pub fn build_worker( + self, + tokio_runtime: Option, + ) -> (TelemetryWorkerHandle, TelemetryWorker) { let (tx, mailbox) = mpsc::channel(5000); let shutdown = Arc::new(InnerTelemetryShutdown { is_shutdown: Mutex::new(false), @@ -1232,7 +1242,7 @@ impl TelemetryWorkerBuilder { pub fn spawn(self) -> (TelemetryWorkerHandle, JoinHandle<()>) { let tokio_runtime = tokio::runtime::Handle::current(); - let (worker_handle, worker) = self.build_worker(tokio_runtime.clone()); + let (worker_handle, worker) = self.build_worker(Some(tokio_runtime.clone())); let join_handle = tokio_runtime.spawn(async move { worker.run_loop().await }); @@ -1244,7 +1254,7 @@ impl TelemetryWorkerBuilder { let runtime = tokio::runtime::Builder::new_current_thread() .enable_all() .build()?; - let (handle, worker) = self.build_worker(runtime.handle().clone()); + let (handle, worker) = self.build_worker(Some(runtime.handle().clone())); let notify_shutdown = handle.shutdown.clone(); std::thread::spawn(move || { runtime.block_on(worker.run_loop()); @@ -1288,7 +1298,7 @@ mod tests { "1.0.0".to_string(), ); // build_worker requires a tokio Handle; tests using this must be #[tokio::test] - builder.build_worker(tokio::runtime::Handle::current()) + builder.build_worker(Some(tokio::runtime::Handle::current())) } fn make_log(id: u64, message: &str) -> (LogIdentifier, Log) { From 92e5ddc217ac9ac480da4a09a4d9da0afe1b2c21 Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 26 Mar 2026 18:04:42 +0100 Subject: [PATCH 46/80] chore: undo catch panic change --- libdd-data-pipeline-ffi/src/lib.rs | 6 +++--- libdd-data-pipeline-ffi/src/trace_exporter.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libdd-data-pipeline-ffi/src/lib.rs b/libdd-data-pipeline-ffi/src/lib.rs index 894b03845d..9e4e4bc278 100644 --- a/libdd-data-pipeline-ffi/src/lib.rs +++ b/libdd-data-pipeline-ffi/src/lib.rs @@ -17,11 +17,11 @@ macro_rules! catch_panic { Ok(ret) => ret, Err(info) => { if let Some(s) = info.downcast_ref::() { - tracing::error!("panic: {}", s); + error!(error = %ErrorCode::Panic, s); } else if let Some(s) = info.downcast_ref::<&str>() { - tracing::error!("panic: {}", s); + error!(error = %ErrorCode::Panic, s); } else { - tracing::error!("panic: unable to retrieve panic context"); + error!(error = %ErrorCode::Panic, "Unable to retrieve panic context"); } $err } diff --git a/libdd-data-pipeline-ffi/src/trace_exporter.rs b/libdd-data-pipeline-ffi/src/trace_exporter.rs index 485c0dbbf8..4a08648e5d 100644 --- a/libdd-data-pipeline-ffi/src/trace_exporter.rs +++ b/libdd-data-pipeline-ffi/src/trace_exporter.rs @@ -13,7 +13,7 @@ use libdd_data_pipeline::trace_exporter::{ }; use libdd_shared_runtime::SharedRuntime; use std::{ptr::NonNull, sync::Arc, time::Duration}; -use tracing::debug; +use tracing::{debug, error}; #[inline] fn sanitize_string(str: CharSlice) -> Result> { From 64b9bd276098f99c56fc1d58e7884aa26d101bab Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 26 Mar 2026 18:05:35 +0100 Subject: [PATCH 47/80] chore: remove async trait from ddcommon --- Cargo.lock | 1 - libdd-common/Cargo.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9c83859b3d..61260cf7ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2980,7 +2980,6 @@ name = "libdd-common" version = "2.0.0" dependencies = [ "anyhow", - "async-trait", "bytes", "cc", "const_format", diff --git a/libdd-common/Cargo.toml b/libdd-common/Cargo.toml index e418591b26..aef95db03c 100644 --- a/libdd-common/Cargo.toml +++ b/libdd-common/Cargo.toml @@ -17,7 +17,6 @@ bench = false [dependencies] anyhow = "1.0" -async-trait = "0.1" futures = "0.3" futures-core = { version = "0.3.0", default-features = false } futures-util = { version = "0.3.0", default-features = false } From 37b8105a21c128fdf0eabb51fbd1e977e6cab557 Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 26 Mar 2026 18:05:56 +0100 Subject: [PATCH 48/80] refactor: use option to handle null pointer --- libdd-data-pipeline-ffi/src/trace_exporter.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/libdd-data-pipeline-ffi/src/trace_exporter.rs b/libdd-data-pipeline-ffi/src/trace_exporter.rs index 4a08648e5d..eb59e46e28 100644 --- a/libdd-data-pipeline-ffi/src/trace_exporter.rs +++ b/libdd-data-pipeline-ffi/src/trace_exporter.rs @@ -406,16 +406,16 @@ pub unsafe extern "C" fn ddog_trace_exporter_config_set_connection_timeout( #[no_mangle] pub unsafe extern "C" fn ddog_trace_exporter_config_set_shared_runtime( config: Option<&mut TraceExporterConfig>, - handle: *const SharedRuntime, + handle: Option>, ) -> Option> { catch_panic!( - match config { - Some(config) if !handle.is_null() => { + match (config, handle) { + (Some(config), Some(handle)) => { // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. // Increment the strong count before reconstructing so the config's Arc // is independent from the caller's handle. - Arc::increment_strong_count(handle); - config.shared_runtime = Some(Arc::from_raw(handle)); + Arc::increment_strong_count(handle.as_ptr()); + config.shared_runtime = Some(Arc::from_raw(handle.as_ptr())); None } _ => gen_error!(ErrorCode::InvalidArgument), @@ -1162,3 +1162,4 @@ mod tests { } } } + From febf92067e46bd1feec81f8242a5043a16216a04 Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 26 Mar 2026 18:42:41 +0100 Subject: [PATCH 49/80] docs: add comment to telemetry unused action --- libdd-telemetry/src/worker/mod.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libdd-telemetry/src/worker/mod.rs b/libdd-telemetry/src/worker/mod.rs index 945a895010..430588beea 100644 --- a/libdd-telemetry/src/worker/mod.rs +++ b/libdd-telemetry/src/worker/mod.rs @@ -182,6 +182,8 @@ impl Worker for TelemetryWorker { "Received telemetry action" ); + // When running as a [libdd_shared_runtime::Worker] Shutdown is handled by stopping the + // Worker from the handle and not by sending stop action let _action_result = match self.flavor { TelemetryWorkerFlavor::Full => self.dispatch_action(action).await, TelemetryWorkerFlavor::MetricsLogs => { From fc45ce94e62ff7da2b2c2e0647b47381e85d96ee Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 26 Mar 2026 19:02:42 +0100 Subject: [PATCH 50/80] feat: use futures unordered instead of JoinSet --- Cargo.lock | 1 + libdd-shared-runtime/Cargo.toml | 3 +- .../src/shared_runtime/mod.rs | 42 +++++++++---------- .../src/shared_runtime/pausable_worker.rs | 25 +---------- 4 files changed, 26 insertions(+), 45 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 61260cf7ad..cbb7290678 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3318,6 +3318,7 @@ name = "libdd-shared-runtime" version = "28.0.3" dependencies = [ "async-trait", + "futures", "libdd-common", "tokio", "tokio-util", diff --git a/libdd-shared-runtime/Cargo.toml b/libdd-shared-runtime/Cargo.toml index 437959036b..7a3e8b2edd 100644 --- a/libdd-shared-runtime/Cargo.toml +++ b/libdd-shared-runtime/Cargo.toml @@ -17,10 +17,11 @@ bench = false [dependencies] async-trait = "0.1" +futures = { version = "0.3", default-features = false, features = ["alloc"] } tokio = { version = "1.23", features = ["rt", "rt-multi-thread", "time"] } tokio-util = "0.7.11" tracing = { version = "0.1", default-features = false } libdd-common = { version = "2.0.0", path = "../libdd-common", default-features = false } [lints.rust] -unexpected_cfgs = { level = "warn", check-cfg = ['cfg(coverage)'] } +unexpected_cfgs = { level = "warn", check-cfg = ['cfg(coverage)'] } \ No newline at end of file diff --git a/libdd-shared-runtime/src/shared_runtime/mod.rs b/libdd-shared-runtime/src/shared_runtime/mod.rs index 38755383b1..46234ac4e0 100644 --- a/libdd-shared-runtime/src/shared_runtime/mod.rs +++ b/libdd-shared-runtime/src/shared_runtime/mod.rs @@ -11,13 +11,13 @@ pub(crate) mod pausable_worker; use crate::worker::Worker; +use futures::stream::{FuturesUnordered, StreamExt}; use libdd_common::MutexExt; use pausable_worker::{PausableWorker, PausableWorkerError}; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex}; use std::{fmt, io}; use tokio::runtime::{Builder, Runtime}; -use tokio::task::JoinSet; use tracing::{debug, error}; type BoxedWorker = Box; @@ -79,7 +79,7 @@ impl WorkerHandle { let WorkerEntry { worker, .. } = workers_lock.swap_remove(position); worker }; - worker.wait_for_pause().await?; + worker.pause().await?; worker.shutdown().await; Ok(()) } @@ -212,16 +212,17 @@ impl SharedRuntime { debug!("before_fork: pausing all workers"); if let Some(runtime) = self.runtime.lock_or_panic().take() { let mut workers_lock = self.workers.lock_or_panic(); - runtime.block_on(async move { - for worker_entry in workers_lock.iter_mut() { - let _ = worker_entry.worker.request_pause(); - } - - for worker_entry in workers_lock.iter_mut() { - if let Err(e) = worker_entry.worker.wait_for_pause().await { - error!("Worker failed to pause before fork: {:?}", e); - } - } + runtime.block_on(async { + let futures: FuturesUnordered<_> = workers_lock + .iter_mut() + .map(|worker_entry| async { + if let Err(e) = worker_entry.worker.pause().await { + error!("Worker failed to pause before fork: {:?}", e); + } + }) + .collect(); + + futures.collect::<()>().await; }); } } @@ -348,19 +349,18 @@ impl SharedRuntime { std::mem::take(&mut *workers_lock) }; - let mut join_set = JoinSet::new(); - for mut worker_entry in workers { - join_set.spawn(async move { - let result = worker_entry.worker.wait_for_pause().await; - if let Err(e) = result { + let futures: FuturesUnordered<_> = workers + .into_iter() + .map(|mut worker_entry| async move { + if let Err(e) = worker_entry.worker.pause().await { error!("Worker failed to shutdown: {:?}", e); return; } worker_entry.worker.shutdown().await; - }); - } + }) + .collect(); - join_set.join_all().await; + futures.collect::<()>().await; } } @@ -454,4 +454,4 @@ mod tests { // This should succeed as we're not in an async context assert!(shared_runtime.after_fork_child().is_ok()); } -} \ No newline at end of file +} diff --git a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs index ea7778912c..23ce9448e0 100644 --- a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs +++ b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs @@ -113,31 +113,11 @@ impl PausableWorker { } } - /// Request the worker to pause without waiting for task termination. - /// - /// This is useful when pausing multiple workers in parallel. - pub fn request_pause(&self) -> Result<(), PausableWorkerError> { - match self { - PausableWorker::Running { stop_token, .. } => { - debug!("Requesting pause for worker"); - stop_token.cancel(); - Ok(()) - } - PausableWorker::Paused { .. } => Ok(()), - PausableWorker::InvalidState => Err(PausableWorkerError::InvalidState), - } - } - /// Pause the worker and wait for it to complete, storing its state for restart. /// - /// This method will cancel the worker's cancellation token if it hasn't been cancelled yet, - /// then wait for the worker to finish and store its state. Calling [`Self::request_pause`] - /// before this method is optional - it's only needed when shutting down multiple workers - /// simultaneously to allow them to pause concurrently before waiting for all workers to pause. - /// /// # Errors /// Fails if the worker handle has been aborted preventing the worker from being retrieved. - pub async fn wait_for_pause(&mut self) -> Result<(), PausableWorkerError> { + pub async fn pause(&mut self) -> Result<(), PausableWorkerError> { match self { PausableWorker::Running { .. } => { debug!("Waiting for worker to pause"); @@ -148,7 +128,6 @@ impl PausableWorker { return Ok(()); }; - // Cancel the token if it hasn't been cancelled yet to avoid deadlock if !stop_token.is_cancelled() { stop_token.cancel(); } @@ -224,7 +203,7 @@ mod tests { pausable_worker.start(&runtime).unwrap(); assert_eq!(receiver.recv().unwrap(), 0); - runtime.block_on(async { pausable_worker.wait_for_pause().await.unwrap() }); + runtime.block_on(async { pausable_worker.pause().await.unwrap() }); // Empty the message queue and get the last message let mut next_message = 1; for message in receiver.try_iter() { From c0c7b2ea5c71876b9027ed052a6c42993784242e Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 27 Mar 2026 13:59:28 +0100 Subject: [PATCH 51/80] fix(telemetry): clear items in telemetry store --- libdd-telemetry/src/worker/store.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/libdd-telemetry/src/worker/store.rs b/libdd-telemetry/src/worker/store.rs index 1c2d400900..e085c8a9b1 100644 --- a/libdd-telemetry/src/worker/store.rs +++ b/libdd-telemetry/src/worker/store.rs @@ -38,6 +38,13 @@ mod queuehashmap { self.items.is_empty() } + /// Clear the map, reusing existing allocations + pub fn clear(&mut self) { + self.table.clear(); + self.items.clear(); + self.popped = 0; + } + // Remove the oldest item in the queue and return it pub fn pop_front(&mut self) -> Option<(K, V)> { let (k, v) = self.items.pop_front()?; @@ -209,9 +216,10 @@ where self.items.len() } - /// Discard only pending unflushed items while preserving stored dedupe history. + /// Discard pending unflushed items and clear stored dedupe history. pub fn clear(&mut self) { self.unflushed.clear(); + self.items.clear(); } } From ec6fbec974dce2dcc6ca87c58352cb419f6ca467 Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 27 Mar 2026 15:23:17 +0100 Subject: [PATCH 52/80] feat: use biased select to reduce time-to-pause --- .../src/shared_runtime/pausable_worker.rs | 29 +++++++++---------- libdd-shared-runtime/src/worker.rs | 7 +++++ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs index 23ce9448e0..ec618e8417 100644 --- a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs +++ b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs @@ -14,15 +14,6 @@ use tracing::debug; /// Used to allow a [`super::Worker`] to be paused while saving its state when /// dropping a tokio runtime to be able to restart with the same state on a new runtime. This is /// used to stop all threads before a fork to avoid deadlocks in child. -/// -/// # Time-to-pause -/// This loop should yield regularly to reduce time-to-pause. See [`tokio::task::yield_now`]. -/// -/// # Cancellation safety -/// The main loop can be interrupted at any yield point (`.await`ed call). The state of the worker -/// at this point will be saved and used to restart the worker. To be able to safely restart, the -/// worker must be in a valid state on every call to `.await`. -/// See [`tokio::select#cancellation-safety`] for more details. #[derive(Debug)] pub enum PausableWorker { Running { @@ -84,23 +75,31 @@ impl PausableWorker { let handle = rt.spawn(async move { // First iteration using initial_trigger select! { - _ = worker.initial_trigger() => { - worker.run().await; - } + // Always check for cancellation first, to reduce time-to-pause in case + // the initial trigger is always ready. + biased; + _ = cloned_token.cancelled() => { return worker; } + _ = worker.initial_trigger() => { + worker.run().await; + } } // Regular iterations loop { select! { - _ = worker.trigger() => { - worker.run().await; - } + // Always check for cancellation first, to reduce time-to-pause in case + // the trigger is always ready. + biased; + _ = cloned_token.cancelled() => { break; } + _ = worker.trigger() => { + worker.run().await; + } } } worker diff --git a/libdd-shared-runtime/src/worker.rs b/libdd-shared-runtime/src/worker.rs index e6bc0e9160..d3b10ba2a1 100644 --- a/libdd-shared-runtime/src/worker.rs +++ b/libdd-shared-runtime/src/worker.rs @@ -9,6 +9,12 @@ use async_trait::async_trait; /// The worker's [`run`](Self::run) method is executed every time [`trigger`](Self::trigger) /// returns. On startup [`initial_trigger`](Self::initial_trigger) is called before the first /// [`run`](Self::run). +/// +/// # Cancellation safety +/// The `trigger` function can be interrupted at any yield point (`.await`ed call). The state of the worker +/// at this point will be saved and used to restart the worker. To be able to safely restart, the +/// worker must be in a valid state on every call to `.await` within the trigger function. +/// See [`tokio::select#cancellation-safety`] for more details. #[async_trait] pub trait Worker: std::fmt::Debug { /// Main worker function @@ -18,6 +24,7 @@ pub trait Worker: std::fmt::Debug { async fn run(&mut self); /// Function called between each `run` to wait for the next run. + /// This function should be cancellation safe as it can be cancelled at any yield point. async fn trigger(&mut self); /// Alternative trigger called on start to provide custom behavior. From a523c1df50e0dcf8bf0410239227244c7b40b619 Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 27 Mar 2026 17:06:37 +0100 Subject: [PATCH 53/80] chore(shared-runtime): address mutex lock order concerns --- libdd-shared-runtime/src/shared_runtime/mod.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/libdd-shared-runtime/src/shared_runtime/mod.rs b/libdd-shared-runtime/src/shared_runtime/mod.rs index 46234ac4e0..55bbfd0928 100644 --- a/libdd-shared-runtime/src/shared_runtime/mod.rs +++ b/libdd-shared-runtime/src/shared_runtime/mod.rs @@ -137,6 +137,10 @@ impl From for SharedRuntimeError { /// The SharedRuntime owns a tokio runtime and tracks PausableWorkers spawned on it. /// It provides methods to safely pause workers before forking and restart them /// after fork in both parent and child processes. +/// +/// # Mutex lock order +/// When locking both [Self::runtime] and [Self::workers], the mutex must be locked in the order of +/// the fields in the struct. When possible avoid holding both locks simultaneously. #[derive(Debug)] pub struct SharedRuntime { runtime: Arc>>>, @@ -254,13 +258,15 @@ impl SharedRuntime { let runtime_lock = self.runtime.lock_or_panic(); let runtime = runtime_lock .as_ref() - .ok_or(SharedRuntimeError::RuntimeUnavailable)?; + .ok_or(SharedRuntimeError::RuntimeUnavailable)? + .clone(); + drop(runtime_lock); let mut workers_lock = self.workers.lock_or_panic(); // Restart all workers for worker_entry in workers_lock.iter_mut() { - worker_entry.worker.start(runtime)?; + worker_entry.worker.start(&runtime)?; } Ok(()) @@ -281,14 +287,16 @@ impl SharedRuntime { let runtime_lock = self.runtime.lock_or_panic(); let runtime = runtime_lock .as_ref() - .ok_or(SharedRuntimeError::RuntimeUnavailable)?; + .ok_or(SharedRuntimeError::RuntimeUnavailable)? + .clone(); + drop(runtime_lock); let mut workers_lock = self.workers.lock_or_panic(); // Restart all workers in child process for worker_entry in workers_lock.iter_mut() { worker_entry.worker.reset(); - worker_entry.worker.start(runtime)?; + worker_entry.worker.start(&runtime)?; } Ok(()) From 21402c2e14aab2be891620a45e7ac3081379181a Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 27 Mar 2026 17:36:20 +0100 Subject: [PATCH 54/80] style: clippy and fmt --- libdd-data-pipeline-ffi/src/trace_exporter.rs | 1 - libdd-data-pipeline/examples/send-traces-with-stats.rs | 8 +++----- libdd-shared-runtime/src/worker.rs | 6 +++--- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/libdd-data-pipeline-ffi/src/trace_exporter.rs b/libdd-data-pipeline-ffi/src/trace_exporter.rs index eb59e46e28..354adc656d 100644 --- a/libdd-data-pipeline-ffi/src/trace_exporter.rs +++ b/libdd-data-pipeline-ffi/src/trace_exporter.rs @@ -1162,4 +1162,3 @@ mod tests { } } } - diff --git a/libdd-data-pipeline/examples/send-traces-with-stats.rs b/libdd-data-pipeline/examples/send-traces-with-stats.rs index 7542826a7e..03573045d0 100644 --- a/libdd-data-pipeline/examples/send-traces-with-stats.rs +++ b/libdd-data-pipeline/examples/send-traces-with-stats.rs @@ -2,15 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 use clap::Parser; -use libdd_data_pipeline::{ - shared_runtime::SharedRuntime, - trace_exporter::{ - TelemetryConfig, TraceExporter, TraceExporterInputFormat, TraceExporterOutputFormat, - }, +use libdd_data_pipeline::trace_exporter::{ + TelemetryConfig, TraceExporter, TraceExporterInputFormat, TraceExporterOutputFormat, }; use libdd_log::logger::{ logger_configure_std, logger_set_log_level, LogEventLevel, StdConfig, StdTarget, }; +use libdd_shared_runtime::SharedRuntime; use libdd_trace_protobuf::pb; use std::{ collections::HashMap, diff --git a/libdd-shared-runtime/src/worker.rs b/libdd-shared-runtime/src/worker.rs index d3b10ba2a1..0d7cf6d0da 100644 --- a/libdd-shared-runtime/src/worker.rs +++ b/libdd-shared-runtime/src/worker.rs @@ -11,9 +11,9 @@ use async_trait::async_trait; /// [`run`](Self::run). /// /// # Cancellation safety -/// The `trigger` function can be interrupted at any yield point (`.await`ed call). The state of the worker -/// at this point will be saved and used to restart the worker. To be able to safely restart, the -/// worker must be in a valid state on every call to `.await` within the trigger function. +/// The `trigger` function can be interrupted at any yield point (`.await`ed call). The state of the +/// worker at this point will be saved and used to restart the worker. To be able to safely restart, +/// the worker must be in a valid state on every call to `.await` within the trigger function. /// See [`tokio::select#cancellation-safety`] for more details. #[async_trait] pub trait Worker: std::fmt::Debug { From f6d69c17509c6db4bb9a4370f3fc0996981c64f3 Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 27 Mar 2026 17:50:53 +0100 Subject: [PATCH 55/80] chore: update 3rd party --- LICENSE-3rdparty.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE-3rdparty.yml b/LICENSE-3rdparty.yml index b0bccd7c2e..ffb2d472c8 100644 --- a/LICENSE-3rdparty.yml +++ b/LICENSE-3rdparty.yml @@ -1,4 +1,4 @@ -root_name: builder, build_common, tools, libdd-alloc, libdd-crashtracker, libdd-common, libdd-telemetry, libdd-ddsketch, libdd-libunwind-sys, libdd-crashtracker-ffi, libdd-common-ffi, datadog-ffe, datadog-ffe-ffi, datadog-ipc, datadog-ipc-macros, libdd-tinybytes, tarpc, tarpc-plugins, spawn_worker, cc_utils, libdd-library-config, libdd-trace-protobuf, libdd-library-config-ffi, datadog-live-debugger, libdd-data-pipeline, libdd-dogstatsd-client, libdd-trace-stats, libdd-trace-utils, libdd-trace-normalization, libdd-log, datadog-live-debugger-ffi, libdd-profiling, libdd-profiling-protobuf, libdd-profiling-ffi, libdd-data-pipeline-ffi, libdd-ddsketch-ffi, libdd-log-ffi, libdd-telemetry-ffi, symbolizer-ffi, datadog-profiling-replayer, datadog-remote-config, datadog-sidecar, datadog-sidecar-macros, datadog-sidecar-ffi, libdd-trace-obfuscation, datadog-tracer-flare, sidecar_mockgen, test_spawn_from_lib, libdd-shared-runtime, libdd-shared-runtime-ffi +root_name: builder, build_common, tools, libdd-alloc, libdd-crashtracker, libdd-common, libdd-telemetry, libdd-ddsketch, libdd-shared-runtime, libdd-libunwind-sys, libdd-crashtracker-ffi, libdd-common-ffi, datadog-ffe, datadog-ffe-ffi, datadog-ipc, datadog-ipc-macros, libdd-tinybytes, tarpc, tarpc-plugins, spawn_worker, cc_utils, libdd-library-config, libdd-trace-protobuf, libdd-library-config-ffi, datadog-live-debugger, libdd-data-pipeline, libdd-dogstatsd-client, libdd-trace-stats, libdd-trace-utils, libdd-trace-normalization, libdd-log, datadog-live-debugger-ffi, libdd-profiling, libdd-profiling-protobuf, libdd-profiling-ffi, libdd-data-pipeline-ffi, libdd-ddsketch-ffi, libdd-log-ffi, libdd-telemetry-ffi, symbolizer-ffi, datadog-profiling-replayer, datadog-remote-config, datadog-sidecar, datadog-sidecar-macros, datadog-sidecar-ffi, libdd-trace-obfuscation, datadog-tracer-flare, sidecar_mockgen, test_spawn_from_lib, bin_tests third_party_libraries: - package_name: addr2line package_version: 0.24.2 From 35d34c0c203214e7c1d32fe0c45e429d25a1c380 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 30 Mar 2026 14:27:03 +0200 Subject: [PATCH 56/80] chore: fix conflicts --- Cargo.lock | 4 ++-- libdd-data-pipeline-ffi/src/trace_exporter.rs | 1 + libdd-data-pipeline/src/stats_exporter.rs | 2 +- libdd-shared-runtime/Cargo.toml | 4 ++-- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9cd28ee36f..2297816486 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3181,7 +3181,7 @@ dependencies = [ [[package]] name = "libdd-shared-runtime" -version = "28.0.3" +version = "29.0.0" dependencies = [ "async-trait", "futures", @@ -3193,7 +3193,7 @@ dependencies = [ [[package]] name = "libdd-shared-runtime-ffi" -version = "28.0.3" +version = "29.0.0" dependencies = [ "build_common", "libdd-shared-runtime", diff --git a/libdd-data-pipeline-ffi/src/trace_exporter.rs b/libdd-data-pipeline-ffi/src/trace_exporter.rs index 160774804b..695231b3a2 100644 --- a/libdd-data-pipeline-ffi/src/trace_exporter.rs +++ b/libdd-data-pipeline-ffi/src/trace_exporter.rs @@ -530,6 +530,7 @@ pub unsafe extern "C" fn ddog_trace_exporter_new( if let Some(runtime) = config.shared_runtime.clone() { builder.set_shared_runtime(runtime); + } if let Some(ref url) = config.otlp_endpoint { builder.set_otlp_endpoint(url); diff --git a/libdd-data-pipeline/src/stats_exporter.rs b/libdd-data-pipeline/src/stats_exporter.rs index caa18190a6..bedd1e2ed7 100644 --- a/libdd-data-pipeline/src/stats_exporter.rs +++ b/libdd-data-pipeline/src/stats_exporter.rs @@ -354,7 +354,7 @@ mod tests { when.method(POST) .header("Content-type", "application/msgpack") .path("/v0.6/stats") - .body_includes("libdatadog-test"); + .body_includes("libdatadog-test") .body_includes("key1:value1,key2:value2"); then.status(200).body(""); }); diff --git a/libdd-shared-runtime/Cargo.toml b/libdd-shared-runtime/Cargo.toml index 7a3e8b2edd..9b2325e617 100644 --- a/libdd-shared-runtime/Cargo.toml +++ b/libdd-shared-runtime/Cargo.toml @@ -21,7 +21,7 @@ futures = { version = "0.3", default-features = false, features = ["alloc"] } tokio = { version = "1.23", features = ["rt", "rt-multi-thread", "time"] } tokio-util = "0.7.11" tracing = { version = "0.1", default-features = false } -libdd-common = { version = "2.0.0", path = "../libdd-common", default-features = false } +libdd-common = { version = "3.0.1", path = "../libdd-common", default-features = false } [lints.rust] -unexpected_cfgs = { level = "warn", check-cfg = ['cfg(coverage)'] } \ No newline at end of file +unexpected_cfgs = { level = "warn", check-cfg = ['cfg(coverage)'] } From b1b5f9b68f9d780a78f25e3fe520dcee4352ee84 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 30 Mar 2026 14:43:22 +0200 Subject: [PATCH 57/80] chore: remove legacy test --- libdd-data-pipeline/src/trace_exporter/mod.rs | 9 --------- 1 file changed, 9 deletions(-) diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 5236b3aa08..0396af61df 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -1830,15 +1830,6 @@ mod tests { ); mock_otlp.assert(); } - - #[test] - #[cfg_attr(miri, ignore)] - fn stop_and_start_runtime() { - let builder = TraceExporterBuilder::default(); - let exporter = builder.build().unwrap(); - exporter.stop_worker(); - exporter.run_worker().unwrap(); - } } #[cfg(test)] From da9ce1e0b5d8c582084224eeb53d87d2ee2b0de0 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 30 Mar 2026 14:47:27 +0200 Subject: [PATCH 58/80] chore: bump libdd-common version --- Cargo.lock | 4 ++-- libdd-shared-runtime/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a4d067c4fe..5b73745e67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3180,7 +3180,7 @@ dependencies = [ [[package]] name = "libdd-shared-runtime" -version = "29.0.0" +version = "30.0.0" dependencies = [ "async-trait", "futures", @@ -3192,7 +3192,7 @@ dependencies = [ [[package]] name = "libdd-shared-runtime-ffi" -version = "29.0.0" +version = "30.0.0" dependencies = [ "build_common", "libdd-shared-runtime", diff --git a/libdd-shared-runtime/Cargo.toml b/libdd-shared-runtime/Cargo.toml index 9b2325e617..e0cf38fe78 100644 --- a/libdd-shared-runtime/Cargo.toml +++ b/libdd-shared-runtime/Cargo.toml @@ -21,7 +21,7 @@ futures = { version = "0.3", default-features = false, features = ["alloc"] } tokio = { version = "1.23", features = ["rt", "rt-multi-thread", "time"] } tokio-util = "0.7.11" tracing = { version = "0.1", default-features = false } -libdd-common = { version = "3.0.1", path = "../libdd-common", default-features = false } +libdd-common = { version = "3.0.2", path = "../libdd-common", default-features = false } [lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ['cfg(coverage)'] } From b9fe72e148cc11a66dd3a2ca319d67c2bebdcea1 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 30 Mar 2026 14:54:02 +0200 Subject: [PATCH 59/80] chore: set shared runtime version --- Cargo.lock | 2 +- libdd-data-pipeline-ffi/Cargo.toml | 2 +- libdd-data-pipeline/Cargo.toml | 2 +- libdd-shared-runtime-ffi/Cargo.toml | 2 +- libdd-shared-runtime/Cargo.toml | 2 +- libdd-telemetry/Cargo.toml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5b73745e67..940a4f9629 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3180,7 +3180,7 @@ dependencies = [ [[package]] name = "libdd-shared-runtime" -version = "30.0.0" +version = "1.0.0" dependencies = [ "async-trait", "futures", diff --git a/libdd-data-pipeline-ffi/Cargo.toml b/libdd-data-pipeline-ffi/Cargo.toml index dbf766d471..0598040d16 100644 --- a/libdd-data-pipeline-ffi/Cargo.toml +++ b/libdd-data-pipeline-ffi/Cargo.toml @@ -30,7 +30,7 @@ libdd-trace-utils = { path = "../libdd-trace-utils" } [dependencies] libdd-data-pipeline = { path = "../libdd-data-pipeline" } -libdd-shared-runtime = { path = "../libdd-shared-runtime" } +libdd-shared-runtime = { version = "1.0.0", path = "../libdd-shared-runtime" } libdd-common-ffi = { path = "../libdd-common-ffi", default-features = false } libdd-tinybytes = { path = "../libdd-tinybytes" } tracing = { version = "0.1", default-features = false } diff --git a/libdd-data-pipeline/Cargo.toml b/libdd-data-pipeline/Cargo.toml index 503e1982bb..0432802309 100644 --- a/libdd-data-pipeline/Cargo.toml +++ b/libdd-data-pipeline/Cargo.toml @@ -31,7 +31,7 @@ tokio = { version = "1.23", features = [ ], default-features = false } uuid = { version = "1.10.0", features = ["v4"] } libdd-common = { version = "3.0.2", path = "../libdd-common", default-features = false } -libdd-shared-runtime = { path = "../libdd-shared-runtime" } +libdd-shared-runtime = { version = "1.0.0", path = "../libdd-shared-runtime" } libdd-telemetry = { version = "4.0.0", path = "../libdd-telemetry", default-features = false } libdd-trace-protobuf = { version = "3.0.1", path = "../libdd-trace-protobuf" } libdd-trace-stats = { version = "2.0.0", path = "../libdd-trace-stats" } diff --git a/libdd-shared-runtime-ffi/Cargo.toml b/libdd-shared-runtime-ffi/Cargo.toml index 2b31805dc1..d965ef9240 100644 --- a/libdd-shared-runtime-ffi/Cargo.toml +++ b/libdd-shared-runtime-ffi/Cargo.toml @@ -22,5 +22,5 @@ cbindgen = ["build_common/cbindgen"] build_common = { path = "../build-common" } [dependencies] -libdd-shared-runtime = { path = "../libdd-shared-runtime" } +libdd-shared-runtime = { version = "1.0.0", path = "../libdd-shared-runtime" } tracing = { version = "0.1", default-features = false } diff --git a/libdd-shared-runtime/Cargo.toml b/libdd-shared-runtime/Cargo.toml index e0cf38fe78..2ba5227da1 100644 --- a/libdd-shared-runtime/Cargo.toml +++ b/libdd-shared-runtime/Cargo.toml @@ -3,7 +3,7 @@ [package] name = "libdd-shared-runtime" -version.workspace = true +version = "1.0.0" description = "Shared tokio runtime with fork-safe worker management for Datadog libraries" homepage = "https://github.com/DataDog/libdatadog/tree/main/libdd-shared-runtime" repository = "https://github.com/DataDog/libdatadog/tree/main/libdd-shared-runtime" diff --git a/libdd-telemetry/Cargo.toml b/libdd-telemetry/Cargo.toml index 9c18dfa40c..af1597386e 100644 --- a/libdd-telemetry/Cargo.toml +++ b/libdd-telemetry/Cargo.toml @@ -33,7 +33,7 @@ uuid = { version = "1.3", features = ["v4"] } hashbrown = "0.15" libdd-common = { version = "3.0.2", path = "../libdd-common", default-features = false } -libdd-shared-runtime = { path = "../libdd-shared-runtime" } +libdd-shared-runtime = { version = "1.0.0", path = "../libdd-shared-runtime" } libdd-ddsketch = { version = "1.0.1", path = "../libdd-ddsketch" } [target."cfg(unix)".dependencies] From 264d9562c2e1b463bfb1756e74c5d1ce7214b4c8 Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 30 Mar 2026 14:58:14 +0200 Subject: [PATCH 60/80] chore: update 3rd party license files --- LICENSE-3rdparty.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE-3rdparty.yml b/LICENSE-3rdparty.yml index 043f1db5f8..05b9f82194 100644 --- a/LICENSE-3rdparty.yml +++ b/LICENSE-3rdparty.yml @@ -1,4 +1,4 @@ -root_name: builder, build_common, tools, libdd-alloc, libdd-crashtracker, libdd-common, libdd-telemetry, libdd-ddsketch, libdd-shared-runtime, libdd-libunwind-sys, libdd-crashtracker-ffi, libdd-common-ffi, datadog-ffe, datadog-ffe-ffi, datadog-ipc, datadog-ipc-macros, libdd-tinybytes, spawn_worker, cc_utils, libdd-library-config, libdd-trace-protobuf, libdd-library-config-ffi, datadog-live-debugger, libdd-data-pipeline, libdd-dogstatsd-client, libdd-trace-stats, libdd-trace-utils, libdd-trace-normalization, libdd-log, datadog-live-debugger-ffi, libdd-profiling, libdd-profiling-protobuf, libdd-profiling-ffi, libdd-data-pipeline-ffi, libdd-ddsketch-ffi, libdd-log-ffi, libdd-telemetry-ffi, symbolizer-ffi, datadog-profiling-replayer, datadog-remote-config, datadog-sidecar, datadog-sidecar-macros, datadog-sidecar-ffi, libdd-trace-obfuscation, datadog-tracer-flare, sidecar_mockgen, test_spawn_from_lib +root_name: builder, build_common, tools, libdd-alloc, libdd-crashtracker, libdd-common, libdd-telemetry, libdd-ddsketch, libdd-shared-runtime, libdd-libunwind-sys, libdd-crashtracker-ffi, libdd-common-ffi, datadog-ffe, datadog-ffe-ffi, datadog-ipc, datadog-ipc-macros, libdd-tinybytes, spawn_worker, cc_utils, libdd-library-config, libdd-trace-protobuf, libdd-library-config-ffi, datadog-live-debugger, libdd-data-pipeline, libdd-dogstatsd-client, libdd-trace-stats, libdd-trace-utils, libdd-trace-normalization, libdd-log, datadog-live-debugger-ffi, libdd-profiling, libdd-profiling-protobuf, libdd-profiling-ffi, libdd-data-pipeline-ffi, libdd-ddsketch-ffi, libdd-log-ffi, libdd-telemetry-ffi, symbolizer-ffi, datadog-profiling-replayer, datadog-remote-config, datadog-sidecar, datadog-sidecar-macros, datadog-sidecar-ffi, libdd-trace-obfuscation, datadog-tracer-flare, sidecar_mockgen, test_spawn_from_lib, bin_tests third_party_libraries: - package_name: addr2line package_version: 0.24.2 From 41bff26e74e36a98cff59386b3af0d285b7b9f7d Mon Sep 17 00:00:00 2001 From: vianney Date: Mon, 30 Mar 2026 15:55:11 +0200 Subject: [PATCH 61/80] test: update test to shared runtime --- libdd-data-pipeline/src/stats_exporter.rs | 50 ++++++++++++----------- libdd-telemetry/src/worker/mod.rs | 12 +++--- 2 files changed, 32 insertions(+), 30 deletions(-) diff --git a/libdd-data-pipeline/src/stats_exporter.rs b/libdd-data-pipeline/src/stats_exporter.rs index bedd1e2ed7..f97d4a1d45 100644 --- a/libdd-data-pipeline/src/stats_exporter.rs +++ b/libdd-data-pipeline/src/stats_exporter.rs @@ -306,39 +306,41 @@ mod tests { } #[cfg_attr(miri, ignore)] - #[tokio::test] - async fn test_run() { - let server = MockServer::start_async().await; + #[test] + fn test_run() { + let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - let mut mock = server - .mock_async(|when, then| { - when.method(POST) - .header("Content-type", "application/msgpack") - .path("/v0.6/stats") - .body_includes("libdatadog-test") - .body_includes("key1:value1,key2:value2"); - then.status(200).body(""); - }) - .await; + let server = MockServer::start(); - let mut stats_exporter = StatsExporter::new( - BUCKETS_DURATION, + let mut mock = server.mock(|when, then| { + when.method(POST) + .header("Content-type", "application/msgpack") + .path("/v0.6/stats") + .body_includes("libdatadog-test") + .body_includes("key1:value1,key2:value2"); + then.status(200).body(""); + }); + + let stats_exporter = StatsExporter::new( + // Use smaller buckets duration to speed up test + Duration::from_secs(1), Arc::new(Mutex::new(get_test_concentrator())), get_test_metadata(), Endpoint::from_url(stats_url_from_agent_url(&server.url("/")).unwrap()), new_default_client(), ); - tokio::time::pause(); - tokio::spawn(async move { - stats_exporter.run().await; - }); - // Wait for the stats to be flushed - tokio::time::sleep(BUCKETS_DURATION + Duration::from_secs(1)).await; - // Resume time to sleep while the stats are being sent - tokio::time::resume(); + let _handle = shared_runtime + .spawn_worker(stats_exporter) + .expect("Failed to spawn worker"); + + // Wait for stats to be flushed + std::thread::sleep(Duration::from_secs(1)); + assert!( - poll_for_mock_hit(&mut mock, 10, 100, 1, false).await, + shared_runtime + .block_on(poll_for_mock_hit(&mut mock, 10, 100, 1, false)) + .expect("Failed to use runtime"), "Expected max retry attempts" ); } diff --git a/libdd-telemetry/src/worker/mod.rs b/libdd-telemetry/src/worker/mod.rs index 430588beea..3dc19d4087 100644 --- a/libdd-telemetry/src/worker/mod.rs +++ b/libdd-telemetry/src/worker/mod.rs @@ -1364,24 +1364,24 @@ mod tests { let stats = worker.stats(); assert_eq!( - stats.dependencies_stored, 1, - "dependency dedupe history should be preserved" + stats.dependencies_stored, 0, + "dependency dedupe history should be cleared" ); assert_eq!( stats.dependencies_unflushed, 0, "dependency pending queue should be cleared" ); assert_eq!( - stats.integrations_stored, 1, - "integration dedupe history should be preserved" + stats.integrations_stored, 0, + "integration dedupe history should be cleared" ); assert_eq!( stats.integrations_unflushed, 0, "integration pending queue should be cleared" ); assert_eq!( - stats.configurations_stored, 1, - "configuration dedupe history should be preserved" + stats.configurations_stored, 0, + "configuration dedupe history should be cleared" ); assert_eq!( stats.configurations_unflushed, 0, From a0047b4c70c12ca792c4f7e67afadf53fc6c2fe7 Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 31 Mar 2026 18:09:16 +0200 Subject: [PATCH 62/80] feat!: bump libdd-data-pipeline to major version --- Cargo.lock | 2 +- libdd-data-pipeline/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 940a4f9629..1bb7e806f6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2955,7 +2955,7 @@ dependencies = [ [[package]] name = "libdd-data-pipeline" -version = "3.0.1" +version = "4.0.0" dependencies = [ "anyhow", "arc-swap", diff --git a/libdd-data-pipeline/Cargo.toml b/libdd-data-pipeline/Cargo.toml index 0432802309..88fc4a4398 100644 --- a/libdd-data-pipeline/Cargo.toml +++ b/libdd-data-pipeline/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libdd-data-pipeline" -version= "3.0.1" +version= "4.0.0" description = "Trace exporter package allowing sending data from datadog SDKs to the Trace Agent." homepage = "https://github.com/DataDog/libdatadog/tree/main/libdd-data-pipeline" repository = "https://github.com/DataDog/libdatadog/tree/main/libdd-data-pipeline" From 39d0920fe3cffcc504f374273b832b45469264bd Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 2 Apr 2026 11:26:07 +0200 Subject: [PATCH 63/80] feat(trace-exporter): remove exporter block on method --- libdd-data-pipeline/src/trace_exporter/mod.rs | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index fae93c6938..46a96fa858 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -240,9 +240,9 @@ impl TraceExporter { /// Returns [`SharedRuntimeError::ShutdownTimedOut`] if a timeout was given and elapsed before /// all workers finished. pub fn shutdown(self, timeout: Option) -> Result<(), TraceExporterError> { - let runtime = self.shared_runtime.clone(); if let Some(timeout) = timeout { - match runtime + match self + .shared_runtime .block_on(async { tokio::time::timeout(timeout, self.shutdown_workers()).await }) .map_err(TraceExporterError::Io)? { @@ -252,7 +252,7 @@ impl TraceExporter { ))), } } else { - runtime + self.shared_runtime .block_on(self.shutdown_workers()) .map_err(TraceExporterError::Io)?; Ok(()) @@ -286,13 +286,6 @@ impl TraceExporter { } } - /// Run a future to completion on the shared runtime. - fn block_on(&self, f: F) -> Result { - self.shared_runtime - .block_on(f) - .map_err(TraceExporterError::Io) - } - /// Manually start all workers #[cfg(not(target_arch = "wasm32"))] pub fn run_worker(&self) -> Result<(), TraceExporterError> { @@ -476,7 +469,8 @@ impl TraceExporter { trace_chunks: Vec>>, ) -> Result { self.check_agent_info(); - self.block_on(async { self.send_trace_chunks_inner(trace_chunks).await })? + self.shared_runtime + .block_on(async { self.send_trace_chunks_inner(trace_chunks).await })? } /// Send a list of trace chunks to the agent, asynchronously (or OTLP when configured). @@ -553,7 +547,8 @@ impl TraceExporter { None, ); - self.block_on(async { self.send_trace_chunks_inner(traces).await })? + self.shared_runtime + .block_on(async { self.send_trace_chunks_inner(traces).await })? } /// Send traces payload to agent with retry and telemetry reporting From b6a2b9550c33f3b7aae1d9d98ef7d3fea952d5fa Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 2 Apr 2026 13:49:28 +0200 Subject: [PATCH 64/80] doc(shared-runtime): Add warnings to doc --- libdd-shared-runtime/src/shared_runtime/mod.rs | 6 ++++++ libdd-shared-runtime/src/worker.rs | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/libdd-shared-runtime/src/shared_runtime/mod.rs b/libdd-shared-runtime/src/shared_runtime/mod.rs index 55bbfd0928..1148e5da14 100644 --- a/libdd-shared-runtime/src/shared_runtime/mod.rs +++ b/libdd-shared-runtime/src/shared_runtime/mod.rs @@ -67,6 +67,11 @@ impl WorkerHandle { /// /// # Errors /// Returns an error if the worker has already been stopped. + /// + /// # Cancel safety + /// This function is *NOT* cancel safe and shouldn't be called in [Worker::trigger]. + /// If cancelled, the stopped worker can end up in an invalid state if a fork occurs while + /// stopping. pub async fn stop(self) -> Result<(), WorkerHandleError> { let mut worker = { let mut workers_lock = self.workers.lock_or_panic(); @@ -212,6 +217,7 @@ impl SharedRuntime { /// preventing potential deadlocks in the child process. /// /// Worker errors are logged but do not cause the function to fail. + /// If the worker fails to pause it is dropped without calling shutdown. pub fn before_fork(&self) { debug!("before_fork: pausing all workers"); if let Some(runtime) = self.runtime.lock_or_panic().take() { diff --git a/libdd-shared-runtime/src/worker.rs b/libdd-shared-runtime/src/worker.rs index 0d7cf6d0da..71e7211b4d 100644 --- a/libdd-shared-runtime/src/worker.rs +++ b/libdd-shared-runtime/src/worker.rs @@ -19,7 +19,7 @@ use async_trait::async_trait; pub trait Worker: std::fmt::Debug { /// Main worker function /// - /// Code in this function should always use timeout on long-running await calls to avoid + /// Code in this function must always use timeout on long-running await calls to avoid /// blocking forks if an await call takes too long to complete. async fn run(&mut self); From a0932f80aab776274bfc1decbd6799866f0c8b8e Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 2 Apr 2026 13:50:06 +0200 Subject: [PATCH 65/80] fix(shared-runtime): fix race-condition between spawn_worker and before_fork --- .../src/shared_runtime/mod.rs | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/libdd-shared-runtime/src/shared_runtime/mod.rs b/libdd-shared-runtime/src/shared_runtime/mod.rs index 1148e5da14..ebc4702aaf 100644 --- a/libdd-shared-runtime/src/shared_runtime/mod.rs +++ b/libdd-shared-runtime/src/shared_runtime/mod.rs @@ -186,20 +186,24 @@ impl SharedRuntime { let boxed_worker: BoxedWorker = Box::new(worker); debug!(?boxed_worker, "Spawning worker on SharedRuntime"); let mut pausable_worker = PausableWorker::new(boxed_worker); - let worker_id = self.next_worker_id.fetch_add(1, Ordering::Relaxed); - - { - let runtime_lock = self.runtime.lock_or_panic(); - // If the runtime is not available, it's added to the worker list and will be started - // when the runtime is recreated. - if let Some(runtime) = runtime_lock.as_ref() { - pausable_worker.start(runtime)?; + // Hold the workers lock while starting the worker to avoid a race with + // before_fork: without this, before_fork could run after the worker is started but + // before it's added to the list, not pausing the worker before the runtime is dropped. + let runtime = self.runtime.lock_or_panic().clone(); + let mut workers_guard = self.workers.lock_or_panic(); + + // If the runtime is not available, the worker will be started + // when the runtime is recreated (after_fork_parent/child). + if let Some(runtime) = runtime { + if let Err(e) = pausable_worker.start(&runtime) { + return Err(e.into()); } } - let mut workers_lock = self.workers.lock_or_panic(); - workers_lock.push(WorkerEntry { + let worker_id = self.next_worker_id.fetch_add(1, Ordering::Relaxed); + + workers_guard.push(WorkerEntry { id: worker_id, worker: pausable_worker, }); From 27801f12aaa735852efbf8ab82b7e70e4ef819be Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 2 Apr 2026 13:50:43 +0200 Subject: [PATCH 66/80] chore: fix formating --- libdd-data-pipeline/src/trace_exporter/mod.rs | 8 -------- .../src/shared_runtime/pausable_worker.rs | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 46a96fa858..ea4efe9fe0 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -286,14 +286,6 @@ impl TraceExporter { } } - /// Manually start all workers - #[cfg(not(target_arch = "wasm32"))] - pub fn run_worker(&self) -> Result<(), TraceExporterError> { - self.shared_runtime.after_fork_parent().map_err(|e| { - TraceExporterError::Internal(InternalErrorKind::InvalidWorkerState(e.to_string())) - }) - } - /// Send msgpack serialized traces to the agent /// /// # Arguments diff --git a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs index b906f801f6..ad4e9c54ce 100644 --- a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs +++ b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs @@ -3,8 +3,8 @@ //! Defines a pausable worker to be able to stop background processes before forks -use libdd_capabilities::MaybeSend; use crate::worker::Worker; +use libdd_capabilities::MaybeSend; use std::fmt::Display; use tokio::{runtime::Runtime, select, task::JoinHandle}; use tokio_util::sync::CancellationToken; From 9807d94c0586d820d23368fb908c6770096919c6 Mon Sep 17 00:00:00 2001 From: vianney Date: Thu, 2 Apr 2026 14:09:10 +0200 Subject: [PATCH 67/80] chore: use MaybeSend --- libdd-shared-runtime/src/shared_runtime/mod.rs | 4 ++-- libdd-shared-runtime/src/worker.rs | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/libdd-shared-runtime/src/shared_runtime/mod.rs b/libdd-shared-runtime/src/shared_runtime/mod.rs index ebc4702aaf..1f63044f98 100644 --- a/libdd-shared-runtime/src/shared_runtime/mod.rs +++ b/libdd-shared-runtime/src/shared_runtime/mod.rs @@ -20,7 +20,7 @@ use std::{fmt, io}; use tokio::runtime::{Builder, Runtime}; use tracing::{debug, error}; -type BoxedWorker = Box; +type BoxedWorker = Box; #[derive(Debug)] struct WorkerEntry { @@ -179,7 +179,7 @@ impl SharedRuntime { /// /// # Errors /// Returns an error if the runtime is not available or the worker cannot be started. - pub fn spawn_worker( + pub fn spawn_worker( &self, worker: T, ) -> Result { diff --git a/libdd-shared-runtime/src/worker.rs b/libdd-shared-runtime/src/worker.rs index 71e7211b4d..1df33882a9 100644 --- a/libdd-shared-runtime/src/worker.rs +++ b/libdd-shared-runtime/src/worker.rs @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use async_trait::async_trait; +use libdd_capabilities::MaybeSend; /// A background worker meant to be spawned on a [`SharedRuntime`](crate::SharedRuntime). /// @@ -16,7 +17,7 @@ use async_trait::async_trait; /// the worker must be in a valid state on every call to `.await` within the trigger function. /// See [`tokio::select#cancellation-safety`] for more details. #[async_trait] -pub trait Worker: std::fmt::Debug { +pub trait Worker: std::fmt::Debug + MaybeSend { /// Main worker function /// /// Code in this function must always use timeout on long-running await calls to avoid @@ -46,7 +47,7 @@ pub trait Worker: std::fmt::Debug { // Blanket implementation for boxed trait objects #[async_trait] -impl Worker for Box { +impl Worker for Box { async fn run(&mut self) { (**self).run().await } From a0ec79c48ae6ae24f71abe6d77acc396883417c7 Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 3 Apr 2026 10:36:54 +0200 Subject: [PATCH 68/80] chore(shared-runtime): fix build on wasm-32 --- libdd-data-pipeline/src/agent_info/fetcher.rs | 4 +- libdd-data-pipeline/src/stats_exporter.rs | 5 +- .../src/trace_exporter/builder.rs | 6 ++- libdd-data-pipeline/src/trace_exporter/mod.rs | 48 +++++++++++-------- libdd-shared-runtime/Cargo.toml | 5 +- .../src/shared_runtime/mod.rs | 40 ++++++++++------ .../src/shared_runtime/pausable_worker.rs | 3 ++ libdd-shared-runtime/src/worker.rs | 6 ++- 8 files changed, 74 insertions(+), 43 deletions(-) diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index 159f74a3f2..3ba1e5be35 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -18,7 +18,6 @@ use std::marker::PhantomData; use std::sync::Arc; use std::time::Duration; use tokio::sync::mpsc; -#[cfg(not(target_arch = "wasm32"))] use tokio::time::sleep; use tracing::{debug, warn}; /// Whether the agent reported the same value or not. @@ -222,7 +221,8 @@ impl AgentInfoFetcher { } } -#[async_trait] +#[cfg_attr(not(target_arch = "wasm32"), async_trait)] +#[cfg_attr(target_arch = "wasm32", async_trait(?Send))] impl Worker for AgentInfoFetcher { async fn initial_trigger(&mut self) { // Skip initial wait if cache is not populated diff --git a/libdd-data-pipeline/src/stats_exporter.rs b/libdd-data-pipeline/src/stats_exporter.rs index bb69322904..e23b0b5f4b 100644 --- a/libdd-data-pipeline/src/stats_exporter.rs +++ b/libdd-data-pipeline/src/stats_exporter.rs @@ -13,7 +13,7 @@ use std::{ use crate::trace_exporter::TracerMetadata; use async_trait::async_trait; use libdd_capabilities::{HttpClientTrait, MaybeSend}; -use libdd_common::{Endpoint}; +use libdd_common::Endpoint; use libdd_shared_runtime::Worker; use libdd_trace_protobuf::pb; use libdd_trace_stats::span_concentrator::SpanConcentrator; @@ -132,7 +132,8 @@ impl StatsExporter { } } -#[async_trait] +#[cfg_attr(not(target_arch = "wasm32"), async_trait)] +#[cfg_attr(target_arch = "wasm32", async_trait(?Send))] impl Worker for StatsExporter { async fn trigger(&mut self) { tokio::time::sleep(self.flush_interval).await; diff --git a/libdd-data-pipeline/src/trace_exporter/builder.rs b/libdd-data-pipeline/src/trace_exporter/builder.rs index 818fa23635..29c3d78657 100644 --- a/libdd-data-pipeline/src/trace_exporter/builder.rs +++ b/libdd-data-pipeline/src/trace_exporter/builder.rs @@ -427,7 +427,9 @@ impl TraceExporterBuilder { #[cfg(target_arch = "wasm32")] { - drop(info_fetcher); + let info_endpoint = Endpoint::from_url(add_path(&agent_url, INFO_ENDPOINT)); + let (_info_fetcher, info_response_observer) = + AgentInfoFetcher::::new(info_endpoint, Duration::from_secs(5 * 60)); Ok(TraceExporter { endpoint: Endpoint { @@ -457,7 +459,7 @@ impl TraceExporterBuilder { input_format: self.input_format, output_format: self.output_format, client_computed_top_level: self.client_computed_top_level, - runtime: Arc::new(Mutex::new(Some(runtime))), + shared_runtime, dogstatsd, common_stats_tags: vec![libdatadog_version], client_side_stats: ArcSwap::new(stats.into()), diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index ea4efe9fe0..1a49f9a515 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -158,6 +158,7 @@ impl<'a> From<&'a TracerMetadata> for HeaderMap { } /// Handles for the background workers owned by a [`TraceExporter`]. +#[cfg(not(target_arch = "wasm32"))] #[derive(Debug)] pub(crate) struct TraceExporterWorkers { info_fetcher: WorkerHandle, @@ -219,6 +220,7 @@ pub struct TraceExporter { telemetry: Option, health_metrics_enabled: bool, client: H, + #[cfg(not(target_arch = "wasm32"))] workers: TraceExporterWorkers, agent_payload_response_version: Option, /// When set, traces are exported via OTLP HTTP/JSON instead of the Datadog agent. @@ -240,9 +242,9 @@ impl TraceExporter { /// Returns [`SharedRuntimeError::ShutdownTimedOut`] if a timeout was given and elapsed before /// all workers finished. pub fn shutdown(self, timeout: Option) -> Result<(), TraceExporterError> { + let runtime = self.shared_runtime.clone(); if let Some(timeout) = timeout { - match self - .shared_runtime + match runtime .block_on(async { tokio::time::timeout(timeout, self.shutdown_workers()).await }) .map_err(TraceExporterError::Io)? { @@ -252,7 +254,7 @@ impl TraceExporter { ))), } } else { - self.shared_runtime + runtime .block_on(self.shutdown_workers()) .map_err(TraceExporterError::Io)?; Ok(()) @@ -260,30 +262,36 @@ impl TraceExporter { } async fn shutdown_workers(self) { - let mut join_set = JoinSet::new(); - - // Extract the stats handle before moving other fields. - if let StatsComputationStatus::Enabled { worker_handle, .. } = - &**self.client_side_stats.load() + #[cfg(not(target_arch = "wasm32"))] { - let handle = worker_handle.clone(); - join_set.spawn(async move { handle.stop().await }); - } + let mut join_set = JoinSet::new(); - let info_fetcher = self.workers.info_fetcher; - let telemetry = self.workers.telemetry; + // Extract the stats handle before moving other fields. + if let StatsComputationStatus::Enabled { worker_handle, .. } = + &**self.client_side_stats.load() + { + let handle = worker_handle.clone(); + join_set.spawn(async move { handle.stop().await }); + } - join_set.spawn(async move { info_fetcher.stop().await }); + let info_fetcher = self.workers.info_fetcher; + join_set.spawn(async move { info_fetcher.stop().await }); - if let Some(telemetry) = telemetry { - join_set.spawn(async move { telemetry.stop().await }); - } + #[cfg(feature = "telemetry")] + if let Some(telemetry) = self.workers.telemetry { + join_set.spawn(async move { telemetry.stop().await }); + } - while let Some(result) = join_set.join_next().await { - if let Ok(Err(e)) = result { - error!("Worker failed to shutdown: {:?}", e); + while let Some(result) = join_set.join_next().await { + if let Ok(Err(e)) = result { + error!("Worker failed to shutdown: {:?}", e); + } } } + + // On wasm32 workers are no-ops, nothing to stop. + #[cfg(target_arch = "wasm32")] + let _ = self; } /// Send msgpack serialized traces to the agent diff --git a/libdd-shared-runtime/Cargo.toml b/libdd-shared-runtime/Cargo.toml index 1b344f0d4f..612cfee8c9 100644 --- a/libdd-shared-runtime/Cargo.toml +++ b/libdd-shared-runtime/Cargo.toml @@ -18,8 +18,11 @@ bench = false [dependencies] async-trait = "0.1" futures = { version = "0.3", default-features = false, features = ["alloc"] } -tokio = { version = "1.23", features = ["rt", "rt-multi-thread", "time"] } +tokio = { version = "1.23", features = ["rt", "macros", "time"] } tokio-util = "0.7.11" tracing = { version = "0.1", default-features = false } libdd-capabilities = { path = "../libdd-capabilities", version = "0.1.0" } libdd-common = { version = "3.0.2", path = "../libdd-common", default-features = false } + +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +tokio = { version = "1.23", features = ["rt-multi-thread"] } diff --git a/libdd-shared-runtime/src/shared_runtime/mod.rs b/libdd-shared-runtime/src/shared_runtime/mod.rs index 1f63044f98..e25ec2b6da 100644 --- a/libdd-shared-runtime/src/shared_runtime/mod.rs +++ b/libdd-shared-runtime/src/shared_runtime/mod.rs @@ -153,17 +153,32 @@ pub struct SharedRuntime { next_worker_id: AtomicU64, } +/// Build a tokio runtime appropriate for the current platform. +/// +/// On wasm32, a single-threaded current-thread runtime is used since multi-threading +/// is not available. On all other platforms a multi-threaded runtime is used. +fn build_runtime() -> Result { + #[cfg(not(target_arch = "wasm32"))] + { + Builder::new_multi_thread() + .worker_threads(1) + .enable_all() + .build() + } + #[cfg(target_arch = "wasm32")] + { + Builder::new_current_thread().enable_all().build() + } +} + impl SharedRuntime { - /// Create a new SharedRuntime with a default multi-threaded tokio runtime. + /// Create a new SharedRuntime with a default tokio runtime. /// /// # Errors /// Returns an error if the tokio runtime cannot be created. pub fn new() -> Result { debug!("Creating new SharedRuntime"); - let runtime = tokio::runtime::Builder::new_multi_thread() - .worker_threads(1) - .enable_all() - .build()?; + let runtime = build_runtime()?; Ok(Self { runtime: Arc::new(Mutex::new(Some(Arc::new(runtime)))), @@ -222,6 +237,7 @@ impl SharedRuntime { /// /// Worker errors are logged but do not cause the function to fail. /// If the worker fails to pause it is dropped without calling shutdown. + #[cfg(not(target_arch = "wasm32"))] pub fn before_fork(&self) { debug!("before_fork: pausing all workers"); if let Some(runtime) = self.runtime.lock_or_panic().take() { @@ -244,12 +260,7 @@ impl SharedRuntime { fn restart_runtime(&self) -> Result<(), SharedRuntimeError> { let mut runtime_lock = self.runtime.lock_or_panic(); if runtime_lock.is_none() { - *runtime_lock = Some(Arc::new( - Builder::new_multi_thread() - .worker_threads(1) - .enable_all() - .build()?, - )); + *runtime_lock = Some(Arc::new(build_runtime()?)); } Ok(()) } @@ -261,6 +272,7 @@ impl SharedRuntime { /// /// # Errors /// Returns an error if workers cannot be restarted or the runtime cannot be recreated. + #[cfg(not(target_arch = "wasm32"))] pub fn after_fork_parent(&self) -> Result<(), SharedRuntimeError> { debug!("after_fork_parent: restarting runtime and workers"); self.restart_runtime()?; @@ -290,6 +302,7 @@ impl SharedRuntime { /// /// # Errors /// Returns an error if the runtime cannot be reinitialized or workers cannot be started. + #[cfg(not(target_arch = "wasm32"))] pub fn after_fork_child(&self) -> Result<(), SharedRuntimeError> { debug!("after_fork_child: reinitializing runtime and workers"); self.restart_runtime()?; @@ -337,7 +350,7 @@ impl SharedRuntime { debug!(?timeout, "Shutting down SharedRuntime"); match self.runtime.lock_or_panic().take() { Some(runtime) => { - let result = if let Some(timeout) = timeout { + if let Some(timeout) = timeout { match runtime.block_on(async { tokio::time::timeout(timeout, self.shutdown_async()).await }) { @@ -347,8 +360,7 @@ impl SharedRuntime { } else { runtime.block_on(self.shutdown_async()); Ok(()) - }; - result + } } None => Ok(()), // The runtime is not running so there's nothing to shutdown } diff --git a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs index ad4e9c54ce..e3dcec8701 100644 --- a/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs +++ b/libdd-shared-runtime/src/shared_runtime/pausable_worker.rs @@ -58,6 +58,9 @@ impl PausableWorker { /// /// The worker's main loop will be run on the runtime. pub fn start(&mut self, rt: &Runtime) -> Result<(), PausableWorkerError> { + #[cfg(target_arch = "wasm32")] + return Ok(()); + #[cfg(not(target_arch = "wasm32"))] match self { PausableWorker::Running { .. } => Ok(()), PausableWorker::Paused { worker } => { diff --git a/libdd-shared-runtime/src/worker.rs b/libdd-shared-runtime/src/worker.rs index 1df33882a9..9d76ab8374 100644 --- a/libdd-shared-runtime/src/worker.rs +++ b/libdd-shared-runtime/src/worker.rs @@ -16,7 +16,8 @@ use libdd_capabilities::MaybeSend; /// worker at this point will be saved and used to restart the worker. To be able to safely restart, /// the worker must be in a valid state on every call to `.await` within the trigger function. /// See [`tokio::select#cancellation-safety`] for more details. -#[async_trait] +#[cfg_attr(not(target_arch = "wasm32"), async_trait)] +#[cfg_attr(target_arch = "wasm32", async_trait(?Send))] pub trait Worker: std::fmt::Debug + MaybeSend { /// Main worker function /// @@ -46,7 +47,8 @@ pub trait Worker: std::fmt::Debug + MaybeSend { } // Blanket implementation for boxed trait objects -#[async_trait] +#[cfg_attr(not(target_arch = "wasm32"), async_trait)] +#[cfg_attr(target_arch = "wasm32", async_trait(?Send))] impl Worker for Box { async fn run(&mut self) { (**self).run().await From 8ee44b673d4329dee90505b271fd47946ca8e73c Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 3 Apr 2026 10:41:34 +0200 Subject: [PATCH 69/80] chore(data-pipeline): revert version bump --- Cargo.lock | 2 +- libdd-data-pipeline/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4903bc36d9..cb3ead76b6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3003,7 +3003,7 @@ dependencies = [ [[package]] name = "libdd-data-pipeline" -version = "4.0.0" +version = "3.0.1" dependencies = [ "anyhow", "arc-swap", diff --git a/libdd-data-pipeline/Cargo.toml b/libdd-data-pipeline/Cargo.toml index 9284ef2ab7..8d5db11a82 100644 --- a/libdd-data-pipeline/Cargo.toml +++ b/libdd-data-pipeline/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libdd-data-pipeline" -version= "4.0.0" +version= "3.0.1" description = "Trace exporter package allowing sending data from datadog SDKs to the Trace Agent." homepage = "https://github.com/DataDog/libdatadog/tree/main/libdd-data-pipeline" repository = "https://github.com/DataDog/libdatadog/tree/main/libdd-data-pipeline" From 0aa1cf9e3ca5878ed2f96623015838da422aa35e Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 3 Apr 2026 15:21:39 +0200 Subject: [PATCH 70/80] docs(telemetry): fix reset docs --- libdd-telemetry/src/worker/mod.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/libdd-telemetry/src/worker/mod.rs b/libdd-telemetry/src/worker/mod.rs index 5af10abb59..226ed811d3 100644 --- a/libdd-telemetry/src/worker/mod.rs +++ b/libdd-telemetry/src/worker/mod.rs @@ -195,11 +195,8 @@ impl Worker for TelemetryWorker { /// Reset the worker state in the child process after a fork. /// - /// Discards inherited pending telemetry state without sending anything, and drains + /// Discards inherited pending telemetry state and dedupe history without sending anything, and drains /// the mailbox so that actions queued before the fork are not processed by the child. - /// Dedupe history is preserved across forks so the child does not re-emit already - /// seen dependencies, integrations, or configurations unless they are observed again - /// as new data. fn reset(&mut self) { // Drain all actions queued in the mailbox before the fork. while self.mailbox.try_recv().is_ok() {} @@ -1319,7 +1316,7 @@ mod tests { ) } - /// After reset(), pending buffered telemetry is cleared while dedupe history is preserved. + /// After reset(), pending buffered telemetry and dedupe history is cleared. #[tokio::test] async fn test_reset_clears_buffered_data() { let (handle, mut worker) = build_test_worker(); From a14fb0439903766df0270bd4a6de0a30a3b61e02 Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 3 Apr 2026 15:23:06 +0200 Subject: [PATCH 71/80] docs(trace-exporter): add warning for send_async --- libdd-data-pipeline/src/trace_exporter/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libdd-data-pipeline/src/trace_exporter/mod.rs b/libdd-data-pipeline/src/trace_exporter/mod.rs index 1a49f9a515..180193f507 100644 --- a/libdd-data-pipeline/src/trace_exporter/mod.rs +++ b/libdd-data-pipeline/src/trace_exporter/mod.rs @@ -317,7 +317,8 @@ impl TraceExporter { Ok(res) } - /// Async version of [`Self::send`] for platforms that cannot use `block_on` (e.g. wasm). + /// **WARNING**: This method is experimental and should not be used for production. + /// Async version of [`Self::send`] for platforms that cannot use `block_on` (e.g. wasm) pub async fn send_async(&self, data: &[u8]) -> Result { self.check_agent_info(); From 1e124a176fe6082166c5355f5a46c2363b0c6afe Mon Sep 17 00:00:00 2001 From: vianney Date: Fri, 3 Apr 2026 15:34:59 +0200 Subject: [PATCH 72/80] fix(info-fetcher): skip drain if the channel is not empty --- libdd-data-pipeline/src/agent_info/fetcher.rs | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index 3ba1e5be35..292002c99a 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -256,16 +256,13 @@ impl Worker for AgentInfoFetche } async fn on_pause(&mut self) { - // Release the IoStack waker stored in trigger_rx by waking the channel, - // then drain the message to avoid a spurious fetch on restart. - let _ = self.trigger_tx.try_send(()); - self.drain(); - } - - fn reset(&mut self) { - // Drain all messages from the channel to remove messages sent to release the reference on - // IoStack - self.drain(); + // Release the IoStack waker stored in trigger_rx by waking the channel and drain the + // message to avoid a spurious fetch on restart. If the channel is not empty then it has + // already been waked. + if self.trigger_rx.as_ref().is_some_and(|rx| rx.is_empty()) { + let _ = self.trigger_tx.try_send(()); + self.drain(); + }; } async fn run(&mut self) { From 3ddce4a6367b50cc974d8efd4891b77195ff817e Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 7 Apr 2026 11:07:24 +0200 Subject: [PATCH 73/80] fix(shared-runtime-ffi): use Option to handle null poniters --- .../src/shared_runtime.rs | 102 +++++++++--------- 1 file changed, 54 insertions(+), 48 deletions(-) diff --git a/libdd-shared-runtime-ffi/src/shared_runtime.rs b/libdd-shared-runtime-ffi/src/shared_runtime.rs index 5442423221..45b9cef452 100644 --- a/libdd-shared-runtime-ffi/src/shared_runtime.rs +++ b/libdd-shared-runtime-ffi/src/shared_runtime.rs @@ -93,7 +93,6 @@ pub unsafe extern "C" fn ddog_shared_runtime_error_free(error: Option, @@ -136,19 +135,21 @@ pub unsafe extern "C" fn ddog_shared_runtime_free(handle: *const SharedRuntime) /// The handle must have been initialized with `ddog_shared_runtime_new`. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_before_fork( - handle: *const SharedRuntime, + handle: Option<&SharedRuntime>, ) -> Option> { catch_panic!( { - if handle.is_null() { - return Some(Box::new(SharedRuntimeFFIError::new( + match handle { + Some(runtime) => { + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + runtime.before_fork(); + None + } + None => Some(Box::new(SharedRuntimeFFIError::new( SharedRuntimeErrorCode::InvalidArgument, "handle is null", - ))); + ))), } - // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. - (*handle).before_fork(); - None }, panic_error!() ) @@ -162,20 +163,22 @@ pub unsafe extern "C" fn ddog_shared_runtime_before_fork( /// The handle must have been initialized with `ddog_shared_runtime_new`. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_after_fork_parent( - handle: *const SharedRuntime, + handle: Option<&SharedRuntime>, ) -> Option> { catch_panic!( { - if handle.is_null() { - return Some(Box::new(SharedRuntimeFFIError::new( + match handle { + Some(runtime) => { + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + match runtime.after_fork_parent() { + Ok(()) => None, + Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + } + } + None => Some(Box::new(SharedRuntimeFFIError::new( SharedRuntimeErrorCode::InvalidArgument, "handle is null", - ))); - } - // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. - match (*handle).after_fork_parent() { - Ok(()) => None, - Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + ))), } }, panic_error!() @@ -192,20 +195,22 @@ pub unsafe extern "C" fn ddog_shared_runtime_after_fork_parent( /// The handle must have been initialized with `ddog_shared_runtime_new`. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_after_fork_child( - handle: *const SharedRuntime, + handle: Option<&SharedRuntime>, ) -> Option> { catch_panic!( { - if handle.is_null() { - return Some(Box::new(SharedRuntimeFFIError::new( + match handle { + Some(runtime) => { + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + match runtime.after_fork_child() { + Ok(()) => None, + Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + } + } + None => Some(Box::new(SharedRuntimeFFIError::new( SharedRuntimeErrorCode::InvalidArgument, "handle is null", - ))); - } - // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. - match (*handle).after_fork_child() { - Ok(()) => None, - Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + ))), } }, panic_error!() @@ -222,28 +227,29 @@ pub unsafe extern "C" fn ddog_shared_runtime_after_fork_child( /// The handle must have been initialized with `ddog_shared_runtime_new`. #[no_mangle] pub unsafe extern "C" fn ddog_shared_runtime_shutdown( - handle: *const SharedRuntime, + handle: Option<&SharedRuntime>, timeout_ms: u64, ) -> Option> { catch_panic!( { - if handle.is_null() { - return Some(Box::new(SharedRuntimeFFIError::new( + match handle { + Some(runtime) => { + let timeout = if timeout_ms > 0 { + Some(std::time::Duration::from_millis(timeout_ms)) + } else { + None + }; + + // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. + match runtime.shutdown(timeout) { + Ok(()) => None, + Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + } + } + None => Some(Box::new(SharedRuntimeFFIError::new( SharedRuntimeErrorCode::InvalidArgument, "handle is null", - ))); - } - - let timeout = if timeout_ms > 0 { - Some(std::time::Duration::from_millis(timeout_ms)) - } else { - None - }; - - // SAFETY: handle was produced by Arc::into_raw and the Arc is still alive. - match (*handle).shutdown(timeout) { - Ok(()) => None, - Err(err) => Some(Box::new(SharedRuntimeFFIError::from(err))), + ))), } }, panic_error!() @@ -268,13 +274,13 @@ mod tests { #[test] fn test_before_after_fork_null() { unsafe { - let err = ddog_shared_runtime_before_fork(std::ptr::null()); + let err = ddog_shared_runtime_before_fork(None); assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); - let err = ddog_shared_runtime_after_fork_parent(std::ptr::null()); + let err = ddog_shared_runtime_after_fork_parent(None); assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); - let err = ddog_shared_runtime_after_fork_child(std::ptr::null()); + let err = ddog_shared_runtime_after_fork_child(None); assert_eq!(err.unwrap().code, SharedRuntimeErrorCode::InvalidArgument); } } @@ -286,10 +292,10 @@ mod tests { ddog_shared_runtime_new(NonNull::new_unchecked(handle.as_mut_ptr())); let handle = handle.assume_init(); - let err = ddog_shared_runtime_before_fork(handle); + let err = ddog_shared_runtime_before_fork(std::mem::transmute(handle)); assert!(err.is_none(), "{:?}", err.map(|e| e.code)); - let err = ddog_shared_runtime_after_fork_parent(handle); + let err = ddog_shared_runtime_after_fork_parent(std::mem::transmute(handle)); assert!(err.is_none(), "{:?}", err.map(|e| e.code)); ddog_shared_runtime_free(handle); @@ -303,7 +309,7 @@ mod tests { ddog_shared_runtime_new(NonNull::new_unchecked(handle.as_mut_ptr())); let handle = handle.assume_init(); - let err = ddog_shared_runtime_shutdown(handle, 0); + let err = ddog_shared_runtime_shutdown(std::mem::transmute(handle), 0); assert!(err.is_none()); ddog_shared_runtime_free(handle); From fd1903f5ee9e53c5ed2b2d5761ee20be9b9c87f5 Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 7 Apr 2026 14:10:16 +0200 Subject: [PATCH 74/80] test(shared-runtime): improve tests --- .../src/shared_runtime/mod.rs | 137 +++++++++++++++--- libdd-telemetry/src/worker/mod.rs | 5 +- 2 files changed, 116 insertions(+), 26 deletions(-) diff --git a/libdd-shared-runtime/src/shared_runtime/mod.rs b/libdd-shared-runtime/src/shared_runtime/mod.rs index e25ec2b6da..21213acd12 100644 --- a/libdd-shared-runtime/src/shared_runtime/mod.rs +++ b/libdd-shared-runtime/src/shared_runtime/mod.rs @@ -398,14 +398,19 @@ impl SharedRuntime { mod tests { use super::*; use async_trait::async_trait; - use std::sync::mpsc::{channel, Sender}; + use std::sync::mpsc::{channel, Receiver, Sender}; use std::time::Duration; use tokio::time::sleep; #[derive(Debug)] struct TestWorker { - state: u32, - sender: Sender, + state: i32, + sender: Sender, + } + + fn make_test_worker() -> (TestWorker, Receiver) { + let (sender, receiver) = channel::(); + (TestWorker { state: 0, sender }, receiver) } #[async_trait] @@ -418,6 +423,15 @@ mod tests { async fn trigger(&mut self) { sleep(Duration::from_millis(100)).await; } + + fn reset(&mut self) { + self.state = 0; + } + + async fn shutdown(&mut self) { + self.state = -1; + let _ = self.sender.send(self.state); + } } #[test] @@ -429,59 +443,134 @@ mod tests { #[test] fn test_spawn_worker() { let shared_runtime = SharedRuntime::new().unwrap(); - let (sender, _receiver) = channel::(); - let worker = TestWorker { state: 0, sender }; + let (worker, receiver) = make_test_worker(); let result = shared_runtime.spawn_worker(worker); assert!(result.is_ok()); assert_eq!(shared_runtime.workers.lock_or_panic().len(), 1); + + // Verify the worker is actually running by receiving its first output + assert_eq!( + receiver + .recv_timeout(Duration::from_secs(1)) + .expect("worker did not run"), + 0 + ); } #[test] - fn test_worker_handle_stop_removes_worker() { + fn test_worker_handle_stop() { let rt = tokio::runtime::Runtime::new().unwrap(); let shared_runtime = SharedRuntime::new().unwrap(); - let (sender, _receiver) = channel::(); - let worker = TestWorker { state: 0, sender }; + let (worker, receiver) = make_test_worker(); let handle = shared_runtime.spawn_worker(worker).unwrap(); assert_eq!(shared_runtime.workers.lock_or_panic().len(), 1); + // Wait for at least one run before stopping + receiver + .recv_timeout(Duration::from_secs(1)) + .expect("worker did not run"); + rt.block_on(async { assert!(handle.stop().await.is_ok()); }); assert_eq!(shared_runtime.workers.lock_or_panic().len(), 0); + + // Drain all messages after stop — the last one must be the shutdown sentinel + let mut last = receiver + .recv_timeout(Duration::from_secs(1)) + .expect("shutdown did not send a value"); + while let Ok(v) = receiver.try_recv() { + last = v; + } + assert_eq!(last, -1); } #[test] fn test_before_and_after_fork_parent() { - // Run in a separate thread to ensure we're not in any async context - let handle = std::thread::spawn(|| { - let rt = tokio::runtime::Runtime::new().unwrap(); - let shared_runtime = SharedRuntime::new().unwrap(); + let shared_runtime = SharedRuntime::new().unwrap(); + let (worker, receiver) = make_test_worker(); - // Test before_fork - shared_runtime.before_fork(); + shared_runtime.spawn_worker(worker).unwrap(); - // Test after_fork_parent (synchronous) - assert!(shared_runtime.after_fork_parent().is_ok()); + // Let the worker run until state > 0 so that preservation is observable + let mut state_before_fork = 0; + while state_before_fork == 0 { + state_before_fork = receiver + .recv_timeout(Duration::from_secs(1)) + .expect("worker did not advance state before fork"); + } - // Clean shutdown - rt.block_on(async { - shared_runtime.shutdown_async().await; - }); - }); + shared_runtime.before_fork(); + // Drain pre-fork buffered messages now that the worker is paused + while receiver.try_recv().is_ok() {} - handle.join().expect("Thread panicked"); + assert!(shared_runtime.after_fork_parent().is_ok()); + + // State must be preserved (not reset) after fork in the parent + let after_fork_value = receiver + .recv_timeout(Duration::from_secs(1)) + .expect("worker did not resume after fork"); + assert!( + after_fork_value > state_before_fork, + "after_fork_parent should preserve state: got {after_fork_value}, expected > {state_before_fork}" + ); } #[test] fn test_after_fork_child() { - // Test after_fork_child in a non-async context let shared_runtime = SharedRuntime::new().unwrap(); + let (worker, receiver) = make_test_worker(); + + shared_runtime.spawn_worker(worker).unwrap(); + + // Let the worker run until state > 0 so that the reset is observable + let mut state_before_fork = 0; + while state_before_fork == 0 { + state_before_fork = receiver + .recv_timeout(Duration::from_secs(1)) + .expect("worker did not advance state before fork"); + } + + shared_runtime.before_fork(); + // Drain pre-fork buffered messages now that the worker is paused + while receiver.try_recv().is_ok() {} - // This should succeed as we're not in an async context assert!(shared_runtime.after_fork_child().is_ok()); + + // State must be reset to 0 in the child + let after_fork_value = receiver + .recv_timeout(Duration::from_secs(1)) + .expect("worker did not resume after fork child"); + assert_eq!( + after_fork_value, 0, + "after_fork_child should reset state to 0, got {after_fork_value}" + ); + } + + #[test] + fn test_shutdown() { + let shared_runtime = SharedRuntime::new().unwrap(); + let (worker, receiver) = make_test_worker(); + + shared_runtime.spawn_worker(worker).unwrap(); + + // Wait for at least one run before shutting down + receiver + .recv_timeout(Duration::from_secs(1)) + .expect("worker did not run"); + + shared_runtime.shutdown(None).unwrap(); + + // Drain all messages after shutdown — the last one must be the shutdown sentinel + let mut last = receiver + .recv_timeout(Duration::from_secs(1)) + .expect("shutdown did not send a value"); + while let Ok(v) = receiver.try_recv() { + last = v; + } + assert_eq!(last, -1); } } diff --git a/libdd-telemetry/src/worker/mod.rs b/libdd-telemetry/src/worker/mod.rs index 226ed811d3..a7a19d09fe 100644 --- a/libdd-telemetry/src/worker/mod.rs +++ b/libdd-telemetry/src/worker/mod.rs @@ -195,8 +195,9 @@ impl Worker for TelemetryWorker { /// Reset the worker state in the child process after a fork. /// - /// Discards inherited pending telemetry state and dedupe history without sending anything, and drains - /// the mailbox so that actions queued before the fork are not processed by the child. + /// Discards inherited pending telemetry state and dedupe history without sending anything, and + /// drains the mailbox so that actions queued before the fork are not processed by the + /// child. fn reset(&mut self) { // Drain all actions queued in the mailbox before the fork. while self.mailbox.try_recv().is_ok() {} From 664a8bee10450897e78683813df99b90e8731a2b Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 7 Apr 2026 14:24:54 +0200 Subject: [PATCH 75/80] test(shared-runtime-ffi): add transmute annotations --- libdd-shared-runtime-ffi/src/shared_runtime.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/libdd-shared-runtime-ffi/src/shared_runtime.rs b/libdd-shared-runtime-ffi/src/shared_runtime.rs index 45b9cef452..7a9a0ac084 100644 --- a/libdd-shared-runtime-ffi/src/shared_runtime.rs +++ b/libdd-shared-runtime-ffi/src/shared_runtime.rs @@ -292,10 +292,16 @@ mod tests { ddog_shared_runtime_new(NonNull::new_unchecked(handle.as_mut_ptr())); let handle = handle.assume_init(); - let err = ddog_shared_runtime_before_fork(std::mem::transmute(handle)); + let err = ddog_shared_runtime_before_fork(std::mem::transmute::< + *const SharedRuntime, + Option<&SharedRuntime>, + >(handle)); assert!(err.is_none(), "{:?}", err.map(|e| e.code)); - let err = ddog_shared_runtime_after_fork_parent(std::mem::transmute(handle)); + let err = ddog_shared_runtime_after_fork_parent(std::mem::transmute::< + *const SharedRuntime, + Option<&SharedRuntime>, + >(handle)); assert!(err.is_none(), "{:?}", err.map(|e| e.code)); ddog_shared_runtime_free(handle); @@ -309,7 +315,10 @@ mod tests { ddog_shared_runtime_new(NonNull::new_unchecked(handle.as_mut_ptr())); let handle = handle.assume_init(); - let err = ddog_shared_runtime_shutdown(std::mem::transmute(handle), 0); + let err = ddog_shared_runtime_shutdown( + std::mem::transmute::<*const SharedRuntime, Option<&SharedRuntime>>(handle), + 0, + ); assert!(err.is_none()); ddog_shared_runtime_free(handle); From ec1ccf4a9d631e5766012b4ff128868dad1478f5 Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 7 Apr 2026 18:12:46 +0200 Subject: [PATCH 76/80] test(info_fetcher): remove needless assert --- libdd-data-pipeline/src/agent_info/fetcher.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/libdd-data-pipeline/src/agent_info/fetcher.rs b/libdd-data-pipeline/src/agent_info/fetcher.rs index 292002c99a..865bd5a87c 100644 --- a/libdd-data-pipeline/src/agent_info/fetcher.rs +++ b/libdd-data-pipeline/src/agent_info/fetcher.rs @@ -593,7 +593,6 @@ mod single_threaded_tests { .clone() .unwrap(); assert_eq!(version_1, "1"); - mock_v1.assert(); // Update the info endpoint mock_v1.delete(); From a171916e57af88f5cd9ff6284df3e32410f68f59 Mon Sep 17 00:00:00 2001 From: vianney Date: Tue, 7 Apr 2026 18:34:57 +0200 Subject: [PATCH 77/80] test(telemetry): increase sleep time --- libdd-data-pipeline/src/telemetry/mod.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index a12a8c93ee..322ceeb30c 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -388,7 +388,7 @@ mod tests { client.start().await; let _ = client.send(&data); // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -423,7 +423,7 @@ mod tests { client.start().await; let _ = client.send(&data); // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -458,7 +458,7 @@ mod tests { client.start().await; let _ = client.send(&data); // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -493,7 +493,7 @@ mod tests { client.start().await; let _ = client.send(&data); // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -528,7 +528,7 @@ mod tests { client.start().await; let _ = client.send(&data); // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -563,7 +563,7 @@ mod tests { client.start().await; let _ = client.send(&data); // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -598,7 +598,7 @@ mod tests { client.start().await; let _ = client.send(&data); // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -633,7 +633,7 @@ mod tests { client.start().await; let _ = client.send(&data); // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -668,7 +668,7 @@ mod tests { client.start().await; let _ = client.send(&data); // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -703,7 +703,7 @@ mod tests { client.start().await; let _ = client.send(&data); // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -879,7 +879,7 @@ mod tests { }) .unwrap(); // Wait for send to be processed - sleep(Duration::from_millis(10)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -914,7 +914,7 @@ mod tests { }) .unwrap(); // Wait for send to be processed - sleep(Duration::from_millis(1)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); // Wait for the server to receive at least one call, but don't hang forever. From ad747d43b751f52e36b4f2b872b8d807940c1c4f Mon Sep 17 00:00:00 2001 From: vianney Date: Wed, 8 Apr 2026 13:12:45 +0200 Subject: [PATCH 78/80] test(telemetry): increase sleep --- libdd-data-pipeline/src/telemetry/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index 322ceeb30c..5b98091a60 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -879,7 +879,7 @@ mod tests { }) .unwrap(); // Wait for send to be processed - sleep(Duration::from_millis(100)).await; + sleep(Duration::from_millis(500)).await; handle.stop().await.expect("Failed to stop worker"); assert!( @@ -914,7 +914,7 @@ mod tests { }) .unwrap(); // Wait for send to be processed - sleep(Duration::from_millis(100)).await; + sleep(Duration::from_millis(500)).await; handle.stop().await.expect("Failed to stop worker"); // Wait for the server to receive at least one call, but don't hang forever. From d4916fc04a7c9330012f0591b02c482c6d8c7300 Mon Sep 17 00:00:00 2001 From: vianney Date: Wed, 8 Apr 2026 15:06:47 +0200 Subject: [PATCH 79/80] test(telemetry): spawn worker outside of the block_on --- libdd-data-pipeline/src/telemetry/mod.rs | 317 ++++++++++------------- 1 file changed, 135 insertions(+), 182 deletions(-) diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index 5b98091a60..af85285f62 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -305,17 +305,16 @@ impl TelemetryClient { #[cfg(test)] mod tests { + use super::*; use bytes::Bytes; use httpmock::Method::POST; use httpmock::MockServer; use libdd_capabilities::HttpError; + use libdd_shared_runtime::{SharedRuntime, WorkerHandle}; use libdd_trace_utils::test_utils::poll_for_mock_hit; use regex::Regex; use tokio::time::sleep; - use super::*; - use libdd_shared_runtime::{SharedRuntime, WorkerHandle}; - fn get_test_client(url: &str, runtime: &SharedRuntime) -> (TelemetryClient, WorkerHandle) { let (client, worker) = TelemetryClientBuilder::default() .set_service_name("test_service") @@ -369,22 +368,18 @@ mod tests { fn api_bytes_test() { let payload = Regex::new(r#""metric":"trace_api.bytes","tags":\["src_library:libdatadog"\],"sketch_b64":".+","common":true,"interval":\d+,"type":"distribution""#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }); + let data = SendPayloadTelemetry { + bytes_sent: 1, + ..Default::default() + }; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); shared_runtime .block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - bytes_sent: 1, - ..Default::default() - }; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); // Wait for send to be processed @@ -404,22 +399,18 @@ mod tests { fn requests_test() { let payload = Regex::new(r#""metric":"trace_api.requests","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog"\],"common":true,"type":"count""#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }); + let data = SendPayloadTelemetry { + requests_count: 1, + ..Default::default() + }; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); shared_runtime .block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - requests_count: 1, - ..Default::default() - }; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); // Wait for send to be processed @@ -439,22 +430,18 @@ mod tests { fn responses_per_code_test() { let payload = Regex::new(r#""metric":"trace_api.responses","points":\[\[\d+,1\.0\]\],"tags":\["status_code:200","src_library:libdatadog"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }); + let data = SendPayloadTelemetry { + responses_count_per_code: HashMap::from([(200, 1)]), + ..Default::default() + }; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); shared_runtime .block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - responses_count_per_code: HashMap::from([(200, 1)]), - ..Default::default() - }; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); // Wait for send to be processed @@ -474,22 +461,18 @@ mod tests { fn errors_timeout_test() { let payload = Regex::new(r#""metric":"trace_api.errors","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","type:timeout"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }); + let data = SendPayloadTelemetry { + errors_timeout: 1, + ..Default::default() + }; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); shared_runtime .block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - errors_timeout: 1, - ..Default::default() - }; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); // Wait for send to be processed @@ -509,22 +492,18 @@ mod tests { fn errors_network_test() { let payload = Regex::new(r#""metric":"trace_api.errors","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","type:network"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }); + let data = SendPayloadTelemetry { + errors_network: 1, + ..Default::default() + }; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); shared_runtime .block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - errors_network: 1, - ..Default::default() - }; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); // Wait for send to be processed @@ -544,22 +523,18 @@ mod tests { fn errors_status_code_test() { let payload = Regex::new(r#""metric":"trace_api.errors","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","type:status_code"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }); + let data = SendPayloadTelemetry { + errors_status_code: 1, + ..Default::default() + }; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); shared_runtime .block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - errors_status_code: 1, - ..Default::default() - }; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); // Wait for send to be processed @@ -579,22 +554,18 @@ mod tests { fn chunks_sent_test() { let payload = Regex::new(r#""metric":"trace_chunks_sent","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }); + let data = SendPayloadTelemetry { + chunks_sent: 1, + ..Default::default() + }; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); shared_runtime .block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_sent: 1, - ..Default::default() - }; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); // Wait for send to be processed @@ -614,22 +585,18 @@ mod tests { fn chunks_dropped_send_failure_test() { let payload = Regex::new(r#""metric":"trace_chunks_dropped","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","reason:send_failure"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }); + let data = SendPayloadTelemetry { + chunks_dropped_send_failure: 1, + ..Default::default() + }; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); shared_runtime .block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_dropped_send_failure: 1, - ..Default::default() - }; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); // Wait for send to be processed @@ -649,22 +616,18 @@ mod tests { fn chunks_dropped_p0_test() { let payload = Regex::new(r#""metric":"trace_chunks_dropped","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","reason:p0_drop"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }); + let data = SendPayloadTelemetry { + chunks_dropped_p0: 1, + ..Default::default() + }; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); shared_runtime .block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_dropped_p0: 1, - ..Default::default() - }; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); // Wait for send to be processed @@ -684,22 +647,18 @@ mod tests { fn chunks_dropped_serialization_error_test() { let payload = Regex::new(r#""metric":"trace_chunks_dropped","points":\[\[\d+,1\.0\]\],"tags":\["src_library:libdatadog","reason:serialization_error"\],"common":true,"type":"count"#).unwrap(); let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_matches(payload); + then.status(200).body(""); + }); + let data = SendPayloadTelemetry { + chunks_dropped_serialization_error: 1, + ..Default::default() + }; + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); shared_runtime .block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_matches(payload); - then.status(200).body(""); - }) - .await; - - let data = SendPayloadTelemetry { - chunks_dropped_serialization_error: 1, - ..Default::default() - }; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; let _ = client.send(&data); // Wait for send to be processed @@ -860,17 +819,14 @@ mod tests { #[test] fn runtime_id_test() { let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_includes(r#""runtime_id":"foo""#); + then.status(200).body(""); + }); + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); shared_runtime .block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST).body_includes(r#""runtime_id":"foo""#); - then.status(200).body(""); - }) - .await; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); client.start().await; client .send(&SendPayloadTelemetry { @@ -879,14 +835,13 @@ mod tests { }) .unwrap(); // Wait for send to be processed - sleep(Duration::from_millis(500)).await; + sleep(Duration::from_millis(100)).await; handle.stop().await.expect("Failed to stop worker"); assert!( poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, "telemetry server did not receive calls within timeout" ); - // One payload generate-metrics }) .expect("Failed to get runtime"); } @@ -895,34 +850,32 @@ mod tests { #[test] fn application_metadata_test() { let shared_runtime = SharedRuntime::new().expect("Failed to create runtime"); - shared_runtime.block_on(async { - let server = MockServer::start_async().await; - let mut telemetry_srv = server - .mock_async(|when, then| { - when.method(POST) - .body_includes(r#""application":{"service_name":"test_service","service_version":"test_version","env":"test_env","language_name":"test_language","language_version":"test_language_version","tracer_version":"test_tracer_version"}"#); - then.status(200).body(""); - }) - .await; - - let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); - client.start().await; - client - .send(&SendPayloadTelemetry { - requests_count: 1, - ..Default::default() - }) - .unwrap(); - // Wait for send to be processed - sleep(Duration::from_millis(500)).await; - - handle.stop().await.expect("Failed to stop worker"); - // Wait for the server to receive at least one call, but don't hang forever. - assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, - "telemetry server did not receive calls within timeout" + let server = MockServer::start(); + let mut telemetry_srv = server.mock(|when, then| { + when.method(POST).body_includes( + r#""application":{"service_name":"test_service","service_version":"test_version","env":"test_env","language_name":"test_language","language_version":"test_language_version","tracer_version":"test_tracer_version"}"#, ); - // One payload generate-metrics - }).expect("Failed to get runtime"); + then.status(200).body(""); + }); + let (client, handle) = get_test_client(&server.url("/"), &shared_runtime); + shared_runtime + .block_on(async { + client.start().await; + client + .send(&SendPayloadTelemetry { + requests_count: 1, + ..Default::default() + }) + .unwrap(); + // Wait for send to be processed + sleep(Duration::from_millis(100)).await; + + handle.stop().await.expect("Failed to stop worker"); + assert!( + poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + "telemetry server did not receive calls within timeout" + ); + }) + .expect("Failed to get runtime"); } } From 22687bb0e3ec8e827e78e5af404b73dabd608e99 Mon Sep 17 00:00:00 2001 From: vianney Date: Wed, 8 Apr 2026 16:00:34 +0200 Subject: [PATCH 80/80] test(telemetry): use poll_for_mock_hits --- libdd-data-pipeline/src/telemetry/mod.rs | 26 ++++++++++++------------ libdd-trace-utils/src/test_utils/mod.rs | 19 +++++++++++++++++ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/libdd-data-pipeline/src/telemetry/mod.rs b/libdd-data-pipeline/src/telemetry/mod.rs index af85285f62..86b7a302a7 100644 --- a/libdd-data-pipeline/src/telemetry/mod.rs +++ b/libdd-data-pipeline/src/telemetry/mod.rs @@ -311,7 +311,7 @@ mod tests { use httpmock::MockServer; use libdd_capabilities::HttpError; use libdd_shared_runtime::{SharedRuntime, WorkerHandle}; - use libdd_trace_utils::test_utils::poll_for_mock_hit; + use libdd_trace_utils::test_utils::poll_for_mock_hits; use regex::Regex; use tokio::time::sleep; @@ -387,7 +387,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) @@ -418,7 +418,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) @@ -449,7 +449,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) @@ -480,7 +480,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) @@ -511,7 +511,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) @@ -542,7 +542,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) @@ -573,7 +573,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) @@ -604,7 +604,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) @@ -635,7 +635,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) @@ -666,7 +666,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) @@ -839,7 +839,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) @@ -872,7 +872,7 @@ mod tests { handle.stop().await.expect("Failed to stop worker"); assert!( - poll_for_mock_hit(&mut telemetry_srv, 1000, 10, 1, false).await, + poll_for_mock_hits(&mut telemetry_srv, 1000, 10, 1).await, "telemetry server did not receive calls within timeout" ); }) diff --git a/libdd-trace-utils/src/test_utils/mod.rs b/libdd-trace-utils/src/test_utils/mod.rs index da4d2b8cff..e71f65fdd1 100644 --- a/libdd-trace-utils/src/test_utils/mod.rs +++ b/libdd-trace-utils/src/test_utils/mod.rs @@ -433,6 +433,25 @@ pub async fn poll_for_mock_hit( mock_hit } +/// Poll for a mock to be hit at least `min_hits` times. +/// +/// Returns `true` as soon as the mock has been called at least `min_hits` times, +/// or `false` if `poll_attempts` is exhausted before that threshold is reached. +pub async fn poll_for_mock_hits( + mock: &mut Mock<'_>, + poll_attempts: i32, + sleep_interval_ms: u64, + min_hits: usize, +) -> bool { + for _ in 0..poll_attempts { + sleep(Duration::from_millis(sleep_interval_ms)).await; + if mock.calls_async().await >= min_hits { + return true; + } + } + false +} + /// Creates a `SendData` object with the specified size and target endpoint. /// /// This function is a test helper to create a `SendData` object.