Skip to content

Commit c56d358

Browse files
authored
Keep window manager state consistent on mmap failures. (netdata#21693)
* Fix window-manager state consistency after failed mmap Invalidate `active_window_idx` before/after window removal to ensure the index doesn't point to a non-existent window if `create_window` fails. Also adds error logging when mmap creation fails, and two separate tests to ensure the window manager's consistency in such cases. * Improve error handling at journal file access call-sites - Stop `EntryDataIterator` on data read errors instead of continuing - Add warning log when find_log_entries fails for a file - Properly propagate errors from `get_timestamp_field`, ignoring only `MissingFieldName` variant.
1 parent d877b8f commit c56d358

File tree

4 files changed

+239
-7
lines changed

4 files changed

+239
-7
lines changed

src/crates/journal-core/src/file/file.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1158,7 +1158,11 @@ impl<'a, M: MemoryMap> Iterator for EntryDataIterator<'a, M> {
11581158
// Try to get the data object
11591159
match self.journal.data_ref(data_offset) {
11601160
Ok(data_guard) => Some(Ok(data_guard)),
1161-
Err(e) => Some(Err(e)),
1161+
Err(e) => {
1162+
// If we can't read the data, return the error and stop iteration
1163+
self.current_index = self.total_items;
1164+
Some(Err(e))
1165+
}
11621166
}
11631167
}
11641168
Err(e) => {

src/crates/journal-core/src/file/mmap.rs

Lines changed: 223 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ use crate::error::Result;
22
use journal_common::compat::is_multiple_of;
33
use std::fs::File;
44
use std::ops::{Deref, DerefMut};
5+
use tracing::error;
56

67
// Re-export memmap2 types for other crates and import for internal use
78
pub use memmap2::{Mmap, MmapMut, MmapOptions};
@@ -141,7 +142,16 @@ impl<M: MemoryMap> WindowManager<M> {
141142
debug_assert_ne!(chunk_count, 0);
142143

143144
let size = chunk_count * self.chunk_size;
144-
let mmap = M::create(&self.file, window_start, size)?;
145+
let mmap = M::create(&self.file, window_start, size).map_err(|e| {
146+
error!(
147+
window_start,
148+
size,
149+
chunk_count,
150+
chunk_size = self.chunk_size,
151+
"mmap failed: {e}"
152+
);
153+
e
154+
})?;
145155
Ok(Window {
146156
offset: window_start,
147157
size,
@@ -197,6 +207,9 @@ impl<M: MemoryMap> WindowManager<M> {
197207
// Remap the window
198208

199209
let window = self.windows.remove(idx);
210+
// Invalidate active_window_idx before removal to maintain consistency.
211+
// If create_window fails, the index won't point to a non-existent window.
212+
self.active_window_idx = None;
200213

201214
let window_start = window.offset;
202215
let window_end = self.get_chunk_aligned_end(position + size_needed);
@@ -212,10 +225,11 @@ impl<M: MemoryMap> WindowManager<M> {
212225

213226
if self.windows.len() >= self.max_windows {
214227
self.windows.remove(self.find_window_to_evict());
228+
// Invalidate active_window_idx after removal to maintain consistency.
229+
// If create_window fails below, the index won't point to a non-existent window.
230+
self.active_window_idx = None;
215231
}
216232

217-
// NOTE: the active window index might have been invalidated. In
218-
// the scope that follows, we should not use code that relies on it.
219233
{
220234
// Calculate window start for this position
221235
let window_start = self.get_chunk_aligned_start(position);
@@ -250,3 +264,209 @@ impl<M: MemoryMapMut> WindowManager<M> {
250264
Ok(())
251265
}
252266
}
267+
268+
#[cfg(test)]
269+
mod tests {
270+
use super::*;
271+
use crate::error::JournalError;
272+
use std::cell::Cell;
273+
use std::io::Write;
274+
use std::rc::Rc;
275+
use tempfile::NamedTempFile;
276+
277+
const PAGE_SIZE_TEST: u64 = 4096;
278+
279+
/// A mock MemoryMap that can be configured to fail on specific calls.
280+
/// This allows us to test error handling in WindowManager.
281+
struct FailingMmap {
282+
data: Vec<u8>,
283+
}
284+
285+
impl Deref for FailingMmap {
286+
type Target = [u8];
287+
fn deref(&self) -> &[u8] {
288+
&self.data
289+
}
290+
}
291+
292+
/// Shared state to control when the mock should fail
293+
struct MockController {
294+
fail_next_create: Cell<bool>,
295+
create_count: Cell<usize>,
296+
}
297+
298+
impl MockController {
299+
fn new() -> Self {
300+
Self {
301+
fail_next_create: Cell::new(false),
302+
create_count: Cell::new(0),
303+
}
304+
}
305+
306+
fn set_fail_next(&self, fail: bool) {
307+
self.fail_next_create.set(fail);
308+
}
309+
310+
fn should_fail(&self) -> bool {
311+
let count = self.create_count.get();
312+
self.create_count.set(count + 1);
313+
self.fail_next_create.get()
314+
}
315+
}
316+
317+
// Thread-local controller for the mock
318+
thread_local! {
319+
static MOCK_CONTROLLER: Rc<MockController> = Rc::new(MockController::new());
320+
}
321+
322+
impl MemoryMap for FailingMmap {
323+
fn create(_file: &File, _offset: u64, size: u64) -> Result<Self> {
324+
MOCK_CONTROLLER.with(|ctrl| {
325+
if ctrl.should_fail() {
326+
return Err(JournalError::Io(std::io::Error::new(
327+
std::io::ErrorKind::Other,
328+
"simulated mmap failure",
329+
)));
330+
}
331+
// Create a mock mmap with zeros
332+
Ok(FailingMmap {
333+
data: vec![0u8; size as usize],
334+
})
335+
})
336+
}
337+
}
338+
339+
/// This test verifies that WindowManager maintains consistent state
340+
/// after a failed remap operation.
341+
///
342+
/// The scenario:
343+
/// 1. A window exists at some position
344+
/// 2. A request comes in that requires remapping (position in window, but size extends beyond)
345+
/// 3. The old window is removed
346+
/// 4. Creating the new (larger) window fails (e.g., mmap error)
347+
/// 5. The WindowManager should remain in a consistent state
348+
/// 6. Subsequent operations should not panic
349+
#[test]
350+
fn test_consistent_state_after_failed_remap() {
351+
// Create a temporary file (content doesn't matter for mock)
352+
let mut temp_file = NamedTempFile::new().unwrap();
353+
temp_file.write_all(&[0u8; 8192]).unwrap();
354+
temp_file.flush().unwrap();
355+
356+
let file = File::open(temp_file.path()).unwrap();
357+
358+
// Create WindowManager with mock mmap, 4KB chunks, max 1 window
359+
let mut wm: WindowManager<FailingMmap> =
360+
WindowManager::new(file, PAGE_SIZE_TEST, 1).unwrap();
361+
362+
// Reset controller state
363+
MOCK_CONTROLLER.with(|ctrl| {
364+
ctrl.set_fail_next(false);
365+
ctrl.create_count.set(0);
366+
});
367+
368+
// First read: creates a window at offset 0, size 4KB (this should succeed)
369+
{
370+
let slice = wm.get_slice(0, 100).unwrap();
371+
assert_eq!(slice.len(), 100);
372+
}
373+
assert_eq!(wm.windows.len(), 1);
374+
assert_eq!(wm.active_window_idx, Some(0));
375+
376+
// Configure mock to fail on the next create call
377+
MOCK_CONTROLLER.with(|ctrl| ctrl.set_fail_next(true));
378+
379+
// Request a slice that requires remapping:
380+
// - Position 100 is within the existing window [0, 4096)
381+
// - But size 4000 means we need bytes [100, 4100), which extends beyond window
382+
// - This triggers the "Remap the window" branch
383+
// - The old window is removed
384+
// - Then create_window is called and FAILS
385+
let remap_result = wm.get_slice(100, 4000);
386+
assert!(remap_result.is_err(), "Expected remap to fail");
387+
388+
// Verify state is consistent after the failure:
389+
// - windows is empty (the old window was removed, new one failed to create)
390+
// - active_window_idx should be None (not pointing to non-existent window)
391+
assert_eq!(wm.windows.len(), 0);
392+
assert_eq!(wm.active_window_idx, None);
393+
394+
// Allow the next create to succeed
395+
MOCK_CONTROLLER.with(|ctrl| ctrl.set_fail_next(false));
396+
397+
// The next operation should NOT panic - it should succeed by creating a new window
398+
let result = wm.get_slice(0, 100);
399+
assert!(
400+
result.is_ok(),
401+
"Expected get_slice to succeed after recovery"
402+
);
403+
assert_eq!(wm.windows.len(), 1);
404+
}
405+
406+
/// This test verifies that WindowManager maintains consistent state
407+
/// after a failed window creation in the eviction path.
408+
///
409+
/// The scenario:
410+
/// 1. A window exists and we're at max_windows
411+
/// 2. A request comes in for a different region requiring a new window
412+
/// 3. The old window is evicted to make room
413+
/// 4. Creating the new window fails (e.g., mmap error)
414+
/// 5. The WindowManager should remain in a consistent state
415+
/// 6. Subsequent operations should not panic
416+
#[test]
417+
fn test_consistent_state_after_failed_eviction() {
418+
// Create a temporary file
419+
let mut temp_file = NamedTempFile::new().unwrap();
420+
temp_file.write_all(&[0u8; 8192]).unwrap();
421+
temp_file.flush().unwrap();
422+
423+
let file = File::open(temp_file.path()).unwrap();
424+
425+
// Create WindowManager with mock mmap, 4KB chunks, max 1 window
426+
let mut wm: WindowManager<FailingMmap> =
427+
WindowManager::new(file, PAGE_SIZE_TEST, 1).unwrap();
428+
429+
// Reset controller state
430+
MOCK_CONTROLLER.with(|ctrl| {
431+
ctrl.set_fail_next(false);
432+
ctrl.create_count.set(0);
433+
});
434+
435+
// Create first window at offset 0
436+
{
437+
let _slice = wm.get_slice(0, 100).unwrap();
438+
}
439+
assert_eq!(wm.windows.len(), 1);
440+
assert_eq!(wm.active_window_idx, Some(0));
441+
442+
// Configure mock to fail on the next create call
443+
MOCK_CONTROLLER.with(|ctrl| ctrl.set_fail_next(true));
444+
445+
// Request a slice at a completely different position (second page)
446+
// This triggers:
447+
// - lookup_window_by_range returns None (position 4096 not in window [0, 4096))
448+
// - lookup_window_by_position returns None
449+
// - "Create a brand new window" branch
450+
// - Eviction: windows.remove(0) since we're at max_windows
451+
// - create_window fails
452+
let result = wm.get_slice(4096, 100);
453+
assert!(result.is_err(), "Expected mmap to fail");
454+
455+
// Verify state is consistent after the failure:
456+
// - windows is empty (the old window was evicted, new one failed to create)
457+
// - active_window_idx should be None (not pointing to non-existent window)
458+
assert_eq!(wm.windows.len(), 0);
459+
assert_eq!(wm.active_window_idx, None);
460+
461+
// Allow the next create to succeed
462+
MOCK_CONTROLLER.with(|ctrl| ctrl.set_fail_next(false));
463+
464+
// The next operation should NOT panic - it should succeed by creating a new window
465+
let result = wm.get_slice(0, 100);
466+
assert!(
467+
result.is_ok(),
468+
"Expected get_slice to succeed after recovery"
469+
);
470+
assert_eq!(wm.windows.len(), 1);
471+
}
472+
}

src/crates/journal-engine/src/logs/query.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ use journal_index::{
1313
use journal_registry::File;
1414
use std::collections::HashMap;
1515
use std::num::NonZeroU64;
16+
use tracing::warn;
1617

1718
/// Pagination state for multi-file log queries.
1819
///
@@ -314,7 +315,10 @@ fn retrieve_log_entries(
314315

315316
let new_entries = match file_index.find_log_entries(file, &file_params) {
316317
Ok(entries) => entries,
317-
Err(_) => continue, // Skip files that fail to read
318+
Err(e) => {
319+
warn!(file = file.path(), "failed to retrieve log entries: {e}");
320+
continue;
321+
}
318322
};
319323

320324
if new_entries.is_empty() {

src/crates/journal-index/src/file_index.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -410,8 +410,12 @@ fn get_entry_timestamp(
410410
) -> Result<u64> {
411411
// Try to read the source timestamp field if specified
412412
if let Some(field_name) = source_timestamp_field {
413-
if let Ok(timestamp) = get_timestamp_field(journal_file, field_name, entry_offset) {
414-
return Ok(timestamp);
413+
match get_timestamp_field(journal_file, field_name, entry_offset) {
414+
Ok(timestamp) => return Ok(timestamp),
415+
Err(IndexError::MissingFieldName) => {
416+
// Field not found, fall back to realtime timestamp
417+
}
418+
Err(e) => return Err(e),
415419
}
416420
}
417421

0 commit comments

Comments
 (0)