1212import org .apache .parquet .column .impl .ColumnReadStoreImpl ;
1313import org .apache .parquet .column .page .PageReader ;
1414import org .apache .parquet .compression .CompressionCodecFactory ;
15+ import org .apache .parquet .filter2 .predicate .FilterPredicate ;
1516import org .apache .parquet .hadoop .ParquetFileReader ;
1617import org .apache .parquet .hadoop .metadata .BlockMetaData ;
1718import org .apache .parquet .hadoop .metadata .ColumnChunkMetaData ;
6263 * {@link BlockFactory#breaker()}) before async I/O starts. The reservation is released
6364 * when prefetched data is consumed and cleared. If the breaker would trip, prefetch is
6465 * skipped and the query falls back to synchronous I/O for that row group.
66+ *
67+ * <p><b>Trivially-passes guard:</b> when late materialization is enabled and row-group
68+ * statistics prove every row satisfies the pushed filter ({@link TriviallyPassesChecker}),
69+ * the iterator routes that row group through {@code nextStandard} for the remainder of the
70+ * row group, skipping per-row filter evaluation and survivor compaction. This benefits queries
71+ * that mix selective and non-selective row groups (e.g., time-bucketed data with skewed
72+ * filter selectivity).
6573 */
6674final class OptimizedParquetColumnIterator implements CloseableIterator <Page > {
6775
@@ -111,8 +119,28 @@ final class OptimizedParquetColumnIterator implements CloseableIterator<Page> {
111119 private final boolean lateMaterialization ;
112120 private final boolean [] isPredicateColumn ;
113121 private final ParquetPushedExpressions pushedExpressions ;
122+ /**
123+ * The parquet-mr {@link FilterPredicate} resolved by {@code ParquetFormatReader} for this scan,
124+ * passed through unchanged so the trivially-passes check sees the same predicate that drove
125+ * row-group pruning and ColumnIndex {@link RowRanges} computation. Translating again here
126+ * would be wasted work and could subtly diverge if the caller's schema differs from
127+ * {@link #projectedSchema}.
128+ *
129+ * <p>{@code null} when the trivially-passes guard is inactive: late materialization is off,
130+ * the reader did not resolve a file-level predicate, or predicate resolution failed earlier
131+ * (in which case the reader logged a warning and passed {@code null} through).
132+ */
133+ private final FilterPredicate triviallyPassesPredicate ;
114134 private final WordMask survivorMask ;
115135 private long rowsEliminatedByLateMaterialization ;
136+ /**
137+ * When {@code true}, row-group statistics prove every row in the current row group satisfies
138+ * the pushed filter, so late materialization is bypassed for this row group: filter
139+ * evaluation and survivor compaction are skipped, and the standard read path is used.
140+ */
141+ private boolean currentRowGroupTriviallyPasses ;
142+ /** Diagnostic counter: number of row groups for which the filter was proven to trivially pass. */
143+ private long rowGroupsWithTrivialFilter ;
116144
117145 OptimizedParquetColumnIterator (
118146 ParquetFileReader reader ,
@@ -129,7 +157,8 @@ final class OptimizedParquetColumnIterator implements CloseableIterator<Page> {
129157 RowRanges [] allRowRanges ,
130158 boolean [] survivingRowGroups ,
131159 CompressionCodecFactory codecFactory ,
132- ParquetPushedExpressions pushedExpressions
160+ ParquetPushedExpressions pushedExpressions ,
161+ FilterPredicate triviallyPassesPredicate
133162 ) {
134163 this .reader = reader ;
135164 this .projectedSchema = projectedSchema ;
@@ -151,6 +180,9 @@ final class OptimizedParquetColumnIterator implements CloseableIterator<Page> {
151180 this .isPredicateColumn = classifyPredicateColumns (attributes , columnInfos , pushedExpressions );
152181 this .lateMaterialization = pushedExpressions != null && hasProjectionOnlyColumns (isPredicateColumn , columnInfos );
153182 this .survivorMask = lateMaterialization ? new WordMask () : null ;
183+ // Caller supplies null when late materialization is off; defensively also drop it here so
184+ // the trivially-passes check is gated by a single condition below.
185+ this .triviallyPassesPredicate = lateMaterialization ? triviallyPassesPredicate : null ;
154186
155187 this .projectedColumnPaths = buildProjectedColumnPaths (columnInfos );
156188 this .prefetchDepth = computePrefetchDepth (reader .getRowGroups (), this .projectedColumnPaths );
@@ -333,18 +365,34 @@ private boolean advanceRowGroup() throws IOException {
333365 if (rowsEliminatedByLateMaterialization > 0 ) {
334366 logger .debug ("Late materialization eliminated [{}] rows in [{}]" , rowsEliminatedByLateMaterialization , fileLocation );
335367 }
368+ if (rowGroupsWithTrivialFilter > 0 ) {
369+ logger .debug (
370+ "Trivially-passes guard skipped late-materialization for [{}] row groups in [{}]" ,
371+ rowGroupsWithTrivialFilter ,
372+ fileLocation
373+ );
374+ }
336375 return false ;
337376 }
338377 rowGroupOrdinal = nextOrdinal ;
339378 pageBatchIndexInRowGroup = 0 ;
340379
341380 BlockMetaData block = reader .getRowGroups ().get (rowGroupOrdinal );
381+ // Per-row-group trivially-passes check: when stats prove every row matches the filter,
382+ // the late-materialization machinery (decode predicate columns → evaluate filter → compact
383+ // survivors) is pure overhead. Switching to the standard path eliminates filter evaluation.
384+ currentRowGroupTriviallyPasses = triviallyPassesPredicate != null && TriviallyPassesChecker .check (triviallyPassesPredicate , block );
385+ if (currentRowGroupTriviallyPasses ) {
386+ rowGroupsWithTrivialFilter ++;
387+ }
342388 NavigableMap <Long , ColumnChunkPrefetcher .PrefetchedChunk > chunks = takePendingPrefetch (rowGroupOrdinal );
343389 try {
344390 RowRanges currentRowRanges = resolveCurrentRowRanges (block );
345391 // When late materialization is active, skip ColumnIndex page filtering — late-mat handles
346392 // row-level filtering itself via the survivor mask. Applying both ColumnIndex RowRanges
347393 // AND late-mat evaluation causes double-filtering that drops rows incorrectly.
394+ // The trivially-passes case is handled the same way: we already know all rows match,
395+ // so leaving page filtering off is consistent and safe (RowRanges would be all() anyway).
348396 RowRanges buildRowRanges = lateMaterialization ? null : currentRowRanges ;
349397 rowGroup = PrefetchedRowGroupBuilder .build (
350398 block ,
@@ -551,12 +599,13 @@ public Page next() {
551599 }
552600 int rowsToRead = (int ) Math .min (effectiveBatch , rowsRemainingInGroup );
553601
554- Page result = lateMaterialization ? nextWithLateMaterialization (rowsToRead ) : nextStandard (rowsToRead );
602+ boolean useLateMaterialization = lateMaterialization && currentRowGroupTriviallyPasses == false ;
603+ Page result = useLateMaterialization ? nextWithLateMaterialization (rowsToRead ) : nextStandard (rowsToRead );
555604
556605 pageBatchIndexInRowGroup ++;
557606 rowsRemainingInGroup -= rowsToRead ;
558607 if (rowBudget != FormatReader .NO_LIMIT ) {
559- rowBudget -= lateMaterialization ? result .getPositionCount () : rowsToRead ;
608+ rowBudget -= useLateMaterialization ? result .getPositionCount () : rowsToRead ;
560609 }
561610 return result ;
562611 }
0 commit comments